falcon : fix CUDA inference by making K and Q contiguous (#2830)

author Georgi Gerganov <redacted>

Sun, 27 Aug 2023 13:40:48 +0000 (16:40 +0300)

committer GitHub <redacted>

Sun, 27 Aug 2023 13:40:48 +0000 (16:40 +0300)
author Georgi Gerganov <redacted>
Sun, 27 Aug 2023 13:40:48 +0000 (16:40 +0300)
committer GitHub <redacted>
Sun, 27 Aug 2023 13:40:48 +0000 (16:40 +0300)
diff --git a/ggml-cuda.cu b/ggml-cuda.cu

index d83aefc9af33c964543e391e767fcb1de8f9cabc..d76a25dc287dc9c9a9fb6d6699b2af2ec98c06cb 100644 (file)
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -6337,9 +6337,11 @@ void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml
  
  void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
      GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
  
      const int mode = ((int32_t *) dst->op_params)[2];
      const bool is_glm = mode & 4;
+
      ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
  }
  
diff --git a/llama.cpp b/llama.cpp

index e9868f5d0da77fc096adedaa5e02fcb60049d5a0..0d12d9cca031f4b5b0bb15d9006e78a808781b0d 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -2642,18 +2642,20 @@ static struct ggml_cgraph * llm_build_falcon(
  
              const size_t wsize = ggml_type_size(cur->type);
  
-            struct ggml_tensor * tmpq = ggml_view_3d(
+            // TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
+            //       non-contiguous views is added for the rope operator
+            struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
                  ctx0, cur, n_embd_head, n_head, N,
                  wsize * n_embd_head,
                  wsize * n_embd_head * (n_head + 2 * n_head_kv),
-                0);
+                0));
              offload_func_kq(tmpq);
  
-            struct ggml_tensor * tmpk = ggml_view_3d(
+            struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
                  ctx0, cur, n_embd_head, n_head_kv, N,
                  wsize * n_embd_head,
                  wsize * n_embd_head * (n_head + 2 * n_head_kv),
-                wsize * n_embd_head *  n_head);
+                wsize * n_embd_head *  n_head));
              offload_func_kq(tmpk);
  
              struct ggml_tensor * tmpv = ggml_view_3d(
author	Georgi Gerganov <redacted>
	Sun, 27 Aug 2023 13:40:48 +0000 (16:40 +0300)
committer	GitHub <redacted>
	Sun, 27 Aug 2023 13:40:48 +0000 (16:40 +0300)
ggml-cuda.cu		patch \| blob \| history
llama.cpp		patch \| blob \| history