talk-llama : sync latest llama.cpp

author Georgi Gerganov <redacted>

Sat, 6 Jan 2024 15:22:57 +0000 (17:22 +0200)

committer Georgi Gerganov <redacted>

Sat, 6 Jan 2024 15:22:57 +0000 (17:22 +0200)
author Georgi Gerganov <redacted>
Sat, 6 Jan 2024 15:22:57 +0000 (17:22 +0200)
committer Georgi Gerganov <redacted>
Sat, 6 Jan 2024 15:22:57 +0000 (17:22 +0200)
diff --git a/examples/talk-llama/llama.cpp b/examples/talk-llama/llama.cpp

index cb0546c952d20f3e6110205c5a5a004bd4b3119e..3bb056dba2e6d6ab21fcd21a3ab70776ad01557d 100644 (file)
--- a/examples/talk-llama/llama.cpp
+++ b/examples/talk-llama/llama.cpp
@@ -198,6 +198,7 @@ enum llm_arch {
      LLM_ARCH_STABLELM,
      LLM_ARCH_QWEN,
      LLM_ARCH_PHI2,
+    LLM_ARCH_PLAMO,
      LLM_ARCH_UNKNOWN,
  };
  
@@ -216,6 +217,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
      { LLM_ARCH_STABLELM,        "stablelm"  },
      { LLM_ARCH_QWEN,            "qwen"      },
      { LLM_ARCH_PHI2,            "phi2"      },
+    { LLM_ARCH_PLAMO,           "plamo"     },
  };
  
  enum llm_kv {
@@ -243,6 +245,8 @@ enum llm_kv {
      LLM_KV_ATTENTION_HEAD_COUNT_KV,
      LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
      LLM_KV_ATTENTION_CLAMP_KQV,
+    LLM_KV_ATTENTION_KEY_LENGTH,
+    LLM_KV_ATTENTION_VALUE_LENGTH,
      LLM_KV_ATTENTION_LAYERNORM_EPS,
      LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
  
@@ -295,6 +299,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
      { LLM_KV_ATTENTION_HEAD_COUNT_KV,       "%s.attention.head_count_kv"          },
      { LLM_KV_ATTENTION_MAX_ALIBI_BIAS,      "%s.attention.max_alibi_bias"         },
      { LLM_KV_ATTENTION_CLAMP_KQV,           "%s.attention.clamp_kqv"              },
+    { LLM_KV_ATTENTION_KEY_LENGTH,          "%s.attention.key_length"             },
+    { LLM_KV_ATTENTION_VALUE_LENGTH,        "%s.attention.value_length"           },
      { LLM_KV_ATTENTION_LAYERNORM_EPS,       "%s.attention.layer_norm_epsilon"     },
      { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,   "%s.attention.layer_norm_rms_epsilon" },
  
@@ -352,6 +358,7 @@ enum llm_tensor {
      LLM_TENSOR_FFN_GATE,
      LLM_TENSOR_FFN_DOWN,
      LLM_TENSOR_FFN_UP,
+    LLM_TENSOR_FFN_ACT,
      LLM_TENSOR_FFN_DOWN_EXP,
      LLM_TENSOR_FFN_GATE_EXP,
      LLM_TENSOR_FFN_UP_EXP,
@@ -420,6 +427,15 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
          LLM_ARCH_GPT2,
          {
              { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_POS_EMBD,        "position_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
          },
      },
      {
@@ -471,6 +487,7 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
              { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
              { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
              { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_ACT,         "blk.%d.ffn.act" },
          },
      },
      {
@@ -567,6 +584,24 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
              { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
          },
      },
+    {
+        LLM_ARCH_PLAMO,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
  
      {
          LLM_ARCH_UNKNOWN,
@@ -778,7 +813,7 @@ struct llama_file {
              throw std::runtime_error(format("read error: %s", strerror(errno)));
          }
          if (ret != 1) {
-            throw std::runtime_error(std::string("unexpectedly reached end of file"));
+            throw std::runtime_error("unexpectedly reached end of file");
          }
      }
  
@@ -931,29 +966,29 @@ struct llama_mmap {
  #elif defined(_WIN32)
      static constexpr bool SUPPORTED = true;
  
-    llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
-        (void) numa;
+    llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1, bool numa = false) {
+        GGML_UNUSED(numa);
  
          size = file->size;
  
          HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
  
          HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
-        DWORD error = GetLastError();
  
          if (hMapping == NULL) {
+            DWORD error = GetLastError();
              throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
          }
  
          addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
-        error = GetLastError();
+        DWORD error = GetLastError();
          CloseHandle(hMapping);
  
          if (addr == NULL) {
              throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
          }
  
-        if (prefetch) {
+        if (prefetch > 0) {
              // PrefetchVirtualMemory is only present on Windows 8 and above, so we dynamically load it
              BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
              HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
@@ -965,9 +1000,9 @@ struct llama_mmap {
                  // advise the kernel to preload the mapped memory
                  WIN32_MEMORY_RANGE_ENTRY range;
                  range.VirtualAddress = addr;
-                range.NumberOfBytes = (SIZE_T)size;
+                range.NumberOfBytes = (SIZE_T) std::min(size, prefetch);
                  if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
-                    fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
+                    LLAMA_LOG_WARN("warning: PrefetchVirtualMemory failed: %s\n",
                              llama_format_win_err(GetLastError()).c_str());
                  }
              }
@@ -982,26 +1017,26 @@ struct llama_mmap {
  
      ~llama_mmap() {
          if (!UnmapViewOfFile(addr)) {
-            fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
+            LLAMA_LOG_WARN("warning: UnmapViewOfFile failed: %s\n",
                      llama_format_win_err(GetLastError()).c_str());
          }
      }
  #else
      static constexpr bool SUPPORTED = false;
  
-    llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
-        (void) file;
-        (void) prefetch;
-        (void) numa;
+    llama_mmap(struct llama_file * file, size_t prefetch = -1, bool numa = false) {
+        GGML_UNUSED(file);
+        GGML_UNUSED(prefetch);
+        GGML_UNUSED(numa);
  
-        throw std::runtime_error(std::string("mmap not supported"));
+        throw std::runtime_error("mmap not supported");
      }
  
-    void unmap(size_t offset, size_t len) {
-        (void) offset;
-        (void) len;
+    void unmap_fragment(size_t first, size_t last) {
+        GGML_UNUSED(first);
+        GGML_UNUSED(last);
  
-        throw std::runtime_error(std::string("mmap not supported"));
+        throw std::runtime_error("mmap not supported");
      }
  #endif
  };
@@ -1177,21 +1212,27 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
  }
  
  static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) {
+    ggml_backend_buffer_type_t buft = nullptr;
+
  #ifdef GGML_USE_METAL
      if (n_gpu_layers > 0) {
-        return ggml_backend_metal_buffer_type();
+        buft = ggml_backend_metal_buffer_type();
      }
  #elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
      if (n_gpu_layers > 0) {
-        return ggml_backend_cuda_buffer_type(0);
+        buft = ggml_backend_cuda_buffer_type(0);
      }
  #elif defined(GGML_USE_CUBLAS)
-    return ggml_backend_cuda_host_buffer_type();
+    buft = ggml_backend_cuda_host_buffer_type();
  #elif defined(GGML_USE_CPU_HBM)
-    return ggml_backend_cpu_hbm_buffer_type();
+    buft = ggml_backend_cpu_hbm_buffer_type();
  #endif
  
-    return ggml_backend_cpu_buffer_type();
+    if (buft == nullptr) {
+        buft = ggml_backend_cpu_buffer_type();
+    }
+
+    return buft;
  
      GGML_UNUSED(n_gpu_layers);
  }
@@ -1228,6 +1269,10 @@ enum e_model {
      MODEL_40B,
      MODEL_65B,
      MODEL_70B,
+    MODEL_SMALL,
+    MODEL_MEDIUM,
+    MODEL_LARGE,
+    MODEL_XL,
  };
  
  static const size_t kiB = 1024;
@@ -1243,6 +1288,8 @@ struct llama_hparams {
      uint32_t n_head_kv;
      uint32_t n_layer;
      uint32_t n_rot;
+    uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
+    uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
      uint32_t n_ff;
      uint32_t n_expert = 0;
      uint32_t n_expert_used = 0;
@@ -1259,6 +1306,7 @@ struct llama_hparams {
      float f_clamp_kqv;
      float f_max_alibi_bias;
  
+
      bool operator!=(const llama_hparams & other) const {
          if (this->vocab_only    != other.vocab_only)    return true;
          if (this->n_vocab       != other.n_vocab)       return true;
@@ -1268,6 +1316,8 @@ struct llama_hparams {
          if (this->n_head_kv     != other.n_head_kv)     return true;
          if (this->n_layer       != other.n_layer)       return true;
          if (this->n_rot         != other.n_rot)         return true;
+        if (this->n_embd_head_k != other.n_embd_head_k) return true;
+        if (this->n_embd_head_v != other.n_embd_head_v) return true;
          if (this->n_ff          != other.n_ff)          return true;
          if (this->n_expert      != other.n_expert)      return true;
          if (this->n_expert_used != other.n_expert_used) return true;
@@ -1275,7 +1325,7 @@ struct llama_hparams {
          if (this->rope_finetuned  != other.rope_finetuned)  return true;
          if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
  
-        const float EPSILON = 1e-9;
+        const float EPSILON = 1e-9f;
  
          if (!is_float_close(this->f_norm_eps,            other.f_norm_eps,            EPSILON)) return true;
          if (!is_float_close(this->f_norm_rms_eps,        other.f_norm_rms_eps,        EPSILON)) return true;
@@ -1289,12 +1339,12 @@ struct llama_hparams {
          return n_head/n_head_kv;
      }
  
-    uint32_t n_embd_head() const {
-        return n_embd/n_head;
+    uint32_t n_embd_k_gqa() const { // dimension of key embeddings across all k-v heads
+        return n_embd_head_k * n_head_kv;
      }
  
-    uint32_t n_embd_gqa() const {
-        return n_embd/n_gqa();
+    uint32_t n_embd_v_gqa() const { // dimension of value embeddings across all k-v heads
+        return n_embd_head_v * n_head_kv;
      }
  };
  
@@ -1362,6 +1412,7 @@ struct llama_layer {
      // ff bias
      struct ggml_tensor * ffn_down_b; // b2
      struct ggml_tensor * ffn_up_b;   // b3
+    struct ggml_tensor * ffn_act;
  };
  
  struct llama_kv_cell {
@@ -1602,8 +1653,9 @@ static bool llama_kv_cache_init(
                            uint32_t   n_ctx,
                                 int   n_gpu_layers,
                                bool   offload) {
-    const uint32_t n_embd  = hparams.n_embd_gqa();
-    const uint32_t n_layer = hparams.n_layer;
+    const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
+    const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
+    const uint32_t n_layer      = hparams.n_layer;
  
      cache.has_shift = false;
  
@@ -1634,8 +1686,8 @@ static bool llama_kv_cache_init(
      const int i_gpu_start = (int) n_layer - n_gpu_layers;
  
      for (int i = 0; i < (int) n_layer; i++) {
-        ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd*n_ctx);
-        ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype, n_embd*n_ctx);
+        ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd_k_gqa*n_ctx);
+        ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype, n_embd_v_gqa*n_ctx);
          ggml_format_name(k, "cache_k_l%d", i);
          ggml_format_name(v, "cache_v_l%d", i);
          cache.k_l.push_back(k);
@@ -2522,18 +2574,22 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
  
  static const char * llama_model_type_name(e_model type) {
      switch (type) {
-        case MODEL_1B:  return "1B";
-        case MODEL_3B:  return "3B";
-        case MODEL_7B:  return "7B";
-        case MODEL_8B:  return "8B";
-        case MODEL_13B: return "13B";
-        case MODEL_15B: return "15B";
-        case MODEL_30B: return "30B";
-        case MODEL_34B: return "34B";
-        case MODEL_40B: return "40B";
-        case MODEL_65B: return "65B";
-        case MODEL_70B: return "70B";
-        default:        return "?B";
+        case MODEL_1B:     return "1B";
+        case MODEL_3B:     return "3B";
+        case MODEL_7B:     return "7B";
+        case MODEL_8B:     return "8B";
+        case MODEL_13B:    return "13B";
+        case MODEL_15B:    return "15B";
+        case MODEL_30B:    return "30B";
+        case MODEL_34B:    return "34B";
+        case MODEL_40B:    return "40B";
+        case MODEL_65B:    return "65B";
+        case MODEL_70B:    return "70B";
+        case MODEL_SMALL:  return "0.1B";
+        case MODEL_MEDIUM: return "0.4B";
+        case MODEL_LARGE:  return "0.8B";
+        case MODEL_XL:     return "1.5B";
+        default:           return "?B";
      }
  }
  
@@ -2625,6 +2681,12 @@ static void llm_load_hparams(
          // gpt-j n_rot = rotary_dim
      }
  
+    hparams.n_embd_head_k = hparams.n_embd / hparams.n_head;
+    ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
+
+    hparams.n_embd_head_v = hparams.n_embd / hparams.n_head;
+    ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
+
      // arch-specific KVs
      switch (model.arch) {
          case LLM_ARCH_LLAMA:
@@ -2743,6 +2805,26 @@ static void llm_load_hparams(
                      default: model.type = e_model::MODEL_UNKNOWN;
                  }
              } break;
+        case LLM_ARCH_PLAMO:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+                switch (hparams.n_layer) {
+                    case 40: model.type = e_model::MODEL_13B; break;
+                    default: model.type = e_model::MODEL_UNKNOWN;
+               }
+            } break;
+        case LLM_ARCH_GPT2:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+                switch (hparams.n_layer) {
+                    case 12: model.type = e_model::MODEL_SMALL; break;
+                    case 24: model.type = e_model::MODEL_MEDIUM; break;
+                    case 36: model.type = e_model::MODEL_LARGE; break;
+                    case 48: model.type = e_model::MODEL_XL; break;
+                    default: model.type = e_model::MODEL_UNKNOWN;
+                }
+            } break;
  
          default: (void)0;
      }
@@ -3015,8 +3097,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
      LLAMA_LOG_INFO("%s: n_head           = %u\n",     __func__, hparams.n_head);
      LLAMA_LOG_INFO("%s: n_head_kv        = %u\n",     __func__, hparams.n_head_kv);
      LLAMA_LOG_INFO("%s: n_layer          = %u\n",     __func__, hparams.n_layer);
-    LLAMA_LOG_INFO("%s: n_rot            = %u\n",     __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
+    LLAMA_LOG_INFO("%s: n_rot            = %u\n",     __func__, hparams.n_rot);
+    LLAMA_LOG_INFO("%s: n_embd_head_k    = %u\n",     __func__, hparams.n_embd_head_k);
+    LLAMA_LOG_INFO("%s: n_embd_head_v    = %u\n",     __func__, hparams.n_embd_head_v);
      LLAMA_LOG_INFO("%s: n_gqa            = %u\n",     __func__, hparams.n_gqa());
+    LLAMA_LOG_INFO("%s: n_embd_k_gqa     = %u\n",     __func__, hparams.n_embd_k_gqa());
+    LLAMA_LOG_INFO("%s: n_embd_v_gqa     = %u\n",     __func__, hparams.n_embd_v_gqa());
      LLAMA_LOG_INFO("%s: f_norm_eps       = %.1e\n",   __func__, hparams.f_norm_eps);
      LLAMA_LOG_INFO("%s: f_norm_rms_eps   = %.1e\n",   __func__, hparams.f_norm_rms_eps);
      LLAMA_LOG_INFO("%s: f_clamp_kqv      = %.1e\n",   __func__, hparams.f_clamp_kqv);
@@ -3106,10 +3192,11 @@ static bool llm_load_tensors(
  
      // create tensors for the weights
      {
-        const int64_t n_embd     = hparams.n_embd;
-        const int64_t n_embd_gqa = hparams.n_embd_gqa();
-        const int64_t n_layer    = hparams.n_layer;
-        const int64_t n_vocab    = hparams.n_vocab;
+        const int64_t n_embd       = hparams.n_embd;
+        const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
+        const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
+        const int64_t n_layer      = hparams.n_layer;
+        const int64_t n_vocab      = hparams.n_vocab;
  
          const auto tn = LLM_TN(model.arch);
          switch (model.arch) {
@@ -3135,7 +3222,10 @@ static bool llm_load_tensors(
                          model.output      = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
                      }
  
-                    const uint32_t n_ff = hparams.n_ff;
+                    const uint32_t n_ff      = hparams.n_ff;
+                    const int64_t n_embd_gqa = n_embd_v_gqa;
+                    GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
+                    GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
  
                      const int i_gpu_start = n_layer - n_gpu_layers;
  
@@ -3203,7 +3293,10 @@ static bool llm_load_tensors(
                          model.output      = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
                      }
  
-                    const uint32_t n_ff = hparams.n_ff;
+                    const uint32_t n_ff      = hparams.n_ff;
+                    const int64_t n_embd_gqa = n_embd_v_gqa;
+                    GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
+                    GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
  
                      const int i_gpu_start = n_layer - n_gpu_layers;
  
@@ -3251,7 +3344,10 @@ static bool llm_load_tensors(
                          model.output        = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
                      }
  
-                    const uint32_t n_ff = hparams.n_ff;
+                    const uint32_t n_ff      = hparams.n_ff;
+                    const int64_t n_embd_gqa = n_embd_v_gqa;
+                    GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
+                    GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
  
                      const int i_gpu_start = n_layer - n_gpu_layers;
  
@@ -3301,7 +3397,10 @@ static bool llm_load_tensors(
                          model.output        = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
                      }
  
-                    const uint32_t n_ff = hparams.n_ff;
+                    const uint32_t n_ff      = hparams.n_ff;
+                    const int64_t n_embd_gqa = n_embd_v_gqa;
+                    GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
+                    GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
  
                      const int i_gpu_start = n_layer - n_gpu_layers;
  
@@ -3353,7 +3452,11 @@ static bool llm_load_tensors(
                          model.output         = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
                      }
  
-                    const uint32_t n_ff = hparams.n_ff;
+                    const uint32_t n_ff      = hparams.n_ff;
+                    const int64_t n_embd_gqa = n_embd_v_gqa;
+                    GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
+                    GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
+
                      const int i_gpu_start = n_layer - n_gpu_layers;
                      model.layers.resize(n_layer);
                      for (uint32_t i = 0; i < n_layer; ++i) {
@@ -3402,7 +3505,10 @@ static bool llm_load_tensors(
                          model.output        = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
                      }
  
-                    const uint32_t n_ff = hparams.n_ff;
+                    const uint32_t n_ff      = hparams.n_ff;
+                    const int64_t n_embd_gqa = n_embd_v_gqa;
+                    GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
+                    GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
  
                      const int i_gpu_start = n_layer - n_gpu_layers;
  
@@ -3436,7 +3542,6 @@ static bool llm_load_tensors(
              case LLM_ARCH_MPT:
                  {
                      model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
-
                      // output
                      {
                          ggml_backend_type backend_norm;
@@ -3454,7 +3559,10 @@ static bool llm_load_tensors(
                          model.output        = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
                      }
  
-                    const uint32_t n_ff = hparams.n_ff;
+                    const uint32_t n_ff      = hparams.n_ff;
+                    const int64_t n_embd_gqa = n_embd_v_gqa;
+                    GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
+                    GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
  
                      const int i_gpu_start = n_layer - n_gpu_layers;
  
@@ -3474,6 +3582,9 @@ static bool llm_load_tensors(
  
                          layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
                          layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
+
+                        // AWQ ScaleActivation layer
+                        layer.ffn_act = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, backend, false);
                      }
                  } break;
              case LLM_ARCH_STABLELM:
@@ -3498,7 +3609,10 @@ static bool llm_load_tensors(
                          model.output      = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
                      }
  
-                    const uint32_t n_ff = hparams.n_ff;
+                    const uint32_t n_ff      = hparams.n_ff;
+                    const int64_t n_embd_gqa = n_embd_v_gqa;
+                    GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
+                    GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
  
                      const int i_gpu_start = n_layer - n_gpu_layers;
  
@@ -3596,7 +3710,10 @@ static bool llm_load_tensors(
                          model.output_b      = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "bias"),   {n_vocab},         backend_output);
                      }
  
-                    const uint32_t n_ff = hparams.n_ff;
+                    const uint32_t n_ff      = hparams.n_ff;
+                    const int64_t n_embd_gqa = n_embd_v_gqa;
+                    GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
+                    GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
  
                      const int i_gpu_start = n_layer - n_gpu_layers;
  
@@ -3624,6 +3741,111 @@ static bool llm_load_tensors(
                          layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i),   {n_ff},         backend);
                      }
                  } break;
+            case LLM_ARCH_PLAMO:
+                {
+                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+
+                    // output
+                    {
+                        ggml_backend_type backend_norm;
+                        ggml_backend_type backend_output;
+
+                        if (n_gpu_layers > int(n_layer)) {
+                            backend_norm   = llama_backend_offload;
+                            backend_output = llama_backend_offload_split;
+                        } else {
+                            backend_norm   = GGML_BACKEND_CPU;
+                            backend_output = GGML_BACKEND_CPU;
+                        }
+
+                        model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
+                        model.output      = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
+                    }
+
+                    const uint32_t n_ff      = hparams.n_ff;
+                    const int64_t n_embd_gqa = n_embd_v_gqa;
+                    GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
+                    GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
+
+                    const int i_gpu_start = n_layer - n_gpu_layers;
+
+                    model.layers.resize(n_layer);
+
+                    for (uint32_t i = 0; i < n_layer; ++i) {
+                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
+                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
+
+                        auto & layer = model.layers[i];
+
+                        layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
+
+                        layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd},     backend_split);
+                        layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, backend_split);
+                        layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, backend_split);
+                        layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},     backend_split);
+
+                        layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, backend_split);
+                        layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
+                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
+                    }
+                } break;
+            case LLM_ARCH_GPT2:
+                {
+                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab},             GGML_BACKEND_CPU);
+                    model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"),   {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
+
+                    // output
+                    {
+                        ggml_backend_type backend_norm;
+                        ggml_backend_type backend_output;
+
+                        if (n_gpu_layers > int(n_layer)) {
+                            backend_norm   = llama_backend_offload;
+                            backend_output = llama_backend_offload_split;
+                        } else {
+                            backend_norm   = GGML_BACKEND_CPU;
+                            backend_output = GGML_BACKEND_CPU;
+                        }
+
+                        model.output_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
+                        model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd},          backend_norm);
+                        model.output        = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
+                    }
+
+                    const uint32_t n_ff      = hparams.n_ff;
+                    const int64_t n_embd_gqa = n_embd_v_gqa;
+                    GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
+                    GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
+
+                    const int i_gpu_start = n_layer - n_gpu_layers;
+
+                    model.layers.resize(n_layer);
+
+                    for (uint32_t i = 0; i < n_layer; ++i) {
+                        const ggml_backend_type backend       = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
+                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
+
+                        auto & layer = model.layers[i];
+
+                        layer.attn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, backend);
+                        layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, backend);
+
+                        layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
+                        layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa},         backend);
+
+                        layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},   backend_split);
+                        layer.bo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd},           backend);
+
+                        layer.ffn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
+                        layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, backend);
+
+                        layer.ffn_down   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
+                        layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd},       backend);
+
+                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, backend_split);
+                        layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "bias", i),           {n_ff}, backend);
+                    }
+                } break;
              default:
                  throw std::runtime_error("unknown architecture");
          }
@@ -3832,8 +4054,8 @@ static struct ggml_tensor * llm_build_inp_embd(
      return inpL;
  }
  
-// Persimmon: n_rot = n_embd_head/2
-// Other:     n_rot = n_embd_head
+// Persimmon: n_rot = n_embd_head_k/2
+// Other:     n_rot = n_embd_head_k
  static void llm_build_k_shift(
        struct ggml_context * ctx,
        const llama_hparams & hparams,
@@ -3846,17 +4068,17 @@ static void llm_build_k_shift(
                    float     freq_base,
                    float     freq_scale,
         const llm_build_cb & cb) {
-    const int64_t n_layer     = hparams.n_layer;
-    const int64_t n_head_kv   = hparams.n_head_kv;
-    const int64_t n_embd_gqa  = hparams.n_embd_gqa();
-    const int64_t n_embd_head = hparams.n_embd_head();
-    const int32_t n_orig_ctx  = cparams.n_yarn_orig_ctx;
-    const float   ext_factor  = cparams.yarn_ext_factor;
-    const float   attn_factor = cparams.yarn_attn_factor;
-    const float   beta_fast   = cparams.yarn_beta_fast;
-    const float   beta_slow   = cparams.yarn_beta_slow;
-
-    GGML_ASSERT(n_embd_head % n_rot == 0);
+    const int64_t n_layer       = hparams.n_layer;
+    const int64_t n_head_kv     = hparams.n_head_kv;
+    const int64_t n_embd_head_k = hparams.n_embd_head_k;
+    const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa();
+    const int32_t n_orig_ctx    = cparams.n_yarn_orig_ctx;
+    const float   ext_factor    = cparams.yarn_ext_factor;
+    const float   attn_factor   = cparams.yarn_attn_factor;
+    const float   beta_fast     = cparams.yarn_beta_fast;
+    const float   beta_slow     = cparams.yarn_beta_slow;
+
+    GGML_ASSERT(n_embd_head_k % n_rot == 0);
  
      struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
      cb(K_shift, "K_shift", -1);
@@ -3874,9 +4096,9 @@ static void llm_build_k_shift(
              // we rotate only the first n_rot dimensions
              ggml_rope_custom_inplace(ctx,
                      ggml_view_3d(ctx, kv.k_l[il],
-                        n_embd_head, n_head_kv, n_ctx,
-                        ggml_row_size(kv.k_l[il]->type, n_embd_head),
-                        ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
+                        n_embd_head_k, n_head_kv, n_ctx,
+                        ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
+                        ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
                          0),
                      K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                      ext_factor, attn_factor, beta_fast, beta_slow);
@@ -3897,18 +4119,19 @@ static void llm_build_kv_store(
                      int32_t   kv_head,
           const llm_build_cb & cb,
                      int64_t   il) {
-    const int64_t n_embd_gqa = hparams.n_embd_gqa();
+    const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
+    const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
  
      // compute the transposed [n_tokens, n_embd] V matrix
-    struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_gqa, n_tokens));
+    struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens));
      //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
      cb(v_cur_t, "v_cur_t", il);
  
-    struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_gqa,
-            (ggml_row_size(kv.k_l[il]->type, n_embd_gqa))*kv_head);
+    struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
+            (ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head);
      cb(k_cache_view, "k_cache_view", il);
  
-    struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_gqa,
+    struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
              (  n_ctx)*ggml_element_size(kv.v_l[il]),
              (kv_head)*ggml_element_size(kv.v_l[il]));
      cb(v_cache_view, "v_cache_view", il);
@@ -3959,6 +4182,7 @@ static struct ggml_tensor * llm_build_ffn(
           struct ggml_tensor * gate_b,
           struct ggml_tensor * down,
           struct ggml_tensor * down_b,
+         struct ggml_tensor * act_scales,
              llm_ffn_op_type   type_op,
            llm_ffn_gate_type   type_gate,
           const llm_build_cb & cb,
@@ -4003,6 +4227,10 @@ static struct ggml_tensor * llm_build_ffn(
              {
                  cur = ggml_gelu(ctx, cur);
                  cb(cur, "ffn_gelu", il);
+                if (act_scales != NULL) {
+                    cur = ggml_div(ctx, cur, act_scales);
+                    cb(cur, "ffn_act", il);
+                }
              } break;
          case LLM_FFN_RELU:
              {
@@ -4053,20 +4281,20 @@ static struct ggml_tensor * llm_build_kqv(
                      float     kq_scale,
           const llm_build_cb & cb,
                      int       il) {
-    const int64_t n_embd      = hparams.n_embd;
-    const int64_t n_head      = hparams.n_head;
-    const int64_t n_head_kv   = hparams.n_head_kv;
-    const int64_t n_embd_head = hparams.n_embd_head();
-    const int64_t n_embd_gqa  = hparams.n_embd_gqa();
+    const int64_t n_head        = hparams.n_head;
+    const int64_t n_head_kv     = hparams.n_head_kv;
+    const int64_t n_embd_head_k = hparams.n_embd_head_k;
+    const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa();
+    const int64_t n_embd_head_v = hparams.n_embd_head_v;
  
      struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
      cb(q, "q", il);
  
      struct ggml_tensor * k =
          ggml_view_3d(ctx, kv.k_l[il],
-                n_embd_head, n_kv, n_head_kv,
-                ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
-                ggml_row_size(kv.k_l[il]->type, n_embd_head),
+                n_embd_head_k, n_kv, n_head_kv,
+                ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
+                ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
                  0);
      cb(k, "k", il);
  
@@ -4105,9 +4333,9 @@ static struct ggml_tensor * llm_build_kqv(
      // split cached v into n_head heads
      struct ggml_tensor * v =
          ggml_view_3d(ctx, kv.v_l[il],
-                n_kv, n_embd_head, n_head_kv,
+                n_kv, n_embd_head_v, n_head_kv,
                  ggml_element_size(kv.v_l[il])*n_ctx,
-                ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head,
+                ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
                  0);
      cb(v, "v", il);
  
@@ -4117,7 +4345,7 @@ static struct ggml_tensor * llm_build_kqv(
      struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
      cb(kqv_merged, "kqv_merged", il);
  
-    struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd, n_tokens);
+    struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
      cb(cur, "kqv_merged_cont", il);
  
      cur = ggml_mul_mat(ctx, wo, cur);
@@ -4144,8 +4372,10 @@ struct llm_build_context {
      const int64_t n_ctx;       // user-specified context size (can be different from n_ctx_train)
      const int64_t n_head;
      const int64_t n_head_kv;
-    const int64_t n_embd_head;
-    const int64_t n_embd_gqa;
+    const int64_t n_embd_head_k;
+    const int64_t n_embd_k_gqa;
+    const int64_t n_embd_head_v;
+    const int64_t n_embd_v_gqa;
      const int64_t n_expert;
      const int64_t n_expert_used;
  
@@ -4187,8 +4417,10 @@ struct llm_build_context {
          n_ctx            (cparams.n_ctx),
          n_head           (hparams.n_head),
          n_head_kv        (hparams.n_head_kv),
-        n_embd_head      (hparams.n_embd_head()),
-        n_embd_gqa       (hparams.n_embd_gqa()),
+        n_embd_head_k    (hparams.n_embd_head_k),
+        n_embd_k_gqa     (hparams.n_embd_k_gqa()),
+        n_embd_head_v    (hparams.n_embd_head_v),
+        n_embd_v_gqa     (hparams.n_embd_v_gqa()),
          n_expert         (hparams.n_expert),
          n_expert_used    (hparams.n_expert_used),
          freq_base        (cparams.rope_freq_base),
@@ -4231,6 +4463,8 @@ struct llm_build_context {
      struct ggml_cgraph * build_llama() {
          struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
  
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
          GGML_ASSERT(n_embd_head == hparams.n_rot);
  
          struct ggml_tensor * cur;
@@ -4321,6 +4555,7 @@ struct llm_build_context {
                          model.layers[il].ffn_up,   NULL,
                          model.layers[il].ffn_gate, NULL,
                          model.layers[il].ffn_down, NULL,
+                        NULL,
                          LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
                  cb(cur, "ffn_out", il);
              } else {
@@ -4414,6 +4649,9 @@ struct llm_build_context {
      struct ggml_cgraph * build_baichuan() {
          struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
  
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
          struct ggml_tensor * cur;
          struct ggml_tensor * inpL;
  
@@ -4500,6 +4738,7 @@ struct llm_build_context {
                          model.layers[il].ffn_up,   NULL,
                          model.layers[il].ffn_gate, NULL,
                          model.layers[il].ffn_down, NULL,
+                        NULL,
                          LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
                  cb(cur, "ffn_out", il);
              }
@@ -4530,6 +4769,11 @@ struct llm_build_context {
      struct ggml_cgraph * build_falcon() {
          struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
  
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_gqa  == n_embd);
+
          struct ggml_tensor * cur;
          struct ggml_tensor * inpL;
  
@@ -4614,6 +4858,7 @@ struct llm_build_context {
                          model.layers[il].ffn_up,   NULL,
                          NULL,                      NULL,
                          model.layers[il].ffn_down, NULL,
+                        NULL,
                          LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
                  cb(cur, "ffn_out", il);
              }
@@ -4648,6 +4893,11 @@ struct llm_build_context {
      struct ggml_cgraph * build_starcoder() {
          struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
  
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_gqa  == n_embd);
+
          struct ggml_tensor * cur;
          struct ggml_tensor * pos;
          struct ggml_tensor * inpL;
@@ -4718,6 +4968,7 @@ struct llm_build_context {
                          model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
                          NULL,                      NULL,
                          model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+                        NULL,
                          LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
                  cb(cur, "ffn_out", il);
              }
@@ -4743,7 +4994,12 @@ struct llm_build_context {
      struct ggml_cgraph * build_persimmon() {
          struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
  
-        const int64_t n_rot = n_embd_head / 2;
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_gqa  == n_embd);
+
+        const int64_t n_rot = n_embd_head_k / 2;
  
          struct ggml_tensor * cur;
          struct ggml_tensor * inpL;
@@ -4922,6 +5178,7 @@ struct llm_build_context {
                          model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
                          NULL,                      NULL,
                          model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+                        NULL,
                          LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
                  cb(cur, "ffn_out", il);
              }
@@ -4951,6 +5208,11 @@ struct llm_build_context {
      struct ggml_cgraph * build_refact() {
          struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
  
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_gqa  == n_embd);
+
          struct ggml_tensor * cur;
          struct ggml_tensor * inpL;
  
@@ -5008,6 +5270,7 @@ struct llm_build_context {
                          model.layers[il].ffn_up,   NULL,
                          model.layers[il].ffn_gate, NULL,
                          model.layers[il].ffn_down, NULL,
+                        NULL,
                          LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
                  cb(cur, "ffn_out", il);
              }
@@ -5038,6 +5301,11 @@ struct llm_build_context {
      struct ggml_cgraph * build_bloom() {
          struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
  
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_gqa  == n_embd);
+
          struct ggml_tensor * cur;
          struct ggml_tensor * inpL;
  
@@ -5103,6 +5371,7 @@ struct llm_build_context {
                          model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
                          NULL,                      NULL,
                          model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+                        NULL,
                          LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
                  cb(cur, "ffn_out", il);
              }
@@ -5128,6 +5397,11 @@ struct llm_build_context {
      struct ggml_cgraph * build_mpt() {
          struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
  
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_gqa  == n_embd);
+
          struct ggml_tensor * cur;
          struct ggml_tensor * inpL;
  
@@ -5188,11 +5462,11 @@ struct llm_build_context {
                          NULL,
                          LLM_NORM, cb, il);
                  cb(cur, "ffn_norm", il);
-
                  cur = llm_build_ffn(ctx0, cur,
                          model.layers[il].ffn_up,   NULL,
                          NULL,                      NULL,
                          model.layers[il].ffn_down, NULL,
+                        model.layers[il].ffn_act,
                          LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
                  cb(cur, "ffn_out", il);
              }
@@ -5223,6 +5497,9 @@ struct llm_build_context {
      struct ggml_cgraph * build_stablelm() {
          struct ggml_cgraph * gf = ggml_new_graph(ctx0);
  
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
          struct ggml_tensor * cur;
          struct ggml_tensor * inpL;
  
@@ -5301,6 +5578,7 @@ struct llm_build_context {
                          model.layers[il].ffn_up,   NULL,
                          model.layers[il].ffn_gate, NULL,
                          model.layers[il].ffn_down, NULL,
+                        NULL,
                          LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
                  cb(cur, "ffn_out", il);
              }
@@ -5332,6 +5610,9 @@ struct llm_build_context {
      struct ggml_cgraph * build_qwen() {
          struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
  
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
          struct ggml_tensor * cur;
          struct ggml_tensor * inpL;
  
@@ -5413,6 +5694,7 @@ struct llm_build_context {
                          model.layers[il].ffn_up,   NULL,
                          model.layers[il].ffn_gate, NULL,
                          model.layers[il].ffn_down, NULL,
+                        NULL,
                          LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
                  cb(cur, "ffn_out", il);
              }
@@ -5442,6 +5724,11 @@ struct llm_build_context {
      struct ggml_cgraph * build_phi2() {
          struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
  
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_gqa  == n_embd);
+
          struct ggml_tensor * cur;
          struct ggml_tensor * attn_norm_output;
          struct ggml_tensor * ffn_output;
@@ -5520,6 +5807,7 @@ struct llm_build_context {
                          model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
                          NULL,                      NULL,
                          model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+                        NULL,
                          LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
                  cb(ffn_output, "ffn_out", il);
              }
@@ -5549,6 +5837,214 @@ struct llm_build_context {
  
          return gf;
      }
+
+    struct ggml_cgraph * build_plamo() {
+        struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+        cb(inpL, "inp_embd", -1);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        cb(inp_pos, "inp_pos", -1);
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask, "KQ_mask", -1);
+
+        // shift the entire K-cache if needed
+        if (do_rope_shift) {
+            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb);
+        }
+
+        for (int il = 0; il < n_layer; ++il) {
+
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            struct ggml_tensor * attention_norm = cur;
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_rope_custom(
+                        ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos,
+                        n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_custom(
+                        ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
+                        n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+                cb(Kcur, "Kcur", il);
+
+                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
+
+                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+                        model.layers[il].wo, NULL,
+                        Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                cb(cur, "kqv_out", il);
+            }
+            struct ggml_tensor * sa_out = cur;
+
+            cur = attention_norm;
+
+            // feed-forward network
+            {
+                cur = llm_build_ffn(ctx0, cur,
+                        model.layers[il].ffn_up, NULL,
+                        model.layers[il].ffn_gate, NULL,
+                        model.layers[il].ffn_down, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, sa_out);
+            cb(cur, "l_out", il);
+
+            cur = ggml_add(ctx0, cur, inpL);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_gpt2() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_gqa  == n_embd);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * pos;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+        cb(inpL, "inp_embd", -1);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        cb(inp_pos, "inp_pos", -1);
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask, "KQ_mask", -1);
+
+        pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
+        cb(pos, "pos_embd", -1);
+
+        inpL = ggml_add(ctx0, inpL, pos);
+        cb(inpL, "inpL", -1);
+
+        for (int il = 0; il < n_layer; ++il) {
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm,
+                    model.layers[il].attn_norm_b,
+                    LLM_NORM, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
+                cb(cur, "wqkv", il);
+
+                cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+                cb(cur, "bqkv", il);
+
+                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+
+                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
+
+                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                cb(cur, "kqv_out", il);
+            }
+
+            // add the input
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // FF
+            {
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                        model.layers[il].ffn_norm,
+                        model.layers[il].ffn_norm_b,
+                        LLM_NORM, cb, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = llm_build_ffn(ctx0, cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
+                        NULL,                      NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+                        NULL,
+                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            inpL = ggml_add(ctx0, cur, ffn_inp);
+            cb(inpL, "l_out", il);
+        }
+
+        cur = llm_build_norm(ctx0, inpL, hparams,
+                model.output_norm,
+                model.output_norm_b,
+                LLM_NORM, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
  };
  
  //
@@ -5704,6 +6200,7 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
      { "ffn_gate",                   OFFLOAD_FUNC     },
      { "ffn_gate_b",                 OFFLOAD_FUNC     },
      { "ffn_gate_par",               OFFLOAD_FUNC     },
+    { "ffn_act",                    OFFLOAD_FUNC     },
      { "ffn_down",                   OFFLOAD_FUNC     },
      { "ffn_down_b",                 OFFLOAD_FUNC     },
      { "ffn_out",                    OFFLOAD_FUNC     },
@@ -6059,6 +6556,14 @@ static struct ggml_cgraph * llama_build_graph(
              {
                  result = llm.build_phi2();
              } break;
+        case LLM_ARCH_PLAMO:
+            {
+                result = llm.build_plamo();
+            } break;
+        case LLM_ARCH_GPT2:
+            {
+                result = llm.build_gpt2();
+            } break;
          default:
              GGML_ASSERT(false);
      }
@@ -7525,7 +8030,7 @@ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * c
      }
  }
  
-void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep) {
+void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
      const int64_t t_start_sample_us = ggml_time_us();
  
      k = std::max(k, (int) min_keep);
@@ -7885,7 +8390,7 @@ void llama_sample_classifier_free_guidance(
      }
  }
  
-llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
+llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
      GGML_ASSERT(ctx);
  
      auto N = float(llama_n_vocab(llama_get_model(ctx)));
@@ -9093,7 +9598,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
      return result;
  }
  
-int llama_max_devices(void) {
+int32_t llama_max_devices(void) {
      return LLAMA_MAX_DEVICES;
  }
  
@@ -9235,8 +9740,8 @@ struct llama_context * llama_new_context_with_model(
      const ggml_type type_k = params.type_k;
      const ggml_type type_v = params.type_v;
  
-    GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(type_k) == 0);
-    GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(type_v) == 0);
+    GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
+    GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
  
      // reserve memory for context buffers
      if (!hparams.vocab_only) {
@@ -9332,7 +9837,8 @@ struct llama_context * llama_new_context_with_model(
              ctx->alloc = ggml_allocr_new_from_buffer(ctx->buf_alloc);
  #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
              if (model->n_gpu_layers > 0) {
-                ggml_cuda_set_scratch_size(alloc_size);
+                // the CPU buffer adds this padding in case the malloc buffer is not aligned, so we need to do the same for the GPU buffer, since we use the same offsets
+                ggml_cuda_set_scratch_size(alloc_size + 64);
                  LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
  
                  // calculate total VRAM usage
@@ -9403,15 +9909,15 @@ enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
      return model->vocab.type;
  }
  
-int llama_n_vocab(const struct llama_model * model) {
+int32_t llama_n_vocab(const struct llama_model * model) {
      return model->vocab.id_to_token.size();
  }
  
-int llama_n_ctx_train(const struct llama_model * model) {
+int32_t llama_n_ctx_train(const struct llama_model * model) {
      return model->hparams.n_ctx_train;
  }
  
-int llama_n_embd(const struct llama_model * model) {
+int32_t llama_n_embd(const struct llama_model * model) {
      return model->hparams.n_embd;
  }
  
@@ -9419,7 +9925,7 @@ float llama_rope_freq_scale_train(const struct llama_model * model) {
      return model->hparams.rope_freq_scale_train;
  }
  
-int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
+int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
      const auto & it = model->gguf_kv.find(key);
      if (it == model->gguf_kv.end()) {
          if (buf_size > 0) {
@@ -9430,11 +9936,11 @@ int llama_model_meta_val_str(const struct llama_model * model, const char * key,
      return snprintf(buf, buf_size, "%s", it->second.c_str());
  }
  
-int llama_model_meta_count(const struct llama_model * model) {
+int32_t llama_model_meta_count(const struct llama_model * model) {
      return (int)model->gguf_kv.size();
  }
  
-int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
+int32_t llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
      if (i < 0 || i >= (int)model->gguf_kv.size()) {
          if (buf_size > 0) {
              buf[0] = '\0';
@@ -9446,7 +9952,7 @@ int llama_model_meta_key_by_index(const struct llama_model * model, int i, char
      return snprintf(buf, buf_size, "%s", it->first.c_str());
  }
  
-int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
+int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size) {
      if (i < 0 || i >= (int)model->gguf_kv.size()) {
          if (buf_size > 0) {
              buf[0] = '\0';
@@ -9458,9 +9964,10 @@ int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, c
      return snprintf(buf, buf_size, "%s", it->second.c_str());
  }
  
-int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
-    return snprintf(buf, buf_size, "%s %s %s",
+int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
+    return snprintf(buf, buf_size, "%s %s%s %s",
              llama_model_arch_name(model->arch).c_str(),
+            model->hparams.n_expert > 0 ? (std::to_string(model->hparams.n_expert) + "x").c_str() : "",
              llama_model_type_name(model->type),
              llama_model_ftype_name(model->ftype).c_str());
  }
@@ -9485,7 +9992,7 @@ struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const ch
      return ggml_get_tensor(model->ctx, name);
  }
  
-int llama_model_quantize(
+uint32_t llama_model_quantize(
          const char * fname_inp,
          const char * fname_out,
          const llama_model_quantize_params * params) {
@@ -9498,7 +10005,7 @@ int llama_model_quantize(
      }
  }
  
-int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
+int32_t llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
      try {
          return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
      } catch (const std::exception & err) {
@@ -9507,7 +10014,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
      }
  }
  
-int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
+int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
      try {
          return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
      } catch (const std::exception & err) {
@@ -9605,7 +10112,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k
      }
  }
  
-int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
+int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx) {
      int result = 0;
  
      for (uint32_t i = 0; i < ctx->kv_self.size; i++) {
@@ -9615,7 +10122,7 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
      return result;
  }
  
-int llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
+int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
      return ctx->kv_self.used;
  }
  
@@ -9779,9 +10286,10 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
          const auto & hparams = ctx->model.hparams;
          const auto & cparams = ctx->cparams;
  
-        const auto   n_layer = hparams.n_layer;
-        const auto   n_embd  = hparams.n_embd_gqa();
-        const auto   n_ctx   = cparams.n_ctx;
+        const auto   n_layer      = hparams.n_layer;
+        const auto   n_embd_k_gqa = hparams.n_embd_k_gqa();
+        const auto   n_embd_v_gqa = hparams.n_embd_v_gqa();
+        const auto   n_ctx        = cparams.n_ctx;
  
          const size_t   kv_buf_size = ggml_backend_buffer_get_size(kv_self.buf);
          const uint32_t kv_head     = kv_self.head;
@@ -9803,15 +10311,15 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
              std::vector<struct ggml_tensor *> vout2d(n_layer);
  
              for (int il = 0; il < (int) n_layer; ++il) {
-                kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
-                vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
+                kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd_k_gqa, kv_head);
+                vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd_v_gqa);
  
                  ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
-                        n_embd, kv_head,
-                        elt_size*n_embd, 0);
+                        n_embd_k_gqa, kv_head,
+                        elt_size*n_embd_k_gqa, 0);
  
                  ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
-                        kv_head, n_embd,
+                        kv_head, n_embd_v_gqa,
                          elt_size*n_ctx, 0);
  
                  ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d[il]));
@@ -9918,9 +10426,10 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
          const auto & hparams = ctx->model.hparams;
          const auto & cparams = ctx->cparams;
  
-        const int    n_layer = hparams.n_layer;
-        const int    n_embd  = hparams.n_embd_gqa();
-        const int    n_ctx   = cparams.n_ctx;
+        const int    n_layer      = hparams.n_layer;
+        const int    n_embd_k_gqa = hparams.n_embd_k_gqa();
+        const int    n_embd_v_gqa = hparams.n_embd_v_gqa();
+        const int    n_ctx        = cparams.n_ctx;
  
          size_t   kv_buf_size;
          uint32_t kv_head;
@@ -9944,15 +10453,15 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
              std::vector<struct ggml_tensor *> vin2d(n_layer);
  
              for (int il = 0; il < n_layer; ++il) {
-                kin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
-                vin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
+                kin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd_k_gqa, kv_head);
+                vin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd_v_gqa);
  
                  ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
-                    n_embd, kv_head,
-                    elt_size*n_embd, 0);
+                    n_embd_k_gqa, kv_head,
+                    elt_size*n_embd_k_gqa, 0);
  
                  ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
-                    kv_head, n_embd,
+                    kv_head, n_embd_v_gqa,
                      elt_size*n_ctx, 0);
  
                  ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d[il], k2d));
@@ -10095,7 +10604,7 @@ int llama_eval(
          struct llama_context * ctx,
                   llama_token * tokens,
                       int32_t   n_tokens,
-                         int   n_past) {
+                     int32_t   n_past) {
      llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
  
      const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
@@ -10110,7 +10619,7 @@ int llama_eval_embd(
              struct llama_context * ctx,
                             float * embd,
                           int32_t   n_tokens,
-                             int   n_past) {
+                         int32_t   n_past) {
      llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
  
      llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
@@ -10181,7 +10690,7 @@ void llama_batch_free(struct llama_batch batch) {
      if (batch.logits)   free(batch.logits);
  }
  
-int llama_decode(
+int32_t llama_decode(
          struct llama_context * ctx,
            struct llama_batch   batch) {
      const int ret = llama_decode_internal(*ctx, batch);
@@ -10229,11 +10738,11 @@ llama_token llama_token_nl(const struct llama_model * model) {
      return model->vocab.linefeed_id;
  }
  
-int llama_add_bos_token(const struct llama_model * model) {
+int32_t llama_add_bos_token(const struct llama_model * model) {
      return model->vocab.special_add_bos;
  }
  
-int llama_add_eos_token(const struct llama_model * model) {
+int32_t llama_add_eos_token(const struct llama_model * model) {
      return model->vocab.special_add_eos;
  }
  
@@ -10253,12 +10762,12 @@ llama_token llama_token_eot(const struct llama_model * model) {
      return model->vocab.special_eot_id;
  }
  
-int llama_tokenize(
+int32_t llama_tokenize(
      const struct llama_model * model,
                    const char * text,
-                         int   text_len,
+                     int32_t   text_len,
                   llama_token * tokens,
-                         int   n_max_tokens,
+                     int32_t   n_max_tokens,
                          bool   add_bos,
                          bool   special) {
      auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special);
@@ -10286,7 +10795,7 @@ static std::string llama_decode_text(const std::string & text) {
  }
  
  // does not write null-terminator to buf
-int llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int length) {
+int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
      if (0 <= token && token < llama_n_vocab(model)) {
          switch (llama_vocab_get_type(model->vocab)) {
          case LLAMA_VOCAB_TYPE_SPM: {
@@ -10294,7 +10803,7 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch
                  std::string result = model->vocab.id_to_token[token].text;
                  llama_unescape_whitespace(result);
                  if (length < (int) result.length()) {
-                    return -result.length();
+                    return -(int) result.length();
                  }
                  memcpy(buf, result.c_str(), result.length());
                  return result.length();
@@ -10324,7 +10833,7 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch
                  std::string result = model->vocab.id_to_token[token].text;
                  result = llama_decode_text(result);
                  if (length < (int) result.length()) {
-                    return -result.length();
+                    return -(int) result.length();
                  }
                  memcpy(buf, result.c_str(), result.length());
                  return result.length();
@@ -10387,6 +10896,7 @@ const char * llama_print_system_info(void) {
  
      s  = "";
      s += "AVX = "         + std::to_string(ggml_cpu_has_avx())         + " | ";
+    s += "AVX_VNNI = "    + std::to_string(ggml_cpu_has_avx_vnni())    + " | ";
      s += "AVX2 = "        + std::to_string(ggml_cpu_has_avx2())        + " | ";
      s += "AVX512 = "      + std::to_string(ggml_cpu_has_avx512())      + " | ";
      s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
diff --git a/examples/talk-llama/llama.h b/examples/talk-llama/llama.h

index af76bae2d2a156258bee4ca1488e68089a555e85..461d4604a1b5455c160078f2517ec30549d8d078 100644 (file)
--- a/examples/talk-llama/llama.h
+++ b/examples/talk-llama/llama.h
@@ -226,7 +226,7 @@ extern "C" {
  
      // model quantization parameters
      typedef struct llama_model_quantize_params {
-        int nthread;                 // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
+        int32_t nthread;             // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
          enum llama_ftype ftype;      // quantize to this llama_ftype
          bool allow_requantize;       // allow quantizing non-f32/f16 tensors
          bool quantize_output_tensor; // quantize output.weight
@@ -310,21 +310,20 @@ extern "C" {
  
      LLAMA_API int64_t llama_time_us(void);
  
-    LLAMA_API int  llama_max_devices    (void);
+    LLAMA_API int32_t  llama_max_devices(void);
      LLAMA_API bool llama_mmap_supported (void);
      LLAMA_API bool llama_mlock_supported(void);
  
      LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
  
-    // TODO: become more consistent with returned int types across the API
      LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
      LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
  
      LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
  
-    LLAMA_API int llama_n_vocab    (const struct llama_model * model);
-    LLAMA_API int llama_n_ctx_train(const struct llama_model * model);
-    LLAMA_API int llama_n_embd     (const struct llama_model * model);
+    LLAMA_API int32_t llama_n_vocab    (const struct llama_model * model);
+    LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
+    LLAMA_API int32_t llama_n_embd     (const struct llama_model * model);
  
      // Get the model's RoPE frequency scaling factor
      LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
@@ -335,19 +334,19 @@ extern "C" {
      // - GGUF array values are not supported by these functions
  
      // Get metadata value as a string by key name
-    LLAMA_API int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size);
+    LLAMA_API int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size);
  
      // Get the number of metadata key/value pairs
-    LLAMA_API int llama_model_meta_count(const struct llama_model * model);
+    LLAMA_API int32_t llama_model_meta_count(const struct llama_model * model);
  
      // Get metadata key name by index
-    LLAMA_API int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
+    LLAMA_API int32_t llama_model_meta_key_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
  
      // Get metadata value as a string by index
-    LLAMA_API int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
+    LLAMA_API int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
  
      // Get a string describing the model type
-    LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
+    LLAMA_API int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
  
      // Returns the total size of all the tensors in the model in bytes
      LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
@@ -359,7 +358,7 @@ extern "C" {
      LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
  
      // Returns 0 on success
-    LLAMA_API int llama_model_quantize(
+    LLAMA_API uint32_t llama_model_quantize(
              const char * fname_inp,
              const char * fname_out,
              const llama_model_quantize_params * params);
@@ -370,20 +369,20 @@ extern "C" {
      // The model needs to be reloaded before applying a new adapter, otherwise the adapter
      // will be applied on top of the previous one
      // Returns 0 on success
-    LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
+    LLAMA_API DEPRECATED(int32_t llama_apply_lora_from_file(
              struct llama_context * ctx,
                        const char * path_lora,
                             float   scale,
                        const char * path_base_model,
-                             int   n_threads),
+                         int32_t   n_threads),
              "use llama_model_apply_lora_from_file instead");
  
-    LLAMA_API int llama_model_apply_lora_from_file(
+    LLAMA_API int32_t llama_model_apply_lora_from_file(
              const struct llama_model * model,
                        const char * path_lora,
                             float   scale,
                        const char * path_base_model,
-                             int   n_threads);
+                         int32_t   n_threads);
  
      //
      // KV cache
@@ -439,10 +438,10 @@ extern "C" {
  
      // Returns the number of tokens in the KV cache (slow, use only for debug)
      // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-    LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
+    LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
  
      // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
-    LLAMA_API int llama_get_kv_cache_used_cells(const struct llama_context * ctx);
+    LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);
  
      // Clear the KV cache
      LLAMA_API void llama_kv_cache_clear(
@@ -533,7 +532,7 @@ extern "C" {
              struct llama_context * ctx,
                       llama_token * tokens,
                           int32_t   n_tokens,
-                             int   n_past),
+                         int32_t   n_past),
              "use llama_decode() instead");
  
      // Same as llama_eval, but use float matrix input directly.
@@ -542,7 +541,7 @@ extern "C" {
              struct llama_context * ctx,
                             float * embd,
                           int32_t   n_tokens,
-                             int   n_past),
+                         int32_t   n_past),
              "use llama_decode() instead");
  
      // Return batch for single sequence of tokens starting at pos_0
@@ -574,7 +573,7 @@ extern "C" {
      //   0 - success
      //   1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
      // < 0 - error
-    LLAMA_API int llama_decode(
+    LLAMA_API int32_t llama_decode(
              struct llama_context * ctx,
                struct llama_batch   batch);
  
@@ -614,10 +613,10 @@ extern "C" {
      LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
  
      // Returns -1 if unknown, 1 for true or 0 for false.
-    LLAMA_API int         llama_add_bos_token(const struct llama_model * model);
+    LLAMA_API int32_t         llama_add_bos_token(const struct llama_model * model);
  
      // Returns -1 if unknown, 1 for true or 0 for false.
-    LLAMA_API int         llama_add_eos_token(const struct llama_model * model);
+    LLAMA_API int32_t         llama_add_eos_token(const struct llama_model * model);
  
      // codellama infill tokens
      LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
@@ -635,12 +634,12 @@ extern "C" {
      /// @return Returns a negative number on failure - the number of tokens that would have been returned
      /// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
      ///                Does not insert a leading space.
-    LLAMA_API int llama_tokenize(
+    LLAMA_API int32_t llama_tokenize(
          const struct llama_model * model,
                        const char * text,
-                             int   text_len,
+                         int32_t   text_len,
                       llama_token * tokens,
-                             int   n_max_tokens,
+                         int32_t   n_max_tokens,
                              bool   add_bos,
                              bool   special);
  
@@ -648,11 +647,11 @@ extern "C" {
      // Uses the vocabulary in the provided context.
      // Does not write null terminator to the buffer.
      // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
-    LLAMA_API int llama_token_to_piece(
+    LLAMA_API int32_t llama_token_to_piece(
                const struct llama_model * model,
                             llama_token   token,
                                    char * buf,
-                                  int    length);
+                               int32_t   length);
  
      //
      // Grammar
@@ -704,7 +703,7 @@ extern "C" {
      LLAMA_API void llama_sample_top_k(
              struct llama_context * ctx,
            llama_token_data_array * candidates,
-                             int   k,
+                         int32_t   k,
                            size_t   min_keep);
  
      /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
@@ -763,7 +762,7 @@ extern "C" {
            llama_token_data_array * candidates,
                             float   tau,
                             float   eta,
-                             int   m,
+                         int32_t   m,
                             float * mu);
  
      /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
@@ -836,8 +835,8 @@ extern "C" {
          llama_beam_search_callback_fn_t   callback,
                                     void * callback_data,
                                   size_t   n_beams,
-                                    int   n_past,
-                                    int   n_predict);
+                                int32_t   n_past,
+                                int32_t   n_predict);
  
      // Performance information
      LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
author	Georgi Gerganov <redacted>
	Sat, 6 Jan 2024 15:22:57 +0000 (17:22 +0200)
committer	Georgi Gerganov <redacted>
	Sat, 6 Jan 2024 15:22:57 +0000 (17:22 +0200)
examples/talk-llama/llama.cpp		patch \| blob \| history
examples/talk-llama/llama.h		patch \| blob \| history