mpt : add optional bias tensors (#5638)

author Dat Quoc Nguyen <redacted>

Thu, 22 Feb 2024 08:15:13 +0000 (18:15 +1000)

committer GitHub <redacted>

Thu, 22 Feb 2024 08:15:13 +0000 (10:15 +0200)
author Dat Quoc Nguyen <redacted>
Thu, 22 Feb 2024 08:15:13 +0000 (18:15 +1000)
committer GitHub <redacted>
Thu, 22 Feb 2024 08:15:13 +0000 (10:15 +0200)
diff --git a/llama.cpp b/llama.cpp

index 259f2a3a3ea00db212956d4e8582aa1ab27493a5..9cae8c761f3acec8c3875c9f6c235f061db984ca 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -4054,6 +4054,8 @@ static bool llm_load_tensors(
                      // output
                      {
                          model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+                        model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, false);
+
                          model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
                      }
  
@@ -4063,14 +4065,23 @@ static bool llm_load_tensors(
  
                          auto & layer = model.layers[i];
  
-                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, false);
  
                          layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
+                        layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, false);
+
                          layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+                        layer.bo   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, false);
  
-                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                        layer.ffn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+                        layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, false);
+
+                        layer.ffn_down   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
+                        layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, false);
+
+                        layer.ffn_up     = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                        layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, false);
  
                          // AWQ ScaleActivation layer
                          layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
@@ -6171,7 +6182,7 @@ struct llm_build_context {
  
              attn_norm = llm_build_norm(ctx0, inpL, hparams,
                      model.layers[il].attn_norm,
-                    NULL,
+                    model.layers[il].attn_norm_b,
                      LLM_NORM, cb, il);
              cb(attn_norm, "attn_norm", il);
  
@@ -6181,6 +6192,11 @@ struct llm_build_context {
  
                  cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
                  cb(cur, "wqkv", il);
+                
+                if (model.layers[il].bqkv){
+                    cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+                    cb(cur, "bqkv", il);
+                }
  
                  if (hparams.f_clamp_kqv > 0.0f) {
                      cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
@@ -6198,7 +6214,7 @@ struct llm_build_context {
                  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  
                  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
-                        model.layers[il].wo, NULL,
+                        model.layers[il].wo, model.layers[il].bo,
                          Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                  cb(cur, "kqv_out", il);
              }
@@ -6211,13 +6227,13 @@ struct llm_build_context {
              {
                  cur = llm_build_norm(ctx0, ffn_inp, hparams,
                          model.layers[il].ffn_norm,
-                        NULL,
+                        model.layers[il].ffn_norm_b,
                          LLM_NORM, cb, il);
                  cb(cur, "ffn_norm", il);
                  cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   NULL,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
                          NULL,                      NULL,
-                        model.layers[il].ffn_down, NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
                          model.layers[il].ffn_act,
                          LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
                  cb(cur, "ffn_out", il);
@@ -6234,7 +6250,7 @@ struct llm_build_context {
  
          cur = llm_build_norm(ctx0, cur, hparams,
                  model.output_norm,
-                NULL,
+                model.output_norm_b,
                  LLM_NORM, cb, -1);
          cb(cur, "result_norm", -1);
author	Dat Quoc Nguyen <redacted>
	Thu, 22 Feb 2024 08:15:13 +0000 (18:15 +1000)
committer	GitHub <redacted>
	Thu, 22 Feb 2024 08:15:13 +0000 (10:15 +0200)