imatrix : offload to GPU support (#4957)

author Georgi Gerganov <redacted>

Wed, 17 Jan 2024 16:46:30 +0000 (18:46 +0200)

committer GitHub <redacted>

Wed, 17 Jan 2024 16:46:30 +0000 (18:46 +0200)
author Georgi Gerganov <redacted>
Wed, 17 Jan 2024 16:46:30 +0000 (18:46 +0200)
committer GitHub <redacted>
Wed, 17 Jan 2024 16:46:30 +0000 (18:46 +0200)
diff --git a/ci/run.sh b/ci/run.sh

index 86293f0dbdfd6a48632f51efb71db62dbe43f74f..f3a8ff774afbcb24b1c38526759abc44ecb17d22 100755 (executable)
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -216,6 +216,8 @@ function gg_run_open_llama_3b_v2 {
      (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
      (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
  
+    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+
      (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
  
      function check_ppl {
@@ -243,6 +245,8 @@ function gg_run_open_llama_3b_v2 {
      check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
      check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
  
+    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
+
      # lora
      function compare_ppl {
          qnt="$1"
@@ -284,7 +288,6 @@ function gg_run_open_llama_3b_v2 {
      (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
      compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
  
-
      set +e
  }
  
@@ -294,6 +297,7 @@ function gg_sum_open_llama_3b_v2 {
      gg_printf 'OpenLLaMA 3B-v2:\n'
      gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
      gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
+    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
      gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
      gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
      gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
@@ -393,6 +397,8 @@ function gg_run_open_llama_7b_v2 {
      (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
      (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
  
+    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+
      (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
  
      function check_ppl {
@@ -420,6 +426,8 @@ function gg_run_open_llama_7b_v2 {
      check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
      check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
  
+    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
+
      # lora
      function compare_ppl {
          qnt="$1"
@@ -471,6 +479,7 @@ function gg_sum_open_llama_7b_v2 {
      gg_printf 'OpenLLaMA 7B-v2:\n'
      gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
      gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
+    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
      gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
      gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
      gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp

index 1461bc96376a7c3436086366d027da13392c094a..af78711c5ab66be6274eb022baf5a470d390f757 100644 (file)
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -33,43 +33,120 @@ class IMatrixCollector {
  public:
      IMatrixCollector() = default;
      void set_parameters(StatParams&& params) { m_params = std::move(params); }
-    void collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1);
+    bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
      void save_imatrix() const;
  private:
      std::unordered_map<std::string, Stats> m_stats;
      StatParams                             m_params;
      std::mutex                             m_mutex;
      int                                    m_last_call = 0;
+    std::vector<float>                     m_src1_data;
+    std::vector<int>                       m_ids; // the expert ids from ggml_mul_mat_id
  };
  
-void IMatrixCollector::collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1) {
-    if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return;
-    if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return;
-    std::lock_guard<std::mutex> lock(m_mutex);
-    auto& e = m_stats[src0->name];
-    if (e.values.empty()) {
-        e.values.resize(src1->ne[0], 0);
-    }
-    else if (e.values.size() != (size_t)src1->ne[0]) {
-        fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
-        exit(1); //GGML_ASSERT(false);
+bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
+    GGML_UNUSED(user_data);
+
+    const struct ggml_tensor * src0 = t->src[0];
+    const struct ggml_tensor * src1 = t->src[1];
+
+    // when ask is true, the scheduler wants to know if we are interested in data from this tensor
+    // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
+    if (ask) {
+        if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications
+        if (t->op != GGML_OP_MUL_MAT) return false;
+        if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
+        if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return false;
+        return true;
      }
-    ++e.ncall;
-    if (m_params.verbosity > 1) {
-        printf("%s[%d]: %s, %d x %d, %d\n",__func__,m_last_call,src0->name,(int)src1->ne[0],(int)src1->ne[1],(int)src1->type);
+
+    std::lock_guard<std::mutex> lock(m_mutex);
+
+    // copy the data from the GPU memory if needed
+    const bool is_host = ggml_backend_buffer_is_host(src1->buffer);
+
+    if (!is_host) {
+        m_src1_data.resize(ggml_nelements(src1));
+        ggml_backend_tensor_get(src1, m_src1_data.data(), 0, ggml_nbytes(src1));
      }
-    for (int row = 0; row < (int)src1->ne[1]; ++row) {
-        const float * x = (const float *)src1->data + row * src1->ne[0];
-        for (int j = 0; j < (int)src1->ne[0]; ++j) {
-            e.values[j] += x[j]*x[j];
+
+    const float * data = is_host ? (const float *) src1->data : m_src1_data.data();
+
+    if (t->op == GGML_OP_MUL_MAT_ID) {
+        const int idx  = ((int32_t *) t->op_params)[0];
+        const int n_as = ((int32_t *) t->op_params)[1];
+
+        // the top-k selected expert ids are stored in the src0 tensor
+        // for simplicity, always copy src0 to host, because it is small
+        // take into account that src0 is not contiguous!
+        GGML_ASSERT(src0->ne[1] == src1->ne[1]);
+        GGML_ASSERT(n_as*ggml_nrows(src0));
+        m_ids.resize(ggml_nbytes(src0)/sizeof(int));
+        ggml_backend_tensor_get(src0, m_ids.data(), 0, ggml_nbytes(src0));
+
+        // loop over all possible experts, regardless if they are used or not in the batch
+        // this is necessary to guarantee equal number of "ncall" for each tensor
+        for (int ex = 0; ex < n_as; ++ex) {
+            src0 = t->src[2 + ex];
+            auto& e = m_stats[src0->name];
+            if (e.values.empty()) {
+                e.values.resize(src1->ne[0], 0);
+            }
+            else if (e.values.size() != (size_t)src1->ne[0]) {
+                fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
+                exit(1); //GGML_ASSERT(false);
+            }
+            // NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
+            //       using the following line, we can correct for that if needed
+            //if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
+            ++e.ncall;
+            if (m_params.verbosity > 1) {
+                printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
+            }
+            for (int row = 0; row < (int)src1->ne[1]; ++row) {
+                const int excur = m_ids[row*n_as + idx];
+                GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check
+                if (excur != ex) continue;
+                const float * x = data + row * src1->ne[0];
+                for (int j = 0; j < (int)src1->ne[0]; ++j) {
+                    e.values[j] += x[j]*x[j];
+                }
+            }
+            if (e.ncall > m_last_call) {
+                m_last_call = e.ncall;
+                if (m_last_call % m_params.n_output_frequency == 0) {
+                    save_imatrix();
+                }
+            }
          }
-    }
-    if (e.ncall > m_last_call) {
-        m_last_call = e.ncall;
-        if (m_last_call % m_params.n_output_frequency == 0) {
-            save_imatrix();
+    } else {
+        auto& e = m_stats[src0->name];
+        if (e.values.empty()) {
+            e.values.resize(src1->ne[0], 0);
+        }
+        else if (e.values.size() != (size_t)src1->ne[0]) {
+            fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
+            exit(1); //GGML_ASSERT(false);
+        }
+        ++e.ncall;
+        if (m_params.verbosity > 1) {
+            printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
+        }
+        for (int row = 0; row < (int)src1->ne[1]; ++row) {
+            const float * x = data + row * src1->ne[0];
+            for (int j = 0; j < (int)src1->ne[0]; ++j) {
+                e.values[j] += x[j]*x[j];
+            }
+        }
+        if (e.ncall > m_last_call) {
+            m_last_call = e.ncall;
+            if (m_last_call % m_params.n_output_frequency == 0) {
+                save_imatrix();
+            }
          }
      }
+
+    return true;
  }
  
  void IMatrixCollector::save_imatrix() const {
@@ -93,8 +170,8 @@ void IMatrixCollector::save_imatrix() const {
  
  static IMatrixCollector g_collector;
  
-static void ik_collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1) {
-    g_collector.collect_imatrix(src0, src1);
+static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
+    return g_collector.collect_imatrix(t, ask, user_data);
  }
  
  
@@ -320,8 +397,6 @@ int main(int argc, char ** argv) {
  
      g_collector.set_parameters(std::move(sparams));
  
-    ggml_set_imatrix_collection(ik_collect_imatrix);
-
      params.logits_all = true;
      params.n_batch = std::min(params.n_batch, params.n_ctx);
  
@@ -340,16 +415,27 @@ int main(int argc, char ** argv) {
  
      llama_backend_init(params.numa);
  
-    llama_model * model;
-    llama_context * ctx;
+    llama_model_params mparams = llama_model_params_from_gpt_params(params);
  
-    // load the model and apply lora adapter, if any
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
      if (model == NULL) {
          fprintf(stderr, "%s: error: unable to load model\n", __func__);
          return 1;
      }
  
+    llama_context_params cparams = llama_context_params_from_gpt_params(params);
+
+    // pass the callback to the backend scheduler
+    // it will be executed for each node during the graph computation
+    cparams.cb_eval = ik_collect_imatrix;
+    cparams.cb_eval_user_data = NULL;
+
+    llama_context * ctx = llama_new_context_with_model(model, cparams);
+    if (ctx == NULL) {
+        fprintf(stderr, "%s: error: unable to create context\n", __func__);
+        return 1;
+    }
+
      const int n_ctx_train = llama_n_ctx_train(model);
      if (params.n_ctx > n_ctx_train) {
          fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
diff --git a/ggml.c b/ggml.c

index d7e01b81f01792b13e251b44e2fe67042f51734d..35fd29a9ec2dc899525d8d0f1d20d5480b17f3f0 100644 (file)
--- a/ggml.c
+++ b/ggml.c
@@ -394,12 +394,6 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
  static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
  static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
  
-ggml_collect_imatrix_t g_imatrix_collect = NULL;
-
-void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect) {
-    g_imatrix_collect = imatrix_collect;
-}
-
  static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
      [GGML_TYPE_I8] = {
          .type_name                = "i8",
@@ -9790,10 +9784,6 @@ static void ggml_compute_forward_mul_mat(
      const int ith = params->ith;
      const int nth = params->nth;
  
-    if (ith == 1 && g_imatrix_collect) {
-        g_imatrix_collect(src0, src1);
-    }
-
      const enum ggml_type type = src0->type;
  
      const bool src1_cont = ggml_is_contiguous(src1);
@@ -10097,10 +10087,6 @@ static void ggml_compute_forward_mul_mat_id(
  
          const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
  
-        if (ith == 1 && g_imatrix_collect) {
-            g_imatrix_collect(src0_cur, src1);
-        }
-
          const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
          const size_t row_size = ggml_row_size(vec_dot_type, ne10);
  
diff --git a/ggml.h b/ggml.h

index 837c52e68c90cefc813e66f1a9817433ca3ce990..27daf6fd1e12b6ccda8513a62e1e6a420e18930f 100644 (file)
--- a/ggml.h
+++ b/ggml.h
@@ -2085,12 +2085,6 @@ extern "C" {
      GGML_API void ggml_init_iq2_quantization(enum ggml_type type);
      GGML_API void ggml_deinit_iq2_quantization(enum ggml_type type);
  
-    //
-    // Importance matrix
-    //
-    typedef void(*ggml_collect_imatrix_t)(const struct ggml_tensor * src0, const struct ggml_tensor * src1);
-    GGML_API void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect);
-
      //
      // gguf
      //
author	Georgi Gerganov <redacted>
	Wed, 17 Jan 2024 16:46:30 +0000 (18:46 +0200)
committer	GitHub <redacted>
	Wed, 17 Jan 2024 16:46:30 +0000 (18:46 +0200)
ci/run.sh		patch \| blob \| history
examples/imatrix/imatrix.cpp		patch \| blob \| history
ggml.c		patch \| blob \| history
ggml.h		patch \| blob \| history