update examples and tests

author slaren <redacted>

Thu, 14 Mar 2024 15:45:27 +0000 (16:45 +0100)

committer Georgi Gerganov <redacted>

Thu, 14 Mar 2024 16:46:58 +0000 (18:46 +0200)
author slaren <redacted>
Thu, 14 Mar 2024 15:45:27 +0000 (16:45 +0100)
committer Georgi Gerganov <redacted>
Thu, 14 Mar 2024 16:46:58 +0000 (18:46 +0200)
diff --git a/examples/common-ggml.cpp b/examples/common-ggml.cpp

index 53811ad193c8e502c90fb142d55f96cd6ecf9476..cf2478f0a43254be9642fc5d19cf6c61e723688e 100644 (file)
--- a/examples/common-ggml.cpp
+++ b/examples/common-ggml.cpp
@@ -90,8 +90,6 @@ bool ggml_common_quantize_0(
      std::vector<ggml_fp16_t> data_f16;
      std::vector<float>       data_f32;
  
-    std::vector<int64_t> hist_all(1 << 4, 0);
-
      while (true) {
          int32_t n_dims;
          int32_t length;
@@ -176,8 +174,6 @@ bool ggml_common_quantize_0(
              work.resize(nelements); // for quantization
  
              size_t cur_size = 0;
-            std::vector<int64_t> hist_cur(1 << 4, 0);
-
              switch ((ggml_type) ttype) {
                  case GGML_TYPE_Q4_0:
                  case GGML_TYPE_Q4_1:
@@ -190,7 +186,7 @@ bool ggml_common_quantize_0(
                  case GGML_TYPE_Q5_K:
                  case GGML_TYPE_Q6_K:
                      {
-                        cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements/ne[0], ne[0], hist_cur.data(), nullptr);
+                        cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements/ne[0], ne[0], nullptr);
                      } break;
                  case GGML_TYPE_F32:
                  case GGML_TYPE_F16:
@@ -217,15 +213,7 @@ bool ggml_common_quantize_0(
              fout.write(reinterpret_cast<char *>(work.data()), cur_size);
              total_size_new += cur_size;
  
-            printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
-            for (int i = 0; i < (int) hist_cur.size(); ++i) {
-                hist_all[i] += hist_cur[i];
-            }
-
-            for (int i = 0; i < (int) hist_cur.size(); ++i) {
-                printf("%5.3f ", hist_cur[i] / (float)nelements);
-            }
-            printf("\n");
+            printf("size = %8.2f MB -> %8.2f MB\n", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
          } else {
              printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
              fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
@@ -238,18 +226,5 @@ bool ggml_common_quantize_0(
      printf("%s: model size  = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
      printf("%s: quant size  = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype));
  
-    {
-        int64_t sum_all = 0;
-        for (int i = 0; i < (int) hist_all.size(); ++i) {
-            sum_all += hist_all[i];
-        }
-
-        printf("%s: hist: ", __func__);
-        for (int i = 0; i < (int) hist_all.size(); ++i) {
-            printf("%5.3f ", hist_all[i] / (float)sum_all);
-        }
-        printf("\n");
-    }
-
      return true;
  }
diff --git a/examples/gpt-2/main-batched.cpp b/examples/gpt-2/main-batched.cpp

index 6ad1838b71c390c3fed23b2e14949daee1698ddf..9ba4496cf10357376739d4005d9946a4b38fd9af 100644 (file)
--- a/examples/gpt-2/main-batched.cpp
+++ b/examples/gpt-2/main-batched.cpp
@@ -419,21 +419,19 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
  
          // allocate the tensors into the backend buffer
          {
-            ggml_tallocr * alloc = ggml_tallocr_new(model.kv_cache.buffer);
+            ggml_tallocr alloc = ggml_tallocr_new(model.kv_cache.buffer);
  
              // this updates the pointers in the tensors to point to the correct location in the buffer
              // this is necessary since the ggml_context is .no_alloc == true
              // note that the buffer can actually be a device buffer, depending on the backend
-            ggml_tallocr_alloc(alloc, model.kv_cache.k);
-            ggml_tallocr_alloc(alloc, model.kv_cache.v);
-
-            ggml_tallocr_free(alloc);
+            ggml_tallocr_alloc(&alloc, model.kv_cache.k);
+            ggml_tallocr_alloc(&alloc, model.kv_cache.v);
          }
      }
  
      // load weights
      {
-        ggml_tallocr * alloc = ggml_tallocr_new(model.buffer_w);
+        ggml_tallocr alloc = ggml_tallocr_new(model.buffer_w);
  
          size_t total_size = 0;
  
@@ -495,7 +493,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
                  return false;
              }
  
-            ggml_tallocr_alloc(alloc, tensor);
+            ggml_tallocr_alloc(&alloc, tensor);
  
              if (ggml_backend_is_cpu  (model.backend)
  #ifdef GGML_USE_METAL
@@ -525,7 +523,6 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
              total_size += ggml_nbytes(tensor);
          }
  
-        ggml_tallocr_free(alloc);
          printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
      }
  
diff --git a/examples/gpt-2/main-sched.cpp b/examples/gpt-2/main-sched.cpp

index e753d5fb3af5383d5c3195aebf3ac42c4ac7a7f0..ad10aa96125bc8722041029f1eb8b5300d2a5d99 100644 (file)
--- a/examples/gpt-2/main-sched.cpp
+++ b/examples/gpt-2/main-sched.cpp
@@ -342,7 +342,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
      }
  
      // allocate buffers
-    std::map<ggml_backend_t, std::unique_ptr<ggml_tallocr, decltype(&ggml_tallocr_free)>> backend_buffers;
+    std::map<ggml_backend_t, ggml_tallocr> backend_buffers;
      for (auto backend : model.backends) {
          // compute the size of the buffer
          size_t size = 0;
@@ -359,7 +359,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
              model.buffers_w.push_back(buffer);
  
              // create an allocator for the buffer to allocate the tensors
-            auto alloc = std::unique_ptr<ggml_tallocr, decltype(&ggml_tallocr_free)>(ggml_tallocr_new(buffer), ggml_tallocr_free);
+            auto alloc = ggml_tallocr_new(buffer);
              backend_buffers.insert(std::make_pair(backend, std::move(alloc)));
          } else {
              model.buffers_w.push_back(NULL);
@@ -394,15 +394,13 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
  
          // allocate the tensors into the backend buffer
          {
-            ggml_tallocr * alloc = ggml_tallocr_new(model.buffer_kv);
+            ggml_tallocr alloc = ggml_tallocr_new(model.buffer_kv);
  
              // this updates the pointers in the tensors to point to the correct location in the buffer
              // this is necessary since the ggml_context is .no_alloc == true
              // note that the buffer can actually be a device buffer, depending on the backend
-            ggml_tallocr_alloc(alloc, model.memory_k);
-            ggml_tallocr_alloc(alloc, model.memory_v);
-
-            ggml_tallocr_free(alloc);
+            ggml_tallocr_alloc(&alloc, model.memory_k);
+            ggml_tallocr_alloc(&alloc, model.memory_v);
          }
      }
  
@@ -470,7 +468,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
  
              // allocate the tensor
              ggml_backend_t backend = tensor_backends[name];
-            ggml_tallocr * alloc = backend_buffers.find(backend)->second.get();
+            ggml_tallocr * alloc = &backend_buffers.find(backend)->second;
              ggml_tallocr_alloc(alloc, tensor);
              //printf("%s: [%5.5s] %s\n", __func__, ggml_backend_name(backend), name.c_str());
  
@@ -490,7 +488,8 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
  
              // GPT-2 models share the WTE tensor as the LM head
              if (name == "model/wte" && has_lm_head == false) {
-                ggml_tallocr_alloc(backend_buffers.find(tensor_backends["model/lm_head"])->second.get(), model.lm_head);
+                ggml_tallocr * alloc_head = &backend_buffers.find(tensor_backends["model/lm_head"])->second;
+                ggml_tallocr_alloc(alloc_head, model.lm_head);
                  //printf("%s: [%5.5s] %s (copied)\n", __func__, ggml_backend_name(tensor_backends["model/lm_head"]), "model/lm_head");
                  ggml_backend_tensor_copy(tensor, model.lm_head);
                  total_size += ggml_nbytes(model.lm_head);
@@ -524,10 +523,9 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
          printf("%s: backend_in = %s (%zu bytes)\n", __func__, ggml_backend_name(backend_input), input_size);
  
          // allocate the tensors into the backend buffer
-        ggml_tallocr * alloc = ggml_tallocr_new(model.buffer_input);
-        ggml_tallocr_alloc(alloc, model.embd);
-        ggml_tallocr_alloc(alloc, model.position);
-        ggml_tallocr_free(alloc);
+        ggml_tallocr alloc = ggml_tallocr_new(model.buffer_input);
+        ggml_tallocr_alloc(&alloc, model.embd);
+        ggml_tallocr_alloc(&alloc, model.position);
      }
  
      return true;
@@ -867,6 +865,7 @@ bool gpt2_eval(
      struct ggml_cgraph * gf = gpt2_graph(model, n_past, embd_inp);
  
      // run the computation
+    ggml_backend_sched_reset(sched);
      ggml_backend_sched_graph_compute(sched, gf);
  
      //if (n_past%100 == 0) {
@@ -934,7 +933,7 @@ int main(int argc, char ** argv) {
      ggml_backend_sched_t sched;
      {
          // initialize the scheduler
-        sched = ggml_backend_sched_new(model.backends.data(), NULL, model.backends.size(), GPT2_MAX_NODES);
+        sched = ggml_backend_sched_new(model.backends.data(), NULL, model.backends.size(), GPT2_MAX_NODES, false);
  
          // create the worst case graph for memory usage estimation
          int n_tokens = std::min(model.hparams.n_ctx, params.n_batch);
diff --git a/tests/test-backend-buffer.cpp b/tests/test-backend-buffer.cpp

index d3c9fe0f97e09b0ed5597291e5269758237c64ad..19ba22a527592cf8b33b5c9d618232234989300c 100644 (file)
--- a/tests/test-backend-buffer.cpp
+++ b/tests/test-backend-buffer.cpp
@@ -40,8 +40,8 @@ static void test_buffer(ggml_backend_t backend, ggml_backend_buffer_type_t buft)
  
      GGML_ASSERT(ggml_backend_buffer_get_alloc_size(buffer, tensor) >= n * sizeof(float));
  
-    ggml_tallocr_t allocr = ggml_tallocr_new(buffer);
-    ggml_tallocr_alloc(allocr, tensor);
+    struct ggml_tallocr allocr = ggml_tallocr_new(buffer);
+    ggml_tallocr_alloc(&allocr, tensor);
  
      GGML_ASSERT(tensor->data != NULL);
  
@@ -59,7 +59,6 @@ static void test_buffer(ggml_backend_t backend, ggml_backend_buffer_type_t buft)
  
      GGML_ASSERT(memcmp(data, data2, sizeof(data)) == 0);
  
-    ggml_tallocr_free(allocr);
      ggml_backend_buffer_free(buffer);
      ggml_free(ctx);
  }
diff --git a/tests/test-conv1d.cpp b/tests/test-conv1d.cpp

index 79d18b872301ef286881b44937e91f6567eed793..b6daa7b6963fa333929950b5da278111e5b6c570 100644 (file)
--- a/tests/test-conv1d.cpp
+++ b/tests/test-conv1d.cpp
@@ -111,10 +111,10 @@ void load_model(test_model & model, bool use_gpu = false) {
      model.b = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, IL, IC, N);
  
      // create a allocator
-    ggml_tallocr_t alloc = ggml_tallocr_new(model.buffer);
+    ggml_tallocr alloc = ggml_tallocr_new(model.buffer);
  
      // alloc memory
-    ggml_tallocr_alloc(alloc, model.a);
+    ggml_tallocr_alloc(&alloc, model.a);
  
      // load data to buffer
      if(ggml_backend_is_cpu(model.backend)) {
@@ -124,7 +124,7 @@ void load_model(test_model & model, bool use_gpu = false) {
      }
  
      // alloc memory
-    ggml_tallocr_alloc(alloc, model.b);
+    ggml_tallocr_alloc(&alloc, model.b);
  
      if(ggml_backend_is_cpu(model.backend)
  #ifdef GGML_USE_METAL
@@ -135,8 +135,6 @@ void load_model(test_model & model, bool use_gpu = false) {
      } else {
          ggml_backend_tensor_set(model.b, bdata, 0, ggml_nbytes(model.b));
      }
-
-    ggml_tallocr_free(alloc);
  }
  
  struct ggml_cgraph * build_graph(const test_model& model) {
diff --git a/tests/test-conv2d.cpp b/tests/test-conv2d.cpp

index 1aea5eef319c0a9a10ad0fcbf4d9971d948e3498..35d565444adddc245cd29bf3c752f01578a7f036 100644 (file)
--- a/tests/test-conv2d.cpp
+++ b/tests/test-conv2d.cpp
@@ -111,10 +111,10 @@ void load_model(test_model & model, bool use_gpu = false) {
      model.b = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F32, IW, IH, IC, N);
  
      // create a allocator
-    ggml_tallocr_t alloc = ggml_tallocr_new(model.buffer);
+    struct ggml_tallocr alloc = ggml_tallocr_new(model.buffer);
  
      // alloc memory
-    ggml_tallocr_alloc(alloc, model.a);
+    ggml_tallocr_alloc(&alloc, model.a);
  
      // load data to buffer
      if(ggml_backend_is_cpu(model.backend)) {
@@ -124,7 +124,7 @@ void load_model(test_model & model, bool use_gpu = false) {
      }
  
      // alloc memory
-    ggml_tallocr_alloc(alloc, model.b);
+    ggml_tallocr_alloc(&alloc, model.b);
  
      if(ggml_backend_is_cpu(model.backend)
  #ifdef GGML_USE_METAL
@@ -135,8 +135,6 @@ void load_model(test_model & model, bool use_gpu = false) {
      } else {
          ggml_backend_tensor_set(model.b, bdata, 0, ggml_nbytes(model.b));
      }
-
-    ggml_tallocr_free(alloc);
  }
  
  struct ggml_cgraph * build_graph(const test_model& model) {
diff --git a/tests/test-mul-mat.cpp b/tests/test-mul-mat.cpp

index f082dea16d505c40a9b42c815b3cbcd8dc4a0ff1..07a6ffeba7bcf56f1b9c5e4e77d4021ace5d0c23 100644 (file)
--- a/tests/test-mul-mat.cpp
+++ b/tests/test-mul-mat.cpp
@@ -85,10 +85,10 @@ void load_model(test_model & model, float* a, float* b, int M, int N, int K, boo
      printf("Matrix B: [%i, %i]\n", K, N);
  
      // create a allocator
-    ggml_tallocr_t alloc = ggml_tallocr_new(model.buffer);
+    struct ggml_tallocr alloc = ggml_tallocr_new(model.buffer);
  
      // alloc memory
-    ggml_tallocr_alloc(alloc, model.a);
+    ggml_tallocr_alloc(&alloc, model.a);
  
      // load data to buffer
      if(ggml_backend_is_cpu(model.backend)
@@ -102,7 +102,7 @@ void load_model(test_model & model, float* a, float* b, int M, int N, int K, boo
      }
  
      // alloc memory
-    ggml_tallocr_alloc(alloc, model.b);
+    ggml_tallocr_alloc(&alloc, model.b);
  
      if(ggml_backend_is_cpu(model.backend)
  #ifdef GGML_USE_METAL
@@ -113,8 +113,6 @@ void load_model(test_model & model, float* a, float* b, int M, int N, int K, boo
      } else {
          ggml_backend_tensor_set(model.b, b, 0, ggml_nbytes(model.b));  // cuda requires copy the data directly to device
      }
-
-    ggml_tallocr_free(alloc);
  }
  
  struct ggml_cgraph * build_graph(const test_model& model) {
author	slaren <redacted>
	Thu, 14 Mar 2024 15:45:27 +0000 (16:45 +0100)
committer	Georgi Gerganov <redacted>
	Thu, 14 Mar 2024 16:46:58 +0000 (18:46 +0200)
examples/common-ggml.cpp		patch \| blob \| history
examples/gpt-2/main-batched.cpp		patch \| blob \| history
examples/gpt-2/main-sched.cpp		patch \| blob \| history
tests/test-backend-buffer.cpp		patch \| blob \| history
tests/test-conv1d.cpp		patch \| blob \| history
tests/test-conv2d.cpp		patch \| blob \| history
tests/test-mul-mat.cpp		patch \| blob \| history