llama-bench : add model sizes (#2771)

author slaren <redacted>

Fri, 25 Aug 2023 13:16:19 +0000 (15:16 +0200)

committer GitHub <redacted>

Fri, 25 Aug 2023 13:16:19 +0000 (15:16 +0200)
author slaren <redacted>
Fri, 25 Aug 2023 13:16:19 +0000 (15:16 +0200)
committer GitHub <redacted>
Fri, 25 Aug 2023 13:16:19 +0000 (15:16 +0200)
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp

index 7a28115841fc3f0fb179d15486e9ace85b084391..d0fe6d90d3bbee09541b628758ff4f98f82f573d 100755 (executable)
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -441,6 +441,8 @@ struct test {
      static const std::string gpu_info;
      std::string model_filename;
      std::string model_type;
+    uint64_t model_size;
+    uint64_t model_n_params;
      int n_batch;
      int n_threads;
      bool f32_kv;
@@ -457,8 +459,10 @@ struct test {
      test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) {
          model_filename = inst.model;
          char buf[128];
-        llama_model_type(lmodel, buf, sizeof(buf));
+        llama_model_desc(lmodel, buf, sizeof(buf));
          model_type = buf;
+        model_size = llama_model_size(lmodel);
+        model_n_params = llama_model_n_params(lmodel);
          n_batch = inst.n_batch;
          n_threads = inst.n_threads;
          f32_kv = inst.f32_kv;
@@ -524,7 +528,7 @@ struct test {
              "build_commit", "build_number",
              "cuda", "opencl", "metal", "gpu_blas", "blas",
              "cpu_info", "gpu_info",
-            "model_filename", "model_type",
+            "model_filename", "model_type", "model_size", "model_n_params",
              "n_batch", "n_threads", "f16_kv",
              "n_gpu_layers", "main_gpu", "mul_mat_q", "low_vram", "tensor_split",
              "n_prompt", "n_gen", "test_time",
@@ -538,6 +542,7 @@ struct test {
  
      static field_type get_field_type(const std::string & field) {
          if (field == "build_number" || field == "n_batch" || field == "n_threads" ||
+            field == "model_size" || field == "model_n_params" ||
              field == "n_gpu_layers" || field == "main_gpu" ||
              field == "n_prompt" || field == "n_gen" ||
              field == "avg_ns" || field == "stddev_ns") {
@@ -573,7 +578,7 @@ struct test {
              build_commit, std::to_string(build_number),
              std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
              cpu_info, gpu_info,
-            model_filename, model_type,
+            model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
              std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv),
              std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), std::to_string(low_vram), tensor_split_str,
              std::to_string(n_prompt), std::to_string(n_gen), test_time,
@@ -709,8 +714,15 @@ struct markdown_printer : public printer {
              return -30;
          }
          if (field == "t/s") {
-            return 15;
+            return 16;
          }
+        if (field == "size" || field == "params") {
+            return 10;
+        }
+        if (field == "n_gpu_layers") {
+            return 3;
+        }
+
          int width = std::max((int)field.length(), 10);
  
          if (test::get_field_type(field) == test::STRING) {
@@ -719,9 +731,28 @@ struct markdown_printer : public printer {
          return width;
      }
  
+    static std::string get_field_display_name(const std::string & field) {
+        if (field == "n_gpu_layers") {
+            return "ngl";
+        }
+        if (field == "n_threads") {
+            return "threads";
+        }
+        if (field == "mul_mat_q") {
+            return "mmq";
+        }
+        if (field == "tensor_split") {
+            return "ts";
+        }
+        return field;
+    }
+
      void print_header(const cmd_params & params) override {
          // select fields to print
-        fields = { "model", "backend" };
+        fields.push_back("model");
+        fields.push_back("size");
+        fields.push_back("params");
+        fields.push_back("backend");
          bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS";
          if (!is_cpu_backend) {
              fields.push_back("n_gpu_layers");
@@ -752,7 +783,7 @@ struct markdown_printer : public printer {
  
          fprintf(fout, "|");
          for (const auto & field : fields) {
-            fprintf(fout, " %*s |", get_field_width(field), field.c_str());
+            fprintf(fout, " %*s |", get_field_width(field), get_field_display_name(field).c_str());
          }
          fprintf(fout, "\n");
          fprintf(fout, "|");
@@ -769,12 +800,26 @@ struct markdown_printer : public printer {
          fprintf(fout, "|");
          for (const auto & field : fields) {
              std::string value;
+            char buf[128];
              if (field == "model") {
                  value = t.model_type;
+            } else if (field == "size") {
+                if (t.model_size < 1024*1024*1024) {
+                    snprintf(buf, sizeof(buf), "%.2f MiB", t.model_size / 1024.0 / 1024.0);
+                } else {
+                    snprintf(buf, sizeof(buf), "%.2f GiB", t.model_size / 1024.0 / 1024.0 / 1024.0);
+                }
+                value = buf;
+            } else if (field == "params") {
+                if (t.model_n_params < 1000*1000*1000) {
+                    snprintf(buf, sizeof(buf), "%.2f M", t.model_n_params / 1e6);
+                } else {
+                    snprintf(buf, sizeof(buf), "%.2f B", t.model_n_params / 1e9);
+                }
+                value = buf;
              } else if (field == "backend") {
                  value = test::get_backend();
              } else if (field == "test") {
-                char buf[128];
                  if (t.n_prompt > 0 && t.n_gen == 0) {
                      snprintf(buf, sizeof(buf), "pp %d", t.n_prompt);
                  } else if (t.n_gen > 0 && t.n_prompt == 0) {
@@ -785,7 +830,6 @@ struct markdown_printer : public printer {
                  }
                  value = buf;
              } else if (field == "t/s") {
-                char buf[128];
                  snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
                  value = buf;
              } else if (vmap.find(field) != vmap.end()) {
diff --git a/llama.cpp b/llama.cpp

index d12b6d1cb0713f5a8836f6d77373d50532569e07..4529ac82288549f2cebab140e810c11bc27f48b6 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -5297,13 +5297,29 @@ int llama_model_n_embd(const struct llama_model * model) {
      return model->hparams.n_embd;
  }
  
-int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size) {
+int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
      return snprintf(buf, buf_size, "%s %s %s",
              model->name.c_str(),
              llama_model_type_name(model->type),
              llama_model_ftype_name(model->ftype).c_str());
  }
  
+uint64_t llama_model_size(const struct llama_model * model) {
+    uint64_t size = 0;
+    for (const auto & it : model->tensors_by_name) {
+        size += ggml_nbytes(it.second);
+    }
+    return size;
+}
+
+uint64_t llama_model_n_params(const struct llama_model * model) {
+    uint64_t nparams = 0;
+    for (const auto & it : model->tensors_by_name) {
+        nparams += ggml_nelements(it.second);
+    }
+    return nparams;
+}
+
  int llama_model_quantize(
          const char * fname_inp,
          const char * fname_out,
diff --git a/llama.h b/llama.h

index 2bcf94e0f3fd20478f83515a3445acad348714ec..d474681725ff8389d31f7127e603cb3f16c23f0b 100644 (file)
--- a/llama.h
+++ b/llama.h
@@ -254,7 +254,11 @@ extern "C" {
      LLAMA_API int llama_model_n_embd (const struct llama_model * model);
  
      // Get a string describing the model type
-    LLAMA_API int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size);
+    LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
+    // Returns the total size of all the tensors in the model in bytes
+    LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
+    // Returns the total number of parameters in the model
+    LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
  
      // Returns 0 on success
      LLAMA_API int llama_model_quantize(
author	slaren <redacted>
	Fri, 25 Aug 2023 13:16:19 +0000 (15:16 +0200)
committer	GitHub <redacted>
	Fri, 25 Aug 2023 13:16:19 +0000 (15:16 +0200)
examples/llama-bench/llama-bench.cpp		patch \| blob \| history
llama.cpp		patch \| blob \| history
llama.h		patch \| blob \| history