llama : add llama_init_backend() API (close #1527)

author Georgi Gerganov <redacted>

Sat, 20 May 2023 08:06:11 +0000 (11:06 +0300)

committer Georgi Gerganov <redacted>

Sat, 20 May 2023 08:06:37 +0000 (11:06 +0300)
author Georgi Gerganov <redacted>
Sat, 20 May 2023 08:06:11 +0000 (11:06 +0300)
committer Georgi Gerganov <redacted>
Sat, 20 May 2023 08:06:37 +0000 (11:06 +0300)
diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp

index 446b8e8fb5ef21ce24822dc2349f7da5d53bee7d..9f9ed9db0c5881d821cbbcc89b3769fd5a7c3208 100644 (file)
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -1,6 +1,7 @@
-#include <locale.h>
  #include "ggml.h"
  #include "build-info.h"
+
+#include <locale.h>
  #include <assert.h>
  #include <math.h>
  #include <cstring>
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp

index c24f7f82071579dfba8f8cfdac05fbca4a6c1e85..03603b10fe3f9515b31d0ee89b563d095ac64e50 100644 (file)
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -31,6 +31,8 @@ int main(int argc, char ** argv) {
          params.prompt = gpt_random_prompt(rng);
      }
  
+    llama_init_backend();
+
      llama_context * ctx;
  
      // load the model
diff --git a/examples/main/main.cpp b/examples/main/main.cpp

index 4d886f8defe568ac21d8d3f5278abd92e56d2530..47b418d972bbca6cc00ae5d5f292233634180a93 100644 (file)
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -96,8 +96,7 @@ int main(int argc, char ** argv) {
          params.prompt = gpt_random_prompt(rng);
      }
  
-//    params.prompt = R"(// this function checks if the number n is prime
-//bool is_prime(int n) {)";
+    llama_init_backend();
  
      llama_context * ctx;
      g_ctx = &ctx;
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp

index 9d38626cbd4f4db97ec8eaa1dcbe270ce3e0cdbf..e19c6825f24468d5402adc89097f271d2c2d642e 100644 (file)
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -143,6 +143,8 @@ int main(int argc, char ** argv) {
          params.prompt = gpt_random_prompt(rng);
      }
  
+    llama_init_backend();
+
      llama_context * ctx;
  
      // load the model and apply lora adapter, if any
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp

index 115d8fb1ba36b149166694bc0d860ece7c8bc36b..769dd36a468e32f9d8a241efb0cd61654fd74ad7 100644 (file)
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -1,7 +1,7 @@
-#include "ggml.h"
-#include "llama.h"
  #include "build-info.h"
  
+#include "llama.h"
+
  #include <cstdio>
  #include <map>
  #include <string>
@@ -42,8 +42,6 @@ bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::st
  //  ./quantize models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
  //
  int main(int argc, char ** argv) {
-    ggml_time_init();
-
      if (argc < 3) {
          fprintf(stderr, "usage: %s model-f32.bin [model-quant.bin] type [nthreads]\n", argv[0]);
          for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) {
@@ -52,12 +50,7 @@ int main(int argc, char ** argv) {
          return 1;
      }
  
-    // needed to initialize f16 tables
-    {
-        struct ggml_init_params params = { 0, NULL, false };
-        struct ggml_context * ctx = ggml_init(params);
-        ggml_free(ctx);
-    }
+    llama_init_backend();
  
      // parse command line arguments
      const std::string fname_inp = argv[1];
@@ -116,25 +109,25 @@ int main(int argc, char ** argv) {
      }
      fprintf(stderr, "\n");
  
-    const int64_t t_main_start_us = ggml_time_us();
+    const int64_t t_main_start_us = llama_time_us();
  
      int64_t t_quantize_us = 0;
  
      // load the model
      {
-        const int64_t t_start_us = ggml_time_us();
+        const int64_t t_start_us = llama_time_us();
  
          if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype, nthread)) {
              fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
              return 1;
          }
  
-        t_quantize_us = ggml_time_us() - t_start_us;
+        t_quantize_us = llama_time_us() - t_start_us;
      }
  
      // report timing
      {
-        const int64_t t_main_end_us = ggml_time_us();
+        const int64_t t_main_end_us = llama_time_us();
  
          printf("\n");
          printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0);
diff --git a/llama.cpp b/llama.cpp

index dd449592a71490dfc434579c9df79dbcf718a164..5e6980b482a6efa68420c7548607f204217b67d0 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -839,6 +839,21 @@ bool llama_mlock_supported() {
      return llama_mlock::SUPPORTED;
  }
  
+void llama_init_backend() {
+    ggml_time_init();
+
+    // needed to initialize f16 tables
+    {
+        struct ggml_init_params params = { 0, NULL, false };
+        struct ggml_context * ctx = ggml_init(params);
+        ggml_free(ctx);
+    }
+}
+
+int64_t llama_time_us() {
+    return ggml_time_us();
+}
+
  //
  // model loading
  //
diff --git a/llama.h b/llama.h

index 8623e08ce5a12cf4f199971b9a3045fc1873efeb..0a63d034b38edf6353a19d7167994da7d2e94f32 100644 (file)
--- a/llama.h
+++ b/llama.h
@@ -40,9 +40,9 @@ extern "C" {
      typedef int llama_token;
  
      typedef struct llama_token_data {
-        llama_token id;  // token id
-        float logit; // log-odds of the token
-        float p;     // probability of the token
+        llama_token id; // token id
+        float logit;    // log-odds of the token
+        float p;        // probability of the token
      } llama_token_data;
  
      typedef struct llama_token_data_array {
@@ -73,16 +73,16 @@ extern "C" {
  
      // model file types
      enum llama_ftype {
-        LLAMA_FTYPE_ALL_F32     = 0,
-        LLAMA_FTYPE_MOSTLY_F16  = 1,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
+        LLAMA_FTYPE_ALL_F32              = 0,
+        LLAMA_FTYPE_MOSTLY_F16           = 1, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_0          = 2, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_1          = 3, // except 1d tensors
          LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
-        // LLAMA_FTYPE_MOSTLY_Q4_2 = 5,  // support has been removed
-        // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
-        LLAMA_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
+        // LLAMA_FTYPE_MOSTLY_Q4_2       = 5, // support has been removed
+        // LLAMA_FTYPE_MOSTLY_Q4_3       = 6, // support has been removed
+        LLAMA_FTYPE_MOSTLY_Q8_0          = 7, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_0          = 8, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_1          = 9, // except 1d tensors
      };
  
      LLAMA_API struct llama_context_params llama_context_default_params();
@@ -90,6 +90,13 @@ extern "C" {
      LLAMA_API bool llama_mmap_supported();
      LLAMA_API bool llama_mlock_supported();
  
+    // TODO: not great API - very likely to change
+    // Initialize the llama + ggml backend
+    // Call once at the start of the program
+    LLAMA_API void llama_init_backend();
+
+    LLAMA_API int64_t llama_time_us();
+
      // Various functions for loading a ggml llama model.
      // Allocate (almost) all memory needed for the model.
      // Return NULL on failure
author	Georgi Gerganov <redacted>
	Sat, 20 May 2023 08:06:11 +0000 (11:06 +0300)
committer	Georgi Gerganov <redacted>
	Sat, 20 May 2023 08:06:37 +0000 (11:06 +0300)
examples/benchmark/benchmark-matmult.cpp		patch \| blob \| history
examples/embedding/embedding.cpp		patch \| blob \| history
examples/main/main.cpp		patch \| blob \| history
examples/perplexity/perplexity.cpp		patch \| blob \| history
examples/quantize/quantize.cpp		patch \| blob \| history
llama.cpp		patch \| blob \| history
llama.h		patch \| blob \| history