llama : make model stateless and context stateful (llama_state) (#1797)

author Didzis Gosko <redacted>

Sat, 24 Jun 2023 08:47:58 +0000 (11:47 +0300)

committer GitHub <redacted>

Sat, 24 Jun 2023 08:47:58 +0000 (11:47 +0300)
author Didzis Gosko <redacted>
Sat, 24 Jun 2023 08:47:58 +0000 (11:47 +0300)
committer GitHub <redacted>
Sat, 24 Jun 2023 08:47:58 +0000 (11:47 +0300)
diff --git a/examples/common.cpp b/examples/common.cpp

index fed24e027d8a814146defdb07f02837e172c3664..6ac484555917231d53918c3da6dd1195799a0ea5 100644 (file)
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -536,7 +536,7 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
      return res;
  }
  
-struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
+std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params) {
      auto lparams = llama_context_default_params();
  
      lparams.n_ctx        = params.n_ctx;
@@ -552,25 +552,33 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
      lparams.logits_all   = params.perplexity;
      lparams.embedding    = params.embedding;
  
-    llama_context * lctx = llama_init_from_file(params.model.c_str(), lparams);
+    llama_model * model  = llama_load_model_from_file(params.model.c_str(), lparams);
+    if (model == NULL) {
+        fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
+        return std::make_tuple(nullptr, nullptr);
+    }
  
+    llama_context * lctx = llama_new_context_with_model(model, lparams);
      if (lctx == NULL) {
-        fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
-        return NULL;
+        fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
+        llama_free_model(model);
+        return std::make_tuple(nullptr, nullptr);
      }
  
      if (!params.lora_adapter.empty()) {
-        int err = llama_apply_lora_from_file(lctx,
+        int err = llama_model_apply_lora_from_file(model,
                                               params.lora_adapter.c_str(),
                                               params.lora_base.empty() ? NULL : params.lora_base.c_str(),
                                               params.n_threads);
          if (err != 0) {
              fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
-            return NULL;
+            llama_free(lctx);
+            llama_free_model(model);
+            return std::make_tuple(nullptr, nullptr);
          }
      }
  
-    return lctx;
+    return std::make_tuple(model, lctx);
  }
  
  void console_init(console_state & con_st) {
diff --git a/examples/common.h b/examples/common.h

index 6c2953cb2a7c672c5eb00eea003c1ba0360bfaaf..713320179e2bea5c21edf80ee9fe727ba8c48d0e 100644 (file)
--- a/examples/common.h
+++ b/examples/common.h
@@ -9,6 +9,7 @@
  #include <random>
  #include <thread>
  #include <unordered_map>
+#include <tuple>
  
  #if !defined (_WIN32)
  #include <stdio.h>
@@ -95,7 +96,7 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
  // Model utils
  //
  
-struct llama_context * llama_init_from_gpt_params(const gpt_params & params);
+std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params);
  
  //
  // Console utils
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp

index 860f99f672c9cab71f4e91ac28caf1c5bdafab7d..369eac1d1c391c22831d5fd2f999254e5956be57 100644 (file)
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -37,11 +37,12 @@ int main(int argc, char ** argv) {
  
      llama_init_backend();
  
+    llama_model * model;
      llama_context * ctx;
  
      // load the model
-    ctx = llama_init_from_gpt_params(params);
-    if (ctx == NULL) {
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    if (model == NULL) {
          fprintf(stderr, "%s: error: unable to load model\n", __func__);
          return 1;
      }
@@ -90,6 +91,7 @@ int main(int argc, char ** argv) {
  
      llama_print_timings(ctx);
      llama_free(ctx);
+    llama_free_model(model);
  
      return 0;
  }
diff --git a/examples/main/main.cpp b/examples/main/main.cpp

index 941312f9cc756c8d90d8b055950d2064ae7cc67e..c1e6bf126804e4391c965daa79105f2fc7d7f414 100644 (file)
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -107,12 +107,13 @@ int main(int argc, char ** argv) {
  
      llama_init_backend();
  
+    llama_model * model;
      llama_context * ctx;
      g_ctx = &ctx;
  
      // load the model and apply lora adapter, if any
-    ctx = llama_init_from_gpt_params(params);
-    if (ctx == NULL) {
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    if (model == NULL) {
          fprintf(stderr, "%s: error: unable to load model\n", __func__);
          return 1;
      }
@@ -139,6 +140,7 @@ int main(int argc, char ** argv) {
  
          llama_print_timings(ctx);
          llama_free(ctx);
+        llama_free_model(model);
  
          return 0;
      }
@@ -147,6 +149,7 @@ int main(int argc, char ** argv) {
      if (params.export_cgraph) {
          llama_eval_export(ctx, "llama.ggml");
          llama_free(ctx);
+        llama_free_model(model);
  
          return 0;
      }
@@ -666,6 +669,7 @@ int main(int argc, char ** argv) {
  
      llama_print_timings(ctx);
      llama_free(ctx);
+    llama_free_model(model);
  
      return 0;
  }
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp

index ae8cfe0afc0b7099e97324a08d8b19bc8c1df0db..b59f5971e3dd278be831c3955228d8a5e087e82d 100644 (file)
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -149,11 +149,12 @@ int main(int argc, char ** argv) {
  
      llama_init_backend();
  
+    llama_model * model;
      llama_context * ctx;
  
      // load the model and apply lora adapter, if any
-    ctx = llama_init_from_gpt_params(params);
-    if (ctx == NULL) {
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    if (model == NULL) {
          fprintf(stderr, "%s: error: unable to load model\n", __func__);
          return 1;
      }
@@ -169,6 +170,7 @@ int main(int argc, char ** argv) {
  
      llama_print_timings(ctx);
      llama_free(ctx);
+    llama_free_model(model);
  
      return 0;
  }
diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp

index 6b8018ee2843225c87efce08e1e38d6ed56a8e37..9cea472dedb82ca95488227acdd0181f50cb3713 100644 (file)
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -320,6 +320,7 @@ int main(int argc, char ** argv) {
      fprintf(stderr, "Loading model\n");
  
      const int64_t t_main_start_us = ggml_time_us();
+    llama_model * model;
      llama_context * ctx;
  
      {
@@ -330,12 +331,20 @@ int main(int argc, char ** argv) {
          lparams.f16_kv     = false;
          lparams.use_mlock  = false;
  
-        ctx = llama_init_from_file(params.model.c_str(), lparams);
+        model = llama_load_model_from_file(params.model.c_str(), lparams);
  
-        if (ctx == NULL) {
+        if (model == NULL) {
              fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
              return 1;
          }
+
+        ctx = llama_new_context_with_model(model, lparams);
+
+        if (ctx == NULL) {
+            fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
+            llama_free_model(model);
+            return 1;
+        }
      }
  
      const auto &tensors = llama_internal_get_tensor_map(ctx);
@@ -357,6 +366,7 @@ int main(int argc, char ** argv) {
              fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
                  "this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
              llama_free(ctx);
+            llama_free_model(model);
              return 1;
          }
          included_layers++;
@@ -415,6 +425,7 @@ int main(int argc, char ** argv) {
  
  
      llama_free(ctx);
+    llama_free_model(model);
      // report timing
      {
          const int64_t t_main_end_us = ggml_time_us();
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp

index da4d37ad03de757fb1970d4447687d491ad3e79c..4c868850317fe413a84f5c7216f56d7e952787a0 100644 (file)
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -35,12 +35,22 @@ int main(int argc, char ** argv) {
      auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0);
  
      // init
-    auto ctx = llama_init_from_file(params.model.c_str(), lparams);
+    auto model = llama_load_model_from_file(params.model.c_str(), lparams);
+    if (model == nullptr) {
+        return 1;
+    }
+    auto ctx = llama_new_context_with_model(model, lparams);
+    if (ctx == nullptr) {
+        llama_free_model(model);
+        return 1;
+    }
      auto tokens = std::vector<llama_token>(params.n_ctx);
      auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true);
  
      if (n_prompt_tokens < 1) {
          fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
+        llama_free(ctx);
+        llama_free_model(model);
          return 1;
      }
  
@@ -84,6 +94,8 @@ int main(int argc, char ** argv) {
          printf("%s", next_token_str);
          if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
              fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
+            llama_free(ctx);
+            llama_free_model(model);
              return 1;
          }
          n_past += 1;
@@ -91,23 +103,27 @@ int main(int argc, char ** argv) {
  
      printf("\n\n");
  
-    // free old model
+    // free old context
      llama_free(ctx);
  
-    // load new model
-    auto ctx2 = llama_init_from_file(params.model.c_str(), lparams);
+    // make new context
+    auto ctx2 = llama_new_context_with_model(model, lparams);
  
      // Load state (rng, logits, embedding and kv_cache) from file
      {
          FILE *fp_read = fopen("dump_state.bin", "rb");
          if (state_size != llama_get_state_size(ctx2)) {
              fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
+            llama_free(ctx2);
+            llama_free_model(model);
              return 1;
          }
  
          const size_t ret = fread(state_mem, 1, state_size, fp_read);
          if (ret != state_size) {
              fprintf(stderr, "\n%s : failed to read state\n", __func__);
+            llama_free(ctx2);
+            llama_free_model(model);
              return 1;
          }
  
@@ -138,6 +154,8 @@ int main(int argc, char ** argv) {
          printf("%s", next_token_str);
          if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
              fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
+            llama_free(ctx2);
+            llama_free_model(model);
              return 1;
          }
          n_past += 1;
@@ -145,5 +163,8 @@ int main(int argc, char ** argv) {
  
      printf("\n\n");
  
+    llama_free(ctx2);
+    llama_free_model(model);
+
      return 0;
  }
diff --git a/examples/server/server.cpp b/examples/server/server.cpp

index c0984aadb92ba733dcc47170bba255c7238e86c3..de22d301342d6bd4dcb2fceec93e645809fc0c19 100644 (file)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -115,6 +115,7 @@ struct llama_server_context {
      std::vector<llama_token> embd;
      std::vector<llama_token> last_n_tokens;
  
+    llama_model * model = nullptr;
      llama_context * ctx = nullptr;
      gpt_params params;
  
@@ -130,6 +131,10 @@ struct llama_server_context {
              llama_free(ctx);
              ctx = nullptr;
          }
+        if (model) {
+            llama_free_model(model);
+            model = nullptr;
+        }
      }
  
      void rewind() {
@@ -150,8 +155,8 @@ struct llama_server_context {
  
      bool loadModel(const gpt_params & params_) {
          params = params_;
-        ctx = llama_init_from_gpt_params(params);
-        if (ctx == nullptr) {
+        std::tie(model, ctx) = llama_init_from_gpt_params(params);
+        if (model == nullptr) {
              LOG_ERROR("unable to load model", { { "model", params_.model } });
              return false;
          }
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp

index 76f991cdc028f3b3ffa3784bc09203f06831caa0..fc45c93406bc422d3987befeb57a17707a594ef8 100644 (file)
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -68,11 +68,12 @@ int main(int argc, char ** argv)
  
      llama_init_backend();
  
-    llama_context * ctx ;
+    llama_model * model;
+    llama_context * ctx;
  
-    ctx = llama_init_from_gpt_params( params );
+    std::tie(model, ctx) = llama_init_from_gpt_params( params );
  
-    if ( ctx == NULL )
+    if ( model == NULL )
      {
          fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
          return 1;
@@ -170,6 +171,7 @@ int main(int argc, char ** argv)
      } // wend of main loop
  
      llama_free( ctx );
+    llama_free_model( model );
  
      return 0;
  }
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp

index 7ec85951adc5788295697a111aef192109eb91a5..61c829e5c0f8a3ec3c9ceb61dfafa07b04f7fcfd 100644 (file)
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -3054,7 +3054,8 @@ int main(int argc, char ** argv) {
      struct llama_context_params llama_params = llama_context_default_params();
      llama_params.vocab_only = true;
  
-    struct llama_context * lctx = llama_init_from_file(params.fn_vocab_model, llama_params);
+    struct llama_model * lmodel = llama_load_model_from_file(params.fn_vocab_model, llama_params);
+    struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
  
      struct llama_vocab vocab;
      {
@@ -3395,6 +3396,8 @@ int main(int argc, char ** argv) {
      delete[] compute_addr;
      delete[] compute_buf_0;
      delete[] compute_buf_1;
+    llama_free(lctx);
+    llama_free_model(lmodel);
      ggml_free(model.ctx);
  
      return 0;
diff --git a/llama.cpp b/llama.cpp

index e597f5048234b40de2e7cea8ad5768522795a7d1..a528eef4a902036552a1ac247cd9b2b1f9011f2a 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -182,6 +182,19 @@ struct llama_kv_cache {
      }
  };
  
+struct llama_vocab {
+    using id    = int32_t;
+    using token = std::string;
+
+    struct token_score {
+        token tok;
+        float score;
+    };
+
+    std::unordered_map<token, id> token_to_id;
+    std::vector<token_score> id_to_token;
+};
+
  struct llama_model {
      e_model type = MODEL_UNKNOWN;
  
@@ -198,10 +211,6 @@ struct llama_model {
      // context
      struct ggml_context * ctx = NULL;
  
-    // key + value cache for the self attention
-    // TODO: move to llama_state
-    struct llama_kv_cache kv_self;
-
      // the model memory buffer
      llama_ctx_buffer buf;
  
@@ -215,6 +224,11 @@ struct llama_model {
      // for quantize-stats only
      std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
  
+    int64_t t_load_us = 0;
+    int64_t t_start_us = 0;
+
+    llama_vocab vocab;
+
      ~llama_model() {
          if (ctx) {
              ggml_free(ctx);
@@ -233,24 +247,11 @@ struct llama_model {
      }
  };
  
-struct llama_vocab {
-    using id    = int32_t;
-    using token = std::string;
-
-    struct token_score {
-        token tok;
-        float score;
-    };
-
-    std::unordered_map<token, id> token_to_id;
-    std::vector<token_score> id_to_token;
-};
-
  struct llama_context {
+    llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
+
      std::mt19937 rng;
  
-    int64_t t_load_us = 0;
-    int64_t t_start_us = 0;
      bool has_evaluated_once = false;
  
      int64_t t_sample_us = 0;
@@ -261,8 +262,16 @@ struct llama_context {
      int32_t n_eval   = 0; // number of eval calls
      int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
  
-    llama_model model;
-    llama_vocab vocab;
+    const llama_model & model;
+    const llama_vocab & vocab;
+
+    bool model_owner = false;
+
+    int64_t t_load_us;
+    int64_t t_start_us;
+
+    // key + value cache for the self attention
+    struct llama_kv_cache kv_self;
  
      size_t mem_per_token = 0;
  
@@ -1033,7 +1042,8 @@ static const char *llama_model_type_name(e_model type) {
  
  static void llama_model_load_internal(
          const std::string & fname,
-        llama_context & lctx,
+        llama_model & model,
+        llama_vocab & vocab,
          int n_ctx,
          int n_batch,
          int n_gpu_layers,
@@ -1047,12 +1057,11 @@ static void llama_model_load_internal(
          llama_progress_callback progress_callback,
          void * progress_callback_user_data) {
  
-    lctx.t_start_us = ggml_time_us();
+    model.t_start_us = ggml_time_us();
  
      std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
  
-    lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
-    auto & model = lctx.model;
+    vocab = std::move(ml->file_loaders.at(0)->vocab);
      model.hparams = ml->file_loaders.at(0)->hparams;
      model.n_gpu_layers = n_gpu_layers;
      llama_file_version file_version = ml->file_loaders.at(0)->file_version;
@@ -1122,15 +1131,15 @@ static void llama_model_load_internal(
  
      // create the ggml context
      {
-        lctx.model.buf.resize(ctx_size);
+        model.buf.resize(ctx_size);
          if (use_mlock) {
-            lctx.model.mlock_buf.init(lctx.model.buf.addr);
-            lctx.model.mlock_buf.grow_to(lctx.model.buf.size);
+            model.mlock_buf.init(model.buf.addr);
+            model.mlock_buf.grow_to(model.buf.size);
          }
  
          struct ggml_init_params params = {
-            /*.mem_size   =*/ lctx.model.buf.size,
-            /*.mem_buffer =*/ lctx.model.buf.addr,
+            /*.mem_size   =*/ model.buf.size,
+            /*.mem_buffer =*/ model.buf.addr,
              /*.no_alloc   =*/ ml->use_mmap,
          };
  
@@ -1311,7 +1320,7 @@ static void llama_model_load_internal(
      }
  #endif
  
-    ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
+    ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
  
      if (progress_callback) {
          progress_callback(1.0f, progress_callback_user_data);
@@ -1321,12 +1330,13 @@ static void llama_model_load_internal(
  
      // loading time will be recalculate after the first eval, so
      // we take page faults deferred by mmap() into consideration
-    lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
+    model.t_load_us = ggml_time_us() - model.t_start_us;
  }
  
  static bool llama_model_load(
          const std::string & fname,
-        llama_context & lctx,
+        llama_model & model,
+        llama_vocab & vocab,
          int n_ctx,
          int n_batch,
          int n_gpu_layers,
@@ -1340,7 +1350,7 @@ static bool llama_model_load(
          llama_progress_callback progress_callback,
          void *progress_callback_user_data) {
      try {
-        llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
+        llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
                                    use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
          return true;
      } catch (const std::exception & err) {
@@ -1378,7 +1388,7 @@ static bool llama_eval_internal(
      const auto & model   = lctx.model;
      const auto & hparams = model.hparams;
  
-    const auto & kv_self = model.kv_self;
+    const auto & kv_self = lctx.kv_self;
  
      LLAMA_ASSERT(!!kv_self.ctx);
  
@@ -1726,7 +1736,7 @@ static bool llama_eval_internal(
      //memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
  
      // update kv token count
-    lctx.model.kv_self.n = n_past + N;
+    lctx.kv_self.n = n_past + N;
  
      // extract logits
      {
@@ -2634,12 +2644,39 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
  // interface implementation
  //
  
-struct llama_context * llama_init_from_file(
+struct llama_model * llama_load_model_from_file(
                               const char * path_model,
              struct llama_context_params   params) {
      ggml_time_init();
  
-    llama_context * ctx = new llama_context;
+    llama_model * model = new llama_model;
+
+    ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
+
+    if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
+                params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
+                params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
+        delete model;
+        fprintf(stderr, "%s: failed to load model\n", __func__);
+        return nullptr;
+    }
+
+    return model;
+}
+
+void llama_free_model(struct llama_model * model) {
+    delete model;
+}
+
+struct llama_context * llama_new_context_with_model(
+                             struct llama_model * model,
+            struct llama_context_params   params) {
+
+    if (!model) {
+        return nullptr;
+    }
+
+    llama_context * ctx = new llama_context(*model, model->vocab);
  
      if (params.seed < 0) {
          params.seed = time(NULL);
@@ -2667,24 +2704,16 @@ struct llama_context * llama_init_from_file(
  
      ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
  
-    if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu,
-                params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
-                params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
-        fprintf(stderr, "%s: failed to load model\n", __func__);
-        llama_free(ctx);
-        return nullptr;
-    }
-
      // reserve memory for context buffers
      if (!params.vocab_only) {
-        if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
+        if (!kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
              fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
              llama_free(ctx);
              return nullptr;
          }
  
          {
-            const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v);
+            const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
              fprintf(stderr, "%s: kv self size  = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
          }
  
@@ -2736,8 +2765,8 @@ struct llama_context * llama_init_from_file(
  
          LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
  
-        LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr,       ctx->buf_compute.size,       0));
-        LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv",   ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size, 0));
+        LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
+        LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv",   ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0));
  
          LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
          LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
@@ -2748,7 +2777,23 @@ struct llama_context * llama_init_from_file(
      return ctx;
  }
  
+struct llama_context * llama_init_from_file(
+                             const char * path_model,
+            struct llama_context_params   params) {
+
+    struct llama_model * model = llama_load_model_from_file(path_model, params);
+    if (!model) {
+        return nullptr;
+    }
+    struct llama_context * ctx = llama_new_context_with_model(model, params);
+    ctx->model_owner = true;
+    return ctx;
+}
+
  void llama_free(struct llama_context * ctx) {
+    if (ctx->model_owner) {
+        delete &ctx->model;
+    }
      delete ctx;
  }
  
@@ -2765,11 +2810,9 @@ int llama_model_quantize(
      }
  }
  
-int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
+int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
      fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
  
-    auto & model = ctx->model;
-
      const int64_t t_start_lora_us = ggml_time_us();
  
      auto fin = std::ifstream(path_lora, std::ios::binary);
@@ -3012,7 +3055,16 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
  
  int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
      try {
-        return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
+        return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
+    } catch (const std::exception & err) {
+        fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
+        return 1;
+    }
+}
+
+int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) {
+    try {
+        return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
      } catch (const std::exception & err) {
          fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
          return 1;
@@ -3020,7 +3072,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
  }
  
  int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
-    return ctx->model.kv_self.n;
+    return ctx->kv_self.n;
  }
  
  #define LLAMA_MAX_RNG_STATE (64*1024)
@@ -3045,7 +3097,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
      const size_t s_embedding       = ctx->embedding.size() * sizeof(float);
      const size_t s_kv_size         = sizeof(size_t);
      const size_t s_kv_ntok         = sizeof(int);
-    const size_t s_kv              = ctx->model.kv_self.buf.size;
+    const size_t s_kv              = ctx->kv_self.buf.size;
  
      const size_t s_total = (
          + s_rng_size
@@ -3111,7 +3163,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
  
      // copy kv cache
      {
-        const auto & kv_self = ctx->model.kv_self;
+        const auto & kv_self = ctx->kv_self;
          const auto & hparams = ctx->model.hparams;
          const int    n_layer = hparams.n_layer;
          const int    n_embd  = hparams.n_embd;
@@ -3215,7 +3267,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
  
      // set kv cache
      {
-        const auto & kv_self = ctx->model.kv_self;
+        const auto & kv_self = ctx->kv_self;
          const auto & hparams = ctx->model.hparams;
          const int    n_layer = hparams.n_layer;
          const int    n_embd  = hparams.n_embd;
@@ -3259,7 +3311,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
              ggml_free(cpy_ctx);
          }
  
-        ctx->model.kv_self.n = kv_ntok;
+        ctx->kv_self.n = kv_ntok;
      }
  
      const size_t nread    = inp - src;
@@ -3506,6 +3558,6 @@ const char * llama_print_system_info(void) {
  }
  
  // For internal test use
-std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
+const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
      return ctx->model.tensors_by_name;
  }
diff --git a/llama.h b/llama.h

index 0de530d456932ccb265cc1f51aece6468646f904..a833a7f4d66cc5d865f57bbdd7e131ddf5baaf88 100644 (file)
--- a/llama.h
+++ b/llama.h
@@ -26,6 +26,14 @@
  #    define LLAMA_API
  #endif
  
+#ifdef __GNUC__
+#    define DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
+#elif defined(_MSC_VER)
+#    define DEPRECATED(func, hint) __declspec(deprecated(hint)) func
+#else
+#    define DEPRECATED(func, hint) func
+#endif
+
  #define LLAMA_FILE_MAGIC_GGJT        0x67676a74u // 'ggjt'
  #define LLAMA_FILE_MAGIC_GGLA        0x67676c61u // 'ggla'
  #define LLAMA_FILE_MAGIC_GGMF        0x67676d66u // 'ggmf'
@@ -53,6 +61,7 @@ extern "C" {
      // TODO: show sample usage
      //
  
+    struct llama_model;
      struct llama_context;
  
      typedef int llama_token;
@@ -136,12 +145,23 @@ extern "C" {
  
      LLAMA_API int64_t llama_time_us();
  
+    LLAMA_API struct llama_model * llama_load_model_from_file(
+                             const char * path_model,
+            struct llama_context_params   params);
+
+    LLAMA_API void llama_free_model(struct llama_model * model);
+
+    LLAMA_API struct llama_context * llama_new_context_with_model(
+                     struct llama_model * model,
+            struct llama_context_params   params);
+
      // Various functions for loading a ggml llama model.
      // Allocate (almost) all memory needed for the model.
      // Return NULL on failure
-    LLAMA_API struct llama_context * llama_init_from_file(
+    LLAMA_API DEPRECATED(struct llama_context * llama_init_from_file(
                               const char * path_model,
-            struct llama_context_params   params);
+            struct llama_context_params   params),
+            "please use llama_load_model_from_file combined with llama_new_context_with_model instead");
  
      // Frees all allocated memory
      LLAMA_API void llama_free(struct llama_context * ctx);
@@ -158,8 +178,15 @@ extern "C" {
      // The model needs to be reloaded before applying a new adapter, otherwise the adapter
      // will be applied on top of the previous one
      // Returns 0 on success
-    LLAMA_API int llama_apply_lora_from_file(
+    LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
              struct llama_context * ctx,
+                      const char * path_lora,
+                      const char * path_base_model,
+                             int   n_threads),
+            "please use llama_model_apply_lora_from_file instead");
+
+    LLAMA_API int llama_model_apply_lora_from_file(
+            const struct llama_model * model,
                        const char * path_lora,
                        const char * path_base_model,
                               int   n_threads);
@@ -310,7 +337,7 @@ extern "C" {
  #include <string>
  struct ggml_tensor;
  
-std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
+const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
  
  #endif
  
diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp

index ab1538a0cf304363820ee76361a797395158654c..20abe710018ee559525b192b065dab8309d47325 100644 (file)
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@@ -28,6 +28,7 @@ int main(int argc, char **argv) {
  
      fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
  
+    llama_model * model;
      llama_context * ctx;
  
      // load the vocab
@@ -36,10 +37,18 @@ int main(int argc, char **argv) {
  
          lparams.vocab_only = true;
  
-        ctx = llama_init_from_file(fname.c_str(), lparams);
+        model = llama_load_model_from_file(fname.c_str(), lparams);
+
+        if (model == NULL) {
+            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
+            return 1;
+        }
+
+        ctx = llama_new_context_with_model(model, lparams);
  
          if (ctx == NULL) {
              fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
+            llama_free_model(model);
              return 1;
          }
      }
@@ -48,6 +57,8 @@ int main(int argc, char **argv) {
  
      if (n_vocab != 32000) {
          fprintf(stderr, "%s : expected 32000 tokens, got %d\n", __func__, n_vocab);
+        llama_free_model(model);
+        llama_free(ctx);
          return 2;
      }
  
@@ -77,10 +88,13 @@ int main(int argc, char **argv) {
              }
              fprintf(stderr, "\n");
  
+            llama_free_model(model);
+            llama_free(ctx);
              return 3;
          }
      }
  
+    llama_free_model(model);
      llama_free(ctx);
  
      return 0;
author	Didzis Gosko <redacted>
	Sat, 24 Jun 2023 08:47:58 +0000 (11:47 +0300)
committer	GitHub <redacted>
	Sat, 24 Jun 2023 08:47:58 +0000 (11:47 +0300)
examples/common.cpp		patch \| blob \| history
examples/common.h		patch \| blob \| history
examples/embedding/embedding.cpp		patch \| blob \| history
examples/main/main.cpp		patch \| blob \| history
examples/perplexity/perplexity.cpp		patch \| blob \| history
examples/quantize-stats/quantize-stats.cpp		patch \| blob \| history
examples/save-load-state/save-load-state.cpp		patch \| blob \| history
examples/server/server.cpp		patch \| blob \| history
examples/simple/simple.cpp		patch \| blob \| history
examples/train-text-from-scratch/train-text-from-scratch.cpp		patch \| blob \| history
llama.cpp		patch \| blob \| history
llama.h		patch \| blob \| history
tests/test-tokenizer-0.cpp		patch \| blob \| history