llama : use smart pointers for ggml resources (#10117)

author Diego Devesa <redacted>

Fri, 1 Nov 2024 22:48:26 +0000 (23:48 +0100)

committer GitHub <redacted>

Fri, 1 Nov 2024 22:48:26 +0000 (23:48 +0100)
author Diego Devesa <redacted>
Fri, 1 Nov 2024 22:48:26 +0000 (23:48 +0100)
committer GitHub <redacted>
Fri, 1 Nov 2024 22:48:26 +0000 (23:48 +0100)
diff --git a/ggml/include/ggml-cpp.h b/ggml/include/ggml-cpp.h

new file mode 100644 (file)

index 0000000..219361a
--- /dev/null
+++ b/ggml/include/ggml-cpp.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#ifndef __cplusplus
+#error "This header is for C++ only"
+#endif
+
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+#include <memory>
+
+// Smart pointers for ggml types
+
+// ggml
+
+struct ggml_context_deleter { void operator()(ggml_context * ctx) { ggml_free(ctx); } };
+struct gguf_context_deleter { void operator()(gguf_context * ctx) { gguf_free(ctx); } };
+
+typedef std::unique_ptr<ggml_context, ggml_context_deleter> ggml_context_ptr;
+typedef std::unique_ptr<gguf_context, gguf_context_deleter> gguf_context_ptr;
+
+// ggml-alloc
+
+struct ggml_gallocr_deleter { void operator()(ggml_gallocr_t galloc) { ggml_gallocr_free(galloc); } };
+
+typedef std::unique_ptr<ggml_gallocr_t, ggml_gallocr_deleter> ggml_gallocr_ptr;
+
+// ggml-backend
+
+struct ggml_backend_deleter        { void operator()(ggml_backend_t backend)       { ggml_backend_free(backend); } };
+struct ggml_backend_buffer_deleter { void operator()(ggml_backend_buffer_t buffer) { ggml_backend_buffer_free(buffer); } };
+struct ggml_backend_event_deleter  { void operator()(ggml_backend_event_t event)   { ggml_backend_event_free(event); } };
+struct ggml_backend_sched_deleter  { void operator()(ggml_backend_sched_t sched)   { ggml_backend_sched_free(sched); } };
+
+typedef std::unique_ptr<ggml_backend,        ggml_backend_deleter>        ggml_backend_ptr;
+typedef std::unique_ptr<ggml_backend_buffer, ggml_backend_buffer_deleter> ggml_backend_buffer_ptr;
+typedef std::unique_ptr<ggml_backend_event,  ggml_backend_event_deleter>  ggml_backend_event_ptr;
+typedef std::unique_ptr<ggml_backend_sched,  ggml_backend_sched_deleter>  ggml_backend_sched_ptr;
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt

index 7365ac91bf5f20b562489bc9e4ca76b0686c9b9b..0764a8d9023a63e7aec9eaa6ca7befadced2e75b 100644 (file)
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -1368,6 +1368,7 @@ add_library(ggml
              ../include/ggml.h
              ../include/ggml-alloc.h
              ../include/ggml-backend.h
+            ../include/ggml-cpp.h
              ggml.c
              ggml-alloc.c
              ggml-backend.cpp
diff --git a/spm-headers/ggml-cpp.h b/spm-headers/ggml-cpp.h

new file mode 120000 (symlink)

index 0000000..8a8604c
--- /dev/null
+++ b/spm-headers/ggml-cpp.h
@@ -0,0 +1 @@
+../ggml/include/ggml-cpp.h
+\ No newline at end of file
diff --git a/src/llama.cpp b/src/llama.cpp

index ca0d259b21fb9802daa1e0cd255477c157782e57..0991c40893ceb0553fe360311944382741dba806 100644 (file)
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -7,6 +7,7 @@
  #include "ggml.h"
  #include "ggml-alloc.h"
  #include "ggml-backend.h"
+#include "ggml-cpp.h"
  
  // TODO: replace with ggml API call
  #define QK_K 256
@@ -2797,31 +2798,22 @@ struct llama_kv_cache {
      std::vector<struct ggml_tensor *> k_l; // per layer
      std::vector<struct ggml_tensor *> v_l;
  
-    std::vector<struct ggml_context *> ctxs;
-    std::vector<ggml_backend_buffer_t> bufs;
+    std::vector<ggml_context_ptr> ctxs;
+    std::vector<ggml_backend_buffer_ptr> bufs;
  
-    size_t total_size() const {
+    size_t total_size() {
          size_t size = 0;
-        for (ggml_backend_buffer_t buf : bufs) {
-            size += ggml_backend_buffer_get_size(buf);
+        for (auto & buf : bufs) {
+            size += ggml_backend_buffer_get_size(buf.get());
          }
          return size;
      }
-
-    ~llama_kv_cache() {
-        for (struct ggml_context * ctx : ctxs) {
-            ggml_free(ctx);
-        }
-        for (ggml_backend_buffer_t buf : bufs) {
-            ggml_backend_buffer_free(buf);
-        }
-    }
  };
  
  struct llama_control_vector {
      std::vector<struct ggml_tensor *> tensors; // per layer
-    std::vector<struct ggml_context *> ctxs;
-    std::vector<ggml_backend_buffer_t> bufs;
+    std::vector<ggml_context_ptr> ctxs;
+    std::vector<ggml_backend_buffer_ptr> bufs;
  
      int32_t layer_start = -1;
      int32_t layer_end   = -1;
@@ -2840,15 +2832,6 @@ struct llama_control_vector {
          }
          return cur;
      }
-
-    ~llama_control_vector() {
-        for (struct ggml_context * ctx : ctxs) {
-            ggml_free(ctx);
-        }
-        for (ggml_backend_buffer_t buf : bufs) {
-            ggml_backend_buffer_free(buf);
-        }
-    }
  };
  
  struct llama_model {
@@ -2908,10 +2891,10 @@ struct llama_model {
      std::vector<layer_dev> dev_layer;
  
      // contexts where the model tensors metadata is stored
-    std::vector<struct ggml_context *> ctxs;
+    std::vector<ggml_context_ptr> ctxs;
  
      // the model memory buffers for the tensor data
-    std::vector<ggml_backend_buffer_t> bufs;
+    std::vector<ggml_backend_buffer_ptr> bufs;
  
      // model memory mapped files
      llama_mmaps mappings;
@@ -2930,13 +2913,7 @@ struct llama_model {
      std::set<struct llama_lora_adapter *> lora_adapters;
  
      ~llama_model() {
-        for (struct ggml_context * ctx : ctxs) {
-            ggml_free(ctx);
-        }
-        for (ggml_backend_buffer_t buf : bufs) {
-            ggml_backend_buffer_free(buf);
-        }
-        while (!lora_adapters.empty()) {
+       while (!lora_adapters.empty()) {
              llama_lora_adapter_free(*lora_adapters.begin());
          }
      }
@@ -3253,16 +3230,6 @@ struct llama_context {
          , t_start_us(model.t_start_us)
          , t_load_us(model.t_load_us) {}
  
-    ~llama_context() {
-        ggml_backend_sched_free(sched);
-
-        for (ggml_backend_t backend : backends) {
-            ggml_backend_free(backend);
-        }
-
-        ggml_backend_buffer_free(buf_output);
-    }
-
      const struct llama_model & model;
  
      struct llama_cparams        cparams;
@@ -3272,7 +3239,7 @@ struct llama_context {
  
      std::unordered_map<struct llama_lora_adapter *, float> lora_adapters;
  
-    std::vector<ggml_backend_t> backends;
+    std::vector<ggml_backend_ptr> backends;
      std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
  
      ggml_backend_t backend_cpu = nullptr;
@@ -3294,7 +3261,7 @@ struct llama_context {
      mutable int32_t n_eval   = 0; // number of eval calls
  
      // host buffer for the model output (logits and embeddings)
-    ggml_backend_buffer_t buf_output = nullptr;
+    ggml_backend_buffer_ptr buf_output;
  
      // decode output (2-dimensional array: [n_outputs][n_vocab])
      size_t  logits_size = 0; // capacity (of floats) for logits
@@ -3324,7 +3291,7 @@ struct llama_context {
  
      // memory buffers used to evaluate the model
      std::vector<uint8_t> buf_compute_meta;
-    ggml_backend_sched_t sched = nullptr;
+    ggml_backend_sched_ptr sched;
  
      ggml_abort_callback abort_callback      = nullptr;
      void *              abort_callback_data = nullptr;
@@ -3358,8 +3325,8 @@ struct llama_lora_adapter {
      struct llama_model * base_model;
      // map tensor name to lora_a_b
      std::unordered_map<std::string, struct llama_lora_weight> ab_map;
-    std::vector<struct ggml_context *> ctxs;
-    std::vector<ggml_backend_buffer_t> bufs;
+    std::vector<ggml_context_ptr> ctxs;
+    std::vector<ggml_backend_buffer_ptr> bufs;
  
      float alpha;
  
@@ -3377,12 +3344,6 @@ struct llama_lora_adapter {
      }
  
      ~llama_lora_adapter() {
-        for (struct ggml_context * ctx : ctxs) {
-            ggml_free(ctx);
-        }
-        for (ggml_backend_buffer_t buf : bufs) {
-            ggml_backend_buffer_free(buf);
-        }
          auto pos = base_model->lora_adapters.find(this);
          if (pos != base_model->lora_adapters.end()) {
              base_model->lora_adapters.erase(pos);
@@ -3401,24 +3362,21 @@ static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t d
          /*.mem_buffer =*/ NULL,
          /*.no_alloc   =*/ true,
      };
-    ggml_context * ctx = ggml_init(params);
+    ggml_context_ptr ctx { ggml_init(params) };
      if (!ctx) {
          throw std::runtime_error(format("failed to create ggml context"));
      }
  
-    ggml_backend_buffer_t buf = ggml_backend_buft_alloc_buffer(buft, 0);
-    ggml_tensor * op_tensor = fn(ctx);
+    ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
+    ggml_tensor * op_tensor = fn(ctx.get());
      for (int i = 0; i < GGML_MAX_SRC; i++) {
          if (op_tensor->src[i] != nullptr) {
              assert(op_tensor->src[i]->buffer == nullptr);
-            op_tensor->src[i]->buffer = buf;
+            op_tensor->src[i]->buffer = buf.get();
          }
      }
      bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
  
-    ggml_free(ctx);
-    ggml_backend_buffer_free(buf);
-
      return op_supported;
  }
  
@@ -3470,7 +3428,8 @@ static bool llama_kv_cache_init(
      // create a context for each buffer type
      std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
      auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
-        if (ctx_map.count(buft) == 0) {
+        auto it = ctx_map.find(buft);
+        if (it == ctx_map.end()) {
              struct ggml_init_params params = {
                  /*.mem_size   =*/ size_t(2u*n_layer*ggml_tensor_overhead()),
                  /*.mem_buffer =*/ NULL,
@@ -3481,9 +3440,10 @@ static bool llama_kv_cache_init(
                  return nullptr;
              }
              ctx_map[buft] = ctx;
-            cache.ctxs.push_back(ctx);
+            cache.ctxs.emplace_back(ctx);
+            return ctx;
          }
-        return ctx_map.at(buft);
+        return it->second;
      };
  
      cache.k_l.reserve(n_layer);
@@ -3535,7 +3495,7 @@ static bool llama_kv_cache_init(
          }
          ggml_backend_buffer_clear(buf, 0);
          LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
-        cache.bufs.push_back(buf);
+        cache.bufs.emplace_back(buf);
      }
  
      return true;
@@ -3788,7 +3748,7 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
      cache.used = 0;
  
      for (auto & buf : cache.bufs) {
-        ggml_backend_buffer_clear(buf, 0);
+        ggml_backend_buffer_clear(buf.get(), 0);
      }
  }
  
@@ -4301,8 +4261,8 @@ struct llama_model_loader {
      std::map<std::string, struct llama_tensor_weight, weight_name_comparer> weights_map;
      std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
  
-    struct gguf_context * meta = NULL;
-    std::vector<ggml_context *> contexts;
+    gguf_context_ptr meta;
+    std::vector<ggml_context_ptr> contexts;
  
      std::string arch_name;
      LLM_KV      llm_kv    = LLM_KV(LLM_ARCH_UNKNOWN);
@@ -4325,7 +4285,7 @@ struct llama_model_loader {
              /*.ctx      = */ &ctx,
          };
  
-        meta = gguf_init_from_file(fname.c_str(), params);
+        meta.reset(gguf_init_from_file(fname.c_str(), params));
          if (!meta) {
              throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
          }
@@ -4347,7 +4307,7 @@ struct llama_model_loader {
              }
              n_elements += ggml_nelements(cur);
              n_bytes    += ggml_nbytes(cur);
-            weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, meta, cur));
+            weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, meta.get(), cur));
          }
          uint16_t n_split = 0;
          get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
@@ -4377,7 +4337,7 @@ struct llama_model_loader {
                      /*.no_alloc = */ true,
                      /*.ctx      = */ &ctx,
                  };
-                struct gguf_context * ctx_gguf = gguf_init_from_file(split_path, split_params);
+                gguf_context_ptr ctx_gguf { gguf_init_from_file(split_path, split_params) };
                  if (!ctx_gguf) {
                      throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
                  }
@@ -4394,10 +4354,8 @@ struct llama_model_loader {
                      }
                      n_elements += ggml_nelements(cur);
                      n_bytes    += ggml_nbytes(cur);
-                    weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf, cur));
+                    weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf.get(), cur));
                  }
-
-                gguf_free(ctx_gguf);
              }
  
              get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
@@ -4413,10 +4371,10 @@ struct llama_model_loader {
              LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n",  __func__, n_split - 1);
          }
  
-        n_kv      = gguf_get_n_kv(meta);
+        n_kv      = gguf_get_n_kv(meta.get());
          n_tensors = weights_map.size();
  
-        fver = (enum llama_fver) gguf_get_version(meta);
+        fver = (enum llama_fver) gguf_get_version(meta.get());
  
          LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
                  __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
@@ -4487,23 +4445,23 @@ struct llama_model_loader {
              ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
  
              {
-                const int kid = gguf_find_key(meta, "general.file_type"); // TODO: use LLM_KV
+                const int kid = gguf_find_key(meta.get(), "general.file_type"); // TODO: use LLM_KV
                  if (kid >= 0) {
-                    ftype = (llama_ftype) gguf_get_val_u32(meta, kid);
+                    ftype = (llama_ftype) gguf_get_val_u32(meta.get(), kid);
                  }
              }
  
              LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
  
              for (int i = 0; i < n_kv; i++) {
-                const char * name           = gguf_get_key(meta, i);
-                const enum gguf_type type   = gguf_get_kv_type(meta, i);
+                const char * name           = gguf_get_key(meta.get(), i);
+                const enum gguf_type type   = gguf_get_kv_type(meta.get(), i);
                  const std::string type_name =
                      type == GGUF_TYPE_ARRAY
-                    ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta, i)), gguf_get_arr_n(meta, i))
+                    ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta.get(), i)), gguf_get_arr_n(meta.get(), i))
                      : gguf_type_name(type);
  
-                std::string value          = gguf_kv_to_str(meta, i);
+                std::string value          = gguf_kv_to_str(meta.get(), i);
                  const size_t MAX_VALUE_LEN = 40;
                  if (value.size() > MAX_VALUE_LEN) {
                      value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
@@ -4532,19 +4490,10 @@ struct llama_model_loader {
          this->check_tensors = check_tensors;
      }
  
-    ~llama_model_loader() {
-        if (meta) {
-            gguf_free(meta);
-        }
-        for (auto * ctx : contexts) {
-            ggml_free(ctx);
-        }
-    }
-
      template<typename T>
      typename std::enable_if<std::is_integral<T>::value, bool>::type
      get_arr_n(const std::string & key, T & result, const bool required = true) {
-        const int kid = gguf_find_key(meta, key.c_str());
+        const int kid = gguf_find_key(meta.get(), key.c_str());
  
          if (kid < 0) {
              if (required) {
@@ -4554,7 +4503,7 @@ struct llama_model_loader {
          }
  
          struct GGUFMeta::ArrayInfo arr_info =
-            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
+            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
  
  
          result = arr_info.length;
@@ -4569,9 +4518,9 @@ struct llama_model_loader {
  
      template<typename T>
      bool get_arr(const std::string & key, std::vector<T> & result, const bool required = true) {
-        const int kid = gguf_find_key(meta, key.c_str());
+        const int kid = gguf_find_key(meta.get(), key.c_str());
  
-        if (kid < 0 || gguf_get_kv_type(meta, kid) != GGUF_TYPE_ARRAY) {
+        if (kid < 0 || gguf_get_kv_type(meta.get(), kid) != GGUF_TYPE_ARRAY) {
              if (required) {
                  throw std::runtime_error(format("array key not found in model: %s", key.c_str()));
              }
@@ -4579,7 +4528,7 @@ struct llama_model_loader {
          }
  
          struct GGUFMeta::ArrayInfo arr_info =
-            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
+            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
  
          switch (arr_info.gt) {
              case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
@@ -4598,9 +4547,9 @@ struct llama_model_loader {
  
      template<typename T, size_t N_MAX>
      bool get_arr(const std::string & key, std::array<T, N_MAX> & result, const bool required = true) {
-        const int kid = gguf_find_key(meta, key.c_str());
+        const int kid = gguf_find_key(meta.get(), key.c_str());
  
-        if (kid < 0 || gguf_get_kv_type(meta, kid) != GGUF_TYPE_ARRAY) {
+        if (kid < 0 || gguf_get_kv_type(meta.get(), kid) != GGUF_TYPE_ARRAY) {
              if (required) {
                  throw std::runtime_error(format("array key not found in model: %s", key.c_str()));
              }
@@ -4608,7 +4557,7 @@ struct llama_model_loader {
          }
  
          struct GGUFMeta::ArrayInfo arr_info =
-            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
+            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
  
          switch (arr_info.gt) {
              case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
@@ -4640,7 +4589,7 @@ struct llama_model_loader {
          const struct llama_model_kv_override * override =
              it != kv_overrides.end() ? &it->second : nullptr;
  
-        const bool found = GGUFMeta::GKV<T>::set(meta, key, result, override);
+        const bool found = GGUFMeta::GKV<T>::set(meta.get(), key, result, override);
  
          if (required && !found) {
              throw std::runtime_error(format("key not found in model: %s", key.c_str()));
@@ -4657,7 +4606,7 @@ struct llama_model_loader {
      // get array of n <= N_MAX elements, or a single element repeated n times
      template<typename T, size_t N_MAX>
      bool get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, const bool required = true) {
-        const int kid = gguf_find_key(meta, key.c_str());
+        const int kid = gguf_find_key(meta.get(), key.c_str());
  
          if (kid < 0) {
              if (required) {
@@ -4670,9 +4619,9 @@ struct llama_model_loader {
              throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", (uint32_t) n, (uint32_t) N_MAX, key.c_str()));
          }
  
-        if (gguf_get_kv_type(meta, kid) == GGUF_TYPE_ARRAY) {
+        if (gguf_get_kv_type(meta.get(), kid) == GGUF_TYPE_ARRAY) {
              struct GGUFMeta::ArrayInfo arr_info =
-                GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
+                GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
  
              if (n != arr_info.length) {
                  throw std::runtime_error(format("key %s has wrong array length; expected %u, got %u", key.c_str(), n, (uint32_t) arr_info.length));
@@ -5342,7 +5291,7 @@ static void llm_load_hparams(
          llama_model_loader & ml,
          llama_model & model) {
      auto & hparams = model.hparams;
-    const gguf_context * ctx = ml.meta;
+    const gguf_context * ctx = ml.meta.get();
  
      // get metadata as string
      for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
@@ -6109,7 +6058,7 @@ static void llm_load_vocab(
          llama_model & model) {
      auto & vocab = model.vocab;
  
-    struct gguf_context * ctx = ml.meta;
+    struct gguf_context * ctx = ml.meta.get();
  
      const auto kv = LLM_KV(model.arch);
  
@@ -7104,10 +7053,11 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
          /*.mem_buffer =*/ NULL,
          /*.no_alloc   =*/ true,
      };
-    ggml_context * ctx = ggml_init(params);
-    if (!ctx) {
+    ggml_context_ptr ctx_ptr { ggml_init(params) };
+    if (!ctx_ptr) {
          throw std::runtime_error(format("failed to create ggml context"));
      }
+    ggml_context * ctx = ctx_ptr.get();
  
      ggml_tensor * op_tensor = nullptr;
  
@@ -7203,8 +7153,6 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
      ggml_backend_buffer_free(w->buffer);
      w->buffer = nullptr;
  
-    ggml_free(ctx);
-
      return op_supported;
  }
  
@@ -7395,7 +7343,8 @@ static bool llm_load_tensors(
  
      std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
      auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
-        if (ctx_map.count(buft) == 0) {
+        auto it = ctx_map.find(buft);
+        if (it == ctx_map.end()) {
              ggml_init_params params = {
                  /*.mem_size   =*/ ctx_size,
                  /*.mem_buffer =*/ NULL,
@@ -7406,9 +7355,10 @@ static bool llm_load_tensors(
                  throw std::runtime_error(format("failed to create ggml context"));
              }
              ctx_map[buft] = ctx;
-            model.ctxs.push_back(ctx);
+            model.ctxs.emplace_back(ctx);
+            return ctx;
          }
-        return ctx_map.at(buft);
+        return it->second;
      };
  
      // create tensors for the weights
@@ -9134,7 +9084,7 @@ static bool llm_load_tensors(
                  if (buf == nullptr) {
                      throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
                  }
-                model.bufs.push_back(buf);
+                model.bufs.emplace_back(buf);
                  bufs.emplace(idx, buf);
              }
          }
@@ -9143,7 +9093,7 @@ static bool llm_load_tensors(
              if (buf == nullptr) {
                  throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
              }
-            model.bufs.push_back(buf);
+            model.bufs.emplace_back(buf);
              if (use_mlock && ggml_backend_buffer_is_host(buf)) {
                  model.mlock_bufs.emplace_back(new llama_mlock);
                  auto & mlock_buf = model.mlock_bufs.back();
@@ -9183,13 +9133,13 @@ static bool llm_load_tensors(
      }
  
      // print memory requirements per buffer type
-    for (ggml_backend_buffer_t buf : model.bufs) {
-        LLAMA_LOG_INFO("%s: %10s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
+    for (auto & buf : model.bufs) {
+        LLAMA_LOG_INFO("%s: %10s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
      }
  
      // populate tensors_by_name
-    for (ggml_context * ctx : model.ctxs) {
-        for (auto * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
+    for (auto & ctx : model.ctxs) {
+        for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) {
              model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
          }
      }
@@ -10294,10 +10244,8 @@ struct llm_build_context {
      }
  
      void free() {
-        if (ctx0) {
-            ggml_free(ctx0);
-            ctx0 = nullptr;
-        }
+        ggml_free(ctx0);
+        ctx0 = nullptr;
      }
  
      struct ggml_cgraph * build_k_shift() {
@@ -10325,10 +10273,10 @@ struct llm_build_context {
                  // dequantize to f32 -> RoPE -> quantize back
                  tmp = ggml_cast(ctx0, k, GGML_TYPE_F32);
                  cb(tmp, "K_f32", il);
-                for (auto * backend : lctx.backends) {
+                for (auto & backend : lctx.backends) {
                      // Figure out which backend KV cache belongs to
-                    if (ggml_backend_supports_buft(backend, ggml_backend_buffer_get_type(kv_self.k_l[il]->buffer))) {
-                        ggml_backend_sched_set_tensor_backend(lctx.sched, tmp, backend);
+                    if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(kv_self.k_l[il]->buffer))) {
+                        ggml_backend_sched_set_tensor_backend(lctx.sched.get(), tmp, backend.get());
                          break;
                      }
                  }
@@ -16443,7 +16391,7 @@ static struct ggml_cgraph * llama_build_graph(
          if (!lctx.cparams.offload_kqv) {
              if (strcmp(name, "kqv_merged_cont") == 0) {
                  // all nodes between the KV store and the attention output are run on the CPU
-                ggml_backend_sched_set_tensor_backend(lctx.sched, cur, lctx.backend_cpu);
+                ggml_backend_sched_set_tensor_backend(lctx.sched.get(), cur, lctx.backend_cpu);
              }
          }
  
@@ -16453,10 +16401,10 @@ static struct ggml_cgraph * llama_build_graph(
          if (ubatch.n_tokens < 32 || full_offload) {
              if (il != -1 && strcmp(name, "norm") == 0) {
                  const auto & dev_layer = lctx.model.dev_layer.at(il);
-                for (auto * backend : lctx.backends) {
-                    if (ggml_backend_get_device(backend) == dev_layer.dev) {
-                        if (ggml_backend_supports_op(backend, cur)) {
-                            ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
+                for (auto & backend : lctx.backends) {
+                    if (ggml_backend_get_device(backend.get()) == dev_layer.dev) {
+                        if (ggml_backend_supports_op(backend.get(), cur)) {
+                            ggml_backend_sched_set_tensor_backend(lctx.sched.get(), cur, backend.get());
                          }
                      }
                  }
@@ -17143,7 +17091,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
          lctx.output_ids.resize(n_batch);
      }
  
-    const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output) : 0;
+    const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output.get()) : 0;
      const size_t new_size  = (logits_size + embd_size) * sizeof(float);
  
      // alloc only when more than the current capacity is required
@@ -17154,7 +17102,6 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
              // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
              LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
  #endif
-            ggml_backend_buffer_free(lctx.buf_output);
              lctx.buf_output = nullptr;
              lctx.logits = nullptr;
              lctx.embd = nullptr;
@@ -17167,14 +17114,14 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
          if (output_dev_host_buft) {
              buft = output_dev_host_buft;
          }
-        lctx.buf_output = ggml_backend_buft_alloc_buffer(buft, new_size);
+        lctx.buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size));
          if (lctx.buf_output == nullptr) {
              LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
              return 0;
          }
      }
  
-    float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output);
+    float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output.get());
  
      lctx.logits = has_logits ? output_base               : nullptr;
      lctx.embd   = has_embd   ? output_base + logits_size : nullptr;
@@ -17186,7 +17133,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
      // set all ids as invalid (negative)
      std::fill(lctx.output_ids.begin(), lctx.output_ids.end(), -1);
  
-    ggml_backend_buffer_clear(lctx.buf_output, 0);
+    ggml_backend_buffer_clear(lctx.buf_output.get(), 0);
  
      lctx.n_outputs = 0;
  
@@ -17246,7 +17193,7 @@ static void llama_graph_compute(
          set_n_threads_fn.second(set_n_threads_fn.first, n_threads);
      }
  
-    auto err = ggml_backend_sched_graph_compute_async(lctx.sched, gf);
+    auto err = ggml_backend_sched_graph_compute_async(lctx.sched.get(), gf);
      if (err != GGML_STATUS_SUCCESS) {
          LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, err);
      }
@@ -17404,8 +17351,8 @@ static int llama_decode_internal(
  
          //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
  
-        ggml_backend_sched_reset(lctx.sched);
-        ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
+        ggml_backend_sched_reset(lctx.sched.get());
+        ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
  
          ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
  
@@ -17433,7 +17380,7 @@ static int llama_decode_internal(
          }
          // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
  
-        ggml_backend_sched_alloc_graph(lctx.sched, gf);
+        ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
  
          llama_set_inputs(lctx, ubatch);
  
@@ -17456,7 +17403,7 @@ static int llama_decode_internal(
  
          // extract logits
          if (res) {
-            ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res);
+            ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched.get(), res);
              GGML_ASSERT(backend_res != nullptr);
              GGML_ASSERT(lctx.logits != nullptr);
  
@@ -17472,7 +17419,7 @@ static int llama_decode_internal(
  
          // extract embeddings
          if (embd) {
-            ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd);
+            ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched.get(), embd);
              GGML_ASSERT(backend_embd != nullptr);
  
              switch (cparams.pooling_type) {
@@ -17567,7 +17514,7 @@ static int llama_decode_internal(
  
      // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
      // overlap with device computation.
-    ggml_backend_sched_reset(lctx.sched);
+    ggml_backend_sched_reset(lctx.sched.get());
  
      return 0;
  }
@@ -17645,8 +17592,8 @@ static int llama_encode_internal(
  
      GGML_ASSERT(n_threads > 0);
  
-    ggml_backend_sched_reset(lctx.sched);
-    ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
+    ggml_backend_sched_reset(lctx.sched.get());
+    ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
  
      ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
  
@@ -17670,7 +17617,7 @@ static int llama_encode_internal(
          }
      }
  
-    ggml_backend_sched_alloc_graph(lctx.sched, gf);
+    ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
  
      llama_set_inputs(lctx, ubatch);
  
@@ -17678,7 +17625,7 @@ static int llama_encode_internal(
  
      // extract embeddings
      if (embd) {
-        ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd);
+        ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched.get(), embd);
          GGML_ASSERT(backend_embd != nullptr);
  
          if (llama_model_has_decoder(&lctx.model)) {
@@ -17745,7 +17692,7 @@ static int llama_encode_internal(
  
      // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
      // overlap with device computation.
-    ggml_backend_sched_reset(lctx.sched);
+    ggml_backend_sched_reset(lctx.sched.get());
  
      return 0;
  }
@@ -17959,7 +17906,7 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
  #else
      // ggml_graph defrag
  
-    ggml_backend_sched_reset(lctx.sched);
+    ggml_backend_sched_reset(lctx.sched.get());
  
      ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
  
@@ -17981,11 +17928,11 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
          }
  
          {
-            ggml_backend_sched_reset(lctx.sched);
+            ggml_backend_sched_reset(lctx.sched.get());
  
              ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
  
-            ggml_backend_sched_alloc_graph(lctx.sched, gf);
+            ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
  
              llama_set_k_shift(lctx);
  
@@ -18025,8 +17972,8 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
          ggml_cgraph * gf = llama_build_graph(lctx, ubatch, true);
  
          // initialize scheduler with the worst-case graph
-        ggml_backend_sched_reset(lctx.sched);
-        if (!ggml_backend_sched_reserve(lctx.sched, gf)) {
+        ggml_backend_sched_reset(lctx.sched.get());
+        if (!ggml_backend_sched_reserve(lctx.sched.get(), gf)) {
              LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
          }
      }
@@ -18577,30 +18524,30 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
      }
  
      const size_t align = GGUF_DEFAULT_ALIGNMENT;
-    struct gguf_context * ctx_out = gguf_init_empty();
+    gguf_context_ptr ctx_out { gguf_init_empty() };
  
      // copy the KV pairs from the input file
-    gguf_set_kv     (ctx_out, ml.meta);
-    gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
-    gguf_set_val_u32(ctx_out, "general.file_type", ftype); // TODO: use LLM_KV
+    gguf_set_kv     (ctx_out.get(), ml.meta.get());
+    gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
+    gguf_set_val_u32(ctx_out.get(), "general.file_type", ftype); // TODO: use LLM_KV
  
      // Remove split metadata
-    gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
-    gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str());
-    gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str());
+    gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
+    gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str());
+    gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str());
  
      if (params->kv_overrides) {
          const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides;
-        for (auto & o : overrides) {
+        for (const auto & o : overrides) {
              if (o.key[0] == 0) break;
              if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
-                gguf_set_val_f32(ctx_out, o.key, o.val_f64);
+                gguf_set_val_f32(ctx_out.get(), o.key, o.val_f64);
              } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
-                gguf_set_val_i32(ctx_out, o.key, o.val_i64);
+                gguf_set_val_i32(ctx_out.get(), o.key, o.val_i64);
              } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
-                gguf_set_val_bool(ctx_out, o.key, o.val_bool);
+                gguf_set_val_bool(ctx_out.get(), o.key, o.val_bool);
              } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
-                gguf_set_val_str(ctx_out, o.key, o.val_str);
+                gguf_set_val_str(ctx_out.get(), o.key, o.val_str);
              } else {
                  LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
              }
@@ -18672,25 +18619,25 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
              n_split = std::max(uint16_t(it->idx + 1), n_split);
          }
      }
-    std::vector<gguf_context*> ctx_outs(n_split, NULL);
-    ctx_outs[0] = ctx_out;
+    std::vector<gguf_context_ptr> ctx_outs(n_split);
+    ctx_outs[0] = std::move(ctx_out);
  
      // populate the original tensors so we get an initial meta data
      for (const auto * it : tensors) {
          uint16_t i_split = params->keep_split ? it->idx : 0;
          struct ggml_tensor * tensor = it->tensor;
-        if (ctx_outs[i_split] == NULL) {
-            ctx_outs[i_split] = gguf_init_empty();
+        if (!ctx_outs[i_split]) {
+            ctx_outs[i_split].reset(gguf_init_empty());
          }
-        gguf_add_tensor(ctx_outs[i_split], tensor);
+        gguf_add_tensor(ctx_outs[i_split].get(), tensor);
      }
  
      // Set split info if needed
      if (n_split > 1) {
          for (size_t i = 0; i < ctx_outs.size(); ++i) {
-            gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
-            gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
-            gguf_set_val_i32(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors);
+            gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
+            gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
+            gguf_set_val_i32(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors);
          }
      }
  
@@ -18700,8 +18647,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
          // Write metadata and close file handler
          if (fout.is_open()) {
              fout.seekp(0);
-            std::vector<uint8_t> data(gguf_get_meta_size(ctx_outs[cur_split]));
-            gguf_get_meta_data(ctx_outs[cur_split], data.data());
+            std::vector<uint8_t> data(gguf_get_meta_size(ctx_outs[cur_split].get()));
+            gguf_get_meta_data(ctx_outs[cur_split].get(), data.data());
              fout.write((const char *) data.data(), data.size());
              fout.close();
          }
@@ -18718,7 +18665,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
  
          fout = std::ofstream(fname, std::ios::binary);
          fout.exceptions(std::ofstream::failbit); // fail fast on write errors
-        const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split]);
+        const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split].get());
          // placeholder for the meta data
          ::zeros(fout, meta_size);
      };
@@ -18903,17 +18850,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
          total_size_new += new_size;
  
          // update the gguf meta data as we go
-        gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type);
-        gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size);
+        gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
+        gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data, new_size);
  
          // write tensor data + padding
          fout.write((const char *) new_data, new_size);
          zeros(fout, GGML_PAD(new_size, align) - new_size);
      }
      close_ofstream();
-    for (auto & c:ctx_outs) {
-        gguf_free(c);
-    }
  
      LLAMA_LOG_INFO("%s: model size  = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
      LLAMA_LOG_INFO("%s: quant size  = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
@@ -18927,51 +18871,51 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
  static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) {
      LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
  
-    ggml_context * ctx = nullptr;
+    ggml_context * ctx_init;
      struct gguf_init_params meta_gguf_params = {
          /* .no_alloc = */ true,
-        /* .ctx      = */ &ctx,
+        /* .ctx      = */ &ctx_init,
      };
-    struct gguf_context * ctx_gguf = gguf_init_from_file(path_lora, meta_gguf_params);
+
+    gguf_context_ptr ctx_gguf { gguf_init_from_file(path_lora, meta_gguf_params) };
      if (!ctx_gguf) {
          throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora));
      }
  
+    ggml_context_ptr ctx { ctx_init };
+
      // check metadata
      {
          auto get_kv_str = [&](const std::string & key) -> std::string {
-            int id = gguf_find_key(ctx_gguf, key.c_str());
-            return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
+            int id = gguf_find_key(ctx_gguf.get(), key.c_str());
+            return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf.get(), id));
          };
          auto get_kv_f32 = [&](const std::string & key) -> float {
-            int id = gguf_find_key(ctx_gguf, key.c_str());
-            return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf, id);
+            int id = gguf_find_key(ctx_gguf.get(), key.c_str());
+            return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf.get(), id);
          };
          LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
  
          auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE));
          if (general_type != "adapter") {
-            gguf_free(ctx_gguf);
              throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
          }
  
          auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
          auto general_arch = llm_arch_from_string(general_arch_str);
          if (general_arch != model->arch) {
-            gguf_free(ctx_gguf);
              throw std::runtime_error("model arch and LoRA arch mismatch");
          }
  
          auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE));
          if (adapter_type != "lora") {
-            gguf_free(ctx_gguf);
              throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
          }
  
          adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));
      }
  
-    int n_tensors = gguf_get_n_tensors(ctx_gguf);
+    int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
  
      // contexts for each buffer type
      std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
@@ -18985,7 +18929,11 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
                  /*.no_alloc   =*/ true,
              };
              ggml_context * buft_ctx = ggml_init(params);
+            if (!buft_ctx) {
+                return nullptr;
+            }
              ctx_map[buft] = buft_ctx;
+            adapter.ctxs.emplace_back(buft_ctx);
              return buft_ctx;
          };
          return it->second;
@@ -18996,7 +18944,7 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
      auto str_endswith = [](const std::string & str, const std::string & suffix) {
          return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
      };
-    for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
+    for (ggml_tensor * cur = ggml_get_first_tensor(ctx.get()); cur; cur = ggml_get_next_tensor(ctx.get(), cur)) {
          std::string name(cur->name);
          if (str_endswith(name, ".lora_a")) {
              replace_all(name, ".lora_a", "");
@@ -19013,8 +18961,6 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
                  ab_map[name].b = cur;
              }
          } else {
-            gguf_free(ctx_gguf);
-            ggml_free(ctx);
              throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
          }
      }
@@ -19025,28 +18971,20 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
          llama_lora_weight & w = it.second;
  
          if (!w.a || !w.b) {
-            gguf_free(ctx_gguf);
-            ggml_free(ctx);
              throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
          }
  
          // device buft and device ctx
          auto * model_tensor = llama_get_model_tensor(model, name.c_str());
          if (!model_tensor) {
-            gguf_free(ctx_gguf);
-            ggml_free(ctx);
              throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
          }
          struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
          // validate tensor shape
          if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
-            gguf_free(ctx_gguf);
-            ggml_free(ctx);
              throw std::runtime_error("tensor '" + name + "' has incorrect shape");
          }
          if (w.a->ne[1] != w.b->ne[0]) {
-            gguf_free(ctx_gguf);
-            ggml_free(ctx);
              throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
          }
          // save tensor to adapter
@@ -19061,18 +18999,15 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
      {
          adapter.ctxs.reserve(ctx_map.size());
          adapter.bufs.reserve(ctx_map.size());
-        for (auto it : ctx_map) {
+        for (auto & it : ctx_map) {
              ggml_backend_buffer_type_t buft = it.first;
              ggml_context * ctx_dev = it.second;
-            ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft);
+            ggml_backend_buffer_ptr buf { ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft) };
              if (!buf) {
-                gguf_free(ctx_gguf);
-                ggml_free(ctx);
                  throw std::runtime_error("failed to allocate buffer for lora adapter\n");
              }
-            LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
-            adapter.ctxs.push_back(ctx_dev);
-            adapter.bufs.push_back(buf);
+            LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get())/1024.0/1024.0);
+            adapter.bufs.emplace_back(std::move(buf));
          }
      }
  
@@ -19081,7 +19016,7 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
          llama_file gguf_file(path_lora, "rb");
          std::vector<uint8_t> read_buf;
          auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) {
-            size_t offs = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, gguf_find_tensor(ctx_gguf, orig->name));
+            size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name));
              size_t size = ggml_nbytes(orig);
              read_buf.resize(size);
              gguf_file.seek(offs, SEEK_SET);
@@ -19097,10 +19032,6 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
      }
  
      LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
-
-    // free ctx for reading gguf
-    gguf_free(ctx_gguf);
-    ggml_free(ctx);
  }
  
  int32_t llama_lora_adapter_set(
@@ -19549,7 +19480,7 @@ struct llama_context * llama_new_context_with_model(
                  llama_free(ctx);
                  return nullptr;
              }
-            ctx->backends.push_back(backend);
+            ctx->backends.emplace_back(backend);
          }
  
          // add ACCEL backends (such as BLAS)
@@ -19562,7 +19493,7 @@ struct llama_context * llama_new_context_with_model(
                      llama_free(ctx);
                      return nullptr;
                  }
-                ctx->backends.push_back(backend);
+                ctx->backends.emplace_back(backend);
              }
          }
  
@@ -19573,16 +19504,16 @@ struct llama_context * llama_new_context_with_model(
              llama_free(ctx);
              return nullptr;
          }
-        ctx->backends.push_back(ctx->backend_cpu);
+        ctx->backends.emplace_back(ctx->backend_cpu);
  
          // create a list of the set_n_threads functions in the backends
-        for (auto * backend : ctx->backends) {
-            ggml_backend_dev_t dev = ggml_backend_get_device(backend);
+        for (auto & backend : ctx->backends) {
+            ggml_backend_dev_t dev = ggml_backend_get_device(backend.get());
              ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr;
              if (reg) {
                  auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
                  if (ggml_backend_set_n_threads_fn) {
-                    ctx->set_n_threads_fns.emplace_back(backend, ggml_backend_set_n_threads_fn);
+                    ctx->set_n_threads_fns.emplace_back(backend.get(), ggml_backend_set_n_threads_fn);
                  }
              }
          }
@@ -19621,17 +19552,18 @@ struct llama_context * llama_new_context_with_model(
              }
  
              LLAMA_LOG_INFO("%s: %10s  output buffer size = %8.2f MiB\n", __func__,
-                    ggml_backend_buffer_name(ctx->buf_output),
-                    ggml_backend_buffer_get_size(ctx->buf_output) / 1024.0 / 1024.0);
+                    ggml_backend_buffer_name(ctx->buf_output.get()),
+                    ggml_backend_buffer_get_size(ctx->buf_output.get()) / 1024.0 / 1024.0);
          }
  
          // scheduler and compute buffers
          {
              // buffer types used for the compute buffer of each backend
              std::vector<ggml_backend_buffer_type_t> backend_buft;
-            for (auto * backend : ctx->backends) {
-                auto * buft = ggml_backend_get_default_buffer_type(backend);
-                if (ggml_backend_is_cpu(backend) && !model->devices.empty()) {
+            std::vector<ggml_backend_t> backend_ptrs;
+            for (auto & backend : ctx->backends) {
+                auto * buft = ggml_backend_get_default_buffer_type(backend.get());
+                if (ggml_backend_is_cpu(backend.get()) && !model->devices.empty()) {
                      // use the host buffer of the first device CPU for faster transfer of the intermediate state
                      auto * dev = model->devices[0];
                      auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
@@ -19640,6 +19572,7 @@ struct llama_context * llama_new_context_with_model(
                      }
                  }
                  backend_buft.push_back(buft);
+                backend_ptrs.push_back(backend.get());
              }
  
              const size_t max_nodes = llama_model_max_nodes(*model);
@@ -19657,12 +19590,12 @@ struct llama_context * llama_new_context_with_model(
  
              // pipeline parallelism requires support for async compute and events in all devices
              if (pipeline_parallel) {
-                for (auto * backend : ctx->backends) {
-                    if (ggml_backend_is_cpu(backend)) {
+                for (auto & backend : ctx->backends) {
+                    if (ggml_backend_is_cpu(backend.get())) {
                          // ignore CPU backend
                          continue;
                      }
-                    auto * dev = ggml_backend_get_device(backend);
+                    auto * dev = ggml_backend_get_device(backend.get());
                      ggml_backend_dev_props props;
                      ggml_backend_dev_get_props(dev, &props);
                      if (!props.caps.async || !props.caps.events) {
@@ -19673,10 +19606,10 @@ struct llama_context * llama_new_context_with_model(
                  }
              }
  
-            ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes, pipeline_parallel);
+            ctx->sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel));
  
              if (pipeline_parallel) {
-                LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched));
+                LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched.get()));
              }
  
              // initialize scheduler with the worst-case graph
@@ -19688,29 +19621,29 @@ struct llama_context * llama_new_context_with_model(
              ggml_cgraph * gf_pp = llama_build_graph(*ctx, ubatch_pp, true);
  
              // reserve pp graph first so that buffers are only allocated once
-            ggml_backend_sched_reserve(ctx->sched, gf_pp);
-            int n_splits_pp = ggml_backend_sched_get_n_splits(ctx->sched);
+            ggml_backend_sched_reserve(ctx->sched.get(), gf_pp);
+            int n_splits_pp = ggml_backend_sched_get_n_splits(ctx->sched.get());
              int n_nodes_pp = ggml_graph_n_nodes(gf_pp);
  
              // reserve with tg graph to get the number of splits and nodes
              llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
              ggml_cgraph * gf_tg = llama_build_graph(*ctx, ubatch_tg, true);
-            ggml_backend_sched_reserve(ctx->sched, gf_tg);
-            int n_splits_tg = ggml_backend_sched_get_n_splits(ctx->sched);
+            ggml_backend_sched_reserve(ctx->sched.get(), gf_tg);
+            int n_splits_tg = ggml_backend_sched_get_n_splits(ctx->sched.get());
              int n_nodes_tg = ggml_graph_n_nodes(gf_tg);
  
              // reserve again with pp graph to avoid ggml-alloc reallocations during inference
              gf_pp = llama_build_graph(*ctx, ubatch_pp, true);
-            if (!ggml_backend_sched_reserve(ctx->sched, gf_pp)) {
+            if (!ggml_backend_sched_reserve(ctx->sched.get(), gf_pp)) {
                  LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
                  llama_free(ctx);
                  return nullptr;
              }
  
-            for (size_t i = 0; i < ctx->backends.size(); i++) {
-                ggml_backend_t backend = ctx->backends[i];
+            for (size_t i = 0; i < backend_ptrs.size(); ++i) {
+                ggml_backend_t backend = backend_ptrs[i];
                  ggml_backend_buffer_type_t buft = backend_buft[i];
-                size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend);
+                size_t size = ggml_backend_sched_get_buffer_size(ctx->sched.get(), backend);
                  if (size > 1) {
                      LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
                              ggml_backend_buft_name(buft),
@@ -19990,7 +19923,8 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
      // create a context for each buffer type
      std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
      auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
-        if (ctx_map.count(buft) == 0) {
+        auto it = ctx_map.find(buft);
+        if (it == ctx_map.end()) {
              struct ggml_init_params params = {
                  /*.mem_size   =*/ model.hparams.n_layer*ggml_tensor_overhead(),
                  /*.mem_buffer =*/ NULL,
@@ -20001,12 +19935,12 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
                  return nullptr;
              }
              ctx_map[buft] = ctx;
-            cvec.ctxs.push_back(ctx);
+            cvec.ctxs.emplace_back(ctx);
+            return ctx;
          }
-        return ctx_map.at(buft);
+        return it->second;
      };
  
-
      // make tensors
      cvec.tensors.reserve(model.hparams.n_layer);
      cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
@@ -20037,7 +19971,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
              return false;
          }
          ggml_backend_buffer_clear(buf, 0);
-        cvec.bufs.push_back(buf);
+        cvec.bufs.emplace_back(buf);
      }
  
      return true;
@@ -21305,7 +21239,7 @@ int32_t llama_decode(
  }
  
  void llama_synchronize(struct llama_context * ctx) {
-    ggml_backend_sched_synchronize(ctx->sched);
+    ggml_backend_sched_synchronize(ctx->sched.get());
  
      // FIXME: if multiple single tokens are evaluated without a synchronization,
      // the stats will be added to the prompt evaluation stats
author	Diego Devesa <redacted>
	Fri, 1 Nov 2024 22:48:26 +0000 (23:48 +0100)
committer	GitHub <redacted>
	Fri, 1 Nov 2024 22:48:26 +0000 (23:48 +0100)
ggml/include/ggml-cpp.h	[new file with mode: 0644]	patch \| blob
ggml/src/CMakeLists.txt		patch \| blob \| history
spm-headers/ggml-cpp.h	[new symlink]	patch \| blob
src/llama.cpp		patch \| blob \| history