llama : remove deprecated API (#5770)

author Georgi Gerganov <redacted>

Wed, 28 Feb 2024 16:43:38 +0000 (18:43 +0200)

committer GitHub <redacted>

Wed, 28 Feb 2024 16:43:38 +0000 (18:43 +0200)
author Georgi Gerganov <redacted>
Wed, 28 Feb 2024 16:43:38 +0000 (18:43 +0200)
committer GitHub <redacted>
Wed, 28 Feb 2024 16:43:38 +0000 (18:43 +0200)
diff --git a/llama.cpp b/llama.cpp

index 893bcdbc0147dd8f2313174ed2297e15798c2d6d..30d5eb32dab83561a7434831a0721ce7b3264ae4 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -7894,9 +7894,9 @@ static int llama_decode_internal(
      const auto n_batch = cparams.n_batch;
  
      GGML_ASSERT(n_tokens <= n_batch);
+    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
  
      int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
-    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
  
      const int64_t t_start_us = ggml_time_us();
  
@@ -10062,10 +10062,6 @@ void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * cand
      }
  }
  
-void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
-    llama_sample_temp(ctx, candidates_p, temp);
-}
-
  void llama_sample_repetition_penalties(
              struct llama_context * ctx,
            llama_token_data_array * candidates,
@@ -10192,38 +10188,6 @@ void llama_sample_apply_guidance(
      ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
  }
  
-void llama_sample_classifier_free_guidance(
-          struct llama_context * ctx,
-        llama_token_data_array * candidates,
-          struct llama_context * guidance_ctx,
-                         float   scale) {
-    GGML_ASSERT(ctx);
-    int64_t t_start_sample_us;
-
-    t_start_sample_us = ggml_time_us();
-    const size_t n_vocab = llama_n_vocab(llama_get_model(ctx));
-
-    GGML_ASSERT(n_vocab == candidates->size);
-    GGML_ASSERT(!candidates->sorted);
-
-    std::vector<float> logits_base(n_vocab);
-    for (size_t i = 0; i < n_vocab; ++i) {
-        logits_base[i] = candidates->data[i].logit;
-    }
-
-    float * logits_guidance = llama_get_logits(guidance_ctx);
-
-    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    llama_sample_apply_guidance(ctx, logits_base.data(), logits_guidance, scale);
-    t_start_sample_us = ggml_time_us();
-
-    for (size_t i = 0; i < n_vocab; ++i) {
-        candidates->data[i].logit = logits_base[i];
-    }
-
-    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-}
-
  llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
      GGML_ASSERT(ctx);
  
@@ -11724,15 +11688,6 @@ bool llama_supports_gpu_offload(void) {
  #endif
  }
  
-// deprecated:
-bool llama_mmap_supported(void) {
-    return llama_supports_mmap();
-}
-
-bool llama_mlock_supported(void) {
-    return llama_supports_mlock();
-}
-
  void llama_backend_init(void) {
      ggml_time_init();
  
@@ -12244,15 +12199,6 @@ uint32_t llama_model_quantize(
      }
  }
  
-int32_t llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
-    try {
-        return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
-        return 1;
-    }
-}
-
  int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
      try {
          return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
@@ -12802,38 +12748,6 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
      return true;
  }
  
-int llama_eval(
-        struct llama_context * ctx,
-                 llama_token * tokens,
-                     int32_t   n_tokens,
-                     int32_t   n_past) {
-    llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
-
-    const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
-    if (ret < 0) {
-        LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
-    }
-
-    return ret;
-}
-
-int llama_eval_embd(
-            struct llama_context * ctx,
-                           float * embd,
-                         int32_t   n_tokens,
-                         int32_t   n_past) {
-    llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
-
-    llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
-
-    const int ret = llama_decode_internal(*ctx, batch);
-    if (ret < 0) {
-        LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
-    }
-
-    return ret;
-}
-
  void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
      ctx->cparams.n_threads       = n_threads;
      ctx->cparams.n_threads_batch = n_threads_batch;
diff --git a/llama.h b/llama.h

index 16e28e91deb549253e59390233313f8bff45bd50..a6823bb2bd5b9e70773fdc95fcc4d4b74a10f112 100644 (file)
--- a/llama.h
+++ b/llama.h
@@ -364,9 +364,6 @@ extern "C" {
      LLAMA_API bool llama_supports_mlock      (void);
      LLAMA_API bool llama_supports_gpu_offload(void);
  
-    LLAMA_API DEPRECATED(bool llama_mmap_supported (void), "use llama_supports_mmap() instead");
-    LLAMA_API DEPRECATED(bool llama_mlock_supported(void), "use llama_supports_mlock() instead");
-
      LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
  
      LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
@@ -423,14 +420,6 @@ extern "C" {
      // The model needs to be reloaded before applying a new adapter, otherwise the adapter
      // will be applied on top of the previous one
      // Returns 0 on success
-    LLAMA_API DEPRECATED(int32_t llama_apply_lora_from_file(
-            struct llama_context * ctx,
-                      const char * path_lora,
-                           float   scale,
-                      const char * path_base_model,
-                         int32_t   n_threads),
-            "use llama_model_apply_lora_from_file instead");
-
      LLAMA_API int32_t llama_model_apply_lora_from_file(
              const struct llama_model * model,
                        const char * path_lora,
@@ -606,27 +595,6 @@ extern "C" {
      // Decoding
      //
  
-    // Run the llama inference to obtain the logits and probabilities for the next token(s).
-    // tokens + n_tokens is the provided batch of new tokens to process
-    // n_past is the number of tokens to use from previous eval calls
-    // Returns 0 on success
-    // DEPRECATED: use llama_decode() instead
-    LLAMA_API DEPRECATED(int llama_eval(
-            struct llama_context * ctx,
-                     llama_token * tokens,
-                         int32_t   n_tokens,
-                         int32_t   n_past),
-            "use llama_decode() instead");
-
-    // Same as llama_eval, but use float matrix input directly.
-    // DEPRECATED: use llama_decode() instead
-    LLAMA_API DEPRECATED(int llama_eval_embd(
-            struct llama_context * ctx,
-                           float * embd,
-                         int32_t   n_tokens,
-                         int32_t   n_past),
-            "use llama_decode() instead");
-
      // Return batch for single sequence of tokens starting at pos_0
      //
      // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
@@ -800,13 +768,6 @@ extern "C" {
                               float * logits_guidance,
                               float   scale);
  
-    LLAMA_API DEPRECATED(void llama_sample_classifier_free_guidance(
-              struct llama_context * ctx,
-            llama_token_data_array * candidates,
-              struct llama_context * guidance_ctx,
-                             float   scale),
-              "use llama_sample_apply_guidance() instead");
-
      /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
      LLAMA_API void llama_sample_softmax(
              struct llama_context * ctx,
@@ -860,12 +821,6 @@ extern "C" {
            llama_token_data_array * candidates,
                             float   temp);
  
-    LLAMA_API DEPRECATED(void llama_sample_temperature(
-                struct llama_context * ctx,
-              llama_token_data_array * candidates,
-                               float   temp),
-            "use llama_sample_temp instead");
-
      /// @details Apply constraints from grammar
      LLAMA_API void llama_sample_grammar(
              struct llama_context * ctx,
author	Georgi Gerganov <redacted>
	Wed, 28 Feb 2024 16:43:38 +0000 (18:43 +0200)
committer	GitHub <redacted>
	Wed, 28 Feb 2024 16:43:38 +0000 (18:43 +0200)
llama.cpp		patch \| blob \| history
llama.h		patch \| blob \| history