llama : use _impl suffix instead of _internal (#11060)

author Georgi Gerganov <redacted>

Mon, 6 Jan 2025 08:52:01 +0000 (10:52 +0200)

committer GitHub <redacted>

Mon, 6 Jan 2025 08:52:01 +0000 (10:52 +0200)
author Georgi Gerganov <redacted>
Mon, 6 Jan 2025 08:52:01 +0000 (10:52 +0200)
committer GitHub <redacted>
Mon, 6 Jan 2025 08:52:01 +0000 (10:52 +0200)
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp

index 42974f8f13a362289a926936d887e63d2bc4e1c5..104f90343a402edde2e5afccfcc9ceaa2dbf2a54 100644 (file)
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -22,7 +22,7 @@ static void zeros(std::ofstream & file, size_t n) {
      }
  }
  
-struct quantize_state_internal {
+struct quantize_state_impl {
      const llama_model                 & model;
      const llama_model_quantize_params * params;
  
@@ -43,13 +43,13 @@ struct quantize_state_internal {
      // used to figure out if a model shares tok_embd with the output weight
      bool has_output = false;
  
-    quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
+    quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params)
          : model(model)
          , params(params)
          {}
  };
  
-static void llama_tensor_dequantize_internal(
+static void llama_tensor_dequantize_impl(
      struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
      const size_t nelements, const int nthread
  ) {
@@ -121,7 +121,7 @@ static void llama_tensor_dequantize_internal(
      workers.clear();
  }
  
-static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
+static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
      const std::string name = ggml_get_name(tensor);
  
      // TODO: avoid hardcoded tensor names - use the TN_* constants
@@ -410,7 +410,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
      return new_type;
  }
  
-static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
+static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
      if (nthread < 2) {
          // single-thread
          size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
@@ -464,7 +464,7 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
      return new_size;
  }
  
-static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
+static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
      ggml_type default_type;
      llama_ftype ftype = params->ftype;
  
@@ -534,7 +534,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
      llm_load_hparams(ml, model);
      llm_load_stats  (ml, model);
  
-    struct quantize_state_internal qs(model, params);
+    struct quantize_state_impl qs(model, params);
  
      if (params->only_copy) {
          ftype = model.ftype;
@@ -837,7 +837,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
              } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
                  throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
              } else {
-                llama_tensor_dequantize_internal(tensor, f32_conv_buf, workers, nelements, nthread);
+                llama_tensor_dequantize_impl(tensor, f32_conv_buf, workers, nelements, nthread);
                  f32_data = (float *) f32_conv_buf.data();
              }
  
@@ -866,7 +866,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                  void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
                  const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
  
-                new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
+                new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
              }
              LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
          }
@@ -919,7 +919,7 @@ uint32_t llama_model_quantize(
          const char * fname_out,
          const llama_model_quantize_params * params) {
      try {
-        llama_model_quantize_internal(fname_inp, fname_out, params);
+        llama_model_quantize_impl(fname_inp, fname_out, params);
      } catch (const std::exception & err) {
          LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what());
          return 1;
diff --git a/src/llama.cpp b/src/llama.cpp

index ea78ea487de588011147ccaa1e75f6a34d779e06..4a6798f416fe9029ff7af5f0f53c378eedbdc994 100644 (file)
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -10717,7 +10717,7 @@ static enum ggml_status llama_graph_compute(
  // return positive int on warning
  // return negative int on error
  //
-static int llama_decode_internal(
+static int llama_decode_impl(
           llama_context & lctx,
             llama_batch   inp_batch) {
  
@@ -11052,7 +11052,7 @@ static int llama_decode_internal(
  // return positive int on warning
  // return negative int on error
  //
-static int llama_encode_internal(
+static int llama_encode_impl(
           llama_context & lctx,
             llama_batch   inp_batch) {
  
@@ -11234,7 +11234,7 @@ static int llama_encode_internal(
  }
  
  // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
-static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
+static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
      auto & kv_self = lctx.kv_self;
  
      const auto & hparams = lctx.model.hparams;
@@ -11454,7 +11454,7 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
      //LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0);
  }
  
-static void llama_kv_cache_update_internal(struct llama_context & lctx) {
+static void llama_kv_cache_update_impl(struct llama_context & lctx) {
      bool need_reserve = false;
  
      if (lctx.kv_self.has_shift) {
@@ -11490,7 +11490,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
  
      // defragment the KV cache if needed
      if (lctx.kv_self.do_defrag) {
-        llama_kv_cache_defrag_internal(lctx);
+        llama_kv_cache_defrag_impl(lctx);
  
          need_reserve = true;
  
@@ -12191,7 +12191,7 @@ void llama_kv_cache_defrag(struct llama_context * ctx) {
  }
  
  void llama_kv_cache_update(struct llama_context * ctx) {
-    llama_kv_cache_update_internal(*ctx);
+    llama_kv_cache_update_impl(*ctx);
  }
  
  bool llama_kv_cache_can_shift(struct llama_context * ctx) {
@@ -12203,7 +12203,7 @@ bool llama_kv_cache_can_shift(struct llama_context * ctx) {
  int32_t llama_encode(
          struct llama_context * ctx,
            struct llama_batch   batch) {
-    const int ret = llama_encode_internal(*ctx, batch);
+    const int ret = llama_encode_impl(*ctx, batch);
      if (ret != 0) {
          LLAMA_LOG_ERROR("%s: failed to encode, ret = %d\n", __func__, ret);
      }
@@ -12214,7 +12214,7 @@ int32_t llama_encode(
  int32_t llama_decode(
          struct llama_context * ctx,
            struct llama_batch   batch) {
-    const int ret = llama_decode_internal(*ctx, batch);
+    const int ret = llama_decode_impl(*ctx, batch);
      if (ret != 0) {
          LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
      }
author	Georgi Gerganov <redacted>
	Mon, 6 Jan 2025 08:52:01 +0000 (10:52 +0200)
committer	GitHub <redacted>
	Mon, 6 Jan 2025 08:52:01 +0000 (10:52 +0200)
src/llama-quant.cpp		patch \| blob \| history
src/llama.cpp		patch \| blob \| history