}
}
-struct quantize_state_internal {
+struct quantize_state_impl {
const llama_model & model;
const llama_model_quantize_params * params;
// used to figure out if a model shares tok_embd with the output weight
bool has_output = false;
- quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
+ quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params)
: model(model)
, params(params)
{}
};
-static void llama_tensor_dequantize_internal(
+static void llama_tensor_dequantize_impl(
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
const size_t nelements, const int nthread
) {
workers.clear();
}
-static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
+static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
const std::string name = ggml_get_name(tensor);
// TODO: avoid hardcoded tensor names - use the TN_* constants
return new_type;
}
-static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
+static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
if (nthread < 2) {
// single-thread
size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
return new_size;
}
-static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
+static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
ggml_type default_type;
llama_ftype ftype = params->ftype;
llm_load_hparams(ml, model);
llm_load_stats (ml, model);
- struct quantize_state_internal qs(model, params);
+ struct quantize_state_impl qs(model, params);
if (params->only_copy) {
ftype = model.ftype;
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
} else {
- llama_tensor_dequantize_internal(tensor, f32_conv_buf, workers, nelements, nthread);
+ llama_tensor_dequantize_impl(tensor, f32_conv_buf, workers, nelements, nthread);
f32_data = (float *) f32_conv_buf.data();
}
void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
- new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
+ new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
}
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
}
const char * fname_out,
const llama_model_quantize_params * params) {
try {
- llama_model_quantize_internal(fname_inp, fname_out, params);
+ llama_model_quantize_impl(fname_inp, fname_out, params);
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what());
return 1;
// return positive int on warning
// return negative int on error
//
-static int llama_decode_internal(
+static int llama_decode_impl(
llama_context & lctx,
llama_batch inp_batch) {
// return positive int on warning
// return negative int on error
//
-static int llama_encode_internal(
+static int llama_encode_impl(
llama_context & lctx,
llama_batch inp_batch) {
}
// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
-static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
+static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
auto & kv_self = lctx.kv_self;
const auto & hparams = lctx.model.hparams;
//LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0);
}
-static void llama_kv_cache_update_internal(struct llama_context & lctx) {
+static void llama_kv_cache_update_impl(struct llama_context & lctx) {
bool need_reserve = false;
if (lctx.kv_self.has_shift) {
// defragment the KV cache if needed
if (lctx.kv_self.do_defrag) {
- llama_kv_cache_defrag_internal(lctx);
+ llama_kv_cache_defrag_impl(lctx);
need_reserve = true;
}
void llama_kv_cache_update(struct llama_context * ctx) {
- llama_kv_cache_update_internal(*ctx);
+ llama_kv_cache_update_impl(*ctx);
}
bool llama_kv_cache_can_shift(struct llama_context * ctx) {
int32_t llama_encode(
struct llama_context * ctx,
struct llama_batch batch) {
- const int ret = llama_encode_internal(*ctx, batch);
+ const int ret = llama_encode_impl(*ctx, batch);
if (ret != 0) {
LLAMA_LOG_ERROR("%s: failed to encode, ret = %d\n", __func__, ret);
}
int32_t llama_decode(
struct llama_context * ctx,
struct llama_batch batch) {
- const int ret = llama_decode_internal(*ctx, batch);
+ const int ret = llama_decode_impl(*ctx, batch);
if (ret != 0) {
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
}