#include "llama.h"
#include "ggml.h"
+#ifdef GGML_USE_CUBLAS
+#include "ggml-cuda.h"
+#endif
#include <array>
#include <ctime>
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
{
- static std::map<e_model, size_t> _MEM_REQ_SCRATCH0 = {
+ static std::map<e_model, size_t> k_sizes = {
{ MODEL_7B, 512ull * MB },
{ MODEL_13B, 512ull * MB },
{ MODEL_30B, 512ull * MB },
{ MODEL_65B, 1024ull * MB },
};
- return _MEM_REQ_SCRATCH0;
+ return k_sizes;
}
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
{
- static std::map<e_model, size_t> _MEM_REQ_SCRATCH1 = {
+ static std::map<e_model, size_t> k_sizes = {
{ MODEL_7B, 512ull * MB },
{ MODEL_13B, 512ull * MB },
{ MODEL_30B, 512ull * MB },
{ MODEL_65B, 1024ull * MB },
};
- return _MEM_REQ_SCRATCH1;
+ return k_sizes;
}
// 2*n_embd*n_ctx*n_layer*sizeof(float16)
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
{
- static std::map<e_model, size_t> _MEM_REQ_KV_SELF = {
+ static std::map<e_model, size_t> k_sizes = {
{ MODEL_7B, 1026ull * MB },
{ MODEL_13B, 1608ull * MB },
{ MODEL_30B, 3124ull * MB },
{ MODEL_65B, 5120ull * MB },
};
- return _MEM_REQ_KV_SELF;
+ return k_sizes;
}
// this is mostly needed for temporary mul_mat buffers to dequantize the data
// not actually needed if BLAS is disabled
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
{
- static std::map<e_model, size_t> _MEM_REQ_EVAL = {
+ static std::map<e_model, size_t> k_sizes = {
{ MODEL_7B, 768ull * MB },
{ MODEL_13B, 1024ull * MB },
{ MODEL_30B, 1280ull * MB },
{ MODEL_65B, 1536ull * MB },
};
- return _MEM_REQ_EVAL;
+ return k_sizes;
}
// default hparams (LLaMA 7B)
LLAMA_FILE_VERSION_GGML,
LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
LLAMA_FILE_VERSION_GGJT_V1, // added padding
+ LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
};
struct llama_file_loader {
file_version = LLAMA_FILE_VERSION_GGMF_V1;
} else if (magic == 'ggjt' && version == 1) {
file_version = LLAMA_FILE_VERSION_GGJT_V1;
+ } else if (magic == 'ggjt' && version == 2) {
+ file_version = LLAMA_FILE_VERSION_GGJT_V2;
} else {
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
magic, version);
case GGML_TYPE_F16:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
- case GGML_TYPE_Q4_2:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
write_vocab();
}
void write_magic() {
- file.write_u32('ggjt'); // magic
- file.write_u32(1); // version
+ file.write_u32(LLAMA_FILE_MAGIC); // magic
+ file.write_u32(LLAMA_FILE_VERSION); // version
}
void write_hparams(enum llama_ftype new_ftype) {
const llama_hparams & hparams = any_file_loader->hparams;
case GGML_TYPE_F16:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
- case GGML_TYPE_Q4_2:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
std::unique_ptr<llama_mmap> mapping;
llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
- auto first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
+ auto * first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
file_loaders.emplace_back(first_file);
uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
for (uint32_t i = 1; i < n_parts; i++) {
std::string fname = fname_base + "." + std::to_string(i);
- auto ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
+ auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
file_loaders.emplace_back(ith_file);
if (ith_file->hparams != first_file->hparams) {
throw format("llama.cpp: hparams inconsistent between files");
}
}
- struct ggml_tensor * get_tensor(const std::string & name, std::vector<uint32_t> ne) {
+ struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne) {
auto it = tensors_map.name_to_idx.find(name);
if (it == tensors_map.name_to_idx.end()) {
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
LLAMA_ASSERT(lt.ne.size() == 1);
tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0));
}
+ ggml_set_name(tensor, lt.name.c_str());
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
lt.ggml_tensor = tensor;
num_ggml_tensors_created++;
return tensor;
}
- void done_getting_tensors() {
+ void done_getting_tensors() const {
if (num_ggml_tensors_created != tensors_map.tensors.size()) {
throw std::string("llama.cpp: file contained more tensors than expected");
}
LLAMA_ASSERT(offset == lt.size);
} else if (lt.split_type == SPLIT_BY_COLUMNS) {
// Let's load the data into temporary buffers to ensure the OS performs large loads.
- std::vector<llama_buffer> tmp_bufs;
- tmp_bufs.resize(lt.shards.size());
+ std::vector<llama_buffer> tmp_bufs(lt.shards.size());
for (size_t i = 0; i < lt.shards.size(); i++) {
llama_load_tensor_shard & shard = lt.shards.at(i);
llama_file & file = file_loaders.at(shard.file_idx)->file;
cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
+ ggml_set_name(cache.k, "cache_k");
+ ggml_set_name(cache.v, "cache_v");
return true;
}
struct llama_context_params result = {
/*.n_ctx =*/ 512,
/*.n_parts =*/ -1,
- /*.seed =*/ 0,
+ /*.gpu_layers =*/ 0,
+ /*.seed =*/ -1,
/*.f16_kv =*/ false,
/*.logits_all =*/ false,
/*.vocab_only =*/ false,
switch (version) {
case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
- case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (latest)";
- default: LLAMA_ASSERT(false);
+ case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
+ case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)";
}
+
+ return "unknown";
}
static const char *llama_ftype_name(enum llama_ftype ftype) {
case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
return "mostly Q4_1, some F16";
- case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
const std::string & fname,
llama_context & lctx,
int n_ctx,
+ int n_gpu_layers,
ggml_type memory_type,
bool use_mmap,
bool use_mlock,
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
}
+ if (file_version != LLAMA_FILE_VERSION_GGJT_V2) {
+ if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
+ hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
+ hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
+ throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)");
+ }
+ }
+
if (vocab_only) {
return;
}
auto & ctx = model.ctx;
- size_t ctx_size, mmapped_size;
+ size_t ctx_size;
+ size_t mmapped_size;
ml->calc_sizes(&ctx_size, &mmapped_size);
fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
// prepare memory for the weights
{
- const auto & hparams = model.hparams;
-
const uint32_t n_embd = hparams.n_embd;
const uint32_t n_layer = hparams.n_layer;
const uint32_t n_vocab = hparams.n_vocab;
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
model.mapping = std::move(ml->mapping);
+#ifdef GGML_USE_CUBLAS
+ {
+ const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
+
+ fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
+
+ size_t vram_total = 0;
+
+ for (int i = 0; i < n_gpu; ++i) {
+ const auto & layer = model.layers[i];
+
+ ggml_cuda_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
+ ggml_cuda_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
+ ggml_cuda_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
+ ggml_cuda_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
+ ggml_cuda_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
+ ggml_cuda_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
+ ggml_cuda_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
+ }
+ if (n_gpu_layers > (int) hparams.n_layer) {
+ fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
+ ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
+ }
+
+ fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
+ }
+#else
+ (void) n_gpu_layers;
+#endif
// loading time will be recalculate after the first eval, so
// we take page faults deferred by mmap() into consideration
const std::string & fname,
llama_context & lctx,
int n_ctx,
+ int n_gpu_layers,
ggml_type memory_type,
bool use_mmap,
bool use_mlock,
llama_progress_callback progress_callback,
void *progress_callback_user_data) {
try {
- llama_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock,
+ llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock,
vocab_only, progress_callback, progress_callback_user_data);
return true;
} catch (const std::string & err) {
const int n_tokens,
const int n_past,
const int n_threads) {
+
+ // enforce that the first token is BOS
+ if (n_past == 0 && tokens[0] != llama_token_bos()) {
+ fprintf(stderr, "%s: first token must be BOS\n", __func__);
+ return false;
+ }
+
const int64_t t_start_us = ggml_time_us();
const int N = n_tokens;
const auto & model = lctx.model;
const auto & hparams = model.hparams;
- auto & kv_self = model.kv_self;
+ const auto & kv_self = model.kv_self;
LLAMA_ASSERT(!!kv_self.ctx);
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+ ggml_set_name(embd, "embd");
memcpy(embd->data, tokens, N*ggml_element_size(embd));
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
// self-attention
{
// compute Q and K and RoPE them
- struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
- struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
+ struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
+ struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
+ ggml_set_name(Qcur, "Qcur");
+ ggml_set_name(Kcur, "Kcur");
// store key and value to memory
{
ggml_permute(ctx0,
Qcur,
0, 2, 1, 3);
+ ggml_set_name(Q, "Q");
struct ggml_tensor * K =
ggml_permute(ctx0,
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
n_embd/n_head, n_head, n_past + N),
0, 2, 1, 3);
+ ggml_set_name(K, "K");
// K * Q
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+ ggml_set_name(KQ, "KQ");
// KQ_scaled = KQ / sqrt(n_embd/n_head)
- struct ggml_tensor * KQ_scaled =
- ggml_scale(ctx0,
- KQ,
- ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
+ struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
+
+ // KQ_scaled shape [n_past + N, N, n_head, 1]
+ struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
+ ggml_set_name(KQ_scaled, "KQ_scaled");
// KQ_masked = mask_past(KQ_scaled)
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+ struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
+ ggml_set_name(KQ_masked, "KQ_masked");
// KQ = soft_max(KQ_masked)
- struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
+
// split cached V into n_head heads
struct ggml_tensor * V =
n_ctx*ggml_element_size(kv_self.v),
n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
+ ggml_set_name(V, "V");
#if 1
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
+ ggml_set_name(KQV, "KQV");
#else
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
// KQV_merged = KQV.permute(0, 2, 1, 3)
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+ ggml_set_name(KQV_merged, "KQV_merged");
// cur = KQV_merged.contiguous().view(n_embd, N)
cur = ggml_cpy(ctx0,
KQV_merged,
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+ ggml_set_name(cur, "KQV_merged_contiguous");
// projection (no bias)
cur = ggml_mul_mat(ctx0,
lctx.use_buf(ctx0, -1);
// logits -> probs
- //inpL = ggml_soft_max(ctx0, inpL);
+ //inpL = ggml_soft_max_inplace(ctx0, inpL);
// run the computation
ggml_build_forward_expand(&gf, inpL);
}
// extract embeddings
- if (lctx.embedding.size()) {
+ if (!lctx.embedding.empty()) {
auto & embedding_out = lctx.embedding;
embedding_out.resize(n_embd);
size_t n;
};
+static_assert(std::is_trivially_copyable<llama_sp_symbol>::value, "llama_sp_symbol is not trivially copyable");
+
struct llama_sp_bigram {
struct comparator {
bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) {
sym.prev = index - 1;
sym.next = offs == text.size() ? -1 : index + 1;
index++;
- symbols_.emplace_back(std::move(sym));
+ symbols_.emplace_back(sym);
}
// seed the work queue with all possible 2-character tokens.
llama_tokenizer tokenizer(vocab);
std::vector<llama_vocab::id> output;
- if (text.size() == 0) {
+ if (text.empty()) {
return output;
}
if (bos) {
- output.push_back(1);
+ output.push_back(llama_token_bos());
}
tokenizer.tokenize(text, output);
}
}
-void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float penalty) {
+void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) {
if (last_tokens_size == 0 || penalty == 1.0f) {
return;
}
const int64_t t_start_sample_us = ggml_time_us();
for (size_t i = 0; i < candidates->size; ++i) {
- auto token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
+ const auto * token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
if (token_iter == last_tokens + last_tokens_size) {
continue;
}
}
}
-void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) {
+void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) {
if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) {
return;
}
float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat);
// Sample the next word X using top-k sampling
- llama_sample_top_k(nullptr, candidates, int(k));
+ llama_sample_top_k(nullptr, candidates, int(k), 1);
if (ctx) {
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
}
const int64_t t_start_sample_us = ggml_time_us();
// Find max element
- auto max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
+ auto * max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
return a.logit < b.logit;
});
switch (ftype) {
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
- case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
nthread = std::thread::hardware_concurrency();
}
- std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
+ std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
/*vocab_only*/ false));
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
} else if (tensor.type == GGML_TYPE_F16) {
f32_conv_buf.resize(nelements * sizeof(float));
f32_data = (float *) f32_conv_buf.addr;
- auto f16_data = (const ggml_fp16_t *) tensor.data;
+ const auto * f16_data = (const ggml_fp16_t *) tensor.data;
for (size_t i = 0; i < nelements; i++) {
f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
}
size_t first = counter; counter += chunk_size;
if (first >= nelements) {
if (!local_hist.empty()) {
- for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j];
+ for (int j=0; j<int(local_hist.size()); ++j) {
+ hist_cur[j] += local_hist[j];
+ }
new_size += local_size;
}
break;
}
lock.unlock();
size_t last = std::min(nelements, first + chunk_size);
- if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
+ if (local_hist.empty()) {
+ local_hist.resize(hist_cur.size(), 0);
+ }
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
}
};
- if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);
- for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute);
+ if ((int) workers.size() < nthread_use - 1) {
+ workers.resize(nthread_use - 1);
+ }
+ for (int it = 0; it < nthread_use - 1; ++it) {
+ workers[it] = std::thread(compute);
+ }
compute();
- for (int it = 0; it < nthread_use - 1; ++it) workers[it].join();
+ for (int it = 0; it < nthread_use - 1; ++it) {
+ workers[it].join();
+ }
}
printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
llama_context * ctx = new llama_context;
- if (params.seed <= 0) {
+ if (params.seed < 0) {
params.seed = time(NULL);
}
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
- if (!llama_model_load(path_model, *ctx, params.n_ctx, memory_type,
+ if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, memory_type,
params.use_mmap, params.use_mlock, params.vocab_only,
params.progress_callback, params.progress_callback_user_data)) {
fprintf(stderr, "%s: failed to load model\n", __func__);
fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
- size_t ctx_size, mmapped_size;
+ size_t ctx_size;
+ size_t mmapped_size;
model_loader->calc_sizes(&ctx_size, &mmapped_size);
base_buf.resize(ctx_size);
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
}
- std::string name(length, 0);
- fin.read(&name[0], length);
+ std::string name;
+ {
+ char buf[1024];
+ fin.read(buf, length);
+ name = std::string(buf, length);
+ }
// check for lora suffix and get the type of tensor
const std::string lora_suffix = ".lora";
base_name.erase(pos);
// fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
- if (model_tensors.find(base_name.data()) == model_tensors.end()) {
+ if (model_tensors.find(base_name) == model_tensors.end()) {
fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
return 1;
}
if (scaling != 1.0f) {
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
- BA = ggml_scale(lora_ctx, BA, scale_tensor);
+ BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
}
ggml_tensor * r;
lora_tensors.clear();
n_tensors++;
- if (n_tensors % 4 == 0)
+ if (n_tensors % 4 == 0) {
fprintf(stderr, ".");
+ }
}
}
}
}
-int llama_get_kv_cache_token_count(struct llama_context * ctx) {
+int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
return ctx->model.kv_self.n;
}
-#define LLAMA_MAX_RNG_STATE 64*1024
+#define LLAMA_MAX_RNG_STATE (64*1024)
void llama_set_rng_seed(struct llama_context * ctx, int seed) {
- if (seed <= 0) {
+ if (seed < 0) {
seed = time(NULL);
}
ctx->rng.seed(seed);
}
// Returns the *maximum* size of the state
-size_t llama_get_state_size(struct llama_context * ctx) {
+size_t llama_get_state_size(const struct llama_context * ctx) {
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
// for reference, std::mt19937(1337) serializes to 6701 bytes.
const size_t s_rng_size = sizeof(size_t);
}
// Copies the state to the specified destination address
-size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
- uint8_t * out = dest;
+size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
+ uint8_t * out = dst;
// copy rng
{
if (kv_size) {
const size_t elt_size = ggml_element_size(kv_self.k);
+
char buffer[4096];
+
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
ggml_cgraph gf{};
gf.n_threads = 1;
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
ggml_graph_compute(cpy_ctx, &gf);
+
+ ggml_free(cpy_ctx);
}
}
- const size_t written = out - dest;
+ const size_t written = out - dst;
const size_t max_size = llama_get_state_size(ctx);
LLAMA_ASSERT(written <= max_size);
// Sets the state reading from the specified source address
size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
- const uint8_t * in = src;
+ const uint8_t * inp = src;
// set rng
{
size_t rng_size;
char rng_buf[LLAMA_MAX_RNG_STATE];
- memcpy(&rng_size, in, sizeof(rng_size)); in += sizeof(rng_size);
- memcpy(&rng_buf[0], in, LLAMA_MAX_RNG_STATE); in += LLAMA_MAX_RNG_STATE;
+ memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size);
+ memcpy(&rng_buf[0], inp, LLAMA_MAX_RNG_STATE); inp += LLAMA_MAX_RNG_STATE;
std::stringstream rng_ss;
rng_ss.str(std::string(&rng_buf[0], rng_size));
size_t logits_cap;
size_t logits_size;
- memcpy(&logits_cap, in, sizeof(logits_cap)); in += sizeof(logits_cap);
- memcpy(&logits_size, in, sizeof(logits_size)); in += sizeof(logits_size);
+ memcpy(&logits_cap, inp, sizeof(logits_cap)); inp += sizeof(logits_cap);
+ memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size);
LLAMA_ASSERT(ctx->logits.capacity() == logits_cap);
if (logits_size) {
ctx->logits.resize(logits_size);
- memcpy(ctx->logits.data(), in, logits_size * sizeof(float));
+ memcpy(ctx->logits.data(), inp, logits_size * sizeof(float));
}
- in += logits_cap * sizeof(float);
+ inp += logits_cap * sizeof(float);
}
// set embeddings
{
size_t embedding_size;
- memcpy(&embedding_size, in, sizeof(embedding_size)); in += sizeof(embedding_size);
+ memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size);
LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size);
if (embedding_size) {
- memcpy(ctx->embedding.data(), in, embedding_size * sizeof(float));
- in += embedding_size * sizeof(float);
+ memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float));
+ inp += embedding_size * sizeof(float);
}
}
size_t kv_size;
int kv_ntok;
- memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size);
- memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);
+ memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
+ memcpy(&kv_ntok, inp, sizeof(kv_ntok)); inp += sizeof(kv_ntok);
if (kv_size) {
LLAMA_ASSERT(kv_self.buf.size == kv_size);
const size_t elt_size = ggml_element_size(kv_self.k);
+
char buffer[4096];
+
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
ggml_cgraph gf{};
gf.n_threads = 1;
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
- kin3d->data = (void *) in;
- in += ggml_nbytes(kin3d);
+ kin3d->data = (void *) inp;
+ inp += ggml_nbytes(kin3d);
ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
- vin3d->data = (void *) in;
- in += ggml_nbytes(vin3d);
+ vin3d->data = (void *) inp;
+ inp += ggml_nbytes(vin3d);
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
n_embd, kv_ntok, n_layer,
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
ggml_graph_compute(cpy_ctx, &gf);
+ ggml_free(cpy_ctx);
}
ctx->model.kv_self.n = kv_ntok;
}
- const size_t nread = in - src;
+ const size_t nread = inp - src;
const size_t max_size = llama_get_state_size(ctx);
LLAMA_ASSERT(nread <= max_size);
return nread;
}
+bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
+ llama_file file(path_session, "rb");
+
+ // sanity checks
+ {
+ const uint32_t magic = file.read_u32();
+ const uint32_t version = file.read_u32();
+
+ if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
+ fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
+ return false;
+ }
+
+ llama_hparams session_hparams;
+ file.read_raw(&session_hparams, sizeof(llama_hparams));
+
+ if (session_hparams != ctx->model.hparams) {
+ fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__);
+ return false;
+ }
+ }
+
+ // load the prompt
+ {
+ const uint32_t n_token_count = file.read_u32();
+
+ if (n_token_count > n_token_capacity) {
+ fprintf(stderr, "%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
+ return false;
+ }
+
+ file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
+ *n_token_count_out = n_token_count;
+ }
+
+ // restore the context state
+ {
+ const size_t n_state_size_cur = file.size - file.tell();
+ const size_t n_state_size_max = llama_get_state_size(ctx);
+
+ if (n_state_size_cur > n_state_size_max) {
+ fprintf(stderr, "%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
+ return false;
+ }
+
+ std::vector<uint8_t> state_data(n_state_size_max);
+ file.read_raw(state_data.data(), n_state_size_cur);
+
+ llama_set_state_data(ctx, state_data.data());
+ }
+
+ return true;
+}
+
+bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
+ llama_file file(path_session, "wb");
+
+ file.write_u32(LLAMA_SESSION_MAGIC);
+ file.write_u32(LLAMA_SESSION_VERSION);
+
+ file.write_raw(&ctx->model.hparams, sizeof(llama_hparams));
+
+ // save the prompt
+ file.write_u32((uint32_t) n_token_count);
+ file.write_raw(tokens, sizeof(llama_token) * n_token_count);
+
+ // save the context state
+ {
+ const size_t n_state_size_max = llama_get_state_size(ctx);
+
+ std::vector<uint8_t> state_data(n_state_size_max);
+ const size_t n_state_size_cur = llama_copy_state_data(ctx, state_data.data());
+
+ file.write_raw(state_data.data(), n_state_size_cur);
+ }
+
+ return true;
+}
+
int llama_eval(
struct llama_context * ctx,
const llama_token * tokens,
fprintf(stderr, "%s: failed to eval\n", __func__);
return 1;
}
+
// get a more accurate load time, upon first eval
+ // TODO: fix this
if (!ctx->has_evaluated_once) {
ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
ctx->has_evaluated_once = true;
}
+
return 0;
}
return res.size();
}
-int llama_n_vocab(struct llama_context * ctx) {
+int llama_n_vocab(const struct llama_context * ctx) {
return ctx->vocab.id_to_token.size();
}
-int llama_n_ctx(struct llama_context * ctx) {
+int llama_n_ctx(const struct llama_context * ctx) {
return ctx->model.hparams.n_ctx;
}
-int llama_n_embd(struct llama_context * ctx) {
+int llama_n_embd(const struct llama_context * ctx) {
return ctx->model.hparams.n_embd;
}
return ctx->embedding.data();
}
-const char * llama_token_to_str(struct llama_context * ctx, llama_token token) {
+const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
if (token >= llama_n_vocab(ctx)) {
return nullptr;
}
fprintf(stderr, "\n");
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
- fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
+ fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval);
- fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval);
+ fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval);
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
}
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
return ctx->model.tensors_by_name;
}
-
-bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
- llama_file file(path_session, "rb");
-
- // sanity checks
- {
- const uint32_t magic = file.read_u32();
- const uint32_t version = file.read_u32();
-
- if (!(magic == LLAMA_SESSION_MAGIC && version == LLAMA_SESSION_VERSION)) {
- fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
- return false;
- }
-
- llama_hparams session_hparams;
- file.read_raw(&session_hparams, sizeof(llama_hparams));
-
- if (session_hparams != ctx->model.hparams) {
- fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__);
- return false;
- }
- }
-
- // load the prompt
- {
- const uint32_t n_token_count = file.read_u32();
-
- if (n_token_count > n_token_capacity) {
- fprintf(stderr, "%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
- return false;
- }
-
- file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
- *n_token_count_out = n_token_count;
- }
-
- // restore the context state
- {
- const size_t n_state_size_cur = file.size - file.tell();
- const size_t n_state_size_max = llama_get_state_size(ctx);
-
- if (n_state_size_cur > n_state_size_max) {
- fprintf(stderr, "%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
- return false;
- }
-
- std::vector<uint8_t> state_data(n_state_size_max);
- file.read_raw(state_data.data(), n_state_size_cur);
-
- llama_set_state_data(ctx, state_data.data());
- }
-
- return true;
-}
-
-bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
- llama_file file(path_session, "wb");
-
- file.write_u32(LLAMA_SESSION_MAGIC);
- file.write_u32(LLAMA_SESSION_VERSION);
-
- file.write_raw(&ctx->model.hparams, sizeof(llama_hparams));
-
- // save the prompt
- file.write_u32((uint32_t) n_token_count);
- file.write_raw(tokens, sizeof(llama_token) * n_token_count);
-
- // save the context state
- {
- const size_t n_state_size_max = llama_get_state_size(ctx);
-
- std::vector<uint8_t> state_data(n_state_size_max);
- const size_t n_state_size_cur = llama_copy_state_data(ctx, state_data.data());
-
- file.write_raw(state_data.data(), n_state_size_cur);
- }
-
- return true;
-}
\ No newline at end of file