#include <string>
#include <vector>
+#include <stdexcept>
#ifdef __has_include
#if __has_include(<unistd.h>)
llama_file(const char * fname, const char * mode) {
fp = std::fopen(fname, mode);
if (fp == NULL) {
- throw format("failed to open %s: %s", fname, std::strerror(errno));
+ throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
}
seek(0, SEEK_END);
size = tell();
LLAMA_ASSERT(ret == 0); // same
}
- void read_raw(void * ptr, size_t size) {
- if (size == 0) {
+ void read_raw(void * ptr, size_t len) const {
+ if (len == 0) {
return;
}
errno = 0;
- std::size_t ret = std::fread(ptr, size, 1, fp);
+ std::size_t ret = std::fread(ptr, len, 1, fp);
if (ferror(fp)) {
- throw format("read error: %s", strerror(errno));
+ throw std::runtime_error(format("read error: %s", strerror(errno)));
}
if (ret != 1) {
- throw std::string("unexpectedly reached end of file");
+ throw std::runtime_error(std::string("unexpectedly reached end of file"));
}
}
return std::string(chars.data(), len);
}
- void write_raw(const void * ptr, size_t size) {
- if (size == 0) {
+ void write_raw(const void * ptr, size_t len) const {
+ if (len == 0) {
return;
}
errno = 0;
- size_t ret = std::fwrite(ptr, size, 1, fp);
+ size_t ret = std::fwrite(ptr, len, 1, fp);
if (ret != 1) {
- throw format("write error: %s", strerror(errno));
+ throw std::runtime_error(format("write error: %s", strerror(errno)));
}
}
#ifdef _POSIX_MAPPED_FILES
static constexpr bool SUPPORTED = true;
- llama_mmap(struct llama_file * file, bool prefetch = true) {
+ llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */) {
size = file->size;
int fd = fileno(file->fp);
int flags = MAP_SHARED;
#endif
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
if (addr == MAP_FAILED) {
- throw format("mmap failed: %s", strerror(errno));
+ throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
}
- if (prefetch) {
+ if (prefetch > 0) {
// Advise the kernel to preload the mapped memory
- if (madvise(addr, file->size, MADV_WILLNEED)) {
+ if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
strerror(errno));
}
DWORD error = GetLastError();
if (hMapping == NULL) {
- throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
+ throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
}
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
CloseHandle(hMapping);
if (addr == NULL) {
- throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
+ throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
}
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
#else
static constexpr bool SUPPORTED = false;
- llama_mmap(struct llama_file *) {
- throw std::string("mmap not supported");
+ llama_mmap(struct llama_file *, bool prefetch = true) {
+ (void)prefetch;
+ throw std::runtime_error(std::string("mmap not supported"));
}
#endif
};
}
}
- void init(void * addr) {
- LLAMA_ASSERT(this->addr == NULL && this->size == 0);
- this->addr = addr;
+ void init(void * ptr) {
+ LLAMA_ASSERT(addr == NULL && size == 0);
+ addr = ptr;
}
void grow_to(size_t target_size) {
return (size_t) si.dwPageSize;
}
- bool raw_lock(void * addr, size_t size) {
+ bool raw_lock(void * ptr, size_t len) {
for (int tries = 1; ; tries++) {
- if (VirtualLock(addr, size)) {
+ if (VirtualLock(ptr, len)) {
return true;
}
if (tries == 2) {
fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
- size, this->size, llama_format_win_err(GetLastError()).c_str());
+ len, size, llama_format_win_err(GetLastError()).c_str());
return false;
}
// is equal to the number of pages in its minimum working set minus
// a small overhead."
// Hopefully a megabyte is enough overhead:
- size_t increment = size + 1048576;
+ size_t increment = len + 1048576;
// The minimum must be <= the maximum, so we need to increase both:
min_ws_size += increment;
max_ws_size += increment;
}
}
- void raw_unlock(void * addr, size_t size) {
- if (!VirtualUnlock(addr, size)) {
+ void raw_unlock(void * ptr, size_t len) {
+ if (!VirtualUnlock(ptr, len)) {
fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
llama_format_win_err(GetLastError()).c_str());
}
#else
static constexpr bool SUPPORTED = false;
- void raw_lock(const void * addr, size_t size) {
+ size_t lock_granularity() {
+ return (size_t) 65536;
+ }
+
+ bool raw_lock(const void * addr, size_t len) {
fprintf(stderr, "warning: mlock not supported on this system\n");
+ return false;
}
- void raw_unlock(const void * addr, size_t size) {}
+ void raw_unlock(const void * addr, size_t len) {}
#endif
};
uint8_t * addr = NULL;
size_t size = 0;
- void resize(size_t size) {
+ llama_buffer() = default;
+
+ void resize(size_t len) {
delete[] addr;
- addr = new uint8_t[size];
- this->size = size;
+ addr = new uint8_t[len];
+ size = len;
}
~llama_buffer() {
delete[] addr;
}
+
+ // disable copy and move
+ llama_buffer(const llama_buffer&) = delete;
+ llama_buffer(llama_buffer&&) = delete;
+ llama_buffer& operator=(const llama_buffer&) = delete;
+ llama_buffer& operator=(llama_buffer&&) = delete;
};
#ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h"
struct llama_ctx_buffer {
uint8_t * addr = NULL;
+ bool is_cuda;
size_t size = 0;
+ llama_ctx_buffer() = default;
+
void resize(size_t size) {
+ free();
+
+ addr = (uint8_t *) ggml_cuda_host_malloc(size);
if (addr) {
- ggml_cuda_host_free(addr);
+ is_cuda = true;
+ }
+ else {
+ // fall back to pageable memory
+ addr = new uint8_t[size];
+ is_cuda = false;
}
- addr = (uint8_t *) ggml_cuda_host_malloc(size);
this->size = size;
}
- ~llama_ctx_buffer() {
+ void free() {
if (addr) {
- ggml_cuda_host_free(addr);
+ if (is_cuda) {
+ ggml_cuda_host_free(addr);
+ }
+ else {
+ delete[] addr;
+ }
}
+ addr = NULL;
}
+
+ ~llama_ctx_buffer() {
+ free();
+ }
+
+ // disable copy and move
+ llama_ctx_buffer(const llama_ctx_buffer&) = delete;
+ llama_ctx_buffer(llama_ctx_buffer&&) = delete;
+ llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
+ llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
};
#else
typedef llama_buffer llama_ctx_buffer;
// Defines fileno on msys:
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
+#include <cstddef>
#include <cstdint>
#include <cstdio>
#endif
MODEL_65B,
};
+
static const size_t MB = 1024*1024;
// computed for n_ctx == 2048
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
bool operator!=(const llama_hparams & other) const {
- return memcmp(this, &other, sizeof(llama_hparams));
+ return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams)));
}
};
LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
LLAMA_FILE_VERSION_GGJT_V1, // added padding
LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
+ LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format
};
struct llama_file_loader {
}
void read_magic() {
uint32_t magic = file.read_u32();
- uint32_t version = 0;
- if (magic != 'ggml') {
- version = file.read_u32();
+ if (magic == LLAMA_FILE_MAGIC_GGML) {
+ file_version = LLAMA_FILE_VERSION_GGML;
+ return;
}
- if (magic == 'ggml' && version == 0) {
- file_version = LLAMA_FILE_VERSION_GGML;
- } else if (magic == 'ggmf' && version == 1) {
- file_version = LLAMA_FILE_VERSION_GGMF_V1;
- } else if (magic == 'ggjt' && version == 1) {
- file_version = LLAMA_FILE_VERSION_GGJT_V1;
- } else if (magic == 'ggjt' && version == 2) {
- file_version = LLAMA_FILE_VERSION_GGJT_V2;
- } else {
- throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
- magic, version);
+ uint32_t version = file.read_u32();
+
+ switch (magic) {
+ case LLAMA_FILE_MAGIC_GGMF:
+ switch (version) {
+ case 1: file_version = LLAMA_FILE_VERSION_GGMF_V1; return;
+ }
+ break;
+ case LLAMA_FILE_MAGIC_GGJT:
+ switch (version) {
+ case 1: file_version = LLAMA_FILE_VERSION_GGJT_V1; return;
+ case 2: file_version = LLAMA_FILE_VERSION_GGJT_V2; return;
+ case 3: file_version = LLAMA_FILE_VERSION_GGJT_V3; return;
+ }
}
+
+ throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
+ magic, version);
}
void read_hparams() {
hparams.n_vocab = file.read_u32();
if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
// skip to the next multiple of 32 bytes
- file.seek(-file.tell() & 31, SEEK_CUR);
+ file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
}
shard.file_idx = file_idx;
shard.file_off = file.tell();
file.write_u32(new_type);
file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
file.write_raw(tensor.name.data(), tensor.name.size());
- file.seek(-file.tell() & 31, SEEK_CUR);
+ file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type));
file.write_raw(new_data, new_size);
}
}
}
- struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne) {
+ struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
auto it = tensors_map.name_to_idx.find(name);
if (it == tensors_map.name_to_idx.end()) {
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
}
- return get_tensor_for(lt);
+ return get_tensor_for(lt, backend);
}
- struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) {
+ struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
struct ggml_tensor * tensor;
if (lt.ne.size() == 2) {
tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
}
ggml_set_name(tensor, lt.name.c_str());
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
+ tensor->backend = backend;
lt.ggml_tensor = tensor;
num_ggml_tensors_created++;
return tensor;
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
size_t data_size = 0;
+ size_t prefetch_size = 0;
for (const llama_load_tensor & lt : tensors_map.tensors) {
data_size += lt.size;
+ if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
+ prefetch_size += lt.size;
+ }
}
if (use_mmap) {
- mapping.reset(new llama_mmap(&file_loaders.at(0)->file));
+ mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
if (!lmlock) {
// Don't call the callback since the actual loading will be lazy
// and we can't measure it.
size_t done_size = 0;
for (llama_load_tensor & lt : tensors_map.tensors) {
+ if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
+ continue;
+ }
if (progress_callback) {
progress_callback((float) done_size / data_size, progress_callback_user_data);
}
lmlock->grow_to(done_size);
}
}
- if (progress_callback) {
- progress_callback(1.0f, progress_callback_user_data);
- }
}
void load_data_for(llama_load_tensor & lt) {
struct llama_context_params llama_context_default_params() {
struct llama_context_params result = {
/*.n_ctx =*/ 512,
- /*.n_parts =*/ -1,
/*.gpu_layers =*/ 0,
/*.seed =*/ -1,
- /*.f16_kv =*/ false,
+ /*.f16_kv =*/ true,
/*.logits_all =*/ false,
/*.vocab_only =*/ false,
/*.use_mmap =*/ true,
return llama_mlock::SUPPORTED;
}
+void llama_init_backend() {
+ ggml_time_init();
+
+ // needed to initialize f16 tables
+ {
+ struct ggml_init_params params = { 0, NULL, false };
+ struct ggml_context * ctx = ggml_init(params);
+ ggml_free(ctx);
+ }
+}
+
+int64_t llama_time_us() {
+ return ggml_time_us();
+}
+
//
// model loading
//
case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
- case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)";
+ case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)";
+ case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (latest)";
}
return "unknown";
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
}
- if (file_version != LLAMA_FILE_VERSION_GGJT_V2) {
+ if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
- throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)");
+ throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)");
+ }
+ }
+
+ if (file_version < LLAMA_FILE_VERSION_GGJT_V3) {
+ if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
+ hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
+ hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
+ throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)");
}
}
size_t ctx_size;
size_t mmapped_size;
ml->calc_sizes(&ctx_size, &mmapped_size);
- fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
-
- // print memory requirements
- {
- const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
-
- // this is the total memory required to run the inference
- const size_t mem_required =
- ctx_size +
- mmapped_size +
- MEM_REQ_SCRATCH0().at(model.type) +
- MEM_REQ_SCRATCH1().at(model.type) +
- MEM_REQ_EVAL().at(model.type);
-
- // this is the memory required by one llama_state
- const size_t mem_required_state =
- scale*MEM_REQ_KV_SELF().at(model.type);
-
- fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
- mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
- }
+ fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
// create the ggml context
{
}
}
+#ifdef GGML_USE_CUBLAS
+#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA
+#else
+#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
+#endif
+
// prepare memory for the weights
+ size_t vram_total = 0;
{
const uint32_t n_embd = hparams.n_embd;
const uint32_t n_layer = hparams.n_layer;
ml->ggml_ctx = ctx;
- model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
- model.norm = ml->get_tensor("norm.weight", {n_embd});
- model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
+ model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
+ model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
+
+ // "output" tensor
+ {
+ ggml_backend backend_output;
+ if (n_gpu_layers > int(n_layer)) { // NOLINT
+ backend_output = LLAMA_BACKEND_OFFLOAD;
+ } else {
+ backend_output = GGML_BACKEND_CPU;
+ }
+
+ model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
+ }
+
+ const int i_gpu_start = n_layer - n_gpu_layers;
model.layers.resize(n_layer);
for (uint32_t i = 0; i < n_layer; ++i) {
+ const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
+
auto & layer = model.layers[i];
std::string layers_i = "layers." + std::to_string(i);
- layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd});
+ layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
+
+ layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
+ layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
+ layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
+ layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);
- layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd});
- layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd});
- layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd});
- layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd});
+ layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
- layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd});
+ layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
+ layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend);
+ layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend);
- layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff});
- layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd});
- layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff});
+ if (backend == GGML_BACKEND_CUDA) {
+ vram_total +=
+ ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
+ ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
+ ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
+ }
}
}
ml->done_getting_tensors();
- // populate `tensors_by_name`
- for (llama_load_tensor & lt : ml->tensors_map.tensors) {
- model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
- }
+ // print memory requirements
+ {
+ const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
- ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
+ // this is the total memory required to run the inference
+ const size_t mem_required =
+ ctx_size +
+ mmapped_size - vram_total + // weights in VRAM not in memory
+ MEM_REQ_SCRATCH0().at(model.type) +
+ MEM_REQ_SCRATCH1().at(model.type) +
+ MEM_REQ_EVAL().at(model.type);
+
+ // this is the memory required by one llama_state
+ const size_t mem_required_state =
+ scale*MEM_REQ_KV_SELF().at(model.type);
+
+ fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
+ mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
- model.mapping = std::move(ml->mapping);
#ifdef GGML_USE_CUBLAS
- {
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
+ if (n_gpu_layers > (int) hparams.n_layer) {
+ fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
+ }
+ fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
+#else
+ (void) n_gpu_layers;
+#endif
+ }
- size_t vram_total = 0;
+ // populate `tensors_by_name`
+ for (llama_load_tensor & lt : ml->tensors_map.tensors) {
+ model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
+ }
- for (int i = 0; i < n_gpu; ++i) {
- const auto & layer = model.layers[i];
+ ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
- ggml_cuda_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
- ggml_cuda_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
- ggml_cuda_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
- ggml_cuda_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
- ggml_cuda_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
- ggml_cuda_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
- ggml_cuda_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
+#ifdef GGML_USE_CUBLAS
+ {
+ size_t done_size = 0;
+ size_t data_size = 0;
+ for (llama_load_tensor & lt : ml->tensors_map.tensors) {
+ data_size += lt.size;
+ if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
+ done_size += lt.size;
+ }
}
- if (n_gpu_layers > (int) hparams.n_layer) {
- fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
- ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
+ for (llama_load_tensor & lt : ml->tensors_map.tensors) {
+ if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
+ continue;
+ }
+ if (progress_callback) {
+ progress_callback((float) done_size / data_size, progress_callback_user_data);
+ }
+ ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
+ done_size += lt.size;
}
+ }
+#endif // GGML_USE_CUBLAS
- fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
+ if (progress_callback) {
+ progress_callback(1.0f, progress_callback_user_data);
}
-#else
- (void) n_gpu_layers;
-#endif
+
+ model.mapping = std::move(ml->mapping);
// loading time will be recalculate after the first eval, so
// we take page faults deferred by mmap() into consideration
{
cur = ggml_rms_norm(ctx0, inpL);
- // cur = attention_norm*cur
- cur = ggml_mul(ctx0,
- ggml_repeat(ctx0, model.layers[il].attention_norm, cur),
- cur);
+ // cur = cur*attention_norm(broadcasted)
+ cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
}
// self-attention
{
cur = ggml_rms_norm(ctx0, inpFF);
- // cur = ffn_norm*cur
- cur = ggml_mul(ctx0,
- ggml_repeat(ctx0, model.layers[il].ffn_norm, cur),
- cur);
+ // cur = cur*ffn_norm(broadcasted)
+ cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
}
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
inpL = ggml_rms_norm(ctx0, inpL);
- // inpL = norm*inpL
- inpL = ggml_mul(ctx0,
- ggml_repeat(ctx0, model.norm, inpL),
- inpL);
+ // inpL = inpL*norm(broadcasted)
+ inpL = ggml_mul(ctx0, inpL, model.norm);
embeddings = inpL;
}
unsigned * cur_percentage_p = (unsigned *) ctx;
unsigned percentage = (unsigned) (100 * progress);
while (percentage > *cur_percentage_p) {
- ++*cur_percentage_p;
+ *cur_percentage_p = percentage;
fprintf(stderr, ".");
fflush(stderr);
if (percentage >= 100) {
{
uint32_t magic;
fin.read((char *) &magic, sizeof(magic));
- if (magic != 'ggla') {
+ if (magic != LLAMA_FILE_MAGIC_GGLA) {
fprintf(stderr, "%s: bad file magic\n", __func__);
return 1;
}
// maybe this should in llama_model_loader
if (model_loader->use_mmap) {
- model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false));
+ model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
}
}
}
size_t idx = model_loader->tensors_map.name_to_idx[base_name];
llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
- base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
+ base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
lt.data = (uint8_t *) lt.ggml_tensor->data;
model_loader->load_data_for(lt);
lt.ggml_tensor->data = lt.data;
}
// Sets the state reading from the specified source address
-size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
- const uint8_t * inp = src;
+size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
+ uint8_t * inp = src;
// set rng
{
# define LLAMA_API
#endif
-#define LLAMA_FILE_VERSION 2
-#define LLAMA_FILE_MAGIC 'ggjt'
-#define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
-#define LLAMA_SESSION_MAGIC 'ggsn'
+#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
+#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
+#define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
+#define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml'
+#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
+
+#define LLAMA_FILE_VERSION 3
+#define LLAMA_FILE_MAGIC LLAMA_FILE_MAGIC_GGJT
+#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
+#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
#define LLAMA_SESSION_VERSION 1
#ifdef __cplusplus
typedef int llama_token;
typedef struct llama_token_data {
- llama_token id; // token id
- float logit; // log-odds of the token
- float p; // probability of the token
+ llama_token id; // token id
+ float logit; // log-odds of the token
+ float p; // probability of the token
} llama_token_data;
typedef struct llama_token_data_array {
struct llama_context_params {
int n_ctx; // text context
- int n_parts; // -1 for default
int n_gpu_layers; // number of layers to store in VRAM
int seed; // RNG seed, -1 for random
// model file types
enum llama_ftype {
- LLAMA_FTYPE_ALL_F32 = 0,
- LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
- LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
- LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
+ LLAMA_FTYPE_ALL_F32 = 0,
+ LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
+ LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
+ LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
- // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
- // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
- LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
- LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
- LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
+ // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
+ // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
+ LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
+ LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
+ LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
};
LLAMA_API struct llama_context_params llama_context_default_params();
LLAMA_API bool llama_mmap_supported();
LLAMA_API bool llama_mlock_supported();
+ // TODO: not great API - very likely to change
+ // Initialize the llama + ggml backend
+ // Call once at the start of the program
+ LLAMA_API void llama_init_backend();
+
+ LLAMA_API int64_t llama_time_us();
+
// Various functions for loading a ggml llama model.
// Allocate (almost) all memory needed for the model.
// Return NULL on failure
// Set the state reading from the specified address
// Returns the number of bytes read
- LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
+ LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src);
// Save/load session file
LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);