// for quantize-stats only
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
- int64_t t_load_us = 0;
+ int64_t t_load_us = 0;
int64_t t_start_us = 0;
+ // total number of parameters in the model
+ uint64_t n_elements = 0;
+
+ // total size of all the tensors in the model in bytes
+ size_t n_bytes = 0;
+
// keep track of loaded lora adapters
std::set<struct llama_lora_adapter *> lora_adapters;
int n_tensors = 0;
int n_created = 0;
- int64_t n_elements = 0;
- size_t n_bytes = 0;
+ uint64_t n_elements = 0;
+ size_t n_bytes = 0;
bool use_mmap = false;
bool check_tensors;
}
}
+static void llm_load_stats(llama_model_loader & ml, llama_model & model) {
+ model.n_elements = ml.n_elements;
+ model.n_bytes = ml.n_bytes;
+}
+
static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
model.arch = ml.get_arch();
if (model.arch == LLM_ARCH_UNKNOWN) {
throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
}
+ llm_load_stats(ml, model);
llm_load_print_meta(ml, model);
if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
llama_model model;
llm_load_arch(ml, model);
llm_load_hparams(ml, model);
+ llm_load_stats(ml, model);
struct quantize_state_internal qs(model, params);
}
uint64_t llama_model_size(const struct llama_model * model) {
- uint64_t size = 0;
- for (const auto & it : model->tensors_by_name) {
- size += ggml_nbytes(it.second);
- }
- return size;
+ return model->n_bytes;
}
uint64_t llama_model_n_params(const struct llama_model * model) {
- uint64_t nparams = 0;
- for (const auto & it : model->tensors_by_name) {
- nparams += ggml_nelements(it.second);
- }
- return nparams;
+ return model->n_elements;
}
struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {