-#include "ggml.h"
-#include "llama.h"
#include "build-info.h"
+#include "llama.h"
+
#include <cstdio>
#include <map>
#include <string>
// ./quantize models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
//
int main(int argc, char ** argv) {
- ggml_time_init();
-
if (argc < 3) {
fprintf(stderr, "usage: %s model-f32.bin [model-quant.bin] type [nthreads]\n", argv[0]);
for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) {
return 1;
}
- // needed to initialize f16 tables
- {
- struct ggml_init_params params = { 0, NULL, false };
- struct ggml_context * ctx = ggml_init(params);
- ggml_free(ctx);
- }
+ llama_init_backend();
// parse command line arguments
const std::string fname_inp = argv[1];
}
fprintf(stderr, "\n");
- const int64_t t_main_start_us = ggml_time_us();
+ const int64_t t_main_start_us = llama_time_us();
int64_t t_quantize_us = 0;
// load the model
{
- const int64_t t_start_us = ggml_time_us();
+ const int64_t t_start_us = llama_time_us();
if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype, nthread)) {
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
return 1;
}
- t_quantize_us = ggml_time_us() - t_start_us;
+ t_quantize_us = llama_time_us() - t_start_us;
}
// report timing
{
- const int64_t t_main_end_us = ggml_time_us();
+ const int64_t t_main_end_us = llama_time_us();
printf("\n");
printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0);
typedef int llama_token;
typedef struct llama_token_data {
- llama_token id; // token id
- float logit; // log-odds of the token
- float p; // probability of the token
+ llama_token id; // token id
+ float logit; // log-odds of the token
+ float p; // probability of the token
} llama_token_data;
typedef struct llama_token_data_array {
// model file types
enum llama_ftype {
- LLAMA_FTYPE_ALL_F32 = 0,
- LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
- LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
- LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
+ LLAMA_FTYPE_ALL_F32 = 0,
+ LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
+ LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
+ LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
- // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
- // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
- LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
- LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
- LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
+ // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
+ // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
+ LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
+ LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
+ LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
};
LLAMA_API struct llama_context_params llama_context_default_params();
LLAMA_API bool llama_mmap_supported();
LLAMA_API bool llama_mlock_supported();
+ // TODO: not great API - very likely to change
+ // Initialize the llama + ggml backend
+ // Call once at the start of the program
+ LLAMA_API void llama_init_backend();
+
+ LLAMA_API int64_t llama_time_us();
+
// Various functions for loading a ggml llama model.
// Allocate (almost) all memory needed for the model.
// Return NULL on failure