$ ./bin/starcoder -m ../models/bigcode/gpt_bigcode-santacoder-ggml-q4_1.bin -p "def fibonnaci(" -t 4 --top_k 0 --top_p 0.95 --temp 0.2
main: seed = 1683881276
-gpt2_model_load: loading model from '../models/bigcode/gpt_bigcode-santacoder-ggml-q4_1.bin'
-gpt2_model_load: n_vocab = 49280
-gpt2_model_load: n_ctx = 2048
-gpt2_model_load: n_embd = 2048
-gpt2_model_load: n_head = 16
-gpt2_model_load: n_layer = 24
-gpt2_model_load: ftype = 3
-gpt2_model_load: ggml ctx size = 1794.90 MB
-gpt2_model_load: memory size = 768.00 MB, n_mem = 49152
-gpt2_model_load: model size = 1026.83 MB
+starcoder_model_load: loading model from '../models/bigcode/gpt_bigcode-santacoder-ggml-q4_1.bin'
+starcoder_model_load: n_vocab = 49280
+starcoder_model_load: n_ctx = 2048
+starcoder_model_load: n_embd = 2048
+starcoder_model_load: n_head = 16
+starcoder_model_load: n_layer = 24
+starcoder_model_load: ftype = 3
+starcoder_model_load: ggml ctx size = 1794.90 MB
+starcoder_model_load: memory size = 768.00 MB, n_mem = 49152
+starcoder_model_load: model size = 1026.83 MB
main: prompt: 'def fibonnaci('
main: number of tokens in prompt = 7, first 8 tokens: 563 24240 78 2658 64 2819 7
| Model | Original size | Quantized size | Quantization type |
| --- | --- | --- | --- |
| `bigcode/gpt_bigcode-santacoder` | 5396.45 MB | 1026.83 MB | 4-bit integer (q4_1) |
-| `bigcode/starcoder` | 71628.23 MB | 13596.23 MB | 4-bit integer (q4_1) |
\ No newline at end of file
+| `bigcode/starcoder` | 71628.23 MB | 13596.23 MB | 4-bit integer (q4_1) |
// default hparams (GPT-2 117M)
// https://huggingface.co/bigcode/gpt_bigcode-santacoder/blob/main/config.json
-struct gpt2_hparams {
+struct starcoder_hparams {
int32_t n_vocab = 49280;
int32_t n_ctx = 2048;
int32_t n_embd = 2048;
int32_t ftype = 1;
};
-struct gpt2_layer {
+struct starcoder_layer {
// normalization
struct ggml_tensor * ln_1_g;
struct ggml_tensor * ln_1_b;
struct ggml_tensor * c_mlp_proj_b;
};
-struct gpt2_model {
- gpt2_hparams hparams;
+struct starcoder_model {
+ starcoder_hparams hparams;
// normalization
struct ggml_tensor * ln_f_g;
struct ggml_tensor * wpe; // token embedding
struct ggml_tensor * lm_head; // language model head
- std::vector<gpt2_layer> layers;
+ std::vector<starcoder_layer> layers;
// key + value memory
struct ggml_tensor * memory_k;
};
// load the model's weights from a file
-bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab) {
+bool starcoder_model_load(const std::string & fname, starcoder_model & model, gpt_vocab & vocab) {
printf("%s: loading model from '%s'\n", __func__, fname.c_str());
auto fin = std::ifstream(fname, std::ios::binary);
// - embd_inp: the embeddings of the tokens in the context
// - embd_w: the predicted logits for the next token
//
-bool gpt2_eval(
- const gpt2_model & model,
+bool starcoder_eval(
+ const starcoder_model & model,
const int n_threads,
const int n_past,
const std::vector<gpt_vocab::id> & embd_inp,
int64_t t_load_us = 0;
gpt_vocab vocab;
- gpt2_model model;
+ starcoder_model model;
// load the model
{
const int64_t t_start_us = ggml_time_us();
- if (!gpt2_model_load(params.model, model, vocab)) {
+ if (!starcoder_model_load(params.model, model, vocab)) {
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
return 1;
}
// determine the required inference memory per token:
size_t mem_per_token = 0;
- gpt2_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
+ starcoder_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
for (int i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
// predict
if (embd.size() > 0) {
const int64_t t_start_us = ggml_time_us();
- if (!gpt2_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
+ if (!starcoder_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
printf("Failed to predict\n");
return 1;
}
#include <regex>
// default hparams (GPT-2 117M)
-struct gpt2_hparams {
+struct starcoder_hparams {
int32_t n_vocab = 49280;
int32_t n_ctx = 2048;
int32_t n_embd = 2048;
};
// quantize a model
-bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
+bool starcoder_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
gpt_vocab vocab;
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
fout.write((char *) &magic, sizeof(magic));
}
- gpt2_hparams hparams;
+ starcoder_hparams hparams;
// load hparams
{
{
const int64_t t_start_us = ggml_time_us();
- if (!gpt2_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
+ if (!starcoder_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
return 1;
}