#include "common.h"
#include "llama.h"
-#include <cassert>
-#include <cinttypes>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <string>
-#include <vector>
-
int main(int argc, char ** argv) {
gpt_params params;
params.model = "models/llama-7B/ggml-model.bin";
}
}
+ const int n_embd = llama_n_embd(ctx);
const auto embeddings = llama_get_embeddings(ctx);
- // TODO: print / use the embeddings
+ for (int i = 0; i < n_embd; i++) {
+ printf("%f ", embeddings[i]);
+ }
+ printf("\n");
}
llama_print_timings(ctx);
double repeat_penalty) {
auto & rng = lctx.rng;
- const auto & vocab = lctx.vocab;
- const auto & logits = lctx.logits;
+ const int n_logits = lctx.model.hparams.n_vocab;
- int n_logits = vocab.id_to_token.size();
+ const auto & logits = lctx.logits;
+ const auto * plogits = logits.data() + logits.size() - n_logits;
std::vector<std::pair<double, llama_vocab::id>> logits_id;
logits_id.reserve(n_logits);
// credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) {
// if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
- if (logits[i] < 0.0) {
- logits_id.push_back(std::make_pair(logits[i]*scale*repeat_penalty, i));
+ if (plogits[i] < 0.0) {
+ logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
} else {
- logits_id.push_back(std::make_pair(logits[i]*scale/repeat_penalty, i));
+ logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
}
} else {
- logits_id.push_back(std::make_pair(logits[i]*scale, i));
+ logits_id.push_back(std::make_pair(plogits[i]*scale, i));
}
}
}
}
const auto & hparams = ctx->model.hparams;
+
+ // resized during inference
if (params.logits_all) {
ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
} else {
}
if (params.embedding){
- ctx->embedding.reserve(hparams.n_embd);
+ ctx->embedding.resize(hparams.n_embd);
}
ctx->buf_compute.resize(MEM_REQ_EVAL.at(ctx->model.type));
return ctx->model.hparams.n_ctx;
}
+int llama_n_embd(struct llama_context * ctx) {
+ return ctx->model.hparams.n_embd;
+}
+
float * llama_get_logits(struct llama_context * ctx) {
return ctx->logits.data();
}