/main
/quantize
/result
+/perplexity
arm_neon.h
compile_commands.json
# Build libraries
#
-add_library(utils OBJECT
- utils.cpp
- utils.h)
-
-target_include_directories(utils PUBLIC .)
-target_compile_features(utils PUBLIC cxx_std_11) # don't bump
-target_link_libraries(utils PRIVATE ${LLAMA_EXTRA_LIBS})
-if (BUILD_SHARED_LIBS)
- set_target_properties(utils PROPERTIES POSITION_INDEPENDENT_CODE ON)
-endif()
-
add_library(ggml OBJECT
ggml.c
ggml.h)
target_include_directories(llama PUBLIC .)
target_compile_features(llama PUBLIC cxx_std_11) # don't bump
-target_link_libraries(llama PRIVATE utils ggml ${LLAMA_EXTRA_LIBS})
+target_link_libraries(llama PRIVATE ggml ${LLAMA_EXTRA_LIBS})
if (BUILD_SHARED_LIBS)
set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
endif()
-#
-# Executables
-#
-
-add_executable(main main.cpp)
-target_link_libraries(main PRIVATE llama ggml utils)
-
-add_executable(quantize quantize.cpp)
-target_link_libraries(quantize PRIVATE llama ggml utils)
-
#
# programs, examples and tests
#
add_subdirectory(tests)
endif ()
-#if (LLAMA_BUILD_EXAMPLES)
-# add_subdirectory(examples)
-#endif()
+if (LLAMA_BUILD_EXAMPLES)
+ add_subdirectory(examples)
+endif()
$(info I CXX: $(CXXV))
$(info )
-default: main quantize
+default: main quantize perplexity
#
# Build library
llama.o: llama.cpp llama.h
$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o
-utils.o: utils.cpp utils.h
- $(CXX) $(CXXFLAGS) -c utils.cpp -o utils.o
+common.o: examples/common.cpp examples/common.h
+ $(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o
clean:
- rm -f *.o main quantize
+ rm -vf *.o main quantize perplexity
-main: main.cpp ggml.o llama.o utils.o
- $(CXX) $(CXXFLAGS) main.cpp ggml.o llama.o utils.o -o main $(LDFLAGS)
+main: examples/main/main.cpp ggml.o llama.o common.o
+ $(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
@echo
@echo '==== Run ./main -h for help. ===='
@echo
-quantize: quantize.cpp ggml.o llama.o utils.o
- $(CXX) $(CXXFLAGS) quantize.cpp ggml.o llama.o utils.o -o quantize $(LDFLAGS)
+quantize: examples/quantize/quantize.cpp ggml.o llama.o
+ $(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS)
+
+perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
+ $(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS)
#
# Tests
--- /dev/null
+# dependencies
+
+find_package(Threads REQUIRED)
+
+# third-party
+
+# ...
+
+# common
+
+set(TARGET common)
+
+add_library(${TARGET} OBJECT
+ common.h
+ common.cpp
+ )
+
+if (BUILD_SHARED_LIBS)
+ set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endif()
+
+target_include_directories(${TARGET} PUBLIC .)
+target_compile_features(${TARGET} PUBLIC cxx_std_11)
+target_link_libraries(${TARGET} PRIVATE llama)
+
+# examples
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
+if (EMSCRIPTEN)
+else()
+ add_subdirectory(main)
+ add_subdirectory(quantize)
+ add_subdirectory(perplexity)
+ add_subdirectory(embedding)
+endif()
--- /dev/null
+#include "common.h"
+
+#include "ggml.h"
+
+#include <cassert>
+#include <cstring>
+#include <fstream>
+#include <string>
+#include <iterator>
+#include <algorithm>
+
+ #if defined(_MSC_VER) || defined(__MINGW32__)
+ #include <malloc.h> // using malloc.h with MSC/MINGW
+ #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
+ #include <alloca.h>
+ #endif
+
+bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
+ // determine sensible default number of threads.
+ // std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0.
+#ifdef __linux__
+ std::ifstream cpuinfo("/proc/cpuinfo");
+ params.n_threads = std::count(std::istream_iterator<std::string>(cpuinfo),
+ std::istream_iterator<std::string>(),
+ std::string("processor"));
+#endif
+ if (params.n_threads == 0) {
+ params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency());
+ }
+
+ bool invalid_param = false;
+ std::string arg;
+ for (int i = 1; i < argc; i++) {
+ arg = argv[i];
+
+ if (arg == "-s" || arg == "--seed") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.seed = std::stoi(argv[i]);
+ } else if (arg == "-t" || arg == "--threads") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.n_threads = std::stoi(argv[i]);
+ } else if (arg == "-p" || arg == "--prompt") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.prompt = argv[i];
+ } else if (arg == "-f" || arg == "--file") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ std::ifstream file(argv[i]);
+ std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
+ if (params.prompt.back() == '\n') {
+ params.prompt.pop_back();
+ }
+ } else if (arg == "-n" || arg == "--n_predict") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.n_predict = std::stoi(argv[i]);
+ } else if (arg == "--top_k") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.top_k = std::stoi(argv[i]);
+ } else if (arg == "-c" || arg == "--ctx_size") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.n_ctx = std::stoi(argv[i]);
+ } else if (arg == "--memory_f32") {
+ params.memory_f16 = false;
+ } else if (arg == "--top_p") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.top_p = std::stof(argv[i]);
+ } else if (arg == "--temp") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.temp = std::stof(argv[i]);
+ } else if (arg == "--repeat_last_n") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.repeat_last_n = std::stoi(argv[i]);
+ } else if (arg == "--repeat_penalty") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.repeat_penalty = std::stof(argv[i]);
+ } else if (arg == "-b" || arg == "--batch_size") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.n_batch = std::stoi(argv[i]);
+ params.n_batch = std::min(512, params.n_batch);
+ } else if (arg == "-m" || arg == "--model") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.model = argv[i];
+ } else if (arg == "-i" || arg == "--interactive") {
+ params.interactive = true;
+ } else if (arg == "--embedding") {
+ params.embedding = true;
+ } else if (arg == "--interactive-start") {
+ params.interactive = true;
+ } else if (arg == "--interactive-first") {
+ params.interactive_start = true;
+ } else if (arg == "-ins" || arg == "--instruct") {
+ params.instruct = true;
+ } else if (arg == "--color") {
+ params.use_color = true;
+ } else if (arg == "--mlock") {
+ params.use_mlock = true;
+ } else if (arg == "--mtest") {
+ params.mem_test = true;
+ } else if (arg == "--verbose_prompt") {
+ params.verbose_prompt = true;
+ } else if (arg == "-r" || arg == "--reverse-prompt") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.antiprompt.push_back(argv[i]);
+ } else if (arg == "--perplexity") {
+ params.perplexity = true;
+ } else if (arg == "--ignore-eos") {
+ params.ignore_eos = true;
+ } else if (arg == "--n_parts") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.n_parts = std::stoi(argv[i]);
+ } else if (arg == "-h" || arg == "--help") {
+ gpt_print_usage(argc, argv, params);
+ exit(0);
+ } else if (arg == "--random-prompt") {
+ params.random_prompt = true;
+ } else if (arg == "--in-prefix") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.input_prefix = argv[i];
+ } else {
+ fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+ gpt_print_usage(argc, argv, params);
+ exit(1);
+ }
+ }
+ if (invalid_param) {
+ fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+ gpt_print_usage(argc, argv, params);
+ exit(1);
+ }
+
+ return true;
+}
+
+void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
+ fprintf(stderr, "usage: %s [options]\n", argv[0]);
+ fprintf(stderr, "\n");
+ fprintf(stderr, "options:\n");
+ fprintf(stderr, " -h, --help show this help message and exit\n");
+ fprintf(stderr, " -i, --interactive run in interactive mode\n");
+ fprintf(stderr, " --interactive-first run in interactive mode and wait for input right away\n");
+ fprintf(stderr, " -ins, --instruct run in instruction mode (use with Alpaca models)\n");
+ fprintf(stderr, " -r PROMPT, --reverse-prompt PROMPT\n");
+ fprintf(stderr, " run in interactive mode and poll user input upon seeing PROMPT (can be\n");
+ fprintf(stderr, " specified more than once for multiple prompts).\n");
+ fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n");
+ fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for <= 0)\n");
+ fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
+ fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
+ fprintf(stderr, " prompt to start generation with (default: empty)\n");
+ fprintf(stderr, " --random-prompt start with a randomized prompt.\n");
+ fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n");
+ fprintf(stderr, " -f FNAME, --file FNAME\n");
+ fprintf(stderr, " prompt file to start generation.\n");
+ fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict);
+ fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k);
+ fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p);
+ fprintf(stderr, " --repeat_last_n N last n tokens to consider for penalize (default: %d)\n", params.repeat_last_n);
+ fprintf(stderr, " --repeat_penalty N penalize repeat sequence of tokens (default: %.1f)\n", params.repeat_penalty);
+ fprintf(stderr, " -c N, --ctx_size N size of the prompt context (default: %d)\n", params.n_ctx);
+ fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating\n");
+ fprintf(stderr, " --memory_f32 use f32 instead of f16 for memory key+value\n");
+ fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp);
+ fprintf(stderr, " --n_parts N number of model parts (default: -1 = determine from dimensions)\n");
+ fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
+ fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
+ if (ggml_mlock_supported()) {
+ fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
+ }
+ fprintf(stderr, " --mtest compute maximum memory usage\n");
+ fprintf(stderr, " --verbose-prompt print prompt before generation\n");
+ fprintf(stderr, " -m FNAME, --model FNAME\n");
+ fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
+ fprintf(stderr, "\n");
+}
+
+std::string gpt_random_prompt(std::mt19937 & rng) {
+ const int r = rng() % 10;
+ switch (r) {
+ case 0: return "So";
+ case 1: return "Once upon a time";
+ case 2: return "When";
+ case 3: return "The";
+ case 4: return "After";
+ case 5: return "If";
+ case 6: return "import";
+ case 7: return "He";
+ case 8: return "She";
+ case 9: return "They";
+ default: return "To";
+ }
+
+ return "The";
+}
+
+// TODO: not great allocating this every time
+std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
+ // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
+ std::vector<llama_token> res(text.size() + (int)add_bos);
+ int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
+ assert(n >= 0);
+ res.resize(n);
+
+ return res;
+}
--- /dev/null
+// Various helper functions and utilities
+
+#pragma once
+
+#include "llama.h"
+
+#include <string>
+#include <vector>
+#include <random>
+#include <thread>
+
+//
+// CLI argument parsing
+//
+
+struct gpt_params {
+ int32_t seed = -1; // RNG seed
+ int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
+ int32_t n_predict = 128; // new tokens to predict
+ int32_t repeat_last_n = 64; // last n tokens to penalize
+ int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)
+ int32_t n_ctx = 512; // context size
+ int32_t n_batch = 8; // batch size for prompt processing
+
+ // sampling parameters
+ int32_t top_k = 40;
+ float top_p = 0.95f;
+ float temp = 0.80f;
+ float repeat_penalty = 1.10f;
+
+ std::string model = "models/lamma-7B/ggml-model.bin"; // model path
+ std::string prompt = "";
+ std::string input_prefix = ""; // string to prefix user inputs with
+
+
+ std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
+
+ bool memory_f16 = true; // use f16 instead of f32 for memory kv
+ bool random_prompt = false; // do not randomize prompt if none provided
+ bool use_color = false; // use color to distinguish generations and inputs
+ bool interactive = false; // interactive mode
+
+ bool embedding = false; // get only sentence embedding
+ bool interactive_start = false; // wait for user input immediately
+
+ bool instruct = false; // instruction mode (used for Alpaca models)
+ bool ignore_eos = false; // do not stop generating after eos
+ bool perplexity = false; // compute perplexity over the prompt
+ bool use_mlock = false; // use mlock to keep model in memory
+ bool mem_test = false; // compute maximum memory usage
+ bool verbose_prompt = false; // print prompt tokens before generation
+};
+
+bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
+
+void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
+
+std::string gpt_random_prompt(std::mt19937 & rng);
+
+//
+// Vocab utils
+//
+
+std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);
--- /dev/null
+set(TARGET embedding)
+add_executable(${TARGET} embedding.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- /dev/null
+# embedding\r
+\r
+TODO\r
--- /dev/null
+#include "common.h"
+#include "llama.h"
+
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <string>
+#include <vector>
+
+int main(int argc, char ** argv) {
+ gpt_params params;
+ params.model = "models/llama-7B/ggml-model.bin";
+
+ if (gpt_params_parse(argc, argv, params) == false) {
+ return 1;
+ }
+
+ params.embedding = true;
+
+ if (params.n_ctx > 2048) {
+ fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
+ "expect poor results\n", __func__, params.n_ctx);
+ }
+
+ if (params.seed <= 0) {
+ params.seed = time(NULL);
+ }
+
+ fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
+
+ std::mt19937 rng(params.seed);
+ if (params.random_prompt) {
+ params.prompt = gpt_random_prompt(rng);
+ }
+
+ llama_context * ctx;
+
+ // load the model
+ {
+ auto lparams = llama_context_default_params();
+
+ lparams.n_ctx = params.n_ctx;
+ lparams.n_parts = params.n_parts;
+ lparams.seed = params.seed;
+ lparams.f16_kv = params.memory_f16;
+ lparams.logits_all = params.perplexity;
+ lparams.use_mlock = params.use_mlock;
+ lparams.embedding = params.embedding;
+
+ ctx = llama_init_from_file(params.model.c_str(), lparams);
+
+ if (ctx == NULL) {
+ fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
+ return 1;
+ }
+ }
+
+ // print system information
+ {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
+ params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
+ }
+
+ int n_past = 0;
+
+ // Add a space in front of the first character to match OG llama tokenizer behavior
+ params.prompt.insert(0, 1, ' ');
+
+ // tokenize the prompt
+ auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
+
+ // determine newline token
+ auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
+
+ if (params.verbose_prompt) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+ fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+ for (int i = 0; i < (int) embd_inp.size(); i++) {
+ fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
+ }
+ fprintf(stderr, "\n");
+ }
+
+ if (params.embedding){
+ if (embd_inp.size() > 0) {
+ if (llama_eval(ctx, embd_inp.data(), embd_inp.size(), n_past, params.n_threads)) {
+ fprintf(stderr, "%s : failed to eval\n", __func__);
+ return 1;
+ }
+ }
+
+ const auto embeddings = llama_get_embeddings(ctx);
+
+ // TODO: print / use the embeddings
+ }
+
+ llama_print_timings(ctx);
+ llama_free(ctx);
+
+ return 0;
+}
--- /dev/null
+set(TARGET main)
+add_executable(${TARGET} main.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- /dev/null
+# main\r
+\r
+TODO\r
--- /dev/null
+#include "common.h"
+#include "llama.h"
+
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+#include <signal.h>
+#include <unistd.h>
+#elif defined (_WIN32)
+#include <signal.h>
+#endif
+
+#if defined (_WIN32)
+#pragma comment(lib,"kernel32.lib")
+extern "C" __declspec(dllimport) void* __stdcall GetStdHandle(unsigned long nStdHandle);
+extern "C" __declspec(dllimport) int __stdcall GetConsoleMode(void* hConsoleHandle, unsigned long* lpMode);
+extern "C" __declspec(dllimport) int __stdcall SetConsoleMode(void* hConsoleHandle, unsigned long dwMode);
+#endif
+
+#define ANSI_COLOR_RED "\x1b[31m"
+#define ANSI_COLOR_GREEN "\x1b[32m"
+#define ANSI_COLOR_YELLOW "\x1b[33m"
+#define ANSI_COLOR_BLUE "\x1b[34m"
+#define ANSI_COLOR_MAGENTA "\x1b[35m"
+#define ANSI_COLOR_CYAN "\x1b[36m"
+#define ANSI_COLOR_RESET "\x1b[0m"
+#define ANSI_BOLD "\x1b[1m"
+
+/* Keep track of current color of output, and emit ANSI code if it changes. */
+enum console_state {
+ CONSOLE_STATE_DEFAULT=0,
+ CONSOLE_STATE_PROMPT,
+ CONSOLE_STATE_USER_INPUT
+};
+
+static console_state con_st = CONSOLE_STATE_DEFAULT;
+static bool con_use_color = false;
+
+void set_console_state(console_state new_st)
+{
+ if (!con_use_color) return;
+ // only emit color code if state changed
+ if (new_st != con_st) {
+ con_st = new_st;
+ switch(con_st) {
+ case CONSOLE_STATE_DEFAULT:
+ printf(ANSI_COLOR_RESET);
+ return;
+ case CONSOLE_STATE_PROMPT:
+ printf(ANSI_COLOR_YELLOW);
+ return;
+ case CONSOLE_STATE_USER_INPUT:
+ printf(ANSI_BOLD ANSI_COLOR_GREEN);
+ return;
+ }
+ }
+}
+
+static bool is_interacting = false;
+
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
+void sigint_handler(int signo) {
+ set_console_state(CONSOLE_STATE_DEFAULT);
+ printf("\n"); // this also force flush stdout.
+ if (signo == SIGINT) {
+ if (!is_interacting) {
+ is_interacting=true;
+ } else {
+ _exit(130);
+ }
+ }
+}
+#endif
+
+int main(int argc, char ** argv) {
+ gpt_params params;
+ params.model = "models/llama-7B/ggml-model.bin";
+
+ if (gpt_params_parse(argc, argv, params) == false) {
+ return 1;
+ }
+
+ if (params.perplexity) {
+ printf("\n************\n");
+ printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
+ printf("************\n\n");
+
+ return 0;
+ }
+
+ if (params.n_ctx > 2048) {
+ fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
+ "expect poor results\n", __func__, params.n_ctx);
+ }
+
+ if (params.seed <= 0) {
+ params.seed = time(NULL);
+ }
+
+ fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
+
+ std::mt19937 rng(params.seed);
+ if (params.random_prompt) {
+ params.prompt = gpt_random_prompt(rng);
+ }
+
+ // save choice to use color for later
+ // (note for later: this is a slightly awkward choice)
+ con_use_color = params.use_color;
+
+// params.prompt = R"(// this function checks if the number n is prime
+//bool is_prime(int n) {)";
+
+ llama_context * ctx;
+
+ // load the model
+ {
+ auto lparams = llama_context_default_params();
+
+ lparams.n_ctx = params.n_ctx;
+ lparams.n_parts = params.n_parts;
+ lparams.seed = params.seed;
+ lparams.f16_kv = params.memory_f16;
+ lparams.use_mlock = params.use_mlock;
+
+ ctx = llama_init_from_file(params.model.c_str(), lparams);
+
+ if (ctx == NULL) {
+ fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
+ return 1;
+ }
+ }
+
+ // print system information
+ {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
+ params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
+ }
+
+ // determine the maximum memory usage needed to do inference for the given n_batch and n_predict parameters
+ // uncomment the "used_mem" line in llama.cpp to see the results
+ if (params.mem_test) {
+ {
+ const std::vector<llama_token> tmp(params.n_batch, 0);
+ llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
+ }
+
+ {
+ const std::vector<llama_token> tmp = { 0, };
+ llama_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads);
+ }
+
+ llama_print_timings(ctx);
+ llama_free(ctx);
+
+ return 0;
+ }
+
+ int n_past = 0;
+
+ // Add a space in front of the first character to match OG llama tokenizer behavior
+ params.prompt.insert(0, 1, ' ');
+
+ // tokenize the prompt
+ auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
+
+ const int n_ctx = llama_n_ctx(ctx);
+
+ params.n_predict = std::min(params.n_predict, n_ctx - (int) embd_inp.size());
+
+ // prefix & suffix for instruct mode
+ const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true);
+ const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false);
+
+ // in instruct mode, we inject a prefix and a suffix to each input by the user
+ if (params.instruct) {
+ params.interactive = true;
+ params.antiprompt.push_back("### Instruction:\n\n");
+ }
+
+ // enable interactive mode if reverse prompt is specified
+ if (params.antiprompt.size() != 0) {
+ params.interactive = true;
+ }
+
+ if (params.interactive_start) {
+ params.interactive = true;
+ }
+
+ // determine newline token
+ auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
+
+ if (params.verbose_prompt) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+ fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+ for (int i = 0; i < (int) embd_inp.size(); i++) {
+ fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
+ }
+ fprintf(stderr, "\n");
+ }
+
+ if (params.interactive) {
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+ struct sigaction sigint_action;
+ sigint_action.sa_handler = sigint_handler;
+ sigemptyset (&sigint_action.sa_mask);
+ sigint_action.sa_flags = 0;
+ sigaction(SIGINT, &sigint_action, NULL);
+#elif defined (_WIN32)
+ signal(SIGINT, sigint_handler);
+#endif
+
+ fprintf(stderr, "%s: interactive mode on.\n", __func__);
+
+ if(params.antiprompt.size()) {
+ for (auto antiprompt : params.antiprompt) {
+ fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str());
+ }
+ }
+
+ if (!params.input_prefix.empty()) {
+ fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
+ }
+ }
+ fprintf(stderr, "sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
+ fprintf(stderr, "\n\n");
+
+ std::vector<llama_token> embd;
+
+
+ int last_n_size = params.repeat_last_n;
+ std::vector<llama_token> last_n_tokens(last_n_size);
+ std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
+
+ if (params.interactive) {
+ fprintf(stderr, "== Running in interactive mode. ==\n"
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
+ " - Press Ctrl+C to interject at any time.\n"
+#endif
+ " - Press Return to return control to LLaMa.\n"
+ " - If you want to submit another line, end your input in '\\'.\n\n");
+ is_interacting = params.interactive_start || params.instruct;
+ }
+
+ int input_consumed = 0;
+ bool input_noecho = false;
+
+ int remaining_tokens = params.n_predict;
+
+#if defined (_WIN32)
+ if (params.use_color) {
+ // Enable ANSI colors on Windows 10+
+ unsigned long dwMode = 0;
+ void* hConOut = GetStdHandle((unsigned long)-11); // STD_OUTPUT_HANDLE (-11)
+ if (hConOut && hConOut != (void*)-1 && GetConsoleMode(hConOut, &dwMode) && !(dwMode & 0x4)) {
+ SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4)
+ }
+ }
+#endif
+ // the first thing we will do is to output the prompt, so set color accordingly
+ set_console_state(CONSOLE_STATE_PROMPT);
+
+ while (remaining_tokens > 0 || params.interactive) {
+ // predict
+ if (embd.size() > 0) {
+ if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) {
+ fprintf(stderr, "%s : failed to eval\n", __func__);
+ return 1;
+ }
+ }
+
+ n_past += embd.size();
+ embd.clear();
+
+ if ((int) embd_inp.size() <= input_consumed && !is_interacting) {
+ // out of user input, sample next token
+ const float top_k = params.top_k;
+ const float top_p = params.top_p;
+ const float temp = params.temp;
+ const float repeat_penalty = params.repeat_penalty;
+
+ llama_token id = 0;
+
+ {
+ auto logits = llama_get_logits(ctx);
+
+ if (params.ignore_eos) {
+ logits[llama_token_eos()] = 0;
+ }
+
+ id = llama_sample_top_p_top_k(ctx, last_n_tokens.data(), last_n_tokens.size(), top_k, top_p, temp, repeat_penalty);
+
+ last_n_tokens.erase(last_n_tokens.begin());
+ last_n_tokens.push_back(id);
+ }
+
+ // replace end of text token with newline token when in interactive mode
+ if (id == llama_token_eos() && params.interactive && !params.instruct) {
+ id = llama_token_newline.front();
+ if (params.antiprompt.size() != 0) {
+ // tokenize and inject first reverse prompt
+ const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
+ embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
+ }
+ }
+
+ // add it to the context
+ embd.push_back(id);
+
+ // echo this to console
+ input_noecho = false;
+
+ // decrement remaining sampling budget
+ --remaining_tokens;
+ } else {
+ // some user input remains from prompt or interaction, forward it to processing
+ while ((int) embd_inp.size() > input_consumed) {
+ embd.push_back(embd_inp[input_consumed]);
+ last_n_tokens.erase(last_n_tokens.begin());
+ last_n_tokens.push_back(embd_inp[input_consumed]);
+ ++input_consumed;
+ if ((int) embd.size() >= params.n_batch) {
+ break;
+ }
+ }
+ }
+
+ // display text
+ if (!input_noecho) {
+ for (auto id : embd) {
+ printf("%s", llama_token_to_str(ctx, id));
+ }
+ fflush(stdout);
+ }
+ // reset color to default if we there is no pending user input
+ if (!input_noecho && (int)embd_inp.size() == input_consumed) {
+ set_console_state(CONSOLE_STATE_DEFAULT);
+ }
+
+ // in interactive mode, and not currently processing queued inputs;
+ // check if we should prompt the user for more
+ if (params.interactive && (int) embd_inp.size() <= input_consumed) {
+ // check for reverse prompt
+ std::string last_output;
+ for (auto id : last_n_tokens) {
+ last_output += llama_token_to_str(ctx, id);
+ }
+
+ // Check if each of the reverse prompts appears at the end of the output.
+ for (std::string & antiprompt : params.antiprompt) {
+ if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
+ is_interacting = true;
+ set_console_state(CONSOLE_STATE_USER_INPUT);
+ fflush(stdout);
+ break;
+ }
+ }
+
+ if (n_past > 0 && is_interacting) {
+ // potentially set color to indicate we are taking user input
+ set_console_state(CONSOLE_STATE_USER_INPUT);
+
+ if (params.instruct) {
+ input_consumed = embd_inp.size();
+ embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
+
+ printf("\n> ");
+ }
+
+ std::string buffer;
+ if (!params.input_prefix.empty()) {
+ buffer += params.input_prefix;
+ printf("%s", buffer.c_str());
+ }
+
+ std::string line;
+ bool another_line = true;
+ do {
+ std::getline(std::cin, line);
+ if (line.empty() || line.back() != '\\') {
+ another_line = false;
+ } else {
+ line.pop_back(); // Remove the continue character
+ }
+ buffer += line + '\n'; // Append the line to the result
+ } while (another_line);
+
+ // done taking input, reset color
+ set_console_state(CONSOLE_STATE_DEFAULT);
+
+ auto line_inp = ::llama_tokenize(ctx, buffer, false);
+ embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
+
+ if (params.instruct) {
+ embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
+ }
+
+ remaining_tokens -= line_inp.size();
+
+ input_noecho = true; // do not echo this again
+ }
+
+ if (n_past > 0) {
+ is_interacting = false;
+ }
+ }
+
+ // end of text token
+ if (embd.back() == llama_token_eos()) {
+ if (params.instruct) {
+ is_interacting = true;
+ } else {
+ fprintf(stderr, " [end of text]\n");
+ break;
+ }
+ }
+
+ // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
+ if (params.interactive && remaining_tokens <= 0) {
+ remaining_tokens = params.n_predict;
+ is_interacting = true;
+ }
+ }
+
+#if defined (_WIN32)
+ signal(SIGINT, SIG_DFL);
+#endif
+
+ llama_print_timings(ctx);
+ llama_free(ctx);
+
+ set_console_state(CONSOLE_STATE_DEFAULT);
+
+ return 0;
+}
--- /dev/null
+set(TARGET perplexity)
+add_executable(${TARGET} perplexity.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- /dev/null
+# perplexity\r
+\r
+TODO\r
--- /dev/null
+#include "common.h"
+#include "llama.h"
+
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <string>
+#include <vector>
+
+std::vector<double> softmax(const std::vector<float>& logits) {
+ std::vector<double> probs(logits.size());
+ float max_logit = logits[0];
+ for (float v : logits) max_logit = std::max(max_logit, v);
+ double sum_exp = 0.0;
+ for (size_t i = 0; i < logits.size(); i++) {
+ // Subtract the maximum logit value from the current logit value for numerical stability
+ float logit = logits[i] - max_logit;
+ double exp_logit = std::exp(logit);
+ sum_exp += exp_logit;
+ probs[i] = exp_logit;
+ }
+ for (size_t i = 0; i < probs.size(); i++) probs[i] /= sum_exp;
+ return probs;
+}
+
+void perplexity(llama_context * ctx, const gpt_params & params) {
+ // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
+ // Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
+ // Output: `perplexity: 13.5106 [114/114]`
+ auto tokens = ::llama_tokenize(ctx, params.prompt, true);
+
+ int count = 0;
+ double nll = 0.0;
+ int seq_count = tokens.size() / params.n_ctx;
+
+ fprintf(stderr, "%s : calculating perplexity over %d chunks\n", __func__, seq_count);
+
+ for (int i = 0; i < seq_count; ++i) {
+ int start = i * params.n_ctx;
+ int end = start + params.n_ctx - 1;
+ std::vector<llama_token> embd(tokens.begin() + start, tokens.begin() + end);
+ auto start_t = std::chrono::high_resolution_clock::now();
+ if (llama_eval(ctx, embd.data(), embd.size(), 0, params.n_threads)) {
+ fprintf(stderr, "%s : failed to eval\n", __func__);
+ return;
+ }
+ auto end_t = std::chrono::high_resolution_clock::now();
+ if (i == 0) {
+ double seconds = std::chrono::duration<double>(end_t - start_t).count();
+ printf("%.2f seconds per pass - ETA %.2f hours\n", seconds, (seconds * seq_count) / (60.0*60.0));
+ }
+ // We get the logits for all the tokens in the context window (params.n_ctx)
+ // from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity,
+ // calculate the perplexity over the last half the window (so the model always has
+ // some context to predict the token).
+ //
+ // We rely on the fact that attention in the forward pass only looks at previous
+ // tokens here, so the logits returned for each token are an accurate representation
+ // of what the model would have predicted at that point.
+ //
+ // Example, we have a context window of 512, we will compute perplexity for each of the
+ // last 256 tokens. Then, we split the input up into context window size chunks to
+ // process the entire prompt.
+
+ auto logits = llama_get_logits(ctx);
+ for (int j = params.n_ctx / 2; j < params.n_ctx - 1; ++j) {
+ // Calculate probability of next token, given the previous ones.
+ int n_vocab = llama_n_vocab(ctx);
+ std::vector<float> tok_logits(
+ logits + j * n_vocab,
+ logits + (j + 1) * n_vocab);
+ double prob = softmax(tok_logits)[tokens[start + j + 1]];
+ nll += -std::log(prob);
+ ++count;
+ }
+ // perplexity is e^(average negative log-likelihood)
+ printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
+ fflush(stdout);
+ }
+ printf("\n");
+}
+
+int main(int argc, char ** argv) {
+ gpt_params params;
+ params.model = "models/llama-7B/ggml-model.bin";
+
+ if (gpt_params_parse(argc, argv, params) == false) {
+ return 1;
+ }
+
+ params.perplexity = true;
+
+ if (params.n_ctx > 2048) {
+ fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
+ "expect poor results\n", __func__, params.n_ctx);
+ }
+
+ if (params.seed <= 0) {
+ params.seed = time(NULL);
+ }
+
+ fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
+
+ std::mt19937 rng(params.seed);
+ if (params.random_prompt) {
+ params.prompt = gpt_random_prompt(rng);
+ }
+
+ llama_context * ctx;
+
+ // load the model
+ {
+ auto lparams = llama_context_default_params();
+
+ lparams.n_ctx = params.n_ctx;
+ lparams.n_parts = params.n_parts;
+ lparams.seed = params.seed;
+ lparams.f16_kv = params.memory_f16;
+ lparams.logits_all = params.perplexity;
+ lparams.use_mlock = params.use_mlock;
+ lparams.embedding = params.embedding;
+
+ ctx = llama_init_from_file(params.model.c_str(), lparams);
+
+ if (ctx == NULL) {
+ fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
+ return 1;
+ }
+ }
+
+ // print system information
+ {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
+ params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
+ }
+
+ perplexity(ctx, params);
+
+ llama_print_timings(ctx);
+ llama_free(ctx);
+
+ return 0;
+}
--- /dev/null
+set(TARGET quantize)
+add_executable(${TARGET} quantize.cpp)
+target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- /dev/null
+# quantize
+
+TODO
--- /dev/null
+#include "ggml.h"
+#include "llama.h"
+
+#include <cstdio>
+#include <string>
+
+const int QK = 32;
+
+// usage:
+// ./llama-quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
+//
+int main(int argc, char ** argv) {
+ ggml_time_init();
+
+ if (argc != 4) {
+ fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
+ fprintf(stderr, " type = 2 - q4_0\n");
+ fprintf(stderr, " type = 3 - q4_1\n");
+ return 1;
+ }
+
+ // needed to initialize f16 tables
+ {
+ struct ggml_init_params params = { 0, NULL };
+ struct ggml_context * ctx = ggml_init(params);
+ ggml_free(ctx);
+ }
+
+ const std::string fname_inp = argv[1];
+ const std::string fname_out = argv[2];
+
+ const int itype = atoi(argv[3]);
+
+ const int64_t t_main_start_us = ggml_time_us();
+
+ int64_t t_quantize_us = 0;
+
+ // load the model
+ {
+ const int64_t t_start_us = ggml_time_us();
+
+ if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype, QK)) {
+ fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
+ return 1;
+ }
+
+ t_quantize_us = ggml_time_us() - t_start_us;
+ }
+
+ // report timing
+ {
+ const int64_t t_main_end_us = ggml_time_us();
+
+ printf("\n");
+ printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
+ printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
+ }
+
+ return 0;
+}
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
- const int ne00 = src0->ne[0];
- const int ne01 = src0->ne[1];
+ //const int ne00 = src0->ne[0];
+ //const int ne01 = src0->ne[1];
const int ne10 = src1->ne[0];
const int ne10 = src1->ne[0];
const int ne11 = src1->ne[1];
- const int ne12 = src1->ne[2];
- const int ne13 = src1->ne[3];
+ //const int ne12 = src1->ne[2];
+ //const int ne13 = src1->ne[3];
- const int ne0 = dst->ne[0];
- const int ne1 = dst->ne[1];
- const int ne2 = dst->ne[2];
- const int ne3 = dst->ne[3];
- const int ne = ne0*ne1*ne2*ne3;
+ //const int ne0 = dst->ne[0];
+ //const int ne1 = dst->ne[1];
+ //const int ne2 = dst->ne[2];
+ //const int ne3 = dst->ne[3];
+ //const int ne = ne0*ne1*ne2*ne3;
- const int nb00 = src0->nb[0];
+ //const int nb00 = src0->nb[0];
const int nb01 = src0->nb[1];
const int nb02 = src0->nb[2];
const int nb03 = src0->nb[3];
const int ne1 = dst->ne[1];
const int ne2 = dst->ne[2];
const int ne3 = dst->ne[3];
- const int ne = ne0*ne1*ne2*ne3;
+ //const int ne = ne0*ne1*ne2*ne3;
const int nb00 = src0->nb[0];
const int nb01 = src0->nb[1];
const int ne1 = dst->ne[1];
const int ne2 = dst->ne[2];
const int ne3 = dst->ne[3];
- const int ne = ne0*ne1*ne2*ne3;
+ //const int ne = ne0*ne1*ne2*ne3;
const int nb00 = src0->nb[0];
const int nb01 = src0->nb[1];
const int ne1 = dst->ne[1];
const int ne2 = dst->ne[2];
const int ne3 = dst->ne[3];
- const int ne = ne0*ne1*ne2*ne3;
+ //const int ne = ne0*ne1*ne2*ne3;
const int nb00 = src0->nb[0];
const int nb01 = src0->nb[1];
+++ /dev/null
-#include "utils.h"
-#include "ggml.h"
-#include "llama.h"
-
-#include <cassert>
-#include <cinttypes>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <string>
-#include <vector>
-
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-#include <signal.h>
-#include <unistd.h>
-#elif defined (_WIN32)
-#include <signal.h>
-#endif
-
-#if defined (_WIN32)
-#pragma comment(lib,"kernel32.lib")
-extern "C" __declspec(dllimport) void* __stdcall GetStdHandle(unsigned long nStdHandle);
-extern "C" __declspec(dllimport) int __stdcall GetConsoleMode(void* hConsoleHandle, unsigned long* lpMode);
-extern "C" __declspec(dllimport) int __stdcall SetConsoleMode(void* hConsoleHandle, unsigned long dwMode);
-#endif
-
-#define ANSI_COLOR_RED "\x1b[31m"
-#define ANSI_COLOR_GREEN "\x1b[32m"
-#define ANSI_COLOR_YELLOW "\x1b[33m"
-#define ANSI_COLOR_BLUE "\x1b[34m"
-#define ANSI_COLOR_MAGENTA "\x1b[35m"
-#define ANSI_COLOR_CYAN "\x1b[36m"
-#define ANSI_COLOR_RESET "\x1b[0m"
-#define ANSI_BOLD "\x1b[1m"
-
-/* Keep track of current color of output, and emit ANSI code if it changes. */
-enum console_state {
- CONSOLE_STATE_DEFAULT=0,
- CONSOLE_STATE_PROMPT,
- CONSOLE_STATE_USER_INPUT
-};
-
-static console_state con_st = CONSOLE_STATE_DEFAULT;
-static bool con_use_color = false;
-
-void set_console_state(console_state new_st)
-{
- if (!con_use_color) return;
- // only emit color code if state changed
- if (new_st != con_st) {
- con_st = new_st;
- switch(con_st) {
- case CONSOLE_STATE_DEFAULT:
- printf(ANSI_COLOR_RESET);
- return;
- case CONSOLE_STATE_PROMPT:
- printf(ANSI_COLOR_YELLOW);
- return;
- case CONSOLE_STATE_USER_INPUT:
- printf(ANSI_BOLD ANSI_COLOR_GREEN);
- return;
- }
- }
-}
-
-std::vector<double> softmax(const std::vector<float>& logits) {
- std::vector<double> probs(logits.size());
- float max_logit = logits[0];
- for (float v : logits) max_logit = std::max(max_logit, v);
- double sum_exp = 0.0;
- for (size_t i = 0; i < logits.size(); i++) {
- // Subtract the maximum logit value from the current logit value for numerical stability
- float logit = logits[i] - max_logit;
- double exp_logit = std::exp(logit);
- sum_exp += exp_logit;
- probs[i] = exp_logit;
- }
- for (size_t i = 0; i < probs.size(); i++) probs[i] /= sum_exp;
- return probs;
-}
-
-void perplexity(llama_context * ctx, const gpt_params & params) {
- // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
- // Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
- // Output: `perplexity: 13.5106 [114/114]`
- auto tokens = ::llama_tokenize(ctx, params.prompt, true);
-
- int count = 0;
- double nll = 0.0;
- int seq_count = tokens.size() / params.n_ctx;
-
- fprintf(stderr, "%s : calculating perplexity over %d chunks\n", __func__, seq_count);
-
- for (int i = 0; i < seq_count; ++i) {
- int start = i * params.n_ctx;
- int end = start + params.n_ctx - 1;
- std::vector<llama_token> embd(tokens.begin() + start, tokens.begin() + end);
- auto start_t = std::chrono::high_resolution_clock::now();
- if (llama_eval(ctx, embd.data(), embd.size(), 0, params.n_threads)) {
- fprintf(stderr, "%s : failed to eval\n", __func__);
- return;
- }
- auto end_t = std::chrono::high_resolution_clock::now();
- if (i == 0) {
- double seconds = std::chrono::duration<double>(end_t - start_t).count();
- printf("%.2f seconds per pass - ETA %.2f hours\n", seconds, (seconds * seq_count) / (60.0*60.0));
- }
- // We get the logits for all the tokens in the context window (params.n_ctx)
- // from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity,
- // calculate the perplexity over the last half the window (so the model always has
- // some context to predict the token).
- //
- // We rely on the fact that attention in the forward pass only looks at previous
- // tokens here, so the logits returned for each token are an accurate representation
- // of what the model would have predicted at that point.
- //
- // Example, we have a context window of 512, we will compute perplexity for each of the
- // last 256 tokens. Then, we split the input up into context window size chunks to
- // process the entire prompt.
-
- auto logits = llama_get_logits(ctx);
- for (int j = params.n_ctx / 2; j < params.n_ctx - 1; ++j) {
- // Calculate probability of next token, given the previous ones.
- int n_vocab = llama_n_vocab(ctx);
- std::vector<float> tok_logits(
- logits + j * n_vocab,
- logits + (j + 1) * n_vocab);
- double prob = softmax(tok_logits)[tokens[start + j + 1]];
- nll += -std::log(prob);
- ++count;
- }
- // perplexity is e^(average negative log-likelihood)
- printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
- fflush(stdout);
- }
- printf("\n");
-}
-
-static bool is_interacting = false;
-
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-void sigint_handler(int signo) {
- set_console_state(CONSOLE_STATE_DEFAULT);
- printf("\n"); // this also force flush stdout.
- if (signo == SIGINT) {
- if (!is_interacting) {
- is_interacting=true;
- } else {
- _exit(130);
- }
- }
-}
-#endif
-
-int main(int argc, char ** argv) {
- // has to be called once at the start of the program to init ggml stuff
- ggml_time_init();
-
- gpt_params params;
- params.model = "models/llama-7B/ggml-model.bin";
-
- if (gpt_params_parse(argc, argv, params) == false) {
- return 1;
- }
-
- if (params.n_ctx > 2048) {
- fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
- "expect poor results\n", __func__, params.n_ctx);
- }
-
- if (params.seed <= 0) {
- params.seed = time(NULL);
- }
-
- fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
-
- std::mt19937 rng(params.seed);
- if (params.random_prompt) {
- params.prompt = gpt_random_prompt(rng);
- }
-
- // save choice to use color for later
- // (note for later: this is a slightly awkward choice)
- con_use_color = params.use_color;
-
-// params.prompt = R"(// this function checks if the number n is prime
-//bool is_prime(int n) {)";
-
- llama_context * ctx;
-
- // load the model
- {
- auto lparams = llama_context_default_params();
-
- lparams.n_ctx = params.n_ctx;
- lparams.n_parts = params.n_parts;
- lparams.seed = params.seed;
- lparams.f16_kv = params.memory_f16;
- lparams.logits_all = params.perplexity;
- lparams.use_mlock = params.use_mlock;
- lparams.embedding = params.embedding;
-
- ctx = llama_init_from_file(params.model.c_str(), lparams);
-
- if (ctx == NULL) {
- fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
- return 1;
- }
- }
-
- // print system information
- {
- fprintf(stderr, "\n");
- fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
- params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
- }
-
- // determine the maximum memory usage needed to do inference for the given n_batch and n_predict parameters
- // uncomment the "used_mem" line in llama.cpp to see the results
- if (params.mem_test) {
- {
- const std::vector<llama_token> tmp(params.n_batch, 0);
- llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
- }
-
- {
- const std::vector<llama_token> tmp = { 0, };
- llama_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads);
- }
-
- llama_print_timings(ctx);
- llama_free(ctx);
-
- return 0;
- }
-
- if (params.perplexity) {
- perplexity(ctx, params);
- exit(0);
- }
-
- int n_past = 0;
-
- // Add a space in front of the first character to match OG llama tokenizer behavior
- params.prompt.insert(0, 1, ' ');
-
- // tokenize the prompt
- auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
-
- const int n_ctx = llama_n_ctx(ctx);
-
- params.n_predict = std::min(params.n_predict, n_ctx - (int) embd_inp.size());
-
- // prefix & suffix for instruct mode
- const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true);
- const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false);
-
- // in instruct mode, we inject a prefix and a suffix to each input by the user
- if (params.instruct) {
- params.interactive = true;
- params.antiprompt.push_back("### Instruction:\n\n");
- }
-
- // enable interactive mode if reverse prompt is specified
- if (params.antiprompt.size() != 0) {
- params.interactive = true;
- }
-
- if (params.interactive_start) {
- params.interactive = true;
- }
-
- // determine newline token
- auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
-
- if (params.verbose_prompt) {
- fprintf(stderr, "\n");
- fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
- fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
- for (int i = 0; i < (int) embd_inp.size(); i++) {
- fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
- }
- fprintf(stderr, "\n");
- }
-
- if (params.interactive) {
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
- struct sigaction sigint_action;
- sigint_action.sa_handler = sigint_handler;
- sigemptyset (&sigint_action.sa_mask);
- sigint_action.sa_flags = 0;
- sigaction(SIGINT, &sigint_action, NULL);
-#elif defined (_WIN32)
- signal(SIGINT, sigint_handler);
-#endif
-
- fprintf(stderr, "%s: interactive mode on.\n", __func__);
-
- if(params.antiprompt.size()) {
- for (auto antiprompt : params.antiprompt) {
- fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str());
- }
- }
-
- if (!params.input_prefix.empty()) {
- fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
- }
- }
- fprintf(stderr, "sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
- fprintf(stderr, "\n\n");
-
- std::vector<llama_token> embd;
-
-
- int last_n_size = params.repeat_last_n;
- std::vector<llama_token> last_n_tokens(last_n_size);
- std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
-
- if (params.interactive) {
- fprintf(stderr, "== Running in interactive mode. ==\n"
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
- " - Press Ctrl+C to interject at any time.\n"
-#endif
- " - Press Return to return control to LLaMa.\n"
- " - If you want to submit another line, end your input in '\\'.\n\n");
- is_interacting = params.interactive_start || params.instruct;
- }
-
- int input_consumed = 0;
- bool input_noecho = false;
-
- int remaining_tokens = params.n_predict;
-
-#if defined (_WIN32)
- if (params.use_color) {
- // Enable ANSI colors on Windows 10+
- unsigned long dwMode = 0;
- void* hConOut = GetStdHandle((unsigned long)-11); // STD_OUTPUT_HANDLE (-11)
- if (hConOut && hConOut != (void*)-1 && GetConsoleMode(hConOut, &dwMode) && !(dwMode & 0x4)) {
- SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4)
- }
- }
-#endif
- // the first thing we will do is to output the prompt, so set color accordingly
- set_console_state(CONSOLE_STATE_PROMPT);
-
- if (params.embedding){
- embd = embd_inp;
-
- if (embd.size() > 0) {
- if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) {
- fprintf(stderr, "%s : failed to eval\n", __func__);
- return 1;
- }
- }
-
- const auto embeddings = llama_get_embeddings(ctx);
-
- // TODO: print / use the embeddings
-
- if (params.use_color) {
- printf(ANSI_COLOR_RESET);
- }
-
- return 0;
- }
-
- while (remaining_tokens > 0 || params.interactive) {
- // predict
- if (embd.size() > 0) {
- if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) {
- fprintf(stderr, "%s : failed to eval\n", __func__);
- return 1;
- }
- }
-
- n_past += embd.size();
- embd.clear();
-
- if ((int) embd_inp.size() <= input_consumed && !is_interacting) {
- // out of user input, sample next token
- const float top_k = params.top_k;
- const float top_p = params.top_p;
- const float temp = params.temp;
- const float repeat_penalty = params.repeat_penalty;
-
- llama_token id = 0;
-
- {
- auto logits = llama_get_logits(ctx);
-
- if (params.ignore_eos) {
- // set the logit of the eos token to zero to avoid sampling it
- //logits[logits.size() - n_vocab + EOS_TOKEN_ID] = 0;
- // TODO: this does not work of params.logits_all == true
- assert(params.perplexity == false);
- logits[llama_token_eos()] = 0;
- }
-
- id = llama_sample_top_p_top_k(ctx, last_n_tokens.data(), last_n_tokens.size(), top_k, top_p, temp, repeat_penalty);
-
- last_n_tokens.erase(last_n_tokens.begin());
- last_n_tokens.push_back(id);
- }
-
- // replace end of text token with newline token when in interactive mode
- if (id == llama_token_eos() && params.interactive && !params.instruct) {
- id = llama_token_newline.front();
- if (params.antiprompt.size() != 0) {
- // tokenize and inject first reverse prompt
- const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
- embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
- }
- }
-
- // add it to the context
- embd.push_back(id);
-
- // echo this to console
- input_noecho = false;
-
- // decrement remaining sampling budget
- --remaining_tokens;
- } else {
- // some user input remains from prompt or interaction, forward it to processing
- while ((int) embd_inp.size() > input_consumed) {
- embd.push_back(embd_inp[input_consumed]);
- last_n_tokens.erase(last_n_tokens.begin());
- last_n_tokens.push_back(embd_inp[input_consumed]);
- ++input_consumed;
- if ((int) embd.size() >= params.n_batch) {
- break;
- }
- }
- }
-
- // display text
- if (!input_noecho) {
- for (auto id : embd) {
- printf("%s", llama_token_to_str(ctx, id));
- }
- fflush(stdout);
- }
- // reset color to default if we there is no pending user input
- if (!input_noecho && (int)embd_inp.size() == input_consumed) {
- set_console_state(CONSOLE_STATE_DEFAULT);
- }
-
- // in interactive mode, and not currently processing queued inputs;
- // check if we should prompt the user for more
- if (params.interactive && (int) embd_inp.size() <= input_consumed) {
- // check for reverse prompt
- std::string last_output;
- for (auto id : last_n_tokens) {
- last_output += llama_token_to_str(ctx, id);
- }
-
- // Check if each of the reverse prompts appears at the end of the output.
- for (std::string & antiprompt : params.antiprompt) {
- if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
- is_interacting = true;
- set_console_state(CONSOLE_STATE_USER_INPUT);
- fflush(stdout);
- break;
- }
- }
-
- if (n_past > 0 && is_interacting) {
- // potentially set color to indicate we are taking user input
- set_console_state(CONSOLE_STATE_USER_INPUT);
-
- if (params.instruct) {
- input_consumed = embd_inp.size();
- embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
-
- printf("\n> ");
- }
-
- std::string buffer;
- if (!params.input_prefix.empty()) {
- buffer += params.input_prefix;
- printf("%s", buffer.c_str());
- }
-
- std::string line;
- bool another_line = true;
- do {
- std::getline(std::cin, line);
- if (line.empty() || line.back() != '\\') {
- another_line = false;
- } else {
- line.pop_back(); // Remove the continue character
- }
- buffer += line + '\n'; // Append the line to the result
- } while (another_line);
-
- // done taking input, reset color
- set_console_state(CONSOLE_STATE_DEFAULT);
-
- auto line_inp = ::llama_tokenize(ctx, buffer, false);
- embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
-
- if (params.instruct) {
- embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
- }
-
- remaining_tokens -= line_inp.size();
-
- input_noecho = true; // do not echo this again
- }
-
- if (n_past > 0) {
- is_interacting = false;
- }
- }
-
- // end of text token
- if (embd.back() == llama_token_eos()) {
- if (params.instruct) {
- is_interacting = true;
- } else {
- fprintf(stderr, " [end of text]\n");
- break;
- }
- }
-
- // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
- if (params.interactive && remaining_tokens <= 0) {
- remaining_tokens = params.n_predict;
- is_interacting = true;
- }
- }
-
-#if defined (_WIN32)
- signal(SIGINT, SIG_DFL);
-#endif
-
- llama_print_timings(ctx);
- llama_free(ctx);
-
- set_console_state(CONSOLE_STATE_DEFAULT);
-
- return 0;
-}
+++ /dev/null
-#include "ggml.h"
-#include "llama.h"
-
-#include <cstdio>
-#include <string>
-
-const int QK = 32;
-
-// usage:
-// ./llama-quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
-//
-int main(int argc, char ** argv) {
- ggml_time_init();
-
- if (argc != 4) {
- fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
- fprintf(stderr, " type = 2 - q4_0\n");
- fprintf(stderr, " type = 3 - q4_1\n");
- return 1;
- }
-
- // needed to initialize f16 tables
- {
- struct ggml_init_params params = { 0, NULL };
- struct ggml_context * ctx = ggml_init(params);
- ggml_free(ctx);
- }
-
- const std::string fname_inp = argv[1];
- const std::string fname_out = argv[2];
-
- const int itype = atoi(argv[3]);
-
- const int64_t t_main_start_us = ggml_time_us();
-
- int64_t t_quantize_us = 0;
-
- // load the model
- {
- const int64_t t_start_us = ggml_time_us();
-
- if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype, QK)) {
- fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
- return 1;
- }
-
- t_quantize_us = ggml_time_us() - t_start_us;
- }
-
- // report timing
- {
- const int64_t t_main_end_us = ggml_time_us();
-
- printf("\n");
- printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
- printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
- }
-
- return 0;
-}
function(llama_add_test source)
get_filename_component(TEST_TARGET ${source} NAME_WE)
add_executable(${TEST_TARGET} ${source})
- target_link_libraries(${TEST_TARGET} PRIVATE llama ggml utils)
+ target_link_libraries(${TEST_TARGET} PRIVATE llama)
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
endfunction()
-#include "utils.h"
#include "llama.h"
#include <cstdio>
#include <string>
#include <map>
+#include <vector>
static const std::map<std::string, std::vector<llama_token>> k_tests = {
{ "Hello World", { 1, 10994, 2787, }, },
}
for (const auto & test_kv : k_tests) {
- const auto res = ::llama_tokenize(ctx, test_kv.first, true);
+ std::vector<llama_token> res(test_kv.first.size());
+ const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), res.size(), true);
+ res.resize(n);
bool correct = res.size() == test_kv.second.size();
+++ /dev/null
-#include "ggml.h"
-
-#include "utils.h"
-
-#include <cassert>
-#include <cstring>
-#include <fstream>
-#include <string>
-#include <iterator>
-#include <algorithm>
-
- #if defined(_MSC_VER) || defined(__MINGW32__)
- #include <malloc.h> // using malloc.h with MSC/MINGW
- #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
- #include <alloca.h>
- #endif
-
-bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
- // determine sensible default number of threads.
- // std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0.
-#ifdef __linux__
- std::ifstream cpuinfo("/proc/cpuinfo");
- params.n_threads = std::count(std::istream_iterator<std::string>(cpuinfo),
- std::istream_iterator<std::string>(),
- std::string("processor"));
-#endif
- if (params.n_threads == 0) {
- params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency());
- }
-
- bool invalid_param = false;
- std::string arg;
- for (int i = 1; i < argc; i++) {
- arg = argv[i];
-
- if (arg == "-s" || arg == "--seed") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.seed = std::stoi(argv[i]);
- } else if (arg == "-t" || arg == "--threads") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.n_threads = std::stoi(argv[i]);
- } else if (arg == "-p" || arg == "--prompt") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.prompt = argv[i];
- } else if (arg == "-f" || arg == "--file") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- std::ifstream file(argv[i]);
- std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
- if (params.prompt.back() == '\n') {
- params.prompt.pop_back();
- }
- } else if (arg == "-n" || arg == "--n_predict") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.n_predict = std::stoi(argv[i]);
- } else if (arg == "--top_k") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.top_k = std::stoi(argv[i]);
- } else if (arg == "-c" || arg == "--ctx_size") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.n_ctx = std::stoi(argv[i]);
- } else if (arg == "--memory_f32") {
- params.memory_f16 = false;
- } else if (arg == "--top_p") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.top_p = std::stof(argv[i]);
- } else if (arg == "--temp") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.temp = std::stof(argv[i]);
- } else if (arg == "--repeat_last_n") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.repeat_last_n = std::stoi(argv[i]);
- } else if (arg == "--repeat_penalty") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.repeat_penalty = std::stof(argv[i]);
- } else if (arg == "-b" || arg == "--batch_size") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.n_batch = std::stoi(argv[i]);
- params.n_batch = std::min(512, params.n_batch);
- } else if (arg == "-m" || arg == "--model") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.model = argv[i];
- } else if (arg == "-i" || arg == "--interactive") {
- params.interactive = true;
- } else if (arg == "--embedding") {
- params.embedding = true;
- } else if (arg == "--interactive-start") {
- params.interactive = true;
- } else if (arg == "--interactive-first") {
- params.interactive_start = true;
- } else if (arg == "-ins" || arg == "--instruct") {
- params.instruct = true;
- } else if (arg == "--color") {
- params.use_color = true;
- } else if (arg == "--mlock") {
- params.use_mlock = true;
- } else if (arg == "--mtest") {
- params.mem_test = true;
- } else if (arg == "--verbose_prompt") {
- params.verbose_prompt = true;
- } else if (arg == "-r" || arg == "--reverse-prompt") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.antiprompt.push_back(argv[i]);
- } else if (arg == "--perplexity") {
- params.perplexity = true;
- } else if (arg == "--ignore-eos") {
- params.ignore_eos = true;
- } else if (arg == "--n_parts") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.n_parts = std::stoi(argv[i]);
- } else if (arg == "-h" || arg == "--help") {
- gpt_print_usage(argc, argv, params);
- exit(0);
- } else if (arg == "--random-prompt") {
- params.random_prompt = true;
- } else if (arg == "--in-prefix") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.input_prefix = argv[i];
- } else {
- fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
- gpt_print_usage(argc, argv, params);
- exit(1);
- }
- }
- if (invalid_param) {
- fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
- gpt_print_usage(argc, argv, params);
- exit(1);
- }
-
- return true;
-}
-
-void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
- fprintf(stderr, "usage: %s [options]\n", argv[0]);
- fprintf(stderr, "\n");
- fprintf(stderr, "options:\n");
- fprintf(stderr, " -h, --help show this help message and exit\n");
- fprintf(stderr, " -i, --interactive run in interactive mode\n");
- fprintf(stderr, " --interactive-first run in interactive mode and wait for input right away\n");
- fprintf(stderr, " -ins, --instruct run in instruction mode (use with Alpaca models)\n");
- fprintf(stderr, " -r PROMPT, --reverse-prompt PROMPT\n");
- fprintf(stderr, " run in interactive mode and poll user input upon seeing PROMPT (can be\n");
- fprintf(stderr, " specified more than once for multiple prompts).\n");
- fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n");
- fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for <= 0)\n");
- fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
- fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
- fprintf(stderr, " prompt to start generation with (default: empty)\n");
- fprintf(stderr, " --random-prompt start with a randomized prompt.\n");
- fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n");
- fprintf(stderr, " -f FNAME, --file FNAME\n");
- fprintf(stderr, " prompt file to start generation.\n");
- fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict);
- fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k);
- fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p);
- fprintf(stderr, " --repeat_last_n N last n tokens to consider for penalize (default: %d)\n", params.repeat_last_n);
- fprintf(stderr, " --repeat_penalty N penalize repeat sequence of tokens (default: %.1f)\n", params.repeat_penalty);
- fprintf(stderr, " -c N, --ctx_size N size of the prompt context (default: %d)\n", params.n_ctx);
- fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating\n");
- fprintf(stderr, " --memory_f32 use f32 instead of f16 for memory key+value\n");
- fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp);
- fprintf(stderr, " --n_parts N number of model parts (default: -1 = determine from dimensions)\n");
- fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
- fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
- if (ggml_mlock_supported()) {
- fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
- }
- fprintf(stderr, " --mtest compute maximum memory usage\n");
- fprintf(stderr, " --verbose-prompt print prompt before generation\n");
- fprintf(stderr, " -m FNAME, --model FNAME\n");
- fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
- fprintf(stderr, "\n");
-}
-
-std::string gpt_random_prompt(std::mt19937 & rng) {
- const int r = rng() % 10;
- switch (r) {
- case 0: return "So";
- case 1: return "Once upon a time";
- case 2: return "When";
- case 3: return "The";
- case 4: return "After";
- case 5: return "If";
- case 6: return "import";
- case 7: return "He";
- case 8: return "She";
- case 9: return "They";
- default: return "To";
- }
-
- return "The";
-}
-
-// TODO: not great allocating this every time
-std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
- // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
- std::vector<llama_token> res(text.size() + (int)add_bos);
- int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
- assert(n >= 0);
- res.resize(n);
-
- return res;
-}
+++ /dev/null
-// Various helper functions and utilities
-
-#pragma once
-
-#include "llama.h"
-
-#include <string>
-#include <vector>
-#include <random>
-#include <thread>
-
-//
-// CLI argument parsing
-//
-
-struct gpt_params {
- int32_t seed = -1; // RNG seed
- int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
- int32_t n_predict = 128; // new tokens to predict
- int32_t repeat_last_n = 64; // last n tokens to penalize
- int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)
- int32_t n_ctx = 512; // context size
- int32_t n_batch = 8; // batch size for prompt processing
-
- // sampling parameters
- int32_t top_k = 40;
- float top_p = 0.95f;
- float temp = 0.80f;
- float repeat_penalty = 1.10f;
-
- std::string model = "models/lamma-7B/ggml-model.bin"; // model path
- std::string prompt = "";
- std::string input_prefix = ""; // string to prefix user inputs with
-
-
- std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
-
- bool memory_f16 = true; // use f16 instead of f32 for memory kv
- bool random_prompt = false; // do not randomize prompt if none provided
- bool use_color = false; // use color to distinguish generations and inputs
- bool interactive = false; // interactive mode
-
- bool embedding = false; // get only sentence embedding
- bool interactive_start = false; // wait for user input immediately
-
- bool instruct = false; // instruction mode (used for Alpaca models)
- bool ignore_eos = false; // do not stop generating after eos
- bool perplexity = false; // compute perplexity over the prompt
- bool use_mlock = false; // use mlock to keep model in memory
- bool mem_test = false; // compute maximum memory usage
- bool verbose_prompt = false; // print prompt tokens before generation
-};
-
-bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
-
-void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
-
-std::string gpt_random_prompt(std::mt19937 & rng);
-
-//
-// Vocab utils
-//
-
-std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);