From: Georgi Gerganov Date: Wed, 30 Aug 2023 09:52:46 +0000 (+0300) Subject: examples : fix underscore in beam-search + .gitignore (close #2900) X-Git-Tag: gguf-v0.4.0~179 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=c90d135eb433cf0d40fb95e46a48d1391d2352b5;p=pkg%2Fggml%2Fsources%2Fllama.cpp examples : fix underscore in beam-search + .gitignore (close #2900) --- diff --git a/.gitignore b/.gitignore index 54ea2b52..8b5f45a2 100644 --- a/.gitignore +++ b/.gitignore @@ -42,6 +42,9 @@ models-mnt /gguf-llama-simple /libllama.so /llama-bench +/baby-llama +/beam-search +/save-load-state build-info.h arm_neon.h compile_commands.json diff --git a/Makefile b/Makefile index bd2d9286..b750540f 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Define the default target now so that it is always the first target -BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam_search tests/test-c.o +BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search tests/test-c.o # Binaries only useful for tests TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1 @@ -446,7 +446,7 @@ llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o co baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -beam_search: examples/beam_search/beam_search.cpp build-info.h ggml.o llama.o common.o $(OBJS) +beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))' diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 94b78522..6e65eb08 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -25,7 +25,7 @@ else() add_subdirectory(simple) add_subdirectory(embd-input) add_subdirectory(llama-bench) - add_subdirectory(beam_search) + add_subdirectory(beam-search) if (LLAMA_METAL) add_subdirectory(metal) endif() diff --git a/examples/beam-search/CMakeLists.txt b/examples/beam-search/CMakeLists.txt new file mode 100644 index 00000000..e44a7497 --- /dev/null +++ b/examples/beam-search/CMakeLists.txt @@ -0,0 +1,8 @@ +set(TARGET beam-search) +add_executable(${TARGET} beam-search.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) +if(TARGET BUILD_INFO) + add_dependencies(${TARGET} BUILD_INFO) +endif() diff --git a/examples/beam-search/beam-search.cpp b/examples/beam-search/beam-search.cpp new file mode 100644 index 00000000..42c7c725 --- /dev/null +++ b/examples/beam-search/beam-search.cpp @@ -0,0 +1,188 @@ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include "common.h" +#include "llama.h" +#include "build-info.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) +#include +#include +#elif defined (_WIN32) +#define WIN32_LEAN_AND_MEAN +#define NOMINMAX +#include +#include +#endif + +// Used for debugging to print out beam tokens. +struct ostream_beam_view { + llama_context * ctx; + llama_beam_view beam_view; +}; +std::ostream& operator<<(std::ostream& os, const ostream_beam_view & obv) { + os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens("; + for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) { + os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]); + } + return os << ')'; +} + +// Put here anything you want back in beam_search_callback(). +struct beam_search_callback_data { + llama_context * ctx; + std::vector response; +}; + +// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same. +// For example, eob can be flagged due to maximum token length, stop words, etc. +bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, const size_t n_tokens) { + return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx); +} + +// Function matching type llama_beam_search_callback_fn_t. +// Custom callback example is called each time the beams lengths increase: +// * Show progress by printing ',' following by number of convergent beam tokens if any. +// * When all beams converge to a common prefix, they are made available in beams_state.beams[0]. +// This is also called when the stop condition is met. +// Collect tokens into std::vector response which is pointed to by callback_data. +void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) { + auto& callback_data = *static_cast(callback_data_ptr); + // Mark beams as EOS as needed. + for (size_t i = 0 ; i < beams_state.n_beams ; ++i) { + llama_beam_view& beam_view = beams_state.beam_views[i]; + if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) { + beam_view.eob = true; + } + } + printf(","); // Show progress + if (const size_t n = beams_state.common_prefix_length) { + callback_data.response.resize(callback_data.response.size() + n); + assert(0u < beams_state.n_beams); + const llama_token * tokens = beams_state.beam_views[0].tokens; + std::copy(tokens, tokens + n, callback_data.response.end() - n); + printf("%lu", n); + } + fflush(stdout); +#if 1 // DEBUG: print current beams for this iteration + std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n"; + for (size_t i = 0 ; i < beams_state.n_beams ; ++i) { + std::cout << "beams["< 3 ) + { + params.prompt = argv[3]; + } + + if ( params.prompt.empty() ) + { + params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n"; + } + + //--------------------------------- + // Init LLM : + //--------------------------------- + + llama_backend_init(params.numa); + + llama_model * model; + llama_context * ctx; + + std::tie(model, ctx) = llama_init_from_gpt_params( params ); + + if ( model == NULL ) + { + fprintf( stderr , "%s: error: unable to load model\n" , __func__ ); + return 1; + } + + //--------------------------------- + // Tokenize the prompt : + //--------------------------------- + + std::vector tokens_list = llama_tokenize(ctx, params.prompt, true); + + const size_t max_context_size = llama_n_ctx( ctx ); + const size_t max_tokens_list_size = max_context_size - 4 ; + + if (tokens_list.size() > max_tokens_list_size) + { + fprintf( stderr , "%s: error: prompt too long (%lu tokens, max %lu)\n" , + __func__ , tokens_list.size() , max_tokens_list_size ); + return 1; + } + + fprintf( stderr, "\n\n" ); + + // Print the tokens from the prompt : + + for( auto id : tokens_list ) + { + std::cout << llama_token_to_piece(ctx, id); + } + std::cout << std::flush; + + int n_past = llama_get_kv_cache_token_count(ctx); + if (llama_eval(ctx, tokens_list.data(), tokens_list.size(), n_past, params.n_threads)) + { + fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ ); + return 1; + } + n_past += tokens_list.size(); + + beam_search_callback_data callback_data{ctx, {}}; + size_t const beam_width = static_cast(params.n_beams); + int const n_predict = 256; + llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict, params.n_threads); + + std::cout << "\n\n"; + for (llama_token const token_id : callback_data.response) { + std::cout << llama_token_to_piece(ctx,token_id); + } + std::cout << std::endl; + + llama_free( ctx ); + llama_free_model( model ); + + llama_backend_free(); + + return 0; +} diff --git a/examples/beam_search/CMakeLists.txt b/examples/beam_search/CMakeLists.txt deleted file mode 100644 index b29e0109..00000000 --- a/examples/beam_search/CMakeLists.txt +++ /dev/null @@ -1,8 +0,0 @@ -set(TARGET beam_search) -add_executable(${TARGET} beam_search.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) -if(TARGET BUILD_INFO) - add_dependencies(${TARGET} BUILD_INFO) -endif() diff --git a/examples/beam_search/beam_search.cpp b/examples/beam_search/beam_search.cpp deleted file mode 100644 index 42c7c725..00000000 --- a/examples/beam_search/beam_search.cpp +++ /dev/null @@ -1,188 +0,0 @@ -#ifndef _GNU_SOURCE -#define _GNU_SOURCE -#endif - -#include "common.h" -#include "llama.h" -#include "build-info.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) -#include -#include -#elif defined (_WIN32) -#define WIN32_LEAN_AND_MEAN -#define NOMINMAX -#include -#include -#endif - -// Used for debugging to print out beam tokens. -struct ostream_beam_view { - llama_context * ctx; - llama_beam_view beam_view; -}; -std::ostream& operator<<(std::ostream& os, const ostream_beam_view & obv) { - os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens("; - for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) { - os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]); - } - return os << ')'; -} - -// Put here anything you want back in beam_search_callback(). -struct beam_search_callback_data { - llama_context * ctx; - std::vector response; -}; - -// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same. -// For example, eob can be flagged due to maximum token length, stop words, etc. -bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, const size_t n_tokens) { - return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx); -} - -// Function matching type llama_beam_search_callback_fn_t. -// Custom callback example is called each time the beams lengths increase: -// * Show progress by printing ',' following by number of convergent beam tokens if any. -// * When all beams converge to a common prefix, they are made available in beams_state.beams[0]. -// This is also called when the stop condition is met. -// Collect tokens into std::vector response which is pointed to by callback_data. -void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) { - auto& callback_data = *static_cast(callback_data_ptr); - // Mark beams as EOS as needed. - for (size_t i = 0 ; i < beams_state.n_beams ; ++i) { - llama_beam_view& beam_view = beams_state.beam_views[i]; - if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) { - beam_view.eob = true; - } - } - printf(","); // Show progress - if (const size_t n = beams_state.common_prefix_length) { - callback_data.response.resize(callback_data.response.size() + n); - assert(0u < beams_state.n_beams); - const llama_token * tokens = beams_state.beam_views[0].tokens; - std::copy(tokens, tokens + n, callback_data.response.end() - n); - printf("%lu", n); - } - fflush(stdout); -#if 1 // DEBUG: print current beams for this iteration - std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n"; - for (size_t i = 0 ; i < beams_state.n_beams ; ++i) { - std::cout << "beams["< 3 ) - { - params.prompt = argv[3]; - } - - if ( params.prompt.empty() ) - { - params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n"; - } - - //--------------------------------- - // Init LLM : - //--------------------------------- - - llama_backend_init(params.numa); - - llama_model * model; - llama_context * ctx; - - std::tie(model, ctx) = llama_init_from_gpt_params( params ); - - if ( model == NULL ) - { - fprintf( stderr , "%s: error: unable to load model\n" , __func__ ); - return 1; - } - - //--------------------------------- - // Tokenize the prompt : - //--------------------------------- - - std::vector tokens_list = llama_tokenize(ctx, params.prompt, true); - - const size_t max_context_size = llama_n_ctx( ctx ); - const size_t max_tokens_list_size = max_context_size - 4 ; - - if (tokens_list.size() > max_tokens_list_size) - { - fprintf( stderr , "%s: error: prompt too long (%lu tokens, max %lu)\n" , - __func__ , tokens_list.size() , max_tokens_list_size ); - return 1; - } - - fprintf( stderr, "\n\n" ); - - // Print the tokens from the prompt : - - for( auto id : tokens_list ) - { - std::cout << llama_token_to_piece(ctx, id); - } - std::cout << std::flush; - - int n_past = llama_get_kv_cache_token_count(ctx); - if (llama_eval(ctx, tokens_list.data(), tokens_list.size(), n_past, params.n_threads)) - { - fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ ); - return 1; - } - n_past += tokens_list.size(); - - beam_search_callback_data callback_data{ctx, {}}; - size_t const beam_width = static_cast(params.n_beams); - int const n_predict = 256; - llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict, params.n_threads); - - std::cout << "\n\n"; - for (llama_token const token_id : callback_data.response) { - std::cout << llama_token_to_piece(ctx,token_id); - } - std::cout << std::endl; - - llama_free( ctx ); - llama_free_model( model ); - - llama_backend_free(); - - return 0; -}