From: maor-ps <redacted>
Date: Sat, 4 May 2024 09:06:40 +0000 (+0300)
Subject: If first token generated from the server is the stop word the server will crash ... 
X-Git-Tag: upstream/0.0.4488~1701
X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=03fb8a002df2e96104f9e06de9c78d2a8ed91e92;p=pkg%2Fggml%2Fsources%2Fllama.cpp

If first token generated from the server is the stop word the server will crash (#7038)

This will reproduce the issue in llama13b
{
'prompt': 'Q: hello world \nA: ',
 'stop': ['\n'],
 'temperature': 0.0,
 'n_predict': 10,
 'cache_prompt': True,
 'n_probs': 10
}
---

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index f60530cf..ff0814b2 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1383,9 +1383,10 @@ struct server_context {
             if (!slot.params.stream && slot.stopped_word) {
                 const std::vector<llama_token> stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false);
 
+                size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size());
                 probs = std::vector<completion_token_output>(
                         slot.generated_token_probs.begin(),
-                        slot.generated_token_probs.end() - stop_word_toks.size());
+                        slot.generated_token_probs.end() - safe_offset);
             } else {
                 probs = std::vector<completion_token_output>(
                         slot.generated_token_probs.begin(),