llama : improve sep token handling (#14272)

author Sigbjørn Skjæret <redacted>

Fri, 20 Jun 2025 12:04:09 +0000 (14:04 +0200)

committer GitHub <redacted>

Fri, 20 Jun 2025 12:04:09 +0000 (14:04 +0200)
author Sigbjørn Skjæret <redacted>
Fri, 20 Jun 2025 12:04:09 +0000 (14:04 +0200)
committer GitHub <redacted>
Fri, 20 Jun 2025 12:04:09 +0000 (14:04 +0200)
diff --git a/ci/run.sh b/ci/run.sh

index 94005570511b6a3d2fe7ea3c95bff76112419c6f..e1b777c304eaf764f6bcf8d5e082b4dc7f650bb9 100755 (executable)
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -779,7 +779,7 @@ function gg_run_rerank_tiny {
      model_f16="${path_models}/ggml-model-f16.gguf"
  
      # for this model, the SEP token is "</s>"
-    (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
+    (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
  
      # sample output
      # rerank score 0:    0.029
diff --git a/common/arg.cpp b/common/arg.cpp

index 3dfaa71eff18806e396c8d3f8e3a9f1f4fb95a31..c4ad85c47b61b4ccd5f4624bf4fb6e46d9127b2e 100644 (file)
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2706,6 +2706,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
              params.embd_sep = value;
          }
      ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
+    add_opt(common_arg(
+        {"--cls-separator"}, "STRING",
+        "separator of classification sequences (default \\t) for example \"<#seq#>\"",
+        [](common_params & params, const std::string & value) {
+            params.cls_sep = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
      add_opt(common_arg(
          {"--host"}, "HOST",
          string_format("ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)", params.hostname.c_str()),
diff --git a/common/common.h b/common/common.h

index 5710c4e9735fdb524eb02ce37188c013b8f51c61..e08a59eae75438818705ec8974a2918b3cf6f7ab 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -358,6 +358,7 @@ struct common_params {
      int32_t embd_normalize = 2;     // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
      std::string embd_out   = "";    // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
      std::string embd_sep   = "\n";  // separator of embeddings
+    std::string cls_sep    = "\t";  // separator of classification sequences
  
      // server params
      int32_t port           = 8080;         // server listens on this network port
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py

index 2e08db3457b6044e4202dc46c0ad0b61454cdb33..2fe76589eb06205116e27887887edb9600e423cb 100755 (executable)
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -2145,7 +2145,6 @@ class Llama4Model(LlamaModel):
  
      def set_vocab(self):
          self._set_vocab_gpt2()
-        self.gguf_writer.add_add_bos_token(True)
  
      def set_gguf_parameters(self):
          super().set_gguf_parameters()
@@ -3918,9 +3917,6 @@ class BertModel(TextModel):
          special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
          special_vocab.add_to_gguf(self.gguf_writer)
  
-        self.gguf_writer.add_add_bos_token(True)
-        self.gguf_writer.add_add_eos_token(True)
-
  
  @ModelBase.register("DistilBertModel", "DistilBertForMaskedLM", "DistilBertForSequenceClassification")
  class DistilBertModel(BertModel):
@@ -3962,8 +3958,6 @@ class RobertaModel(BertModel):
          bpe_tok_path = self.dir_model / "tokenizer.json"
          if bpe_tok_path.exists():
              self._set_vocab_gpt2()
-            self.gguf_writer.add_add_bos_token(True)
-            self.gguf_writer.add_add_eos_token(True)
  
              # we need this to validate the size of the token_type embeddings
              # though currently we are passing all zeros to the token_type embeddings
@@ -4848,8 +4842,6 @@ class JinaBertV2Model(BertModel):
              self.gguf_writer.add_token_type_count(2)
          else:
              raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel')
-        self.gguf_writer.add_add_bos_token(True)
-        self.gguf_writer.add_add_eos_token(True)
  
  
  @ModelBase.register("OpenELMForCausalLM")
@@ -5451,9 +5443,6 @@ class T5Model(TextModel):
          special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
          special_vocab.add_to_gguf(self.gguf_writer)
  
-        self.gguf_writer.add_add_bos_token(False)
-        self.gguf_writer.add_add_eos_token(True)
-
      def set_gguf_parameters(self):
          if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
              logger.warning("Couldn't find context length in config.json, assuming default value of 512")
@@ -5591,9 +5580,6 @@ class T5EncoderModel(TextModel):
          special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
          special_vocab.add_to_gguf(self.gguf_writer)
  
-        self.gguf_writer.add_add_bos_token(False)
-        self.gguf_writer.add_add_eos_token(True)
-
      def set_gguf_parameters(self):
          if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
              logger.warning("Couldn't find context length in config.json, assuming default value of 512")
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp

index 681929d27d617460eedbcdabaefc6be89633829c..0ec2999a0c8e90d6cfe5809e08f5de0e99496b0b 100644 (file)
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -133,10 +133,36 @@ int main(int argc, char ** argv) {
      // max batch size
      const uint64_t n_batch = params.n_batch;
  
+    // get added sep and eos token, if any
+    const std::string added_sep_token = llama_vocab_get_add_sep(vocab) ? llama_vocab_get_text(vocab, llama_vocab_sep(vocab)) : "";
+    const std::string added_eos_token = llama_vocab_get_add_eos(vocab) ? llama_vocab_get_text(vocab, llama_vocab_eos(vocab)) : "";
+
      // tokenize the prompts and trim
      std::vector<std::vector<int32_t>> inputs;
      for (const auto & prompt : prompts) {
-        auto inp = common_tokenize(ctx, prompt, true, true);
+        std::vector<llama_token> inp;
+
+        // split classification pairs and insert expected separator tokens
+        if (pooling_type == LLAMA_POOLING_TYPE_RANK && prompt.find(params.cls_sep) != std::string::npos) {
+            std::vector<std::string> pairs = split_lines(prompt, params.cls_sep);
+            std::string final_prompt;
+
+            for (size_t i = 0; i < pairs.size(); i++) {
+                final_prompt += pairs[i];
+                if (i != pairs.size() - 1) {
+                    if (!added_eos_token.empty()) {
+                        final_prompt += added_eos_token;
+                    }
+                    if (!added_sep_token.empty()) {
+                        final_prompt += added_sep_token;
+                    }
+                }
+            }
+
+            inp = common_tokenize(ctx, final_prompt, true, true);
+        } else {
+            inp = common_tokenize(ctx, prompt, true, true);
+        }
          if (inp.size() > n_batch) {
              LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
                      __func__, (long long int) inp.size(), (long long int) n_batch);
@@ -145,11 +171,11 @@ int main(int argc, char ** argv) {
          inputs.push_back(inp);
      }
  
-    // check if the last token is SEP
+    // check if the last token is SEP/EOS
      // it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
      for (auto & inp : inputs) {
-        if (inp.empty() || inp.back() != llama_vocab_sep(vocab)) {
-            LOG_WRN("%s: last token in the prompt is not SEP\n", __func__);
+        if (inp.empty() || (inp.back() != llama_vocab_sep(vocab) && inp.back() != llama_vocab_eos(vocab))) {
+            LOG_WRN("%s: last token in the prompt is not SEP or EOS\n", __func__);
              LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
          }
      }
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py

index 834a1d5e1a97ed98f01196e0a0542afa95739b8c..0429b0aaf135dddf0f13f5a9e43f9889e1f7c641 100644 (file)
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -198,6 +198,7 @@ class Keys:
          MASK_ID              = "tokenizer.ggml.mask_token_id"
          ADD_BOS              = "tokenizer.ggml.add_bos_token"
          ADD_EOS              = "tokenizer.ggml.add_eos_token"
+        ADD_SEP              = "tokenizer.ggml.add_sep_token"
          ADD_PREFIX           = "tokenizer.ggml.add_space_prefix"
          REMOVE_EXTRA_WS      = "tokenizer.ggml.remove_extra_whitespaces"
          PRECOMPILED_CHARSMAP = "tokenizer.ggml.precompiled_charsmap"
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py

index 54ca0c33fd3368daa6a19a15023b792da88951d2..b9b63d052624d81c53852381a86e26129efb5261 100644 (file)
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -891,6 +891,9 @@ class GGUFWriter:
      def add_add_eos_token(self, value: bool) -> None:
          self.add_bool(Keys.Tokenizer.ADD_EOS, value)
  
+    def add_add_sep_token(self, value: bool) -> None:
+        self.add_bool(Keys.Tokenizer.ADD_SEP, value)
+
      def add_add_space_prefix(self, value: bool) -> None:
          self.add_bool(Keys.Tokenizer.ADD_PREFIX, value)
  
diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py

index 44d066ee75a7ece48dabb168c5054fb4053f5d48..6c4d3a422b99dd67408ffb8af287b1b7f95d4523 100644 (file)
--- a/gguf-py/gguf/vocab.py
+++ b/gguf-py/gguf/vocab.py
@@ -119,6 +119,7 @@ class SpecialVocab:
          logger.warning(f'Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping')
  
      def _try_load_from_tokenizer_json(self, path: Path) -> bool:
+        tokenizer = None
          tokenizer_file = path / 'tokenizer.json'
          if tokenizer_file.is_file():
              with open(tokenizer_file, encoding = 'utf-8') as f:
@@ -152,11 +153,87 @@ class SpecialVocab:
              added_tokens = tokenizer.get('added_tokens', {})
          else:
              added_tokens = {}
+        tokenizer_config = None
          tokenizer_config_file = path / 'tokenizer_config.json'
-        if not tokenizer_config_file.is_file():
+        if tokenizer_config_file.is_file():
+            with open(tokenizer_config_file, encoding = 'utf-8') as f:
+                tokenizer_config = json.load(f)
+        if tokenizer:
+            special_bos = (tokenizer_config or {}).get('bos_token')
+            special_cls = (tokenizer_config or {}).get('cls_token')
+            special_eos = (tokenizer_config or {}).get('eos_token')
+            special_sep = (tokenizer_config or {}).get('sep_token')
+            if not special_bos and special_cls and tokenizer_config:
+                tokenizer_config['bos_token'] = special_bos = special_cls
+            if not special_eos and special_sep and tokenizer_config:
+                tokenizer_config['eos_token'] = special_eos = special_sep
+            post_processor = tokenizer.get('post_processor', {})
+            for processor in post_processor.get('processors', [post_processor]):
+                if processor.get('type') == 'RobertaProcessing':
+                    self.add_special_token['bos'] = True
+                    self.add_special_token['eos'] = True
+                    self.add_special_token['sep'] = True
+                    if not special_cls and tokenizer_config:
+                        special_cls = processor.get('cls', [special_bos])[0]
+                        tokenizer_config['cls_token'] = special_cls
+                    if not special_sep and tokenizer_config:
+                        special_sep = processor.get('sep', [special_eos])[0]
+                        tokenizer_config['sep_token'] = special_sep
+                    continue
+                # Crude parsing of TemplateProcessing to determine if BOS/SEP/EOS should be added
+                # Only works with simple templates, **will** get it wrong on unusual sequences
+                if processor.get('type') == 'TemplateProcessing':
+                    tmpl_single = processor.get('single', [])
+                    tmpl_pair = processor.get('pair', [])
+                    special_first = None
+                    special_last = None
+                    if len(tmpl_single) > 1:
+                        if special_first := tmpl_single[0].get('SpecialToken', {}).get('id'):
+                            if not tokenizer_config:
+                                special_bos = special_first
+                            self.add_special_token['bos'] = True if special_first in (special_bos, special_cls) else False
+                            if special_first not in (special_bos, special_cls):
+                                logger.warning(f'Unknown leading special token {special_first!r} in TemplateProcessing<single>')
+                        if special_last := tmpl_single[-1].get('SpecialToken', {}).get('id'):
+                            if not tokenizer_config:
+                                special_eos = special_last
+                            self.add_special_token['eos'] = True if special_last == special_eos else False
+                            if special_last != special_eos:
+                                logger.warning(f'Unknown trailing special token {special_last!r} in TemplateProcessing<single>')
+                    if tmpl_pair:
+                        seq_start = 1 if tmpl_pair[0].get('SpecialToken', {}).get('id') == special_first else 0
+                        seq_stop = -1 if tmpl_pair[-1].get('SpecialToken', {}).get('id') == special_last else None
+                        if seq_start == 0 or seq_stop is None:
+                            logger.warning('TemplateProcessing<single> leading/trailing special tokens do not match TemplateProcessing<pair>')
+                        if tmpl_pair := tmpl_pair[slice(seq_start, seq_stop)]:
+                            tmpl_a = tmpl_pair[0].get('Sequence', {}).get('id')
+                            tmpl_b = tmpl_pair[-1].get('Sequence', {}).get('id')
+                            if tmpl_a != 'A' or tmpl_b != 'B':
+                                logger.warning(f'Unknown sequence {tmpl_a}...{tmpl_b} in TemplateProcessing<pair>')
+                            # A [sep] [eos] B
+                            if tmpl_a == 'A' and tmpl_b == 'B' and (tmpl_pair := tmpl_pair[1:-1]):
+                                add_sep = False
+                                if special_entry := tmpl_pair[0].get('SpecialToken', {}).get('id'):
+                                    if special_entry in (special_sep, special_eos) and not special_last:
+                                        add_sep = True
+                                    if special_entry not in (special_sep, special_eos):
+                                        logger.warning(f'Unknown separator token {special_entry!r} in TemplateProcessing<pair>')
+                                else:
+                                    logger.warning(f'Unknown middle sequence {tmpl_pair[0]!r} in TemplateProcessing<pair>')
+                                if len(tmpl_pair) == 2:
+                                    if special_entry := tmpl_pair[1].get('SpecialToken', {}).get('id'):
+                                        if special_entry in (special_sep, special_eos):
+                                            add_sep = True
+                                        if special_entry not in (special_sep, special_eos):
+                                            logger.warning(f'Unknown second separator token {special_entry!r} in TemplateProcessing<pair>')
+                                    else:
+                                        logger.warning(f'Unknown second middle sequence {tmpl_pair[1]!r} in TemplateProcessing<pair>')
+                                self.add_special_token['sep'] = add_sep
+                                if add_sep and not special_sep and tokenizer_config:
+                                    tokenizer_config['sep_token'] = special_eos
+                    continue
+        if not tokenizer_config:
              return True
-        with open(tokenizer_config_file, encoding = 'utf-8') as f:
-            tokenizer_config = json.load(f)
          chat_template_alt = None
          chat_template_file = path / 'chat_template.json'
          if chat_template_file.is_file():
diff --git a/include/llama.h b/include/llama.h

index 635508b10f2ff1a2820ca98b15d26992b539f495..3475d596502c654ee301cf29d9a7c4eb50d31c47 100644 (file)
--- a/include/llama.h
+++ b/include/llama.h
@@ -1044,6 +1044,7 @@ extern "C" {
  
      LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
      LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
+    LLAMA_API bool llama_vocab_get_add_sep(const struct llama_vocab * vocab);
  
      LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab);
      LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab);
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp

index 0bc60565df12ca77249d6449698f53836f53cfc0..8dadef204f9d71f45039c4402aedcb5e923683ad 100644 (file)
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -198,6 +198,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
      { LLM_KV_TOKENIZER_MASK_ID,              "tokenizer.ggml.mask_token_id"            },
      { LLM_KV_TOKENIZER_ADD_BOS,              "tokenizer.ggml.add_bos_token"            },
      { LLM_KV_TOKENIZER_ADD_EOS,              "tokenizer.ggml.add_eos_token"            },
+    { LLM_KV_TOKENIZER_ADD_SEP,              "tokenizer.ggml.add_sep_token"            },
      { LLM_KV_TOKENIZER_ADD_PREFIX,           "tokenizer.ggml.add_space_prefix"         },
      { LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,      "tokenizer.ggml.remove_extra_whitespaces" },
      { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap"     },
diff --git a/src/llama-arch.h b/src/llama-arch.h

index 51b242c66b824a93498c420156d751144317b019..5b0230c15067817a4500c903259f7e97a1213db3 100644 (file)
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -194,6 +194,7 @@ enum llm_kv {
      LLM_KV_TOKENIZER_MASK_ID,
      LLM_KV_TOKENIZER_ADD_BOS,
      LLM_KV_TOKENIZER_ADD_EOS,
+    LLM_KV_TOKENIZER_ADD_SEP,
      LLM_KV_TOKENIZER_ADD_PREFIX,
      LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,
      LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
diff --git a/src/llama-model-saver.cpp b/src/llama-model-saver.cpp

index a70b9892347cb0a0ea1204e6adf4416597e4f66e..563823dc35d8eef29b5e7b62589c6500face3a97 100644 (file)
--- a/src/llama-model-saver.cpp
+++ b/src/llama-model-saver.cpp
@@ -228,6 +228,7 @@ void llama_model_saver::add_kv_from_model() {
      // add_kv(LLM_KV_TOKENIZER_MASK_ID,                 ???);
      add_kv(LLM_KV_TOKENIZER_ADD_BOS,                 vocab.get_add_bos());
      add_kv(LLM_KV_TOKENIZER_ADD_EOS,                 vocab.get_add_eos());
+    add_kv(LLM_KV_TOKENIZER_ADD_SEP,                 vocab.get_add_sep());
      add_kv(LLM_KV_TOKENIZER_ADD_PREFIX,              vocab.get_add_space_prefix());
      add_kv(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,         vocab.get_remove_extra_whitespaces());
      add_kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,    vocab.get_precompiled_charsmap());
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp

index d90f1d6b1ea63eb5456b1fe31376065a587b27e1..4ab120d9ba818ffdbc201594f5c8af418dc18a38 100644 (file)
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1269,6 +1269,7 @@ struct llama_vocab::impl {
      bool add_space_prefix           = false;
      bool add_bos                    = false;
      bool add_eos                    = false;
+    bool add_sep                    = false;
      bool ignore_merges              = false;
      bool clean_spaces               = false;  // clean_up_tokenization_spaces
      bool remove_extra_whitespaces   = false;
@@ -1421,6 +1422,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
              special_sep_id  = 102;
              special_pad_id  = 0;
              special_mask_id = 103;
+
+            add_sep = true;
          } else if (tokenizer_model == "gpt2") {
              type = LLAMA_VOCAB_TYPE_BPE;
  
@@ -1550,12 +1553,15 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                      tokenizer_pre == "jina-es" ||
                      tokenizer_pre == "jina-de" ||
                      tokenizer_pre == "gigachat"   ||
-                    tokenizer_pre == "jina-v1-en" ||
                      tokenizer_pre == "jina-v2-es" ||
-                    tokenizer_pre == "jina-v2-de" ||
+                    tokenizer_pre == "jina-v2-de") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
+            } else if (
+                    tokenizer_pre == "jina-v1-en" ||
                      tokenizer_pre == "jina-v2-code" ||
                      tokenizer_pre == "roberta-bpe") {
                  pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
+                add_sep = true;
              } else if (
                      tokenizer_pre == "refact") {
                  pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT;
@@ -1665,6 +1671,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
              clean_spaces = true;
              add_bos = true;
              add_eos = false;
+            add_sep = true;
          } else if (type == LLAMA_VOCAB_TYPE_UGM) {
              pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
              add_bos = false;
@@ -1801,7 +1808,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
              }
          }
  
-        // Handle add_bos and add_eos
+        // Handle add_bos, add_eos and add_sep
          {
              bool temp = true;
  
@@ -1811,6 +1818,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
              if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
                  add_eos = temp;
              }
+            if (ml.get_key(LLM_KV_TOKENIZER_ADD_SEP, temp, false)) {
+                add_sep = temp;
+            }
          }
  
          // auto-detect special tokens by text
@@ -3000,6 +3010,10 @@ bool llama_vocab::get_add_eos() const {
      return pimpl->add_eos;
  }
  
+bool llama_vocab::get_add_sep() const {
+    return pimpl->add_sep;
+}
+
  bool llama_vocab::get_ignore_merges() const {
      return pimpl->ignore_merges;
  }
@@ -3191,6 +3205,10 @@ bool llama_vocab_get_add_eos(const struct llama_vocab * vocab) {
      return vocab->get_add_eos();
  }
  
+bool llama_vocab_get_add_sep(const struct llama_vocab * vocab) {
+    return vocab->get_add_sep();
+}
+
  llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab) {
      return vocab->token_fim_pre();
  }
diff --git a/src/llama-vocab.h b/src/llama-vocab.h

index daa6cf3082f90a3dc1ace5fff4b379bd3220c51e..40e4d1c05b18ea8542c72779bf408a9a3a7f11ea 100644 (file)
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@@ -74,6 +74,7 @@ struct llama_vocab {
      bool get_add_space_prefix          () const;
      bool get_add_bos                   () const;
      bool get_add_eos                   () const;
+    bool get_add_sep                   () const;
      bool get_ignore_merges             () const;
      bool get_clean_spaces              () const;
      bool get_remove_extra_whitespaces  () const;
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp

index f3e0392a4e9d19ee33b254bec425237845f59186..f8fab2c86664e16ac89ebb96761dd4560fbc39a0 100644 (file)
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -271,12 +271,20 @@ static llama_tokens format_rerank(const struct llama_vocab * vocab, const llama_
      }
  
      result.reserve(doc.size() + query.size() + 4);
-    result.push_back(llama_vocab_bos(vocab));
+    if (llama_vocab_get_add_bos(vocab)) {
+        result.push_back(llama_vocab_bos(vocab));
+    }
      result.insert(result.end(), query.begin(), query.end());
-    result.push_back(eos_token);
-    result.push_back(llama_vocab_sep(vocab));
+    if (llama_vocab_get_add_eos(vocab)) {
+        result.push_back(eos_token);
+    }
+    if (llama_vocab_get_add_sep(vocab)) {
+        result.push_back(llama_vocab_sep(vocab));
+    }
      result.insert(result.end(), doc.begin(), doc.end());
-    result.push_back(eos_token);
+    if (llama_vocab_get_add_eos(vocab)) {
+        result.push_back(eos_token);
+    }
  
      return result;
  }
author	Sigbjørn Skjæret <redacted>
	Fri, 20 Jun 2025 12:04:09 +0000 (14:04 +0200)
committer	GitHub <redacted>
	Fri, 20 Jun 2025 12:04:09 +0000 (14:04 +0200)
ci/run.sh		patch \| blob \| history
common/arg.cpp		patch \| blob \| history
common/common.h		patch \| blob \| history
convert_hf_to_gguf.py		patch \| blob \| history
examples/embedding/embedding.cpp		patch \| blob \| history
gguf-py/gguf/constants.py		patch \| blob \| history
gguf-py/gguf/gguf_writer.py		patch \| blob \| history
gguf-py/gguf/vocab.py		patch \| blob \| history
include/llama.h		patch \| blob \| history
src/llama-arch.cpp		patch \| blob \| history
src/llama-arch.h		patch \| blob \| history
src/llama-model-saver.cpp		patch \| blob \| history
src/llama-vocab.cpp		patch \| blob \| history
src/llama-vocab.h		patch \| blob \| history
tools/server/utils.hpp		patch \| blob \| history