llama : add option to render special/control tokens (#6807)

author Georgi Gerganov <redacted>

Sun, 21 Apr 2024 15:36:45 +0000 (18:36 +0300)

committer GitHub <redacted>

Sun, 21 Apr 2024 15:36:45 +0000 (18:36 +0300)
author Georgi Gerganov <redacted>
Sun, 21 Apr 2024 15:36:45 +0000 (18:36 +0300)
committer GitHub <redacted>
Sun, 21 Apr 2024 15:36:45 +0000 (18:36 +0300)
diff --git a/Makefile b/Makefile

index 760015f2947294047892e796d0fe993b4a198183..72fdc6ba46bc71eb9b733a500393b801f65dbd50 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -699,7 +699,7 @@ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
  llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
         $(CXX) $(CXXFLAGS) -c $< -o $@
  
-COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
+COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
  COMMON_DEPS   = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
  
  common.o: common/common.cpp $(COMMON_H_DEPS)
diff --git a/README.md b/README.md

index a5ba710d7b106e0d087ef3e25bcdd106ec777897..1d4e9d41742b7e034773e94a1ddedf2e45d1d291 100644 (file)
--- a/README.md
+++ b/README.md
@@ -10,6 +10,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
  
  ### Recent API changes
  
+- [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807
  - [2024 Apr 4] State and session file functions reorganized under `llama_state_*` https://github.com/ggerganov/llama.cpp/pull/6341
  - [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122
  - [2024 Mar 13] Add `llama_synchronize()` + `llama_context_params.n_ubatch` https://github.com/ggerganov/llama.cpp/pull/6017
diff --git a/common/common.cpp b/common/common.cpp

index b6143e41c02cb356402849b0f5e1bc4327324eab..06f252ea6914b9779d032401c4c08e2e7b6316a9 100644 (file)
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2328,10 +2328,10 @@ std::vector<llama_token> llama_tokenize(
  
  std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
      std::vector<char> result(8, 0);
-    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
+    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), true);
      if (n_tokens < 0) {
          result.resize(-n_tokens);
-        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
+        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), true);
          GGML_ASSERT(check == -n_tokens);
      } else {
          result.resize(n_tokens);
diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift

index 5764acb6d5825c06a8f7e5bb9416aa34d0030783..dbbd06da5818364178a2fcc2c986100a5c2a2f72 100644 (file)
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@@ -229,7 +229,7 @@ private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
  
  private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? {
      var result = [CChar](repeating: 0, count: 8)
-    let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count))
+    let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), false)
      if nTokens < 0 {
          let actualTokensCount = -Int(nTokens)
          result = .init(repeating: 0, count: actualTokensCount)
@@ -237,7 +237,8 @@ private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String
              model,
              token,
              &result,
-            Int32(result.count)
+            Int32(result.count),
+            false
          )
          assert(check == actualTokensCount)
      } else {
diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift

index 70c43a3852732e90adc8cab6520932c0bbc91053..737f882fb2d2eeb2b7c6ea3b3639e0f8cd0f2f0d 100644 (file)
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -322,7 +322,7 @@ actor LlamaContext {
          defer {
              result.deallocate()
          }
-        let nTokens = llama_token_to_piece(model, token, result, 8)
+        let nTokens = llama_token_to_piece(model, token, result, 8, false)
  
          if nTokens < 0 {
              let newResult = UnsafeMutablePointer<Int8>.allocate(capacity: Int(-nTokens))
@@ -330,7 +330,7 @@ actor LlamaContext {
              defer {
                  newResult.deallocate()
              }
-            let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens)
+            let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, false)
              let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
              return Array(bufferPointer)
          } else {
diff --git a/llama.cpp b/llama.cpp

index ec4c1242b20bc098219db33c4879bfacfd88f0c0..7440c740fefbc65b3c073ddbba6c4b66c9e64bb0 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -1600,12 +1600,12 @@ struct llama_mlock {
  };
  using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
  
-static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
+static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
      std::vector<char> result(8, 0);
-    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
+    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
      if (n_tokens < 0) {
          result.resize(-n_tokens);
-        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
+        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
          GGML_ASSERT(check == -n_tokens);
      }
      else {
@@ -13312,7 +13312,8 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
  
      for (size_t i = 0; i < candidates->size; ++i) {
          const llama_token id    = candidates->data[i].id;
-        const std::string piece = llama_token_to_piece(ctx, id);
+        const std::string piece = llama_token_to_piece(ctx, id, false);
+
          if (llama_token_is_eog(&ctx->model, id)) {
              if (!allow_eog) {
                  candidates->data[i].logit = -INFINITY;
@@ -13512,7 +13513,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
          GGML_ASSERT(false);
      }
  
-    const std::string piece = llama_token_to_piece(ctx, token);
+    const std::string piece = llama_token_to_piece(ctx, token, false);
  
      // Note terminating 0 in decoded string
      const auto   decoded     = decode_utf8(piece, grammar->partial_utf8);
@@ -16991,7 +16992,7 @@ static std::string llama_decode_text(const std::string & text) {
  }
  
  // does not write null-terminator to buf
-int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
+int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
      if (0 <= token && token < llama_n_vocab(model)) {
          switch (llama_vocab_get_type(model->vocab)) {
          case LLAMA_VOCAB_TYPE_WPM:
@@ -17006,7 +17007,9 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
                  }
                  memcpy(buf, result.c_str(), result.length());
                  return result.length();
-            } else if (llama_is_user_defined_token(model->vocab, token)) {
+            } else if (
+                    (llama_is_user_defined_token(model->vocab, token)) ||
+                    (llama_is_control_token     (model->vocab, token) && special)) {
                  std::string result = model->vocab.id_to_token[token].text;
                  if (length < (int) result.length()) {
                      return -(int) result.length();
@@ -17019,8 +17022,6 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
                  }
                  memcpy(buf, "\xe2\x96\x85", 3);
                  return 3;
-            } else if (llama_is_control_token(model->vocab, token)) {
-                ;
              } else if (llama_is_byte_token(model->vocab, token)) {
                  if (length < 1) {
                      return -1;
@@ -17041,15 +17042,15 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
                  }
                  memcpy(buf, result.c_str(), result.length());
                  return result.length();
-            } else if (llama_is_user_defined_token(model->vocab, token)) {
+            } else if (
+                    (llama_is_user_defined_token(model->vocab, token)) ||
+                    (llama_is_control_token     (model->vocab, token) && special)) {
                  std::string result = model->vocab.id_to_token[token].text;
                  if (length < (int) result.length()) {
                      return -(int) result.length();
                  }
                  memcpy(buf, result.c_str(), result.length());
                  return result.length();
-            } else if (llama_is_control_token(model->vocab, token)) {
-                ;
              }
              break;
          }
diff --git a/llama.h b/llama.h

index 5bed97ad1ef9f70efda2043dc5b6948209b1f55d..4effca42cc65de9176110abc04428d28a6da68da 100644 (file)
--- a/llama.h
+++ b/llama.h
@@ -828,11 +828,13 @@ extern "C" {
      // Uses the vocabulary in the provided context.
      // Does not write null terminator to the buffer.
      // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
+    // @param special If true, special tokens are rendered in the output.
      LLAMA_API int32_t llama_token_to_piece(
                const struct llama_model * model,
                             llama_token   token,
                                    char * buf,
-                               int32_t   length);
+                               int32_t   length,
+                                  bool   special);
  
      /// Apply chat template. Inspired by hf apply_chat_template() on python.
      /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
author	Georgi Gerganov <redacted>
	Sun, 21 Apr 2024 15:36:45 +0000 (18:36 +0300)
committer	GitHub <redacted>
	Sun, 21 Apr 2024 15:36:45 +0000 (18:36 +0300)
Makefile		patch \| blob \| history
README.md		patch \| blob \| history
common/common.cpp		patch \| blob \| history
examples/batched.swift/Sources/main.swift		patch \| blob \| history
examples/llama.swiftui/llama.cpp.swift/LibLlama.swift		patch \| blob \| history
llama.cpp		patch \| blob \| history
llama.h		patch \| blob \| history