llama : remove Tail-Free sampling (#10071)

author Georgi Gerganov <redacted>

Tue, 29 Oct 2024 08:42:05 +0000 (10:42 +0200)

committer GitHub <redacted>

Tue, 29 Oct 2024 08:42:05 +0000 (10:42 +0200)
author Georgi Gerganov <redacted>
Tue, 29 Oct 2024 08:42:05 +0000 (10:42 +0200)
committer GitHub <redacted>
Tue, 29 Oct 2024 08:42:05 +0000 (10:42 +0200)
diff --git a/common/arg.cpp b/common/arg.cpp

index e1e933934f0ef036dbc4f78bc512d022feb10de1..7c5c5e5cd5b887f71549e734c816bd573d2003d8 100644 (file)
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -943,13 +943,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
              params.sparams.min_p = std::stof(value);
          }
      ).set_sparam());
-    add_opt(common_arg(
-        {"--tfs"}, "N",
-        string_format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)params.sparams.tfs_z),
-        [](common_params & params, const std::string & value) {
-            params.sparams.tfs_z = std::stof(value);
-        }
-    ).set_sparam());
      add_opt(common_arg(
          {"--xtc-probability"}, "N",
          string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_probability),
@@ -1074,7 +1067,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
      ).set_sparam());
      add_opt(common_arg(
          {"--mirostat"}, "N",
-        string_format("use Mirostat sampling.\nTop K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"
+        string_format("use Mirostat sampling.\nTop K, Nucleus and Locally Typical samplers are ignored if used.\n"
          "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat),
          [](common_params & params, int value) {
              params.sparams.mirostat = value;
diff --git a/common/common.cpp b/common/common.cpp

index ff8cc4076e95d76d03a1d1f454b12dc864870494..7656843b116dde1b9cbca7c1b9de039a6f933f41 100644 (file)
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2090,7 +2090,6 @@ void yaml_dump_non_result_info(FILE * stream, const common_params & params, cons
      const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
      yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
  
-    fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
      fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
      fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
      fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
diff --git a/common/common.h b/common/common.h

index 18b2121ed89b093db8644840e8787cef639b72f1..cd5a8e051d33a8a8ee25620c70e7a72d5f884087 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -88,7 +88,7 @@ enum common_sampler_type {
      COMMON_SAMPLER_TYPE_TOP_K       = 2,
      COMMON_SAMPLER_TYPE_TOP_P       = 3,
      COMMON_SAMPLER_TYPE_MIN_P       = 4,
-    COMMON_SAMPLER_TYPE_TFS_Z       = 5,
+  //COMMON_SAMPLER_TYPE_TFS_Z       = 5,
      COMMON_SAMPLER_TYPE_TYPICAL_P   = 6,
      COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
      COMMON_SAMPLER_TYPE_XTC         = 8,
@@ -113,7 +113,6 @@ struct common_sampler_params {
      float   min_p              = 0.05f; // 0.0 = disabled
      float   xtc_probability    = 0.00f; // 0.0 = disabled
      float   xtc_threshold      = 0.10f; // > 0.5 disables XTC
-    float   tfs_z              = 1.00f; // 1.0 = disabled
      float   typ_p              = 1.00f; // typical_p, 1.0 = disabled
      float   temp               = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
      float   dynatemp_range     = 0.00f; // 0.0 = disabled
@@ -139,7 +138,6 @@ struct common_sampler_params {
      std::vector<enum common_sampler_type> samplers = {
          COMMON_SAMPLER_TYPE_DRY,
          COMMON_SAMPLER_TYPE_TOP_K,
-        COMMON_SAMPLER_TYPE_TFS_Z,
          COMMON_SAMPLER_TYPE_TYPICAL_P,
          COMMON_SAMPLER_TYPE_TOP_P,
          COMMON_SAMPLER_TYPE_MIN_P,
diff --git a/common/sampling.cpp b/common/sampling.cpp

index 48a9df8ba5b88cf1623d7725697a06c64872bf0a..7922fde47d3693bfe9fc02c642cb6c37f784b084 100644 (file)
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -131,11 +131,11 @@ std::string common_sampler_params::print() const {
      snprintf(result, sizeof(result),
              "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
              "\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
-            "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n"
+            "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n"
              "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
              penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
              dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
-            top_k, tfs_z, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp,
+            top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp,
              mirostat, mirostat_eta, mirostat_tau);
  
      return std::string(result);
@@ -199,9 +199,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                  case COMMON_SAMPLER_TYPE_XTC:
                      llama_sampler_chain_add(result->chain, llama_sampler_init_xtc      (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
                      break;
-                case COMMON_SAMPLER_TYPE_TFS_Z:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
-                    break;
                  case COMMON_SAMPLER_TYPE_TYPICAL_P:
                      llama_sampler_chain_add(result->chain, llama_sampler_init_typical  (params.typ_p, params.min_keep));
                      break;
@@ -373,7 +370,6 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
      switch (cnstr) {
          case COMMON_SAMPLER_TYPE_DRY:         return 'd';
          case COMMON_SAMPLER_TYPE_TOP_K:       return 'k';
-        case COMMON_SAMPLER_TYPE_TFS_Z:       return 'f';
          case COMMON_SAMPLER_TYPE_TYPICAL_P:   return 'y';
          case COMMON_SAMPLER_TYPE_TOP_P:       return 'p';
          case COMMON_SAMPLER_TYPE_MIN_P:       return 'm';
@@ -388,7 +384,6 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
      switch (cnstr) {
          case COMMON_SAMPLER_TYPE_DRY:         return "dry";
          case COMMON_SAMPLER_TYPE_TOP_K:       return "top_k";
-        case COMMON_SAMPLER_TYPE_TFS_Z:       return "tfs_z";
          case COMMON_SAMPLER_TYPE_TYPICAL_P:   return "typ_p";
          case COMMON_SAMPLER_TYPE_TOP_P:       return "top_p";
          case COMMON_SAMPLER_TYPE_MIN_P:       return "min_p";
@@ -406,7 +401,6 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
          { "top_p",       COMMON_SAMPLER_TYPE_TOP_P },
          { "typ_p",       COMMON_SAMPLER_TYPE_TYPICAL_P },
          { "min_p",       COMMON_SAMPLER_TYPE_MIN_P },
-        { "tfs_z",       COMMON_SAMPLER_TYPE_TFS_Z },
          { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
          { "xtc",         COMMON_SAMPLER_TYPE_XTC },
          { "infill",      COMMON_SAMPLER_TYPE_INFILL },
@@ -423,8 +417,6 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
          { "typ-p",       COMMON_SAMPLER_TYPE_TYPICAL_P },
          { "typ",         COMMON_SAMPLER_TYPE_TYPICAL_P },
          { "min-p",       COMMON_SAMPLER_TYPE_MIN_P },
-        { "tfs-z",       COMMON_SAMPLER_TYPE_TFS_Z },
-        { "tfs",         COMMON_SAMPLER_TYPE_TFS_Z },
          { "temp",        COMMON_SAMPLER_TYPE_TEMPERATURE },
      };
  
@@ -452,7 +444,6 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
      std::unordered_map<char, common_sampler_type> sampler_name_map = {
          { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_DRY),         COMMON_SAMPLER_TYPE_DRY },
          { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K),       COMMON_SAMPLER_TYPE_TOP_K },
-        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TFS_Z),       COMMON_SAMPLER_TYPE_TFS_Z },
          { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P),   COMMON_SAMPLER_TYPE_TYPICAL_P },
          { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P),       COMMON_SAMPLER_TYPE_TOP_P },
          { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P),       COMMON_SAMPLER_TYPE_MIN_P },
diff --git a/examples/main/README.md b/examples/main/README.md

index c7c82317179808b863668dbd4d27da5cde34e2b3..5357ac2e2fa8fe89423d975fc57b1a99e2ca18ac 100644 (file)
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -235,14 +235,6 @@ The Min-P sampling method was designed as an alternative to Top-P, and aims to e
  
  Example usage: `--min-p 0.05`
  
-### Tail-Free Sampling (TFS)
-
--   `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).
-
-Tail-free sampling (TFS) is a text generation technique that aims to reduce the impact of less likely tokens, which may be less relevant, less coherent, or nonsensical, on the output. Similar to Top-P it tries to determine the bulk of the most likely tokens dynamically. But TFS filters out logits based on the second derivative of their probabilities. Adding tokens is stopped after the sum of the second derivatives reaches the parameter z. In short: TFS looks at how quickly the probabilities of the tokens decrease and cuts off the tail of unlikely tokens using the parameter z. Typical values for z are in the range of 0.9 to 0.95. A value of 1.0 would include all tokens and thus disables the effect of TFS.
-
-Example usage: `--tfs 0.95`
-
  ### Locally Typical Sampling
  
  -   `--typical N`: Enable locally typical sampling with parameter p (default: 1.0, 1.0 = disabled).
diff --git a/examples/server/README.md b/examples/server/README.md

index bc737237eb0181b68a51cb4e18691f52002985d5..1629e456b68360cc607fa34dce1ebb539ae3a801 100644 (file)
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -99,7 +99,7 @@ The project is under active development, and we are [looking for feedback and co
  
  | Argument | Explanation |
  | -------- | ----------- |
-| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;tfs_z;typ_p;top_p;min_p;temperature) |
+| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;typ_p;top_p;min_p;temperature) |
  | `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
  | `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: kfypmt) |
  | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
@@ -108,7 +108,6 @@ The project is under active development, and we are [looking for feedback and co
  | `--top-k N` | top-k sampling (default: 40, 0 = disabled) |
  | `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) |
  | `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) |
-| `--tfs N` | tail free sampling, parameter z (default: 1.0, 1.0 = disabled) |
  | `--typical N` | locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) |
  | `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) |
  | `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) |
@@ -121,7 +120,7 @@ The project is under active development, and we are [looking for feedback and co
  | `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers (`['\n', ':', '"', '*']`) in the process; use `"none"` to not use any sequence breakers
  | `--dynatemp-range N` | dynamic temperature range (default: 0.0, 0.0 = disabled) |
  | `--dynatemp-exp N` | dynamic temperature exponent (default: 1.0) |
-| `--mirostat N` | use Mirostat sampling.<br/>Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.<br/>(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) |
+| `--mirostat N` | use Mirostat sampling.<br/>Top K, Nucleus and Locally Typical samplers are ignored if used.<br/>(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) |
  | `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.1) |
  | `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.0) |
  | `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,<br/>i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',<br/>or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' |
@@ -360,8 +359,6 @@ node index.js
      `stop`: Specify a JSON array of stopping strings.
      These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. Default: `[]`
  
-    `tfs_z`: Enable tail free sampling with parameter z. Default: `1.0`, which is disabled.
-
      `typical_p`: Enable locally typical sampling with parameter p. Default: `1.0`, which is disabled.
  
      `repeat_penalty`: Control the repetition of token sequences in the generated text. Default: `1.1`
@@ -412,7 +409,7 @@ node index.js
  
      `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false`
  
-    `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values.
+    `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["top_k", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values.
  
  **Response format**
  
@@ -738,7 +735,6 @@ Example:
          "repeat_penalty": 1.100000023841858,
          "samplers": [
              "top_k",
-            "tfs_z",
              "typical_p",
              "top_p",
              "min_p",
@@ -752,7 +748,6 @@ Example:
          "stream": false,
          "task_id": 0,
          "temperature": 0.0,
-        "tfs_z": 1.0,
          "top_k": 40,
          "top_p": 0.949999988079071,
          "typical_p": 1.0
diff --git a/examples/server/public/index-new.html b/examples/server/public/index-new.html

index cb3995abee0efbeb73d1aa15d5db810d601fd8b3..8bfa380e57388c82981be36dfa2ed4ede0f0fdec 100644 (file)
--- a/examples/server/public/index-new.html
+++ b/examples/server/public/index-new.html
@@ -49,7 +49,6 @@
        min_p: 0.05, // 0 = disabled; recommended for non-english: ~ 0.4
        xtc_probability: 0.0, // 0 = disabled;
        xtc_threshold: 0.1, // > 0.5 disables XTC;
-      tfs_z: 1.0, // 1.0 = disabled
        typical_p: 1.0, // 1.0 = disabled
        presence_penalty: 0.0, // 0.0 = disabled
        frequency_penalty: 0.0, // 0.0 = disabled
@@ -847,7 +846,6 @@ return html`
            ${FloatField({ label: "DRY Base", title: "Set the DRY repetition penalty base value. Default is 1.75", max: 3.0, min: 1.0, name: "dry_base", step: 0.01, value: params.value.dry_base })}
            ${IntField({ label: "DRY Allowed Length", title: "Tokens that extend repetition beyond this receive exponentially increasing penalty. Default is 2", max: 10, min: 1, step: 1, name: "dry_allowed_length", value: params.value.dry_allowed_length })}
            ${IntField({ label: "DRY Penalty Last N", title: "How many tokens to scan for repetitions. Default is -1, where 0 is disabled and -1 is context size", max: 2048, min: -1, step: 16, name: "dry_penalty_last_n", value: params.value.dry_penalty_last_n })}
-          ${FloatField({ label: "TFS-Z", title: "Activates tail-free sampling, a method used to limit the prediction of tokens that are too frequent. The parameter z controls the strength of this limitation. A value of 1.0 means that this function is deactivated.", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z })}
            ${IntField({ label: "Min Keep", title: "If greater than 0, samplers are forced to return N possible tokens at minimum. Default is 0", max: 10, min: 0, name: "min_keep", value: params.value.min_keep })}
          </fieldset>
  
@@ -1147,7 +1145,6 @@ document.addEventListener('DOMContentLoaded', (event) => {
      xtc_probability: { snapValue: 0.0, snapRangeMultiplier: 4 },
      xtc_threshold: { snapValue: 0.5, snapRangeMultiplier: 4 },
      top_p: { snapValue: 1.0, snapRangeMultiplier: 4 },
-    tfs_z: { snapValue: 1.0, snapRangeMultiplier: 4 },
      typical_p: { snapValue: 1.0, snapRangeMultiplier: 4 },
      repeat_penalty: { snapValue: 1.0, snapRangeMultiplier: 4 },
      presence_penalty: { snapValue: 0.0, snapRangeMultiplier: 4 },
diff --git a/examples/server/public/index.html b/examples/server/public/index.html

index 7f9b02bfbf83b6b792d0b28a923cc1b4fe04c3ef..a95f5c6df877577d0db606e19696aebd63e45177 100644 (file)
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -313,7 +313,6 @@
        min_p: 0.05, // 0 = disabled
        xtc_probability: 0.0, // 0 = disabled;
        xtc_threshold: 0.1, // > 0.5 disables XTC;
-      tfs_z: 1.0, // 1.0 = disabled
        typical_p: 1.0, // 1.0 = disabled
        presence_penalty: 0.0, // 0.0 = disabled
        frequency_penalty: 0.0, // 0.0 = disabled
@@ -1015,7 +1014,6 @@
            <details>
              <summary>More options</summary>
              <fieldset class="two">
-              ${FloatField({ label: "TFS-Z", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z })}
                ${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
                ${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })}
                ${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
diff --git a/examples/server/server.cpp b/examples/server/server.cpp

index 077c7ad1adffbe935173d739846d4bae1e7c1b91..7953b5065904939c432f270fcf8d9fd79246e292 100644 (file)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -809,7 +809,6 @@ struct server_context {
          slot.sparams.min_p              = json_value(data, "min_p",              default_sparams.min_p);
          slot.sparams.xtc_probability    = json_value(data, "xtc_probability",    default_sparams.xtc_probability);
          slot.sparams.xtc_threshold      = json_value(data, "xtc_threshold",      default_sparams.xtc_threshold);
-        slot.sparams.tfs_z              = json_value(data, "tfs_z",              default_sparams.tfs_z);
          slot.sparams.typ_p              = json_value(data, "typical_p",          default_sparams.typ_p);
          slot.sparams.temp               = json_value(data, "temperature",        default_sparams.temp);
          slot.sparams.dynatemp_range     = json_value(data, "dynatemp_range",     default_sparams.dynatemp_range);
@@ -1149,7 +1148,6 @@ struct server_context {
              {"min_p",                     slot.sparams.min_p},
              {"xtc_probability",           slot.sparams.xtc_probability},
              {"xtc_threshold",             slot.sparams.xtc_threshold},
-            {"tfs_z",                     slot.sparams.tfs_z},
              {"typical_p",                 slot.sparams.typ_p},
              {"repeat_last_n",             slot.sparams.penalty_last_n},
              {"repeat_penalty",            slot.sparams.penalty_repeat},
diff --git a/examples/server/themes/buttons-top/index.html b/examples/server/themes/buttons-top/index.html

index 8334bcde5049cce2889f47c99a34e68beaa43e81..2797c37c96456239906caf6b01d8f0129ac9c867 100644 (file)
--- a/examples/server/themes/buttons-top/index.html
+++ b/examples/server/themes/buttons-top/index.html
@@ -226,7 +226,6 @@
        top_k: 40, // <= 0 to use vocab size
        top_p: 0.95, // 1.0 = disabled
        min_p: 0.05, // 0 = disabled
-      tfs_z: 1.0, // 1.0 = disabled
        typical_p: 1.0, // 1.0 = disabled
        presence_penalty: 0.0, // 0.0 = disabled
        frequency_penalty: 0.0, // 0.0 = disabled
@@ -788,7 +787,6 @@
            <details>
              <summary>More options</summary>
              <fieldset class="two">
-              ${FloatField({ label: "TFS-Z", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z })}
                ${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
                ${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })}
                ${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
diff --git a/examples/server/themes/wild/index.html b/examples/server/themes/wild/index.html

index 8361c577494d72338ddc5d3e5ff356788ebe12fc..dbe23c40241556f069258c3ec046bcd37d89cbb6 100644 (file)
--- a/examples/server/themes/wild/index.html
+++ b/examples/server/themes/wild/index.html
@@ -229,7 +229,6 @@
        top_k: 40, // <= 0 to use vocab size
        top_p: 0.95, // 1.0 = disabled
        min_p: 0.05, // 0 = disabled
-      tfs_z: 1.0, // 1.0 = disabled
        typical_p: 1.0, // 1.0 = disabled
        presence_penalty: 0.0, // 0.0 = disabled
        frequency_penalty: 0.0, // 0.0 = disabled
@@ -791,7 +790,6 @@
            <details>
              <summary>More options</summary>
              <fieldset class="two">
-              ${FloatField({ label: "TFS-Z", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z })}
                ${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
                ${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })}
                ${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp

index 562635555e0abaf5fc2f304332ed0f7f66249cd8..58f5a5684ac115fb6300d6a88a954bc69f8ac085 100644 (file)
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -607,7 +607,7 @@ static json oaicompat_completion_params_parse(
      }
  
      // Copy remaining properties to llama_params
-    // This allows user to use llama.cpp-specific params like "mirostat", "tfs_z",... via OAI endpoint.
+    // This allows user to use llama.cpp-specific params like "mirostat", ... via OAI endpoint.
      // See "launch_slot_with_task()" for a complete list of params supported by llama.cpp
      for (const auto & item : body.items()) {
          // Exception: if "n_predict" is present, we overwrite the value specified earlier by "max_tokens"
diff --git a/include/llama.h b/include/llama.h

index b2d1e7d5ae16b68cc49c0eb864b24153b97d385a..4076d34a7ae5a018788ca8af5431abfaccd86e1e 100644 (file)
--- a/include/llama.h
+++ b/include/llama.h
@@ -1087,9 +1087,6 @@ extern "C" {
      /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
      LLAMA_API struct llama_sampler * llama_sampler_init_min_p      (float   p, size_t min_keep);
  
-    /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
-    LLAMA_API struct llama_sampler * llama_sampler_init_tail_free  (float   z, size_t min_keep);
-
      /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
      LLAMA_API struct llama_sampler * llama_sampler_init_typical    (float   p, size_t min_keep);
  
diff --git a/scripts/run-with-preset.py b/scripts/run-with-preset.py

index 47cacb432158907f2b886c4c26016b2caff6af84..8f0bf8ca8aa555db35467096e08ef5b82e321200 100755 (executable)
--- a/scripts/run-with-preset.py
+++ b/scripts/run-with-preset.py
@@ -20,7 +20,7 @@ CLI_ARGS_LLAMA_CLI_PERPLEXITY = [
      "np-penalize-nl", "numa", "ppl-output-type", "ppl-stride", "presence-penalty", "prompt",
      "prompt-cache", "prompt-cache-all", "prompt-cache-ro", "repeat-last-n",
      "repeat-penalty", "reverse-prompt", "rope-freq-base", "rope-freq-scale", "rope-scale", "seed",
-    "simple-io", "tensor-split", "threads", "temp", "tfs", "top-k", "top-p", "typical",
+    "simple-io", "tensor-split", "threads", "temp", "top-k", "top-p", "typical",
      "verbose-prompt"
  ]
  
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp

index 25536eb6c5a063f7e003f8fed9cba4a936e7e8c4..c2cfe0a77ad842bd2f303a860ff10a56aa811706 100644 (file)
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -113,7 +113,7 @@ static void llama_sampler_softmax_impl(llama_token_data_array * cur_p) {
  }
  
  static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) {
-    // TODO: move bucket sort to separate function so that top_p/tail_free/typical/softmax first is equally fast
+    // TODO: move bucket sort to separate function so that top_p/typical/softmax first is equally fast
      // if (k >= (int32_t)cur_p->size) {
      //     return;
      // }
@@ -733,101 +733,6 @@ struct llama_sampler * llama_sampler_init_min_p(float p, size_t min_keep) {
      };
  }
  
-// tail-free
-
-struct llama_sampler_tail_free {
-    const float  z;
-    const size_t min_keep;
-};
-
-static const char * llama_sampler_tail_free_name(const struct llama_sampler * /*smpl*/) {
-    return "tail-free";
-}
-
-static void llama_sampler_tail_free_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-    const auto * ctx = (llama_sampler_tail_free *) smpl->ctx;
-
-    if (ctx->z >= 1.0f || cur_p->size <= 2) {
-        return;
-    }
-
-    llama_sampler_softmax_impl(cur_p);
-
-    // Compute the first and second derivatives
-    std::vector<float> first_derivatives(cur_p->size - 1);
-    std::vector<float> second_derivatives(cur_p->size - 2);
-
-    for (size_t i = 0; i < first_derivatives.size(); ++i) {
-        first_derivatives[i] = cur_p->data[i].p - cur_p->data[i + 1].p;
-    }
-    for (size_t i = 0; i < second_derivatives.size(); ++i) {
-        second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1];
-    }
-
-    // Calculate absolute value of second derivatives
-    for (size_t i = 0; i < second_derivatives.size(); ++i) {
-        second_derivatives[i] = std::abs(second_derivatives[i]);
-    }
-
-    // Normalize the second derivatives
-    {
-        const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
-
-        if (second_derivatives_sum > 1e-6f) {
-            for (float & value : second_derivatives) {
-                value /= second_derivatives_sum;
-            }
-        } else {
-            for (float & value : second_derivatives) {
-                value = 1.0f / second_derivatives.size();
-            }
-        }
-    }
-
-    float cum_sum = 0.0f;
-    size_t last_idx = cur_p->size;
-    for (size_t i = 0; i < second_derivatives.size(); ++i) {
-        cum_sum += second_derivatives[i];
-
-        // Check if the running sum is greater than z or if we have kept at least min_keep tokens
-        if (cum_sum > ctx->z && i >= ctx->min_keep) {
-            last_idx = i;
-            break;
-        }
-    }
-
-    // Resize the output vector to keep only the tokens above the tail location
-    cur_p->size = last_idx;
-}
-
-static struct llama_sampler * llama_sampler_tail_free_clone(const struct llama_sampler * smpl) {
-    const auto * ctx = (const llama_sampler_tail_free *) smpl->ctx;
-    return llama_sampler_init_tail_free(ctx->z, ctx->min_keep);
-}
-
-static void llama_sampler_tail_free_free(struct llama_sampler * smpl) {
-    delete (llama_sampler_tail_free *) smpl->ctx;
-}
-
-static struct llama_sampler_i llama_sampler_tail_free_i = {
-    /* .name   = */ llama_sampler_tail_free_name,
-    /* .accept = */ nullptr,
-    /* .apply  = */ llama_sampler_tail_free_apply,
-    /* .reset  = */ nullptr,
-    /* .clone  = */ llama_sampler_tail_free_clone,
-    /* .free   = */ llama_sampler_tail_free_free,
-};
-
-struct llama_sampler * llama_sampler_init_tail_free(float z, size_t min_keep) {
-    return new llama_sampler {
-        /* .iface = */ &llama_sampler_tail_free_i,
-        /* .ctx   = */ new llama_sampler_tail_free {
-            /* .z        = */ z,
-            /*. min_keep = */ min_keep,
-        },
-    };
-}
-
  // typical
  
  struct llama_sampler_typical {
diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp

index eb39661c3698fe607786e120029a77e3fa403dcf..be370044d6f58671d8f59cb4b8b0e7442c46f51d 100644 (file)
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -105,16 +105,6 @@ static void test_top_p(const std::vector<float> & probs, const std::vector<float
      tester.check();
  }
  
-static void test_tfs(const std::vector<float> & probs, const std::vector<float> & probs_expected, float z) {
-    sampler_tester tester(probs, probs_expected);
-
-    DUMP(&tester.cur_p);
-    tester.apply(llama_sampler_init_tail_free(z, 1));
-    DUMP(&tester.cur_p);
-
-    tester.check();
-}
-
  static void test_min_p(const std::vector<float> & probs, const std::vector<float> & probs_expected, float p) {
      sampler_tester tester(probs, probs_expected);
  
@@ -202,7 +192,6 @@ static void test_sampler_queue(const size_t n_vocab, const std::string & sampler
      for (auto s : samplers_sequence) {
          switch (s){
              case 'k': tester.apply(llama_sampler_init_top_k(top_k)); break;
-            case 'f': GGML_ABORT("tail_free test not implemented");
              case 'y': GGML_ABORT("typical test not implemented");
              case 'p': tester.apply(llama_sampler_init_top_p(top_p, 1)); break;
              case 'm': tester.apply(llama_sampler_init_min_p(min_p, 1)); break;
@@ -299,12 +288,11 @@ static void test_perf() {
          data.emplace_back(llama_token_data{i, logit, 0.0f});
      }
  
-    BENCH(llama_sampler_init_top_k    (40),                     data, 32);
-    BENCH(llama_sampler_init_top_p    (0.8f, 1),                data, 32);
-    BENCH(llama_sampler_init_min_p    (0.2f, 1),                data, 32);
-    BENCH(llama_sampler_init_tail_free(0.5f, 1),                data, 32);
-    BENCH(llama_sampler_init_typical  (0.5f, 1),                data, 32);
-    BENCH(llama_sampler_init_xtc      (1.0f, 0.1f, 1, 1),       data, 32);
+    BENCH(llama_sampler_init_top_k  (40),                     data, 32);
+    BENCH(llama_sampler_init_top_p  (0.8f, 1),                data, 32);
+    BENCH(llama_sampler_init_min_p  (0.2f, 1),                data, 32);
+    BENCH(llama_sampler_init_typical(0.5f, 1),                data, 32);
+    BENCH(llama_sampler_init_xtc    (1.0f, 0.1f, 1, 1),       data, 32);
  }
  
  int main(void) {
@@ -343,10 +331,6 @@ int main(void) {
      printf("XTC should not:\n");
      test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.4f, 0.3f, 0.2f, 0.1f},              0.99f, 0.39f);
  
-    test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f}, 0.25f);
-    test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.75f);
-    test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.99f);
-
      test_typical({0.97f, 0.01f, 0.01f, 0.01f}, {0.97f}, 0.5f);
      test_typical({0.4f, 0.2f, 0.2f, 0.2f}, {0.2f, 0.2f, 0.2f}, 0.5f);
author	Georgi Gerganov <redacted>
	Tue, 29 Oct 2024 08:42:05 +0000 (10:42 +0200)
committer	GitHub <redacted>
	Tue, 29 Oct 2024 08:42:05 +0000 (10:42 +0200)
common/arg.cpp		patch \| blob \| history
common/common.cpp		patch \| blob \| history
common/common.h		patch \| blob \| history
common/sampling.cpp		patch \| blob \| history
examples/main/README.md		patch \| blob \| history
examples/server/README.md		patch \| blob \| history
examples/server/public/index-new.html		patch \| blob \| history
examples/server/public/index.html		patch \| blob \| history
examples/server/server.cpp		patch \| blob \| history
examples/server/themes/buttons-top/index.html		patch \| blob \| history
examples/server/themes/wild/index.html		patch \| blob \| history
examples/server/utils.hpp		patch \| blob \| history
include/llama.h		patch \| blob \| history
scripts/run-with-preset.py		patch \| blob \| history
src/llama-sampling.cpp		patch \| blob \| history
tests/test-sampling.cpp		patch \| blob \| history