server : use different seeds for child completions (#18700)

author Georgi Gerganov <redacted>

Fri, 9 Jan 2026 07:33:50 +0000 (09:33 +0200)

committer GitHub <redacted>

Fri, 9 Jan 2026 07:33:50 +0000 (09:33 +0200)
author Georgi Gerganov <redacted>
Fri, 9 Jan 2026 07:33:50 +0000 (09:33 +0200)
committer GitHub <redacted>
Fri, 9 Jan 2026 07:33:50 +0000 (09:33 +0200)
diff --git a/include/llama.h b/include/llama.h

index 12e4e57d0e5857283afaaf5c2578d7492704333c..1c17efb9fa1c9f2e2b45cd073704cd35de80e5c8 100644 (file)
--- a/include/llama.h
+++ b/include/llama.h
@@ -1292,7 +1292,9 @@ extern "C" {
      // available samplers:
  
      LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
-    LLAMA_API struct llama_sampler * llama_sampler_init_dist  (uint32_t seed);
+
+    /// seed == LLAMA_DEFAULT_SEED to use a random seed.
+    LLAMA_API struct llama_sampler * llama_sampler_init_dist(uint32_t seed);
  
      /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
      /// Setting k <= 0 makes this a noop
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp

index 48291a3a7c5e14aa62d6c19b4ccf1a4b5cd18e16..11f0394c4ceb15daf5ab21c6c92af33a94b5575e 100644 (file)
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -2142,7 +2142,7 @@ struct llama_sampler_xtc {
      const uint32_t seed;
      uint32_t       seed_cur;
  
-    std::mt19937    rng;
+    std::mt19937   rng;
  };
  
  static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/) {
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp

index 33635a158664ce8d0237a3144a98a29f4fb895bd..e1f65dfcceccac91f9caf083381f698c1c4e630d 100644 (file)
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -4,7 +4,6 @@
  #include "server-task.h"
  #include "server-queue.h"
  
-#include "arg.h"
  #include "common.h"
  #include "llama.h"
  #include "log.h"
@@ -16,7 +15,6 @@
  #include <cstddef>
  #include <cinttypes>
  #include <memory>
-#include <unordered_set>
  #include <filesystem>
  
  // fix problem with std::min and std::max
@@ -2927,9 +2925,14 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
              if (task.params.n_cmpl > 1) {
                  task.n_children = task.params.n_cmpl - 1;
                  for (size_t j = 0; j < task.n_children; j++) {
-                    server_task child = task.create_child(
-                        task.id,
-                        rd.get_new_id());
+                    server_task child = task.create_child(task.id, rd.get_new_id());
+
+                    // use different sampling seed for each child
+                    // note: https://github.com/ggml-org/llama.cpp/pull/18700#discussion_r2675115723
+                    if (child.params.sampling.seed != LLAMA_DEFAULT_SEED) {
+                        child.params.sampling.seed += j + 1;
+                    }
+
                      tasks.push_back(std::move(child));
                  }
              }
diff --git a/tools/server/tests/unit/test_chat_completion.py b/tools/server/tests/unit/test_chat_completion.py

index 5f5de415cf8f3a70654a172de3912f2c71cbaca5..d0ce01bc6eca95f2571c26202475b1d88410b214 100644 (file)
--- a/tools/server/tests/unit/test_chat_completion.py
+++ b/tools/server/tests/unit/test_chat_completion.py
@@ -503,5 +503,4 @@ def test_chat_completions_multiple_choices():
      assert len(res.body["choices"]) == 2
      for choice in res.body["choices"]:
          assert "assistant" == choice["message"]["role"]
-        assert match_regex("Suddenly", choice["message"]["content"])
          assert choice["finish_reason"] == "length"
author	Georgi Gerganov <redacted>
	Fri, 9 Jan 2026 07:33:50 +0000 (09:33 +0200)
committer	GitHub <redacted>
	Fri, 9 Jan 2026 07:33:50 +0000 (09:33 +0200)
include/llama.h		patch \| blob \| history
src/llama-sampling.cpp		patch \| blob \| history
tools/server/server-context.cpp		patch \| blob \| history
tools/server/tests/unit/test_chat_completion.py		patch \| blob \| history