]> git.djapps.eu Git - pkg/ggml/sources/llama.cpp/commitdiff
server : use different seeds for child completions (#18700)
authorGeorgi Gerganov <redacted>
Fri, 9 Jan 2026 07:33:50 +0000 (09:33 +0200)
committerGitHub <redacted>
Fri, 9 Jan 2026 07:33:50 +0000 (09:33 +0200)
* server : use different seeds for child completions

* cont : handle default seed

* cont : note

include/llama.h
src/llama-sampling.cpp
tools/server/server-context.cpp
tools/server/tests/unit/test_chat_completion.py

index 12e4e57d0e5857283afaaf5c2578d7492704333c..1c17efb9fa1c9f2e2b45cd073704cd35de80e5c8 100644 (file)
@@ -1292,7 +1292,9 @@ extern "C" {
     // available samplers:
 
     LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
-    LLAMA_API struct llama_sampler * llama_sampler_init_dist  (uint32_t seed);
+
+    /// seed == LLAMA_DEFAULT_SEED to use a random seed.
+    LLAMA_API struct llama_sampler * llama_sampler_init_dist(uint32_t seed);
 
     /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
     /// Setting k <= 0 makes this a noop
index 48291a3a7c5e14aa62d6c19b4ccf1a4b5cd18e16..11f0394c4ceb15daf5ab21c6c92af33a94b5575e 100644 (file)
@@ -2142,7 +2142,7 @@ struct llama_sampler_xtc {
     const uint32_t seed;
     uint32_t       seed_cur;
 
-    std::mt19937    rng;
+    std::mt19937   rng;
 };
 
 static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/) {
index 33635a158664ce8d0237a3144a98a29f4fb895bd..e1f65dfcceccac91f9caf083381f698c1c4e630d 100644 (file)
@@ -4,7 +4,6 @@
 #include "server-task.h"
 #include "server-queue.h"
 
-#include "arg.h"
 #include "common.h"
 #include "llama.h"
 #include "log.h"
@@ -16,7 +15,6 @@
 #include <cstddef>
 #include <cinttypes>
 #include <memory>
-#include <unordered_set>
 #include <filesystem>
 
 // fix problem with std::min and std::max
@@ -2927,9 +2925,14 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
             if (task.params.n_cmpl > 1) {
                 task.n_children = task.params.n_cmpl - 1;
                 for (size_t j = 0; j < task.n_children; j++) {
-                    server_task child = task.create_child(
-                        task.id,
-                        rd.get_new_id());
+                    server_task child = task.create_child(task.id, rd.get_new_id());
+
+                    // use different sampling seed for each child
+                    // note: https://github.com/ggml-org/llama.cpp/pull/18700#discussion_r2675115723
+                    if (child.params.sampling.seed != LLAMA_DEFAULT_SEED) {
+                        child.params.sampling.seed += j + 1;
+                    }
+
                     tasks.push_back(std::move(child));
                 }
             }
index 5f5de415cf8f3a70654a172de3912f2c71cbaca5..d0ce01bc6eca95f2571c26202475b1d88410b214 100644 (file)
@@ -503,5 +503,4 @@ def test_chat_completions_multiple_choices():
     assert len(res.body["choices"]) == 2
     for choice in res.body["choices"]:
         assert "assistant" == choice["message"]["role"]
-        assert match_regex("Suddenly", choice["message"]["content"])
         assert choice["finish_reason"] == "length"