server : add support for `embd_normalize` parameter (#14964)

author Daniel Bevenius <redacted>

Wed, 30 Jul 2025 16:07:11 +0000 (18:07 +0200)

committer GitHub <redacted>

Wed, 30 Jul 2025 16:07:11 +0000 (18:07 +0200)
author Daniel Bevenius <redacted>
Wed, 30 Jul 2025 16:07:11 +0000 (18:07 +0200)
committer GitHub <redacted>
Wed, 30 Jul 2025 16:07:11 +0000 (18:07 +0200)
diff --git a/tools/server/README.md b/tools/server/README.md

index aa07f1ef5b1776e654224579a19ed0531fd7b1f8..f3f4caed85cf5a0928d0fcac95a828043d9be0c5 100644 (file)
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -644,6 +644,15 @@ The same as [the embedding example](../embedding) does.
  
  `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
  
+`embd_normalize`: Normalization for pooled embeddings. Can be one of the following values:
+```
+  -1: No normalization
+   0: Max absolute
+   1: Taxicab
+   2: Euclidean/L2
+  >2: P-Norm
+```
+
  ### POST `/reranking`: Rerank documents according to a given query
  
  Similar to https://jina.ai/reranker/ but might change in the future.
diff --git a/tools/server/server.cpp b/tools/server/server.cpp

index 022b5d0b310340044bfa3e3233d7700cfa83246f..2e4c40af7839a6f1936af678b4023cc096fa4488 100644 (file)
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -138,6 +138,9 @@ struct slot_params {
      std::string                  oaicompat_cmpl_id;
      common_chat_syntax           oaicompat_chat_syntax;
  
+    // Embeddings
+    int32_t embd_normalize = 2; // (-1=none, 0=max absolute int16, 1=taxicab, 2=Euclidean/L2, >2=p-norm)
+
      json to_json() const {
          std::vector<std::string> samplers;
          samplers.reserve(sampling.samplers.size());
@@ -2601,7 +2604,7 @@ struct server_context {
  
              // normalize only when there is pooling
              if (llama_pooling_type(slot.ctx) != LLAMA_POOLING_TYPE_NONE) {
-                common_embd_normalize(embd, embd_res.data(), n_embd, 2);
+                common_embd_normalize(embd, embd_res.data(), n_embd, slot.params.embd_normalize);
                  res->embedding.push_back(embd_res);
                  break;
              } else {
@@ -4614,6 +4617,14 @@ int main(int argc, char ** argv) {
              }
          }
  
+        int embd_normalize = 2; // default to Euclidean/L2 norm
+        if (body.count("embd_normalize") != 0) {
+            embd_normalize = body.at("embd_normalize");
+            if (llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) {
+                SRV_DBG("embd_normalize is not supported by pooling type %d, ignoring it\n", llama_pooling_type(ctx_server.ctx));
+            }
+        }
+
          // create and queue the task
          json responses = json::array();
          bool error = false;
@@ -4629,6 +4640,7 @@ int main(int argc, char ** argv) {
  
                  // OAI-compat
                  task.params.oaicompat = oaicompat;
+                task.params.embd_normalize = embd_normalize;
  
                  tasks.push_back(std::move(task));
              }
author	Daniel Bevenius <redacted>
	Wed, 30 Jul 2025 16:07:11 +0000 (18:07 +0200)
committer	GitHub <redacted>
	Wed, 30 Jul 2025 16:07:11 +0000 (18:07 +0200)
tools/server/README.md		patch \| blob \| history
tools/server/server.cpp		patch \| blob \| history