printf("\n=== Done dumping\n");
}
+
+void llama_embd_normalize(const float * inp, float * out, int n) {
+ double sum = 0.0;
+ for (int i = 0; i < n; i++) {
+ sum += inp[i] * inp[i];
+ }
+ sum = sqrt(sum);
+
+ const float norm = sum > 0.0 ? 1.0f / sum : 0.0f;
+
+ for (int i = 0; i < n; i++) {
+ out[i] = inp[i] * norm;
+ }
+}
+
// Dump the KV cache view showing individual sequences in each cell (long output).
void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
+
+//
+// Embedding utils
+//
+
+void llama_embd_normalize(const float * inp, float * out, int n);
+
}
}
-static void normalize(const float * vec, float * out, int n) {
- float norm = 0;
- for (int i = 0; i < n; i++) {
- norm += vec[i] * vec[i];
- }
- norm = sqrt(norm);
- for (int i = 0; i < n; i++) {
- out[i] = vec[i] / norm;
- }
-}
-
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
// clear previous kv_cache values (irrelevant for embeddings)
llama_kv_cache_clear(ctx);
fprintf(stderr, "%s : failed to decode\n", __func__);
}
- // normalize on copy
for (int i = 0; i < batch.n_tokens; i++) {
if (!batch.logits[i]) {
continue;
}
float * out = output + batch.seq_id[i][0] * n_embd;
- normalize(embd, out, n_embd);
+ llama_embd_normalize(embd, out, n_embd);
}
}
const int n_embd = llama_n_embd(model);
+ std::vector<float> embd_res(n_embd, 0.0f);
+
for (int i = 0; i < batch.n_tokens; ++i) {
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id + 1) {
continue;
continue;
}
+ llama_embd_normalize(embd, embd_res.data(), n_embd);
+
res.data = json {
- {"embedding", std::vector<float>(embd, embd + n_embd)},
+ {"embedding", embd_res},
};
}
// get the result
server_task_result result = ctx_server.queue_results.recv(id_task);
ctx_server.queue_results.remove_waiting_task_id(id_task);
+
+ // append to the responses
responses.push_back(result.data);
}