params.api_prefix = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
+ add_opt(common_arg(
+ {"--webui-config"}, "JSON",
+ "JSON that provides default WebUI settings (overrides WebUI defaults)",
+ [](common_params & params, const std::string & value) {
+ params.webui_config_json = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG"));
+ add_opt(common_arg(
+ {"--webui-config-file"}, "PATH",
+ "JSON file that provides default WebUI settings (overrides WebUI defaults)",
+ [](common_params & params, const std::string & value) {
+ params.webui_config_json = read_file(value);
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG_FILE"));
add_opt(common_arg(
{"--webui"},
{"--no-webui"},
std::map<std::string, std::string> default_template_kwargs;
+ // webui configs
+ bool webui = true;
+ std::string webui_config_json;
+
// "advanced" endpoints are disabled by default for better security
- bool webui = true;
bool endpoint_slots = true;
bool endpoint_props = false; // only control POST requests, not GET
bool endpoint_metrics = false;
| `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) |
| `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> |
| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
-| `-c, --ctx-size N` | size of the prompt context (default: 4096, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
+| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity)<br/>(env: LLAMA_ARG_N_PREDICT) |
| `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs<br/>(env: LLAMA_ARG_SPLIT_MODE) |
| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1<br/>(env: LLAMA_ARG_TENSOR_SPLIT) |
| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)<br/>(env: LLAMA_ARG_MAIN_GPU) |
+| `-fit, --fit [on\|off]` | whether to adjust unset arguments to fit in device memory ('on' or 'off', default: 'on')<br/>(env: LLAMA_ARG_FIT) |
+| `-fitt, --fit-target MiB` | target margin per device for --fit option, default: 1024<br/>(env: LLAMA_ARG_FIT_TARGET) |
+| `-fitc, --fit-ctx N` | minimum ctx size that can be set by --fit option, default: 4096<br/>(env: LLAMA_ARG_FIT_CTX) |
| `--check-tensors` | check model tensor data for invalid values (default: false) |
-| `--override-kv KEY=TYPE:VALUE` | advanced option to override model metadata by key. may be specified multiple times.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false |
+| `--override-kv KEY=TYPE:VALUE,...` | advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated or repeat this argument.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false |
| `--op-offload, --no-op-offload` | whether to offload host tensor operations to device (default: true) |
-| `--lora FNAME` | path to LoRA adapter (can be repeated to use multiple adapters) |
-| `--lora-scaled FNAME SCALE` | path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters) |
-| `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors |
-| `--control-vector-scaled FNAME SCALE` | add a control vector with user defined scaling SCALE<br/>note: this argument can be repeated to add multiple scaled control vectors |
+| `--lora FNAME` | path to LoRA adapter (use comma-separated values to load multiple adapters) |
+| `--lora-scaled FNAME:SCALE,...` | path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)<br/>note: use comma-separated values |
+| `--control-vector FNAME` | add a control vector<br/>note: use comma-separated values to add multiple control vectors |
+| `--control-vector-scaled FNAME:SCALE,...` | add a control vector with user defined scaling SCALE<br/>note: use comma-separated values (format: FNAME:SCALE,...) |
| `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
| `-m, --model FNAME` | model path to load<br/>(env: LLAMA_ARG_MODEL) |
| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
| `--sampling-seq, --sampler-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) |
| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
| `--temp N` | temperature (default: 0.8) |
-| `--top-k N` | top-k sampling (default: 40, 0 = disabled) |
+| `--top-k N` | top-k sampling (default: 40, 0 = disabled)<br/>(env: LLAMA_ARG_TOP_K) |
| `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) |
| `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) |
| `--top-nsigma N` | top-n-sigma sampling (default: -1.0, -1.0 = disabled) |
| `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
| `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
| `--api-prefix PREFIX` | prefix path the server serves from, without the trailing slash (default: )<br/>(env: LLAMA_ARG_API_PREFIX) |
+| `--webui-config JSON` | JSON that provides default WebUI settings (overrides WebUI defaults)<br/>(env: LLAMA_ARG_WEBUI_CONFIG) |
+| `--webui-config-file PATH` | JSON file that provides default WebUI settings (overrides WebUI defaults)<br/>(env: LLAMA_ARG_WEBUI_CONFIG_FILE) |
| `--webui, --no-webui` | whether to enable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_WEBUI) |
| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
| `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
server_metrics metrics;
+ json webui_settings = json::object();
+
// Necessary similarity of prompt for slot selection
float slot_prompt_similarity = 0.0f;
params_base = params;
+ webui_settings = json::object();
+ if (!params_base.webui_config_json.empty()) {
+ try {
+ webui_settings = json::parse(params_base.webui_config_json);
+ } catch (const std::exception & e) {
+ SRV_ERR("%s: failed to parse webui config: %s\n", __func__, e.what());
+ return false;
+ }
+ }
+
llama_init = common_init_from_params(params_base);
model = llama_init->model();
};
}
- // this endpoint is publicly available, please only return what is safe to be exposed
json data = {
{ "default_generation_settings", default_generation_settings_for_props },
{ "total_slots", ctx_server.params_base.n_parallel },
{ "endpoint_props", params.endpoint_props },
{ "endpoint_metrics", params.endpoint_metrics },
{ "webui", params.webui },
+ { "webui_settings", ctx_server.webui_settings },
{ "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) },
{ "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
{ "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
{"params", json{}},
{"n_ctx", 0},
}},
+ {"webui_settings", webui_settings},
});
return res;
}
#include "common.h"
#include "preset.h"
+#include "server-common.h"
#include "server-http.h"
#include <mutex>
struct server_models_routes {
common_params params;
+ json webui_settings = json::object();
server_models models;
server_models_routes(const common_params & params, int argc, char ** argv, char ** envp)
: params(params), models(params, argc, argv, envp) {
+ if (!this->params.webui_config_json.empty()) {
+ try {
+ webui_settings = json::parse(this->params.webui_config_json);
+ } catch (const std::exception & e) {
+ LOG_ERR("%s: failed to parse webui config: %s\n", __func__, e.what());
+ throw;
+ }
+ }
init_routes();
}
#include "log.h"
#include <atomic>
+#include <exception>
#include <signal.h>
#include <thread> // for std::thread::hardware_concurrency
std::optional<server_models_routes> models_routes{};
if (is_router_server) {
// setup server instances manager
- models_routes.emplace(params, argc, argv, envp);
+ try {
+ models_routes.emplace(params, argc, argv, envp);
+ } catch (const std::exception & e) {
+ LOG_ERR("%s: failed to initialize router models: %s\n", __func__, e.what());
+ return 1;
+ }
// proxy handlers
// note: routes.get_health stays the same
<script lang="ts">
+ import { base } from '$app/paths';
import { AlertTriangle, RefreshCw, Key, CheckCircle, XCircle } from '@lucide/svelte';
import { goto } from '$app/navigation';
import { Button } from '$lib/components/ui/button';
settingsStore.updateConfig('apiKey', apiKeyInput.trim());
// Test the API key by making a real request to the server
- const response = await fetch('./props', {
+ const response = await fetch(`${base}/props`, {
headers: {
'Content-Type': 'application/json',
Authorization: `Bearer ${apiKeyInput.trim()}`
expect(result.max_tokens).toBe(-1);
expect(result.temperature).toBe(0.7);
});
+
+ it('should merge webui settings from props when provided', () => {
+ const result = ParameterSyncService.extractServerDefaults(null, {
+ pasteLongTextToFileLen: 0,
+ pdfAsImage: true,
+ renderUserContentAsMarkdown: false,
+ theme: 'dark'
+ });
+
+ expect(result.pasteLongTextToFileLen).toBe(0);
+ expect(result.pdfAsImage).toBe(true);
+ expect(result.renderUserContentAsMarkdown).toBe(false);
+ expect(result.theme).toBeUndefined();
+ });
});
});
{ key: 'dry_allowed_length', serverKey: 'dry_allowed_length', type: 'number', canSync: true },
{ key: 'dry_penalty_last_n', serverKey: 'dry_penalty_last_n', type: 'number', canSync: true },
{ key: 'max_tokens', serverKey: 'max_tokens', type: 'number', canSync: true },
- { key: 'samplers', serverKey: 'samplers', type: 'string', canSync: true }
+ { key: 'samplers', serverKey: 'samplers', type: 'string', canSync: true },
+ {
+ key: 'pasteLongTextToFileLen',
+ serverKey: 'pasteLongTextToFileLen',
+ type: 'number',
+ canSync: true
+ },
+ { key: 'pdfAsImage', serverKey: 'pdfAsImage', type: 'boolean', canSync: true },
+ {
+ key: 'showThoughtInProgress',
+ serverKey: 'showThoughtInProgress',
+ type: 'boolean',
+ canSync: true
+ },
+ { key: 'showToolCalls', serverKey: 'showToolCalls', type: 'boolean', canSync: true },
+ {
+ key: 'disableReasoningFormat',
+ serverKey: 'disableReasoningFormat',
+ type: 'boolean',
+ canSync: true
+ },
+ { key: 'keepStatsVisible', serverKey: 'keepStatsVisible', type: 'boolean', canSync: true },
+ { key: 'showMessageStats', serverKey: 'showMessageStats', type: 'boolean', canSync: true },
+ {
+ key: 'askForTitleConfirmation',
+ serverKey: 'askForTitleConfirmation',
+ type: 'boolean',
+ canSync: true
+ },
+ { key: 'disableAutoScroll', serverKey: 'disableAutoScroll', type: 'boolean', canSync: true },
+ {
+ key: 'renderUserContentAsMarkdown',
+ serverKey: 'renderUserContentAsMarkdown',
+ type: 'boolean',
+ canSync: true
+ },
+ { key: 'autoMicOnEmpty', serverKey: 'autoMicOnEmpty', type: 'boolean', canSync: true },
+ {
+ key: 'pyInterpreterEnabled',
+ serverKey: 'pyInterpreterEnabled',
+ type: 'boolean',
+ canSync: true
+ },
+ {
+ key: 'enableContinueGeneration',
+ serverKey: 'enableContinueGeneration',
+ type: 'boolean',
+ canSync: true
+ }
];
export class ParameterSyncService {
* Extract server default parameters that can be synced
*/
static extractServerDefaults(
- serverParams: ApiLlamaCppServerProps['default_generation_settings']['params'] | null
+ serverParams: ApiLlamaCppServerProps['default_generation_settings']['params'] | null,
+ webuiSettings?: Record<string, string | number | boolean>
): ParameterRecord {
- if (!serverParams) return {};
-
const extracted: ParameterRecord = {};
- for (const param of SYNCABLE_PARAMETERS) {
- if (param.canSync && param.serverKey in serverParams) {
- const value = (serverParams as unknown as Record<string, ParameterValue>)[param.serverKey];
- if (value !== undefined) {
- // Apply precision rounding to avoid JavaScript floating-point issues
- extracted[param.key] = this.roundFloatingPoint(value);
+ if (serverParams) {
+ for (const param of SYNCABLE_PARAMETERS) {
+ if (param.canSync && param.serverKey in serverParams) {
+ const value = (serverParams as unknown as Record<string, ParameterValue>)[
+ param.serverKey
+ ];
+ if (value !== undefined) {
+ // Apply precision rounding to avoid JavaScript floating-point issues
+ extracted[param.key] = this.roundFloatingPoint(value);
+ }
}
}
+
+ // Handle samplers array conversion to string
+ if (serverParams.samplers && Array.isArray(serverParams.samplers)) {
+ extracted.samplers = serverParams.samplers.join(';');
+ }
}
- // Handle samplers array conversion to string
- if (serverParams.samplers && Array.isArray(serverParams.samplers)) {
- extracted.samplers = serverParams.samplers.join(';');
+ if (webuiSettings) {
+ for (const param of SYNCABLE_PARAMETERS) {
+ if (param.canSync && param.serverKey in webuiSettings) {
+ const value = webuiSettings[param.serverKey];
+ if (value !== undefined) {
+ extracted[param.key] = this.roundFloatingPoint(value);
+ }
+ }
+ }
}
return extracted;
return this.props?.default_generation_settings?.n_ctx ?? null;
}
+ get webuiSettings(): Record<string, string | number | boolean> | undefined {
+ return this.props?.webui_settings;
+ }
+
get isRouterMode(): boolean {
return this.role === ServerRole.ROUTER;
}
*/
private getServerDefaults(): Record<string, string | number | boolean> {
const serverParams = serverStore.defaultParams;
- return serverParams ? ParameterSyncService.extractServerDefaults(serverParams) : {};
+ const webuiSettings = serverStore.webuiSettings;
+ return ParameterSyncService.extractServerDefaults(serverParams, webuiSettings);
}
constructor() {
bos_token: string;
eos_token: string;
build_info: string;
+ webui_settings?: Record<string, string | number | boolean>;
}
export interface ApiChatCompletionRequest {
+import { base } from '$app/paths';
import { error } from '@sveltejs/kit';
import { browser } from '$app/environment';
import { config } from '$lib/stores/settings.svelte';
headers.Authorization = `Bearer ${apiKey}`;
}
- const response = await fetch(`./props`, { headers });
+ const response = await fetch(`${base}/props`, { headers });
if (!response.ok) {
if (response.status === 401 || response.status === 403) {
<script lang="ts">
import '../app.css';
+ import { base } from '$app/paths';
import { page } from '$app/state';
import { untrack } from 'svelte';
import { ChatSidebar, DialogConversationTitleUpdate } from '$lib/components/app';
headers.Authorization = `Bearer ${apiKey.trim()}`;
}
- fetch(`./props`, { headers })
+ fetch(`${base}/props`, { headers })
.then((response) => {
if (response.status === 401 || response.status === 403) {
window.location.reload();