));
add_opt(common_arg(
{"-a", "--alias"}, "STRING",
- "set alias for model name (to be used by REST API)",
+ "set model name aliases, comma-separated (to be used by API)",
[](common_params & params, const std::string & value) {
- params.model_alias = value;
+ for (auto & alias : string_split<std::string>(value, ',')) {
+ alias = string_strip(alias);
+ if (!alias.empty()) {
+ params.model_alias.insert(alias);
+ }
+ }
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS"));
+ add_opt(common_arg(
+ {"--tags"}, "STRING",
+ "set model tags, comma-separated (informational, not used for routing)",
+ [](common_params & params, const std::string & value) {
+ for (auto & tag : string_split<std::string>(value, ',')) {
+ tag = string_strip(tag);
+ if (!tag.empty()) {
+ params.model_tags.insert(tag);
+ }
+ }
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TAGS"));
add_opt(common_arg(
{"-m", "--model"}, "FNAME",
ex == LLAMA_EXAMPLE_EXPORT_LORA
struct common_params_model model;
- std::string model_alias = ""; // model alias // NOLINT
+ std::set<std::string> model_alias; // model aliases // NOLINT
+ std::set<std::string> model_tags; // model tags (informational, not used for routing) // NOLINT
std::string hf_token = ""; // HF token // NOLINT
std::string prompt = ""; // NOLINT
std::string system_prompt = ""; // NOLINT
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
-| `--mmap, --no-mmap` | whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
-| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. Takes precedence over --mmap (default: enabled)<br/>(env: LLAMA_ARG_DIO) |
+| `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
+| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. (default: disabled)<br/>(env: LLAMA_ARG_DIO) |
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
| `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
| `--list-devices` | print list of available devices and exit |
| `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
| `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) |
| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
-| `--temp N` | temperature (default: 0.80) |
+| `--temp, --temperature N` | temperature (default: 0.80) |
| `--top-k N` | top-k sampling (default: 40, 0 = disabled)<br/>(env: LLAMA_ARG_TOP_K) |
| `--top-p N` | top-p sampling (default: 0.95, 1.0 = disabled) |
| `--min-p N` | min-p sampling (default: 0.05, 0.0 = disabled) |
-| `--top-nsigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) |
+| `--top-nsigma, --top-n-sigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) |
| `--xtc-probability N` | xtc probability (default: 0.00, 0.0 = disabled) |
| `--xtc-threshold N` | xtc threshold (default: 0.10, 1.0 = disabled) |
-| `--typical N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) |
+| `--typical, --typical-p N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) |
| `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) |
| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled) |
| `--presence-penalty N` | repeat alpha presence penalty (default: 0.00, 0.0 = disabled) |
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
-| `--mmap, --no-mmap` | whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
-| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. Takes precedence over --mmap (default: enabled)<br/>(env: LLAMA_ARG_DIO) |
+| `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
+| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. (default: disabled)<br/>(env: LLAMA_ARG_DIO) |
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
| `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
| `--list-devices` | print list of available devices and exit |
| `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
| `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) |
| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
-| `--temp N` | temperature (default: 0.80) |
+| `--temp, --temperature N` | temperature (default: 0.80) |
| `--top-k N` | top-k sampling (default: 40, 0 = disabled)<br/>(env: LLAMA_ARG_TOP_K) |
| `--top-p N` | top-p sampling (default: 0.95, 1.0 = disabled) |
| `--min-p N` | min-p sampling (default: 0.05, 0.0 = disabled) |
-| `--top-nsigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) |
+| `--top-nsigma, --top-n-sigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) |
| `--xtc-probability N` | xtc probability (default: 0.00, 0.0 = disabled) |
| `--xtc-threshold N` | xtc threshold (default: 0.10, 1.0 = disabled) |
-| `--typical N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) |
+| `--typical, --typical-p N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) |
| `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) |
| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled) |
| `--presence-penalty N` | repeat alpha presence penalty (default: 0.00, 0.0 = disabled) |
| `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
-| `--mmap, --no-mmap` | whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
-| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. Takes precedence over --mmap (default: enabled)<br/>(env: LLAMA_ARG_DIO) |
+| `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
+| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. (default: disabled)<br/>(env: LLAMA_ARG_DIO) |
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
| `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
| `--list-devices` | print list of available devices and exit |
| `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
| `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) |
| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
-| `--temp N` | temperature (default: 0.80) |
+| `--temp, --temperature N` | temperature (default: 0.80) |
| `--top-k N` | top-k sampling (default: 40, 0 = disabled)<br/>(env: LLAMA_ARG_TOP_K) |
| `--top-p N` | top-p sampling (default: 0.95, 1.0 = disabled) |
| `--min-p N` | min-p sampling (default: 0.05, 0.0 = disabled) |
-| `--top-nsigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) |
+| `--top-nsigma, --top-n-sigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) |
| `--xtc-probability N` | xtc probability (default: 0.00, 0.0 = disabled) |
| `--xtc-threshold N` | xtc threshold (default: 0.10, 1.0 = disabled) |
-| `--typical N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) |
+| `--typical, --typical-p N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) |
| `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) |
| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled) |
| `--presence-penalty N` | repeat alpha presence penalty (default: 0.00, 0.0 = disabled) |
| Argument | Explanation |
| -------- | ----------- |
+| `-lcs, --lookup-cache-static FNAME` | path to static lookup cache to use for lookup decoding (not updated by generation) |
+| `-lcd, --lookup-cache-dynamic FNAME` | path to dynamic lookup cache to use for lookup decoding (updated by generation) |
| `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)<br/>(env: LLAMA_ARG_CTX_CHECKPOINTS) |
| `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
-| `-kvu, --kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
+| `-kvu, --kv-unified, -no-kvu, --no-kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
| `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
| `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode |
| `-sp, --special` | special tokens output enabled (default: false) |
| `-otd, --override-tensor-draft <tensor name pattern>=<buffer type>,...` | override tensor buffer type for draft model |
| `-cmoed, --cpu-moe-draft` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model<br/>(env: LLAMA_ARG_CPU_MOE_DRAFT) |
| `-ncmoed, --n-cpu-moe-draft N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model<br/>(env: LLAMA_ARG_N_CPU_MOE_DRAFT) |
-| `-a, --alias STRING` | set alias for model name (to be used by REST API)<br/>(env: LLAMA_ARG_ALIAS) |
+| `-a, --alias STRING` | set model name aliases, comma-separated (to be used by API)<br/>(env: LLAMA_ARG_ALIAS) |
+| `--tags STRING` | set model tags, comma-separated (informational, not used for routing)<br/>(env: LLAMA_ARG_TAGS) |
| `--host HOST` | ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: 127.0.0.1)<br/>(env: LLAMA_ARG_HOST) |
| `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
| `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)<br/>(env: LLAMA_ARG_MODEL_DRAFT) |
| `--spec-replace TARGET DRAFT` | translate the string in TARGET into DRAFT if the draft model and main model are not compatible |
+| `--spec-type [none\|ngram-cache\|ngram-simple\|ngram-map-k\|ngram-map-k4v\|ngram-mod]` | type of speculative decoding to use when no draft model is provided (default: none) |
+| `--spec-ngram-size-n N` | ngram size N for ngram-simple/ngram-map speculative decoding, length of lookup n-gram (default: 12) |
+| `--spec-ngram-size-m N` | ngram size M for ngram-simple/ngram-map speculative decoding, length of draft m-gram (default: 48) |
+| `--spec-ngram-min-hits N` | minimum hits for ngram-map speculative decoding (default: 1) |
| `-mv, --model-vocoder FNAME` | vocoder model for audio generation (default: unused) |
| `--tts-use-guide-tokens` | Use guide tokens to improve TTS word recall |
| `--embd-gemma-default` | use default EmbeddingGemma model (note: can download weights from the internet) |
float slot_prompt_similarity = 0.0f;
std::string model_name; // name of the loaded model, to be used by API
+ std::set<std::string> model_aliases; // additional names for the model
+ std::set<std::string> model_tags; // informational tags
bool sleeping = false;
SRV_WRN("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n");
if (!params_base.model_alias.empty()) {
- // user explicitly specified model name
- model_name = params_base.model_alias;
+ // backward compat: use first alias as model name
+ model_name = *params_base.model_alias.begin();
} else if (!params_base.model.name.empty()) {
- // use model name in registry format (for models in cache)
model_name = params_base.model.name;
} else {
// fallback: derive model name from file name
model_name = model_path.filename().string();
}
+ model_aliases = params_base.model_alias;
+ model_tags = params_base.model_tags;
+
if (!is_resume) {
return init();
}
return server_context_meta {
/* build_info */ build_info,
/* model_name */ impl->model_name,
+ /* model_aliases */ impl->model_aliases,
+ /* model_tags */ impl->model_tags,
/* model_path */ impl->params_base.model.path,
/* has_mtmd */ impl->mctx != nullptr,
/* has_inp_image */ impl->chat_params.allow_image,
{"data", {
{
{"id", meta->model_name},
+ {"aliases", meta->model_aliases},
+ {"tags", meta->model_tags},
{"object", "model"},
{"created", std::time(0)},
{"owned_by", "llamacpp"},
#include <cstddef>
#include <memory>
+#include <set>
struct server_context_impl; // private implementation
struct server_context_meta {
std::string build_info;
std::string model_name;
+ std::set<std::string> model_aliases;
+ std::set<std::string> model_tags;
std::string model_path;
bool has_mtmd;
bool has_inp_image;
if (mapping.find(meta.name) != mapping.end()) {
throw std::runtime_error(string_format("model '%s' appears multiple times", meta.name.c_str()));
}
+
+ // check model name does not conflict with existing aliases
+ for (const auto & [key, inst] : mapping) {
+ if (inst.meta.aliases.count(meta.name)) {
+ throw std::runtime_error(string_format("model name '%s' conflicts with alias of model '%s'",
+ meta.name.c_str(), key.c_str()));
+ }
+ }
+
+ // parse aliases from preset's --alias option (comma-separated)
+ std::string alias_str;
+ if (meta.preset.get_option("LLAMA_ARG_ALIAS", alias_str) && !alias_str.empty()) {
+ for (auto & alias : string_split<std::string>(alias_str, ',')) {
+ alias = string_strip(alias);
+ if (!alias.empty()) {
+ meta.aliases.insert(alias);
+ }
+ }
+ }
+
+ // parse tags from preset's --tags option (comma-separated)
+ std::string tags_str;
+ if (meta.preset.get_option("LLAMA_ARG_TAGS", tags_str) && !tags_str.empty()) {
+ for (auto & tag : string_split<std::string>(tags_str, ',')) {
+ tag = string_strip(tag);
+ if (!tag.empty()) {
+ meta.tags.insert(tag);
+ }
+ }
+ }
+
+ // validate aliases do not conflict with existing names or aliases
+ for (const auto & alias : meta.aliases) {
+ if (mapping.find(alias) != mapping.end()) {
+ throw std::runtime_error(string_format("alias '%s' for model '%s' conflicts with existing model name",
+ alias.c_str(), meta.name.c_str()));
+ }
+ for (const auto & [key, inst] : mapping) {
+ if (inst.meta.aliases.count(alias)) {
+ throw std::runtime_error(string_format("alias '%s' for model '%s' conflicts with alias of model '%s'",
+ alias.c_str(), meta.name.c_str(), key.c_str()));
+ }
+ }
+ }
+
meta.update_args(ctx_preset, bin_path); // render args
std::string name = meta.name;
mapping[name] = instance_t{
server_model_meta meta{
/* preset */ preset.second,
/* name */ preset.first,
+ /* aliases */ {},
+ /* tags */ {},
/* port */ 0,
/* status */ SERVER_MODEL_STATUS_UNLOADED,
/* last_used */ 0,
for (const auto & [name, preset] : custom_presets) {
custom_names.insert(name);
}
+ auto join_set = [](const std::set<std::string> & s) {
+ std::string result;
+ for (const auto & v : s) {
+ if (!result.empty()) {
+ result += ", ";
+ }
+ result += v;
+ }
+ return result;
+ };
+
SRV_INF("Available models (%zu) (*: custom preset)\n", mapping.size());
for (const auto & [name, inst] : mapping) {
bool has_custom = custom_names.find(name) != custom_names.end();
- SRV_INF(" %c %s\n", has_custom ? '*' : ' ', name.c_str());
+ std::string info;
+ if (!inst.meta.aliases.empty()) {
+ info += " (aliases: " + join_set(inst.meta.aliases) + ")";
+ }
+ if (!inst.meta.tags.empty()) {
+ info += " [tags: " + join_set(inst.meta.tags) + "]";
+ }
+ SRV_INF(" %c %s%s\n", has_custom ? '*' : ' ', name.c_str(), info.c_str());
}
}
bool server_models::has_model(const std::string & name) {
std::lock_guard<std::mutex> lk(mutex);
- return mapping.find(name) != mapping.end();
+ if (mapping.find(name) != mapping.end()) {
+ return true;
+ }
+ for (const auto & [key, inst] : mapping) {
+ if (inst.meta.aliases.count(name)) {
+ return true;
+ }
+ }
+ return false;
}
std::optional<server_model_meta> server_models::get_meta(const std::string & name) {
if (it != mapping.end()) {
return it->second.meta;
}
+ for (const auto & [key, inst] : mapping) {
+ if (inst.meta.aliases.count(name)) {
+ return inst.meta;
+ }
+ }
return std::nullopt;
}
res->data = safe_json_to_str({{ "error", error_data }});
}
-static bool router_validate_model(const std::string & name, server_models & models, bool models_autoload, std::unique_ptr<server_http_res> & res) {
+static bool router_validate_model(std::string & name, server_models & models, bool models_autoload, std::unique_ptr<server_http_res> & res) {
if (name.empty()) {
res_err(res, format_error_response("model name is missing from the request", ERROR_TYPE_INVALID_REQUEST));
return false;
res_err(res, format_error_response(string_format("model '%s' not found", name.c_str()), ERROR_TYPE_INVALID_REQUEST));
return false;
}
+ // resolve alias to canonical model name
+ name = meta->name;
if (models_autoload) {
models.ensure_model_loaded(name);
} else {
auto res = std::make_unique<server_http_res>();
json body = json::parse(req.body);
std::string name = json_value(body, "model", std::string());
- auto model = models.get_meta(name);
- if (!model.has_value()) {
+ auto meta = models.get_meta(name);
+ if (!meta.has_value()) {
res_err(res, format_error_response("model is not found", ERROR_TYPE_NOT_FOUND));
return res;
}
- if (model->status == SERVER_MODEL_STATUS_LOADED) {
+ if (meta->status == SERVER_MODEL_STATUS_LOADED) {
res_err(res, format_error_response("model is already loaded", ERROR_TYPE_INVALID_REQUEST));
return res;
}
- models.load(name);
+ models.load(meta->name);
res_ok(res, {{"success", true}});
return res;
};
preset_copy.unset_option("LLAMA_ARG_HOST");
preset_copy.unset_option("LLAMA_ARG_PORT");
preset_copy.unset_option("LLAMA_ARG_ALIAS");
+ preset_copy.unset_option("LLAMA_ARG_TAGS");
status["preset"] = preset_copy.to_ini();
}
if (meta.is_failed()) {
}
models_json.push_back(json {
{"id", meta.name},
+ {"aliases", meta.aliases},
+ {"tags", meta.tags},
{"object", "model"}, // for OAI-compat
{"owned_by", "llamacpp"}, // for OAI-compat
{"created", t}, // for OAI-compat
res_err(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST));
return res;
}
- models.unload(name);
+ models.unload(model->name);
res_ok(res, {{"success", true}});
return res;
};
struct server_model_meta {
common_preset preset;
std::string name;
+ std::set<std::string> aliases; // additional names that resolve to this model
+ std::set<std::string> tags; // informational tags, not used for routing
int port = 0;
server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
int64_t last_used = 0; // for LRU unloading
// for consistency between server router mode and single-model mode, we set the same model name as alias
if (params.model_alias.empty() && !params.model.name.empty()) {
- params.model_alias = params.model.name;
+ params.model_alias.insert(params.model.name);
}
common_init();
server.start()
res = requests.get(url)
assert res.status_code == 404
+
+
+def test_server_model_aliases_and_tags():
+ global server
+ server.model_alias = "tinyllama-2,fim,code"
+ server.model_tags = "chat,fim,small"
+ server.start()
+ res = server.make_request("GET", "/models")
+ assert res.status_code == 200
+ assert len(res.body["data"]) == 1
+ model = res.body["data"][0]
+ # aliases field must contain all aliases
+ assert set(model["aliases"]) == {"tinyllama-2", "fim", "code"}
+ # tags field must contain all tags
+ assert set(model["tags"]) == {"chat", "fim", "small"}
+ # id is derived from first alias (alphabetical order from std::set)
+ assert model["id"] == "code"
# custom options
model_alias: str | None = None
+ model_tags: str | None = None
model_url: str | None = None
model_file: str | None = None
model_draft: str | None = None
server_args.extend(["--pooling", self.pooling])
if self.model_alias:
server_args.extend(["--alias", self.model_alias])
+ if self.model_tags:
+ server_args.extend(["--tags", self.model_tags])
if self.n_ctx:
server_args.extend(["--ctx-size", self.n_ctx])
if self.n_slots: