+#include "gguf.h" // for reading GGUF splits
#include "arg.h"
#include "log.h"
#include "sampling.h"
#include "chat.h"
+// fix problem with std::min and std::max
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+# define NOMINMAX
+#endif
+#include <windows.h>
+#endif
+
#include <algorithm>
#include <climits>
#include <cstdarg>
#include <thread>
#include <vector>
+//#define LLAMA_USE_CURL
+
+#if defined(LLAMA_USE_CURL)
+#include <curl/curl.h>
+#include <curl/easy.h>
+#include <future>
+#endif
+
#include "json-schema-to-grammar.h"
using json = nlohmann::ordered_json;
}
//
-// utils
+// downloader
+//
+
+struct common_hf_file_res {
+ std::string repo; // repo name with ":tag" removed
+ std::string ggufFile;
+ std::string mmprojFile;
+};
+
+#ifdef LLAMA_USE_CURL
+
+#ifdef __linux__
+#include <linux/limits.h>
+#elif defined(_WIN32)
+# if !defined(PATH_MAX)
+# define PATH_MAX MAX_PATH
+# endif
+#else
+#include <sys/syslimits.h>
+#endif
+#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
+
+//
+// CURL utils
//
-static void common_params_handle_model_default(
- std::string & model,
- const std::string & model_url,
- std::string & hf_repo,
- std::string & hf_file,
- const std::string & hf_token,
- const std::string & model_default) {
- if (!hf_repo.empty()) {
- // short-hand to avoid specifying --hf-file -> default it to --model
- if (hf_file.empty()) {
- if (model.empty()) {
- auto auto_detected = common_get_hf_file(hf_repo, hf_token);
- if (auto_detected.first.empty() || auto_detected.second.empty()) {
- exit(1); // built without CURL, error message already printed
+using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
+
+// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
+struct curl_slist_ptr {
+ struct curl_slist * ptr = nullptr;
+ ~curl_slist_ptr() {
+ if (ptr) {
+ curl_slist_free_all(ptr);
+ }
+ }
+};
+
+#define CURL_MAX_RETRY 3
+#define CURL_RETRY_DELAY_SECONDS 2
+
+static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
+ int remaining_attempts = max_attempts;
+
+ while (remaining_attempts > 0) {
+ LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
+
+ CURLcode res = curl_easy_perform(curl);
+ if (res == CURLE_OK) {
+ return true;
+ }
+
+ int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
+ LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
+
+ remaining_attempts--;
+ std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
+ }
+
+ LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
+
+ return false;
+}
+
+// download one single file from remote URL to local path
+static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token) {
+ // Initialize libcurl
+ curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
+ curl_slist_ptr http_headers;
+ if (!curl) {
+ LOG_ERR("%s: error initializing libcurl\n", __func__);
+ return false;
+ }
+
+ bool force_download = false;
+
+ // Set the URL, allow to follow http redirection
+ curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
+ curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
+
+ // Check if hf-token or bearer-token was specified
+ if (!bearer_token.empty()) {
+ std::string auth_header = "Authorization: Bearer " + bearer_token;
+ http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
+ curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
+ }
+
+#if defined(_WIN32)
+ // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
+ // operating system. Currently implemented under MS-Windows.
+ curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
+#endif
+
+ // Check if the file already exists locally
+ auto file_exists = std::filesystem::exists(path);
+
+ // If the file exists, check its JSON metadata companion file.
+ std::string metadata_path = path + ".json";
+ nlohmann::json metadata;
+ std::string etag;
+ std::string last_modified;
+
+ if (file_exists) {
+ // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
+ std::ifstream metadata_in(metadata_path);
+ if (metadata_in.good()) {
+ try {
+ metadata_in >> metadata;
+ LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
+ if (metadata.contains("url") && metadata.at("url").is_string()) {
+ auto previous_url = metadata.at("url").get<std::string>();
+ if (previous_url != url) {
+ LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
+ return false;
+ }
}
- hf_repo = auto_detected.first;
- hf_file = auto_detected.second;
- } else {
- hf_file = model;
+ if (metadata.contains("etag") && metadata.at("etag").is_string()) {
+ etag = metadata.at("etag");
+ }
+ if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
+ last_modified = metadata.at("lastModified");
+ }
+ } catch (const nlohmann::json::exception & e) {
+ LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
+ return false;
}
}
- // make sure model path is present (for caching purposes)
- if (model.empty()) {
- // this is to avoid different repo having same file name, or same file name in different subdirs
- std::string filename = hf_repo + "_" + hf_file;
- // to make sure we don't have any slashes in the filename
- string_replace_all(filename, "/", "_");
- model = fs_get_cache_file(filename);
+ } else {
+ LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
+ }
+
+ // Send a HEAD request to retrieve the etag and last-modified headers
+ struct common_load_model_from_url_headers {
+ std::string etag;
+ std::string last_modified;
+ };
+
+ common_load_model_from_url_headers headers;
+
+ {
+ typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
+ auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
+ common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
+
+ static std::regex header_regex("([^:]+): (.*)\r\n");
+ static std::regex etag_regex("ETag", std::regex_constants::icase);
+ static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
+
+ std::string header(buffer, n_items);
+ std::smatch match;
+ if (std::regex_match(header, match, header_regex)) {
+ const std::string & key = match[1];
+ const std::string & value = match[2];
+ if (std::regex_match(key, match, etag_regex)) {
+ headers->etag = value;
+ } else if (std::regex_match(key, match, last_modified_regex)) {
+ headers->last_modified = value;
+ }
+ }
+ return n_items;
+ };
+
+ curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
+ curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
+ curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
+ curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
+
+ bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
+ if (!was_perform_successful) {
+ return false;
+ }
+
+ long http_code = 0;
+ curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
+ if (http_code != 200) {
+ // HEAD not supported, we don't know if the file has changed
+ // force trigger downloading
+ force_download = true;
+ LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
}
- } else if (!model_url.empty()) {
- if (model.empty()) {
- auto f = string_split<std::string>(model_url, '#').front();
- f = string_split<std::string>(f, '?').front();
- model = fs_get_cache_file(string_split<std::string>(f, '/').back());
+ }
+
+ bool should_download = !file_exists || force_download;
+ if (!should_download) {
+ if (!etag.empty() && etag != headers.etag) {
+ LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
+ should_download = true;
+ } else if (!last_modified.empty() && last_modified != headers.last_modified) {
+ LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
+ should_download = true;
+ }
+ }
+ if (should_download) {
+ std::string path_temporary = path + ".downloadInProgress";
+ if (file_exists) {
+ LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
+ if (remove(path.c_str()) != 0) {
+ LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
+ return false;
+ }
+ }
+
+ // Set the output file
+
+ struct FILE_deleter {
+ void operator()(FILE * f) const {
+ fclose(f);
+ }
+ };
+
+ std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
+ if (!outfile) {
+ LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
+ return false;
+ }
+
+ typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
+ auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
+ return fwrite(data, size, nmemb, (FILE *)fd);
+ };
+ curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L);
+ curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
+ curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get());
+
+ // display download progress
+ curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
+
+ // helper function to hide password in URL
+ auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
+ std::size_t protocol_pos = url.find("://");
+ if (protocol_pos == std::string::npos) {
+ return url; // Malformed URL
+ }
+
+ std::size_t at_pos = url.find('@', protocol_pos + 3);
+ if (at_pos == std::string::npos) {
+ return url; // No password in URL
+ }
+
+ return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
+ };
+
+ // start the download
+ LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
+ llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
+ bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
+ if (!was_perform_successful) {
+ return false;
+ }
+
+ long http_code = 0;
+ curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
+ if (http_code < 200 || http_code >= 400) {
+ LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
+ return false;
+ }
+
+ // Causes file to be closed explicitly here before we rename it.
+ outfile.reset();
+
+ // Write the updated JSON metadata file.
+ metadata.update({
+ {"url", url},
+ {"etag", headers.etag},
+ {"lastModified", headers.last_modified}
+ });
+ std::ofstream(metadata_path) << metadata.dump(4);
+ LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
+
+ if (rename(path_temporary.c_str(), path.c_str()) != 0) {
+ LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
+ return false;
+ }
+ }
+
+ return true;
+}
+
+// download multiple files from remote URLs to local paths
+// the input is a vector of pairs <url, path>
+static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token) {
+ // Prepare download in parallel
+ std::vector<std::future<bool>> futures_download;
+ for (auto const & item : urls) {
+ futures_download.push_back(std::async(std::launch::async, [bearer_token](const std::pair<std::string, std::string> & it) -> bool {
+ return common_download_file_single(it.first, it.second, bearer_token);
+ }, item));
+ }
+
+ // Wait for all downloads to complete
+ for (auto & f : futures_download) {
+ if (!f.get()) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static bool common_download_model(
+ const common_params_model & model,
+ const std::string & bearer_token) {
+ // Basic validation of the model.url
+ if (model.url.empty()) {
+ LOG_ERR("%s: invalid model url\n", __func__);
+ return false;
+ }
+
+ if (!common_download_file_single(model.url, model.path, bearer_token)) {
+ return false;
+ }
+
+ // check for additional GGUFs split to download
+ int n_split = 0;
+ {
+ struct gguf_init_params gguf_params = {
+ /*.no_alloc = */ true,
+ /*.ctx = */ NULL,
+ };
+ auto * ctx_gguf = gguf_init_from_file(model.path.c_str(), gguf_params);
+ if (!ctx_gguf) {
+ LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, model.path.c_str());
+ return false;
+ }
+
+ auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
+ if (key_n_split >= 0) {
+ n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
+ }
+
+ gguf_free(ctx_gguf);
+ }
+
+ if (n_split > 1) {
+ char split_prefix[PATH_MAX] = {0};
+ char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
+
+ // Verify the first split file format
+ // and extract split URL and PATH prefixes
+ {
+ if (!llama_split_prefix(split_prefix, sizeof(split_prefix), model.path.c_str(), 0, n_split)) {
+ LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, model.path.c_str(), n_split);
+ return false;
+ }
+
+ if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model.url.c_str(), 0, n_split)) {
+ LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model.url.c_str(), n_split);
+ return false;
+ }
+ }
+
+ std::vector<std::pair<std::string, std::string>> urls;
+ for (int idx = 1; idx < n_split; idx++) {
+ char split_path[PATH_MAX] = {0};
+ llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
+
+ char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
+ llama_split_path(split_url, sizeof(split_url), split_url_prefix, idx, n_split);
+
+ if (std::string(split_path) == model.path) {
+ continue; // skip the already downloaded file
+ }
+
+ urls.push_back({split_url, split_path});
+ }
+
+ // Download in parallel
+ common_download_file_multiple(urls, bearer_token);
+ }
+
+ return true;
+}
+
+/**
+ * Allow getting the HF file from the HF repo with tag (like ollama), for example:
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
+ * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
+ *
+ * Return pair of <repo, file> (with "repo" already having tag removed)
+ *
+ * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
+ */
+static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token) {
+ auto parts = string_split<std::string>(hf_repo_with_tag, ':');
+ std::string tag = parts.size() > 1 ? parts.back() : "latest";
+ std::string hf_repo = parts[0];
+ if (string_split<std::string>(hf_repo, '/').size() != 2) {
+ throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
+ }
+
+ // fetch model info from Hugging Face Hub API
+ curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
+ curl_slist_ptr http_headers;
+ std::string res_str;
+ std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
+ curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
+ curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
+ typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
+ auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
+ static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
+ return size * nmemb;
+ };
+ curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
+ curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
+#if defined(_WIN32)
+ curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
+#endif
+ if (!bearer_token.empty()) {
+ std::string auth_header = "Authorization: Bearer " + bearer_token;
+ http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
+ }
+ // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
+ http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
+ http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
+ curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
+
+ CURLcode res = curl_easy_perform(curl.get());
+
+ if (res != CURLE_OK) {
+ throw std::runtime_error("error: cannot make GET request to HF API");
+ }
+
+ long res_code;
+ std::string ggufFile = "";
+ std::string mmprojFile = "";
+ curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
+ if (res_code == 200) {
+ // extract ggufFile.rfilename in json, using regex
+ {
+ std::regex pattern("\"ggufFile\"[\\s\\S]*?\"rfilename\"\\s*:\\s*\"([^\"]+)\"");
+ std::smatch match;
+ if (std::regex_search(res_str, match, pattern)) {
+ ggufFile = match[1].str();
+ }
+ }
+ // extract mmprojFile.rfilename in json, using regex
+ {
+ std::regex pattern("\"mmprojFile\"[\\s\\S]*?\"rfilename\"\\s*:\\s*\"([^\"]+)\"");
+ std::smatch match;
+ if (std::regex_search(res_str, match, pattern)) {
+ mmprojFile = match[1].str();
+ }
+ }
+ } else if (res_code == 401) {
+ throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
+ } else {
+ throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
+ }
+
+ // check response
+ if (ggufFile.empty()) {
+ throw std::runtime_error("error: model does not have ggufFile");
+ }
+
+ return { hf_repo, ggufFile, mmprojFile };
+}
+
+#else
+
+static bool common_download_file_single(const std::string &, const std::string &, const std::string &) {
+ LOG_ERR("error: built without CURL, cannot download model from internet\n");
+ return false;
+}
+
+static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &) {
+ LOG_ERR("error: built without CURL, cannot download model from the internet\n");
+ return false;
+}
+
+static bool common_download_model(
+ const common_params_model &,
+ const std::string &) {
+ LOG_ERR("error: built without CURL, cannot download model from the internet\n");
+ return false;
+}
+
+static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &) {
+ LOG_ERR("error: built without CURL, cannot download model from the internet\n");
+ return {};
+}
+
+#endif // LLAMA_USE_CURL
+
+//
+// utils
+//
+
+static void common_params_handle_model(
+ struct common_params_model & model,
+ const std::string & bearer_token,
+ const std::string & model_path_default,
+ bool is_mmproj = false) { // TODO: move is_mmproj to an enum when we have more files?
+ // handle pre-fill default model path and url based on hf_repo and hf_file
+ {
+ if (!model.hf_repo.empty()) {
+ // short-hand to avoid specifying --hf-file -> default it to --model
+ if (model.hf_file.empty()) {
+ if (model.path.empty()) {
+ auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token);
+ if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
+ exit(1); // built without CURL, error message already printed
+ }
+ model.hf_repo = auto_detected.repo;
+ model.hf_file = is_mmproj ? auto_detected.mmprojFile : auto_detected.ggufFile;
+ } else {
+ model.hf_file = model.path;
+ }
+ }
+
+ // TODO: allow custom host
+ model.url = "https://huggingface.co/" + model.hf_repo + "/resolve/main/" + model.hf_file;
+
+ // make sure model path is present (for caching purposes)
+ if (model.path.empty()) {
+ // this is to avoid different repo having same file name, or same file name in different subdirs
+ std::string filename = model.hf_repo + "_" + model.hf_file;
+ // to make sure we don't have any slashes in the filename
+ string_replace_all(filename, "/", "_");
+ model.path = fs_get_cache_file(filename);
+ }
+
+ } else if (!model.url.empty()) {
+ if (model.path.empty()) {
+ auto f = string_split<std::string>(model.url, '#').front();
+ f = string_split<std::string>(f, '?').front();
+ model.path = fs_get_cache_file(string_split<std::string>(f, '/').back());
+ }
+
+ } else if (model.path.empty()) {
+ model.path = model_path_default;
+ }
+ }
+
+ // then, download it if needed
+ if (!model.url.empty()) {
+ bool ok = common_download_model(model, bearer_token);
+ if (!ok) {
+ LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
+ exit(1);
}
- } else if (model.empty()) {
- model = model_default;
}
}
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
}
- // TODO: refactor model params in a common struct
- common_params_handle_model_default(params.model, params.model_url, params.hf_repo, params.hf_file, params.hf_token, DEFAULT_MODEL_PATH);
- common_params_handle_model_default(params.speculative.model, params.speculative.model_url, params.speculative.hf_repo, params.speculative.hf_file, params.hf_token, "");
- common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file, params.hf_token, "");
+ common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH);
+ common_params_handle_model(params.speculative.model, params.hf_token, "");
+ common_params_handle_model(params.vocoder.model, params.hf_token, "");
+
+ // allow --mmproj to be set from -hf
+ // assuming that mmproj is always in the same repo as text model
+ if (!params.model.hf_repo.empty() && ctx_arg.ex == LLAMA_EXAMPLE_LLAVA) {
+ params.mmproj.hf_repo = params.model.hf_repo;
+ }
+ common_params_handle_model(params.mmproj, params.hf_token, "", true);
if (params.escape) {
string_process_escapes(params.prompt);
{"--mmproj"}, "FILE",
"path to a multimodal projector file for LLaVA. see examples/llava/README.md",
[](common_params & params, const std::string & value) {
- params.mmproj = value;
+ params.mmproj.path = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_LLAVA}));
+ add_opt(common_arg(
+ {"--mmproj-url"}, "URL",
+ "URL to a multimodal projector file for LLaVA. see examples/llava/README.md",
+ [](common_params & params, const std::string & value) {
+ params.mmproj.url = value;
}
).set_examples({LLAMA_EXAMPLE_LLAVA}));
add_opt(common_arg(
"or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
),
[](common_params & params, const std::string & value) {
- params.model = value;
+ params.model.path = value;
}
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
add_opt(common_arg(
{"-mu", "--model-url"}, "MODEL_URL",
"model download url (default: unused)",
[](common_params & params, const std::string & value) {
- params.model_url = value;
+ params.model.url = value;
}
).set_env("LLAMA_ARG_MODEL_URL"));
add_opt(common_arg(
"example: unsloth/phi-4-GGUF:q4_k_m\n"
"(default: unused)",
[](common_params & params, const std::string & value) {
- params.hf_repo = value;
+ params.model.hf_repo = value;
}
).set_env("LLAMA_ARG_HF_REPO"));
add_opt(common_arg(
{"-hfd", "-hfrd", "--hf-repo-draft"}, "<user>/<model>[:quant]",
"Same as --hf-repo, but for the draft model (default: unused)",
[](common_params & params, const std::string & value) {
- params.speculative.hf_repo = value;
+ params.speculative.model.hf_repo = value;
}
).set_env("LLAMA_ARG_HFD_REPO"));
add_opt(common_arg(
{"-hff", "--hf-file"}, "FILE",
"Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)",
[](common_params & params, const std::string & value) {
- params.hf_file = value;
+ params.model.hf_file = value;
}
).set_env("LLAMA_ARG_HF_FILE"));
add_opt(common_arg(
{"-hfv", "-hfrv", "--hf-repo-v"}, "<user>/<model>[:quant]",
"Hugging Face model repository for the vocoder model (default: unused)",
[](common_params & params, const std::string & value) {
- params.vocoder.hf_repo = value;
+ params.vocoder.model.hf_repo = value;
}
).set_env("LLAMA_ARG_HF_REPO_V"));
add_opt(common_arg(
{"-hffv", "--hf-file-v"}, "FILE",
"Hugging Face model file for the vocoder model (default: unused)",
[](common_params & params, const std::string & value) {
- params.vocoder.hf_file = value;
+ params.vocoder.model.hf_file = value;
}
).set_env("LLAMA_ARG_HF_FILE_V"));
add_opt(common_arg(
{"-md", "--model-draft"}, "FNAME",
"draft model for speculative decoding (default: unused)",
[](common_params & params, const std::string & value) {
- params.speculative.model = value;
+ params.speculative.model.path = value;
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
{"-mv", "--model-vocoder"}, "FNAME",
"vocoder model for audio generation (default: unused)",
[](common_params & params, const std::string & value) {
- params.vocoder.model = value;
+ params.vocoder.model.path = value;
}
).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--tts-oute-default"},
string_format("use default OuteTTS models (note: can download weights from the internet)"),
[](common_params & params) {
- params.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF";
- params.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
- params.vocoder.hf_repo = "ggml-org/WavTokenizer";
- params.vocoder.hf_file = "WavTokenizer-Large-75-F16.gguf";
+ params.model.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF";
+ params.model.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
+ params.vocoder.model.hf_repo = "ggml-org/WavTokenizer";
+ params.vocoder.model.hf_file = "WavTokenizer-Large-75-F16.gguf";
}
).set_examples({LLAMA_EXAMPLE_TTS}));
{"--embd-bge-small-en-default"},
string_format("use default bge-small-en-v1.5 model (note: can download weights from the internet)"),
[](common_params & params) {
- params.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
- params.hf_file = "bge-small-en-v1.5-q8_0.gguf";
+ params.model.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
+ params.model.hf_file = "bge-small-en-v1.5-q8_0.gguf";
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
params.embd_normalize = 2;
params.n_ctx = 512;
{"--embd-e5-small-en-default"},
string_format("use default e5-small-v2 model (note: can download weights from the internet)"),
[](common_params & params) {
- params.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
- params.hf_file = "e5-small-v2-q8_0.gguf";
+ params.model.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
+ params.model.hf_file = "e5-small-v2-q8_0.gguf";
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
params.embd_normalize = 2;
params.n_ctx = 512;
{"--embd-gte-small-default"},
string_format("use default gte-small model (note: can download weights from the internet)"),
[](common_params & params) {
- params.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
- params.hf_file = "gte-small-q8_0.gguf";
+ params.model.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
+ params.model.hf_file = "gte-small-q8_0.gguf";
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
params.embd_normalize = 2;
params.n_ctx = 512;
{"--fim-qwen-1.5b-default"},
string_format("use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet)"),
[](common_params & params) {
- params.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
- params.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
+ params.model.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
+ params.model.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
params.port = 8012;
params.n_gpu_layers = 99;
params.flash_attn = true;
{"--fim-qwen-3b-default"},
string_format("use default Qwen 2.5 Coder 3B (note: can download weights from the internet)"),
[](common_params & params) {
- params.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
- params.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
+ params.model.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
+ params.model.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
params.port = 8012;
params.n_gpu_layers = 99;
params.flash_attn = true;
{"--fim-qwen-7b-default"},
string_format("use default Qwen 2.5 Coder 7B (note: can download weights from the internet)"),
[](common_params & params) {
- params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
- params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
+ params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
+ params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
params.port = 8012;
params.n_gpu_layers = 99;
params.flash_attn = true;
{"--fim-qwen-7b-spec"},
string_format("use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
[](common_params & params) {
- params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
- params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
- params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
- params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
+ params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
+ params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
+ params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
+ params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
params.speculative.n_gpu_layers = 99;
params.port = 8012;
params.n_gpu_layers = 99;
{"--fim-qwen-14b-spec"},
string_format("use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
[](common_params & params) {
- params.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
- params.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
- params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
- params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
+ params.model.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
+ params.model.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
+ params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
+ params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
params.speculative.n_gpu_layers = 99;
params.port = 8012;
params.n_gpu_layers = 99;
#include <sys/stat.h>
#include <unistd.h>
#endif
-#if defined(LLAMA_USE_CURL)
-#include <curl/curl.h>
-#include <curl/easy.h>
-#include <future>
-#endif
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
-#if defined(LLAMA_USE_CURL)
-#ifdef __linux__
-#include <linux/limits.h>
-#elif defined(_WIN32)
-# if !defined(PATH_MAX)
-# define PATH_MAX MAX_PATH
-# endif
-#else
-#include <sys/syslimits.h>
-#endif
-#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
-
-//
-// CURL utils
-//
-
-using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
-
-// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
-struct curl_slist_ptr {
- struct curl_slist * ptr = nullptr;
- ~curl_slist_ptr() {
- if (ptr) {
- curl_slist_free_all(ptr);
- }
- }
-};
-#endif // LLAMA_USE_CURL
-
using json = nlohmann::ordered_json;
//
//
// Model utils
//
+
struct common_init_result common_init_from_params(common_params & params) {
common_init_result iparams;
auto mparams = common_model_params_to_llama(params);
- llama_model * model = nullptr;
-
- if (!params.hf_repo.empty() && !params.hf_file.empty()) {
- model = common_load_model_from_hf(params.hf_repo, params.hf_file, params.model, params.hf_token, mparams);
- } else if (!params.model_url.empty()) {
- model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
- } else {
- model = llama_model_load_from_file(params.model.c_str(), mparams);
- }
-
+ llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
if (model == NULL) {
- LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
+ LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
return iparams;
}
llama_context * lctx = llama_init_from_model(model, cparams);
if (lctx == NULL) {
- LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
+ LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
llama_model_free(model);
return iparams;
}
return tpp;
}
-#ifdef LLAMA_USE_CURL
-
-#define CURL_MAX_RETRY 3
-#define CURL_RETRY_DELAY_SECONDS 2
-
-static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
- int remaining_attempts = max_attempts;
-
- while (remaining_attempts > 0) {
- LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
-
- CURLcode res = curl_easy_perform(curl);
- if (res == CURLE_OK) {
- return true;
- }
-
- int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
- LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
-
- remaining_attempts--;
- std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
- }
-
- LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
-
- return false;
-}
-
-static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
- // Initialize libcurl
- curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
- curl_slist_ptr http_headers;
- if (!curl) {
- LOG_ERR("%s: error initializing libcurl\n", __func__);
- return false;
- }
-
- bool force_download = false;
-
- // Set the URL, allow to follow http redirection
- curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
- curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
-
- // Check if hf-token or bearer-token was specified
- if (!hf_token.empty()) {
- std::string auth_header = "Authorization: Bearer " + hf_token;
- http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
- curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
- }
-
-#if defined(_WIN32)
- // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
- // operating system. Currently implemented under MS-Windows.
- curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
-#endif
-
- // Check if the file already exists locally
- auto file_exists = std::filesystem::exists(path);
-
- // If the file exists, check its JSON metadata companion file.
- std::string metadata_path = path + ".json";
- nlohmann::json metadata;
- std::string etag;
- std::string last_modified;
-
- if (file_exists) {
- // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
- std::ifstream metadata_in(metadata_path);
- if (metadata_in.good()) {
- try {
- metadata_in >> metadata;
- LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
- if (metadata.contains("url") && metadata.at("url").is_string()) {
- auto previous_url = metadata.at("url").get<std::string>();
- if (previous_url != url) {
- LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
- return false;
- }
- }
- if (metadata.contains("etag") && metadata.at("etag").is_string()) {
- etag = metadata.at("etag");
- }
- if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
- last_modified = metadata.at("lastModified");
- }
- } catch (const nlohmann::json::exception & e) {
- LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
- return false;
- }
- }
- } else {
- LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
- }
-
- // Send a HEAD request to retrieve the etag and last-modified headers
- struct common_load_model_from_url_headers {
- std::string etag;
- std::string last_modified;
- };
-
- common_load_model_from_url_headers headers;
-
- {
- typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
- auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
- common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
-
- static std::regex header_regex("([^:]+): (.*)\r\n");
- static std::regex etag_regex("ETag", std::regex_constants::icase);
- static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
-
- std::string header(buffer, n_items);
- std::smatch match;
- if (std::regex_match(header, match, header_regex)) {
- const std::string & key = match[1];
- const std::string & value = match[2];
- if (std::regex_match(key, match, etag_regex)) {
- headers->etag = value;
- } else if (std::regex_match(key, match, last_modified_regex)) {
- headers->last_modified = value;
- }
- }
- return n_items;
- };
-
- curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
- curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
- curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
- curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
-
- bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
- if (!was_perform_successful) {
- return false;
- }
-
- long http_code = 0;
- curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
- if (http_code != 200) {
- // HEAD not supported, we don't know if the file has changed
- // force trigger downloading
- force_download = true;
- LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
- }
- }
-
- bool should_download = !file_exists || force_download;
- if (!should_download) {
- if (!etag.empty() && etag != headers.etag) {
- LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
- should_download = true;
- } else if (!last_modified.empty() && last_modified != headers.last_modified) {
- LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
- should_download = true;
- }
- }
- if (should_download) {
- std::string path_temporary = path + ".downloadInProgress";
- if (file_exists) {
- LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
- if (remove(path.c_str()) != 0) {
- LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
- return false;
- }
- }
-
- // Set the output file
-
- struct FILE_deleter {
- void operator()(FILE * f) const {
- fclose(f);
- }
- };
-
- std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
- if (!outfile) {
- LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
- return false;
- }
-
- typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
- auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
- return fwrite(data, size, nmemb, (FILE *)fd);
- };
- curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L);
- curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
- curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get());
-
- // display download progress
- curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
-
- // helper function to hide password in URL
- auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
- std::size_t protocol_pos = url.find("://");
- if (protocol_pos == std::string::npos) {
- return url; // Malformed URL
- }
-
- std::size_t at_pos = url.find('@', protocol_pos + 3);
- if (at_pos == std::string::npos) {
- return url; // No password in URL
- }
-
- return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
- };
-
- // start the download
- LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
- llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
- bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
- if (!was_perform_successful) {
- return false;
- }
-
- long http_code = 0;
- curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
- if (http_code < 200 || http_code >= 400) {
- LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
- return false;
- }
-
- // Causes file to be closed explicitly here before we rename it.
- outfile.reset();
-
- // Write the updated JSON metadata file.
- metadata.update({
- {"url", url},
- {"etag", headers.etag},
- {"lastModified", headers.last_modified}
- });
- std::ofstream(metadata_path) << metadata.dump(4);
- LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
-
- if (rename(path_temporary.c_str(), path.c_str()) != 0) {
- LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
- return false;
- }
- }
-
- return true;
-}
-
-struct llama_model * common_load_model_from_url(
- const std::string & model_url,
- const std::string & local_path,
- const std::string & hf_token,
- const struct llama_model_params & params) {
- // Basic validation of the model_url
- if (model_url.empty()) {
- LOG_ERR("%s: invalid model_url\n", __func__);
- return NULL;
- }
-
- if (!common_download_file(model_url, local_path, hf_token)) {
- return NULL;
- }
-
- // check for additional GGUFs split to download
- int n_split = 0;
- {
- struct gguf_init_params gguf_params = {
- /*.no_alloc = */ true,
- /*.ctx = */ NULL,
- };
- auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params);
- if (!ctx_gguf) {
- LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, local_path.c_str());
- return NULL;
- }
-
- auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
- if (key_n_split >= 0) {
- n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
- }
-
- gguf_free(ctx_gguf);
- }
-
- if (n_split > 1) {
- char split_prefix[PATH_MAX] = {0};
- char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
-
- // Verify the first split file format
- // and extract split URL and PATH prefixes
- {
- if (!llama_split_prefix(split_prefix, sizeof(split_prefix), local_path.c_str(), 0, n_split)) {
- LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, local_path.c_str(), n_split);
- return NULL;
- }
-
- if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url.c_str(), 0, n_split)) {
- LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url.c_str(), n_split);
- return NULL;
- }
- }
-
- // Prepare download in parallel
- std::vector<std::future<bool>> futures_download;
- for (int idx = 1; idx < n_split; idx++) {
- futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split, hf_token](int download_idx) -> bool {
- char split_path[PATH_MAX] = {0};
- llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
-
- char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
- llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
-
- return common_download_file(split_url, split_path, hf_token);
- }, idx));
- }
-
- // Wait for all downloads to complete
- for (auto & f : futures_download) {
- if (!f.get()) {
- return NULL;
- }
- }
- }
-
- return llama_model_load_from_file(local_path.c_str(), params);
-}
-
-struct llama_model * common_load_model_from_hf(
- const std::string & repo,
- const std::string & remote_path,
- const std::string & local_path,
- const std::string & hf_token,
- const struct llama_model_params & params) {
- // construct hugging face model url:
- //
- // --repo ggml-org/models --file tinyllama-1.1b/ggml-model-f16.gguf
- // https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf
- //
- // --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf
- // https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf
- //
-
- std::string model_url = "https://huggingface.co/";
- model_url += repo;
- model_url += "/resolve/main/";
- model_url += remote_path;
-
- return common_load_model_from_url(model_url, local_path, hf_token, params);
-}
-
-/**
- * Allow getting the HF file from the HF repo with tag (like ollama), for example:
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
- * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
- *
- * Return pair of <repo, file> (with "repo" already having tag removed)
- *
- * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
- */
-std::pair<std::string, std::string> common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) {
- auto parts = string_split<std::string>(hf_repo_with_tag, ':');
- std::string tag = parts.size() > 1 ? parts.back() : "latest";
- std::string hf_repo = parts[0];
- if (string_split<std::string>(hf_repo, '/').size() != 2) {
- throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
- }
-
- // fetch model info from Hugging Face Hub API
- json model_info;
- curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
- curl_slist_ptr http_headers;
- std::string res_str;
- std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
- curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
- curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
- typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
- auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
- static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
- return size * nmemb;
- };
- curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
- curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
-#if defined(_WIN32)
- curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
-#endif
- if (!hf_token.empty()) {
- std::string auth_header = "Authorization: Bearer " + hf_token;
- http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
- }
- // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
- http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
- http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
- curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
-
- CURLcode res = curl_easy_perform(curl.get());
-
- if (res != CURLE_OK) {
- throw std::runtime_error("error: cannot make GET request to HF API");
- }
-
- long res_code;
- curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
- if (res_code == 200) {
- model_info = json::parse(res_str);
- } else if (res_code == 401) {
- throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
- } else {
- throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
- }
-
- // check response
- if (!model_info.contains("ggufFile")) {
- throw std::runtime_error("error: model does not have ggufFile");
- }
- json & gguf_file = model_info.at("ggufFile");
- if (!gguf_file.contains("rfilename")) {
- throw std::runtime_error("error: ggufFile does not have rfilename");
- }
-
- return std::make_pair(hf_repo, gguf_file.at("rfilename"));
-}
-
-#else
-
-struct llama_model * common_load_model_from_url(
- const std::string & /*model_url*/,
- const std::string & /*local_path*/,
- const std::string & /*hf_token*/,
- const struct llama_model_params & /*params*/) {
- LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
- return nullptr;
-}
-
-struct llama_model * common_load_model_from_hf(
- const std::string & /*repo*/,
- const std::string & /*remote_path*/,
- const std::string & /*local_path*/,
- const std::string & /*hf_token*/,
- const struct llama_model_params & /*params*/) {
- LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
- return nullptr;
-}
-
-std::pair<std::string, std::string> common_get_hf_file(const std::string &, const std::string &) {
- LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
- return std::make_pair("", "");
-}
-
-#endif // LLAMA_USE_CURL
-
//
// Batch utils
//
std::string print() const;
};
+struct common_params_model {
+ std::string path = ""; // model local path // NOLINT
+ std::string url = ""; // model url to download // NOLINT
+ std::string hf_repo = ""; // HF repo // NOLINT
+ std::string hf_file = ""; // HF file // NOLINT
+};
+
struct common_params_speculative {
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
struct cpu_params cpuparams;
struct cpu_params cpuparams_batch;
- std::string hf_repo = ""; // HF repo // NOLINT
- std::string hf_file = ""; // HF file // NOLINT
-
- std::string model = ""; // draft model for speculative decoding // NOLINT
- std::string model_url = ""; // model url to download // NOLINT
+ struct common_params_model model;
};
struct common_params_vocoder {
- std::string hf_repo = ""; // HF repo // NOLINT
- std::string hf_file = ""; // HF file // NOLINT
-
- std::string model = ""; // model path // NOLINT
- std::string model_url = ""; // model url to download // NOLINT
+ struct common_params_model model;
std::string speaker_file = ""; // speaker file path // NOLINT
struct common_params_speculative speculative;
struct common_params_vocoder vocoder;
- std::string model = ""; // model path // NOLINT
+ struct common_params_model model;
+
std::string model_alias = ""; // model alias // NOLINT
- std::string model_url = ""; // model url to download // NOLINT
std::string hf_token = ""; // HF token // NOLINT
- std::string hf_repo = ""; // HF repo // NOLINT
- std::string hf_file = ""; // HF file // NOLINT
std::string prompt = ""; // NOLINT
std::string system_prompt = ""; // NOLINT
std::string prompt_file = ""; // store the external prompt file name // NOLINT
common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
// multimodal models (see examples/llava)
- std::string mmproj = ""; // path to multimodal projector // NOLINT
+ struct common_params_model mmproj;
std::vector<std::string> image; // path to image file(s)
// embedding
struct llama_context_params common_context_params_to_llama(const common_params & params);
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
-struct llama_model * common_load_model_from_url(
- const std::string & model_url,
- const std::string & local_path,
- const std::string & hf_token,
- const struct llama_model_params & params);
-
-struct llama_model * common_load_model_from_hf(
- const std::string & repo,
- const std::string & remote_path,
- const std::string & local_path,
- const std::string & hf_token,
- const struct llama_model_params & params);
-
-std::pair<std::string, std::string> common_get_hf_file(
- const std::string & hf_repo_with_tag,
- const std::string & hf_token);
-
// clear LoRA adapters from context, then apply new list of adapters
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
llama_model_params model_params = common_model_params_to_llama(params);
- llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
+ llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
if (model == NULL) {
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
llama_model_params model_params = common_model_params_to_llama(params);
- llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
+ llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
if (model == NULL) {
LOG_ERR("%s: error: unable to load model\n" , __func__);
g_verbose = (params.verbosity > 1);
try {
- lora_merge_ctx ctx(params.model, params.lora_adapters, params.out_file, params.cpuparams.n_threads);
+ lora_merge_ctx ctx(params.model.path, params.lora_adapters, params.out_file, params.cpuparams.n_threads);
ctx.run_merge();
} catch (const std::exception & err) {
fprintf(stderr, "%s\n", err.what());
llama_backend_init();
- llama_model * model = llama_model_load_from_file(params.model.c_str(), mparams);
+ llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
// create generation context
llama_context * ctx = llama_init_from_model(model, cparams);
>
> This is very experimental, only used for demo purpose.
+## Quick started
+
+You can use pre-quantized model from [ggml-org](https://huggingface.co/ggml-org)'s Hugging Face account
+
+```bash
+# build
+cmake -B build
+cmake --build build --target llama-gemma3-cli
+
+# alternatively, install from brew (MacOS)
+brew install llama.cpp
+
+# run it
+llama-gemma3-cli -hf ggml-org/gemma-3-4b-it-GGUF
+llama-gemma3-cli -hf ggml-org/gemma-3-12b-it-GGUF
+llama-gemma3-cli -hf ggml-org/gemma-3-27b-it-GGUF
+
+# note: 1B model does not support vision
+```
+
## How to get mmproj.gguf?
```bash
}
void init_clip_model(common_params & params) {
- const char * clip_path = params.mmproj.c_str();
+ const char * clip_path = params.mmproj.path.c_str();
ctx_clip = clip_model_load(clip_path, params.verbosity > 1);
}
common_init();
- if (params.mmproj.empty()) {
+ if (params.mmproj.path.empty()) {
show_additional_info(argc, argv);
return 1;
}
gemma3_context ctx(params);
- printf("%s: %s\n", __func__, params.model.c_str());
+ printf("%s: %s\n", __func__, params.model.path.c_str());
bool is_single_turn = !params.prompt.empty() && !params.image.empty();
llama_model_params model_params = common_model_params_to_llama(*params);
- llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
+ llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
if (model == NULL) {
LOG_ERR("%s: unable to load model\n" , __func__);
return NULL;
}
static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
- const char * clip_path = params->mmproj.c_str();
+ const char * clip_path = params->mmproj.path.c_str();
auto prompt = params->prompt;
if (prompt.empty()) {
common_init();
- if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
+ if (params.mmproj.path.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
print_usage(argc, argv);
return 1;
}
llama_model_params model_params = common_model_params_to_llama(*params);
- llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
+ llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
if (model == NULL) {
LOG_ERR("%s: unable to load model\n" , __func__);
return NULL;
}
static struct clip_ctx * clip_init_context(common_params * params) {
- const char * clip_path = params->mmproj.c_str();
+ const char * clip_path = params->mmproj.path.c_str();
auto prompt = params->prompt;
if (prompt.empty()) {
common_init();
- if (params.mmproj.empty() || (params.image.empty())) {
+ if (params.mmproj.path.empty() || (params.image.empty())) {
show_additional_info(argc, argv);
return 1;
}
llama_model_params model_params = common_model_params_to_llama(*params);
- llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
+ llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
if (model == NULL) {
LOG_ERR("%s: unable to load model\n" , __func__);
return NULL;
}
static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
- const char * clip_path = params->mmproj.c_str();
+ const char * clip_path = params->mmproj.path.c_str();
auto prompt = params->prompt;
if (prompt.empty()) {
common_init();
- if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
+ if (params.mmproj.path.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
print_usage(argc, argv);
return 1;
}
params.prompt_file = "used built-in defaults";
}
LOG_INF("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
- LOG_INF("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str());
+ LOG_INF("Model and path used: \033[32m%s\033[0m\n\n", params.model.path.c_str());
LOG_INF("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);
LOG_INF("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);
llama_model_params model_params = common_model_params_to_llama(params);
- llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
+ llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
if (model == NULL) {
LOG_ERR("%s: unable to load model\n" , __func__);
}
bool load_model(const common_params & params) {
- SRV_INF("loading model '%s'\n", params.model.c_str());
+ SRV_INF("loading model '%s'\n", params.model.path.c_str());
params_base = params;
ctx = llama_init.context.get();
if (model == nullptr) {
- SRV_ERR("failed to load model, '%s'\n", params_base.model.c_str());
+ SRV_ERR("failed to load model, '%s'\n", params_base.model.path.c_str());
return false;
}
add_bos_token = llama_vocab_get_add_bos(vocab);
has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
- if (!params_base.speculative.model.empty() || !params_base.speculative.hf_repo.empty()) {
- SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str());
+ if (!params_base.speculative.model.path.empty() || !params_base.speculative.model.hf_repo.empty()) {
+ SRV_INF("loading draft model '%s'\n", params_base.speculative.model.path.c_str());
auto params_dft = params_base;
params_dft.devices = params_base.speculative.devices;
- params_dft.hf_file = params_base.speculative.hf_file;
- params_dft.hf_repo = params_base.speculative.hf_repo;
params_dft.model = params_base.speculative.model;
- params_dft.model_url = params_base.speculative.model_url;
params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx;
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
params_dft.n_parallel = 1;
model_dft = llama_init_dft.model.get();
if (model_dft == nullptr) {
- SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.c_str());
+ SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.path.c_str());
return false;
}
if (!common_speculative_are_compatible(ctx, llama_init_dft.context.get())) {
- SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.c_str(), params_base.model.c_str());
+ SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.path.c_str(), params_base.model.path.c_str());
return false;
}
json data = {
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
{ "total_slots", ctx_server.params_base.n_parallel },
- { "model_path", ctx_server.params_base.model },
+ { "model_path", ctx_server.params_base.model.path },
{ "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) },
{ "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
{ "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
{"object", "list"},
{"data", {
{
- {"id", params.model_alias.empty() ? params.model : params.model_alias},
+ {"id", params.model_alias.empty() ? params.model.path : params.model_alias},
{"object", "model"},
{"created", std::time(0)},
{"owned_by", "llamacpp"},
common_init();
- if (params.speculative.model.empty()) {
+ if (params.speculative.model.path.empty()) {
LOG_ERR("%s: --model-draft is required\n", __func__);
return 1;
}
common_init();
- if (params.speculative.model.empty()) {
+ if (params.speculative.model.path.empty()) {
LOG_ERR("%s: --model-draft is required\n", __func__);
return 1;
}
const llama_vocab * vocab = llama_model_get_vocab(model_ttc);
- // TODO: refactor in a common struct
- params.model = params.vocoder.model;
- params.model_url = params.vocoder.model_url;
- params.hf_repo = params.vocoder.hf_repo;
- params.hf_file = params.vocoder.hf_file;
-
+ params.model = params.vocoder.model;
params.embedding = true;
common_init_result llama_init_cts = common_init_from_params(params);
argv = {"binary_name", "-m", "model_file.gguf"};
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
- assert(params.model == "model_file.gguf");
+ assert(params.model.path == "model_file.gguf");
argv = {"binary_name", "-t", "1234"};
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"};
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
- assert(params.model == "abc.gguf");
+ assert(params.model.path == "abc.gguf");
assert(params.n_predict == 6789);
assert(params.n_batch == 9090);
setenv("LLAMA_ARG_THREADS", "1010", true);
argv = {"binary_name"};
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
- assert(params.model == "blah.gguf");
+ assert(params.model.path == "blah.gguf");
assert(params.cpuparams.n_threads == 1010);
setenv("LLAMA_ARG_THREADS", "1010", true);
argv = {"binary_name", "-m", "overwritten.gguf"};
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
- assert(params.model == "overwritten.gguf");
+ assert(params.model.path == "overwritten.gguf");
assert(params.cpuparams.n_threads == 1010);
#endif // _WIN32