Improve usability of --model-url & related flags (#6930)

author Olivier Chafik <redacted>

Mon, 29 Apr 2024 23:52:50 +0000 (00:52 +0100)

committer GitHub <redacted>

Mon, 29 Apr 2024 23:52:50 +0000 (00:52 +0100)
author Olivier Chafik <redacted>
Mon, 29 Apr 2024 23:52:50 +0000 (00:52 +0100)
committer GitHub <redacted>
Mon, 29 Apr 2024 23:52:50 +0000 (00:52 +0100)
diff --git a/.gitignore b/.gitignore

index 60f9d1f8d04b9267f8ed7586366c43bf1995e952..50ae0973ae3b306be0f817a30bdc63c8254186e6 100644 (file)
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
  *.a
  *.so
  *.gguf
+*.gguf.json
  *.bin
  *.exe
  *.dll
diff --git a/common/common.cpp b/common/common.cpp

index fe84039f76e551c2aff8d7f01c6528e140c5cd8d..099d0356fa4b8895c4460e4e17e273cc4a9482a9 100644 (file)
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -67,7 +67,6 @@
  #include <sys/syslimits.h>
  #endif
  #define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
-#define LLAMA_CURL_MAX_HEADER_LENGTH 256
  #endif // LLAMA_USE_CURL
  
  using json = nlohmann::ordered_json;
@@ -1324,6 +1323,29 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
      return false;
  }
  
+void gpt_params_handle_model_default(gpt_params & params) {
+    if (!params.hf_repo.empty()) {
+        // short-hand to avoid specifying --hf-file -> default it to --model
+        if (params.hf_file.empty()) {
+            if (params.model.empty()) {
+                throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n");
+            }
+            params.hf_file = params.model;
+        } else if (params.model.empty()) {
+            params.model = "models/" + string_split(params.hf_file, '/').back();
+        }
+    } else if (!params.model_url.empty()) {
+        if (params.model.empty()) {
+            auto f = string_split(params.model_url, '#').front();
+            f = string_split(f, '?').front();
+            f = string_split(f, '/').back();
+            params.model =  "models/" + f;
+        }
+    } else if (params.model.empty()) {
+        params.model = DEFAULT_MODEL_PATH;
+    }
+}
+
  bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
      bool invalid_param = false;
      std::string arg;
@@ -1352,10 +1374,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
          throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
      }
  
-    // short-hand to avoid specifying --hf-file -> default it to --model
-    if (!params.hf_repo.empty() && params.hf_file.empty()) {
-        params.hf_file = params.model;
-    }
+    gpt_params_handle_model_default(params);
  
      if (params.escape) {
          process_escapes(params.prompt);
@@ -1548,7 +1567,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
      printf("  --control-vector-layer-range START END\n");
      printf("                        layer range to apply the control vector(s) to, start and end inclusive\n");
      printf("  -m FNAME, --model FNAME\n");
-    printf("                        model path (default: %s)\n", params.model.c_str());
+    printf("                        model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)\n", DEFAULT_MODEL_PATH);
      printf("  -md FNAME, --model-draft FNAME\n");
      printf("                        draft model for speculative decoding (default: unused)\n");
      printf("  -mu MODEL_URL, --model-url MODEL_URL\n");
@@ -1896,59 +1915,75 @@ void llama_batch_add(
  
  #ifdef LLAMA_USE_CURL
  
-static bool llama_download_file(CURL * curl, const char * url, const char * path) {
+static bool starts_with(const std::string & str, const std::string & prefix) {
+    // While we wait for C++20's std::string::starts_with...
+    return str.rfind(prefix, 0) == 0;
+}
+
+static bool llama_download_file(const std::string & url, const std::string & path) {
+
+    // Initialize libcurl
+    std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
+    if (!curl) {
+        fprintf(stderr, "%s: error initializing libcurl\n", __func__);
+        return false;
+    }
+
      bool force_download = false;
  
      // Set the URL, allow to follow http redirection
-    curl_easy_setopt(curl, CURLOPT_URL, url);
-    curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
+    curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
+    curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
  
  #if defined(_WIN32)
      // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
      //   operating system. Currently implemented under MS-Windows.
-    curl_easy_setopt(curl, CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
+    curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
  #endif
  
      // Check if the file already exists locally
      struct stat model_file_info;
-    auto file_exists = (stat(path, &model_file_info) == 0);
-
-    // If the file exists, check for ${path_model}.etag or ${path_model}.lastModified files
-    char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
-    char etag_path[PATH_MAX] = {0};
-    snprintf(etag_path, sizeof(etag_path), "%s.etag", path);
+    auto file_exists = (stat(path.c_str(), &model_file_info) == 0);
  
-    char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
-    char last_modified_path[PATH_MAX] = {0};
-    snprintf(last_modified_path, sizeof(last_modified_path), "%s.lastModified", path);
+    // If the file exists, check its JSON metadata companion file.
+    std::string metadata_path = path + ".json";
+    nlohmann::json metadata;
+    std::string etag;
+    std::string last_modified;
  
      if (file_exists) {
-        auto * f_etag = fopen(etag_path, "r");
-        if (f_etag) {
-            if (!fgets(etag, sizeof(etag), f_etag)) {
-                fprintf(stderr, "%s: unable to read file %s\n", __func__, etag_path);
-            } else {
-                fprintf(stderr, "%s: previous file found %s: %s\n", __func__, etag_path, etag);
-            }
-            fclose(f_etag);
-        }
-
-        auto * f_last_modified = fopen(last_modified_path, "r");
-        if (f_last_modified) {
-            if (!fgets(last_modified, sizeof(last_modified), f_last_modified)) {
-                fprintf(stderr, "%s: unable to read file %s\n", __func__, last_modified_path);
-            } else {
-                fprintf(stderr, "%s: previous file found %s: %s\n", __func__, last_modified_path,
-                        last_modified);
+        // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
+        std::ifstream metadata_in(metadata_path);
+        if (metadata_in.good()) {
+            try {
+                metadata_in >> metadata;
+                fprintf(stderr, "%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
+                if (metadata.contains("url") && metadata["url"].is_string()) {
+                    auto previous_url = metadata["url"].get<std::string>();
+                    if (previous_url != url) {
+                        fprintf(stderr, "%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
+                        return false;
+                    }
+                }
+                if (metadata.contains("etag") && metadata["etag"].is_string()) {
+                    etag = metadata["etag"];
+                }
+                if (metadata.contains("lastModified") && metadata["lastModified"].is_string()) {
+                    last_modified = metadata["lastModified"];
+                }
+            } catch (const nlohmann::json::exception & e) {
+                fprintf(stderr, "%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
+                return false;
              }
-            fclose(f_last_modified);
          }
+    } else {
+        fprintf(stderr, "%s: no previous model file found %s\n", __func__, path.c_str());
      }
  
      // Send a HEAD request to retrieve the etag and last-modified headers
      struct llama_load_model_from_url_headers {
-        char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
-        char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
+        std::string etag;
+        std::string last_modified;
      };
      llama_load_model_from_url_headers headers;
      {
@@ -1956,38 +1991,37 @@ static bool llama_download_file(CURL * curl, const char * url, const char * path
          auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
              llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata;
  
-            // Convert header field name to lowercase
-            for (size_t i = 0; i < n_items && buffer[i] != ':'; ++i) {
-                buffer[i] = tolower(buffer[i]);
-            }
-
-            const char * etag_prefix = "etag: ";
-            if (strncmp(buffer, etag_prefix, strlen(etag_prefix)) == 0) {
-                strncpy(headers->etag, buffer + strlen(etag_prefix), n_items - strlen(etag_prefix) - 2); // Remove CRLF
-            }
-
-            const char * last_modified_prefix = "last-modified: ";
-            if (strncmp(buffer, last_modified_prefix, strlen(last_modified_prefix)) == 0) {
-                strncpy(headers->last_modified, buffer + strlen(last_modified_prefix),
-                        n_items - strlen(last_modified_prefix) - 2); // Remove CRLF
+            static std::regex header_regex("([^:]+): (.*)\r\n");
+            static std::regex etag_regex("ETag", std::regex_constants::icase);
+            static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
+
+            std::string header(buffer, n_items);
+            std::smatch match;
+            if (std::regex_match(header, match, header_regex)) {
+                const std::string & key = match[1];
+                const std::string & value = match[2];
+                if (std::regex_match(key, match, etag_regex)) {
+                    headers->etag = value;
+                } else if (std::regex_match(key, match, last_modified_regex)) {
+                    headers->last_modified = value;
+                }
              }
              return n_items;
          };
  
-        curl_easy_setopt(curl, CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
-        curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L); // hide head request progress
-        curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
-        curl_easy_setopt(curl, CURLOPT_HEADERDATA, &headers);
+        curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
+        curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
+        curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
+        curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
  
-        CURLcode res = curl_easy_perform(curl);
+        CURLcode res = curl_easy_perform(curl.get());
          if (res != CURLE_OK) {
-            curl_easy_cleanup(curl);
              fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
              return false;
          }
  
          long http_code = 0;
-        curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_code);
+        curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
          if (http_code != 200) {
              // HEAD not supported, we don't know if the file has changed
              // force trigger downloading
@@ -1996,28 +2030,30 @@ static bool llama_download_file(CURL * curl, const char * url, const char * path
          }
      }
  
-    // If the ETag or the Last-Modified headers are different: trigger a new download
-    bool should_download = !file_exists
-        || force_download
-        || (strlen(headers.etag) > 0 && strcmp(etag, headers.etag) != 0)
-        || (strlen(headers.last_modified) > 0 && strcmp(last_modified, headers.last_modified) != 0);
+    bool should_download = !file_exists || force_download;
+    if (!should_download) {
+        if (!etag.empty() && etag != headers.etag) {
+            fprintf(stderr, "%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
+            should_download = true;
+        } else if (!last_modified.empty() && last_modified != headers.last_modified) {
+            fprintf(stderr, "%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
+            should_download = true;
+        }
+    }
      if (should_download) {
-        char path_temporary[PATH_MAX] = {0};
-        snprintf(path_temporary, sizeof(path_temporary), "%s.downloadInProgress", path);
+        std::string path_temporary = path + ".downloadInProgress";
          if (file_exists) {
-            fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path);
-            if (remove(path) != 0) {
-                curl_easy_cleanup(curl);
-                fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path);
+            fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
+            if (remove(path.c_str()) != 0) {
+                fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path.c_str());
                  return false;
              }
          }
  
          // Set the output file
-        auto * outfile = fopen(path_temporary, "wb");
+        std::unique_ptr<FILE, decltype(&fclose)> outfile(fopen(path_temporary.c_str(), "wb"), fclose);
          if (!outfile) {
-            curl_easy_cleanup(curl);
-            fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path);
+            fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path.c_str());
              return false;
          }
  
@@ -2025,12 +2061,12 @@ static bool llama_download_file(CURL * curl, const char * url, const char * path
          auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
              return fwrite(data, size, nmemb, (FILE *)fd);
          };
-        curl_easy_setopt(curl, CURLOPT_NOBODY, 0L);
-        curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
-        curl_easy_setopt(curl, CURLOPT_WRITEDATA, outfile);
+        curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L);
+        curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
+        curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get());
  
          //  display download progress
-        curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
+        curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
  
          // helper function to hide password in URL
          auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
@@ -2049,51 +2085,34 @@ static bool llama_download_file(CURL * curl, const char * url, const char * path
  
          // start the download
          fprintf(stderr, "%s: downloading from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
-                llama_download_hide_password_in_url(url).c_str(), path, headers.etag, headers.last_modified);
-        auto res = curl_easy_perform(curl);
+                llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
+        auto res = curl_easy_perform(curl.get());
          if (res != CURLE_OK) {
-            fclose(outfile);
-            curl_easy_cleanup(curl);
              fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
              return false;
          }
  
          long http_code = 0;
-        curl_easy_getinfo (curl, CURLINFO_RESPONSE_CODE, &http_code);
+        curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
          if (http_code < 200 || http_code >= 400) {
-            fclose(outfile);
-            curl_easy_cleanup(curl);
              fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code);
              return false;
          }
  
-        // Clean up
-        fclose(outfile);
+        // Causes file to be closed explicitly here before we rename it.
+        outfile.reset();
  
-        // Write the new ETag to the .etag file
-        if (strlen(headers.etag) > 0) {
-            auto * etag_file = fopen(etag_path, "w");
-            if (etag_file) {
-                fputs(headers.etag, etag_file);
-                fclose(etag_file);
-                fprintf(stderr, "%s: file etag saved %s: %s\n", __func__, etag_path, headers.etag);
-            }
-        }
+        // Write the updated JSON metadata file.
+        metadata.update({
+            {"url", url},
+            {"etag", headers.etag},
+            {"lastModified", headers.last_modified}
+        });
+        std::ofstream(metadata_path) << metadata.dump(4);
+        fprintf(stderr, "%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
  
-        // Write the new lastModified to the .etag file
-        if (strlen(headers.last_modified) > 0) {
-            auto * last_modified_file = fopen(last_modified_path, "w");
-            if (last_modified_file) {
-                fputs(headers.last_modified, last_modified_file);
-                fclose(last_modified_file);
-                fprintf(stderr, "%s: file last modified saved %s: %s\n", __func__, last_modified_path,
-                        headers.last_modified);
-            }
-        }
-
-        if (rename(path_temporary, path) != 0) {
-            curl_easy_cleanup(curl);
-            fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary, path);
+        if (rename(path_temporary.c_str(), path.c_str()) != 0) {
+            fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
              return false;
          }
      }
@@ -2111,15 +2130,7 @@ struct llama_model * llama_load_model_from_url(
          return NULL;
      }
  
-    // Initialize libcurl
-    auto * curl = curl_easy_init();
-
-    if (!curl) {
-        fprintf(stderr, "%s: error initializing libcurl\n", __func__);
-        return NULL;
-    }
-
-    if (!llama_download_file(curl, model_url, path_model)) {
+    if (!llama_download_file(model_url, path_model)) {
          return NULL;
      }
  
@@ -2133,7 +2144,6 @@ struct llama_model * llama_load_model_from_url(
          auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params);
          if (!ctx_gguf) {
              fprintf(stderr, "\n%s:  failed to load input GGUF from %s\n", __func__, path_model);
-            curl_easy_cleanup(curl);
              return NULL;
          }
  
@@ -2145,8 +2155,6 @@ struct llama_model * llama_load_model_from_url(
          gguf_free(ctx_gguf);
      }
  
-    curl_easy_cleanup(curl);
-
      if (n_split > 1) {
          char split_prefix[PATH_MAX] = {0};
          char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
@@ -2177,11 +2185,7 @@ struct llama_model * llama_load_model_from_url(
                  char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
                  llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
  
-                auto * curl = curl_easy_init();
-                bool res = llama_download_file(curl, split_url, split_path);
-                curl_easy_cleanup(curl);
-
-                return res;
+                return llama_download_file(split_url, split_path);
              }, idx));
          }
  
@@ -2668,7 +2672,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
      fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
      fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
      fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
-    fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
+    fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH);
      fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
      fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
      fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
diff --git a/common/common.h b/common/common.h

index 3233d90e69eb5fc2abe79caea15613e1fc62b748..8afdf2bdf189b41855b2210c83683fd2cb8fb266 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -31,6 +31,8 @@
      fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);    \
  } while(0)
  
+#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
+
  // build info
  extern int LLAMA_BUILD_NUMBER;
  extern char const *LLAMA_COMMIT;
@@ -92,7 +94,7 @@ struct gpt_params {
      // // sampling parameters
      struct llama_sampling_params sparams;
  
-    std::string model                = "models/7B/ggml-model-f16.gguf"; // model path
+    std::string model                = "";  // model path
      std::string model_draft          = "";  // draft model for speculative decoding
      std::string model_alias          = "unknown"; // model alias
      std::string model_url            = "";  // model url to download
@@ -171,6 +173,8 @@ struct gpt_params {
      std::vector<std::string> image; // path to image file(s)
  };
  
+void gpt_params_handle_model_default(gpt_params & params);
+
  bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
  
  bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
diff --git a/examples/main/README.md b/examples/main/README.md

index 649f4e0f35820e75a01917c907211141b7eb7f67..e7a38743c240a102e17bc78787c2968cd35838b8 100644 (file)
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -66,7 +66,7 @@ main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt
  
  In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:
  
--   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
+-   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`; inferred from `--model-url` if set).
  -   `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
  -   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
  -   `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp

index 1d05f13911fc2af186cd294528d695128a54d852..746df8446b85e8a69191920104569dfe168e7b2e 100644 (file)
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -23,7 +23,7 @@
  #endif
  
  struct quantize_stats_params {
-    std::string model = "models/7B/ggml-model-f16.gguf";
+    std::string model = DEFAULT_MODEL_PATH;
      bool verbose = false;
      bool per_layer_stats = false;
      bool print_histogram = false;
diff --git a/examples/server/server.cpp b/examples/server/server.cpp

index 2760aea8fd3e9bbb5c4c0d8d3449d6340453f6dd..01453af2c97e3337c9aa099c175ee2c0edc1d90b 100644 (file)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2353,7 +2353,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
          printf("                            disable KV offload\n");
      }
      printf("  -m FNAME, --model FNAME\n");
-    printf("                            model path (default: %s)\n", params.model.c_str());
+    printf("                            model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)\n", DEFAULT_MODEL_PATH);
      printf("  -mu MODEL_URL, --model-url MODEL_URL\n");
      printf("                            model download url (default: unused)\n");
      printf("  -hfr REPO, --hf-repo REPO\n");
@@ -2835,6 +2835,8 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
          }
      }
  
+    gpt_params_handle_model_default(params);
+
      if (!params.kv_overrides.empty()) {
          params.kv_overrides.emplace_back();
          params.kv_overrides.back().key[0] = 0;
diff --git a/examples/server/tests/features/embeddings.feature b/examples/server/tests/features/embeddings.feature

index dcf1434f97124121ba8751f6bda97be600ae28ba..6f163ce04b3f6b51c73f6b9d946a1a80e3159b8e 100644 (file)
--- a/examples/server/tests/features/embeddings.feature
+++ b/examples/server/tests/features/embeddings.feature
@@ -5,7 +5,7 @@ Feature: llama.cpp server
    Background: Server startup
      Given a server listening on localhost:8080
      And   a model url https://huggingface.co/ggml-org/models/resolve/main/bert-bge-small/ggml-model-f16.gguf
-    And   a model file ggml-model-f16.gguf
+    And   a model file bert-bge-small.gguf
      And   a model alias bert-bge-small
      And   42 as server seed
      And   2 slots
author	Olivier Chafik <redacted>
	Mon, 29 Apr 2024 23:52:50 +0000 (00:52 +0100)
committer	GitHub <redacted>
	Mon, 29 Apr 2024 23:52:50 +0000 (00:52 +0100)
.gitignore		patch \| blob \| history
common/common.cpp		patch \| blob \| history
common/common.h		patch \| blob \| history
examples/main/README.md		patch \| blob \| history
examples/quantize-stats/quantize-stats.cpp		patch \| blob \| history
examples/server/server.cpp		patch \| blob \| history
examples/server/tests/features/embeddings.feature		patch \| blob \| history