Don't tell users to use a bad number of threads (#243)

author Stephan Walter <redacted>

Fri, 17 Mar 2023 17:47:35 +0000 (17:47 +0000)

committer GitHub <redacted>

Fri, 17 Mar 2023 17:47:35 +0000 (19:47 +0200)
author Stephan Walter <redacted>
Fri, 17 Mar 2023 17:47:35 +0000 (17:47 +0000)
committer GitHub <redacted>
Fri, 17 Mar 2023 17:47:35 +0000 (19:47 +0200)
diff --git a/.devops/tools.sh b/.devops/tools.sh

index b5711c94e1832068fa5db6736f00443d25839b19..352e049421b57e95a87b0e6b2db01abba1c0a05f 100755 (executable)
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -34,7 +34,7 @@ else
      echo "Unknown command: $arg1"
      echo "Available commands: "
      echo "  --run (-r): Run a model previously converted into ggml"
-    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -t 8 -n 512"
+    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
      echo "  --convert (-c): Convert a llama model into ggml"
      echo "              ex: \"/models/7B/\" 1"
      echo "  --quantize (-q): Optimize with quantization process ggml"
diff --git a/README.md b/README.md

index 8cf59f4188c117b3772096fd58569bf3b6958b3f..7338ea790989ad2c3503eecae19d54aaa71980fd 100644 (file)
--- a/README.md
+++ b/README.md
@@ -39,7 +39,7 @@ Supported platforms:
  Here is a typical run using LLaMA-7B:
  
  ```java
-make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
+make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
  I llama.cpp build info:
  I UNAME_S:  Darwin
  I UNAME_P:  arm
@@ -150,7 +150,7 @@ python3 convert-pth-to-ggml.py models/7B/ 1
  ./quantize.sh 7B
  
  # run the inference
-./main -m ./models/7B/ggml-model-q4_0.bin -t 8 -n 128
+./main -m ./models/7B/ggml-model-q4_0.bin -n 128
  ```
  
  When running the larger models, make sure you have enough disk space to store all the intermediate files.
@@ -164,7 +164,7 @@ In this mode, you can always interrupt generation by pressing Ctrl+C and enter o
  
  Here is an example few-shot interaction, invoked with the command
  ```
-./main -m ./models/13B/ggml-model-q4_0.bin -t 8 -n 256 --repeat_penalty 1.0 --color -i -r "User:" \
+./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" \
                                             -p \
  "Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
  
@@ -218,13 +218,13 @@ docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-on
  On complete, you are ready to play!
  
  ```bash
-docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
+docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
  ```
  
  or with light image:
  
  ```bash
-docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
+docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
  ```
  
  ## Limitations
diff --git a/ggml.c b/ggml.c

index c4f8389171026f41c995e74c1f75bd1df569b96e..4fb83adbdb2c8c83450fa48d25e9a0f65fac911f 100644 (file)
--- a/ggml.c
+++ b/ggml.c
@@ -9318,10 +9318,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
  }
  
  void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
-    if (cgraph->n_threads <= 0) {
-        cgraph->n_threads = 8;
-    }
-
      const int n_threads = cgraph->n_threads;
  
      struct ggml_compute_state_shared state_shared = {
diff --git a/utils.cpp b/utils.cpp

index 26e313d5f1bf97b0b0e8a9e4ede4714995403503..9e50487efe7a6d107a550acfc822381906e32bd0 100644 (file)
--- a/utils.cpp
+++ b/utils.cpp
@@ -16,6 +16,18 @@
   #endif
  
  bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
+    // determine sensible default number of threads.
+    // std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0.
+#ifdef __linux__
+    std::ifstream cpuinfo("/proc/cpuinfo");
+    params.n_threads = std::count(std::istream_iterator<std::string>(cpuinfo),
+                                  std::istream_iterator<std::string>(),
+                                  std::string("processor"));
+#endif
+    if (params.n_threads == 0) {
+        params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency());
+    }
+
      for (int i = 1; i < argc; i++) {
          std::string arg = argv[i];
  
diff --git a/utils.h b/utils.h

index 021120b0513c777415edcf78a92d312acc11b929..5e5b40ffae35cef7021d61b890428a3e137fdb6d 100644 (file)
--- a/utils.h
+++ b/utils.h
@@ -14,7 +14,7 @@
  
  struct gpt_params {
      int32_t seed      = -1; // RNG seed
-    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    int32_t n_threads;
      int32_t n_predict = 128; // new tokens to predict
      int32_t repeat_last_n = 64;  // last n tokens to penalize
      int32_t n_ctx = 512; //context size
author	Stephan Walter <redacted>
	Fri, 17 Mar 2023 17:47:35 +0000 (17:47 +0000)
committer	GitHub <redacted>
	Fri, 17 Mar 2023 17:47:35 +0000 (19:47 +0200)
.devops/tools.sh		patch \| blob \| history
README.md		patch \| blob \| history
ggml.c		patch \| blob \| history
utils.cpp		patch \| blob \| history
utils.h		patch \| blob \| history