echo "Unknown command: $arg1"
echo "Available commands: "
echo " --run (-r): Run a model previously converted into ggml"
- echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -t 8 -n 512"
+ echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
echo " --convert (-c): Convert a llama model into ggml"
echo " ex: \"/models/7B/\" 1"
echo " --quantize (-q): Optimize with quantization process ggml"
Here is a typical run using LLaMA-7B:
```java
-make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
+make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
I llama.cpp build info:
I UNAME_S: Darwin
I UNAME_P: arm
./quantize.sh 7B
# run the inference
-./main -m ./models/7B/ggml-model-q4_0.bin -t 8 -n 128
+./main -m ./models/7B/ggml-model-q4_0.bin -n 128
```
When running the larger models, make sure you have enough disk space to store all the intermediate files.
Here is an example few-shot interaction, invoked with the command
```
-./main -m ./models/13B/ggml-model-q4_0.bin -t 8 -n 256 --repeat_penalty 1.0 --color -i -r "User:" \
+./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" \
-p \
"Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
On complete, you are ready to play!
```bash
-docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
+docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
```
or with light image:
```bash
-docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
+docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
```
## Limitations
}
void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
- if (cgraph->n_threads <= 0) {
- cgraph->n_threads = 8;
- }
-
const int n_threads = cgraph->n_threads;
struct ggml_compute_state_shared state_shared = {
#endif
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
+ // determine sensible default number of threads.
+ // std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0.
+#ifdef __linux__
+ std::ifstream cpuinfo("/proc/cpuinfo");
+ params.n_threads = std::count(std::istream_iterator<std::string>(cpuinfo),
+ std::istream_iterator<std::string>(),
+ std::string("processor"));
+#endif
+ if (params.n_threads == 0) {
+ params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency());
+ }
+
for (int i = 1; i < argc; i++) {
std::string arg = argv[i];
struct gpt_params {
int32_t seed = -1; // RNG seed
- int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
+ int32_t n_threads;
int32_t n_predict = 128; // new tokens to predict
int32_t repeat_last_n = 64; // last n tokens to penalize
int32_t n_ctx = 512; //context size