CUDA: use mul_mat_q kernels by default (#2683)

author Johannes Gäßler <redacted>

Tue, 22 Aug 2023 20:47:05 +0000 (22:47 +0200)

committer GitHub <redacted>

Tue, 22 Aug 2023 20:47:05 +0000 (22:47 +0200)
author Johannes Gäßler <redacted>
Tue, 22 Aug 2023 20:47:05 +0000 (22:47 +0200)
committer GitHub <redacted>
Tue, 22 Aug 2023 20:47:05 +0000 (22:47 +0200)
diff --git a/common/common.cpp b/common/common.cpp

index 1623ba21f461ae4939e64bfb9ac1fe876a33d6ff..2a83b379ec4f56a117a950a34518f8a277f63ccb 100644 (file)
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -387,11 +387,11 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
  #else
              fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
  #endif // GGML_USE_CUBLAS
-        } else if (arg == "--mul-mat-q" || arg == "-mmq") {
+        } else if (arg == "--no-mul-mat-q" || arg == "-nommq") {
  #ifdef GGML_USE_CUBLAS
-            params.mul_mat_q = true;
+            params.mul_mat_q = false;
  #else
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels.\n");
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n");
  #endif // GGML_USE_CUBLAS
          } else if (arg == "--low-vram" || arg == "-lv") {
  #ifdef GGML_USE_CUBLAS
@@ -599,11 +599,11 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
      fprintf(stdout, "                        number of layers to store in VRAM\n");
      fprintf(stdout, "  -ts SPLIT --tensor-split SPLIT\n");
      fprintf(stdout, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
-    fprintf(stdout, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n" );
-    fprintf(stdout, "  -lv, --low-vram       don't allocate VRAM scratch buffer\n" );
-    fprintf(stdout, "  -mmq, --mul-mat-q     use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" );
-    fprintf(stdout, "                        Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" );
-    fprintf(stdout, "                        is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" );
+    fprintf(stdout, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
+    fprintf(stdout, "  -lv, --low-vram       don't allocate VRAM scratch buffer\n");
+    fprintf(stdout, "  -nommq, --no-mul-mat-q\n");
+    fprintf(stdout, "                        use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
+    fprintf(stdout, "                        Not recommended since this is both slower and uses more VRAM.\n");
  #endif
      fprintf(stdout, "  --mtest               compute maximum memory usage\n");
      fprintf(stdout, "  --export              export the computation graph to 'llama.ggml'\n");
diff --git a/common/common.h b/common/common.h

index c50a6edfc4124728e7facde01ef91221aba05dec..18fd951ead9df6581ce4a3fd993b4ed52eff9cf1 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -68,7 +68,7 @@ struct gpt_params {
      size_t hellaswag_tasks = 400;   // number of tasks to use when computing the HellaSwag score
  
      bool low_vram          = false; // if true, reduce VRAM usage at the cost of performance
-    bool mul_mat_q         = false; // if true, use experimental mul_mat_q kernels
+    bool mul_mat_q         = true;  // if true, use mul_mat_q kernels instead of cuBLAS
      bool memory_f16        = true;  // use f16 instead of f32 for memory kv
      bool random_prompt     = false; // do not randomize prompt if none provided
      bool use_color         = false; // use color to distinguish generations and inputs
diff --git a/examples/server/server.cpp b/examples/server/server.cpp

index 39fdf3307c93d8bd8ee97f407ffb04f26e9a71f4..e5bc52cd00624b12f55092f1972b1b3c45472e41 100644 (file)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -671,12 +671,11 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
      fprintf(stdout, "                        number of layers to store in VRAM\n");
      fprintf(stdout, "  -ts SPLIT --tensor-split SPLIT\n");
      fprintf(stdout, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
-    fprintf(stdout, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
      fprintf(stdout, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
      fprintf(stdout, "  -lv, --low-vram don't allocate VRAM scratch buffer\n");
-    fprintf(stdout, "  -mmq, --mul-mat-q     use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" );
-    fprintf(stdout, "                        Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" );
-    fprintf(stdout, "                        is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" );
+    fprintf(stdout, "  -nommq, --no-mul-mat-q\n");
+    fprintf(stdout, "                        use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
+    fprintf(stdout, "                        Not recommended since this is both slower and uses more VRAM.\n");
  #endif
      fprintf(stdout, "  -m FNAME, --model FNAME\n");
      fprintf(stdout, "                        model path (default: %s)\n", params.model.c_str());
@@ -867,12 +866,12 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
              LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n", {});
  #endif // GGML_USE_CUBLAS
          }
-        else if (arg == "--mul-mat-q" || arg == "-mmq")
+        else if (arg == "--no-mul-mat-q" || arg == "-nommq")
          {
  #ifdef GGML_USE_CUBLAS
-            params.mul_mat_q = true;
+            params.mul_mat_q = false;
  #else
-            LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels.\n", {});
+            LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n", {});
  #endif // GGML_USE_CUBLAS
          }
          else if (arg == "--main-gpu" || arg == "-mg")
diff --git a/ggml-cuda.cu b/ggml-cuda.cu

index 4fe378c210030fcbbcd8834777a6ac05e9bcbbdb..70a950bb58b9b0b09356d8e9a226a9f339b91067 100644 (file)
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -287,7 +287,7 @@ static int g_device_count = -1;
  static int g_main_device = 0;
  static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
  static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
-static bool g_mul_mat_q = false;
+static bool g_mul_mat_q = true;
  
  static void * g_scratch_buffer = nullptr;
  static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
author	Johannes Gäßler <redacted>
	Tue, 22 Aug 2023 20:47:05 +0000 (22:47 +0200)
committer	GitHub <redacted>
	Tue, 22 Aug 2023 20:47:05 +0000 (22:47 +0200)
common/common.cpp		patch \| blob \| history
common/common.h		patch \| blob \| history
examples/server/server.cpp		patch \| blob \| history
ggml-cuda.cu		patch \| blob \| history