batched-bench : print params at start

author Georgi Gerganov <redacted>

Wed, 25 Oct 2023 07:26:27 +0000 (10:26 +0300)

committer Georgi Gerganov <redacted>

Wed, 25 Oct 2023 07:26:27 +0000 (10:26 +0300)
author Georgi Gerganov <redacted>
Wed, 25 Oct 2023 07:26:27 +0000 (10:26 +0300)
committer Georgi Gerganov <redacted>
Wed, 25 Oct 2023 07:26:27 +0000 (10:26 +0300)
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp

index c552eaa738becffad88093af8a54ae5c3810ed02..43f9c971d18465fa7274b6bb40566eef8e63623e 100644 (file)
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -154,6 +154,10 @@ int main(int argc, char ** argv) {
          }
      }
  
+    LOG_TEE("\n");
+    LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq);
+    LOG_TEE("\n");
+
      LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP",     "TG",     "B",    "N_KV",     "T_PP s",   "S_PP t/s", "T_TG s",   "S_TG t/s", "T s",      "S t/s");
      LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
  
diff --git a/ggml-cuda.cu b/ggml-cuda.cu

index d1e874b6c778af32d887cc5cf5e8970ceb92310f..ba0cd5a7d3f1eb7274887a33722ec4007cd9d94f 100644 (file)
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -6254,16 +6254,15 @@ inline void ggml_cuda_op_mul_mat_cublas(
      const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
      const int64_t src1_padded_row_size, const cudaStream_t & stream) {
  
-    GGML_ASSERT(src0_dd_i != nullptr);
+    GGML_ASSERT(src0_dd_i  != nullptr);
      GGML_ASSERT(src1_ddf_i != nullptr);
-    GGML_ASSERT(dst_dd_i != nullptr);
-
+    GGML_ASSERT(dst_dd_i   != nullptr);
  
      const int64_t ne00 = src0->ne[0];
-
      const int64_t ne10 = src1->ne[0];
  
      const int64_t ne0 = dst->ne[0];
+
      const int64_t row_diff = row_high - row_low;
  
      int id;
@@ -7223,12 +7222,13 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
      //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
  
      if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
-        // KQ
+        // KQ single-batch
          ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
      } else if (all_on_device && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
-        // KQV
+        // KQV single-batch
          ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
      } else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
+        // KQ + KQV multi-batch
          ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
      } else if (src0->type == GGML_TYPE_F32) {
          ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
author	Georgi Gerganov <redacted>
	Wed, 25 Oct 2023 07:26:27 +0000 (10:26 +0300)
committer	Georgi Gerganov <redacted>
	Wed, 25 Oct 2023 07:26:27 +0000 (10:26 +0300)
examples/batched-bench/batched-bench.cpp		patch \| blob \| history
ggml-cuda.cu		patch \| blob \| history