vulkan: optimize coopmat2 q2_k dequant function (llama/11130)

author Jeff Bolz <redacted>

Thu, 16 Jan 2025 21:16:39 +0000 (15:16 -0600)

committer Georgi Gerganov <redacted>

Mon, 3 Feb 2025 20:00:57 +0000 (22:00 +0200)
author Jeff Bolz <redacted>
Thu, 16 Jan 2025 21:16:39 +0000 (15:16 -0600)
committer Georgi Gerganov <redacted>
Mon, 3 Feb 2025 20:00:57 +0000 (22:00 +0200)
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp

index 94b78598ea21510531329b5f4ecb63e04af7c97e..e768b8930c43674bc7b0445af68ffa64905473ba 100644 (file)
--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp
@@ -101,19 +101,25 @@ layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufQ2_
     block_q2_K block;
  };
  
+layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ2_K_packed16 {
+   block_q2_K_packed16 block;
+};
+
  float16_t dequantFuncQ2_K(const in decodeBufQ2_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
  {
+    decodeBufQ2_K_packed16 bl16 = decodeBufQ2_K_packed16(bl);
      const f16vec2 d = bl.block.d;
      const uint idx = coordInBlock[1];
-    const uint iqs = idx;
  
-    const uint qsi = (iqs / 128) * 32 + (iqs % 32);     // 0..31
-    const uint scalesi = iqs / 16;                      // 0..15
-    const uint qsshift = ((iqs % 128) / 32) * 2;        // 0,2,4,6
+    const uint scalesi = (idx & 0xF0) >> 4;             // 0..15
+    const uint qsshift = (idx & 0x60) >> 4;             // 0,2,4,6
+
+    uint qs = uint32_t(bl16.block.qs[((idx & 0x80) >> 3) + ((idx & 0x1E) >> 1)]);
+    qs = (qs >> qsshift) & 0x0303;
+    qs = unpack8(qs)[idx & 1];
  
-    uint32_t qs = bl.block.qs[qsi];
      const uint scales = bl.block.scales[scalesi];
-    float16_t ret = d.x * float16_t(scales & 0xF) * float16_t((qs >> qsshift) & 3) - d.y * float16_t(scales >> 4);
+    float16_t ret = d.x * float16_t(scales & 0xF) * float16_t(qs) - d.y * float16_t(scales >> 4);
      return ret;
  }
author	Jeff Bolz <redacted>
	Thu, 16 Jan 2025 21:16:39 +0000 (15:16 -0600)
committer	Georgi Gerganov <redacted>
	Mon, 3 Feb 2025 20:00:57 +0000 (22:00 +0200)