ggml webgpu: add support for emscripten builds (llama/17184)

author Reese Levine <redacted>

Wed, 3 Dec 2025 09:25:34 +0000 (01:25 -0800)

committer Georgi Gerganov <redacted>

Fri, 12 Dec 2025 15:53:16 +0000 (17:53 +0200)
author Reese Levine <redacted>
Wed, 3 Dec 2025 09:25:34 +0000 (01:25 -0800)
committer Georgi Gerganov <redacted>
Fri, 12 Dec 2025 15:53:16 +0000 (17:53 +0200)
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt

index 689f0327c499f16887a8b24ef3837f443ce4d978..db47ae8dff2f963be379abd7c5d3da57eefe8d96 100644 (file)
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -226,7 +226,7 @@ option(GGML_WEBGPU                          "ggml: use WebGPU"
  option(GGML_WEBGPU_DEBUG                    "ggml: enable WebGPU debug output"                OFF)
  option(GGML_WEBGPU_CPU_PROFILE              "ggml: enable WebGPU profiling (CPU)"             OFF)
  option(GGML_WEBGPU_GPU_PROFILE              "ggml: enable WebGPU profiling (GPU)"             OFF)
-
+option(GGML_WEBGPU_JSPI                     "ggml: use JSPI for WebGPU"                       ON)
  option(GGML_ZDNN                            "ggml: use zDNN"                                  OFF)
  option(GGML_METAL                           "ggml: use Metal"                                 ${GGML_METAL_DEFAULT})
  option(GGML_METAL_NDEBUG                    "ggml: disable Metal debugging"                   OFF)
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c

index e19e82ac9ce1add5fd6cbed1b16dd0a90c01ab3f..8507557267a30fc07df912a4d95cb58a8fdba17c 100644 (file)
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -2698,6 +2698,11 @@ struct ggml_cplan ggml_graph_plan(
          n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS;
      }
  
+#if defined(__EMSCRIPTEN__) && !defined(__EMSCRIPTEN_PTHREADS__)
+    // Emscripten without pthreads support can only use a single thread
+    n_threads = 1;
+#endif
+
      size_t work_size = 0;
  
      struct ggml_cplan cplan;
diff --git a/ggml/src/ggml-webgpu/CMakeLists.txt b/ggml/src/ggml-webgpu/CMakeLists.txt

index c6a95d51512454807c2668ca3377e6073e30bc83..3ccce58aa39ec8dc9ed6965016ffd7f9dc70e8f7 100644 (file)
--- a/ggml/src/ggml-webgpu/CMakeLists.txt
+++ b/ggml/src/ggml-webgpu/CMakeLists.txt
@@ -39,8 +39,23 @@ add_dependencies(ggml-webgpu generate_shaders)
  if(EMSCRIPTEN)
      set(EMDAWNWEBGPU_DIR "" CACHE PATH "Path to emdawnwebgpu_pkg")
  
-    target_compile_options(ggml-webgpu PRIVATE "--use-port=${EMDAWNWEBGPU_DIR}/emdawnwebgpu.port.py")
-    target_link_options(ggml-webgpu PRIVATE "--use-port=${EMDAWNWEBGPU_DIR}/emdawnwebgpu.port.py")
+    if(NOT EMDAWNWEBGPU_DIR)
+        # default built-in port
+        target_compile_options(ggml-webgpu PRIVATE "--use-port=emdawnwebgpu")
+        target_link_options(ggml-webgpu INTERFACE "--use-port=emdawnwebgpu")
+    else()
+        # custom port
+        target_compile_options(ggml-webgpu PRIVATE "--use-port=${EMDAWNWEBGPU_DIR}/emdawnwebgpu.port.py")
+        target_link_options(ggml-webgpu INTERFACE "--use-port=${EMDAWNWEBGPU_DIR}/emdawnwebgpu.port.py")
+    endif()
+
+    if (GGML_WEBGPU_JSPI)
+        target_compile_options(ggml-webgpu PRIVATE "-fwasm-exceptions")
+        target_link_options(ggml-webgpu INTERFACE "-sJSPI" "-fwasm-exceptions")
+    else()
+        target_compile_options(ggml-webgpu PRIVATE "-fexceptions")
+        target_link_options(ggml-webgpu INTERFACE "-sASYNCIFY" "-exceptions")
+    endif()
  else()
      find_package(Dawn REQUIRED)
      set(DawnWebGPU_TARGET dawn::webgpu_dawn)
@@ -48,6 +63,9 @@ endif()
  
  if (GGML_WEBGPU_DEBUG)
      target_compile_definitions(ggml-webgpu PRIVATE GGML_WEBGPU_DEBUG=1)
+    if(EMSCRIPTEN)
+        target_link_options(ggml-webgpu INTERFACE "-sASSERTIONS=2")
+    endif()
  endif()
  
  if (GGML_WEBGPU_CPU_PROFILE)
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp

index 9e8cbc477ed18ba67d5831b611ecef7b360cd28e..a7476b109df4c8052db1392f838764ecd7b43975 100644 (file)
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -9,6 +9,10 @@
  #include "ggml-impl.h"
  #include "ggml-wgsl-shaders.hpp"
  
+#ifdef __EMSCRIPTEN__
+#    include <emscripten/emscripten.h>
+#endif
+
  #include <webgpu/webgpu_cpp.h>
  
  #include <atomic>
@@ -261,9 +265,12 @@ struct webgpu_context_struct {
      wgpu::Queue    queue;
      wgpu::Limits   limits;
  
+    uint32_t subgroup_size;
+
+#ifndef __EMSCRIPTEN__
      bool                       supports_subgroup_matrix = false;
-    uint32_t                   subgroup_size;
      wgpu::SubgroupMatrixConfig subgroup_matrix_config;
+#endif
  
      // Separate this out from limits since on some Metal systems, the limit returned by
      // querying the limits is higher than the actual allowed maximum.
@@ -449,8 +456,8 @@ static void ggml_backend_webgpu_wait(webgpu_context &                         ct
      // If we have too many in-flight submissions, wait on the oldest one first. If there are many threads,
      // inflight_max may be 0, meaning that we must wait on all futures.
      uint64_t timeout_ms       = block ? UINT64_MAX : 0;
-    uint     inflight_threads = ctx->inflight_threads;
-    uint     inflight_max     = WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD / std::max(inflight_threads, 1u);
+    uint32_t inflight_threads = ctx->inflight_threads;
+    uint32_t inflight_max     = WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD / std::max(inflight_threads, 1u);
      while (futures.size() >= inflight_max && futures.size() > 0) {
          ctx->instance.WaitAny(futures[0].futures.size(), futures[0].futures.data(), UINT64_MAX);
          futures.erase(futures.begin());
@@ -986,6 +993,7 @@ static webgpu_command ggml_webgpu_mul_mat(webgpu_context & ctx,
              pipeline = ctx->mul_mat_pipelines[src0->type][src1->type][vectorized];
              uint32_t wg_m;
              uint32_t wg_n;
+#ifndef __EMSCRIPTEN__
              if (ctx->supports_subgroup_matrix) {
                  // The total number of subgroups/workgroups needed per matrix.
                  uint32_t wg_m_sg_tile =
@@ -995,11 +1003,15 @@ static webgpu_command ggml_webgpu_mul_mat(webgpu_context & ctx,
                      WEBGPU_MUL_MAT_SUBGROUP_N * WEBGPU_MUL_MAT_SUBGROUP_MATRIX_N * ctx->subgroup_matrix_config.N;
                  wg_n = (dst->ne[1] + wg_n_sg_tile - 1) / wg_n_sg_tile;
              } else {
+#endif
                  uint32_t tile_m_s = WEBGPU_MUL_MAT_TILE_M * WEBGPU_MUL_MAT_WG_SIZE_M;
                  uint32_t tile_n_s = WEBGPU_MUL_MAT_TILE_N * WEBGPU_MUL_MAT_WG_SIZE_N;
                  wg_m              = (dst->ne[0] + tile_m_s - 1) / tile_m_s;
                  wg_n              = (dst->ne[1] + tile_n_s - 1) / tile_n_s;
+#ifndef __EMSCRIPTEN__
              }
+#endif
+
              wg_x = wg_m * wg_n * dst->ne[2] * dst->ne[3];
          }
      }
@@ -1419,9 +1431,9 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
              commands.push_back(*cmd);
          }
          // compute the batch size based on the number of inflight threads
-        uint inflight_threads = ctx->inflight_threads;
-        uint batch_size       = std::min(std::max(1u, WEBGPU_NUM_PARAM_BUFS / std::max(inflight_threads, 1u)),
-                                         WEBGPU_COMMAND_SUBMIT_BATCH_SIZE);
+        uint32_t inflight_threads = ctx->inflight_threads;
+        uint32_t batch_size       = std::min(std::max(1u, WEBGPU_NUM_PARAM_BUFS / std::max(inflight_threads, 1u)),
+                                             WEBGPU_COMMAND_SUBMIT_BATCH_SIZE);
          if (commands.size() >= batch_size) {
              futures.push_back(ggml_backend_webgpu_submit(ctx, commands));
              // Process events and check for completed submissions
@@ -1758,6 +1770,17 @@ static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) {
      ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ4_XS][GGML_TYPE_F32],
                                  wgsl_mul_mat_iq4_xs_f32, "mul_mat_iq4_xs_f32");
  
+    std::string proc_mul_mat_f32_f32;
+    std::string proc_mul_mat_f32_f32_vec;
+    std::string proc_mul_mat_f16_f32;
+    std::string proc_mul_mat_f16_f32_vec;
+    std::string proc_mul_mat_f16_f16;
+    std::string proc_mul_mat_f16_f16_vec;
+    std::string proc_mul_mat_q4_0_f32;
+    std::string proc_mul_mat_q4_0_f32_vec;
+
+    std::vector<wgpu::ConstantEntry> mul_mat_constants;
+#ifndef __EMSCRIPTEN__
      if (webgpu_ctx->supports_subgroup_matrix) {
          std::map<std::string, std::string> sg_matrix_repls;
          sg_matrix_repls["WEBGPU_MAX_SUBGROUP_SIZE"] = std::to_string(webgpu_ctx->subgroup_size);
@@ -1770,100 +1793,57 @@ static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) {
          sg_matrix_repls["WEBGPU_SG_MAT_N_SIZE"]     = std::to_string(webgpu_ctx->subgroup_matrix_config.N);
          sg_matrix_repls["WEBGPU_SG_MAT_K_SIZE"]     = std::to_string(webgpu_ctx->subgroup_matrix_config.K);
  
-        std::string proc_mul_mat_subgroup_matrix_f32_f32 =
-            ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f32_f32, sg_matrix_repls);
-        std::string proc_mul_mat_subgroup_matrix_f32_f32_vec =
+        proc_mul_mat_f32_f32 = ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f32_f32, sg_matrix_repls);
+        proc_mul_mat_f32_f32_vec =
              ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f32_f32_vec, sg_matrix_repls);
-        std::string proc_mul_mat_subgroup_matrix_f16_f32 =
-            ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f32, sg_matrix_repls);
-        std::string proc_mul_mat_subgroup_matrix_f16_f32_vec =
+        proc_mul_mat_f16_f32 = ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f32, sg_matrix_repls);
+        proc_mul_mat_f16_f32_vec =
              ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f32_vec, sg_matrix_repls);
-        std::string proc_mul_mat_subgroup_matrix_f16_f16 =
-            ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f16, sg_matrix_repls);
-        std::string proc_mul_mat_subgroup_matrix_f16_f16_vec =
+        proc_mul_mat_f16_f16 = ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f16, sg_matrix_repls);
+        proc_mul_mat_f16_f16_vec =
              ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f16_vec, sg_matrix_repls);
-        std::string proc_mul_mat_subgroup_matrix_q4_0_f32 =
+        proc_mul_mat_q4_0_f32 =
              ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_q4_0_f32, sg_matrix_repls);
-        std::string proc_mul_mat_subgroup_matrix_q4_0_f32_vec =
+        proc_mul_mat_q4_0_f32_vec =
              ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_q4_0_f32_vec, sg_matrix_repls);
-
-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
-            webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f32_f32.c_str(), "mul_mat_subgroup_matrix_f32_f32");
-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][1] =
-            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f32_f32_vec.c_str(),
-                                         "mul_mat_subgroup_matrix_f32_f32_vec");
-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
-            webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f16_f32.c_str(), "mul_mat_subgroup_matrix_f16_f32");
-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][1] =
-            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f16_f32_vec.c_str(),
-                                         "mul_mat_subgroup_matrix_f16_f32_vec");
-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][0] = ggml_webgpu_create_pipeline2(
-            webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f16_f16.c_str(), "mul_mat_subgroup_matrix_f16_f16");
-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][1] =
-            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f16_f16_vec.c_str(),
-                                         "mul_mat_subgroup_matrix_f16_f16_vec");
-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
-            webgpu_ctx->device, proc_mul_mat_subgroup_matrix_q4_0_f32.c_str(), "mul_mat_subgroup_matrix_q4_0_f32");
-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][1] =
-            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_subgroup_matrix_q4_0_f32_vec.c_str(),
-                                         "mul_mat_subgroup_matrix_q4_0_f32_vec");
      } else {
-        std::vector<wgpu::ConstantEntry> mul_mat_reg_tile_constants(3);
-        mul_mat_reg_tile_constants[0].key   = "TILE_K";
-        mul_mat_reg_tile_constants[0].value = WEBGPU_MUL_MAT_TILE_K;
-        mul_mat_reg_tile_constants[1].key   = "WORKGROUP_SIZE_M";
-        mul_mat_reg_tile_constants[1].value = WEBGPU_MUL_MAT_WG_SIZE_M;
-        mul_mat_reg_tile_constants[2].key   = "WORKGROUP_SIZE_N";
-        mul_mat_reg_tile_constants[2].value = WEBGPU_MUL_MAT_WG_SIZE_N;
+#endif
+        mul_mat_constants.push_back({ .key = "TILE_K", .value = WEBGPU_MUL_MAT_TILE_K });
+        mul_mat_constants.push_back({ .key = "WORKGROUP_SIZE_M", .value = WEBGPU_MUL_MAT_WG_SIZE_M });
+        mul_mat_constants.push_back({ .key = "WORKGROUP_SIZE_N", .value = WEBGPU_MUL_MAT_WG_SIZE_N });
  
          std::map<std::string, std::string> reg_repls;
          reg_repls["WEBGPU_TILE_M"] = std::to_string(WEBGPU_MUL_MAT_TILE_M);
          reg_repls["WEBGPU_TILE_N"] = std::to_string(WEBGPU_MUL_MAT_TILE_N);
  
-        // Process each reg-tile shader with tile replacements.
-        // Keep the processed strings in-scope so .c_str() remains valid.
-        std::string proc_mul_mat_reg_tile_f32_f32 =
-            ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f32_f32, reg_repls);
-        std::string proc_mul_mat_reg_tile_f32_f32_vec =
-            ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f32_f32_vec, reg_repls);
-        std::string proc_mul_mat_reg_tile_f16_f32 =
-            ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f32, reg_repls);
-        std::string proc_mul_mat_reg_tile_f16_f32_vec =
-            ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f32_vec, reg_repls);
-        std::string proc_mul_mat_reg_tile_f16_f16 =
-            ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f16, reg_repls);
-        std::string proc_mul_mat_reg_tile_f16_f16_vec =
-            ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f16_vec, reg_repls);
-        std::string proc_mul_mat_reg_tile_q4_0_f32 =
-            ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_q4_0_f32, reg_repls);
-        std::string proc_mul_mat_reg_tile_q4_0_f32_vec =
-            ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_q4_0_f32_vec, reg_repls);
-
-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][0] =
-            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f32_f32.c_str(),
-                                         "mul_mat_reg_tile_f32_f32", mul_mat_reg_tile_constants);
-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][1] =
-            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f32_f32_vec.c_str(),
-                                         "mul_mat_reg_tile_f32_f32_vec", mul_mat_reg_tile_constants);
-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][0] =
-            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f16_f32.c_str(),
-                                         "mul_mat_reg_tile_f16_f32", mul_mat_reg_tile_constants);
-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][1] =
-            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f16_f32_vec.c_str(),
-                                         "mul_mat_reg_tile_f16_f32_vec", mul_mat_reg_tile_constants);
-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][0] =
-            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f16_f16.c_str(),
-                                         "mul_mat_reg_tile_f16_f16", mul_mat_reg_tile_constants);
-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][1] =
-            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f16_f16_vec.c_str(),
-                                         "mul_mat_reg_tile_f16_f16_vec", mul_mat_reg_tile_constants);
-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][0] =
-            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_q4_0_f32.c_str(),
-                                         "mul_mat_reg_tile_q4_0_f32", mul_mat_reg_tile_constants);
-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][1] =
-            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_q4_0_f32_vec.c_str(),
-                                         "mul_mat_reg_tile_q4_0_f32_vec", mul_mat_reg_tile_constants);
+        proc_mul_mat_f32_f32      = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f32_f32, reg_repls);
+        proc_mul_mat_f32_f32_vec  = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f32_f32_vec, reg_repls);
+        proc_mul_mat_f16_f32      = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f32, reg_repls);
+        proc_mul_mat_f16_f32_vec  = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f32_vec, reg_repls);
+        proc_mul_mat_f16_f16      = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f16, reg_repls);
+        proc_mul_mat_f16_f16_vec  = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f16_vec, reg_repls);
+        proc_mul_mat_q4_0_f32     = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_q4_0_f32, reg_repls);
+        proc_mul_mat_q4_0_f32_vec = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_q4_0_f32_vec, reg_repls);
+#ifndef __EMSCRIPTEN__
      }
+#endif
+
+    webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
+        webgpu_ctx->device, proc_mul_mat_f32_f32.c_str(), "mul_mat_f32_f32", mul_mat_constants);
+    webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][1] = ggml_webgpu_create_pipeline2(
+        webgpu_ctx->device, proc_mul_mat_f32_f32_vec.c_str(), "mul_mat_f32_f32_vec", mul_mat_constants);
+    webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
+        webgpu_ctx->device, proc_mul_mat_f16_f32.c_str(), "mul_mat_f16_f32", mul_mat_constants);
+    webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][1] = ggml_webgpu_create_pipeline2(
+        webgpu_ctx->device, proc_mul_mat_f16_f32_vec.c_str(), "mul_mat_f16_f32_vec", mul_mat_constants);
+    webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][0] = ggml_webgpu_create_pipeline2(
+        webgpu_ctx->device, proc_mul_mat_f16_f16.c_str(), "mul_mat_f16_f16", mul_mat_constants);
+    webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][1] = ggml_webgpu_create_pipeline2(
+        webgpu_ctx->device, proc_mul_mat_f16_f16_vec.c_str(), "mul_mat_f16_f16_vec", mul_mat_constants);
+    webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
+        webgpu_ctx->device, proc_mul_mat_q4_0_f32.c_str(), "mul_mat_q4_0_f32", mul_mat_constants);
+    webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][1] = ggml_webgpu_create_pipeline2(
+        webgpu_ctx->device, proc_mul_mat_q4_0_f32_vec.c_str(), "mul_mat_q4_0_f32_vec", mul_mat_constants);
  
      std::vector<wgpu::ConstantEntry> mul_mat_vec_constants(3);
      mul_mat_vec_constants[0].key   = "WORKGROUP_SIZE";
@@ -2384,13 +2364,17 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
  
      webgpu_context ctx = reg_ctx->webgpu_ctx;
  
+    wgpu::RequestAdapterOptions options = {};
+
+#ifndef __EMSCRIPTEN__
      // TODO: track need for these toggles: https://issues.chromium.org/issues/42251215
      const char * const          adapterEnabledToggles[] = { "vulkan_enable_f16_on_nvidia", "use_vulkan_memory_model" };
      wgpu::DawnTogglesDescriptor adapterTogglesDesc;
      adapterTogglesDesc.enabledToggles     = adapterEnabledToggles;
      adapterTogglesDesc.enabledToggleCount = 2;
-    wgpu::RequestAdapterOptions options   = {};
      options.nextInChain                   = &adapterTogglesDesc;
+#endif
+
      ctx->instance.WaitAny(ctx->instance.RequestAdapter(
                                &options, wgpu::CallbackMode::AllowSpontaneous,
                                [&ctx](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char * message) {
@@ -2406,11 +2390,13 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
      ctx->adapter.GetLimits(&ctx->limits);
      ctx->max_wg_size_x = 288;  // default value
  
-    wgpu::AdapterInfo                            info{};
+    wgpu::AdapterInfo info{};
+#ifndef __EMSCRIPTEN__
      wgpu::AdapterPropertiesSubgroupMatrixConfigs subgroup_matrix_configs{};
      if (ctx->adapter.HasFeature(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix)) {
          info.nextInChain = &subgroup_matrix_configs;
      }
+#endif
      ctx->adapter.GetInfo(&info);
  
      wgpu::SupportedFeatures features;
@@ -2418,6 +2404,7 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
      // we require f16 support
      GGML_ASSERT(ctx->adapter.HasFeature(wgpu::FeatureName::ShaderF16));
  
+#ifndef __EMSCRIPTEN__
      // Only support square f16 matrices of size 8 or 16 for now
      bool valid_subgroup_matrix_config = false;
      if (ctx->adapter.HasFeature(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix)) {
@@ -2433,36 +2420,27 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
          }
      }
  
+    ctx->supports_subgroup_matrix = valid_subgroup_matrix_config;
+#endif
      // For subgroup matrix code to be the most efficient, we would like the subgroup size to be consistent and accurate.
      // Unfortunately, that is not possible, so we use the maximum subgroup size reported by the adapter.
-    ctx->subgroup_size            = info.subgroupMaxSize;
-    ctx->supports_subgroup_matrix = valid_subgroup_matrix_config;
+    ctx->subgroup_size = info.subgroupMaxSize;
  
      // Initialize device
-    std::vector<wgpu::FeatureName> required_features = { wgpu::FeatureName::ShaderF16,
-                                                         wgpu::FeatureName::ImplicitDeviceSynchronization };
+    std::vector<wgpu::FeatureName> required_features = { wgpu::FeatureName::ShaderF16 };
+
+#ifndef __EMSCRIPTEN__
+    required_features.push_back(wgpu::FeatureName::ImplicitDeviceSynchronization);
      if (ctx->supports_subgroup_matrix) {
          required_features.push_back(wgpu::FeatureName::Subgroups);
          required_features.push_back(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix);
      }
+#endif
  
  #ifdef GGML_WEBGPU_GPU_PROFILE
      required_features.push_back(wgpu::FeatureName::TimestampQuery);
  #endif
  
-    // Enable Dawn-specific toggles to increase native performance
-    // TODO: Don't enable for WASM builds, they won't have an effect anyways
-    // TODO: Maybe WebGPU needs a "fast" mode where you can request compilers skip adding checks like these,
-    //       only for native performance?
-    const char * const deviceEnabledToggles[]  = { "skip_validation", "disable_robustness", "disable_workgroup_init",
-                                                   "disable_polyfills_on_integer_div_and_mod" };
-    const char * const deviceDisabledToggles[] = { "timestamp_quantization" };
-    wgpu::DawnTogglesDescriptor deviceTogglesDesc;
-    deviceTogglesDesc.enabledToggles      = deviceEnabledToggles;
-    deviceTogglesDesc.enabledToggleCount  = 4;
-    deviceTogglesDesc.disabledToggles     = deviceDisabledToggles;
-    deviceTogglesDesc.disabledToggleCount = 1;
-
      wgpu::DeviceDescriptor dev_desc;
      dev_desc.requiredLimits       = &ctx->limits;
      dev_desc.requiredFeatures     = required_features.data();
@@ -2480,7 +2458,23 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
              GGML_ABORT("ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast<int>(reason),
                         std::string(message).c_str());
          });
+
+#ifndef __EMSCRIPTEN__
+    // Enable Dawn-specific toggles to increase native performance
+    // TODO: Maybe WebGPU needs a "fast" mode where you can request compilers skip adding checks like these,
+    //       only for native performance?
+    const char * const deviceEnabledToggles[]  = { "skip_validation", "disable_robustness", "disable_workgroup_init",
+                                                   "disable_polyfills_on_integer_div_and_mod" };
+    const char * const deviceDisabledToggles[] = { "timestamp_quantization" };
+    wgpu::DawnTogglesDescriptor deviceTogglesDesc;
+    deviceTogglesDesc.enabledToggles      = deviceEnabledToggles;
+    deviceTogglesDesc.enabledToggleCount  = 4;
+    deviceTogglesDesc.disabledToggles     = deviceDisabledToggles;
+    deviceTogglesDesc.disabledToggleCount = 1;
+
      dev_desc.nextInChain = &deviceTogglesDesc;
+#endif
+
      ctx->instance.WaitAny(ctx->adapter.RequestDevice(
                                &dev_desc, wgpu::CallbackMode::AllowSpontaneous,
                                [ctx](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) {
@@ -2578,18 +2572,27 @@ ggml_backend_reg_t ggml_backend_webgpu_reg() {
      ctx.name         = GGML_WEBGPU_NAME;
      ctx.device_count = 1;
  
-    const char * const instanceEnabledToggles[] = { "allow_unsafe_apis" };
-
-    wgpu::DawnTogglesDescriptor instanceTogglesDesc;
-    instanceTogglesDesc.enabledToggles     = instanceEnabledToggles;
-    instanceTogglesDesc.enabledToggleCount = 1;
      wgpu::InstanceDescriptor               instance_descriptor{};
      std::vector<wgpu::InstanceFeatureName> instance_features = { wgpu::InstanceFeatureName::TimedWaitAny };
      instance_descriptor.requiredFeatures                     = instance_features.data();
      instance_descriptor.requiredFeatureCount                 = instance_features.size();
-    instance_descriptor.nextInChain                          = &instanceTogglesDesc;
+
+#ifndef __EMSCRIPTEN__
+    const char * const          instanceEnabledToggles[] = { "allow_unsafe_apis" };
+    wgpu::DawnTogglesDescriptor instanceTogglesDesc;
+    instanceTogglesDesc.enabledToggles     = instanceEnabledToggles;
+    instanceTogglesDesc.enabledToggleCount = 1;
+    instance_descriptor.nextInChain        = &instanceTogglesDesc;
+#endif
  
      webgpu_ctx->instance = wgpu::CreateInstance(&instance_descriptor);
+
+#ifdef __EMSCRIPTEN__
+    if (webgpu_ctx->instance == nullptr) {
+        GGML_LOG_ERROR("ggml_webgpu: Failed to create WebGPU instance. Make sure either -sASYNCIFY or -sJSPI is set\n");
+        return nullptr;
+    }
+#endif
      GGML_ASSERT(webgpu_ctx->instance != nullptr);
  
      static ggml_backend_reg reg = {
author	Reese Levine <redacted>
	Wed, 3 Dec 2025 09:25:34 +0000 (01:25 -0800)
committer	Georgi Gerganov <redacted>
	Fri, 12 Dec 2025 15:53:16 +0000 (17:53 +0200)
ggml/CMakeLists.txt		patch \| blob \| history
ggml/src/ggml-cpu/ggml-cpu.c		patch \| blob \| history
ggml/src/ggml-webgpu/CMakeLists.txt		patch \| blob \| history
ggml/src/ggml-webgpu/ggml-webgpu.cpp		patch \| blob \| history