From: Georgi Gerganov Date: Sat, 14 Feb 2026 10:57:36 +0000 (+0200) Subject: models : optimize qwen3next graph (llama/19375) X-Git-Tag: v0.9.7~3 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=f61050d0c0771749179486f1672d4b0b43f97637;p=pkg%2Fggml%2Fsources%2Fggml models : optimize qwen3next graph (llama/19375) * models : optimizing qwen3next graph * cont * wip * wip * wip * wip * wip * wip * wip * wip * wip * wip * cont : remove redundant q, g chunking * minor * minor * avoid passing masks around * avoid concats during chunking * naming + shapes * update names and use prefix to disable CUDA graphs --- diff --git a/src/ggml-cuda/ggml-cuda.cu b/src/ggml-cuda/ggml-cuda.cu index 85ce9695..bed5c71a 100644 --- a/src/ggml-cuda/ggml-cuda.cu +++ b/src/ggml-cuda/ggml-cuda.cu @@ -2872,6 +2872,7 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) { const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased"; const std::string nemotron_h_block_out_prefix = "nemotron_h_block_out"; const std::string mamba2_y_add_d_prefix = "mamba2_y_add_d"; + const std::string delta_net_prefix = "dnet_add"; for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; @@ -2902,7 +2903,8 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) { strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != 0 && strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0 && strncmp(node->name, nemotron_h_block_out_prefix.c_str(), nemotron_h_block_out_prefix.size()) != 0 && - strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0) { + strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0 && + strncmp(node->name, delta_net_prefix.c_str(), delta_net_prefix.size()) != 0) { // disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation // by means of matching node names. See // https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and @@ -4544,6 +4546,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_UNARY_OP_CEIL: case GGML_UNARY_OP_ROUND: case GGML_UNARY_OP_TRUNC: + // TODO: should become: + //return ggml_is_contiguous_rows(op->src[0]); return ggml_is_contiguous(op->src[0]); default: return false; diff --git a/src/ggml-metal/ggml-metal-common.cpp b/src/ggml-metal/ggml-metal-common.cpp index 87e13786..2eb9820b 100644 --- a/src/ggml-metal/ggml-metal-common.cpp +++ b/src/ggml-metal/ggml-metal-common.cpp @@ -273,6 +273,7 @@ static std::vector ggml_metal_graph_optimize_reorder(const std::vector