vk_queue transfer_queue;
bool single_queue;
uint32_t subgroup_size;
+ uint32_t shader_core_count;
bool uma;
size_t idx;
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl_f32", get_rows_iq4_nl_f32_len, get_rows_iq4_nl_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
- ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1);
+ ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256 * 4, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_p021_f16_f32, "mul_mat_vec_p021_f16_f32", mul_mat_vec_p021_f16_f32_len, mul_mat_vec_p021_f16_f32_data, "main", 3, 6 * sizeof(uint32_t), {1, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 7 * sizeof(uint32_t), {1, 1, 1}, {}, 1);
const std::vector<vk::ExtensionProperties> ext_props = device->physical_device.enumerateDeviceExtensionProperties();
bool maintenance4_support = false;
+ bool sm_builtins = false;
// Check if maintenance4 is supported
for (const auto& properties : ext_props) {
if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) {
maintenance4_support = true;
+ } else if (strcmp("VK_NV_shader_sm_builtins", properties.extensionName) == 0) {
+ sm_builtins = true;
}
}
vk::PhysicalDeviceMaintenance3Properties props3;
vk::PhysicalDeviceMaintenance4Properties props4;
vk::PhysicalDeviceSubgroupProperties subgroup_props;
+ vk::PhysicalDeviceShaderSMBuiltinsPropertiesNV sm_props;
props2.pNext = &props3;
props3.pNext = &subgroup_props;
+
+ VkBaseOutStructure * last_struct = (VkBaseOutStructure *)&subgroup_props;
+
if (maintenance4_support) {
- subgroup_props.pNext = &props4;
+ last_struct->pNext = (VkBaseOutStructure *)&props4;
+ last_struct = (VkBaseOutStructure *)&props4;
+ }
+ if (sm_builtins) {
+ last_struct->pNext = (VkBaseOutStructure *)&sm_props;
+ last_struct = (VkBaseOutStructure *)&sm_props;
}
+
device->physical_device.getProperties2(&props2);
device->properties = props2.properties;
device->vendor_id = device->properties.vendorID;
device->subgroup_size = subgroup_props.subgroupSize;
device->uma = device->properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu;
+ if (sm_builtins) {
+ device->shader_core_count = sm_props.shaderSMCount;
+ } else {
+ device->shader_core_count = 0;
+ }
bool fp16_storage = false;
bool fp16_compute = false;
dst->device->device.resetFences({ dst->device->fence });
}
-static uint32_t ggml_vk_guess_split_k(int m, int n, int k) {
+static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, int m, int n, int k, const vk_pipeline& pipeline) {
VK_LOG_DEBUG("ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")");
- // if (k > 128 && (m < 128 || n < 128) && m > 2 && n > 2) {
- // return 4;
- // }
- return 1;
+ uint32_t split_k = 1;
+ if (ctx->device->shader_core_count != 0 && m >= (int)pipeline->wg_denoms[0] && n >= (int)pipeline->wg_denoms[1]) {
+ // If k is 'large' and the SMs will fill less than halfway, use split_k.
+ uint32_t m_tiles = CEIL_DIV(m, pipeline->wg_denoms[0]);
+ uint32_t n_tiles = CEIL_DIV(n, pipeline->wg_denoms[1]);
+ if (k >= 2048 && m_tiles * n_tiles < ctx->device->shader_core_count / 2) {
+ split_k = ctx->device->shader_core_count / (m_tiles * n_tiles);
+ // Clamp to 2 or 4
+ split_k = std::min(split_k, 4u);
+ if (split_k == 3) {
+ split_k = 2;
+ }
+ }
+ }
- GGML_UNUSED(m); GGML_UNUSED(n); GGML_UNUSED(k);
+ return split_k;
}
static vk_pipeline ggml_vk_guess_matmul_pipeline_amd(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned) {
const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, ne11));
const bool aligned = ne10 == kpad && ne01 > 8 && ne11 > 8;
- const uint32_t split_k = ggml_vk_guess_split_k(ne01, ne11, ne10);
-
vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, ne11, aligned);
+ const uint32_t split_k = ggml_vk_guess_split_k(ctx, ne01, ne11, ne10, pipeline);
+
const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type);
const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
const uint64_t x_sz = !qx_needs_dequant ? qx_sz : sizeof(ggml_fp16_t) * x_ne;
if (dryrun) {
const uint64_t x_sz_upd = x_sz * ne02 * ne03;
const uint64_t y_sz_upd = y_sz * ne12 * ne13;
- const uint64_t split_k_size = split_k > 1 ? d_sz * ne12 * ne13 * 4 : 0;
+ const uint64_t split_k_size = split_k > 1 ? d_sz * ne12 * ne13 * split_k : 0;
if (
(qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) ||
(qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size) ||
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
layout (binding = 0) readonly buffer A {float data_a[];};
+layout (binding = 0) readonly buffer A4 {vec4 data_a4[];};
layout (binding = 1) writeonly buffer D {float data_d[];};
+layout (binding = 1) writeonly buffer D4 {vec4 data_d4[];};
layout (push_constant) uniform parameter {
uint ne;
} p;
void main() {
- const uint idx = gl_GlobalInvocationID.x;
+ // Each invocation handles four consecutive components
+ const uint idx = gl_GlobalInvocationID.x * 4;
if (idx >= p.ne) {
return;
}
- float result = 0.0f;
+ // Check if all four components are in bounds and aligned,
+ // then use vector loads
+ if (idx + 3 < p.ne && (p.ne % 4) == 0) {
+ vec4 result = vec4(0.0f);
- [[unroll]] for (uint i = 0; i < p.k_num; i++) {
- result += data_a[i * p.ne + idx];
- }
+ [[unroll]] for (uint i = 0; i < p.k_num; i++) {
+ result += data_a4[(i * p.ne + idx) / 4];
+ }
+
+ data_d4[idx / 4] = result;
+ } else {
+ [[unroll]] for (uint j = 0; j < 4; ++j) {
+ if (idx + j < p.ne) {
+ float result = 0.0f;
- data_d[idx] = result;
+ [[unroll]] for (uint i = 0; i < p.k_num; i++) {
+ result += data_a[i * p.ne + idx + j];
+ }
+
+ data_d[idx + j] = result;
+ }
+ }
+ }
}