From: Georgi Gerganov Date: Mon, 18 Aug 2025 19:01:00 +0000 (+0300) Subject: cuda : remove obsolete sources (#1332) X-Git-Tag: upstream/0.0.2471^0 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=323951f;p=pkg%2Fggml%2Fsources%2Fggml cuda : remove obsolete sources (#1332) ggml-ci --- diff --git a/src/ggml-cuda/mmv.cu b/src/ggml-cuda/mmv.cu deleted file mode 100644 index e14c9351..00000000 --- a/src/ggml-cuda/mmv.cu +++ /dev/null @@ -1,506 +0,0 @@ -#include "ggml.h" -#include "common.cuh" -#include "mmv.cuh" - -template -static __global__ void mul_mat_vec( - const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, float * __restrict__ dst, - const int ncols2, const int nchannels_y, const int stride_row, const int stride_col_y2, const int stride_col_dst, - const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst, - const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) { - const int row = blockIdx.x; - const int channel_dst = blockIdx.y; - const int channel_x = ids ? ids[channel_dst] : channel_dst / channel_ratio; - const int channel_y = ids ? channel_dst % nchannels_y : channel_dst; - const int sample_dst = blockIdx.z; - const int sample_x = sample_dst / sample_ratio; - const int sample_y = sample_dst; - const int tid = threadIdx.x; - - constexpr int warp_size = ggml_cuda_get_physical_warp_size(); - - x += int64_t(sample_x) *stride_sample_x + channel_x *stride_channel_x + row*stride_row; - y += int64_t(sample_y) *stride_sample_y + channel_y *stride_channel_y; - dst += int64_t(sample_dst)*stride_sample_dst + channel_dst*stride_channel_dst; - - const float2 * y2 = (const float2 *) y; - - extern __shared__ char data_mmv[]; - float * buf_iw = (float *) data_mmv; - - if (block_size > warp_size) { - if (tid < warp_size) { - buf_iw[tid] = 0.0f; - } - __syncthreads(); - } - - float sumf[ncols_dst] = {0.0f}; - - if constexpr (std::is_same::value) { - const float2 * x2 = (const float2 *) x; - - for (int col2 = tid; col2 < ncols2; col2 += block_size) { - const float2 tmpx = x2[col2]; - -#pragma unroll - for (int j = 0; j < ncols_dst; ++j) { - const float2 tmpy = y2[j*stride_col_y2 + col2]; - sumf[j] += tmpx.x*tmpy.x; - sumf[j] += tmpx.y*tmpy.y; - } - } - } else if constexpr (std::is_same::value) { - const half2 * x2 = (const half2 *) x; - - if (std::is_same::value) { - for (int col2 = tid; col2 < ncols2; col2 += block_size) { - const float2 tmpx = __half22float2(x2[col2]); - -#pragma unroll - for (int j = 0; j < ncols_dst; ++j) { - const float2 tmpy = y2[j*stride_col_y2 + col2]; - sumf[j] += tmpx.x * tmpy.x; - sumf[j] += tmpx.y * tmpy.y; - } - } - } else { -#ifdef FP16_AVAILABLE - half2 sumh2[ncols_dst] = {{0.0f, 0.0f}}; - - for (int col2 = tid; col2 < ncols2; col2 += block_size) { - const half2 tmpx = x2[col2]; - -#pragma unroll - for (int j = 0; j < ncols_dst; ++j) { - const float2 tmpy = y2[j*stride_col_y2 + col2]; - sumh2[j] += tmpx * make_half2(tmpy.x, tmpy.y); - } - } - -#pragma unroll - for (int j = 0; j < ncols_dst; ++j) { - sumf[j] = __low2float(sumh2[j]) + __high2float(sumh2[j]); - } -#else - NO_DEVICE_CODE; -#endif // FP16_AVAILABLE - } - } else if constexpr (std::is_same::value) { - const int * x2 = (const int *) x; - for (int col2 = tid; col2 < ncols2; col2 += block_size) { - const int tmpx = x2[col2]; -#pragma unroll - for (int j = 0; j < ncols_dst; ++j) { - const float2 tmpy = y2[j*stride_col_y2 + col2]; - sumf[j] += float(reinterpret_cast(&tmpx)[0]) * tmpy.x; - sumf[j] += float(reinterpret_cast(&tmpx)[1]) * tmpy.y; - } - } - } else { - static_assert(std::is_same::value, "unsupported type"); - } - -#pragma unroll - for (int j = 0; j < ncols_dst; ++j) { - sumf[j] = warp_reduce_sum(sumf[j]); - - if (block_size > warp_size) { - buf_iw[tid/warp_size] = sumf[j]; - __syncthreads(); - if (tid < warp_size) { - sumf[j] = buf_iw[tid]; - sumf[j] = warp_reduce_sum(sumf[j]); - } - if (j < ncols_dst) { - __syncthreads(); - } - } - } - - if (tid >= ncols_dst) { - return; - } - - dst[tid*stride_col_dst + row] = sumf[tid]; -} - -template -static void launch_mul_mat_vec_cuda( - const T * x, const float * y, const int32_t * ids, float * dst, - const int64_t ncols, const int64_t nrows, - const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst, - const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, - const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x, - const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, - cudaStream_t stream) { - GGML_ASSERT(ncols % 2 == 0); - GGML_ASSERT(stride_row % 2 == 0); - GGML_ASSERT(stride_col_y % 2 == 0); - GGML_ASSERT(ids || nchannels_dst % nchannels_x == 0); - GGML_ASSERT( nsamples_dst % nsamples_x == 0); - const int64_t channel_ratio = nchannels_dst / nchannels_x; - const int64_t sample_ratio = nsamples_dst / nsamples_x; - int device; - int warp_size; - - CUDA_CHECK(cudaGetDevice(&device)); - warp_size = ggml_cuda_info().devices[device].warp_size; - - int64_t block_size_best = warp_size; - int64_t niter_best = (ncols + 2*warp_size - 1) / (2*warp_size); - int64_t max_block_size = 256; - if(ggml_cuda_info().devices[device].cc > GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_info().devices[device].cc < GGML_CUDA_CC_RDNA1) { - max_block_size = 128; - } - for (int64_t block_size = 2*warp_size; block_size <= max_block_size; block_size += warp_size) { - const int64_t niter = (ncols + 2*block_size - 1) / (2*block_size); - if (niter < niter_best) { - niter_best = niter; - block_size_best = block_size; - } - } - - const int smem = warp_size*sizeof(float); - const dim3 block_nums(nrows, nchannels_dst, nsamples_dst); - const dim3 block_dims(block_size_best, 1, 1); - switch (block_size_best) { - case 32: { - mul_mat_vec<<>> - (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, - channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); - } break; - case 64: { - mul_mat_vec<<>> - (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, - channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); - } break; - case 96: { - mul_mat_vec<<>> - (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, - channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); - } break; - case 128: { - mul_mat_vec<<>> - (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, - channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); - } break; - case 160: { - mul_mat_vec<<>> - (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, - channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); - } break; - case 192: { - mul_mat_vec<<>> - (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, - channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); - } break; - case 224: { - mul_mat_vec<<>> - (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, - channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); - } break; - case 256: { - mul_mat_vec<<>> - (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, - channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); - } break; - default: { - GGML_ABORT("fatal error"); - } break; - } -} - -template -static void mul_mat_vec_cuda_switch_ncols_dst( - const T * x, const float * y, const int32_t * ids, float * dst, - const int64_t ncols, const int64_t nrows, const int64_t ncols_dst, - const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst, - const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, - const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x, - const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, - cudaStream_t stream) { - switch (ncols_dst) { - case 1: - launch_mul_mat_vec_cuda - (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, - stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - break; - case 2: - launch_mul_mat_vec_cuda - (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, - stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - break; - case 3: - launch_mul_mat_vec_cuda - (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, - stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - break; - case 4: - launch_mul_mat_vec_cuda - (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, - stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - break; - case 5: - launch_mul_mat_vec_cuda - (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, - stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - break; - case 6: - launch_mul_mat_vec_cuda - (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, - stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - break; - case 7: - launch_mul_mat_vec_cuda - (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, - stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - break; - case 8: - launch_mul_mat_vec_cuda - (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, - stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - break; - default: - GGML_ABORT("fatal error"); - break; - } -} - -template -static void mul_mat_vec_cuda( - const T * x, const float * y, const int32_t * ids, float * dst, - const int64_t ncols, const int64_t nrows, const int64_t ncols_dst, - const int64_t stride_row, const int64_t stride_col_y, const int stride_col_dst, - const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, - const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x, - const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, - enum ggml_prec prec, cudaStream_t stream) { - if constexpr(std::is_same::value) { - if (prec == GGML_PREC_DEFAULT) { - mul_mat_vec_cuda_switch_ncols_dst - (x, y, ids, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, - stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - return; - } - } - mul_mat_vec_cuda_switch_ncols_dst - (x, y, ids, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, - stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); -} - -void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) { - GGML_ASSERT( src1->type == GGML_TYPE_F32); - GGML_ASSERT(!ids || ids->type == GGML_TYPE_I32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - GGML_TENSOR_BINARY_OP_LOCALS; - - const size_t ts_src0 = ggml_type_size(src0->type); - const size_t ts_src1 = ggml_type_size(src1->type); - const size_t ts_dst = ggml_type_size(dst->type); - - GGML_ASSERT(!ids || ne12 == 1); // Implementation is only correct for batch size 1. - GGML_ASSERT(ne13 == ne3); - - GGML_ASSERT( nb00 == ts_src0); - GGML_ASSERT( nb10 == ts_src1); - GGML_ASSERT(!ids || ids->nb[0] == ggml_type_size(ids->type)); - GGML_ASSERT( nb0 == ts_dst); - - const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; - const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32; - - const float * src1_d = (const float *) src1->data; - const int32_t * ids_d = ids ? (const int32_t *) ids->data : nullptr; - float * dst_d = (float *) dst->data; - - const int64_t s01 = src0->nb[1] / ts_src0; - const int64_t s11 = src1->nb[1] / ts_src1; - const int64_t s1 = dst->nb[1] / ts_dst; - const int64_t s02 = src0->nb[2] / ts_src0; - const int64_t s12 = src1->nb[2] / ts_src1; - const int64_t s2 = dst->nb[2] / ts_dst; - const int64_t s03 = src0->nb[3] / ts_src0; - const int64_t s13 = src1->nb[3] / ts_src1; - const int64_t s3 = dst->nb[3] / ts_dst; - - // For MUL_MAT_ID the memory layout is different than for MUL_MAT: - const int64_t ncols_dst = ids ? ne2 : ne1; - const int64_t nchannels_y = ids ? ne11 : ne12; - const int64_t nchannels_dst = ids ? ne1 : ne2; - const int64_t stride_channel_dst = ids ? s1 : s2; - const int64_t stride_channel_y = ids ? s11 : s12; - - GGML_ASSERT(!ids || ncols_dst == 1); - - switch (src0->type) { - case GGML_TYPE_F32: { - const float * src0_d = (const float *) src0->data; - mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1, - ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst, - ne03, ne3, s03, s13, s3, prec, ctx.stream()); - } break; - case GGML_TYPE_F16: { - const half * src0_d = (const half *) src0->data; - mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1, - ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst, - ne03, ne3, s03, s13, s3, prec, ctx.stream()); - } break; - case GGML_TYPE_BF16: { - const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0->data; - mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1, - ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst, - ne03, ne3, s03, s13, s3, prec, ctx.stream()); - } break; - default: - GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type)); - } -} - -void ggml_cuda_op_mul_mat_vec( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, - const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, - const int64_t src1_padded_row_size, cudaStream_t stream) { - - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); - - const int64_t ne00 = src0->ne[0]; - const int64_t ne10 = src1->ne[0]; - const int64_t ne0 = dst->ne[0]; - const int64_t row_diff = row_high - row_low; - - const int id = ggml_cuda_get_device(); - const int cc = ggml_cuda_info().devices[id].cc; - const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32; - - - // ggml_cuda_op provides single, contiguous matrices - const int64_t stride_row = ne00; - const int64_t stride_col_y = ne10; - const int64_t stride_col_dst = id == ctx.device ? ne0 : row_diff; // main device has larger memory buffer - const int64_t nchannels_x = 1; - const int64_t nchannels_y = 1; - const int64_t nchannels_dst = 1; - const int64_t stride_channel_x = 0; - const int64_t stride_channel_y = 0; - const int64_t stride_channel_dst = 0; - const int64_t nsamples_x = 1; - const int64_t nsamples_dst = 1; - const int64_t stride_sample_x = 0; - const int64_t stride_sample_y = 0; - const int64_t stride_sample_dst = 0; - - switch (src0->type) { - case GGML_TYPE_F32: { - const float * src0_d = (const float *) src0_dd_i; - mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream); - } break; - case GGML_TYPE_F16: { - const half * src0_d = (const half *) src0_dd_i; - mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream); - } break; - case GGML_TYPE_BF16: { - const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0_dd_i; - mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream); - } break; - default: - GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type)); - } - - GGML_UNUSED(ctx); - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_ddq_i); - GGML_UNUSED(src1_ncols); - GGML_UNUSED(src1_padded_row_size); -} - -bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_ne, int64_t ne11) { - if (src0_ne[0] % 2 != 0) { - return false; - } - switch (type) { - case GGML_TYPE_F32: - if (GGML_CUDA_CC_IS_NVIDIA(cc)) { - if (cc >= GGML_CUDA_CC_ADA_LOVELACE) { - return ne11 <= 8; - } - if (cc >= GGML_CUDA_CC_TURING) { - return ne11 <= 4; - } - return ne11 <= 3; - } else if (GGML_CUDA_CC_IS_AMD(cc)) { - if (fp32_mma_hardware_available(cc)) { - return ne11 <= 3; - } - return ne11 <= 8; - } - return ne11 <= 8; - case GGML_TYPE_F16: - if (GGML_CUDA_CC_IS_NVIDIA(cc)) { - const bool src0_small = (src0_ne[1] <= 512 || src0_ne[2]*src0_ne[3] == 1); - if (cc >= GGML_CUDA_CC_ADA_LOVELACE) { - return src0_small && ne11 <= 4; - } - if (fp16_mma_hardware_available(cc)) { - return src0_small && ne11 <= 3; - } - return ne11 <= 8; - } else if (GGML_CUDA_CC_IS_AMD(cc)) { - if (fp16_mma_hardware_available(cc)) { - if (GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) { - return ne11 <= 5; - } - return ne11 <= 2; - } - return ne11 <= 8; - } - return ne11 <= 8; - case GGML_TYPE_BF16: - if (GGML_CUDA_CC_IS_NVIDIA(cc)) { - const bool src0_small = (src0_ne[1] <= 512 || src0_ne[2]*src0_ne[3] == 1); - if (cc >= GGML_CUDA_CC_ADA_LOVELACE) { - return src0_small && ne11 <= 4; - } - if (bf16_mma_hardware_available(cc)) { - return src0_small && ne11 <= 3; - } - return ne11 <= 8; - } else if (GGML_CUDA_CC_IS_AMD(cc)) { - if (bf16_mma_hardware_available(cc)) { - return ne11 <= 3; - } - return ne11 <= 8; - } - return ne11 <= 8; - default: - return false; - } -} diff --git a/src/ggml-cuda/mmv.cuh b/src/ggml-cuda/mmv.cuh deleted file mode 100644 index 1330bcb6..00000000 --- a/src/ggml-cuda/mmv.cuh +++ /dev/null @@ -1,11 +0,0 @@ -#include "common.cuh" - -void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst); - -void ggml_cuda_op_mul_mat_vec( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, - const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, - const int64_t src1_padded_row_size, cudaStream_t stream); - -bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_ne, int64_t ne11);