}
}
-static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
- float *dst, const int ncols,
- const int nrows,
- dpct::queue_ptr stream) {
- GGML_ASSERT(ncols % QK4_0 == 0);
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
- const sycl::range<3> block_nums(1, 1, block_num_y);
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
- stream->parallel_for(
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
- mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ,
- vec_dot_q4_0_q8_1>(vx, vy, dst, ncols, nrows,
- item_ct1);
- });
-}
-
-static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
- float *dst, const int ncols,
- const int nrows,
- dpct::queue_ptr stream) {
- GGML_ASSERT(ncols % QK4_1 == 0);
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
- const sycl::range<3> block_nums(1, 1, block_num_y);
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
- stream->parallel_for(
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
- mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ,
- vec_dot_q4_1_q8_1>(vx, vy, dst, ncols, nrows,
- item_ct1);
- });
-}
-
-static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
- float *dst, const int ncols,
- const int nrows,
- dpct::queue_ptr stream) {
- GGML_ASSERT(ncols % QK5_0 == 0);
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
- const sycl::range<3> block_nums(1, 1, block_num_y);
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
- stream->parallel_for(
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
- mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ,
- vec_dot_q5_0_q8_1>(vx, vy, dst, ncols, nrows,
- item_ct1);
- });
-}
-
-static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
- float *dst, const int ncols,
- const int nrows,
- dpct::queue_ptr stream) {
- GGML_ASSERT(ncols % QK5_1 == 0);
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
- const sycl::range<3> block_nums(1, 1, block_num_y);
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
- stream->parallel_for(
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
- mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ,
- vec_dot_q5_1_q8_1>(vx, vy, dst, ncols, nrows,
- item_ct1);
- });
-}
-
-static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
- float *dst, const int ncols,
- const int nrows,
- dpct::queue_ptr stream) {
- GGML_ASSERT(ncols % QK8_0 == 0);
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
- const sycl::range<3> block_nums(1, 1, block_num_y);
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
- stream->parallel_for(
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
- mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ,
- vec_dot_q8_0_q8_1>(vx, vy, dst, ncols, nrows,
- item_ct1);
- });
-}
-
-static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
- float *dst, const int ncols,
- const int nrows,
- dpct::queue_ptr stream) {
- GGML_ASSERT(ncols % QK_K == 0);
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
- const sycl::range<3> block_nums(1, 1, block_num_y);
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
- stream->parallel_for(
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
- mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ,
- vec_dot_q2_K_q8_1>(vx, vy, dst, ncols, nrows,
- item_ct1);
- });
-}
-
-static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
- float *dst, const int ncols,
- const int nrows,
- dpct::queue_ptr stream) {
- GGML_ASSERT(ncols % QK_K == 0);
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
- const sycl::range<3> block_nums(1, 1, block_num_y);
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
- stream->parallel_for(
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
- mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ,
- vec_dot_q3_K_q8_1>(vx, vy, dst, ncols, nrows,
- item_ct1);
- });
-}
-
-static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
- float *dst, const int ncols,
- const int nrows,
- dpct::queue_ptr stream) {
- GGML_ASSERT(ncols % QK_K == 0);
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
- const sycl::range<3> block_nums(1, 1, block_num_y);
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
- stream->parallel_for(
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
- mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ,
- vec_dot_q4_K_q8_1>(vx, vy, dst, ncols, nrows,
- item_ct1);
- });
-}
-
-static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
- float *dst, const int ncols,
- const int nrows,
- dpct::queue_ptr stream) {
- GGML_ASSERT(ncols % QK_K == 0);
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
- const sycl::range<3> block_nums(1, 1, block_num_y);
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
- stream->parallel_for(
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
- mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ,
- vec_dot_q5_K_q8_1>(vx, vy, dst, ncols, nrows,
- item_ct1);
- });
-}
-
-static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
- float *dst, const int ncols,
- const int nrows,
- dpct::queue_ptr stream) {
- GGML_ASSERT(ncols % QK_K == 0);
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
- const sycl::range<3> block_nums(1, 1, block_num_y);
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
- stream->parallel_for(
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
- mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ,
- vec_dot_q6_K_q8_1>(vx, vy, dst, ncols, nrows,
- item_ct1);
- });
+template <int qk, int qi, typename block_q_t, int vdr,
+ vec_dot_q_sycl_t vec_dot_q_sycl>
+static void mul_mat_vec_q_sycl_submitter(const void *vx, const void *vy,
+ float *dst, const int ncols,
+ const int nrows,
+ dpct::queue_ptr stream) {
+ GGML_ASSERT(ncols % QK4_0 == 0);
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+ const sycl::range<3> block_nums(1, 1, block_num_y);
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+ stream->parallel_for(
+ sycl::nd_range<3>(block_nums * block_dims, block_dims), [=
+ ](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+ mul_mat_vec_q<qk, qi, block_q_t, vdr, vec_dot_q_sycl>(
+ vx, vy, dst, ncols, nrows, item_ct1);
+ });
}
int get_device_index_by_id(int id){
const int64_t ne00 = src0->ne[0];
const int64_t row_diff = row_high - row_low;
+ // TODO: support these quantization types
+ GGML_ASSERT(!(src0->type == GGML_TYPE_IQ2_XXS ||
+ src0->type == GGML_TYPE_IQ2_XS ||
+ src0->type == GGML_TYPE_IQ3_XXS ||
+ src0->type == GGML_TYPE_IQ1_S));
+
switch (src0->type) {
case GGML_TYPE_Q4_0:
- mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
- break;
+ mul_mat_vec_q_sycl_submitter<QK4_0, QI4_0, block_q4_0,
+ VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+ break;
case GGML_TYPE_Q4_1:
- mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
- break;
+ mul_mat_vec_q_sycl_submitter<QK4_1, QI4_1, block_q4_1,
+ VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+ break;
case GGML_TYPE_Q5_0:
- mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
- break;
+ mul_mat_vec_q_sycl_submitter<QK5_0, QI5_0, block_q5_0,
+ VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+ break;
case GGML_TYPE_Q5_1:
- mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
- break;
+ mul_mat_vec_q_sycl_submitter<QK5_1, QI5_1, block_q5_1,
+ VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+ break;
case GGML_TYPE_Q8_0:
- mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
- break;
+ mul_mat_vec_q_sycl_submitter<QK8_0, QI8_0, block_q8_0,
+ VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+ break;
case GGML_TYPE_Q2_K:
- mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
- break;
+ mul_mat_vec_q_sycl_submitter<QK_K, QI2_K, block_q2_K,
+ VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+ break;
case GGML_TYPE_Q3_K:
- mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
- break;
+ mul_mat_vec_q_sycl_submitter<QK_K, QI3_K, block_q3_K,
+ VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+ break;
case GGML_TYPE_Q4_K:
- mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
- break;
+ mul_mat_vec_q_sycl_submitter<QK_K, QI4_K, block_q4_K,
+ VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+ break;
case GGML_TYPE_Q5_K:
- mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
- break;
+ mul_mat_vec_q_sycl_submitter<QK_K, QI5_K, block_q5_K,
+ VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+ break;
case GGML_TYPE_Q6_K:
- mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
- break;
+ mul_mat_vec_q_sycl_submitter<QK_K, QI6_K, block_q6_K,
+ VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+ break;
default:
GGML_ASSERT(false);
break;
const int64_t src1_ncols, const int64_t src1_padded_row_size,
const dpct::queue_ptr &stream) {
- GGML_TENSOR_BINARY_OP_LOCALS
+ GGML_TENSOR_BINARY_OP_LOCALS;
const int64_t row_diff = row_high - row_low;
return false;
}
+ if (a->type == GGML_TYPE_IQ1_S) {
+ return false;
+ }
+ if (a->type == GGML_TYPE_IQ3_XXS) {
+ return false;
+ }
if (a->type == GGML_TYPE_IQ2_XXS) {
return false;
}