const int nthreads = item_ct1.get_local_range(2);
const int nwarps = nthreads / WARP_SIZE;
- assert(nwarps % WARP_SIZE == 0);
sycl::float2 mean_var = sycl::float2(0.f, 0.f);
for (int col = tid; col < ncols; col += block_size) {
int end = start + group_size;
const int nthreads = item_ct1.get_local_range(2);
const int nwarps = nthreads / WARP_SIZE;
- assert(nwarps % WARP_SIZE == 0);
start += item_ct1.get_local_id(2);
int nreduce = nwarps / WARP_SIZE;
const int tid = item_ct1.get_local_id(2);
const int nthreads = item_ct1.get_local_range(2);
const int nwarps = nthreads / WARP_SIZE;
- assert(nwarps % WARP_SIZE == 0);
float tmp = 0.0f; // partial sum for thread in warp
for (int col = tid; col < ncols; col += block_size) {
}
else {
const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
+ assert(work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
const sycl::range<3> block_dims(1, 1, work_group_size);
/*
DPCT1049:17: The work-group size passed to the SYCL kernel may exceed
}
else {
const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
+ assert(work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
const sycl::range<3> block_dims(1, 1, work_group_size);
/*
DPCT1049:18: The work-group size passed to the SYCL kernel may exceed
}
else {
const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
+ assert(work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
const sycl::range<3> block_dims(1, 1, work_group_size);
/*
DPCT1049:19: The work-group size passed to the SYCL kernel may exceed
#include <sycl/sycl.hpp>
+#include <oneapi/mkl.hpp>
#include "outprod.hpp"
try {
// Perform matrix multiplication using oneMKL GEMM
- oneapi::mkl::blas::gemm(*stream,
+ oneapi::mkl::blas::column_major::gemm(*stream,
oneapi::mkl::transpose::nontrans, src1_op,
ne0, ne1, ne01,
alpha,