#include <cblas.h>
#elif defined(GGML_USE_CUBLAS)
#include "ggml-cuda.h"
+#elif defined(GGML_USE_CLBLAST)
+#include "ggml-opencl.h"
#endif
#undef MIN
}
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
+#elif defined(__AVX2__)
+ // Initialize accumulator with zeros
+ __m256 acc = _mm256_setzero_ps();
+
+ // Main loop
+ for (int i = 0; i < nb; ++i) {
+ // Compute combined scale for the block
+ const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
+ __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs);
+ __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+
+ const __m256 q = mul_sum_i8_pairs_float(bx, by);
+
+ // Multiply q with scale and accumulate
+ acc = _mm256_fmadd_ps( d, q, acc );
+ }
+
+ *s = hsum_float_8(acc);
#else
// scalar
float sumf = 0.0;
// initialize cuBLAS
#if defined(GGML_USE_CUBLAS)
ggml_init_cublas();
+ #elif defined(GGML_USE_CLBLAST)
+ ggml_cl_init();
#endif
is_first_call = false;
// ggml_compute_forward_mul_mat
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
// helper function to determine if it is better to use BLAS or not
// for large matrices, BLAS is faster
static bool ggml_compute_forward_mul_mat_use_blas(
return false;
}
+
#endif
static void ggml_compute_forward_mul_mat_f32(
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[3];
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
const int64_t ne10 = src1->ne[0];
#endif
const int64_t ne11 = src1->ne[1];
// nb01 >= nb00 - src0 is not transposed
// compute by src0 rows
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
if (params->ith != 0) {
return;
// copy data to host
CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
-#else
+#elif defined(GGML_USE_CLBLAST)
// zT = y * xT
+ ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
+ ne11, ne01, ne10,
+ 1.0f, y, ne10,
+ x, ne10,
+ 0.0f, d, ne01,
+ GGML_TYPE_F32);
+#else
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
ne11, ne01, ne10,
1.0f, y, ne10,
// nb01 >= nb00 - src0 is not transposed
// compute by src0 rows
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
GGML_ASSERT(nb10 == sizeof(float));
// copy data to host
CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
+#elif defined(GGML_USE_CLBLAST)
+ const float * x = wdata;
+ const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
+
+ float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+
+ // zT = y * xT
+ ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
+ ne11, ne01, ne10,
+ 1.0f, y, ne10,
+ x, ne10,
+ 0.0f, d, ne01,
+ GGML_TYPE_F32);
#else
const float * x = wdata;
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
// nb01 >= nb00 - src0 is not transposed
// compute by src0 rows
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
if (params->ith != 0) {
return;
else {
GGML_ASSERT(false);
}
-#else
+#elif !defined(GGML_USE_CLBLAST)
float * const wdata = params->wdata;
dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
#endif
dequantize_row_q_cuda(d_Q, d_X, ne01 * ne00, g_cudaStream);
CUDA_CHECK(cudaGetLastError());
+#elif defined(GGML_USE_CLBLAST)
+ const void* x = (char *) src0->data + i03*nb03 + i02*nb02;
#else
{
size_t id = 0;
// copy data to host
CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
-#else
+#elif defined(GGML_USE_CLBLAST)
// zT = y * xT
+ ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
+ ne11, ne01, ne10,
+ 1.0f, y, ne10,
+ x, ne10,
+ 0.0f, d, ne01,
+ type);
+#else
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
ne11, ne01, ne10,
1.0f, y, ne10,
size_t cur = 0;
if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
node->n_tasks = 1; // TODO: this actually is doing nothing
// the threads are still spinning
} else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
cur = 0;
} else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
node->n_tasks = 1;
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
}
int ggml_cpu_has_blas(void) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
return 1;
#else
return 0;
#endif
}
+int ggml_cpu_has_clblast(void) {
+#if defined(GGML_USE_CLBLAST)
+ return 1;
+#else
+ return 0;
+#endif
+}
+
+int ggml_cpu_has_gpublas(void) {
+ return ggml_cpu_has_cublas() || ggml_cpu_has_clblast();
+}
+
int ggml_cpu_has_sse3(void) {
#if defined(__SSE3__)
return 1;