} \
} while (0)
+#if CUDART_VERSION >= 12
#define CUBLAS_CHECK(err) \
do { \
cublasStatus_t err_ = (err); \
if (err_ != CUBLAS_STATUS_SUCCESS) { \
- fprintf(stderr, "cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
+ fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \
+ err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \
exit(1); \
} \
} while (0)
+#else
+#define CUBLAS_CHECK(err) \
+ do { \
+ cublasStatus_t err_ = (err); \
+ if (err_ != CUBLAS_STATUS_SUCCESS) { \
+ fprintf(stderr, "\ncuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
+ exit(1); \
+ } \
+ } while (0)
+#endif // CUDART_VERSION >= 11
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, float & v0, float & v1);
typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
-typedef void (*dequantize_mul_mat_vec_cuda_t)(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream);
typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v);
+typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+typedef void (*ggml_cuda_op_t)(
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, float * src0_ddf_i,
+ float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
+ cudaStream_t & cudaStream_main);
// QK = number of values after dequantization
// QR = QK / number of values before dequantization
#define WARP_SIZE 32
+#define CUDA_ADD_BLOCK_SIZE 256
#define CUDA_MUL_BLOCK_SIZE 256
-
+#define CUDA_SILU_BLOCK_SIZE 256
+#define CUDA_ROPE_BLOCK_SIZE 256
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
// dmmv = dequantize_mul_mat_vec
#define GGML_CUDA_DMMV_Y 1
#endif
+static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+ if (i >= k) {
+ return;
+ }
+ dst[i] = x[i] + y[i];
+}
+
static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
dst[i] = x[i] * y[i%ky];
}
+static __global__ void silu_f32(const float * x, float * dst, const int k) {
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+ if (i >= k) {
+ return;
+ }
+ dst[i] = x[i] / (1.0f + expf(-x[i]));
+}
+
+static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols) {
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
+ const int tid = threadIdx.x;
+
+ const float eps = 1e-6;
+
+ float tmp = 0.0f; // partial sum for thread in warp
+
+ for (int i = 0; i < ncols; i += WARP_SIZE) {
+ const int col = i + tid;
+ const float xi = x[row*ncols + col];
+ tmp += xi * xi;
+ }
+
+ // sum up partial sums
+ __syncthreads();
+#pragma unroll
+ for (int mask = 16; mask > 0; mask >>= 1) {
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
+ }
+
+ const float mean = tmp / ncols;
+ const float scale = 1.0f / sqrtf(mean + eps);
+
+ for (int i = 0; i < ncols; i += WARP_SIZE) {
+ const int col = i + tid;
+ dst[row*ncols + col] = scale * x[row*ncols + col];
+ }
+}
+
static __device__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
const block_q4_0 * x = (const block_q4_0 *) vx;
static __device__ void convert_f16(const void * vx, const int ib, const int iqs, float & v0, float & v1){
const half * x = (const half *) vx;
- v0 = __half2float(x[ib + 0]);
- v1 = __half2float(x[ib + 1]);
+ v0 = __half2float(x[ib + iqs + 0]);
+ v1 = __half2float(x[ib + iqs + 1]);
}
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
const int y_offset = qr == 1 ? 1 : qk/2;
- float tmp = 0; // partial sum for thread in warp
+ float tmp = 0.0f; // partial sum for thread in warp
for (int i = 0; i < ncols; i += iter_stride) {
const int col = i + vals_per_iter*tid;
}
}
+static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p, const float theta_scale) {
+ const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x);
+
+ if (col >= ncols) {
+ return;
+ }
+
+ const int row = blockDim.y*blockIdx.y + threadIdx.y;
+ const int i = row*ncols + col;
+
+ const float theta = p*powf(theta_scale, col/2);
+ const float sin_theta = sinf(theta);
+ const float cos_theta = cosf(theta);
+
+ const float x0 = x[i + 0];
+ const float x1 = x[i + 1];
+
+ dst[i + 0] = x0*cos_theta - x1*sin_theta;
+ dst[i + 1] = x0*sin_theta + x1*cos_theta;
+}
+
+static void add_f32_cuda(const float * x, const float * y, float * dst, const int k, cudaStream_t stream) {
+ const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
+ add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
+}
+
static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
}
+static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
+ const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
+ silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
+}
+
+static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+ GGML_ASSERT(ncols % WARP_SIZE == 0);
+ const dim3 block_dims(WARP_SIZE, 1, 1);
+ rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
+}
+
static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
- dequantize_block<32, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
+ dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
}
static void convert_mul_mat_vec_f16_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
}
}
-static dequantize_mul_mat_vec_cuda_t ggml_get_dequantize_mul_mat_vec_cuda(ggml_type type) {
- switch (type) {
- case GGML_TYPE_Q4_0:
- return dequantize_mul_mat_vec_q4_0_cuda;
- case GGML_TYPE_Q4_1:
- return dequantize_mul_mat_vec_q4_1_cuda;
- case GGML_TYPE_Q5_0:
- return dequantize_mul_mat_vec_q5_0_cuda;
- case GGML_TYPE_Q5_1:
- return dequantize_mul_mat_vec_q5_1_cuda;
- case GGML_TYPE_Q8_0:
- return dequantize_mul_mat_vec_q8_0_cuda;
- case GGML_TYPE_Q2_K:
- return dequantize_mul_mat_vec_q2_k_cuda;
- case GGML_TYPE_Q3_K:
- return dequantize_mul_mat_vec_q3_k_cuda;
- case GGML_TYPE_Q4_K:
- return dequantize_mul_mat_vec_q4_k_cuda;
- case GGML_TYPE_Q5_K:
- return dequantize_mul_mat_vec_q5_k_cuda;
- case GGML_TYPE_Q6_K:
- return dequantize_mul_mat_vec_q6_k_cuda;
- case GGML_TYPE_F16:
- return convert_mul_mat_vec_f16_cuda;
- default:
- return nullptr;
- }
+static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float theta_scale, cudaStream_t stream) {
+ GGML_ASSERT(nrows % 2 == 0);
+ const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1, 1);
+ const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
+ const dim3 block_nums(num_blocks_x, nrows, 1);
+ rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, theta_scale);
}
// buffer pool for cuda
size_t size = 0;
};
-static cuda_buffer g_cuda_buffer_pool[MAX_CUDA_BUFFERS];
+static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
scoped_spin_lock lock(g_cuda_pool_lock);
+ int id;
+ CUDA_CHECK(cudaGetDevice(&id));
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
- cuda_buffer& b = g_cuda_buffer_pool[i];
+ cuda_buffer& b = g_cuda_buffer_pool[id][i];
if (b.size >= size && b.ptr != nullptr) {
void * ptr = b.ptr;
*actual_size = b.size;
static void ggml_cuda_pool_free(void * ptr, size_t size) {
scoped_spin_lock lock(g_cuda_pool_lock);
+ int id;
+ CUDA_CHECK(cudaGetDevice(&id));
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
- cuda_buffer& b = g_cuda_buffer_pool[i];
+ cuda_buffer& b = g_cuda_buffer_pool[id][i];
if (b.ptr == nullptr) {
b.ptr = ptr;
b.size = size;
CUDA_CHECK(cudaFree(ptr));
}
+
+static void * g_scratch_buffer = nullptr;
+static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
+static size_t g_scratch_offset = 0;
+
#define GGML_CUDA_MAX_STREAMS 8 // Set this to 1 for reproducible matrix multiplication.
#define GGML_CUDA_MAX_EVENTS 64
-static cublasHandle_t g_cublasH = nullptr;
-static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_STREAMS] = { nullptr };
-static cudaStream_t g_cudaStreams2[GGML_CUDA_MAX_STREAMS] = { nullptr };
-static cudaEvent_t g_cudaEvents[GGML_CUDA_MAX_EVENTS] = { nullptr };
+
+static int g_device_count = -1;
+static int g_main_device = 0;
+static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
+
+static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
+
+static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr };
+
+static cudaStream_t g_cudaStreams_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr };
+static cudaEvent_t g_cudaEvents_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_EVENTS] = { nullptr };
void ggml_init_cublas() {
- if (g_cublasH == nullptr) {
- // create streams
- for (int i = 0; i < GGML_CUDA_MAX_STREAMS; ++i) {
- CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[i], cudaStreamNonBlocking));
- CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams2[i], cudaStreamNonBlocking));
+ static bool initialized = false;
+
+ if (!initialized) {
+ CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
+ GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
+ int64_t total_vram = 0;
+ fprintf(stderr, "%s: found %d CUDA devices:\n", __func__, g_device_count);
+ for (int id = 0; id < g_device_count; ++id) {
+ cudaDeviceProp prop;
+ CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
+ fprintf(stderr, " Device %d: %s\n", id, prop.name);
+ g_tensor_split[id] = total_vram;
+ total_vram += prop.totalGlobalMem;
}
- // create events
- for (int i = 0; i < GGML_CUDA_MAX_EVENTS; ++i) {
- CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvents[i], cudaEventDisableTiming));
+ for (int id = 0; id < g_device_count; ++id) {
+ g_tensor_split[id] /= total_vram;
}
- // create cublas handle
- CUBLAS_CHECK(cublasCreate(&g_cublasH));
- CUBLAS_CHECK(cublasSetMathMode(g_cublasH, CUBLAS_TF32_TENSOR_OP_MATH));
+ for (int id = 0; id < g_device_count; ++id) {
+ CUDA_CHECK(cudaSetDevice(id));
+
+ // create streams
+ for (int i = 0; i < GGML_CUDA_MAX_STREAMS; ++i) {
+ CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id][i], cudaStreamNonBlocking));
+ CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_memcpy_src1[id][i], cudaStreamNonBlocking));
+ }
+ // create events
+ for (int i = 0; i < GGML_CUDA_MAX_EVENTS; ++i) {
+ CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvents_memcpy_src1[id][i], cudaEventDisableTiming));
+ }
+
+ // create cublas handle
+ CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
+ CUBLAS_CHECK(cublasSetMathMode(g_cublas_handles[id], CUBLAS_TF32_TENSOR_OP_MATH));
+ }
// configure logging to stdout
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
+
+ initialized = true;
+ }
+}
+
+void ggml_cuda_set_tensor_split(const float * tensor_split) {
+ bool all_zero = true;
+ for (int i = 0; i < g_device_count; ++i) {
+ if (tensor_split[i] != 0.0f) {
+ all_zero = false;
+ break;
+ }
+ }
+ if (all_zero) {
+ return;
+ }
+ float split_sum = 0.0f;
+ for (int i = 0; i < g_device_count; ++i) {
+ g_tensor_split[i] = split_sum;
+ split_sum += tensor_split[i];
+ }
+ for (int i = 0; i < g_device_count; ++i) {
+ g_tensor_split[i] /= split_sum;
}
}
CUDA_CHECK(cudaFreeHost(ptr));
}
-static cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream) {
- const uint64_t ne0 = src->ne[0];
- const uint64_t ne1 = src->ne[1];
- const uint64_t nb0 = src->nb[0];
- const uint64_t nb1 = src->nb[1];
- const uint64_t nb2 = src->nb[2];
- const uint64_t nb3 = src->nb[3];
+static cudaError_t ggml_cuda_h2d_tensor_2d(
+ void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
+
+ char * dst_char = (char *) dst;
+ const int64_t ne0 = src->ne[0];
+ const int64_t nb0 = src->nb[0];
+ const int64_t nb1 = src->nb[1];
+ const int64_t nb2 = src->nb[2];
+ const int64_t nb3 = src->nb[3];
const enum ggml_type type = src->type;
- const size_t ts = ggml_type_size(type);
- const size_t bs = ggml_blck_size(type);
+ const int64_t ts = ggml_type_size(type);
+ const int64_t bs = ggml_blck_size(type);
+ int64_t i1_diff = i1_high - i1_low;
- const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3);
+ const void * x = (const void *) ((const char *) src->data + i1_low*nb1 + i2*nb2 + i3*nb3);
if (nb0 == ts && nb1 == ts*ne0/bs) {
- return cudaMemcpyAsync(dst, x, ne1*nb1, cudaMemcpyHostToDevice, stream);
+ return cudaMemcpyAsync(dst_char, x, i1_diff*nb1, cudaMemcpyHostToDevice, stream);
} else if (nb0 == ts) {
- return cudaMemcpy2DAsync(dst, ts*ne0/bs, x, nb1, ts*ne0/bs, ne1, cudaMemcpyHostToDevice, stream);
+ return cudaMemcpy2DAsync(dst_char, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, cudaMemcpyHostToDevice, stream);
} else {
- for (uint64_t i1 = 0; i1 < ne1; i1++) {
+ for (int64_t i1 = 0; i1 < i1_diff; i1++) {
const void * rx = (const void *) ((const char *) x + i1*nb1);
- void * rd = (void *) ((char *) dst + i1*ts*ne0/bs);
+ void * rd = (void *) (dst_char + i1*ts*ne0/bs);
// pretend the row is a matrix with cols=1
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, cudaMemcpyHostToDevice, stream);
if (r != cudaSuccess) return r;
}
}
-static void ggml_cuda_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
- GGML_ASSERT(src1->backend == GGML_BACKEND_CUDA);
+inline void ggml_cuda_op_add(
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
+ cudaStream_t & cudaStream_main){
+
+ GGML_ASSERT(src0_ddf_i != nullptr);
+ GGML_ASSERT(src1_ddf_i != nullptr);
+ GGML_ASSERT(dst_ddf_i != nullptr);
+
+ const int64_t ne0 = src0->ne[0];
+ const int64_t i01_diff = i01_high - i01_low;
+
+ // compute
+ add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main);
+ CUDA_CHECK(cudaGetLastError());
+
+ (void) src1;
+ (void) dst;
+ (void) src0_ddq_i;
+ (void) i02;
+ (void) i1;
+}
+
+inline void ggml_cuda_op_mul(
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
+ cudaStream_t & cudaStream_main){
+
+ GGML_ASSERT(src0_ddf_i != nullptr);
+ GGML_ASSERT(src1_ddf_i != nullptr);
+ GGML_ASSERT(dst_ddf_i != nullptr);
+
const int64_t ne00 = src0->ne[0];
- const int64_t ne01 = src0->ne[1];
- const int64_t ne02 = src0->ne[2];
- const int64_t ne03 = src0->ne[2];
- const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
+
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
- const int64_t ne12 = src1->ne[2];
- const int64_t ne13 = src1->ne[3];
- const int nb2 = dst->nb[2];
- const int nb3 = dst->nb[3];
- size_t x_size, d_size;
-
- float * d_X = (float *) ggml_cuda_pool_malloc(ne0 * sizeof(float), &x_size); // src0
- float * d_Y = (float *) src1->data; // src1 is already on device, broadcasted.
- float * d_D = (float *) ggml_cuda_pool_malloc(ne0 * sizeof(float), &d_size); // dst
-
- for (int64_t i03 = 0; i03 < ne03; i03++) {
- for (int64_t i02 = 0; i02 < ne02; i02++) {
- const int i0 = i03*ne02 + i02;
- float * c_X2 = d_X + i0*ne01*ne00;
- float * c_D2 = d_D + i0*ne01*ne00;
-
- cudaStream_t cudaStream = g_cudaStreams[i0 % GGML_CUDA_MAX_STREAMS];
- cudaStream_t cudaStream2 = g_cudaStreams2[i0 % GGML_CUDA_MAX_STREAMS];
- cudaEvent_t cudaEvent = g_cudaEvents[i0 % GGML_CUDA_MAX_EVENTS];
-
- // copy src0 to device
- CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_X2, src0, i03, i02, cudaStream2));
- CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2));
-
- // wait for data
- CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0));
-
- for (int64_t i01 = 0; i01 < ne01; i01++) {
- const int64_t i13 = i03%ne13;
- const int64_t i12 = i02%ne12;
- const int64_t i11 = i01%ne11;
- const int i1 = i13*ne12*ne11 + i12*ne11 + i11;
-
- float * c_X1 = c_X2 + i01*ne00;
- float * c_Y = d_Y + i1*ne10;
- float * c_D1 = c_D2 + i01*ne00;
-
- // compute
- mul_f32_cuda(c_X1, c_Y, c_D1, ne00, ne10, cudaStream);
- CUDA_CHECK(cudaGetLastError());
- }
- // copy dst to host
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
- CUDA_CHECK(cudaMemcpyAsync(d, c_D2, sizeof(float)*ne00*ne01, cudaMemcpyDeviceToHost, cudaStream));
- }
+ for (int64_t i01 = i01_low; i01 < i01_high; i01++) {
+ const int64_t i11 = i1*ne11 + i01%ne11; // broadcast src1 across src0
+
+ float * src0_ddf_i01 = src0_ddf_i + i01*ne00;
+ float * src1_ddf_i01 = src1_ddf_i + i11*ne10;
+ float * dst_ddf_i01 = dst_ddf_i + i01*ne00;
+
+ // compute
+ mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
+ CUDA_CHECK(cudaGetLastError());
}
- CUDA_CHECK(cudaDeviceSynchronize());
- ggml_cuda_pool_free(d_X, x_size);
- ggml_cuda_pool_free(d_D, d_size);
+
+ (void) dst;
+ (void) src0_ddq_i;
+ (void) i02;
}
-static void ggml_cuda_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+inline void ggml_cuda_op_silu(
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
+ cudaStream_t & cudaStream_main){
+
+ GGML_ASSERT(src0_ddf_i != nullptr);
+ GGML_ASSERT(dst_ddf_i != nullptr);
+
const int64_t ne00 = src0->ne[0];
- const int64_t ne01 = src0->ne[1];
- const int64_t ne02 = src0->ne[2];
- const int64_t ne03 = src0->ne[3];
+ const int64_t i01_diff = i01_high - i01_low;
+
+ // compute
+ silu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
+ CUDA_CHECK(cudaGetLastError());
+
+ (void) src1;
+ (void) dst;
+ (void) src0_ddq_i;
+ (void) src1_ddf_i;
+ (void) i02;
+ (void) i1;
+}
- const int64_t ne10 = src1->ne[0];
- const int64_t ne11 = src1->ne[1];
+inline void ggml_cuda_op_rms_norm(
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
+ cudaStream_t & cudaStream_main){
- const int nb2 = dst->nb[2];
- const int nb3 = dst->nb[3];
+ GGML_ASSERT(src0_ddf_i != nullptr);
+ GGML_ASSERT(dst_ddf_i != nullptr);
- const float alpha = 1.0f;
- const float beta = 0.0f;
- const int x_ne = ne01 * ne00;
- const int y_ne = ne11 * ne10;
- const int d_ne = ne11 * ne01;
- const int n_mm = ne03 * ne02;
-
- size_t x_size, y_size, d_size;
- float * d_X = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * x_ne, &x_size);
- float * d_Y = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * y_ne, &y_size);
- float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size);
-
- for (int64_t i03 = 0; i03 < ne03; i03++) {
- for (int64_t i02 = 0; i02 < ne02; i02++) {
- int i = i03*ne02 + i02;
- cudaStream_t cudaStream = g_cudaStreams[i % GGML_CUDA_MAX_STREAMS];
-
- float * c_X = d_X + i * x_ne;
- float * c_Y = d_Y + i * y_ne;
- float * c_D = d_D + i * d_ne;
-
- // copy data to device
- CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_X, src0, i03, i02, cudaStream));
- CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream));
-
- // compute
- CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream));
- CUBLAS_CHECK(
- cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
- ne01, ne11, ne10,
- &alpha, c_X, ne00,
- c_Y, ne10,
- &beta, c_D, ne01));
-
- // copy dst to host
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
- CUDA_CHECK(cudaMemcpyAsync(d, c_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream));
- }
+ const int64_t ne00 = src0->ne[0];
+ const int64_t i01_diff = i01_high - i01_low;
+
+ // compute
+ rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
+ CUDA_CHECK(cudaGetLastError());
+
+ (void) src1;
+ (void) dst;
+ (void) src0_ddq_i;
+ (void) src1_ddf_i;
+ (void) i02;
+ (void) i1;
+}
+
+inline void ggml_cuda_op_dequantize_mul_mat_vec(
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
+ cudaStream_t & cudaStream_main){
+
+ GGML_ASSERT(src0_ddq_i != nullptr);
+ GGML_ASSERT(src1_ddf_i != nullptr);
+ GGML_ASSERT(dst_ddf_i != nullptr);
+
+ const int64_t ne00 = src0->ne[0];
+ const int64_t nrows = i01_high - i01_low;
+
+ switch (src0->type) {
+ case GGML_TYPE_Q4_0:
+ dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
+ break;
+ case GGML_TYPE_Q4_1:
+ dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
+ break;
+ case GGML_TYPE_Q5_0:
+ dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
+ break;
+ case GGML_TYPE_Q5_1:
+ dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
+ break;
+ case GGML_TYPE_Q8_0:
+ dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
+ break;
+ case GGML_TYPE_Q2_K:
+ dequantize_mul_mat_vec_q2_k_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
+ break;
+ case GGML_TYPE_Q3_K:
+ dequantize_mul_mat_vec_q3_k_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
+ break;
+ case GGML_TYPE_Q4_K:
+ dequantize_mul_mat_vec_q4_k_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
+ break;
+ case GGML_TYPE_Q5_K:
+ dequantize_mul_mat_vec_q5_k_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
+ break;
+ case GGML_TYPE_Q6_K:
+ dequantize_mul_mat_vec_q6_k_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
+ break;
+ case GGML_TYPE_F16:
+ convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
+ break;
+ default:
+ GGML_ASSERT(false);
+ break;
}
+ CUDA_CHECK(cudaGetLastError());
- CUDA_CHECK(cudaDeviceSynchronize());
- ggml_cuda_pool_free(d_X, x_size);
- ggml_cuda_pool_free(d_Y, y_size);
- ggml_cuda_pool_free(d_D, d_size);
+ (void) src1;
+ (void) dst;
+ (void) src0_ddf_i;
+ (void) i02;
+ (void) i1;
}
-static void ggml_cuda_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t /* wsize */) {
+inline void ggml_cuda_op_mul_mat_cublas(
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
+ cudaStream_t & cudaStream_main){
+
+ GGML_ASSERT(src0_ddf_i != nullptr);
+ GGML_ASSERT(src1_ddf_i != nullptr);
+ GGML_ASSERT(dst_ddf_i != nullptr);
+
+ const float alpha = 1.0f;
+ const float beta = 0.0f;
+
const int64_t ne00 = src0->ne[0];
- const int64_t ne01 = src0->ne[1];
- const int64_t ne02 = src0->ne[2];
- const int64_t ne03 = src0->ne[3];
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
- const int nb10 = src1->nb[0];
- const int nb11 = src1->nb[1];
- const int nb12 = src1->nb[2];
- const int nb13 = src1->nb[3];
+ const int64_t ne0 = dst->ne[0];
+ const int64_t i01_diff = i01_high - i01_low;
+
+ int id;
+ CUDA_CHECK(cudaGetDevice(&id));
+
+ // the main device has a larger memory buffer to hold the results from all GPUs
+ // ldc == nrows of the matrix that cuBLAS writes into
+ int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff;
+
+ CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], cudaStream_main));
+ CUBLAS_CHECK(
+ cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
+ i01_diff, ne11, ne10,
+ &alpha, src0_ddf_i, ne00,
+ src1_ddf_i, ne10,
+ &beta, dst_ddf_i, ldc));
+
+ (void) dst;
+ (void) src0_ddq_i;
+ (void) i02;
+ (void) i1;
+}
- const int nb2 = dst->nb[2];
- const int nb3 = dst->nb[3];
+inline void ggml_cuda_op_rope(
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
+ cudaStream_t & cudaStream_main){
- const float alpha = 1.0f;
- const float beta = 0.0f;
- const int x_ne = ne01 * ne00;
- const int y_ne = ne11 * ne10;
- const int d_ne = ne11 * ne01;
- const int n_mm = ne03 * ne02;
-
- size_t x_size, y_size, d_size;
- half * d_X = (half *) ggml_cuda_pool_malloc(n_mm * sizeof(half) * x_ne, &x_size);
- half * d_Y = (half *) ggml_cuda_pool_malloc(n_mm * sizeof(half) * y_ne, &y_size);
- float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size);
-
- bool src1_cont_rows = nb10 == sizeof(float);
- bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
-
- for (int64_t i03 = 0; i03 < ne03; i03++) {
- for (int64_t i02 = 0; i02 < ne02; i02++) {
- int i = i03*ne02 + i02;
- cudaStream_t cudaStream = g_cudaStreams[i % GGML_CUDA_MAX_STREAMS];
-
- half * c_X = d_X + i * x_ne;
- half * c_Y = d_Y + i * y_ne;
- float * c_D = d_D + i * d_ne;
-
- // copy src0 to device
- CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_X, src0, i03, i02, cudaStream));
-
- // convert src1 to fp16
- // TODO: use multiple threads
- ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02);
- char * src1i = (char *) src1->data + i03*nb13 + i02*nb12;
- if (src1_cont_rows) {
- if (src1_cont_cols) {
- ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
- }
- else {
- for (int64_t i01 = 0; i01 < ne11; i01++) {
- ggml_fp32_to_fp16_row((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10);
- }
- }
- }
- else {
- for (int64_t i01 = 0; i01 < ne11; i01++) {
- for (int64_t i00 = 0; i00 < ne10; i00++) {
- // very slow due to no inlining
- tmp[i01*ne10 + i00] = ggml_fp32_to_fp16(*(float *) (src1i + i01*nb11 + i00*nb10));
- }
- }
- }
+ GGML_ASSERT(src0_ddf_i != nullptr);
+ GGML_ASSERT(dst_ddf_i != nullptr);
- // copy src1 to device
- CUDA_CHECK(cudaMemcpyAsync(c_Y, tmp, sizeof(half) * y_ne, cudaMemcpyHostToDevice, cudaStream));
-
- // compute
- CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream));
- CUBLAS_CHECK(
- cublasGemmEx(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
- ne01, ne11, ne10,
- &alpha, c_X, CUDA_R_16F, ne00,
- c_Y, CUDA_R_16F, ne10,
- &beta, c_D, CUDA_R_32F, ne01,
- CUBLAS_COMPUTE_32F_FAST_16F,
- CUBLAS_GEMM_DEFAULT));
-
- // copy dst to host
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
- CUDA_CHECK(cudaMemcpyAsync(d, c_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream));
- }
- }
+ const int64_t ne00 = src0->ne[0];
+ const int64_t i01_diff = i01_high - i01_low;
+
+ const int n_past = ((int32_t *) src1->data)[0];
+ const int n_dims = ((int32_t *) src1->data)[1];
+ const int mode = ((int32_t *) src1->data)[2];
+ GGML_ASSERT(mode == 0);
+
+ const float theta_scale = powf(10000.0, -2.0f/n_dims);
+ const float p = ((mode & 1) == 0 ? n_past + i02 : i02);
- CUDA_CHECK(cudaDeviceSynchronize());
- ggml_cuda_pool_free(d_X, x_size);
- ggml_cuda_pool_free(d_Y, y_size);
- ggml_cuda_pool_free(d_D, d_size);
+ // compute
+ rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
+ CUDA_CHECK(cudaGetLastError());
+
+ (void) dst;
+ (void) src0_ddq_i;
+ (void) src1_ddf_i;
+ (void) i1;
}
-static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
+ ggml_cuda_op_t op, bool src0_needs_f32) {
const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[3];
+ const int64_t nrows0 = ggml_nrows(src0);
- const int64_t ne10 = src1->ne[0];
- const int64_t ne11 = src1->ne[1];
+ const bool use_src1 = src1 != nullptr;
+ const int64_t ne10 = use_src1 ? src1->ne[0] : 1;
+ const int64_t ne11 = use_src1 ? src1->ne[1] : 1;
+ const int64_t ne12 = use_src1 ? src1->ne[2] : 1;
+ const int64_t ne13 = use_src1 ? src1->ne[3] : 1;
+
+ const int64_t ne0 = dst->ne[0];
+ const int64_t ne1 = dst->ne[1];
const int nb2 = dst->nb[2];
const int nb3 = dst->nb[3];
- const ggml_type type = src0->type;
- const bool mul_mat_vec = ne11 == 1;
- const float alpha = 1.0f;
- const float beta = 0.0f;
- const int x_ne = ne01 * ne00;
- const int y_ne = ne11 * ne10;
- const int d_ne = ne11 * ne01;
- const int n_mm = ne03 * ne02;
- const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type);
-
- size_t x_size, y_size, d_size, q_size;
- float * d_X = nullptr;
- if (!mul_mat_vec) {
- d_X = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * x_ne, &x_size);
- }
- float * d_Y = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * y_ne, &y_size);
- float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size);
- char * d_Q = (char *) ggml_cuda_pool_malloc(n_mm * q_sz, &q_size);
-
- const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(type);
- dequantize_mul_mat_vec_cuda_t dmmv = ggml_get_dequantize_mul_mat_vec_cuda(type);
- GGML_ASSERT(to_fp32_cuda != nullptr);
-
- for (int64_t i03 = 0; i03 < ne03; i03++) {
- for (int64_t i02 = 0; i02 < ne02; i02++) {
- int i = i03*ne02 + i02;
- cudaStream_t cudaStream = g_cudaStreams[i % GGML_CUDA_MAX_STREAMS];
- cudaStream_t cudaStream2 = g_cudaStreams2[i % GGML_CUDA_MAX_STREAMS];
- cudaEvent_t cudaEvent = g_cudaEvents[i % GGML_CUDA_MAX_EVENTS];
-
- float * c_Y = d_Y + i * y_ne;
- float * c_D = d_D + i * d_ne;
- char * c_Q = d_Q + i * q_sz;
-
- // copy src0 to device if necessary
- if (src0->backend == GGML_BACKEND_CPU) {
- CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Q, src0, i03, i02, cudaStream2));
- } else if (src0->backend == GGML_BACKEND_CUDA) {
- c_Q = ((char *) src0->data) + i * q_sz;
+ GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
+ GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
+
+ // strides for iteration over dims 3 and 2
+ const int64_t src0_stride = ne00 * ne01;
+ const int64_t src1_stride = ne10 * ne11;
+ const int64_t dst_stride = ne0 * ne1;
+ const int64_t num_iters = ne02 * ne03;
+
+ const size_t src0_ts = ggml_type_size(src0->type);
+ const size_t src0_bs = ggml_blck_size(src0->type);
+
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+ struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
+ struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
+
+ const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
+ const bool src0_is_f32 = src0->type == GGML_TYPE_F32;
+
+ const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
+
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
+
+ // dd = data device
+ char * src0_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // quantized
+ float * src0_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
+ float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
+ float * dst_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
+
+ // asq = actual size quantized, asf = actual size float
+ size_t src0_asq[GGML_CUDA_MAX_DEVICES] = {0};
+ size_t src0_asf[GGML_CUDA_MAX_DEVICES] = {0};
+ size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
+ size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
+
+ for (int id = 0; id < g_device_count; ++id) {
+ if (!split && id != g_main_device) {
+ continue;
+ }
+
+ const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU && id == g_main_device;
+ const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
+
+ int64_t row_low, row_high;
+ if (split) {
+ row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
+ row_low -= row_low % GGML_CUDA_DMMV_Y;
+ row_high = id == g_device_count - 1 ? nrows0 : nrows0*g_tensor_split[id + 1];
+ row_high -= row_high % GGML_CUDA_DMMV_Y;
+ } else {
+ row_low = 0;
+ row_high = nrows0;
+ }
+ if (row_low == row_high) {
+ continue;
+ }
+
+ int64_t row_diff = row_high - row_low;
+
+ cudaSetDevice(id);
+
+ if (src0_on_device) {
+ if (src0_is_f32) {
+ src0_ddf[id] = (float *) src0_extra->data_device[id];
+ } else {
+ src0_ddq[id] = (char *) src0_extra->data_device[id];
+ }
+ } else {
+ if (src0_is_f32) {
+ src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]);
} else {
- GGML_ASSERT(false);
+ src0_ddq[id] = (char *) ggml_cuda_pool_malloc(row_diff*ne00 * src0_ts/src0_bs, &src0_asq[id]);
}
- if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
- CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2));
-
- // copy src1 to device
- CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream));
-
- // wait for data
- CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0));
-
- // compute
- //printf("Calling dmmv\n");
- dmmv(c_Q, c_Y, c_D, ne00, ne01, cudaStream);
- CUDA_CHECK(cudaGetLastError());
-
- } else { // general dequantization kernel + cuBLAS matrix matrix multiplication
- float * c_X = d_X + i * x_ne;
-
-//typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
- // convert src0 to fp32 on device
- to_fp32_cuda(c_Q, c_X, x_ne, cudaStream2);
- CUDA_CHECK(cudaGetLastError());
- CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2));
-
- // copy src1 to device
- CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream));
-
- // wait for conversion
- CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0));
-
- // compute
- CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream));
- CUBLAS_CHECK(
- cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
- ne01, ne11, ne10,
- &alpha, c_X, ne00,
- c_Y, ne10,
- &beta, c_D, ne01));
+ }
+
+ if (src0_needs_f32 && !src0_is_f32) {
+ src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]);
+ }
+
+ if (use_src1) {
+ if (src1_on_device) {
+ src1_ddf[id] = (float *) src1_extra->data_device[id];
+ } else {
+ src1_ddf[id] = (float *) ggml_cuda_pool_malloc(num_iters*src1_stride * sizeof(float), &src1_asf[id]);
}
+ }
+ if (dst_on_device) {
+ dst_ddf[id] = (float *) dst_extra->data_device[id];
+ } else {
+ size_t size_dst_ddf = split ? row_diff*ne1 * sizeof(float) : num_iters*dst_stride * sizeof(float);
+ dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]);
+ }
+
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
+ const int64_t i13 = i03 % ne13;
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
+ const int64_t i12 = i02 % ne12;
+
+ const int64_t i0 = i03*ne02 + i02;
+ const int64_t i0_offset_low = row_low/ne01;
+ const int64_t i0_offset_high = row_high/ne01;
+
+ int64_t i01_low = 0;
+ int64_t i01_high = ne01;
+ if (split) {
+ if (i0 < i0_offset_low || i0 > i0_offset_high) {
+ continue;
+ }
+ if (i0 == i0_offset_low) {
+ i01_low = row_low % ne01;
+ }
+ if (i0 == i0_offset_high) {
+ i01_high = row_high % ne01;
+ }
+ }
+ const int64_t i01_diff = i01_high - i01_low;
+ if (i01_diff == 0) {
+ continue;
+ }
+ const int64_t i11 = i13*ne12 + i12;
+
+ cudaStream_t cudaStream_main = g_cudaStreams_main[id][i0 % GGML_CUDA_MAX_STREAMS];
+ cudaStream_t cudaStream_memcpy_src1 = g_cudaStreams_memcpy_src1[id][i0 % GGML_CUDA_MAX_STREAMS];
+ cudaEvent_t cudaEvent_memcpy_src1 = g_cudaEvents_memcpy_src1[id][i0 % GGML_CUDA_MAX_EVENTS];
+
+ // for split tensors the data begins at i0 == i0_offset_low
+ char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
+ float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
+ float * src1_ddf_i = src1_ddf[id] + i11*src1_stride;
+ float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
+
+ // for split tensors the data pointer needs to be rounded down
+ // to the bin edge for i03, i02 bins beyond the first
+ if (i0 - i0_offset_low > 0) {
+ src0_ddq_i -= (row_low % ne01)*ne00 * src0_ts/src0_bs;
+ src0_ddf_i -= (row_low % ne01)*ne00;
+ }
+ if (i0 - i0_offset_low > 0) {
+ dst_ddf_i -= (row_low % ne0)*ne1;
+ }
+
+ // the main device memory buffer can be on VRAM scratch, with space for all partial results
+ // in that case an offset on dst_ddf_i is needed
+ if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) {
+ dst_ddf_i += i01_low; // offset is 0 if no tensor split
+ }
+
+ // copy src0, src1 to device if necessary
+ if (use_src1) {
+ if (src1->backend == GGML_BACKEND_CPU) {
+ CUDA_CHECK(ggml_cuda_h2d_tensor_2d(src1_ddf_i, src1, i03, i02, 0, ne11, cudaStream_memcpy_src1));
+ } else if (src1->backend == GGML_BACKEND_GPU) {
+ if (id != g_main_device) {
+ float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
+ src1_ddf_i_source += i11*src1_stride;
+ CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof(float),
+ cudaMemcpyDeviceToDevice, cudaStream_memcpy_src1));
+ }
+ } else {
+ GGML_ASSERT(false);
+ }
+ }
+ CUDA_CHECK(cudaEventRecord(cudaEvent_memcpy_src1, cudaStream_memcpy_src1));
+ if (!src0_on_device) {
+ if (src0_is_f32) {
+ CUDA_CHECK(ggml_cuda_h2d_tensor_2d(src0_ddf_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
+ } else {
+ CUDA_CHECK(ggml_cuda_h2d_tensor_2d(src0_ddq_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
+ }
+ }
+
+ // convert src0 to f32 if it's necessary for the ggml_cuda_op
+ if (src0_needs_f32 && !src0_is_f32) {
+ to_fp32_cuda(src0_ddq_i, src0_ddf_i, i01_diff*ne00, cudaStream_main);
+ CUDA_CHECK(cudaGetLastError());
+ }
- // copy dst to host
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
- CUDA_CHECK(cudaMemcpyAsync(d, c_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream));
+ // wait with main stream until src1 memcpy is done
+ CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, cudaEvent_memcpy_src1, 0));
+
+ // do the computation
+ op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
+
+ // copy dst to host or other device if necessary
+ if (!dst_on_device) {
+ void * dst_off_device;
+ cudaMemcpyKind kind;
+ if (dst->backend == GGML_BACKEND_CPU) {
+ dst_off_device = dst->data;
+ kind = cudaMemcpyDeviceToHost;
+ } else if (dst->backend == GGML_BACKEND_GPU) {
+ dst_off_device = dst_extra->data_device[g_main_device];
+ kind = cudaMemcpyDeviceToDevice;
+ } else {
+ GGML_ASSERT(false);
+ }
+ if (split) {
+ // src0 = weight matrix is saved as a transposed matrix for better memory layout.
+ // dst is NOT transposed.
+ // The outputs of cuBLAS matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
+ // Instead they need to be copied to the correct slice in ne0 = dst row index.
+ // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
+ for (int64_t j = 0; j < ne1; ++j) {
+ float * dhf_dst_i = (float *) ((char *) dst_off_device + (j*ne0 + i01_low)*sizeof(float) + i02*nb2 + i03*nb3);
+ CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i + j*i01_diff, i01_diff*sizeof(float), kind, cudaStream_main));
+ }
+ } else {
+ float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
+ CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
+ }
+ }
+ }
}
}
- CUDA_CHECK(cudaDeviceSynchronize());
- if (!mul_mat_vec) {
- ggml_cuda_pool_free(d_X, x_size);
+ // wait until each device is finished, then free their buffers
+ for (int id = 0; id < g_device_count; ++id) {
+ CUDA_CHECK(cudaSetDevice(id));
+ CUDA_CHECK(cudaDeviceSynchronize());
+ if (src0_asq[id] > 0) {
+ ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
+ }
+ if (src0_asf[id] > 0) {
+ ggml_cuda_pool_free(src0_ddf[id], src0_asf[id]);
+ }
+ if (src1_asf[id] > 0) {
+ ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
+ }
+ if (dst_asf[id] > 0) {
+ ggml_cuda_pool_free(dst_ddf[id], dst_asf[id]);
+ }
}
- ggml_cuda_pool_free(d_Y, y_size);
- ggml_cuda_pool_free(d_D, d_size);
- ggml_cuda_pool_free(d_Q, q_size);
}
-void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
+void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
- ggml_cuda_mul_f32(src0, src1, dst);
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, true);
+}
+
+void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true);
+}
+
+void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true);
+}
+
+void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true);
}
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
+ GGML_ASSERT(src0->backend != GGML_BACKEND_GPU);
const int64_t ne10 = src1->ne[0];
const int64_t ne0 = dst->ne[0];
const int64_t ne1 = dst->ne[1];
+ // if (strcmp(dst->name, "KQ") == 0 || strcmp(dst->name, "KQV") == 0) {
+ // fprintf(stderr, "(%ld, %ld, %ld, %ld) + (%ld, %ld, %ld, %ld) -> (%ld, %ld, %ld, %ld)\n",
+ // src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
+ // src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
+ // dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
+ // return false;
+ // }
+
// TODO: find the optimal values for these
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
src1->type == GGML_TYPE_F32 &&
dst->type == GGML_TYPE_F32 &&
- ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_CUDA)) {
+ (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
return true;
}
return false;
}
-bool ggml_cuda_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
- size_t src0_sz = ggml_nbytes(src0);
- size_t src1_sz = ggml_nbytes(src1);
-
- // mul_mat_q: src0 is converted to fp32 on device
- size_t mul_mat_q_transfer = src0_sz + src1_sz;
+void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ if (src0->type == GGML_TYPE_F32) {
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true);
+ } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
+ if (src1->ne[1] == 1) {
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
+ } else {
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true);
+ }
+ } else {
+ GGML_ASSERT(false);
+ }
+}
- // mul_mat_f16: src1 is converted to fp16 on cpu
- size_t mul_mat_f16_transfer = src0_sz + sizeof(half) * ggml_nelements(src1);
+void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true);
+}
- // choose the smaller one to transfer to the device
- // TODO: this is not always the best choice due to the overhead of converting to fp16
- return mul_mat_f16_transfer < mul_mat_q_transfer;
+void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ (void) src0;
+ (void) src1;
+ (void) dst;
}
-void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
- GGML_ASSERT(ggml_cuda_can_mul_mat(src0, src1, dst));
+void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const size_t offset) {
+ FILE * fp = fopen(fname, "rb");
+ int nrows = ggml_nrows(tensor);
+ const size_t nb1 = tensor->nb[1];
+ ggml_backend backend = tensor->backend;
+ struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
- if (src0->type == GGML_TYPE_F32) {
- ggml_cuda_mul_mat_f32(src0, src1, dst);
- }
- else if (src0->type == GGML_TYPE_F16) {
- if (ggml_cuda_mul_mat_use_f16(src0, src1, dst)) {
- ggml_cuda_mul_mat_f16(src0, src1, dst, wdata, wsize);
+ for (int id = 0; id < g_device_count; ++id) {
+ extra->data_device[id] = nullptr;
+
+ if (backend == GGML_BACKEND_GPU && id != g_main_device) {
+ continue;
}
- else {
- ggml_cuda_mul_mat_q_f32(src0, src1, dst);
+
+ cudaSetDevice(id);
+
+ int row_low, row_high;
+ if (backend == GGML_BACKEND_GPU) {
+ row_low = 0;
+ row_high = nrows;
+ } else if (backend == GGML_BACKEND_GPU_SPLIT) {
+ row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
+ row_low -= row_low % GGML_CUDA_DMMV_Y;
+ row_high = id == g_device_count - 1 ? nrows : nrows*g_tensor_split[id + 1];
+ row_high -= row_high % GGML_CUDA_DMMV_Y;
+ } else {
+ GGML_ASSERT(false);
}
+ if (row_low == row_high) {
+ continue;
+ }
+
+ int64_t nrows_split = row_high - row_low;
+
+ const size_t offset_split = offset + row_low*nb1;
+ const size_t size = ggml_nbytes_split(tensor, nrows_split);
+
+ void * buf;
+ CUDA_CHECK(cudaMalloc(&buf, size));
+ void * buf_host = malloc(size);
+
+#ifdef _WIN32
+ int ret = _fseeki64(fp, (__int64) offset_split, SEEK_SET);
+#else
+ int ret = fseek(fp, (long) offset_split, SEEK_SET);
+#endif
+ GGML_ASSERT(ret == 0); // same
+
+ size_t ret2 = fread(buf_host, size, 1, fp);
+ if (ret2 != 1) {
+ fprintf(stderr, "unexpectedly reached end of file");
+ exit(1);
+ }
+
+ cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
+ cudaDeviceSynchronize();
+
+ free(buf_host);
+ extra->data_device[id] = buf;
}
- else if (ggml_is_quantized(src0->type)) {
- ggml_cuda_mul_mat_q_f32(src0, src1, dst);
- }
- else {
- GGML_ASSERT(false);
- }
+
+ tensor->extra = extra;
+ fclose(fp);
}
-size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
- if (ggml_cuda_mul_mat_use_f16(src0, src1, dst)) {
- return ggml_nelements(src1) * sizeof(ggml_fp16_t);
+void ggml_cuda_free_data(struct ggml_tensor * tensor) {
+ if (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) {
+ return;
}
- else {
- return 0;
+
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
+
+ for (int id = 0; id < g_device_count; ++id) {
+ if (extra->data_device[id] == nullptr) {
+ continue;
+ }
+
+ CUDA_CHECK(cudaSetDevice(id));
+ CUDA_CHECK(cudaFree(extra->data_device[id]));
}
+
+ delete extra;
}
-void ggml_cuda_transform_tensor(ggml_tensor * tensor) {
- const int64_t ne0 = tensor->ne[0];
- const int64_t ne1 = tensor->ne[1];
- const int64_t ne2 = tensor->ne[2];
- const int64_t ne3 = tensor->ne[3];
+void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
+ if (tensor->src0 != nullptr && tensor->src0->op == GGML_OP_RESHAPE) {
+ ggml_cuda_assign_buffers(tensor);
+ }
- const ggml_type type = tensor->type;
- const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
+ const size_t size = ggml_nbytes(tensor);
+ GGML_ASSERT(size <= g_scratch_size);
+ if (g_scratch_offset + size > g_scratch_size) {
+ g_scratch_offset = 0;
+ }
- size_t q_size;
- char * dst = (char *) ggml_cuda_pool_malloc(q_sz, &q_size);
+ tensor->backend = GGML_BACKEND_GPU;
+ struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
- cudaStream_t cudaStream2 = g_cudaStreams2[0];
+ bool inplace = tensor->src0 != nullptr && tensor->src0->data == tensor->data;
- // copy tensor to device
- for (int64_t i3 = 0; i3 < ne3; i3++) {
- for (int64_t i2 = 0; i2 < ne2; i2++) {
- int i = i3*ne2 + i2;
- CUDA_CHECK(ggml_cuda_h2d_tensor_2d(dst + i*ne0*ne1, tensor, i3, i2, cudaStream2));
+ CUDA_CHECK(cudaSetDevice(g_main_device));
+ if (inplace && tensor->src0->backend == GGML_BACKEND_GPU) {
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra;
+ extra->data_device[g_main_device] = src0_extra->data_device;
+ GGML_ASSERT(false);
+ } else {
+ char * data = (char *) g_scratch_buffer;
+ if (data == nullptr) {
+ CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
+ g_scratch_buffer = data;
}
+ extra->data_device[g_main_device] = data + g_scratch_offset;
}
- tensor->data = dst;
- tensor->backend = GGML_BACKEND_CUDA;
-}
+ // fprintf(stderr, "data=%p offset=%ld data_device=%p\n", data, g_scratch_offset, extra->data_device[0]);
+ g_scratch_offset += size;
+ // fprintf(stderr, "%s: scratch %d, %p - %p\n",
+ // tensor->name, g_scratch_index, data + g_scratch_offset, data + g_scratch_offset + size);
-void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const size_t offset) {
- FILE * fp = fopen(fname, "rb");
+ GGML_ASSERT(g_scratch_offset <= g_scratch_size);
+ tensor->extra = extra;
+}
- const size_t size = ggml_nbytes(tensor);
+void ggml_cuda_set_main_device(int main_device) {
+ if (main_device > g_device_count) {
+ fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
+ main_device, g_device_count, g_main_device);
+ return;
+ }
+ g_main_device = main_device;
+ if (g_device_count > 1) {
+ cudaDeviceProp prop;
+ CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
+ fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
+ }
+}
- void * buf;
- CUDA_CHECK(cudaMalloc(&buf, size));
- void * buf_host = malloc(size);
+void ggml_cuda_set_scratch_size(size_t scratch_size) {
+ g_scratch_size = scratch_size;
+}
-#ifdef _WIN32
- int ret = _fseeki64(fp, (__int64) offset, SEEK_SET);
-#else
- int ret = fseek(fp, (long) offset, SEEK_SET);
-#endif
- GGML_ASSERT(ret == 0); // same
+bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
+ ggml_cuda_func_t func;
+ const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
+ || tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT
+ || (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU);
- size_t ret2 = fread(buf_host, size, 1, fp);
- if (ret2 != 1) {
- fprintf(stderr, "unexpectedly reached end of file");
- exit(1);
+ switch (tensor->op) {
+ case GGML_OP_ADD:
+ if (!any_on_device) {
+ return false;
+ }
+ func = ggml_cuda_add;
+ break;
+ case GGML_OP_MUL:
+ if (!any_on_device) {
+ return false;
+ }
+ func = ggml_cuda_mul;
+ break;
+ case GGML_OP_SILU:
+ if (!any_on_device) {
+ return false;
+ }
+ func = ggml_cuda_silu;
+ break;
+ case GGML_OP_RMS_NORM:
+ if (!any_on_device) {
+ return false;
+ }
+ func = ggml_cuda_rms_norm;
+ break;
+ case GGML_OP_MUL_MAT:
+ if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)) {
+ return false;
+ }
+ func = ggml_cuda_mul_mat;
+ break;
+ case GGML_OP_RESHAPE:
+ if (!any_on_device) {
+ return false;
+ }
+ func = ggml_cuda_nop;
+ break;
+ case GGML_OP_ROPE:
+ if (!any_on_device) {
+ return false;
+ }
+ func = ggml_cuda_rope;
+ break;
+ default:
+ return false;
}
- cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
- cudaDeviceSynchronize();
-
- tensor->data = buf;
- free(buf_host);
- fclose(fp);
+ if (params->ith != 0) {
+ return true;
+ }
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+ return true;
+ }
+ func(tensor->src0, tensor->src1, tensor);
+ return true;
}
// TODO: dynamically determine these sizes
// needs modifications in ggml
+typedef void (*offload_func_t)(struct ggml_tensor * tensor);
+
+void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
+ (void) tensor;
+}
+
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
{
static std::map<e_model, size_t> k_sizes = {
struct ggml_tensor * output;
std::vector<llama_layer> layers;
+ int n_gpu_layers;
// context
struct ggml_context * ctx = NULL;
if (ctx) {
ggml_free(ctx);
}
+
+#ifdef GGML_USE_CUBLAS
+ for (size_t i = 0; i < tensors_by_name.size(); ++i) {
+ ggml_cuda_free_data(tensors_by_name[i].second);
+ }
+#endif // GGML_USE_CUBLAS
}
};
}
ggml_set_name(tensor, lt.name.c_str());
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
+
tensor->backend = backend;
lt.ggml_tensor = tensor;
num_ggml_tensors_created++;
struct llama_context_params llama_context_default_params() {
struct llama_context_params result = {
/*.n_ctx =*/ 512,
+ /*.n_batch =*/ 512,
/*.gpu_layers =*/ 0,
+ /*.main_gpu =*/ 0,
+ /*.tensor_split =*/ {0},
/*.seed =*/ -1,
/*.f16_kv =*/ true,
/*.logits_all =*/ false,
const std::string & fname,
llama_context & lctx,
int n_ctx,
+ int n_batch,
int n_gpu_layers,
+ int main_gpu,
+ const float * tensor_split,
ggml_type memory_type,
bool use_mmap,
bool use_mlock,
lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
auto & model = lctx.model;
model.hparams = ml->file_loaders.at(0)->hparams;
+ model.n_gpu_layers = n_gpu_layers;
llama_file_version file_version = ml->file_loaders.at(0)->file_version;
auto & hparams = model.hparams;
}
#if defined(GGML_USE_CUBLAS)
-#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA
fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
+ ggml_cuda_set_main_device(main_gpu);
+#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
+#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
#elif defined(GGML_USE_CLBLAST)
-#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CL
fprintf(stderr, "%s: using OpenCL for GPU acceleration\n", __func__);
+#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
+#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
#else
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
+#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
#endif
// prepare memory for the weights
- size_t vram_total = 0;
+ size_t vram_weights = 0;
+ size_t vram_scratch = 0;
{
const uint32_t n_embd = hparams.n_embd;
const uint32_t n_layer = hparams.n_layer;
{
ggml_backend backend_output;
if (n_gpu_layers > int(n_layer)) { // NOLINT
- backend_output = LLAMA_BACKEND_OFFLOAD;
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
} else {
backend_output = GGML_BACKEND_CPU;
}
model.layers.resize(n_layer);
for (uint32_t i = 0; i < n_layer; ++i) {
- const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
+ const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
+ const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
auto & layer = model.layers[i];
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
- layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
- layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
- layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
- layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);
+ layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split);
+ layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend_split);
+ layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend_split);
+ layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split);
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
- layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
- layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend);
- layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend);
+ layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split);
+ layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split);
+ layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split);
- if (backend == LLAMA_BACKEND_OFFLOAD) {
- vram_total +=
+ if (backend == GGML_BACKEND_GPU) {
+ vram_weights +=
ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
// this is the total memory required to run the inference
const size_t mem_required =
ctx_size +
- mmapped_size - vram_total + // weights in VRAM not in memory
+ mmapped_size - vram_weights + // weights in VRAM not in memory
MEM_REQ_SCRATCH0().at(model.type) +
MEM_REQ_SCRATCH1().at(model.type) +
MEM_REQ_EVAL().at (model.type);
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
+#ifdef GGML_USE_CUBLAS
+ vram_scratch = n_batch * MB;
+ ggml_cuda_set_scratch_size(vram_scratch);
+ if (n_gpu_layers > 0) {
+ fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
+ __func__, vram_scratch / MB);
+ }
+#endif // GGML_USE_CUBLAS
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
fprintf(stderr, "%s: offloading %d layers to GPU\n", __func__, n_gpu);
if (n_gpu_layers > (int) hparams.n_layer) {
fprintf(stderr, "%s: offloading output layer to GPU\n", __func__);
}
- fprintf(stderr, "%s: total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
+ fprintf(stderr, "%s: total VRAM used: %zu MB\n",
+ __func__, (vram_weights + vram_scratch + MB - 1) / MB); // round up
#else
(void) n_gpu_layers;
#endif
#if defined(GGML_USE_CUBLAS)
{
+ ggml_cuda_set_tensor_split(tensor_split);
+
size_t done_size = 0;
size_t data_size = 0;
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
}
}
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
- if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
+ ggml_backend backend = lt.ggml_tensor->backend;
+ if (backend != GGML_BACKEND_GPU && backend != GGML_BACKEND_GPU_SPLIT) {
continue;
}
if (progress_callback) {
}
}
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
- if (lt.ggml_tensor->backend != GGML_BACKEND_CL) {
+ if (lt.ggml_tensor->backend != GGML_BACKEND_GPU) {
continue;
}
if (progress_callback) {
done_size += lt.size;
}
}
+#else
+ (void) n_batch;
+ (void) tensor_split;
#endif
if (progress_callback) {
const std::string & fname,
llama_context & lctx,
int n_ctx,
+ int n_batch,
int n_gpu_layers,
+ int main_gpu,
+ float * tensor_split,
ggml_type memory_type,
bool use_mmap,
bool use_mlock,
llama_progress_callback progress_callback,
void *progress_callback_user_data) {
try {
- llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock,
- vocab_only, progress_callback, progress_callback_user_data);
+ llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, memory_type,
+ use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
return true;
} catch (const std::exception & err) {
fprintf(stderr, "error loading model: %s\n", err.what());
LLAMA_ASSERT(!!kv_self.ctx);
- const int n_embd = hparams.n_embd;
- const int n_layer = hparams.n_layer;
- const int n_ctx = hparams.n_ctx;
- const int n_head = hparams.n_head;
- const int n_vocab = hparams.n_vocab;
- const int n_rot = hparams.n_embd/hparams.n_head;
+ const int n_embd = hparams.n_embd;
+ const int n_layer = hparams.n_layer;
+ const int n_ctx = hparams.n_ctx;
+ const int n_head = hparams.n_head;
+ const int n_vocab = hparams.n_vocab;
+ const int n_rot = hparams.n_embd/hparams.n_head;
+ const int n_gpu_layers = model.n_gpu_layers;
auto & mem_per_token = lctx.mem_per_token;
auto & buf_compute = lctx.buf_compute;
struct ggml_tensor * cur;
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
+ const int i_gpu_start = n_layer - n_gpu_layers;
+
for (int il = 0; il < n_layer; ++il) {
+ offload_func_t offload_func = llama_nop;
+
+#ifdef GGML_USE_CUBLAS
+ if (il >= i_gpu_start) {
+ offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
+ }
+#endif // GGML_USE_CUBLAS
+
struct ggml_tensor * inpSA = inpL;
lctx.use_buf(ctx0, 0);
// norm
{
cur = ggml_rms_norm(ctx0, inpL);
+ offload_func(cur);
+ ggml_set_name(cur, "rms_norm_0");
// cur = cur*attention_norm(broadcasted)
cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
+ offload_func(cur);
+ ggml_set_name(cur, "attention_norm_0");
}
// self-attention
{
// compute Q and K and RoPE them
+ struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+ // offload_func(tmpq);
+ ggml_set_name(tmpq, "tmpq");
- struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
- struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
- ggml_set_name(Qcur, "Qcur");
+ struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+ // offload_func(tmpk);
+ ggml_set_name(tmpk, "tmpk");
+
+ struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
ggml_set_name(Kcur, "Kcur");
+ struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
+ ggml_set_name(Qcur, "Qcur");
+
// store key and value to memory
{
// compute the transposed [N, n_embd] V matrix
ggml_set_name(Vcur, "Vcur");
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
+ ggml_set_name(k, "k");
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
( n_ctx)*ggml_element_size(kv_self.v),
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
+ ggml_set_name(v, "v");
// important: storing RoPE-ed version of K in the KV cache!
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
cur = ggml_mul_mat(ctx0,
model.layers[il].wo,
cur);
+ offload_func(cur);
+ ggml_set_name(cur, "result_wo");
}
lctx.use_buf(ctx0, 1);
+ //ggml_cuda_set_scratch(1);
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
+ offload_func(inpFF);
+ ggml_set_name(inpFF, "inpFF");
// feed-forward network
{
// norm
{
cur = ggml_rms_norm(ctx0, inpFF);
+ offload_func(cur);
+ ggml_set_name(cur, "rms_norm_1");
// cur = cur*ffn_norm(broadcasted)
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
+ offload_func(cur);
+ ggml_set_name(cur, "ffn_norm");
}
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
model.layers[il].w3,
cur);
+ offload_func(tmp);
+ ggml_set_name(tmp, "result_w3");
cur = ggml_mul_mat(ctx0,
model.layers[il].w1,
cur);
+ offload_func(cur);
+ ggml_set_name(cur, "result_w2");
// SILU activation
cur = ggml_silu(ctx0, cur);
+ offload_func(cur);
+ ggml_set_name(cur, "silu");
cur = ggml_mul(ctx0, cur, tmp);
+ offload_func(cur);
+ ggml_set_name(cur, "silu_x_result_w3");
cur = ggml_mul_mat(ctx0,
model.layers[il].w2,
cur);
+ offload_func(cur);
+ ggml_set_name(cur, "result_w2");
}
cur = ggml_add(ctx0, cur, inpFF);
+ offload_func(cur);
+ ggml_set_name(cur, "inpFF_+_result_w2");
// input for next layer
inpL = cur;
+
}
lctx.use_buf(ctx0, 0);
+ //ggml_cuda_set_scratch(0);
// used at the end to optionally extract the embeddings
struct ggml_tensor * embeddings = NULL;
+ offload_func_t offload_func = llama_nop;
+
+#ifdef GGML_USE_CUBLAS
+ if (n_gpu_layers > n_layer) {
+ offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
+ }
+#endif // GGML_USE_CUBLAS
+
// norm
{
cur = ggml_rms_norm(ctx0, inpL);
+ offload_func(cur);
+ ggml_set_name(cur, "rms_norm_inpL");
+
+ cur = ggml_rms_norm(ctx0, cur);
+ offload_func(cur);
+ ggml_set_name(cur, "rms_norm_after");
// cur = cur*norm(broadcasted)
cur = ggml_mul(ctx0, cur, model.norm);
+ offload_func(cur);
+ ggml_set_name(cur, "result_norm");
embeddings = cur;
}
+
// lm_head
cur = ggml_mul_mat(ctx0, model.output, cur);
+ ggml_set_name(cur, "result_output");
lctx.use_buf(ctx0, -1);
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
- if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, memory_type,
- params.use_mmap, params.use_mlock, params.vocab_only,
- params.progress_callback, params.progress_callback_user_data)) {
+ if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers,
+ params.main_gpu, params.tensor_split, memory_type, params.use_mmap, params.use_mlock,
+ params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
fprintf(stderr, "%s: failed to load model\n", __func__);
llama_free(ctx);
return nullptr;