#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
-#define cudaDeviceGetMemPool hipDeviceGetMemPool
#define cudaDeviceProp hipDeviceProp_t
#define cudaDeviceSynchronize hipDeviceSynchronize
#define cudaError_t hipError_t
#define cudaEvent_t hipEvent_t
#define cudaEventDestroy hipEventDestroy
#define cudaFree hipFree
-#define cudaFreeAsync hipFreeAsync
#define cudaFreeHost hipHostFree
#define cudaGetDevice hipGetDevice
#define cudaGetDeviceCount hipGetDeviceCount
#define cudaGetErrorString hipGetErrorString
#define cudaGetLastError hipGetLastError
#define cudaMalloc hipMalloc
-#define cudaMallocFromPoolAsync hipMallocFromPoolAsync
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
#define cudaMemcpy hipMemcpy
#define cudaMemcpy2DAsync hipMemcpy2DAsync
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
#define cudaMemcpyKind hipMemcpyKind
-#define cudaMemPool_t hipMemPool_t
-#define cudaMemPoolAttrReleaseThreshold hipMemPoolAttrReleaseThreshold
-#define cudaMemPoolSetAttribute hipMemPoolSetAttribute
#define cudaMemset hipMemset
#define cudaMemsetAsync hipMemsetAsync
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
do { \
cudaError_t err_ = (err); \
if (err_ != cudaSuccess) { \
- int dev_id; \
- cudaGetDevice(&dev_id); \
+ int id; \
+ cudaGetDevice(&id); \
fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
cudaGetErrorString(err_)); \
- fprintf(stderr, "current device: %d\n", dev_id); \
+ fprintf(stderr, "current device: %d\n", id); \
exit(1); \
} \
} while (0)
do { \
cublasStatus_t err_ = (err); \
if (err_ != CUBLAS_STATUS_SUCCESS) { \
- int dev_id; \
- cudaGetDevice(&dev_id); \
+ int id; \
+ cudaGetDevice(&id); \
fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \
err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \
- fprintf(stderr, "current device: %d\n", dev_id); \
+ fprintf(stderr, "current device: %d\n", id); \
exit(1); \
} \
} while (0)
#define CUDA_MUL_BLOCK_SIZE 256
#define CUDA_GELU_BLOCK_SIZE 256
#define CUDA_SILU_BLOCK_SIZE 256
+#define CUDA_RELU_BLOCK_SIZE 256
+#define CUDA_SQR_BLOCK_SIZE 256
#define CUDA_CPY_BLOCK_SIZE 32
#define CUDA_SCALE_BLOCK_SIZE 256
#define CUDA_CLAMP_BLOCK_SIZE 256
#define MAX_STREAMS 8
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
-static cudaMemPool_t g_cudaMemPools[GGML_CUDA_MAX_DEVICES] = { nullptr };
struct ggml_tensor_extra_gpu {
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
dst[i] = x[i] / (1.0f + expf(-x[i]));
}
+static __global__ void relu_f32(const float * x, float * dst, const int k) {
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+ if (i >= k) {
+ return;
+ }
+ dst[i] = fmaxf(x[i], 0);
+}
+
+static __global__ void sqr_f32(const float * x, float * dst, const int k) {
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+ if (i >= k) {
+ return;
+ }
+ dst[i] = x[i] * x[i];
+}
+
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
if (row > nrows) return;
const int num_blocks_per_row = ncols / QK_K;
static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
if (row > nrows) return;
const int num_blocks_per_row = ncols / QK_K;
static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
if (row > nrows) return;
const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row;
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
if (row > nrows) return;
const int num_blocks_per_row = ncols / QK_K;
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
if (row >= nrows) {
return;
static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
// qk = quantized weights per x block
// qr = number of quantized weights per data value in x block
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
if (row >= nrows) {
return;
int ofs0, int ofs1, int IW, int IH, int CHW,
int s0, int s1, int p0, int p1, int d0, int d1) {
const int iiw = blockIdx.z * s0 + threadIdx.z * d0 - p0;
- const int iih = blockIdx.y * s1 + threadIdx.y * d1 - p1;
+ const int iih = blockIdx.y * s1 + threadIdx.y * d1 - p1;
const int offset_dst =
(threadIdx.x * gridDim.y * gridDim.z + blockIdx.y * gridDim.z + blockIdx.z) * CHW +
silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}
+static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
+ const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
+ relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
+}
+
+static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
+ const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
+ sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
+}
+
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % WARP_SIZE == 0);
if (ncols < 1024) {
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
- const dim3 block_nums(1, block_num_y, 1);
+ // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
+ const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
- const dim3 block_nums(1, block_num_y, 1);
+ const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
- const dim3 block_nums(1, block_num_y, 1);
+ const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
- const dim3 block_nums(1, block_num_y, 1);
+ const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
- const dim3 block_nums(1, block_num_y, 1);
+ const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
GGML_ASSERT(ncols % QK_K == 0);
const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
const int block_num_y = (nrows + ny - 1) / ny;
- const dim3 block_nums(1, block_num_y, 1);
+ const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(32, ny, 1);
dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
}
GGML_ASSERT(ncols % QK_K == 0);
const int ny = 2 / K_QUANTS_PER_ITERATION;
const int block_num_y = (nrows + ny - 1) / ny;
- const dim3 block_nums(1, block_num_y, 1);
+ const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(32, ny, 1);
dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
}
GGML_ASSERT(ncols % QK_K == 0);
const int ny = 2 / K_QUANTS_PER_ITERATION;
const int block_num_y = (nrows + ny - 1) / ny;
- const dim3 block_nums(1, block_num_y, 1);
+ const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(32, ny, 1);
dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
}
GGML_ASSERT(ncols % QK_K == 0);
const int ny = 2 / K_QUANTS_PER_ITERATION;
const int block_num_y = (nrows + ny - 1) / ny;
- const dim3 block_nums(1, block_num_y, 1);
+ const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(32, ny, 1);
dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
}
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % QK4_0 == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
- const dim3 block_nums(1, block_num_y, 1);
+ const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % QK4_1 == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
- const dim3 block_nums(1, block_num_y, 1);
+ const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % QK5_0 == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
- const dim3 block_nums(1, block_num_y, 1);
+ const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % QK5_1 == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
- const dim3 block_nums(1, block_num_y, 1);
+ const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % QK8_0 == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
- const dim3 block_nums(1, block_num_y, 1);
+ const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % QK_K == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
- const dim3 block_nums(1, block_num_y, 1);
+ const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % QK_K == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
- const dim3 block_nums(1, block_num_y, 1);
+ const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % QK_K == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
- const dim3 block_nums(1, block_num_y, 1);
+ const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % QK_K == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
- const dim3 block_nums(1, block_num_y, 1);
+ const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % QK_K == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
- const dim3 block_nums(1, block_num_y, 1);
+ const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
- const dim3 block_nums(1, block_num_y, 1);
+ const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
dequantize_mul_mat_vec<1, 1, convert_f16>
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
return ptr;
}
-static void * ggml_cuda_pool_malloc_async(size_t size, size_t * actual_size, int id, cudaStream_t stream) {
- if (g_cudaMemPools[id] == nullptr) {
- return ggml_cuda_pool_malloc(size, actual_size);
- }
- void *ptr;
- CUDA_CHECK(cudaMallocFromPoolAsync(&ptr, size, g_cudaMemPools[id], stream));
- *actual_size = size;
- return ptr;
-}
-
static void ggml_cuda_pool_free(void * ptr, size_t size) {
scoped_spin_lock lock(g_cuda_pool_lock);
int id;
CUDA_CHECK(cudaFree(ptr));
}
+static bool g_cublas_loaded = false;
-static void ggml_cuda_pool_free_async(void * ptr, size_t actual_size, int id, cudaStream_t stream) {
- if (g_cudaMemPools[id] == nullptr) {
- return ggml_cuda_pool_free(ptr, actual_size);
- }
- CUDA_CHECK(cudaFreeAsync(ptr, stream));
+bool ggml_cublas_loaded(void) {
+ return g_cublas_loaded;
}
void ggml_init_cublas() {
CUDA_CHECK(cudaDeviceSynchronize());
#endif
- CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
+ if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
+ initialized = true;
+ g_cublas_loaded = false;
+ return;
+ }
+
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
int64_t total_vram = 0;
#if defined(GGML_CUDA_FORCE_MMQ)
// create cublas handle
CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
CUBLAS_CHECK(cublasSetMathMode(g_cublas_handles[id], CUBLAS_TF32_TENSOR_OP_MATH));
-
- // configure memory pool
- cudaError_t err = cudaDeviceGetMemPool(&g_cudaMemPools[id], id);
- if (err == cudaSuccess) {
- size_t treshold = UINT64_MAX;
- CUDA_CHECK(cudaMemPoolSetAttribute(g_cudaMemPools[id], cudaMemPoolAttrReleaseThreshold, &treshold));
- }
}
// configure logging to stdout
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
initialized = true;
+ g_cublas_loaded = true;
}
}
(void) src1_dd;
}
+inline void ggml_cuda_op_relu(
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+ relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+ (void) src1;
+ (void) dst;
+ (void) src1_dd;
+}
+
+inline void ggml_cuda_op_sqr(
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+ sqr_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+ (void) src1;
+ (void) dst;
+ (void) src1_dd;
+}
+
inline void ggml_cuda_op_norm(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
GGML_ASSERT(to_fp16_cuda != nullptr);
size_t ne = row_diff*ne00;
- src0_as_f16 = (half *) ggml_cuda_pool_malloc_async(ne * sizeof(half), &src0_as, id, stream);
+ src0_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src0_as);
to_fp16_cuda(src0_dd_i, src0_as_f16, ne, stream);
}
const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16;
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
GGML_ASSERT(to_fp16_cuda != nullptr);
size_t ne = src1_ncols*ne10;
- src1_as_f16 = (half *) ggml_cuda_pool_malloc_async(ne * sizeof(half), &src1_as, id, stream);
+ src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src1_as);
to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
}
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16;
- size_t dst_f16_as = 0;
- half * dst_f16 = (half *) ggml_cuda_pool_malloc_async(row_diff*src1_ncols * sizeof(half), &dst_f16_as, id, stream);
+ size_t dst_as = 0;
+ half * dst_f16 = (half *) ggml_cuda_pool_malloc(row_diff*src1_ncols * sizeof(half), &dst_as);
const half alpha_f16 = 1.0f;
const half beta_f16 = 0.0f;
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
to_fp32_cuda(dst_f16, dst_dd_i, row_diff*src1_ncols, stream);
- if (dst_f16_as != 0) {
- ggml_cuda_pool_free_async(dst_f16, dst_f16_as, id, stream);
- }
+ ggml_cuda_pool_free(dst_f16, dst_as);
if (src0_as != 0) {
- ggml_cuda_pool_free_async(src0_as_f16, src0_as, id, stream);
+ ggml_cuda_pool_free(src0_as_f16, src0_as);
}
+
if (src1_as != 0) {
- ggml_cuda_pool_free_async(src1_as_f16, src1_as, id, stream);
+ ggml_cuda_pool_free(src1_as_f16, src1_as);
}
}
else {
if (src0->type != GGML_TYPE_F32) {
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
GGML_ASSERT(to_fp32_cuda != nullptr);
- src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc_async(row_diff*ne00 * sizeof(float), &src0_as, id, stream); // NOLINT
+ src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
}
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
&beta, dst_dd_i, ldc));
if (src0_as != 0) {
- ggml_cuda_pool_free_async(src0_ddq_as_f32, src0_as, id, stream);
+ ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
}
}
int64_t row_low[GGML_CUDA_MAX_DEVICES];
int64_t row_high[GGML_CUDA_MAX_DEVICES];
+ int used_devices = 0;
+
for (int64_t id = 0; id < g_device_count; ++id) {
// by default, use all rows
row_low[id] = 0;
continue;
}
+ used_devices++;
+
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
src0_dd[id] = (char *) src0_extra->data_device[id];
} else {
const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
- src0_dd[id] = (char *) ggml_cuda_pool_malloc_async(ggml_nbytes(src0), &src0_as[id], id, stream);
+ src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
}
if (src1_on_device && src1_is_contiguous) {
src1_ddf[id] = (float *) src1_extra->data_device[id];
} else {
- src1_ddf[id] = (float *) ggml_cuda_pool_malloc_async(ggml_nbytes(src1), &src1_asf[id], id, stream);
+ src1_ddf[id] = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf[id]);
}
if (convert_src1_to_q8_1) {
- const size_t size_dst_ddq = nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs;
- src1_ddq[id] = (char *) ggml_cuda_pool_malloc_async(size_dst_ddq, &src1_asq[id], id, stream);
+ src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
if (src1_on_device && src1_is_contiguous) {
quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
- // CUDA_CHECK(cudaGetLastError());
+ CUDA_CHECK(cudaGetLastError());
}
}
dst_dd[id] = (float *) dst_extra->data_device[id];
} else {
const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof(float) : ggml_nbytes(dst);
- dst_dd[id] = (float *) ggml_cuda_pool_malloc_async(size_dst_ddf, &dst_as[id], id, stream);
+ dst_dd[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_as[id]);
}
}
// if multiple devices are used they need to wait for the main device
// here an event is recorded that signals that the main device has finished calculating the input data
- if (split && g_device_count > 1) {
+ if (split && used_devices > 1) {
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
}
- const int64_t src1_col_stride = split && g_device_count > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
+ const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
}
}
+ for (int64_t id = 0; id < g_device_count; ++id) {
+ if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
+ continue;
+ }
+ CUDA_CHECK(ggml_cuda_set_device(id));
+
+ // free buffers again when done
+ if (src0_as[id] > 0) {
+ ggml_cuda_pool_free(src0_dd[id], src0_as[id]);
+ }
+ if (src1_asf[id] > 0) {
+ ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
+ }
+ if (src1_asq[id] > 0) {
+ ggml_cuda_pool_free(src1_ddq[id], src1_asq[id]);
+ }
+ if (dst_as[id] > 0) {
+ ggml_cuda_pool_free(dst_dd[id], dst_as[id]);
+ }
+ }
+
// main device waits for all other devices to be finished
if (split && g_device_count > 1) {
int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
for (int64_t id = 0; id < g_device_count; ++id) {
+ if (row_low[id] == row_high[id]) {
+ continue;
+ }
for (int64_t is = 0; is < is_max; ++is) {
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
}
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
CUDA_CHECK(cudaDeviceSynchronize());
}
-
- for (int64_t id = 0; id < g_device_count; ++id) {
- if (src0_as[id] > 0) {
- ggml_cuda_pool_free_async(src0_dd[id], src0_as[id], id, g_cudaStreams[id][0]);
- }
- if (src1_asf[id] > 0) {
- ggml_cuda_pool_free_async(src1_ddf[id], src1_asf[id], id, g_cudaStreams[id][0]);
- }
- if (src1_asq[id] > 0) {
- ggml_cuda_pool_free_async(src1_ddq[id], src1_asq[id], id, g_cudaStreams[id][0]);
- }
- if (dst_as[id] > 0) {
- ggml_cuda_pool_free_async(dst_dd[id], dst_as[id], id, g_cudaStreams[id][0]);
- }
- }
}
static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
}
+static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
+}
+
+static void ggml_cuda_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sqr);
+}
+
static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
}
}
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
+ if (!g_cublas_loaded) return false;
+
const int64_t ne10 = src1->ne[0];
const int64_t ne0 = dst->ne[0];
GGML_ASSERT(to_fp16_cuda != nullptr);
size_t src1_as = 0;
- half * src1_as_f16 = (half *) ggml_cuda_pool_malloc_async(ne1 * sizeof(half), &src1_as, id, main_stream);
+ half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
size_t dst_as = 0;
- half * dst_f16 = (half *) ggml_cuda_pool_malloc_async(ne * sizeof(half), &dst_as, id, main_stream);
+ half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
GGML_ASSERT(ne12 % ne02 == 0);
GGML_ASSERT(ne13 % ne03 == 0);
size_t ptrs_src_s = 0;
size_t ptrs_dst_s = 0;
- ptrs_src = (const void **) ggml_cuda_pool_malloc_async(2*ne23*sizeof(void *), &ptrs_src_s, id, main_stream);
- ptrs_dst = ( void **) ggml_cuda_pool_malloc_async(1*ne23*sizeof(void *), &ptrs_dst_s, id, main_stream);
+ ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
+ ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
dim3 block_dims(ne13, ne12);
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
dst->nb[2], dst->nb[3],
r2, r3);
CUDA_CHECK(cudaGetLastError());
+
CUBLAS_CHECK(
cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
if (ptrs_src_s != 0) {
- ggml_cuda_pool_free_async(ptrs_src, ptrs_src_s, id, main_stream);
+ ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
}
if (ptrs_dst_s != 0) {
- ggml_cuda_pool_free_async(ptrs_dst, ptrs_dst_s, id, main_stream);
+ ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
}
}
#endif
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
- if (src1_as != 0) {
- ggml_cuda_pool_free_async(src1_as_f16, src1_as, id, main_stream);
- }
- if (dst_as != 0) {
- ggml_cuda_pool_free_async(dst_f16, dst_as, id, main_stream);
- }
+
+ ggml_cuda_pool_free(src1_as_f16, src1_as);
+ ggml_cuda_pool_free(dst_f16, dst_as);
}
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const bool all_on_device =
- (src0->backend == GGML_BACKEND_GPU) &&
+ (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
(src1->backend == GGML_BACKEND_GPU) &&
( dst->backend == GGML_BACKEND_GPU);
+ const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
+
int64_t min_compute_capability = INT_MAX;
for (int64_t id = 0; id < g_device_count; ++id) {
if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
- if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
+ if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
// KQ single-batch
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
- } else if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
+ } else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
// KQV single-batch
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
- } else if (all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
+ } else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
// KQ + KQV multi-batch
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
} else if (src0->type == GGML_TYPE_F32) {
}
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
+ if (!g_cublas_loaded) return false;
+
ggml_cuda_func_t func;
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
case GGML_UNARY_OP_SILU:
func = ggml_cuda_silu;
break;
+ case GGML_UNARY_OP_RELU:
+ func = ggml_cuda_relu;
+ break;
default:
return false;
} break;
case GGML_OP_SCALE:
func = ggml_cuda_scale;
break;
+ case GGML_OP_SQR:
+ func = ggml_cuda_sqr;
+ break;
case GGML_OP_CLAMP:
if (!any_on_device) {
return false;
int n_dims,
int mode,
int n_ctx,
+ int n_orig_ctx,
float freq_base,
float freq_scale,
+ float ext_factor,
+ float attn_factor,
+ float beta_fast,
+ float beta_slow,
float xpos_base,
bool xpos_down) {
GGML_ASSERT(ggml_is_vector(b));
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
- int32_t params[8] = { /*n_past*/ 0, n_dims, mode, n_ctx };
- memcpy(params + 4, &freq_base, sizeof(float));
- memcpy(params + 5, &freq_scale, sizeof(float));
- memcpy(params + 6, &xpos_base, sizeof(float));
- memcpy(params + 7, &xpos_down, sizeof(bool));
+ int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
+ memcpy(params + 5, &freq_base, sizeof(float));
+ memcpy(params + 6, &freq_scale, sizeof(float));
+ memcpy(params + 7, &ext_factor, sizeof(float));
+ memcpy(params + 8, &attn_factor, sizeof(float));
+ memcpy(params + 9, &beta_fast, sizeof(float));
+ memcpy(params + 10, &beta_slow, sizeof(float));
+ memcpy(params + 11, &xpos_base, sizeof(float));
+ memcpy(params + 12, &xpos_down, sizeof(bool));
ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_ROPE_BACK;
}
#endif
-
static void ggml_compute_forward_mul_mat(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
- struct ggml_tensor * dst) {
+ struct ggml_tensor * dst,
+ const bool forward) {
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}
const bool is_neox = mode & 2;
const bool is_glm = mode & 4;
+ // backward process uses inverse rotation by cos and sin.
+ // cos and sin build a rotation matrix, where the inverse is the transpose.
+ // this essentially just switches the sign of sin.
+ const float sin_sign = forward ? 1.0f : -1.0f;
+
const int32_t * pos = (const int32_t *) src1->data;
for (int64_t i3 = 0; i3 < ne3; i3++) {
float block_theta = MAX(p - (n_ctx - 2), 0);
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
const float cos_theta = cosf(theta_base);
- const float sin_theta = sinf(theta_base);
+ const float sin_theta = sinf(theta_base) * sin_sign;
const float cos_block_theta = cosf(block_theta);
- const float sin_block_theta = sinf(block_theta);
+ const float sin_block_theta = sinf(block_theta) * sin_sign;
theta_base *= theta_scale;
block_theta *= theta_scale;
rope_yarn(
theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
);
+ sin_theta *= sin_sign;
// zeta scaling for xPos only:
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
&cos_theta, &sin_theta
);
+ sin_theta *= sin_sign;
theta_base *= theta_scale;
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
- struct ggml_tensor * dst) {
+ struct ggml_tensor * dst,
+ const bool forward) {
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}
const bool is_neox = mode & 2;
const bool is_glm = mode & 4;
+ // backward process uses inverse rotation by cos and sin.
+ // cos and sin build a rotation matrix, where the inverse is the transpose.
+ // this essentially just switches the sign of sin.
+ const float sin_sign = forward ? 1.0f : -1.0f;
+
const int32_t * pos = (const int32_t *) src1->data;
for (int64_t i3 = 0; i3 < ne3; i3++) {
float block_theta = MAX(p - (n_ctx - 2), 0);
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
const float cos_theta = cosf(theta_base);
- const float sin_theta = sinf(theta_base);
+ const float sin_theta = sinf(theta_base) * sin_sign;
const float cos_block_theta = cosf(block_theta);
- const float sin_block_theta = sinf(block_theta);
+ const float sin_block_theta = sinf(block_theta) * sin_sign;
theta_base *= theta_scale;
block_theta *= theta_scale;
rope_yarn(
theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
);
+ sin_theta *= sin_sign;
theta_base *= theta_scale;
theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
&cos_theta, &sin_theta
);
+ sin_theta *= sin_sign;
theta_base *= theta_scale;
switch (src0->type) {
case GGML_TYPE_F16:
{
- ggml_compute_forward_rope_f16(params, src0, src1, dst);
+ ggml_compute_forward_rope_f16(params, src0, src1, dst, true);
} break;
case GGML_TYPE_F32:
{
- ggml_compute_forward_rope_f32(params, src0, src1, dst);
+ ggml_compute_forward_rope_f32(params, src0, src1, dst, true);
} break;
default:
{
// ggml_compute_forward_rope_back
-static void ggml_compute_forward_rope_back_f32(
- const struct ggml_compute_params * params,
- const struct ggml_tensor * src0,
- const struct ggml_tensor * src1,
- struct ggml_tensor * dst) {
-
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
- return;
- }
-
- // y = rope(x, src1)
- // dx = rope_back(dy, src1)
- // src0 is dy, src1 contains options
-
- float freq_base;
- float freq_scale;
-
- // these two only relevant for xPos RoPE:
- float xpos_base;
- bool xpos_down;
-
- //const int n_past = ((int32_t *) dst->op_params)[0];
- const int n_dims = ((int32_t *) dst->op_params)[1];
- const int mode = ((int32_t *) dst->op_params)[2];
- const int n_ctx = ((int32_t *) dst->op_params)[3]; UNUSED(n_ctx);
- memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
- memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
- memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float));
- memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool));
-
- GGML_TENSOR_UNARY_OP_LOCALS
-
- //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
- //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
-
- assert(nb0 == sizeof(float));
-
- const int ith = params->ith;
- const int nth = params->nth;
-
- const int nr = ggml_nrows(dst);
-
- // rows per thread
- const int dr = (nr + nth - 1)/nth;
-
- // row range for this thread
- const int ir0 = dr*ith;
- const int ir1 = MIN(ir0 + dr, nr);
-
- // row index used to determine which thread to use
- int ir = 0;
-
- const float theta_scale = powf(freq_base, -2.0f/n_dims);
-
- const bool is_neox = mode & 2;
-
- const int32_t * pos = (const int32_t *) src1->data;
-
- for (int64_t i3 = 0; i3 < ne3; i3++) {
- for (int64_t i2 = 0; i2 < ne2; i2++) {
- const int64_t p = pos[i2];
- for (int64_t i1 = 0; i1 < ne1; i1++) {
- if (ir++ < ir0) continue;
- if (ir > ir1) break;
-
- float theta_base = freq_scale * (float)p;
-
- if (!is_neox) {
- for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
- const float cos_theta = cosf(theta_base);
- const float sin_theta = sinf(theta_base);
-
- // zeta scaling for xPos only:
- float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
- if (xpos_down) zeta = 1.0f / zeta;
-
- theta_base *= theta_scale;
-
- const float * const dy = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
- float * dx = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
-
- const float dy0 = dy[0];
- const float dy1 = dy[1];
-
- dx[0] = dy0*cos_theta*zeta + dy1*sin_theta*zeta;
- dx[1] = - dy0*sin_theta*zeta + dy1*cos_theta*zeta;
- }
- } else {
- for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
- for (int64_t ic = 0; ic < n_dims; ic += 2) {
- const float cos_theta = cosf(theta_base);
- const float sin_theta = sinf(theta_base);
-
- theta_base *= theta_scale;
-
- const int64_t i0 = ib*n_dims + ic/2;
-
- const float * const dy = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
- float * dx = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
-
- const float dy0 = dy[0];
- const float dy1 = dy[n_dims/2];
-
- dx[0] = dy0*cos_theta + dy1*sin_theta;
- dx[n_dims/2] = - dy0*sin_theta + dy1*cos_theta;
- }
- }
- }
- }
- }
- }
-}
-
-static void ggml_compute_forward_rope_back_f16(
- const struct ggml_compute_params * params,
- const struct ggml_tensor * src0,
- const struct ggml_tensor * src1,
- struct ggml_tensor * dst) {
-
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
- return;
- }
-
- // y = rope(x, src1)
- // dx = rope_back(dy, src1)
- // src0 is dy, src1 contains options
-
- //const int n_past = ((int32_t *) dst->op_params)[0];
- const int n_dims = ((int32_t *) dst->op_params)[1];
- const int mode = ((int32_t *) dst->op_params)[2];
-
- GGML_TENSOR_UNARY_OP_LOCALS
-
- //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
- //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
-
- assert(nb0 == sizeof(ggml_fp16_t));
-
- const int ith = params->ith;
- const int nth = params->nth;
-
- const int nr = ggml_nrows(dst);
-
- // rows per thread
- const int dr = (nr + nth - 1)/nth;
-
- // row range for this thread
- const int ir0 = dr*ith;
- const int ir1 = MIN(ir0 + dr, nr);
-
- // row index used to determine which thread to use
- int ir = 0;
-
- const float theta_scale = powf(10000.0, -2.0f/n_dims);
-
- const bool is_neox = mode & 2;
-
- const int32_t * pos = (const int32_t *) src1->data;
-
- for (int64_t i3 = 0; i3 < ne3; i3++) {
- for (int64_t i2 = 0; i2 < ne2; i2++) {
- const int64_t p = pos[i2];
- for (int64_t i1 = 0; i1 < ne1; i1++) {
- if (ir++ < ir0) continue;
- if (ir > ir1) break;
-
- float theta_base = (float)p;
-
- if (!is_neox) {
- for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
- const float cos_theta = cosf(theta_base);
- const float sin_theta = sinf(theta_base);
-
- theta_base *= theta_scale;
-
- const ggml_fp16_t * const dy = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
- ggml_fp16_t * dx = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
-
- const float dy0 = GGML_FP16_TO_FP32(dy[0]);
- const float dy1 = GGML_FP16_TO_FP32(dy[1]);
-
- dx[0] = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta);
- dx[1] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta);
- }
- } else {
- for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
- for (int64_t ic = 0; ic < n_dims; ic += 2) {
- const float cos_theta = cosf(theta_base);
- const float sin_theta = sinf(theta_base);
-
- theta_base *= theta_scale;
-
- const int64_t i0 = ib*n_dims + ic/2;
-
- const ggml_fp16_t * const dy = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
- ggml_fp16_t * dx = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
-
- const float dy0 = GGML_FP16_TO_FP32(dy[0]);
- const float dy1 = GGML_FP16_TO_FP32(dy[n_dims/2]);
-
- dx[0] = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta);
- dx[n_dims/2] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta);
- }
- }
- }
- }
- }
- }
-}
-
static void ggml_compute_forward_rope_back(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
switch (src0->type) {
case GGML_TYPE_F16:
{
- ggml_compute_forward_rope_back_f16(params, src0, src1, dst);
+ ggml_compute_forward_rope_f16(params, src0, src1, dst, false);
} break;
case GGML_TYPE_F32:
{
- ggml_compute_forward_rope_back_f32(params, src0, src1, dst);
+ ggml_compute_forward_rope_f32(params, src0, src1, dst, false);
} break;
default:
{
// necessary for llama
if (src0->grad) {
//const int n_past = ((int32_t *) tensor->op_params)[0];
- const int n_dims = ((int32_t *) tensor->op_params)[1];
- const int mode = ((int32_t *) tensor->op_params)[2];
- const int n_ctx = ((int32_t *) tensor->op_params)[3];
- float freq_base;
- float freq_scale;
- float xpos_base;
- bool xpos_down;
- memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float));
- memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
- memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float));
- memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool));
+ const int n_dims = ((int32_t *) tensor->op_params)[1];
+ const int mode = ((int32_t *) tensor->op_params)[2];
+ const int n_ctx = ((int32_t *) tensor->op_params)[3];
+ const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
+ float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
+
+ memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float));
+ memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float));
+ memcpy(&ext_factor, (int32_t *) tensor->op_params + 7, sizeof(float));
+ memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float));
+ memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float));
+ memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float));
+ memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float));
+ memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool));
src0->grad = ggml_add_or_set(ctx,
src0->grad,
n_dims,
mode,
n_ctx,
+ n_orig_ctx,
freq_base,
freq_scale,
+ ext_factor,
+ attn_factor,
+ beta_fast,
+ beta_slow,
xpos_base,
xpos_down),
zero_table);
{
if (src0->grad) {
//const int n_past = ((int32_t *) tensor->op_params)[0];
- const int n_dims = ((int32_t *) tensor->op_params)[1];
- const int mode = ((int32_t *) tensor->op_params)[2];
- const int n_ctx = ((int32_t *) tensor->op_params)[3];
- float freq_base;
- float freq_scale;
- float xpos_base;
- bool xpos_down;
- memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float));
- memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
- memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float));
- memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool));
+ const int n_dims = ((int32_t *) tensor->op_params)[1];
+ const int mode = ((int32_t *) tensor->op_params)[2];
+ const int n_ctx = ((int32_t *) tensor->op_params)[3];
+ const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
+ float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
+
+ memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float));
+ memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float));
+ memcpy(&ext_factor, (int32_t *) tensor->op_params + 7, sizeof(float));
+ memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float));
+ memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float));
+ memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float));
+ memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float));
+ memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool));
src0->grad = ggml_add_or_set(ctx,
src0->grad,
src1,
n_dims,
mode,
- 0,
n_ctx,
+ n_orig_ctx,
freq_base,
freq_scale,
- 0.0f,
- 1.0f,
- 0.0f,
- 0.0f,
+ ext_factor,
+ attn_factor,
+ beta_fast,
+ beta_slow,
xpos_base,
xpos_down,
false),