#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
#endif
#define cudaMemcpy hipMemcpy
-#define cudaMemcpy2DAsync hipMemcpy2DAsync
#define cudaMemcpyAsync hipMemcpyAsync
+#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
+#define cudaMemcpy2DAsync hipMemcpy2DAsync
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
#if __has_builtin(__builtin_elementwise_sub_sat)
const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
- return reinterpret_cast<const int&>(c);
+ return reinterpret_cast<const int &>(c);
#else
int8x4_t c;
int16_t tmp;
if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
c[i] = tmp;
}
- return reinterpret_cast<int&>(c);
+ return reinterpret_cast<int &>(c);
#endif // __has_builtin(__builtin_elementwise_sub_sat)
}
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
+[[noreturn]]
+static void ggml_cuda_error(const char * stmt, const char * func, const char * file, const int line, const char * msg) {
+ int id = -1; // in case cudaGetDevice fails
+ cudaGetDevice(&id);
+
+ fprintf(stderr, "CUDA error: %s\n", msg);
+ fprintf(stderr, " current device: %d, in function %s at %s:%d\n", id, func, file, line);
+ fprintf(stderr, " %s\n", stmt);
+ // abort with GGML_ASSERT to get a stack trace
+ GGML_ASSERT(!"CUDA error");
+}
+
+#define CUDA_CHECK_GEN(err, success, error_fn) \
+ do { \
+ auto err_ = (err); \
+ if (err_ != (success)) { \
+ ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_)); \
+ } \
+ } while (0)
+
+#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString)
+
#if CUDART_VERSION >= 12000
static const char * cublas_get_error_str(const cublasStatus_t err) {
return cublasGetStatusString(err);
}
#endif // CUDART_VERSION >= 12000
-[[noreturn]]
-static void ggml_cuda_error(const char * stmt, const char * func, const char * file, const int line, const char * msg) {
- fprintf(stderr, "CUDA error: %s: %s\n", stmt, msg);
- fprintf(stderr, " in function %s at %s:%d\n", func, file, line);
- GGML_ASSERT(!"CUDA error");
-}
-
-#define CUDA_CHECK(err) do { auto err_ = (err); if (err_ != cudaSuccess) ggml_cuda_error(#err, __func__, __FILE__, __LINE__, cudaGetErrorString(err_)); } while (0)
-#define CUBLAS_CHECK(err) do { auto err_ = (err); if (err_ != CUBLAS_STATUS_SUCCESS) ggml_cuda_error(#err, __func__, __FILE__, __LINE__, cublas_get_error_str(err_)); } while (0)
+#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
#if !defined(GGML_USE_HIPBLAS)
static const char * cu_get_error_str(CUresult err) {
cuGetErrorString(err, &err_str);
return err_str;
}
-#define CU_CHECK(err) do { auto err_ = (err); if (err_ != CUDA_SUCCESS) ggml_cuda_error(#err, __func__, __FILE__, __LINE__, cu_get_error_str(err_)); } while (0)
+#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
#endif
#if CUDART_VERSION >= 11100
typedef void (*ggml_cuda_op_mul_mat_t)(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
- const int64_t src1_padded_row_size, const cudaStream_t & stream);
+ const int64_t src1_padded_row_size, cudaStream_t stream);
typedef void (*ggml_cuda_op_flatten_t)(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream);
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream);
// QK = number of values after dequantization
// QR = QK / number of values before dequantization
// this is faster on Windows
// probably because the Windows CUDA libraries forget to make this check before invoking the drivers
-inline cudaError_t ggml_cuda_set_device(const int device) {
+static void ggml_cuda_set_device(const int device) {
int current_device;
CUDA_CHECK(cudaGetDevice(¤t_device));
if (device == current_device) {
- return cudaSuccess;
+ return;
}
- return cudaSetDevice(device);
+ CUDA_CHECK(cudaSetDevice(device));
}
static int g_device_count = -1;
static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, false, 0} };
-
static void * g_scratch_buffer = nullptr;
static size_t g_scratch_size = 0; // disabled by default
static size_t g_scratch_offset = 0;
static __device__ __forceinline__ float op_repeat(const float a, const float b) {
return b;
+ GGML_UNUSED(a);
}
static __device__ __forceinline__ float op_add(const float a, const float b) {
dst[i] = x[i] / (1.0f + expf(-x[i]));
}
-static __global__ void gelu_quick_f32(const float *x, float *dst, int k) {
+static __global__ void gelu_quick_f32(const float * x, float * dst, int k) {
const float GELU_QUICK_COEF = -1.702f;
const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) {
dst[i] = x[i] * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x[i])));
}
-static __global__ void tanh_f32(const float *x, float *dst, int k) {
+static __global__ void tanh_f32(const float * x, float * dst, int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) {
return;
dst[i] = fmaxf(x[i], 0);
}
-static __global__ void leaky_relu_f32(const float *x, float *dst, const int k, const float negative_slope) {
+static __global__ void leaky_relu_f32(const float * x, float * dst, const int k, const float negative_slope) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) {
return;
}
}
-static __global__ void concat_f32(const float *x,const float *y, float *dst, const int ne0, const int ne02) {
+static __global__ void concat_f32(const float * x,const float * y, float * dst, const int ne0, const int ne02) {
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
if (nidx >= ne0) {
return;
}
}
-static __global__ void upscale_f32(const float *x, float *dst, const int ne00, const int nb02, const int scale_factor) {
+static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int nb02, const int scale_factor) {
int ne0 = ne00 * scale_factor;
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
if (nidx >= ne0) {
dst[offset_dst] = x[offset_src];
}
-static __global__ void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02) {
+static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02) {
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
if (nidx >= ne0) {
return;
const int row_y = col_x;
-
// y is not transposed but permuted
const int iy = channel*nrows_y + row_y;
cne[3] = 1;
};
- auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
+ auto collapse_nb = [](size_t cnb[], const int64_t cne[]) {
cnb[1] *= cne[1];
cnb[2] *= cne[2];
cnb[3] *= cne[3];
static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
// #define DEBUG_CUDA_MALLOC
-struct cuda_buffer {
+struct ggml_cuda_buffer {
void * ptr = nullptr;
size_t size = 0;
};
-static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
+static ggml_cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0};
-static void * ggml_cuda_pool_malloc_leg(size_t size, size_t * actual_size) {
+static void * ggml_cuda_pool_malloc_leg(int device, size_t size, size_t * actual_size) {
scoped_spin_lock lock(g_cuda_pool_lock);
- int id;
- CUDA_CHECK(cudaGetDevice(&id));
#ifdef DEBUG_CUDA_MALLOC
int nnz = 0;
size_t max_size = 0;
size_t best_diff = 1ull << 36;
int ibest = -1;
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
- cuda_buffer& b = g_cuda_buffer_pool[id][i];
+ ggml_cuda_buffer& b = g_cuda_buffer_pool[device][i];
if (b.ptr != nullptr) {
#ifdef DEBUG_CUDA_MALLOC
++nnz;
}
}
if (ibest >= 0) {
- cuda_buffer& b = g_cuda_buffer_pool[id][ibest];
+ ggml_cuda_buffer& b = g_cuda_buffer_pool[device][ibest];
void * ptr = b.ptr;
*actual_size = b.size;
b.ptr = nullptr;
void * ptr;
size_t look_ahead_size = (size_t) (1.05 * size);
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
+ ggml_cuda_set_device(device);
CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
*actual_size = look_ahead_size;
- g_cuda_pool_size[id] += look_ahead_size;
+ g_cuda_pool_size[device] += look_ahead_size;
#ifdef DEBUG_CUDA_MALLOC
fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz,
(uint32_t)(max_size/1024/1024), (uint32_t)(g_cuda_pool_size[id]/1024/1024), (uint32_t)(size/1024/1024));
return ptr;
}
-static void ggml_cuda_pool_free_leg(void * ptr, size_t size) {
+static void ggml_cuda_pool_free_leg(int device, void * ptr, size_t size) {
scoped_spin_lock lock(g_cuda_pool_lock);
- int id;
- CUDA_CHECK(cudaGetDevice(&id));
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
- cuda_buffer& b = g_cuda_buffer_pool[id][i];
+ ggml_cuda_buffer& b = g_cuda_buffer_pool[device][i];
if (b.ptr == nullptr) {
b.ptr = ptr;
b.size = size;
}
}
fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
+ ggml_cuda_set_device(device);
CUDA_CHECK(cudaFree(ptr));
- g_cuda_pool_size[id] -= size;
+ g_cuda_pool_size[device] -= size;
}
#if !defined(GGML_USE_HIPBLAS)
// pool with virtual memory
-static std::vector<CUmemGenericAllocationHandle> g_cuda_pool_handles[GGML_CUDA_MAX_DEVICES];
static CUdeviceptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0};
static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0};
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB
-static void * ggml_cuda_pool_malloc_vmm(size_t size, size_t * actual_size) {
+static void * ggml_cuda_pool_malloc_vmm(int device, size_t size, size_t * actual_size) {
scoped_spin_lock lock(g_cuda_pool_lock);
- int id;
- CUDA_CHECK(cudaGetDevice(&id));
// round up the allocation size to the alignment to ensure that all allocations are aligned for all data types
const size_t alignment = 128;
size = alignment * ((size + alignment - 1) / alignment);
- size_t avail = g_cuda_pool_size[id] - g_cuda_pool_used[id];
+ size_t avail = g_cuda_pool_size[device] - g_cuda_pool_used[device];
if (size > avail) {
// round up to the next multiple of the granularity
size_t reserve_size = size - avail;
- const size_t granularity = g_device_caps[id].vmm_granularity;
+ const size_t granularity = g_device_caps[device].vmm_granularity;
reserve_size = granularity * ((reserve_size + granularity - 1) / granularity);
- GGML_ASSERT(g_cuda_pool_size[id] + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
+ GGML_ASSERT(g_cuda_pool_size[device] + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
// allocate more physical memory
CUmemAllocationProp prop = {};
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
- prop.location.id = id;
+ prop.location.id = device;
CUmemGenericAllocationHandle handle;
CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0));
// reserve virtual address space (if not already reserved)
- if (g_cuda_pool_addr[id] == 0) {
- CU_CHECK(cuMemAddressReserve(&g_cuda_pool_addr[id], CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
+ if (g_cuda_pool_addr[device] == 0) {
+ CU_CHECK(cuMemAddressReserve(&g_cuda_pool_addr[device], CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
}
// map at the end of the pool
- CU_CHECK(cuMemMap(g_cuda_pool_addr[id] + g_cuda_pool_size[id], reserve_size, 0, handle, 0));
+ CU_CHECK(cuMemMap(g_cuda_pool_addr[device] + g_cuda_pool_size[device], reserve_size, 0, handle, 0));
+
+ // the memory allocation handle is no longer needed after mapping
+ CU_CHECK(cuMemRelease(handle));
// set access
CUmemAccessDesc access = {};
access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
- access.location.id = id;
+ access.location.id = device;
access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
- CU_CHECK(cuMemSetAccess(g_cuda_pool_addr[id] + g_cuda_pool_size[id], reserve_size, &access, 1));
+ CU_CHECK(cuMemSetAccess(g_cuda_pool_addr[device] + g_cuda_pool_size[device], reserve_size, &access, 1));
// add to the pool
- g_cuda_pool_handles[id].push_back(handle);
- g_cuda_pool_size[id] += reserve_size;
+ g_cuda_pool_size[device] += reserve_size;
//printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
// id, (unsigned long long) (g_cuda_pool_size[id]/1024/1024),
// (unsigned long long) (reserve_size/1024/1024));
}
- GGML_ASSERT(g_cuda_pool_addr[id] != 0);
+ GGML_ASSERT(g_cuda_pool_addr[device] != 0);
- void * ptr = (void *) (g_cuda_pool_addr[id] + g_cuda_pool_used[id]);
+ void * ptr = (void *) (g_cuda_pool_addr[device] + g_cuda_pool_used[device]);
*actual_size = size;
- g_cuda_pool_used[id] += size;
+ g_cuda_pool_used[device] += size;
#ifdef DEBUG_CUDA_MALLOC
printf("cuda pool[%d]: allocated %llu bytes at %llx [%s]\n", id, (unsigned long long) size, ptr);
return ptr;
}
-static void ggml_cuda_pool_free_vmm(void * ptr, size_t size) {
+static void ggml_cuda_pool_free_vmm(int device, void * ptr, size_t size) {
scoped_spin_lock lock(g_cuda_pool_lock);
- int id;
- CUDA_CHECK(cudaGetDevice(&id));
#ifdef DEBUG_CUDA_MALLOC
printf("cuda pool[%d]: freed %llu bytes at %llx\n", id, (unsigned long long) size, ptr);
#endif
- g_cuda_pool_used[id] -= size;
+ g_cuda_pool_used[device] -= size;
// all deallocations must be in reverse order of the allocations
- GGML_ASSERT(ptr == (void *) (g_cuda_pool_addr[id] + g_cuda_pool_used[id]));
+ GGML_ASSERT(ptr == (void *) (g_cuda_pool_addr[device] + g_cuda_pool_used[device]));
}
-static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
- int id;
- CUDA_CHECK(cudaGetDevice(&id));
- if (g_device_caps[id].vmm) {
- return ggml_cuda_pool_malloc_vmm(size, actual_size);
+static void * ggml_cuda_pool_malloc(int device, size_t size, size_t * actual_size) {
+ if (g_device_caps[device].vmm) {
+ return ggml_cuda_pool_malloc_vmm(device, size, actual_size);
} else {
- return ggml_cuda_pool_malloc_leg(size, actual_size);
+ return ggml_cuda_pool_malloc_leg(device, size, actual_size);
}
}
-static void ggml_cuda_pool_free(void * ptr, size_t size) {
- int id;
- CUDA_CHECK(cudaGetDevice(&id));
- if (g_device_caps[id].vmm) {
- ggml_cuda_pool_free_vmm(ptr, size);
+static void ggml_cuda_pool_free(int device, void * ptr, size_t size) {
+ if (g_device_caps[device].vmm) {
+ ggml_cuda_pool_free_vmm(device, ptr, size);
} else {
- ggml_cuda_pool_free_leg(ptr, size);
+ ggml_cuda_pool_free_leg(device, ptr, size);
}
}
#else
template<typename T>
struct cuda_pool_alloc {
+ int device = -1;
T * ptr = nullptr;
size_t actual_size = 0;
// size is in number of elements
T * alloc(size_t size) {
GGML_ASSERT(ptr == nullptr);
- ptr = (T *) ggml_cuda_pool_malloc(size * sizeof(T), &this->actual_size);
+ CUDA_CHECK(cudaGetDevice(&device));
+ ptr = (T *) ggml_cuda_pool_malloc(device, size * sizeof(T), &this->actual_size);
return ptr;
}
~cuda_pool_alloc() {
if (ptr != nullptr) {
- ggml_cuda_pool_free(ptr, actual_size);
+ ggml_cuda_pool_free(device, ptr, actual_size);
}
}
alloc_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
alloc_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
alloc_prop.location.id = id;
- CU_CHECK(cuMemGetAllocationGranularity(&g_device_caps[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
+ CU_CHECK(cuMemGetAllocationGranularity(&g_device_caps[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
}
#endif // !defined(GGML_USE_HIPBLAS)
g_device_caps[id].vmm = !!device_vmm;
}
for (int id = 0; id < g_device_count; ++id) {
- CUDA_CHECK(ggml_cuda_set_device(id));
+ ggml_cuda_set_device(id);
// create cuda streams
for (int is = 0; is < MAX_STREAMS; ++is) {
static void ggml_cuda_op_get_rows(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
- const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
+ const float * src0_d, const float * src1_d, float * dst_d, cudaStream_t stream) {
GGML_ASSERT(src1->type == GGML_TYPE_I32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
}
template<class op>
-inline void ggml_cuda_op_bin_bcast(
+static void ggml_cuda_op_bin_bcast(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
GGML_ASSERT(src1->type == GGML_TYPE_F32);
static void ggml_cuda_op_repeat(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
- const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & main_stream) {
+ const float * src0_d, const float * src1_d, float * dst_d, cudaStream_t main_stream) {
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
(void) src1_d;
}
-inline void ggml_cuda_op_add(
+static void ggml_cuda_op_add(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
}
-inline void ggml_cuda_op_acc(
+static void ggml_cuda_op_acc(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
(void) dst;
}
-inline void ggml_cuda_op_mul(
+static void ggml_cuda_op_mul(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
}
-inline void ggml_cuda_op_div(
+static void ggml_cuda_op_div(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
}
-inline void ggml_cuda_op_gelu(
+static void ggml_cuda_op_gelu(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
(void) src1_dd;
}
-inline void ggml_cuda_op_silu(
+static void ggml_cuda_op_silu(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
(void) src1_dd;
}
-inline void ggml_cuda_op_gelu_quick(
+static void ggml_cuda_op_gelu_quick(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
(void) src1_dd;
}
-inline void ggml_cuda_op_tanh(
+static void ggml_cuda_op_tanh(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
(void) src1_dd;
}
-inline void ggml_cuda_op_relu(
+static void ggml_cuda_op_relu(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
(void) src1_dd;
}
-inline void ggml_cuda_op_leaky_relu(
+static void ggml_cuda_op_leaky_relu(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
(void) src1_dd;
}
-inline void ggml_cuda_op_sqr(
+static void ggml_cuda_op_sqr(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
(void) src1_dd;
}
-inline void ggml_cuda_op_norm(
+static void ggml_cuda_op_norm(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
(void) src1_dd;
}
-
-inline void ggml_cuda_op_group_norm(
+static void ggml_cuda_op_group_norm(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
(void) src1_dd;
}
-inline void ggml_cuda_op_concat(
+static void ggml_cuda_op_concat(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
(void) dst;
}
-inline void ggml_cuda_op_upscale(
+static void ggml_cuda_op_upscale(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
(void) src1_dd;
}
-inline void ggml_cuda_op_pad(
+static void ggml_cuda_op_pad(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
(void) src1_dd;
}
-inline void ggml_cuda_op_rms_norm(
+static void ggml_cuda_op_rms_norm(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
(void) src1_dd;
}
-inline void ggml_cuda_op_mul_mat_q(
+static void ggml_cuda_op_mul_mat_q(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
- const int64_t src1_padded_row_size, const cudaStream_t & stream) {
+ const int64_t src1_padded_row_size, cudaStream_t stream) {
const int64_t ne00 = src0->ne[0];
static int64_t get_row_rounding(ggml_type type) {
int64_t min_compute_capability = INT_MAX;
int64_t max_compute_capability = INT_MIN;
- for (int64_t id = 0; id < g_device_count; ++id) {
+ for (int id = 0; id < g_device_count; ++id) {
if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
if (min_compute_capability > g_device_caps[id].cc) {
min_compute_capability = g_device_caps[id].cc;
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
}
-inline void ggml_cuda_op_mul_mat_vec_q(
+static void ggml_cuda_op_mul_mat_vec_q(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
- const int64_t src1_padded_row_size, const cudaStream_t & stream) {
+ const int64_t src1_padded_row_size, cudaStream_t stream) {
GGML_ASSERT(ggml_nrows(src1) == 1);
(void) src1_padded_row_size;
}
-inline void ggml_cuda_op_dequantize_mul_mat_vec(
+static void ggml_cuda_op_dequantize_mul_mat_vec(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
- const int64_t src1_padded_row_size, const cudaStream_t & stream) {
+ const int64_t src1_padded_row_size, cudaStream_t stream) {
const int64_t ne00 = src0->ne[0];
const int64_t row_diff = row_high - row_low;
(void) src1_padded_row_size;
}
-inline void ggml_cuda_op_mul_mat_cublas(
+static void ggml_cuda_op_mul_mat_cublas(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
- const int64_t src1_padded_row_size, const cudaStream_t & stream) {
+ const int64_t src1_padded_row_size, cudaStream_t stream) {
GGML_ASSERT(src0_dd_i != nullptr);
GGML_ASSERT(src1_ddf_i != nullptr);
(void) src1_padded_row_size;
}
-inline void ggml_cuda_op_rope(
+static void ggml_cuda_op_rope(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
(void) src1_dd;
}
-inline void ggml_cuda_op_alibi(
+static void ggml_cuda_op_alibi(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
(void) src1_dd;
}
-inline void ggml_cuda_op_im2col(
+static void ggml_cuda_op_im2col(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
(void) src0_dd;
}
-
-inline void ggml_cuda_op_sum_rows(
+static void ggml_cuda_op_sum_rows(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
(void) src1_dd;
}
-inline void ggml_cuda_op_argsort(
+static void ggml_cuda_op_argsort(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_I32);
(void) src1_dd;
}
-inline void ggml_cuda_op_diag_mask_inf(
+static void ggml_cuda_op_diag_mask_inf(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
(void) src1_dd;
}
-inline void ggml_cuda_op_soft_max(
+static void ggml_cuda_op_soft_max(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
(void) dst;
}
-inline void ggml_cuda_op_scale(
+static void ggml_cuda_op_scale(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
(void) src1_dd;
}
-inline void ggml_cuda_op_clamp(
+static void ggml_cuda_op_clamp(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
#ifdef NDEBUG
for (int id = 0; id < g_device_count; ++id) {
- CUDA_CHECK(ggml_cuda_set_device(id));
+ ggml_cuda_set_device(id);
CUDA_CHECK(cudaDeviceSynchronize());
}
for (int id = 0; id < g_device_count; ++id) {
- CUDA_CHECK(ggml_cuda_set_device(id));
+ ggml_cuda_set_device(id);
for (int id_other = 0; id_other < g_device_count; ++id_other) {
if (id == id_other) {
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[3];
- const int64_t nrows0 = ggml_nrows(src0);
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
GGML_ASSERT(!(split && ne03 > 1));
GGML_ASSERT(!(split && ne02 < ne12));
- // dd = data device
- char * src0_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
- float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
- char * src1_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // q8_1
- float * dst_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
+ struct dev_data {
+ cuda_pool_alloc<char> src0_dd_alloc;
+ cuda_pool_alloc<float> src1_ddf_alloc;
+ cuda_pool_alloc<char> src1_ddq_alloc;
+ cuda_pool_alloc<float> dst_dd_alloc;
- // as = actual size
- size_t src0_as[GGML_CUDA_MAX_DEVICES] = {0};
- size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
- size_t src1_asq[GGML_CUDA_MAX_DEVICES] = {0};
- size_t dst_as[GGML_CUDA_MAX_DEVICES] = {0};
+ char * src0_dd = nullptr;
+ float * src1_ddf = nullptr; // float
+ char * src1_ddq = nullptr; // q8_1
+ float * dst_dd = nullptr;
- int64_t row_low[GGML_CUDA_MAX_DEVICES];
- int64_t row_high[GGML_CUDA_MAX_DEVICES];
+ int64_t row_low;
+ int64_t row_high;
+ };
+
+ dev_data dev[GGML_CUDA_MAX_DEVICES];
int used_devices = 0;
- for (int64_t id = 0; id < g_device_count; ++id) {
+ for (int id = 0; id < g_device_count; ++id) {
// by default, use all rows
- row_low[id] = 0;
- row_high[id] = ne01;
+ dev[id].row_low = 0;
+ dev[id].row_high = ne01;
// for multi GPU, get the row boundaries from tensor split
// and round to mul_mat_q tile sizes
const int64_t rounding = get_row_rounding(src0->type);
if (id != 0) {
- row_low[id] = ne01*g_tensor_split[id];
- if (row_low[id] < ne01) {
- row_low[id] -= row_low[id] % rounding;
+ dev[id].row_low = ne01*g_tensor_split[id];
+ if (dev[id].row_low < ne01) {
+ dev[id].row_low -= dev[id].row_low % rounding;
}
}
if (id != g_device_count - 1) {
- row_high[id] = ne01*g_tensor_split[id + 1];
- if (row_high[id] < ne01) {
- row_high[id] -= row_high[id] % rounding;
+ dev[id].row_high = ne01*g_tensor_split[id + 1];
+ if (dev[id].row_high < ne01) {
+ dev[id].row_high -= dev[id].row_high % rounding;
}
}
}
}
- for (int64_t id = 0; id < g_device_count; ++id) {
- if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
+ for (int id = 0; id < g_device_count; ++id) {
+ if ((!split && id != g_main_device) || dev[id].row_low == dev[id].row_high) {
continue;
}
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
ggml_cuda_set_device(id);
- const cudaStream_t stream = g_cudaStreams[id][0];
+ cudaStream_t stream = g_cudaStreams[id][0];
if (src0_on_device && src0_is_contiguous) {
- src0_dd[id] = (char *) src0_extra->data_device[id];
+ dev[id].src0_dd = (char *) src0_extra->data_device[id];
} else {
- // const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
- src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
+ dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ggml_nbytes(src0));
}
if (src1_on_device && src1_is_contiguous) {
- src1_ddf[id] = (float *) src1_extra->data_device[id];
+ dev[id].src1_ddf = (float *) src1_extra->data_device[id];
} else {
- src1_ddf[id] = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf[id]);
+ dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ggml_nelements(src1));
}
if (convert_src1_to_q8_1) {
- src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
+ dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
if (src1_on_device && src1_is_contiguous) {
- quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
+ quantize_row_q8_1_cuda(dev[id].src1_ddf, dev[id].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
CUDA_CHECK(cudaGetLastError());
}
}
if (dst_on_device) {
- dst_dd[id] = (float *) dst_extra->data_device[id];
+ dev[id].dst_dd = (float *) dst_extra->data_device[id];
} else {
- const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof(float) : ggml_nbytes(dst);
- dst_dd[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_as[id]);
+ const size_t size_dst_ddf = split ? (dev[id].row_high - dev[id].row_low)*ne1 : ggml_nelements(dst);
+ dev[id].dst_dd = dev[id].dst_dd_alloc.alloc(size_dst_ddf);
}
}
// if multiple devices are used they need to wait for the main device
// here an event is recorded that signals that the main device has finished calculating the input data
if (split && used_devices > 1) {
- CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+ ggml_cuda_set_device(g_main_device);
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
}
const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
- for (int64_t id = 0; id < g_device_count; ++id) {
- if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
+ for (int id = 0; id < g_device_count; ++id) {
+ if ((!split && id != g_main_device) || dev[id].row_low == dev[id].row_high) {
continue;
}
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
- const int64_t row_diff = row_high[id] - row_low[id];
+ const int64_t row_diff = dev[id].row_high - dev[id].row_low;
ggml_cuda_set_device(id);
- const cudaStream_t stream = g_cudaStreams[id][is];
+ cudaStream_t stream = g_cudaStreams[id][is];
// wait for main GPU data if necessary
if (split && (id != g_main_device || is != 0)) {
const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
// for split tensors the data begins at i0 == i0_offset_low
- char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
- float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10;
- char * src1_ddq_i = src1_ddq[id] + src1_ddq_i_offset;
- float * dst_dd_i = dst_dd[id] + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
+ char * src0_dd_i = dev[id].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
+ float * src1_ddf_i = dev[id].src1_ddf + (i0*ne11 + src1_col_0) * ne10;
+ char * src1_ddq_i = dev[id].src1_ddq + src1_ddq_i_offset;
+ float * dst_dd_i = dev[id].dst_dd + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
// the main device memory buffer can be on VRAM scratch, with space for all partial results
// in that case an offset on dst_ddf_i is needed
if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) {
- dst_dd_i += row_low[id]; // offset is 0 if no tensor split
+ dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split
}
// copy src0, src1 to device if necessary
if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
if (id != g_main_device) {
if (convert_src1_to_q8_1) {
- char * src1_ddq_i_source = src1_ddq[g_main_device] + src1_ddq_i_offset;
- CUDA_CHECK(cudaMemcpyAsync(src1_ddq_i, src1_ddq_i_source, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs,
- cudaMemcpyDeviceToDevice, stream));
+ char * src1_ddq_i_source = dev[g_main_device].src1_ddq + src1_ddq_i_offset;
+ CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddq_i, id, src1_ddq_i_source, g_main_device,
+ src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
} else {
float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
- CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_ncols*ne10*sizeof(float),
- cudaMemcpyDeviceToDevice, stream));
+ CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddf_i, id, src1_ddf_i_source, g_main_device,
+ src1_ncols*ne10*sizeof(float), stream));
}
}
} else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
- src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
+ src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
} else {
GGML_ASSERT(false);
}
}
if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, row_low[id], row_high[id], stream));
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, dev[id].row_low, dev[id].row_high, stream));
}
// do the computation
op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
- row_low[id], row_high[id], src1_ncols, src1_padded_col_size, stream);
+ dev[id].row_low, dev[id].row_high, src1_ncols, src1_padded_col_size, stream);
CUDA_CHECK(cudaGetLastError());
// copy dst to host or other device if necessary
// If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
- dhf_dst_i += src1_col_0*ne0 + row_low[id];
- CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_dd_i, row_diff*sizeof(float),
- row_diff*sizeof(float), src1_ncols, kind, stream));
+ dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
+#if !defined(GGML_USE_HIPBLAS)
+ if (kind == cudaMemcpyDeviceToDevice) {
+ // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
+ cudaMemcpy3DPeerParms p = {};
+ p.dstDevice = g_main_device;
+ p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), row_diff, src1_ncols);
+ p.srcDevice = id;
+ p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols);
+ p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1);
+ CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream));
+ } else
+#endif
+ {
+ CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float),
+ dst_dd_i, row_diff*sizeof(float),
+ row_diff*sizeof(float), src1_ncols,
+ kind, stream));
+ }
} else {
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
}
}
- for (int64_t id = 0; id < g_device_count; ++id) {
- if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
- continue;
- }
- CUDA_CHECK(ggml_cuda_set_device(id));
-
- // free buffers again when done
- if (dst_as[id] > 0) {
- ggml_cuda_pool_free(dst_dd[id], dst_as[id]);
- }
- if (src1_asq[id] > 0) {
- ggml_cuda_pool_free(src1_ddq[id], src1_asq[id]);
- }
- if (src1_asf[id] > 0) {
- ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
- }
- if (src0_as[id] > 0) {
- ggml_cuda_pool_free(src0_dd[id], src0_as[id]);
- }
- }
-
// main device waits for all other devices to be finished
if (split && g_device_count > 1) {
int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
- CUDA_CHECK(ggml_cuda_set_device(g_main_device));
- for (int64_t id = 0; id < g_device_count; ++id) {
- if (row_low[id] == row_high[id]) {
+ ggml_cuda_set_device(g_main_device);
+ for (int id = 0; id < g_device_count; ++id) {
+ if (dev[id].row_low == dev[id].row_high) {
continue;
}
for (int64_t is = 0; is < is_max; ++is) {
}
if (dst->backend == GGML_BACKEND_CPU) {
- CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+ ggml_cuda_set_device(g_main_device);
CUDA_CHECK(cudaDeviceSynchronize());
}
}
const int64_t ne12 = src1->ne[2];
- CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+ ggml_cuda_set_device(g_main_device);
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
const int64_t ne12 = src1->ne[2];
- CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+ ggml_cuda_set_device(g_main_device);
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
const int64_t ne1 = ggml_nelements(src1);
const int64_t ne = ggml_nelements(dst);
- CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+ ggml_cuda_set_device(g_main_device);
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
int64_t min_compute_capability = INT_MAX;
- for (int64_t id = 0; id < g_device_count; ++id) {
+ for (int id = 0; id < g_device_count; ++id) {
if (min_compute_capability > g_device_caps[id].cc && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
min_compute_capability = g_device_caps[id].cc;
}
const int64_t ne1 = ggml_nelements(src1);
const int64_t ne = ggml_nelements(dst);
- CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+ ggml_cuda_set_device(g_main_device);
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
std::vector<char> ids_host(ggml_nbytes(ids));
- const cudaStream_t stream = g_cudaStreams[g_main_device][0];
+ cudaStream_t stream = g_cudaStreams[g_main_device][0];
if (ids->backend == GGML_BACKEND_GPU) {
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
const int64_t nb11 = src1->nb[1];
const int64_t nb12 = src1->nb[2];
- CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+ ggml_cuda_set_device(g_main_device);
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
memset(extra, 0, sizeof(*extra));
- for (int64_t id = 0; id < g_device_count; ++id) {
+ for (int id = 0; id < g_device_count; ++id) {
if (backend == GGML_BACKEND_GPU && id != g_main_device) {
continue;
}
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
- for (int64_t id = 0; id < g_device_count; ++id) {
+ for (int id = 0; id < g_device_count; ++id) {
+ ggml_cuda_set_device(id);
if (extra->data_device[id] != nullptr) {
- CUDA_CHECK(ggml_cuda_set_device(id));
CUDA_CHECK(cudaFree(extra->data_device[id]));
}
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
if (extra->events[id][is] != nullptr) {
- CUDA_CHECK(ggml_cuda_set_device(id));
CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
}
}
force_inplace;
const size_t size = ggml_nbytes(tensor);
- CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+ ggml_cuda_set_device(g_main_device);
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
GGML_ASSERT(ggml_is_contiguous(tensor));
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
- CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+ ggml_cuda_set_device(g_main_device);
CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
}