#define CC_OFFSET_AMD 1000000
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
+#define GGML_CUDA_MAX_NODES 8192
+
// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
}
-void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
}
static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
if (g_temp_tensor_extras == nullptr) {
- g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_DEFAULT_GRAPH_SIZE];
+ g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
}
size_t alloc_index = g_temp_tensor_extra_index;
- g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_DEFAULT_GRAPH_SIZE;
+ g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
memset(extra, 0, sizeof(*extra));
ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
if (temp_tensor_extras == nullptr) {
- temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_DEFAULT_GRAPH_SIZE];
+ temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
}
size_t alloc_index = temp_tensor_extra_index;
- temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_DEFAULT_GRAPH_SIZE;
+ temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
memset(extra, 0, sizeof(*extra));
float max = x[0];
float sum_w = weights[0];
float sum_x = sum_w * x[0];
+#ifdef HAVE_BUGGY_APPLE_LINKER
+ // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
+ for (volatile int i = 1; i < n; ++i) {
+#else
for (int i = 1; i < n; ++i) {
+#endif
if (x[i] < min) min = x[i];
if (x[i] > max) max = x[i];
float w = weights[i];
{
ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
- for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
+ for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
struct gguf_kv * kv = &ctx->kv[i];
//fprintf(stderr, "%s: reading kv %d\n", __func__, i);
case GGUF_TYPE_STRING:
{
kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
- for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
+ for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
}
} break;
{
ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
- for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+ for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
struct gguf_tensor_info * info = &ctx->infos[i];
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
// compute the total size of the data section, taking into account the alignment
{
ctx->size = 0;
- for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+ for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
struct gguf_tensor_info * info = &ctx->infos[i];
const int64_t ne =
ggml_set_no_alloc(ctx_data, true);
// create the tensors
- for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+ for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
const int64_t ne[GGML_MAX_DIMS] = {
ctx->infos[i].ne[0],
ctx->infos[i].ne[1],