(void) dst;
}
-void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const size_t offset) {
- FILE * fp = fopen(fname, "rb");
+void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
int nrows = ggml_nrows(tensor);
const size_t nb1 = tensor->nb[1];
ggml_backend backend = tensor->backend;
int64_t nrows_split = row_high - row_low;
- const size_t offset_split = offset + row_low*nb1;
+ const size_t offset_split = row_low*nb1;
const size_t size = ggml_nbytes_split(tensor, nrows_split);
void * buf;
CUDA_CHECK(cudaMalloc(&buf, size));
- void * buf_host = malloc(size);
-
-#ifdef _WIN32
- int ret = _fseeki64(fp, (__int64) offset_split, SEEK_SET);
-#else
- int ret = fseek(fp, (long) offset_split, SEEK_SET);
-#endif
- GGML_ASSERT(ret == 0); // same
-
- size_t ret2 = fread(buf_host, size, 1, fp);
- if (ret2 != 1) {
- fprintf(stderr, "unexpectedly reached end of file");
- exit(1);
- }
+ void * buf_host = (char*)data + offset_split;
cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
- cudaDeviceSynchronize();
- free(buf_host);
extra->data_device[id] = buf;
}
tensor->extra = extra;
- fclose(fp);
}
void ggml_cuda_free_data(struct ggml_tensor * tensor) {
return 0;
}
-void ggml_cl_transform_tensor(ggml_tensor * tensor) {
+void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
const int64_t ne0 = tensor->ne[0];
const int64_t ne1 = tensor->ne[1];
const int64_t ne2 = tensor->ne[2];
size_t q_size;
cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size);
+ tensor->data = data;
// copy tensor to device
for (int64_t i3 = 0; i3 < ne3; i3++) {
for (int64_t i2 = 0; i2 < ne2; i2++) {
CL_CHECK(clFinish(queue));
tensor->data = dst;
- tensor->backend = GGML_BACKEND_GPU;
-}
-
-void ggml_cl_load_data(const char * fname, struct ggml_tensor * tensor, const size_t offset) {
- cl_int err;
- FILE * fp = fopen(fname, "rb");
-
- const size_t size = ggml_nbytes(tensor);
-
- cl_mem dst;
- CL_CHECK((dst = clCreateBuffer(context, CL_MEM_READ_ONLY, size, nullptr, &err), err));
- void * buf_host = malloc(size);
-
-#ifdef _WIN32
- int ret = _fseeki64(fp, (__int64) offset, SEEK_SET);
-#else
- int ret = fseek(fp, (long) offset, SEEK_SET);
-#endif
- GGML_ASSERT(ret == 0); // same
-
- size_t ret2 = fread(buf_host, size, 1, fp);
- if (ret2 != 1) {
- fprintf(stderr, "unexpectedly reached end of file");
- exit(1);
- }
-
- clEnqueueWriteBuffer(queue, dst, CL_TRUE, 0, size, buf_host, 0, nullptr, nullptr);
-
- tensor->data = dst;
- free(buf_host);
- fclose(fp);
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
}
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
struct ggml_tensor * tensor;
+ if (backend != GGML_BACKEND_CPU) {
+ ggml_set_no_alloc(ggml_ctx, true);
+ }
if (lt.ne.size() == 2) {
tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
} else {
ggml_set_name(tensor, lt.name.c_str());
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
+ if (backend != GGML_BACKEND_CPU) {
+ ggml_set_no_alloc(ggml_ctx, use_mmap);
+ }
tensor->backend = backend;
lt.ggml_tensor = tensor;
num_ggml_tensors_created++;
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
size_t data_size = 0;
size_t prefetch_size = 0;
+ size_t lock_size = 0;
for (const llama_load_tensor & lt : tensors_map.tensors) {
data_size += lt.size;
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
if (use_mmap) {
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
- if (!lmlock) {
- // Don't call the callback since the actual loading will be lazy
- // and we can't measure it.
- progress_callback = NULL;
- }
if (lmlock) {
lmlock->init(mapping->addr);
}
size_t done_size = 0;
for (llama_load_tensor & lt : tensors_map.tensors) {
- if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
- continue;
- }
if (progress_callback) {
progress_callback((float) done_size / data_size, progress_callback_user_data);
}
LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already
lt.data = (uint8_t *) lt.ggml_tensor->data;
+
+ // allocate temp buffer if not using mmap
+ if (!use_mmap && lt.data == NULL) {
+ GGML_ASSERT(lt.ggml_tensor->backend != GGML_BACKEND_CPU);
+ lt.data = (uint8_t*)malloc(ggml_nbytes(lt.ggml_tensor));
+ }
+
load_data_for(lt);
- lt.ggml_tensor->data = lt.data;
- done_size += lt.size;
- if (use_mmap && lmlock) {
- lmlock->grow_to(done_size);
+
+ switch(lt.ggml_tensor->backend) {
+ case GGML_BACKEND_CPU:
+ lt.ggml_tensor->data = lt.data;
+ if (use_mmap && lmlock) {
+ lock_size += lt.size;
+ lmlock->grow_to(lock_size);
+ }
+ break;
+#if defined(GGML_USE_CUBLAS)
+ case GGML_BACKEND_GPU:
+ case GGML_BACKEND_GPU_SPLIT:
+ ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor);
+ if (!use_mmap) {
+ free(lt.data);
+ }
+ break;
+#elif defined(GGML_USE_CLBLAST)
+ case GGML_BACKEND_GPU:
+ ggml_cl_transform_tensor(lt.data, lt.ggml_tensor);
+ if (!use_mmap) {
+ free(lt.data);
+ }
+ break;
+#endif
+ default:
+ continue;
}
+
+ done_size += lt.size;
}
}
if (backend == GGML_BACKEND_GPU) {
vram_weights +=
ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
- ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
+ ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
}
}
model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
}
- ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
-
#if defined(GGML_USE_CUBLAS)
{
ggml_cuda_set_tensor_split(tensor_split);
-
- size_t done_size = 0;
- size_t data_size = 0;
- for (llama_load_tensor & lt : ml->tensors_map.tensors) {
- data_size += lt.size;
- if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
- done_size += lt.size;
- }
- }
- for (llama_load_tensor & lt : ml->tensors_map.tensors) {
- ggml_backend backend = lt.ggml_tensor->backend;
- if (backend != GGML_BACKEND_GPU && backend != GGML_BACKEND_GPU_SPLIT) {
- continue;
- }
- if (progress_callback) {
- progress_callback((float) done_size / data_size, progress_callback_user_data);
- }
- ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
- done_size += lt.size;
- }
- }
-#elif defined(GGML_USE_CLBLAST)
- {
- size_t done_size = 0;
- size_t data_size = 0;
- for (llama_load_tensor & lt : ml->tensors_map.tensors) {
- data_size += lt.size;
- if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
- done_size += lt.size;
- }
- }
- for (llama_load_tensor & lt : ml->tensors_map.tensors) {
- if (lt.ggml_tensor->backend != GGML_BACKEND_GPU) {
- continue;
- }
- if (progress_callback) {
- progress_callback((float) done_size / data_size, progress_callback_user_data);
- }
- ggml_cl_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
- done_size += lt.size;
- }
}
-#else
- (void) n_batch;
- (void) tensor_split;
#endif
+ ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
+
if (progress_callback) {
progress_callback(1.0f, progress_callback_user_data);
}