From: Georgi Gerganov Date: Mon, 10 Jul 2023 18:40:05 +0000 (+0300) Subject: ggml : sync llama.cpp (changes to ggml_graph_compute() API) (#368) X-Git-Tag: upstream/0.0.1642~1343 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=e41924818511e3e95ce1f94dbdccd206b4bd210c;p=pkg%2Fggml%2Fsources%2Fggml ggml : sync llama.cpp (changes to ggml_graph_compute() API) (#368) --- diff --git a/examples/dolly-v2/main.cpp b/examples/dolly-v2/main.cpp index 4d497b01..ce634213 100644 --- a/examples/dolly-v2/main.cpp +++ b/examples/dolly-v2/main.cpp @@ -496,7 +496,6 @@ bool dollyv2_eval( struct ggml_context * ctx0 = ggml_init(params); struct ggml_cgraph gf = { }; - gf.n_threads = n_threads; struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); @@ -659,7 +658,7 @@ bool dollyv2_eval( // run the computation ggml_build_forward_expand(&gf, inpL); - ggml_graph_compute (ctx0, &gf); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); //if (n_past%100 == 0) { // ggml_graph_print (&gf); diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp index c6f2c7b1..7e12eab5 100644 --- a/examples/gpt-2/main.cpp +++ b/examples/gpt-2/main.cpp @@ -429,7 +429,6 @@ bool gpt2_eval( struct ggml_context * ctx0 = ggml_init(params); struct ggml_cgraph gf = {}; - gf.n_threads = n_threads; struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); @@ -674,7 +673,7 @@ bool gpt2_eval( // run the computation ggml_build_forward_expand(&gf, inpL); - ggml_graph_compute (ctx0, &gf); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); //if (n_past%100 == 0) { // ggml_graph_print (&gf); diff --git a/examples/gpt-j/main.cpp b/examples/gpt-j/main.cpp index ad99c80b..b42764ce 100644 --- a/examples/gpt-j/main.cpp +++ b/examples/gpt-j/main.cpp @@ -425,7 +425,6 @@ bool gptj_eval( struct ggml_context * ctx0 = ggml_init(params); struct ggml_cgraph gf = {}; - gf.n_threads = n_threads; struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); @@ -584,7 +583,7 @@ bool gptj_eval( // run the computation ggml_build_forward_expand(&gf, inpL); - ggml_graph_compute (ctx0, &gf); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); //if (n_past%100 == 0) { // ggml_graph_print (&gf); diff --git a/examples/gpt-neox/main.cpp b/examples/gpt-neox/main.cpp index 065af4da..4af771b8 100644 --- a/examples/gpt-neox/main.cpp +++ b/examples/gpt-neox/main.cpp @@ -476,7 +476,6 @@ bool gpt_neox_eval( struct ggml_context * ctx0 = ggml_init(params); struct ggml_cgraph gf = {}; - gf.n_threads = n_threads; struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); @@ -646,7 +645,7 @@ bool gpt_neox_eval( // run the computation ggml_build_forward_expand(&gf, inpL); - ggml_graph_compute (ctx0, &gf); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); //if (n_past%100 == 0) { // ggml_graph_print (&gf); diff --git a/examples/mnist/main-cpu.cpp b/examples/mnist/main-cpu.cpp index b3cde658..2000c9aa 100644 --- a/examples/mnist/main-cpu.cpp +++ b/examples/mnist/main-cpu.cpp @@ -41,11 +41,10 @@ int mnist_eval( struct ggml_context * ctx_eval = NULL; struct ggml_cgraph gfi = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval); - gfi.n_threads = n_threads; // allocate work context // needed during ggml_graph_compute() to allocate a work tensor - static size_t buf_size = gfi.work_size; // TODO + static size_t buf_size = 128ull*1024*1024; // TODO static void * buf = malloc(buf_size); struct ggml_init_params params = { @@ -59,7 +58,7 @@ int mnist_eval( struct ggml_tensor * input = ggml_graph_get_tensor(&gfi, "input"); memcpy(input->data, digit.data(), ggml_nbytes(input)); - ggml_graph_compute(ctx_work, &gfi); + ggml_graph_compute_with_ctx(ctx_work, &gfi, n_threads); const float * probs_data = ggml_get_data_f32(ggml_graph_get_tensor(&gfi, "probs")); diff --git a/examples/mnist/main-mtl.cpp b/examples/mnist/main-mtl.cpp index 1f9475d8..a8d47ac9 100644 --- a/examples/mnist/main-mtl.cpp +++ b/examples/mnist/main-mtl.cpp @@ -36,16 +36,15 @@ int mnist_eval( struct ggml_context * ctx_eval = NULL; struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval); - gf.n_threads = 1; // allocate work context - static size_t buf_size = gf.work_size; // TODO + static size_t buf_size = 128ull*1024*1024; // TODO static void * buf = malloc(buf_size); struct ggml_init_params params = { - .mem_size = buf_size, - .mem_buffer = buf, - .no_alloc = false, + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ buf, + /*.no_alloc =*/ false, }; struct ggml_context * ctx_work = ggml_init(params); diff --git a/examples/mnist/main.cpp b/examples/mnist/main.cpp index fac641e0..5ff4ac20 100644 --- a/examples/mnist/main.cpp +++ b/examples/mnist/main.cpp @@ -186,7 +186,6 @@ int mnist_eval( struct ggml_context * ctx0 = ggml_init(params); struct ggml_cgraph gf = {}; - gf.n_threads = n_threads; struct ggml_tensor * input = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, hparams.n_input); memcpy(input->data, digit.data(), ggml_nbytes(input)); @@ -202,7 +201,7 @@ int mnist_eval( // build / export / run the computation graph ggml_build_forward_expand(&gf, probs); - ggml_graph_compute (ctx0, &gf); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); //ggml_graph_print (&gf); ggml_graph_dump_dot(&gf, NULL, "mnist.dot"); diff --git a/examples/mpt/main.cpp b/examples/mpt/main.cpp index 063d8566..457dc3d5 100644 --- a/examples/mpt/main.cpp +++ b/examples/mpt/main.cpp @@ -499,7 +499,6 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past, struct ggml_context * ctx0 = ggml_init(params); struct ggml_cgraph gf = {}; - gf.n_threads = n_threads; struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); memcpy(embd->data, embd_inp.data(), N * ggml_element_size(embd)); @@ -651,7 +650,7 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past, // run the computation ggml_build_forward_expand(&gf, inpL); - ggml_graph_compute(ctx0, &gf); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); // std::cout << "Qcur" << std::endl; // print_tensor(Qcur); diff --git a/examples/replit/main.cpp b/examples/replit/main.cpp index 4b96ddfb..1ed265bf 100644 --- a/examples/replit/main.cpp +++ b/examples/replit/main.cpp @@ -475,7 +475,6 @@ bool replit_eval(const replit_model & model, const int n_threads, const int n_pa struct ggml_context * ctx0 = ggml_init(params); struct ggml_cgraph gf = {}; - gf.n_threads = n_threads; struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); memcpy(embd->data, embd_inp.data(), N * ggml_element_size(embd)); @@ -614,7 +613,7 @@ bool replit_eval(const replit_model & model, const int n_threads, const int n_pa // run the computation ggml_build_forward_expand(&gf, inpL); - ggml_graph_compute(ctx0, &gf); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); // std::cout << "Qcur" << std::endl; // print_tensor(Qcur); diff --git a/examples/starcoder/main.cpp b/examples/starcoder/main.cpp index 8a694aa8..c5007304 100644 --- a/examples/starcoder/main.cpp +++ b/examples/starcoder/main.cpp @@ -463,7 +463,6 @@ bool starcoder_eval( struct ggml_context * ctx0 = ggml_init(params); struct ggml_cgraph gf = {}; - gf.n_threads = n_threads; struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); @@ -716,7 +715,7 @@ bool starcoder_eval( // run the computation ggml_build_forward_expand(&gf, inpL); - ggml_graph_compute (ctx0, &gf); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); //if (n_past%100 == 0) { // ggml_graph_print (&gf); @@ -791,7 +790,7 @@ int main(int argc, char ** argv) { printf("%s: top_p = %.3f\n", __func__, params.top_p); printf("%s: repeat_last_n = %d\n", __func__, params.repeat_last_n); printf("%s: repeat_penalty = %.3f\n", __func__, params.repeat_penalty); - + int n_past = 0; int64_t t_sample_us = 0; @@ -801,7 +800,7 @@ int main(int argc, char ** argv) { std::vector last_n_tokens(model.hparams.n_ctx); std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); - + // tokenize the prompt std::vector embd_inp = ::gpt_tokenize(vocab, params.prompt); @@ -868,14 +867,14 @@ int main(int argc, char ** argv) { embd.push_back(id); last_n_tokens.erase(last_n_tokens.begin()); - last_n_tokens.push_back(id); + last_n_tokens.push_back(id); } else { // if here, it means we are still processing the input prompt for (int k = i; k < embd_inp.size(); k++) { embd.push_back(embd_inp[k]); last_n_tokens.erase(last_n_tokens.begin()); - last_n_tokens.push_back(embd_inp[k]); + last_n_tokens.push_back(embd_inp[k]); if (embd.size() >= params.n_batch) { break; diff --git a/examples/whisper/whisper.cpp b/examples/whisper/whisper.cpp index 38187457..963e955d 100644 --- a/examples/whisper/whisper.cpp +++ b/examples/whisper/whisper.cpp @@ -1782,10 +1782,9 @@ static bool whisper_encode_internal( // run the computation { struct ggml_cgraph gf = {}; - gf.n_threads = n_threads; ggml_build_forward_expand(&gf, cur); - ggml_graph_compute(ctx0, &gf); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); //ggml_graph_print(&gf); } @@ -1828,7 +1827,6 @@ static bool whisper_encode_internal( // pre-compute cross-attention memory { struct ggml_cgraph gf = {}; - gf.n_threads = n_threads; // TODO: hack to disconnect the encoded features from the previous graph cur->op = GGML_OP_NONE; @@ -1871,7 +1869,7 @@ static bool whisper_encode_internal( ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcross, v)); } - ggml_graph_compute(ctx0, &gf); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); //ggml_graph_print(&gf); } @@ -1942,7 +1940,6 @@ static bool whisper_decode_internal( struct ggml_context * ctx0 = ggml_init(params); struct ggml_cgraph gf = {}; - gf.n_threads = n_threads; struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); memcpy(embd->data, tokens, N*ggml_element_size(embd)); @@ -2286,7 +2283,7 @@ static bool whisper_decode_internal( // run the computation { ggml_build_forward_expand(&gf, logits); - ggml_graph_compute (ctx0, &gf); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); } // extract logits for all N tokens @@ -5098,17 +5095,15 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) { struct ggml_cgraph gf = ggml_build_forward(c); - gf.n_threads = n_threads; - double tsum = 0.0; // heat-up - ggml_graph_compute(ctx0, &gf); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); for (int i = 0; i < n_max; ++i) { const int64_t t0 = ggml_time_us(); - ggml_graph_compute(ctx0, &gf); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); const int64_t t1 = ggml_time_us(); diff --git a/include/ggml/ggml.h b/include/ggml/ggml.h index d0710c55..ab84bef6 100644 --- a/include/ggml/ggml.h +++ b/include/ggml/ggml.h @@ -65,7 +65,7 @@ // ggml_set_f32(a, 3.0f); // ggml_set_f32(b, 4.0f); // -// ggml_graph_compute(ctx0, &gf); +// ggml_graph_compute_with_ctx(ctx, &gf, n_threads); // // printf("f = %f\n", ggml_get_f32_1d(f, 0)); // @@ -418,9 +418,6 @@ extern "C" { struct ggml_tensor * src1; struct ggml_tensor * opt[GGML_MAX_OPT]; - // thread scheduling - int n_tasks; - // performance int perf_runs; int64_t perf_cycles; @@ -432,19 +429,27 @@ extern "C" { void * extra; // extra things e.g. for ggml-cuda.cu - char padding[4]; + char padding[8]; }; static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); + // the compute plan that needs to be prepared for ggml_graph_compute() + // since https://github.com/ggerganov/ggml/issues/287 + struct ggml_cplan { + size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()` + uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()` + + int n_threads; + + // the `n_tasks` of nodes, 1:1 mapping to cgraph nodes + int n_tasks[GGML_MAX_NODES]; + }; + // computation graph struct ggml_cgraph { int n_nodes; int n_leafs; - int n_threads; - - size_t work_size; - struct ggml_tensor * work; struct ggml_tensor * nodes[GGML_MAX_NODES]; struct ggml_tensor * grads[GGML_MAX_NODES]; @@ -1290,15 +1295,22 @@ extern "C" { GGML_API void ggml_set_param( struct ggml_context * ctx, - struct ggml_tensor * tensor); + struct ggml_tensor * tensor); GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor); GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep); - GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph); - GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); + // ggml_graph_plan() has to be called before ggml_graph_compute() + // when plan.work_size > 0, caller must allocate memory for plan.work_data + GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/); + GGML_API void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); + GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); + + // same as ggml_graph_compute() but the work data is allocated as a part of the context + // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data + GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads); GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name); diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu index 7965ff74..fd36f179 100644 --- a/src/ggml-cuda.cu +++ b/src/ggml-cuda.cu @@ -59,8 +59,8 @@ typedef float2 dfloat2; #endif //GGML_CUDA_DMMV_F16 typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v); -typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream); -typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v); +typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream); +typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v); typedef void (*cpy_kernel_t)(const char * cx, char * cdst); typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); typedef void (*ggml_cuda_op_t)( @@ -131,7 +131,7 @@ typedef struct { } block_q8_1; static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding"); -typedef float (*vec_dot_q_cuda_t)(const void * vbq, const block_q8_1 * bq8_1, const int iqs); +typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs); //================================= k-quants @@ -208,6 +208,7 @@ typedef struct { static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding"); #define WARP_SIZE 32 +#define MATRIX_ROW_PADDING 256 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses #define CUDA_ADD_BLOCK_SIZE 256 #define CUDA_MUL_BLOCK_SIZE 256 @@ -407,7 +408,7 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in //================================== k-quants -static __global__ void dequantize_block_q2_K(const void * vx, float * yy) { +static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float * __restrict__ yy) { const int i = blockIdx.x; const block_q2_K * x = (const block_q2_K *) vx; @@ -440,7 +441,7 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) { } -static __global__ void dequantize_block_q3_K(const void * vx, float * yy) { +static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float * __restrict__ yy) { const int i = blockIdx.x; const block_q3_K * x = (const block_q3_K *) vx; @@ -504,7 +505,7 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t } #endif -static __global__ void dequantize_block_q4_K(const void * vx, float * yy) { +static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float * __restrict__ yy) { const block_q4_K * x = (const block_q4_K *) vx; const int i = blockIdx.x; @@ -544,7 +545,7 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) { #endif } -static __global__ void dequantize_block_q5_K(const void * vx, float * yy) { +static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float * __restrict__ yy) { const block_q5_K * x = (const block_q5_K *) vx; const int i = blockIdx.x; @@ -590,7 +591,7 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) { #endif } -static __global__ void dequantize_block_q6_K(const void * vx, float * yy) { +static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float * __restrict__ yy) { const block_q6_K * x = (const block_q6_K *) vx; const int i = blockIdx.x; @@ -634,7 +635,7 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) { #endif } -static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) { +static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) { static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); @@ -742,7 +743,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float } } -static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) { +static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) { const int row = blockIdx.y*blockDim.y + threadIdx.y; if (row > nrows) return; @@ -846,7 +847,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float } } -static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) { +static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) { const int row = blockIdx.y*blockDim.y + threadIdx.y; if (row > nrows) return; @@ -949,7 +950,7 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float } } -static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) { +static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) { const int row = blockIdx.x; const int num_blocks_per_row = ncols / QK_K; @@ -1053,7 +1054,7 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float } } -static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) { +static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) { static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); @@ -1171,7 +1172,7 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs, v.y = x[ib + iqs + 1]; } -static __global__ void quantize_q8_1(const float * x, void * vy, const int k) { +static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int ndata, const int k) { const int i = blockDim.x*blockIdx.x + threadIdx.x; if (i >= k) { @@ -1180,10 +1181,10 @@ static __global__ void quantize_q8_1(const float * x, void * vy, const int k) { block_q8_1 * y = (block_q8_1 *) vy; - const int ib = i / QK8_0; // block index - const int iqs = i % QK8_0; // quant index + const int ib = i / QK8_1; // block index + const int iqs = i % QK8_1; // quant index - const float xi = x[i]; + const float xi = i < ndata ? x[i] : 0.0f; float amax = fabsf(xi); float sum = xi; @@ -1207,7 +1208,7 @@ static __global__ void quantize_q8_1(const float * x, void * vy, const int k) { } template -static __global__ void dequantize_block(const void * vx, float * y, const int k) { +static __global__ void dequantize_block(const void * __restrict__ vx, float * __restrict__ y, const int k) { const int i = blockDim.x*blockIdx.x + 2*threadIdx.x; if (i >= k) { @@ -1227,7 +1228,7 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k) y[iybs + iqs + y_offset] = v.y; } -static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) { +static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq; @@ -1252,7 +1253,7 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * vbq, cons #endif // __CUDA_ARCH__ >= 600 } -static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) { +static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq; @@ -1277,7 +1278,7 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * vbq, cons #endif // __CUDA_ARCH__ >= 600 } -static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) { +static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq; @@ -1312,7 +1313,7 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * vbq, cons #endif // __CUDA_ARCH__ >= 600 } -static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) { +static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq; @@ -1346,7 +1347,7 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * vbq, cons #endif // __CUDA_ARCH__ >= 600 } -static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) { +static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq; @@ -1366,7 +1367,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * vbq, cons } template -static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * dst, const int ncols, const int nrows) { +static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) { const int row = blockIdx.y*blockDim.y + threadIdx.y; if (row >= nrows) { @@ -1404,7 +1405,7 @@ static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * d } template -static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) { +static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) { // qk = quantized weights per x block // qr = number of quantized weights per data value in x block const int row = blockIdx.y*blockDim.y + threadIdx.y; @@ -1471,7 +1472,7 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, } } -static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) { +static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nchannels_x) { const half * x = (const half *) vx; const int row_x = blockDim.y*blockIdx.y + threadIdx.y; @@ -1518,7 +1519,7 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl } static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous - const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, + const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int row_stride_x, const int channel_stride_x) { const half * x = (const half *) vx; @@ -1714,9 +1715,9 @@ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, con rms_norm_f32<<>>(x, dst, ncols); } -static void quantize_row_q8_1_cuda(const float * x, void * vy, const int k, cudaStream_t stream) { +static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) { const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE; - quantize_q8_1<<>>(x, vy, k); + quantize_q8_1<<>>(x, vy, ndata, k); } static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { @@ -2355,16 +2356,15 @@ inline void ggml_cuda_op_mul_mat_vec( src0->type == GGML_TYPE_Q5_1 || src0->type == GGML_TYPE_Q8_0; - // The integer intrinsics used in mul_mat_vec_q are available with compute capability 6. - // However, they have bad performance with Pascal cards. - // Therefore, in a multi GPU setting decide at runtime which GPUs should use mul_mat_vec_q. - const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 700 && mul_mat_vec_q_implemented; + const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 600 && mul_mat_vec_q_implemented; #endif if (use_mul_mat_vec_q) { + int64_t padded_row_size = ne00 + MATRIX_ROW_PADDING - 1; + padded_row_size -= padded_row_size % MATRIX_ROW_PADDING; size_t as; - void * src1_q8_1 = ggml_cuda_pool_malloc(ne00*sizeof(block_q8_1)/QK8_1, &as); - quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, cudaStream_main); + void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as); + quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main); switch (src0->type) { case GGML_TYPE_Q4_0: @@ -3108,7 +3108,11 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { int nrows = ggml_nrows(tensor); + + const int64_t ne0 = tensor->ne[0]; + const size_t nb1 = tensor->nb[1]; + ggml_backend backend = tensor->backend; struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu; memset(extra, 0, sizeof(*extra)); @@ -3137,11 +3141,24 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { int64_t nrows_split = row_high - row_low; const size_t offset_split = row_low*nb1; - const size_t size = ggml_nbytes_split(tensor, nrows_split); + size_t size = ggml_nbytes_split(tensor, nrows_split); + const size_t original_size = size; + + // pad last row to a multiple of 256 elements to avoid out-of-bounds memory accesses + if (ne0 % MATRIX_ROW_PADDING != 0) { + size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING) + * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type); + } - void * buf; + char * buf; CUDA_CHECK(cudaMalloc(&buf, size)); - void * buf_host = (char*)data + offset_split; + char * buf_host = (char*)data + offset_split; + + // set padding to 0 to avoid possible NaN values + if (size > original_size) { + CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size)); + } + cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice); diff --git a/src/ggml-metal.h b/src/ggml-metal.h index b9e50ac7..928f1705 100644 --- a/src/ggml-metal.h +++ b/src/ggml-metal.h @@ -34,9 +34,13 @@ extern "C" { struct ggml_metal_context; -struct ggml_metal_context * ggml_metal_init(void); +// number of command buffers to use +struct ggml_metal_context * ggml_metal_init(int n_cb); void ggml_metal_free(struct ggml_metal_context * ctx); +// set the number of command buffers to use +void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb); + // creates a mapping between a host memory buffer and a device memory buffer // - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute // - the mapping is used during computation to determine the arguments of the compute kernels diff --git a/src/ggml-metal.m b/src/ggml-metal.m index fd69c41f..6473644c 100644 --- a/src/ggml-metal.m +++ b/src/ggml-metal.m @@ -25,6 +25,8 @@ struct ggml_metal_buffer { }; struct ggml_metal_context { + int n_cb; + float * logits; id device; @@ -86,11 +88,12 @@ static NSString * const msl_library_source = @"see metal.metal"; @implementation GGMLMetalClass @end -struct ggml_metal_context * ggml_metal_init(void) { +struct ggml_metal_context * ggml_metal_init(int n_cb) { fprintf(stderr, "%s: allocating\n", __func__); struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); + ctx->n_cb = n_cb; ctx->device = MTLCreateSystemDefaultDevice(); ctx->queue = [ctx->device newCommandQueue]; ctx->n_buffers = 0; @@ -208,6 +211,10 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { free(ctx); } +void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) { + ctx->n_cb = n_cb; +} + // finds the Metal buffer that contains the tensor data on the GPU device // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the // Metal buffer based on the host memory pointer @@ -354,7 +361,7 @@ void ggml_metal_graph_compute( // create multiple command buffers and enqueue them // then, we encode the graph into the command buffers in parallel - const int n_cb = gf->n_threads; + const int n_cb = ctx->n_cb; NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb]; @@ -443,6 +450,7 @@ void ggml_metal_graph_compute( //} switch (dst->op) { + case GGML_OP_NONE: case GGML_OP_RESHAPE: case GGML_OP_VIEW: case GGML_OP_TRANSPOSE: diff --git a/src/ggml-opencl.cpp b/src/ggml-opencl.cpp index fa0bdbef..eb214a83 100644 --- a/src/ggml-opencl.cpp +++ b/src/ggml-opencl.cpp @@ -653,13 +653,17 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx, const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... const int in = tid - step*im; // 0...15 or 0...7 -#if K_QUANTS_PER_ITERATION == 1 +\n#if K_QUANTS_PER_ITERATION == 1\n const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 const int is = 0; -#else + +\n#else\n + const int l0 = 4 * in; // 0, 4, 8, ..., 28 const int is = in / 4; -#endif + +\n#endif\n + const int ql_offset = 64*im + l0; const int qh_offset = 32*im + l0; const int s_offset = 8*im + is; @@ -676,7 +680,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx, const float d = vload_half(0, &x[i].d); -#if K_QUANTS_PER_ITERATION == 1 +\n#if K_QUANTS_PER_ITERATION == 1\n float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32) + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32) + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32) @@ -686,7 +690,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx, + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32) +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32); tmp[16 * ix + tid] += sum; -#else +\n#else\n float sum = 0; for (int l = 0; l < 4; ++l) { sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32) @@ -695,7 +699,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx, + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32); } tmp[16 * ix + tid] += sum; -#endif +\n#endif\n } diff --git a/src/ggml.c b/src/ggml.c index d257c3d6..c10877a7 100644 --- a/src/ggml.c +++ b/src/ggml.c @@ -247,7 +247,11 @@ inline static void* ggml_aligned_malloc(size_t size) { #include "ggml-opencl.h" #endif #elif defined(GGML_USE_OPENBLAS) +#if defined(GGML_BLAS_USE_MKL) +#include +#else #include +#endif #elif defined(GGML_USE_CUBLAS) #include "ggml-cuda.h" #elif defined(GGML_USE_CLBLAST) @@ -4583,14 +4587,13 @@ struct ggml_tensor * ggml_new_tensor_impl( /*.src0 =*/ NULL, /*.src1 =*/ NULL, /*.opt =*/ { NULL }, - /*.n_tasks =*/ 0, /*.perf_runs =*/ 0, /*.perf_cycles =*/ 0, /*.perf_time_us =*/ 0, /*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data, /*.name =*/ { 0 }, /*.extra =*/ NULL, - /*.pad =*/ { 0 }, + /*.padding =*/ { 0 }, }; // TODO: this should not be needed as long as we don't rely on aligned SIMD loads @@ -10718,8 +10721,6 @@ static void ggml_compute_forward_mul_mat( float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3)); - assert(ne00 % 32 == 0); - for (int64_t ic = 0; ic < ne11; ++ic) { vec_dot(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size)); } @@ -15772,9 +15773,6 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) { struct ggml_cgraph result = { /*.n_nodes =*/ 0, /*.n_leafs =*/ 0, - /*.n_threads =*/ GGML_DEFAULT_N_THREADS, - /*.work_size =*/ 0, - /*.work =*/ NULL, /*.nodes =*/ { NULL }, /*.grads =*/ { NULL }, /*.leafs =*/ { NULL }, @@ -15945,12 +15943,13 @@ void clear_numa_thread_affinity(void) {} #endif struct ggml_compute_state_shared { - struct ggml_cgraph * cgraph; + const struct ggml_cgraph * cgraph; + const struct ggml_cplan * cplan; int64_t perf_node_start_cycles; int64_t perf_node_start_time_us; - int n_threads; + const int n_threads; // synchronization primitives atomic_int n_active; // num active threads @@ -15974,9 +15973,13 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const static thread_ret_t ggml_graph_compute_thread(void * data) { struct ggml_compute_state * state = (struct ggml_compute_state *) data; - struct ggml_cgraph * cgraph = state->shared->cgraph; - const int n_threads = state->shared->n_threads; + const struct ggml_cgraph * cgraph = state->shared->cgraph; + const struct ggml_cplan * cplan = state->shared->cplan; + + const int * n_tasks_arr = cplan->n_tasks; + const int n_threads = state->shared->n_threads; + set_numa_thread_affinity(state->ith, n_threads); int node_n = -1; @@ -15989,15 +15992,15 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { /*.type =*/ GGML_TASK_FINALIZE, /*.ith =*/ 0, /*.nth =*/ 0, - /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0, - /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL, + /*.wsize =*/ cplan->work_size, + /*.wdata =*/ cplan->work_data, }; if (node_n != -1) { /* FINALIZE */ struct ggml_tensor * node = state->shared->cgraph->nodes[node_n]; if (GGML_OP_HAS_FINALIZE[node->op]) { - params.nth = node->n_tasks; + params.nth = n_tasks_arr[node_n]; ggml_compute_forward(¶ms, node); ggml_graph_compute_perf_stats_node(node, state->shared); } @@ -16008,11 +16011,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes); struct ggml_tensor * node = cgraph->nodes[node_n]; + const int n_tasks = n_tasks_arr[node_n]; state->shared->perf_node_start_cycles = ggml_perf_cycles(); state->shared->perf_node_start_time_us = ggml_perf_time_us(); - params.nth = node->n_tasks; + params.nth = n_tasks; /* INIT */ if (GGML_OP_HAS_INIT[node->op]) { @@ -16020,7 +16024,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { ggml_compute_forward(¶ms, node); } - if (node->n_tasks == 1) { + if (n_tasks == 1) { // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1, // they do something more efficient than spinning (?) params.type = GGML_TASK_COMPUTE; @@ -16042,7 +16046,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { // wait for other threads to finish const int last = node_n; do { - sched_yield(); + //sched_yield(); node_n = atomic_load(&state->shared->node_n); } while (node_n == last); } @@ -16052,16 +16056,17 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { /* COMPUTE */ struct ggml_tensor * node = cgraph->nodes[node_n]; + const int n_tasks = n_tasks_arr[node_n]; struct ggml_compute_params params = { /*.type =*/ GGML_TASK_COMPUTE, /*.ith =*/ state->ith, - /*.nth =*/ node->n_tasks, - /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0, - /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL, + /*.nth =*/ n_tasks, + /*.wsize =*/ cplan->work_size, + /*.wdata =*/ cplan->work_data, }; - if (state->ith < node->n_tasks) { + if (state->ith < n_tasks) { ggml_compute_forward(¶ms, node); } } @@ -16069,349 +16074,372 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { return 0; } -void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { - const int n_threads = cgraph->n_threads; +struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { + if (n_threads <= 0) { + n_threads = GGML_DEFAULT_N_THREADS; + } - struct ggml_compute_state_shared state_shared = { - /*.cgraph =*/ cgraph, - /*.perf_node_start_cycles =*/ 0, - /*.perf_node_start_time_us =*/ 0, - /*.n_threads =*/ n_threads, - /*.n_active =*/ n_threads, - /*.node_n =*/ -1, - }; - struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads); + size_t work_size = 0; - // initialize tasks + work buffer - { - size_t work_size = 0; + struct ggml_cplan cplan; + memset(&cplan, 0, sizeof(struct ggml_cplan)); - // thread scheduling for the different operations - for (int i = 0; i < cgraph->n_nodes; i++) { - struct ggml_tensor * node = cgraph->nodes[i]; + // thread scheduling for the different operations + work buffer size estimation + for (int i = 0; i < cgraph->n_nodes; i++) { + int n_tasks = 1; - switch (node->op) { - case GGML_OP_CPY: - case GGML_OP_DUP: - { - node->n_tasks = n_threads; + struct ggml_tensor * node = cgraph->nodes[i]; - size_t cur = 0; - if (ggml_is_quantized(node->type)) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_threads; - } + switch (node->op) { + case GGML_OP_CPY: + case GGML_OP_DUP: + { + n_tasks = n_threads; - work_size = MAX(work_size, cur); - } break; - case GGML_OP_ADD: - case GGML_OP_ADD1: - { - node->n_tasks = n_threads; + size_t cur = 0; + if (ggml_is_quantized(node->type)) { + cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_tasks; + } - size_t cur = 0; + work_size = MAX(work_size, cur); + } break; + case GGML_OP_ADD: + case GGML_OP_ADD1: + { + n_tasks = n_threads; - if (ggml_is_quantized(node->src0->type)) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_threads; - } + size_t cur = 0; - work_size = MAX(work_size, cur); - } break; - case GGML_OP_ACC: - { - node->n_tasks = n_threads; + if (ggml_is_quantized(node->src0->type)) { + cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_tasks; + } + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_ACC: + { + n_tasks = n_threads; - size_t cur = 0; + size_t cur = 0; - if (ggml_is_quantized(node->src0->type)) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_threads; - } + if (ggml_is_quantized(node->src0->type)) { + cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_tasks; + } - work_size = MAX(work_size, cur); - } break; - case GGML_OP_SUB: - case GGML_OP_DIV: - case GGML_OP_SQR: - case GGML_OP_SQRT: - case GGML_OP_LOG: - case GGML_OP_SUM: - case GGML_OP_SUM_ROWS: - case GGML_OP_MEAN: - case GGML_OP_ARGMAX: - case GGML_OP_REPEAT: - case GGML_OP_REPEAT_BACK: - case GGML_OP_ABS: - case GGML_OP_SGN: - case GGML_OP_NEG: - case GGML_OP_STEP: - case GGML_OP_TANH: - case GGML_OP_ELU: - case GGML_OP_RELU: - { - node->n_tasks = 1; - } break; - case GGML_OP_MUL: - case GGML_OP_GELU: - case GGML_OP_GELU_QUICK: - case GGML_OP_SILU: - case GGML_OP_SILU_BACK: - case GGML_OP_NORM: - case GGML_OP_RMS_NORM: - case GGML_OP_RMS_NORM_BACK: - { - node->n_tasks = n_threads; - } break; - case GGML_OP_MUL_MAT: - case GGML_OP_OUT_PROD: - { - node->n_tasks = n_threads; - - // TODO: use different scheduling for different matrix sizes - //const int nr0 = ggml_nrows(node->src0); - //const int nr1 = ggml_nrows(node->src1); - - //node->n_tasks = MIN(n_threads, MAX(1, nr0/128)); - //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks = %d\n", nr0, nr1, nr0*nr1, node->n_tasks); - - size_t cur = 0; - const enum ggml_type vec_dot_type = type_traits[node->src0->type].vec_dot_type; + work_size = MAX(work_size, cur); + } break; + case GGML_OP_SUB: + case GGML_OP_DIV: + case GGML_OP_SQR: + case GGML_OP_SQRT: + case GGML_OP_LOG: + case GGML_OP_SUM: + case GGML_OP_SUM_ROWS: + case GGML_OP_MEAN: + case GGML_OP_ARGMAX: + case GGML_OP_REPEAT: + case GGML_OP_REPEAT_BACK: + case GGML_OP_ABS: + case GGML_OP_SGN: + case GGML_OP_NEG: + case GGML_OP_STEP: + case GGML_OP_TANH: + case GGML_OP_ELU: + case GGML_OP_RELU: + { + n_tasks = 1; + } break; + case GGML_OP_MUL: + case GGML_OP_GELU: + case GGML_OP_GELU_QUICK: + case GGML_OP_SILU: + case GGML_OP_SILU_BACK: + case GGML_OP_NORM: + case GGML_OP_RMS_NORM: + case GGML_OP_RMS_NORM_BACK: + { + n_tasks = n_threads; + } break; + case GGML_OP_MUL_MAT: + case GGML_OP_OUT_PROD: + { + n_tasks = n_threads; + + // TODO: use different scheduling for different matrix sizes + //const int nr0 = ggml_nrows(node->src0); + //const int nr1 = ggml_nrows(node->src1); + + //n_tasks = MIN(n_threads, MAX(1, nr0/128)); + //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks); + + size_t cur = 0; + const enum ggml_type vec_dot_type = type_traits[node->src0->type].vec_dot_type; #if defined(GGML_USE_CUBLAS) - if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) { - node->n_tasks = 1; // TODO: this actually is doing nothing - // the threads are still spinning - } - else + if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) { + n_tasks = 1; // TODO: this actually is doing nothing + // the threads are still spinning + } else #elif defined(GGML_USE_CLBLAST) - if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) { - node->n_tasks = 1; // TODO: this actually is doing nothing - // the threads are still spinning - cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node); - } - else + if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) { + n_tasks = 1; // TODO: this actually is doing nothing + // the threads are still spinning + cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node); + } else #endif #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { - node->n_tasks = 1; // TODO: this actually is doing nothing - // the threads are still spinning - if (node->src0->type != GGML_TYPE_F32) { - // here we need memory just for single 2D matrix from src0 - cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); - } - } else -#endif - if (node->src1->type != vec_dot_type) { - cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[vec_dot_type]; - } else { - cur = 0; + if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { + n_tasks = 1; // TODO: this actually is doing nothing + // the threads are still spinning + if (node->src0->type != GGML_TYPE_F32) { + // here we need memory just for single 2D matrix from src0 + cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); } + } else +#endif + if (node->src1->type != vec_dot_type) { + cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[vec_dot_type]; + } else { + cur = 0; + } - work_size = MAX(work_size, cur); - } break; - case GGML_OP_SCALE: - { - node->n_tasks = 1; - } break; - case GGML_OP_SET: - case GGML_OP_CONT: - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_PERMUTE: - case GGML_OP_TRANSPOSE: - case GGML_OP_GET_ROWS: - case GGML_OP_GET_ROWS_BACK: - case GGML_OP_DIAG: - case GGML_OP_DIAG_MASK_ZERO: - { - node->n_tasks = 1; - } break; - case GGML_OP_DIAG_MASK_INF: - case GGML_OP_SOFT_MAX: - case GGML_OP_SOFT_MAX_BACK: - case GGML_OP_ROPE: - case GGML_OP_ROPE_BACK: - { - node->n_tasks = n_threads; - } break; - case GGML_OP_ALIBI: - { - node->n_tasks = 1; //TODO - } break; - case GGML_OP_CLAMP: - { - node->n_tasks = 1; //TODO - } break; - case GGML_OP_CONV_1D: - { - node->n_tasks = n_threads; - - GGML_ASSERT(node->src0->ne[3] == 1); - GGML_ASSERT(node->src1->ne[2] == 1); - GGML_ASSERT(node->src1->ne[3] == 1); - - size_t cur = 0; - const int nk = node->src0->ne[0]; - - if (node->src0->type == GGML_TYPE_F16 && + work_size = MAX(work_size, cur); + } break; + case GGML_OP_SCALE: + { + n_tasks = 1; + } break; + case GGML_OP_SET: + case GGML_OP_CONT: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + case GGML_OP_GET_ROWS: + case GGML_OP_GET_ROWS_BACK: + case GGML_OP_DIAG: + case GGML_OP_DIAG_MASK_ZERO: + { + n_tasks = 1; + } break; + case GGML_OP_DIAG_MASK_INF: + case GGML_OP_SOFT_MAX: + case GGML_OP_SOFT_MAX_BACK: + case GGML_OP_ROPE: + case GGML_OP_ROPE_BACK: + { + n_tasks = n_threads; + } break; + case GGML_OP_ALIBI: + { + n_tasks = 1; //TODO + } break; + case GGML_OP_CLAMP: + { + n_tasks = 1; //TODO + } break; + case GGML_OP_CONV_1D: + { + n_tasks = n_threads; + + GGML_ASSERT(node->src0->ne[3] == 1); + GGML_ASSERT(node->src1->ne[2] == 1); + GGML_ASSERT(node->src1->ne[3] == 1); + + size_t cur = 0; + const int nk = node->src0->ne[0]; + + if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) { - cur = sizeof(ggml_fp16_t)*( - nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] + - ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1] - ); - } else if (node->src0->type == GGML_TYPE_F32 && - node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)*( - nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] + - ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1] - ); - } else { - GGML_ASSERT(false); - } + cur = sizeof(ggml_fp16_t)*( + nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] + + ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1] + ); + } else if (node->src0->type == GGML_TYPE_F32 && + node->src1->type == GGML_TYPE_F32) { + cur = sizeof(float)*( + nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] + + ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1] + ); + } else { + GGML_ASSERT(false); + } - work_size = MAX(work_size, cur); - } break; - case GGML_OP_CONV_2D: - { - node->n_tasks = n_threads; + work_size = MAX(work_size, cur); + } break; + case GGML_OP_CONV_2D: + { + n_tasks = n_threads; - GGML_ASSERT(node->src1->ne[3] == 1); + GGML_ASSERT(node->src1->ne[3] == 1); - const int64_t ne00 = node->src0->ne[0]; // W - const int64_t ne01 = node->src0->ne[1]; // H - const int64_t ne02 = node->src0->ne[2]; // C - const int64_t ne03 = node->src0->ne[3]; // N + const int64_t ne00 = node->src0->ne[0]; // W + const int64_t ne01 = node->src0->ne[1]; // H + const int64_t ne02 = node->src0->ne[2]; // C + const int64_t ne03 = node->src0->ne[3]; // N - const int64_t ne10 = node->src1->ne[0]; // W - const int64_t ne11 = node->src1->ne[1]; // H - const int64_t ne12 = node->src1->ne[2]; // C + const int64_t ne10 = node->src1->ne[0]; // W + const int64_t ne11 = node->src1->ne[1]; // H + const int64_t ne12 = node->src1->ne[2]; // C - const int64_t nk = ne00*ne01; + const int64_t nk = ne00*ne01; - UNUSED(ne02); - UNUSED(ne03); - UNUSED(nk); + UNUSED(ne02); + UNUSED(ne03); + UNUSED(nk); - size_t cur = 0; + size_t cur = 0; - if (node->src0->type == GGML_TYPE_F16 && + if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) { - cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12); - } else if (node->src0->type == GGML_TYPE_F32 && - node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)* (ne10*ne11*ne12); - } else { - GGML_ASSERT(false); - } + cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12); + } else if (node->src0->type == GGML_TYPE_F32 && + node->src1->type == GGML_TYPE_F32) { + cur = sizeof(float)* (ne10*ne11*ne12); + } else { + GGML_ASSERT(false); + } - work_size = MAX(work_size, cur); - } break; - case GGML_OP_FLASH_ATTN: - { - node->n_tasks = n_threads; + work_size = MAX(work_size, cur); + } break; + case GGML_OP_FLASH_ATTN: + { + n_tasks = n_threads; - size_t cur = 0; + size_t cur = 0; - const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL); + const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL); - if (node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2 - } + if (node->src1->type == GGML_TYPE_F32) { + cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2 + } - if (node->src1->type == GGML_TYPE_F16) { - cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2 - } + if (node->src1->type == GGML_TYPE_F16) { + cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2 + } - work_size = MAX(work_size, cur); - } break; - case GGML_OP_FLASH_FF: - { - node->n_tasks = n_threads; + work_size = MAX(work_size, cur); + } break; + case GGML_OP_FLASH_FF: + { + n_tasks = n_threads; - size_t cur = 0; + size_t cur = 0; - if (node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2 - } + if (node->src1->type == GGML_TYPE_F32) { + cur = sizeof(float)*node->src1->ne[1]*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*node->src1->ne[1]*n_tasks; // this is overestimated by x2 + } - if (node->src1->type == GGML_TYPE_F16) { - cur = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2 - } + if (node->src1->type == GGML_TYPE_F16) { + cur = sizeof(float)*node->src1->ne[1]*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*node->src1->ne[1]*n_tasks; // this is overestimated by x2 + } - work_size = MAX(work_size, cur); - } break; - case GGML_OP_FLASH_ATTN_BACK: - { - node->n_tasks = n_threads; + work_size = MAX(work_size, cur); + } break; + case GGML_OP_FLASH_ATTN_BACK: + { + n_tasks = n_threads; - size_t cur = 0; + size_t cur = 0; - const int64_t D = node->src0->ne[0]; - const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL); - const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back - if (node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2 - } + const int64_t D = node->src0->ne[0]; + const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL); + const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back + if (node->src1->type == GGML_TYPE_F32) { + cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 + } - if (node->src1->type == GGML_TYPE_F16) { - cur = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2 - } + if (node->src1->type == GGML_TYPE_F16) { + cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 + } - work_size = MAX(work_size, cur); - } break; - case GGML_OP_WIN_PART: - case GGML_OP_WIN_UNPART: - case GGML_OP_MAP_UNARY: - case GGML_OP_MAP_BINARY: - case GGML_OP_MAP_CUSTOM1: - case GGML_OP_MAP_CUSTOM2: - case GGML_OP_MAP_CUSTOM3: - { - node->n_tasks = 1; - } break; - case GGML_OP_CROSS_ENTROPY_LOSS: - { - node->n_tasks = n_threads; - - size_t cur = ggml_type_size(node->type)*(node->n_tasks + node->src0->ne[0]*node->n_tasks); - - work_size = MAX(work_size, cur); - } break; - case GGML_OP_CROSS_ENTROPY_LOSS_BACK: - { - node->n_tasks = n_threads; - - size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*node->n_tasks; - - work_size = MAX(work_size, cur); - } break; - case GGML_OP_NONE: - { - node->n_tasks = 1; - } break; - case GGML_OP_COUNT: - { - GGML_ASSERT(false); - } break; - } - } + work_size = MAX(work_size, cur); + } break; + case GGML_OP_WIN_PART: + case GGML_OP_WIN_UNPART: + case GGML_OP_MAP_UNARY: + case GGML_OP_MAP_BINARY: + case GGML_OP_MAP_CUSTOM1: + case GGML_OP_MAP_CUSTOM2: + case GGML_OP_MAP_CUSTOM3: + { + n_tasks = 1; + } break; + case GGML_OP_CROSS_ENTROPY_LOSS: + { + n_tasks = n_threads; + + size_t cur = ggml_type_size(node->type)*(n_tasks + node->src0->ne[0]*n_tasks); + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_CROSS_ENTROPY_LOSS_BACK: + { + n_tasks = n_threads; + + size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*n_tasks; - if (cgraph->work != NULL && work_size > cgraph->work_size) { - GGML_ASSERT(false); // TODO: better handling + work_size = MAX(work_size, cur); + } break; + case GGML_OP_NONE: + { + n_tasks = 1; + } break; + case GGML_OP_COUNT: + { + GGML_ASSERT(false); + } break; } - if (work_size > 0 && cgraph->work == NULL) { - cgraph->work_size = work_size + CACHE_LINE_SIZE*(n_threads - 1); + cplan.n_tasks[i] = n_tasks; + } + + if (work_size > 0) { + work_size += CACHE_LINE_SIZE*(n_threads - 1); + } - GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, cgraph->work_size); - cgraph->work = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cgraph->work_size); + cplan.n_threads = n_threads; + cplan.work_size = work_size; + cplan.work_data = NULL; + + return cplan; +} + +void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { + { + GGML_ASSERT(cplan); + GGML_ASSERT(cplan->n_threads > 0); + + if (cplan->work_size > 0) { + GGML_ASSERT(cplan->work_data); + } + + for (int i = 0; i < cgraph->n_nodes; ++i) { + if (cgraph->nodes[i]->op != GGML_OP_NONE) { + GGML_ASSERT(cplan->n_tasks[i] > 0); + } } } + const int n_threads = cplan->n_threads; + + struct ggml_compute_state_shared state_shared = { + /*.cgraph =*/ cgraph, + /*.cgraph_plan =*/ cplan, + /*.perf_node_start_cycles =*/ 0, + /*.perf_node_start_time_us =*/ 0, + /*.n_threads =*/ n_threads, + /*.n_active =*/ n_threads, + /*.node_n =*/ -1, + }; + struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads); + // create thread pool if (n_threads > 1) { for (int j = 1; j < n_threads; ++j) { @@ -16473,6 +16501,17 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) { } } +void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) { + struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads); + + struct ggml_tensor * buf = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cplan.work_size); + GGML_ASSERT(buf); + + cplan.work_data = buf->data; + + ggml_graph_compute(cgraph, &cplan); +} + struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) { for (int i = 0; i < cgraph->n_leafs; i++) { struct ggml_tensor * leaf = cgraph->leafs[i]; @@ -16511,14 +16550,13 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char const int64_t * ne = tensor->ne; const size_t * nb = tensor->nb; - fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %8d %16p %32s\n", + fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n", arg, ggml_type_name(tensor->type), ggml_op_name (tensor->op), tensor->n_dims, ne[0], ne[1], ne[2], ne[3], nb[0], nb[1], nb[2], nb[3], - tensor->n_tasks, tensor->data, tensor->name); } @@ -17254,9 +17292,6 @@ static enum ggml_opt_result ggml_opt_adam( struct ggml_cgraph * gb) { GGML_ASSERT(ggml_is_scalar(f)); - gf->n_threads = params.n_threads; - gb->n_threads = params.n_threads; - // these will store the parameters we want to optimize struct ggml_tensor * ps[GGML_MAX_PARAMS]; @@ -17303,7 +17338,8 @@ static enum ggml_opt_result ggml_opt_adam( // compute the function value ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(ctx, gb); + + ggml_graph_compute_with_ctx(ctx, gb, params.n_threads); opt->adam.fx_prev = ggml_get_f32_1d(f, 0); opt->adam.fx_best = opt->adam.fx_prev; @@ -17383,7 +17419,8 @@ static enum ggml_opt_result ggml_opt_adam( ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(ctx, gb); + + ggml_graph_compute_with_ctx(ctx, gb, params.n_threads); const float fx = ggml_get_f32_1d(f, 0); @@ -17505,7 +17542,8 @@ static enum ggml_opt_result linesearch_backtracking( ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(ctx, gb); + + ggml_graph_compute_with_ctx(ctx, gb, params->n_threads); ggml_opt_get_grad(np, ps, g); @@ -17573,9 +17611,6 @@ static enum ggml_opt_result ggml_opt_lbfgs( } } - gf->n_threads = params.n_threads; - gb->n_threads = params.n_threads; - const int m = params.lbfgs.m; // these will store the parameters we want to optimize @@ -17627,7 +17662,8 @@ static enum ggml_opt_result ggml_opt_lbfgs( ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(ctx, gb); + + ggml_graph_compute_with_ctx(ctx, gb, params.n_threads); ggml_opt_get_grad(np, ps, g); diff --git a/tests/test-blas0.c b/tests/test-blas0.c index aecccf02..0977d3ef 100644 --- a/tests/test-blas0.c +++ b/tests/test-blas0.c @@ -46,6 +46,8 @@ int main(int argc, const char ** argv) { return 1; } + const int n_threads = 1; + int M = atoi(argv[1]); int N = atoi(argv[2]); int K = atoi(argv[3]); @@ -131,14 +133,14 @@ int main(int argc, const char ** argv) { dst2 = ggml_mul_mat(ctx0, s0_f32, s1_f32); struct ggml_cgraph gf = ggml_build_forward(dst2); - ggml_graph_compute(ctx0, &gf); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); } { dst3 = ggml_mul_mat(ctx0, s0_f16, s1_f32); struct ggml_cgraph gf = ggml_build_forward(dst3); - ggml_graph_compute(ctx0, &gf); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); } bool ok_blas = true; diff --git a/tests/test-grad0.c b/tests/test-grad0.c index a3e25214..da4001ce 100644 --- a/tests/test-grad0.c +++ b/tests/test-grad0.c @@ -10,6 +10,8 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif +#pragma GCC diagnostic ignored "-Wdouble-promotion" + #define MAX_NARGS 3 #undef MIN @@ -49,7 +51,7 @@ float frand(void) { int irand(int n) { if (n == 0) return 0; - else return rand()%n; + return rand()%n; } void get_random_dims(int64_t * dims, int ndims) { @@ -159,12 +161,14 @@ struct ggml_tensor * get_random_tensor_int( float get_element(const struct ggml_tensor * t, int idx) { if (t->type == GGML_TYPE_F32) { return ((float *)t->data)[idx]; - } else if (t->type == GGML_TYPE_I32) { + } + + if (t->type == GGML_TYPE_I32) { return ((int32_t *)t->data)[idx]; - } else { - assert(false); - return INFINITY; } + + assert(false); + return INFINITY; } void set_element(struct ggml_tensor * t, int idx, float value) { @@ -215,15 +219,14 @@ bool check_gradient( } struct ggml_cgraph gf = ggml_build_forward (f); - gf.n_threads = n_threads; - struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false); - gb.n_threads = n_threads; - ggml_graph_compute(ctx0, &gf); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); + ggml_graph_reset (&gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(ctx0, &gb); + + ggml_graph_compute_with_ctx(ctx0, &gb, n_threads); // ggml_graph_dump_dot(&gf, NULL, "test-grad0-forward.dot"); // ggml_graph_dump_dot(&gb, &gf, "test-grad0-backward.dot"); @@ -236,15 +239,16 @@ bool check_gradient( const float xm = x0 - eps; const float xp = x0 + eps; set_element(x[i], k, xp); - ggml_graph_compute(ctx0, &gf); + + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); const float f0 = ggml_get_f32_1d(f, 0); set_element(x[i], k, xm); - ggml_graph_compute(ctx0, &gf); - const float f1 = ggml_get_f32_1d(f, 0); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); + const float f1 = ggml_get_f32_1d(f, 0); const float g0 = (f0 - f1)/(2.0f*eps); set_element(x[i], k, x0); @@ -252,12 +256,13 @@ bool check_gradient( // compute gradient using backward graph ggml_graph_reset (&gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(ctx0, &gb); + + ggml_graph_compute_with_ctx(ctx0, &gb, n_threads); const float g1 = get_element(x[i]->grad, k); const float error_abs = fabsf(g0 - g1); - const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabs(g0) : 0; + const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabsf(g0) : 0; if (error_abs > max_error_abs || error_rel > max_error_rel) { printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n", diff --git a/tests/test-mul-mat0.c b/tests/test-mul-mat0.c index 185df396..1bd6e140 100644 --- a/tests/test-mul-mat0.c +++ b/tests/test-mul-mat0.c @@ -95,14 +95,15 @@ bool check_gradient( float eps, float max_error_abs, float max_error_rel) { + const int n_threads = 1; struct ggml_cgraph gf = ggml_build_forward (f); struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false); - ggml_graph_compute(ctx0, &gf); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); ggml_graph_reset (&gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(ctx0, &gb); + ggml_graph_compute_with_ctx(ctx0, &gb, n_threads); ggml_graph_dump_dot(&gf, NULL, "test-grad0-forward.dot"); ggml_graph_dump_dot(&gb, &gf, "test-grad0-backward.dot"); @@ -114,12 +115,12 @@ bool check_gradient( const float x0 = get_element(x[i], k); set_element(x[i], k, x0 + eps); - ggml_graph_compute(ctx0, &gf); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); const float f0 = ggml_get_f32_1d(f, 0); set_element(x[i], k, x0 - eps); - ggml_graph_compute(ctx0, &gf); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); const float f1 = ggml_get_f32_1d(f, 0); @@ -130,7 +131,7 @@ bool check_gradient( // compute gradient using backward graph ggml_graph_reset (&gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(ctx0, &gb); + ggml_graph_compute_with_ctx(ctx0, &gb, n_threads); const float g1 = get_element(x[i]->grad, k); @@ -247,6 +248,9 @@ int main(int argc, const char ** argv) { if (argc > 1) { niter = atoi(argv[1]); } + + int n_threads = 1; + for (int iter = 0; iter < niter; ++iter) { printf("test-mul-mat0: iter:%d/%d\n", iter, niter); struct ggml_context * ctx0 = ggml_init(params); @@ -283,7 +287,7 @@ int main(int argc, const char ** argv) { check_gradient("mul_mat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); } else { struct ggml_cgraph gf = ggml_build_forward(m); - ggml_graph_compute(ctx0, &gf); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); } check_mat_mul(m, x[1], x[0]); @@ -319,7 +323,7 @@ int main(int argc, const char ** argv) { check_gradient("mul_mat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); } else { struct ggml_cgraph gf = ggml_build_forward(m); - ggml_graph_compute(ctx0, &gf); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); } check_mat_mul(m, x[1], x[0]); diff --git a/tests/test-opt.c b/tests/test-opt.c index d001615e..e928a7df 100644 --- a/tests/test-opt.c +++ b/tests/test-opt.c @@ -7,6 +7,7 @@ #define MAX_NARGS 2 +#pragma GCC diagnostic ignored "-Wdouble-promotion" // // logging @@ -33,7 +34,7 @@ #define GGML_PRINT(...) printf(__VA_ARGS__) -float frand() { +float frand(void) { return (float)rand()/(float)RAND_MAX; } @@ -114,7 +115,7 @@ void set_element(struct ggml_tensor * t, int idx, float value) { ((float *)t->data)[idx] = value; } -int main(int argc, const char ** argv) { +int main(void) { struct ggml_init_params params = { .mem_size = 1024*1024*1024, .mem_buffer = NULL, @@ -137,10 +138,11 @@ int main(int argc, const char ** argv) { struct ggml_tensor * d = ggml_sub(ctx, c, ab); struct ggml_tensor * e = ggml_sum(ctx, ggml_sqr(ctx, d)); - struct ggml_cgraph ge = ggml_build_forward(e); - ggml_graph_reset (&ge); - ggml_graph_compute(ctx, &ge); + ggml_graph_reset(&ge); + + ggml_graph_compute_with_ctx(ctx, &ge, /*n_threads*/ 1); + const float fe = ggml_get_f32_1d(e, 0); printf("%s: e = %.4f\n", __func__, fe); @@ -148,8 +150,10 @@ int main(int argc, const char ** argv) { ggml_opt(ctx, opt_params, e); - ggml_graph_reset (&ge); - ggml_graph_compute(ctx, &ge); + ggml_graph_reset(&ge); + + ggml_graph_compute_with_ctx(ctx, &ge, /*n_threads*/ 1); + const float fe_opt = ggml_get_f32_1d(e, 0); printf("%s: original e = %.4f\n", __func__, fe); printf("%s: optimized e = %.4f\n", __func__, fe_opt); diff --git a/tests/test1.c b/tests/test1.c index 8c1a352e..c313bf8e 100644 --- a/tests/test1.c +++ b/tests/test1.c @@ -4,6 +4,8 @@ #include int main(int argc, const char ** argv) { + const int n_threads = 2; + struct ggml_init_params params = { .mem_size = 128*1024*1024, .mem_buffer = NULL, @@ -35,7 +37,7 @@ int main(int argc, const char ** argv) { ggml_graph_reset(&gf); ggml_set_f32(f->grad, 1.0f); - ggml_graph_compute(ctx0, &gb); + ggml_graph_compute_with_ctx(ctx0, &gb, n_threads); printf("f = %f\n", ggml_get_f32_1d(f, 0)); printf("df/dx = %f\n", ggml_get_f32_1d(x->grad, 0)); @@ -48,7 +50,7 @@ int main(int argc, const char ** argv) { ggml_graph_reset(&gf); ggml_set_f32(f->grad, 1.0f); - ggml_graph_compute(ctx0, &gb); + ggml_graph_compute_with_ctx(ctx0, &gb, n_threads); printf("f = %f\n", ggml_get_f32_1d(f, 0)); printf("df/dx = %f\n", ggml_get_f32_1d(x->grad, 0)); @@ -82,7 +84,7 @@ int main(int argc, const char ** argv) { ggml_graph_reset(&gf); ggml_set_f32(y->grad, 1.0f); - ggml_graph_compute(ctx0, &gb); + ggml_graph_compute_with_ctx(ctx0, &gb, n_threads); printf("y = %f\n", ggml_get_f32_1d(y, 0)); printf("df/dx1 = %f\n", ggml_get_f32_1d(x1->grad, 0)); @@ -101,7 +103,7 @@ int main(int argc, const char ** argv) { ggml_set_f32(g1->grad, 1.0f); ggml_set_f32(g2->grad, 1.0f); - ggml_graph_compute(ctx0, &gbb); + ggml_graph_compute_with_ctx(ctx0, &gbb, n_threads); printf("H * [1, 1] = [ %f %f ]\n", ggml_get_f32_1d(x1->grad, 0), ggml_get_f32_1d(x2->grad, 0)); @@ -132,7 +134,7 @@ int main(int argc, const char ** argv) { ggml_graph_reset(&gf); ggml_set_f32(y->grad, 1.0f); - ggml_graph_compute(ctx0, &gb); + ggml_graph_compute_with_ctx(ctx0, &gb, n_threads); printf("y = %f\n", ggml_get_f32_1d(y, 0)); printf("df/dx1 = %f\n", ggml_get_f32_1d(x1->grad, 0)); @@ -169,7 +171,7 @@ int main(int argc, const char ** argv) { ggml_graph_reset(&gf); ggml_set_f32(y->grad, 1.0f); - ggml_graph_compute(ctx0, &gb); + ggml_graph_compute_with_ctx(ctx0, &gb, n_threads); printf("y = %f\n", ggml_get_f32_1d(y, 0)); printf("df/dx1 = %f\n", ggml_get_f32_1d(x1->grad, 0)); @@ -192,7 +194,7 @@ int main(int argc, const char ** argv) { ggml_set_f32(g2->grad, 1.0f); ggml_set_f32(g3->grad, 1.0f); - ggml_graph_compute(ctx0, &gbb); + ggml_graph_compute_with_ctx(ctx0, &gbb, n_threads); printf("H * [1, 1, 1] = [ %f %f %f ]\n", ggml_get_f32_1d(x1->grad, 0), @@ -227,7 +229,7 @@ int main(int argc, const char ** argv) { ggml_graph_reset(&gf); ggml_set_f32(y->grad, 1.0f); - ggml_graph_compute(ctx0, &gb); + ggml_graph_compute_with_ctx(ctx0, &gb, n_threads); printf("y = %f\n", ggml_get_f32_1d(y, 0)); printf("df/dx1 = %f %f %f\n", @@ -280,7 +282,7 @@ int main(int argc, const char ** argv) { ggml_graph_reset(&gf); ggml_set_f32(y->grad, 1.0f); - ggml_graph_compute(ctx0, &gb); + ggml_graph_compute_with_ctx(ctx0, &gb, n_threads); printf("y = %f\n", ggml_get_f32_1d(y, 0)); printf("df/dx1 = %f %f %f\n", @@ -333,7 +335,7 @@ int main(int argc, const char ** argv) { ggml_graph_reset(&gf); ggml_set_f32(y->grad, 1.0f); - ggml_graph_compute(ctx0, &gb); + ggml_graph_compute_with_ctx(ctx0, &gb, n_threads); printf("y = %f\n", ggml_get_f32_1d(y, 0)); printf("df/dx1 = %f %f %f\n", @@ -380,7 +382,7 @@ int main(int argc, const char ** argv) { ggml_graph_reset(&gf); ggml_set_f32(y->grad, 1.0f); - ggml_graph_compute(ctx0, &gb); + ggml_graph_compute_with_ctx(ctx0, &gb, n_threads); printf("y = %f\n", ggml_get_f32_1d(y, 0)); printf("df/dx1 = %f %f %f\n", @@ -406,7 +408,7 @@ int main(int argc, const char ** argv) { ggml_graph_reset(&gf); ggml_set_f32(y->grad, 1.0f); - ggml_graph_compute(ctx0, &gb); + ggml_graph_compute_with_ctx(ctx0, &gb, n_threads); printf("y = %f\n", ggml_get_f32_1d(y, 0)); printf("df/dx1 = %f %f %f\n", diff --git a/tests/test1.zig b/tests/test1.zig index 60fed53d..f331acbd 100644 --- a/tests/test1.zig +++ b/tests/test1.zig @@ -4,6 +4,8 @@ const c = @cImport({ }); pub fn main() !void { + const n_threads = 2; + const params = .{ .mem_size = 128*1024*1024, .mem_buffer = null, @@ -36,7 +38,7 @@ pub fn main() !void { c.ggml_graph_reset(@constCast(&gf)); _ = c.ggml_set_f32(f.*.grad, 1.0); - c.ggml_graph_compute(ctx0, @constCast(&gb)); + c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads); std.debug.print("f = {d:.6}\n", .{c.ggml_get_f32_1d(f, 0)}); std.debug.print("df/dx = {d:.6}\n", .{c.ggml_get_f32_1d(x.*.grad, 0)}); @@ -49,7 +51,7 @@ pub fn main() !void { c.ggml_graph_reset(@constCast(&gf)); _ = c.ggml_set_f32(f.*.grad, 1.0); - c.ggml_graph_compute(ctx0, @constCast(&gb)); + c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads); std.debug.print("f = {d:.6}\n", .{c.ggml_get_f32_1d(f, 0)}); std.debug.print("df/dx = {d:.6}\n", .{c.ggml_get_f32_1d(x.*.grad, 0)}); @@ -83,7 +85,7 @@ pub fn main() !void { c.ggml_graph_reset(@constCast(&gf)); _ = c.ggml_set_f32(y.*.grad, 1.0); - c.ggml_graph_compute(ctx0, @constCast(&gb)); + c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads); std.debug.print("y = {d:.6}\n", .{c.ggml_get_f32_1d(y, 0)}); std.debug.print("df/dx1 = {d:.6}\n", .{c.ggml_get_f32_1d(x1.*.grad, 0)}); @@ -102,7 +104,7 @@ pub fn main() !void { _ = c.ggml_set_f32(g1.*.grad, 1.0); _ = c.ggml_set_f32(g2.*.grad, 1.0); - c.ggml_graph_compute(ctx0, @constCast(&gbb)); + c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gbb), n_threads); std.debug.print("H * [1, 1] = [ {d:.6} {d:.6} ]\n", .{c.ggml_get_f32_1d(x1.*.grad, 0), c.ggml_get_f32_1d(x2.*.grad, 0)}); @@ -133,7 +135,7 @@ pub fn main() !void { c.ggml_graph_reset(@constCast(&gf)); _ = c.ggml_set_f32(y.*.grad, 1.0); - c.ggml_graph_compute(ctx0, @constCast(&gb)); + c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads); std.debug.print("y = {d:.6}\n", .{c.ggml_get_f32_1d(y, 0)}); std.debug.print("df/dx1 = {d:.6}\n", .{c.ggml_get_f32_1d(x1.*.grad, 0)}); @@ -170,7 +172,7 @@ pub fn main() !void { c.ggml_graph_reset(@constCast(&gf)); _ = c.ggml_set_f32(y.*.grad, 1.0); - c.ggml_graph_compute(ctx0, @constCast(&gb)); + c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads); std.debug.print("y = {d:.6}\n", .{c.ggml_get_f32_1d(y, 0)}); std.debug.print("df/dx1 = {d:.6}\n", .{c.ggml_get_f32_1d(x1.*.grad, 0)}); @@ -193,7 +195,7 @@ pub fn main() !void { _ = c.ggml_set_f32(g2.*.grad, 1.0); _ = c.ggml_set_f32(g3.*.grad, 1.0); - c.ggml_graph_compute(ctx0, @constCast(&gbb)); + c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gbb), n_threads); std.debug.print("H * [1, 1, 1] = [ {d:.6} {d:.6} {d:.6}]\n", .{ @@ -230,7 +232,7 @@ pub fn main() !void { c.ggml_graph_reset(@constCast(&gf)); _ = c.ggml_set_f32(y.*.grad, 1.0); - c.ggml_graph_compute(ctx0, @constCast(&gb)); + c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads); std.debug.print("y = {d:.6}\n", .{c.ggml_get_f32_1d(y, 0)}); std.debug.print("df/dx1 = {d:.6} {d:.6} {d:.6}\n", @@ -287,7 +289,7 @@ pub fn main() !void { c.ggml_graph_reset(@constCast(&gf)); _ = c.ggml_set_f32(y.*.grad, 1.0); - c.ggml_graph_compute(ctx0, @constCast(&gb)); + c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads); std.debug.print("y = {d:.6}\n", .{c.ggml_get_f32_1d(y, 0)}); std.debug.print("df/dx1 = {d:.6} {d:.6} {d:.6}\n", @@ -344,7 +346,7 @@ pub fn main() !void { c.ggml_graph_reset(@constCast(&gf)); _ = c.ggml_set_f32(y.*.grad, 1.0); - c.ggml_graph_compute(ctx0, @constCast(&gb)); + c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads); std.debug.print("y = {d:.6}\n", .{c.ggml_get_f32_1d(y, 0)}); std.debug.print("df/dx1 = {d:.6} {d:.6} {d:.6}\n", @@ -395,7 +397,7 @@ pub fn main() !void { c.ggml_graph_reset(@constCast(&gf)); _ = c.ggml_set_f32(y.*.grad, 1.0); - c.ggml_graph_compute(ctx0, @constCast(&gb)); + c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads); std.debug.print("y = {d:.6}\n", .{c.ggml_get_f32_1d(y, 0)}); std.debug.print("df/dx1 = {d:.6} {d:.6} {d:.6}\n", @@ -425,7 +427,7 @@ pub fn main() !void { c.ggml_graph_reset(@constCast(&gf)); _ = c.ggml_set_f32(y.*.grad, 1.0); - c.ggml_graph_compute(ctx0, @constCast(&gb)); + c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads); std.debug.print("y = {d:.6}\n", .{c.ggml_get_f32_1d(y, 0)}); std.debug.print("df/dx1 = {d:.6} {d:.6} {d:.6}\n",