From: Georgi Gerganov Date: Tue, 1 Oct 2024 15:08:31 +0000 (+0300) Subject: metal : add perf-metal tool + fix build X-Git-Tag: upstream/0.0.1642~321 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=6b30c17879e0a41375061958d70d443c8d34b381;p=pkg%2Fggml%2Fsources%2Fggml metal : add perf-metal tool + fix build --- diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 582609a4..b273a1ad 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -25,3 +25,7 @@ add_subdirectory(sam) add_subdirectory(yolo) add_subdirectory(simple) add_subdirectory(magika) + +if (GGML_METAL) + add_subdirectory(perf-metal) +endif() diff --git a/examples/gpt-2/main-backend.cpp b/examples/gpt-2/main-backend.cpp index db8e7f20..e35d5c67 100644 --- a/examples/gpt-2/main-backend.cpp +++ b/examples/gpt-2/main-backend.cpp @@ -758,12 +758,6 @@ bool gpt2_eval( ggml_backend_cpu_set_n_threads(model.backend, n_threads); } -#ifdef GGML_USE_METAL - if (ggml_backend_is_metal(model.backend)) { - ggml_backend_metal_set_n_cb(model.backend, n_threads); - } -#endif - // run the computation ggml_backend_graph_compute(model.backend, gf); diff --git a/examples/gpt-2/main-batched.cpp b/examples/gpt-2/main-batched.cpp index d504093e..f2646e88 100644 --- a/examples/gpt-2/main-batched.cpp +++ b/examples/gpt-2/main-batched.cpp @@ -942,11 +942,6 @@ int gpt2_decode( if (ggml_backend_is_cpu(model.backend)) { ggml_backend_cpu_set_n_threads(model.backend, n_threads); } -#ifdef GGML_USE_METAL - if (ggml_backend_is_metal(model.backend)) { - ggml_backend_metal_set_n_cb(model.backend, n_threads); - } -#endif ggml_backend_graph_compute(model.backend, gf); //if (n_past%100 == 0) { diff --git a/examples/gpt-2/main-sched.cpp b/examples/gpt-2/main-sched.cpp index ccbceca7..e8864aed 100644 --- a/examples/gpt-2/main-sched.cpp +++ b/examples/gpt-2/main-sched.cpp @@ -126,8 +126,6 @@ void init_backends(gpt2_model & model, const gpt_params & params) { gpu_backend = ggml_backend_metal_init(); if (!gpu_backend) { fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); - } else { - ggml_backend_metal_set_n_cb(gpu_backend, params.n_threads); } } #endif diff --git a/examples/perf-metal/CMakeLists.txt b/examples/perf-metal/CMakeLists.txt new file mode 100644 index 00000000..a9705517 --- /dev/null +++ b/examples/perf-metal/CMakeLists.txt @@ -0,0 +1,7 @@ +# +# perf-metal + +set(TEST_TARGET perf-metal) +add_executable(${TEST_TARGET} perf-metal.cpp) +target_link_libraries(${TEST_TARGET} PRIVATE ggml) + diff --git a/examples/perf-metal/perf-metal.cpp b/examples/perf-metal/perf-metal.cpp new file mode 100644 index 00000000..e004dc0e --- /dev/null +++ b/examples/perf-metal/perf-metal.cpp @@ -0,0 +1,152 @@ +// basic tool to experiment with the Metal backend +// +// 1. Get GPU trace of a dummy graph: +// +// rm -rf /tmp/perf-metal.gputrace +// make -j perf-metal && METAL_CAPTURE_ENABLED=1 ./bin/perf-metal +// open /tmp/perf-metal.gputrace +// +// https://github.com/ggerganov/llama.cpp/issues/9507 +// + +#include "ggml.h" +#include "ggml-alloc.h" +#include "ggml-backend.h" +#include "ggml-metal.h" + +#include +#include +#include + +int main(int argc, char ** argv) { + int n_op = 1024; + int n_iter = 128; + + if (argc > 1) { + n_op = std::atoi(argv[1]); + } + + if (argc > 2) { + n_iter = std::atoi(argv[2]); + } + + printf("%s: n_op = %d, n_iter = %d\n", __func__, n_op, n_iter); + + const int ne00 = 8; + const int ne01 = 8; + const int ne11 = 8; + + std::vector data0(ne00*ne01, 1.0f); + std::vector data1(ne00*ne01, 1.0f/ne00); + + ggml_backend_t backend = ggml_backend_metal_init(); + if (!backend) { + fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); + return 1; + } + + const size_t ctx_size = 2 * ggml_tensor_overhead(); + + struct ggml_init_params params = { + /*.mem_size =*/ ctx_size, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + struct ggml_context * ctx = ggml_init(params); + + struct ggml_tensor * t0 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, ne00, ne01); + struct ggml_tensor * t1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, ne00, ne11); + + ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend); + + ggml_backend_tensor_set(t0, data0.data(), 0, ggml_nbytes(t0)); + ggml_backend_tensor_set(t1, data1.data(), 0, ggml_nbytes(t1)); + + struct ggml_cgraph * gf = NULL; + + struct ggml_context * ctx_cgraph = NULL; + + // create a dummy compute graph: + // + // x = mul_mat(t0, t1) + // x = x * 1.0f + // x = mul_mat(x, t1) + // x = x * 1.0f + // ... repeat n_op times ... + // + { + struct ggml_init_params params0 = { + /*.mem_size =*/ 4*n_op*ggml_tensor_overhead() + ggml_graph_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + ctx_cgraph = ggml_init(params0); + + gf = ggml_new_graph_custom(ctx_cgraph, 4*n_op, false); + + struct ggml_tensor * cur = ggml_mul_mat(ctx_cgraph, t0, t1); + cur = ggml_scale(ctx_cgraph, cur, 1.0f); + + for (int i = 0; i < n_op - 1; i++) { + cur = ggml_mul_mat(ctx_cgraph, cur, t1); + cur = ggml_scale(ctx_cgraph, cur, 1.0f); + } + + cur = ggml_scale(ctx_cgraph, cur, 42.0f); + + ggml_build_forward_expand(gf, cur); + } + + printf("%s: graph nodes = %d\n", __func__, ggml_graph_n_nodes(gf)); + + ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend)); + ggml_gallocr_alloc_graph(allocr, gf); + + { + // warm-up + ggml_backend_graph_compute(backend, gf); + + const int64_t t_start = ggml_time_us(); + + for (int iter = 0; iter < n_iter; iter++) { + ggml_backend_graph_compute(backend, gf); + } + + const int64_t t_end = ggml_time_us(); + + // actual trace + ggml_backend_metal_capture_next_compute(backend); + ggml_backend_graph_compute(backend, gf); + //std::this_thread::sleep_for(std::chrono::milliseconds(1000)); // NOTE: these intervals do not appear in the XCode trace! + ggml_backend_metal_capture_next_compute(backend); + ggml_backend_graph_compute(backend, gf); + //std::this_thread::sleep_for(std::chrono::milliseconds(1000)); // NOTE: these intervals do not appear in the XCode trace! + ggml_backend_metal_capture_next_compute(backend); + ggml_backend_graph_compute(backend, gf); + + printf("%s: time = %f ms\n", __func__, (t_end - t_start) / 1000.0 / n_iter); + } + + { + struct ggml_tensor * res = ggml_graph_node(gf, -1); + + std::vector data(res->ne[0] * res->ne[1], 0.0f); + + ggml_backend_tensor_get(res, data.data(), 0, ggml_nbytes(res)); + + for (int i1 = 0; i1 < res->ne[1]; i1++) { + for (int i0 = 0; i0 < res->ne[0]; i0++) { + printf("%f ", data[i1*res->ne[0] + i0]); + } + printf("\n"); + } + } + + ggml_free(ctx_cgraph); + ggml_gallocr_free(allocr); + ggml_free(ctx); + ggml_backend_buffer_free(buffer); + ggml_backend_free(backend); + + return 0; +} diff --git a/examples/simple/simple-backend.cpp b/examples/simple/simple-backend.cpp index 203028c8..89594d02 100644 --- a/examples/simple/simple-backend.cpp +++ b/examples/simple/simple-backend.cpp @@ -131,12 +131,6 @@ struct ggml_tensor * compute(const simple_model & model, ggml_gallocr_t allocr) ggml_backend_cpu_set_n_threads(model.backend, n_threads); } -#ifdef GGML_USE_METAL - if (ggml_backend_is_metal(model.backend)) { - ggml_backend_metal_set_n_cb(model.backend, n_threads); - } -#endif - ggml_backend_graph_compute(model.backend, gf); // in this case, the output tensor is the last one in the graph diff --git a/src/ggml-metal.m b/src/ggml-metal.m index 9da08fe2..c1e3a66d 100644 --- a/src/ggml-metal.m +++ b/src/ggml-metal.m @@ -3042,7 +3042,6 @@ static enum ggml_status ggml_metal_graph_compute( NSError * error = nil; if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) { GGML_METAL_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]); - GGML_ABORT("capture failed"); } else { [ctx->capture_scope beginScope]; ctx->capture_started = true; diff --git a/tests/test-conv-transpose-1d.cpp b/tests/test-conv-transpose-1d.cpp index 36643ab9..f8236818 100644 --- a/tests/test-conv-transpose-1d.cpp +++ b/tests/test-conv-transpose-1d.cpp @@ -377,12 +377,6 @@ struct ggml_cgraph* compute_graph(const test_model & model, ggml_gallocr_t alloc ggml_backend_cpu_set_n_threads(model.backend, n_threads); } -#ifdef GGML_USE_METAL - if (ggml_backend_is_metal(model.backend)) { - ggml_backend_metal_set_n_cb(model.backend, n_threads); - } -#endif - ggml_backend_graph_compute(model.backend, gf); //ggml_graph_print(gf); diff --git a/tests/test-conv1d.cpp b/tests/test-conv1d.cpp index e98a4ca9..95a9368f 100644 --- a/tests/test-conv1d.cpp +++ b/tests/test-conv1d.cpp @@ -179,12 +179,6 @@ struct ggml_cgraph* compute_graph(const test_model & model, ggml_gallocr_t alloc ggml_backend_cpu_set_n_threads(model.backend, n_threads); } -#ifdef GGML_USE_METAL - if (ggml_backend_is_metal(model.backend)) { - ggml_backend_metal_set_n_cb(model.backend, n_threads); - } -#endif - ggml_backend_graph_compute(model.backend, gf); //ggml_graph_print(gf); diff --git a/tests/test-conv2d.cpp b/tests/test-conv2d.cpp index 37127739..788c3352 100644 --- a/tests/test-conv2d.cpp +++ b/tests/test-conv2d.cpp @@ -182,12 +182,6 @@ struct ggml_cgraph * compute_graph(const test_model & model, ggml_gallocr_t allo ggml_backend_cpu_set_n_threads(model.backend, n_threads); } -#ifdef GGML_USE_METAL - if (ggml_backend_is_metal(model.backend)) { - ggml_backend_metal_set_n_cb(model.backend, n_threads); - } -#endif - ggml_backend_graph_compute(model.backend, gf); //ggml_graph_print(gf); diff --git a/tests/test-mul-mat.cpp b/tests/test-mul-mat.cpp index 1df9d144..2b23a6ae 100644 --- a/tests/test-mul-mat.cpp +++ b/tests/test-mul-mat.cpp @@ -150,11 +150,6 @@ struct ggml_tensor* compute(const test_model & model, ggml_gallocr_t allocr) { ggml_backend_cpu_set_n_threads(model.backend, n_threads); } -#ifdef GGML_USE_METAL - if (ggml_backend_is_metal(model.backend)) { - ggml_backend_metal_set_n_cb(model.backend, n_threads); - } -#endif ggml_backend_graph_compute(model.backend, gf);