From: Georgi Gerganov Date: Fri, 8 Sep 2023 14:57:04 +0000 (+0300) Subject: sync : whisper (POSIX) (#511) X-Git-Tag: upstream/0.0.1642~1246 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=c06cb61f665a5fa0f3f5ae9299ee3c2525df376c;p=pkg%2Fggml%2Fsources%2Fggml sync : whisper (POSIX) (#511) * sync : whisper (POSIX) ggml-ci * sync : llama (HBM + Metal + style) ggml-ci --- diff --git a/CMakeLists.txt b/CMakeLists.txt index d31be316..bf1491bf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -91,6 +91,57 @@ if (NOT MSVC) ) endif() +# +# POSIX conformance +# + +# clock_gettime came in POSIX.1b (1993) +# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional +# posix_memalign came in POSIX.1-2001 / SUSv3 +# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985) +add_compile_definitions(_XOPEN_SOURCE=600) + +# Somehow in OpenBSD whenever POSIX conformance is specified +# some string functions rely on locale_t availability, +# which was introduced in POSIX.1-2008, forcing us to go higher +if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD") + remove_definitions(-D_XOPEN_SOURCE=600) + add_compile_definitions(_XOPEN_SOURCE=700) +endif() + +# Data types, macros and functions related to controlling CPU affinity +# are available on Linux through GNU extensions in libc +if (CMAKE_SYSTEM_NAME MATCHES "Linux") + add_compile_definitions(_GNU_SOURCE) +endif() + +# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1, +# and on macOS its availability depends on enabling Darwin extensions +# similarly on DragonFly, enabling BSD extensions is necessary +if (CMAKE_SYSTEM_NAME MATCHES "Darwin") + add_compile_definitions(_DARWIN_C_SOURCE) +endif() +if (CMAKE_SYSTEM_NAME MATCHES "DragonFly") + add_compile_definitions(_DARWIN_C_SOURCE) +endif() + +# alloca is a non-standard interface that is not visible on BSDs when +# POSIX conformance is specified, but not all of them provide a clean way +# to enable it in such cases +if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD") + add_compile_definitions(__BSD_VISIBLE) +endif() +if (CMAKE_SYSTEM_NAME MATCHES "NetBSD") + add_compile_definitions(_NETBSD_SOURCE) +endif() +if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD") + add_compile_definitions(_BSD_SOURCE) +endif() + +if (WHISPER_PERF) + set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_PERF) +endif() + # dependencies set(CMAKE_C_STANDARD 11) diff --git a/examples/common.cpp b/examples/common.cpp index 2b8da8f7..11064b8d 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -792,7 +792,7 @@ bool sam_params_parse(int argc, char ** argv, sam_params & params) { return true; } -void sam_print_usage(int argc, char ** argv, const sam_params & params) { +void sam_print_usage(int /*argc*/, char ** argv, const sam_params & params) { fprintf(stderr, "usage: %s [options]\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, "options:\n"); diff --git a/examples/whisper/main.cpp b/examples/whisper/main.cpp index fa399c6d..60c1cca7 100644 --- a/examples/whisper/main.cpp +++ b/examples/whisper/main.cpp @@ -260,7 +260,7 @@ std::string estimate_diarization_speaker(std::vector> pcmf32s return speaker; } -void whisper_print_progress_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int progress, void * user_data) { +void whisper_print_progress_callback(struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, int progress, void * user_data) { int progress_step = ((whisper_print_user_data *) user_data)->params->progress_step; int * progress_prev = &(((whisper_print_user_data *) user_data)->progress_prev); if (progress >= *progress_prev + progress_step) { @@ -492,7 +492,7 @@ bool output_csv(struct whisper_context * ctx, const char * fname, const whisper_ return true; } -bool output_score(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector> pcmf32s) { +bool output_score(struct whisper_context * ctx, const char * fname, const whisper_params & /*params*/, std::vector> /*pcmf32s*/) { std::ofstream fout(fname); fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname); diff --git a/src/ggml-alloc.c b/src/ggml-alloc.c index c1939a4b..a1f6e7bf 100644 --- a/src/ggml-alloc.c +++ b/src/ggml-alloc.c @@ -1,8 +1,3 @@ -// defines MAP_ANONYMOUS -#ifndef _GNU_SOURCE -#define _GNU_SOURCE -#endif - #include "ggml-alloc.h" #include "ggml.h" #include @@ -138,7 +133,7 @@ static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_ten void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { #ifdef GGML_ALLOCATOR_DEBUG - GGML_ASSERT(ggml_is_view(tensor) == false); // views generally get data pointer from one of their sources + GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated #endif size_t size = ggml_allocr_get_alloc_size(alloc, tensor); @@ -165,14 +160,14 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) if (best_fit_block == -1) { // the last block is our last resort struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1]; + max_avail = MAX(max_avail, block->size); if (block->size >= size) { best_fit_block = alloc->n_free_blocks - 1; - max_avail = MAX(max_avail, block->size); } else { fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n", __func__, size, max_avail); GGML_ASSERT(!"not enough space in the buffer"); - return; + return; } } struct free_block * block = &alloc->free_blocks[best_fit_block]; @@ -316,7 +311,11 @@ static void * alloc_vmem(size_t size) { #if defined(_WIN32) return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS); #elif defined(_POSIX_MAPPED_FILES) - return mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0); + void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (ptr == MAP_FAILED) { + return NULL; + } + return ptr; #else // use a fixed address for other platforms uintptr_t base_addr = (uintptr_t)-size - 0x100; diff --git a/src/ggml-metal.m b/src/ggml-metal.m index d0d23442..7e2355ce 100644 --- a/src/ggml-metal.m +++ b/src/ggml-metal.m @@ -327,7 +327,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { void * ggml_metal_host_malloc(size_t n) { void * data = NULL; - const int result = posix_memalign((void **) &data, getpagesize(), n); + const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n); if (result != 0) { metal_printf("%s: error: posix_memalign failed\n", __func__); return NULL; @@ -401,7 +401,7 @@ bool ggml_metal_add_buffer( } } - const size_t size_page = getpagesize(); + const size_t size_page = sysconf(_SC_PAGESIZE); size_t size_aligned = size; if ((size_aligned % size_page) != 0) { @@ -1141,7 +1141,7 @@ void ggml_metal_graph_compute( [encoder setBytes:&freq_base length:sizeof(float) atIndex:21]; [encoder setBytes:&freq_scale length:sizeof(float) atIndex:22]; - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)]; } break; case GGML_OP_DUP: case GGML_OP_CPY: diff --git a/src/ggml-metal.metal b/src/ggml-metal.metal index 119fcbeb..5070561f 100644 --- a/src/ggml-metal.metal +++ b/src/ggml-metal.metal @@ -220,14 +220,10 @@ kernel void kernel_norm( } threadgroup_barrier(mem_flags::mem_threadgroup); } - //// broadcast - //if (tpitg == 0) { - // sum[0] /= ne00; - //} - //threadgroup_barrier(mem_flags::mem_threadgroup); - const float mean = sum[0]; + const float mean = sum[0] / ne00; // recenter and VARIANCE + threadgroup_barrier(mem_flags::mem_threadgroup); device float * y = dst + tgpig*ne00; sum[tpitg] = 0.0f; for (int i00 = tpitg; i00 < ne00; i00 += ntg) { @@ -235,12 +231,6 @@ kernel void kernel_norm( sum[tpitg] += y[i00] * y[i00]; } - //// VARIANCE - //// parallel sum - //sum[tpitg] = 0.0f; - //for (int i00 = tpitg; i00 < ne00; i00 += ntg) { - // sum[tpitg] += y[i00] * y[i00]; - //} // reduce threadgroup_barrier(mem_flags::mem_threadgroup); for (uint i = ntg/2; i > 0; i /= 2) { @@ -249,12 +239,7 @@ kernel void kernel_norm( } threadgroup_barrier(mem_flags::mem_threadgroup); } - //// broadcast - //if (tpitg == 0) { - // sum[0] /= ne00; - //} - //threadgroup_barrier(mem_flags::mem_threadgroup); - const float variance = sum[0]; + const float variance = sum[0] / ne00; const float scale = 1.0f/sqrt(variance + eps); for (int i00 = tpitg; i00 < ne00; i00 += ntg) { @@ -262,7 +247,6 @@ kernel void kernel_norm( } } - kernel void kernel_rms_norm( device const void * src0, device float * dst, @@ -630,7 +614,6 @@ kernel void kernel_mul_mat_f16_f32( } } } - } kernel void kernel_alibi_f32( @@ -699,25 +682,27 @@ kernel void kernel_rope( constant int & mode, constant float & freq_base, constant float & freq_scale, - uint3 tpig[[thread_position_in_grid]]) { - const int64_t i3 = tpig[2]; - const int64_t i2 = tpig[1]; - const int64_t i1 = tpig[0]; + uint tiitg[[thread_index_in_threadgroup]], + uint3 tptg[[threads_per_threadgroup]], + uint3 tgpig[[threadgroup_position_in_grid]]) { + const int64_t i3 = tgpig[2]; + const int64_t i2 = tgpig[1]; + const int64_t i1 = tgpig[0]; const bool is_neox = mode & 2; - const float theta_scale = pow(freq_base, -2.0f/n_dims); const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2); - float theta = freq_scale * (float)p; + const float theta_0 = freq_scale * (float)p; + const float inv_ndims = -1.f/n_dims; if (!is_neox) { - for (int64_t i0 = 0; i0 < ne0; i0 += 2) { + for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) { + + const float theta = theta_0 * pow(freq_base, inv_ndims*i0); const float cos_theta = cos(theta); const float sin_theta = sin(theta); - theta *= theta_scale; - device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); @@ -729,12 +714,12 @@ kernel void kernel_rope( } } else { for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { - for (int64_t ic = 0; ic < n_dims; ic += 2) { + for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) { + + const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib); const float cos_theta = cos(theta); const float sin_theta = sin(theta); - theta *= theta_scale; - const int64_t i0 = ib*n_dims + ic/2; device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); diff --git a/src/ggml.c b/src/ggml.c index 63a22223..3f72379c 100644 --- a/src/ggml.c +++ b/src/ggml.c @@ -1,4 +1,3 @@ -#define _GNU_SOURCE // Defines CLOCK_MONOTONIC on Linux #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows #include "ggml.h" @@ -107,6 +106,9 @@ typedef void * thread_ret_t; #include #include +#endif +#ifdef GGML_USE_CPU_HBM +#include #endif // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512 @@ -196,9 +198,15 @@ typedef void * thread_ret_t; #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr) #else inline static void * ggml_aligned_malloc(size_t size) { + if (size == 0) { + GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n"); + return NULL; + } void * aligned_memory = NULL; -#ifdef GGML_USE_METAL - int result = posix_memalign(&aligned_memory, getpagesize(), size); +#ifdef GGML_USE_CPU_HBM + int result = hbw_posix_memalign(&aligned_memory, 16, size); +#elif GGML_USE_METAL + int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size); #else int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size); #endif @@ -219,8 +227,12 @@ inline static void * ggml_aligned_malloc(size_t size) { return aligned_memory; } #define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size) +#ifdef GGML_USE_CPU_HBM +#define GGML_ALIGNED_FREE(ptr) if(NULL != ptr) hbw_free(ptr) +#else #define GGML_ALIGNED_FREE(ptr) free(ptr) #endif +#endif #define UNUSED GGML_UNUSED #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0) @@ -4572,6 +4584,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { return NULL; } + // allow to call ggml_init with 0 size + if (params.mem_size == 0) { + params.mem_size = GGML_MEM_ALIGN; + } + const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN); *ctx = (struct ggml_context) { @@ -4774,7 +4791,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( size_t obj_alloc_size = 0; - if (view_src == NULL && ctx->no_alloc == false) { + if (view_src == NULL && !ctx->no_alloc) { if (ctx->scratch.data != NULL) { // allocate tensor data in the scratch buffer if (ctx->scratch.offs + data_size > ctx->scratch.size) { @@ -5475,7 +5492,7 @@ static struct ggml_tensor * ggml_mul_impl( } if (inplace) { - GGML_ASSERT(is_node == false); + GGML_ASSERT(!is_node); } struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); @@ -5518,7 +5535,7 @@ static struct ggml_tensor * ggml_div_impl( } if (inplace) { - GGML_ASSERT(is_node == false); + GGML_ASSERT(!is_node); } struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); @@ -19962,7 +19979,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p struct ggml_tensor * data = NULL; - if (params.no_alloc == false) { + if (!params.no_alloc) { data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size); ok = ok && data != NULL; @@ -20003,7 +20020,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p } // point the data member to the appropriate location in the binary blob using the tensor infos - if (params.no_alloc == false) { + if (!params.no_alloc) { //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data } diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp index 0bb9537f..cbea7d45 100644 --- a/tests/test-quantize-perf.cpp +++ b/tests/test-quantize-perf.cpp @@ -76,7 +76,7 @@ void * align_with_offset(void * ptr, int offset) { return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset; } -void benchmark_function(size_t size, size_t q_size, int64_t iterations, std::function function) { +void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function & function) { int64_t min_time_us = INT64_MAX; int64_t total_time_us = 0; int64_t min_time_cycles = INT64_MAX;