)
endif()
+#
+# POSIX conformance
+#
+
+# clock_gettime came in POSIX.1b (1993)
+# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
+# posix_memalign came in POSIX.1-2001 / SUSv3
+# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
+add_compile_definitions(_XOPEN_SOURCE=600)
+
+# Somehow in OpenBSD whenever POSIX conformance is specified
+# some string functions rely on locale_t availability,
+# which was introduced in POSIX.1-2008, forcing us to go higher
+if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
+ remove_definitions(-D_XOPEN_SOURCE=600)
+ add_compile_definitions(_XOPEN_SOURCE=700)
+endif()
+
+# Data types, macros and functions related to controlling CPU affinity
+# are available on Linux through GNU extensions in libc
+if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+ add_compile_definitions(_GNU_SOURCE)
+endif()
+
+# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
+# and on macOS its availability depends on enabling Darwin extensions
+# similarly on DragonFly, enabling BSD extensions is necessary
+if (CMAKE_SYSTEM_NAME MATCHES "Darwin")
+ add_compile_definitions(_DARWIN_C_SOURCE)
+endif()
+if (CMAKE_SYSTEM_NAME MATCHES "DragonFly")
+ add_compile_definitions(_DARWIN_C_SOURCE)
+endif()
+
+# alloca is a non-standard interface that is not visible on BSDs when
+# POSIX conformance is specified, but not all of them provide a clean way
+# to enable it in such cases
+if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
+ add_compile_definitions(__BSD_VISIBLE)
+endif()
+if (CMAKE_SYSTEM_NAME MATCHES "NetBSD")
+ add_compile_definitions(_NETBSD_SOURCE)
+endif()
+if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
+ add_compile_definitions(_BSD_SOURCE)
+endif()
+
+if (WHISPER_PERF)
+ set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_PERF)
+endif()
+
# dependencies
set(CMAKE_C_STANDARD 11)
return true;
}
-void sam_print_usage(int argc, char ** argv, const sam_params & params) {
+void sam_print_usage(int /*argc*/, char ** argv, const sam_params & params) {
fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
return speaker;
}
-void whisper_print_progress_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int progress, void * user_data) {
+void whisper_print_progress_callback(struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, int progress, void * user_data) {
int progress_step = ((whisper_print_user_data *) user_data)->params->progress_step;
int * progress_prev = &(((whisper_print_user_data *) user_data)->progress_prev);
if (progress >= *progress_prev + progress_step) {
return true;
}
-bool output_score(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
+bool output_score(struct whisper_context * ctx, const char * fname, const whisper_params & /*params*/, std::vector<std::vector<float>> /*pcmf32s*/) {
std::ofstream fout(fname);
fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
-// defines MAP_ANONYMOUS
-#ifndef _GNU_SOURCE
-#define _GNU_SOURCE
-#endif
-
#include "ggml-alloc.h"
#include "ggml.h"
#include <assert.h>
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
#ifdef GGML_ALLOCATOR_DEBUG
- GGML_ASSERT(ggml_is_view(tensor) == false); // views generally get data pointer from one of their sources
+ GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
#endif
size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
if (best_fit_block == -1) {
// the last block is our last resort
struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
+ max_avail = MAX(max_avail, block->size);
if (block->size >= size) {
best_fit_block = alloc->n_free_blocks - 1;
- max_avail = MAX(max_avail, block->size);
} else {
fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
__func__, size, max_avail);
GGML_ASSERT(!"not enough space in the buffer");
- return;
+ return;
}
}
struct free_block * block = &alloc->free_blocks[best_fit_block];
#if defined(_WIN32)
return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
#elif defined(_POSIX_MAPPED_FILES)
- return mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
+ void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
+ if (ptr == MAP_FAILED) {
+ return NULL;
+ }
+ return ptr;
#else
// use a fixed address for other platforms
uintptr_t base_addr = (uintptr_t)-size - 0x100;
void * ggml_metal_host_malloc(size_t n) {
void * data = NULL;
- const int result = posix_memalign((void **) &data, getpagesize(), n);
+ const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
if (result != 0) {
metal_printf("%s: error: posix_memalign failed\n", __func__);
return NULL;
}
}
- const size_t size_page = getpagesize();
+ const size_t size_page = sysconf(_SC_PAGESIZE);
size_t size_aligned = size;
if ((size_aligned % size_page) != 0) {
[encoder setBytes:&freq_base length:sizeof(float) atIndex:21];
[encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
- [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+ [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
} break;
case GGML_OP_DUP:
case GGML_OP_CPY:
}
threadgroup_barrier(mem_flags::mem_threadgroup);
}
- //// broadcast
- //if (tpitg == 0) {
- // sum[0] /= ne00;
- //}
- //threadgroup_barrier(mem_flags::mem_threadgroup);
- const float mean = sum[0];
+ const float mean = sum[0] / ne00;
// recenter and VARIANCE
+ threadgroup_barrier(mem_flags::mem_threadgroup);
device float * y = dst + tgpig*ne00;
sum[tpitg] = 0.0f;
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
sum[tpitg] += y[i00] * y[i00];
}
- //// VARIANCE
- //// parallel sum
- //sum[tpitg] = 0.0f;
- //for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
- // sum[tpitg] += y[i00] * y[i00];
- //}
// reduce
threadgroup_barrier(mem_flags::mem_threadgroup);
for (uint i = ntg/2; i > 0; i /= 2) {
}
threadgroup_barrier(mem_flags::mem_threadgroup);
}
- //// broadcast
- //if (tpitg == 0) {
- // sum[0] /= ne00;
- //}
- //threadgroup_barrier(mem_flags::mem_threadgroup);
- const float variance = sum[0];
+ const float variance = sum[0] / ne00;
const float scale = 1.0f/sqrt(variance + eps);
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
}
}
-
kernel void kernel_rms_norm(
device const void * src0,
device float * dst,
}
}
}
-
}
kernel void kernel_alibi_f32(
constant int & mode,
constant float & freq_base,
constant float & freq_scale,
- uint3 tpig[[thread_position_in_grid]]) {
- const int64_t i3 = tpig[2];
- const int64_t i2 = tpig[1];
- const int64_t i1 = tpig[0];
+ uint tiitg[[thread_index_in_threadgroup]],
+ uint3 tptg[[threads_per_threadgroup]],
+ uint3 tgpig[[threadgroup_position_in_grid]]) {
+ const int64_t i3 = tgpig[2];
+ const int64_t i2 = tgpig[1];
+ const int64_t i1 = tgpig[0];
const bool is_neox = mode & 2;
- const float theta_scale = pow(freq_base, -2.0f/n_dims);
const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
- float theta = freq_scale * (float)p;
+ const float theta_0 = freq_scale * (float)p;
+ const float inv_ndims = -1.f/n_dims;
if (!is_neox) {
- for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
+ for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
+
+ const float theta = theta_0 * pow(freq_base, inv_ndims*i0);
const float cos_theta = cos(theta);
const float sin_theta = sin(theta);
- theta *= theta_scale;
-
device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
}
} else {
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
- for (int64_t ic = 0; ic < n_dims; ic += 2) {
+ for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
+
+ const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib);
const float cos_theta = cos(theta);
const float sin_theta = sin(theta);
- theta *= theta_scale;
-
const int64_t i0 = ib*n_dims + ic/2;
device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-#define _GNU_SOURCE // Defines CLOCK_MONOTONIC on Linux
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
#include "ggml.h"
#include <sys/stat.h>
#include <unistd.h>
+#endif
+#ifdef GGML_USE_CPU_HBM
+#include <hbwmalloc.h>
#endif
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
#else
inline static void * ggml_aligned_malloc(size_t size) {
+ if (size == 0) {
+ GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
+ return NULL;
+ }
void * aligned_memory = NULL;
-#ifdef GGML_USE_METAL
- int result = posix_memalign(&aligned_memory, getpagesize(), size);
+#ifdef GGML_USE_CPU_HBM
+ int result = hbw_posix_memalign(&aligned_memory, 16, size);
+#elif GGML_USE_METAL
+ int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
#else
int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
#endif
return aligned_memory;
}
#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
+#ifdef GGML_USE_CPU_HBM
+#define GGML_ALIGNED_FREE(ptr) if(NULL != ptr) hbw_free(ptr)
+#else
#define GGML_ALIGNED_FREE(ptr) free(ptr)
#endif
+#endif
#define UNUSED GGML_UNUSED
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
return NULL;
}
+ // allow to call ggml_init with 0 size
+ if (params.mem_size == 0) {
+ params.mem_size = GGML_MEM_ALIGN;
+ }
+
const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
*ctx = (struct ggml_context) {
size_t obj_alloc_size = 0;
- if (view_src == NULL && ctx->no_alloc == false) {
+ if (view_src == NULL && !ctx->no_alloc) {
if (ctx->scratch.data != NULL) {
// allocate tensor data in the scratch buffer
if (ctx->scratch.offs + data_size > ctx->scratch.size) {
}
if (inplace) {
- GGML_ASSERT(is_node == false);
+ GGML_ASSERT(!is_node);
}
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
}
if (inplace) {
- GGML_ASSERT(is_node == false);
+ GGML_ASSERT(!is_node);
}
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
struct ggml_tensor * data = NULL;
- if (params.no_alloc == false) {
+ if (!params.no_alloc) {
data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
ok = ok && data != NULL;
}
// point the data member to the appropriate location in the binary blob using the tensor infos
- if (params.no_alloc == false) {
+ if (!params.no_alloc) {
//cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data
}
return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset;
}
-void benchmark_function(size_t size, size_t q_size, int64_t iterations, std::function<size_t(void)> function) {
+void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function<size_t(void)> & function) {
int64_t min_time_us = INT64_MAX;
int64_t total_time_us = 0;
int64_t min_time_cycles = INT64_MAX;