}
threadgroup_barrier(mem_flags::mem_threadgroup);
}
- //// broadcast
- //if (tpitg == 0) {
- // sum[0] /= ne00;
- //}
- //threadgroup_barrier(mem_flags::mem_threadgroup);
- const float mean = sum[0];
+ const float mean = sum[0] / ne00;
// recenter and VARIANCE
+ threadgroup_barrier(mem_flags::mem_threadgroup);
device float * y = dst + tgpig*ne00;
sum[tpitg] = 0.0f;
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
sum[tpitg] += y[i00] * y[i00];
}
- //// VARIANCE
- //// parallel sum
- //sum[tpitg] = 0.0f;
- //for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
- // sum[tpitg] += y[i00] * y[i00];
- //}
// reduce
threadgroup_barrier(mem_flags::mem_threadgroup);
for (uint i = ntg/2; i > 0; i /= 2) {
}
threadgroup_barrier(mem_flags::mem_threadgroup);
}
- //// broadcast
- //if (tpitg == 0) {
- // sum[0] /= ne00;
- //}
- //threadgroup_barrier(mem_flags::mem_threadgroup);
- const float variance = sum[0];
+ const float variance = sum[0] / ne00;
const float scale = 1.0f/sqrt(variance + eps);
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
}
}
-
kernel void kernel_rms_norm(
device const void * src0,
device float * dst,
}
}
}
-
}
kernel void kernel_alibi_f32(
constant int & mode,
constant float & freq_base,
constant float & freq_scale,
- uint3 tpig[[thread_position_in_grid]]) {
- const int64_t i3 = tpig[2];
- const int64_t i2 = tpig[1];
- const int64_t i1 = tpig[0];
+ uint tiitg[[thread_index_in_threadgroup]],
+ uint3 tptg[[threads_per_threadgroup]],
+ uint3 tgpig[[threadgroup_position_in_grid]]) {
+ const int64_t i3 = tgpig[2];
+ const int64_t i2 = tgpig[1];
+ const int64_t i1 = tgpig[0];
const bool is_neox = mode & 2;
- const float theta_scale = pow(freq_base, -2.0f/n_dims);
const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
- float theta = freq_scale * (float)p;
+ const float theta_0 = freq_scale * (float)p;
+ const float inv_ndims = -1.f/n_dims;
if (!is_neox) {
- for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
+ for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
+
+ const float theta = theta_0 * pow(freq_base, inv_ndims*i0);
const float cos_theta = cos(theta);
const float sin_theta = sin(theta);
- theta *= theta_scale;
-
device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
}
} else {
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
- for (int64_t ic = 0; ic < n_dims; ic += 2) {
+ for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
+
+ const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib);
const float cos_theta = cos(theta);
const float sin_theta = sin(theta);
- theta *= theta_scale;
-
const int64_t i0 = ib*n_dims + ic/2;
device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
#include <sys/stat.h>
#include <unistd.h>
+#endif
+#ifdef GGML_USE_CPU_HBM
+#include <hbwmalloc.h>
#endif
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
#else
inline static void * ggml_aligned_malloc(size_t size) {
+ if (size == 0) {
+ GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
+ return NULL;
+ }
void * aligned_memory = NULL;
-#ifdef GGML_USE_METAL
+#ifdef GGML_USE_CPU_HBM
+ int result = hbw_posix_memalign(&aligned_memory, 16, size);
+#elif GGML_USE_METAL
int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
#else
int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
return aligned_memory;
}
#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
+#ifdef GGML_USE_CPU_HBM
+#define GGML_ALIGNED_FREE(ptr) if(NULL != ptr) hbw_free(ptr)
+#else
#define GGML_ALIGNED_FREE(ptr) free(ptr)
#endif
+#endif
#define UNUSED GGML_UNUSED
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
return NULL;
}
+ // allow to call ggml_init with 0 size
+ if (params.mem_size == 0) {
+ params.mem_size = GGML_MEM_ALIGN;
+ }
+
const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
*ctx = (struct ggml_context) {
size_t obj_alloc_size = 0;
- if (view_src == NULL && ctx->no_alloc == false) {
+ if (view_src == NULL && !ctx->no_alloc) {
if (ctx->scratch.data != NULL) {
// allocate tensor data in the scratch buffer
if (ctx->scratch.offs + data_size > ctx->scratch.size) {
}
if (inplace) {
- GGML_ASSERT(is_node == false);
+ GGML_ASSERT(!is_node);
}
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
}
if (inplace) {
- GGML_ASSERT(is_node == false);
+ GGML_ASSERT(!is_node);
}
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
struct ggml_tensor * data = NULL;
- if (params.no_alloc == false) {
+ if (!params.no_alloc) {
data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
ok = ok && data != NULL;
}
// point the data member to the appropriate location in the binary blob using the tensor infos
- if (params.no_alloc == false) {
+ if (!params.no_alloc) {
//cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data
}