#endif
// the compute plan that needs to be prepared for ggml_graph_compute()
- // since https://github.com/ggerganov/ggml/issues/287
+ // since https://github.com/ggml-org/ggml/issues/287
struct ggml_cplan {
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
GGML_DEPRECATED(
GGML_BACKEND_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
- "obsoleted by the new device interface - https://github.com/ggerganov/llama.cpp/pull/9713");
+ "obsoleted by the new device interface - https://github.com/ggml-org/llama.cpp/pull/9713");
GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
#if __FINITE_MATH_ONLY__
#error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
-#error "ref: https://github.com/ggerganov/llama.cpp/pull/7154#issuecomment-2143844461"
+#error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461"
#endif
#if defined(__ARM_NEON) && defined(__aarch64__)
int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
// If the chunking is poor for the number of threads on this setup, scrap the whole plan. Re-chunk it by thread.
- // Also, chunking by thread was measured to have perform better on NUMA systems. See https://github.com/ggerganov/llama.cpp/pull/6915
+ // Also, chunking by thread was measured to have perform better on NUMA systems. See https://github.com/ggml-org/llama.cpp/pull/6915
// In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that.
if (nchunk0 * nchunk1 < nth * 4 || ggml_is_numa()) {
// distribute the thread work across the inner or outer loop based on which one is larger
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
// TODO: add ggml_metal_kargs struct
- // TODO: optimize (see https://github.com/ggerganov/llama.cpp/pull/10238/commits/7941b6b9ec29a2866fec6fa6c51612515ca509f6)
+ // TODO: optimize (see https://github.com/ggml-org/llama.cpp/pull/10238/commits/7941b6b9ec29a2866fec6fa6c51612515ca509f6)
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
if (id_src1) {
}
// This barrier fixes a failing test
- // ref: https://github.com/ggerganov/ggml/pull/621#discussion_r1425156335
+ // ref: https://github.com/ggml-org/ggml/pull/621#discussion_r1425156335
threadgroup_barrier(mem_flags::mem_none);
float sum = simd_sum(lsum);
const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
// This barrier fixes a failing test
- // ref: https://github.com/ggerganov/ggml/pull/621#discussion_r1425156335
+ // ref: https://github.com/ggml-org/ggml/pull/621#discussion_r1425156335
threadgroup_barrier(mem_flags::mem_none);
float sum = simd_sum(lsum);