case GGML_FTYPE_MOSTLY_IQ4_NL:
case GGML_FTYPE_MOSTLY_IQ4_XS:
case GGML_FTYPE_MOSTLY_IQ1_M:
+ case GGML_FTYPE_MOSTLY_BF16:
{
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
return false;
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ1_M:
+ case GGML_TYPE_BF16:
case GGML_TYPE_COUNT:
{
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
dst[tpig] = x / (1.0f + exp(-x));
}
-+kernel void kernel_silu_4(
+kernel void kernel_silu_4(
device const float4 * src0,
device float4 * dst,
uint tpig[[thread_position_in_grid]]) {
// ALiBi
if (max_bias > 0.0f) {
- const short h = iq2;
+ const uint32_t h = iq2;
const float base = h < n_head_log2 ? m0 : m1;
const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
// ALiBi
if (max_bias > 0.0f) {
- const short h = iq2;
+ const uint32_t h = iq2;
const float base = h < n_head_log2 ? m0 : m1;
const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
#include "ggml-impl.h"
#include "ggml-quants.h"
#include "ggml.h"
-#include "sgemm.h"
#if defined(_MSC_VER) || defined(__MINGW32__)
#include <malloc.h> // using malloc.h with MSC/MINGW
#undef GGML_USE_LLAMAFILE
#endif
+#ifdef GGML_USE_LLAMAFILE
+#include "sgemm.h"
+#endif
+
#if defined(_MSC_VER)
// disable "possible loss of data" to avoid hundreds of casts
// we should just be careful :)