const int nb = k / QK;
const size_t bs = sizeof(float) + QK/2;
- uint8_t * restrict pd = (uint8_t *) (y + 0*bs);
- uint8_t * restrict pb = (uint8_t *) (y + 0*bs + sizeof(float));
+ uint8_t * restrict pd = ((uint8_t *)y + 0*bs);
+ uint8_t * restrict pb = ((uint8_t *)y + 0*bs + sizeof(float));
uint8_t pp[QK/2];
const int nb = k / QK;
const size_t bs = sizeof(float) + QK/2;
- const uint8_t * restrict pd = (const uint8_t *) (x + 0*bs);
- const uint8_t * restrict pb = (const uint8_t *) (x + 0*bs + sizeof(float));
+ const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs);
+ const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + sizeof(float));
// scalar
for (int i = 0; i < nb; i++) {
const size_t bs = sizeof(float) + QK/2;
- const uint8_t * restrict pd0 = (const uint8_t *) (x + 0*bs);
- const uint8_t * restrict pd1 = (const uint8_t *) (y + 0*bs);
+ const uint8_t * restrict pd0 = ((const uint8_t *)x + 0*bs);
+ const uint8_t * restrict pd1 = ((const uint8_t *)y + 0*bs);
- const uint8_t * restrict pb0 = (const uint8_t *) (x + 0*bs + sizeof(float));
- const uint8_t * restrict pb1 = (const uint8_t *) (y + 0*bs + sizeof(float));
+ const uint8_t * restrict pb0 = ((const uint8_t *)x + 0*bs + sizeof(float));
+ const uint8_t * restrict pb1 = ((const uint8_t *)y + 0*bs + sizeof(float));
float sumf = 0.0;
const int nb = n / QK;
const size_t bs = sizeof(float) + QK/2;
- const uint8_t * restrict pd = (const uint8_t *) (x + 0*bs);
- const uint8_t * restrict pb = (const uint8_t *) (x + 0*bs + sizeof(float));
+ const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs);
+ const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + sizeof(float));
#if __ARM_NEON
#if QK == 32
// create the ggml context
{
struct ggml_init_params params = {
- .mem_size = ctx_size,
- .mem_buffer = NULL,
+ /*.mem_size =*/ ctx_size,
+ /*.mem_buffer =*/ NULL,
};
model.ctx = ggml_init(params);
}
struct ggml_init_params params = {
- .mem_size = buf_size,
- .mem_buffer = buf,
+ /*.mem_size =*/ buf_size,
+ /*.mem_buffer =*/ buf,
};
struct ggml_context * ctx0 = ggml_init(params);
- struct ggml_cgraph gf = { .n_threads = n_threads };
+ ggml_cgraph gf = {};
+ gf.n_threads = n_threads;
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
}
int main(int argc, char ** argv) {
+ ggml_time_init();
const int64_t t_main_start_us = ggml_time_us();
gpt_params params;
// ./llama-quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
//
int main(int argc, char ** argv) {
+ ggml_time_init();
if (argc != 4) {
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
fprintf(stderr, " type = 2 - q4_0\n");
#include <fstream>
#include <regex>
+ #if defined(_MSC_VER) || defined(__MINGW32__)
+ #include <malloc.h> // using malloc.h with MSC/MINGW
+ #elif !defined(__FreeBSD__)
+ #include <alloca.h>
+ #endif
+
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
for (int i = 1; i < argc; i++) {
std::string arg = argv[i];
assert(k % qk == 0);
- uint8_t pp[qk/2];
+ const size_t pp_size = qk / 2;
+ uint8_t *pp = static_cast<uint8_t*>(alloca(pp_size));
char * pdst = (char *) dst;
pp[l/2] = vi0 | (vi1 << 4);
}
- memcpy(pb, pp, sizeof(pp));
+ memcpy(pb, pp, pp_size);
pb += bs;
}
}
assert(k % qk == 0);
- uint8_t pp[qk/2];
+ const size_t pp_size = qk / 2;
+ uint8_t *pp = static_cast<uint8_t*>(alloca(pp_size));
char * pdst = (char *) dst;
pp[l/2] = vi0 | (vi1 << 4);
}
- memcpy(pb + i*qk/2, pp, sizeof(pp));
+ memcpy(pb + i*qk/2, pp, pp_size);
}
}
}