# warning flags
+if (GGML_ALL_WARNINGS)
+ if (NOT MSVC)
+ set(c_flags -Wall -Wpedantic -Wformat=2 -Wno-unused -Wstrict-prototypes)
+ set(cxx_flags -Wall -Wpedantic -Wformat=2)
+ else()
+ # todo : windows
+ endif()
+
+ add_compile_options(
+ "$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
+ "$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
+ )
+endif()
+
if (NOT MSVC)
add_compile_options(-Werror=vla)
endif()
params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int)embd_inp.size());
printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
- for (int i = 0; i < embd_inp.size(); i++) {
- printf("%s: token[%d] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
+ for (size_t i = 0; i < embd_inp.size(); i++) {
+ printf("%s: token[%zu] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
}
printf("\n");
const int32_t end_token = vocab.token_to_id["### End"];
- for (int i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
+ for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
// predict
if (embd.size() > 0) {
const int64_t t_start_us = ggml_time_us();
embd.push_back(id);
} else {
// if here, it means we are still processing the input prompt
- for (int k = i; k < embd_inp.size(); k++) {
+ for (size_t k = i; k < embd_inp.size(); k++) {
embd.push_back(embd_inp[k]);
- if (embd.size() > params.n_batch) {
+ if (int32_t(embd.size()) > params.n_batch) {
break;
}
}
}
#if defined(DOLLY_INTERACTIVE_PORT)
- int sockfd;
+ int sockfd = -1;
if (params.interactive_port != -1) {
sockfd = setup_port(params.interactive_port);
if (sockfd == -1) {
while (true) {
std::string prompt_input;
#if defined(DOLLY_INTERACTIVE_PORT)
- int clientfd;
+ int clientfd = -1;
if (params.interactive_port != -1) {
sockaddr_in clientaddr;
socklen_t clientaddrlen = sizeof(clientaddr);
// this reduces the memory usage during inference, at the cost of a bit of speed at the beginning
std::vector<gpt_vocab::id> embd;
- for (int i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
+ for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
// predict
if (embd.size() > 0) {
const int64_t t_start_us = ggml_time_us();
embd.push_back(id);
} else {
// if here, it means we are still processing the input prompt
- for (int k = i; k < embd_inp.size(); k++) {
+ for (size_t k = i; k < embd_inp.size(); k++) {
embd.push_back(embd_inp[k]);
- if (embd.size() >= params.n_batch) {
+ if (int32_t(embd.size()) >= params.n_batch) {
break;
}
}
size_t mem_per_token = 0;
gptj_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
- for (int i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
+ for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
// predict
if (embd.size() > 0) {
const int64_t t_start_us = ggml_time_us();
embd.push_back(id);
} else {
// if here, it means we are still processing the input prompt
- for (int k = i; k < embd_inp.size(); k++) {
+ for (size_t k = i; k < embd_inp.size(); k++) {
embd.push_back(embd_inp[k]);
- if (embd.size() > params.n_batch) {
+ if (int32_t(embd.size()) > params.n_batch) {
break;
}
}
params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
- for (int i = 0; i < embd_inp.size(); i++) {
- printf("%s: token[%d] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
+ for (size_t i = 0; i < embd_inp.size(); i++) {
+ printf("%s: token[%zu] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
}
printf("\n");
size_t mem_per_token = 0;
gpt_neox_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
- for (int i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
+ for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
// predict
if (embd.size() > 0) {
const int64_t t_start_us = ggml_time_us();
embd.push_back(id);
} else {
// if here, it means we are still processing the input prompt
- for (int k = i; k < embd_inp.size(); k++) {
+ for (size_t k = i; k < embd_inp.size(); k++) {
embd.push_back(embd_inp[k]);
- if (embd.size() > params.n_batch) {
+ if (int32_t(embd.size()) > params.n_batch) {
break;
}
}
struct ggml_cgraph gfi = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
// param export/import test
- GGML_ASSERT(ggml_graph_get_tensor(&gfi, "fc1_bias")->op_params[0] == 0xdeadbeef);
+ GGML_ASSERT(ggml_graph_get_tensor(&gfi, "fc1_bias")->op_params[0] == int(0xdeadbeef));
// allocate work context
// needed during ggml_graph_compute() to allocate a work tensor
// Convert token from utf-8
std::wstring word_multibytes = convert_to_wstring(word);
word.resize(word_multibytes.size());
- for (int w = 0; w < word_multibytes.size(); w++) {
+ for (size_t w = 0; w < word_multibytes.size(); w++) {
word[w] = uint8_t(word_multibytes[w]);
}
std::vector<float> best_segmentations_scores(word.length() + 1, -std::numeric_limits<float>::infinity());
best_segmentations_scores[0] = 1.0;
- for (int start_idx = 0; start_idx < word.length(); ++start_idx) {
+ for (size_t start_idx = 0; start_idx < word.length(); ++start_idx) {
float best_score_at_start = best_segmentations_scores[start_idx];
- for (int end_idx = start_idx + 1; end_idx <= word.length(); ++end_idx) {
+ for (size_t end_idx = start_idx + 1; end_idx <= word.length(); ++end_idx) {
std::string token = word.substr(start_idx, end_idx - start_idx);
if (model.count(token) && best_score_at_start != -std::numeric_limits<float>::infinity()) {
float token_score = model.at(token).second;
std::string word;
std::vector<char> buf(128);
- for (std::size_t i = 0; i < max_vocab_size; i++) {
+ for (int i = 0; i < max_vocab_size; i++) {
uint32_t len;
fin.read((char *)&len, sizeof(len));
printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
- for (int i = 0; i < embd_inp.size(); i++) {
- printf("%s: token[%d] = %6zu\n", __func__, i, embd_inp[i]);
+ for (size_t i = 0; i < embd_inp.size(); i++) {
+ printf("%s: token[%zu] = %6zu\n", __func__, i, embd_inp[i]);
// vocab.id_to_token.at(embd_inp[i]).c_str()
}
printf("\n");
size_t mem_per_token = 0;
replit_eval(model, params.n_threads, 0, {0, 1, 2, 3}, logits, false, mem_per_token);
- for (int i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
+ for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
// predict
if (embd.size() > 0) {
const int64_t t_start_us = ggml_time_us();
embd.push_back(id);
} else {
// if here, it means we are still processing the input prompt
- for (int k = i; k < embd_inp.size(); k++) {
+ for (size_t k = i; k < embd_inp.size(); k++) {
embd.push_back(embd_inp[k]);
- if (embd.size() > params.n_batch) {
+ if (int32_t(embd.size()) > params.n_batch) {
break;
}
}
#include <cassert>
#include <cmath>
+#include <cstddef>
#include <cstdio>
#include <cstring>
#include <fstream>
}
}
- if (n_tensors != model.tensors.size()) {
+ if (n_tensors != ptrdiff_t(model.tensors.size())) {
fprintf(stderr, "%s: model file has %d tensors, but %d tensors were expected\n", __func__, n_tensors, (int) model.tensors.size());
return false;
}
}
// Add StarChat special tokens.
- for (const std::string & token : {
+ for (std::string token : {
"<|system|>",
"<|user|>",
"<|assistant|>",
printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
- for (int i = 0; i < embd_inp.size(); i++) {
- printf("%s: token[%d] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
+ for (size_t i = 0; i < embd_inp.size(); i++) {
+ printf("%s: token[%zu] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
}
printf("\n\n");
size_t mem_per_token = 0;
starcoder_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
- for (int i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
+ for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
// predict
if (embd.size() > 0) {
const int64_t t_start_us = ggml_time_us();
last_n_tokens.push_back(id);
} else {
// if here, it means we are still processing the input prompt
- for (int k = i; k < embd_inp.size(); k++) {
+ for (size_t k = i; k < embd_inp.size(); k++) {
embd.push_back(embd_inp[k]);
last_n_tokens.erase(last_n_tokens.begin());
last_n_tokens.push_back(embd_inp[k]);
- if (embd.size() >= params.n_batch) {
+ if (int32_t(embd.size()) >= params.n_batch) {
break;
}
}
}
// Add StarChat special tokens.
- for (const std::string & token : {
+ for (std::string token : {
"<|system|>",
"<|user|>",
"<|assistant|>",
printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
- for (int i = 0; i < embd_inp.size(); i++) {
- printf("%s: token[%d] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
+ for (size_t i = 0; i < embd_inp.size(); i++) {
+ printf("%s: token[%zu] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
}
printf("\n\n");
printf("Calling starcoder_eval\n");
starcoder_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
- for (int i = int(embd.size()); i < embd_inp.size() + params.n_predict; i++) {
+ for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
// predict
if (embd.size() > 0) {
const int64_t t_start_us = ggml_time_us();
embd.push_back(id);
} else {
// if here, it means we are still processing the input prompt
- for (int k = i; k < embd_inp.size(); k++) {
+ for (size_t k = i; k < embd_inp.size(); k++) {
embd.push_back(embd_inp[k]);
- if (embd.size() >= params.n_batch) {
+ if (int32_t(embd.size()) >= params.n_batch) {
break;
}
}
if (GGML_ALL_WARNINGS)
- if (CMAKE_COMPILER_IS_GNUCC OR CMAKE_C_COMPILER_ID MATCHES "Clang")
- #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wextra")
+ if (NOT MSVC)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} \
- -Wall \
+ -Wunused \
-Wextra \
- -Wpedantic \
-Wshadow \
-Wcast-qual \
- -Wstrict-prototypes \
- -Wpointer-arith \
-Wdouble-promotion \
-Wno-unused-function \
-Wmissing-prototypes \
return result;
}
-static void print_elements(const char* label, const struct ggml_tensor * t) {
- if (!t) {
- printf("%s: %s = null\n", __func__, label);
- return;
- }
- const int nelements = ggml_nelements(t);
- printf("%s: %s = [", __func__, label);
- for (int k = 0; k < nelements; ++k) {
- if (k > 0) { printf(", "); }
- printf("%.5f", ggml_get_f32_1d(t, k));
- }
- printf("] shape: [");
- for (int k = 0; k < t->n_dims; ++k) {
- if (k > 0) { printf(", "); }
- printf("%d", (int)t->ne[k]);
- }
- printf("]\n");
-
-}
-
static bool check_gradient(
const char * op_name,
struct ggml_context * ctx0,
#define MAX_NARGS 2
-float frand() {
+float frand(void) {
return (float)rand()/(float)RAND_MAX;
}
const struct ggml_tensor * y,
const struct ggml_tensor * x0,
const struct ggml_tensor * x1) {
- float * dst = (float *) y->data;
- float * src0 = (float *) x0->data;
- float * src1 = (float *) x1->data;
-
const int64_t n00 = x0->ne[0];
const int64_t n10 = x0->ne[1];
const int64_t n20 = x0->ne[2];
#define gq_t_bits 64
#define gq_quant_t uint64_t
-float frand() {
+float frand(void) {
return (float) rand() / (float) RAND_MAX;
}
return k/QK;
}
-static inline int quantize_1_quants_per_block() {
+static inline int quantize_1_quants_per_block(void) {
return QK/gq_t_bits;
}
return k/QK;
}
-static inline int quantize_2_quants_per_block() {
+static inline int quantize_2_quants_per_block(void) {
return QK/gq_t_bits;
}
int m, int n, int k) {
assert(k % QK == 0);
- const int nb = quantize_2_blocks_per_row(k);
- const int nq = quantize_2_quants_per_block();
-
for (int ir0 = 0; ir0 < m; ir0++) {
for (int ir1 = 0; ir1 < n; ir1++) {
vec_dot_gq_2(k, dst + ir1, src0, src1);
return k/QK;
}
-static inline int quantize_3_quants_per_block() {
+static inline int quantize_3_quants_per_block(void) {
return QK/gq_t_bits;
}
int m, int n, int k) {
assert(k % 32 == 0);
- const int nb = quantize_6_blocks_per_row(k);
-
for (int ir0 = 0; ir0 < m; ir0++) {
for (int ir1 = 0; ir1 < n; ir1++) {
vec_dot_gq_6(k, dst + ir1, src0, src1);
}
}
-uint64_t get_time_us() {
+uint64_t get_time_us(void) {
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec * 1000000 + tv.tv_usec;