#include <string>
#include <thread>
#include <vector>
+#include <regex>
#define USE_FLASH_ATTN
//#define USE_FLASH_FF
std::map<token, id> token_to_id;
std::map<id, token> id_to_token;
+ // used to avoid memory allocations during sampling
+ // TODO: move to whisper_context in the future
+ std::vector<std::pair<double, whisper_vocab::id>> probs_id;
+
id token_eot = 50256;
id token_sot = 50257;
id token_prev = 50360;
int32_t exp_n_audio_ctx; // 0 - use default
};
+template<typename T>
+static void read_safe(std::ifstream& fin, T& dest)
+{
+ fin.read((char*)& dest, sizeof(T));
+}
+
// load the model from a ggml file
//
// file format:
// verify magic
{
uint32_t magic;
- fin.read((char *) &magic, sizeof(magic));
+ read_safe(fin, magic);
if (magic != 0x67676d6c) {
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
return false;
{
auto & hparams = model.hparams;
- fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
- fin.read((char *) &hparams.n_audio_ctx, sizeof(hparams.n_audio_ctx));
- fin.read((char *) &hparams.n_audio_state, sizeof(hparams.n_audio_state));
- fin.read((char *) &hparams.n_audio_head, sizeof(hparams.n_audio_head));
- fin.read((char *) &hparams.n_audio_layer, sizeof(hparams.n_audio_layer));
- fin.read((char *) &hparams.n_text_ctx, sizeof(hparams.n_text_ctx));
- fin.read((char *) &hparams.n_text_state, sizeof(hparams.n_text_state));
- fin.read((char *) &hparams.n_text_head, sizeof(hparams.n_text_head));
- fin.read((char *) &hparams.n_text_layer, sizeof(hparams.n_text_layer));
- fin.read((char *) &hparams.n_mels, sizeof(hparams.n_mels));
- fin.read((char *) &hparams.f16, sizeof(hparams.f16));
+ read_safe(fin, hparams.n_vocab);
+ read_safe(fin, hparams.n_audio_ctx);
+ read_safe(fin, hparams.n_audio_state);
+ read_safe(fin, hparams.n_audio_head);
+ read_safe(fin, hparams.n_audio_layer);
+ read_safe(fin, hparams.n_text_ctx);
+ read_safe(fin, hparams.n_text_state);
+ read_safe(fin, hparams.n_text_head);
+ read_safe(fin, hparams.n_text_layer);
+ read_safe(fin, hparams.n_mels);
+ read_safe(fin, hparams.f16);
assert(hparams.n_text_state == hparams.n_audio_state);
{
auto & filters = wctx.model.filters;
- fin.read((char *) &filters.n_mel, sizeof(filters.n_mel));
- fin.read((char *) &filters.n_fft, sizeof(filters.n_fft));
+ read_safe(fin, filters.n_mel);
+ read_safe(fin, filters.n_fft);
filters.data.resize(filters.n_mel * filters.n_fft);
fin.read((char *) filters.data.data(), filters.data.size() * sizeof(float));
// load vocab
{
int32_t n_vocab = 0;
- fin.read((char *) &n_vocab, sizeof(n_vocab));
+ read_safe(fin, n_vocab);
//if (n_vocab != model.hparams.n_vocab) {
// fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
//}
std::string word;
+ std::vector<char> tmp;
+
+ tmp.reserve(128);
+
for (int i = 0; i < n_vocab; i++) {
uint32_t len;
- fin.read((char *) &len, sizeof(len));
+ read_safe(fin, len);
- word.resize(len);
- fin.read((char *) word.data(), len);
+ if (len > 0) {
+ tmp.resize(len);
+ fin.read(&tmp[0], tmp.size()); // read to buffer
+ word.assign(&tmp[0], tmp.size());
+ } else {
+ // seems like we have an empty-string token in multi-language models (i = 50256)
+ //fprintf(stderr, "%s: warning: empty-string token in vocab, i = %d\n", __func__, i);
+ word = "";
+ }
vocab.token_to_id[word] = i;
vocab.id_to_token[i] = word;
vocab.id_to_token[i] = word;
}
}
+
+ wctx.logits.reserve(vocab.n_vocab*model.hparams.n_text_ctx);
+ wctx.probs.reserve(vocab.n_vocab*model.hparams.n_text_ctx);
+
+ vocab.probs_id.reserve(n_vocab);
}
{
const ggml_type wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
size_t ctx_size = 0;
- size_t ctx_mem_size = 0;
{
const auto & hparams = model.hparams;
ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_ln_1_b
}
- ctx_mem_size += n_text_layer*n_text_ctx*n_text_state*ggml_type_size(GGML_TYPE_F16); // memory_k
- ctx_mem_size += n_text_layer*n_text_ctx*n_text_state*ggml_type_size(GGML_TYPE_F16); // memory_v
-
- ctx_mem_size += n_text_layer*n_audio_ctx*n_text_state*ggml_type_size(GGML_TYPE_F16); // memory_cross_k
- ctx_mem_size += n_text_layer*n_audio_ctx*n_text_state*ggml_type_size(GGML_TYPE_F16); // memory_cross_v
-
ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*256; // object overhead
fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
// create the ggml context
{
- struct ggml_init_params params = {
- .mem_size = wctx.buf_model->size(),
- .mem_buffer = wctx.buf_model->data(),
- };
+ struct ggml_init_params params;
+ params.mem_size = wctx.buf_model->size();
+ params.mem_buffer = wctx.buf_model->data();
model.ctx = ggml_init(params);
if (!model.ctx) {
// create the ggml memory context
{
- struct ggml_init_params params = {
- .mem_size = wctx.buf_memory.size(),
- .mem_buffer = wctx.buf_memory.data(),
- };
+ struct ggml_init_params params;
+ params.mem_size = wctx.buf_memory.size();
+ params.mem_buffer = wctx.buf_memory.data();
model.ctx_mem = ggml_init(params);
if (!model.ctx_mem) {
int32_t length;
int32_t ftype;
- fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
- fin.read(reinterpret_cast<char *>(&length), sizeof(length));
- fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
+ read_safe(fin, n_dims);
+ read_safe(fin, length);
+ read_safe(fin, ftype);
if (fin.eof()) {
break;
int32_t nelements = 1;
int32_t ne[3] = { 1, 1, 1 };
for (int i = 0; i < n_dims; ++i) {
- fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+ read_safe(fin, ne[i]);
nelements *= ne[i];
}
- std::string name(length, 0);
- fin.read(&name[0], length);
+ std::string name;
+ std::vector<char> tmp(length); // create a buffer
+ fin.read(&tmp[0], tmp.size()); // read to buffer
+ name.assign(&tmp[0], tmp.size());
- if (model.tensors.find(name.data()) == model.tensors.end()) {
+ if (model.tensors.find(name) == model.tensors.end()) {
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
return false;
}
const int n_mels = hparams.n_mels;
assert(mel_inp.n_mel == n_mels);
- struct ggml_init_params params = {
- .mem_size = wctx.buf_compute.size(),
- .mem_buffer = wctx.buf_compute.data(),
- };
+ struct ggml_init_params params;
+ params.mem_size = wctx.buf_compute.size();
+ params.mem_buffer = wctx.buf_compute.data();
struct ggml_context * ctx0 = ggml_init(params);
// create separate context for each layer to reduce memory usage
- struct ggml_init_params paramsL = {
- .mem_size = wctx.buf_compute_layer.size(),
- .mem_buffer = wctx.buf_compute_layer.data(),
- };
+ struct ggml_init_params paramsL;
+ paramsL.mem_size = wctx.buf_compute_layer.size();
+ paramsL.mem_buffer = wctx.buf_compute_layer.data();
struct ggml_context * ctxL = ggml_init(paramsL);
// input for next layer (inpO -> inpL)
memcpy(inpL->data, inpO->data, ggml_nbytes(inpL));
inpL->op = GGML_OP_NONE;
- inpL->src0 = NULL;
- inpL->src1 = NULL;
+ inpL->src0 = nullptr;
+ inpL->src1 = nullptr;
//printf("%s: - used_mem(%d) = %f MB\n", __func__, il, ggml_used_mem(ctxL)/1024.0/1024.0);
// TODO: hack to disconnect the encoded features from the previous graph
cur->op = GGML_OP_NONE;
- cur->src0 = NULL;
- cur->src1 = NULL;
+ cur->src0 = nullptr;
+ cur->src1 = nullptr;
for (int il = 0; il < model.hparams.n_text_layer; ++il) {
auto & layer = model.layers_decoder[il];
const int N = n_tokens;
const int M = wctx.exp_n_audio_ctx > 0 ? wctx.exp_n_audio_ctx : hparams.n_audio_ctx;
- struct ggml_init_params params = {
- .mem_size = wctx.buf_compute.size(),
- .mem_buffer = wctx.buf_compute.data(),
- };
+ struct ggml_init_params params;
+ params.mem_size = wctx.buf_compute.size();
+ params.mem_buffer = wctx.buf_compute.data();
struct ggml_context * ctx0 = ggml_init(params);
for (int il = 0; il < n_layer; ++il) {
const auto & layer = model.layers_decoder[il];
- struct ggml_init_params paramsL = {
- .mem_size = wctx.buf_compute_layer.size(),
- .mem_buffer = wctx.buf_compute_layer.data(),
- };
+ struct ggml_init_params paramsL;
+ paramsL.mem_size = wctx.buf_compute_layer.size();
+ paramsL.mem_buffer = wctx.buf_compute_layer.data();
struct ggml_context * ctxL = ggml_init(paramsL);
struct ggml_cgraph gf = {};
// input for next layer (inpO -> inpL)
memcpy(inpL->data, inpO->data, ggml_nbytes(inpL));
inpL->op = GGML_OP_NONE;
- inpL->src0 = NULL;
- inpL->src1 = NULL;
+ inpL->src0 = nullptr;
+ inpL->src1 = nullptr;
if (N > 1) {
//printf("%s: - used_mem(%d) = %f MB\n", __func__, il, ggml_used_mem(ctxL)/1024.0/1024.0);
// the most basic sampling scheme - select the top token
static whisper_token_data whisper_sample_best(
- const whisper_vocab & vocab,
+ whisper_vocab & vocab,
const float * probs,
bool force_timestamp,
bool is_initial) {
0, 0, 0.0f, 0.0f, 0.0f, -1, -1, 0.0f,
};
- int n_logits = vocab.id_to_token.size();
+ const int n_logits = vocab.n_vocab;
- std::vector<std::pair<double, whisper_vocab::id>> probs_id;
- probs_id.reserve(n_logits);
+ auto & probs_id = vocab.probs_id;
+ probs_id.clear();
for (int i = 0; i < n_logits; i++) {
- probs_id.push_back(std::make_pair(probs[i], i));
+ probs_id.emplace_back(probs[i], i);
}
{
std::vector<float> even;
std::vector<float> odd;
+ even.reserve(N/2);
+ odd.reserve(N/2);
+
for (int i = 0; i < N; i++) {
if (i % 2 == 0) {
even.push_back(in[i]);
static bool log_mel_spectrogram(
const float * samples,
const int n_samples,
- const int sample_rate,
+ const int /*sample_rate*/,
const int fft_size,
const int fft_step,
const int n_mel,
return true;
}
+// split text into tokens
+//
+// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
+//
+// Regex (Python):
+// r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
+//
+// Regex (C++):
+// R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
+//
+static std::vector<whisper_vocab::id> tokenize(const whisper_vocab & vocab, const std::string & text) {
+ std::vector<std::string> words;
+
+ // first split the text into words
+ {
+ std::string str = text;
+ std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
+
+ std::regex re(pat);
+ std::smatch m;
+
+ while (std::regex_search(str, m, re)) {
+ for (auto x : m) {
+ words.push_back(x);
+ }
+ str = m.suffix();
+ }
+ }
+
+ // find the longest tokens that form the words:
+ std::vector<whisper_vocab::id> tokens;
+ for (const auto & word : words) {
+ if (word.empty()) continue;
+
+ int i = 0;
+ int n = word.size();
+ while (i < n) {
+ int j = n;
+ while (j > i) {
+ auto it = vocab.token_to_id.find(word.substr(i, j-i));
+ if (it != vocab.token_to_id.end()) {
+ tokens.push_back(it->second);
+ i = j;
+ break;
+ }
+ --j;
+ }
+ if (i == n) {
+ break;
+ }
+ if (j == i) {
+ auto sub = word.substr(i, 1);
+ if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
+ tokens.push_back(vocab.token_to_id.at(sub));
+ } else {
+ fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
+ }
+ ++i;
+ }
+ }
+ }
+
+ return tokens;
+}
+
//
// interface implementation
//
if (!whisper_model_load(path_model, *ctx)) {
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, path_model);
- return NULL;
+ delete ctx;
+ return nullptr;
}
ctx->t_load_us = ggml_time_us() - t_start_us;
return res;
}
+int whisper_tokenize(struct whisper_context * ctx, const char * text, whisper_token * tokens, int n_max_tokens) {
+ const auto res = tokenize(ctx->vocab, text);
+
+ if (n_max_tokens < (int) res.size()) {
+ fprintf(stderr, "%s: too many resulting tokens: %d (max %d)\n", __func__, (int) res.size(), n_max_tokens);
+ return -1;
+ }
+
+ for (int i = 0; i < (int) res.size(); i++) {
+ tokens[i] = res[i];
+ }
+
+ return res.size();
+}
+
+int whisper_lang_max_id() {
+ auto max_id = 0;
+ for (const auto & kv : g_lang) {
+ max_id = std::max(max_id, kv.second.first);
+ }
+
+ return max_id;
+}
+
int whisper_lang_id(const char * lang) {
if (!g_lang.count(lang)) {
+ for (const auto & kv : g_lang) {
+ if (kv.second.second == lang) {
+ return kv.second.first;
+ }
+ }
+
fprintf(stderr, "%s: unknown language '%s'\n", __func__, lang);
return -1;
}
return g_lang.at(lang).first;
}
+const char * whisper_lang_str(int id) {
+ for (const auto & kv : g_lang) {
+ if (kv.second.first == id) {
+ return kv.first.c_str();
+ }
+ }
+
+ fprintf(stderr, "%s: unknown language id %d\n", __func__, id);
+ return nullptr;
+}
+
+int whisper_lang_auto_detect(
+ struct whisper_context * ctx,
+ int offset_ms,
+ int n_threads,
+ float * lang_probs) {
+ const int seek = offset_ms/10;
+
+ if (seek < 0) {
+ fprintf(stderr, "%s: offset %dms is before the start of the audio\n", __func__, offset_ms);
+ return -1;
+ }
+
+ if (seek >= ctx->mel.n_len) {
+ fprintf(stderr, "%s: offset %dms is past the end of the audio (%dms)\n", __func__, offset_ms, ctx->mel.n_len*10);
+ return -2;
+ }
+
+ // run the encoder
+ if (whisper_encode(ctx, seek, n_threads) != 0) {
+ fprintf(stderr, "%s: failed to encode\n", __func__);
+ return -6;
+ }
+
+ const std::vector<whisper_token> prompt = { whisper_token_sot(ctx) };
+
+ if (whisper_decode(ctx, prompt.data(), prompt.size(), 0, n_threads) != 0) {
+ fprintf(stderr, "%s: failed to decode\n", __func__);
+ return -7;
+ }
+
+ std::vector<std::pair<float, int>> probs_id;
+ for (const auto & kv : g_lang) {
+ const auto token_lang = whisper_token_lang(ctx, kv.second.first);
+ probs_id.emplace_back(ctx->probs[token_lang], kv.second.first);
+ }
+
+ // sort descending
+ {
+ using pair_type = decltype(probs_id)::value_type;
+ std::sort(probs_id.begin(), probs_id.end(), [](const pair_type & a, const pair_type & b) {
+ return a.first > b.first;
+ });
+ }
+
+ // softmax
+ {
+ float sum = 0;
+ for (const auto & kv : probs_id) {
+ sum += exp(kv.first);
+ }
+
+ for (auto & kv : probs_id) {
+ kv.first = exp(kv.first) / sum;
+ }
+ }
+
+ {
+ for (int i = 0; i < (int) probs_id.size(); i++) {
+ if (lang_probs) {
+ lang_probs[probs_id[i].second] = probs_id[i].first;
+ }
+
+ //printf("%s: lang %2d (%3s): %f\n", __func__, probs_id[i].second, whisper_lang_str(probs_id[i].second), probs_id[i].first);
+ }
+ }
+
+ return probs_id[0].second;
+}
+
int whisper_n_len(struct whisper_context * ctx) {
return ctx->mel.n_len;
}
return ctx->model.hparams.n_text_ctx;
}
+int whisper_n_audio_ctx(struct whisper_context * ctx) {
+ return ctx->model.hparams.n_audio_ctx;
+}
+
int whisper_is_multilingual(struct whisper_context * ctx) {
return ctx->vocab.is_multilingual() ? 1 : 0;
}
return ctx->vocab.token_beg;
}
+whisper_token whisper_token_lang(struct whisper_context * ctx, int lang_id) {
+ return whisper_token_sot(ctx) + 1 + lang_id;
+}
+
whisper_token whisper_token_translate(void) {
return whisper_vocab::token_translate;
}
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
+ s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
+ s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
+ s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
} else {
if (whisper_pcm_to_mel(ctx, samples, n_samples, params.n_threads) != 0) {
fprintf(stderr, "%s: failed to compute log mel spectrogram\n", __func__);
- return -1;
+ return -2;
}
}
+ // auto-detect language if not specified
+ if (params.language == nullptr || strlen(params.language) == 0 || strcmp(params.language, "auto") == 0) {
+ std::vector<float> probs(whisper_lang_max_id() + 1, 0.0f);
+
+ const auto lang_id = whisper_lang_auto_detect(ctx, 0, params.n_threads, probs.data());
+ if (lang_id < 0) {
+ fprintf(stderr, "%s: failed to auto-detect language\n", __func__);
+ return -3;
+ }
+
+ params.language = whisper_lang_str(lang_id);
+
+ fprintf(stderr, "%s: auto-detected language: %s (p = %f)\n", __func__, params.language, probs[whisper_lang_id(params.language)]);
+ }
+
if (params.token_timestamps) {
ctx->t_beg = 0;
ctx->t_last = 0;
std::rotate(prompt_past.begin(), prompt_past.end() - params.prompt_n_tokens, prompt_past.end());
}
- // overwrite audio_ctx
+ // overwrite audio_ctx, max allowed is hparams.n_audio_ctx
+ if (params.audio_ctx > whisper_n_audio_ctx(ctx)) {
+ fprintf(stderr, "%s: audio_ctx is larger than the maximum allowed (%d > %d)\n", __func__, params.audio_ctx, whisper_n_audio_ctx(ctx));
+ return -4;
+ }
ctx->exp_n_audio_ctx = params.audio_ctx;
// these tokens determine the task that will be performed
std::vector<whisper_token> prompt_init = { whisper_token_sot(ctx) };
if (whisper_is_multilingual(ctx)) {
- prompt_init.push_back(whisper_token_sot(ctx) + 1 + whisper_lang_id(params.language));
+ const int lang_id = whisper_lang_id(params.language);
+ prompt_init.push_back(whisper_token_lang(ctx, lang_id));
if (params.translate) {
prompt_init.push_back(whisper_token_translate());
} else {
}
}
+ // of only 1 second left, then stop
if (seek + 100 >= seek_end) {
break;
}
+ // if there is a very short audio segment left to process, we remove any past prompt since it tends
+ // to confuse the decoder and often make it repeat or hallucinate stuff
+ if (seek > seek_start && seek + 500 >= seek_end) {
+ prompt_past.clear();
+ }
+
if (params.encoder_begin_callback) {
if (params.encoder_begin_callback(ctx, params.encoder_begin_callback_user_data) == false) {
fprintf(stderr, "%s: encoder_begin_callback returned false - aborting\n", __func__);
// encode audio features starting at offset seek
if (whisper_encode(ctx, seek, params.n_threads) != 0) {
fprintf(stderr, "%s: failed to encode\n", __func__);
- return 7;
+ return -4;
}
int n_past = 0;
prompt.clear();
// if we have already generated some text, use it as a prompt to condition the next generation
- if (prompt_past.size() > 0) {
+ if (!prompt_past.empty()) {
int n_take = std::min(std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2), int(prompt_past.size()));
prompt = { whisper_token_prev(ctx) };
tokens_cur.clear();
bool failed = false;
+ bool has_ts = false; // have we already sampled a non-beg timestamp token for the current segment?
for (int i = 0, n_max = whisper_n_text_ctx(ctx)/2 - 4; i < n_max; ++i) {
if (whisper_decode(ctx, prompt.data(), prompt.size(), n_past, params.n_threads) != 0) {
fprintf(stderr, "%s: failed to decode\n", __func__);
- return 8;
+ return -5;
}
n_past += prompt.size();
const int seek_delta_new = 2*(token.id - whisper_token_beg(ctx));
// do not allow to go back in time
- if (seek_delta != 100*WHISPER_CHUNK_SIZE &&
- seek_delta > seek_delta_new && result_len < i) {
+ if (has_ts && seek_delta > seek_delta_new && result_len < i) {
break;
}
seek_delta = seek_delta_new;
result_len = i + 1;
+ has_ts = true;
}
// add it to the context
//{
// const auto tt = token.pt > 0.10 ? ctx->vocab.id_to_token[token.tid] : "[?]";
- // printf("%s: %10s %6d %6.3f '%s'\n", __func__, tt.c_str(), token.id, token.pt, ctx->vocab.id_to_token[token.id].c_str());
+ // printf("%s: %3d %10s %6d %6.3f '%s'\n", __func__, i, tt.c_str(), token.id, token.pt, ctx->vocab.id_to_token[token.id].c_str());
//}
- // end of text token
- if (token.id == whisper_token_eot(ctx) || (params.max_tokens > 0 && i > params.max_tokens)) {
+ // end of segment
+ if (token.id == whisper_token_eot(ctx) || // end of text token
+ (params.max_tokens > 0 && i >= params.max_tokens) || // max tokens per segment reached
+ (has_ts && seek + seek_delta + 100 >= seek_end) // end of audio reached
+ ) {
if (result_len == 0) {
if (seek + seek_delta + 100 >= seek_end) {
result_len = i + 1;
}
if (failed) {
- fprintf(stderr, "\n%s: failed to generate timestamp token - using fallback strategy\n\n", __func__);
- seek += 100;
+ // when we fail to sample timestamp token, retry by clearing the past prompt
+ // if it fails again, then we advance the window by 1 second
+ if (!prompt_past.empty()) {
+ prompt_past.clear();
+ } else {
+ fprintf(stderr, "\n%s: failed to generate timestamp token - skipping one second\n\n", __func__);
+ seek += 100;
+ }
continue;
}
}
// store the text from this iteration
- if (tokens_cur.size() > 0) {
+ if (!tokens_cur.empty()) {
int i0 = 0;
auto t0 = seek + 2*(tokens_cur.front().tid - whisper_token_beg(ctx));
- std::string text = "";
+ std::string text;
for (int i = 0; i < (int) tokens_cur.size(); i++) {
//printf("%s: %18s %6.3f %18s %6.3f\n", __func__,
// create the ggml memory context
{
- struct ggml_init_params params = {
- .mem_size = ctxs[i].buf_memory.size(),
- .mem_buffer = ctxs[i].buf_memory.data(),
- };
+ struct ggml_init_params params;
+ params.mem_size = ctxs[i].buf_memory.size();
+ params.mem_buffer = ctxs[i].buf_memory.data();
model.ctx_mem = ggml_init(params);
if (!model.ctx_mem) {
results_i[j].t1 += 100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t;
// make sure that segments are not overlapping
- if (ctx->result_all.size() > 0) {
+ if (!ctx->result_all.empty()) {
results_i[j].t0 = std::max(results_i[j].t0, ctx->result_all.back().t1);
}
#include <stdint.h>
#include <stdio.h>
+// if C99 - static_assert is noop
+// ref: https://stackoverflow.com/a/53923785/4039976
+#ifndef static_assert
+#define static_assert(cond, msg) struct global_scope_noop_trick
+#endif
+
#if defined _MSC_VER || defined(__MINGW32__)
#if !defined(__MINGW32__)
typedef void* thread_ret_t;
#endif
+#ifdef __HAIKU__
+#define static_assert(cond, msg) _Static_assert(cond, msg)
+#endif
+
#define GGML_DEBUG 0
#define GGML_GELU_FP16
return x;
}
+#define GGML_FP16_TO_FP32(x) (x)
+#define GGML_FP32_TO_FP16(x) (x)
+
#else
#ifdef __wasm_simd128__
#include <wasm_simd128.h>
#else
+#ifdef __POWER9_VECTOR__
+#include <altivec.h>
+#undef bool
+#define bool _Bool
+#else
#include <immintrin.h>
#endif
+#endif
+
+#ifdef __F16C__
+float ggml_fp16_to_fp32(ggml_fp16_t h) {
+ return _cvtsh_ss(h);
+}
+ggml_fp16_t ggml_fp32_to_fp16(float f) {
+ return _cvtss_sh(f, 0);
+}
+
+#define GGML_FP16_TO_FP32(x) _cvtsh_ss(x)
+#define GGML_FP32_TO_FP16(x) _cvtss_sh(x, 0)
+
+#else
// FP16 <-> FP32
// ref: https://github.com/Maratyszcza/FP16
union {
uint32_t as_bits;
float as_value;
- } fp32 = { w };
+ } fp32;
+ fp32.as_bits = w;
return fp32.as_value;
}
union {
float as_value;
uint32_t as_bits;
- } fp32 = { f };
+ } fp32;
+ fp32.as_value = f;
return fp32.as_bits;
}
const uint32_t nonsign = exp_bits + mantissa_bits;
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
}
-#endif
+
+#define GGML_FP16_TO_FP32(x) ggml_fp16_to_fp32(x)
+#define GGML_FP32_TO_FP16(x) ggml_fp32_to_fp16(x)
+
+#endif // __F16C__
+
+#endif // __ARM_NEON
//
// global data
#define CACHE_LINE_SIZE 64
#endif
-const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
+static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
//
-// fundamental operations
+// simd mappings
//
-inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
-inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
-inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
-inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
-inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
-inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; }
-inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; }
-inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; }
-inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; }
-inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
-inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; }
-inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
-inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
-
-inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
- ggml_float sumf = 0.0;
-#ifdef __ARM_NEON
- // NEON 128-bit
- const int n16 = (n & ~15);
-
- float32x4_t sum0 = vdupq_n_f32(0);
- float32x4_t sum1 = vdupq_n_f32(0);
- float32x4_t sum2 = vdupq_n_f32(0);
- float32x4_t sum3 = vdupq_n_f32(0);
-
- float32x4_t x0, x1, x2, x3;
- float32x4_t y0, y1, y2, y3;
-
- for (int i = 0; i < n16; i += 16) {
- x0 = vld1q_f32(x + i + 0);
- x1 = vld1q_f32(x + i + 4);
- x2 = vld1q_f32(x + i + 8);
- x3 = vld1q_f32(x + i + 12);
-
- y0 = vld1q_f32(y + i + 0);
- y1 = vld1q_f32(y + i + 4);
- y2 = vld1q_f32(y + i + 8);
- y3 = vld1q_f32(y + i + 12);
+// we define a common set of C macros which map to specific intrinsics based on the current architecture
+// we then implement the fundamental computation operations below using only these macros
+// adding support for new architectures requires to define the corresponding SIMD macros
+//
+// GGML_F32_STEP / GGML_F16_STEP
+// number of elements to process in a single step
+//
+// GGML_F32_EPR / GGML_F16_EPR
+// number of elements to fit in a single register
+//
- sum0 = vfmaq_f32(sum0, x0, y0);
- sum1 = vfmaq_f32(sum1, x1, y1);
- sum2 = vfmaq_f32(sum2, x2, y2);
- sum3 = vfmaq_f32(sum3, x3, y3);
- }
+#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
- // reduce sum0..sum3 to sum0
- sum0 = vaddq_f32(sum0, sum1);
- sum2 = vaddq_f32(sum2, sum3);
- sum0 = vaddq_f32(sum0, sum2);
+#define GGML_SIMD
- float32x2_t sumf32 = vadd_f32(vget_low_f32(sum0), vget_high_f32(sum0));
- sumf = vget_lane_f32(sumf32, 0) + vget_lane_f32(sumf32, 1);
+// F32 NEON
- // leftovers
- for (int i = n16; i < n; ++i) {
- sumf += x[i]*y[i];
- }
-#elif defined(__AVX2__)
- // AVX 256-bit
- const int n32 = (n & ~31);
+#define GGML_F32_STEP 16
+#define GGML_F32_EPR 4
- __m256 sum0 = _mm256_setzero_ps();
- __m256 sum1 = _mm256_setzero_ps();
- __m256 sum2 = _mm256_setzero_ps();
- __m256 sum3 = _mm256_setzero_ps();
+#define GGML_F32x4 float32x4_t
+#define GGML_F32x4_ZERO vdupq_n_f32(0.0f)
+#define GGML_F32x4_SET1(x) vdupq_n_f32(x)
+#define GGML_F32x4_LOAD vld1q_f32
+#define GGML_F32x4_STORE vst1q_f32
+#define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
+#define GGML_F32x4_ADD vaddq_f32
+#define GGML_F32x4_MUL vmulq_f32
+#if defined(__ARM_FEATURE_QRDMX)
+ #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
+#else
+ #define GGML_F32x4_REDUCE_ONE(x) \
+ (vgetq_lane_f32(x, 0) + \
+ vgetq_lane_f32(x, 1) + \
+ vgetq_lane_f32(x, 2) + \
+ vgetq_lane_f32(x, 3))
+#endif
+#define GGML_F32x4_REDUCE(res, x) \
+{ \
+ for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
+ x[2*i] = vaddq_f32(x[2*i], x[2*i+1]); \
+ } \
+ for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
+ x[4*i] = vaddq_f32(x[4*i], x[4*i+2]); \
+ } \
+ for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
+ x[8*i] = vaddq_f32(x[8*i], x[8*i+4]); \
+ } \
+ res = GGML_F32x4_REDUCE_ONE(x[0]); \
+}
+
+#define GGML_F32_VEC GGML_F32x4
+#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
+#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
+#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
+#define GGML_F32_VEC_STORE GGML_F32x4_STORE
+#define GGML_F32_VEC_FMA GGML_F32x4_FMA
+#define GGML_F32_VEC_ADD GGML_F32x4_ADD
+#define GGML_F32_VEC_MUL GGML_F32x4_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
+
+// F16 NEON
- __m256 x0, x1, x2, x3;
- __m256 y0, y1, y2, y3;
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+ #define GGML_F16_STEP 32
+ #define GGML_F16_EPR 8
+
+ #define GGML_F16x8 float16x8_t
+ #define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
+ #define GGML_F16x8_SET1(x) vdupq_n_f16(x)
+ #define GGML_F16x8_LOAD vld1q_f16
+ #define GGML_F16x8_STORE vst1q_f16
+ #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
+ #define GGML_F16x8_ADD vaddq_f16
+ #define GGML_F16x8_MUL vmulq_f16
+ #define GGML_F16x8_REDUCE(res, x) \
+ { \
+ for (int i = 0; i < GGML_F16_ARR/2; ++i) { \
+ x[2*i] = vaddq_f16(x[2*i], x[2*i+1]); \
+ } \
+ for (int i = 0; i < GGML_F16_ARR/4; ++i) { \
+ x[4*i] = vaddq_f16(x[4*i], x[4*i+2]); \
+ } \
+ for (int i = 0; i < GGML_F16_ARR/8; ++i) { \
+ x[8*i] = vaddq_f16(x[8*i], x[8*i+4]); \
+ } \
+ const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
+ const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
+ res = vaddvq_f32(vaddq_f32(t0, t1)); \
+ }
+
+ #define GGML_F16_VEC GGML_F16x8
+ #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
+ #define GGML_F16_VEC_SET1 GGML_F16x8_SET1
+ #define GGML_F16_VEC_LOAD GGML_F16x8_LOAD
+ #define GGML_F16_VEC_STORE GGML_F16x8_STORE
+ #define GGML_F16_VEC_FMA GGML_F16x8_FMA
+ #define GGML_F16_VEC_ADD GGML_F16x8_ADD
+ #define GGML_F16_VEC_MUL GGML_F16x8_MUL
+ #define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE
+#else
+ // if FP16 vector arithmetic is not supported, we use FP32 instead
+ // and take advantage of the vcvt_ functions to convert to/from FP16
+
+ #define GGML_F16_STEP 16
+ #define GGML_F16_EPR 4
+
+ #define GGML_F32Cx4 float32x4_t
+ #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
+ #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
+ #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16(x))
+ #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
+ #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
+ #define GGML_F32Cx4_ADD vaddq_f32
+ #define GGML_F32Cx4_MUL vmulq_f32
+ #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
+
+ #define GGML_F16_VEC GGML_F32Cx4
+ #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
+ #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
+ #define GGML_F16_VEC_LOAD GGML_F32Cx4_LOAD
+ #define GGML_F16_VEC_STORE GGML_F32Cx4_STORE
+ #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
+ #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
+ #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
+#endif
- for (int i = 0; i < n32; i += 32) {
- x0 = _mm256_loadu_ps(x + i + 0);
- x1 = _mm256_loadu_ps(x + i + 8);
- x2 = _mm256_loadu_ps(x + i + 16);
- x3 = _mm256_loadu_ps(x + i + 24);
+#elif defined(__AVX__)
- y0 = _mm256_loadu_ps(y + i + 0);
- y1 = _mm256_loadu_ps(y + i + 8);
- y2 = _mm256_loadu_ps(y + i + 16);
- y3 = _mm256_loadu_ps(y + i + 24);
+#define GGML_SIMD
- sum0 = _mm256_fmadd_ps(x0, y0, sum0);
- sum1 = _mm256_fmadd_ps(x1, y1, sum1);
- sum2 = _mm256_fmadd_ps(x2, y2, sum2);
- sum3 = _mm256_fmadd_ps(x3, y3, sum3);
- }
+// F32 AVX
- sum0 = _mm256_add_ps(sum0, sum1);
- sum2 = _mm256_add_ps(sum2, sum3);
- sum0 = _mm256_add_ps(sum0, sum2);
+#define GGML_F32_STEP 32
+#define GGML_F32_EPR 8
- const __m128 r4 = _mm_add_ps(_mm256_castps256_ps128(sum0), _mm256_extractf128_ps(sum0, 1));
- const __m128 r2 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
- const __m128 r1 = _mm_add_ss(r2, _mm_movehdup_ps(r2));
+#define GGML_F32x8 __m256
+#define GGML_F32x8_ZERO _mm256_setzero_ps()
+#define GGML_F32x8_SET1(x) _mm256_set1_ps(x)
+#define GGML_F32x8_LOAD _mm256_loadu_ps
+#define GGML_F32x8_STORE _mm256_storeu_ps
+#if defined(__FMA__)
+ #define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a)
+#else
+ #define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a)
+#endif
+#define GGML_F32x8_ADD _mm256_add_ps
+#define GGML_F32x8_MUL _mm256_mul_ps
+#define GGML_F32x8_REDUCE(res, x) \
+{ \
+ for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
+ x[2*i] = _mm256_add_ps(x[2*i], x[2*i+1]); \
+ } \
+ for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
+ x[4*i] = _mm256_add_ps(x[4*i], x[4*i+2]); \
+ } \
+ for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
+ x[8*i] = _mm256_add_ps(x[8*i], x[8*i+4]); \
+ } \
+ const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
+ _mm256_extractf128_ps(x[0], 1)); \
+ const __m128 t1 = _mm_hadd_ps(t0, t0); \
+ res = _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \
+}
+// TODO: is this optimal ?
+
+#define GGML_F32_VEC GGML_F32x8
+#define GGML_F32_VEC_ZERO GGML_F32x8_ZERO
+#define GGML_F32_VEC_SET1 GGML_F32x8_SET1
+#define GGML_F32_VEC_LOAD GGML_F32x8_LOAD
+#define GGML_F32_VEC_STORE GGML_F32x8_STORE
+#define GGML_F32_VEC_FMA GGML_F32x8_FMA
+#define GGML_F32_VEC_ADD GGML_F32x8_ADD
+#define GGML_F32_VEC_MUL GGML_F32x8_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
+
+// F16 AVX
+
+#define GGML_F16_STEP 32
+#define GGML_F16_EPR 8
+
+// F16 arithmetic is not supported by AVX, so we use F32 instead
+// we take advantage of the _mm256_cvt intrinsics to convert F16 <-> F32
+
+#define GGML_F32Cx8 __m256
+#define GGML_F32Cx8_ZERO _mm256_setzero_ps()
+#define GGML_F32Cx8_SET1(x) _mm256_set1_ps(x)
+#define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((__m128i *)(x)))
+#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
+#define GGML_F32Cx8_FMA GGML_F32x8_FMA
+#define GGML_F32Cx8_ADD _mm256_add_ps
+#define GGML_F32Cx8_MUL _mm256_mul_ps
+#define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE
+
+#define GGML_F16_VEC GGML_F32Cx8
+#define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO
+#define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1
+#define GGML_F16_VEC_LOAD GGML_F32Cx8_LOAD
+#define GGML_F16_VEC_STORE GGML_F32Cx8_STORE
+#define GGML_F16_VEC_FMA GGML_F32Cx8_FMA
+#define GGML_F16_VEC_ADD GGML_F32Cx8_ADD
+#define GGML_F16_VEC_MUL GGML_F32Cx8_MUL
+#define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE
+
+#elif defined(__POWER9_VECTOR__)
+
+// TODO: uncomment this when it works
+//#define GGML_SIMD
+
+// F32 POWER9
+
+#define GGML_F32_STEP 32
+#define GGML_F32_EPR 8
+
+// TODO: not tested !!
+#define GGML_F32x4 __vector float
+#define GGML_F32x4_ZERO (__vector float){0.0f, 0.0f, 0.0f, 0.0f}
+#define GGML_F32x4_SET1(x) (__vector float){x, x, x, x}
+#define GGML_F32x4_LOAD vec_vsx_ld
+#define GGML_F32x4_STORE vec_vsx_st
+#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
+#define GGML_F32x4_ADD vec_add
+#define GGML_F32x4_MUL vec_mul
+#define GGML_F32x4_REDUCE(res, x) \
+{ \
+ for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
+ x[2*i] = vec_add(x[2*i], x[2*i+1]); \
+ } \
+ for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
+ x[4*i] = vec_add(x[4*i], x[4*i+2]); \
+ } \
+ for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
+ x[8*i] = vec_add(x[8*i], x[8*i+4]); \
+ } \
+ res = vec_extract(x[0], 0) + \
+ vec_extract(x[0], 1) + \
+ vec_extract(x[0], 2) + \
+ vec_extract(x[0], 3); \
+}
+
+#define GGML_F32_VEC GGML_F32x4
+#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
+#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
+#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
+#define GGML_F32_VEC_STORE GGML_F32x4_STORE
+#define GGML_F32_VEC_FMA GGML_F32x4_FMA
+#define GGML_F32_VEC_ADD GGML_F32x4_ADD
+#define GGML_F32_VEC_MUL GGML_F32x4_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
+
+// F16 POWER9
+// TODO: implement here
+// ...
- sumf = _mm_cvtss_f32(r1);
+#elif defined(__wasm_simd128__)
- // leftovers
- for (int i = n32; i < n; ++i) {
- sumf += x[i]*y[i];
- }
-#elif defined(__AVX__)
- // AVX 256-bit
- const int n32 = (n & ~31);
+#define GGML_SIMD
+
+// F32 WASM
+
+#define GGML_F32_STEP 16
+#define GGML_F32_EPR 4
+
+#define GGML_F32x4 v128_t
+#define GGML_F32x4_ZERO wasm_f32x4_splat(0.0f)
+#define GGML_F32x4_SET1(x) wasm_f32x4_splat(x)
+#define GGML_F32x4_LOAD wasm_v128_load
+#define GGML_F32x4_STORE wasm_v128_store
+#define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a)
+#define GGML_F32x4_ADD wasm_f32x4_add
+#define GGML_F32x4_MUL wasm_f32x4_mul
+#define GGML_F32x4_REDUCE(res, x) \
+{ \
+ for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
+ x[2*i] = wasm_f32x4_add(x[2*i], x[2*i+1]); \
+ } \
+ for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
+ x[4*i] = wasm_f32x4_add(x[4*i], x[4*i+2]); \
+ } \
+ for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
+ x[8*i] = wasm_f32x4_add(x[8*i], x[8*i+4]); \
+ } \
+ res = wasm_f32x4_extract_lane(x[0], 0) + \
+ wasm_f32x4_extract_lane(x[0], 1) + \
+ wasm_f32x4_extract_lane(x[0], 2) + \
+ wasm_f32x4_extract_lane(x[0], 3); \
+}
+
+#define GGML_F32_VEC GGML_F32x4
+#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
+#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
+#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
+#define GGML_F32_VEC_STORE GGML_F32x4_STORE
+#define GGML_F32_VEC_FMA GGML_F32x4_FMA
+#define GGML_F32_VEC_ADD GGML_F32x4_ADD
+#define GGML_F32_VEC_MUL GGML_F32x4_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
+
+// F16 WASM
+
+#define GGML_F16_STEP 16
+#define GGML_F16_EPR 4
+
+inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) {
+ float tmp[4];
+
+ tmp[0] = GGML_FP16_TO_FP32(p[0]);
+ tmp[1] = GGML_FP16_TO_FP32(p[1]);
+ tmp[2] = GGML_FP16_TO_FP32(p[2]);
+ tmp[3] = GGML_FP16_TO_FP32(p[3]);
+
+ return wasm_v128_load(tmp);
+}
+
+inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
+ float tmp[4];
+
+ wasm_v128_store(tmp, x);
+
+ p[0] = GGML_FP32_TO_FP16(tmp[0]);
+ p[1] = GGML_FP32_TO_FP16(tmp[1]);
+ p[2] = GGML_FP32_TO_FP16(tmp[2]);
+ p[3] = GGML_FP32_TO_FP16(tmp[3]);
+}
+
+#define GGML_F16x4 v128_t
+#define GGML_F16x4_ZERO wasm_f32x4_splat(0.0f)
+#define GGML_F16x4_SET1(x) wasm_f32x4_splat(x)
+#define GGML_F16x4_LOAD(x) __wasm_f16x4_load(x)
+#define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y)
+#define GGML_F16x4_FMA GGML_F32x4_FMA
+#define GGML_F16x4_ADD wasm_f32x4_add
+#define GGML_F16x4_MUL wasm_f32x4_mul
+#define GGML_F16x4_REDUCE(res, x) \
+{ \
+ for (int i = 0; i < GGML_F16_ARR/2; ++i) { \
+ x[2*i] = wasm_f32x4_add(x[2*i], x[2*i+1]); \
+ } \
+ for (int i = 0; i < GGML_F16_ARR/4; ++i) { \
+ x[4*i] = wasm_f32x4_add(x[4*i], x[4*i+2]); \
+ } \
+ for (int i = 0; i < GGML_F16_ARR/8; ++i) { \
+ x[8*i] = wasm_f32x4_add(x[8*i], x[8*i+4]); \
+ } \
+ res = wasm_f32x4_extract_lane(x[0], 0) + \
+ wasm_f32x4_extract_lane(x[0], 1) + \
+ wasm_f32x4_extract_lane(x[0], 2) + \
+ wasm_f32x4_extract_lane(x[0], 3); \
+}
+
+#define GGML_F16_VEC GGML_F16x4
+#define GGML_F16_VEC_ZERO GGML_F16x4_ZERO
+#define GGML_F16_VEC_SET1 GGML_F16x4_SET1
+#define GGML_F16_VEC_LOAD GGML_F16x4_LOAD
+#define GGML_F16_VEC_STORE GGML_F16x4_STORE
+#define GGML_F16_VEC_FMA GGML_F16x4_FMA
+#define GGML_F16_VEC_ADD GGML_F16x4_ADD
+#define GGML_F16_VEC_MUL GGML_F16x4_MUL
+#define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE
- __m256 sum0 = _mm256_setzero_ps();
- __m256 sum1 = _mm256_setzero_ps();
- __m256 sum2 = _mm256_setzero_ps();
- __m256 sum3 = _mm256_setzero_ps();
+#endif
- __m256 x0, x1, x2, x3;
- __m256 y0, y1, y2, y3;
+// GGML_F32_ARR / GGML_F16_ARR
+// number of registers to use per step
+#ifdef GGML_SIMD
+#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
+#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
+#endif
- for (int i = 0; i < n32; i += 32) {
- x0 = _mm256_loadu_ps(x + i + 0);
- x1 = _mm256_loadu_ps(x + i + 8);
- x2 = _mm256_loadu_ps(x + i + 16);
- x3 = _mm256_loadu_ps(x + i + 24);
+//
+// fundamental operations
+//
- y0 = _mm256_loadu_ps(y + i + 0);
- y1 = _mm256_loadu_ps(y + i + 8);
- y2 = _mm256_loadu_ps(y + i + 16);
- y3 = _mm256_loadu_ps(y + i + 24);
+inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
- sum0 = _mm256_add_ps(_mm256_mul_ps(x0, y0), sum0);
- sum1 = _mm256_add_ps(_mm256_mul_ps(x1, y1), sum1);
- sum2 = _mm256_add_ps(_mm256_mul_ps(x2, y2), sum2);
- sum3 = _mm256_add_ps(_mm256_mul_ps(x3, y3), sum3);
- }
+inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
- sum0 = _mm256_add_ps(sum0, sum1);
- sum2 = _mm256_add_ps(sum2, sum3);
- sum0 = _mm256_add_ps(sum0, sum2);
+inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
- const __m128 r4 = _mm_add_ps(_mm256_castps256_ps128(sum0), _mm256_extractf128_ps(sum0, 1));
- const __m128 r2 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
- const __m128 r1 = _mm_add_ss(r2, _mm_movehdup_ps(r2));
+inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
- sumf = _mm_cvtss_f32(r1);
+inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
+inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; }
+inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; }
+inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; }
+inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; }
+inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
+inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; }
+inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
+inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
- // leftovers
- for (int i = n32; i < n; ++i) {
- sumf += x[i]*y[i];
- }
-#elif defined(__wasm_simd128__)
- // WASM 128-bit
- const int n16 = (n & ~15);
+inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
+ ggml_float sumf = 0.0;
- v128_t sum0 = wasm_f32x4_splat(0);
- v128_t sum1 = wasm_f32x4_splat(0);
- v128_t sum2 = wasm_f32x4_splat(0);
- v128_t sum3 = wasm_f32x4_splat(0);
+#ifdef GGML_SIMD
+ const int np = (n & ~(GGML_F32_STEP - 1));
- v128_t x0, x1, x2, x3;
- v128_t y0, y1, y2, y3;
+ GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
- for (int i = 0; i < n16; i += 16) {
- x0 = wasm_v128_load(x + i + 0);
- x1 = wasm_v128_load(x + i + 4);
- x2 = wasm_v128_load(x + i + 8);
- x3 = wasm_v128_load(x + i + 12);
+ GGML_F32_VEC ax[GGML_F32_ARR];
+ GGML_F32_VEC ay[GGML_F32_ARR];
- y0 = wasm_v128_load(y + i + 0);
- y1 = wasm_v128_load(y + i + 4);
- y2 = wasm_v128_load(y + i + 8);
- y3 = wasm_v128_load(y + i + 12);
+ for (int i = 0; i < np; i += GGML_F32_STEP) {
+ for (int j = 0; j < GGML_F32_ARR; j++) {
+ ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
+ ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
- sum0 = wasm_f32x4_add(sum0, wasm_f32x4_mul(x0, y0));
- sum1 = wasm_f32x4_add(sum1, wasm_f32x4_mul(x1, y1));
- sum2 = wasm_f32x4_add(sum2, wasm_f32x4_mul(x2, y2));
- sum3 = wasm_f32x4_add(sum3, wasm_f32x4_mul(x3, y3));
+ sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
+ }
}
- sum0 = wasm_f32x4_add(sum0, sum1);
- sum2 = wasm_f32x4_add(sum2, sum3);
- sum0 = wasm_f32x4_add(sum0, sum2);
-
- sumf = wasm_f32x4_extract_lane(sum0, 0) + wasm_f32x4_extract_lane(sum0, 1) + wasm_f32x4_extract_lane(sum0, 2) + wasm_f32x4_extract_lane(sum0, 3);
+ // reduce sum0..sum3 to sum0
+ GGML_F32_VEC_REDUCE(sumf, sum);
// leftovers
- for (int i = n16; i < n; ++i) {
+ for (int i = np; i < n; ++i) {
sumf += x[i]*y[i];
}
#else
inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
ggml_float sumf = 0.0;
-#ifdef __ARM_NEON
- const int n32 = (n & ~31);
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
- float16x8_t sum0 = vdupq_n_f16(0);
- float16x8_t sum1 = vdupq_n_f16(0);
- float16x8_t sum2 = vdupq_n_f16(0);
- float16x8_t sum3 = vdupq_n_f16(0);
+#if defined(GGML_SIMD)
+ const int np = (n & ~(GGML_F16_STEP - 1));
- float16x8_t x0, x1, x2, x3;
- float16x8_t y0, y1, y2, y3;
+ GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
- for (int i = 0; i < n32; i += 32) {
- x0 = vld1q_f16(x + i + 0 );
- x1 = vld1q_f16(x + i + 8 );
- x2 = vld1q_f16(x + i + 16);
- x3 = vld1q_f16(x + i + 24);
+ GGML_F16_VEC ax[GGML_F16_ARR];
+ GGML_F16_VEC ay[GGML_F16_ARR];
- y0 = vld1q_f16(y + i + 0 );
- y1 = vld1q_f16(y + i + 8 );
- y2 = vld1q_f16(y + i + 16);
- y3 = vld1q_f16(y + i + 24);
+ for (int i = 0; i < np; i += GGML_F16_STEP) {
+ for (int j = 0; j < GGML_F16_ARR; j++) {
+ ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR);
+ ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR);
- sum0 = vfmaq_f16(sum0, x0, y0);
- sum1 = vfmaq_f16(sum1, x1, y1);
- sum2 = vfmaq_f16(sum2, x2, y2);
- sum3 = vfmaq_f16(sum3, x3, y3);
+ sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);
+ }
}
// reduce sum0..sum3 to sum0
- sum0 = vaddq_f16(sum0, sum1);
- sum2 = vaddq_f16(sum2, sum3);
- sum0 = vaddq_f16(sum0, sum2);
-
- // load sum0 into 2 float32x4_t
- float32x4_t sum0f32 = vcvt_f32_f16(vget_low_f16(sum0));
- float32x4_t sum1f32 = vcvt_f32_f16(vget_high_f16(sum0));
-
- // reduce sum0f32 and sum1f32 to sumf
- sum0f32 = vaddq_f32(sum0f32, sum1f32);
-
- float32x2_t sumf32 = vadd_f32(vget_low_f32(sum0f32), vget_high_f32(sum0f32));
- sumf = vget_lane_f32(sumf32, 0) + vget_lane_f32(sumf32, 1);
-#else
- float32x4_t sum0 = vdupq_n_f32(0);
- float32x4_t sum1 = vdupq_n_f32(0);
- float32x4_t sum2 = vdupq_n_f32(0);
- float32x4_t sum3 = vdupq_n_f32(0);
- float32x4_t sum4 = vdupq_n_f32(0);
- float32x4_t sum5 = vdupq_n_f32(0);
- float32x4_t sum6 = vdupq_n_f32(0);
- float32x4_t sum7 = vdupq_n_f32(0);
-
- float32x4_t x0, x1, x2, x3, x4, x5, x6, x7;
- float32x4_t y0, y1, y2, y3, y4, y5, y6, y7;
-
- for (int i = 0; i < n32; i += 32) {
- x0 = vcvt_f32_f16(vld1_f16(x + i + 0 ));
- x1 = vcvt_f32_f16(vld1_f16(x + i + 4 ));
- x2 = vcvt_f32_f16(vld1_f16(x + i + 8 ));
- x3 = vcvt_f32_f16(vld1_f16(x + i + 12));
- x4 = vcvt_f32_f16(vld1_f16(x + i + 16));
- x5 = vcvt_f32_f16(vld1_f16(x + i + 20));
- x6 = vcvt_f32_f16(vld1_f16(x + i + 24));
- x7 = vcvt_f32_f16(vld1_f16(x + i + 28));
-
- y0 = vcvt_f32_f16(vld1_f16(y + i + 0 ));
- y1 = vcvt_f32_f16(vld1_f16(y + i + 4 ));
- y2 = vcvt_f32_f16(vld1_f16(y + i + 8 ));
- y3 = vcvt_f32_f16(vld1_f16(y + i + 12));
- y4 = vcvt_f32_f16(vld1_f16(y + i + 16));
- y5 = vcvt_f32_f16(vld1_f16(y + i + 20));
- y6 = vcvt_f32_f16(vld1_f16(y + i + 24));
- y7 = vcvt_f32_f16(vld1_f16(y + i + 28));
-
- sum0 = vfmaq_f32(sum0, x0, y0);
- sum1 = vfmaq_f32(sum1, x1, y1);
- sum2 = vfmaq_f32(sum2, x2, y2);
- sum3 = vfmaq_f32(sum3, x3, y3);
- sum4 = vfmaq_f32(sum4, x4, y4);
- sum5 = vfmaq_f32(sum5, x5, y5);
- sum6 = vfmaq_f32(sum6, x6, y6);
- sum7 = vfmaq_f32(sum7, x7, y7);
- }
-
- // reduce sum0..sum7 to sum0
- sum0 = vaddq_f32(sum0, sum1);
- sum2 = vaddq_f32(sum2, sum3);
- sum4 = vaddq_f32(sum4, sum5);
- sum6 = vaddq_f32(sum6, sum7);
- sum0 = vaddq_f32(sum0, sum2);
- sum4 = vaddq_f32(sum4, sum6);
- sum0 = vaddq_f32(sum0, sum4);
-
- // reduce sum0 to sumf
- float32x2_t sumf32 = vadd_f32(vget_low_f32(sum0), vget_high_f32(sum0));
- sumf = vget_lane_f32(sumf32, 0) + vget_lane_f32(sumf32, 1);
-#endif
-
- // leftovers
- for (int i = n32; i < n; ++i) {
- sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
- }
-#elif defined(__AVX2__)
- // AVX 256-bit
- const int n32 = (n & ~31);
-
- __m256 sum0 = _mm256_setzero_ps();
- __m256 sum1 = _mm256_setzero_ps();
- __m256 sum2 = _mm256_setzero_ps();
- __m256 sum3 = _mm256_setzero_ps();
-
- __m256 x0, x1, x2, x3;
- __m256 y0, y1, y2, y3;
-
- for (int i = 0; i < n32; i += 32) {
- x0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 0 )));
- x1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 8 )));
- x2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 16)));
- x3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 24)));
-
- y0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 0 )));
- y1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 8 )));
- y2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 16)));
- y3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 24)));
-
- sum0 = _mm256_fmadd_ps(x0, y0, sum0);
- sum1 = _mm256_fmadd_ps(x1, y1, sum1);
- sum2 = _mm256_fmadd_ps(x2, y2, sum2);
- sum3 = _mm256_fmadd_ps(x3, y3, sum3);
- }
-
- const __m256 sum01 = _mm256_add_ps(sum0, sum1);
- const __m256 sum23 = _mm256_add_ps(sum2, sum3);
- const __m256 sum0123 = _mm256_add_ps(sum01, sum23);
-
- const __m128 r4 = _mm_add_ps(_mm256_castps256_ps128(sum0123), _mm256_extractf128_ps(sum0123, 1));
- const __m128 r2 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
- const __m128 r1 = _mm_add_ss(r2, _mm_movehdup_ps(r2));
-
- sumf = _mm_cvtss_f32(r1);
+ GGML_F16_VEC_REDUCE(sumf, sum);
// leftovers
- for (int i = n32; i < n; ++i) {
- //GGML_ASSERT(false);
- sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
+ for (int i = np; i < n; ++i) {
+ sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
}
-#elif defined(__AVX__)
- // AVX 256-bit
+#elif defined(__POWER9_VECTOR__)
+ // TODO: this is temporary because I cannot fit it in the GGML_SIMD pattern like all other architectures without
+ // being able to test it. hoping someone with access to a POWER9 machine can help out here.
const int n32 = (n & ~31);
- __m256 sum0 = _mm256_setzero_ps();
- __m256 sum1 = _mm256_setzero_ps();
- __m256 sum2 = _mm256_setzero_ps();
- __m256 sum3 = _mm256_setzero_ps();
-
- __m256 x0, x1, x2, x3;
- __m256 y0, y1, y2, y3;
+ vector float sum0 = vec_splats (0.0f);
for (int i = 0; i < n32; i += 32) {
- x0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 0 )));
- x1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 8 )));
- x2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 16)));
- x3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 24)));
-
- y0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 0 )));
- y1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 8 )));
- y2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 16)));
- y3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 24)));
+ // Use vec_xl, not vec_ld, because x is sometimes unaligned.
+ vector unsigned short x0 = vec_xl(i * 2 + 0, x);
+ vector unsigned short x1 = vec_xl(i * 2 + 16, x);
+ vector unsigned short x2 = vec_xl(i * 2 + 32, x);
+ vector unsigned short x3 = vec_xl(i * 2 + 48, x);
+
+ vector unsigned short y0 = vec_xl(i * 2 + 0, y);
+ vector unsigned short y1 = vec_xl(i * 2 + 16, y);
+ vector unsigned short y2 = vec_xl(i * 2 + 32, y);
+ vector unsigned short y3 = vec_xl(i * 2 + 48, y);
+
+ vector float fx0l = vec_extract_fp32_from_shortl(x0);
+ vector float fx0h = vec_extract_fp32_from_shorth(x0);
+ vector float fx1l = vec_extract_fp32_from_shortl(x1);
+ vector float fx1h = vec_extract_fp32_from_shorth(x1);
+ vector float fx2l = vec_extract_fp32_from_shortl(x2);
+ vector float fx2h = vec_extract_fp32_from_shorth(x2);
+ vector float fx3l = vec_extract_fp32_from_shortl(x3);
+ vector float fx3h = vec_extract_fp32_from_shorth(x3);
+
+ vector float fy0l = vec_extract_fp32_from_shortl(y0);
+ vector float fy0h = vec_extract_fp32_from_shorth(y0);
+ vector float fy1l = vec_extract_fp32_from_shortl(y1);
+ vector float fy1h = vec_extract_fp32_from_shorth(y1);
+ vector float fy2l = vec_extract_fp32_from_shortl(y2);
+ vector float fy2h = vec_extract_fp32_from_shorth(y2);
+ vector float fy3l = vec_extract_fp32_from_shortl(y3);
+ vector float fy3h = vec_extract_fp32_from_shorth(y3);
+
+ sum0 = vec_add(sum0, vec_mul(fx0l, fy0l));
+ sum0 = vec_add(sum0, vec_mul(fx0h, fy0h));
+ sum0 = vec_add(sum0, vec_mul(fx1l, fy1l));
+ sum0 = vec_add(sum0, vec_mul(fx1h, fy1h));
+ sum0 = vec_add(sum0, vec_mul(fx2l, fy2l));
+ sum0 = vec_add(sum0, vec_mul(fx2h, fy2h));
+ sum0 = vec_add(sum0, vec_mul(fx3l, fy3l));
+ sum0 = vec_add(sum0, vec_mul(fx3h, fy3h));
+ }
+
+ sumf = vec_extract(sum0, 0) + vec_extract(sum0, 1)
+ + vec_extract(sum0, 2) + vec_extract(sum0, 3);
- sum0 = _mm256_add_ps(_mm256_mul_ps(x0, y0), sum0);
- sum1 = _mm256_add_ps(_mm256_mul_ps(x1, y1), sum1);
- sum2 = _mm256_add_ps(_mm256_mul_ps(x2, y2), sum2);
- sum3 = _mm256_add_ps(_mm256_mul_ps(x3, y3), sum3);
- }
-
- const __m256 sum01 = _mm256_add_ps(sum0, sum1);
- const __m256 sum23 = _mm256_add_ps(sum2, sum3);
- const __m256 sum0123 = _mm256_add_ps(sum01, sum23);
-
- const __m128 r4 = _mm_add_ps(_mm256_castps256_ps128(sum0123), _mm256_extractf128_ps(sum0123, 1));
- const __m128 r2 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
- const __m128 r1 = _mm_add_ss(r2, _mm_movehdup_ps(r2));
-
- sumf = _mm_cvtss_f32(r1);
-
- // leftovers
for (int i = n32; i < n; ++i) {
- //GGML_ASSERT(false);
- sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
- }
-#elif defined(__wasm_simd128__)
- // WASM 128-bit
- const int n16 = (n & ~15);
-
- v128_t sum0 = wasm_f32x4_splat(0.0f);
- v128_t sum1 = wasm_f32x4_splat(0.0f);
- v128_t sum2 = wasm_f32x4_splat(0.0f);
- v128_t sum3 = wasm_f32x4_splat(0.0f);
-
- v128_t x0, x1, x2, x3;
- v128_t y0, y1, y2, y3;
-
- float tx[16];
- float ty[16];
-
- for (int i = 0; i < n16; i += 16) {
- for (int k = 0; k < 16; ++k) {
- tx[k] = ggml_fp16_to_fp32(x[i + k]);
- ty[k] = ggml_fp16_to_fp32(y[i + k]);
- }
-
- x0 = wasm_v128_load(tx + 0);
- x1 = wasm_v128_load(tx + 4);
- x2 = wasm_v128_load(tx + 8);
- x3 = wasm_v128_load(tx + 12);
-
- y0 = wasm_v128_load(ty + 0);
- y1 = wasm_v128_load(ty + 4);
- y2 = wasm_v128_load(ty + 8);
- y3 = wasm_v128_load(ty + 12);
-
- sum0 = wasm_f32x4_add(sum0, wasm_f32x4_mul(x0, y0));
- sum1 = wasm_f32x4_add(sum1, wasm_f32x4_mul(x1, y1));
- sum2 = wasm_f32x4_add(sum2, wasm_f32x4_mul(x2, y2));
- sum3 = wasm_f32x4_add(sum3, wasm_f32x4_mul(x3, y3));
- }
-
- sum0 = wasm_f32x4_add(sum0, sum1);
- sum2 = wasm_f32x4_add(sum2, sum3);
- sum0 = wasm_f32x4_add(sum0, sum2);
-
- sumf = wasm_f32x4_extract_lane(sum0, 0) + wasm_f32x4_extract_lane(sum0, 1) + wasm_f32x4_extract_lane(sum0, 2) + wasm_f32x4_extract_lane(sum0, 3);
-
- // leftovers
- for (int i = n16; i < n; ++i) {
- //GGML_ASSERT(false);
- sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
+ sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
}
#else
for (int i = 0; i < n; ++i) {
- sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
+ sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
}
#endif
}
inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) {
-#ifdef __ARM_NEON
- // NEON 128-bit
- const int n16 = (n & ~15);
-
- const float32x4_t v4 = vdupq_n_f32(v);
-
- float32x4_t x0, x1, x2, x3;
- float32x4_t y0, y1, y2, y3;
-
- for (int i = 0; i < n16; i += 16) {
- x0 = vld1q_f32(x + i + 0);
- x1 = vld1q_f32(x + i + 4);
- x2 = vld1q_f32(x + i + 8);
- x3 = vld1q_f32(x + i + 12);
-
- y0 = vld1q_f32(y + i + 0);
- y1 = vld1q_f32(y + i + 4);
- y2 = vld1q_f32(y + i + 8);
- y3 = vld1q_f32(y + i + 12);
-
- y0 = vfmaq_f32(y0, x0, v4);
- y1 = vfmaq_f32(y1, x1, v4);
- y2 = vfmaq_f32(y2, x2, v4);
- y3 = vfmaq_f32(y3, x3, v4);
-
- vst1q_f32(y + i + 0, y0);
- vst1q_f32(y + i + 4, y1);
- vst1q_f32(y + i + 8, y2);
- vst1q_f32(y + i + 12, y3);
- }
-
- // leftovers
- for (int i = n16; i < n; ++i) {
- y[i] += x[i]*v;
- }
-#elif defined(__AVX2__)
- // AVX 256-bit
- const int n32 = (n & ~31);
-
- const __m256 v4 = _mm256_set1_ps(v);
-
- __m256 x0, x1, x2, x3;
- __m256 y0, y1, y2, y3;
-
- for (int i = 0; i < n32; i += 32) {
- x0 = _mm256_loadu_ps(x + i + 0);
- x1 = _mm256_loadu_ps(x + i + 8);
- x2 = _mm256_loadu_ps(x + i + 16);
- x3 = _mm256_loadu_ps(x + i + 24);
-
- y0 = _mm256_loadu_ps(y + i + 0);
- y1 = _mm256_loadu_ps(y + i + 8);
- y2 = _mm256_loadu_ps(y + i + 16);
- y3 = _mm256_loadu_ps(y + i + 24);
-
- y0 = _mm256_fmadd_ps(x0, v4, y0);
- y1 = _mm256_fmadd_ps(x1, v4, y1);
- y2 = _mm256_fmadd_ps(x2, v4, y2);
- y3 = _mm256_fmadd_ps(x3, v4, y3);
-
- _mm256_storeu_ps(y + i + 0, y0);
- _mm256_storeu_ps(y + i + 8, y1);
- _mm256_storeu_ps(y + i + 16, y2);
- _mm256_storeu_ps(y + i + 24, y3);
- }
-
- // leftovers
- for (int i = n32; i < n; ++i) {
- y[i] += x[i]*v;
- }
-#elif defined(__AVX__)
- // AVX 256-bit
- const int n32 = (n & ~31);
+#if defined(GGML_SIMD)
+ const int np = (n & ~(GGML_F32_STEP - 1));
- const __m256 v4 = _mm256_set1_ps(v);
+ GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
- __m256 x0, x1, x2, x3;
- __m256 y0, y1, y2, y3;
+ GGML_F32_VEC ax[GGML_F32_ARR];
+ GGML_F32_VEC ay[GGML_F32_ARR];
- for (int i = 0; i < n32; i += 32) {
- x0 = _mm256_loadu_ps(x + i + 0);
- x1 = _mm256_loadu_ps(x + i + 8);
- x2 = _mm256_loadu_ps(x + i + 16);
- x3 = _mm256_loadu_ps(x + i + 24);
+ for (int i = 0; i < np; i += GGML_F32_STEP) {
+ for (int j = 0; j < GGML_F32_ARR; j++) {
+ ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
+ ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
+ ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
- y0 = _mm256_loadu_ps(y + i + 0);
- y1 = _mm256_loadu_ps(y + i + 8);
- y2 = _mm256_loadu_ps(y + i + 16);
- y3 = _mm256_loadu_ps(y + i + 24);
-
- y0 = _mm256_add_ps(_mm256_mul_ps(x0, v4), y0);
- y1 = _mm256_add_ps(_mm256_mul_ps(x1, v4), y1);
- y2 = _mm256_add_ps(_mm256_mul_ps(x2, v4), y2);
- y3 = _mm256_add_ps(_mm256_mul_ps(x3, v4), y3);
-
- _mm256_storeu_ps(y + i + 0, y0);
- _mm256_storeu_ps(y + i + 8, y1);
- _mm256_storeu_ps(y + i + 16, y2);
- _mm256_storeu_ps(y + i + 24, y3);
- }
-
- // leftovers
- for (int i = n32; i < n; ++i) {
- y[i] += x[i]*v;
- }
-#elif defined(__wasm_simd128__)
- // WASM SIMD 128-bit
- const int n16 = (n & ~15);
-
- const v128_t v4 = wasm_f32x4_splat(v);
-
- v128_t x0, x1, x2, x3;
- v128_t y0, y1, y2, y3;
-
- for (int i = 0; i < n16; i += 16) {
- x0 = wasm_v128_load(x + i + 0);
- x1 = wasm_v128_load(x + i + 4);
- x2 = wasm_v128_load(x + i + 8);
- x3 = wasm_v128_load(x + i + 12);
-
- y0 = wasm_v128_load(y + i + 0);
- y1 = wasm_v128_load(y + i + 4);
- y2 = wasm_v128_load(y + i + 8);
- y3 = wasm_v128_load(y + i + 12);
-
- y0 = wasm_f32x4_add(y0, wasm_f32x4_mul(x0, v4));
- y1 = wasm_f32x4_add(y1, wasm_f32x4_mul(x1, v4));
- y2 = wasm_f32x4_add(y2, wasm_f32x4_mul(x2, v4));
- y3 = wasm_f32x4_add(y3, wasm_f32x4_mul(x3, v4));
-
- wasm_v128_store(y + i + 0, y0);
- wasm_v128_store(y + i + 4, y1);
- wasm_v128_store(y + i + 8, y2);
- wasm_v128_store(y + i + 12, y3);
+ GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
+ }
}
// leftovers
- for (int i = n16; i < n; ++i) {
+ for (int i = np; i < n; ++i) {
y[i] += x[i]*v;
}
#else
}
inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_fp16_t * restrict x, const float v) {
-#ifdef __ARM_NEON
- // NEON 128-bit
- const int n32 = (n & ~31);
+#if defined(GGML_SIMD)
+ const int np = (n & ~(GGML_F16_STEP - 1));
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
- const float16x8_t v8 = vdupq_n_f16(v);
+ GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
- float16x8_t x0, x1, x2, x3;
- float16x8_t y0, y1, y2, y3;
+ GGML_F16_VEC ax[GGML_F16_ARR];
+ GGML_F16_VEC ay[GGML_F16_ARR];
- for (int i = 0; i < n32; i += 32) {
- y0 = vld1q_f16(y + i + 0 );
- y1 = vld1q_f16(y + i + 8 );
- y2 = vld1q_f16(y + i + 16);
- y3 = vld1q_f16(y + i + 24);
-
- x0 = vld1q_f16(x + i + 0 );
- x1 = vld1q_f16(x + i + 8 );
- x2 = vld1q_f16(x + i + 16);
- x3 = vld1q_f16(x + i + 24);
-
- y0 = vfmaq_f16(y0, x0, v8);
- y1 = vfmaq_f16(y1, x1, v8);
- y2 = vfmaq_f16(y2, x2, v8);
- y3 = vfmaq_f16(y3, x3, v8);
-
- vst1q_f16(y + i + 0 , y0);
- vst1q_f16(y + i + 8 , y1);
- vst1q_f16(y + i + 16, y2);
- vst1q_f16(y + i + 24, y3);
- }
-#else
- const float32x4_t v40 = vdupq_n_f32(v);
- const float32x4_t v41 = vdupq_n_f32(v);
+ for (int i = 0; i < np; i += GGML_F16_STEP) {
+ for (int j = 0; j < GGML_F16_ARR; j++) {
+ ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR);
+ ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR);
+ ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
- float32x4_t x0, x1, x2, x3, x4, x5, x6, x7;
- float32x4_t y0, y1, y2, y3, y4, y5, y6, y7;
-
- for (int i = 0; i < n32; i += 32) {
- y0 = vcvt_f32_f16(vld1_f16(y + i + 0 ));
- y1 = vcvt_f32_f16(vld1_f16(y + i + 4 ));
- y2 = vcvt_f32_f16(vld1_f16(y + i + 8 ));
- y3 = vcvt_f32_f16(vld1_f16(y + i + 12));
- y4 = vcvt_f32_f16(vld1_f16(y + i + 16));
- y5 = vcvt_f32_f16(vld1_f16(y + i + 20));
- y6 = vcvt_f32_f16(vld1_f16(y + i + 24));
- y7 = vcvt_f32_f16(vld1_f16(y + i + 28));
-
- x0 = vcvt_f32_f16(vld1_f16(x + i + 0 ));
- x1 = vcvt_f32_f16(vld1_f16(x + i + 4 ));
- x2 = vcvt_f32_f16(vld1_f16(x + i + 8 ));
- x3 = vcvt_f32_f16(vld1_f16(x + i + 12));
- x4 = vcvt_f32_f16(vld1_f16(x + i + 16));
- x5 = vcvt_f32_f16(vld1_f16(x + i + 20));
- x6 = vcvt_f32_f16(vld1_f16(x + i + 24));
- x7 = vcvt_f32_f16(vld1_f16(x + i + 28));
-
- y0 = vfmaq_f32(y0, x0, v40);
- y1 = vfmaq_f32(y1, x1, v40);
- y2 = vfmaq_f32(y2, x2, v40);
- y3 = vfmaq_f32(y3, x3, v40);
- y4 = vfmaq_f32(y4, x4, v41);
- y5 = vfmaq_f32(y5, x5, v41);
- y6 = vfmaq_f32(y6, x6, v41);
- y7 = vfmaq_f32(y7, x7, v41);
-
- vst1_f16(y + i + 0 , vcvt_f16_f32(y0));
- vst1_f16(y + i + 4 , vcvt_f16_f32(y1));
- vst1_f16(y + i + 8 , vcvt_f16_f32(y2));
- vst1_f16(y + i + 12, vcvt_f16_f32(y3));
- vst1_f16(y + i + 16, vcvt_f16_f32(y4));
- vst1_f16(y + i + 20, vcvt_f16_f32(y5));
- vst1_f16(y + i + 24, vcvt_f16_f32(y6));
- vst1_f16(y + i + 28, vcvt_f16_f32(y7));
+ GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay[j]);
+ }
}
-#endif
// leftovers
- for (int i = n32; i < n; ++i) {
+ for (int i = np; i < n; ++i) {
GGML_ASSERT(false);
- y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
+ y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
}
-#elif defined(__AVX2__)
- // AVX 256-bit
+#elif defined(__POWER9_VECTOR__)
+ // TODO: this is temporary because I cannot fit it in the GGML_SIMD pattern like all other architectures without
+ // being able to test it. hoping someone with access to a POWER9 machine can help out here.
const int n32 = (n & ~31);
-
- const __m256 v8 = _mm256_set1_ps(v);
-
- __m256 x0, x1, x2, x3;
- __m256 y0, y1, y2, y3;
-
for (int i = 0; i < n32; i += 32) {
- y0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 0 )));
- y1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 8 )));
- y2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 16)));
- y3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 24)));
-
- x0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 0 )));
- x1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 8 )));
- x2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 16)));
- x3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 24)));
-
- y0 = _mm256_fmadd_ps(x0, v8, y0);
- y1 = _mm256_fmadd_ps(x1, v8, y1);
- y2 = _mm256_fmadd_ps(x2, v8, y2);
- y3 = _mm256_fmadd_ps(x3, v8, y3);
-
- _mm_storeu_si128((__m128i*)(y + i + 0 ), _mm256_cvtps_ph(y0, 0));
- _mm_storeu_si128((__m128i*)(y + i + 8 ), _mm256_cvtps_ph(y1, 0));
- _mm_storeu_si128((__m128i*)(y + i + 16), _mm256_cvtps_ph(y2, 0));
- _mm_storeu_si128((__m128i*)(y + i + 24), _mm256_cvtps_ph(y3, 0));
+ // Use vec_xl, not vec_ld, because x is sometimes unaligned!
+ vector unsigned short x0 = vec_xl(i * 2 + 0, x);
+ vector unsigned short x1 = vec_xl(i * 2 + 16, x);
+ vector unsigned short x2 = vec_xl(i * 2 + 32, x);
+ vector unsigned short x3 = vec_xl(i * 2 + 48, x);
+
+ vector unsigned short y0 = vec_xl(i * 2 + 0, y);
+ vector unsigned short y1 = vec_xl(i * 2 + 16, y);
+ vector unsigned short y2 = vec_xl(i * 2 + 32, y);
+ vector unsigned short y3 = vec_xl(i * 2 + 48, y);
+
+ vector float v4 = vec_splats(v);
+
+ vector float fx0l = vec_extract_fp32_from_shortl(x0);
+ vector float fx0h = vec_extract_fp32_from_shorth(x0);
+ vector float fx1l = vec_extract_fp32_from_shortl(x1);
+ vector float fx1h = vec_extract_fp32_from_shorth(x1);
+ vector float fx2l = vec_extract_fp32_from_shortl(x2);
+ vector float fx2h = vec_extract_fp32_from_shorth(x2);
+ vector float fx3l = vec_extract_fp32_from_shortl(x3);
+ vector float fx3h = vec_extract_fp32_from_shorth(x3);
+
+ vector float fy0l = vec_extract_fp32_from_shortl(y0);
+ vector float fy0h = vec_extract_fp32_from_shorth(y0);
+ vector float fy1l = vec_extract_fp32_from_shortl(y1);
+ vector float fy1h = vec_extract_fp32_from_shorth(y1);
+ vector float fy2l = vec_extract_fp32_from_shortl(y2);
+ vector float fy2h = vec_extract_fp32_from_shorth(y2);
+ vector float fy3l = vec_extract_fp32_from_shortl(y3);
+ vector float fy3h = vec_extract_fp32_from_shorth(y3);
+
+ fy0l = vec_madd(fx0l, v4, fy0l);
+ fy0h = vec_madd(fx0h, v4, fy0h);
+ fy1l = vec_madd(fx1l, v4, fy1l);
+ fy1h = vec_madd(fx1h, v4, fy1h);
+ fy2l = vec_madd(fx2l, v4, fy2l);
+ fy2h = vec_madd(fx2h, v4, fy2h);
+ fy3l = vec_madd(fx3l, v4, fy3l);
+ fy3h = vec_madd(fx3h, v4, fy3h);
+
+ y0 = vec_pack_to_short_fp32(fy0h, fy0l);
+ y1 = vec_pack_to_short_fp32(fy1h, fy1l);
+ y2 = vec_pack_to_short_fp32(fy2h, fy2l);
+ y3 = vec_pack_to_short_fp32(fy3h, fy3l);
+
+ vec_xst(y0, i * 2 + 0, y);
+ vec_xst(y1, i * 2 + 16, y);
+ vec_xst(y2, i * 2 + 32, y);
+ vec_xst(y3, i * 2 + 48, y);
}
- // leftovers
for (int i = n32; i < n; ++i) {
- GGML_ASSERT(false);
- y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
- }
-#elif defined(__AVX__)
- // AVX 256-bit
- const int n32 = (n & ~31);
-
- const __m256 v8 = _mm256_set1_ps(v);
-
- __m256 x0, x1, x2, x3;
- __m256 y0, y1, y2, y3;
-
- for (int i = 0; i < n32; i += 32) {
- y0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 0 )));
- y1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 8 )));
- y2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 16)));
- y3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 24)));
-
- x0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 0 )));
- x1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 8 )));
- x2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 16)));
- x3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 24)));
-
- y0 = _mm256_add_ps(_mm256_mul_ps(x0, v8), y0);
- y1 = _mm256_add_ps(_mm256_mul_ps(x1, v8), y1);
- y2 = _mm256_add_ps(_mm256_mul_ps(x2, v8), y2);
- y3 = _mm256_add_ps(_mm256_mul_ps(x3, v8), y3);
-
- _mm_storeu_si128((__m128i*)(y + i + 0 ), _mm256_cvtps_ph(y0, 0));
- _mm_storeu_si128((__m128i*)(y + i + 8 ), _mm256_cvtps_ph(y1, 0));
- _mm_storeu_si128((__m128i*)(y + i + 16), _mm256_cvtps_ph(y2, 0));
- _mm_storeu_si128((__m128i*)(y + i + 24), _mm256_cvtps_ph(y3, 0));
+ y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
}
-
- // leftovers
- for (int i = n32; i < n; ++i) {
- GGML_ASSERT(false);
- y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
+#else
+ for (int i = 0; i < n; ++i) {
+ y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
}
-#elif defined(__wasm_simd128__)
- // WASM SIMD 128-bit
- const int n16 = (n & ~15);
-
- const v128_t v4 = wasm_f32x4_splat(v);
-
- v128_t x0, x1, x2, x3;
- v128_t y0, y1, y2, y3;
-
- float tx[16];
- float ty[16];
-
- for (int i = 0; i < n16; i += 16) {
- for (int k = 0; k < 16; ++k) {
- tx[k] = ggml_fp16_to_fp32(x[i + k]);
- ty[k] = ggml_fp16_to_fp32(y[i + k]);
- }
+#endif
+}
- x0 = wasm_v128_load(tx + 0);
- x1 = wasm_v128_load(tx + 4);
- x2 = wasm_v128_load(tx + 8);
- x3 = wasm_v128_load(tx + 12);
+//inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; }
+inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
+#if defined(GGML_SIMD)
+ const int np = (n & ~(GGML_F32_STEP - 1));
- y0 = wasm_v128_load(ty + 0);
- y1 = wasm_v128_load(ty + 4);
- y2 = wasm_v128_load(ty + 8);
- y3 = wasm_v128_load(ty + 12);
+ GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
- y0 = wasm_f32x4_add(y0, wasm_f32x4_mul(x0, v4));
- y1 = wasm_f32x4_add(y1, wasm_f32x4_mul(x1, v4));
- y2 = wasm_f32x4_add(y2, wasm_f32x4_mul(x2, v4));
- y3 = wasm_f32x4_add(y3, wasm_f32x4_mul(x3, v4));
+ GGML_F32_VEC ay[GGML_F32_ARR];
- wasm_v128_store(ty + 0, y0);
- wasm_v128_store(ty + 4, y1);
- wasm_v128_store(ty + 8, y2);
- wasm_v128_store(ty + 12, y3);
+ for (int i = 0; i < np; i += GGML_F32_STEP) {
+ for (int j = 0; j < GGML_F32_ARR; j++) {
+ ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
+ ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
- for (int k = 0; k < 16; ++k) {
- y[i + k] = ggml_fp32_to_fp16(ty[k]);
+ GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
}
}
// leftovers
- for (int i = n16; i < n; ++i) {
- GGML_ASSERT(false);
- y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
+ for (int i = np; i < n; ++i) {
+ y[i] *= v;
}
#else
+ // scalar
for (int i = 0; i < n; ++i) {
- y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
+ y[i] *= v;
}
#endif
}
-inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; }
inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, x, x); *s = sqrt(*s); }
inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrt(x[i]); }
inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
-const ggml_float GELU_COEF_A = 0.044715;
-const ggml_float SQRT_2_OVER_PI = 0.79788456080286535587989211986876;
+static const ggml_float GELU_COEF_A = 0.044715;
+static const ggml_float SQRT_2_OVER_PI = 0.79788456080286535587989211986876;
inline static float ggml_gelu_f32(float x) {
return 0.5*x*(1.0 + tanh(SQRT_2_OVER_PI*x*(1.0 + GELU_COEF_A*x*x)));
inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
uint16_t t;
for (int i = 0; i < n; ++i) {
- ggml_fp16_t fp16 = ggml_fp32_to_fp16(x[i]);
+ ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
memcpy(&t, &fp16, sizeof(uint16_t));
- y[i] = ggml_fp16_to_fp32(table_gelu_f16[t]);
+ y[i] = GGML_FP16_TO_FP32(table_gelu_f16[t]);
}
}
#else
// data types
//
-const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
+static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
sizeof(int8_t ),
sizeof(int16_t),
sizeof(int32_t),
sizeof(float ),
};
-const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
+static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
"NONE",
"DUP",
"FLASH_FF",
};
-const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
+static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
"x",
char padding[8];
};
-const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
+static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
};
// global state
-struct ggml_state g_state;
-atomic_int g_state_barrier = 0;
+static struct ggml_state g_state;
+static atomic_int g_state_barrier = 0;
+
+// barrier via spin lock
+inline static void ggml_critical_section_start() {
+ int processing = atomic_fetch_add(&g_state_barrier, 1);
+
+ while (processing > 0) {
+ // wait for other threads to finish
+ atomic_fetch_sub(&g_state_barrier, 1);
+ sched_yield(); // TODO: reconsider this
+ processing = atomic_fetch_add(&g_state_barrier, 1);
+ }
+}
+
+// TODO: make this somehow automatically executed
+// some sort of "sentry" mechanism
+inline static void ggml_critical_section_end() {
+ atomic_fetch_sub(&g_state_barrier, 1);
+}
////////////////////////////////////////////////////////////////////////////////
return
tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
- tensor->nb[3] == tensor->nb[2]*tensor->ne[2];;
+ tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
}
bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
struct ggml_context * ggml_init(struct ggml_init_params params) {
// make this function thread safe
- {
- int processing = atomic_fetch_add(&g_state_barrier, 1);
- while (processing > 0) {
- // wait for other threads to finish
- atomic_fetch_sub(&g_state_barrier, 1);
- sched_yield();
- processing = atomic_fetch_add(&g_state_barrier, 1);
- }
- }
+ ggml_critical_section_start();
static bool is_first_call = true;
+
if (is_first_call) {
- const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
-
- ggml_fp16_t ii;
- for (int i = 0; i < (1 << 16); ++i) {
- uint16_t ui = i;
- memcpy(&ii, &ui, sizeof(ii));
- const float f = ggml_fp16_to_fp32(ii);
- table_gelu_f16[i] = ggml_fp32_to_fp16(ggml_gelu_f32(f));
- table_exp_f16[i] = ggml_fp32_to_fp16(exp(f));
+ // initialize GELU and EXP tables
+ {
+ const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
+
+ ggml_fp16_t ii;
+ for (int i = 0; i < (1 << 16); ++i) {
+ uint16_t ui = i;
+ memcpy(&ii, &ui, sizeof(ii));
+ const float f = GGML_FP16_TO_FP32(ii);
+ table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
+ table_exp_f16[i] = GGML_FP32_TO_FP16(exp(f));
+ }
+
+ const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
+
+ GGML_PRINT_DEBUG("%s: GELU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
}
- const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
+ // initialize g_state
+ {
+ const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
- GGML_PRINT_DEBUG("%s: GELU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
+ g_state = (struct ggml_state) {
+ /*.contexts =*/ { 0 },
+ };
+
+ for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) {
+ g_state.contexts[i].used = false;
+ }
+
+ const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
+
+ GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
+ }
is_first_call = false;
}
// find non-used context in g_state
struct ggml_context * ctx = NULL;
- static bool first_time = true;
- if (first_time) {
- for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
- g_state.contexts[i].used = false;
- }
- first_time = false;
- }
-
for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
if (!g_state.contexts[i].used) {
g_state.contexts[i].used = true;
if (ctx == NULL) {
GGML_PRINT_DEBUG("%s: no unused context found\n", __func__);
- atomic_fetch_sub(&g_state_barrier, 1);
+ ggml_critical_section_end();
return NULL;
}
GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
- atomic_fetch_sub(&g_state_barrier, 1);
+ ggml_critical_section_end();
return ctx;
}
void ggml_free(struct ggml_context * ctx) {
// make this function thread safe
- {
- int processing = atomic_fetch_add(&g_state_barrier, 1);
- while (processing > 0) {
- // wait for other threads to finish
- atomic_fetch_sub(&g_state_barrier, 1);
- sched_yield();
- processing = atomic_fetch_add(&g_state_barrier, 1);
- }
- }
+ ggml_critical_section_start();
+
+ bool found = false;
for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
if (&g_state.contexts[i].context == ctx) {
free(ctx->mem_buffer);
}
- atomic_fetch_sub(&g_state_barrier, 1);
-
- return;
+ found = true;
+ break;
}
}
- GGML_PRINT_DEBUG("%s: context not found\n", __func__);
+ if (!found) {
+ GGML_PRINT_DEBUG("%s: context not found\n", __func__);
+ }
- atomic_fetch_sub(&g_state_barrier, 1);
+ ggml_critical_section_end();
}
size_t ggml_used_mem(const struct ggml_context * ctx) {
case GGML_TYPE_F16:
{
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
- return ggml_fp16_to_fp32(((ggml_fp16_t *)(tensor->data))[i]);
+ return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
} break;
case GGML_TYPE_F32:
{
case GGML_TYPE_F16:
{
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
- ((ggml_fp16_t *)(tensor->data))[i] = ggml_fp32_to_fp16(value);
+ ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
} break;
case GGML_TYPE_F32:
{
case GGML_TYPE_F16:
{
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
- return ggml_fp16_to_fp32(((ggml_fp16_t *)(tensor->data))[i]);
+ return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
} break;
case GGML_TYPE_F32:
{
case GGML_TYPE_F16:
{
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
- ((ggml_fp16_t *)(tensor->data))[i] = ggml_fp32_to_fp16(value);
+ ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
} break;
case GGML_TYPE_F32:
{
// ggml_compute_forward_dup
-void ggml_compute_forward_dup_f16(
+static void ggml_compute_forward_dup_f16(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
return;
}
- //const int ne00 = src0->ne[0];
- //const int ne01 = src0->ne[1];
- //const int ne02 = src0->ne[2];
- //const int ne03 = src0->ne[3];
+ const int ne00 = src0->ne[0];
+ const int ne01 = src0->ne[1];
+ const int ne02 = src0->ne[2];
+ const int ne03 = src0->ne[3];
- //const size_t nb00 = src0->nb[0];
- //const size_t nb01 = src0->nb[1];
- //const size_t nb02 = src0->nb[2];
- //const size_t nb03 = src0->nb[3];
+ const size_t nb00 = src0->nb[0];
+ const size_t nb01 = src0->nb[1];
+ const size_t nb02 = src0->nb[2];
+ const size_t nb03 = src0->nb[3];
if (ggml_is_contiguous(src0) && src0->type == dst->type) {
memcpy(dst->data, src0->data, ggml_nelements(dst) * GGML_TYPE_SIZE[src0->type]);
return;
}
- GGML_ASSERT(false); // TODO: implement
+ if (src0->nb[0] == sizeof(ggml_fp16_t)) {
+ if (dst->type == GGML_TYPE_F16) {
+ int id = 0;
+ const size_t rs = ne00*nb00;
+
+ for (int i03 = 0; i03 < ne03; i03++) {
+ for (int i02 = 0; i02 < ne02; i02++) {
+ for (int i01 = 0; i01 < ne01; i01++) {
+ const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
+ char * dst_ptr = (char *) dst->data + id*rs;
+
+ memcpy(dst_ptr, src0_ptr, rs);
+
+ id++;
+ }
+ }
+ }
+ } else if (dst->type == GGML_TYPE_F32) {
+ int id = 0;
+ float * dst_ptr = (float *) dst->data;
+
+ for (int i03 = 0; i03 < ne03; i03++) {
+ for (int i02 = 0; i02 < ne02; i02++) {
+ for (int i01 = 0; i01 < ne01; i01++) {
+ for (int i00 = 0; i00 < ne00; i00++) {
+ const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+
+ dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr);
+ id++;
+ }
+ }
+ }
+ }
+ } else {
+ GGML_ASSERT(false); // TODO: implement
+ }
+ } else {
+ //printf("%s: this is not optimal - fix me\n", __func__);
+
+ if (dst->type == GGML_TYPE_F32) {
+ int id = 0;
+ float * dst_ptr = (float *) dst->data;
+
+ for (int i03 = 0; i03 < ne03; i03++) {
+ for (int i02 = 0; i02 < ne02; i02++) {
+ for (int i01 = 0; i01 < ne01; i01++) {
+ for (int i00 = 0; i00 < ne00; i00++) {
+ const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+
+ dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr);
+ id++;
+ }
+ }
+ }
+ }
+ } else if (dst->type == GGML_TYPE_F16) {
+ int id = 0;
+ ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
+
+ for (int i03 = 0; i03 < ne03; i03++) {
+ for (int i02 = 0; i02 < ne02; i02++) {
+ for (int i01 = 0; i01 < ne01; i01++) {
+ for (int i00 = 0; i00 < ne00; i00++) {
+ const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+
+ dst_ptr[id] = *src0_ptr;
+ id++;
+ }
+ }
+ }
+ }
+ } else {
+ GGML_ASSERT(false); // TODO: implement
+ }
+ }
}
-void ggml_compute_forward_dup_f32(
+static void ggml_compute_forward_dup_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
for (int i00 = 0; i00 < ne00; i00++) {
const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
- dst_ptr[id] = ggml_fp32_to_fp16(*src0_ptr);
+ dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
id++;
}
}
for (int i00 = 0; i00 < ne00; i00++) {
const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
- dst_ptr[id] = ggml_fp32_to_fp16(*src0_ptr);
+ dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
id++;
}
}
}
}
-void ggml_compute_forward_dup(
+static void ggml_compute_forward_dup(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
// ggml_compute_forward_add
-void ggml_compute_forward_add_f32(
+static void ggml_compute_forward_add_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
}
}
-void ggml_compute_forward_add(
+static void ggml_compute_forward_add(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
// ggml_compute_forward_sub
-void ggml_compute_forward_sub_f32(
+static void ggml_compute_forward_sub_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
}
}
-void ggml_compute_forward_sub(
+static void ggml_compute_forward_sub(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
// ggml_compute_forward_mul
-void ggml_compute_forward_mul_f32(
+static void ggml_compute_forward_mul_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
}
}
-void ggml_compute_forward_mul(
+static void ggml_compute_forward_mul(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
// ggml_compute_forward_div
-void ggml_compute_forward_div_f32(
+static void ggml_compute_forward_div_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
}
}
-void ggml_compute_forward_div(
+static void ggml_compute_forward_div(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
// ggml_compute_forward_sqr
-void ggml_compute_forward_sqr_f32(
+static void ggml_compute_forward_sqr_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
}
}
-void ggml_compute_forward_sqr(
+static void ggml_compute_forward_sqr(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
// ggml_compute_forward_sqrt
-void ggml_compute_forward_sqrt_f32(
+static void ggml_compute_forward_sqrt_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
}
}
-void ggml_compute_forward_sqrt(
+static void ggml_compute_forward_sqrt(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
// ggml_compute_forward_sum
-void ggml_compute_forward_sum_f32(
+static void ggml_compute_forward_sum_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
}
}
-void ggml_compute_forward_sum(
+static void ggml_compute_forward_sum(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
// ggml_compute_forward_mean
-void ggml_compute_forward_mean_f32(
+static void ggml_compute_forward_mean_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
}
}
-void ggml_compute_forward_mean(
+static void ggml_compute_forward_mean(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
// ggml_compute_forward_repeat
-void ggml_compute_forward_repeat_f32(
+static void ggml_compute_forward_repeat_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
}
}
-void ggml_compute_forward_repeat(
+static void ggml_compute_forward_repeat(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
// ggml_compute_forward_abs
-void ggml_compute_forward_abs_f32(
+static void ggml_compute_forward_abs_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
}
}
-void ggml_compute_forward_abs(
+static void ggml_compute_forward_abs(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
// ggml_compute_forward_sgn
-void ggml_compute_forward_sgn_f32(
+static void ggml_compute_forward_sgn_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
}
}
-void ggml_compute_forward_sgn(
+static void ggml_compute_forward_sgn(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
// ggml_compute_forward_neg
-void ggml_compute_forward_neg_f32(
+static void ggml_compute_forward_neg_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
}
}
-void ggml_compute_forward_neg(
+static void ggml_compute_forward_neg(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
// ggml_compute_forward_step
-void ggml_compute_forward_step_f32(
+static void ggml_compute_forward_step_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
}
}
-void ggml_compute_forward_step(
+static void ggml_compute_forward_step(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
// ggml_compute_forward_relu
-void ggml_compute_forward_relu_f32(
+static void ggml_compute_forward_relu_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
}
}
-void ggml_compute_forward_relu(
+static void ggml_compute_forward_relu(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
// ggml_compute_forward_gelu
-void ggml_compute_forward_gelu_f32(
+static void ggml_compute_forward_gelu_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
}
}
-void ggml_compute_forward_gelu(
+static void ggml_compute_forward_gelu(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
// ggml_compute_forward_norm
-void ggml_compute_forward_norm_f32(
+static void ggml_compute_forward_norm_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
}
}
-void ggml_compute_forward_norm(
+static void ggml_compute_forward_norm(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
// ggml_compute_forward_mul_mat
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
// helper function to determine if it is better to use BLAS or not
// for large matrices, BLAS is faster
-bool ggml_compute_forward_mul_mat_use_blas(
+static bool ggml_compute_forward_mul_mat_use_blas(
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
const int ne1 = dst->ne[1];
// TODO: find the optimal values for these
- if (ggml_is_contiguous(src1) && ne0 >= 32 && ne1 >= 32 && ne10 >= 32) {
+ if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ne0 >= 32 && ne1 >= 32 && ne10 >= 32) {
//printf("BLAS: %d %d %d\n", ne0, ne1, ne10);
return true;
}
return false;
}
+#endif
-void ggml_compute_forward_mul_mat_f32(
+static void ggml_compute_forward_mul_mat_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
- GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(nb10 == sizeof(float));
if (params->ith != 0) return;
//}
}
-void ggml_compute_forward_mul_mat_f16_f32(
+static void ggml_compute_forward_mul_mat_f16_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
int id = 0;
for (int i01 = 0; i01 < ne01; ++i01) {
for (int i00 = 0; i00 < ne00; ++i00) {
- wdata[id++] = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
+ wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
}
}
}
// }
//}
- // zT = y * xT
{
+#if 1
+ // zT = y * xT
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
ne11, ne01, ne10,
- 1.0f, y, ne10,
- x, ne10,
+ 1.0f, y, ne00,
+ x, ne00,
0.0f, d, ne01);
+#else
+ // zT = (xT * y)T
+ cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
+ ne01, ne11, ne10,
+ 1.0f, x, ne00,
+ y, ne00,
+ 0.0f, d, ne01);
+#endif
}
}
}
for (int i12 = 0; i12 < ne12; ++i12) {
for (int i11 = 0; i11 < ne11; ++i11) {
for (int i10 = 0; i10 < ne10; ++i10) {
- wdata[id++] = ggml_fp32_to_fp16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10));
+ wdata[id++] = GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10));
}
}
}
const int ic1 = MIN(ic0 + dc, ne);
for (int i = ic0; i < ic1; ++i) {
- ((float *) dst->data)[i] = ggml_fp16_to_fp32(wdata[i]);
+ ((float *) dst->data)[i] = GGML_FP16_TO_FP32(wdata[i]);
}
for (int k = 1; k < nth; k++) {
for (int i = ic0; i < ic1; ++i) {
- ((float *) dst->data)[i] += ggml_fp16_to_fp32(wdata[(ne + CACHE_LINE_SIZE_F32)*k + i]);
+ ((float *) dst->data)[i] += GGML_FP16_TO_FP32(wdata[(ne + CACHE_LINE_SIZE_F32)*k + i]);
}
}
//}
}
-void ggml_compute_forward_mul_mat(
+static void ggml_compute_forward_mul_mat(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
// ggml_compute_forward_scale
-void ggml_compute_forward_scale_f32(
+static void ggml_compute_forward_scale_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
}
}
-void ggml_compute_forward_scale(
+static void ggml_compute_forward_scale(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
// ggml_compute_forward_cpy
-void ggml_compute_forward_cpy(
+static void ggml_compute_forward_cpy(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
// ggml_compute_forward_reshape
-void ggml_compute_forward_reshape(
+static void ggml_compute_forward_reshape(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
// ggml_compute_forward_view
-void ggml_compute_forward_view(
+static void ggml_compute_forward_view(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0) {
// NOP
// ggml_compute_forward_permute
-void ggml_compute_forward_permute(
+static void ggml_compute_forward_permute(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0) {
// NOP
// ggml_compute_forward_transpose
-void ggml_compute_forward_transpose(
+static void ggml_compute_forward_transpose(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0) {
// NOP
// ggml_compute_forward_get_rows
-void ggml_compute_forward_get_rows_f16(
+static void ggml_compute_forward_get_rows_f16(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
for (int j = 0; j < nc; ++j) {
ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + r*src0->nb[1]))[j];
- ((float *) ((char *) dst->data + i*dst->nb[1]))[j] = ggml_fp16_to_fp32(v);
+ ((float *) ((char *) dst->data + i*dst->nb[1]))[j] = GGML_FP16_TO_FP32(v);
}
}
}
-void ggml_compute_forward_get_rows_f32(
+static void ggml_compute_forward_get_rows_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
}
}
-void ggml_compute_forward_get_rows(
+static void ggml_compute_forward_get_rows(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
// ggml_compute_forward_diag_mask_inf
-void ggml_compute_forward_diag_mask_inf_f32(
+static void ggml_compute_forward_diag_mask_inf_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
}
}
-void ggml_compute_forward_diag_mask_inf(
+static void ggml_compute_forward_diag_mask_inf(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
// ggml_compute_forward_soft_max
-void ggml_compute_forward_soft_max_f32(
+static void ggml_compute_forward_soft_max_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
p[i] = 0.0;
} else {
//const float val = (p[i] == -INFINITY) ? 0.0 : exp(p[i] - max);
- ggml_fp16_t s = ggml_fp32_to_fp16(p[i] - max);
+ ggml_fp16_t s = GGML_FP32_TO_FP16(p[i] - max);
memcpy(&ss, &s, sizeof(ss));
- const float val = ggml_fp16_to_fp32(table_exp_f16[ss]);
+ const float val = GGML_FP16_TO_FP32(table_exp_f16[ss]);
sum += val;
p[i] = val;
}
}
}
-void ggml_compute_forward_soft_max(
+static void ggml_compute_forward_soft_max(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
// ggml_compute_forward_rope
-void ggml_compute_forward_rope_f32(
+static void ggml_compute_forward_rope_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
}
}
-void ggml_compute_forward_rope(
+static void ggml_compute_forward_rope(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
// ggml_compute_forward_conv_1d_1s
-void ggml_compute_forward_conv_1d_1s_f16_f32(
+static void ggml_compute_forward_conv_1d_1s_f16_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
const float * const src = (float *)((char *) src1->data + i11*nb11);
ggml_fp16_t * dst_data = wdata;
for (int i10 = 0; i10 < ne10; i10++) {
- dst_data[(i10 + nh)*ew0 + i11] = ggml_fp32_to_fp16(src[i10]);
+ dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]);
}
}
}
}
}
-void ggml_compute_forward_conv_1d_1s_f32(
+static void ggml_compute_forward_conv_1d_1s_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
}
}
-void ggml_compute_forward_conv_1d_1s(
+static void ggml_compute_forward_conv_1d_1s(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
// ggml_compute_forward_conv_1d_2s
-void ggml_compute_forward_conv_1d_2s_f16_f32(
+static void ggml_compute_forward_conv_1d_2s_f16_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
const float * const src = (float *)((char *) src1->data + i11*nb11);
ggml_fp16_t * dst_data = wdata;
for (int i10 = 0; i10 < ne10; i10++) {
- dst_data[(i10 + nh)*ew0 + i11] = ggml_fp32_to_fp16(src[i10]);
+ dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]);
}
}
}
}
}
-void ggml_compute_forward_conv_1d_2s_f32(
+static void ggml_compute_forward_conv_1d_2s_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
}
}
-void ggml_compute_forward_conv_1d_2s(
+static void ggml_compute_forward_conv_1d_2s(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
// ggml_compute_forward_flash_attn
-void ggml_compute_forward_flash_attn_f32(
+static void ggml_compute_forward_flash_attn_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * q,
const struct ggml_tensor * k,
S[i] = 0.0;
} else {
//const float val = (S[i] == -INFINITY) ? 0.0 : exp(S[i] - max);
- ggml_fp16_t s = ggml_fp32_to_fp16(S[i] - max);
+ ggml_fp16_t s = GGML_FP32_TO_FP16(S[i] - max);
memcpy(&ss, &s, sizeof(ss));
- const float val = ggml_fp16_to_fp32(table_exp_f16[ss]);
+ const float val = GGML_FP16_TO_FP32(table_exp_f16[ss]);
sum += val;
S[i] = val;
}
}
}
-void ggml_compute_forward_flash_attn_f16(
+static void ggml_compute_forward_flash_attn_f16(
const struct ggml_compute_params * params,
const struct ggml_tensor * q,
const struct ggml_tensor * k,
S[i] = 0.0;
} else {
//const float val = (S[i] == -INFINITY) ? 0.0 : exp(S[i] - max);
- ggml_fp16_t s = ggml_fp32_to_fp16(S[i] - max);
+ ggml_fp16_t s = GGML_FP32_TO_FP16(S[i] - max);
memcpy(&ss, &s, sizeof(ss));
- const float val = ggml_fp16_to_fp32(table_exp_f16[ss]);
+ const float val = GGML_FP16_TO_FP32(table_exp_f16[ss]);
sum += val;
S[i] = val;
}
ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
for (int i = 0; i < M; i++) {
- S16[i] = ggml_fp32_to_fp16(S[i]);
+ S16[i] = GGML_FP32_TO_FP16(S[i]);
}
for (int ic = 0; ic < nev1; ++ic) {
}
}
-void ggml_compute_forward_flash_attn(
+static void ggml_compute_forward_flash_attn(
const struct ggml_compute_params * params,
const struct ggml_tensor * q,
const struct ggml_tensor * k,
// ggml_compute_forward_flash_ff
-void ggml_compute_forward_flash_ff_f16(
+static void ggml_compute_forward_flash_ff_f16(
const struct ggml_compute_params * params,
const struct ggml_tensor * a, // F16
const struct ggml_tensor * b0, // F16 fc_w
ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
for (int i = 0; i < M; i++) {
- S16[i] = ggml_fp32_to_fp16(S[i]);
+ S16[i] = GGML_FP32_TO_FP16(S[i]);
}
ggml_vec_gelu_f16(neb01, S16, S16);
}
}
-void ggml_compute_forward_flash_ff(
+static void ggml_compute_forward_flash_ff(
const struct ggml_compute_params * params,
const struct ggml_tensor * a,
const struct ggml_tensor * b0,
/////////////////////////////////
-void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
+static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
assert(params);
switch (tensor->op) {
{
GGML_ASSERT(false);
} break;
- };
+ }
}
////////////////////////////////////////////////////////////////////////////////
-void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, bool inplace) {
+static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, bool inplace) {
struct ggml_tensor * src0 = tensor->src0;
struct ggml_tensor * src1 = tensor->src1;
{
GGML_ASSERT(false);
} break;
- };
+ }
}
-void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) {
+static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) {
if (node->grad == NULL) {
// this usually happens when we generate intermediate nodes from constants in the backward pass
// it can also happen during forward pass, if the user performs computations with constants
}
}
-void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand) {
+static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand) {
if (!expand) {
cgraph->n_nodes = 0;
cgraph->n_leafs = 0;
#define GGML_LOCK_INITIALIZER 0
+typedef pthread_t ggml_thread_t;
+
+#define ggml_thread_create pthread_create
+#define ggml_thread_join pthread_join
+
#else
//typedef pthread_spinlock_t ggml_lock_t;
#define GGML_LOCK_INITIALIZER 0
+typedef pthread_t ggml_thread_t;
+
+#define ggml_thread_create pthread_create
+#define ggml_thread_join pthread_join
+
#endif
struct ggml_compute_state_shared {
};
struct ggml_compute_state {
- pthread_t thrd;
+ ggml_thread_t thrd;
struct ggml_compute_params params;
struct ggml_tensor * node;
struct ggml_compute_state_shared * shared;
};
-// function used by each compute thread
-void * ggml_graph_compute_one(void * data) {
- struct ggml_compute_state * state = (struct ggml_compute_state *) data;
-
- ggml_compute_forward(&state->params, state->node);
-
- return NULL;
-}
-
-thread_ret_t ggml_graph_compute_thread(void * data) {
+static thread_ret_t ggml_graph_compute_thread(void * data) {
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
const int n_threads = state->shared->n_threads;
.node = NULL,
.shared = &state_shared,
};
- int rc = pthread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
+ int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
assert(rc == 0);
UNUSED(rc);
}
{
assert(false);
} break;
- };
+ }
}
if (cgraph->work != NULL && work_size > cgraph->work_size) {
atomic_store(&state_shared.has_work, true);
for (int j = 0; j < n_threads - 1; j++) {
- int rc = pthread_join(workers[j].thrd, NULL);
+ int rc = ggml_thread_join(workers[j].thrd, NULL);
assert(rc == 0);
UNUSED(rc);
}
}
// check if node is part of the graph
-bool ggml_graph_find(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
+static bool ggml_graph_find(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
if (cgraph == NULL) {
return true;
}
return false;
}
-struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
+static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
for (int i = 0; i < cgraph->n_nodes; i++) {
struct ggml_tensor * parent = cgraph->nodes[i];
////////////////////////////////////////////////////////////////////////////////
-void ggml_opt_set_params(int np, struct ggml_tensor * const ps[], const float * x) {
+static void ggml_opt_set_params(int np, struct ggml_tensor * const ps[], const float * x) {
int i = 0;
for (int p = 0; p < np; ++p) {
const int ne = ggml_nelements(ps[p]) ;
}
}
-void ggml_opt_get_params(int np, struct ggml_tensor * const ps[], float * x) {
+static void ggml_opt_get_params(int np, struct ggml_tensor * const ps[], float * x) {
int i = 0;
for (int p = 0; p < np; ++p) {
const int ne = ggml_nelements(ps[p]) ;
}
}
-void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g) {
+static void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g) {
int i = 0;
for (int p = 0; p < np; ++p) {
const int ne = ggml_nelements(ps[p]) ;
// ref: https://arxiv.org/pdf/1412.6980.pdf
//
-enum ggml_opt_result ggml_opt_adam(
+static enum ggml_opt_result ggml_opt_adam(
struct ggml_context * ctx,
struct ggml_opt_params params,
struct ggml_tensor * f,
return GGML_LINESEARCH_FAIL;
}
-enum ggml_opt_result ggml_opt_lbfgs(
+static enum ggml_opt_result ggml_opt_lbfgs(
struct ggml_context * ctx,
struct ggml_opt_params params,
struct ggml_tensor * f,
#endif
}
+int ggml_cpu_has_fma(void) {
+#if defined(__FMA__)
+ return 1;
+#else
+ return 0;
+#endif
+}
+
int ggml_cpu_has_neon(void) {
#if defined(__ARM_NEON)
return 1;
#endif
}
+int ggml_cpu_has_arm_fma(void) {
+#if defined(__ARM_FEATURE_FMA)
+ return 1;
+#else
+ return 0;
+#endif
+}
+
+int ggml_cpu_has_f16c(void) {
+#if defined(__F16C__)
+ return 1;
+#else
+ return 0;
+#endif
+}
+
int ggml_cpu_has_fp16_va(void) {
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
return 1;