cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
- cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
- cur = ggml_add(ctx0, cur, model.mm_1_b);
- cur = ggml_gelu(ctx0, cur);
- cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
- cur = ggml_add(ctx0, cur, model.mm_2_b);
+ cur = build_ffn(cur,
+ model.mm_1_w, model.mm_1_b,
+ nullptr, nullptr,
+ model.mm_2_w, model.mm_2_b,
+ FFN_GELU,
+ -1);
} else if (ctx->proj_type() == PROJECTOR_TYPE_JANUS_PRO) {
cur = build_ffn(cur,
// LlavaMultiModalProjector (always using GELU activation)
{
- cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
- if (model.mm_1_b) {
- cur = ggml_add(ctx0, cur, model.mm_1_b);
- }
-
- cur = ggml_gelu(ctx0, cur);
- cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
- if (model.mm_2_b) {
- cur = ggml_add(ctx0, cur, model.mm_2_b);
- }
+ cur = build_ffn(cur,
+ model.mm_1_w, model.mm_1_b,
+ nullptr, nullptr,
+ model.mm_2_w, model.mm_2_b,
+ FFN_GELU,
+ -1);
}
// arrangement of the [IMG_BREAK] token
// multimodal projection
ggml_tensor * embeddings = inpL;
embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
-
- embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
- embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
-
- // GELU activation
- embeddings = ggml_gelu(ctx0, embeddings);
-
- // Second linear layer
- embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
- embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
+ embeddings = build_ffn(embeddings,
+ model.mm_0_w, model.mm_0_b,
+ nullptr, nullptr,
+ model.mm_1_w, model.mm_1_b,
+ FFN_GELU,
+ -1);
if (use_window_attn) {
window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
// projector LayerNorm uses pytorch's default eps = 1e-5
// ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79
cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
- cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
- cur = ggml_add(ctx0, cur, model.mm_1_b);
- cur = ggml_gelu(ctx0, cur);
- cur = ggml_mul_mat(ctx0, model.mm_3_w, cur);
- cur = ggml_add(ctx0, cur, model.mm_3_b);
+ cur = build_ffn(cur,
+ model.mm_1_w, model.mm_1_b,
+ nullptr, nullptr,
+ model.mm_3_w, model.mm_3_b,
+ FFN_GELU,
+ -1);
}
// build the graph
cb(cur, "proj_inp_normed", -1);
// projection mlp
- cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
- cur = ggml_add(ctx0, cur, model.mm_1_b);
- cur = ggml_gelu(ctx0, cur);
- cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
- cur = ggml_add(ctx0, cur, model.mm_2_b);
+ cur = build_ffn(cur,
+ model.mm_1_w, model.mm_1_b,
+ nullptr, nullptr,
+ model.mm_2_w, model.mm_2_b,
+ FFN_GELU,
+ -1);
cb(cur, "proj_out", -1);
}
} else if (ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL) {
// projector
- cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
- cur = ggml_gelu_erf(ctx0, cur);
- cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
+ cur = build_ffn(cur,
+ model.mm_1_w, model.mm_1_b,
+ nullptr, nullptr,
+ model.mm_2_w, model.mm_2_b,
+ FFN_GELU_ERF,
+ -1);
} else {
GGML_ABORT("%s: unknown projector type", __func__);
// self-attention
{
- ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
- if (layer.q_b) {
- Qcur = ggml_add(ctx0, Qcur, layer.q_b);
- }
+ ggml_tensor * Qcur = nullptr;
+ ggml_tensor * Kcur = nullptr;
+ ggml_tensor * Vcur = nullptr;
+ if (layer.qkv_w != nullptr) {
+ // fused qkv
+ cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
+ if (layer.qkv_b != nullptr) {
+ cur = ggml_add(ctx0, cur, layer.qkv_b);
+ }
- ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
- if (layer.k_b) {
- Kcur = ggml_add(ctx0, Kcur, layer.k_b);
- }
+ Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
+ /* nb1 */ ggml_row_size(cur->type, d_head),
+ /* nb2 */ cur->nb[1],
+ /* offset */ 0);
- ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
- if (layer.v_b) {
- Vcur = ggml_add(ctx0, Vcur, layer.v_b);
- }
+ Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
+ /* nb1 */ ggml_row_size(cur->type, d_head),
+ /* nb2 */ cur->nb[1],
+ /* offset */ ggml_row_size(cur->type, n_embd));
- if (layer.q_norm) {
- Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
- cb(Qcur, "Qcur_norm", il);
- }
+ Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
+ /* nb1 */ ggml_row_size(cur->type, d_head),
+ /* nb2 */ cur->nb[1],
+ /* offset */ ggml_row_size(cur->type, 2 * n_embd));
- if (layer.k_norm) {
- Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
- cb(Kcur, "Kcur_norm", il);
- }
+ // TODO: q/k norm requires row size == n_embd, while here it's d_head
+ // we can add support in the future if needed
+ GGML_ASSERT(layer.q_norm == nullptr && layer.k_norm == nullptr);
- Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
- Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
- Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
+ } else {
+ // separate q, k, v
+ Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
+ if (layer.q_b) {
+ Qcur = ggml_add(ctx0, Qcur, layer.q_b);
+ }
+
+ Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
+ if (layer.k_b) {
+ Kcur = ggml_add(ctx0, Kcur, layer.k_b);
+ }
+
+ Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
+ if (layer.v_b) {
+ Vcur = ggml_add(ctx0, Vcur, layer.v_b);
+ }
+
+ if (layer.q_norm) {
+ Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
+ cb(Qcur, "Qcur_norm", il);
+ }
+
+ if (layer.k_norm) {
+ Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
+ cb(Kcur, "Kcur_norm", il);
+ }
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
+ }
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
arr_prefix=()
arr_hf=()
-arr_tmpl=() # chat template
+arr_extra_args=()
arr_file=()
add_test_vision() {
local hf=$1
- local tmpl=${2:-""} # default to empty string if not provided
+ shift
+ local extra_args=""
+ if [ $# -gt 0 ]; then
+ extra_args=$(printf " %q" "$@")
+ fi
arr_prefix+=("[vision]")
arr_hf+=("$hf")
- arr_tmpl+=("$tmpl")
+ arr_extra_args+=("$extra_args")
arr_file+=("test-1.jpeg")
}
add_test_audio() {
local hf=$1
+ shift
+ local extra_args=""
+ if [ $# -gt 0 ]; then
+ extra_args=$(printf " %q" "$@")
+ fi
arr_prefix+=("[audio] ")
arr_hf+=("$hf")
- arr_tmpl+=("") # no need for chat tmpl
+ arr_extra_args+=("$extra_args")
arr_file+=("test-2.mp3")
}
add_test_vision "ggml-org/SmolVLM2-2.2B-Instruct-GGUF:Q4_K_M"
add_test_vision "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF:Q8_0"
add_test_vision "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
-add_test_vision "THUDM/glm-edge-v-5b-gguf:Q4_K_M"
-add_test_vision "second-state/Llava-v1.5-7B-GGUF:Q2_K" "vicuna"
-add_test_vision "cjpais/llava-1.6-mistral-7b-gguf:Q3_K_M" "vicuna"
+add_test_vision "THUDM/glm-edge-v-5b-gguf:Q4_K_M" -p "name of the newspaper?<__media__>"
+add_test_vision "second-state/Llava-v1.5-7B-GGUF:Q2_K" --chat-template vicuna
+add_test_vision "cjpais/llava-1.6-mistral-7b-gguf:Q3_K_M" --chat-template vicuna
add_test_vision "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
add_test_vision "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted
add_test_vision "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
# to test the big models, run: ./tests.sh big
if [ "$RUN_BIG_TESTS" = true ]; then
add_test_vision "ggml-org/pixtral-12b-GGUF:Q4_K_M"
- add_test_vision "ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF" "mistral-v7"
+ add_test_vision "ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF" --chat-template mistral-v7
add_test_vision "ggml-org/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
add_test_vision "ggml-org/Qwen2-VL-7B-Instruct-GGUF:Q4_K_M"
add_test_vision "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
add_test_vision "ggml-org/InternVL3-14B-Instruct-GGUF:Q4_K_M"
add_test_vision "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
# add_test_vision "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" # does not work on my mac M3 Ultra
- add_test_vision "ggml-org/Kimi-VL-A3B-Thinking-2506-GGUF:Q4_K_M"
+ # add_test_vision "ggml-org/Kimi-VL-A3B-Thinking-2506-GGUF:Q4_K_M" # not always working
add_test_audio "ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF:Q4_K_M"
add_test_audio "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
bin="llama-mtmd-cli"
prefix="${arr_prefix[$i]}"
hf="${arr_hf[$i]}"
- tmpl="${arr_tmpl[$i]}"
+ extra_args="${arr_extra_args[$i]}"
inp_file="${arr_file[$i]}"
echo "Running test with binary: $bin and HF model: $hf"
echo ""
echo ""
- output=$(\
- "$PROJ_ROOT/build/bin/$bin" \
- -hf "$hf" \
- --image $SCRIPT_DIR/$inp_file \
- -p "what is the publisher name of the newspaper?" \
+ cmd="$(printf %q "$PROJ_ROOT/build/bin/$bin") \
+ -hf $(printf %q "$hf") \
+ --image $(printf %q "$SCRIPT_DIR/$inp_file") \
--temp 0 -n 128 \
- ${tmpl:+--chat-template "$tmpl"} \
- 2>&1 | tee /dev/tty)
+ ${extra_args}"
+
+ # if extra_args does not contain -p, we add a default prompt
+ if ! [[ "$extra_args" =~ "-p" ]]; then
+ cmd+=" -p \"what is the publisher name of the newspaper?\""
+ fi
+
+ output=$(eval "$cmd" 2>&1 | tee /dev/tty)
echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log
if echo "$output" | grep -iq "new york" \
|| (echo "$output" | grep -iq "men" && echo "$output" | grep -iq "walk")
then
- result="$prefix \033[32mOK\033[0m: $bin $hf"
+ result="$prefix \033[32mOK\033[0m: $hf"
else
- result="$prefix \033[31mFAIL\033[0m: $bin $hf"
+ result="$prefix \033[31mFAIL\033[0m: $hf"
fi
echo -e "$result"
arr_res+=("$result")