self._set_vocab_qwen()
-@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration", "KORMoForCausalLM")
+@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration", "KORMoForCausalLM", "AudioFlamingo3ForConditionalGeneration")
class Qwen2Model(TextModel):
model_arch = gguf.MODEL_ARCH.QWEN2
self.gguf_writer.add_audio_stack_factor(4) # == intermediate_size // hidden_size
+@ModelBase.register("AudioFlamingo3ForConditionalGeneration")
+class AudioFlamingo3WhisperEncoderModel(WhisperEncoderModel):
+ def set_gguf_parameters(self):
+ super().set_gguf_parameters()
+ self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.MUSIC_FLAMINGO)
+
+ def tensor_force_quant(self, name, new_name, bid, n_dims):
+ if ".conv" in name and ".weight" in name:
+ # Was trained in BF16, being safe, avoiding quantizing to FP16
+ return gguf.GGMLQuantizationType.F32
+ return super().tensor_force_quant(name, new_name, bid, n_dims)
+
@ModelBase.register("FalconH1ForCausalLM")
class FalconH1Model(Mamba2Model):
model_arch = gguf.MODEL_ARCH.FALCON_H1
COGVLM = "cogvlm"
JANUS_PRO = "janus_pro"
LFM2A = "lfm2a" # audio
+ MUSIC_FLAMINGO = "musicflamingo" # audio
GLM4V = "glm4v"
PROJECTOR_TYPE_GLMA,
PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
PROJECTOR_TYPE_VOXTRAL,
+ PROJECTOR_TYPE_MUSIC_FLAMINGO,
PROJECTOR_TYPE_LFM2,
PROJECTOR_TYPE_KIMIVL,
PROJECTOR_TYPE_LIGHTONOCR,
{ PROJECTOR_TYPE_GLMA, "glma"},
{ PROJECTOR_TYPE_QWEN25O, "qwen2.5o"},
{ PROJECTOR_TYPE_VOXTRAL, "voxtral"},
+ { PROJECTOR_TYPE_MUSIC_FLAMINGO, "musicflamingo"},
{ PROJECTOR_TYPE_LFM2, "lfm2"},
{ PROJECTOR_TYPE_KIMIVL, "kimivl"},
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
bool audio_has_avgpool() const {
return proj_type == PROJECTOR_TYPE_QWEN2A
- || proj_type == PROJECTOR_TYPE_VOXTRAL;
+ || proj_type == PROJECTOR_TYPE_VOXTRAL
+ || proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO;
}
bool audio_has_stack_frames() const {
case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_QWEN2A:
case PROJECTOR_TYPE_GLMA:
+ case PROJECTOR_TYPE_MUSIC_FLAMINGO:
{
builder = std::make_unique<clip_graph_whisper_enc>(ctx, img);
} break;
case PROJECTOR_TYPE_QWEN2A:
case PROJECTOR_TYPE_GLMA:
case PROJECTOR_TYPE_VOXTRAL:
+ case PROJECTOR_TYPE_MUSIC_FLAMINGO:
{
bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX ||
model.proj_type == PROJECTOR_TYPE_VOXTRAL ||
model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
} break;
+ case PROJECTOR_TYPE_MUSIC_FLAMINGO:
+ {
+ model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
+ model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
+ model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
+ model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
+ model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
+ model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
+ model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
+ model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
+ } break;
case PROJECTOR_TYPE_INTERNVL:
{
model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_ULTRAVOX:
case PROJECTOR_TYPE_QWEN2A:
+ case PROJECTOR_TYPE_MUSIC_FLAMINGO:
{
n_patches = img->nx;
case PROJECTOR_TYPE_ULTRAVOX:
case PROJECTOR_TYPE_LFM2:
case PROJECTOR_TYPE_VOXTRAL:
+ case PROJECTOR_TYPE_MUSIC_FLAMINGO:
case PROJECTOR_TYPE_JANUS_PRO:
case PROJECTOR_TYPE_COGVLM:
{
return ctx->model.projection->ne[1];
case PROJECTOR_TYPE_ULTRAVOX:
case PROJECTOR_TYPE_VOXTRAL:
+ case PROJECTOR_TYPE_MUSIC_FLAMINGO:
return ctx->model.mm_2_w->ne[1];
case PROJECTOR_TYPE_INTERNVL:
return ctx->model.mm_3_w->ne[1];
return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN2A
|| ctx->proj_type() == PROJECTOR_TYPE_GLMA
- || ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL;
+ || ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL
+ || ctx->proj_type() == PROJECTOR_TYPE_MUSIC_FLAMINGO;
}
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
FFN_GELU_ERF,
-1);
+ } else if (proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
+ // projector
+ cur = build_ffn(cur,
+ model.mm_1_w, model.mm_1_b,
+ nullptr, nullptr,
+ model.mm_2_w, model.mm_2_b,
+ FFN_GELU_ERF,
+ -1);
+
} else if (proj_type == PROJECTOR_TYPE_GLMA) {
cur = ggml_norm(ctx0, cur, hparams.eps);
cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
case PROJECTOR_TYPE_ULTRAVOX:
case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_GLMA:
+ case PROJECTOR_TYPE_MUSIC_FLAMINGO:
audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
break;
case PROJECTOR_TYPE_LFM2A:
// [BEGIN_AUDIO] ... (embeddings) ...
aud_beg = "[BEGIN_AUDIO]";
+ } else if (proj == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
+ // <sound> ... (embeddings) ...
+ aud_beg = "<sound>";
}
}