const int64_t ir0 = (ir1/ne11)%(ne02*ne03);
const int64_t i03 = (ir0/(ne02));
- const int64_t i02 = (ir0 - i03*ne02);
+ // Hack for "Falcon multi-query-attention key stutter" / alternative to ggml_repeat2.
+ // See https://github.com/ggerganov/llama.cpp/issues/1602#issuecomment-1606087470:
+ // GG: this is likely the correct way to broadcast, though need some more thought
+ // therefore leaving the comments to remind us for now
+ const int64_t i02 = (i12 / (ne12 / ne02));
+ // Original from PR/224 (and also essential/correct for non-broadcast matmuls in Falcon)
+ // const int64_t i02 = (ir0 - i03*ne02);
const int64_t i1 = i11;
const int64_t i2 = i12;
if (s0 == src0->ne[0] && s1 == src0->ne[1]) {
ggml_compute_forward_conv_2d_sk_p0(params, src0, src1, dst);
- }
- else {
+ } else {
GGML_ASSERT(false); // only stride equal to kernel size is supported
- };
+ }
}
// ggml_compute_forward_pool_1d_sk_p0