int src_offset = (i / 8) * blck_size_interleave;
int dst_offset = i * blck_size_interleave;
+ // buffer large enough for the max interleave block size (8 bytes)
uint64_t elems;
- memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
- memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
+ memcpy(&elems, &in[src_id].qs[src_offset], blck_size_interleave);
+ memcpy(&out.qs[dst_offset], &elems, blck_size_interleave);
}
// The below logic is designed so as to unpack and rearrange scales and mins values in Q4_K