return
tensor->nb[0] == ggml_type_size(tensor->type) &&
tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
- tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
+ (tensor->ne[3] == 1 || tensor->nb[3] == tensor->nb[2]*tensor->ne[2]);
}
static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src, const ggml_tensor * dst, ggml_type to) {
// If there's not enough shared memory for row_ids and the result tile, fallback to CPU
return false;
}
- // Check against size of shared memory variable
- if (op->src[2]->ne[0] > 4096) {
- return false;
- }
}
switch (src0_type) {
case GGML_TYPE_F32:
void main() {
[[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
const uint i = gl_WorkGroupID.x * 256 + wgy;
- if (i >= p.M * p.K / QUANT_K) {
+ if (i >= p.nel / QUANT_K) {
return;
}
void main() {
[[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
const uint i = uint(gl_WorkGroupID.x * 256 + wgy);
- if (i >= p.M * p.K / QUANT_K) {
+ if (i >= p.nel / QUANT_K) {
return;
}
void main() {
[[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
const uint ib = gl_WorkGroupID.x * 256 + wgy;
- if (ib >= p.M * p.K / QUANT_K) {
+ if (ib >= p.nel / QUANT_K) {
return;
}
void main() {
[[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
const uint ib = gl_WorkGroupID.x * 256 + wgy;
- if (ib >= p.M * p.K / QUANT_K) {
+ if (ib >= p.nel / QUANT_K) {
return;
}
void main() {
[[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
const uint i = gl_WorkGroupID.x * 256 + wgy;
- if (i >= p.M * p.K / QUANT_K) {
+ if (i >= p.nel / QUANT_K) {
return;
}
const uint tid = gl_LocalInvocationID.x;