Fix bad calculation of the end of the range. Add a backend test that
covers the bad case (taken from stable diffusion).
Fixes https://github.com/leejet/stable-diffusion.cpp/issues/439.
const int32_t max_period = tensor->op_params[1];
tensor_clone = ggml_timestep_embedding(ggml_ctx, src0_clone, dim, max_period);
} else if (tensor->op == GGML_OP_POOL_2D) {
- enum ggml_op_pool op = static_cast<ggml_op_pool>(dst->op_params[0]);
+ enum ggml_op_pool op = static_cast<ggml_op_pool>(tensor->op_params[0]);
const int32_t k0 = tensor->op_params[1];
const int32_t k1 = tensor->op_params[2];
const int32_t s0 = tensor->op_params[3];
const uint tid = gl_LocalInvocationID.x;
const uint start = gl_WorkGroupID.x * group_size + tid;
- const uint end = start + group_size;
+ const uint end = (gl_WorkGroupID.x + 1) * group_size;
tmp[tid] = 0.0f;