ids_dst_shared[j] = j;
}
+ __syncthreads();
// On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
return;
}
+ // __syncthreads(); // There is no previous tile that could cause a race condition.
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {
const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x;
ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j];
}
+ __syncthreads();
}
offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int));
continue;
}
+ __syncthreads();
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {
const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x;
ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j];
}
+ __syncthreads();
}
offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int));
}
// The memory layout for the fixup buffer is always contiguous, therefore reset ids:
+ __syncthreads();
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {
const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x;
ids_dst_shared[j] = j;
}
+ __syncthreads();
}
offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int));