static __device__ void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
const half * x = (const half *) vx;
-
+ // load 2 halfs into register in a single instruction
+ const half2 x_reg = *((half2 *) &(x[ib + iqs]));
// automatic half -> float type cast if dfloat == float
- v.x = x[ib + iqs + 0];
- v.y = x[ib + iqs + 1];
+ v.x = __low2float(x_reg);
+ v.y = __high2float(x_reg);
}
static constexpr __device__ dequantize_kernel_t get_dequantize_kernel(ggml_type type) {
// matrix multiplication
// for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
#ifdef GGML_CUDA_F16
- tmp += __hmul2(v, {
- y[iybs + iqs + j/qr + 0],
- y[iybs + iqs + j/qr + y_offset]
- });
+ if ( y_offset == 1 ) {
+ // load 2 dfloats into register in a single instruction
+ const dfloat2 y_reg = *((dfloat2 *) &(y[iybs + iqs + j/qr]));
+ tmp += __hmul2(v, y_reg);
+ }
+ else {
+ tmp += __hmul2(v, {
+ y[iybs + iqs + j/qr + 0],
+ y[iybs + iqs + j/qr + y_offset]
+ });
+ }
#else
- tmp += v.x * y[iybs + iqs + j/qr + 0];
- tmp += v.y * y[iybs + iqs + j/qr + y_offset];
+ if ( y_offset == 1 ) {
+ // load 2 dfloats into register in a single instruction
+ const dfloat2 y_reg = *((dfloat2 *) &(y[iybs + iqs + j/qr]));
+ tmp += v.x * y_reg.x;
+ tmp += v.y * y_reg.y;
+ }
+ else {
+ tmp += v.x * y[iybs + iqs + j/qr + 0];
+ tmp += v.y * y[iybs + iqs + j/qr + y_offset];
+ }
#endif // GGML_CUDA_F16
}
}