device const half * dh = &x[ib].d;
for (int row = 0; row < N_DST; row++) {
-
float4 acc1 = {0.f, 0.f, 0.f, 0.f};
float4 acc2 = {0.f, 0.f, 0.f, 0.f};
for (int i = 0; i < 8; i += 2) {
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
- for (int row = 0; row < N_DST; ++row) {
+ for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum;
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
if (tiisg == 0) {
- for (int row = 0; row < 2; ++row) {
+ for (int row = 0; row < 2 && first_row + row < args.ne0; ++row) {
dst_f32[first_row + row] = sumf1[row];
}
}
device float * dst_f32 = (device float *) dst + (int64_t)im*args.ne0*args.ne1 + (int64_t)r1*args.ne0;
- for (int row = 0; row < N_DST; ++row) {
+ for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum;
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
- for (int row = 0; row < 2; ++row) {
+ for (int row = 0; row < 2 && first_row + row < args.ne0; ++row) {
const float tot = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = tot;
const int row = 2*r0 + sgitg;
+ if (row >= args.ne0) {
+ return;
+ }
+
const uint i12 = im%args.ne12;
const uint i13 = im/args.ne12;
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
- for (int row = 0; row < N_DST; ++row) {
+ for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum * 0.25f;
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
- for (int row = 0; row < N_DST; ++row) {
+ for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum * 0.25f;
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
- for (int row = 0; row < N_DST; ++row) {
+ for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum * 0.5f;
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
- for (int row = 0; row < N_DST; ++row) {
+ for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum;
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
- for (int row = 0; row < N_DST; ++row) {
+ for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum * 0.25f;
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
- for (int row = 0; row < N_DST; ++row) {
+ for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum;
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
- for (int row = 0; row < N_DST; ++row) {
+ for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum;
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
- for (int row = 0; row < 2 && first_row + row < args.ne01; ++row) {
+ for (int row = 0; row < 2 && first_row + row < args.ne0; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum;
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
- for (int row = 0; row < 2; ++row) {
+ for (int row = 0; row < 2 && first_row + row < args.ne0; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum;