Example usage:
// operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
- // preferrably to run on the same backend as the buffer
+ // preferably to run on the same backend as the buffer
ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false, true);
GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params);
GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx);
- // set gradients to zero, initilize loss, and optionally reset the optimizer
+ // set gradients to zero, initialize loss, and optionally reset the optimizer
GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);
GGML_API bool ggml_opt_static_graphs(ggml_opt_context_t opt_ctx); // whether the graphs are allocated_statically
struct ggml_tensor * grad,
struct ggml_tensor * sgd_params); // alpha, weight decay
- // build forward mutiple tensors and select one of them for computing
+ // build forward multiple tensors and select one of them for computing
// this is useful for creating graphs that have constant topology but compute different things based on the input
// ref: https://github.com/ggml-org/llama.cpp/pull/18550
//
// will be needed.
//
// Here another commonly used pattern 1-3-3 is skipped, as it is mostly used when m <=16;
-// and the sinlge batch gemm (m=1) has a special fast path with `avx512-vnni`.
+// and the single batch gemm (m=1) has a special fast path with `avx512-vnni`.
//
// ref: https://www.intel.com/content/www/us/en/developer/articles/code-sample/
// advanced-matrix-extensions-intrinsics-functions.html
// sum of offsets, shared across COLS
//
// avx512-vnni does not have `_mm512_dpbssd_epi32`,
- // need to transfrom ss to us:
- // a * (b - 8) is equavilent to b * a - 8 * a
+ // need to transform ss to us:
+ // a * (b - 8) is equivalent to b * a - 8 * a
// s u u u s u s
//
__m512i vcomp;
const int vector_length = ggml_cpu_get_sve_cnt()*8;
- //VLA Implemenation for SVE
+ //VLA Implementation for SVE
switch (vector_length) {
case 128:
{
const uint8_t * q4_base = q4_ptr[b].qs + sb * QK_K;
- // Load the 64 quants from q8K duplicated to use vecdots with the interelaved columns
+ // Load the 64 quants from q8K duplicated to use vecdots with the interleaved columns
// but still need the qs to use the low and hi bits from q4
const int8_t * q8_base = q8_ptr[b].qs + sb * 64;
int8x16_t q8_qs[8];
for (int b = 0; b < nb; b++) {
// bsums pairs belongs to the same q8_k subblock
- // 64 elemnts loaded and made sum of 0-7 and 8-15 sum || 16-23 and 24 - 31 sum
+ // 64 elements loaded and made sum of 0-7 and 8-15 sum || 16-23 and 24 - 31 sum
const int16x8_t bsums[4]{
vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 0), vld1q_s16(q8_ptr[b].bsums + 16 * 0 + 8)),
vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 1), vld1q_s16(q8_ptr[b].bsums + 16 * 1 + 8)),
quants_interleaved[j] = i0;
}
- // Masks to shuffle the quants of corresonding sub blocks for rearraning quants for vectorized bsums computation
+ // Masks to shuffle the quants of corresponding sub blocks for rearranging quants for vectorized bsums computation
__m256i shuffle_mask_sb2 = _mm256_castsi128_si256(_mm_setr_epi8(0, 1, 0, 1, 4, 5, 6, 7, 8, 9, 8, 9, 12, 13, 14, 15));
shuffle_mask_sb2 = _mm256_permute2f128_si256(shuffle_mask_sb2, shuffle_mask_sb2, 0);
__m256i shuffle_mask_sb3 = _mm256_castsi128_si256(_mm_setr_epi8(0, 1, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 8, 9, 14, 15));
iacc = mul_sum_i8_pairs_acc_int32x8(iacc, _mm256_blend_epi32(rhs_vec_0123_3 ,_mm256_shuffle_epi32(rhs_vec_4567_3, 177), 170), _mm256_shuffle_epi32(lhs_vec_1, 170));
iacc = mul_sum_i8_pairs_acc_int32x8(iacc, _mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_3, 177) ,rhs_vec_4567_3, 170), _mm256_shuffle_epi32(lhs_vec_1, 255));
- // Accumulated values multipled with appropriate scales
+ // Accumulated values multiplied with appropriate scales
acc_row = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc), _mm256_mul_ps(col_scale_f32, row_scale_f32), acc_row);
}
const __m128i row_scale_f16 = _mm_shuffle_epi32(_mm_maskload_epi32((int const*)(a_ptrs[rp][b].d), loadMask), 68);
const __m512 row_scale_f32 = GGML_F32Cx16_REPEAT_LOAD(row_scale_f16);
- // Multiply with appropiate scales and accumulate
+ // Multiply with appropriate scales and accumulate
acc_rows[rp * 4] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
acc_rows[rp * 4 + 1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
acc_rows[rp * 4 + 2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
const __m128i row_scale_f16 = _mm_shuffle_epi32(_mm_maskload_epi32((int const*)(a_ptr[b].d), loadMask), 68);
const __m512 row_scale_f32 = GGML_F32Cx16_REPEAT_LOAD(row_scale_f16);
- // Multiply with appropiate scales and accumulate
+ // Multiply with appropriate scales and accumulate
acc_rows[0] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
acc_rows[1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
acc_rows[2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
// Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
const __m256 row_scale_f32 = GGML_F32Cx8_REPEAT_LOAD(a_ptrs[rp][b].d, loadMask);
- // Multiply with appropiate scales and accumulate
+ // Multiply with appropriate scales and accumulate
acc_rows[rp * 4] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
acc_rows[rp * 4 + 1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
acc_rows[rp * 4 + 2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
// Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
const __m256 row_scale_f32 = GGML_F32Cx8_REPEAT_LOAD(a_ptr[b].d, loadMask);
- // Multiply with appropiate scales and accumulate
+ // Multiply with appropriate scales and accumulate
acc_rows[0] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
acc_rows[1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
acc_rows[2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
lhs_vec_11 = _mm256_permute2f128_si256(lhs_vec_11, lhs_vec_11, 0);
// Dot product done within 32 bit lanes and accumulated in the same vector
- // First done for first sub block and thenn for second sub block in each sb
+ // First done for first sub block and then for second sub block in each sb
// B0(0-3) B4(0-3) B1(0-3) B5(0-3) B2(0-3) B6(0-3) B3(0-3) B7(0-3) with A0(0-3)
// B0(4-7) B4(4-7) B1(4-7) B5(4-7) B2(4-7) B6(4-7) B3(4-7) B7(4-7) with A0(4-7)
// ...........................................................................
const __m256 row_scale_f32_ymm = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);
const __m512 row_scale_f32 = _mm512_insertf32x8(_mm512_castps256_ps512(row_scale_f32_ymm), row_scale_f32_ymm, 1);
- // Multiply with appropiate scales and accumulate (for both d and dmin) below
+ // Multiply with appropriate scales and accumulate (for both d and dmin) below
acc_rows[rp * 4] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
acc_rows[rp * 4 + 1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
acc_rows[rp * 4 + 2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
const __m256 row_scale_f32_ymm = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);
const __m512 row_scale_f32 = _mm512_insertf32x8(_mm512_castps256_ps512(row_scale_f32_ymm), row_scale_f32_ymm, 1);
- // Multiply with appropiate scales and accumulate (for both d and dmin) below
+ // Multiply with appropriate scales and accumulate (for both d and dmin) below
acc_rows[0] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
acc_rows[1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
acc_rows[2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
acc_min_rows[3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_min_3), _mm512_mul_ps(col_dmin_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_min_rows[3]);
}
}
- // Store accumlated values
+ // Store accumulated values
for (int i = 0; i < 4; i++) {
_mm512_storeu_ps((float * )(s + ((y * 4 + i) * bs + x * 8)), _mm512_sub_ps(acc_rows[i], acc_min_rows[i]));
}
const __m128 row_scale_f32_sse = _mm_load_ps(a_ptrs[rp][b].d);
const __m256 row_scale_f32 = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);//GGML_F32Cx8_REPEAT_LOAD(a_ptrs[rp][b].d, loadMask);
- // Multiply with appropiate scales and accumulate (for both d and dmin) below
+ // Multiply with appropriate scales and accumulate (for both d and dmin) below
acc_rows[rp * 4] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
acc_rows[rp * 4 + 1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
acc_rows[rp * 4 + 2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
const __m128 row_scale_f32_sse = _mm_load_ps(a_ptr[b].d);
const __m256 row_scale_f32 = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse); //GGML_F32Cx8_REPEAT_LOAD(a_ptrs[rp][b].d, loadMask);
- // Multiply with appropiate scales and accumulate (for both d and dmin) below
+ // Multiply with appropriate scales and accumulate (for both d and dmin) below
acc_rows[0] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
acc_rows[1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
acc_rows[2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
const __m256 row_scale_f32_ymm = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);
const __m512 row_scale_f32 = _mm512_insertf32x8(_mm512_castps256_ps512(row_scale_f32_ymm), row_scale_f32_ymm, 1);
- // Multiply with appropiate scales and accumulate (for both d and dmin) below
+ // Multiply with appropriate scales and accumulate (for both d and dmin) below
acc_rows[rp * 4] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
acc_rows[rp * 4 + 1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
acc_rows[rp * 4 + 2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
acc_min_rows[3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_min_3), _mm512_mul_ps(col_dmin_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_min_rows[3]);
}
}
- // Store accumlated values
+ // Store accumulated values
for (int i = 0; i < 4; i++) {
_mm512_storeu_ps((float * )(s + ((y * 4 + i) * bs + x * 8)), _mm512_sub_ps(acc_rows[i], acc_min_rows[i]));
}
const __m128 row_scale_f32_sse = _mm_load_ps(a_ptrs[rp][b].d);
const __m256 row_scale_f32 = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);
- // Multiply with appropiate scales and accumulate (for both d and dmin) below
+ // Multiply with appropriate scales and accumulate (for both d and dmin) below
acc_rows[rp * 4] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
acc_rows[rp * 4 + 1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
acc_rows[rp * 4 + 2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
const __m128 row_scale_f32_sse = _mm_load_ps(a_ptr[b].d);
const __m256 row_scale_f32 = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);
- // Multiply with appropiate scales and accumulate (for both d and dmin) below
+ // Multiply with appropriate scales and accumulate (for both d and dmin) below
acc_rows[0] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
acc_rows[1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
acc_rows[2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
if (prio != GGML_SCHED_PRIO_LOW) {
// Tell Windows that this thread should not be throttled (needs its own CPU core).
- // Newer Windows 11 versions aggresively park (offline) CPU cores and often place
+ // Newer Windows 11 versions aggressively park (offline) CPU cores and often place
// all our threads onto the first 4 cores which results in terrible performance with
// n_threads > 4
#if _WIN32_WINNT >= 0x0602
if constexpr (RN > 1) {
return mnpack<RM, RN-1, BM>(m, n, SIZE_N, BN);
} else {
- GGML_LOG_ERROR("mnpack<%d, %d> bloc size not supported\n", RM, (int)SIZE_N);
+ GGML_LOG_ERROR("mnpack<%d, %d> block size not supported\n", RM, (int)SIZE_N);
GGML_ASSERT(false); // we have miss something.
}
}
if constexpr (RN > 1) {
return mnpack<RM, RN-1, BM>(m, n, SIZE_N, BN);
} else {
- GGML_LOG_ERROR("mnpack<%d, %d> bloc size not supported\n", RM, (int)SIZE_N);
+ GGML_LOG_ERROR("mnpack<%d, %d> block size not supported\n", RM, (int)SIZE_N);
GGML_ASSERT(false); // we have miss something.
}
}
const size_t rs = ne00 * type_size;
if (nb00 == type_size) {
- // src0 is contigous on first dimension, copy by rows
+ // src0 is contiguous on first dimension, copy by rows
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
id += rs * ir0;
{
ggml_compute_forward_repeat_f32(params, dst);
} break;
- // TODO: templateify the implemenation and support for I64
+ // TODO: templateify the implementation and support for I64
// ref https://github.com/ggml-org/llama.cpp/pull/14274#discussion_r2169492225
//case GGML_TYPE_I64:
// {
case GGML_OP_MUL_MAT_ID:
{
size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
- size = GGML_PAD(size, sizeof(int64_t)); // + padding for next bloc.
+ size = GGML_PAD(size, sizeof(int64_t)); // + padding for next block.
const int64_t ne02 = op->src[0]->ne[2]; // n_as, n_expert
const int64_t ne12 = op->src[1]->ne[2]; // n_tokens
auto * wdata = (char *)params->wdata;
auto * wdata_src1_end = (char *)wdata + GGML_PAD(nbw3, sizeof(int64_t));
- // total of [n_as][ne12 + 1] elemets of type mmid_row_mapping (2*int32_t = int64_t)
+ // total of [n_as][ne12 + 1] elements of type mmid_row_mapping (2*int32_t = int64_t)
auto * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as); // [n_as][ne12]
}
// If attention sinks are used, potentially re-scale if KQ_max is small.
- // Also add the sink as a value to KQ_rowsum, this is done after synchonization of KQ_rowsum
+ // Also add the sink as a value to KQ_rowsum, this is done after synchronization of KQ_rowsum
// so it's being done unconditionally for every thread.
if (!is_fixup && (np == 1 || threadIdx.y % np == 0) && sinks_f) {
float KQ_max_scale[cols_per_thread];
return 128;
}
-// Currenlty llvm with the amdgcn target does not support unrolling loops
+// Currently llvm with the amdgcn target does not support unrolling loops
// that contain a break that can not be resolved at compile time.
#ifdef __clang__
#pragma clang diagnostic push
#if defined(RDNA4) && ROCWMMA_VERSION_MAJOR > 1
#define GGML_USE_WMMA_FATTN
#elif defined(RDNA4)
-#warning "rocwmma fattn is not suported on RDNA4 on rocwmma < v2.0.0, expect degraded performance"
+#warning "rocwmma fattn is not supported on RDNA4 on rocwmma < v2.0.0, expect degraded performance"
#endif // defined(RDNA4) && ROCWMMA_VERSION_MAJOR > 1
#endif // defined(GGML_HIP_ROCWMMA_FATTN)
return false;
}
- //rms_norm kernel assumes contigous rows
+ //rms_norm kernel assumes contiguous rows
if (!ggml_is_contiguous_rows(mul->src[0]) || !ggml_is_contiguous_rows(mul->src[1])) {
return false;
}
q.z = roundf(xi.z*d_inv);
q.w = roundf(xi.w*d_inv);
- // Write back 4 int8 values as a single 32 bit value for better memroy bandwidth:
+ // Write back 4 int8 values as a single 32 bit value for better memory bandwidth:
char4 * yqs4 = (char4 *) y[ib].qs;
yqs4[iqs/4] = q;
};
// When ncols_template == 0 the bounds for the loops in this function are not known and can't be unrolled.
-// As we want to keep pragma unroll for all other cases we supress the clang transformation warning here.
+// As we want to keep pragma unroll for all other cases we suppress the clang transformation warning here.
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wpass-failed"
// ======================
// When ncols_template == 0 the bounds for the loops in this function are not
// known and can't be unrolled. As we want to keep pragma unroll for all other
-// cases we supress the clang transformation warning here.
+// cases we suppress the clang transformation warning here.
#ifdef __clang__
# pragma clang diagnostic push
# pragma clang diagnostic ignored "-Wpass-failed"
};
void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync) {
- // Bump pending flag (cleared in the session::flush once we get the responce)
+ // Bump pending flag (cleared in the session::flush once we get the response)
this->op_pending++; // atomic inc
int err = dspqueue_write(this->queue,
// Repack the scales
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
- // the last block is truncated and overriden by the scales.
+ // the last block is truncated and overridden by the scales.
for (int i = 0; i < nb; i++) {
// Repack the scales
ggml_half * d = (ggml_half *) (y_d + i * dblk_size);
// Repack the scales
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
- // the last block is truncated and overriden by the scales.
+ // the last block is truncated and overridden by the scales.
for (int i = 0; i < nb; i++) {
// Unpack the scales
const ggml_half * d = (const ggml_half *) (y_d + i * dblk_size);
// Init the scales
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
- // the last block is truncated and overriden by the scales.
+ // the last block is truncated and overridden by the scales.
for (int i = 0; i < nb; i++) {
// Unpack the scales
x[i * 8 + 0].d = 0;
// Repack the scales
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
- // the last block is truncated and overriden by the scales.
+ // the last block is truncated and overridden by the scales.
for (int i = 0; i < nb; i++) {
// Repack the scales
ggml_half * d = (ggml_half *) (y_d + i * dblk_size);
// Repack the scales
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
- // the last block is truncated and overriden by the scales.
+ // the last block is truncated and overridden by the scales.
for (int i = 0; i < nb; i++) {
// Unpack the scales
const ggml_half * d = (const ggml_half *) (y_d + i * dblk_size);
// Init the scales
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q8_0x4x2)
- // the last block is truncated and overriden by the scales.
+ // the last block is truncated and overridden by the scales.
for (int i = 0; i < nb; i++) {
// Unpack the scales
x[i * 8 + 0].d = 0;
// Repack the scales
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4x4x2)
- // the last block is truncated and overriden by the scales.
+ // the last block is truncated and overridden by the scales.
for (int i = 0; i < nb; i++) {
// Repack the scales
uint8_t * e = (uint8_t *) (y_e + i * eblk_size);
// Repack the scales
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4_0x4x2)
- // the last block is truncated and overriden by the scales.
+ // the last block is truncated and overridden by the scales.
for (int i = 0; i < nb; i++) {
// Unpack the scales
const uint8_t * e = (const uint8_t *) (y_e + i * eblk_size);
// Init the scales
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4x4x2)
- // the last block is truncated and overriden by the scales.
+ // the last block is truncated and overridden by the scales.
for (int i = 0; i < nb; i++) {
// Unpack the scales
x[i * 8 + 0].e = 0;
// The main goal here is to stack the MUL_MAT ops with the same src1 input.
// This allows use to reuse dynamically quantized src1 in VTCM.
- // TODO: the current version might do incorrect reodering in cases where quantized src0
+ // TODO: the current version might do incorrect reordering in cases where quantized src0
// input is an output of another Op.
for (int i0 = 0; i0 < n; i0++) {
// Replace \SystemRoot with an absolute path from system ENV windir
const std::wstring systemRootEnv = L"windir";
- // Query the number of wide charactors this variable requires
+ // Query the number of wide characters this variable requires
DWORD numWords = GetEnvironmentVariableW(systemRootEnv.c_str(), NULL, 0);
if (numWords == 0) {
GGML_LOG_ERROR("ggml-hex: Failed get systemRoot environment variable\n");
HVX_Vector vcl0 = Q6_Vuh_vcl0_Vuh(rm); //count leading zeros
- // Get mantissa for 16-bit represenation
+ // Get mantissa for 16-bit representation
HVX_Vector mant_recip = Q6_V_vand_VV(Q6_Vh_vasr_VhR(Q6_Vh_vasl_VhVh(rm, vcl0), 5), Q6_Vh_vsplat_R(0x03FF));
//Compute Reciprocal Exponent
#include "htp-msg.h"
#include "htp-ops.h"
-// Redefined the types GGML_ROPE_TYPE_NORMAL & GGML_ROPE_TYPE_NEOX as we cant include ggml.h
+// Redefined the types GGML_ROPE_TYPE_NORMAL & GGML_ROPE_TYPE_NEOX as we can't include ggml.h
#define HTP_ROPE_TYPE_NORMAL 0
#define HTP_ROPE_TYPE_NEOX 2
unsigned int n = atomic_load(&pool->n_jobs);
unsigned int i = atomic_fetch_add(&pool->next_job, 1);
if (i >= n) {
- // Spurios wakeup
+ // Spurious wakeup
continue;
}
bool use_residency_sets;
// optional MTLResidencySet
- // note: cannot use explicity "id<MTLResidencySet>" here because it is not available on certain OSes
+ // note: cannot use explicitly "id<MTLResidencySet>" here because it is not available on certain OSes
id rset;
// pointers to global device
const bool inplace = (bool) ((const int32_t *) op->op_params)[4];
if (!inplace) {
- // run a separete kernel to cpy src->dst
+ // run a separate kernel to cpy src->dst
// not sure how to avoid this
// TODO: make a simpler cpy_bytes kernel
const bool inplace = (bool) ((const int32_t *) op->op_params)[4];
if (!inplace) {
- // run a separete kernel to cpy src->dst
+ // run a separate kernel to cpy src->dst
// not sure how to avoid this
// TODO: make a simpler cpy_bytes kernel
const int16_t r0ptg = nypsg*nsg; // num src0 rows per threadgroup
int16_t r1ptg = 4; // num src1 rows per threadgroup
- // note: not sure how optimal are those across all different hardware. there might be someting cleverer
+ // note: not sure how optimal are those across all different hardware. there might be something cleverer
switch (ne11) {
case 2:
r1ptg = 2; break;
#define GGML_METAL_MAX_DEVICES 16
// number of Metal devices
-// note: can be overriden with GGML_METAL_DEVICES env to simulate virtual devices
+// note: can be overridden with GGML_METAL_DEVICES env to simulate virtual devices
static int g_devices = 1;
////////////////////////////////////////////////////////////////////////////////
template [[host_name("kernel_im2col_f32")]] kernel im2col_t kernel_im2col<float>;
template [[host_name("kernel_im2col_f16")]] kernel im2col_t kernel_im2col<half>;
-// TODO: obolete -- remove
+// TODO: obsolete -- remove
//typedef void (im2col_ext_t)(
// constant ggml_metal_kargs_im2col & args,
// device const float * x,
cl_ulong cmd_duration_ns;
// The time for the kernel to complete - COMPLETE - END
cl_ulong cmd_complete_duration_ns;
- // Total time to finish the kernel - COMPELTE - QUEUED
+ // Total time to finish the kernel - COMPLETE - QUEUED
cl_ulong cmd_total_duration_ns;
// Global and local work sizes.
size_t global_size[3];
cl_platform_id platform_ids[NPLAT];
if (clGetPlatformIDs(NPLAT, platform_ids, &n_platforms) != CL_SUCCESS) {
- GGML_LOG_ERROR("ggml_opencl: plaform IDs not available.\n");
+ GGML_LOG_ERROR("ggml_opencl: platform IDs not available.\n");
return found_devices;
}
CL_CHECK(clReleaseEvent(evt));
}
-// Syncronizes the 'backend_ctx's device with others so that commands
+// Synchronizes the 'backend_ctx's device with others so that commands
// enqueued to it won't start until commands in the other devices have
// completed.
static void sync_with_other_backends(ggml_backend_opencl_context * backend_ctx) {
// The buffer_context is initially created by ggml_backend_buft_alloc_buffer
// before any tensor is initialized (at the beginning of alloc_tensor_range).
- // Hence, there is alway a buffer object in this vector. When each tensor is
+ // Hence, there is always a buffer object in this vector. When each tensor is
// being initialized, this original buffer object will be released if both
// flattening and small allocation are enabled, and additional buffer
// objects will be created in init_tensor to represent flattened quantized
//GGML_ASSERT(offset == 0);
// We create subbuffers from the original tensor buffer for scales and
- // quants - i.e., scales and quants are aliases into the buffer obejct
+ // quants - i.e., scales and quants are aliases into the buffer object
// that backs the original tensor. This is a cleaner way to adapt to the
// new memory management.
// In the old code, we allocate new buffers for scales and quants
#define __SYCL_ARCH__ DPCT_COMPATIBILITY_TEMP
-#define VER_4VEC 610 // todo for hardward optimize.
-#define VER_GEN9 700 // todo for hardward optimize.
-#define VER_GEN12 1000000 // todo for hardward optimize.
-#define VER_GEN13 (VER_GEN12 + 1030) // todo for hardward optimize.
+#define VER_4VEC 610 // todo for hardware optimize.
+#define VER_GEN9 700 // todo for hardware optimize.
+#define VER_GEN12 1000000 // todo for hardware optimize.
+#define VER_GEN13 (VER_GEN12 + 1030) // todo for hardware optimize.
#define GGML_SYCL_MAX_NODES 8192 // TODO: adapt to hardwares
// [qs0, qs1, qs2, ..., qsN] [d0, d1, d2, ..., dN]
//
// Notes: out-of-bounds qs will run into d values
-// Aligment relies on the allocated size of qs
+// Alignment relies on the allocated size of qs
template <ggml_type type> struct block_q_t;
};
// When ncols_template == 0 the bounds for the loops in this function are not known and can't be unrolled.
-// As we want to keep pragma unroll for all other cases we supress the clang transformation warning here.
+// As we want to keep pragma unroll for all other cases we suppress the clang transformation warning here.
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wpass-failed"
target_include_directories(ggml-vulkan PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
# Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build
- # Posssibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector
+ # Possibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector
if (MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
add_compile_definitions(_ITERATOR_DEBUG_LEVEL=0)
endif()