GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
- GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor);
GGML_API GGML_CALL bool ggml_is_empty (const struct ggml_tensor * tensor);
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
+ GGML_API GGML_CALL bool ggml_is_contiguous (const struct ggml_tensor * tensor);
+ GGML_API GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
+ GGML_API GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
+ GGML_API GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
+
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
}
}
#else
- if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
+ if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
// use cublasGemmStridedBatchedEx
CUBLAS_CHECK(
case GGML_OP_CONT:
case GGML_OP_DIAG_MASK_INF:
case GGML_OP_SOFT_MAX:
+ return true;
case GGML_OP_ROPE:
+ return ggml_is_contiguous(op->src[0]);
case GGML_OP_IM2COL:
case GGML_OP_POOL_2D:
case GGML_OP_SUM_ROWS:
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
+ GGML_ASSERT(ggml_is_contiguous(src0));
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
+ GGML_ASSERT(ggml_is_contiguous(src0));
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
+ GGML_ASSERT(ggml_is_contiguous(src0));
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
template<typename T, bool has_pos, bool has_freq_facs>
static __global__ void rope_neox(
const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
- float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims, const float * freq_factors
+ float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors
) {
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
const int i = row*ncols + ib*n_dims + ic/2;
const int i2 = row/p_delta_rows;
- float cur_rot = inv_ndims * ic - ib;
-
const int p = has_pos ? pos[i2] : 0;
const float freq_factor = has_freq_facs ? freq_factors[ic/2] : 1.0f;
- const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f)/freq_factor;
+ const float theta_base = p*powf(theta_scale, col/2.0f)/freq_factor;
float cos_theta, sin_theta;
- rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
+ rope_yarn(theta_base, freq_scale, corr_dims, ic, ext_factor, attn_factor, &cos_theta, &sin_theta);
const float x0 = x[i + 0];
const float x1 = x[i + n_dims/2];
const dim3 block_nums(nrows, num_blocks_x, 1);
const float theta_scale = powf(freq_base, -2.0f/n_dims);
- const float inv_ndims = -1.0f / n_dims;
if (pos == nullptr) {
if (freq_factors == nullptr) {
rope_neox<T, false, false><<<block_nums, block_dims, 0, stream>>>(
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
- theta_scale, inv_ndims, freq_factors
+ theta_scale, freq_factors
);
} else {
rope_neox<T, false, true><<<block_nums, block_dims, 0, stream>>>(
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
- theta_scale, inv_ndims, freq_factors
+ theta_scale, freq_factors
);
}
} else {
if (freq_factors == nullptr) {
rope_neox<T, true, false><<<block_nums, block_dims, 0, stream>>>(
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
- theta_scale, inv_ndims, freq_factors
+ theta_scale, freq_factors
);
} else {
rope_neox<T, true, true><<<block_nums, block_dims, 0, stream>>>(
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
- theta_scale, inv_ndims, freq_factors
+ theta_scale, freq_factors
);
}
}
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
+ GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
GGML_ASSERT(src0->type == dst->type);
{
GGML_ASSERT(ne00 == ne10);
- // TODO: assert that dim2 and dim3 are contiguous
+ ggml_is_contiguous_2(src0);
+ ggml_is_contiguous_2(src1);
+
GGML_ASSERT(ne12 % ne02 == 0);
GGML_ASSERT(ne13 % ne03 == 0);
{
GGML_ASSERT(ne00 == ne10);
- // TODO: assert that dim2 and dim3 are contiguous
+ ggml_is_contiguous_2(src0);
+ ggml_is_contiguous_2(src1);
+
GGML_ASSERT(ne12 % ne02 == 0);
GGML_ASSERT(ne13 % ne03 == 0);
case GGML_OP_RMS_NORM:
{
GGML_ASSERT(ne00 % 4 == 0);
+ GGML_ASSERT(ggml_is_contiguous_1(src0));
float eps;
memcpy(&eps, dst->op_params, sizeof(float));
case GGML_OP_GROUP_NORM:
{
GGML_ASSERT(ne00 % 4 == 0);
+ GGML_ASSERT(ggml_is_contiguous(src0));
//float eps;
//memcpy(&eps, dst->op_params, sizeof(float));
} break;
case GGML_OP_NORM:
{
+ GGML_ASSERT(ggml_is_contiguous_1(src0));
+
float eps;
memcpy(&eps, dst->op_params, sizeof(float));
const int64_t p = pos[i2];
- const float theta_0 = (float)p;
+ const float theta_base = (float)p;
const float inv_ndims = -1.f/n_dims;
if (!is_neox) {
for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
+ const float theta = theta_base * pow(freq_base, inv_ndims*i0);
- const float theta = theta_0 * pow(freq_base, inv_ndims*i0);
float cos_theta, sin_theta;
rope_yarn(theta, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
} else {
for (int64_t ic = 2*tiitg; ic < ne0; ic += 2*tptg.x) {
if (ic < n_dims) {
- const int64_t ib = 0;
+ const int64_t i0 = ic/2;
- // simplified from `(ib * n_dims + ic) * inv_ndims`
- const float cur_rot = inv_ndims*ic - ib;
- const float freq_factor = src2 != src0 ? src2[ic/2] : 1.0f;
+ const float freq_factor = src2 != src0 ? src2[i0] : 1.0f;
- const float theta = theta_0 * pow(freq_base, cur_rot) / freq_factor;
+ const float theta = theta_base * pow(freq_base, inv_ndims*ic);
float cos_theta, sin_theta;
- rope_yarn(theta, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
-
- const int64_t i0 = ib*n_dims + ic/2;
+ rope_yarn(theta/freq_factor, freq_scale, corr_dims, ic, ext_factor, attn_factor, &cos_theta, &sin_theta);
device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
const int64_t r2 = ne12/ne02;
const int64_t r3 = ne13/ne03;
- if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
+ if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
*g_sycl_handles[g_main_device], oneapi::mkl::transpose::trans,
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
}
-static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * tensor) {
+GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) {
+ return ggml_is_contiguous(tensor);
+}
+
+GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
}
+GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
+ static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+ return
+ tensor->nb[0] == ggml_type_size(tensor->type) &&
+ tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
+}
+
GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
const struct ggml_tensor * src0 = dst->src[0];
- GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
- GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
+ GGML_ASSERT(ggml_is_contiguous_1(src0));
+ GGML_ASSERT(ggml_is_contiguous_1(dst));
GGML_ASSERT(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
const struct ggml_tensor * src0 = dst->src[0];
- GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
- GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
+ GGML_ASSERT(ggml_is_contiguous_1(src0));
+ GGML_ASSERT(ggml_is_contiguous_1(dst));
GGML_ASSERT(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
const struct ggml_tensor * src0 = dst->src[0];
- GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
- GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
+ GGML_ASSERT(ggml_is_contiguous_1(src0));
+ GGML_ASSERT(ggml_is_contiguous_1(dst));
GGML_ASSERT(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * grad = dst->src[1];
- GGML_ASSERT(ggml_is_contiguous_except_dim_1(grad));
- GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
- GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
+ GGML_ASSERT(ggml_is_contiguous_1(grad));
+ GGML_ASSERT(ggml_is_contiguous_1(src0));
+ GGML_ASSERT(ggml_is_contiguous_1(dst));
GGML_ASSERT(ggml_are_same_shape(src0, dst));
GGML_ASSERT(ggml_are_same_shape(src0, grad));
int ir = 0;
const float theta_scale = powf(freq_base, -2.0f/n_dims);
- const float inv_ndims = -1.f/n_dims;
+
float corr_dims[2];
ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
const float cos_block_theta = cosf(block_theta);
const float sin_block_theta = sinf(block_theta) * sin_sign;
- theta_base *= theta_scale;
+ theta_base *= theta_scale;
block_theta *= theta_scale;
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
}
} else {
- // TODO: this might be wrong for ne0 != n_dims - need double check
- // it seems we have to rope just the first n_dims elements and do nothing with the rest
- // ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
- theta_base *= freq_scale;
+ // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
for (int64_t ic = 0; ic < ne0; ic += 2) {
if (ic < n_dims) {
- const int64_t ib = 0;
+ const int64_t i0 = ic/2;
- // simplified from `(ib * n_dims + ic) * inv_ndims`
- float cur_rot = inv_ndims * ic - ib;
- float freq_factor = freq_factors ? freq_factors[ic/2] : 1.0f;
+ const float freq_factor = freq_factors ? freq_factors[i0] : 1.0f;
float cos_theta, sin_theta;
rope_yarn(
- theta_base/freq_factor, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
+ theta_base/freq_factor, freq_scale, corr_dims, ic, ext_factor, attn_factor,
&cos_theta, &sin_theta
);
- sin_theta *= sin_sign;
+ sin_theta *= sin_sign;
theta_base *= theta_scale;
- const int64_t i0 = ib*n_dims + ic/2;
-
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
int ir = 0;
const float theta_scale = powf(freq_base, -2.0f/n_dims);
- const float inv_ndims = -1.f/n_dims;
+
float corr_dims[2];
ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
const float cos_block_theta = cosf(block_theta);
const float sin_block_theta = sinf(block_theta) * sin_sign;
- theta_base *= theta_scale;
+ theta_base *= theta_scale;
block_theta *= theta_scale;
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
}
} else {
- // TODO: this might be wrong for ne0 != n_dims - need double check
- // it seems we have to rope just the first n_dims elements and do nothing with the rest
- // ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
- theta_base *= freq_scale;
+ // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
for (int64_t ic = 0; ic < ne0; ic += 2) {
if (ic < n_dims) {
- const int64_t ib = 0;
+ const int64_t i0 = ic/2;
- // simplified from `(ib * n_dims + ic) * inv_ndims`
- float cur_rot = inv_ndims * ic - ib;
- float freq_factor = freq_factors ? freq_factors[ic/2] : 1.0f;
+ const float freq_factor = freq_factors ? freq_factors[i0] : 1.0f;
float cos_theta, sin_theta;
rope_yarn(
- theta_base/freq_factor, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
+ theta_base/freq_factor, freq_scale, corr_dims, ic, ext_factor, attn_factor,
&cos_theta, &sin_theta
);
- sin_theta *= sin_sign;
+ sin_theta *= sin_sign;
theta_base *= theta_scale;
- const int64_t i0 = ib*n_dims + ic/2;
-
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
// GGML_OP_ROPE
struct test_rope : public test_case {
const ggml_type type;
- const std::array<int64_t, 4> ne;
+ const std::array<int64_t, 4> ne_a;
int n_dims;
int mode;
int n_ctx;
+ float fs; // freq_scale
+ float ef; // ext_factor
+ float af; // attn_factor
bool ff;
+ int v; // view (1 : non-contiguous a)
std::string vars() override {
- return VARS_TO_STR6(type, ne, n_dims, mode, n_ctx, ff);
+ return VARS_TO_STR10(type, ne_a, n_dims, mode, n_ctx, fs, ef, af, ff, v);
}
test_rope(ggml_type type = GGML_TYPE_F32,
- std::array<int64_t, 4> ne = {10, 10, 10, 1},
- int n_dims = 10, int mode = 0, int n_ctx = 512, bool ff = false)
- : type(type), ne(ne), n_dims(n_dims), mode(mode), n_ctx(n_ctx), ff(ff) {}
+ std::array<int64_t, 4> ne_a = {10, 10, 10, 1},
+ int n_dims = 10, int mode = 0, int n_ctx = 512, float fs = 1.0f, float ef = 0.0f, float af = 0.0f, bool ff = false, int v = 0)
+ : type(type), ne_a(ne_a), n_dims(n_dims), mode(mode), n_ctx(n_ctx), fs(fs), ef(ef), af(af), ff(ff), v(v) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
- ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
- ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne[2]);
+ ggml_tensor * a;
+ if (v & 1) {
+ auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3;
+ a = ggml_new_tensor(ctx, type, 4, ne.data());
+ a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
+ } else {
+ a = ggml_new_tensor(ctx, type, 4, ne_a.data());
+ }
+ ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2]);
ggml_tensor * freq = ff ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_dims/2) : nullptr;
- ggml_tensor * out = ggml_rope_ext(ctx, a, pos, freq, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
+ ggml_tensor * out = ggml_rope_ext(ctx, a, pos, freq, n_dims, mode, n_ctx, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
return out;
}
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
if (t->type == GGML_TYPE_I32) {
// pos
- std::vector<int> data(ne[2]);
- for (int i = 0; i < ne[2]; i++) {
+ std::vector<int> data(ne_a[2]);
+ for (int i = 0; i < ne_a[2]; i++) {
data[i] = rand() % n_ctx;
}
- ggml_backend_tensor_set(t, data.data(), 0, ne[2] * sizeof(int));
+ ggml_backend_tensor_set(t, data.data(), 0, ne_a[2] * sizeof(int));
} else {
if (t->ne[0] == n_dims/2) {
// frequency factors in the range [0.9f, 1.1f]
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 0.0f));
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 8.0f));
- for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
- // TODO: ff not supported yet for !neox
- test_cases.emplace_back(new test_rope(type, {128, 32, 10, 1}, 128, 0, 512, false)); // llama 7B
- test_cases.emplace_back(new test_rope(type, {128, 40, 10, 1}, 128, 0, 512, false)); // llama 13B
- test_cases.emplace_back(new test_rope(type, {128, 52, 10, 1}, 128, 0, 512, false)); // llama 30B
- test_cases.emplace_back(new test_rope(type, {128, 64, 10, 1}, 128, 0, 512, false)); // llama 65B
-
- for (bool ff : {false, true}) { // freq_factors
- test_cases.emplace_back(new test_rope(type, { 64, 1, 10, 1}, 64, 2, 512, ff)); // neox (falcon 7B)
- test_cases.emplace_back(new test_rope(type, { 64, 71, 10, 1}, 64, 2, 512, ff)); // neox (falcon 7B)
- test_cases.emplace_back(new test_rope(type, { 64, 8, 10, 1}, 64, 2, 512, ff)); // neox (falcon 40B)
- test_cases.emplace_back(new test_rope(type, { 64, 128, 10, 1}, 64, 2, 512, ff)); // neox (falcon 40B)
- test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 20, 2, 512, ff)); // neox (stablelm)
- test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 32, 2, 512, ff)); // neox (phi-2)
+ {
+ bool all = true;
+
+ for (float v : { 0, 1 }) {
+ for (float fs : { 1.0f, 1.4245f }) {
+ for (float ef : { 0.0f, 0.7465f }) {
+ for (float af : { 1.0f, 1.4245f }) {
+ for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
+ // TODO: ff not supported yet for !neox
+ test_cases.emplace_back(new test_rope(type, {128, 32, 10, 1}, 128, 0, 512, fs, ef, af, false, v)); // llama 7B
+ if (all) {
+ test_cases.emplace_back(new test_rope(type, {128, 40, 10, 1}, 128, 0, 512, fs, ef, af, false, v)); // llama 13B
+ test_cases.emplace_back(new test_rope(type, {128, 52, 10, 1}, 128, 0, 512, fs, ef, af, false, v)); // llama 30B
+ test_cases.emplace_back(new test_rope(type, {128, 64, 10, 1}, 128, 0, 512, fs, ef, af, false, v)); // llama 65B
+ }
+
+ for (bool ff : {false, true}) { // freq_factors
+ if (all) {
+ test_cases.emplace_back(new test_rope(type, { 64, 1, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B)
+ test_cases.emplace_back(new test_rope(type, { 64, 71, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B)
+ test_cases.emplace_back(new test_rope(type, { 64, 8, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B)
+ test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 20, 2, 512, fs, ef, af, ff, v)); // neox (stablelm)
+ test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 32, 2, 512, fs, ef, af, ff, v)); // neox (phi-2)
+ }
+
+ test_cases.emplace_back(new test_rope(type, { 64, 128, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B)
+ }
+ }
+ all = false;
+ }
+ }
+ }
}
}