float logit_softcap;
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
- const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
-
if (Q->ne[1] == 1) {
constexpr int cols_per_block = 1;
if (logit_softcap == 0.0f) {
It is intended as fusion of softmax->top-k->get_rows pipeline for MoE models
*/
-template <size_t n_experts, bool with_norm>
+template <int n_experts, bool with_norm>
__launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float * logits,
float * weights,
int32_t * ids,
GGML_ASSERT(ids->nb[1] / ggml_type_size(ids->type) == (size_t) n_experts);
- cudaStream_t stream = ctx.stream();
-
const int n_expert_used = weights->ne[1];
if (with_norm) {
set_source_files_properties(${GGML_SOURCES_MUSA} PROPERTIES LANGUAGE CXX)
foreach(SOURCE ${GGML_SOURCES_MUSA})
- set(COMPILE_FLAGS "-fsigned-char -x musa -mtgpu")
+ set(COMPILE_FLAGS "-Od3 -fno-strict-aliasing -ffast-math -fsigned-char -x musa -mtgpu -fmusa-flush-denormals-to-zero")
foreach(ARCH ${MUSA_ARCHITECTURES})
set(COMPILE_FLAGS "${COMPILE_FLAGS} --cuda-gpu-arch=mp_${ARCH}")
endforeach()