} else {
compute_ctx = ctx->compute_ctx.lock();
}
+ } else {
+ switch (node->op) {
+ case GGML_OP_REPEAT:
+ case GGML_OP_ACC:
+ case GGML_OP_GET_ROWS:
+ case GGML_OP_ADD:
+ case GGML_OP_MUL:
+ case GGML_OP_DIV:
+ case GGML_OP_CONCAT:
+ case GGML_OP_UPSCALE:
+ case GGML_OP_SCALE:
+ case GGML_OP_SQR:
+ case GGML_OP_SIN:
+ case GGML_OP_COS:
+ case GGML_OP_CLAMP:
+ case GGML_OP_PAD:
+ case GGML_OP_CPY:
+ case GGML_OP_CONT:
+ case GGML_OP_DUP:
+ case GGML_OP_NORM:
+ case GGML_OP_GROUP_NORM:
+ case GGML_OP_RMS_NORM:
+ case GGML_OP_UNARY:
+ case GGML_OP_DIAG_MASK_INF:
+ case GGML_OP_SOFT_MAX:
+ case GGML_OP_ROPE:
+ case GGML_OP_ARGSORT:
+ case GGML_OP_SUM_ROWS:
+ case GGML_OP_IM2COL:
+ case GGML_OP_TIMESTEP_EMBEDDING:
+ case GGML_OP_POOL_2D:
+ case GGML_OP_LEAKY_RELU:
+ {
+ // These operations all go through ggml_vk_op_f32, so short-circuit and
+ // do the only thing needed for the dryrun.
+ vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, node, node->op);
+ ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
+ return false;
+ }
+ default:
+ break;
+ }
}
switch (node->op) {
bool first_node_in_batch = true; // true if next node will be first node in a batch
int submit_node_idx = 0; // index to first node in a batch
- // submit work every submit_count node to overlap CPU cmdbuffer generation with GPU execution
- constexpr int submit_count = 100;
+ // Submit work every nodes_per_submit nodes to overlap CPU cmdbuffer generation with GPU execution.
+ // Start with a smaller count to get work submitted right away, and increase it after each submit.
+ int nodes_per_submit = 20;
int submitted_nodes = 0;
+ int submit_count = 0;
for (int i = 0; i < cgraph->n_nodes; i++) {
if (first_node_in_batch) {
submit_node_idx = i;
}
- bool submit = (submitted_nodes >= submit_count) || (i == last_node);
-
+ bool submit = (submitted_nodes >= nodes_per_submit) || (i == last_node);
bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, submit);
if (submit) {
first_node_in_batch = true;
submitted_nodes = 0;
+ switch (submit_count) {
+ case 0:
+ nodes_per_submit = 50;
+ break;
+ default:
+ nodes_per_submit = 100;
+ break;
+ }
+ submit_count++;
}
}