#include "ggml-vulkan.h"
#include <vulkan/vulkan_core.h>
-#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_PERF) || defined(GGML_VULKAN_CHECK_RESULTS)
+#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_CHECK_RESULTS)
#include <chrono>
#include "ggml-cpu.h"
#endif
#ifdef GGML_VULKAN_MEMORY_DEBUG
class vk_memory_logger;
#endif
-#ifdef GGML_VULKAN_PERF
class vk_perf_logger;
-#endif
static void ggml_vk_destroy_buffer(vk_buffer& buf);
static constexpr uint32_t mul_mat_vec_max_cols = 8;
#ifdef GGML_VULKAN_MEMORY_DEBUG
std::unique_ptr<vk_memory_logger> memory_logger;
#endif
-#ifdef GGML_VULKAN_PERF
+
+ // for GGML_VK_PERF_LOGGER
std::unique_ptr<vk_perf_logger> perf_logger;
-#endif
+ vk::QueryPool query_pool;
+ uint32_t num_queries;
~vk_device_struct() {
VK_LOG_DEBUG("destroy device " << name);
#define VK_LOG_MEMORY(msg) ((void) 0)
#endif // GGML_VULKAN_MEMORY_DEBUG
-#if defined(GGML_VULKAN_PERF)
-
class vk_perf_logger {
public:
void print_timings() {
for (const auto& time : t.second) {
total += time;
}
- std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " ms" << std::endl;
+ std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " us" << std::endl;
}
timings.clear();
private:
std::map<std::string, std::vector<uint64_t>> timings;
};
-#endif // GGML_VULKAN_PERF
struct ggml_backend_vk_context {
std::string name;
static bool vk_instance_initialized = false;
static vk_instance_t vk_instance;
+static bool vk_perf_logger_enabled = false;
+
#ifdef GGML_VULKAN_CHECK_RESULTS
static size_t vk_skip_checks;
static size_t vk_output_tensor;
#ifdef GGML_VULKAN_MEMORY_DEBUG
device->memory_logger = std::unique_ptr<vk_memory_logger>(new vk_memory_logger());
#endif
-#ifdef GGML_VULKAN_PERF
- device->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger());
-#endif
+ if (vk_perf_logger_enabled) {
+ device->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger());
+ }
size_t dev_num = vk_instance.device_indices[idx];
vk_instance.instance = vk::createInstance(instance_create_info);
vk_instance_initialized = true;
+ vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr;
+
size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size();
// Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan
ctx->tensor_ctxs[node_idx] = compute_ctx;
-#if defined(GGML_VULKAN_CHECK_RESULTS) || defined(GGML_VULKAN_PERF)
+#if defined(GGML_VULKAN_CHECK_RESULTS)
// Force context reset on each node so that each tensor ends up in its own context
// and can be run and compared to its CPU equivalent separately
last_node = true;
bool first_node_in_batch = true; // true if next node will be first node in a batch
int submit_node_idx = 0; // index to first node in a batch
+ vk_context compute_ctx;
+ if (vk_perf_logger_enabled) {
+ // allocate/resize the query pool
+ if (ctx->device->num_queries < cgraph->n_nodes + 1) {
+ if (ctx->device->query_pool) {
+ ctx->device->device.destroyQueryPool(ctx->device->query_pool);
+ }
+ VkQueryPoolCreateInfo query_create_info = { VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO };
+ query_create_info.queryType = VK_QUERY_TYPE_TIMESTAMP;
+ query_create_info.queryCount = cgraph->n_nodes + 100;
+ ctx->device->query_pool = ctx->device->device.createQueryPool(query_create_info);
+ ctx->device->num_queries = query_create_info.queryCount;
+ }
+
+ ctx->device->device.resetQueryPool(ctx->device->query_pool, 0, cgraph->n_nodes+1);
+
+ GGML_ASSERT(ctx->compute_ctx.expired());
+ compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
+ ctx->compute_ctx = compute_ctx;
+ ggml_vk_ctx_begin(ctx->device, compute_ctx);
+ compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0);
+ }
+
// Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution.
// Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB
// (and scaled down based on model size, so smaller models submit earlier).
bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, almost_ready, submit);
+ if (vk_perf_logger_enabled) {
+ if (ctx->compute_ctx.expired()) {
+ compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
+ ctx->compute_ctx = compute_ctx;
+ ggml_vk_ctx_begin(ctx->device, compute_ctx);
+ } else {
+ compute_ctx = ctx->compute_ctx.lock();
+ }
+ compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, i+1);
+ }
+
if (enqueued) {
++submitted_nodes;
}
}
-#ifdef GGML_VULKAN_PERF
- ctx->device->perf_logger->print_timings();
-#endif
+ if (vk_perf_logger_enabled) {
+ // End the command buffer and submit/wait
+ GGML_ASSERT(!ctx->compute_ctx.expired());
+ compute_ctx = ctx->compute_ctx.lock();
+ ggml_vk_ctx_end(compute_ctx);
+
+ ggml_vk_submit(compute_ctx, ctx->device->fence);
+ VK_CHECK(ctx->device->device.waitForFences({ ctx->device->fence }, true, UINT64_MAX), "GGML_VULKAN_PERF waitForFences");
+ ctx->device->device.resetFences({ ctx->device->fence });
+
+ // Get the results and pass them to the logger
+ std::vector<uint64_t> timestamps(cgraph->n_nodes + 1);
+ ctx->device->device.getQueryPoolResults(ctx->device->query_pool, 0, cgraph->n_nodes + 1, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait);
+ for (int i = 0; i < cgraph->n_nodes; i++) {
+ if (!ggml_vk_is_empty(cgraph->nodes[i])) {
+ ctx->device->perf_logger->log_timing(cgraph->nodes[i], uint64_t((timestamps[i+1] - timestamps[i]) * ctx->device->properties.limits.timestampPeriod));
+ }
+ }
+
+ ctx->device->perf_logger->print_timings();
+ }
ggml_vk_graph_cleanup(ctx);