/ggml/src/ggml-rpc/ @rgerganov
/ggml/src/ggml-threading.* @ggerganov
/ggml/src/ggml-vulkan/ @0cc4m
+/ggml/src/ggml-virtgpu/ @kpouget
/ggml/src/ggml-webgpu/ @reeselevine
/ggml/src/ggml-zdnn/ @taronaeo @Andreas-Krebbel @AlekseiNikiforovIBM
/ggml/src/ggml.c @ggerganov
option(GGML_WEBGPU_GPU_PROFILE "ggml: enable WebGPU profiling (GPU)" OFF)
option(GGML_WEBGPU_JSPI "ggml: use JSPI for WebGPU" ON)
option(GGML_ZDNN "ggml: use zDNN" OFF)
+option(GGML_VIRTGPU "ggml: use the VirtGPU/Virglrenderer API Remoting frontend" OFF)
+option(GGML_VIRTGPU_BACKEND "ggml: build the VirtGPU/Virglrenderer API Remoting backend" OFF)
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
option(GGML_METAL_SHADER_DEBUG "ggml: compile Metal with -fno-fast-math" OFF)
include/ggml-opt.h
include/ggml-metal.h
include/ggml-rpc.h
+ include/ggml-virtgpu.h
include/ggml-sycl.h
include/ggml-vulkan.h
include/ggml-webgpu.h
--- /dev/null
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define GGML_REMOTING_FRONTEND_NAME "RemotingFrontend"
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_virtgpu_reg();
+
+#ifdef __cplusplus
+}
+#endif
ggml_add_backend(METAL)
ggml_add_backend(MUSA)
ggml_add_backend(RPC)
+ggml_add_backend(VirtGPU)
ggml_add_backend(SYCL)
ggml_add_backend(Vulkan)
ggml_add_backend(WebGPU)
#include "ggml-rpc.h"
#endif
+#ifdef GGML_USE_VIRTGPU_FRONTEND
+#include "ggml-virtgpu.h"
+#endif
+
#ifdef GGML_USE_CANN
#include "ggml-cann.h"
#endif
register_backend(ggml_backend_sycl_reg());
#endif
#ifdef GGML_USE_VULKAN
+ // Add runtime disable check
+ if (getenv("GGML_DISABLE_VULKAN") == nullptr) {
register_backend(ggml_backend_vk_reg());
+ } else {
+ GGML_LOG_DEBUG("Vulkan backend disabled by GGML_DISABLE_VULKAN environment variable\n");
+ }
#endif
#ifdef GGML_USE_WEBGPU
register_backend(ggml_backend_webgpu_reg());
#ifdef GGML_USE_ZDNN
register_backend(ggml_backend_zdnn_reg());
#endif
+#ifdef GGML_USE_VIRTGPU_FRONTEND
+ register_backend(ggml_backend_virtgpu_reg());
+#endif
+
#ifdef GGML_USE_OPENCL
register_backend(ggml_backend_opencl_reg());
#endif
ggml_backend_load_best("rpc", silent, dir_path);
ggml_backend_load_best("sycl", silent, dir_path);
ggml_backend_load_best("vulkan", silent, dir_path);
+ ggml_backend_load_best("virtgpu", silent, dir_path);
ggml_backend_load_best("opencl", silent, dir_path);
ggml_backend_load_best("hexagon", silent, dir_path);
ggml_backend_load_best("musa", silent, dir_path);
--- /dev/null
+cmake_minimum_required(VERSION 3.19)
+cmake_policy(SET CMP0114 NEW)
+
+include(ExternalProject)
+
+message(STATUS "Including the VirtGPU/Virglrenderer API Remoting")
+
+# Download venus_hw.h from virglrenderer repository
+ExternalProject_Add(
+ venus_hw_header
+ URL https://gitlab.freedesktop.org/virgl/virglrenderer/-/raw/virglrenderer-1.2.0/src/venus_hw.h
+ DOWNLOAD_NO_EXTRACT YES
+ DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include
+ DOWNLOAD_NAME venus_hw.h
+ CONFIGURE_COMMAND ""
+ BUILD_COMMAND ""
+ INSTALL_COMMAND ""
+ LOG_DOWNLOAD ON
+)
+
+if (NOT GGML_VIRTGPU_BACKEND STREQUAL "ONLY")
+ message(STATUS "Enable the VirtGPU/Virglrenderer API Remoting frontend library")
+
+ find_package(PkgConfig REQUIRED)
+ pkg_check_modules(DRM REQUIRED libdrm)
+ if (NOT GGML_BACKEND_DL)
+ # cannot simply use USE_VIRTGPU, as in the 'else()' case the
+ # frontend isn't compiled
+ target_compile_definitions(ggml PUBLIC "GGML_USE_VIRTGPU_FRONTEND")
+ endif()
+
+ ggml_add_backend_library(ggml-virtgpu
+ ggml-backend-buffer.cpp
+ ggml-backend.cpp
+ ggml-backend-device.cpp
+ ggml-backend-reg.cpp
+ ggml-backend-buffer-type.cpp
+ virtgpu-apir.h
+ virtgpu-forward.gen.h
+ virtgpu.cpp
+ virtgpu-shm.cpp
+ virtgpu-utils.cpp
+ virtgpu-forward-device.cpp
+ virtgpu-forward-buffer-type.cpp
+ virtgpu-forward-buffer.cpp
+ virtgpu-forward-backend.cpp
+ virtgpu-forward-impl.h
+ apir_cs_ggml-rpc-front.cpp
+ ../../include/ggml-virtgpu.h)
+
+ target_include_directories(ggml-virtgpu PUBLIC /usr/include/libdrm/)
+
+ target_link_libraries(ggml-virtgpu PUBLIC ${DRM_LIBRARIES})
+ target_include_directories(ggml-virtgpu PUBLIC ${DRM_INCLUDE_DIRS})
+ target_compile_options(ggml-virtgpu PUBLIC ${DRM_CFLAGS_OTHER})
+
+ target_include_directories(ggml-virtgpu PUBLIC ./include)
+ target_include_directories(ggml-virtgpu PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
+
+ # Ensure venus_hw.h is downloaded before building ggml-virtgpu
+ add_dependencies(ggml-virtgpu venus_hw_header)
+
+ target_compile_options(ggml-virtgpu PRIVATE -std=c++20)
+else()
+ message(STATUS "Not building the VirtGPU/Virglrenderer API Remoting frontend library")
+endif()
+
+if (NOT GGML_VIRTGPU_BACKEND STREQUAL "OFF")
+ add_subdirectory("backend")
+endif()
--- /dev/null
+#include "backend/shared/apir_cs_rpc.h"
+#include "ggml-backend-impl.h"
+#include "ggml-impl.h"
+#include "ggml-remoting.h"
+
+#include <cinttypes>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+apir_rpc_tensor apir_serialize_tensor(const ggml_tensor * tensor) {
+ apir_rpc_tensor result;
+ result.id = reinterpret_cast<uint64_t>(tensor);
+ result.type = tensor->type;
+ if (tensor->buffer) {
+ ggml_backend_buffer_t buffer = tensor->buffer;
+
+ result.buffer = BUFFER_TO_HOST_HANDLE(buffer);
+ } else {
+ result.buffer = 0;
+ }
+ for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) {
+ result.ne[i] = tensor->ne[i];
+ result.nb[i] = tensor->nb[i];
+ }
+ result.op = tensor->op;
+ for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
+ result.op_params[i] = tensor->op_params[i];
+ }
+ result.flags = tensor->flags;
+ for (uint32_t i = 0; i < GGML_MAX_SRC; i++) {
+ result.src[i] = reinterpret_cast<uint64_t>(tensor->src[i]);
+ }
+ result.view_src = reinterpret_cast<uint64_t>(tensor->view_src);
+ result.view_offs = tensor->view_offs;
+ result.data = reinterpret_cast<uint64_t>(tensor->data);
+ if (tensor->data) {
+ if (!tensor->buffer) {
+ GGML_ABORT("tensor has data but not buffer");
+ }
+ // tensor->data is serialized as an offset to the buffer base address
+ result.data -= reinterpret_cast<uint64_t>(BUFFER_TO_GGML_CONTEXT(tensor->buffer)->base);
+ }
+ snprintf(result.name, GGML_MAX_NAME, "%s", tensor->name);
+ return result;
+}
+
+void apir_add_tensor(ggml_tensor * tensor,
+ std::vector<apir_rpc_tensor> & tensors,
+ std::unordered_set<ggml_tensor *> & visited) {
+ if (tensor == nullptr) {
+ return;
+ }
+ if (visited.find(tensor) != visited.end()) {
+ return;
+ }
+ visited.insert(tensor);
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
+ apir_add_tensor(tensor->src[i], tensors, visited);
+ }
+ apir_add_tensor(tensor->view_src, tensors, visited);
+ tensors.push_back(apir_serialize_tensor(tensor));
+}
+
+void apir_serialize_graph(const ggml_cgraph * cgraph, std::vector<uint8_t> & output) {
+ uint32_t n_nodes = cgraph->n_nodes;
+ std::vector<apir_rpc_tensor> tensors;
+ std::unordered_set<ggml_tensor *> visited;
+ for (uint32_t i = 0; i < n_nodes; i++) {
+ apir_add_tensor(cgraph->nodes[i], tensors, visited);
+ }
+ // serialization format:
+ // | n_nodes (4 bytes) | nodes (n_nodes * sizeof(uint64_t) | n_tensors (4 bytes) | tensors (n_tensors * sizeof(apir_rpc_tensor)) |
+ uint32_t n_tensors = tensors.size();
+ int output_size =
+ sizeof(uint32_t) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t) + n_tensors * sizeof(apir_rpc_tensor);
+ output.resize(output_size, 0);
+ memcpy(output.data(), &n_nodes, sizeof(n_nodes));
+ for (uint32_t i = 0; i < n_nodes; i++) {
+ memcpy(output.data() + sizeof(n_nodes) + i * sizeof(uint64_t), &cgraph->nodes[i], sizeof(uint64_t));
+ }
+ uint32_t * out_ntensors = (uint32_t *) (output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t));
+ *out_ntensors = n_tensors;
+ apir_rpc_tensor * out_tensors =
+ (apir_rpc_tensor *) (output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t));
+ memcpy(out_tensors, tensors.data(), n_tensors * sizeof(apir_rpc_tensor));
+}
--- /dev/null
+cmake_minimum_required(VERSION 3.19)
+cmake_policy(SET CMP0114 NEW)
+
+message(STATUS "Enable the VirtGPU/Virglrenderer backend library")
+
+ggml_add_backend_library(ggml-virtgpu-backend
+ backend.cpp
+ backend-dispatched.cpp
+ backend-dispatched-backend.cpp
+ backend-dispatched-device.cpp
+ backend-dispatched-buffer.cpp
+ backend-dispatched-buffer-type.cpp
+ shared/api_remoting.h
+ shared/apir_backend.h
+ shared/apir_cs.h
+ apir_cs_ggml-rpc-back.cpp)
+
+target_compile_options(ggml-virtgpu-backend PRIVATE -std=c++20)
+
+# Add include directory for ggml-backend-impl.h and other core headers
+target_include_directories(ggml-virtgpu-backend PRIVATE ../..)
--- /dev/null
+#include "ggml-backend-impl.h"
+#include "ggml-impl.h"
+#include "shared/apir_cs_rpc.h"
+
+#include <cinttypes>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+std::unordered_set<ggml_backend_buffer_t> backend_buffers;
+
+void apir_track_backend_buffer(ggml_backend_buffer_t buffer) {
+ backend_buffers.insert(buffer);
+}
+
+bool apir_untrack_backend_buffer(ggml_backend_buffer_t buffer) {
+ auto it = backend_buffers.find(buffer);
+ if (it == backend_buffers.end()) {
+ return false;
+ }
+
+ backend_buffers.erase(it);
+ return true;
+}
+
+std::unordered_set<ggml_backend_buffer_t> apir_get_track_backend_buffers() {
+ return backend_buffers;
+}
+
+ggml_tensor * apir_deserialize_tensor(ggml_context * ctx, const apir_rpc_tensor * tensor) {
+ ggml_tensor * result =
+ ggml_new_tensor_4d(ctx, (ggml_type) tensor->type, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
+ for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) {
+ result->nb[i] = tensor->nb[i];
+ }
+ result->buffer = reinterpret_cast<ggml_backend_buffer_t>(tensor->buffer);
+ if (result->buffer && backend_buffers.find(result->buffer) == backend_buffers.end()) {
+ printf("WARNING: HOST BUFFER NOT FOUND | %p\n", (void *) result->buffer);
+ result->buffer = nullptr;
+ }
+
+ uint64_t tensor_data = tensor->data;
+ if (result->buffer) {
+ // require that the tensor data does not go beyond the buffer end
+ uint64_t tensor_size = (uint64_t) ggml_nbytes(result);
+ uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer);
+ uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer);
+
+ // tensor->data is serialized as an offset to the buffer base address
+ tensor_data += buffer_start;
+
+ GGML_ASSERT(tensor_data + tensor_size >= tensor_data); // check for overflow
+ GGML_ASSERT(tensor_data >= buffer_start && tensor_data + tensor_size <= buffer_start + buffer_size);
+ }
+
+ result->op = (ggml_op) tensor->op;
+ for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
+ result->op_params[i] = tensor->op_params[i];
+ }
+ result->flags = tensor->flags;
+ result->data = reinterpret_cast<void *>(tensor_data);
+ ggml_set_name(result, tensor->name);
+ return result;
+}
+
+ggml_tensor * apir_create_node(uint64_t id,
+ ggml_context * ctx,
+ const std::unordered_map<uint64_t, const apir_rpc_tensor *> & tensor_ptrs,
+ std::unordered_map<uint64_t, ggml_tensor *> & tensor_map) {
+ if (id == 0) {
+ return nullptr;
+ }
+ if (tensor_map.find(id) != tensor_map.end()) {
+ return tensor_map[id];
+ }
+ const apir_rpc_tensor * tensor = tensor_ptrs.at(id);
+ ggml_tensor * result = apir_deserialize_tensor(ctx, tensor);
+ if (result == nullptr) {
+ return nullptr;
+ }
+ tensor_map[id] = result;
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
+ result->src[i] = apir_create_node(tensor->src[i], ctx, tensor_ptrs, tensor_map);
+ }
+ result->view_src = apir_create_node(tensor->view_src, ctx, tensor_ptrs, tensor_map);
+ result->view_offs = tensor->view_offs;
+ return result;
+}
+
+ggml_cgraph * apir_deserialize_graph(uint32_t n_nodes,
+ uint32_t n_tensors,
+ const apir_rpc_tensor * tensors,
+ const uint64_t * nodes) {
+ size_t buf_size = ggml_tensor_overhead() * (n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false);
+ ggml_init_params params = {
+ /*.mem_size =*/buf_size,
+ /*.mem_buffer =*/NULL,
+ /*.no_alloc =*/true,
+ };
+ ggml_context * ctx = ggml_init(params);
+ ggml_cgraph * graph = ggml_new_graph_custom(ctx, n_nodes, false);
+ graph->n_nodes = n_nodes;
+ std::unordered_map<uint64_t, const apir_rpc_tensor *> tensor_ptrs;
+ for (uint32_t i = 0; i < n_tensors; i++) {
+ tensor_ptrs[tensors[i].id] = &tensors[i];
+ }
+ std::unordered_map<uint64_t, ggml_tensor *> tensor_map;
+ for (uint32_t i = 0; i < n_nodes; i++) {
+ int64_t id;
+ memcpy(&id, &nodes[i], sizeof(id));
+ graph->nodes[i] = apir_create_node(id, ctx, tensor_ptrs, tensor_map);
+ }
+
+ return graph;
+}
--- /dev/null
+#include "shared/apir_backend.h"
+
+#define BUFFER_TO_HOST_HANDLE(name) ggml_buffer_to_apir_handle(name)
+
+static inline apir_buffer_host_handle_t ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) {
+ // in the backend, the buffer handle is the buffer pointer
+ return (apir_buffer_host_handle_t) buffer;
+}
+
+static inline apir_buffer_type_host_handle_t ggml_buffer_type_to_apir_handle(ggml_backend_buffer_type_t buft) {
+ // in the backend, the buffer handle is the buffer pointer
+ return (apir_buffer_type_host_handle_t) buft;
+}
--- /dev/null
+#include "backend-dispatched.h"
+#include "backend-virgl-apir.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+#include "ggml-impl.h"
+#include "shared/apir_backend.h"
+
+#include <cstdint>
+
+uint32_t backend_backend_graph_compute(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+ GGML_UNUSED(ctx);
+ GGML_UNUSED(enc);
+
+ static bool async_backend_initialized = false;
+ static bool async_backend;
+
+ if (!async_backend_initialized) {
+ ggml_backend_dev_props props;
+
+ dev->iface.get_props(dev, &props);
+ async_backend = props.caps.async;
+ async_backend_initialized = true;
+ }
+
+ uint32_t shmem_res_id;
+ apir_decode_virtgpu_shmem_res_id(dec, &shmem_res_id);
+
+ const void * shmem_data = ctx->iface->get_shmem_ptr(ctx->ctx_id, shmem_res_id);
+ if (!shmem_data) {
+ GGML_LOG_ERROR("Couldn't get the shmem addr from virgl\n");
+ apir_decoder_set_fatal(dec);
+ return 1;
+ }
+ size_t cgraph_size;
+ apir_decode_size_t(dec, &cgraph_size);
+
+ apir_decoder secondary_dec = apir_new_decoder((const char *) shmem_data, cgraph_size);
+
+ ggml_cgraph * cgraph = apir_decode_ggml_cgraph(&secondary_dec, cgraph_size);
+
+ ggml_status status;
+#if APIR_BACKEND_CHECK_SUPPORTS_OP == 1
+ for (int idx = 0; idx < cgraph->n_nodes; idx++) {
+ ggml_tensor * op = ggml_graph_node(cgraph, idx);
+ if (dev->iface.supports_op(dev, op)) {
+ continue;
+ }
+ GGML_LOG_ERROR("Graph node %d (%s) not supported by the backend\n", idx, ggml_op_desc(op));
+
+ status = GGML_STATUS_ABORTED;
+ apir_encode_ggml_status(enc, &status);
+
+ return 0;
+ }
+#endif
+ status = bck->iface.graph_compute(bck, cgraph);
+
+ if (async_backend) {
+ bck->iface.synchronize(bck);
+ }
+
+ apir_encode_ggml_status(enc, &status);
+
+ return 0;
+}
--- /dev/null
+#include "backend-dispatched.h"
+#include "backend-virgl-apir.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+#include "ggml-impl.h"
+
+#include <cstdint>
+
+uint32_t backend_buffer_type_get_name(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+ GGML_UNUSED(ctx);
+ ggml_backend_buffer_type_t buft;
+ buft = apir_decode_ggml_buffer_type(dec);
+
+ const char * string = buft->iface.get_name(buft);
+
+ const size_t string_size = strlen(string) + 1;
+ apir_encode_array_size(enc, string_size);
+ apir_encode_char_array(enc, string, string_size);
+
+ return 0;
+}
+
+uint32_t backend_buffer_type_get_alignment(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+ GGML_UNUSED(ctx);
+ ggml_backend_buffer_type_t buft;
+ buft = apir_decode_ggml_buffer_type(dec);
+
+ size_t value = buft->iface.get_alignment(buft);
+ apir_encode_size_t(enc, &value);
+
+ return 0;
+}
+
+uint32_t backend_buffer_type_get_max_size(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+ GGML_UNUSED(ctx);
+ ggml_backend_buffer_type_t buft;
+ buft = apir_decode_ggml_buffer_type(dec);
+
+ size_t value = buft->iface.get_max_size(buft);
+ apir_encode_size_t(enc, &value);
+
+ return 0;
+}
+
+uint32_t backend_buffer_type_is_host(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+ GGML_UNUSED(ctx);
+ ggml_backend_buffer_type_t buft;
+ buft = apir_decode_ggml_buffer_type(dec);
+
+ bool is_host = buft->iface.is_host(buft);
+ apir_encode_bool_t(enc, &is_host);
+
+ return 0;
+}
+
+uint32_t backend_buffer_type_alloc_buffer(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+ GGML_UNUSED(ctx);
+ ggml_backend_buffer_type_t buft;
+ buft = apir_decode_ggml_buffer_type(dec);
+
+ size_t size;
+ apir_decode_size_t(dec, &size);
+
+ ggml_backend_buffer_t buffer;
+
+ buffer = buft->iface.alloc_buffer(buft, size);
+
+ apir_encode_ggml_buffer(enc, buffer);
+
+ if (buffer) {
+ apir_track_backend_buffer(buffer);
+ }
+
+ return 0;
+}
+
+uint32_t backend_buffer_type_get_alloc_size(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+ GGML_UNUSED(ctx);
+ ggml_backend_buffer_type_t buft;
+ buft = apir_decode_ggml_buffer_type(dec);
+
+ const ggml_tensor * op = apir_decode_ggml_tensor_inplace(dec);
+
+ size_t value = buft->iface.get_alloc_size(buft, op);
+
+ apir_encode_size_t(enc, &value);
+
+ return 0;
+}
--- /dev/null
+#include "backend-dispatched.h"
+#include "backend-virgl-apir.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+#include "ggml-impl.h"
+
+#include <cstdint>
+
+uint32_t backend_buffer_get_base(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+ GGML_UNUSED(ctx);
+ ggml_backend_buffer_t buffer;
+ buffer = apir_decode_ggml_buffer(dec);
+
+ uintptr_t base = (uintptr_t) buffer->iface.get_base(buffer);
+ apir_encode_uintptr_t(enc, &base);
+
+ return 0;
+}
+
+uint32_t backend_buffer_set_tensor(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+ GGML_UNUSED(ctx);
+ GGML_UNUSED(enc);
+
+ ggml_backend_buffer_t buffer;
+ buffer = apir_decode_ggml_buffer(dec);
+
+ ggml_tensor * tensor;
+ // safe to remove the const qualifier here
+ tensor = (ggml_tensor *) (uintptr_t) apir_decode_ggml_tensor(dec);
+
+ uint32_t shmem_res_id;
+ apir_decode_virtgpu_shmem_res_id(dec, &shmem_res_id);
+
+ size_t offset;
+ apir_decode_size_t(dec, &offset);
+
+ size_t size;
+ apir_decode_size_t(dec, &size);
+
+ void * shmem_data = ctx->iface->get_shmem_ptr(ctx->ctx_id, shmem_res_id);
+
+ if (!shmem_data) {
+ GGML_LOG_ERROR("Couldn't get the shmem addr from virgl\n");
+ return 1;
+ }
+
+ buffer->iface.set_tensor(buffer, tensor, shmem_data, offset, size);
+
+ return 0;
+}
+
+uint32_t backend_buffer_get_tensor(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+ GGML_UNUSED(ctx);
+ GGML_UNUSED(enc);
+
+ ggml_backend_buffer_t buffer;
+ buffer = apir_decode_ggml_buffer(dec);
+
+ const ggml_tensor * tensor;
+ // safe to remove the const qualifier here
+ tensor = apir_decode_ggml_tensor(dec);
+
+ uint32_t shmem_res_id;
+ apir_decode_virtgpu_shmem_res_id(dec, &shmem_res_id);
+
+ size_t offset;
+ apir_decode_size_t(dec, &offset);
+
+ size_t size;
+ apir_decode_size_t(dec, &size);
+
+ void * shmem_data = ctx->iface->get_shmem_ptr(ctx->ctx_id, shmem_res_id);
+ if (!shmem_data) {
+ GGML_LOG_ERROR("Couldn't get the shmem addr from virgl\n");
+ return 1;
+ }
+
+ buffer->iface.get_tensor(buffer, tensor, shmem_data, offset, size);
+
+ return 0;
+}
+
+uint32_t backend_buffer_cpy_tensor(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+ GGML_UNUSED(ctx);
+
+ ggml_backend_buffer_t buffer;
+ buffer = apir_decode_ggml_buffer(dec);
+
+ const ggml_tensor * src;
+ // safe to remove the const qualifier here
+ src = apir_decode_ggml_tensor(dec);
+ ggml_tensor * dst = (ggml_tensor *) (uintptr_t) apir_decode_ggml_tensor(dec);
+
+ bool ret = buffer->iface.cpy_tensor(buffer, src, (ggml_tensor *) dst);
+
+ apir_encode_bool_t(enc, &ret);
+
+ return 0;
+}
+
+uint32_t backend_buffer_clear(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+ GGML_UNUSED(ctx);
+ GGML_UNUSED(enc);
+
+ ggml_backend_buffer_t buffer;
+ buffer = apir_decode_ggml_buffer(dec);
+
+ uint8_t value;
+ apir_decode_uint8_t(dec, &value);
+
+ buffer->iface.clear(buffer, value);
+
+ return 0;
+}
+
+uint32_t backend_buffer_free_buffer(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+ GGML_UNUSED(ctx);
+ GGML_UNUSED(enc);
+
+ ggml_backend_buffer_t buffer;
+ buffer = apir_decode_ggml_buffer(dec);
+
+ if (!apir_untrack_backend_buffer(buffer)) {
+ GGML_LOG_WARN("%s: unknown buffer %p\n", __func__, (void *) buffer);
+ return 1;
+ }
+
+ buffer->iface.free_buffer(buffer);
+
+ return 0;
+}
--- /dev/null
+#include "backend-dispatched.h"
+#include "backend-virgl-apir.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+#include "ggml-impl.h"
+
+#include <cstdint>
+
+uint32_t backend_device_get_device_count(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+ GGML_UNUSED(ctx);
+ GGML_UNUSED(ctx);
+ GGML_UNUSED(dec);
+
+ int32_t dev_count = reg->iface.get_device_count(reg);
+ apir_encode_int32_t(enc, &dev_count);
+
+ return 0;
+}
+
+uint32_t backend_device_get_count(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+ GGML_UNUSED(ctx);
+ GGML_UNUSED(ctx);
+ GGML_UNUSED(dec);
+
+ int32_t dev_count = reg->iface.get_device_count(reg);
+ apir_encode_int32_t(enc, &dev_count);
+
+ return 0;
+}
+
+uint32_t backend_device_get_name(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+ GGML_UNUSED(ctx);
+ GGML_UNUSED(dec);
+
+ const char * string = dev->iface.get_name(dev);
+
+ const size_t string_size = strlen(string) + 1;
+ apir_encode_array_size(enc, string_size);
+ apir_encode_char_array(enc, string, string_size);
+
+ return 0;
+}
+
+uint32_t backend_device_get_description(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+ GGML_UNUSED(ctx);
+ GGML_UNUSED(dec);
+
+ const char * string = dev->iface.get_description(dev);
+
+ const size_t string_size = strlen(string) + 1;
+ apir_encode_array_size(enc, string_size);
+ apir_encode_char_array(enc, string, string_size);
+
+ return 0;
+}
+
+uint32_t backend_device_get_type(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+ GGML_UNUSED(ctx);
+ GGML_UNUSED(dec);
+
+ uint32_t type = dev->iface.get_type(dev);
+ apir_encode_uint32_t(enc, &type);
+
+ return 0;
+}
+
+uint32_t backend_device_get_memory(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+ GGML_UNUSED(ctx);
+ GGML_UNUSED(dec);
+
+ size_t free, total;
+ dev->iface.get_memory(dev, &free, &total);
+
+ apir_encode_size_t(enc, &free);
+ apir_encode_size_t(enc, &total);
+
+ return 0;
+}
+
+uint32_t backend_device_supports_op(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+ GGML_UNUSED(ctx);
+
+ const ggml_tensor * op = apir_decode_ggml_tensor_inplace(dec);
+
+ bool supports_op = dev->iface.supports_op(dev, op);
+
+ apir_encode_bool_t(enc, &supports_op);
+
+ return 0;
+}
+
+uint32_t backend_device_get_buffer_type(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+ GGML_UNUSED(ctx);
+ GGML_UNUSED(dec);
+
+ ggml_backend_buffer_type_t bufft = dev->iface.get_buffer_type(dev);
+
+ apir_encode_ggml_buffer_type(enc, bufft);
+
+ return 0;
+}
+
+uint32_t backend_device_get_props(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+ GGML_UNUSED(ctx);
+ GGML_UNUSED(dec);
+
+ ggml_backend_dev_props props;
+ dev->iface.get_props(dev, &props);
+
+ apir_encode_bool_t(enc, &props.caps.async);
+ apir_encode_bool_t(enc, &props.caps.host_buffer);
+ apir_encode_bool_t(enc, &props.caps.buffer_from_host_ptr);
+ apir_encode_bool_t(enc, &props.caps.events);
+
+ return 0;
+}
+
+uint32_t backend_device_buffer_from_ptr(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+ GGML_UNUSED(ctx);
+ GGML_UNUSED(dec);
+
+ uint32_t shmem_res_id;
+ apir_decode_virtgpu_shmem_res_id(dec, &shmem_res_id);
+
+ void * shmem_ptr = ctx->iface->get_shmem_ptr(ctx->ctx_id, shmem_res_id);
+ if (!shmem_ptr) {
+ GGML_LOG_ERROR("Couldn't get the shmem addr from virgl\n");
+ apir_decoder_set_fatal(dec);
+ return 1;
+ }
+
+ size_t size;
+ apir_decode_size_t(dec, &size);
+ size_t max_tensor_size;
+ apir_decode_size_t(dec, &max_tensor_size);
+
+ ggml_backend_buffer_t buffer;
+ buffer = dev->iface.buffer_from_host_ptr(dev, shmem_ptr, size, max_tensor_size);
+
+ apir_encode_ggml_buffer(enc, buffer);
+ apir_encode_ggml_buffer_type(enc, buffer->buft);
+
+ if (buffer) {
+ apir_track_backend_buffer(buffer);
+ }
+
+ return 0;
+}
--- /dev/null
+#include "backend-dispatched.h"
+#include "backend-virgl-apir.h"
+
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+#include "ggml-impl.h"
+
+#include <cstdint>
+
+ggml_backend_reg_t reg = NULL;
+ggml_backend_dev_t dev = NULL;
+ggml_backend_t bck = NULL;
+
+uint64_t timer_start = 0;
+uint64_t timer_total = 0;
+uint64_t timer_count = 0;
+
+uint32_t backend_dispatch_initialize(void * ggml_backend_reg_fct_p) {
+ if (reg != NULL) {
+ GGML_LOG_WARN("%s: already initialized\n", __func__);
+ return APIR_BACKEND_INITIALIZE_ALREADY_INITED;
+ }
+ ggml_backend_reg_t (*ggml_backend_reg_fct)(void) = (ggml_backend_reg_t (*)()) ggml_backend_reg_fct_p;
+
+ reg = ggml_backend_reg_fct();
+ if (reg == NULL) {
+ GGML_LOG_ERROR("%s: backend registration failed\n", __func__);
+ return APIR_BACKEND_INITIALIZE_BACKEND_REG_FAILED;
+ }
+
+ if (!reg->iface.get_device_count(reg)) {
+ GGML_LOG_ERROR("%s: backend initialization failed: no device found\n", __func__);
+ return APIR_BACKEND_INITIALIZE_NO_DEVICE;
+ }
+
+ dev = reg->iface.get_device(reg, 0);
+
+ if (!dev) {
+ GGML_LOG_ERROR("%s: backend initialization failed: no device received\n", __func__);
+ return APIR_BACKEND_INITIALIZE_NO_DEVICE;
+ }
+
+ bck = dev->iface.init_backend(dev, NULL);
+
+ return APIR_BACKEND_INITIALIZE_SUCCESS;
+}
--- /dev/null
+#pragma once
+
+/* device */
+uint32_t backend_device_get_device_count(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+uint32_t backend_device_get_count(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+uint32_t backend_device_get_name(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+uint32_t backend_device_get_description(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+uint32_t backend_device_get_type(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+uint32_t backend_device_get_memory(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+uint32_t backend_device_supports_op(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+uint32_t backend_device_get_buffer_type(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+uint32_t backend_device_get_props(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+uint32_t backend_device_buffer_from_ptr(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+
+/* buffer-type */
+uint32_t backend_buffer_type_get_name(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+uint32_t backend_buffer_type_get_alignment(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+uint32_t backend_buffer_type_get_max_size(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+uint32_t backend_buffer_type_is_host(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+uint32_t backend_buffer_type_alloc_buffer(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+uint32_t backend_buffer_type_get_alloc_size(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+
+/* buffer */
+uint32_t backend_buffer_get_base(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+uint32_t backend_buffer_set_tensor(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+uint32_t backend_buffer_get_tensor(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+uint32_t backend_buffer_cpy_tensor(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+uint32_t backend_buffer_clear(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+uint32_t backend_buffer_free_buffer(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+
+/* backend */
+uint32_t backend_backend_graph_compute(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+
+static inline const char * backend_dispatch_command_name(ApirBackendCommandType type) {
+ switch (type) {
+ /* device */
+ case APIR_COMMAND_TYPE_DEVICE_GET_DEVICE_COUNT:
+ return "backend_device_get_device_count";
+ case APIR_COMMAND_TYPE_DEVICE_GET_COUNT:
+ return "backend_device_get_count";
+ case APIR_COMMAND_TYPE_DEVICE_GET_NAME:
+ return "backend_device_get_name";
+ case APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION:
+ return "backend_device_get_description";
+ case APIR_COMMAND_TYPE_DEVICE_GET_TYPE:
+ return "backend_device_get_type";
+ case APIR_COMMAND_TYPE_DEVICE_GET_MEMORY:
+ return "backend_device_get_memory";
+ case APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP:
+ return "backend_device_supports_op";
+ case APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE:
+ return "backend_device_get_buffer_type";
+ case APIR_COMMAND_TYPE_DEVICE_GET_PROPS:
+ return "backend_device_get_props";
+ case APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR:
+ return "backend_device_buffer_from_ptr";
+ /* buffer-type */
+ case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME:
+ return "backend_buffer_type_get_name";
+ case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT:
+ return "backend_buffer_type_get_alignment";
+ case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE:
+ return "backend_buffer_type_get_max_size";
+ case APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST:
+ return "backend_buffer_type_is_host";
+ case APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER:
+ return "backend_buffer_type_alloc_buffer";
+ case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALLOC_SIZE:
+ return "backend_buffer_type_get_alloc_size";
+ /* buffer */
+ case APIR_COMMAND_TYPE_BUFFER_GET_BASE:
+ return "backend_buffer_get_base";
+ case APIR_COMMAND_TYPE_BUFFER_SET_TENSOR:
+ return "backend_buffer_set_tensor";
+ case APIR_COMMAND_TYPE_BUFFER_GET_TENSOR:
+ return "backend_buffer_get_tensor";
+ case APIR_COMMAND_TYPE_BUFFER_CPY_TENSOR:
+ return "backend_buffer_cpy_tensor";
+ case APIR_COMMAND_TYPE_BUFFER_CLEAR:
+ return "backend_buffer_clear";
+ case APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER:
+ return "backend_buffer_free_buffer";
+ /* backend */
+ case APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE:
+ return "backend_backend_graph_compute";
+
+ default:
+ return "unknown";
+ }
+}
+
+extern "C" {
+static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATCH_TABLE_COUNT] = {
+
+ /* device */
+
+ /* APIR_COMMAND_TYPE_DEVICE_GET_DEVICE_COUNT = */ backend_device_get_device_count,
+ /* APIR_COMMAND_TYPE_DEVICE_GET_COUNT = */ backend_device_get_count,
+ /* APIR_COMMAND_TYPE_DEVICE_GET_NAME = */ backend_device_get_name,
+ /* APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION = */ backend_device_get_description,
+ /* APIR_COMMAND_TYPE_DEVICE_GET_TYPE = */ backend_device_get_type,
+ /* APIR_COMMAND_TYPE_DEVICE_GET_MEMORY = */ backend_device_get_memory,
+ /* APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP = */ backend_device_supports_op,
+ /* APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE = */ backend_device_get_buffer_type,
+ /* APIR_COMMAND_TYPE_DEVICE_GET_PROPS = */ backend_device_get_props,
+ /* APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR = */ backend_device_buffer_from_ptr,
+
+ /* buffer-type */
+
+ /* APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME = */ backend_buffer_type_get_name,
+ /* APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT = */ backend_buffer_type_get_alignment,
+ /* APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE = */ backend_buffer_type_get_max_size,
+ /* APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST = */ backend_buffer_type_is_host,
+ /* APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER = */ backend_buffer_type_alloc_buffer,
+ /* APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALLOC_SIZE = */ backend_buffer_type_get_alloc_size,
+
+ /* buffer */
+
+ /* APIR_COMMAND_TYPE_BUFFER_GET_BASE = */ backend_buffer_get_base,
+ /* APIR_COMMAND_TYPE_BUFFER_SET_TENSOR = */ backend_buffer_set_tensor,
+ /* APIR_COMMAND_TYPE_BUFFER_GET_TENSOR = */ backend_buffer_get_tensor,
+ /* APIR_COMMAND_TYPE_BUFFER_CPY_TENSOR = */ backend_buffer_cpy_tensor,
+ /* APIR_COMMAND_TYPE_BUFFER_CLEAR = */ backend_buffer_clear,
+ /* APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER = */ backend_buffer_free_buffer,
+
+ /* backend */
+
+ /* APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE = */ backend_backend_graph_compute,
+};
+}
--- /dev/null
+#pragma once
+
+#include <cstdint>
+#include <cstddef>
+
+#include <ggml-backend.h>
+
+#include "backend-convert.h"
+#include "backend-virgl-apir.h"
+#include "shared/apir_backend.h"
+#include "shared/apir_cs.h"
+#include "shared/apir_cs_ggml.h"
+
+struct virgl_apir_context {
+ uint32_t ctx_id;
+ virgl_apir_callbacks * iface;
+};
+
+typedef uint32_t (*backend_dispatch_t)(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+
+#include "backend-dispatched.gen.h"
+
+uint32_t backend_dispatch_initialize(void * ggml_backend_reg_fct_p);
--- /dev/null
+#pragma once
+
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+#include "ggml-impl.h"
+#include "shared/api_remoting.h"
+
+#include <cstdarg>
+#include <cstdio>
+#include <cstdlib>
+
+extern ggml_backend_reg_t reg;
+extern ggml_backend_dev_t dev;
+extern ggml_backend_t bck;
+
+struct virgl_apir_callbacks {
+ const char * (*get_config)(uint32_t virgl_ctx_id, const char * key);
+ void * (*get_shmem_ptr)(uint32_t virgl_ctx_id, uint32_t res_id);
+};
+
+extern "C" {
+ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct virgl_apir_callbacks *virgl_cbs);
+void apir_backend_deinit(uint32_t virgl_ctx_id);
+uint32_t apir_backend_dispatcher(uint32_t virgl_ctx_id,
+ virgl_apir_callbacks * virgl_cbs,
+ uint32_t cmd_type,
+ char * dec_cur,
+ const char * dec_end,
+ char * enc_cur,
+ const char * enc_end,
+ char ** enc_cur_after);
+}
--- /dev/null
+#include "backend-dispatched.h"
+#include "backend-virgl-apir.h"
+
+#include "shared/api_remoting.h"
+#include "shared/apir_backend.h"
+#include "shared/apir_cs.h"
+
+#include <dlfcn.h>
+#include <ggml-backend.h>
+
+#include <iostream>
+
+#define APIR_LLAMA_CPP_GGML_LIBRARY_PATH_ENV "APIR_LLAMA_CPP_GGML_LIBRARY_PATH"
+#define APIR_LLAMA_CPP_GGML_LIBRARY_REG_ENV "APIR_LLAMA_CPP_GGML_LIBRARY_REG"
+#define APIR_LLAMA_CPP_LOG_TO_FILE_ENV "APIR_LLAMA_CPP_LOG_TO_FILE"
+
+#define GGML_DEFAULT_BACKEND_REG "ggml_backend_init"
+
+static void * backend_library_handle = NULL;
+static FILE * apir_logfile = NULL;
+
+static void log_to_file_callback(enum ggml_log_level level, const char * text, void * user_data) {
+ FILE * logfile = (FILE *)user_data;
+ fprintf(logfile, "[%d] %s", level, text);
+ fflush(logfile);
+}
+
+extern "C" {
+void apir_backend_deinit(uint32_t virgl_ctx_id) {
+ GGML_UNUSED(virgl_ctx_id);
+
+ auto buffers = apir_get_track_backend_buffers();
+ for (const auto & buffer : buffers) {
+ apir_untrack_backend_buffer(buffer);
+ buffer->iface.free_buffer(buffer);
+ }
+
+ if (dev) {
+ size_t free, total;
+ dev->iface.get_memory(dev, &free, &total);
+ GGML_LOG_INFO("%s: free memory: %ld MB\n", __func__, (size_t) free / 1024 / 1024);
+ }
+
+ if (backend_library_handle) {
+ GGML_LOG_INFO("%s: The GGML backend library was loaded. Unloading it.\n", __func__);
+ dlclose(backend_library_handle);
+ backend_library_handle = NULL;
+ }
+
+ if (apir_logfile) {
+ fclose(apir_logfile);
+ apir_logfile = NULL;
+ }
+}
+
+#define APIR_GGML_LIBRARY_PATH_KEY "ggml.library.path"
+#define APIR_GGML_LIBRARY_REG_KEY "ggml.library.reg"
+
+ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct virgl_apir_callbacks *virgl_cbs) {
+ const char * dlsym_error;
+
+ const char * apir_log_to_file = getenv(APIR_LLAMA_CPP_LOG_TO_FILE_ENV);
+ if (apir_log_to_file) {
+ apir_logfile = fopen(apir_log_to_file, "w");
+ if (apir_logfile) {
+ ggml_log_set(log_to_file_callback, apir_logfile);
+ } else {
+ GGML_LOG_INFO("Could not open the log file at '%s'\n", apir_log_to_file);
+ }
+ }
+
+ const char * library_name = virgl_cbs->get_config(virgl_ctx_id, APIR_GGML_LIBRARY_PATH_KEY);
+ const char * virgl_library_reg = virgl_cbs->get_config(virgl_ctx_id, APIR_GGML_LIBRARY_REG_KEY);
+ const char * library_reg = virgl_library_reg ? virgl_library_reg : GGML_DEFAULT_BACKEND_REG;
+
+ if (!library_name) {
+ GGML_LOG_ERROR("cannot open the GGML library: env var '%s' not defined\n", APIR_LLAMA_CPP_GGML_LIBRARY_PATH_ENV);
+
+ return APIR_LOAD_LIBRARY_ENV_VAR_MISSING;
+ }
+
+ backend_library_handle = dlopen(library_name, RTLD_LAZY);
+
+ if (!backend_library_handle) {
+ GGML_LOG_ERROR("cannot open the GGML library: %s\n", dlerror());
+
+ return APIR_LOAD_LIBRARY_CANNOT_OPEN;
+ }
+
+ if (!library_reg) {
+ GGML_LOG_ERROR("cannot register the GGML library: env var '%s' not defined\n", APIR_LLAMA_CPP_GGML_LIBRARY_REG_ENV);
+
+ return APIR_LOAD_LIBRARY_ENV_VAR_MISSING;
+ }
+
+ void * ggml_backend_reg_fct = dlsym(backend_library_handle, library_reg);
+ dlsym_error = dlerror();
+ if (dlsym_error) {
+ GGML_LOG_ERROR("cannot find the GGML backend registration symbol '%s' (from %s): %s\n", library_reg,
+ APIR_LLAMA_CPP_GGML_LIBRARY_REG_ENV, dlsym_error);
+
+ return APIR_LOAD_LIBRARY_SYMBOL_MISSING;
+ }
+
+ uint32_t ret = backend_dispatch_initialize(ggml_backend_reg_fct);
+
+ return (ApirLoadLibraryReturnCode) (APIR_LOAD_LIBRARY_INIT_BASE_INDEX + ret);
+}
+
+uint32_t apir_backend_dispatcher(uint32_t virgl_ctx_id,
+ virgl_apir_callbacks * virgl_cbs,
+ uint32_t cmd_type,
+ char * dec_cur,
+ const char * dec_end,
+ char * enc_cur,
+ const char * enc_end,
+ char ** enc_cur_after) {
+ apir_encoder enc = {
+ .cur = enc_cur,
+ .start = enc_cur,
+ .end = enc_end,
+ .fatal = false,
+ };
+
+ apir_decoder dec = {
+ .cur = dec_cur,
+ .end = dec_end,
+ .fatal = false,
+ };
+
+ virgl_apir_context ctx = {
+ .ctx_id = virgl_ctx_id,
+ .iface = virgl_cbs,
+ };
+
+ if (cmd_type >= APIR_BACKEND_DISPATCH_TABLE_COUNT) {
+ GGML_LOG_ERROR("Received an invalid dispatch index (%d >= %d)\n", cmd_type, APIR_BACKEND_DISPATCH_TABLE_COUNT);
+ return APIR_BACKEND_FORWARD_INDEX_INVALID;
+ }
+
+ backend_dispatch_t forward_fct = apir_backend_dispatch_table[cmd_type];
+ uint32_t ret = forward_fct(&enc, &dec, &ctx);
+
+ *enc_cur_after = enc.cur;
+
+ return ret;
+}
+}
--- /dev/null
+#pragma once
+
+/* the rest of this file must match virglrenderer/src/apir-protocol.h */
+
+#include <unistd.h>
+
+#include <cstdint>
+
+#define APIR_PROTOCOL_MAJOR 0
+#define APIR_PROTOCOL_MINOR 1
+
+#define APIR_HANDSHAKE_MAGIC 0xab1e
+
+enum ApirCommandType {
+ APIR_COMMAND_TYPE_HANDSHAKE = 0,
+ APIR_COMMAND_TYPE_LOADLIBRARY = 1,
+ APIR_COMMAND_TYPE_FORWARD = 2,
+
+ APIR_COMMAND_TYPE_LENGTH = 3,
+};
+
+typedef uint64_t ApirCommandFlags;
+
+enum ApirLoadLibraryReturnCode {
+ APIR_LOAD_LIBRARY_SUCCESS = 0,
+ APIR_LOAD_LIBRARY_HYPERCALL_INITIALIZATION_ERROR = 1,
+ APIR_LOAD_LIBRARY_ALREADY_LOADED = 2,
+ APIR_LOAD_LIBRARY_ENV_VAR_MISSING = 3,
+ APIR_LOAD_LIBRARY_CANNOT_OPEN = 4,
+ APIR_LOAD_LIBRARY_SYMBOL_MISSING = 5,
+ APIR_LOAD_LIBRARY_INIT_BASE_INDEX = 6, // anything above this is a APIR backend library initialization return code
+};
+
+enum ApirForwardReturnCode {
+ APIR_FORWARD_SUCCESS = 0,
+ APIR_FORWARD_NO_DISPATCH_FCT = 1,
+ APIR_FORWARD_TIMEOUT = 2,
+
+ APIR_FORWARD_BASE_INDEX = 3, // anything above this is a APIR backend library forward return code
+} ;
+
+__attribute__((unused)) static inline const char * apir_command_name(ApirCommandType type) {
+ switch (type) {
+ case APIR_COMMAND_TYPE_HANDSHAKE:
+ return "HandShake";
+ case APIR_COMMAND_TYPE_LOADLIBRARY:
+ return "LoadLibrary";
+ case APIR_COMMAND_TYPE_FORWARD:
+ return "Forward";
+ default:
+ return "unknown";
+ }
+}
+
+__attribute__((unused)) static const char * apir_load_library_error(ApirLoadLibraryReturnCode code) {
+#define APIR_LOAD_LIBRARY_ERROR(code_name) \
+ do { \
+ if (code == code_name) \
+ return #code_name; \
+ } while (0)
+
+ APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_SUCCESS);
+ APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_HYPERCALL_INITIALIZATION_ERROR);
+ APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_ALREADY_LOADED);
+ APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_ENV_VAR_MISSING);
+ APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_CANNOT_OPEN);
+ APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_SYMBOL_MISSING);
+ APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_INIT_BASE_INDEX);
+
+ return "Unknown APIR_COMMAND_TYPE_LoadLibrary error";
+
+#undef APIR_LOAD_LIBRARY_ERROR
+}
+
+__attribute__((unused)) static const char * apir_forward_error(ApirForwardReturnCode code) {
+#define APIR_FORWARD_ERROR(code_name) \
+ do { \
+ if (code == code_name) \
+ return #code_name; \
+ } while (0)
+
+ APIR_FORWARD_ERROR(APIR_FORWARD_SUCCESS);
+ APIR_FORWARD_ERROR(APIR_FORWARD_NO_DISPATCH_FCT);
+ APIR_FORWARD_ERROR(APIR_FORWARD_TIMEOUT);
+ APIR_FORWARD_ERROR(APIR_FORWARD_BASE_INDEX);
+
+ return "Unknown APIR_COMMAND_TYPE_FORWARD error";
+
+#undef APIR_FORWARD_ERROR
+}
--- /dev/null
+typedef enum ApirBackendCommandType {
+
+ /* device */
+ APIR_COMMAND_TYPE_DEVICE_GET_DEVICE_COUNT = 0,
+ APIR_COMMAND_TYPE_DEVICE_GET_COUNT = 1,
+ APIR_COMMAND_TYPE_DEVICE_GET_NAME = 2,
+ APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION = 3,
+ APIR_COMMAND_TYPE_DEVICE_GET_TYPE = 4,
+ APIR_COMMAND_TYPE_DEVICE_GET_MEMORY = 5,
+ APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP = 6,
+ APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE = 7,
+ APIR_COMMAND_TYPE_DEVICE_GET_PROPS = 8,
+ APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR = 9,
+
+ /* buffer-type */
+ APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME = 10,
+ APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT = 11,
+ APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE = 12,
+ APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST = 13,
+ APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER = 14,
+ APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALLOC_SIZE = 15,
+
+ /* buffer */
+ APIR_COMMAND_TYPE_BUFFER_GET_BASE = 16,
+ APIR_COMMAND_TYPE_BUFFER_SET_TENSOR = 17,
+ APIR_COMMAND_TYPE_BUFFER_GET_TENSOR = 18,
+ APIR_COMMAND_TYPE_BUFFER_CPY_TENSOR = 19,
+ APIR_COMMAND_TYPE_BUFFER_CLEAR = 20,
+ APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER = 21,
+
+ /* backend */
+ APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE = 22,
+
+ // last command_type index + 1
+ APIR_BACKEND_DISPATCH_TABLE_COUNT = 23,
+} ApirBackendCommandType;
--- /dev/null
+#pragma once
+
+#include "apir_backend.gen.h"
+
+#include <stdint.h> // for uintptr_t
+#include <time.h> // for timespec, clock_gettime
+
+#define APIR_BACKEND_INITIALIZE_SUCCESS 0
+#define APIR_BACKEND_INITIALIZE_CANNOT_OPEN_BACKEND_LIBRARY 1
+#define APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY 2
+#define APIR_BACKEND_INITIALIZE_MISSING_BACKEND_SYMBOLS 3
+#define APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS 4
+#define APIR_BACKEND_INITIALIZE_BACKEND_FAILED 5
+#define APIR_BACKEND_INITIALIZE_BACKEND_REG_FAILED 6
+#define APIR_BACKEND_INITIALIZE_ALREADY_INITED 7
+#define APIR_BACKEND_INITIALIZE_NO_DEVICE 8
+
+
+// new entries here need to be added to the apir_backend_initialize_error function below
+
+#define APIR_BACKEND_FORWARD_INDEX_INVALID 6
+
+// 0 is fast, 1 avoids the backend to crash if an unsupported tensor is received
+#define APIR_BACKEND_CHECK_SUPPORTS_OP 0
+
+typedef uintptr_t apir_buffer_type_host_handle_t;
+typedef uintptr_t apir_buffer_host_handle_t;
+
+static const char * apir_backend_initialize_error(int code) {
+#define APIR_BACKEND_INITIALIZE_ERROR(code_name) \
+ do { \
+ if (code == code_name) \
+ return #code_name; \
+ } while (0)
+
+ APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_SUCCESS);
+ APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_CANNOT_OPEN_BACKEND_LIBRARY);
+ APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY);
+ APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_MISSING_BACKEND_SYMBOLS);
+ APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS);
+ APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_BACKEND_FAILED);
+
+ return "Unknown APIR_BACKEND_INITIALIZE error:/";
+
+#undef APIR_BACKEND_INITIALIZE_ERROR
+}
--- /dev/null
+#pragma once
+
+#include "ggml-impl.h"
+
+#include <cassert>
+#include <cstring>
+
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+
+struct apir_encoder {
+ char * cur;
+ const char * start;
+ const char * end;
+ bool fatal;
+
+};
+
+struct apir_decoder {
+ const char * cur;
+ const char * end;
+ bool fatal;
+};
+
+/*
+ * new encoder and decoder
+ */
+
+static apir_decoder apir_new_decoder(const char * ptr, size_t size) {
+ apir_decoder dec = {
+ .cur = ptr,
+ .end = ptr + size,
+ .fatal = false,
+ };
+
+ return dec;
+}
+
+static apir_encoder apir_new_encoder(char * ptr, size_t size) {
+ apir_encoder enc = {
+ .cur = ptr,
+ .start = ptr,
+ .end = ptr + size,
+ .fatal = false,
+ };
+
+ return enc;
+}
+
+/*
+ * fatal flag handling
+ */
+
+static inline void apir_encoder_reset_fatal(apir_encoder * enc) {
+ enc->fatal = false;
+}
+
+static inline void apir_encoder_set_fatal(apir_encoder * enc) {
+ enc->fatal = true;
+}
+
+static inline bool apir_encoder_get_fatal(const apir_encoder * enc) {
+ return enc->fatal;
+}
+
+static inline void apir_decoder_reset_fatal(apir_decoder * dec) {
+ dec->fatal = false;
+}
+
+static inline void apir_decoder_set_fatal(apir_decoder * dec) {
+ dec->fatal = true;
+}
+
+static inline bool apir_decoder_get_fatal(const apir_decoder * dec) {
+ return dec->fatal;
+}
+
+/*
+ * encode peek
+ */
+
+static inline bool apir_decoder_peek_internal(apir_decoder * dec,
+ size_t size,
+ void * val,
+ size_t val_size) {
+ assert(val_size <= size);
+
+ if (unlikely(size > (size_t) (dec->end - dec->cur))) {
+ GGML_LOG_ERROR("reading too much from the decoder ...\n");
+ apir_decoder_set_fatal(dec);
+ memset(val, 0, val_size);
+ return false;
+ }
+
+ /* we should not rely on the compiler to optimize away memcpy... */
+ memcpy(val, dec->cur, val_size);
+ return true;
+}
+
+static inline void apir_decoder_peek(apir_decoder * dec, size_t size, void * val, size_t val_size) {
+ apir_decoder_peek_internal(dec, size, val, val_size);
+}
+
+static inline const void * apir_decoder_use_inplace(apir_decoder * dec, size_t size) {
+ if (unlikely(size > (size_t) (dec->end - dec->cur))) {
+ GGML_LOG_ERROR("reading too much from the decoder ...\n");
+ apir_decoder_set_fatal(dec);
+ return NULL;
+ }
+ const void * addr = dec->cur;
+ dec->cur += size;
+
+ return addr;
+}
+
+/*
+ * read/write
+ */
+
+static inline void apir_decoder_read(apir_decoder * dec, size_t size, void * val, size_t val_size) {
+ if (apir_decoder_peek_internal(dec, size, val, val_size)) {
+ dec->cur += size;
+ }
+}
+
+static inline char * apir_encoder_write(apir_encoder * enc, size_t size, const void * val, size_t val_size) {
+ assert(val_size <= size);
+ assert(size <= ((size_t) (enc->end - enc->cur)));
+
+ char * write_addr = enc->cur;
+ /* we should not rely on the compiler to optimize away memcpy... */
+ memcpy(write_addr, val, val_size);
+ enc->cur += size;
+
+ return write_addr;
+}
+
+/*
+ * encode/decode
+ */
+
+static inline void apir_decode(apir_decoder * dec, size_t size, void * data, size_t data_size) {
+ assert(size % 4 == 0);
+ apir_decoder_read(dec, size, data, data_size);
+}
+
+static inline void apir_encode(apir_encoder * enc, size_t size, const void * data, size_t data_size) {
+ assert(size % 4 == 0);
+ apir_encoder_write(enc, size, data, data_size);
+}
+
+/*
+ * typed encode/decode
+ */
+
+/* uint8_t */
+
+static inline void apir_encode_uint8_t(apir_encoder * enc, const uint8_t * val) {
+ apir_encode(enc, sizeof(int), val, sizeof(*val));
+}
+
+static inline void apir_decode_uint8_t(apir_decoder * dec, uint8_t * val) {
+ apir_decode(dec, sizeof(int), val, sizeof(*val));
+}
+
+/* uint64_t */
+
+static inline void apir_encode_uint64_t(apir_encoder * enc, const uint64_t * val) {
+ apir_encode(enc, 8, val, sizeof(*val));
+}
+
+static inline void apir_decode_uint64_t(apir_decoder * dec, uint64_t * val) {
+ apir_decode(dec, 8, val, sizeof(*val));
+}
+
+static inline void apir_encode_uint64_t_array(apir_encoder * enc, const uint64_t * val, uint32_t count) {
+ const size_t size = sizeof(*val) * count;
+ assert(size >= count);
+ apir_encode(enc, size, val, size);
+}
+
+static inline void apir_decode_uint64_t_array(apir_decoder * dec, uint64_t * val, uint32_t count) {
+ const size_t size = sizeof(*val) * count;
+ assert(size >= count);
+ apir_decode(dec, size, val, size);
+}
+
+static inline const uint64_t * apir_decode_uint64_t_array_inplace(apir_decoder * dec, uint32_t count) {
+ return (uint64_t *) (uintptr_t) apir_decoder_use_inplace(dec, count * sizeof(uint64_t));
+}
+
+/* int32_t */
+
+static inline void apir_encode_int32_t(apir_encoder * enc, const int32_t * val) {
+ apir_encode(enc, 4, val, sizeof(*val));
+}
+
+static inline void apir_decode_int32_t(apir_decoder * dec, int32_t * val) {
+ apir_decode(dec, 4, val, sizeof(*val));
+}
+
+static inline void apir_encode_int32_t_array(apir_encoder * enc, const int32_t * val, uint32_t count) {
+ const size_t size = sizeof(*val) * count;
+ assert(size >= count);
+ apir_encode(enc, size, val, size);
+}
+
+static inline void apir_decode_int32_t_array(apir_decoder * dec, int32_t * val, uint32_t count) {
+ const size_t size = sizeof(*val) * count;
+ assert(size >= count);
+ apir_decode(dec, size, val, size);
+}
+
+/* array size (uint64_t) */
+
+static inline void apir_encode_array_size(apir_encoder * enc, uint64_t size) {
+ apir_encode_uint64_t(enc, &size);
+}
+
+static inline uint64_t apir_decode_array_size(apir_decoder * dec, uint64_t expected_size) {
+ uint64_t size;
+ apir_decode_uint64_t(dec, &size);
+ if (size != expected_size) {
+ GGML_LOG_ERROR("Couldn't decode array from the decoder\n");
+ apir_decoder_set_fatal(dec);
+ size = 0;
+ }
+ return size;
+}
+
+static inline uint64_t apir_decode_array_size_unchecked(apir_decoder * dec) {
+ uint64_t size;
+ apir_decode_uint64_t(dec, &size);
+ return size;
+}
+
+/* non-array pointer */
+
+static inline bool apir_encode_simple_pointer(apir_encoder * enc, const void * val) {
+ apir_encode_array_size(enc, val ? 1 : 0);
+ return val;
+}
+
+static inline bool apir_decode_simple_pointer(apir_decoder * dec) {
+ return apir_decode_array_size_unchecked(dec);
+}
+
+/* uint32_t */
+
+static inline void apir_encode_uint32_t(apir_encoder * enc, const uint32_t * val) {
+ apir_encode(enc, 4, val, sizeof(*val));
+}
+
+static inline void apir_decode_uint32_t(apir_decoder * dec, uint32_t * val) {
+ apir_decode(dec, 4, val, sizeof(*val));
+}
+
+static inline void apir_encode_uint32_t_array(apir_encoder * enc, const uint32_t * val, uint32_t count) {
+ const size_t size = sizeof(*val) * count;
+ assert(size >= count);
+ apir_encode(enc, size, val, size);
+}
+
+static inline void apir_decode_uint32_t_array(apir_decoder * dec, uint32_t * val, uint32_t count) {
+ const size_t size = sizeof(*val) * count;
+ assert(size >= count);
+ apir_decode(dec, size, val, size);
+}
+
+/* size_t */
+
+static inline void apir_encode_size_t(apir_encoder * enc, const size_t * val) {
+ const uint64_t tmp = *val;
+ apir_encode_uint64_t(enc, &tmp);
+}
+
+static inline void apir_decode_size_t(apir_decoder * dec, size_t * val) {
+ uint64_t tmp;
+ apir_decode_uint64_t(dec, &tmp);
+ *val = tmp;
+}
+
+static inline void apir_encode_size_t_array(apir_encoder * enc, const size_t * val, uint32_t count) {
+ if (sizeof(size_t) == sizeof(uint64_t)) {
+ apir_encode_uint64_t_array(enc, (const uint64_t *) val, count);
+ } else {
+ for (uint32_t i = 0; i < count; i++) {
+ apir_encode_size_t(enc, &val[i]);
+ }
+ }
+}
+
+static inline void apir_decode_size_t_array(apir_decoder * dec, size_t * val, uint32_t count) {
+ if (sizeof(size_t) == sizeof(uint64_t)) {
+ apir_decode_uint64_t_array(dec, (uint64_t *) val, count);
+ } else {
+ for (uint32_t i = 0; i < count; i++) {
+ apir_decode_size_t(dec, &val[i]);
+ }
+ }
+}
+
+/* opaque blob */
+
+static inline void apir_encode_blob_array(apir_encoder * enc, const void * val, size_t size) {
+ apir_encode(enc, (size + 3) & ~3, val, size);
+}
+
+static inline void apir_decode_blob_array(apir_decoder * dec, void * val, size_t size) {
+ apir_decode(dec, (size + 3) & ~3, val, size);
+}
+
+/* string */
+
+static inline void apir_encode_char_array(apir_encoder * enc, const char * val, size_t size) {
+ assert(size && strlen(val) < size);
+ apir_encode_blob_array(enc, val, size);
+}
+
+static inline void apir_decode_char_array(apir_decoder * dec, char * val, size_t size) {
+ apir_decode_blob_array(dec, val, size);
+ if (size) {
+ val[size - 1] = '\0';
+ } else {
+ GGML_LOG_ERROR("Couldn't decode the blog array\n");
+ apir_decoder_set_fatal(dec);
+ }
+}
+
+/* (temp) buffer allocation */
+
+static inline void * apir_decoder_alloc_array(size_t size, size_t count) {
+ size_t alloc_size;
+ if (unlikely(__builtin_mul_overflow(size, count, &alloc_size))) {
+ GGML_LOG_ERROR("overflow in array allocation of %zu * %zu bytes\n", size, count);
+ return NULL;
+ }
+
+ return malloc(alloc_size);
+}
+
+/* bool */
+
+static inline void apir_encode_bool_t(apir_encoder * enc, const bool * val) {
+ apir_encode(enc, sizeof(int), val, sizeof(bool));
+}
+
+static inline void apir_decode_bool_t(apir_decoder * dec, bool * val) {
+ apir_decode(dec, sizeof(int), val, sizeof(bool));
+}
+
+/* apir_buffer_type_host_handle_t */
+
+static inline void apir_encode_apir_buffer_type_host_handle_t(apir_encoder * enc,
+ const apir_buffer_type_host_handle_t * val) {
+ apir_encode(enc, sizeof(apir_buffer_type_host_handle_t), val, sizeof(apir_buffer_type_host_handle_t));
+}
+
+static inline void apir_decode_apir_buffer_type_host_handle_t(apir_decoder * dec,
+ apir_buffer_type_host_handle_t * val) {
+ apir_decode(dec, sizeof(apir_buffer_type_host_handle_t), val, sizeof(apir_buffer_type_host_handle_t));
+}
+
+/* apir_buffer_host_handle_t */
+
+static inline void apir_encode_apir_buffer_host_handle_t(apir_encoder * enc,
+ const apir_buffer_host_handle_t * val) {
+ apir_encode(enc, sizeof(apir_buffer_host_handle_t), val, sizeof(apir_buffer_host_handle_t));
+}
+
+static inline void apir_decode_apir_buffer_host_handle_t(apir_decoder * dec, apir_buffer_host_handle_t * val) {
+ apir_decode(dec, sizeof(apir_buffer_host_handle_t), val, sizeof(apir_buffer_host_handle_t));
+}
+
+/* uintptr_t */
+
+static inline void apir_encode_uintptr_t(apir_encoder * enc, const uintptr_t * val) {
+ apir_encode(enc, sizeof(*val), val, sizeof(*val));
+}
+
+static inline void apir_decode_uintptr_t(apir_decoder * dec, uintptr_t * val) {
+ apir_decode(dec, sizeof(*val), val, sizeof(*val));
+}
--- /dev/null
+#include "ggml-impl.h"
+#include "apir_cs.h"
+#include "apir_cs_rpc.h"
+
+// ggml_buffer_to_apir_host_handle(ggml_backend_buffer_t buffer);
+
+static inline void apir_encode_ggml_buffer_host_handle(apir_encoder * enc,
+ const apir_buffer_host_handle_t * handle);
+
+static inline ggml_backend_buffer_t apir_decode_ggml_buffer(apir_decoder * dec);
+
+/* apir_rpc_tensor */
+
+static inline void apir_encode_rcp_tensor(apir_encoder * enc, const apir_rpc_tensor * apir_rpc_tensor) {
+ size_t apir_rpc_tensor_size = sizeof(*apir_rpc_tensor);
+ apir_encode(enc, apir_rpc_tensor_size, apir_rpc_tensor, apir_rpc_tensor_size);
+}
+
+static inline apir_rpc_tensor * apir_decode_apir_rpc_tensor_inplace(apir_decoder * dec) {
+ size_t apir_rpc_tensor_size = sizeof(apir_rpc_tensor);
+
+ return (apir_rpc_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, apir_rpc_tensor_size);
+}
+
+static inline apir_rpc_tensor * apir_decode_apir_rpc_tensor_array_inplace(apir_decoder * dec,
+ uint32_t n_tensors) {
+ size_t apir_rpc_tensor_size = sizeof(apir_rpc_tensor) * n_tensors;
+
+ return (apir_rpc_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, apir_rpc_tensor_size);
+}
+
+/* ggml_tensor */
+
+static inline void apir_encode_ggml_tensor(apir_encoder * enc, const ggml_tensor * tensor) {
+ apir_rpc_tensor serialized = apir_serialize_tensor(tensor);
+
+ apir_encode_rcp_tensor(enc, &serialized);
+}
+
+static inline const ggml_tensor * apir_decode_ggml_tensor(apir_decoder * dec) {
+ const apir_rpc_tensor * apir_rpc_tensor = apir_decode_apir_rpc_tensor_inplace(dec);
+ ggml_init_params params{
+ /*.mem_size =*/ ggml_tensor_overhead(),
+ /*.mem_buffer =*/ NULL,
+ /*.no_alloc =*/ true,
+ };
+ ggml_context * ctx = ggml_init(params);
+
+ const ggml_tensor * tensor = apir_deserialize_tensor(ctx, apir_rpc_tensor);
+
+ return tensor;
+}
+
+/* *** ggml_backend_buffer_type_t *** */
+
+// ggml_backend_buffer_type_t is a POINTER (to a struct).
+// Only the host pointer is shared between the host and guest.
+// The guest stores it in `buft->context`.
+// The host simply writes the pointer address in the buffer variable.
+
+static inline void apir_encode_ggml_buffer_type(apir_encoder * enc, ggml_backend_buffer_type_t buft) {
+ apir_buffer_type_host_handle_t handle = ggml_buffer_type_to_apir_handle(buft);
+ apir_encoder_write(enc, sizeof(handle), &handle, sizeof(handle));
+}
+
+static inline ggml_backend_buffer_type_t apir_decode_ggml_buffer_type(apir_decoder * dec) {
+ apir_buffer_type_host_handle_t handle;
+
+ apir_decoder_read(dec, sizeof(handle), &handle, sizeof(handle));
+
+ return (ggml_backend_buffer_type_t) handle;
+}
+
+static inline apir_buffer_type_host_handle_t apir_decode_apir_buffer_type_host_handle(apir_decoder * dec) {
+ apir_buffer_type_host_handle_t handle;
+
+ apir_decoder_read(dec, sizeof(handle), &handle, sizeof(handle));
+
+ return handle;
+}
+
+/* *** ggml_backend_type_t *** */
+
+// ggml_backend_buffer_t is a POINTER.
+// same logic as for ggml_backend_buffer_type_t
+
+static inline void apir_encode_ggml_buffer(apir_encoder * enc, const ggml_backend_buffer_t buffer) {
+ apir_buffer_host_handle_t handle = BUFFER_TO_HOST_HANDLE(buffer);
+ apir_encoder_write(enc, sizeof(handle), &handle, sizeof(handle));
+}
+
+static inline ggml_backend_buffer_t apir_decode_ggml_buffer(apir_decoder * dec) {
+ ggml_backend_buffer_t buffer;
+ size_t buffer_ptr_size = sizeof(buffer);
+
+ apir_decoder_read(dec, buffer_ptr_size, &buffer, buffer_ptr_size);
+
+ return buffer;
+}
+
+/* enum ggml_status */
+
+static inline void apir_encode_ggml_status(apir_encoder * enc, const ggml_status * status) {
+ apir_encoder_write(enc, sizeof(*status), status, sizeof(*status));
+}
+
+static inline void apir_decode_ggml_status(apir_decoder * dec, ggml_status * status) {
+ apir_decoder_read(dec, sizeof(*status), status, sizeof(*status));
+}
+
+/* virtgpu_shmem */
+
+static inline void apir_encode_virtgpu_shmem_res_id(apir_encoder * enc, uint32_t shmem_res_id) {
+ apir_encode_uint32_t(enc, &shmem_res_id);
+}
+
+static inline void apir_decode_virtgpu_shmem_res_id(apir_decoder * dec, uint32_t * shmem_res_id) {
+ apir_decode_uint32_t(dec, shmem_res_id);
+}
+
+/* ggml_cgraph */
+
+static inline size_t apir_serialize_ggml_cgraph(ggml_cgraph * cgraph, std::vector<uint8_t> & cgraph_data) {
+ apir_serialize_graph(cgraph, cgraph_data);
+
+ return cgraph_data.size();
+}
+
+static inline void apir_encode_cgraph_data(apir_encoder * enc, std::vector<uint8_t> & cgraph_data) {
+ size_t cgraph_size = cgraph_data.size();
+
+ apir_encode(enc, cgraph_size, cgraph_data.data(), cgraph_size);
+}
+
+static inline ggml_cgraph * apir_decode_ggml_cgraph(apir_decoder * dec, size_t cgraph_size) {
+ GGML_UNUSED(cgraph_size);
+
+ uint32_t n_nodes;
+ apir_decode_uint32_t(dec, &n_nodes);
+ const uint64_t * nodes = apir_decode_uint64_t_array_inplace(dec, n_nodes);
+
+ uint32_t n_tensors;
+ apir_decode_uint32_t(dec, &n_tensors);
+ const apir_rpc_tensor * tensors = apir_decode_apir_rpc_tensor_array_inplace(dec, n_tensors);
+
+ return apir_deserialize_graph(n_nodes, n_tensors, tensors, nodes);
+}
+
+static inline void apir_encode_ggml_buffer_handle(apir_encoder * enc, const apir_buffer_host_handle_t * handle) {
+ apir_encoder_write(enc, sizeof(*handle), &handle, sizeof(*handle));
+}
+
+static inline void apir_encode_ggml_tensor_inline(apir_encoder * enc, const ggml_tensor * tensor) {
+ size_t tensor_size = sizeof(*tensor);
+
+ if (tensor->extra) {
+ GGML_ABORT("Cannot pass tensors with extra");
+ }
+
+ if (tensor->src[0] && tensor->buffer) {
+ static int first = 1;
+ if (first) {
+ GGML_LOG_WARN("Cannot pass tensors with src and buffer\n");
+ first = 0;
+ }
+ }
+
+ apir_encoder_write(enc, tensor_size, tensor, tensor_size);
+
+ // tensor->data is a pointer inside the device buffer. No need to touch it
+ // tensor->buffer is a pointer to a buffer. Encoding the buffer handle in sequence.
+ // (could also make a copy of the tensor, and update locally.)
+
+ if (tensor->buffer) {
+ apir_buffer_host_handle_t buffer_handle = ggml_buffer_to_apir_handle(tensor->buffer);
+ apir_encode_ggml_buffer_handle(enc, &buffer_handle);
+ }
+
+ if (tensor->view_src) {
+ apir_encoder_write(enc, tensor_size, tensor->view_src, tensor_size);
+ }
+
+ for (int i = 0; tensor->src[i]; i++) {
+ const ggml_tensor * tensor_src = tensor->src[i];
+ apir_encoder_write(enc, tensor_size, tensor_src, tensor_size);
+ }
+}
+
+static inline const ggml_tensor * apir_decode_ggml_tensor_inplace(apir_decoder * dec) {
+ // it safe to remove the `const` qualifier here, we *do* want to
+ // modify the shared memory data to fix the `src` pointers.
+ ggml_tensor * tensor = (ggml_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, sizeof(ggml_tensor));
+
+ // tensor->data is a pointer inside the device buffer. No need to touch it
+ // tensor->buffer is a pointer to a buffer. Decode the buffer handle encoded in sequence.
+ if (tensor->buffer) {
+ tensor->buffer = apir_decode_ggml_buffer(dec);
+ }
+
+ if (tensor->view_src) {
+ ggml_tensor * tensor_view_src = (ggml_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, sizeof(ggml_tensor));
+ tensor->view_src = tensor_view_src;
+ }
+
+ for (int i = 0; tensor->src[i]; i++) {
+ ggml_tensor * tensor_src = (ggml_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, sizeof(ggml_tensor));
+ tensor->src[i] = tensor_src; // overwrite op->src[i] pointer with the actual location of the src tensor
+ }
+
+ return tensor;
+}
--- /dev/null
+#include "ggml.h"
+#include "ggml-backend-impl.h"
+
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include <cstdint>
+
+// ggml_tensor is serialized into apir_rpc_tensor
+struct apir_rpc_tensor {
+ uint64_t id;
+ uint32_t type;
+ uint64_t buffer;
+ uint32_t ne[GGML_MAX_DIMS];
+ uint32_t nb[GGML_MAX_DIMS];
+ uint32_t op;
+ int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
+ int32_t flags;
+ uint64_t src[GGML_MAX_SRC];
+ uint64_t view_src;
+ uint64_t view_offs;
+ uint64_t data;
+ char name[GGML_MAX_NAME];
+
+ char padding[4];
+};
+
+/* frontend */
+
+apir_rpc_tensor apir_serialize_tensor(const ggml_tensor * tensor);
+
+void apir_serialize_graph(const ggml_cgraph * cgraph, std::vector<uint8_t> & output);
+
+/* backend */
+
+void apir_track_backend_buffer(ggml_backend_buffer_t buffer);
+bool apir_untrack_backend_buffer(ggml_backend_buffer_t buffer);
+std::unordered_set<ggml_backend_buffer_t> apir_get_track_backend_buffers();
+
+void apir_add_tensor(ggml_tensor * tensor,
+ std::vector<apir_rpc_tensor> & tensors,
+ std::unordered_set<ggml_tensor *> & visited);
+
+ggml_tensor * apir_deserialize_tensor(ggml_context * ctx, const apir_rpc_tensor * tensor);
+
+ggml_tensor * apir_create_node(uint64_t id,
+ ggml_context * ctx,
+ const std::unordered_map<uint64_t, const apir_rpc_tensor *> & tensor_ptrs,
+ std::unordered_map<uint64_t, ggml_tensor *> & tensor_map);
+
+ggml_cgraph * apir_deserialize_graph(uint32_t n_nodes,
+ uint32_t n_tensors,
+ const apir_rpc_tensor * tensors,
+ const uint64_t * nodes);
--- /dev/null
+#include "ggml-remoting.h"
+
+static ggml_backend_buffer_t ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
+ size_t size) {
+ virtgpu * gpu = BUFT_TO_GPU(buft);
+
+ ggml_backend_remoting_buffer_context * context = (ggml_backend_remoting_buffer_context *) malloc(sizeof(*context));
+ if (!context) {
+ GGML_ABORT("Couldn't allocate the buffer context ...");
+ }
+
+ context->gpu = gpu;
+
+ bool async__unused, host_buffer__unused, events__unused;
+ bool buffer_from_host_ptr;
+ apir_device_get_props(gpu, &async__unused, &host_buffer__unused, &buffer_from_host_ptr, &events__unused);
+
+ if (buffer_from_host_ptr) {
+ context->apir_context = apir_device_buffer_from_ptr(gpu, size, size);
+ context->base = context->apir_context.shmem.mmap_ptr;
+ context->is_from_ptr = true;
+ } else {
+ context->apir_context = apir_buffer_type_alloc_buffer(gpu, buft, size);
+ context->is_from_ptr = false;
+ context->base = NULL;
+ }
+
+ ggml_backend_buffer_t buffer =
+ ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, (void *) context, size);
+
+ return buffer;
+}
+
+static const char * ggml_backend_remoting_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+ virtgpu * gpu = BUFT_TO_GPU(buft);
+
+ return apir_buffer_type_get_name(gpu, buft);
+}
+
+static size_t ggml_backend_remoting_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+ virtgpu * gpu = BUFT_TO_GPU(buft);
+
+ static size_t align = 0;
+
+ if (align == 0) {
+ align = apir_buffer_type_get_alignment(gpu, buft);
+ }
+
+ return align;
+}
+
+static size_t ggml_backend_remoting_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
+ virtgpu * gpu = BUFT_TO_GPU(buft);
+
+ static size_t max_size = 0;
+ if (max_size == 0) {
+ max_size = apir_buffer_type_get_max_size(gpu, buft);
+ }
+
+ return max_size;
+}
+
+static bool ggml_backend_remoting_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+ virtgpu * gpu = BUFT_TO_GPU(buft);
+
+ return apir_buffer_type_is_host(gpu, buft);
+}
+
+static size_t ggml_backend_remoting_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft,
+ const ggml_tensor * tensor) {
+ virtgpu * gpu = BUFT_TO_GPU(buft);
+
+ if (tensor->buffer == NULL
+ || !tensor->buffer->context
+ || !buft->device->iface.supports_buft(buft->device, tensor->buffer->buft)) {
+ return ggml_nbytes(tensor);
+ }
+
+ return apir_buffer_type_get_alloc_size(gpu, buft, tensor);
+}
+
+const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface = {
+ /* .get_name = */ ggml_backend_remoting_buffer_type_get_name,
+ /* .alloc_buffer = */ ggml_backend_remoting_buffer_type_alloc_buffer,
+ /* .get_alignment = */ ggml_backend_remoting_buffer_type_get_alignment,
+ /* .get_max_size = */ ggml_backend_remoting_buffer_type_get_max_size,
+ /* .get_alloc_size = */ ggml_backend_remoting_buffer_type_get_alloc_size,
+ /* .is_host = */ NULL,
+};
+
+const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_from_ptr_type_interface = {
+ /* .get_name = */ ggml_backend_remoting_buffer_type_get_name,
+ /* .alloc_buffer = */ NULL,
+ /* .get_alignment = */ ggml_backend_remoting_buffer_type_get_alignment,
+ /* .get_max_size = */ ggml_backend_remoting_buffer_type_get_max_size,
+ /* .get_alloc_size = */ ggml_backend_remoting_buffer_type_get_alloc_size,
+ /* .is_host = */ NULL,
+};
--- /dev/null
+#include "ggml-remoting.h"
+
+#define BUFFER_TO_GPU(name) ((ggml_backend_remoting_buffer_context *) (name)->context)->gpu
+
+static void * ggml_backend_remoting_buffer_get_base(ggml_backend_buffer_t buffer) {
+ ggml_backend_remoting_buffer_context * context = (ggml_backend_remoting_buffer_context *) buffer->context;
+ if (context->base) {
+ return context->base;
+ }
+
+ context->base = apir_buffer_get_base(BUFFER_TO_GPU(buffer), BUFFER_TO_APIR_CONTEXT(buffer));
+
+ return context->base;
+}
+
+static void ggml_backend_remoting_buffer_set_tensor(ggml_backend_buffer_t buffer,
+ ggml_tensor * tensor,
+ const void * data,
+ size_t offset,
+ size_t size) {
+ virtgpu * gpu = BUFFER_TO_GPU(buffer);
+
+ ggml_backend_remoting_buffer_context * context = BUFFER_TO_GGML_CONTEXT(buffer);
+ if (context->is_from_ptr) {
+ memcpy((char *) tensor->data + offset, data, size);
+ } else {
+ apir_buffer_set_tensor(gpu, BUFFER_TO_APIR_CONTEXT(buffer), tensor, data, offset, size);
+ }
+
+ return;
+}
+
+static void ggml_backend_remoting_buffer_get_tensor(ggml_backend_buffer_t buffer,
+ const ggml_tensor * tensor,
+ void * data,
+ size_t offset,
+ size_t size) {
+ virtgpu * gpu = BUFFER_TO_GPU(buffer);
+ ggml_backend_remoting_buffer_context * context = BUFFER_TO_GGML_CONTEXT(buffer);
+ if (context->is_from_ptr) {
+ memcpy(data, (const char *) tensor->data + offset, size);
+ } else {
+ apir_buffer_get_tensor(gpu, BUFFER_TO_APIR_CONTEXT(buffer), tensor, data, offset, size);
+ }
+}
+
+static void ggml_backend_remoting_buffer_set_tensor_from_ptr(ggml_backend_buffer_t buffer,
+ ggml_tensor * tensor,
+ const void * data,
+ size_t offset,
+ size_t size) {
+ UNUSED(buffer);
+
+ memcpy((char *) tensor->data + offset, data, size);
+
+ return;
+}
+
+static void ggml_backend_remoting_buffer_get_tensor_from_ptr(ggml_backend_buffer_t buffer,
+ const ggml_tensor * tensor,
+ void * data,
+ size_t offset,
+ size_t size) {
+ UNUSED(buffer);
+
+ memcpy(data, (const char *) tensor->data + offset, size);
+}
+
+static bool ggml_backend_remoting_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
+ const ggml_tensor * src,
+ ggml_tensor * dst) {
+ virtgpu * gpu = BUFFER_TO_GPU(buffer);
+
+ bool ret = apir_buffer_cpy_tensor(gpu, BUFFER_TO_APIR_CONTEXT(buffer), src, dst);
+
+ return ret;
+}
+
+static void ggml_backend_remoting_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+ virtgpu * gpu = BUFFER_TO_GPU(buffer);
+
+ apir_buffer_clear(gpu, BUFFER_TO_APIR_CONTEXT(buffer), value);
+
+ return;
+}
+
+static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+ virtgpu * gpu = BUFFER_TO_GPU(buffer);
+
+ apir_buffer_free_buffer(gpu, BUFFER_TO_APIR_CONTEXT(buffer));
+
+ ggml_backend_remoting_buffer_context * context = BUFFER_TO_GGML_CONTEXT(buffer);
+ free(context);
+ buffer->context = NULL;
+}
+
+const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = {
+ /* .free_buffer = */ ggml_backend_remoting_buffer_free_buffer,
+ /* .get_base = */ ggml_backend_remoting_buffer_get_base,
+ /* .init_tensor = */ NULL,
+ /* .memset_tensor = */ NULL,
+ /* .set_tensor = */ ggml_backend_remoting_buffer_set_tensor,
+ /* .get_tensor = */ ggml_backend_remoting_buffer_get_tensor,
+ /* .cpy_tensor = */ ggml_backend_remoting_buffer_cpy_tensor,
+ /* .clear = */ ggml_backend_remoting_buffer_clear,
+ /* .reset = */ NULL,
+};
+
+const ggml_backend_buffer_i ggml_backend_remoting_buffer_from_ptr_interface = {
+ /* .free_buffer = */ ggml_backend_remoting_buffer_free_buffer,
+ /* .get_base = */ ggml_backend_remoting_buffer_get_base,
+ /* .init_tensor = */ NULL,
+ /* .memset_tensor = */ NULL,
+ /* .set_tensor = */ ggml_backend_remoting_buffer_set_tensor_from_ptr,
+ /* .get_tensor = */ ggml_backend_remoting_buffer_get_tensor_from_ptr,
+ /* .cpy_tensor = */ ggml_backend_remoting_buffer_cpy_tensor,
+ /* .clear = */ ggml_backend_remoting_buffer_clear,
+ /* .reset = */ NULL,
+};
--- /dev/null
+#include "ggml-remoting.h"
+
+static const char * ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev) {
+ virtgpu * gpu = DEV_TO_GPU(dev);
+
+ return apir_device_get_name(gpu);
+}
+
+static const char * ggml_backend_remoting_device_get_description(ggml_backend_dev_t dev) {
+ virtgpu * gpu = DEV_TO_GPU(dev);
+
+ return apir_device_get_description(gpu);
+}
+
+static enum ggml_backend_dev_type ggml_backend_remoting_device_get_type(ggml_backend_dev_t dev) {
+ virtgpu * gpu = DEV_TO_GPU(dev);
+
+ static enum ggml_backend_dev_type type;
+ static bool has_type = false;
+ if (!has_type) {
+ has_type = true;
+ type = (enum ggml_backend_dev_type) apir_device_get_type(gpu);
+ }
+
+ return type;
+}
+
+static void ggml_backend_remoting_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+ virtgpu * gpu = DEV_TO_GPU(dev);
+
+ return apir_device_get_memory(gpu, free, total);
+}
+
+static bool ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
+#if USE_ALWAYS_TRUE_SUPPORTS_OP == 1
+ /* ggml-rpc cheats it like this */
+ /* with the current implementation of serialize_tensor, the src/view aren't properly passed */
+ UNUSED(dev);
+ UNUSED(op);
+
+ return true;
+#else
+ virtgpu * gpu = DEV_TO_GPU(dev);
+
+ return apir_device_supports_op(gpu, op);
+#endif
+}
+
+static bool ggml_backend_remoting_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+ bool supported = buft->device == dev;
+
+ return supported;
+}
+
+static bool ggml_backend_remoting_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
+ UNUSED(dev);
+ UNUSED(op);
+
+ return false;
+}
+
+static void ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
+ props->name = ggml_backend_remoting_device_get_name(dev);
+ props->description = ggml_backend_remoting_device_get_description(dev);
+ props->type = ggml_backend_remoting_device_get_type(dev);
+ ggml_backend_remoting_device_get_memory(dev, &props->memory_free, &props->memory_total);
+
+ virtgpu * gpu = DEV_TO_GPU(dev);
+ apir_device_get_props(gpu, &props->caps.async, &props->caps.host_buffer, &props->caps.buffer_from_host_ptr,
+ &props->caps.events);
+
+ props->caps.buffer_from_host_ptr = false;
+ props->caps.async = false;
+ props->caps.events = false;
+}
+
+ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) {
+ virtgpu * gpu = DEV_TO_GPU(dev);
+
+ apir_buffer_type_host_handle_t ctx = apir_device_get_buffer_type(gpu);
+
+ static ggml_backend_buffer_type buft{
+ /* .iface = */ ggml_backend_remoting_buffer_type_interface,
+ /* .device = */ dev,
+ /* .context = */ (void *) ctx,
+ };
+
+ return &buft;
+}
+
+static ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_from_ptr_type(ggml_backend_dev_t dev) {
+ virtgpu * gpu = DEV_TO_GPU(dev);
+
+ apir_buffer_type_host_handle_t ctx = apir_device_get_buffer_type(gpu);
+
+ static ggml_backend_buffer_type buft{
+ /* .iface = */ ggml_backend_remoting_buffer_from_ptr_type_interface,
+ /* .device = */ dev,
+ /* .context = */ (void *) ctx,
+ };
+
+ return &buft;
+}
+
+static ggml_backend_buffer_t ggml_backend_remoting_device_buffer_from_ptr(ggml_backend_dev_t dev,
+ void * ptr,
+ size_t size,
+ size_t max_tensor_size) {
+ virtgpu * gpu = DEV_TO_GPU(dev);
+
+ ggml_backend_remoting_buffer_context * context = (ggml_backend_remoting_buffer_context *) malloc(sizeof(*context));
+ if (!context) {
+ GGML_ABORT("Couldn't allocate the buffer context ...");
+ }
+
+ context->gpu = gpu;
+ context->apir_context = apir_device_buffer_from_ptr(gpu, size, max_tensor_size);
+ context->base = ptr;
+ context->is_from_ptr = true;
+
+ ggml_backend_buffer_t buffer =
+ ggml_backend_buffer_init(ggml_backend_remoting_device_get_buffer_from_ptr_type(dev),
+ ggml_backend_remoting_buffer_from_ptr_interface, (void *) context, size);
+
+ return buffer;
+}
+
+const ggml_backend_device_i ggml_backend_remoting_device_interface = {
+ /* .get_name = */ ggml_backend_remoting_device_get_name,
+ /* .get_description = */ ggml_backend_remoting_device_get_description,
+ /* .get_memory = */ ggml_backend_remoting_device_get_memory,
+ /* .get_type = */ ggml_backend_remoting_device_get_type,
+ /* .get_props = */ ggml_backend_remoting_device_get_props,
+ /* .init_backend = */ ggml_backend_remoting_device_init,
+ /* .get_buffer_type = */ ggml_backend_remoting_device_get_buffer_type,
+ /* .get_host_buffer_type = */ NULL,
+ /* .buffer_from_host_ptr = */ ggml_backend_remoting_device_buffer_from_ptr,
+ /* .supports_op = */ ggml_backend_remoting_device_supports_op,
+ /* .supports_buft = */ ggml_backend_remoting_device_supports_buft,
+ /* .offload_op = */ ggml_backend_remoting_device_offload_op,
+ /* .event_new = */ NULL,
+ /* .event_free = */ NULL,
+ /* .event_synchronize = */ NULL,
+};
--- /dev/null
+#include "ggml-remoting.h"
+#include "ggml-virtgpu.h"
+
+#include <iostream>
+#include <mutex>
+
+static virtgpu * apir_initialize() {
+ static virtgpu * apir_gpu_instance = NULL;
+ static bool apir_initialized = false;
+
+ {
+ static std::mutex mutex;
+ std::lock_guard<std::mutex> lock(mutex);
+
+ if (apir_initialized) {
+ return apir_gpu_instance;
+ }
+
+ apir_gpu_instance = create_virtgpu();
+ if (!apir_gpu_instance) {
+ GGML_ABORT("failed to initialize the virtgpu");
+ }
+
+ apir_initialized = true;
+ }
+
+ return apir_gpu_instance;
+}
+
+static int ggml_backend_remoting_get_device_count() {
+ virtgpu * gpu = apir_initialize();
+ if (!gpu) {
+ GGML_LOG_WARN("apir_initialize failed\n");
+ return 0;
+ }
+
+ return apir_device_get_count(gpu);
+}
+
+static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) {
+ UNUSED(reg);
+
+ return ggml_backend_remoting_get_device_count();
+}
+
+static std::vector<ggml_backend_dev_t> devices;
+
+ggml_backend_dev_t ggml_backend_remoting_get_device(size_t device) {
+ GGML_ASSERT(device < devices.size());
+ return devices[device];
+}
+
+static void ggml_backend_remoting_reg_init_devices(ggml_backend_reg_t reg) {
+ if (devices.size() > 0) {
+ GGML_LOG_INFO("%s: already initialized\n", __func__);
+ return;
+ }
+
+ virtgpu * gpu = apir_initialize();
+ if (!gpu) {
+ GGML_LOG_ERROR("apir_initialize failed\n");
+ return;
+ }
+
+ static bool initialized = false;
+
+ {
+ static std::mutex mutex;
+ std::lock_guard<std::mutex> lock(mutex);
+ if (!initialized) {
+ for (int i = 0; i < ggml_backend_remoting_get_device_count(); i++) {
+ ggml_backend_remoting_device_context * ctx = new ggml_backend_remoting_device_context;
+ char desc[256] = "API Remoting device";
+
+ ctx->device = i;
+ ctx->name = GGML_REMOTING_FRONTEND_NAME + std::to_string(i);
+ ctx->description = desc;
+ ctx->gpu = gpu;
+
+ ggml_backend_dev_t dev = new ggml_backend_device{
+ /* .iface = */ ggml_backend_remoting_device_interface,
+ /* .reg = */ reg,
+ /* .context = */ ctx,
+ };
+ devices.push_back(dev);
+ }
+ initialized = true;
+ }
+ }
+}
+
+static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_t reg, size_t device) {
+ UNUSED(reg);
+
+ return ggml_backend_remoting_get_device(device);
+}
+
+static const char * ggml_backend_remoting_reg_get_name(ggml_backend_reg_t reg) {
+ UNUSED(reg);
+
+ return GGML_REMOTING_FRONTEND_NAME;
+}
+
+static const ggml_backend_reg_i ggml_backend_remoting_reg_i = {
+ /* .get_name = */ ggml_backend_remoting_reg_get_name,
+ /* .get_device_count = */ ggml_backend_remoting_reg_get_device_count,
+ /* .get_device = */ ggml_backend_remoting_reg_get_device,
+ /* .get_proc_address = */ NULL,
+};
+
+ggml_backend_reg_t ggml_backend_virtgpu_reg() {
+ virtgpu * gpu = apir_initialize();
+ if (!gpu) {
+ GGML_LOG_ERROR("virtgpu_apir_initialize failed\n");
+ return NULL;
+ }
+
+ static ggml_backend_reg reg = {
+ /* .api_version = */ GGML_BACKEND_API_VERSION,
+ /* .iface = */ ggml_backend_remoting_reg_i,
+ /* .context = */ gpu,
+ };
+
+ static bool initialized = false;
+ if (initialized) {
+ return ®
+ }
+ initialized = true;
+
+ ggml_backend_remoting_reg_init_devices(®);
+
+ GGML_LOG_INFO("%s: initialized\n", __func__);
+
+ return ®
+}
+
+GGML_BACKEND_DL_IMPL(ggml_backend_virtgpu_reg)
--- /dev/null
+#include "ggml-remoting.h"
+#include "../../include/ggml-virtgpu.h"
+
+static const char * ggml_backend_remoting_get_name(ggml_backend_t backend) {
+ UNUSED(backend);
+
+ return "API Remoting backend";
+}
+
+static void ggml_backend_remoting_free(ggml_backend_t backend) {
+ delete backend;
+}
+
+static ggml_status ggml_backend_remoting_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+ virtgpu * gpu = DEV_TO_GPU(backend->device);
+
+ return apir_backend_graph_compute(gpu, cgraph);
+}
+
+static void ggml_backend_remoting_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) {
+ virtgpu * gpu = DEV_TO_GPU(backend->device);
+#if true
+ UNUSED(gpu);
+ UNUSED(cgraph);
+#else
+ // not working yet
+
+ apir_backend_graph_optimize(gpu, cgraph);
+#endif
+}
+
+static ggml_backend_i ggml_backend_remoting_interface = {
+ /* .get_name = */ ggml_backend_remoting_get_name,
+ /* .free = */ ggml_backend_remoting_free,
+ /* .set_tensor_async = */ NULL, // ggml_backend_remoting_set_tensor_async,
+ /* .get_tensor_async = */ NULL, // ggml_backend_remoting_get_tensor_async,
+ /* .cpy_tensor_async = */ NULL, // ggml_backend_remoting_cpy_tensor_async,
+ /* .synchronize = */ NULL, // ggml_backend_remoting_synchronize,
+ /* .graph_plan_create = */ NULL,
+ /* .graph_plan_free = */ NULL,
+ /* .graph_plan_update = */ NULL,
+ /* .graph_plan_compute = */ NULL,
+ /* .graph_compute = */ ggml_backend_remoting_graph_compute,
+ /* .event_record = */ NULL,
+ /* .event_wait = */ NULL,
+ /* .graph_optimize = */ ggml_backend_remoting_graph_optimize,
+};
+
+static ggml_guid_t ggml_backend_remoting_guid() {
+ static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x14, 0x03, 0x86, 0x02,
+ 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b };
+
+ return &guid;
+}
+
+ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const char * params) {
+ UNUSED(params);
+
+ ggml_backend_remoting_device_context * ctx = (ggml_backend_remoting_device_context *) dev->context;
+
+ ggml_backend_t remoting_backend = new ggml_backend{
+ /* .guid = */ ggml_backend_remoting_guid(),
+ /* .interface = */ ggml_backend_remoting_interface,
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_virtgpu_reg(), ctx->device),
+ /* .context = */ ctx,
+ };
+
+ return remoting_backend;
+}
--- /dev/null
+#pragma once
+
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+#include "ggml-impl.h"
+#include "virtgpu.h"
+
+#include <memory>
+#include <string>
+
+// USE_ALWAYS_TRUE_SUPPORTS_OP: 1 is fast, 0 avoid micro-benchmark crashes
+
+#define USE_ALWAYS_TRUE_SUPPORTS_OP 1
+#define USE_METAL_GUEST_SUPPORTS_OP 0
+
+#define DEV_TO_GPU(name) ((ggml_backend_remoting_device_context *) (name)->context)->gpu
+
+#define BUFFER_TO_GGML_CONTEXT(name) ((ggml_backend_remoting_buffer_context *) (name)->context)
+
+#define BUFFER_TO_APIR_CONTEXT(name) &((ggml_backend_remoting_buffer_context *) (name)->context)->apir_context
+
+#define BUFFER_TO_HOST_HANDLE(name) ((ggml_backend_remoting_buffer_context *) (name)->context)->apir_context.host_handle
+
+#define GET_DEVICE_CONTEXT() (ggml_backend_remoting_device_context *) ggml_backend_remoting_get_device(0)->context
+
+#define BUFT_TO_GPU(name) ((ggml_backend_remoting_device_context *) (name)->device->context)->gpu
+
+struct ggml_backend_remoting_device_context {
+ size_t device;
+ std::string name;
+ std::string description;
+
+ std::vector<std::tuple<void *, size_t, virtgpu_shmem *>> shared_memory;
+
+ virtgpu * gpu;
+};
+
+struct ggml_backend_remoting_buffer_context {
+ apir_buffer_context_t apir_context;
+
+ virtgpu * gpu;
+
+ void * base;
+
+ bool is_from_ptr;
+};
+
+extern const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface;
+extern const ggml_backend_device_i ggml_backend_remoting_device_interface;
+extern const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface;
+extern const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_from_ptr_type_interface;
+extern const ggml_backend_buffer_i ggml_backend_remoting_buffer_from_ptr_interface;
+
+ggml_backend_dev_t ggml_backend_remoting_get_device(size_t device);
+ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const char * params);
+ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev);
+
+static inline apir_buffer_type_host_handle_t ggml_buffer_type_to_apir_handle(ggml_backend_buffer_type_t buft) {
+ // in the backend, the buffer handle is the buffer pointer
+ return (apir_buffer_type_host_handle_t) buft->context;
+}
+
+static inline apir_buffer_host_handle_t ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) {
+ if (!buffer->context) {
+ GGML_ABORT("%s: no context available :/", __func__);
+ }
+ return BUFFER_TO_HOST_HANDLE(buffer);
+}
--- /dev/null
+# YAML schema for GGML remoting API functions
+# This defines the structure for generating the remoting layer code
+
+# Configuration for the generated files
+config:
+ # Base path for the generated files
+ base_path: "ggml/src"
+
+ # Header files to update
+ files:
+ apir_backend_header: "ggml-virtgpu-apir/backend/shared/apir_backend.gen.h"
+ backend_dispatched_header: "ggml-virtgpu-apir/backend/backend-dispatched.gen.h"
+ virtgpu_forward_header: "ggml-virtgpu-apir/virtgpu-forward.gen.h"
+
+# Simplified function definitions with grouping and metadata combined
+functions:
+ device:
+ group_description: "device"
+ functions:
+ get_device_count:
+ # No specific metadata - uses default void return and base params
+
+ get_count:
+ frontend_return: "int"
+
+ get_name:
+ frontend_return: "const char *"
+
+ get_description:
+ frontend_return: "const char *"
+
+ get_type:
+ frontend_return: "uint32_t"
+
+ get_memory:
+ frontend_return: "void"
+ frontend_extra_params:
+ - "size_t *free"
+ - "size_t *total"
+
+ supports_op:
+ frontend_return: "bool"
+ frontend_extra_params:
+ - "const ggml_tensor *op"
+
+ get_buffer_type:
+ frontend_return: "apir_buffer_type_host_handle_t"
+
+ get_props:
+ frontend_return: "void"
+ frontend_extra_params:
+ - "bool *async"
+ - "bool *host_buffer"
+ - "bool *buffer_from_host_ptr"
+ - "bool *events"
+
+ buffer_from_ptr:
+ frontend_return: "apir_buffer_context_t"
+ frontend_extra_params:
+ - "size_t size"
+ - "size_t max_tensor_size"
+
+ buffer_type:
+ group_description: "buffer-type"
+ functions:
+ get_name:
+ frontend_return: "const char *"
+ frontend_extra_params:
+ - "ggml_backend_buffer_type_t buft"
+
+ get_alignment:
+ frontend_return: "size_t"
+ frontend_extra_params:
+ - "ggml_backend_buffer_type_t buft"
+
+ get_max_size:
+ frontend_return: "size_t"
+ frontend_extra_params:
+ - "ggml_backend_buffer_type_t buft"
+
+ is_host:
+ frontend_return: "bool"
+ frontend_extra_params:
+ - "ggml_backend_buffer_type_t buft"
+
+ alloc_buffer:
+ frontend_return: "apir_buffer_context_t"
+ frontend_extra_params:
+ - "ggml_backend_buffer_type_t buffer_buft"
+ - "size_t size"
+
+ get_alloc_size:
+ frontend_return: "size_t"
+ frontend_extra_params:
+ - "ggml_backend_buffer_type_t buft"
+ - "const ggml_tensor *op"
+
+ buffer:
+ group_description: "buffer"
+ functions:
+ get_base:
+ frontend_return: "void *"
+ frontend_extra_params:
+ - "apir_buffer_context_t *buffer_context"
+
+ set_tensor:
+ frontend_return: "void"
+ frontend_extra_params:
+ - "apir_buffer_context_t *buffer_context"
+ - "ggml_tensor *tensor"
+ - "const void *data"
+ - "size_t offset"
+ - "size_t size"
+
+ get_tensor:
+ frontend_return: "void"
+ frontend_extra_params:
+ - "apir_buffer_context_t *buffer_context"
+ - "const ggml_tensor *tensor"
+ - "void *data"
+ - "size_t offset"
+ - "size_t size"
+
+ cpy_tensor:
+ frontend_return: "bool"
+ frontend_extra_params:
+ - "apir_buffer_context_t *buffer_context"
+ - "const ggml_tensor *src"
+ - "const ggml_tensor *dst"
+
+ clear:
+ frontend_return: "void"
+ frontend_extra_params:
+ - "apir_buffer_context_t *buffer_context"
+ - "uint8_t value"
+
+ free_buffer:
+ frontend_return: "void"
+ frontend_extra_params:
+ - "apir_buffer_context_t *buffer_context"
+
+ backend:
+ group_description: "backend"
+ functions:
+ graph_compute:
+ frontend_return: "ggml_status"
+ frontend_extra_params:
+ - "ggml_cgraph *cgraph"
+
+ graph_optimize:
+ frontend_return: "ggml_cgraph *"
+ frontend_extra_params:
+ - "ggml_cgraph *cgraph"
+ enabled: false
+
+# Naming patterns used for code generation
+naming_patterns:
+ # How to generate enum names
+ enum_prefix: "APIR_COMMAND_TYPE_"
+
+ # How to generate backend function names
+ backend_function_prefix: "backend_"
+
+ # How to generate frontend function names
+ frontend_function_prefix: "apir_"
+
+ # Standard frontend first parameter
+ frontend_base_param: "struct virtgpu *gpu"
--- /dev/null
+#pragma once
+
+#include <stdint.h>
+
+struct virgl_renderer_capset_apir {
+ uint32_t apir_version;
+ uint32_t supports_blob_resources;
+ uint32_t reserved[4]; // For future expansion
+};
--- /dev/null
+#!/usr/bin/env python3
+"""
+# Generated by Claude AI
+
+Script to completely regenerate the GGML remoting codebase from YAML configuration.
+
+This script reads api_functions.yaml and regenerates all the header files and
+implementation templates for the GGML remoting layer.
+
+Usage:
+ python regenerate_remoting.py
+
+The script will:
+1. Read ggmlremoting_functions.yaml configuration
+2. Generate updated header files
+3. Generate implementation templates in dedicated files
+4. Show a summary of what was generated
+"""
+
+import yaml
+from typing import Dict, List, Any
+from pathlib import Path
+import os
+import subprocess
+import shutil
+import logging
+
+NL = '\n' # can't have f"{'\n'}" in f-strings
+
+
+class RemotingCodebaseGenerator:
+ def __init__(self, yaml_path: str = "ggmlremoting_functions.yaml"):
+ """Initialize the generator with the YAML configuration."""
+ self.yaml_path = yaml_path
+
+ if not Path(yaml_path).exists():
+ raise FileNotFoundError(f"Configuration file {yaml_path} not found")
+
+ with open(yaml_path, 'r') as f:
+ self.config = yaml.safe_load(f)
+
+ self.functions = self.config['functions']
+ self.naming_patterns = self.config['naming_patterns']
+ self.config_data = self.config['config']
+
+ # Check if clang-format is available
+ self.clang_format_available = self._check_clang_format_available()
+
+ def _check_clang_format_available(self) -> bool:
+ """Check if clang-format is available in the system PATH."""
+ return shutil.which("clang-format") is not None
+
+ def _format_file_with_clang_format(self, file_path: Path) -> bool:
+ """Format a file with clang-format -i. Returns True if successful, False otherwise."""
+ if not self.clang_format_available:
+ return False
+
+ try:
+ subprocess.run(
+ ["clang-format", "-i", str(file_path)],
+ check=True,
+ capture_output=True,
+ text=True
+ )
+ return True
+ except subprocess.CalledProcessError:
+ logging.exception(f" ⚠️ clang-format failed for {file_path}")
+ return False
+ except Exception as e:
+ logging.exception(f" ⚠️ Unexpected error formatting {file_path}: {e}")
+ return False
+
+ def generate_enum_name(self, group_name: str, function_name: str) -> str:
+ """Generate the APIR_COMMAND_TYPE enum name for a function."""
+ prefix = self.naming_patterns['enum_prefix']
+ return f"{prefix}{group_name.upper()}_{function_name.upper()}"
+
+ def generate_backend_function_name(self, group_name: str, function_name: str) -> str:
+ """Generate the backend function name."""
+ function_key = f"{group_name}_{function_name}"
+ overrides = self.naming_patterns.get('backend_function_overrides', {})
+
+ if function_key in overrides:
+ return overrides[function_key]
+
+ prefix = self.naming_patterns['backend_function_prefix']
+ return f"{prefix}{group_name}_{function_name}"
+
+ def generate_frontend_function_name(self, group_name: str, function_name: str) -> str:
+ """Generate the frontend function name."""
+ prefix = self.naming_patterns['frontend_function_prefix']
+ return f"{prefix}{group_name}_{function_name}"
+
+ def get_enabled_functions(self) -> List[Dict[str, Any]]:
+ """Get all enabled functions with their metadata."""
+ functions = []
+ enum_value = 0
+
+ for group_name, group_data in self.functions.items():
+ group_description = group_data['group_description']
+
+ for function_name, func_metadata in group_data['functions'].items():
+ # Handle case where func_metadata is None or empty (functions with only comments)
+ if func_metadata is None:
+ func_metadata = {}
+
+ # Functions are enabled by default unless explicitly disabled
+ if func_metadata.get('enabled', True):
+ functions.append({
+ 'group_name': group_name,
+ 'function_name': function_name,
+ 'enum_name': self.generate_enum_name(group_name, function_name),
+ 'enum_value': enum_value,
+ 'backend_function': self.generate_backend_function_name(group_name, function_name),
+ 'frontend_function': self.generate_frontend_function_name(group_name, function_name),
+ 'frontend_return': func_metadata.get('frontend_return', 'void'),
+ 'frontend_extra_params': func_metadata.get('frontend_extra_params', []),
+ 'group_description': group_description,
+ 'newly_added': func_metadata.get('newly_added', False)
+ })
+ enum_value += 1
+
+ return functions
+
+ def generate_apir_backend_header(self) -> str:
+ """Generate the complete apir_backend.h file."""
+ functions = self.get_enabled_functions()
+
+ # Generate the enum section
+ enum_lines = ["typedef enum ApirBackendCommandType {"]
+ current_group = None
+
+ for func in functions:
+ # Add comment for new group
+ if func['group_name'] != current_group:
+ enum_lines.append("")
+ enum_lines.append(f" /* {func['group_description']} */")
+ current_group = func['group_name']
+
+ enum_lines.append(f" {func['enum_name']} = {func['enum_value']},")
+
+ # Add the count
+ total_count = len(functions)
+ enum_lines.append("\n // last command_type index + 1")
+ enum_lines.append(f" APIR_BACKEND_DISPATCH_TABLE_COUNT = {total_count},")
+ enum_lines.append("} ApirBackendCommandType;")
+
+ # Full header template
+ header_content = NL.join(enum_lines) + "\n"
+
+ return header_content
+
+ def generate_backend_dispatched_header(self) -> str:
+ """Generate the complete backend-dispatched.h file."""
+ functions = self.get_enabled_functions()
+
+ # Function declarations
+ decl_lines = []
+ current_group = None
+
+ for func in functions:
+ if func['group_name'] != current_group:
+ decl_lines.append(f"\n/* {func['group_description']} */")
+ current_group = func['group_name']
+
+ signature = "uint32_t"
+ params = "apir_encoder *enc, apir_decoder *dec, virgl_apir_context *ctx"
+ decl_lines.append(f"{signature} {func['backend_function']}({params});")
+
+ # Switch cases
+ switch_lines = []
+ current_group = None
+
+ for func in functions:
+ if func['group_name'] != current_group:
+ switch_lines.append(f" /* {func['group_description']} */")
+ current_group = func['group_name']
+
+ switch_lines.append(f" case {func['enum_name']}: return \"{func['backend_function']}\";")
+
+ # Dispatch table
+ table_lines = []
+ current_group = None
+
+ for func in functions:
+ if func['group_name'] != current_group:
+ table_lines.append(f"\n /* {func['group_description']} */")
+ table_lines.append("")
+ current_group = func['group_name']
+
+ table_lines.append(f" /* {func['enum_name']} = */ {func['backend_function']},")
+
+ header_content = f'''\
+#pragma once
+
+{NL.join(decl_lines)}
+
+static inline const char *backend_dispatch_command_name(ApirBackendCommandType type)
+{{
+ switch (type) {{
+{NL.join(switch_lines)}
+
+ default: return "unknown";
+ }}
+}}
+
+extern "C" {{
+static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATCH_TABLE_COUNT] = {{
+ {NL.join(table_lines)}
+}};
+}}
+'''
+ return header_content
+
+ def generate_virtgpu_forward_header(self) -> str:
+ """Generate the complete virtgpu-forward.gen.h file."""
+ functions = self.get_enabled_functions()
+
+ decl_lines = []
+ current_group = None
+
+ for func in functions:
+ if func['group_name'] != current_group:
+ decl_lines.append("")
+ decl_lines.append(f"/* {func['group_description']} */")
+ current_group = func['group_name']
+
+ # Build parameter list
+ params = [self.naming_patterns['frontend_base_param']]
+ params.extend(func['frontend_extra_params'])
+ param_str = ', '.join(params)
+
+ decl_lines.append(f"{func['frontend_return']} {func['frontend_function']}({param_str});")
+
+ header_content = f'''\
+#pragma once
+{NL.join(decl_lines)}
+'''
+ return header_content
+
+ def regenerate_codebase(self) -> None:
+ """Regenerate the entire remoting codebase."""
+ logging.info("🔄 Regenerating GGML Remoting Codebase...")
+ logging.info("=" * 50)
+
+ # Detect if we're running from frontend directory
+ current_dir = os.getcwd()
+ is_frontend_dir = current_dir.endswith('ggml-virtgpu')
+
+ if is_frontend_dir:
+ # Running from ggml/src/ggml-virtgpu-apir
+ logging.info("📍 Detected frontend directory execution")
+ frontend_base = Path(".")
+ else:
+ # Running from project root (fallback to original behavior)
+ logging.info("📍 Detected project root execution")
+ base_path = self.config_data.get('base_path', 'ggml/src')
+ frontend_base = Path(base_path) / "ggml-virtgpu"
+
+ # Compute final file paths
+ backend_base = frontend_base / "backend"
+ apir_backend_path = backend_base / "shared" / "apir_backend.gen.h"
+ backend_dispatched_path = backend_base / "backend-dispatched.gen.h"
+ virtgpu_forward_path = frontend_base / "virtgpu-forward.gen.h"
+
+ # Create output directories for each file
+ apir_backend_path.parent.mkdir(parents=True, exist_ok=True)
+ backend_dispatched_path.parent.mkdir(parents=True, exist_ok=True)
+ virtgpu_forward_path.parent.mkdir(parents=True, exist_ok=True)
+
+ # Generate header files
+ logging.info("📁 Generating header files...")
+
+ apir_backend_content = self.generate_apir_backend_header()
+ apir_backend_path.write_text(apir_backend_content)
+ logging.info(f" ✅ {apir_backend_path.resolve()}")
+
+ backend_dispatched_content = self.generate_backend_dispatched_header()
+ backend_dispatched_path.write_text(backend_dispatched_content)
+ logging.info(f" ✅ {backend_dispatched_path.resolve()}")
+
+ virtgpu_forward_content = self.generate_virtgpu_forward_header()
+ virtgpu_forward_path.write_text(virtgpu_forward_content)
+ logging.info(f" ✅ {virtgpu_forward_path.resolve()}")
+
+ # Format generated files with clang-format
+ generated_files = [apir_backend_path, backend_dispatched_path, virtgpu_forward_path]
+
+ if not self.clang_format_available:
+ logging.warning("\n⚠️clang-format not found in PATH. Generated files will not be formatted."
+ " Install clang-format to enable automatic code formatting.")
+ else:
+ logging.info("\n🎨 Formatting files with clang-format...")
+ for file_path in generated_files:
+ if self._format_file_with_clang_format(file_path):
+ logging.info(f" ✅ Formatted {file_path.name}")
+ else:
+ logging.warning(f" ❌ Failed to format {file_path.name}")
+
+ # Generate summary
+ functions = self.get_enabled_functions()
+ total_functions = len(functions)
+
+ logging.info("\n📊 Generation Summary:")
+ logging.info("=" * 50)
+ logging.info(f" Total functions: {total_functions}")
+ logging.info(f" Function groups: {len(self.functions)}")
+ logging.info(" Header files: 3")
+ logging.info(f" Working directory: {current_dir}")
+
+
+def main():
+ try:
+ generator = RemotingCodebaseGenerator()
+ generator.regenerate_codebase()
+ except Exception as e:
+ logging.exception(f"❌ Error: {e}")
+ exit(1)
+
+
+if __name__ == "__main__":
+ main()
--- /dev/null
+#include "backend/shared/apir_backend.h"
+#include "ggml-alloc.h"
+#include "ggml-impl.h"
+#include "ggml.h"
+#include "virtgpu-shm.h"
+#include "virtgpu-utils.h"
+
+struct apir_buffer_context_t {
+ apir_buffer_host_handle_t host_handle;
+
+ struct virtgpu_shmem shmem;
+ apir_buffer_type_host_handle_t buft_host_handle;
+};
+
+#include "virtgpu-forward.gen.h"
--- /dev/null
+#include "virtgpu-forward-impl.h"
+
+static long long current_time_ms() {
+ timespec ts;
+ clock_gettime(CLOCK_REALTIME, &ts); // Use CLOCK_MONOTONIC for elapsed time
+ return (long long) ts.tv_sec * 1000000000LL + ts.tv_nsec;
+}
+
+ggml_status apir_backend_graph_compute(virtgpu * gpu, ggml_cgraph * cgraph) {
+ apir_encoder * encoder;
+ apir_decoder * decoder;
+ ApirForwardReturnCode ret;
+
+ REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE);
+
+ std::vector<uint8_t> cgraph_data;
+ size_t cgraph_size = apir_serialize_ggml_cgraph(cgraph, cgraph_data);
+
+ virtgpu_shmem temp_shmem; // Local storage for large buffers
+ virtgpu_shmem * shmem = &temp_shmem;
+
+ if (cgraph_size <= gpu->data_shmem.mmap_size) {
+ // prefer the init-time allocated page, if large enough
+ shmem = &gpu->data_shmem;
+ } else if (virtgpu_shmem_create(gpu, cgraph_size, shmem)) {
+ GGML_ABORT("Couldn't allocate the guest-host shared buffer");
+ }
+
+ apir_encode_virtgpu_shmem_res_id(encoder, shmem->res_id);
+
+ apir_encode_size_t(encoder, &cgraph_size);
+
+ char * shmem_data = (char *) shmem->mmap_ptr;
+ apir_encoder secondary_enc = apir_new_encoder(shmem_data, cgraph_size);
+
+ apir_encode_cgraph_data(&secondary_enc, cgraph_data);
+
+ REMOTE_CALL(gpu, encoder, decoder, ret);
+
+ ggml_status status = GGML_STATUS_ABORTED;
+ apir_decode_ggml_status(decoder, &status);
+
+ remote_call_finish(gpu, encoder, decoder);
+
+ if (shmem != &gpu->data_shmem) {
+ virtgpu_shmem_destroy(gpu, shmem);
+ }
+
+ return status;
+}
--- /dev/null
+#include "virtgpu-forward-impl.h"
+
+const char * apir_buffer_type_get_name(virtgpu * gpu, ggml_backend_buffer_type_t buft) {
+ apir_encoder * encoder;
+ apir_decoder * decoder;
+ ApirForwardReturnCode ret;
+
+ REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME);
+
+ apir_encode_ggml_buffer_type(encoder, buft);
+
+ REMOTE_CALL(gpu, encoder, decoder, ret);
+
+ const size_t string_size = apir_decode_array_size_unchecked(decoder);
+ char * string = (char *) apir_decoder_alloc_array(sizeof(char), string_size);
+ if (!string) {
+ GGML_LOG_ERROR("%s: Could not allocate the device name buffer\n", __func__);
+ apir_decoder_set_fatal(decoder);
+ }
+ apir_decode_char_array(decoder, string, string_size);
+
+ remote_call_finish(gpu, encoder, decoder);
+
+ return string;
+}
+
+size_t apir_buffer_type_get_alignment(virtgpu * gpu, ggml_backend_buffer_type_t buft) {
+ apir_encoder * encoder;
+ apir_decoder * decoder;
+ ApirForwardReturnCode ret;
+
+ REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT);
+
+ apir_encode_ggml_buffer_type(encoder, buft);
+
+ REMOTE_CALL(gpu, encoder, decoder, ret);
+
+ size_t alignment;
+ apir_decode_size_t(decoder, &alignment);
+
+ remote_call_finish(gpu, encoder, decoder);
+
+ return alignment;
+}
+
+size_t apir_buffer_type_get_max_size(virtgpu * gpu, ggml_backend_buffer_type_t buft) {
+ apir_encoder * encoder;
+ apir_decoder * decoder;
+ ApirForwardReturnCode ret;
+
+ REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE);
+
+ apir_encode_ggml_buffer_type(encoder, buft);
+
+ REMOTE_CALL(gpu, encoder, decoder, ret);
+
+ size_t max_size;
+ apir_decode_size_t(decoder, &max_size);
+
+ remote_call_finish(gpu, encoder, decoder);
+
+ return max_size;
+}
+
+bool apir_buffer_type_is_host(virtgpu * gpu, ggml_backend_buffer_type_t buft) {
+ apir_encoder * encoder;
+ apir_decoder * decoder;
+ ApirForwardReturnCode ret;
+
+ REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST);
+
+ apir_encode_ggml_buffer_type(encoder, buft);
+
+ REMOTE_CALL(gpu, encoder, decoder, ret);
+
+ bool is_host;
+ apir_decode_bool_t(decoder, &is_host);
+
+ remote_call_finish(gpu, encoder, decoder);
+
+ return is_host;
+}
+
+apir_buffer_context_t apir_buffer_type_alloc_buffer(virtgpu * gpu, ggml_backend_buffer_type_t buft, size_t size) {
+ apir_encoder * encoder;
+ apir_decoder * decoder;
+ ApirForwardReturnCode ret;
+
+ apir_buffer_context_t buffer_context;
+
+ REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER);
+
+ apir_encode_ggml_buffer_type(encoder, buft);
+
+ apir_encode_size_t(encoder, &size);
+
+ REMOTE_CALL(gpu, encoder, decoder, ret);
+
+ apir_decode_apir_buffer_host_handle_t(decoder, &buffer_context.host_handle);
+
+ remote_call_finish(gpu, encoder, decoder);
+
+ return buffer_context;
+}
+
+size_t apir_buffer_type_get_alloc_size(virtgpu * gpu, ggml_backend_buffer_type_t buft, const ggml_tensor * op) {
+ apir_encoder * encoder;
+ apir_decoder * decoder;
+ ApirForwardReturnCode ret;
+
+ REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALLOC_SIZE);
+
+ apir_encode_ggml_buffer_type(encoder, buft);
+
+ apir_encode_ggml_tensor_inline(encoder, op);
+
+ REMOTE_CALL(gpu, encoder, decoder, ret);
+
+ size_t alloc_size;
+ apir_decode_size_t(decoder, &alloc_size);
+
+ remote_call_finish(gpu, encoder, decoder);
+
+ return alloc_size;
+}
--- /dev/null
+#include "virtgpu-forward-impl.h"
+
+void * apir_buffer_get_base(virtgpu * gpu, apir_buffer_context_t * buffer_context) {
+ apir_encoder * encoder;
+ apir_decoder * decoder;
+ ApirForwardReturnCode ret;
+
+ REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_GET_BASE);
+
+ apir_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle);
+
+ REMOTE_CALL(gpu, encoder, decoder, ret);
+
+ uintptr_t base;
+ apir_decode_uintptr_t(decoder, &base);
+
+ remote_call_finish(gpu, encoder, decoder);
+
+ return (void *) base;
+}
+
+void apir_buffer_set_tensor(virtgpu * gpu,
+ apir_buffer_context_t * buffer_context,
+ ggml_tensor * tensor,
+ const void * data,
+ size_t offset,
+ size_t size) {
+ apir_encoder * encoder;
+ apir_decoder * decoder;
+ ApirForwardReturnCode ret;
+
+ REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_SET_TENSOR);
+
+ apir_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle);
+ apir_encode_ggml_tensor(encoder, tensor);
+
+ virtgpu_shmem temp_shmem; // Local storage for large buffers
+ virtgpu_shmem * shmem = &temp_shmem;
+
+ if (size <= gpu->data_shmem.mmap_size) {
+ // prefer the init-time allocated page, if large enough
+ shmem = &gpu->data_shmem;
+
+ } else if (virtgpu_shmem_create(gpu, size, shmem)) {
+ GGML_ABORT("Couldn't allocate the guest-host shared buffer");
+ }
+
+ memcpy(shmem->mmap_ptr, data, size);
+ apir_encode_virtgpu_shmem_res_id(encoder, shmem->res_id);
+
+ apir_encode_size_t(encoder, &offset);
+ apir_encode_size_t(encoder, &size);
+
+ REMOTE_CALL(gpu, encoder, decoder, ret);
+
+ remote_call_finish(gpu, encoder, decoder);
+
+ if (shmem != &gpu->data_shmem) {
+ virtgpu_shmem_destroy(gpu, shmem);
+ }
+
+ return;
+}
+
+void apir_buffer_get_tensor(virtgpu * gpu,
+ apir_buffer_context_t * buffer_context,
+ const ggml_tensor * tensor,
+ void * data,
+ size_t offset,
+ size_t size) {
+ apir_encoder * encoder;
+ apir_decoder * decoder;
+ ApirForwardReturnCode ret;
+
+ REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_GET_TENSOR);
+
+ apir_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle);
+ apir_encode_ggml_tensor(encoder, tensor);
+
+ virtgpu_shmem temp_shmem; // Local storage for large buffers
+ virtgpu_shmem * shmem = &temp_shmem;
+
+ if (size <= gpu->data_shmem.mmap_size) {
+ // prefer the init-time allocated page, if large enough
+ shmem = &gpu->data_shmem;
+
+ } else if (virtgpu_shmem_create(gpu, size, shmem)) {
+ GGML_ABORT("Couldn't allocate the guest-host shared buffer");
+ }
+
+ apir_encode_virtgpu_shmem_res_id(encoder, shmem->res_id);
+ apir_encode_size_t(encoder, &offset);
+ apir_encode_size_t(encoder, &size);
+
+ REMOTE_CALL(gpu, encoder, decoder, ret);
+
+ memcpy(data, shmem->mmap_ptr, size);
+
+ remote_call_finish(gpu, encoder, decoder);
+
+ if (shmem != &gpu->data_shmem) {
+ virtgpu_shmem_destroy(gpu, shmem);
+ }
+}
+
+bool apir_buffer_cpy_tensor(virtgpu * gpu,
+ apir_buffer_context_t * buffer_context,
+ const ggml_tensor * src,
+ const ggml_tensor * dst) {
+ apir_encoder * encoder;
+ apir_decoder * decoder;
+ ApirForwardReturnCode ret;
+
+ REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_CPY_TENSOR);
+
+ apir_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle);
+ apir_encode_ggml_tensor(encoder, src);
+ apir_encode_ggml_tensor(encoder, dst);
+
+ REMOTE_CALL(gpu, encoder, decoder, ret);
+
+ bool ret_val;
+ apir_decode_bool_t(decoder, &ret_val);
+
+ remote_call_finish(gpu, encoder, decoder);
+
+ return ret_val;
+}
+
+void apir_buffer_clear(virtgpu * gpu, apir_buffer_context_t * buffer_context, uint8_t value) {
+ apir_encoder * encoder;
+ apir_decoder * decoder;
+ ApirForwardReturnCode ret;
+
+ REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_CLEAR);
+
+ apir_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle);
+ apir_encode_uint8_t(encoder, &value);
+
+ REMOTE_CALL(gpu, encoder, decoder, ret);
+
+ remote_call_finish(gpu, encoder, decoder);
+}
+
+void apir_buffer_free_buffer(virtgpu * gpu, apir_buffer_context_t * buffer_context) {
+ apir_encoder * encoder;
+ apir_decoder * decoder;
+ ApirForwardReturnCode ret;
+
+ REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER);
+
+ apir_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle);
+
+ REMOTE_CALL(gpu, encoder, decoder, ret);
+
+ remote_call_finish(gpu, encoder, decoder);
+}
--- /dev/null
+#include "virtgpu-forward-impl.h"
+#include "virtgpu-shm.h"
+
+int apir_device_get_count(virtgpu * gpu) {
+ static int32_t dev_count = -1;
+ if (dev_count != -1) {
+ return dev_count;
+ }
+
+ apir_encoder * encoder;
+ apir_decoder * decoder;
+ ApirForwardReturnCode ret;
+
+ REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_COUNT);
+ REMOTE_CALL(gpu, encoder, decoder, ret);
+
+ apir_decode_int32_t(decoder, &dev_count);
+
+ remote_call_finish(gpu, encoder, decoder);
+
+ return dev_count;
+}
+
+const char * apir_device_get_name(virtgpu * gpu) {
+ static char * string = nullptr;
+ if (string) {
+ return string;
+ }
+ apir_encoder * encoder;
+ apir_decoder * decoder;
+ ApirForwardReturnCode ret;
+
+ REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_NAME);
+ REMOTE_CALL(gpu, encoder, decoder, ret);
+
+ const size_t string_size = apir_decode_array_size_unchecked(decoder);
+ string = (char *) apir_decoder_alloc_array(sizeof(char), string_size);
+ if (!string) {
+ GGML_LOG_ERROR("%s: Could not allocate the device name buffer\n", __func__);
+ return NULL;
+ }
+ apir_decode_char_array(decoder, string, string_size);
+
+ remote_call_finish(gpu, encoder, decoder);
+
+ return string;
+}
+
+const char * apir_device_get_description(virtgpu * gpu) {
+ apir_encoder * encoder;
+ apir_decoder * decoder;
+ ApirForwardReturnCode ret;
+
+ REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION);
+
+ REMOTE_CALL(gpu, encoder, decoder, ret);
+
+ const size_t string_size = apir_decode_array_size_unchecked(decoder);
+ char * string = (char *) apir_decoder_alloc_array(sizeof(char), string_size);
+ if (!string) {
+ GGML_LOG_ERROR("%s: Could not allocate the device description buffer\n", __func__);
+
+ return NULL;
+ }
+ apir_decode_char_array(decoder, string, string_size);
+
+ remote_call_finish(gpu, encoder, decoder);
+
+ return string;
+}
+
+uint32_t apir_device_get_type(virtgpu * gpu) {
+ static uint32_t dev_type = 255;
+ if (dev_type != 255) {
+ return dev_type;
+ }
+
+ apir_encoder * encoder;
+ apir_decoder * decoder;
+ ApirForwardReturnCode ret;
+
+ REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_TYPE);
+
+ REMOTE_CALL(gpu, encoder, decoder, ret);
+
+ apir_decode_uint32_t(decoder, &dev_type);
+
+ remote_call_finish(gpu, encoder, decoder);
+
+ return dev_type;
+}
+
+void apir_device_get_memory(virtgpu * gpu, size_t * free, size_t * total) {
+ static size_t dev_free = 0;
+ static size_t dev_total = 0;
+ apir_encoder * encoder;
+ apir_decoder * decoder;
+ ApirForwardReturnCode ret;
+
+ REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_MEMORY);
+
+ REMOTE_CALL(gpu, encoder, decoder, ret);
+
+ apir_decode_size_t(decoder, &dev_free);
+ apir_decode_size_t(decoder, &dev_total);
+
+ *free = dev_free;
+ *total = dev_total;
+
+ remote_call_finish(gpu, encoder, decoder);
+
+ return;
+}
+
+bool apir_device_supports_op(virtgpu * gpu, const ggml_tensor * op) {
+ apir_encoder * encoder;
+ apir_decoder * decoder;
+ ApirForwardReturnCode ret;
+
+ REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP);
+
+ apir_encode_ggml_tensor_inline(encoder, op);
+
+ REMOTE_CALL(gpu, encoder, decoder, ret);
+
+ bool supports_op;
+ apir_decode_bool_t(decoder, &supports_op);
+
+ remote_call_finish(gpu, encoder, decoder);
+
+ return supports_op;
+}
+
+apir_buffer_type_host_handle_t apir_device_get_buffer_type(virtgpu * gpu) {
+ apir_encoder * encoder;
+ apir_decoder * decoder;
+ ApirForwardReturnCode ret;
+
+ REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE);
+
+ REMOTE_CALL(gpu, encoder, decoder, ret);
+
+ apir_buffer_type_host_handle_t buft_handle;
+ apir_decode_apir_buffer_type_host_handle_t(decoder, &buft_handle);
+
+ remote_call_finish(gpu, encoder, decoder);
+
+ return buft_handle;
+}
+
+void apir_device_get_props(virtgpu * gpu,
+ bool * async,
+ bool * host_buffer,
+ bool * buffer_from_host_ptr,
+ bool * events) {
+ apir_encoder * encoder;
+ apir_decoder * decoder;
+ ApirForwardReturnCode ret;
+
+ REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_PROPS);
+
+ REMOTE_CALL(gpu, encoder, decoder, ret);
+
+ apir_decode_bool_t(decoder, async);
+ apir_decode_bool_t(decoder, host_buffer);
+ apir_decode_bool_t(decoder, buffer_from_host_ptr);
+ apir_decode_bool_t(decoder, events);
+
+ remote_call_finish(gpu, encoder, decoder);
+
+ return;
+}
+
+apir_buffer_context_t apir_device_buffer_from_ptr(virtgpu * gpu, size_t size, size_t max_tensor_size) {
+ apir_encoder * encoder;
+ apir_decoder * decoder;
+ ApirForwardReturnCode ret;
+
+ apir_buffer_context_t buffer_context;
+
+ REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR);
+
+ if (virtgpu_shmem_create(gpu, size, &buffer_context.shmem)) {
+ GGML_ABORT("Couldn't allocate the guest-host shared buffer");
+ }
+
+ apir_encode_virtgpu_shmem_res_id(encoder, buffer_context.shmem.res_id);
+
+ apir_encode_size_t(encoder, &size);
+ apir_encode_size_t(encoder, &max_tensor_size);
+
+ REMOTE_CALL(gpu, encoder, decoder, ret);
+
+ apir_decode_apir_buffer_host_handle_t(decoder, &buffer_context.host_handle);
+ buffer_context.buft_host_handle = apir_decode_apir_buffer_type_host_handle(decoder);
+
+ remote_call_finish(gpu, encoder, decoder);
+
+ return buffer_context;
+}
--- /dev/null
+#include "virtgpu.h"
+
+#include "ggml-remoting.h"
+#include "backend/shared/apir_backend.h"
+#include "backend/shared/apir_cs_ggml.h"
+
+#include "ggml-backend-impl.h"
+
+#define REMOTE_CALL_PREPARE(gpu_dev_name, encoder_name, apir_command_type__) \
+ do { \
+ int32_t forward_flag = (int32_t) apir_command_type__; \
+ encoder_name = remote_call_prepare(gpu_dev_name, APIR_COMMAND_TYPE_FORWARD, forward_flag); \
+ if (!encoder_name) { \
+ GGML_ABORT("%s: failed to prepare the remote call encoder", __func__); \
+ } \
+ } while (0)
+
+#define REMOTE_CALL(gpu_dev_name, encoder_name, decoder_name, ret_name) \
+ do { \
+ ret_name = (ApirForwardReturnCode) remote_call(gpu_dev_name, encoder_name, &decoder_name, 0, NULL); \
+ if (!decoder_name) { \
+ GGML_ABORT("%s: failed to kick the remote call", __func__); \
+ } \
+ if (ret_name < APIR_FORWARD_BASE_INDEX) { \
+ GGML_ABORT("%s: failed to forward the API call: %s: code %d", __func__, \
+ apir_forward_error(ret_name), ret_name); \
+ } \
+ ret_name = (ApirForwardReturnCode) (ret_name - APIR_FORWARD_BASE_INDEX); \
+ } while (0)
--- /dev/null
+#pragma once
+
+/* device */
+void apir_device_get_device_count(struct virtgpu * gpu);
+int apir_device_get_count(struct virtgpu * gpu);
+const char * apir_device_get_name(struct virtgpu * gpu);
+const char * apir_device_get_description(struct virtgpu * gpu);
+uint32_t apir_device_get_type(struct virtgpu * gpu);
+void apir_device_get_memory(struct virtgpu * gpu, size_t * free, size_t * total);
+bool apir_device_supports_op(struct virtgpu * gpu, const ggml_tensor * op);
+apir_buffer_type_host_handle_t apir_device_get_buffer_type(struct virtgpu * gpu);
+void apir_device_get_props(struct virtgpu * gpu,
+ bool * async,
+ bool * host_buffer,
+ bool * buffer_from_host_ptr,
+ bool * events);
+apir_buffer_context_t apir_device_buffer_from_ptr(struct virtgpu * gpu, size_t size, size_t max_tensor_size);
+
+/* buffer-type */
+const char * apir_buffer_type_get_name(struct virtgpu * gpu, ggml_backend_buffer_type_t buft);
+size_t apir_buffer_type_get_alignment(struct virtgpu * gpu, ggml_backend_buffer_type_t buft);
+size_t apir_buffer_type_get_max_size(struct virtgpu * gpu, ggml_backend_buffer_type_t buft);
+bool apir_buffer_type_is_host(struct virtgpu * gpu, ggml_backend_buffer_type_t buft);
+apir_buffer_context_t apir_buffer_type_alloc_buffer(struct virtgpu * gpu,
+ ggml_backend_buffer_type_t buffer_buft,
+ size_t size);
+size_t apir_buffer_type_get_alloc_size(struct virtgpu * gpu, ggml_backend_buffer_type_t buft, const ggml_tensor * op);
+
+/* buffer */
+void * apir_buffer_get_base(struct virtgpu * gpu, apir_buffer_context_t * buffer_context);
+void apir_buffer_set_tensor(struct virtgpu * gpu,
+ apir_buffer_context_t * buffer_context,
+ ggml_tensor * tensor,
+ const void * data,
+ size_t offset,
+ size_t size);
+void apir_buffer_get_tensor(struct virtgpu * gpu,
+ apir_buffer_context_t * buffer_context,
+ const ggml_tensor * tensor,
+ void * data,
+ size_t offset,
+ size_t size);
+bool apir_buffer_cpy_tensor(struct virtgpu * gpu,
+ apir_buffer_context_t * buffer_context,
+ const ggml_tensor * src,
+ const ggml_tensor * dst);
+void apir_buffer_clear(struct virtgpu * gpu, apir_buffer_context_t * buffer_context, uint8_t value);
+void apir_buffer_free_buffer(struct virtgpu * gpu, apir_buffer_context_t * buffer_context);
+
+/* backend */
+ggml_status apir_backend_graph_compute(struct virtgpu * gpu, ggml_cgraph * cgraph);
--- /dev/null
+#include "virtgpu-shm.h"
+
+#include "virtgpu.h"
+
+#include <assert.h>
+
+static uint32_t virtgpu_ioctl_resource_create_blob(virtgpu * gpu,
+ uint32_t blob_mem,
+ uint32_t blob_flags,
+ size_t blob_size,
+ uint64_t blob_id,
+ uint32_t * res_id) {
+#ifdef SIMULATE_BO_SIZE_FIX
+ blob_size = align64(blob_size, 4096);
+#endif
+
+ drm_virtgpu_resource_create_blob args = {
+ .blob_mem = blob_mem,
+ .blob_flags = blob_flags,
+ .bo_handle = 0,
+ .res_handle = 0,
+ .size = blob_size,
+ .pad = 0,
+ .cmd_size = 0,
+ .cmd = 0,
+ .blob_id = blob_id,
+ };
+
+ if (virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_RESOURCE_CREATE_BLOB, &args)) {
+ return 0;
+ }
+
+ *res_id = args.res_handle;
+ return args.bo_handle;
+}
+
+static void virtgpu_ioctl_gem_close(virtgpu * gpu, uint32_t gem_handle) {
+ drm_gem_close args = {
+ .handle = gem_handle,
+ .pad = 0,
+ };
+
+ const int ret = virtgpu_ioctl(gpu, DRM_IOCTL_GEM_CLOSE, &args);
+ assert(!ret);
+#ifdef NDEBUG
+ UNUSED(ret);
+#endif
+}
+
+static void * virtgpu_ioctl_map(virtgpu * gpu, uint32_t gem_handle, size_t size) {
+ drm_virtgpu_map args = {
+ .offset = 0,
+ .handle = gem_handle,
+ .pad = 0,
+ };
+
+ if (virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_MAP, &args)) {
+ return NULL;
+ }
+
+ void * ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, gpu->fd, args.offset);
+ if (ptr == MAP_FAILED) {
+ return NULL;
+ }
+
+ return ptr;
+}
+
+void virtgpu_shmem_destroy(virtgpu * gpu, virtgpu_shmem * shmem) {
+ munmap(shmem->mmap_ptr, shmem->mmap_size);
+ virtgpu_ioctl_gem_close(gpu, shmem->gem_handle);
+}
+
+int virtgpu_shmem_create(virtgpu * gpu, size_t size, virtgpu_shmem * shmem) {
+ size = align64(size, 16384);
+
+ uint32_t res_id;
+ uint32_t gem_handle = virtgpu_ioctl_resource_create_blob(gpu, VIRTGPU_BLOB_MEM_HOST3D,
+ VIRTGPU_BLOB_FLAG_USE_MAPPABLE, size, 0, &res_id);
+
+ if (!gem_handle) {
+ return 1;
+ }
+
+ void * ptr = virtgpu_ioctl_map(gpu, gem_handle, size);
+ if (!ptr) {
+ virtgpu_ioctl_gem_close(gpu, gem_handle);
+ GGML_LOG_ERROR("virtgpu_ioctl_map FAILED\n");
+ exit(1);
+ return 1;
+ }
+
+ shmem->res_id = res_id;
+ shmem->mmap_size = size;
+ shmem->mmap_ptr = ptr;
+ shmem->gem_handle = gem_handle;
+
+ return 0;
+}
--- /dev/null
+#pragma once
+
+#include "virtgpu-utils.h"
+
+#include <sys/mman.h>
+
+#include <atomic>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+struct virtgpu;
+
+struct virtgpu_shmem {
+ uint32_t res_id;
+ size_t mmap_size;
+ void * mmap_ptr;
+
+ uint32_t gem_handle;
+};
+
+int virtgpu_shmem_create(virtgpu * gpu, size_t size, virtgpu_shmem * shmem);
+void virtgpu_shmem_destroy(virtgpu * gpu, virtgpu_shmem * shmem);
--- /dev/null
+#include "virtgpu-utils.h"
+
+#include <malloc.h>
+#include <stdlib.h>
+
+#include <cstring>
+
+#define NODE_ALLOC_ALIGN 64
+#define NODE_PTR_MASK (~((uintptr_t) NODE_ALLOC_ALIGN - 1))
+#define NODE_LEVEL_MASK ((uintptr_t) NODE_ALLOC_ALIGN - 1)
+#define NULL_NODE 0
+
+#define os_malloc_aligned(_size, _align) _aligned_malloc(_size, _align)
+#define os_free_aligned(_ptr) free(_ptr)
+#define p_atomic_cmpxchg(v, old, _new) __sync_val_compare_and_swap((v), (old), (_new))
+
+static inline uint64_t util_logbase2_64(uint64_t n) {
+#if defined(HAVE___BUILTIN_CLZLL)
+ return ((sizeof(uint64_t) * 8 - 1) - __builtin_clzll(n | 1));
+#else
+ uint64_t pos = 0ull;
+ if (n >= 1ull << 32) {
+ n >>= 32;
+ pos += 32;
+ }
+ if (n >= 1ull << 16) {
+ n >>= 16;
+ pos += 16;
+ }
+ if (n >= 1ull << 8) {
+ n >>= 8;
+ pos += 8;
+ }
+ if (n >= 1ull << 4) {
+ n >>= 4;
+ pos += 4;
+ }
+ if (n >= 1ull << 2) {
+ n >>= 2;
+ pos += 2;
+ }
+ if (n >= 1ull << 1) {
+ pos += 1;
+ }
+ return pos;
+#endif
+}
+
+void util_sparse_array_init(util_sparse_array * arr, size_t elem_size, size_t node_size) {
+ memset(arr, 0, sizeof(*arr));
+ arr->elem_size = elem_size;
+ arr->node_size_log2 = util_logbase2_64(node_size);
+ assert(node_size >= 2 && node_size == (1ull << arr->node_size_log2));
+}
+
+static inline void * os_malloc_aligned(size_t size, size_t alignment) {
+ void * ptr;
+ alignment = (alignment + sizeof(void *) - 1) & ~(sizeof(void *) - 1);
+ if (posix_memalign(&ptr, alignment, size) != 0) {
+ return NULL;
+ }
+ return ptr;
+}
+
+static inline void * _util_sparse_array_node_data(uintptr_t handle) {
+ return (void *) (handle & NODE_PTR_MASK);
+}
+
+static inline unsigned _util_sparse_array_node_level(uintptr_t handle) {
+ return handle & NODE_LEVEL_MASK;
+}
+
+static inline void _util_sparse_array_node_finish(util_sparse_array * arr, uintptr_t node) {
+ if (_util_sparse_array_node_level(node) > 0) {
+ uintptr_t * children = (uintptr_t *) _util_sparse_array_node_data(node);
+ size_t node_size = 1ull << arr->node_size_log2;
+ for (size_t i = 0; i < node_size; i++) {
+ if (children[i]) {
+ _util_sparse_array_node_finish(arr, children[i]);
+ }
+ }
+ }
+
+ os_free_aligned(_util_sparse_array_node_data(node));
+}
+
+static inline uintptr_t _util_sparse_array_node(void * data, unsigned level) {
+ assert(data != NULL);
+ assert(((uintptr_t) data & NODE_LEVEL_MASK) == 0);
+ assert((level & NODE_PTR_MASK) == 0);
+ return (uintptr_t) data | level;
+}
+
+inline uintptr_t _util_sparse_array_node_alloc(util_sparse_array * arr, unsigned level) {
+ size_t size;
+ if (level == 0) {
+ size = arr->elem_size << arr->node_size_log2;
+ } else {
+ size = sizeof(uintptr_t) << arr->node_size_log2;
+ }
+
+ void * data = os_malloc_aligned(size, NODE_ALLOC_ALIGN);
+ memset(data, 0, size);
+
+ return _util_sparse_array_node(data, level);
+}
+
+static inline uintptr_t _util_sparse_array_set_or_free_node(uintptr_t * node_ptr, uintptr_t cmp_node, uintptr_t node) {
+ uintptr_t prev_node = p_atomic_cmpxchg(node_ptr, cmp_node, node);
+
+ if (prev_node != cmp_node) {
+ /* We lost the race. Free this one and return the one that was already
+ * allocated.
+ */
+ os_free_aligned(_util_sparse_array_node_data(node));
+ return prev_node;
+ } else {
+ return node;
+ }
+}
+
+void * util_sparse_array_get(util_sparse_array * arr, uint64_t idx) {
+ const unsigned node_size_log2 = arr->node_size_log2;
+ uintptr_t root = p_atomic_read(&arr->root);
+ if (unlikely(!root)) {
+ unsigned root_level = 0;
+ uint64_t idx_iter = idx >> node_size_log2;
+ while (idx_iter) {
+ idx_iter >>= node_size_log2;
+ root_level++;
+ }
+ uintptr_t new_root = _util_sparse_array_node_alloc(arr, root_level);
+ root = _util_sparse_array_set_or_free_node(&arr->root, NULL_NODE, new_root);
+ }
+
+ while (1) {
+ unsigned root_level = _util_sparse_array_node_level(root);
+ uint64_t root_idx = idx >> (root_level * node_size_log2);
+ if (likely(root_idx < (1ull << node_size_log2))) {
+ break;
+ }
+
+ /* In this case, we have a root but its level is low enough that the
+ * requested index is out-of-bounds.
+ */
+ uintptr_t new_root = _util_sparse_array_node_alloc(arr, root_level + 1);
+
+ uintptr_t * new_root_children = (uintptr_t *) _util_sparse_array_node_data(new_root);
+ new_root_children[0] = root;
+
+ /* We only add one at a time instead of the whole tree because it's
+ * easier to ensure correctness of both the tree building and the
+ * clean-up path. Because we're only adding one node we never have to
+ * worry about trying to free multiple things without freeing the old
+ * things.
+ */
+ root = _util_sparse_array_set_or_free_node(&arr->root, root, new_root);
+ }
+
+ void * node_data = _util_sparse_array_node_data(root);
+ unsigned node_level = _util_sparse_array_node_level(root);
+ while (node_level > 0) {
+ uint64_t child_idx = (idx >> (node_level * node_size_log2)) & ((1ull << node_size_log2) - 1);
+
+ uintptr_t * children = (uintptr_t *) node_data;
+ uintptr_t child = p_atomic_read(&children[child_idx]);
+
+ if (unlikely(!child)) {
+ child = _util_sparse_array_node_alloc(arr, node_level - 1);
+ child = _util_sparse_array_set_or_free_node(&children[child_idx], NULL_NODE, child);
+ }
+
+ node_data = _util_sparse_array_node_data(child);
+ node_level = _util_sparse_array_node_level(child);
+ }
+
+ uint64_t elem_idx = idx & ((1ull << node_size_log2) - 1);
+ return (void *) ((char *) node_data + (elem_idx * arr->elem_size));
+}
--- /dev/null
+#pragma once
+
+#include <atomic>
+#include <cassert>
+#include <cerrno>
+#include <cstdarg>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <ctime>
+
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#define likely(x) __builtin_expect(!!(x), 1)
+
+#ifndef UNUSED
+# define UNUSED(x) (void) (x)
+#endif
+
+/** Checks is a value is a power of two. Does not handle zero. */
+#define IS_POT(v) (((v) & ((v) - 1)) == 0)
+
+/** Checks is a value is a power of two. Zero handled. */
+#define IS_POT_NONZERO(v) ((v) != 0 && IS_POT(v))
+
+/** Align a value to a power of two */
+#define ALIGN_POT(x, pot_align) (((x) + (pot_align) - 1) & ~((pot_align) - 1))
+
+#define p_atomic_read(_v) __atomic_load_n((_v), __ATOMIC_ACQUIRE)
+
+static inline bool util_is_power_of_two_nonzero64(uint64_t v) {
+ return IS_POT_NONZERO(v);
+}
+
+static inline uint64_t align64(uint64_t value, uint64_t alignment) {
+ assert(util_is_power_of_two_nonzero64(alignment));
+ return ALIGN_POT(value, alignment);
+}
+
+struct list_head {
+ list_head * prev;
+ list_head * next;
+};
+
+struct util_sparse_array {
+ size_t elem_size;
+ unsigned node_size_log2;
+
+ uintptr_t root;
+};
+
+void * util_sparse_array_get(util_sparse_array * arr, uint64_t idx);
+void util_sparse_array_init(util_sparse_array * arr, size_t elem_size, size_t node_size);
+
+inline void os_time_sleep(int64_t usecs) {
+ timespec time;
+ time.tv_sec = usecs / 1000000;
+ time.tv_nsec = (usecs % 1000000) * 1000;
+ while (clock_nanosleep(CLOCK_MONOTONIC, 0, &time, &time) == EINTR)
+ ;
+}
+
+struct timer_data {
+ long long start;
+ long long total;
+ long long count;
+};
+
+static inline void start_timer(timer_data * timer) {
+ timespec ts;
+ clock_gettime(CLOCK_MONOTONIC, &ts);
+ timer->start = (long long) ts.tv_sec * 1000000000LL + ts.tv_nsec;
+}
+
+// returns the duration in ns
+static inline long long stop_timer(timer_data * timer) {
+ timespec ts;
+ clock_gettime(CLOCK_MONOTONIC, &ts);
+ long long timer_end = (long long) ts.tv_sec * 1000000000LL + ts.tv_nsec;
+
+ long long duration = (timer_end - timer->start);
+ timer->total += duration;
+ timer->count += 1;
+
+ return duration;
+}
--- /dev/null
+#include "virtgpu.h"
+
+#include <stdio.h>
+#include <unistd.h>
+
+#include <cassert>
+#include <cerrno>
+#include <cstdlib>
+
+static virt_gpu_result_t virtgpu_open_device(virtgpu * gpu, const drmDevicePtr dev);
+static virt_gpu_result_t virtgpu_open(virtgpu * gpu);
+
+static virt_gpu_result_t virtgpu_init_capset(virtgpu * gpu);
+static virt_gpu_result_t virtgpu_init_context(virtgpu * gpu);
+
+static int virtgpu_ioctl_context_init(virtgpu * gpu, virgl_renderer_capset capset_id);
+static int virtgpu_ioctl_get_caps(virtgpu * gpu,
+ virgl_renderer_capset id,
+ uint32_t version,
+ void * capset,
+ size_t capset_size);
+static uint64_t virtgpu_ioctl_getparam(virtgpu * gpu, uint64_t param);
+static void virtgpu_init_renderer_info(virtgpu * gpu);
+
+static void log_call_duration(long long call_duration_ns, const char * name);
+
+const uint64_t APIR_HANDSHAKE_MAX_WAIT_MS = 2 * 1000; // 2s
+const uint64_t APIR_LOADLIBRARY_MAX_WAIT_MS = 60 * 1000; // 60s
+
+static int virtgpu_handshake(virtgpu * gpu) {
+ apir_encoder * encoder;
+ apir_decoder * decoder;
+
+ encoder = remote_call_prepare(gpu, APIR_COMMAND_TYPE_HANDSHAKE, 0);
+ if (!encoder) {
+ GGML_ABORT("%s: failed to prepare the remote call encoder", __func__);
+ return 1;
+ }
+
+ /* write handshake props */
+
+ uint32_t guest_major = APIR_PROTOCOL_MAJOR;
+ uint32_t guest_minor = APIR_PROTOCOL_MINOR;
+ apir_encode_uint32_t(encoder, &guest_major);
+ apir_encode_uint32_t(encoder, &guest_minor);
+
+ /* *** */
+
+ uint32_t ret_magic;
+ long long call_duration_ns;
+ ret_magic = remote_call(gpu, encoder, &decoder, APIR_HANDSHAKE_MAX_WAIT_MS, &call_duration_ns);
+ log_call_duration(call_duration_ns, "API Remoting handshake");
+
+ if (!decoder) {
+ GGML_ABORT(
+ "%s: failed to initiate the communication with the virglrenderer library. "
+ "Most likely, the wrong virglrenderer library was loaded in the hypervisor.",
+ __func__);
+ return 1;
+ }
+
+ /* read handshake return values */
+
+ uint32_t host_major;
+ uint32_t host_minor;
+
+ if (ret_magic != APIR_HANDSHAKE_MAGIC) {
+ GGML_ABORT("%s: handshake with the virglrenderer failed (code=%d | %s)", __func__, ret_magic,
+ apir_backend_initialize_error(ret_magic));
+ } else {
+ apir_decode_uint32_t(decoder, &host_major);
+ apir_decode_uint32_t(decoder, &host_minor);
+ }
+
+ remote_call_finish(gpu, encoder, decoder);
+
+ if (ret_magic != APIR_HANDSHAKE_MAGIC) {
+ return 1;
+ }
+
+ GGML_LOG_INFO("%s: Guest is running with %u.%u\n", __func__, guest_major, guest_minor);
+ GGML_LOG_INFO("%s: Host is running with %u.%u\n", __func__, host_major, host_minor);
+
+ if (guest_major != host_major) {
+ GGML_LOG_ERROR("Host major (%d) and guest major (%d) version differ\n", host_major, guest_major);
+ } else if (guest_minor != host_minor) {
+ GGML_LOG_WARN("Host minor (%d) and guest minor (%d) version differ\n", host_minor, guest_minor);
+ }
+
+ return 0;
+}
+
+static ApirLoadLibraryReturnCode virtgpu_load_library(virtgpu * gpu) {
+ apir_encoder * encoder;
+ apir_decoder * decoder;
+ ApirLoadLibraryReturnCode ret;
+
+ encoder = remote_call_prepare(gpu, APIR_COMMAND_TYPE_LOADLIBRARY, 0);
+ if (!encoder) {
+ GGML_ABORT("%s: hypercall error: failed to prepare the remote call encoder", __func__);
+ return APIR_LOAD_LIBRARY_HYPERCALL_INITIALIZATION_ERROR;
+ }
+
+ long long call_duration_ns;
+
+ ret = (ApirLoadLibraryReturnCode) remote_call(gpu, encoder, &decoder, APIR_LOADLIBRARY_MAX_WAIT_MS,
+ &call_duration_ns);
+ log_call_duration(call_duration_ns, "API Remoting LoadLibrary");
+
+ if (!decoder) {
+ GGML_ABORT("%s: hypercall error: failed to kick the API remoting hypercall.\n", __func__);
+ return APIR_LOAD_LIBRARY_HYPERCALL_INITIALIZATION_ERROR;
+ }
+
+ remote_call_finish(gpu, encoder, decoder);
+
+ if (ret == APIR_LOAD_LIBRARY_SUCCESS) {
+ GGML_LOG_INFO("%s: The API Remoting backend was successfully loaded and initialized\n", __func__);
+
+ return ret;
+ }
+
+ // something wrong happened, find out what.
+
+ if (ret < APIR_LOAD_LIBRARY_INIT_BASE_INDEX) {
+ GGML_ABORT("%s: virglrenderer could not load the API Remoting backend library: %s (code %d)", __func__,
+ apir_load_library_error(ret), ret);
+ return ret;
+ }
+
+ GGML_LOG_INFO("%s: virglrenderer successfully loaded the API Remoting backend library", __func__);
+
+ ApirLoadLibraryReturnCode apir_ret = (ApirLoadLibraryReturnCode) (ret - APIR_LOAD_LIBRARY_INIT_BASE_INDEX);
+
+ if (apir_ret < APIR_LOAD_LIBRARY_INIT_BASE_INDEX) {
+ GGML_ABORT("%s: the API Remoting backend library couldn't load the backend library: apir code=%d | %s)",
+ __func__, apir_ret, apir_load_library_error(apir_ret));
+ } else {
+ uint32_t lib_ret = apir_ret - APIR_LOAD_LIBRARY_INIT_BASE_INDEX;
+ GGML_ABORT("%s: the API Remoting backend library initialize its backend library: apir code=%d)", __func__,
+ lib_ret);
+ }
+ return ret;
+}
+
+virtgpu * create_virtgpu() {
+ virtgpu * gpu = new virtgpu();
+
+ gpu->use_apir_capset = getenv("GGML_REMOTING_USE_APIR_CAPSET") != nullptr;
+ util_sparse_array_init(&gpu->shmem_array, sizeof(virtgpu_shmem), 1024);
+
+ if (virtgpu_open(gpu) != APIR_SUCCESS) {
+ GGML_ABORT("%s: failed to open the virtgpu device", __func__);
+ return NULL;
+ }
+
+ if (virtgpu_init_capset(gpu) != APIR_SUCCESS) {
+ GGML_ABORT("%s: failed to initialize the GPU capset", __func__);
+ return NULL;
+ }
+
+ if (virtgpu_init_context(gpu) != APIR_SUCCESS) {
+ GGML_ABORT("%s: failed to initialize the GPU context", __func__);
+ return NULL;
+ }
+
+ if (virtgpu_shmem_create(gpu, SHMEM_REPLY_SIZE, &gpu->reply_shmem)) {
+ GGML_ABORT("%s: failed to create the shared reply memory pages", __func__);
+ return NULL;
+ }
+
+ if (virtgpu_shmem_create(gpu, SHMEM_DATA_SIZE, &gpu->data_shmem)) {
+ GGML_ABORT("%s: failed to create the shared data memory pages", __func__);
+ return NULL;
+ }
+
+ if (virtgpu_handshake(gpu)) {
+ GGML_ABORT("%s: failed to handshake with the virglrenderer library", __func__);
+ return NULL;
+ }
+
+ if (virtgpu_load_library(gpu) != APIR_LOAD_LIBRARY_SUCCESS) {
+ GGML_ABORT("%s: failed to load the backend library", __func__);
+ return NULL;
+ }
+
+ return gpu;
+}
+
+static virt_gpu_result_t virtgpu_open(virtgpu * gpu) {
+ drmDevicePtr devs[8];
+ int count = drmGetDevices2(0, devs, ARRAY_SIZE(devs));
+ if (count < 0) {
+ GGML_LOG_ERROR("%s: failed to enumerate DRM devices\n", __func__);
+ return APIR_ERROR_INITIALIZATION_FAILED;
+ }
+
+ virt_gpu_result_t result = APIR_ERROR_INITIALIZATION_FAILED;
+ for (int i = 0; i < count; i++) {
+ result = virtgpu_open_device(gpu, devs[i]);
+ if (result == APIR_SUCCESS) {
+ break;
+ }
+ }
+
+ drmFreeDevices(devs, count);
+
+ return result;
+}
+
+static virt_gpu_result_t virtgpu_open_device(virtgpu * gpu, const drmDevicePtr dev) {
+ const char * node_path = dev->nodes[DRM_NODE_RENDER];
+
+ int fd = open(node_path, O_RDWR | O_CLOEXEC);
+ if (fd < 0) {
+ GGML_ABORT("failed to open %s", node_path);
+ return APIR_ERROR_INITIALIZATION_FAILED;
+ }
+
+ drmVersionPtr version = drmGetVersion(fd);
+ if (!version || strcmp(version->name, "virtio_gpu") || version->version_major != 0) {
+ if (version) {
+ GGML_ABORT("unknown DRM driver %s version %d", version->name, version->version_major);
+ } else {
+ GGML_ABORT("failed to get DRM driver version");
+ }
+
+ if (version) {
+ drmFreeVersion(version);
+ }
+ close(fd);
+ return APIR_ERROR_INITIALIZATION_FAILED;
+ }
+
+ gpu->fd = fd;
+
+ drmFreeVersion(version);
+
+ GGML_LOG_INFO("using DRM device %s\n", node_path);
+
+ return APIR_SUCCESS;
+}
+
+static virt_gpu_result_t virtgpu_init_context(virtgpu * gpu) {
+ assert(!gpu->capset.version);
+ const int ret = virtgpu_ioctl_context_init(gpu, gpu->capset.id);
+ if (ret) {
+ GGML_LOG_INFO("failed to initialize context: %s\n", strerror(errno));
+ return APIR_ERROR_INITIALIZATION_FAILED;
+ }
+
+ return APIR_SUCCESS;
+}
+
+static virt_gpu_result_t virtgpu_init_capset(virtgpu * gpu) {
+ if (gpu->use_apir_capset) {
+ GGML_LOG_INFO("Using the APIR capset\n");
+ gpu->capset.id = VIRTGPU_DRM_CAPSET_APIR;
+ } else {
+ GGML_LOG_INFO("Using the Venus capset\n");
+ gpu->capset.id = VIRTGPU_DRM_CAPSET_VENUS;
+ }
+ gpu->capset.version = 0;
+
+ int ret =
+ virtgpu_ioctl_get_caps(gpu, gpu->capset.id, gpu->capset.version, &gpu->capset.data, sizeof(gpu->capset.data));
+
+ if (ret) {
+ GGML_LOG_INFO("failed to get APIR v%d capset: %s\n", gpu->capset.version, strerror(errno));
+ return APIR_ERROR_INITIALIZATION_FAILED;
+ }
+
+ assert(gpu->capset.data.supports_blob_resources);
+
+ return APIR_SUCCESS;
+}
+
+static int virtgpu_ioctl_context_init(virtgpu * gpu, virgl_renderer_capset capset_id) {
+ drm_virtgpu_context_set_param ctx_set_params[3] = {
+ {
+ .param = VIRTGPU_CONTEXT_PARAM_CAPSET_ID,
+ .value = capset_id,
+ },
+ {
+ .param = VIRTGPU_CONTEXT_PARAM_NUM_RINGS,
+ .value = 1,
+ },
+ {
+ .param = VIRTGPU_CONTEXT_PARAM_POLL_RINGS_MASK,
+ .value = 0, /* don't generate drm_events on fence signaling */
+ },
+ };
+
+ drm_virtgpu_context_init args = {
+ .num_params = ARRAY_SIZE(ctx_set_params),
+ .pad = 0,
+ .ctx_set_params = (uintptr_t) &ctx_set_params,
+ };
+
+ return virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_CONTEXT_INIT, &args);
+}
+
+static int virtgpu_ioctl_get_caps(virtgpu * gpu,
+ virgl_renderer_capset id,
+ uint32_t version,
+ void * capset,
+ size_t capset_size) {
+ drm_virtgpu_get_caps args = {
+ .cap_set_id = id,
+ .cap_set_ver = version,
+ .addr = (uintptr_t) capset,
+ .size = (__u32) capset_size,
+ .pad = 0,
+ };
+
+ return virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_GET_CAPS, &args);
+}
+
+static uint64_t virtgpu_ioctl_getparam(virtgpu * gpu, uint64_t param) {
+ /* val must be zeroed because kernel only writes the lower 32 bits */
+ uint64_t val = 0;
+ drm_virtgpu_getparam args = {
+ .param = param,
+ .value = (uintptr_t) &val,
+ };
+
+ const int ret = virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_GETPARAM, &args);
+ return ret ? 0 : val;
+}
+
+apir_encoder * remote_call_prepare(virtgpu * gpu, ApirCommandType apir_cmd_type, int32_t cmd_flags) {
+ /*
+ * Prepare the command encoder and its buffer
+ */
+
+ static char encoder_buffer[4096];
+
+ static apir_encoder enc;
+ enc = {
+ .cur = encoder_buffer,
+ .start = encoder_buffer,
+ .end = encoder_buffer + sizeof(encoder_buffer),
+ .fatal = false,
+ };
+
+ /*
+ * Fill the command encoder with the common args:
+ * - cmd_type (int32_t)
+ * - cmd_flags (int32_t)
+ * - reply res id (uint32_t)
+ */
+
+ int32_t cmd_type = apir_cmd_type;
+
+ // for testing during the hypervisor transition
+ if (!gpu->use_apir_capset) {
+ cmd_type += VENUS_COMMAND_TYPE_LENGTH;
+ }
+ apir_encode_int32_t(&enc, &cmd_type);
+ apir_encode_int32_t(&enc, &cmd_flags);
+
+ uint32_t reply_res_id = gpu->reply_shmem.res_id;
+ apir_encode_uint32_t(&enc, &reply_res_id);
+
+ return &enc;
+}
+
+void remote_call_finish(virtgpu * gpu, apir_encoder * enc, apir_decoder * dec) {
+ UNUSED(gpu);
+
+ if (!enc) {
+ GGML_LOG_ERROR("Invalid (null) encoder\n");
+ }
+
+ if (!dec) {
+ GGML_LOG_ERROR("Invalid (null) decoder\n");
+ }
+
+ if (apir_encoder_get_fatal(enc)) {
+ GGML_LOG_ERROR("Failed to encode the output parameters.\n");
+ }
+
+ if (apir_decoder_get_fatal(dec)) {
+ GGML_LOG_ERROR("Failed to decode the input parameters.\n");
+ }
+}
+
+uint32_t remote_call(virtgpu * gpu,
+ apir_encoder * encoder,
+ apir_decoder ** decoder,
+ float max_wait_ms,
+ long long * call_duration_ns) {
+ /*
+ * Prepare the reply notification pointer
+ */
+
+ volatile std::atomic_uint * atomic_reply_notif = (volatile std::atomic_uint *) gpu->reply_shmem.mmap_ptr;
+ *atomic_reply_notif = 0;
+
+ /*
+ * Trigger the execbuf ioctl
+ */
+
+ drm_virtgpu_execbuffer args = {
+ .flags = VIRTGPU_EXECBUF_RING_IDX,
+ .size = (uint32_t) (encoder->cur - encoder->start),
+ .command = (uintptr_t) encoder->start,
+
+ .bo_handles = 0,
+ .num_bo_handles = 0,
+
+ .fence_fd = 0,
+ .ring_idx = 0,
+ .syncobj_stride = 0,
+ .num_in_syncobjs = 0,
+ .num_out_syncobjs = 0,
+ .in_syncobjs = 0,
+ .out_syncobjs = 0,
+ };
+
+ *decoder = NULL;
+
+ int ret = drmIoctl(gpu->fd, DRM_IOCTL_VIRTGPU_EXECBUFFER, &args);
+
+ if (ret != 0) {
+ GGML_ABORT("%s: the virtgpu EXECBUFFER ioctl failed (%d)", __func__, ret);
+ }
+
+ /*
+ * Wait for the response notification
+ */
+ timer_data wait_host_reply_timer = { 0, 0, 0 };
+
+ start_timer(&wait_host_reply_timer);
+
+ timespec ts_start, ts_end;
+ clock_gettime(CLOCK_MONOTONIC, &ts_start);
+ long long start_time = (long long) ts_start.tv_sec * 1000000000LL + ts_start.tv_nsec;
+
+ bool timedout = false;
+ uint32_t notif_value = 0;
+ while (true) {
+ notif_value = std::atomic_load_explicit(atomic_reply_notif, std::memory_order_acquire);
+
+ if (notif_value != 0) {
+ break;
+ }
+
+ int64_t base_sleep_us = 15;
+
+ os_time_sleep(base_sleep_us);
+
+ if (max_wait_ms) {
+ clock_gettime(CLOCK_MONOTONIC, &ts_end);
+ long long end_time = (long long) ts_end.tv_sec * 1000000000LL + ts_end.tv_nsec;
+ float duration_ms = (end_time - start_time) / 1000000;
+
+ if (duration_ms > max_wait_ms) {
+ timedout = true;
+ break;
+ }
+ }
+ }
+
+ if (call_duration_ns) {
+ *call_duration_ns = stop_timer(&wait_host_reply_timer);
+ }
+
+ if (max_wait_ms && timedout) {
+ GGML_LOG_ERROR("timed out waiting for the host answer...\n");
+ return APIR_FORWARD_TIMEOUT;
+ }
+
+ /*
+ * Prepare the decoder
+ */
+ static apir_decoder response_dec;
+ response_dec.cur = (char *) gpu->reply_shmem.mmap_ptr + sizeof(*atomic_reply_notif);
+ response_dec.end = (char *) gpu->reply_shmem.mmap_ptr + gpu->reply_shmem.mmap_size;
+ *decoder = &response_dec;
+
+ // extract the actual return value from the notif flag
+ uint32_t returned_value = notif_value - 1;
+ return returned_value;
+}
+
+static void log_call_duration(long long call_duration_ns, const char * name) {
+ double call_duration_ms = (double) call_duration_ns / 1e6; // 1 millisecond = 1e6 nanoseconds
+ double call_duration_s = (double) call_duration_ns / 1e9; // 1 second = 1e9 nanoseconds
+
+ if (call_duration_s > 1) {
+ GGML_LOG_INFO("%s: waited %.2fs for the %s host reply...\n", __func__, call_duration_s, name);
+ } else if (call_duration_ms > 1) {
+ GGML_LOG_INFO("%s: waited %.2fms for the %s host reply...\n", __func__, call_duration_ms, name);
+ } else {
+ GGML_LOG_INFO("%s: waited %lldns for the %s host reply...\n", __func__, call_duration_ns, name);
+ }
+}
--- /dev/null
+#pragma once
+
+#include "virtgpu-utils.h"
+#include "virtgpu-shm.h"
+#include "virtgpu-apir.h"
+
+#include "backend/shared/api_remoting.h"
+#include "backend/shared/apir_cs.h"
+
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <threads.h>
+#include <xf86drm.h>
+
+#include <cstring>
+
+#define VIRGL_RENDERER_UNSTABLE_APIS 1
+#include "apir_hw.h"
+#include <drm/virtgpu_drm.h>
+#include "venus_hw.h"
+
+#ifndef VIRTGPU_DRM_CAPSET_APIR
+// Will be defined include/drm/virtgpu_drm.h when
+// https://gitlab.freedesktop.org/virgl/virglrenderer/-/merge_requests/1590/diffs
+// is merged
+#define VIRTGPU_DRM_CAPSET_APIR 10
+#endif
+
+// Mesa/Virlgrenderer Venus internal. Only necessary during the
+// Venus->APIR transition in Virglrenderer
+#define VENUS_COMMAND_TYPE_LENGTH 331
+
+#ifndef VIRTGPU_DRM_CAPSET_VENUS // only available with Linux >= v6.16
+#define VIRTGPU_DRM_CAPSET_VENUS 4
+#endif
+
+typedef uint32_t virgl_renderer_capset;
+
+/* from src/virtio/vulkan/vn_renderer_virtgpu.c */
+#define VIRTGPU_PCI_VENDOR_ID 0x1af4
+#define VIRTGPU_PCI_DEVICE_ID 0x1050
+#define VIRTGPU_BLOB_MEM_GUEST_VRAM 0x0004
+#define VIRTGPU_PARAM_GUEST_VRAM 9
+
+#define SHMEM_DATA_SIZE 0x1830000 // 24MiB
+#define SHMEM_REPLY_SIZE 0x4000
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+enum virt_gpu_result_t {
+ APIR_SUCCESS = 0,
+ APIR_ERROR_INITIALIZATION_FAILED = -1,
+};
+
+#define PRINTFLIKE(f, a) __attribute__((format(__printf__, f, a)))
+
+struct virtgpu {
+ bool use_apir_capset;
+
+ int fd;
+
+ struct {
+ virgl_renderer_capset id;
+ uint32_t version;
+ virgl_renderer_capset_apir data;
+ } capset;
+
+ util_sparse_array shmem_array;
+
+ /* APIR communication pages */
+ virtgpu_shmem reply_shmem;
+ virtgpu_shmem data_shmem;
+};
+
+static inline int virtgpu_ioctl(virtgpu * gpu, unsigned long request, void * args) {
+ return drmIoctl(gpu->fd, request, args);
+}
+
+virtgpu * create_virtgpu();
+
+apir_encoder * remote_call_prepare(virtgpu * gpu, ApirCommandType apir_cmd_type, int32_t cmd_flags);
+
+uint32_t remote_call(virtgpu * gpu,
+ apir_encoder * enc,
+ apir_decoder ** dec,
+ float max_wait_ms,
+ long long * call_duration_ns);
+
+void remote_call_finish(virtgpu * gpu, apir_encoder * enc, apir_decoder * dec);