#include <regex>
#include <sycl/sycl.hpp>
+#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
+# include <sycl/ext/oneapi/experimental/async_alloc/async_alloc.hpp>
+#endif
#include <sycl/half_type.hpp>
#include "ggml-sycl.h"
int g_ggml_sycl_disable_graph = 0;
int g_ggml_sycl_disable_dnn = 0;
int g_ggml_sycl_prioritize_dmmv = 0;
+int g_ggml_sycl_use_async_mem_op = 0;
static ggml_sycl_device_info ggml_sycl_init() {
ggml_sycl_device_info info = {};
fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
#endif
*/
-
+ // Currently, we only use async malloc / free when graphs are enabled as it is required for the calls to be
+ // properly recorded. As this SYCL extension matures it may be beneficial to enable as the default path and in
+ // other places.
+#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
+ g_ggml_sycl_use_async_mem_op = !g_ggml_sycl_disable_graph;
+ if (g_ggml_sycl_use_async_mem_op) {
+ for (unsigned int i = 0; i < dpct::dev_mgr::instance().device_count(); ++i) {
+ if (!dpct::dev_mgr::instance().get_device(i).has(sycl::aspect::ext_oneapi_async_memory_alloc)) {
+ g_ggml_sycl_use_async_mem_op = 0;
+ break;
+ }
+ }
+ }
+#endif
if (CHECK_TRY_ERROR(g_all_sycl_device_count =
dpct::dev_mgr::instance().device_count()) != 0) {
initialized = true;
}
}
+// Helper functions to unify device memory allocation for both async and sync paths
+static inline void * sycl_ext_malloc_device(dpct::queue_ptr stream, size_t size) {
+ bool use_async = g_ggml_sycl_use_async_mem_op;
+#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
+ if (use_async) {
+ return syclex::async_malloc(*stream, sycl::usm::alloc::device, size);
+ }
+#else
+ // If async allocation extension is not available, use_async should always be false.
+ GGML_ASSERT(!use_async);
+#endif
+ return sycl::malloc(size, *stream, sycl::usm::alloc::device);
+}
+
+static inline void sycl_ext_free(dpct::queue_ptr stream, void * ptr) {
+ bool use_async = g_ggml_sycl_use_async_mem_op;
+#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
+ if (use_async) {
+ syclex::async_free(*stream, ptr);
+ return;
+ }
+#else
+ // If async allocation extension is not available, use_async should always be false.
+ GGML_ASSERT(!use_async);
+#endif
+ sycl::free(ptr, *stream);
+}
+
static void reorder_qw_q4_0(uint8_t * data_device, const int ncols, const int nrows, size_t size, size_t offset,
dpct::queue_ptr stream) {
- auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
- SYCL_CHECK(
- CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size)
- .wait()));
+ uint8_t * tmp_buf = static_cast<uint8_t *>(sycl_ext_malloc_device(stream, size));
+
+ sycl::event copy_event;
+ SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
+ if (!g_ggml_sycl_use_async_mem_op) {
+ copy_event.wait();
+ }
+
GGML_ASSERT((size % sizeof(block_q4_0) == 0));
GGML_ASSERT((offset % sizeof(block_q4_0) == 0));
int offset_blks = offset / sizeof(block_q4_0);
auto qs_ptr = data_device + offset_blks * QK4_0 / 2;
auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks;
- stream->parallel_for(
+ auto reorder_event = stream->parallel_for(
size / sizeof(block_q4_0),
[=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
const block_q4_0* x = (const block_q4_0*)tmp_buf;
*(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j];
}
*(d_ptr + ib) = x[ib].d;
- }).wait_and_throw();
-
- sycl::free(tmp_buf, *stream);
+ });
+ if (!g_ggml_sycl_use_async_mem_op) {
+ reorder_event.wait_and_throw();
+ }
+ sycl_ext_free(stream, tmp_buf);
}
static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
const int nblocks = size / sizeof(block_q4_K);
- auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
- SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size).wait()));
+ uint8_t * tmp_buf = static_cast<uint8_t *>(sycl_ext_malloc_device(stream, size));
+
+ sycl::event copy_event;
+ SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
+ if (!g_ggml_sycl_use_async_mem_op) {
+ copy_event.wait();
+ }
auto * qs_ptr = data_device;
auto * scales_ptr = qs_ptr + QK_K / 2 * nblocks;
auto * dm_ptr = (sycl::half2 *) (scales_ptr + K_SCALE_SIZE * nblocks);
- stream->parallel_for(nblocks, [=](auto i) {
+ auto reorder_event = stream->parallel_for(nblocks, [=](auto i) {
const block_q4_K * x = (const block_q4_K *) tmp_buf;
const int ib = i;
}
dm_ptr[ib] = x[ib].dm;
- }).wait_and_throw();
-
- sycl::free(tmp_buf, *stream);
+ });
+ if (!g_ggml_sycl_use_async_mem_op) {
+ reorder_event.wait_and_throw();
+ }
+ sycl_ext_free(stream, tmp_buf);
}
static void reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
const int nblocks = size / sizeof(block_q6_K);
- auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
- SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size).wait()));
+ uint8_t * tmp_buf = static_cast<uint8_t *>(sycl_ext_malloc_device(stream, size));
+
+ sycl::event copy_event;
+ SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
+ if (!g_ggml_sycl_use_async_mem_op) {
+ copy_event.wait();
+ }
auto * ql_ptr = data_device;
auto * qh_ptr = ql_ptr + (QK_K / 2) * nblocks;
auto * scales_ptr = qh_ptr + (QK_K / 4) * nblocks;
sycl::half * dm_ptr = (sycl::half *) (scales_ptr + (QK_K / 16) * nblocks);
- stream
- ->parallel_for(nblocks,
- [=](auto i) {
- const block_q6_K * x = (const block_q6_K *) tmp_buf;
- const int ib = i;
-
- const uint8_t * ql = x[ib].ql;
- const uint8_t * qh = x[ib].qh;
- uint8_t * base_ql_ptr = ql_ptr + (QK_K / 2) * ib;
- uint8_t * base_qh_ptr = qh_ptr + (QK_K / 4) * ib;
- uint8_t * base_scales_ptr = scales_ptr + (QK_K / 16) * ib;
+ auto reorder_event = stream->parallel_for(nblocks, [=](auto i) {
+ const block_q6_K * x = (const block_q6_K *) tmp_buf;
+ const int ib = i;
- for (int j = 0; j < QK_K / 2; ++j) {
- base_ql_ptr[j] = ql[j];
- }
- for (int j = 0; j < QK_K / 4; ++j) {
- base_qh_ptr[j] = qh[j];
- }
+ const uint8_t * ql = x[ib].ql;
+ const uint8_t * qh = x[ib].qh;
+ uint8_t * base_ql_ptr = ql_ptr + (QK_K / 2) * ib;
+ uint8_t * base_qh_ptr = qh_ptr + (QK_K / 4) * ib;
+ uint8_t * base_scales_ptr = scales_ptr + (QK_K / 16) * ib;
- for (int j = 0; j < QK_K / 16; ++j) {
- base_scales_ptr[j] = x[ib].scales[j];
- }
+ for (int j = 0; j < QK_K / 2; ++j) {
+ base_ql_ptr[j] = ql[j];
+ }
+ for (int j = 0; j < QK_K / 4; ++j) {
+ base_qh_ptr[j] = qh[j];
+ }
- dm_ptr[ib] = x[ib].d;
- })
- .wait_and_throw();
+ for (int j = 0; j < QK_K / 16; ++j) {
+ base_scales_ptr[j] = x[ib].scales[j];
+ }
- sycl::free(tmp_buf, *stream);
+ dm_ptr[ib] = x[ib].d;
+ });
+ if (!g_ggml_sycl_use_async_mem_op) {
+ reorder_event.wait_and_throw();
+ }
+ sycl_ext_free(stream, tmp_buf);
}
static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
GGML_LOG_INFO("%s: disabling SYCL graphs due to unsupported node type %s\n", __func__,
ggml_op_name(node_op));
return false;
+ case GGML_OP_MUL_MAT:
+ // We cannot use graphs with ggml_sycl_mul_mat() when SYCL async memory allocation extensions are not available,
+ // as SYCL malloc / free and host wait calls are not supported when recording to a graph which are all present
+ // in reordering.
+ if (!g_ggml_sycl_use_async_mem_op) {
+ GGML_LOG_INFO(
+ "%s: disabling SYCL graphs due to unsupported node type when using a compiler without the "
+ "oneAPI async memory allocation extension "
+ "%s\n",
+ __func__, ggml_op_name(node_op));
+ return false;
+ }
}
}
return true;