int work_group_size = 0;
std::string gpus_list = "";
- /*
- Use all GPU with same top max compute units
- */
sycl_gpu_mgr() {
detect_sycl_gpu_list_with_max_cu();
get_allow_gpus();
create_context_with_gpus();
}
- /*
- Use the assigned GPU as only one
- */
- sycl_gpu_mgr(int main_gpu_id) {
- sycl::device device = dpct::dev_mgr::instance().get_device(main_gpu_id);
- dpct::device_info prop;
- dpct::get_device_info(prop, device);
- gpus.push_back(main_gpu_id);
- devices.push_back(device);
- work_group_size = prop.get_max_work_group_size();
- max_compute_units = prop.get_max_compute_units();
-
- get_allow_gpus();
- create_context_with_gpus();
- }
-
void create_context_with_gpus() {
sycl::context ctx = sycl::context(devices);
assert(gpus.size() > 0);
gpus_list += std::to_string(gpus[i]);
gpus_list += ",";
}
- if (gpus_list.length() > 1) {
+ if (gpus_list.length() > 2) {
gpus_list.pop_back();
}
}
if (gpus[i] == id)
return i;
}
- printf("miss to get device index by id=%d\n", id);
- GGML_ASSERT(false);
+ assert(false);
+ return -1;
}
int get_next_index(int id) {
if (gpus[i] == id)
return i;
}
- GGML_ASSERT(false);
+ assert(false);
+ return -1;
}
};
static int g_all_sycl_device_count = -1;
static int g_main_device = -1;
static int g_main_device_id = -1;
-static bool g_ggml_backend_sycl_buffer_type_initialized = false;
static std::array<float, GGML_SYCL_MAX_DEVICES> g_default_tensor_split = {};
}
void print_gpu_device_list() {
- fprintf(stderr, "detect %d SYCL GPUs: [%s] with top Max compute units:%d\n",
+ fprintf(stderr, "detect %d SYCL GPUs: [%s] with Max compute units:%d\n",
g_sycl_gpu_mgr->get_gpu_count(),
g_sycl_gpu_mgr->gpus_list.c_str(),
g_sycl_gpu_mgr->max_compute_units);
#else
fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
#endif
-
-/* NOT REMOVE, keep it for next optimize for XMX.
-#if defined(SYCL_USE_XMX)
- fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
-#else
- fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
-#endif
-*/
-
if (CHECK_TRY_ERROR(g_all_sycl_device_count =
dpct::dev_mgr::instance().device_count()) != 0) {
initialized = true;
}
GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
ggml_backend_sycl_print_sycl_devices();
- if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
- print_gpu_device_list();
- initialized = true;
- g_sycl_loaded = true;
- }
+ if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
+ g_device_count = g_sycl_gpu_mgr->get_gpu_count();
+ g_work_group_size = g_sycl_gpu_mgr->work_group_size;
- g_device_count = g_sycl_gpu_mgr->get_gpu_count();
- g_work_group_size = g_sycl_gpu_mgr->work_group_size;
+ print_gpu_device_list();
- int64_t total_vram = 0;
+ int64_t total_vram = 0;
+/* NOT REMOVE, keep it for next optimize for XMX.
+#if defined(SYCL_USE_XMX)
+ fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
+#else
+ fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
+#endif
+*/
+ for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
+ g_device_caps[id].vmm = 0;
+ g_device_caps[id].device_id = -1;
+ g_device_caps[id].cc = 0;
+ g_tensor_split[id] = 0;
+ g_default_tensor_split[id] = 0;
+ }
- for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
- g_device_caps[id].vmm = 0;
- g_device_caps[id].device_id = -1;
- g_device_caps[id].cc = 0;
- g_tensor_split[id] = 0;
- g_default_tensor_split[id] = 0;
- }
+ for (int i = 0; i < g_device_count; ++i) {
+ int device_id = g_sycl_gpu_mgr->gpus[i];
+ g_device_caps[i].vmm = 0;
- for (int i = 0; i < g_device_count; ++i) {
- int device_id = g_sycl_gpu_mgr->gpus[i];
- g_device_caps[i].vmm = 0;
+ dpct::device_info prop;
+ SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
+ prop, dpct::dev_mgr::instance().get_device(device_id))));
- dpct::device_info prop;
- SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
- prop, dpct::dev_mgr::instance().get_device(device_id))));
+ g_default_tensor_split[i] = total_vram;
+ total_vram += prop.get_global_mem_size();
- g_default_tensor_split[i] = total_vram;
- total_vram += prop.get_global_mem_size();
+ g_device_caps[i].cc =
+ 100 * prop.get_major_version() + 10 * prop.get_minor_version();
+ }
- g_device_caps[i].cc =
- 100 * prop.get_major_version() + 10 * prop.get_minor_version();
- }
+ for (int i = 0; i < g_device_count; ++i) {
+ g_default_tensor_split[i] /= total_vram;
+ }
- for (int i = 0; i < g_device_count; ++i) {
- g_default_tensor_split[i] /= total_vram;
- }
+ for (int i = 0; i < g_device_count; ++i) {
+ SYCL_CHECK(ggml_sycl_set_device(i));
- for (int i = 0; i < g_device_count; ++i) {
- SYCL_CHECK(ggml_sycl_set_device(i));
+ // create sycl streams
+ for (int is = 0; is < MAX_STREAMS; ++is) {
+ SYCL_CHECK(CHECK_TRY_ERROR(
+ g_syclStreams[i][is] =
+ dpct::get_current_device().create_queue(
+ g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
+ }
- // create sycl streams
- for (int is = 0; is < MAX_STREAMS; ++is) {
- SYCL_CHECK(CHECK_TRY_ERROR(
- g_syclStreams[i][is] =
- dpct::get_current_device().create_queue(
- g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
+ const dpct::queue_ptr stream = g_syclStreams[i][0];
+ // create sycl handle
+ SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream));
}
- const dpct::queue_ptr stream = g_syclStreams[i][0];
- // create sycl handle
- SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream));
+ initialized = true;
+ g_sycl_loaded = true;
}
}
catch (sycl::exception const &exc) {
/* .is_host = */ nullptr,
};
-ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) {
- if (device_index>=g_device_count or device_index<0) {
- printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
- device_index, g_device_count-1);
- GGML_ASSERT(device_index<g_device_count);
- }
+ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];
- if (!g_ggml_backend_sycl_buffer_type_initialized) {
+ static bool ggml_backend_sycl_buffer_type_initialized = false;
+
+ if (!ggml_backend_sycl_buffer_type_initialized) {
for (int i = 0; i < g_device_count; i++) {
ggml_backend_sycl_buffer_types[i] = {
/* .iface = */ ggml_backend_sycl_buffer_type_interface,
/* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(g_sycl_gpu_mgr->gpus[i])},
};
}
- g_ggml_backend_sycl_buffer_type_initialized = true;
+ ggml_backend_sycl_buffer_type_initialized = true;
}
- return &ggml_backend_sycl_buffer_types[device_index];
+
+ return &ggml_backend_sycl_buffer_types[device];
}
// sycl split buffer type
return g_sycl_gpu_mgr->get_index(device_id);
}
-GGML_API GGML_CALL void ggml_backend_sycl_set_single_device(int main_gpu_id) {
- GGML_ASSERT(main_gpu_id<g_all_sycl_device_count);
- printf("ggml_backend_sycl_set_single_device: use single device: %d\n", main_gpu_id);
- if (g_sycl_gpu_mgr) {
- delete g_sycl_gpu_mgr;
- }
- g_sycl_gpu_mgr = new sycl_gpu_mgr(main_gpu_id);
- ggml_init_sycl();
- g_ggml_backend_sycl_buffer_type_initialized = false;
-}
-
extern "C" int ggml_backend_sycl_reg_devices();
int ggml_backend_sycl_reg_devices() {