#include "llama.h"
#include "common.h"
#include "ggml-cuda.h"
+#include "ggml-sycl.h"
// utils
static uint64_t get_time_ns() {
id += "/";
}
}
+#endif
+#ifdef GGML_USE_SYCL
+ int device_list[GGML_SYCL_MAX_DEVICES];
+ ggml_sycl_get_gpu_list(device_list, GGML_SYCL_MAX_DEVICES);
+
+ for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) {
+ if (device_list[i] >0 ){
+ char buf[128];
+ ggml_sycl_get_device_description(i, buf, sizeof(buf));
+ id += buf;
+ id += "/";
+ }
+ }
+ if (id.length() >2 ) {
+ id.pop_back();
+ }
#endif
// TODO: other backends
return id;
std::vector<bool> no_kv_offload;
std::vector<bool> mul_mat_q;
std::vector<std::vector<float>> tensor_split;
+ std::vector<bool> use_mmap;
int reps;
bool verbose;
output_formats output_format;
/* no_kv_offload */ {false},
/* mul_mat_q */ {true},
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
+ /* use_mmap */ {true},
/* reps */ 5,
/* verbose */ false,
/* output_format */ MARKDOWN
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
+ printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
printf(" -ts, --tensor_split <ts0/ts1/..> (default: 0)\n");
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
}
auto p = split<bool>(argv[i], split_delim);
params.mul_mat_q.insert(params.mul_mat_q.end(), p.begin(), p.end());
+ } else if (arg == "-mmp" || arg == "--mmap") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ auto p = split<bool>(argv[i], split_delim);
+ params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
} else if (arg == "-ts" || arg == "--tensor-split") {
if (++i >= argc) {
invalid_param = true;
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
+ if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
return params;
bool no_kv_offload;
bool mul_mat_q;
std::vector<float> tensor_split;
+ bool use_mmap;
llama_model_params to_llama_mparams() const {
llama_model_params mparams = llama_model_default_params();
mparams.split_mode = split_mode;
mparams.main_gpu = main_gpu;
mparams.tensor_split = tensor_split.data();
+ mparams.use_mmap = use_mmap;
return mparams;
}
n_gpu_layers == other.n_gpu_layers &&
split_mode == other.split_mode &&
main_gpu == other.main_gpu &&
+ use_mmap == other.use_mmap &&
tensor_split == other.tensor_split;
}
for (const auto & sm : params.split_mode)
for (const auto & mg : params.main_gpu)
for (const auto & ts : params.tensor_split)
+ for (const auto & mmp : params.use_mmap)
for (const auto & nb : params.n_batch)
for (const auto & tk : params.type_k)
for (const auto & tv : params.type_v)
/* .no_kv_offload= */ nkvo,
/* .mul_mat_q = */ mmq,
/* .tensor_split = */ ts,
+ /* .use_mmap = */ mmp,
};
instances.push_back(instance);
}
/* .no_kv_offload= */ nkvo,
/* .mul_mat_q = */ mmq,
/* .tensor_split = */ ts,
+ /* .use_mmap = */ mmp,
};
instances.push_back(instance);
}
static const bool vulkan;
static const bool kompute;
static const bool metal;
+ static const bool sycl;
static const bool gpu_blas;
static const bool blas;
static const std::string cpu_info;
bool no_kv_offload;
bool mul_mat_q;
std::vector<float> tensor_split;
+ bool use_mmap;
int n_prompt;
int n_gen;
std::string test_time;
no_kv_offload = inst.no_kv_offload;
mul_mat_q = inst.mul_mat_q;
tensor_split = inst.tensor_split;
+ use_mmap = inst.use_mmap;
n_prompt = inst.n_prompt;
n_gen = inst.n_gen;
// RFC 3339 date-time format
if (metal) {
return "Metal";
}
+ if (sycl) {
+ return GGML_SYCL_NAME;
+ }
if (gpu_blas) {
return "GPU BLAS";
}
if (blas) {
return "BLAS";
}
+
return "CPU";
}
static const std::vector<std::string> & get_fields() {
static const std::vector<std::string> fields = {
"build_commit", "build_number",
- "cuda", "opencl", "vulkan", "kompute", "metal", "gpu_blas", "blas",
+ "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas",
"cpu_info", "gpu_info",
"model_filename", "model_type", "model_size", "model_n_params",
"n_batch", "n_threads", "type_k", "type_v",
"n_gpu_layers", "split_mode",
"main_gpu", "no_kv_offload",
- "mul_mat_q", "tensor_split",
+ "mul_mat_q", "tensor_split", "use_mmap",
"n_prompt", "n_gen", "test_time",
"avg_ns", "stddev_ns",
"avg_ts", "stddev_ts"
return INT;
}
if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" ||
- field == "gpu_blas" || field == "blas" || field == "f16_kv" || field == "no_kv_offload" ||
- field == "mul_mat_q") {
+ field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
+ field == "mul_mat_q" || field == "use_mmap") {
return BOOL;
}
if (field == "avg_ts" || field == "stddev_ts") {
std::vector<std::string> values = {
build_commit, std::to_string(build_number),
std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan),
- std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
+ std::to_string(metal), std::to_string(sycl), std::to_string(gpu_blas), std::to_string(blas),
cpu_info, gpu_info,
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
std::to_string(n_gpu_layers), split_mode_str(split_mode),
std::to_string(main_gpu), std::to_string(no_kv_offload),
- std::to_string(mul_mat_q), tensor_split_str,
+ std::to_string(mul_mat_q), tensor_split_str, std::to_string(use_mmap),
std::to_string(n_prompt), std::to_string(n_gen), test_time,
std::to_string(avg_ns()), std::to_string(stdev_ns()),
std::to_string(avg_ts()), std::to_string(stdev_ts())
const bool test::metal = !!ggml_cpu_has_metal();
const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
const bool test::blas = !!ggml_cpu_has_blas();
+const bool test::sycl = !!ggml_cpu_has_sycl();
const std::string test::cpu_info = get_cpu_info();
const std::string test::gpu_info = get_gpu_info();
if (field == "no_kv_offload") {
return "nkvo";
}
+ if (field == "use_mmap") {
+ return "mmap";
+ }
if (field == "tensor_split") {
return "ts";
}
if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
fields.push_back("tensor_split");
}
+ if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
+ fields.push_back("use_mmap");
+ }
fields.push_back("test");
fields.push_back("t/s");
void ggml_sycl_set_mul_mat_q(bool mul_mat_q);
void ggml_sycl_set_scratch_size(size_t scratch_size);
void ggml_sycl_free_scratch(void);
-int ggml_sycl_get_device_count(void);
void ggml_sycl_get_device_description(int device, char * description, size_t description_size);
bool ggml_backend_is_sycl(ggml_backend_t backend);
int ggml_backend_sycl_get_device(ggml_backend_t backend);
return true;
}
+GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len) try {
+ int max_compute_units = -1;
+ for(int i=0;i<max_len;i++) id_list[i] = 0;
+
+ int device_count = dpct::dev_mgr::instance().device_count();
+
+ for(int id=0; id< device_count; id++){
+ sycl::device device = dpct::dev_mgr::instance().get_device(id);
+ if (!device.is_gpu()) continue;
+ dpct::device_info prop;
+ dpct::get_device_info(prop, device);
+ if(max_compute_units < prop.get_max_compute_units()) max_compute_units = prop.get_max_compute_units();
+ }
+
+ for(int id=0;id< device_count;id++){
+ sycl::device device = dpct::dev_mgr::instance().get_device(id);
+ if (!device.is_gpu()) continue;
+ dpct::device_info prop;
+ dpct::get_device_info(prop, device);
+ if(max_compute_units == prop.get_max_compute_units() && prop.get_major_version() == 1 ){
+ id_list[id] = 1;
+ }
+ }
+ return;
+}
+catch (sycl::exception const &exc) {
+ std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+ << ", line:" << __LINE__ << std::endl;
+ std::exit(1);
+}
+
int ggml_sycl_get_device_count() try {
int device_count;
if (CHECK_TRY_ERROR(device_count =
std::exit(1);
}
-void ggml_sycl_get_device_description(int device, char *description,
+GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description,
size_t description_size) try {
dpct::device_info prop;
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(