// for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
+
+ uint32_t n_full() const {
+ assert(n_layer >= n_part);
+ return n_layer - n_part;
+ }
};
const size_t ntbo = llama_max_tensor_buft_overrides();
size_t itbo = 0;
for (size_t id = 0; id < nd; id++) {
- il0 += ngl_per_device[id].n_layer - ngl_per_device[id].n_part;
+ il0 += ngl_per_device[id].n_full();
for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
if (itbo + 1 >= ntbo) {
tensor_buft_overrides[itbo].pattern = nullptr;
+ std::to_string(ntbo) + " is insufficient for model");
}
tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
- tensor_buft_overrides[itbo].buft = overflow_bufts[id];
+ tensor_buft_overrides[itbo].buft = il == il0 ? overflow_bufts[id] : ggml_backend_cpu_buffer_type();
itbo++;
}
il0 += ngl_per_device[id].n_part;
LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
}
- std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the partial layers of a device overflow to:
+ std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to:
overflow_bufts.reserve(nd);
- for (size_t id = 0; id < nd - 1; ++id) {
- overflow_bufts.push_back(ggml_backend_dev_buffer_type(devs[id + 1]));
+ for (size_t id = 0; id < nd; id++) {
+ overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
}
- overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
std::vector<ngl_t> ngl_per_device(nd);
std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts);
- if (hp_nex > 0) {
- for (size_t id = 0; id < nd; id++) {
- ngl_per_device[id].overflow_type = LAYER_FRACTION_MOE;
- }
- }
// optimize the number of layers per device using the method of false position:
// - ngl_per_device has 0 layers for each device, lower bound
if (mem_high[id] > targets[id]) {
assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
- if (hp_nex > 0 && size_t(id) == nd - 1) {
- delta--;
- }
LLAMA_LOG_DEBUG("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
while (delta > 1) {
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
ngl_per_device_test[id].n_layer += step_size;
if (hp_nex) {
- ngl_per_device_test[id].n_part += step_size;
+ ngl_per_device_test[id].n_part += size_t(id) == nd - 1 && ngl_per_device_test[id].n_part == 0 ?
+ step_size - 1 : step_size; // the first layer is the output layer which must always be full
}
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
assert(id_dense_start < nd);
LLAMA_LOG_INFO("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
- for (size_t id = 0; id <= id_dense_start; id++) {
+ for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) {
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
for (size_t jd = id_dense_start; jd < nd; jd++) {
const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1;
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
if (mem_high[id] > targets[id]) {
- assert(ngl_per_device_high[id].n_layer >= ngl_per_device_high[id].n_part);
- assert(ngl_per_device[id].n_layer >= ngl_per_device[id].n_part);
- assert((ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
- >= ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
- uint32_t delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
- - (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
+ assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
+ uint32_t delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
while (delta > 1) {
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
step_size = std::max(step_size, uint32_t(1));
ngl_per_device_test[id].n_layer += n_convert_jd;
n_converted_test += n_convert_jd;
- if (ngl_per_device_test[id_dense_start_test].n_layer > 0) {
+ if (ngl_per_device_test[id_dense_start_test].n_part > 0) {
break;
}
}
LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
__func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
}
- delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
- - (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
+ assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
+ delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
}
} else {
ngl_per_device = ngl_per_device_high;
ngl_per_device_test[id_dense_start_test].n_part--;
ngl_per_device_test[id].n_layer++;
ngl_per_device_test[id].n_part++;
- if (ngl_per_device_test[id_dense_start_test].n_layer == 0) {
+ if (ngl_per_device_test[id_dense_start_test].n_part == 0) {
id_dense_start_test++;
}
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
+ std::vector<ggml_backend_buffer_type_t> overflow_bufts_test = overflow_bufts;
+ if (id < nd - 1) {
+ overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]);
+ }
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
- std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
+ std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
ngl_per_device = ngl_per_device_test;
+ overflow_bufts = overflow_bufts_test;
mem = mem_test;
id_dense_start = id_dense_start_test;
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
- mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
+ mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
ngl_per_device = ngl_per_device_test;
+ overflow_bufts = overflow_bufts_test;
mem = mem_test;
id_dense_start = id_dense_start_test;
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
} else {
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
- mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
+ mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
ngl_per_device = ngl_per_device_test;
+ overflow_bufts = overflow_bufts_test;
mem = mem_test;
id_dense_start = id_dense_start_test;
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
}
+ // print info for devices that were not changed during the conversion from dense only to full layers:
+ for (size_t id = id_dense_start + 1; id < nd; id++) {
+ const int64_t projected_margin = dmds_full[id].free - mem[id];
+ LLAMA_LOG_INFO(
+ "%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
+ __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
+ }
+
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
}