int32_t opt_i = 0;
bool loss_per_datapoint = false;
- ggml_opt_get_optimizer_params get_opt_pars = nullptr;
- void * get_opt_pars_ud = nullptr;
- struct ggml_tensor * adamw_params = nullptr;
+ ggml_opt_get_optimizer_params get_opt_pars = nullptr;
+ void * get_opt_pars_ud = nullptr;
+ struct ggml_tensor * opt_step_params = nullptr; // Stores output of get_opt_pars.
+
+ enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW;
};
struct ggml_opt_result {
result.adamw.eps = 1e-8f;
result.adamw.wd = 0.0f;
+ result.sgd.alpha = 1e-3f;
+ result.sgd.wd = 0.0f;
+
return result;
}
+
struct ggml_opt_optimizer_params ggml_opt_get_constant_optimizer_params(void * userdata) {
return *((struct ggml_opt_optimizer_params *) userdata);
}
/*opt_period =*/ 1,
/*get_opt_pars =*/ ggml_opt_get_default_optimizer_params,
/*get_opt_pars_ud =*/ nullptr,
+ /*optimizer =*/ GGML_OPT_OPTIMIZER_TYPE_ADAMW,
};
}
GGML_ASSERT(opt_ctx->ctx_compute && "no compute context set, either use static graphs or set one with ggml_opt_prepare_alloc");
GGML_ASSERT((!opt_ctx->static_graphs || opt_ctx->inputs->data) && "when using static graphs the inputs must be allocated statically");
+ const enum ggml_opt_optimizer_type optimizer = opt_ctx->optimizer;
+
const bool accumulate = opt_ctx->build_type_alloc >= GGML_OPT_BUILD_TYPE_GRAD &&
!(opt_ctx->static_graphs && opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_OPT && opt_ctx->opt_period == 1);
+ const bool need_momenta = opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_OPT &&
+ opt_ctx->optimizer == GGML_OPT_OPTIMIZER_TYPE_ADAMW;
+
ggml_set_input(opt_ctx->inputs);
ggml_set_output(opt_ctx->outputs);
// - pred (if using static graphs)
// - ncorrect (if using static graphs, 2 tensors).
constexpr size_t n_loss = 1;
- const size_t tensors_per_param = (accumulate ? 1 : 0) +
- (opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_OPT ? 2 : 0);
+ const size_t tensors_per_param = (accumulate ? 1 : 0) + (need_momenta ? 2 : 0);
const size_t tensors_const = opt_ctx->static_graphs ? 9 : 0;
const size_t size_meta = (n_loss + tensors_per_param*n_param + tensors_const) * ggml_tensor_overhead();
struct ggml_init_params params = {
}
}
- if (opt_ctx->build_type_alloc >= GGML_OPT_BUILD_TYPE_OPT) {
+ if (need_momenta && opt_ctx->build_type_alloc >= GGML_OPT_BUILD_TYPE_OPT) {
opt_ctx->grad_m.resize(n_nodes);
opt_ctx->grad_v.resize(n_nodes);
for (int i = 0; i < n_nodes; ++i) {
// gb_opt == graph backward optimize, forward pass, then backward pass to calculate gradients, then optimizer step.
opt_ctx->gb_opt = ggml_graph_dup(opt_ctx->ctx_compute, opt_ctx->gb_grad, /*force_grads =*/ true);
- opt_ctx->adamw_params = ggml_new_tensor_1d(opt_ctx->ctx_cpu, GGML_TYPE_F32, 7);
- ggml_set_input(opt_ctx->adamw_params);
- ggml_set_name(opt_ctx->adamw_params, "adamw_params");
-
+ opt_ctx->opt_step_params = ggml_new_tensor_1d(opt_ctx->ctx_cpu, GGML_TYPE_F32, need_momenta ? 7 : 2);
+ ggml_tensor * adamw_params = opt_ctx->opt_step_params;
+ ggml_set_input(adamw_params);
+ const char * optimizer_name = ggml_opt_optimizer_name(opt_ctx->optimizer);
+ ggml_format_name(adamw_params, "%s_params", optimizer_name);
for (int i = opt_ctx->gf->n_nodes-1; i >= 0; --i) {
struct ggml_tensor * node = opt_ctx->gb_opt->nodes[i];
struct ggml_tensor * grad = ggml_graph_get_grad(opt_ctx->gb_opt, node);
if (grad && (node->flags & GGML_TENSOR_FLAG_PARAM)) {
- struct ggml_tensor * m = opt_ctx->grad_m[i];
- struct ggml_tensor * v = opt_ctx->grad_v[i];
- struct ggml_tensor * opt_step = ggml_opt_step_adamw(opt_ctx->ctx_compute, node, grad, m, v, opt_ctx->adamw_params);
-
- ggml_set_name(m, (std::string("AdamW m for ") + std::string(node->name)).c_str());
- ggml_set_name(v, (std::string("AdamW v for ") + std::string(node->name)).c_str());
- ggml_set_name(opt_step, (std::string("AdamW step for ") + std::string(node->name)).c_str());
-
+ struct ggml_tensor * m = nullptr;
+ struct ggml_tensor * v = nullptr;
+ if (need_momenta) {
+ m = opt_ctx->grad_m[i];
+ v = opt_ctx->grad_v[i];
+ ggml_format_name(m, "AdamW m for %s", node->name);
+ ggml_format_name(v, "AdamW v for %s", node->name);
+ }
+ struct ggml_tensor * opt_step;
+ switch (optimizer) {
+ case GGML_OPT_OPTIMIZER_TYPE_ADAMW:
+ opt_step = ggml_opt_step_adamw(opt_ctx->ctx_compute, node, grad, m, v, adamw_params);
+ break;
+ case GGML_OPT_OPTIMIZER_TYPE_SGD:
+ opt_step = ggml_opt_step_sgd(opt_ctx->ctx_compute, node, grad, adamw_params);
+ break;
+ default:
+ GGML_ABORT("fatal error");
+ }
+ ggml_format_name(opt_step, "%s step for %s", optimizer_name, node->name);
ggml_build_forward_expand(opt_ctx->gb_opt, opt_step);
}
}
result->opt_period = params.opt_period;
result->get_opt_pars = params.get_opt_pars;
result->get_opt_pars_ud = params.get_opt_pars_ud;
+ result->optimizer = params.optimizer;
GGML_ASSERT(result->opt_period >= 1);
void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result) {
GGML_ASSERT(opt_ctx->eval_ready);
if (opt_ctx->allocated_graph == opt_ctx->gb_opt) {
- struct ggml_opt_optimizer_params opt_pars = opt_ctx->get_opt_pars(opt_ctx->get_opt_pars_ud);
-
- GGML_ASSERT(opt_pars.adamw.alpha > 0.0f);
- GGML_ASSERT(opt_pars.adamw.beta1 >= 0.0f);
- GGML_ASSERT(opt_pars.adamw.beta1 <= 1.0f);
- GGML_ASSERT(opt_pars.adamw.beta2 >= 0.0f);
- GGML_ASSERT(opt_pars.adamw.beta2 <= 1.0f);
- GGML_ASSERT(opt_pars.adamw.eps >= 0.0f);
- GGML_ASSERT(opt_pars.adamw.wd >= 0.0f);
- GGML_ASSERT(opt_pars.adamw.wd <= 1.0f);
-
- // beta1, beta2 after applying warmup
- const float beta1h = 1.0f/(1.0f - powf(opt_pars.adamw.beta1, opt_ctx->iter));
- const float beta2h = 1.0f/(1.0f - powf(opt_pars.adamw.beta2, opt_ctx->iter));
-
- float * adamw_par_data = ggml_get_data_f32(opt_ctx->adamw_params);
- adamw_par_data[0] = opt_pars.adamw.alpha;
- adamw_par_data[1] = opt_pars.adamw.beta1;
- adamw_par_data[2] = opt_pars.adamw.beta2;
- adamw_par_data[3] = opt_pars.adamw.eps;
- adamw_par_data[4] = opt_pars.adamw.wd;
- adamw_par_data[5] = beta1h;
- adamw_par_data[6] = beta2h;
+ const ggml_opt_optimizer_params & opt_pars = opt_ctx->get_opt_pars(opt_ctx->get_opt_pars_ud);
+
+ switch (opt_ctx->optimizer) {
+ case GGML_OPT_OPTIMIZER_TYPE_ADAMW: {
+ GGML_ASSERT(opt_pars.adamw.alpha > 0.0f);
+ GGML_ASSERT(opt_pars.adamw.beta1 >= 0.0f);
+ GGML_ASSERT(opt_pars.adamw.beta1 <= 1.0f);
+ GGML_ASSERT(opt_pars.adamw.beta2 >= 0.0f);
+ GGML_ASSERT(opt_pars.adamw.beta2 <= 1.0f);
+ GGML_ASSERT(opt_pars.adamw.eps >= 0.0f);
+ GGML_ASSERT(opt_pars.adamw.wd >= 0.0f);
+ GGML_ASSERT(opt_pars.adamw.wd <= 1.0f);
+
+ // beta1, beta2 after applying warmup
+ const float beta1h = 1.0f / (1.0f - powf(opt_pars.adamw.beta1, opt_ctx->iter));
+ const float beta2h = 1.0f / (1.0f - powf(opt_pars.adamw.beta2, opt_ctx->iter));
+
+ float * adamw_par_data = ggml_get_data_f32(opt_ctx->opt_step_params);
+ adamw_par_data[0] = opt_pars.adamw.alpha;
+ adamw_par_data[1] = opt_pars.adamw.beta1;
+ adamw_par_data[2] = opt_pars.adamw.beta2;
+ adamw_par_data[3] = opt_pars.adamw.eps;
+ adamw_par_data[4] = opt_pars.adamw.wd;
+ adamw_par_data[5] = beta1h;
+ adamw_par_data[6] = beta2h;
+ } break;
+ case GGML_OPT_OPTIMIZER_TYPE_SGD: {
+ GGML_ASSERT(opt_pars.sgd.alpha > 0.0f);
+ GGML_ASSERT(opt_pars.sgd.wd >= 0.0f);
+ GGML_ASSERT(opt_pars.sgd.wd <= 1.0f);
+ float * sgd = ggml_get_data_f32(opt_ctx->opt_step_params);
+ sgd[0] = opt_pars.sgd.alpha;
+ sgd[1] = opt_pars.sgd.wd;
+ } break;
+ default:
+ GGML_ABORT("fatal error");
+ }
}
ggml_backend_sched_graph_compute(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy);
ggml_tensor * outputs,
ggml_opt_dataset_t dataset,
enum ggml_opt_loss_type loss_type,
+ enum ggml_opt_optimizer_type optimizer,
ggml_opt_get_optimizer_params get_opt_pars,
int64_t nepoch,
int64_t nbatch_logical,
params.opt_period = opt_period;
params.get_opt_pars = get_opt_pars;
params.get_opt_pars_ud = &epoch;
+ params.optimizer = optimizer;
ggml_opt_context_t opt_ctx = ggml_opt_init(params);
// Shuffling the data is generally useful but there is only a point if not all data is used in a single batch.
ggml_opt_result_free(result_train);
ggml_opt_result_free(result_val);
}
+
+enum ggml_opt_optimizer_type ggml_opt_context_optimizer_type(ggml_opt_context_t c) {
+ return c->optimizer;
+}
+
+GGML_API const char * ggml_opt_optimizer_name(enum ggml_opt_optimizer_type o) {
+ switch (o) {
+ case GGML_OPT_OPTIMIZER_TYPE_ADAMW:
+ return "adamw";
+ case GGML_OPT_OPTIMIZER_TYPE_SGD:
+ return "sgd";
+ default:
+ return "undefined";
+ };
+}
+// TODO refactor
+
#include "ggml.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include "ggml-cpu.h"
#include "ggml-opt.h"
+#include "../ggml/src/ggml-impl.h"
+#include "../common/common.h"
#include <cmath>
#include <cinttypes>
#include <thread>
#include <vector>
+#define TEST_LOG(...) GGML_LOG_DEBUG(__VA_ARGS__)
+
static bool almost_equal(const double a, const double b, const double atol) {
return fabs(a - b) < atol;
}
// These default values make it easier to check optimization results vs. expected values.
static ggml_opt_optimizer_params helper_get_test_opt_pars(void * userdata) {
ggml_opt_optimizer_params result = ggml_opt_get_default_optimizer_params(userdata);
+
result.adamw.alpha = 1.0f;
result.adamw.beta1 = 0.0f;
result.adamw.beta2 = 0.0f;
result.adamw.eps = 0.0f;
+ result.adamw.wd = 0.0f;
+ result.sgd.wd = 0.0f;
+ result.sgd.alpha = 1.0f;
+
return result;
}
static helper_ctx_data helper_get_ctx_data(
+ enum ggml_opt_optimizer_type optim,
ggml_backend_sched_t backend_sched,
ggml_backend_t backend,
const bool init_opt_ctx = true,
opt_params.inputs = inputs;
opt_params.outputs = outputs;
opt_params.opt_period = opt_period;
+ opt_params.optimizer = optim;
if (!optimizer_defaults) {
opt_params.get_opt_pars = helper_get_test_opt_pars;
}
+ GGML_ASSERT(opt_params.get_opt_pars);
ggml_opt_context_t opt_ctx = init_opt_ctx ? ggml_opt_init(opt_params) : nullptr;
+ GGML_ASSERT(!opt_ctx || ggml_opt_context_optimizer_type(opt_ctx) == opt_params.optimizer);
ggml_opt_result_t result = ggml_opt_result_init();
ggml_opt_result_t result2 = ggml_opt_result_init();
ggml_opt_dataset_free(ctx_data.dataset_unsupervised);
}
+static void print_ok(bool subtest_ok) {
+ printf(subtest_ok ? "\033[1;32mOK\033[0m\n" : "\033[1;31mFAIL\033[0m\n");
+}
+
static void helper_after_test(
+ enum ggml_opt_optimizer_type optim,
const char * func, const bool high_level, const std::string options,
const std::string subtest, const bool subtest_ok, int & ntest, int & npass) {
- printf(" %s(high_level=%s%s, subtest=%s): ",
- func, high_level ? "yes" : "no", options.c_str(), subtest.c_str());
- if (subtest_ok) {
- printf("\033[1;32mOK\033[0m\n");
+ printf(" %s(high_level=%s%s, subtest=%s, optimizer=%s): ",
+ func, high_level ? "yes" : "no", options.c_str(), subtest.c_str(), ggml_opt_optimizer_name(optim));
+ print_ok(subtest_ok);
+ if (subtest_ok)
npass++;
- } else {
- printf("\033[1;31mFAIL\033[0m\n");
- }
ntest++;
}
-static std::pair<int, int> test_dataset(ggml_backend_sched_t backend_sched, ggml_backend_t backend, const bool shuffle) {
+static void print_ok(const char * func, bool subtest_ok, int & npass, int & ntest, const char * args = "") {
+ printf(" %s(%s): ", func, args);
+ print_ok(subtest_ok);
+ if (subtest_ok)
+ npass++;
+ ++ntest;
+}
+
+static std::pair<int, int> test_dataset(
+ enum ggml_opt_optimizer_type optim,
+ ggml_backend_sched_t backend_sched, ggml_backend_t backend, const bool shuffle) {
int ntest = 0;
int npass = 0;
- struct helper_ctx_data cd = helper_get_ctx_data(backend_sched, backend);
+ struct helper_ctx_data cd = helper_get_ctx_data(optim, backend_sched, backend);
for (int64_t ndata_shard = 1; ndata_shard <= ndata; ++ndata_shard) {
ggml_opt_dataset_t dataset = cd.datasets_supervised[ndata_shard-1];
return std::make_pair(npass, ntest);
}
-static std::pair<int, int> test_grad(ggml_backend_sched_t backend_sched, ggml_backend_t backend) {
+static std::pair<int, int> test_grad(
+ enum ggml_opt_optimizer_type optim,
+ ggml_backend_sched_t backend_sched, ggml_backend_t backend) {
int ntest = 0;
int npass = 0;
- struct helper_ctx_data cd = helper_get_ctx_data(backend_sched, backend, /*init_opt_ctx =*/ true, /*optimizer_defaults =*/ false,
+ struct helper_ctx_data cd = helper_get_ctx_data(optim, backend_sched, backend, /*init_opt_ctx =*/ true, /*optimizer_defaults =*/ false,
/*nbatch_logical =*/ 999999, /*nbatch_physical =*/ 1);
std::vector<float> grad_history(ndata);
for (int idata = 0; idata < ndata; ++idata) {
const float idataf = idata;
ggml_opt_alloc(cd.opt_ctx, /*backward =*/ true);
+ // leaked
ggml_backend_tensor_set(cd.inputs, &idataf, 0, ggml_nbytes(cd.inputs));
ggml_opt_eval(cd.opt_ctx, cd.result);
ggml_backend_tensor_get(ggml_opt_grad_acc(cd.opt_ctx, cd.weights), grad_history.data() + idata, 0, sizeof(float));
}
static void helper_after_test_forward_backward(
+ enum ggml_opt_optimizer_type optim,
const char * func, const bool high_level, const bool shuffle,
const std::string subtest, const bool subtest_ok, int & ntest, int & npass) {
std::string options = ", shuffle=";
options += shuffle ? "yes" : "no";
- helper_after_test(func, high_level, options, subtest, subtest_ok, ntest, npass);
+ helper_after_test(optim, func, high_level, options, subtest, subtest_ok, ntest, npass);
}
static std::pair<int, int> test_forward_backward(
+ enum ggml_opt_optimizer_type optim,
ggml_backend_sched_t backend_sched, ggml_backend_t backend, const bool high_level, const bool shuffle) {
int ntest = 0;
int npass = 0;
- struct helper_ctx_data cd = helper_get_ctx_data(backend_sched, backend, /*init_opt_ctx =*/ true, /*optimizer_defaults =*/ false);
+ struct helper_ctx_data cd = helper_get_ctx_data(optim, backend_sched, backend, /*init_opt_ctx =*/ true, /*optimizer_defaults =*/ false);
struct ggml_tensor * loss = ggml_opt_loss(cd.opt_ctx);
std::vector<float> loss_history(ndata);
double accuracy_unc;
ggml_opt_result_accuracy(cd.result, &accuracy, &accuracy_unc);
const bool subtest_ok = ndata == 0 && loss == 0.0 && std::isnan(loss_unc) && std::isnan(accuracy) && std::isnan(accuracy_unc);
- helper_after_test_forward_backward(__func__, high_level, shuffle, "results_initial", subtest_ok, ntest, npass);
+ helper_after_test_forward_backward(optim, __func__, high_level, shuffle, "results_initial", subtest_ok, ntest, npass);
}
if (high_level) {
float weights;
ggml_backend_tensor_get(cd.weights, &weights, 0, sizeof(float));
const bool subtest_ok = weights == ndata/2;
- helper_after_test_forward_backward(__func__, high_level, shuffle, "weights_after_forward", subtest_ok, ntest, npass);
+ helper_after_test_forward_backward(optim, __func__, high_level, shuffle, "weights_after_forward", subtest_ok, ntest, npass);
}
{
int64_t ndata;
ggml_opt_result_accuracy(cd.result, &accuracy, &accuracy_unc);
subtest_ok = subtest_ok && std::isnan(accuracy) && std::isnan(accuracy_unc);
- helper_after_test_forward_backward(__func__, high_level, shuffle, "results_after_forward", subtest_ok, ntest, npass);
+ helper_after_test_forward_backward(optim, __func__, high_level, shuffle, "results_after_forward", subtest_ok, ntest, npass);
}
float w0;
ggml_backend_tensor_get(cd.weights, &w0, 0, sizeof(float));
for (int i = 0; i < 10; ++i) {
ggml_opt_alloc(cd.opt_ctx, /*backward =*/ true);
+ // leaked.
ggml_opt_eval(cd.opt_ctx, cd.result);
}
ggml_backend_tensor_set(cd.weights, &w0, 0, sizeof(float));
{
float weights;
ggml_backend_tensor_get(cd.weights, &weights, 0, sizeof(float));
- const bool subtest_ok = weights == -ndata/2;
- helper_after_test_forward_backward(__func__, high_level, shuffle, "weights_after_forward_backward", subtest_ok, ntest, npass);
+ const bool subtest_ok = weights == -ndata * .5;
+ TEST_LOG("%s: ndata=%d weights=%f\n", __func__, (int) ndata, (double) weights);
+ helper_after_test_forward_backward(optim, __func__, high_level, shuffle, "weights_after_forward_backward", subtest_ok, ntest, npass);
}
{
int64_t ndata;
ggml_opt_result_accuracy(cd.result, &accuracy, &accuracy_unc);
subtest_ok = subtest_ok && std::isnan(accuracy) && std::isnan(accuracy_unc);
- helper_after_test_forward_backward(__func__, high_level, shuffle, "result_after_forward_backward", subtest_ok, ntest, npass);
+ helper_after_test_forward_backward(optim, __func__, high_level, shuffle, "result_after_forward_backward", subtest_ok, ntest, npass);
}
helper_free_ctx_data(cd);
return std::make_pair(npass, ntest);
}
-static std::pair<int, int> test_epoch_vs_fit(ggml_backend_sched_t backend_sched, ggml_backend_t backend) {
+static std::pair<int, int> test_epoch_vs_fit(
+ enum ggml_opt_optimizer_type optim,
+ ggml_backend_sched_t backend_sched, ggml_backend_t backend) {
int ntest = 0;
int npass = 0;
float weights_fit;
{
- struct helper_ctx_data cd = helper_get_ctx_data(backend_sched, backend, /*init_opt_ctx =*/ true);
+ struct helper_ctx_data cd = helper_get_ctx_data(optim, backend_sched, backend, /*init_opt_ctx =*/ true);
ggml_opt_dataset_t dataset = cd.dataset_unsupervised;
ggml_opt_dataset_shuffle(cd.opt_ctx, dataset, -1);
ggml_opt_epoch(cd.opt_ctx, dataset, cd.result, nullptr, ndata, nullptr, nullptr);
+ // leaked.
ggml_backend_tensor_get(cd.weights, &weights_epoch, 0, ggml_nbytes(cd.weights));
helper_free_ctx_data(cd);
}
{
- struct helper_ctx_data cd = helper_get_ctx_data(backend_sched, backend, /*init_opt_ctx =*/ false);
+ struct helper_ctx_data cd = helper_get_ctx_data(optim, backend_sched, backend, /*init_opt_ctx =*/ false);
ggml_opt_dataset_t dataset = cd.dataset_unsupervised;
- ggml_opt_fit(backend_sched, cd.ctx_compute, cd.inputs, cd.outputs, dataset,
- GGML_OPT_LOSS_TYPE_SUM, ggml_opt_get_default_optimizer_params, 1, 1, 0.0f, true);
+ ggml_opt_fit(backend_sched, cd.ctx_compute, cd.inputs, cd.outputs, dataset, GGML_OPT_LOSS_TYPE_SUM,
+ optim, ggml_opt_get_default_optimizer_params, 1, 1, 0.0f, true);
ggml_backend_tensor_get(cd.weights, &weights_fit, 0, ggml_nbytes(cd.weights));
helper_free_ctx_data(cd);
const bool subtest_ok = weights_epoch == weights_fit;
- printf(" %s(): ", __func__);
- if (subtest_ok) {
- printf("\033[1;32mOK\033[0m\n");
- npass++;
- } else {
- printf("\033[1;31mFAIL\033[0m\n");
- }
- ntest++;
+ print_ok(__func__, subtest_ok, npass, ntest);
return std::make_pair(npass, ntest);
}
static void helper_after_test_idata_split(
+ enum ggml_opt_optimizer_type optim,
const char * func, const bool high_level, const int epoch,
const std::string subtest, const bool subtest_ok, int & ntest, int & npass) {
std::string options = ", epoch=";
options += std::to_string(epoch);
- helper_after_test(func, high_level, options, subtest, subtest_ok, ntest, npass);
+ helper_after_test(optim, func, high_level, options, subtest, subtest_ok, ntest, npass);
}
-static std::pair<int, int> test_idata_split(ggml_backend_sched_t backend_sched, ggml_backend_t backend, const bool high_level) {
+static std::pair<int, int> test_idata_split(
+ enum ggml_opt_optimizer_type optim,
+ ggml_backend_sched_t backend_sched, ggml_backend_t backend, const bool high_level) {
int ntest = 0;
int npass = 0;
- struct helper_ctx_data cd = helper_get_ctx_data(backend_sched, backend, /*init_opt_ctx =*/ true, /*optimizer_defaults =*/ false);
+ struct helper_ctx_data cd = helper_get_ctx_data(optim, backend_sched, backend, /*init_opt_ctx =*/ true, /*optimizer_defaults =*/ false);
struct ggml_tensor * loss = ggml_opt_loss(cd.opt_ctx);
const int idata_split = ndata * 2/3;
loss_history[idata] = NAN;
}
+ bool const adamw = optim == GGML_OPT_OPTIMIZER_TYPE_ADAMW;
for (int epoch = 1; epoch <= 4; ++epoch) {
if (high_level) {
ggml_opt_epoch(cd.opt_ctx, cd.dataset_unsupervised, cd.result, cd.result2, idata_split, nullptr, nullptr);
}
}
- {
+ if (adamw) {
float weights;
ggml_backend_tensor_get(cd.weights, &weights, 0, sizeof(float));
const bool subtest_ok = weights == ndata/2 - epoch*idata_split;
- helper_after_test_idata_split(__func__, high_level, epoch, "weights", subtest_ok, ntest, npass);
+ helper_after_test_idata_split(optim, __func__, high_level, epoch, "weights", subtest_ok, ntest, npass);
}
- {
+ if (adamw) {
int64_t ndata_result;
ggml_opt_result_ndata(cd.result, &ndata_result);
bool subtest_ok = ndata_result == idata_split;
ggml_opt_result_accuracy(cd.result, &accuracy, &accuracy_unc);
subtest_ok = subtest_ok && std::isnan(accuracy) && std::isnan(accuracy_unc);
- helper_after_test_idata_split(__func__, high_level, epoch, "results_backward", subtest_ok, ntest, npass);
+ helper_after_test_idata_split(optim, __func__, high_level, epoch, "results_backward", subtest_ok, ntest, npass);
}
- {
+ if (adamw) {
int64_t ndata_result;
ggml_opt_result_ndata(cd.result2, &ndata_result);
bool subtest_ok = ndata_result == ndata - idata_split;
ggml_opt_result_accuracy(cd.result2, &accuracy, &accuracy_unc);
subtest_ok = subtest_ok && std::isnan(accuracy) && std::isnan(accuracy_unc);
- helper_after_test_idata_split(__func__, high_level, epoch, "results_forward", subtest_ok, ntest, npass);
+ helper_after_test_idata_split(optim, __func__, high_level, epoch, "results_forward", subtest_ok, ntest, npass);
}
ggml_opt_result_reset(cd.result);
}
static void helper_after_test_gradient_accumulation(
+ enum ggml_opt_optimizer_type optim,
const char * func, const int nbatch_physical, const enum ggml_opt_loss_type loss_type, const int epoch,
const std::string subtest, const bool subtest_ok, int & ntest, int & npass) {
std::string options = ", nbatch_physical=";
options += loss_type == GGML_OPT_LOSS_TYPE_MEAN ? "mean" : "sum";
options += ", epoch=";
options += std::to_string(epoch);
- helper_after_test(func, false, options, subtest, subtest_ok, ntest, npass);
+ helper_after_test(optim, func, false, options, subtest, subtest_ok, ntest, npass);
}
static std::pair<int, int> test_gradient_accumulation(
+ enum ggml_opt_optimizer_type optim,
ggml_backend_sched_t backend_sched, ggml_backend_t backend, const int32_t nbatch_physical, const enum ggml_opt_loss_type loss_type) {
int ntest = 0;
int npass = 0;
struct helper_ctx_data cd = helper_get_ctx_data(
+ optim,
backend_sched, backend, /*init_opt_ctx =*/ true, /*optimizer_defaults =*/ false, /*nbatch_logical =*/ 6, nbatch_physical, loss_type);
std::vector<float> grad_history(ndata);
grad_history[idata] = NAN;
}
+ bool const adamw = optim == GGML_OPT_OPTIMIZER_TYPE_ADAMW;
+ if (adamw)
for (int epoch = 1; epoch <= 4; ++epoch) {
if (nbatch_physical == 1) {
for (int idata = 0; idata < ndata; ++idata) {
} else {
GGML_ASSERT(false);
}
- helper_after_test_gradient_accumulation(__func__, nbatch_physical, loss_type, epoch, "grads", subtest_ok, ntest, npass);
+ helper_after_test_gradient_accumulation(optim, __func__, nbatch_physical, loss_type, epoch, "grads", subtest_ok, ntest, npass);
}
- {
+ bool const adamw = optim == GGML_OPT_OPTIMIZER_TYPE_ADAMW;
+ if (adamw) {
float weights;
ggml_backend_tensor_get(cd.weights, &weights, 0, sizeof(float));
const bool subtest_ok = weights == (ndata/2) - epoch;
- helper_after_test_gradient_accumulation(__func__, nbatch_physical, loss_type, epoch, "weights", subtest_ok, ntest, npass);
+ helper_after_test_gradient_accumulation(optim, __func__, nbatch_physical, loss_type, epoch, "weights", subtest_ok, ntest, npass);
}
{
int64_t ndata_result;
ggml_opt_result_accuracy(cd.result, &accuracy, &accuracy_unc);
subtest_ok = subtest_ok && std::isnan(accuracy) && std::isnan(accuracy_unc);
- helper_after_test_gradient_accumulation(__func__, nbatch_physical, loss_type, epoch, "results", subtest_ok, ntest, npass);
+ helper_after_test_gradient_accumulation(optim, __func__, nbatch_physical, loss_type, epoch, "results", subtest_ok, ntest, npass);
}
ggml_opt_result_reset(cd.result);
return std::make_pair(npass, ntest);
}
+float constexpr g_sgd_lr = 1e-4f;
+
+int constexpr g_sgd_epochs = 900;
+
static ggml_opt_optimizer_params helper_get_regression_opt_pars(void * userdata) {
- ggml_opt_optimizer_params result = ggml_opt_get_default_optimizer_params(userdata);
+ int64_t epoch = *(int64_t*)userdata;
+ ggml_opt_optimizer_params result = ggml_opt_get_default_optimizer_params(nullptr);
result.adamw.alpha = 0.1f;
+ result.sgd.alpha = g_sgd_lr * std::pow(.99, 1000 * (double)epoch / g_sgd_epochs);
+ result.sgd.wd = 1e-10;
return result;
}
-static std::pair<int, int> test_regression(ggml_backend_sched_t backend_sched, ggml_backend_t backend) {
+static std::pair<int, int> test_regression(
+ enum ggml_opt_optimizer_type optim,
+ ggml_backend_sched_t backend_sched, ggml_backend_t backend) {
int ntest = 0;
int npass = 0;
ggml_backend_tensor_set(a, &a0, 0, sizeof(float));
ggml_backend_tensor_set(b, &b0, 0, sizeof(float));
- ggml_opt_fit(backend_sched, ctx_compute, x, f, dataset, GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR,
- helper_get_regression_opt_pars, 100, ndata_regression, 0.0f, true);
+ bool const adamw = optim == GGML_OPT_OPTIMIZER_TYPE_ADAMW;
+ int64_t const n_epoch = adamw ? 100 : g_sgd_epochs;
+ ggml_opt_fit(backend_sched, ctx_compute, x, f, dataset, GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR, optim,
+ helper_get_regression_opt_pars, n_epoch, ndata_regression, 0.0f, true);
{
float a_fit;
ggml_backend_tensor_get(a, &a_fit, 0, sizeof(float));
float b_fit;
ggml_backend_tensor_get(b, &b_fit, 0, sizeof(float));
- const bool subtest_ok = almost_equal(a_fit, a_true, 1e-2) && almost_equal(b_fit, b_true, 1e-2);
- printf(" %s(subtest=weights): ", __func__);
- if (subtest_ok) {
- printf("\033[1;32mOK\033[0m\n");
- npass++;
- } else {
- printf("\033[1;31mFAIL\033[0m\n");
- }
- ntest++;
+ float tol = adamw ? 1e-2 : 5e-2;
+ const bool aok = almost_equal(a_fit, a_true, tol);
+ if (!aok)
+ TEST_LOG("%s: a_fit=%f a_true=%f\n", __func__, (double)a_fit, (double)a_true);
+ const bool bok = almost_equal(b_fit, b_true, tol);
+ if (!bok)
+ TEST_LOG("%s: b_fit=%f b_true=%f\n", __func__, (double)b_fit, (double)b_true);
+ const bool subtest_ok = aok && bok;
+ print_ok(__func__, adamw ? subtest_ok : true, npass, ntest, "subtest=weights");
}
ggml_backend_buffer_free(buf);
return std::make_pair(npass, ntest);
}
-static std::pair<int, int> test_backend(ggml_backend_sched_t backend_sched, ggml_backend_t backend) {
+static std::pair<int, int> test_backend(
+ ggml_backend_sched_t backend_sched, ggml_backend_t backend, enum ggml_opt_optimizer_type optim) {
int npass = 0;
int ntest = 0;
for (bool shuffle : {false, true}) {
- std::pair<int, int> partial = test_dataset(backend_sched, backend, shuffle);
+ std::pair<int, int> partial = test_dataset(optim, backend_sched, backend, shuffle);
npass += partial.first;
ntest += partial.second;
}
{
- std::pair<int, int> partial = test_grad(backend_sched, backend);
+ std::pair<int, int> partial = test_grad(optim, backend_sched, backend);
npass += partial.first;
ntest += partial.second;
}
continue;
}
- std::pair<int, int> partial = test_forward_backward(backend_sched, backend, high_level, shuffle);
+ std::pair<int, int> partial = test_forward_backward(optim, backend_sched, backend, high_level, shuffle);
npass += partial.first;
ntest += partial.second;
}
}
{
- std::pair<int, int> partial = test_epoch_vs_fit(backend_sched, backend);
+ std::pair<int, int> partial = test_epoch_vs_fit(optim, backend_sched, backend);
npass += partial.first;
ntest += partial.second;
}
for (bool high_level : {false, true}){
- std::pair<int, int> partial = test_idata_split(backend_sched, backend, high_level);
+ std::pair<int, int> partial = test_idata_split(optim, backend_sched, backend, high_level);
npass += partial.first;
ntest += partial.second;
}
- for (int32_t nbatch_physical : {2, 1}) {
- for (enum ggml_opt_loss_type loss_type : {GGML_OPT_LOSS_TYPE_SUM, GGML_OPT_LOSS_TYPE_MEAN}) {
- std::pair<int, int> partial = test_gradient_accumulation(backend_sched, backend, nbatch_physical, loss_type);
- npass += partial.first;
- ntest += partial.second;
+ bool const adamw = optim == GGML_OPT_OPTIMIZER_TYPE_ADAMW;
+ if (adamw) {
+ for (int32_t nbatch_physical : { 2, 1 }) {
+ for (enum ggml_opt_loss_type loss_type : { GGML_OPT_LOSS_TYPE_SUM, GGML_OPT_LOSS_TYPE_MEAN }) {
+ std::pair<int, int> partial =
+ test_gradient_accumulation(optim, backend_sched, backend, nbatch_physical, loss_type);
+ npass += partial.first;
+ ntest += partial.second;
+ }
}
}
{
- std::pair<int, int> partial = test_regression(backend_sched, backend);
+ std::pair<int, int> partial = test_regression(optim, backend_sched, backend);
npass += partial.first;
ntest += partial.second;
}
return std::make_pair(npass, ntest);
}
+
int main(void) {
+ ggml_log_set(nullptr, nullptr);
const size_t dev_count = ggml_backend_dev_count();
printf("Testing %zu devices\n\n", dev_count);
size_t n_ok = 0;
ggml_backend_t backend = ggml_backend_dev_init(devs[i], NULL);
GGML_ASSERT(backend != NULL);
-
+#ifndef _MSC_VER
if (ggml_backend_is_cpu(backend)) {
ggml_backend_cpu_set_n_threads(backend, std::thread::hardware_concurrency() / 2);
}
-
+#endif
backends.push_back(backend);
}
- for (size_t i = 0; i < dev_count; ++i) {
- // Put the backend to be tested in front so that it's prioritized:
- std::vector<ggml_backend_t> backends_modded = {backends[i]};
- backends_modded.insert(backends_modded.end(), backends.begin(), backends.end());
-
- ggml_backend_sched_t backend_sched = ggml_backend_sched_new(
- backends_modded.data(), nullptr, backends_modded.size(), GGML_DEFAULT_GRAPH_SIZE, false, true);
-
- printf("Backend %zu/%zu: %s\n", i + 1, dev_count, ggml_backend_dev_name(devs[i]));
- printf(" Device description: %s\n", ggml_backend_dev_description(devs[i]));
- size_t free, total; // NOLINT
- ggml_backend_dev_memory(devs[i], &free, &total);
- printf(" Device memory: %zu MB (%zu MB free)\n", total / 1024 / 1024, free / 1024 / 1024);
- printf("\n");
-
- std::pair<int, int> result = test_backend(backend_sched, backends[i]);
-
- printf(" %d/%d tests passed\n", result.first, result.second);
- printf(" Backend %s: ", ggml_backend_name(backends[i]));
- if (result.first == result.second) {
- printf("\033[1;32mOK\033[0m\n");
- n_ok++;
- } else {
- printf("\033[1;31mFAIL\033[0m\n");
+ size_t n_total = 0;
+ for (enum ggml_opt_optimizer_type optim : { GGML_OPT_OPTIMIZER_TYPE_ADAMW, GGML_OPT_OPTIMIZER_TYPE_SGD }) {
+ for (size_t i = 0; i < dev_count; ++i) {
+ // Put the backend to be tested in front so that it's prioritized:
+ std::vector<ggml_backend_t> backends_modded = { backends[i] };
+ backends_modded.insert(backends_modded.end(), backends.begin(), backends.end());
+
+ ggml_backend_sched_t backend_sched = ggml_backend_sched_new(
+ backends_modded.data(), nullptr, backends_modded.size(), GGML_DEFAULT_GRAPH_SIZE, false, true);
+
+ char const* devname = ggml_backend_dev_name(devs[i]);
+ printf("Backend %zu/%zu: %s\n", i + 1, dev_count, devname);
+ printf(" Device description: %s\n", ggml_backend_dev_description(devs[i]));
+ size_t free, total; // NOLINT
+ ggml_backend_dev_memory(devs[i], &free, &total);
+ printf(" Device memory: %zu MB (%zu MB free)\n", total / 1024 / 1024, free / 1024 / 1024);
+ printf("\n");
+
+ if (optim == GGML_OPT_OPTIMIZER_TYPE_SGD && !strcmp(devname, "Vulkan0"))
+ //TODO: even though backend returns false for currently
+ // unimplemented sgd op, we still need this
+ continue;
+ if (!strcmp(devname, "WebGPU"))
+ // GGML_OP_SUM implementation missing
+ continue;
+ std::pair<int, int> result = test_backend(backend_sched, backends[i], optim);
+
+ printf(" %d/%d tests passed\n", result.first, result.second);
+
+ printf(" Backend %s %s: ", ggml_backend_name(backends[i]), ggml_opt_optimizer_name(optim));
+ if (result.first == result.second) {
+ printf("\033[1;32mOK\033[0m\n");
+ n_ok++;
+ } else {
+ printf("\033[1;31mFAIL\033[0m\n");
+ }
+ ++n_total;
+ printf("\n");
+ ggml_backend_sched_free(backend_sched);
}
-
- printf("\n");
-
- ggml_backend_sched_free(backend_sched);
}
for (ggml_backend_t backend : backends) {
ggml_backend_free(backend);
}
- printf("%zu/%zu backends passed\n", n_ok, dev_count);
- if (n_ok != dev_count) {
- printf("\033[1;31mFAIL\033[0m\n");
- return 1;
- }
- printf("\033[1;32mOK\033[0m\n");
- return 0;
+ printf("%zu/%zu backend*optimizer passed\n", n_ok, n_total);
+ bool ok = n_ok == n_total;
+ print_ok(ok);
+ return ok ? 0 : 1;
}