GGML_OPT_BUILD_TYPE_OPT = 30,
};
+ enum ggml_opt_optimizer_type {
+ GGML_OPT_OPTIMIZER_TYPE_ADAMW,
+ GGML_OPT_OPTIMIZER_TYPE_SGD,
+
+ GGML_OPT_OPTIMIZER_TYPE_COUNT
+ };
+
// parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
struct ggml_opt_optimizer_params {
- // AdamW optimizer parameters
struct {
float alpha; // learning rate
- float beta1;
- float beta2;
+ float beta1; // first AdamW momentum
+ float beta2; // second AdamW momentum
float eps; // epsilon for numerical stability
- float wd; // weight decay for AdamW, use 0.0f to disable
+ float wd; // weight decay - 0.0f to disable
} adamw;
+ struct {
+ float alpha; // learning rate
+ float wd; // weight decay
+ } sgd;
};
// callback to calculate optimizer parameters prior to a backward pass
int32_t opt_period; // after how many gradient accumulation steps an optimizer step should be done
- ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
- void * get_opt_pars_ud; // userdata for calculating optimizer parameters
+ ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
+ void * get_opt_pars_ud; // userdata for calculating optimizer parameters
+
+ // only GGML_OPT_OPTIMIZER_TYPE_ADAMW needs m, v momenta per parameter tensor
+ enum ggml_opt_optimizer_type optimizer;
};
// get parameters for an optimization context with defaults set where possible
// get the gradient accumulator for a node from the forward graph
GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);
+ GGML_API enum ggml_opt_optimizer_type ggml_opt_context_optimizer_type(ggml_opt_context_t); //TODO consistent naming scheme
+
+ GGML_API const char * ggml_opt_optimizer_name(enum ggml_opt_optimizer_type);
+
// ====== Optimization Result ======
GGML_API ggml_opt_result_t ggml_opt_result_init(void);
struct ggml_tensor * outputs, // output tensor, must have shape [ne_label, ndata_batch] if labels are used
ggml_opt_dataset_t dataset, // dataset with data and optionally also labels
enum ggml_opt_loss_type loss_type, // loss to minimize
+ enum ggml_opt_optimizer_type optimizer, // sgd or adamw
ggml_opt_get_optimizer_params get_opt_pars, // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
int64_t nepoch, // how many times the dataset should be iterated over
int64_t nbatch_logical, // datapoints optimizer step, must be a multiple of ndata_batch in inputs/outputs
float val_split, // fraction of the dataset to use for validation, must be in [0.0f, 1.0f)
bool silent); // whether or not info prints to stderr should be suppressed
+
#ifdef __cplusplus
}
#endif
GGML_OP_CROSS_ENTROPY_LOSS,
GGML_OP_CROSS_ENTROPY_LOSS_BACK,
GGML_OP_OPT_STEP_ADAMW,
+ GGML_OP_OPT_STEP_SGD,
GGML_OP_GLU,
struct ggml_tensor * grad,
struct ggml_tensor * m,
struct ggml_tensor * v,
- struct ggml_tensor * adamw_params); // parameters such a the learning rate
+ struct ggml_tensor * adamw_params); // parameters such as the learning rate
+
+ // stochastic gradient descent step (with weight decay)
+ GGML_API struct ggml_tensor * ggml_opt_step_sgd(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * grad,
+ struct ggml_tensor * sgd_params); // alpha, weight decay
//
// automatic differentiation
ggml_compute_forward_opt_step_adamw(params, tensor);
}
break;
+ case GGML_OP_OPT_STEP_SGD:
+ {
+ ggml_compute_forward_opt_step_sgd(params, tensor);
+ }
+ break;
case GGML_OP_NONE:
{
// nop
case GGML_OP_CROSS_ENTROPY_LOSS:
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
case GGML_OP_OPT_STEP_ADAMW:
+ case GGML_OP_OPT_STEP_SGD:
{
n_tasks = n_threads;
} break;
const int ir1 = MIN(ir0 + dr, nr);
const float * adamw_params_ptr = ggml_get_data_f32(adamw_params);
+
const float alpha = adamw_params_ptr[0];
const float beta1 = adamw_params_ptr[1];
const float beta2 = adamw_params_ptr[2];
const float wd = adamw_params_ptr[4];
const float beta1h = adamw_params_ptr[5];
const float beta2h = adamw_params_ptr[6];
-
+ const float keep = 1.f - alpha * wd;
for (int ir = ir0; ir < ir1; ++ir) {
const int64_t i03 = ir/(ne02*ne01);
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
// The weight decay is applied independently of the Adam momenta m and v.
// This is NOT equivalent to l2 regularization that adds w[i00]*w[i00] to the loss.
// See: https://arxiv.org/pdf/1711.05101v3.pdf
- w[i00] = w[i00]*(1.0f - alpha*wd) - alpha*mh/vh;
+ w[i00] = w[i00] * keep - alpha * mh / vh;
}
}
}
}
}
}
+
+static void ggml_compute_forward_opt_step_sgd_f32(const ggml_compute_params * params, ggml_tensor * dst) {
+ const ggml_tensor * src0 = dst->src[0];
+ const ggml_tensor * src0_grad = dst->src[1];
+ const ggml_tensor * sgd_params = dst->src[2];
+
+ GGML_ASSERT(ggml_are_same_shape(src0, src0_grad));
+ GGML_ASSERT(ggml_nelements(sgd_params) == 2);
+
+ const int ith = params->ith;
+ const int nth = params->nth;
+
+ const int nr = ggml_nrows(src0);
+
+ GGML_TENSOR_UNARY_OP_LOCALS
+ GGML_ASSERT(nb00 == sizeof(float));
+
+ // rows per thread
+ const int dr = (nr + nth - 1) / nth;
+
+ // row range for this thread
+ const int ir0 = dr * ith;
+ const int ir1 = MIN(ir0 + dr, nr);
+
+ // using adamw param subset we care about - alpha, wd - could have a separate struct
+ const float * sgd_params_ptr = ggml_get_data_f32(sgd_params);
+ const float alpha = sgd_params_ptr[0];
+ const float keep = 1.f - alpha * sgd_params_ptr[1];
+
+ for (int ir = ir0; ir < ir1; ++ir) {
+ const int64_t i03 = ir / (ne02 * ne01);
+ const int64_t i02 = (ir - i03 * ne02 * ne01) / ne01;
+ const int64_t i01 = (ir - i03 * ne02 * ne01 - i02 * ne01);
+
+ const size_t offset = i03 * nb03 + i02 * nb02 + i01 * nb01;
+
+ float * w = (float *) ((char *) src0->data + offset); // weight
+ const float * g = (const float *) ((const char *) src0_grad->data + offset); // grad
+
+ for (int i00 = 0; i00 < ne00; ++i00) {
+ w[i00] = w[i00] * keep - alpha * g[i00];
+ }
+ }
+}
+
+void ggml_compute_forward_opt_step_sgd(const ggml_compute_params * params, ggml_tensor * dst) {
+ const ggml_tensor * src0 = dst->src[0];
+
+ switch (src0->type) {
+ case GGML_TYPE_F32:
+ {
+ ggml_compute_forward_opt_step_sgd_f32(params, dst);
+ }
+ break;
+ default:
+ {
+ GGML_ABORT("fatal error - sgd is F32 only");
+ }
+ }
+}
void ggml_compute_forward_cross_entropy_loss_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_opt_step_adamw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-
+void ggml_compute_forward_opt_step_sgd(const struct ggml_compute_params * params, struct ggml_tensor * dst);
#ifdef __cplusplus
}
#endif
#include "ggml-cuda/mmvq.cuh"
#include "ggml-cuda/norm.cuh"
#include "ggml-cuda/opt-step-adamw.cuh"
+#include "ggml-cuda/opt-step-sgd.cuh"
#include "ggml-cuda/out-prod.cuh"
#include "ggml-cuda/pad.cuh"
#include "ggml-cuda/pool2d.cuh"
case GGML_OP_OPT_STEP_ADAMW:
ggml_cuda_opt_step_adamw(ctx, dst);
break;
+ case GGML_OP_OPT_STEP_SGD:
+ ggml_cuda_opt_step_sgd(ctx, dst);
+ break;
default:
return false;
}
case GGML_OP_CROSS_ENTROPY_LOSS:
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
case GGML_OP_OPT_STEP_ADAMW:
+ case GGML_OP_OPT_STEP_SGD:
return true;
default:
return false;
--- /dev/null
+#include "ggml-impl.h"
+#include "opt-step-sgd.cuh"
+
+#include <cstdint>
+
+static __global__ void opt_step_sgd_f32(
+ float * __restrict__ x, const float * __restrict__ g,
+ const float * __restrict__ pars, const int64_t k) {
+
+ const int64_t i = (int64_t) blockIdx.x*blockDim.x + threadIdx.x;
+
+ if (i >= k) {
+ return;
+ }
+ x[i] = x[i] * (1.0f - pars[0] * pars[1]) - pars[0] * g[i];
+}
+
+static void opt_step_sgd_f32_cuda(
+ float * x, const float * g, const float * __restrict__ pars, const int64_t k, cudaStream_t stream) {
+
+ const dim3 block_dims(CUDA_OPT_STEP_SGD_BLOCK_SIZE, 1, 1);
+ const dim3 block_nums((k + CUDA_OPT_STEP_SGD_BLOCK_SIZE - 1) / CUDA_OPT_STEP_SGD_BLOCK_SIZE, 1, 1);
+ opt_step_sgd_f32<<<block_nums, block_dims, 0, stream>>>(x, g, pars, k);
+}
+
+void ggml_cuda_opt_step_sgd(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+ const ggml_tensor * src0 = dst->src[0];
+ const ggml_tensor * src0_grad = dst->src[1];
+ const ggml_tensor * params = dst->src[2];
+
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
+ GGML_ASSERT(src0_grad->type == GGML_TYPE_F32);
+ GGML_ASSERT(params->type == GGML_TYPE_F32);
+ GGML_ASSERT(ggml_is_contiguous(src0));
+ GGML_ASSERT(ggml_is_contiguous(src0_grad));
+ GGML_ASSERT(ggml_is_contiguous(params));
+ GGML_ASSERT(ggml_are_same_shape(src0, src0_grad));
+ GGML_ASSERT(ggml_nelements(params) == 2);
+
+ float * src0_d = (float *) src0->data;
+ const float * src0_grad_d = (const float *) src0_grad->data;
+ const float * params_d = (const float *) params->data;
+
+ cudaStream_t stream = ctx.stream();
+
+ const int64_t ne = ggml_nelements(src0);
+
+ opt_step_sgd_f32_cuda(src0_d, src0_grad_d, params_d, ne, stream);
+}
--- /dev/null
+#include "common.cuh"
+
+#define CUDA_OPT_STEP_SGD_BLOCK_SIZE 256
+
+void ggml_cuda_opt_step_sgd(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
int32_t opt_i = 0;
bool loss_per_datapoint = false;
- ggml_opt_get_optimizer_params get_opt_pars = nullptr;
- void * get_opt_pars_ud = nullptr;
- struct ggml_tensor * adamw_params = nullptr;
+ ggml_opt_get_optimizer_params get_opt_pars = nullptr;
+ void * get_opt_pars_ud = nullptr;
+ struct ggml_tensor * opt_step_params = nullptr; // Stores output of get_opt_pars.
+
+ enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW;
};
struct ggml_opt_result {
result.adamw.eps = 1e-8f;
result.adamw.wd = 0.0f;
+ result.sgd.alpha = 1e-3f;
+ result.sgd.wd = 0.0f;
+
return result;
}
+
struct ggml_opt_optimizer_params ggml_opt_get_constant_optimizer_params(void * userdata) {
return *((struct ggml_opt_optimizer_params *) userdata);
}
/*opt_period =*/ 1,
/*get_opt_pars =*/ ggml_opt_get_default_optimizer_params,
/*get_opt_pars_ud =*/ nullptr,
+ /*optimizer =*/ GGML_OPT_OPTIMIZER_TYPE_ADAMW,
};
}
GGML_ASSERT(opt_ctx->ctx_compute && "no compute context set, either use static graphs or set one with ggml_opt_prepare_alloc");
GGML_ASSERT((!opt_ctx->static_graphs || opt_ctx->inputs->data) && "when using static graphs the inputs must be allocated statically");
+ const enum ggml_opt_optimizer_type optimizer = opt_ctx->optimizer;
+
const bool accumulate = opt_ctx->build_type_alloc >= GGML_OPT_BUILD_TYPE_GRAD &&
!(opt_ctx->static_graphs && opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_OPT && opt_ctx->opt_period == 1);
+ const bool need_momenta = opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_OPT &&
+ opt_ctx->optimizer == GGML_OPT_OPTIMIZER_TYPE_ADAMW;
+
ggml_set_input(opt_ctx->inputs);
ggml_set_output(opt_ctx->outputs);
// - pred (if using static graphs)
// - ncorrect (if using static graphs, 2 tensors).
constexpr size_t n_loss = 1;
- const size_t tensors_per_param = (accumulate ? 1 : 0) +
- (opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_OPT ? 2 : 0);
+ const size_t tensors_per_param = (accumulate ? 1 : 0) + (need_momenta ? 2 : 0);
const size_t tensors_const = opt_ctx->static_graphs ? 9 : 0;
const size_t size_meta = (n_loss + tensors_per_param*n_param + tensors_const) * ggml_tensor_overhead();
struct ggml_init_params params = {
}
}
- if (opt_ctx->build_type_alloc >= GGML_OPT_BUILD_TYPE_OPT) {
+ if (need_momenta && opt_ctx->build_type_alloc >= GGML_OPT_BUILD_TYPE_OPT) {
opt_ctx->grad_m.resize(n_nodes);
opt_ctx->grad_v.resize(n_nodes);
for (int i = 0; i < n_nodes; ++i) {
// gb_opt == graph backward optimize, forward pass, then backward pass to calculate gradients, then optimizer step.
opt_ctx->gb_opt = ggml_graph_dup(opt_ctx->ctx_compute, opt_ctx->gb_grad, /*force_grads =*/ true);
- opt_ctx->adamw_params = ggml_new_tensor_1d(opt_ctx->ctx_cpu, GGML_TYPE_F32, 7);
- ggml_set_input(opt_ctx->adamw_params);
- ggml_set_name(opt_ctx->adamw_params, "adamw_params");
-
+ opt_ctx->opt_step_params = ggml_new_tensor_1d(opt_ctx->ctx_cpu, GGML_TYPE_F32, need_momenta ? 7 : 2);
+ ggml_tensor * adamw_params = opt_ctx->opt_step_params;
+ ggml_set_input(adamw_params);
+ const char * optimizer_name = ggml_opt_optimizer_name(opt_ctx->optimizer);
+ ggml_format_name(adamw_params, "%s_params", optimizer_name);
for (int i = opt_ctx->gf->n_nodes-1; i >= 0; --i) {
struct ggml_tensor * node = opt_ctx->gb_opt->nodes[i];
struct ggml_tensor * grad = ggml_graph_get_grad(opt_ctx->gb_opt, node);
if (grad && (node->flags & GGML_TENSOR_FLAG_PARAM)) {
- struct ggml_tensor * m = opt_ctx->grad_m[i];
- struct ggml_tensor * v = opt_ctx->grad_v[i];
- struct ggml_tensor * opt_step = ggml_opt_step_adamw(opt_ctx->ctx_compute, node, grad, m, v, opt_ctx->adamw_params);
-
- ggml_set_name(m, (std::string("AdamW m for ") + std::string(node->name)).c_str());
- ggml_set_name(v, (std::string("AdamW v for ") + std::string(node->name)).c_str());
- ggml_set_name(opt_step, (std::string("AdamW step for ") + std::string(node->name)).c_str());
-
+ struct ggml_tensor * m = nullptr;
+ struct ggml_tensor * v = nullptr;
+ if (need_momenta) {
+ m = opt_ctx->grad_m[i];
+ v = opt_ctx->grad_v[i];
+ ggml_format_name(m, "AdamW m for %s", node->name);
+ ggml_format_name(v, "AdamW v for %s", node->name);
+ }
+ struct ggml_tensor * opt_step;
+ switch (optimizer) {
+ case GGML_OPT_OPTIMIZER_TYPE_ADAMW:
+ opt_step = ggml_opt_step_adamw(opt_ctx->ctx_compute, node, grad, m, v, adamw_params);
+ break;
+ case GGML_OPT_OPTIMIZER_TYPE_SGD:
+ opt_step = ggml_opt_step_sgd(opt_ctx->ctx_compute, node, grad, adamw_params);
+ break;
+ default:
+ GGML_ABORT("fatal error");
+ }
+ ggml_format_name(opt_step, "%s step for %s", optimizer_name, node->name);
ggml_build_forward_expand(opt_ctx->gb_opt, opt_step);
}
}
result->opt_period = params.opt_period;
result->get_opt_pars = params.get_opt_pars;
result->get_opt_pars_ud = params.get_opt_pars_ud;
+ result->optimizer = params.optimizer;
GGML_ASSERT(result->opt_period >= 1);
void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result) {
GGML_ASSERT(opt_ctx->eval_ready);
if (opt_ctx->allocated_graph == opt_ctx->gb_opt) {
- struct ggml_opt_optimizer_params opt_pars = opt_ctx->get_opt_pars(opt_ctx->get_opt_pars_ud);
-
- GGML_ASSERT(opt_pars.adamw.alpha > 0.0f);
- GGML_ASSERT(opt_pars.adamw.beta1 >= 0.0f);
- GGML_ASSERT(opt_pars.adamw.beta1 <= 1.0f);
- GGML_ASSERT(opt_pars.adamw.beta2 >= 0.0f);
- GGML_ASSERT(opt_pars.adamw.beta2 <= 1.0f);
- GGML_ASSERT(opt_pars.adamw.eps >= 0.0f);
- GGML_ASSERT(opt_pars.adamw.wd >= 0.0f);
- GGML_ASSERT(opt_pars.adamw.wd <= 1.0f);
-
- // beta1, beta2 after applying warmup
- const float beta1h = 1.0f/(1.0f - powf(opt_pars.adamw.beta1, opt_ctx->iter));
- const float beta2h = 1.0f/(1.0f - powf(opt_pars.adamw.beta2, opt_ctx->iter));
-
- float * adamw_par_data = ggml_get_data_f32(opt_ctx->adamw_params);
- adamw_par_data[0] = opt_pars.adamw.alpha;
- adamw_par_data[1] = opt_pars.adamw.beta1;
- adamw_par_data[2] = opt_pars.adamw.beta2;
- adamw_par_data[3] = opt_pars.adamw.eps;
- adamw_par_data[4] = opt_pars.adamw.wd;
- adamw_par_data[5] = beta1h;
- adamw_par_data[6] = beta2h;
+ const ggml_opt_optimizer_params & opt_pars = opt_ctx->get_opt_pars(opt_ctx->get_opt_pars_ud);
+
+ switch (opt_ctx->optimizer) {
+ case GGML_OPT_OPTIMIZER_TYPE_ADAMW: {
+ GGML_ASSERT(opt_pars.adamw.alpha > 0.0f);
+ GGML_ASSERT(opt_pars.adamw.beta1 >= 0.0f);
+ GGML_ASSERT(opt_pars.adamw.beta1 <= 1.0f);
+ GGML_ASSERT(opt_pars.adamw.beta2 >= 0.0f);
+ GGML_ASSERT(opt_pars.adamw.beta2 <= 1.0f);
+ GGML_ASSERT(opt_pars.adamw.eps >= 0.0f);
+ GGML_ASSERT(opt_pars.adamw.wd >= 0.0f);
+ GGML_ASSERT(opt_pars.adamw.wd <= 1.0f);
+
+ // beta1, beta2 after applying warmup
+ const float beta1h = 1.0f / (1.0f - powf(opt_pars.adamw.beta1, opt_ctx->iter));
+ const float beta2h = 1.0f / (1.0f - powf(opt_pars.adamw.beta2, opt_ctx->iter));
+
+ float * adamw_par_data = ggml_get_data_f32(opt_ctx->opt_step_params);
+ adamw_par_data[0] = opt_pars.adamw.alpha;
+ adamw_par_data[1] = opt_pars.adamw.beta1;
+ adamw_par_data[2] = opt_pars.adamw.beta2;
+ adamw_par_data[3] = opt_pars.adamw.eps;
+ adamw_par_data[4] = opt_pars.adamw.wd;
+ adamw_par_data[5] = beta1h;
+ adamw_par_data[6] = beta2h;
+ } break;
+ case GGML_OPT_OPTIMIZER_TYPE_SGD: {
+ GGML_ASSERT(opt_pars.sgd.alpha > 0.0f);
+ GGML_ASSERT(opt_pars.sgd.wd >= 0.0f);
+ GGML_ASSERT(opt_pars.sgd.wd <= 1.0f);
+ float * sgd = ggml_get_data_f32(opt_ctx->opt_step_params);
+ sgd[0] = opt_pars.sgd.alpha;
+ sgd[1] = opt_pars.sgd.wd;
+ } break;
+ default:
+ GGML_ABORT("fatal error");
+ }
}
ggml_backend_sched_graph_compute(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy);
ggml_tensor * outputs,
ggml_opt_dataset_t dataset,
enum ggml_opt_loss_type loss_type,
+ enum ggml_opt_optimizer_type optimizer,
ggml_opt_get_optimizer_params get_opt_pars,
int64_t nepoch,
int64_t nbatch_logical,
params.opt_period = opt_period;
params.get_opt_pars = get_opt_pars;
params.get_opt_pars_ud = &epoch;
+ params.optimizer = optimizer;
ggml_opt_context_t opt_ctx = ggml_opt_init(params);
// Shuffling the data is generally useful but there is only a point if not all data is used in a single batch.
ggml_opt_result_free(result_train);
ggml_opt_result_free(result_val);
}
+
+enum ggml_opt_optimizer_type ggml_opt_context_optimizer_type(ggml_opt_context_t c) {
+ return c->optimizer;
+}
+
+GGML_API const char * ggml_opt_optimizer_name(enum ggml_opt_optimizer_type o) {
+ switch (o) {
+ case GGML_OPT_OPTIMIZER_TYPE_ADAMW:
+ return "adamw";
+ case GGML_OPT_OPTIMIZER_TYPE_SGD:
+ return "sgd";
+ default:
+ return "undefined";
+ };
+}
vk_pipeline pipeline_rwkv_wkv6_f32;
vk_pipeline pipeline_rwkv_wkv7_f32;
vk_pipeline pipeline_opt_step_adamw_f32;
+ vk_pipeline pipeline_opt_step_sgd_f32;
vk_pipeline pipeline_conv2d_f32[CONV_SHAPE_COUNT];
vk_pipeline pipeline_conv2d_f16_f32[CONV_SHAPE_COUNT];
vk_pipeline pipeline_conv2d_dw_whcn_f32;
ggml_vk_create_pipeline(device, device->pipeline_opt_step_adamw_f32, "opt_step_adamw_f32", opt_step_adamw_f32_len, opt_step_adamw_f32_data, "main", 5, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
+ ggml_vk_create_pipeline(device, device->pipeline_opt_step_sgd_f32, "opt_step_sgd_f32", opt_step_sgd_f32_len, opt_step_sgd_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
+
// conv2d
for (uint32_t s = 0; s < CONV_SHAPE_COUNT; ++s) {
uint32_t conv2d_WG_SIZE = 256;
return ctx->device->pipeline_opt_step_adamw_f32;
}
return nullptr;
+ case GGML_OP_OPT_STEP_SGD:
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+ return ctx->device->pipeline_opt_step_sgd_f32;
+ }
+ return nullptr;
case GGML_OP_LEAKY_RELU:
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
return ctx->device->pipeline_leaky_relu_f32;
ggml_vk_buffer_memset_async(subctx, d_D, d_buf_offset, 0, d_sz);
ggml_vk_sync_buffers(subctx);
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
+ } else if (op == GGML_OP_OPT_STEP_SGD) {
+ // OPT_STEP_SGD works on src0, it does not need dst
+ ggml_vk_sync_buffers(subctx);
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz } }, pc, elements);
} else if (use_src2) {
ggml_vk_sync_buffers(subctx);
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
);
}
+static void ggml_vk_opt_step_sgd(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool dryrun = false) {
+ const size_t n = ggml_nelements(dst->src[0]);
+
+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_OPT_STEP_SGD, { (uint32_t)n, 0, 0.0f, 0.0f }, dryrun);
+}
+
static void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
int * op_params = (int *)dst->op_params;
case GGML_OP_LEAKY_RELU:
case GGML_OP_FLASH_ATTN_EXT:
case GGML_OP_OPT_STEP_ADAMW:
+ case GGML_OP_OPT_STEP_SGD:
break;
default:
std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
case GGML_OP_CONV_2D:
case GGML_OP_CONV_2D_DW:
case GGML_OP_LEAKY_RELU:
+ case GGML_OP_OPT_STEP_SGD:
{
// These operations all go through ggml_vk_op_f32, so short-circuit and
// do the only thing needed for the dryrun.
case GGML_OP_OPT_STEP_ADAMW:
ggml_vk_opt_step_adamw(ctx, compute_ctx, node, dryrun);
+ break;
+
+ case GGML_OP_OPT_STEP_SGD:
+ ggml_vk_opt_step_sgd(ctx, compute_ctx, src0, src1, src2, node, dryrun);
+
break;
default:
return false;
case GGML_OP_REPEAT:
case GGML_OP_REPEAT_BACK:
case GGML_OP_OPT_STEP_ADAMW:
+ case GGML_OP_OPT_STEP_SGD:
buf = tensor->buffer;
-
break;
case GGML_OP_UNARY:
switch (ggml_get_unary_op(tensor)) {
case GGML_OP_SIN:
case GGML_OP_COS:
case GGML_OP_CLAMP:
+ case GGML_OP_LEAKY_RELU:
+ case GGML_OP_OPT_STEP_ADAMW:
+ case GGML_OP_OPT_STEP_SGD:
return op->src[0]->type == GGML_TYPE_F32;
case GGML_OP_UPSCALE:
case GGML_OP_ACC:
case GGML_OP_POOL_2D:
case GGML_OP_RWKV_WKV6:
case GGML_OP_RWKV_WKV7:
- case GGML_OP_LEAKY_RELU:
- case GGML_OP_OPT_STEP_ADAMW:
return true;
case GGML_OP_CONV_TRANSPOSE_1D:
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
src_clone[0]->flags = src0->flags;
tensor_clone = ggml_opt_step_adamw(ggml_ctx, src_clone[0], src_clone[1],
src_clone[2], src_clone[3], src_clone[4]);
+ } else if (tensor->op == GGML_OP_OPT_STEP_SGD) {
+ src_clone[0]->flags = src0->flags;
+ tensor_clone = ggml_opt_step_sgd(ggml_ctx, src_clone[0], src_clone[1],
+ src_clone[2]);
}
else {
std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
--- /dev/null
+#version 450
+
+#include "generic_head.comp"
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) buffer X {A_TYPE data_x[];};
+layout (binding = 1) readonly buffer G {A_TYPE data_grad[];};
+layout (binding = 2) readonly buffer P {float data_params[2];};
+
+void main() {
+ const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
+
+ if (i >= p.KX) {
+ return;
+ }
+
+ const float alpha = data_params[0];
+ const float keep = 1.f - alpha * data_params[1];
+
+ data_x[i] = data_x[i] * keep - alpha * data_grad[i];
+}
string_to_spv("rwkv_wkv7_f32", "wkv7.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
string_to_spv("opt_step_adamw_f32", "opt_step_adamw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
+ string_to_spv("opt_step_sgd_f32", "opt_step_sgd.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
string_to_spv("conv2d_f32_unroll", "conv2d_mm.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}, {"UNROLL", "[[unroll]]"}});
string_to_spv("conv2d_f16_f32_unroll", "conv2d_mm.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}, {"UNROLL", "[[unroll]]"}});
"CROSS_ENTROPY_LOSS",
"CROSS_ENTROPY_LOSS_BACK",
"OPT_STEP_ADAMW",
+ "OPT_STEP_SGD",
"GLU",
};
-static_assert(GGML_OP_COUNT == 87, "GGML_OP_COUNT != 87");
+static_assert(GGML_OP_COUNT == 88, "GGML_OP_COUNT != 88");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
"cross_entropy_loss(x,y)",
"cross_entropy_loss_back(x,y)",
"adamw(x)",
+ "sgd(x)",
"glu(x)",
};
-static_assert(GGML_OP_COUNT == 87, "GGML_OP_COUNT != 87");
+static_assert(GGML_OP_COUNT == 88, "GGML_OP_COUNT != 88");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
-
static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
"ABS",
"SGN",
return result;
}
+// opt_step_sgd
+
+struct ggml_tensor * ggml_opt_step_sgd(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * grad,
+ struct ggml_tensor * params) {
+ GGML_ASSERT(a->flags & GGML_TENSOR_FLAG_PARAM);
+ GGML_ASSERT(ggml_are_same_shape(a, grad));
+ GGML_ASSERT(params->type == GGML_TYPE_F32);
+ GGML_ASSERT(ggml_nelements(params) == 2);
+
+ struct ggml_tensor * result = ggml_view_tensor(ctx, a);
+
+ result->op = GGML_OP_OPT_STEP_SGD;
+ result->src[0] = a;
+ result->src[1] = grad;
+ result->src[2] = params;
+
+ return result;
+}
+
////////////////////////////////////////////////////////////////////////////////
struct ggml_hash_set ggml_hash_set_new(size_t size) {