uint32_t prof_pkts;
};
+static inline void hex_print_op_info(const ggml_tensor * op, ggml_hexagon_session * sess, const uint32_t req_flags) {
+ char dims[64 * GGML_MAX_SRC];
+ char strides[64 * GGML_MAX_SRC];
+ char types[16 * GGML_MAX_SRC];
+ char buffs[64 * GGML_MAX_SRC];
+ char names[64 * GGML_MAX_SRC];
+
+ hex_format_op_dims(dims, op);
+ hex_format_op_strides(strides, op);
+ hex_format_op_types(types, op);
+ hex_format_op_buffs(buffs, op);
+ hex_format_op_names(names, op);
+
+ HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
+ names, dims, types, strides, buffs, req_flags);
+}
+
void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync) {
// Bump pending flag (cleared in the session::flush once we get the responce)
this->op_pending++; // atomic inc
return true;
}
+template <typename... _TTensor>
+static inline bool hex_supported_buffer(const struct ggml_hexagon_session * sess, _TTensor... tensors) {
+ return ([&]() -> bool {
+ return !tensors || !tensors->buffer ||
+ (ggml_backend_buffer_is_hexagon(tensors->buffer) &&
+ ggml_backend_hexagon_buffer_get_sess(tensors->buffer) == sess);
+ }() && ...);
+}
+
static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1];
}
// src0 & src1 & dst must be mapped to the same session
- if (src0->buffer &&
- (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
- return false;
- }
- if (src1->buffer &&
- (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
- return false;
- }
- if (dst->buffer &&
- (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
+ if (!hex_supported_buffer(sess, src0, src1, dst)) {
return false;
}
// src0 (weights) must be repacked and mapped to the same session
// src1 & sr2 & dst must be mapped to the same session
- if (src0->buffer &&
- (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
- return false;
- }
- if (src1->buffer &&
- (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
- return false;
- }
- if (src2->buffer &&
- (!ggml_backend_buffer_is_hexagon(src2->buffer) || ggml_backend_hexagon_buffer_get_sess(src2->buffer) != sess)) {
- return false;
- }
- if (dst->buffer &&
- (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
+ if (!hex_supported_buffer(sess, src0, src1, src2, dst)) {
return false;
}
}
// src0, src1 & dst must be mapped to the same session
- if (src0->buffer &&
- (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
- return false;
- }
- if (src1->buffer &&
- (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
- return false;
- }
- if (dst->buffer &&
- (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
+ if (!hex_supported_buffer(sess, src0, src1, dst)) {
return false;
}
}
// src0, src1 & dst must be mapped to the same session
- if (src0->buffer &&
- (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
- return false;
- }
- if (src1->buffer &&
- (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
- return false;
- }
- if (src2->buffer &&
- (!ggml_backend_buffer_is_hexagon(src2->buffer) || ggml_backend_hexagon_buffer_get_sess(src2->buffer) != sess)) {
- return false;
- }
- if (dst->buffer &&
- (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
+ if (!hex_supported_buffer(sess, src0, src1, src2, dst)) {
return false;
}
}
// src0 & dst must be mapped to the same session
- if (src0->buffer &&
- (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
- return false;
- }
- if (dst->buffer &&
- (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
+ if (!hex_supported_buffer(sess, src0, dst)) {
return false;
}
}
// src0, src1 & dst must be mapped to the same session
- if (src0->buffer &&
- (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
- return false;
- }
- if (src1 && src1->buffer &&
- (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
- return false;
- }
- if (dst->buffer &&
- (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
+ if (!hex_supported_buffer(sess, src0, src1, dst)) {
return false;
}
}
// src0, src1 & dst must be mapped to the same session
- if (src0->buffer &&
- (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
- return false;
- }
- if (src1 && src1->buffer &&
- (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
- return false;
- }
- if (dst->buffer &&
- (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
+ if (!hex_supported_buffer(sess, src0, src1, dst)) {
return false;
}
}
// src0, src1, src2 & dst must be mapped to the same session
- if (src0->buffer &&
- (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
- return false;
- }
- if (src1->buffer &&
- (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
- return false;
- }
- if (src2 && src2->buffer &&
- (!ggml_backend_buffer_is_hexagon(src2->buffer) || ggml_backend_hexagon_buffer_get_sess(src2->buffer) != sess)) {
- return false;
- }
- if (dst->buffer &&
- (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
+ if (!hex_supported_buffer(sess, src0, src1, src2, dst)) {
return false;
}
h->nb[3] = t->nb[3];
}
+static size_t dspqueue_buffers_init(dspqueue_buffer * buf, const ggml_tensor * t, bool flush_host, bool flush_htp) {
+ if (!t) {
+ return 0;
+ }
+
+ memset(buf, 0, sizeof(*buf));
+ auto tensor_buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
+ buf->fd = tensor_buf->fd;
+ buf->ptr = t->data;
+ buf->offset = (uint8_t *) t->data - tensor_buf->base;
+ buf->size = ggml_nbytes(t);
+ buf->flags = (flush_host ? DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER : 0); // Flush CPU
+ buf->flags |= (flush_htp ? DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT : 0); // Invalidate DSP
+ return 1;
+}
+
+static ggml_hexagon_session * get_session_from_tensor(const ggml_tensor * t) {
+ return static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context)->sess;
+}
+
static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer * d) {
auto buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
auto sess = buf->sess;
const struct ggml_tensor * src1 = op->src[1];
const struct ggml_tensor * dst = op;
- auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
- auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
- auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
-
uint64_t t1, t2;
t1 = ggml_time_us();
}
dspqueue_buffer bufs[3];
- memset(bufs, 0, sizeof(bufs));
// First buffer Weights.
// The content is static, there is no need to do any cache management
- bufs[0].fd = src0_buf->fd;
- bufs[0].ptr = src0->data;
- bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
- bufs[0].size = ggml_nbytes(src0);
- bufs[0].flags = 0;
+ dspqueue_buffers_init(bufs, src0, false, false);
// Second buffer Input Activations. This is a buffer that the CPU
// writes and the DSP reads, so we'll need to flush CPU caches and
// invalidate DSP ones. On platforms with I/O coherency support the
// framework will automatically skip cache operations where possible.
- bufs[1].fd = src1_buf->fd;
- bufs[1].ptr = src1->data;
- bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
- bufs[1].size = ggml_nbytes(src1);
- bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
+ dspqueue_buffers_init(&bufs[1], src1, true, true);
// Third buffer Output Activations. We'll handle DSP
// cache maintenance in the response message but need to flush
// CPU caches to ensure any previously written dirty lines are
// written out before writes from the DSP start.
- bufs[2].fd = dst_buf->fd;
- bufs[2].ptr = dst->data;
- bufs[2].offset = (uint8_t *) dst->data - dst_buf->base;
- bufs[2].size = ggml_nbytes(dst);
- bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
+ dspqueue_buffers_init(&bufs[2], dst, true, false);
- // Primary DSP session from the src0 (normally weight) tensor
- auto sess = src0_buf->sess;
+ auto * sess = get_session_from_tensor(src0);
if (opt_verbose) {
- char dims[64 * GGML_MAX_SRC];
- char strides[64 * GGML_MAX_SRC];
- char types[16 * GGML_MAX_SRC];
- char buffs[64 * GGML_MAX_SRC];
- char names[64 * GGML_MAX_SRC];
-
- hex_format_op_dims(dims, op);
- hex_format_op_strides(strides, op);
- hex_format_op_types(types, op);
- hex_format_op_buffs(buffs, op);
- hex_format_op_names(names, op);
-
- HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
- names, dims, types, strides, buffs, req.flags);
+ hex_print_op_info(op, sess, req.flags);
if (opt_verbose > 1) {
hex_dump_dspbuf(src0, &bufs[0]);
hex_dump_dspbuf(src1, &bufs[1]);
const struct ggml_tensor * src2 = op->src[2];
const struct ggml_tensor * dst = op;
- auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
- auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
- auto src2_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src2->buffer->context);
- auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
-
uint64_t t1, t2;
t1 = ggml_time_us();
}
dspqueue_buffer bufs[4];
- memset(bufs, 0, sizeof(bufs));
-
// First buffer Weights.
// The content is static, there is no need to do any cache management
- bufs[0].fd = src0_buf->fd;
- bufs[0].ptr = src0->data;
- bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
- bufs[0].size = ggml_nbytes(src0);
- bufs[0].flags = 0;
+ dspqueue_buffers_init(bufs, src0, false, false);
// Second buffer Input Activations. This is a buffer that the CPU
// writes and the DSP reads, so we'll need to flush CPU caches and
// invalidate DSP ones. On platforms with I/O coherency support the
// framework will automatically skip cache operations where possible.
- bufs[1].fd = src1_buf->fd;
- bufs[1].ptr = src1->data;
- bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
- bufs[1].size = ggml_nbytes(src1);
- bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
+ dspqueue_buffers_init(&bufs[1], src1, true, true);
// Third buffer expert IDs. This is a buffer that the CPU
// writes and the DSP reads, so we'll need to flush CPU caches and
// invalidate DSP ones. On platforms with I/O coherency support the
// framework will automatically skip cache operations where possible.
- bufs[2].fd = src2_buf->fd;
- bufs[2].ptr = src2->data;
- bufs[2].offset = (uint8_t *) src2->data - src2_buf->base;
- bufs[2].size = ggml_nbytes(src2);
- bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
+ dspqueue_buffers_init(&bufs[2], src2, true, true);
// Forth buffer Output Activations. We'll handle DSP
// cache maintenance in the response message but need to flush
// CPU caches to ensure any previously written dirty lines are
// written out before writes from the DSP start.
- bufs[3].fd = dst_buf->fd;
- bufs[3].ptr = dst->data;
- bufs[3].offset = (uint8_t *) dst->data - dst_buf->base;
- bufs[3].size = ggml_nbytes(dst);
- bufs[3].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
+ dspqueue_buffers_init(&bufs[3], dst, true, false);
- // Primary DSP session from the src0 (normally weight) tensor
- auto sess = src0_buf->sess;
+ auto * sess = get_session_from_tensor(src0);
if (opt_verbose) {
- char dims[64 * GGML_MAX_SRC];
- char strides[64 * GGML_MAX_SRC];
- char types[16 * GGML_MAX_SRC];
- char buffs[64 * GGML_MAX_SRC];
- char names[64 * GGML_MAX_SRC];
-
- hex_format_op_dims(dims, op);
- hex_format_op_types(types, op);
- hex_format_op_buffs(buffs, op);
- hex_format_op_names(names, op);
-
- HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
- names, dims, types, strides, buffs, req.flags);
-
+ hex_print_op_info(op, sess, req.flags);
if (opt_verbose > 1) {
hex_dump_dspbuf(src0, &bufs[0]);
hex_dump_dspbuf(src1, &bufs[1]);
const struct ggml_tensor * src1 = node->src[1];
const struct ggml_tensor * dst = node;
- auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
- auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
- auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
-
uint64_t t1 = 0;
uint64_t t2 = 0;
init_htp_tensor(&req.dst, dst);
dspqueue_buffer bufs[3];
- memset(bufs, 0, sizeof(bufs));
-
// First buffer = First Operand of Binary op
// This is a buffer that the CPU writes and the DSP reads, so we'll
// need to flush CPU caches and invalidate DSP ones. On platforms
// with I/O coherency support the framework will automatically skip
// cache operations where possible.
- bufs[0].fd = src0_buf->fd;
- bufs[0].ptr = src0->data;
- bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
- bufs[0].size = ggml_nbytes(src0);
- bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP;
+ dspqueue_buffers_init(bufs, src0, true, true);
// Second buffer = Second Operand of Binary op
// This is a buffer that the CPU writes and the DSP reads, so we'll
// need to flush CPU caches and invalidate DSP ones. On platforms
// with I/O coherency support the framework will automatically skip
// cache operations where possible.
- bufs[1].fd = src1_buf->fd;
- bufs[1].ptr = src1->data;
- bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
- bufs[1].size = ggml_nbytes(src1);
- bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
+ dspqueue_buffers_init(&bufs[1], src1, true, true);
// Third buffer = Output Activations. We'll handle DSP
// cache maintenance in the response message but need to flush
// CPU caches to ensure any previously written dirty lines are
// written out before writes from the DSP start.
- bufs[2].fd = dst_buf->fd;
- bufs[2].ptr = dst->data;
- bufs[2].offset = (uint8_t *) dst->data - dst_buf->base;
- bufs[2].size = ggml_nbytes(dst);
- bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
+ dspqueue_buffers_init(&bufs[2], dst, true, false);
- // Primary DSP session from the src0 tensor
- ggml_hexagon_session * sess = src0_buf->sess;
+ auto * sess = get_session_from_tensor(src0);
if (opt_verbose) {
- char dims[64 * GGML_MAX_SRC];
- char strides[16 * GGML_MAX_SRC];
- char types[16 * GGML_MAX_SRC];
- char buffs[64 * GGML_MAX_SRC];
- char names[64 * GGML_MAX_SRC];
-
- hex_format_op_dims(dims, op);
- hex_format_op_strides(strides, op);
- hex_format_op_types(types, op);
- hex_format_op_buffs(buffs, op);
- hex_format_op_names(names, op);
-
- HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(),
- ggml_op_name(node->op), names, dims, types, strides, buffs, req.flags);
+ hex_print_op_info(op, sess, req.flags);
if (opt_verbose > 1) {
hex_dump_dspbuf(src0, &bufs[0]);
hex_dump_dspbuf(src1, &bufs[1]);
const struct ggml_tensor * src2 = node->src[2];
const struct ggml_tensor * dst = node;
- auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
- auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
- auto src2_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src2->buffer->context);
- auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
-
uint64_t t1 = 0;
uint64_t t2 = 0;
init_htp_tensor(&req.dst, dst);
dspqueue_buffer bufs[4];
- memset(bufs, 0, sizeof(bufs));
-
// First buffer = input activations
- bufs[0].fd = src0_buf->fd;
- bufs[0].ptr = src0->data;
- bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
- bufs[0].size = ggml_nbytes(src0);
- bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP;
-
+ dspqueue_buffers_init(bufs, src0, true, true);
// Second buffer = experts bias
- bufs[1].fd = src1_buf->fd;
- bufs[1].ptr = src1->data;
- bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
- bufs[1].size = ggml_nbytes(src1);
- bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
-
+ dspqueue_buffers_init(&bufs[1], src1, true, true);
// Third buffer = activated experts
- bufs[2].fd = src2_buf->fd;
- bufs[2].ptr = src2->data;
- bufs[2].offset = (uint8_t *) src2->data - src2_buf->base;
- bufs[2].size = ggml_nbytes(src2);
- bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
-
+ dspqueue_buffers_init(&bufs[2], src2, true, true);
// Forth buffer = output activations
- bufs[3].fd = dst_buf->fd;
- bufs[3].ptr = dst->data;
- bufs[3].offset = (uint8_t *) dst->data - dst_buf->base;
- bufs[3].size = ggml_nbytes(dst);
- bufs[3].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
+ dspqueue_buffers_init(&bufs[3], dst, true, true);
- // Primary DSP session from the src0 tensor
- ggml_hexagon_session * sess = src0_buf->sess;
+ auto * sess = get_session_from_tensor(src0);
if (opt_verbose) {
- char dims[64 * GGML_MAX_SRC];
- char strides[16 * GGML_MAX_SRC];
- char types[16 * GGML_MAX_SRC];
- char buffs[64 * GGML_MAX_SRC];
- char names[64 * GGML_MAX_SRC];
-
- hex_format_op_dims(dims, op);
- hex_format_op_strides(strides, op);
- hex_format_op_types(types, op);
- hex_format_op_buffs(buffs, op);
- hex_format_op_names(names, op);
-
- HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(),
- ggml_op_name(node->op), names, dims, types, strides, buffs, req.flags);
-
+ hex_print_op_info(op, sess, req.flags);
if (opt_verbose > 1) {
hex_dump_dspbuf(src0, &bufs[0]);
hex_dump_dspbuf(src1, &bufs[1]);
}
dspqueue_buffer bufs[3];
- int n_bufs = 0;
-
- memset(bufs, 0, sizeof(bufs));
// First buffer = Only Operand of Unary op
// This is a buffer that the CPU writes and the DSP reads, so we'll
// need to flush CPU caches and invalidate DSP ones. On platforms
// with I/O coherency support the framework will automatically skip
// cache operations where possible.
- auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
- bufs[n_bufs].fd = src0_buf->fd;
- bufs[n_bufs].ptr = src0->data;
- bufs[n_bufs].offset = (uint8_t *) src0->data - src0_buf->base;
- bufs[n_bufs].size = ggml_nbytes(src0);
- bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP;
- ++n_bufs;
+ size_t n_bufs = dspqueue_buffers_init(bufs, src0, true, true);
- if (src1) {
- // Second buffer = Second Operand of Binary op
- // This is a buffer that the CPU writes and the DSP reads, so we'll
- // need to flush CPU caches and invalidate DSP ones. On platforms
- // with I/O coherency support the framework will automatically skip
- // cache operations where possible.
- auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
- bufs[n_bufs].fd = src1_buf->fd;
- bufs[n_bufs].ptr = src1->data;
- bufs[n_bufs].offset = (uint8_t *) src1->data - src1_buf->base;
- bufs[n_bufs].size = ggml_nbytes(src1);
- bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
- ++n_bufs;
- }
+ // Second buffer(nullable) = Second Operand of Binary op
+ // This is a buffer that the CPU writes and the DSP reads, so we'll
+ // need to flush CPU caches and invalidate DSP ones. On platforms
+ // with I/O coherency support the framework will automatically skip
+ // cache operations where possible.
+ n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, true, true);
// Second or third buffer = Output Activations. We'll handle DSP
// Second buffer = Output Activations. We'll handle DSP
// cache maintenance in the response message but need to flush
// CPU caches to ensure any previously written dirty lines are
// written out before writes from the DSP start.
- auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
- bufs[n_bufs].fd = dst_buf->fd;
- bufs[n_bufs].ptr = dst->data;
- bufs[n_bufs].offset = (uint8_t *) dst->data - dst_buf->base;
- bufs[n_bufs].size = ggml_nbytes(dst);
- bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
- ++n_bufs;
+ n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, true, false);
// Primary DSP session from the src0 tensor
- ggml_hexagon_session * sess = src0_buf->sess;
+ auto * sess = get_session_from_tensor(src0);
if (opt_verbose) {
- char dims[64 * GGML_MAX_SRC];
- char strides[64 * GGML_MAX_SRC];
- char types[16 * GGML_MAX_SRC];
- char buffs[64 * GGML_MAX_SRC];
- char names[64 * GGML_MAX_SRC];
-
- hex_format_op_dims(dims, op);
- hex_format_op_strides(strides, op);
- hex_format_op_types(types, op);
- hex_format_op_buffs(buffs, op);
- hex_format_op_names(names, op);
-
- HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
- names, dims, types, strides, buffs, req.flags);
+ hex_print_op_info(op, sess, req.flags);
if (opt_verbose > 1) {
hex_dump_dspbuf(src0, &bufs[0]);
if (src1) {
}
dspqueue_buffer bufs[4];
- int n_bufs = 0;
-
- memset(bufs, 0, sizeof(bufs));
// First buffer
// This is a buffer that the CPU writes and the DSP reads, so we'll
// need to flush CPU caches and invalidate DSP ones. On platforms
// with I/O coherency support the framework will automatically skip
// cache operations where possible.
- auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
- bufs[n_bufs].fd = src0_buf->fd;
- bufs[n_bufs].ptr = src0->data;
- bufs[n_bufs].offset = (uint8_t *) src0->data - src0_buf->base;
- bufs[n_bufs].size = ggml_nbytes(src0);
- bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP;
- ++n_bufs;
+ size_t n_bufs = dspqueue_buffers_init(bufs, src0, true, true);
// Second buffer
// This is a buffer that the CPU writes and the DSP reads, so we'll
// need to flush CPU caches and invalidate DSP ones. On platforms
// with I/O coherency support the framework will automatically skip
// cache operations where possible.
- auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
- bufs[n_bufs].fd = src1_buf->fd;
- bufs[n_bufs].ptr = src1->data;
- bufs[n_bufs].offset = (uint8_t *) src1->data - src1_buf->base;
- bufs[n_bufs].size = ggml_nbytes(src1);
- bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
- ++n_bufs;
+ n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, true, true);
- if (src2) {
- // Third buffer
- // This is a buffer that the CPU writes and the DSP reads, so we'll
- // need to flush CPU caches and invalidate DSP ones. On platforms
- // with I/O coherency support the framework will automatically skip
- // cache operations where possible.
- auto src2_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src2->buffer->context);
- bufs[n_bufs].fd = src2_buf->fd;
- bufs[n_bufs].ptr = src2->data;
- bufs[n_bufs].offset = (uint8_t *) src2->data - src2_buf->base;
- bufs[n_bufs].size = ggml_nbytes(src2);
- bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
- ++n_bufs;
- }
+ // Third buffer(nullable)
+ // This is a buffer that the CPU writes and the DSP reads, so we'll
+ // need to flush CPU caches and invalidate DSP ones. On platforms
+ // with I/O coherency support the framework will automatically skip
+ // cache operations where possible.
+ n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src2, true, true);
// Final buffer = Output Activations. We'll handle DSP
// Second buffer = Output Activations. We'll handle DSP
// cache maintenance in the response message but need to flush
// CPU caches to ensure any previously written dirty lines are
// written out before writes from the DSP start.
- auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
- bufs[n_bufs].fd = dst_buf->fd;
- bufs[n_bufs].ptr = dst->data;
- bufs[n_bufs].offset = (uint8_t *) dst->data - dst_buf->base;
- bufs[n_bufs].size = ggml_nbytes(dst);
- bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
- ++n_bufs;
+ n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, true, false);
// Primary DSP session from the src0 tensor
- ggml_hexagon_session * sess = src0_buf->sess;
+ auto * sess = get_session_from_tensor(src0);
if (opt_verbose) {
- char dims[64 * GGML_MAX_SRC];
- char strides[64 * GGML_MAX_SRC];
- char types[16 * GGML_MAX_SRC];
- char buffs[64 * GGML_MAX_SRC];
- char names[64 * GGML_MAX_SRC];
-
- hex_format_op_dims(dims, op);
- hex_format_op_strides(strides, op);
- hex_format_op_types(types, op);
- hex_format_op_buffs(buffs, op);
- hex_format_op_names(names, op);
-
- HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
- names, dims, types, strides, buffs, req.flags);
+ hex_print_op_info(op, sess, req.flags);
if (opt_verbose > 1) {
hex_dump_dspbuf(src0, &bufs[0]);
if (src1) {