* Extract common debugging functions; plug eval-callback and mtmd's MTMD_DEBUG_GRAPH with same functionality
* Move to common
* Remove unneeded header
* Unlink from common
* chore: update webui build output
* Cleanup; properly pass params to mtmd without depending on common; factorize debug.cpp to use common debug code.
* Revert change to webapp
* Post-merge adjust
* Apply suggestions from code review
Co-authored-by: Xuan-Son Nguyen <redacted>
* Apply code review changes
* Remove changes to server-context
* Remove mtmd.h include
* Remove utility functions from header
* Apply suggestions from code review
Co-authored-by: Xuan-Son Nguyen <redacted>
* Rename functions
* Update tools/mtmd/clip.cpp
Co-authored-by: Xuan-Son Nguyen <redacted>
* Update tools/mtmd/clip.cpp
Co-authored-by: Xuan-Son Nguyen <redacted>
* Update tools/mtmd/clip.cpp
Co-authored-by: Xuan-Son Nguyen <redacted>
---------
Co-authored-by: Xuan-Son Nguyen <redacted>
common.h
console.cpp
console.h
+ debug.cpp
+ debug.h
download.cpp
download.h
http.h
--- /dev/null
+#include "debug.h"
+
+#include "log.h"
+
+#include <cmath>
+#include <string>
+
+static std::string common_ggml_ne_string(const ggml_tensor * t) {
+ std::string str;
+ for (int i = 0; i < GGML_MAX_DIMS; ++i) {
+ str += std::to_string(t->ne[i]);
+ if (i + 1 < GGML_MAX_DIMS) {
+ str += ", ";
+ }
+ }
+ return str;
+}
+
+static float common_ggml_get_float_value(const uint8_t * data,
+ ggml_type type,
+ const size_t * nb,
+ size_t i0,
+ size_t i1,
+ size_t i2,
+ size_t i3) {
+ size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
+ float v;
+ if (type == GGML_TYPE_F16) {
+ v = ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]);
+ } else if (type == GGML_TYPE_F32) {
+ v = *(const float *) &data[i];
+ } else if (type == GGML_TYPE_I64) {
+ v = (float) *(const int64_t *) &data[i];
+ } else if (type == GGML_TYPE_I32) {
+ v = (float) *(const int32_t *) &data[i];
+ } else if (type == GGML_TYPE_I16) {
+ v = (float) *(const int16_t *) &data[i];
+ } else if (type == GGML_TYPE_I8) {
+ v = (float) *(const int8_t *) &data[i];
+ } else if (type == GGML_TYPE_BF16) {
+ v = ggml_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]);
+ } else {
+ GGML_ABORT("fatal error");
+ }
+ return v;
+}
+
+template <bool abort>
+void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
+ GGML_ASSERT(n > 0);
+ float sum = 0;
+ for (int64_t i3 = 0; i3 < ne[3]; i3++) {
+ for (int64_t i2 = 0; i2 < ne[2]; i2++) {
+ for (int64_t i1 = 0; i1 < ne[1]; i1++) {
+ for (int64_t i0 = 0; i0 < ne[0]; i0++) {
+ const float v = common_ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
+ sum += v;
+ }
+ }
+ }
+ }
+ for (int64_t i3 = 0; i3 < ne[3]; i3++) {
+ LOG_ERR(" [\n");
+ for (int64_t i2 = 0; i2 < ne[2]; i2++) {
+ if (i2 == n && ne[2] > 2 * n) {
+ LOG_ERR(" ..., \n");
+ i2 = ne[2] - n;
+ }
+ LOG_ERR(" [\n");
+ for (int64_t i1 = 0; i1 < ne[1]; i1++) {
+ if (i1 == n && ne[1] > 2 * n) {
+ LOG_ERR(" ..., \n");
+ i1 = ne[1] - n;
+ }
+ LOG_ERR(" [");
+ for (int64_t i0 = 0; i0 < ne[0]; i0++) {
+ if (i0 == n && ne[0] > 2 * n) {
+ LOG_ERR("..., ");
+ i0 = ne[0] - n;
+ }
+ const float v = common_ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
+ LOG_ERR("%12.4f", v);
+ if (i0 < ne[0] - 1) {
+ LOG_ERR(", ");
+ }
+ }
+ LOG_ERR("],\n");
+ }
+ LOG_ERR(" ],\n");
+ }
+ LOG_ERR(" ]\n");
+ LOG_ERR(" sum = %f\n", sum);
+ }
+
+ if constexpr (abort) {
+ if (std::isnan(sum)) {
+ LOG_ERR("encountered NaN - aborting\n");
+ exit(0);
+ }
+ }
+}
+
+/**
+ * GGML operations callback during the graph execution.
+ *
+ * @param t current tensor
+ * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
+ * if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
+ * see ggml_backend_sched_eval_callback
+ * @param user_data user data to pass at each call back
+ * @return true to receive data or continue the graph, false otherwise
+ */
+template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
+ auto * cb_data = (base_callback_data *) user_data;
+
+ const struct ggml_tensor * src0 = t->src[0];
+ const struct ggml_tensor * src1 = t->src[1];
+
+ if (ask) {
+ return true; // Always retrieve data
+ }
+
+ bool matches_filter = cb_data->tensor_filters.empty();
+
+ if (!matches_filter) {
+ for (const auto & filter : cb_data->tensor_filters) {
+ if (std::regex_search(t->name, filter)) {
+ matches_filter = true;
+ break;
+ }
+ }
+ }
+
+ char src1_str[128] = { 0 };
+ if (src1) {
+ snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, common_ggml_ne_string(src1).c_str());
+ }
+
+ if (matches_filter) {
+ LOG_ERR("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, t->name, ggml_type_name(t->type),
+ ggml_op_desc(t), src0->name, common_ggml_ne_string(src0).c_str(), src1 ? src1_str : "",
+ common_ggml_ne_string(t).c_str());
+ }
+
+ const bool is_host = ggml_backend_buffer_is_host(t->buffer);
+
+ if (!is_host) {
+ auto n_bytes = ggml_nbytes(t);
+ cb_data->data.resize(n_bytes);
+ ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
+ }
+
+ if (!ggml_is_quantized(t->type) && matches_filter) {
+ uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
+ common_debug_print_tensor<abort_on_nan>(data, t->type, t->ne, t->nb, 3);
+ }
+
+ return true;
+}
+
+// Explicit template instantiations
+template bool common_debug_cb_eval<false>(ggml_tensor *, bool, void *);
+template bool common_debug_cb_eval<true>(ggml_tensor *, bool, void *);
+template void common_debug_print_tensor<false>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
+template void common_debug_print_tensor<true>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
--- /dev/null
+#pragma once
+#include "common.h"
+#include <string>
+#include <vector>
+#include <regex>
+
+// common debug functions and structs
+
+// Print a tensor's detailed data
+// data - the tensor's data in byte format
+// type - the tensor's quantization type
+// ne - the tensor dimensions array
+// nb - the tensor strides array
+// n - the number of rows/columns to fully print
+template <bool abort_on_nan> void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n);
+
+// Intended to use as callback for ggml_backend_sched_eval_callback
+// prints tensors that are processed in the computation graph
+// by default prints all tensors, but can be configured by creating a `base_callback_data` instance with
+// non-empty filter_patterns. See examples/debug.ccp for possible usage patterns
+// The template parameter determins whether an error should be thrown whenever a NaN is encountered
+// in a tensor (useful for stopping debug sessions on first erroneous tensor)
+// The callback data will be passed as the third parameter (user_data)
+template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data);
+struct base_callback_data {
+ std::vector<uint8_t> data;
+ std::vector<std::regex> tensor_filters;
+
+ base_callback_data() = default;
+
+ base_callback_data(common_params & params, const std::vector<std::string> & filter_patterns) {
+ for (const auto & pattern : filter_patterns) {
+ try {
+ std::string anchored_pattern = "^" + pattern;
+ tensor_filters.emplace_back(anchored_pattern, std::regex::optimize);
+ } catch (const std::regex_error & e) {
+ throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what());
+ }
+ }
+ params.cb_eval = common_debug_cb_eval<false>;
+ params.cb_eval_user_data = this;
+ }
+};
-{
+{
"version": 4,
"configurePresets": [
{
+#include "debug.h"
#include "arg.h"
#include "common.h"
#include "log.h"
#include "llama.h"
-#include "ggml.h"
-#include <cmath>
-#include <cstdint>
#include <cstdlib>
#include <string>
#include <vector>
#include <fstream>
#include <regex>
-static void print_usage(int, char ** argv) {
+static void print_usage(int /*argc*/, char ** argv) {
const std::string usage_template = R"(
example usage:
LOG("%s\n", usage.c_str());
}
-static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data);
-
-struct callback_data {
- std::vector<uint8_t> data;
- std::vector<std::regex> tensor_filters;
-
- callback_data() = default;
-
- callback_data(common_params & params, const std::vector<std::string> & filter_patterns) {
- for (const auto & pattern : filter_patterns) {
- try {
- std::string anchored_pattern = "^" + pattern;
- tensor_filters.emplace_back(anchored_pattern, std::regex::optimize);
- } catch (const std::regex_error & e) {
- throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what());
- }
- }
- params.cb_eval = ggml_debug;
- params.cb_eval_user_data = this;
- }
-};
-
static bool has_pooling(llama_context * ctx) {
switch (llama_pooling_type(ctx)) {
case LLAMA_POOLING_TYPE_NONE:
}
};
-static std::string ggml_ne_string(const ggml_tensor * t) {
- std::string str;
- for (int i = 0; i < GGML_MAX_DIMS; ++i) {
- str += std::to_string(t->ne[i]);
- if (i + 1 < GGML_MAX_DIMS) {
- str += ", ";
- }
- }
- return str;
-}
-
-static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
- union {
- float f;
- uint32_t i;
- } u;
- u.i = (uint32_t)h.bits << 16;
- return u.f;
-}
-
-static float ggml_get_float_value(const uint8_t * data, ggml_type type,
- const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
- size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
- switch (type) {
- case GGML_TYPE_F16:
- return ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]);
- case GGML_TYPE_F32:
- return *(const float *) &data[i];
- case GGML_TYPE_I64:
- return (float) *(const int64_t *) &data[i];
- case GGML_TYPE_I32:
- return (float) *(const int32_t *) &data[i];
- case GGML_TYPE_I16:
- return (float) *(const int16_t *) &data[i];
- case GGML_TYPE_I8:
- return (float) *(const int8_t *) &data[i];
- case GGML_TYPE_BF16:
- return ggml_compute_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]);
- default:
- GGML_ABORT("fatal error");
- }
-}
-
-static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
- GGML_ASSERT(n > 0);
- float sum = 0;
- float sum_sq = 0.0;
- for (int64_t i3 = 0; i3 < ne[3]; i3++) {
- for (int64_t i2 = 0; i2 < ne[2]; i2++) {
- for (int64_t i1 = 0; i1 < ne[1]; i1++) {
- for (int64_t i0 = 0; i0 < ne[0]; i0++) {
- const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
- sum += v;
- sum_sq += v * v;
- }
- }
- }
- }
- for (int64_t i3 = 0; i3 < ne[3]; i3++) {
- LOG_DBG(" [\n");
- for (int64_t i2 = 0; i2 < ne[2]; i2++) {
- if (i2 == n && ne[2] > 2*n) {
- LOG_DBG(" ..., \n");
- i2 = ne[2] - n;
- }
- LOG_DBG(" [\n");
- for (int64_t i1 = 0; i1 < ne[1]; i1++) {
- if (i1 == n && ne[1] > 2*n) {
- LOG_DBG(" ..., \n");
- i1 = ne[1] - n;
- }
- LOG_DBG(" [");
- for (int64_t i0 = 0; i0 < ne[0]; i0++) {
- if (i0 == n && ne[0] > 2*n) {
- LOG_DBG("..., ");
- i0 = ne[0] - n;
- }
- const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
- LOG_DBG("%12.4f", v);
- if (i0 < ne[0] - 1) {
- LOG_DBG(", ");
- }
- }
- LOG_DBG("],\n");
- }
- LOG_DBG(" ],\n");
- }
- LOG_DBG(" ]\n");
- LOG_DBG(" sum = %f\n", sum);
- LOG_DBG(" sum_sq = %f\n", sum_sq);
- }
-
- if (std::isnan(sum)) {
- LOG_ERR("encountered NaN - aborting\n");
- exit(0);
- }
-}
-
-/**
- * GGML operations callback during the graph execution.
- *
- * @param t current tensor
- * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
- * if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
- * see ggml_backend_sched_eval_callback
- * @param user_data user data to pass at each call back
- * @return true to receive data or continue the graph, false otherwise
- */
-static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
- auto * cb_data = (callback_data *) user_data;
-
- const struct ggml_tensor * src0 = t->src[0];
- const struct ggml_tensor * src1 = t->src[1];
-
- if (ask) {
- return true; // Always retrieve data
- }
-
- bool matches_filter = cb_data->tensor_filters.empty();
-
- if (!matches_filter) {
- for (const auto & filter : cb_data->tensor_filters) {
- if (std::regex_search(t->name, filter)) {
- matches_filter = true;
- break;
- }
- }
- }
-
- char src1_str[128] = {0};
- if (src1) {
- snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
- }
-
- if (matches_filter) {
- LOG_DBG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
- t->name,
- ggml_type_name(t->type),
- ggml_op_desc(t),
- src0->name,
- ggml_ne_string(src0).c_str(),
- src1 ? src1_str : "",
- ggml_ne_string(t).c_str());
- }
-
- const bool is_host = ggml_backend_buffer_is_host(t->buffer);
-
- if (!is_host) {
- auto n_bytes = ggml_nbytes(t);
- cb_data->data.resize(n_bytes);
- ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
- }
-
- if (!ggml_is_quantized(t->type) && matches_filter) {
- uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
- ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
- }
-
- return true;
-}
-
-
static void save_output_data(const output_data & output, const std::string & model_name, const std::string & output_dir) {
std::filesystem::create_directory(output_dir);
auto base_path = std::filesystem::path{output_dir} / ("llamacpp-" + model_name + output.type_suffix);
llama_backend_init();
llama_numa_init(params.numa);
- callback_data cb_data(params, params.tensor_filter);
+ base_callback_data cb_data(params, params.tensor_filter);
auto llama_init = common_init_from_params(params);
#include "arg.h"
#include "common.h"
+#include "debug.h"
#include "log.h"
#include "llama.h"
-#include "ggml.h"
-
-#include <cmath>
-#include <cstdio>
+#include "llama-cpp.h"
#include <string>
#include <vector>
-/**
- * This the arbitrary data which will be passed to each callback.
- * Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor.
- */
-struct callback_data {
- std::vector<uint8_t> data;
-};
-
-static std::string ggml_ne_string(const ggml_tensor * t) {
- std::string str;
- for (int i = 0; i < GGML_MAX_DIMS; ++i) {
- str += std::to_string(t->ne[i]);
- if (i + 1 < GGML_MAX_DIMS) {
- str += ", ";
- }
- }
- return str;
-}
-
-static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
- union {
- float f;
- uint32_t i;
- } u;
- u.i = (uint32_t)h.bits << 16;
- return u.f;
-}
-
-static float ggml_get_float_value(const uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
- size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
- float v;
- if (type == GGML_TYPE_F16) {
- v = ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]);
- } else if (type == GGML_TYPE_F32) {
- v = *(const float *) &data[i];
- } else if (type == GGML_TYPE_I64) {
- v = (float) *(const int64_t *) &data[i];
- } else if (type == GGML_TYPE_I32) {
- v = (float) *(const int32_t *) &data[i];
- } else if (type == GGML_TYPE_I16) {
- v = (float) *(const int16_t *) &data[i];
- } else if (type == GGML_TYPE_I8) {
- v = (float) *(const int8_t *) &data[i];
- } else if (type == GGML_TYPE_BF16) {
- v = ggml_compute_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]);
- } else {
- GGML_ABORT("fatal error");
- }
- return v;
-}
-
-static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
- GGML_ASSERT(n > 0);
- float sum = 0;
- for (int64_t i3 = 0; i3 < ne[3]; i3++) {
- for (int64_t i2 = 0; i2 < ne[2]; i2++) {
- for (int64_t i1 = 0; i1 < ne[1]; i1++) {
- for (int64_t i0 = 0; i0 < ne[0]; i0++) {
- const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
- sum += v;
- }
- }
- }
- }
- for (int64_t i3 = 0; i3 < ne[3]; i3++) {
- LOG(" [\n");
- for (int64_t i2 = 0; i2 < ne[2]; i2++) {
- if (i2 == n && ne[2] > 2*n) {
- LOG(" ..., \n");
- i2 = ne[2] - n;
- }
- LOG(" [\n");
- for (int64_t i1 = 0; i1 < ne[1]; i1++) {
- if (i1 == n && ne[1] > 2*n) {
- LOG(" ..., \n");
- i1 = ne[1] - n;
- }
- LOG(" [");
- for (int64_t i0 = 0; i0 < ne[0]; i0++) {
- if (i0 == n && ne[0] > 2*n) {
- LOG("..., ");
- i0 = ne[0] - n;
- }
- const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
- LOG("%12.4f", v);
- if (i0 < ne[0] - 1) LOG(", ");
- }
- LOG("],\n");
- }
- LOG(" ],\n");
- }
- LOG(" ]\n");
- LOG(" sum = %f\n", sum);
- }
-
- // TODO: make this abort configurable/optional?
- if (std::isnan(sum)) {
- LOG_ERR("encountered NaN - aborting\n");
- exit(0);
- }
-}
-
-/**
- * GGML operations callback during the graph execution.
- *
- * @param t current tensor
- * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
- * if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
- * see ggml_backend_sched_eval_callback
- * @param user_data user data to pass at each call back
- * @return true to receive data or continue the graph, false otherwise
- */
-static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
- auto * cb_data = (callback_data *) user_data;
-
- const struct ggml_tensor * src0 = t->src[0];
- const struct ggml_tensor * src1 = t->src[1];
-
- if (ask) {
- return true; // Always retrieve data
- }
-
- char src1_str[128] = {0};
- if (src1) {
- snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
- }
-
- LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
- t->name, ggml_type_name(t->type), ggml_op_desc(t),
- src0->name, ggml_ne_string(src0).c_str(),
- src1 ? src1_str : "",
- ggml_ne_string(t).c_str());
-
-
- // copy the data from the GPU memory if needed
- const bool is_host = ggml_backend_buffer_is_host(t->buffer);
-
- if (!is_host) {
- auto n_bytes = ggml_nbytes(t);
- cb_data->data.resize(n_bytes);
- ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
- }
-
- if (!ggml_is_quantized(t->type)) {
- uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
- ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
- }
-
- return true;
-}
-
static bool run(llama_context * ctx, const common_params & params) {
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
}
int main(int argc, char ** argv) {
- callback_data cb_data;
+ base_callback_data cb_data;
common_params params;
// pass the callback to the backend scheduler
// it will be executed for each node during the graph computation
- params.cb_eval = ggml_debug;
+ params.cb_eval = common_debug_cb_eval<false>;
params.cb_eval_user_data = &cb_data;
params.warmup = false;
const float kq_scale;
const clip_flash_attn_type flash_attn_type;
- // for debugging
- const bool debug_graph;
- std::vector<ggml_tensor *> & debug_print_tensors;
-
ggml_context_ptr ctx0_ptr;
ggml_context * ctx0;
ggml_cgraph * gf;
ggml_backend_t backend_cpu = nullptr;
ggml_backend_buffer_ptr buf;
+
int max_nodes = 8192;
ggml_backend_sched_ptr sched;
clip_flash_attn_type flash_attn_type = CLIP_FLASH_ATTN_TYPE_AUTO;
bool is_allocated = false;
- // for debugging
- bool debug_graph = false;
- std::vector<ggml_tensor *> debug_print_tensors;
-
clip_ctx(clip_context_params & ctx_params) {
flash_attn_type = ctx_params.flash_attn_type;
- debug_graph = std::getenv("MTMD_DEBUG_GRAPH") != nullptr;
backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
if (!backend_cpu) {
throw std::runtime_error("failed to initialize CPU backend");
sched.reset(
ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false, true)
);
+
+ if (ctx_params.cb_eval != nullptr) {
+ ggml_backend_sched_set_eval_callback(sched.get(), ctx_params.cb_eval, ctx_params.cb_eval_user_data);
+ }
}
~clip_ctx() {
n_mmproj_embd(clip_n_mmproj_embd(ctx)),
eps(hparams.eps),
kq_scale(1.0f / sqrtf((float)d_head)),
- flash_attn_type(ctx->flash_attn_type),
- debug_graph(ctx->debug_graph),
- debug_print_tensors(ctx->debug_print_tensors) {
+ flash_attn_type(ctx->flash_attn_type) {
struct ggml_init_params params = {
/*.mem_size =*/ ctx->buf_compute_meta.size(),
/*.mem_buffer =*/ ctx->buf_compute_meta.data(),
gf = ggml_new_graph_custom(ctx0, ctx->max_nodes, false);
}
-void clip_graph::cb(ggml_tensor * cur0, const char * name, int il) const {
- if (debug_graph) {
- ggml_tensor * cur = ggml_cpy(ctx0, cur0, ggml_dup_tensor(ctx0, cur0));
- std::string cur_name = il >= 0 ? std::string(name) + "_" + std::to_string(il) : name;
- ggml_set_name(cur, cur_name.c_str());
- ggml_set_output(cur);
- ggml_build_forward_expand(gf, cur);
- debug_print_tensors.push_back(cur);
+void clip_graph::cb(ggml_tensor * cur, const char * name, int il) const {
+ if (il >= 0) {
+ ggml_format_name(cur, "%s-%d", name, il);
+ } else {
+ ggml_set_name(cur, name);
}
}
model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H, "weight"));
model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE, "weight"));
model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H, "weight"));
- model.mm_boi = get_tensor(string_format(TN_TOK_GLM_BOI, "weight"));
- model.mm_eoi = get_tensor(string_format(TN_TOK_GLM_EOI, "weight"));
+ model.mm_boi = get_tensor(string_format(TN_TOK_GLM_BOI));
+ model.mm_eoi = get_tensor(string_format(TN_TOK_GLM_EOI));
} break;
case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL:
model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
model.mm_norm_pre_b = get_tensor(string_format(TN_MM_NORM_PRE, "bias"));
- model.mm_boi = get_tensor(string_format(TN_TOK_BOI, "weight"));
- model.mm_eoi = get_tensor(string_format(TN_TOK_EOI, "weight"));
+ model.mm_boi = get_tensor(string_format(TN_TOK_BOI));
+ model.mm_eoi = get_tensor(string_format(TN_TOK_EOI));
} break;
case PROJECTOR_TYPE_LLAMA4:
{
}
// build the inference graph
- ctx->debug_print_tensors.clear();
ggml_backend_sched_reset(ctx->sched.get());
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
return false;
}
- // print debug nodes
- if (ctx->debug_graph) {
- LOG_INF("\n\n---\n\n");
- LOG_INF("\n\nDebug graph:\n\n");
- for (ggml_tensor * t : ctx->debug_print_tensors) {
- std::vector<uint8_t> data(ggml_nbytes(t));
- ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t));
- print_tensor_shape(t);
- print_tensor_data(t, data.data(), 3);
- }
- }
-
// the last node is the embedding tensor
ggml_tensor * embeddings = ggml_graph_node(gf, -1);
//
// API for debugging
//
-
void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) {
clip_image_f32 img;
img.nx = w;
for (int i = 0; i < h * w * 3; i++) {
img.buf[i] = static_cast<float>(fill_value);
}
- bool cur_debug_graph = ctx->debug_graph;
- ctx->debug_graph = true;
clip_image_encode(ctx, 1, &img, nullptr);
- ctx->debug_graph = cur_debug_graph;
GGML_ASSERT(img.buf.empty() && "expected, always stop here");
}
#pragma once
#include "ggml.h"
+#include "mtmd.h"
#include <stddef.h>
#include <stdint.h>
int image_min_tokens;
int image_max_tokens;
bool warmup;
+ ggml_backend_sched_eval_callback cb_eval;
+ void * cb_eval_user_data;
};
struct clip_init_result {
#include "arg.h"
+#include "debug.h"
#include "log.h"
#include "common.h"
#include "sampling.h"
int n_threads = 1;
llama_pos n_past = 0;
+ base_callback_data cb_data;
+
mtmd_cli_context(common_params & params) : llama_init(common_init_from_params(params)) {
model = llama_init->model();
lctx = llama_init->context();
mparams.warmup = params.warmup;
mparams.image_min_tokens = params.image_min_tokens;
mparams.image_max_tokens = params.image_max_tokens;
+ if (std::getenv("MTMD_DEBUG_GRAPH") != nullptr) {
+ mparams.cb_eval_user_data = &cb_data;
+ mparams.cb_eval = common_debug_cb_eval<false>;
+ }
ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams));
if (!ctx_vision.get()) {
LOG_ERR("Failed to load vision model from %s\n", clip_path);
/* warmup */ true,
/* image_min_tokens */ -1,
/* image_max_tokens */ -1,
+ /* cb_eval */ nullptr,
+ /* cb_eval_user_data */ nullptr,
};
return params;
}
/* image_min_tokens */ ctx_params.image_min_tokens,
/* image_max_tokens */ ctx_params.image_max_tokens,
/* warmup */ ctx_params.warmup,
+ /* cb_eval */ ctx_params.cb_eval,
+ /* cb_eval_user_data */ ctx_params.cb_eval_user_data,
};
auto res = clip_init(mmproj_fname, ctx_clip_params);
// limit number of image tokens, only for vision models with dynamic resolution
int image_min_tokens; // minimum number of tokens for image input (default: read from metadata)
int image_max_tokens; // maximum number of tokens for image input (default: read from metadata)
+
+ // callback function passed over to mtmd proper
+ ggml_backend_sched_eval_callback cb_eval;
+ void * cb_eval_user_data;
};
MTMD_API const char * mtmd_default_marker(void);
ptr.reset(mtmd_bitmap_init(nx, ny, data));
}
~bitmap() = default;
- uint32_t nx() { return mtmd_bitmap_get_nx(ptr.get()); }
- uint32_t ny() { return mtmd_bitmap_get_ny(ptr.get()); }
- const unsigned char * data() { return mtmd_bitmap_get_data(ptr.get()); }
- size_t n_bytes() { return mtmd_bitmap_get_n_bytes(ptr.get()); }
- std::string id() { return mtmd_bitmap_get_id(ptr.get()); }
- void set_id(const char * id) { mtmd_bitmap_set_id(ptr.get(), id); }
+ uint32_t nx() const { return mtmd_bitmap_get_nx(ptr.get()); }
+ uint32_t ny() const { return mtmd_bitmap_get_ny(ptr.get()); }
+ const unsigned char * data() const { return mtmd_bitmap_get_data(ptr.get()); }
+ size_t n_bytes() const { return mtmd_bitmap_get_n_bytes(ptr.get()); }
+ std::string id() const { return mtmd_bitmap_get_id(ptr.get()); }
+ void set_id(const char * id) const { mtmd_bitmap_set_id(ptr.get(), id); }
};
struct bitmaps {
input_chunks() = default;
input_chunks(mtmd_input_chunks * chunks) : ptr(chunks) {}
~input_chunks() = default;
- size_t size() { return mtmd_input_chunks_size(ptr.get()); }
- const mtmd_input_chunk * operator[](size_t idx) {
+ size_t size() const { return mtmd_input_chunks_size(ptr.get()); }
+ const mtmd_input_chunk * operator[](size_t idx) const {
return mtmd_input_chunks_get(ptr.get(), idx);
}
};