const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
const float temp = params.temp;
+ const float dynatemp_range = params.dynatemp_range;
+ const float dynatemp_exponent = params.dynatemp_exponent;
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
const float top_p = params.top_p;
const float min_p = params.min_p;
case 'y': llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break;
case 'p': llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
case 'm': llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
- case 't': llama_sample_temp (ctx_main, &cur_p, temp); break;
+ case 't':
+ if (dynatemp_range > 0) {
+ float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
+ float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
+ llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent);
+ } else {
+ llama_sample_temp(ctx_main, &cur_p, temp);
+ }
+ break;
default : break;
}
}
float tfs_z = 1.00f; // 1.0 = disabled
float typical_p = 1.00f; // 1.0 = disabled
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
+ float dynatemp_range = 0.00f; // 0.0 = disabled
+ float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
float penalty_repeat = 1.10f; // 1.0 = disabled
float penalty_freq = 0.00f; // 0.0 = disabled
}
}
+void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float min_temp, float max_temp, float exponent_val) {
+ const int64_t t_start_sample_us = ggml_time_us();
+
+ // no need to do anything if there is only one (or zero) candidates
+ if(candidates_p->size <= 1) {
+ return;
+ }
+
+ // Calculate maximum possible entropy
+ float max_entropy = -logf(1.0f / candidates_p->size);
+
+ llama_sample_softmax(nullptr, candidates_p);
+
+ // Calculate entropy of the softmax probabilities
+ float entropy = 0.0f;
+ for (size_t i = 0; i < candidates_p->size; ++i) {
+ float prob = candidates_p->data[i].p;
+ if (prob > 0.0f) { // Ensure no log(0)
+ entropy -= prob * logf(prob);
+ }
+ }
+
+ // Normalize the entropy (max_entropy cannot be 0 here because we checked candidates_p->size != 1 above)
+ float normalized_entropy = entropy / max_entropy;
+
+ // Map the normalized entropy to the desired temperature range using the power function
+ float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val);
+
+#ifdef DEBUG
+ LLAMA_LOG_INFO("Your text maxtemp value is: %f\n", max_temp);
+ LLAMA_LOG_INFO("Entropy: %f\n", entropy);
+ LLAMA_LOG_INFO("Max Possible Entropy: %f\n", max_entropy);
+ LLAMA_LOG_INFO("Normalized Entropy: %f\n", normalized_entropy);
+ LLAMA_LOG_INFO("Exponent: %f\n", exponent_val);
+ LLAMA_LOG_INFO("Dynamic Temperature (dyn_temp): %f\n", dyn_temp);
+#endif
+
+ // Apply the dynamically calculated temperature scaling
+ for (size_t i = 0; i < candidates_p->size; ++i) {
+ candidates_p->data[i].logit /= dyn_temp;
+ }
+
+ // Re-compute softmax probabilities after scaling logits with dynamic temperature
+ double max_l_double = candidates_p->data[0].logit;
+ double cum_sum_double = 0.0;
+ for (size_t i = 0; i < candidates_p->size; ++i) {
+ double p = exp(candidates_p->data[i].logit - max_l_double);
+ candidates_p->data[i].p = p; // Store the scaled probability
+ cum_sum_double += p;
+ }
+ for (size_t i = 0; i < candidates_p->size; ++i) {
+ candidates_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities
+ }
+
+#ifdef DEBUG
+ // Print the updated top 25 probabilities after temperature scaling
+ LLAMA_LOG_INFO("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n");
+ for (size_t i = 0; i < 25 && i < candidates_p->size; ++i) {
+ LLAMA_LOG_INFO("Token %zu: %f%%\n", i + 1, candidates_p->data[i].p * 100.0f);
+ }
+#endif
+
+ if (ctx) {
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+ }
+}
+
void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
const int64_t t_start_sample_us = ggml_time_us();
float p,
size_t min_keep);
+ /// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
+ LLAMA_API void llama_sample_entropy(
+ struct llama_context * ctx,
+ llama_token_data_array * candidates_p,
+ float min_temp,
+ float max_temp,
+ float exponent_val);
+
LLAMA_API void llama_sample_temp(
struct llama_context * ctx,
llama_token_data_array * candidates,