From 5eaf9964fc797d4585c214db32a463d557f3ed33 Mon Sep 17 00:00:00 2001 From: l3utterfly Date: Fri, 26 Jan 2024 05:06:22 +0900 Subject: [PATCH] llama : dynamic temperature sampling (#4972) * implemented dynamic temperature sampling from koboldcpp * removed trailing whitespace * removed unused temp parameter in llama_sample_entropy * exposed exponent_val in dynamic temp sampler * added debug check for printf statements * use nullptr in llama_sample_softmax call during llama_sample_entropy this avoids counting the time taken stats twice Co-authored-by: Georgi Gerganov * return earlier if there is only 1 candiate (i.e. max_entropy == 0) * reformat 't' case in llama_sample_queue Co-authored-by: Jared Van Bortel * check for one or zero candidates case in llama_sample_entropy --------- Co-authored-by: Georgi Gerganov Co-authored-by: Jared Van Bortel --- common/sampling.cpp | 12 +++++++- common/sampling.h | 2 ++ llama.cpp | 67 +++++++++++++++++++++++++++++++++++++++++++++ llama.h | 8 ++++++ 4 files changed, 88 insertions(+), 1 deletion(-) diff --git a/common/sampling.cpp b/common/sampling.cpp index dd1ffeb1b..efd7eab6e 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -129,6 +129,8 @@ static void sampler_queue( const int n_vocab = llama_n_vocab(llama_get_model(ctx_main)); const float temp = params.temp; + const float dynatemp_range = params.dynatemp_range; + const float dynatemp_exponent = params.dynatemp_exponent; const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k; const float top_p = params.top_p; const float min_p = params.min_p; @@ -143,7 +145,15 @@ static void sampler_queue( case 'y': llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break; case 'p': llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break; case 'm': llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break; - case 't': llama_sample_temp (ctx_main, &cur_p, temp); break; + case 't': + if (dynatemp_range > 0) { + float dynatemp_min = std::max(0.0f, temp - dynatemp_range); + float dynatemp_max = std::max(0.0f, temp + dynatemp_range); + llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent); + } else { + llama_sample_temp(ctx_main, &cur_p, temp); + } + break; default : break; } } diff --git a/common/sampling.h b/common/sampling.h index 2ee180376..88899c094 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -18,6 +18,8 @@ typedef struct llama_sampling_params { float tfs_z = 1.00f; // 1.0 = disabled float typical_p = 1.00f; // 1.0 = disabled float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities + float dynatemp_range = 0.00f; // 0.0 = disabled + float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) float penalty_repeat = 1.10f; // 1.0 = disabled float penalty_freq = 0.00f; // 0.0 = disabled diff --git a/llama.cpp b/llama.cpp index 6a7506e85..823d42d7f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8151,6 +8151,73 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c } } +void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float min_temp, float max_temp, float exponent_val) { + const int64_t t_start_sample_us = ggml_time_us(); + + // no need to do anything if there is only one (or zero) candidates + if(candidates_p->size <= 1) { + return; + } + + // Calculate maximum possible entropy + float max_entropy = -logf(1.0f / candidates_p->size); + + llama_sample_softmax(nullptr, candidates_p); + + // Calculate entropy of the softmax probabilities + float entropy = 0.0f; + for (size_t i = 0; i < candidates_p->size; ++i) { + float prob = candidates_p->data[i].p; + if (prob > 0.0f) { // Ensure no log(0) + entropy -= prob * logf(prob); + } + } + + // Normalize the entropy (max_entropy cannot be 0 here because we checked candidates_p->size != 1 above) + float normalized_entropy = entropy / max_entropy; + + // Map the normalized entropy to the desired temperature range using the power function + float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val); + +#ifdef DEBUG + LLAMA_LOG_INFO("Your text maxtemp value is: %f\n", max_temp); + LLAMA_LOG_INFO("Entropy: %f\n", entropy); + LLAMA_LOG_INFO("Max Possible Entropy: %f\n", max_entropy); + LLAMA_LOG_INFO("Normalized Entropy: %f\n", normalized_entropy); + LLAMA_LOG_INFO("Exponent: %f\n", exponent_val); + LLAMA_LOG_INFO("Dynamic Temperature (dyn_temp): %f\n", dyn_temp); +#endif + + // Apply the dynamically calculated temperature scaling + for (size_t i = 0; i < candidates_p->size; ++i) { + candidates_p->data[i].logit /= dyn_temp; + } + + // Re-compute softmax probabilities after scaling logits with dynamic temperature + double max_l_double = candidates_p->data[0].logit; + double cum_sum_double = 0.0; + for (size_t i = 0; i < candidates_p->size; ++i) { + double p = exp(candidates_p->data[i].logit - max_l_double); + candidates_p->data[i].p = p; // Store the scaled probability + cum_sum_double += p; + } + for (size_t i = 0; i < candidates_p->size; ++i) { + candidates_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities + } + +#ifdef DEBUG + // Print the updated top 25 probabilities after temperature scaling + LLAMA_LOG_INFO("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n"); + for (size_t i = 0; i < 25 && i < candidates_p->size; ++i) { + LLAMA_LOG_INFO("Token %zu: %f%%\n", i + 1, candidates_p->data[i].p * 100.0f); + } +#endif + + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } +} + void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) { const int64_t t_start_sample_us = ggml_time_us(); diff --git a/llama.h b/llama.h index bb6054557..7b3634aa6 100644 --- a/llama.h +++ b/llama.h @@ -775,6 +775,14 @@ extern "C" { float p, size_t min_keep); + /// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772. + LLAMA_API void llama_sample_entropy( + struct llama_context * ctx, + llama_token_data_array * candidates_p, + float min_temp, + float max_temp, + float exponent_val); + LLAMA_API void llama_sample_temp( struct llama_context * ctx, llama_token_data_array * candidates,