mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-24 13:28:50 +01:00
More efficient Hellaswag implementation (#2677)
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
parent
1f0bccb279
commit
5e9ff54a67
@ -5,6 +5,7 @@
|
|||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
|
#include <cstring>
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
@ -209,50 +210,97 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|||||||
double acc = 0.0f;
|
double acc = 0.0f;
|
||||||
const int n_vocab = llama_n_vocab(ctx);
|
const int n_vocab = llama_n_vocab(ctx);
|
||||||
|
|
||||||
|
std::vector<float> tok_logits(n_vocab);
|
||||||
|
|
||||||
for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) {
|
for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) {
|
||||||
|
|
||||||
// Tokenize the context to count tokens
|
// Tokenize the context to count tokens
|
||||||
std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, prepend_bos);
|
std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, prepend_bos);
|
||||||
size_t context_size = context_embd.size();
|
size_t context_size = context_embd.size();
|
||||||
|
|
||||||
for (size_t ending_idx=0;ending_idx<4;ending_idx++) {
|
// Do the 1st ending
|
||||||
|
// In this case we include the context when evaluating
|
||||||
|
auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], prepend_bos);
|
||||||
|
auto query_size = query_embd.size();
|
||||||
|
//printf("First query: %d\n",(int)query_size);
|
||||||
|
|
||||||
|
// Stop if query wont fit the ctx window
|
||||||
|
if (query_size > (size_t)params.n_ctx) {
|
||||||
|
fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Speedup small evaluations by evaluating atleast 32 tokens
|
||||||
|
if (query_size < 32) {
|
||||||
|
query_embd.resize(32);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Evaluate the query
|
||||||
|
if (llama_eval(ctx, query_embd.data(), query_embd.size(), 0, params.n_threads)) {
|
||||||
|
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto query_logits = llama_get_logits(ctx);
|
||||||
|
|
||||||
|
std::memcpy(tok_logits.data(), query_logits + (context_size-1)*n_vocab, n_vocab*sizeof(float));
|
||||||
|
const auto first_probs = softmax(tok_logits);
|
||||||
|
|
||||||
|
hs_data[task_idx].ending_logprob_count[0] = 1;
|
||||||
|
hs_data[task_idx].ending_logprob[0] = std::log(first_probs[query_embd[context_size]]);
|
||||||
|
|
||||||
|
// Calculate the logprobs over the ending
|
||||||
|
for (size_t j = context_size; j < query_size - 1; j++) {
|
||||||
|
|
||||||
|
std::memcpy(tok_logits.data(), query_logits + j*n_vocab, n_vocab*sizeof(float));
|
||||||
|
|
||||||
|
const float prob = softmax(tok_logits)[query_embd[j + 1]];
|
||||||
|
|
||||||
|
hs_data[task_idx].ending_logprob[0] += std::log(prob);
|
||||||
|
hs_data[task_idx].ending_logprob_count[0]++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate the mean token logprob for acc_norm
|
||||||
|
hs_data[task_idx].ending_logprob[0] /= hs_data[task_idx].ending_logprob_count[0];
|
||||||
|
|
||||||
|
// Do the remaining endings
|
||||||
|
// For these, we use the bare ending with n_past = context_size
|
||||||
|
//
|
||||||
|
for (size_t ending_idx = 1; ending_idx < 4; ending_idx++) {
|
||||||
|
|
||||||
// Tokenize the query
|
// Tokenize the query
|
||||||
std::vector<int> query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[ending_idx], prepend_bos);
|
query_embd = ::llama_tokenize(ctx, hs_data[task_idx].ending[ending_idx], false);
|
||||||
size_t query_size = query_embd.size();
|
query_size = query_embd.size();
|
||||||
|
//printf("Second query: %d\n",(int)query_size);
|
||||||
|
|
||||||
// Stop if query wont fit the ctx window
|
// Stop if query wont fit the ctx window
|
||||||
if (query_size > (size_t)params.n_ctx) {
|
if (context_size + query_size > (size_t)params.n_ctx) {
|
||||||
fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size);
|
fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Speedup small evaluations by evaluating atleast 32 tokens
|
// Speedup small evaluations by evaluating atleast 32 tokens
|
||||||
if (query_size < 32) {
|
// No, resizing to 32 is actually slightly slower (at least on CUDA)
|
||||||
query_embd.resize(32);
|
//if (query_size < 32) {
|
||||||
}
|
// query_embd.resize(32);
|
||||||
|
//}
|
||||||
|
|
||||||
// Evaluate the query
|
// Evaluate the query
|
||||||
if (llama_eval(ctx, query_embd.data(), query_embd.size(), 0, params.n_threads)) {
|
if (llama_eval(ctx, query_embd.data(), query_embd.size(), context_size, params.n_threads)) {
|
||||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto query_logits = llama_get_logits(ctx);
|
query_logits = llama_get_logits(ctx);
|
||||||
std::vector<float> logits;
|
|
||||||
logits.insert(logits.end(), query_logits, query_logits + query_size * n_vocab);
|
|
||||||
|
|
||||||
hs_data[task_idx].ending_logprob_count[ending_idx] = 0;
|
hs_data[task_idx].ending_logprob_count[ending_idx] = 1;
|
||||||
hs_data[task_idx].ending_logprob[ending_idx] = 0.0f;
|
hs_data[task_idx].ending_logprob[ending_idx] = std::log(first_probs[query_embd[0]]);
|
||||||
|
|
||||||
// Calculate the logprobs over the ending
|
// Calculate the logprobs over the ending
|
||||||
for (size_t j = context_size-1; j < query_size - 1; j++) {
|
for (size_t j = 0; j < query_size - 1; j++) {
|
||||||
// Calculate probability of next token, given the previous ones.
|
std::memcpy(tok_logits.data(), query_logits + j*n_vocab, n_vocab*sizeof(float));
|
||||||
const std::vector<float> tok_logits(
|
|
||||||
logits.begin() + (j + 0) * n_vocab,
|
|
||||||
logits.begin() + (j + 1) * n_vocab);
|
|
||||||
|
|
||||||
const float prob = softmax(tok_logits)[query_embd[ j + 1]];
|
const float prob = softmax(tok_logits)[query_embd[j + 1]];
|
||||||
|
|
||||||
hs_data[task_idx].ending_logprob[ending_idx] += std::log(prob);
|
hs_data[task_idx].ending_logprob[ending_idx] += std::log(prob);
|
||||||
hs_data[task_idx].ending_logprob_count[ending_idx]++;
|
hs_data[task_idx].ending_logprob_count[ending_idx]++;
|
||||||
@ -267,9 +315,9 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Find the ending with maximum logprob
|
// Find the ending with maximum logprob
|
||||||
size_t ending_logprob_max_idx = -1;
|
size_t ending_logprob_max_idx = 0;
|
||||||
double ending_logprob_max_val = -INFINITY;
|
double ending_logprob_max_val = hs_data[task_idx].ending_logprob[0];
|
||||||
for (size_t j=0; j < 4; j++) {
|
for (size_t j = 1; j < 4; j++) {
|
||||||
if (hs_data[task_idx].ending_logprob[j] > ending_logprob_max_val) {
|
if (hs_data[task_idx].ending_logprob[j] > ending_logprob_max_val) {
|
||||||
ending_logprob_max_idx = j;
|
ending_logprob_max_idx = j;
|
||||||
ending_logprob_max_val = hs_data[task_idx].ending_logprob[j];
|
ending_logprob_max_val = hs_data[task_idx].ending_logprob[j];
|
||||||
|
Loading…
Reference in New Issue
Block a user