mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-10-29 22:20:15 +01:00
parallel : add option to load external prompt file (#3416)
* Enable external file and add datestamp * Add name of external file at end * Upload ToK2024 * Delete ToK2024.txt * Experiments with jeopardy * Move ParallelQuestions to /proimpts and rename * Interim commit * Interim commit * Final revision * Remove trailing whitespace * remove cmake_all.sh * Remove cmake_all.sh * Changed .gitignore * Improved reporting and new question files. * Corrected typo * More LLM questions * Update LLM-questions.txt * Yet more LLM-questions * Remove jeopardy results file * Reinstate original jeopardy.sh * Update examples/parallel/parallel.cpp --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
97af49fa39
commit
a8777ad84e
@ -167,6 +167,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
// store the external file name in params
|
||||
params.prompt_file = argv[i];
|
||||
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
|
||||
if (params.prompt.back() == '\n') {
|
||||
params.prompt.pop_back();
|
||||
|
@ -79,6 +79,7 @@ struct gpt_params {
|
||||
std::string model_draft = ""; // draft model for speculative decoding
|
||||
std::string model_alias = "unknown"; // model alias
|
||||
std::string prompt = "";
|
||||
std::string prompt_file = ""; // store the external prompt file name
|
||||
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
|
||||
std::string input_prefix = ""; // string to prefix user inputs with
|
||||
std::string input_suffix = ""; // string to suffix user inputs with
|
||||
|
@ -10,6 +10,7 @@
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <ctime>
|
||||
|
||||
// trim whitespace from the beginning and end of a string
|
||||
static std::string trim(const std::string & str) {
|
||||
@ -70,6 +71,26 @@ struct client {
|
||||
std::vector<llama_token> tokens_prev;
|
||||
};
|
||||
|
||||
static void print_date_time() {
|
||||
std::time_t current_time = std::time(nullptr);
|
||||
std::tm* local_time = std::localtime(¤t_time);
|
||||
char buffer[80];
|
||||
strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", local_time);
|
||||
|
||||
printf("\n\033[35mrun parameters as at %s\033[0m\n", buffer);
|
||||
}
|
||||
|
||||
// Define a split string function to ...
|
||||
static std::vector<std::string> split_string(const std::string& input, char delimiter) {
|
||||
std::vector<std::string> tokens;
|
||||
std::istringstream stream(input);
|
||||
std::string token;
|
||||
while (std::getline(stream, token, delimiter)) {
|
||||
tokens.push_back(token);
|
||||
}
|
||||
return tokens;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
srand(1234);
|
||||
|
||||
@ -104,6 +125,23 @@ int main(int argc, char ** argv) {
|
||||
params.logits_all = true;
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
|
||||
// load the prompts from an external file if there are any
|
||||
if (params.prompt.empty()) {
|
||||
printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
|
||||
} else {
|
||||
// Output each line of the input params.prompts vector and copy to k_prompts
|
||||
int index = 0;
|
||||
printf("\n\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str());
|
||||
|
||||
std::vector<std::string> prompts = split_string(params.prompt, '\n');
|
||||
for (const auto& prompt : prompts) {
|
||||
k_prompts.resize(index + 1);
|
||||
k_prompts[index] = prompt;
|
||||
index++;
|
||||
printf("%3d prompt: %s\n", index, prompt.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(stderr, "\n\n");
|
||||
fflush(stderr);
|
||||
|
||||
@ -233,7 +271,7 @@ int main(int argc, char ** argv) {
|
||||
client.n_decoded = 0;
|
||||
client.i_batch = batch.n_tokens - 1;
|
||||
|
||||
LOG_TEE("\033[1mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
|
||||
LOG_TEE("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
|
||||
|
||||
g_seq_id += 1;
|
||||
|
||||
@ -336,8 +374,8 @@ int main(int argc, char ** argv) {
|
||||
|
||||
const auto t_main_end = ggml_time_us();
|
||||
|
||||
LOG_TEE("\033[1mClient %3d, seq %4d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \n\nInput: %s\nResponse: %s\n\n",
|
||||
client.id, client.seq_id, client.n_prompt, client.n_decoded,
|
||||
LOG_TEE("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \nInput: %s\n\033[35mResponse: %s\033[0m\n\n",
|
||||
client.id, client.seq_id, n_seq, client.n_prompt, client.n_decoded,
|
||||
(t_main_end - client.t_start_prompt) / 1e6,
|
||||
(double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6,
|
||||
n_cache_miss,
|
||||
@ -357,13 +395,21 @@ int main(int argc, char ** argv) {
|
||||
|
||||
const auto t_main_end = ggml_time_us();
|
||||
|
||||
LOG_TEE("\n\n");
|
||||
print_date_time();
|
||||
|
||||
LOG_TEE("\n%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
|
||||
if (params.prompt_file.empty()) {
|
||||
params.prompt_file = "used built-in defaults";
|
||||
}
|
||||
LOG_TEE("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
|
||||
LOG_TEE("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str());
|
||||
|
||||
LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);
|
||||
LOG_TEE("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);
|
||||
LOG_TEE("Total speed (AVG): %6s speed: %5.2f t/s\n", "", (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
|
||||
LOG_TEE("Cache misses: %6d\n", n_cache_miss);
|
||||
|
||||
LOG_TEE("\n\n");
|
||||
LOG_TEE("\n");
|
||||
|
||||
llama_print_timings(ctx);
|
||||
|
||||
|
10
llama.cpp
10
llama.cpp
@ -8219,14 +8219,14 @@ void llama_print_timings(struct llama_context * ctx) {
|
||||
const llama_timings timings = llama_get_timings(ctx);
|
||||
|
||||
LLAMA_LOG_INFO("\n");
|
||||
LLAMA_LOG_INFO("%s: load time = %8.2f ms\n", __func__, timings.t_load_ms);
|
||||
LLAMA_LOG_INFO("%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, timings.t_load_ms);
|
||||
LLAMA_LOG_INFO("%s: sample time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
__func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
|
||||
LLAMA_LOG_INFO("%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
__func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
|
||||
LLAMA_LOG_INFO("%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
__func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
|
||||
LLAMA_LOG_INFO("%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
|
||||
LLAMA_LOG_INFO("%s: total time = %10.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
|
||||
}
|
||||
|
||||
void llama_reset_timings(struct llama_context * ctx) {
|
||||
|
49
prompts/LLM-questions.txt
Normal file
49
prompts/LLM-questions.txt
Normal file
@ -0,0 +1,49 @@
|
||||
In the context of LLMs, what is "Attention"?
|
||||
In the context of LLMs, what is a completion?
|
||||
In the context of LLMs, what is a prompt?
|
||||
In the context of LLMs, what is GELU?
|
||||
In the context of LLMs, what is RELU?
|
||||
In the context of LLMs, what is softmax?
|
||||
In the context of LLMs, what is decoding?
|
||||
In the context of LLMs, what is encoding?
|
||||
In the context of LLMs, what is tokenizing?
|
||||
In the context of LLMs, what is an embedding?
|
||||
In the context of LLMs, what is quantization?
|
||||
In the context of LLMs, what is a tensor?
|
||||
In the context of LLMs, what is a sparse tensor?
|
||||
In the context of LLMs, what is a vector?
|
||||
In the context of LLMs, how is attention implemented?
|
||||
In the context of LLMs, why is attention all you need?
|
||||
In the context of LLMs, what is "RoPe" and what is it used for?
|
||||
In the context of LLMs, what is "LoRA" and what is it used for?
|
||||
In the context of LLMs, what are weights?
|
||||
In the context of LLMs, what are biases?
|
||||
In the context of LLMs, what are checkpoints?
|
||||
In the context of LLMs, what is "perplexity"?
|
||||
In the context of LLMs, what are models?
|
||||
In the context of machine-learning, what is "catastrophic forgetting"?
|
||||
In the context of machine-learning, what is "elastic weight consolidation (EWC)"?
|
||||
In the context of neural nets, what is a hidden layer?
|
||||
In the context of neural nets, what is a convolution?
|
||||
In the context of neural nets, what is dropout?
|
||||
In the context of neural nets, what is cross-entropy?
|
||||
In the context of neural nets, what is over-fitting?
|
||||
In the context of neural nets, what is under-fitting?
|
||||
What is the difference between an interpreted computer language and a compiled computer language?
|
||||
In the context of software development, what is a debugger?
|
||||
When processing using a GPU, what is off-loading?
|
||||
When processing using a GPU, what is a batch?
|
||||
When processing using a GPU, what is a block?
|
||||
When processing using a GPU, what is the difference between a batch and a block?
|
||||
When processing using a GPU, what is a scratch tensor?
|
||||
When processing using a GPU, what is a layer?
|
||||
When processing using a GPU, what is a cache?
|
||||
When processing using a GPU, what is unified memory?
|
||||
When processing using a GPU, what is VRAM?
|
||||
When processing using a GPU, what is a kernel?
|
||||
When processing using a GPU, what is "metal"?
|
||||
In the context of LLMs, what are "Zero-Shot", "One-Shot" and "Few-Shot" learning models?
|
||||
In the context of LLMs, what is the "Transformer-model" architecture?
|
||||
In the context of LLMs, what is "Multi-Head Attention"?
|
||||
In the context of LLMs, what is "Self-Attention"?
|
||||
In the context of transformer-model architectures, how do attention mechanisms use masks?
|
42
prompts/parallel-questions.txt
Normal file
42
prompts/parallel-questions.txt
Normal file
@ -0,0 +1,42 @@
|
||||
What do you know about Hobbits?
|
||||
What is quantum field theory?
|
||||
Why did the chicken cross the road?
|
||||
Who is the president of the United States?
|
||||
How do I run CMake on MacOS?
|
||||
Do you agree that C++ is a really finicky language compared with Python3?
|
||||
Is it a good idea to invest in technology?
|
||||
Do you like Wagner's Ring?
|
||||
Do you think this file input option is really neat?
|
||||
What should we all do about climate change?
|
||||
Is time-travel possible within the laws of current physics?
|
||||
Is it like anything to be a bat?
|
||||
Once the chicken has crossed the road, does it try to go back?
|
||||
Who is the greatest of all musical composers?
|
||||
What is art?
|
||||
Is there life elsewhere in the universe?
|
||||
What is intelligence?
|
||||
What is the difference between knowledge and intelligence?
|
||||
Will religion ever die?
|
||||
Do we understand ourselves?
|
||||
What is the best way to cook eggs?
|
||||
If you cannot see things, on what basis do you evaluate them?
|
||||
Explain the role of the np junction in photovoltaic cells?
|
||||
Is professional sport a good or bad influence on human behaviour?
|
||||
Is capital punishment immoral?
|
||||
Should we care about other people?
|
||||
Who are you?
|
||||
Which sense would you surrender if you could?
|
||||
Was Henry Ford a hero or a villain?
|
||||
Do we need leaders?
|
||||
What is nucleosynthesis?
|
||||
Who is the greatest scientist of all time?
|
||||
Who first observed what came to be known as the photovoltaic effect?
|
||||
What is nuclear fusion and why does it release energy?
|
||||
Can you know that you exist?
|
||||
What is an exoplanet?
|
||||
Do you like cream?
|
||||
What is the difference?
|
||||
Can I know that I exist while I'm dreaming that I'm Descartes?
|
||||
Who said "I didn't know I thought that until I heard myself saying it"?
|
||||
Does anything really matter?
|
||||
Can you explain the unreasonable effectiveness of mathematics?
|
Loading…
Reference in New Issue
Block a user