From 0161372b9a1a72d245b4694b547ed6905c2d7167 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 18 Sep 2023 20:30:05 +0300
Subject: [PATCH] parallel : example for serving multiple users in parallel

---
 common/common.cpp                    |   8 +-
 common/common.h                      |   2 +-
 examples/CMakeLists.txt              |   1 +
 examples/main/main.cpp               |   2 +-
 examples/parallel/CMakeLists.txt     |   8 +
 examples/parallel/parallel.cpp       | 244 +++++++++++++++++++++++++++
 examples/perplexity/perplexity.cpp   |   2 +-
 examples/speculative/speculative.cpp |   6 +-
 llama.cpp                            |   2 +-
 9 files changed, 262 insertions(+), 13 deletions(-)
 create mode 100644 examples/parallel/CMakeLists.txt
 create mode 100644 examples/parallel/parallel.cpp

diff --git a/common/common.cpp b/common/common.cpp
index fd50891f8..ff1b5ee9f 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -454,8 +454,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             if (params.logdir.back() != DIRECTORY_SEPARATOR) {
                 params.logdir += DIRECTORY_SEPARATOR;
             }
-        } else if (arg == "--perplexity") {
-            params.perplexity = true;
+        } else if (arg == "--perplexity" || arg == "--all-logits") {
+            params.logits_all = true;
         } else if (arg == "--ppl-stride") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -653,7 +653,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
     printf("                        not recommended: doubles context memory required and no measurable increase in quality\n");
     printf("  --temp N              temperature (default: %.1f)\n", (double)params.temp);
-    printf("  --perplexity          compute perplexity over each ctx window of the prompt\n");
+    printf("  --logits-all          return logits for all tokens in the batch (default: disabled)\n");
     printf("  --hellaswag           compute HellaSwag score over random tasks from datafile supplied with -f\n");
     printf("  --hellaswag-tasks N   number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
     printf("  --keep N              number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
@@ -735,7 +735,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     lparams.f16_kv          = params.memory_f16;
     lparams.use_mmap        = params.use_mmap;
     lparams.use_mlock       = params.use_mlock;
-    lparams.logits_all      = params.perplexity;
+    lparams.logits_all      = params.logits_all;
     lparams.embedding       = params.embedding;
     lparams.rope_freq_base  = params.rope_freq_base;
     lparams.rope_freq_scale = params.rope_freq_scale;
diff --git a/common/common.h b/common/common.h
index 2cc11966b..945403274 100644
--- a/common/common.h
+++ b/common/common.h
@@ -113,7 +113,7 @@ struct gpt_params {
     bool ignore_eos        = false; // ignore generated EOS tokens
     bool instruct          = false; // instruction mode (used for Alpaca models)
     bool penalize_nl       = true;  // consider newlines as a repeatable token
-    bool perplexity        = false; // compute perplexity over the prompt
+    bool logits_all        = false; // return logits for all tokens in the batch
     bool use_mmap          = true;  // use mmap for faster loads
     bool use_mlock         = false; // use mlock to keep model in memory
     bool numa              = false; // attempt optimizations that help on some NUMA systems
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 884c42764..df7307072 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -24,6 +24,7 @@ else()
     add_subdirectory(convert-llama2c-to-ggml)
     add_subdirectory(simple)
     add_subdirectory(speculative)
+    add_subdirectory(parallel)
     add_subdirectory(embd-input)
     add_subdirectory(llama-bench)
     add_subdirectory(beam-search)
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index ed2d9e2f7..9c5f2746a 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -124,7 +124,7 @@ int main(int argc, char ** argv) {
     console::init(params.simple_io, params.use_color);
     atexit([]() { console::cleanup(); });
 
-    if (params.perplexity) {
+    if (params.logits_all) {
         printf("\n************\n");
         printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
         printf("************\n\n");
diff --git a/examples/parallel/CMakeLists.txt b/examples/parallel/CMakeLists.txt
new file mode 100644
index 000000000..0bbf89eae
--- /dev/null
+++ b/examples/parallel/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(TARGET parallel)
+add_executable(${TARGET} parallel.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
new file mode 100644
index 000000000..1bb4d497f
--- /dev/null
+++ b/examples/parallel/parallel.cpp
@@ -0,0 +1,244 @@
+// A basic application simulating a server with multiple clients.
+// The clients submite requests to the server and they are processed in parallel.
+
+#include "build-info.h"
+
+#include "common.h"
+#include "llama.h"
+
+#include <cmath>
+#include <cstdio>
+#include <string>
+#include <vector>
+
+// trim whitespace from the beginning and end of a string
+static std::string trim(const std::string & str) {
+    size_t start = 0;
+    size_t end = str.size();
+
+    while (start < end && isspace(str[start])) {
+        start += 1;
+    }
+
+    while (end > start && isspace(str[end - 1])) {
+        end -= 1;
+    }
+
+    return str.substr(start, end - start);
+}
+
+static std::string k_system = R"(
+Transcript of a dialog, where the User interacts with an Assistant.
+The Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
+
+User: Hello, what is the temperature outside?
+Assistant: It is 72 degrees Fahrenheit.
+User: What is the definition of a prime number?
+Assistant: A prime number is a number that is divisible only by itself and 1.
+User: )";
+
+static std::vector<std::string> k_prompts = {
+    "What is the meaning of life?",
+    "What is the population of Europe?",
+    "List all planets in the Solar System.",
+    "What is the capital of France?",
+    "Tell me an interesting fact about llamas.",
+    "What is the best way to cook a steak?",
+    "Are you familiar with the Special Theory of Relativity and can you explain it to me?",
+    "Recommend some interesting books to read.",
+    "What is the best way to learn a new language?",
+    "How to get a job at Google?",
+    "If you could have any superpower, what would it be?",
+    "I want to learn how to play the piano.",
+};
+
+struct client {
+    int32_t id = 0;
+
+    llama_seq_id seq_id = -1;
+
+    llama_token sampled;
+
+    int32_t n_prompt  = 0;
+    int32_t n_decoded = 0;
+    int32_t i_batch   = -1;
+
+    std::string input;
+    std::string prompt;
+    std::string response;
+
+    std::vector<llama_token> last_tokens;
+};
+
+int main(int argc, char ** argv) {
+    gpt_params params;
+
+    if (gpt_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+
+    const int n_clients = 16;
+
+#ifndef LOG_DISABLE_LOGS
+    log_set_target(log_filename_generator("parallel", "log"));
+    LOG_TEE("Log start\n");
+    log_dump_cmdline(argc, argv);
+#endif // LOG_DISABLE_LOGS
+
+    // init llama.cpp
+    llama_backend_init(params.numa);
+
+    llama_model * model = NULL;
+
+    llama_context * ctx = NULL;
+
+    // load the target model
+    params.logits_all = true;
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+
+    fprintf(stderr, "\n\n");
+    fflush(stderr);
+
+    const int n_ctx   = llama_n_ctx(ctx);
+    const int n_vocab = llama_n_vocab(ctx);
+
+    std::vector<client> clients(n_clients);
+    for (size_t i = 0; i < clients.size(); ++i) {
+        auto & client = clients[i];
+        client.id = i;
+        client.last_tokens.resize(n_ctx);
+        std::fill(client.last_tokens.begin(), client.last_tokens.end(), 0);
+    }
+
+    std::vector<llama_token_data> candidates;
+    candidates.reserve(n_vocab);
+
+    auto t_main_start = ggml_time_us();
+
+    int64_t n_tokens_total = 0;
+
+    llama_seq_id g_seq_id = 0;
+
+    std::vector<llama_token>  batch_token;
+    std::vector<llama_pos>    batch_pos;
+    std::vector<llama_seq_id> batch_seq_id;
+    std::vector<client *>     batch_clients;
+
+    while (true) {
+        uint32_t n_tokens = 0;
+
+        batch_token.clear();
+        batch_pos.clear();
+        batch_seq_id.clear();
+
+        for (auto & client : clients) {
+            if (client.seq_id == -1) {
+                client.seq_id = g_seq_id;
+                client.input = k_prompts[rand() % k_prompts.size()];
+                client.prompt = k_system + client.input + "\nAssistant:";
+                client.response = "";
+                std::fill(client.last_tokens.begin(), client.last_tokens.end(), 0);
+
+                std::vector<llama_token> prompt_tokens;
+                prompt_tokens = ::llama_tokenize(ctx, client.prompt, true);
+
+                for (size_t i = 0; i < prompt_tokens.size(); ++i) {
+                    batch_token.push_back(prompt_tokens[i]);
+                    batch_pos.push_back(i);
+                    batch_seq_id.push_back(client.seq_id);
+                    batch_clients.push_back(&client);
+                }
+                client.n_prompt  = prompt_tokens.size();
+                client.n_decoded = prompt_tokens.size();
+                client.i_batch   = batch_token.size() - 1;
+
+                g_seq_id += 1;
+            } else {
+                batch_token.push_back(client.sampled);
+                batch_pos.push_back(client.n_decoded);
+                batch_seq_id.push_back(client.seq_id);
+                batch_clients.push_back(&client);
+                client.n_decoded += 1;
+                client.i_batch = batch_token.size() - 1;
+            }
+        }
+
+        // process in chunks of params.n_batch
+        for (size_t i = 0; i < batch_token.size(); i += params.n_batch) {
+            n_tokens = std::min(params.n_batch, (int32_t) (batch_token.size() - i));
+
+            llama_batch batch = {
+                n_tokens,
+                batch_token.data() + i,
+                nullptr,
+                batch_pos.data() + i,
+                batch_seq_id.data() + i,
+                0, 0, 0, // unused
+            };
+
+            if (llama_decode(ctx, batch, params.n_threads)) {
+                LOG_TEE("%s : failed to decode batch\n", __func__);
+                return 1;
+            }
+
+            for (auto & client : clients) {
+                if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) {
+                    continue;
+                }
+
+                const llama_token id = llama_sample_token(ctx, NULL, NULL, params, client.last_tokens, candidates, client.i_batch - i);
+
+                // remember which tokens were sampled - used for repetition penalties during sampling
+                client.last_tokens.erase(client.last_tokens.begin());
+                client.last_tokens.push_back(id);
+
+                const std::string token_str = llama_token_to_piece(ctx, id);
+                client.response += token_str;
+                client.sampled = id;
+
+                //printf("client %d, seq %d, token %d, pos %d, batch %d: %s\n",
+                //        client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());
+
+                if (id == llama_token_eos(ctx) || client.n_decoded > params.n_predict || client.response.find("User:") != std::string::npos) {
+                    const size_t pos = client.response.find("User:");
+                    if (pos != std::string::npos) {
+                        client.response = client.response.substr(0, pos);
+                    }
+
+                    llama_kv_cache_rm_seq(ctx, client.seq_id, 0, n_ctx);
+
+                    const auto t_main_end = ggml_time_us();
+
+                    n_tokens_total += client.n_decoded - client.n_prompt;
+
+                    printf("\033[1mClient %d, seq %d, prompt %d t, response %d t, speed: %.2f t/s\033[0m: \n\nInput:    %s\nResponse: %s\n\n",
+                            client.id, client.seq_id, client.n_prompt, client.n_decoded - client.n_prompt,
+                            (double) n_tokens_total / (t_main_end - t_main_start) * 1e6,
+                            client.input.c_str(), ::trim(client.response).c_str());
+
+                    client.seq_id = -1;
+                }
+            }
+        }
+
+        static bool is_first = true;
+        if (is_first) {
+            t_main_start = ggml_time_us();
+            n_tokens_total = 0;
+            is_first = false;
+        }
+    }
+
+    LOG_TEE("\n\n");
+
+    llama_print_timings(ctx);
+
+    llama_free(ctx);
+    llama_free_model(model);
+
+    llama_backend_free();
+
+    fprintf(stderr, "\n\n");
+
+    return 0;
+}
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index fd2160bbf..8386a3d16 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -681,7 +681,7 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    params.perplexity = true;
+    params.logits_all = true;
     params.n_batch = std::min(params.n_batch, params.n_ctx);
 
     if (params.ppl_stride > 0) {
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 053073397..526d98e66 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -37,7 +37,7 @@ int main(int argc, char ** argv) {
     llama_context * ctx_dft = NULL;
 
     // load the target model
-    params.perplexity = true; // HACK: enable logits_all = true
+    params.logits_all = true;
     std::tie(model_tgt, ctx_tgt) = llama_init_from_gpt_params(params);
 
     // load the draft model
@@ -172,7 +172,6 @@ int main(int argc, char ** argv) {
                 LOG("out of drafted tokens\n");
             }
 
-            llama_kv_cache_rm_seq(ctx_dft, 0, n_past_dft, n_ctx);
             llama_decode(ctx_dft, llama_batch_get_one(&id, 1, n_past_dft, 0), params.n_threads);
             ++n_past_dft;
 
@@ -218,7 +217,6 @@ int main(int argc, char ** argv) {
 
         // sample n_draft tokens from the draft model using greedy decoding
         int n_past_cur = n_past_dft;
-
         for (int i = 0; i < n_draft; ++i) {
             float * logits = llama_get_logits(ctx_dft);
 
@@ -258,7 +256,6 @@ int main(int argc, char ** argv) {
             }
 
             // evaluate the drafted token on the draft model
-            llama_kv_cache_rm_seq(ctx_dft, 0, n_past_cur, n_ctx);
             llama_decode(ctx_dft, llama_batch_get_one(&drafted.back(), 1, n_past_cur, 0), params.n_threads);
             ++n_past_cur;
 
@@ -268,7 +265,6 @@ int main(int argc, char ** argv) {
         }
 
         // evaluate the target model on the drafted tokens
-        llama_kv_cache_rm_seq(ctx_tgt, 0, n_past_tgt, n_ctx);
         llama_decode(ctx_tgt, llama_batch_get_one(drafted.data(), drafted.size(), n_past_tgt, 0), params.n_threads);
         ++n_past_tgt;
 
diff --git a/llama.cpp b/llama.cpp
index a7c7604d9..875fd5227 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6673,7 +6673,7 @@ struct llama_context * llama_new_context_with_model(
             ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
 
             // build worst-case graph
-            uint32_t n_tokens = std::min((int)hparams.n_ctx, params.n_batch);
+            uint32_t n_tokens = std::max((int)hparams.n_ctx, params.n_batch);
             llama_token token = llama_token_bos(ctx); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
             ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, 0, 0));