From e8b8d32e8663ffc55a02c9721af3a5190382cbb0 Mon Sep 17 00:00:00 2001
From: Jhen-Jie Hong <iainst0409@gmail.com>
Date: Thu, 5 Oct 2023 09:02:55 -0500
Subject: [PATCH] server : fix incorrect num_tokens_predicted (#3480)

---
 examples/server/server.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 921eb5da4..6e31e1332 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -504,9 +504,11 @@ struct llama_server_context
                                            });
         }
 
+        bool tg = true;
         while (n_past < embd.size())
         {
             int n_eval = (int)embd.size() - n_past;
+            tg = n_eval == 1;
             if (n_eval > params.n_batch)
             {
                 n_eval = params.n_batch;
@@ -633,7 +635,9 @@ struct llama_server_context
 
             last_n_tokens.erase(last_n_tokens.begin());
             last_n_tokens.push_back(result.tok);
-            num_tokens_predicted++;
+            if (tg) {
+                num_tokens_predicted++;
+            }
         }
 
         // add it to the context
@@ -1124,8 +1128,6 @@ static json format_timings(llama_server_context &llama)
 {
     const auto timings = llama_get_timings(llama.ctx);
 
-    assert(timings.n_eval == ptrdiff_t(llama.num_tokens_predicted));
-
     return json{
         {"prompt_n", timings.n_p_eval},
         {"prompt_ms", timings.t_p_eval_ms},