mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-01 00:39:00 +01:00
batched-bench : pass custom set of PP, TG and PL
This commit is contained in:
parent
c062ffd18c
commit
daeb834da9
@ -10,13 +10,16 @@ There are 2 modes of operation:
|
||||
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
|
||||
|
||||
```bash
|
||||
./batched-bench MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL]
|
||||
./batched-bench MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>
|
||||
|
||||
# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
|
||||
./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 0 99
|
||||
|
||||
# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
|
||||
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 1 99
|
||||
|
||||
# custom set of batches
|
||||
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 0 999 128,256,512 128,256 1,2,4,8,16,32
|
||||
```
|
||||
|
||||
## Sample results
|
||||
|
@ -7,11 +7,34 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
// mutates the input string
|
||||
static std::vector<int> parse_list(char * p) {
|
||||
std::vector<int> ret;
|
||||
|
||||
char * q = p;
|
||||
|
||||
while (*p) {
|
||||
if (*p == ',') {
|
||||
*p = '\0';
|
||||
ret.push_back(std::atoi(q));
|
||||
q = p + 1;
|
||||
}
|
||||
|
||||
++p;
|
||||
}
|
||||
|
||||
ret.push_back(std::atoi(q));
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
if (argc == 1 || argv[1][0] == '-') {
|
||||
printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL]\n" , argv[0]);
|
||||
printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
|
||||
printf(" <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
|
||||
printf(" example: %s ggml-model-f16.gguf 2048 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
|
||||
return 1 ;
|
||||
}
|
||||
|
||||
@ -40,6 +63,18 @@ int main(int argc, char ** argv) {
|
||||
n_gpu_layers = std::atoi(argv[4]);
|
||||
}
|
||||
|
||||
if (argc >= 6) {
|
||||
n_pp = parse_list(argv[5]);
|
||||
}
|
||||
|
||||
if (argc >= 7) {
|
||||
n_tg = parse_list(argv[6]);
|
||||
}
|
||||
|
||||
if (argc >= 8) {
|
||||
n_pl = parse_list(argv[7]);
|
||||
}
|
||||
|
||||
// init LLM
|
||||
|
||||
llama_backend_init(params.numa);
|
||||
|
Loading…
Reference in New Issue
Block a user