mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 05:17:21 +01:00
Merge pull request #1 from HanClinto/bins-rename-nits
Nits found in binary renames
This commit is contained in:
commit
82df7f9f0e
@ -733,7 +733,7 @@ Here is an example of a few-shot interaction, invoked with the command
|
|||||||
./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
|
./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program.
|
Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `llama-cli` example program.
|
||||||
|
|
||||||
![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png)
|
![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png)
|
||||||
|
|
||||||
@ -958,7 +958,7 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m
|
|||||||
|
|
||||||
### Docs
|
### Docs
|
||||||
|
|
||||||
- [main](./examples/main/README.md)
|
- [main (cli)](./examples/main/README.md)
|
||||||
- [server](./examples/server/README.md)
|
- [server](./examples/server/README.md)
|
||||||
- [jeopardy](./examples/jeopardy/README.md)
|
- [jeopardy](./examples/jeopardy/README.md)
|
||||||
- [BLIS](./docs/BLIS.md)
|
- [BLIS](./docs/BLIS.md)
|
||||||
|
@ -100,7 +100,7 @@ Have a look at existing implementation like `build_llama`, `build_dbrx` or `buil
|
|||||||
|
|
||||||
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.
|
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.
|
||||||
|
|
||||||
Note: to debug the inference graph: you can use [eval-callback](../examples/eval-callback).
|
Note: to debug the inference graph: you can use [llama-eval-callback](../examples/eval-callback).
|
||||||
|
|
||||||
## GGUF specification
|
## GGUF specification
|
||||||
|
|
||||||
|
@ -6,7 +6,7 @@ It simply prints to the console all operations and tensor data.
|
|||||||
Usage:
|
Usage:
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
eval-callback \
|
llama-eval-callback \
|
||||||
--hf-repo ggml-org/models \
|
--hf-repo ggml-org/models \
|
||||||
--hf-file phi-2/ggml-model-q4_0.gguf \
|
--hf-file phi-2/ggml-model-q4_0.gguf \
|
||||||
--model phi-2-q4_0.gguf \
|
--model phi-2-q4_0.gguf \
|
||||||
|
@ -3,7 +3,7 @@
|
|||||||
Apply LORA adapters to base model and export the resulting model.
|
Apply LORA adapters to base model and export the resulting model.
|
||||||
|
|
||||||
```
|
```
|
||||||
usage: export-lora [options]
|
usage: llama-export-lora [options]
|
||||||
|
|
||||||
options:
|
options:
|
||||||
-h, --help show this help message and exit
|
-h, --help show this help message and exit
|
||||||
|
@ -11,14 +11,14 @@
|
|||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
static void print_usage() {
|
static void print_usage(char* argv0) {
|
||||||
fprintf(stderr, "Merges multiple lookup cache files into a single one.\n");
|
fprintf(stderr, "Merges multiple lookup cache files into a single one.\n");
|
||||||
fprintf(stderr, "Usage: lookup-merge [--help] lookup_part_1.bin lookup_part_2.bin ... lookup_merged.bin\n");
|
fprintf(stderr, "Usage: %s [--help] lookup_part_1.bin lookup_part_2.bin ... lookup_merged.bin\n", argv0);
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv){
|
int main(int argc, char ** argv){
|
||||||
if (argc < 3) {
|
if (argc < 3) {
|
||||||
print_usage();
|
print_usage(argv[0]);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -27,7 +27,7 @@ int main(int argc, char ** argv){
|
|||||||
for (int i = 0; i < argc-1; ++i) {
|
for (int i = 0; i < argc-1; ++i) {
|
||||||
args[i] = argv[i+1];
|
args[i] = argv[i+1];
|
||||||
if (args[i] == "-h" || args[i] == "--help") {
|
if (args[i] == "-h" || args[i] == "--help") {
|
||||||
print_usage();
|
print_usage(argv[0]);
|
||||||
exit(0);
|
exit(0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -64,7 +64,7 @@ llama-cli.exe -m models\7B\ggml-model.bin --ignore-eos -n -1
|
|||||||
|
|
||||||
## Common Options
|
## Common Options
|
||||||
|
|
||||||
In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:
|
In this section, we cover the most commonly used options for running the `llama-cli` program with the LLaMA models:
|
||||||
|
|
||||||
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`; inferred from `--model-url` if set).
|
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`; inferred from `--model-url` if set).
|
||||||
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
|
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
|
||||||
@ -74,7 +74,7 @@ In this section, we cover the most commonly used options for running the `main`
|
|||||||
|
|
||||||
## Input Prompts
|
## Input Prompts
|
||||||
|
|
||||||
The `main` program provides several ways to interact with the LLaMA models using input prompts:
|
The `llama-cli` program provides several ways to interact with the LLaMA models using input prompts:
|
||||||
|
|
||||||
- `--prompt PROMPT`: Provide a prompt directly as a command-line option.
|
- `--prompt PROMPT`: Provide a prompt directly as a command-line option.
|
||||||
- `--file FNAME`: Provide a file containing a prompt or multiple prompts.
|
- `--file FNAME`: Provide a file containing a prompt or multiple prompts.
|
||||||
@ -82,7 +82,7 @@ The `main` program provides several ways to interact with the LLaMA models using
|
|||||||
|
|
||||||
## Interaction
|
## Interaction
|
||||||
|
|
||||||
The `main` program offers a seamless way to interact with LLaMA models, allowing users to engage in real-time conversations or provide instructions for specific tasks. The interactive mode can be triggered using various options, including `--interactive` and `--interactive-first`.
|
The `llama-cli` program offers a seamless way to interact with LLaMA models, allowing users to engage in real-time conversations or provide instructions for specific tasks. The interactive mode can be triggered using various options, including `--interactive` and `--interactive-first`.
|
||||||
|
|
||||||
In interactive mode, users can participate in text generation by injecting their input during the process. Users can press `Ctrl+C` at any time to interject and type their input, followed by pressing `Return` to submit it to the LLaMA model. To submit additional lines without finalizing input, users can end the current line with a backslash (`\`) and continue typing.
|
In interactive mode, users can participate in text generation by injecting their input during the process. Users can press `Ctrl+C` at any time to interject and type their input, followed by pressing `Return` to submit it to the LLaMA model. To submit additional lines without finalizing input, users can end the current line with a backslash (`\`) and continue typing.
|
||||||
|
|
||||||
|
@ -476,7 +476,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
// Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
||||||
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
// Run `./llama-perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
||||||
// Output: `perplexity: 13.5106 [114/114]`
|
// Output: `perplexity: 13.5106 [114/114]`
|
||||||
// BOS tokens will be added for each chunk before eval
|
// BOS tokens will be added for each chunk before eval
|
||||||
|
|
||||||
|
@ -10,7 +10,7 @@ import yaml
|
|||||||
|
|
||||||
logger = logging.getLogger("run-with-preset")
|
logger = logging.getLogger("run-with-preset")
|
||||||
|
|
||||||
CLI_ARGS_MAIN_PERPLEXITY = [
|
CLI_ARGS_LLAMA_CLI_PERPLEXITY = [
|
||||||
"batch-size", "cfg-negative-prompt", "cfg-scale", "chunks", "color", "ctx-size", "escape",
|
"batch-size", "cfg-negative-prompt", "cfg-scale", "chunks", "color", "ctx-size", "escape",
|
||||||
"export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag",
|
"export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag",
|
||||||
"hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix",
|
"hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix",
|
||||||
@ -29,7 +29,7 @@ CLI_ARGS_LLAMA_BENCH = [
|
|||||||
"n-prompt", "output", "repetitions", "tensor-split", "threads", "verbose"
|
"n-prompt", "output", "repetitions", "tensor-split", "threads", "verbose"
|
||||||
]
|
]
|
||||||
|
|
||||||
CLI_ARGS_SERVER = [
|
CLI_ARGS_LLAMA_SERVER = [
|
||||||
"alias", "batch-size", "ctx-size", "embedding", "host", "memory-f32", "lora", "lora-base",
|
"alias", "batch-size", "ctx-size", "embedding", "host", "memory-f32", "lora", "lora-base",
|
||||||
"low-vram", "main-gpu", "mlock", "model", "n-gpu-layers", "n-probs", "no-mmap", "no-mul-mat-q",
|
"low-vram", "main-gpu", "mlock", "model", "n-gpu-layers", "n-probs", "no-mmap", "no-mul-mat-q",
|
||||||
"numa", "path", "port", "rope-freq-base", "timeout", "rope-freq-scale", "tensor-split",
|
"numa", "path", "port", "rope-freq-base", "timeout", "rope-freq-scale", "tensor-split",
|
||||||
@ -37,7 +37,7 @@ CLI_ARGS_SERVER = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
description = """Run llama.cpp binaries with presets from YAML file(s).
|
description = """Run llama.cpp binaries with presets from YAML file(s).
|
||||||
To specify which binary should be run, specify the "binary" property (main, perplexity, llama-bench, and server are supported).
|
To specify which binary should be run, specify the "binary" property (llama-cli, llama-perplexity, llama-bench, and llama-server are supported).
|
||||||
To get a preset file template, run a llama.cpp binary with the "--logdir" CLI argument.
|
To get a preset file template, run a llama.cpp binary with the "--logdir" CLI argument.
|
||||||
|
|
||||||
Formatting considerations:
|
Formatting considerations:
|
||||||
@ -77,19 +77,19 @@ for yaml_file in known_args.yaml_files:
|
|||||||
|
|
||||||
props = {prop.replace("_", "-"): val for prop, val in props.items()}
|
props = {prop.replace("_", "-"): val for prop, val in props.items()}
|
||||||
|
|
||||||
binary = props.pop("binary", "main")
|
binary = props.pop("binary", "llama-cli")
|
||||||
if known_args.binary:
|
if known_args.binary:
|
||||||
binary = known_args.binary
|
binary = known_args.binary
|
||||||
|
|
||||||
if os.path.exists(f"./{binary}"):
|
if os.path.exists(f"./{binary}"):
|
||||||
binary = f"./{binary}"
|
binary = f"./{binary}"
|
||||||
|
|
||||||
if binary.lower().endswith("main") or binary.lower().endswith("perplexity"):
|
if binary.lower().endswith("llama-cli") or binary.lower().endswith("llama-perplexity"):
|
||||||
cli_args = CLI_ARGS_MAIN_PERPLEXITY
|
cli_args = CLI_ARGS_LLAMA_CLI_PERPLEXITY
|
||||||
elif binary.lower().endswith("llama-bench"):
|
elif binary.lower().endswith("llama-bench"):
|
||||||
cli_args = CLI_ARGS_LLAMA_BENCH
|
cli_args = CLI_ARGS_LLAMA_BENCH
|
||||||
elif binary.lower().endswith("server"):
|
elif binary.lower().endswith("llama-server"):
|
||||||
cli_args = CLI_ARGS_SERVER
|
cli_args = CLI_ARGS_LLAMA_SERVER
|
||||||
else:
|
else:
|
||||||
logger.error(f"Unknown binary: {binary}")
|
logger.error(f"Unknown binary: {binary}")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user