mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-27 04:23:06 +01:00
scripts : add non-interactive server-llm.sh (#5303)
* Update server-llm.sh Add flag --non-interactive that allows run script without asking a permission * Update scripts/server-llm.sh --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
5d55b0cd82
commit
4be04c8965
@ -47,6 +47,7 @@ if ! command -v make &> /dev/null; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
# parse arguments
|
# parse arguments
|
||||||
|
is_interactive=1
|
||||||
port=8888
|
port=8888
|
||||||
repo=""
|
repo=""
|
||||||
wtype=""
|
wtype=""
|
||||||
@ -66,15 +67,16 @@ verbose=0
|
|||||||
|
|
||||||
function print_usage {
|
function print_usage {
|
||||||
printf "Usage:\n"
|
printf "Usage:\n"
|
||||||
printf " ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]\n\n"
|
printf " ./server-llm.sh [-interactive] [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]\n\n"
|
||||||
printf " --port: port number, default is 8888\n"
|
printf " --non-interactive: run without asking a permision to run\n"
|
||||||
printf " --repo: path to a repo containing GGUF model files\n"
|
printf " --port: port number, default is 8888\n"
|
||||||
printf " --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input\n"
|
printf " --repo: path to a repo containing GGUF model files\n"
|
||||||
printf " --backend: cpu, cuda, metal, opencl, depends on the OS\n"
|
printf " --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input\n"
|
||||||
printf " --gpu-id: gpu id, default is 0\n"
|
printf " --backend: cpu, cuda, metal, opencl, depends on the OS\n"
|
||||||
printf " --n-parallel: number of parallel requests, default is 8\n"
|
printf " --gpu-id: gpu id, default is 0\n"
|
||||||
printf " --n-kv: KV cache size, default is 4096\n"
|
printf " --n-parallel: number of parallel requests, default is 8\n"
|
||||||
printf " --verbose: verbose output\n\n"
|
printf " --n-kv: KV cache size, default is 4096\n"
|
||||||
|
printf " --verbose: verbose output\n\n"
|
||||||
printf "Example:\n\n"
|
printf "Example:\n\n"
|
||||||
printf ' bash -c "$(curl -s https://ggml.ai/server-llm.sh)"\n\n'
|
printf ' bash -c "$(curl -s https://ggml.ai/server-llm.sh)"\n\n'
|
||||||
}
|
}
|
||||||
@ -82,6 +84,10 @@ function print_usage {
|
|||||||
while [[ $# -gt 0 ]]; do
|
while [[ $# -gt 0 ]]; do
|
||||||
key="$1"
|
key="$1"
|
||||||
case $key in
|
case $key in
|
||||||
|
--non-interactive)
|
||||||
|
is_interactive=0
|
||||||
|
shift
|
||||||
|
;;
|
||||||
--port)
|
--port)
|
||||||
port="$2"
|
port="$2"
|
||||||
shift
|
shift
|
||||||
@ -176,31 +182,32 @@ repos=(
|
|||||||
"https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF"
|
"https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF"
|
||||||
"https://huggingface.co/TheBloke/CausalLM-7B-GGUF"
|
"https://huggingface.co/TheBloke/CausalLM-7B-GGUF"
|
||||||
)
|
)
|
||||||
|
if [ $is_interactive -eq 1 ]; then
|
||||||
|
printf "\n"
|
||||||
|
printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n"
|
||||||
|
printf " Based on the options that follow, the script might download a model file\n"
|
||||||
|
printf " from the internet, which can be a few GBs in size. The script will also\n"
|
||||||
|
printf " build the latest llama.cpp source code from GitHub, which can be unstable.\n"
|
||||||
|
printf "\n"
|
||||||
|
printf " Upon success, an HTTP server will be started and it will serve the selected\n"
|
||||||
|
printf " model using llama.cpp for demonstration purposes.\n"
|
||||||
|
printf "\n"
|
||||||
|
printf " Please note:\n"
|
||||||
|
printf "\n"
|
||||||
|
printf " - All new data will be stored in the current folder\n"
|
||||||
|
printf " - The server will be listening on all network interfaces\n"
|
||||||
|
printf " - The server will run with default settings which are not always optimal\n"
|
||||||
|
printf " - Do not judge the quality of a model based on the results from this script\n"
|
||||||
|
printf " - Do not use this script to benchmark llama.cpp\n"
|
||||||
|
printf " - Do not use this script in production\n"
|
||||||
|
printf " - This script is only for demonstration purposes\n"
|
||||||
|
printf "\n"
|
||||||
|
printf " If you don't know what you are doing, please press Ctrl-C to abort now\n"
|
||||||
|
printf "\n"
|
||||||
|
printf " Press Enter to continue ...\n\n"
|
||||||
|
|
||||||
printf "\n"
|
read
|
||||||
printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n"
|
fi
|
||||||
printf " Based on the options that follow, the script might download a model file\n"
|
|
||||||
printf " from the internet, which can be a few GBs in size. The script will also\n"
|
|
||||||
printf " build the latest llama.cpp source code from GitHub, which can be unstable.\n"
|
|
||||||
printf "\n"
|
|
||||||
printf " Upon success, an HTTP server will be started and it will serve the selected\n"
|
|
||||||
printf " model using llama.cpp for demonstration purposes.\n"
|
|
||||||
printf "\n"
|
|
||||||
printf " Please note:\n"
|
|
||||||
printf "\n"
|
|
||||||
printf " - All new data will be stored in the current folder\n"
|
|
||||||
printf " - The server will be listening on all network interfaces\n"
|
|
||||||
printf " - The server will run with default settings which are not always optimal\n"
|
|
||||||
printf " - Do not judge the quality of a model based on the results from this script\n"
|
|
||||||
printf " - Do not use this script to benchmark llama.cpp\n"
|
|
||||||
printf " - Do not use this script in production\n"
|
|
||||||
printf " - This script is only for demonstration purposes\n"
|
|
||||||
printf "\n"
|
|
||||||
printf " If you don't know what you are doing, please press Ctrl-C to abort now\n"
|
|
||||||
printf "\n"
|
|
||||||
printf " Press Enter to continue ...\n\n"
|
|
||||||
|
|
||||||
read
|
|
||||||
|
|
||||||
if [[ -z "$repo" ]]; then
|
if [[ -z "$repo" ]]; then
|
||||||
printf "[+] No repo provided from the command line\n"
|
printf "[+] No repo provided from the command line\n"
|
||||||
|
Loading…
Reference in New Issue
Block a user