From 5265c15d4cd0f7255328c622c6281f8fd58809cd Mon Sep 17 00:00:00 2001
From: Olivier Chafik <ochafik@google.com>
Date: Mon, 10 Jun 2024 15:34:14 +0100
Subject: [PATCH] rename llama|main -> llama-cli; consistent RPM bin prefixes

---
 .devops/cloud-v-pipeline                   |  2 +-
 .devops/llama-cpp-clblast.srpm.spec        | 14 ++---
 .devops/llama-cpp-cuda.srpm.spec           | 14 ++---
 .devops/llama-cpp.srpm.spec                | 14 ++---
 .devops/llama-cuda.Dockerfile              |  4 +-
 .devops/llama-intel.Dockerfile             |  4 +-
 .devops/llama-rocm.Dockerfile              |  2 +-
 .devops/llama-vulkan.Dockerfile            |  4 +-
 .devops/llama.Dockerfile                   |  4 +-
 .devops/nix/package.nix                    |  2 +-
 .devops/tools.sh                           |  2 +-
 .dockerignore                              |  2 +-
 .github/ISSUE_TEMPLATE/01-bug-low.yml      |  2 +-
 .github/ISSUE_TEMPLATE/02-bug-medium.yml   |  2 +-
 .github/ISSUE_TEMPLATE/03-bug-high.yml     |  2 +-
 .github/ISSUE_TEMPLATE/04-bug-critical.yml |  2 +-
 .gitignore                                 |  1 -
 Makefile                                   | 10 ++--
 README-sycl.md                             |  6 +-
 README.md                                  | 14 ++---
 ci/run.sh                                  | 66 +++++++++++-----------
 docs/token_generation_performance_tips.md  |  4 +-
 examples/Miku.sh                           |  2 +-
 examples/alpaca.sh                         |  2 +-
 examples/baby-llama/CMakeLists.txt         |  2 +-
 examples/base-translate.sh                 |  2 +-
 examples/chat-13B.sh                       |  2 +-
 examples/chat-persistent.sh                | 10 ++--
 examples/chat-vicuna.sh                    |  2 +-
 examples/chat.sh                           |  2 +-
 examples/convert-llama2c-to-ggml/README.md |  2 +-
 examples/finetune/README.md                |  6 +-
 examples/gguf-split/tests.sh               |  2 +-
 examples/gpt4all.sh                        |  2 +-
 examples/jeopardy/jeopardy.sh              |  2 +-
 examples/json_schema_to_grammar.py         |  2 +-
 examples/llama2-13b.sh                     |  2 +-
 examples/llama2.sh                         |  2 +-
 examples/llava/CMakeLists.txt              |  4 +-
 examples/llava/MobileVLM-README.md         | 16 +++---
 examples/llava/README.md                   |  8 +--
 examples/llava/android/adb_run.sh          |  2 +-
 examples/main/CMakeLists.txt               |  1 -
 examples/main/README.md                    | 10 ++--
 examples/quantize/tests.sh                 |  2 +-
 examples/reason-act.sh                     |  2 +-
 examples/rpc/README.md                     |  2 +-
 examples/sycl/run-llama2.sh                |  8 +--
 examples/train-text-from-scratch/README.md |  2 +-
 grammars/README.md                         |  2 +-
 scripts/hf.sh                              |  6 +-
 51 files changed, 142 insertions(+), 144 deletions(-)

diff --git a/.devops/cloud-v-pipeline b/.devops/cloud-v-pipeline
index 90e5bbd9e..af8c0cea6 100644
--- a/.devops/cloud-v-pipeline
+++ b/.devops/cloud-v-pipeline
@@ -15,7 +15,7 @@ node('x86_runner1'){            // Running on x86 runner containing latest vecto
     stage('Running llama.cpp'){
         sh'''#!/bin/bash
             module load gnu-bin2/0.1            # loading latest versions of vector qemu and vector gcc
-            qemu-riscv64 -L /softwares/gnu-bin2/sysroot  -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt            # Running llama.cpp on vector qemu-riscv64
+            qemu-riscv64 -L /softwares/gnu-bin2/sysroot  -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama-cli -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt            # Running llama.cpp on vector qemu-riscv64
             cat llama_log.txt                   # Printing results
         '''
     }
diff --git a/.devops/llama-cpp-clblast.srpm.spec b/.devops/llama-cpp-clblast.srpm.spec
index dcf803cb4..013952191 100644
--- a/.devops/llama-cpp-clblast.srpm.spec
+++ b/.devops/llama-cpp-clblast.srpm.spec
@@ -36,9 +36,9 @@ make -j LLAMA_CLBLAST=1
 
 %install
 mkdir -p %{buildroot}%{_bindir}/
-cp -p llama %{buildroot}%{_bindir}/llamaclblast
-cp -p llama-server %{buildroot}%{_bindir}/llamaclblastserver
-cp -p simple %{buildroot}%{_bindir}/llamaclblastsimple
+cp -p llama-cli %{buildroot}%{_bindir}/llama-clblast-cli
+cp -p llama-server %{buildroot}%{_bindir}/llama-clblast-server
+cp -p llama-simple %{buildroot}%{_bindir}/llama-clblast-simple
 
 mkdir -p %{buildroot}/usr/lib/systemd/system
 %{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamaclblast.service
@@ -49,7 +49,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
 [Service]
 Type=simple
 EnvironmentFile=/etc/sysconfig/llama
-ExecStart=/usr/bin/llamaclblastserver $LLAMA_ARGS
+ExecStart=/usr/bin/llama-clblast-server $LLAMA_ARGS
 ExecReload=/bin/kill -s HUP $MAINPID
 Restart=never
 
@@ -67,9 +67,9 @@ rm -rf %{buildroot}
 rm -rf %{_builddir}/*
 
 %files
-%{_bindir}/llamaclblast
-%{_bindir}/llamaclblastserver
-%{_bindir}/llamaclblastsimple
+%{_bindir}/llama-clblast-cli
+%{_bindir}/llama-clblast-server
+%{_bindir}/llama-clblast-simple
 /usr/lib/systemd/system/llamaclblast.service
 %config /etc/sysconfig/llama
 
diff --git a/.devops/llama-cpp-cuda.srpm.spec b/.devops/llama-cpp-cuda.srpm.spec
index cec6052fa..cbdf43626 100644
--- a/.devops/llama-cpp-cuda.srpm.spec
+++ b/.devops/llama-cpp-cuda.srpm.spec
@@ -36,9 +36,9 @@ make -j LLAMA_CUDA=1
 
 %install
 mkdir -p %{buildroot}%{_bindir}/
-cp -p llama %{buildroot}%{_bindir}/llamacppcuda
-cp -p llama-server %{buildroot}%{_bindir}/llamacppcudaserver
-cp -p simple %{buildroot}%{_bindir}/llamacppcudasimple
+cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
+cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
+cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
 
 mkdir -p %{buildroot}/usr/lib/systemd/system
 %{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamacuda.service
@@ -49,7 +49,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
 [Service]
 Type=simple
 EnvironmentFile=/etc/sysconfig/llama
-ExecStart=/usr/bin/llamacppcudaserver $LLAMA_ARGS
+ExecStart=/usr/bin/llama-cuda-server $LLAMA_ARGS
 ExecReload=/bin/kill -s HUP $MAINPID
 Restart=never
 
@@ -67,9 +67,9 @@ rm -rf %{buildroot}
 rm -rf %{_builddir}/*
 
 %files
-%{_bindir}/llamacppcuda
-%{_bindir}/llamacppcudaserver
-%{_bindir}/llamacppcudasimple
+%{_bindir}/llama-cuda-cli
+%{_bindir}/llama-cuda-server
+%{_bindir}/llama-cuda-simple
 /usr/lib/systemd/system/llamacuda.service
 %config /etc/sysconfig/llama
 
diff --git a/.devops/llama-cpp.srpm.spec b/.devops/llama-cpp.srpm.spec
index 6d9de60ad..4d5560089 100644
--- a/.devops/llama-cpp.srpm.spec
+++ b/.devops/llama-cpp.srpm.spec
@@ -38,9 +38,9 @@ make -j
 
 %install
 mkdir -p %{buildroot}%{_bindir}/
-cp -p llama %{buildroot}%{_bindir}/llama
-cp -p llama-server %{buildroot}%{_bindir}/llamaserver
-cp -p simple %{buildroot}%{_bindir}/llamasimple
+cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
+cp -p llama-server %{buildroot}%{_bindir}/llama-server
+cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
 
 mkdir -p %{buildroot}/usr/lib/systemd/system
 %{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llama.service
@@ -51,7 +51,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
 [Service]
 Type=simple
 EnvironmentFile=/etc/sysconfig/llama
-ExecStart=/usr/bin/llamaserver $LLAMA_ARGS
+ExecStart=/usr/bin/llama-server $LLAMA_ARGS
 ExecReload=/bin/kill -s HUP $MAINPID
 Restart=never
 
@@ -69,9 +69,9 @@ rm -rf %{buildroot}
 rm -rf %{_builddir}/*
 
 %files
-%{_bindir}/llama
-%{_bindir}/llamaserver
-%{_bindir}/llamasimple
+%{_bindir}/llama-cli
+%{_bindir}/llama-server
+%{_bindir}/llama-simple
 /usr/lib/systemd/system/llama.service
 %config /etc/sysconfig/llama
 
diff --git a/.devops/llama-cuda.Dockerfile b/.devops/llama-cuda.Dockerfile
index 5ec554aea..c2a3914ed 100644
--- a/.devops/llama-cuda.Dockerfile
+++ b/.devops/llama-cuda.Dockerfile
@@ -30,6 +30,6 @@ FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
 RUN apt-get update && \
     apt-get install -y libgomp1
 
-COPY --from=build /app/llama /llama
+COPY --from=build /app/llama-cli /llama-cli
 
-ENTRYPOINT [ "/main" ]
+ENTRYPOINT [ "/llama-cli" ]
diff --git a/.devops/llama-intel.Dockerfile b/.devops/llama-intel.Dockerfile
index 0c1f4468f..027d1a62d 100644
--- a/.devops/llama-intel.Dockerfile
+++ b/.devops/llama-intel.Dockerfile
@@ -27,8 +27,8 @@ RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
 
 FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
 
-COPY --from=build /app/build/bin/llama /llama
+COPY --from=build /app/build/bin/llama-cli /llama-cli
 
 ENV LC_ALL=C.utf8
 
-ENTRYPOINT [ "/main" ]
+ENTRYPOINT [ "/llama-cli" ]
diff --git a/.devops/llama-rocm.Dockerfile b/.devops/llama-rocm.Dockerfile
index 009ac826c..86b11b07f 100644
--- a/.devops/llama-rocm.Dockerfile
+++ b/.devops/llama-rocm.Dockerfile
@@ -42,4 +42,4 @@ ENV CXX=/opt/rocm/llvm/bin/clang++
 
 RUN make -j$(nproc) llama
 
-ENTRYPOINT [ "/app/main" ]
+ENTRYPOINT [ "/app/llama-cli" ]
diff --git a/.devops/llama-vulkan.Dockerfile b/.devops/llama-vulkan.Dockerfile
index 1310eb90b..7a0abe71f 100644
--- a/.devops/llama-vulkan.Dockerfile
+++ b/.devops/llama-vulkan.Dockerfile
@@ -19,9 +19,9 @@ RUN cmake -B build -DLLAMA_VULKAN=1 && \
 
 # Clean up
 WORKDIR /
-RUN cp /app/build/bin/llama /llama && \
+RUN cp /app/build/bin/llama-cli /llama-cli && \
     rm -rf /app
 
 ENV LC_ALL=C.utf8
 
-ENTRYPOINT [ "/main" ]
+ENTRYPOINT [ "/llama-cli" ]
diff --git a/.devops/llama.Dockerfile b/.devops/llama.Dockerfile
index c5479800e..21fdf2a66 100644
--- a/.devops/llama.Dockerfile
+++ b/.devops/llama.Dockerfile
@@ -16,8 +16,8 @@ FROM ubuntu:$UBUNTU_VERSION as runtime
 RUN apt-get update && \
     apt-get install -y libgomp1
 
-COPY --from=build /app/llama /llama
+COPY --from=build /app/llama-cli /llama-cli
 
 ENV LC_ALL=C.utf8
 
-ENTRYPOINT [ "/main" ]
+ENTRYPOINT [ "/llama-cli" ]
diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix
index 107cc995f..87bb3a20f 100644
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -292,7 +292,7 @@ effectiveStdenv.mkDerivation (
       license = lib.licenses.mit;
 
       # Accommodates `nix run` and `lib.getExe`
-      mainProgram = "llama";
+      mainProgram = "llama-cli";
 
       # These people might respond, on the best effort basis, if you ping them
       # in case of Nix-specific regressions or for reviewing Nix-specific PRs.
diff --git a/.devops/tools.sh b/.devops/tools.sh
index 1ed20d56b..335382f69 100755
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -12,7 +12,7 @@ if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
 elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
     ./llama-quantize "$@"
 elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
-    ./llama "$@"
+    ./llama-cli "$@"
 elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
     ./llama-finetune "$@"
 elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
diff --git a/.dockerignore b/.dockerignore
index 42d0ad165..8916e2a66 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -12,7 +12,7 @@ build*/
 
 models/*
 
-/llama
+/llama-cli
 /llama-quantize
 
 arm_neon.h
diff --git a/.github/ISSUE_TEMPLATE/01-bug-low.yml b/.github/ISSUE_TEMPLATE/01-bug-low.yml
index 67b274092..54785854f 100644
--- a/.github/ISSUE_TEMPLATE/01-bug-low.yml
+++ b/.github/ISSUE_TEMPLATE/01-bug-low.yml
@@ -24,7 +24,7 @@ body:
       label: Name and Version
       description: Which executable and which version of our software are you running? (use `--version` to get a version string)
       placeholder: |
-        $./llama --version
+        $./llama-cli --version
         version: 2999 (42b4109e)
         built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
     validations:
diff --git a/.github/ISSUE_TEMPLATE/02-bug-medium.yml b/.github/ISSUE_TEMPLATE/02-bug-medium.yml
index 9f5adbe2f..a6285c6f0 100644
--- a/.github/ISSUE_TEMPLATE/02-bug-medium.yml
+++ b/.github/ISSUE_TEMPLATE/02-bug-medium.yml
@@ -24,7 +24,7 @@ body:
       label: Name and Version
       description: Which executable and which version of our software are you running? (use `--version` to get a version string)
       placeholder: |
-        $./llama --version
+        $./llama-cli --version
         version: 2999 (42b4109e)
         built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
     validations:
diff --git a/.github/ISSUE_TEMPLATE/03-bug-high.yml b/.github/ISSUE_TEMPLATE/03-bug-high.yml
index 7fcc8a7d5..ff816b937 100644
--- a/.github/ISSUE_TEMPLATE/03-bug-high.yml
+++ b/.github/ISSUE_TEMPLATE/03-bug-high.yml
@@ -24,7 +24,7 @@ body:
       label: Name and Version
       description: Which executable and which version of our software are you running? (use `--version` to get a version string)
       placeholder: |
-        $./llama --version
+        $./llama-cli --version
         version: 2999 (42b4109e)
         built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
     validations:
diff --git a/.github/ISSUE_TEMPLATE/04-bug-critical.yml b/.github/ISSUE_TEMPLATE/04-bug-critical.yml
index 848deab31..7af42a80b 100644
--- a/.github/ISSUE_TEMPLATE/04-bug-critical.yml
+++ b/.github/ISSUE_TEMPLATE/04-bug-critical.yml
@@ -24,7 +24,7 @@ body:
       label: Name and Version
       description: Which executable and which version of our software are you running? (use `--version` to get a version string)
       placeholder: |
-        $./llama --version
+        $./llama-cli --version
         version: 2999 (42b4109e)
         built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
     validations:
diff --git a/.gitignore b/.gitignore
index 12fb39076..529659495 100644
--- a/.gitignore
+++ b/.gitignore
@@ -47,7 +47,6 @@ models-mnt
 
 /Pipfile
 /libllama.so
-/llama
 /llama-*
 llama-batched-swift
 /common/build-info.cpp
diff --git a/Makefile b/Makefile
index 852022016..c320b2fa0 100644
--- a/Makefile
+++ b/Makefile
@@ -1,12 +1,12 @@
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
 	libllava.a \
-	llama \
 	llama-baby \
 	llama-batched \
 	llama-batched-bench \
 	llama-bench \
 	llama-benchmark-matmult \
+	llama-cli \
 	llama-convert-llama2c-to-ggml \
 	llama-embedding \
 	llama-eval-callback \
@@ -17,7 +17,7 @@ BUILD_TARGETS = \
 	llama-gritlm \
 	llama-imatrix \
 	llama-infill \
-	llama-llava \
+	llama-llava-cli \
 	llama-lookahead \
 	llama-lookup \
 	llama-lookup-create \
@@ -828,11 +828,11 @@ clean:
 # Helper function that replaces .c, .cpp, and .cu file endings with .o:
 GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1))))
 
-llama: examples/main/main.cpp                                  ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
+llama-cli: examples/main/main.cpp                                  ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 	@echo
-	@echo '====  Run ./llama -h for help.  ===='
+	@echo '====  Run ./llama-cli -h for help.  ===='
 	@echo
 
 llama-infill: examples/infill/infill.cpp                            ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
@@ -923,7 +923,7 @@ llama-bench: examples/llama-bench/llama-bench.cpp ggml.o llama.o $(COMMON_DEPS)
 libllava.a: examples/llava/llava.cpp examples/llava/llava.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h common/base64.hpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
 
-llama-llava: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp  -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
 	$(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp)
diff --git a/README-sycl.md b/README-sycl.md
index ad750b797..720d2ced9 100644
--- a/README-sycl.md
+++ b/README-sycl.md
@@ -77,7 +77,7 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
 *Notes:*
 
 - **Memory**
-  - The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama`.
+  - The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-cli`.
 
   - Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
 
@@ -313,7 +313,7 @@ Examples:
 - Use device 0:
 
 ```sh
-ZES_ENABLE_SYSMAN=1 ./build/bin/llama -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
+ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
 ```
 or run by script:
 
@@ -324,7 +324,7 @@ or run by script:
 - Use multiple devices:
 
 ```sh
-ZES_ENABLE_SYSMAN=1 ./build/bin/llama -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
+ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
 ```
 
 Otherwise, you can run the script:
diff --git a/README.md b/README.md
index 1f077cd15..54998bf5e 100644
--- a/README.md
+++ b/README.md
@@ -218,7 +218,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 Here is a typical run using LLaMA v2 13B on M2 Ultra:
 
 ```
-$ make -j && ./llama -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
+$ make -j && ./llama-cli -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
 I llama.cpp build info:
 I UNAME_S:  Darwin
 I UNAME_P:  arm
@@ -585,7 +585,7 @@ Building the program with BLAS support may lead to some performance improvements
   cmake -B build -DLLAMA_VULKAN=1
   cmake --build build --config Release
   # Test the output binary (with "-ngl 33" to offload all layers to GPU)
-  ./bin/llama -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
+  ./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
 
   # You should see in the output, ggml_vulkan detected your GPU. For example:
   # ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
@@ -632,7 +632,7 @@ python convert-hf-to-gguf.py models/mymodel/ --vocab-type bpe
 
 ```bash
 # start inference on a gguf model
-./llama -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128
+./llama-cli -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128
 ```
 
 When running the larger models, make sure you have enough disk space to store all the intermediate files.
@@ -731,7 +731,7 @@ Here is an example of a few-shot interaction, invoked with the command
 ./examples/chat-13B.sh
 
 # custom arguments using a 13B model
-./llama -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
+./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
 ```
 
 Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program.
@@ -740,7 +740,7 @@ Note the use of `--color` to distinguish between user input and generated text.
 
 ### Persistent Interaction
 
-The prompt, user inputs, and model generations can be saved and resumed across calls to `./llama` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
+The prompt, user inputs, and model generations can be saved and resumed across calls to `./llama-cli` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
 
 ```bash
 # Start a new chat
@@ -762,7 +762,7 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
 `llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
 
 ```bash
-./llama -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
+./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
 ```
 
 The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
@@ -869,7 +869,7 @@ $mv /sdcard/llama.cpp/llama-2-7b-chat.Q4_K_M.gguf /data/data/com.termux/files/ho
 Now, you can start chatting:
 ```
 $cd /data/data/com.termux/files/home/bin
-$./llama -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
+$./llama-cli -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
 ```
 
 Here's a demo of an interactive session running on Pixel 5 phone:
diff --git a/ci/run.sh b/ci/run.sh
index f3d1ea70d..291c44f47 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -314,17 +314,17 @@ function gg_run_open_llama_7b_v2 {
     ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
     ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
 
-    (time ./bin/llama --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-cli --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
 
     (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
     (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
@@ -448,17 +448,17 @@ function gg_run_pythia_1_4b {
     ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
     ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
 
-    (time ./bin/llama --model ${model_f16}  -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-cli --model ${model_f16}  -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-cli --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-cli --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-cli --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-cli --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-cli --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-cli --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-cli --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-cli --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-cli --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-cli --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
 
     (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
     (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
@@ -580,17 +580,17 @@ function gg_run_pythia_2_8b {
     ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
     ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
 
-    (time ./bin/llama --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-cli --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
 
     (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
     (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
diff --git a/docs/token_generation_performance_tips.md b/docs/token_generation_performance_tips.md
index c6a201abe..c0840cad5 100644
--- a/docs/token_generation_performance_tips.md
+++ b/docs/token_generation_performance_tips.md
@@ -3,7 +3,7 @@
 ## Verifying that the model is running on the GPU with CUDA
 Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
 ```shell
-./llama -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
+./llama-cli -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
 ```
 
 When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
@@ -27,7 +27,7 @@ RAM: 32GB
 
 Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML)
 
-Run command: `./llama -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
+Run command: `./llama-cli -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
 
 Result:
 
diff --git a/examples/Miku.sh b/examples/Miku.sh
index 114ccbb4f..0f6c8c878 100755
--- a/examples/Miku.sh
+++ b/examples/Miku.sh
@@ -22,7 +22,7 @@ if [ -n "$N_THREAD" ]; then
     GEN_OPTIONS+=(--threads "$N_THREAD")
 fi
 
-./llama "${GEN_OPTIONS[@]}" \
+./llama-cli "${GEN_OPTIONS[@]}" \
     --model "$MODEL" \
     --in-prefix " " \
     --in-suffix "${AI_NAME}:" \
diff --git a/examples/alpaca.sh b/examples/alpaca.sh
index b8148dbf2..ed245d2c4 100755
--- a/examples/alpaca.sh
+++ b/examples/alpaca.sh
@@ -7,7 +7,7 @@
 cd `dirname $0`
 cd ..
 
-./llama -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
+./llama-cli -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
        --color \
        -f ./prompts/alpaca.txt \
        --ctx_size 2048 \
diff --git a/examples/baby-llama/CMakeLists.txt b/examples/baby-llama/CMakeLists.txt
index c7a3062d5..71b82105c 100644
--- a/examples/baby-llama/CMakeLists.txt
+++ b/examples/baby-llama/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(TARGET llama-baby)
+set(TARGET llama-baby-llama)
 add_executable(${TARGET} baby-llama.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
diff --git a/examples/base-translate.sh b/examples/base-translate.sh
index 971b8ebc2..103a52f55 100755
--- a/examples/base-translate.sh
+++ b/examples/base-translate.sh
@@ -58,4 +58,4 @@ echo "$2
 model=$1
 
 # generate the most likely continuation until the string "===" is found
-./llama -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
+./llama-cli -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
diff --git a/examples/chat-13B.sh b/examples/chat-13B.sh
index c27763a36..1828903c3 100755
--- a/examples/chat-13B.sh
+++ b/examples/chat-13B.sh
@@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
      $PROMPT_TEMPLATE > $PROMPT_FILE
 
 # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
-./llama $GEN_OPTIONS \
+./llama-cli $GEN_OPTIONS \
   --model "$MODEL" \
   --threads "$N_THREAD" \
   --n_predict "$N_PREDICTS" \
diff --git a/examples/chat-persistent.sh b/examples/chat-persistent.sh
index 151da6625..d9cab9836 100755
--- a/examples/chat-persistent.sh
+++ b/examples/chat-persistent.sh
@@ -62,7 +62,7 @@ fi
 if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
     echo 'Prompt cache does not exist, building...'
     # Default batch_size to 64 here for better user feedback during initial prompt processing
-    ./llama 2>>"$LOG" \
+    ./llama-cli 2>>"$LOG" \
         --batch_size 64 \
         "${OPTS[@]}" \
         --prompt-cache "$PROMPT_CACHE_FILE" \
@@ -109,13 +109,13 @@ while read -e line; do
 
     printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE"
 
-    ./llama 2>>"$LOG" "${OPTS[@]}" \
+    ./llama-cli 2>>"$LOG" "${OPTS[@]}" \
             --prompt-cache "$CUR_PROMPT_CACHE" \
             --prompt-cache-all \
             --file "$CUR_PROMPT_FILE" \
             --reverse-prompt "${USER_NAME}:" \
             --n_predict "$n_predict" |
-        skip_bytes 1 |                  # skip BOS token added by ./llama
+        skip_bytes 1 |                  # skip BOS token added by ./llama-cli
         tee "$CUR_PROMPT_FILE.tmp" |    # save prompt + generation to tmp file
         skip_bytes "$n_prompt_len_pre"  # print generation
 
@@ -133,7 +133,7 @@ while read -e line; do
     # TODO get both messages in one go
     if  ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
         ! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
-        echo >&2 "Couldn't get number of tokens from ./llama output!"
+        echo >&2 "Couldn't get number of tokens from ./llama-cli output!"
         exit 1
     fi
 
@@ -144,7 +144,7 @@ while read -e line; do
     fi
 
     # Update cache for next prompt in background, ideally during user input
-    ./llama >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
+    ./llama-cli >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
           --prompt-cache "$NEXT_PROMPT_CACHE" \
           --file "$NEXT_PROMPT_FILE" \
           --n_predict 1 &
diff --git a/examples/chat-vicuna.sh b/examples/chat-vicuna.sh
index c243c1adc..ffdd20084 100755
--- a/examples/chat-vicuna.sh
+++ b/examples/chat-vicuna.sh
@@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
      $PROMPT_TEMPLATE > $PROMPT_FILE
 
 # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
-./bin/llama $GEN_OPTIONS \
+./bin/llama-cli $GEN_OPTIONS \
   --model "$MODEL" \
   --threads "$N_THREAD" \
   --n_predict "$N_PREDICTS" \
diff --git a/examples/chat.sh b/examples/chat.sh
index 70db33c79..9f85d1e26 100755
--- a/examples/chat.sh
+++ b/examples/chat.sh
@@ -11,6 +11,6 @@ cd ..
 #
 #   "--keep 48" is based on the contents of prompts/chat-with-bob.txt
 #
-./llama -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
+./llama-cli -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
     --repeat_penalty 1.0 --color -i \
     -r "User:" -f prompts/chat-with-bob.txt
diff --git a/examples/convert-llama2c-to-ggml/README.md b/examples/convert-llama2c-to-ggml/README.md
index 1dc67a23c..5774ac83c 100644
--- a/examples/convert-llama2c-to-ggml/README.md
+++ b/examples/convert-llama2c-to-ggml/README.md
@@ -25,4 +25,4 @@ Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.b
 
 Now you can use the model with a command like:
 
-`$ ./llama -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
+`$ ./llama-cli -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
diff --git a/examples/finetune/README.md b/examples/finetune/README.md
index 814430cfb..ea72f5d95 100644
--- a/examples/finetune/README.md
+++ b/examples/finetune/README.md
@@ -18,7 +18,7 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
         --use-checkpointing
 
 # predict
-./bin/llama -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
+./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
 ```
 
 **Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`).
@@ -45,7 +45,7 @@ In `main` you can also load multiple LORA adapters, which will then be mixed tog
 For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this:
 
 ```bash
-./bin/llama -m open-llama-3b-v2-q8_0.gguf \
+./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
   --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \
   --lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin
 ```
@@ -55,7 +55,7 @@ You can change how strong each LORA adapter is applied to the base model by usin
 For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one:
 
 ```bash
-./bin/llama -m open-llama-3b-v2-q8_0.gguf \
+./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
   --lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \
   --lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \
   --lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
diff --git a/examples/gguf-split/tests.sh b/examples/gguf-split/tests.sh
index e7006a989..8d33f8531 100755
--- a/examples/gguf-split/tests.sh
+++ b/examples/gguf-split/tests.sh
@@ -19,7 +19,7 @@ fi
 set -x
 
 SPLIT=$1/gguf-split
-MAIN=$1/llama
+MAIN=$1/llama-cli
 WORK_PATH=$TMP_DIR/gguf-split
 ROOT_DIR=$(realpath $(dirname $0)/../../)
 
diff --git a/examples/gpt4all.sh b/examples/gpt4all.sh
index 7194a4728..8232d6777 100755
--- a/examples/gpt4all.sh
+++ b/examples/gpt4all.sh
@@ -7,7 +7,7 @@
 cd `dirname $0`
 cd ..
 
-./llama --color --instruct --threads 4 \
+./llama-cli --color --instruct --threads 4 \
        --model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \
        --file ./prompts/alpaca.txt \
        --batch_size 8 --ctx_size 2048 -n -1 \
diff --git a/examples/jeopardy/jeopardy.sh b/examples/jeopardy/jeopardy.sh
index 502b0cac8..07bcb3b8d 100755
--- a/examples/jeopardy/jeopardy.sh
+++ b/examples/jeopardy/jeopardy.sh
@@ -21,7 +21,7 @@ counter=1
 echo 'Running'
 while IFS= read -r question
 do
-  exe_cmd="./llama -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
+  exe_cmd="./llama-cli -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
   echo $counter
   echo "Current Question: $question"
   eval "$exe_cmd"
diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py
index f741c391e..c8e108df0 100755
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@@ -524,7 +524,7 @@ class SchemaConverter:
 def main(args_in = None):
     parser = argparse.ArgumentParser(
         description='''
-            Generates a grammar (suitable for use in ./llama) that produces JSON conforming to a
+            Generates a grammar (suitable for use in ./llama-cli) that produces JSON conforming to a
             given JSON schema. Only a subset of JSON schema features are supported; more may be
             added in the future.
         ''',
diff --git a/examples/llama2-13b.sh b/examples/llama2-13b.sh
index cc76b5672..37bcda3d9 100755
--- a/examples/llama2-13b.sh
+++ b/examples/llama2-13b.sh
@@ -7,7 +7,7 @@
 cd `dirname $0`
 cd ..
 
-./llama -m models/available/Llama2/13B/llama-2-13b.ggmlv3.q4_0.bin \
+./llama-cli -m models/available/Llama2/13B/llama-2-13b.ggmlv3.q4_0.bin \
        --color \
        --ctx_size 2048 \
        -n -1 \
diff --git a/examples/llama2.sh b/examples/llama2.sh
index 4ed7f0932..5977efa22 100755
--- a/examples/llama2.sh
+++ b/examples/llama2.sh
@@ -7,7 +7,7 @@
 cd `dirname $0`
 cd ..
 
-./llama -m models/available/Llama2/7B/llama-2-7b.ggmlv3.q4_0.bin \
+./llama-cli -m models/available/Llama2/7B/llama-2-7b.ggmlv3.q4_0.bin \
        --color \
        --ctx_size 2048 \
        -n -1 \
diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt
index d807b2934..e9fa73acb 100644
--- a/examples/llava/CMakeLists.txt
+++ b/examples/llava/CMakeLists.txt
@@ -30,9 +30,9 @@ if(TARGET BUILD_INFO)
     add_dependencies(llava BUILD_INFO)
 endif()
 
-set(TARGET llama-llava)
+set(TARGET llama-llava-cli)
 add_executable(${TARGET} llava-cli.cpp)
-set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava)
+set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md
index a35d6fd2b..05a8207e6 100644
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@@ -9,12 +9,12 @@ The implementation is based on llava, and is compatible with llava and mobileVLM
 Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using **MobileVLM-1.7B** as an example, the different conversion step will be shown.
 
 ## Usage
-Build with cmake or run `make llama-llava` to build it.
+Build with cmake or run `make llama-llava-cli` to build it.
 
-After building, run: `./llama-llava` to see the usage. For example:
+After building, run: `./llama-llava-cli` to see the usage. For example:
 
 ```sh
-./llama-llava -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
+./llama-llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
     --mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \
     --image path/to/an/image.jpg \
     -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:"
@@ -82,7 +82,7 @@ refer to `android/adb_run.sh`, modify resources' `name` and `path`
 ### case 1
 **input**
 ```sh
-/data/local/tmp/llama-llava \
+/data/local/tmp/llama-llava-cli \
     -m /data/local/tmp/ggml-model-q4_k.gguf \
     --mmproj /data/local/tmp/mmproj-model-f16.gguf \
     -t 4 \
@@ -102,7 +102,7 @@ llama_print_timings:       total time =   34731.93 ms
 ### case 2
 **input**
 ```sh
-/data/local/tmp/llama-llava \
+/data/local/tmp/llama-llava-cli \
     -m /data/local/tmp/ggml-model-q4_k.gguf \
     --mmproj /data/local/tmp/mmproj-model-f16.gguf \
     -t 4 \
@@ -126,7 +126,7 @@ llama_print_timings:       total time =   34570.79 ms
 #### llava-cli release-b2005
 **input**
 ```sh
-/data/local/tmp/llama-llava \
+/data/local/tmp/llama-llava-cli \
     -m /data/local/tmp/ggml-model-q4_k.gguf \
     --mmproj /data/local/tmp/mmproj-model-f16.gguf \
     -t 4 \
@@ -200,7 +200,7 @@ make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
 ### case 1
 **input**
 ```sh
-./llama-llava \
+./llama-llava-cli \
     -m /data/local/tmp/ggml-model-q4_k.gguf \
     --mmproj /data/local/tmp/mmproj-model-f16.gguf \
     --image /data/local/tmp/demo.jpeg \
@@ -224,7 +224,7 @@ llama_print_timings:       total time =    1352.63 ms /   252 tokens
 ### case 2
 **input**
 ```sh
-./llama-llava \
+./llama-llava-cli \
     -m /data/local/tmp/ggml-model-q4_k.gguf \
     --mmproj /data/local/tmp/mmproj-model-f16.gguf \
     -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:" \
diff --git a/examples/llava/README.md b/examples/llava/README.md
index f808297f1..f4554de67 100644
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@@ -11,12 +11,12 @@ For llava-1.6 a variety of prepared gguf models are available as well [7b-34b](h
 After API is confirmed, more models will be supported / uploaded.
 
 ## Usage
-Build with cmake or run `make llama-llava` to build it.
+Build with cmake or run `make llama-llava-cli` to build it.
 
-After building, run: `./llama-llava` to see the usage. For example:
+After building, run: `./llama-llava-cli` to see the usage. For example:
 
 ```sh
-./llama-llava -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
+./llama-llava-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
 ```
 
 **note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
@@ -97,7 +97,7 @@ python ./examples/convert-legacy-llama.py ../llava-v1.6-vicuna-7b/ --skip-unknow
 
 7) And finally we can run the llava cli using the 1.6 model version:
 ```console
-./llama-llava -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
+./llama-llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
 ```
 
 **note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)
diff --git a/examples/llava/android/adb_run.sh b/examples/llava/android/adb_run.sh
index aca375d07..45ccf8d70 100755
--- a/examples/llava/android/adb_run.sh
+++ b/examples/llava/android/adb_run.sh
@@ -10,7 +10,7 @@ prompt="A chat between a curious user and an artificial intelligence assistant.
 # prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
 
 program_dir="build_64/bin"
-binName="llama-llava"
+binName="llama-llava-cli"
 n_threads=4
 
 
diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt
index cdc6373c9..5f6efaa9a 100644
--- a/examples/main/CMakeLists.txt
+++ b/examples/main/CMakeLists.txt
@@ -1,6 +1,5 @@
 set(TARGET llama-cli)
 add_executable(${TARGET} main.cpp)
-set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/main/README.md b/examples/main/README.md
index 96423ea4e..d3582cad0 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -20,7 +20,7 @@ To get started right away, run the following command, making sure to use the cor
 #### Unix-based systems (Linux, macOS, etc.):
 
 ```bash
-./llama -m models/7B/ggml-model.bin --prompt "Once upon a time"
+./llama-cli -m models/7B/ggml-model.bin --prompt "Once upon a time"
 ```
 
 #### Windows:
@@ -34,7 +34,7 @@ For an interactive experience, try this command:
 #### Unix-based systems (Linux, macOS, etc.):
 
 ```bash
-./llama -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -p \
+./llama-cli -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -p \
 'User: Hi
 AI: Hello. I am an AI chatbot. Would you like to talk?
 User: Sure!
@@ -53,7 +53,7 @@ The following command generates "infinite" text from a starting prompt (you can
 #### Unix-based systems (Linux, macOS, etc.):
 
 ```bash
-./llama -m models/7B/ggml-model.bin --ignore-eos -n -1
+./llama-cli -m models/7B/ggml-model.bin --ignore-eos -n -1
 ```
 
 #### Windows:
@@ -107,7 +107,7 @@ To overcome this limitation, you can use the `--in-prefix` flag to add a space o
 The `--in-prefix` flag is used to add a prefix to your input, primarily, this is used to insert a space after the reverse prompt. Here's an example of how to use the `--in-prefix` flag in conjunction with the `--reverse-prompt` flag:
 
 ```sh
-./llama -r "User:" --in-prefix " "
+./llama-cli -r "User:" --in-prefix " "
 ```
 
 ### In-Suffix
@@ -115,7 +115,7 @@ The `--in-prefix` flag is used to add a prefix to your input, primarily, this is
 The `--in-suffix` flag is used to add a suffix after your input. This is useful for adding an "Assistant:" prompt after the user's input. It's added after the new-line character (`\n`) that's automatically added to the end of the user's input. Here's an example of how to use the `--in-suffix` flag in conjunction with the `--reverse-prompt` flag:
 
 ```sh
-./llama -r "User:" --in-prefix " " --in-suffix "Assistant:"
+./llama-cli -r "User:" --in-prefix " " --in-suffix "Assistant:"
 ```
 
 ## Context Management
diff --git a/examples/quantize/tests.sh b/examples/quantize/tests.sh
index 5d2778c05..428d9eda6 100644
--- a/examples/quantize/tests.sh
+++ b/examples/quantize/tests.sh
@@ -20,7 +20,7 @@ set -x
 
 SPLIT=$1/gguf-split
 QUANTIZE=$1/quantize
-MAIN=$1/llama
+MAIN=$1/llama-cli
 WORK_PATH=$TMP_DIR/quantize
 ROOT_DIR=$(realpath $(dirname $0)/../../)
 
diff --git a/examples/reason-act.sh b/examples/reason-act.sh
index e84993bdd..06d592799 100755
--- a/examples/reason-act.sh
+++ b/examples/reason-act.sh
@@ -8,7 +8,7 @@ if [ "$1" == "-m" ]; then
   MODEL="-m $2 "
 fi
 
-./llama $MODEL --color \
+./llama-cli $MODEL --color \
     -f ./prompts/reason-act.txt \
     -i --interactive-first \
     --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7 -c 2048 \
diff --git a/examples/rpc/README.md b/examples/rpc/README.md
index 66343ea6b..86544e3fe 100644
--- a/examples/rpc/README.md
+++ b/examples/rpc/README.md
@@ -70,5 +70,5 @@ cmake --build . --config Release
 Finally, use the `--rpc` option to specify the host and port of each `rpc-server`:
 
 ```bash
-$ bin/llama -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
+$ bin/llama-cli -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
 ```
diff --git a/examples/sycl/run-llama2.sh b/examples/sycl/run-llama2.sh
index 33929197f..da0e4aaba 100755
--- a/examples/sycl/run-llama2.sh
+++ b/examples/sycl/run-llama2.sh
@@ -23,15 +23,15 @@ fi
 if [ $GGML_SYCL_SINGLE_GPU -eq 1 ]; then
     echo "use $GGML_SYCL_DEVICE as main GPU"
     #use signle GPU only
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
 else
     #use multiple GPUs with same max compute units
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
 fi
 
 #use main GPU only
-#ZES_ENABLE_SYSMAN=1 ./build/bin/llama -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
+#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
 
 #use multiple GPUs with same max compute units
-#ZES_ENABLE_SYSMAN=1 ./build/bin/llama -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
+#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
 
diff --git a/examples/train-text-from-scratch/README.md b/examples/train-text-from-scratch/README.md
index d40f53d2e..3abae2380 100644
--- a/examples/train-text-from-scratch/README.md
+++ b/examples/train-text-from-scratch/README.md
@@ -18,7 +18,7 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
         --no-checkpointing
 
 # predict
-./bin/llama -m ggml-shakespeare-256x16-f32.gguf
+./bin/llama-cli -m ggml-shakespeare-256x16-f32.gguf
 ```
 
 Output files will be saved every N iterations (config with `--save-every N`).
diff --git a/grammars/README.md b/grammars/README.md
index 1675f6e7b..81c900857 100644
--- a/grammars/README.md
+++ b/grammars/README.md
@@ -91,7 +91,7 @@ item ::= [^\n]+ "\n"
 
 This guide provides a brief overview. Check out the GBNF files in this directory (`grammars/`) for examples of full grammars. You can try them out with:
 ```
-./llama -m <model> --grammar-file grammars/some-grammar.gbnf -p 'Some prompt'
+./llama-cli -m <model> --grammar-file grammars/some-grammar.gbnf -p 'Some prompt'
 ```
 
 ## Troubleshooting
diff --git a/scripts/hf.sh b/scripts/hf.sh
index 6a614cf54..85c2c4d9a 100755
--- a/scripts/hf.sh
+++ b/scripts/hf.sh
@@ -3,9 +3,9 @@
 # Shortcut for downloading HF models
 #
 # Usage:
-#   ./llama -m $(./scripts/hf.sh https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
-#   ./llama -m $(./scripts/hf.sh --url https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/blob/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
-#   ./llama -m $(./scripts/hf.sh --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf)
+#   ./llama-cli -m $(./scripts/hf.sh https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
+#   ./llama-cli -m $(./scripts/hf.sh --url https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/blob/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
+#   ./llama-cli -m $(./scripts/hf.sh --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf)
 #
 
 # all logs go to stderr