make : deprecate (#10514)

* make : deprecate ggml-ci * ci : disable Makefile builds ggml-ci * docs : remove make references [no ci] * ci : disable swift build ggml-ci * docs : remove obsolete make references, scripts, examples ggml-ci * basic fix for compare-commits.sh * update build.md * more build.md updates * more build.md updates * more build.md updates * Update Makefile Co-authored-by: Diego Devesa <slarengh@gmail.com> --------- Co-authored-by: slaren <slarengh@gmail.com>
2024-12-26 14:20:31 +01:00 · 2024-12-02 21:22:53 +02:00 · 2024-12-02 21:22:53 +02:00 · 8648c52101
commit 8648c52101
parent 64ed2091b2
11 changed files with 139 additions and 1011 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -160,66 +160,6 @@ jobs:
          path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
          name: llama-bin-macos-x64.zip
  ubuntu-focal-make:
    runs-on: ubuntu-20.04
    env:
      LLAMA_NODE_AVAILABLE: true
      LLAMA_PYTHON_AVAILABLE: true
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
      - name: Dependencies
        id: depends
        run: |
          sudo apt-get update
          sudo apt-get install build-essential gcc-8
      - uses: actions/setup-node@v4
        with:
          node-version: "20"
      - uses: actions/setup-python@v5
        with:
          python-version: "3.11"
      - name: Build
        id: make_build
        env:
            LLAMA_FATAL_WARNINGS: 1
        run: |
          CC=gcc-8 make -j $(nproc)
      - name: Test
        id: make_test
        run: |
          CC=gcc-8 make tests -j $(nproc)
          make test -j $(nproc)
  ubuntu-focal-make-curl:
    runs-on: ubuntu-20.04
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
      - name: Dependencies
        id: depends
        run: |
          sudo apt-get update
          sudo apt-get install build-essential gcc-8 libcurl4-openssl-dev
      - name: Build
        id: make_build
        env:
          LLAMA_FATAL_WARNINGS: 1
          LLAMA_CURL: 1
        run: |
          CC=gcc-8 make -j $(nproc)
  ubuntu-latest-cmake:
    runs-on: ubuntu-latest
@ -517,36 +457,6 @@ jobs:
          cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON ..
          cmake --build . --config Release -j $(nproc)
  # TODO: build with GGML_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
  #       how to debug it.
  #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
  macOS-latest-make:
    runs-on: macos-latest
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
      - name: Dependencies
        id: depends
        continue-on-error: true
        run: |
          brew update
      - name: Build
        id: make_build
        env:
            LLAMA_FATAL_WARNINGS: 1
        run: |
          GGML_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)
      - name: Test
        id: make_test
        run: |
          GGML_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu)
          GGML_NO_METAL=1 make test  -j $(sysctl -n hw.logicalcpu)
  # TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
  #       how to debug it.
  #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
@ -642,33 +552,35 @@ jobs:
            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
-  macOS-latest-swift:
+# TODO: tmp disabled. see for possible re-enable:
-    runs-on: macos-latest
+#       https://github.com/ggerganov/llama.cpp/pull/10525
-
+#  macOS-latest-swift:
-    strategy:
+#    runs-on: macos-latest
-      matrix:
+#
-        destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
+#    strategy:
-
+#      matrix:
-    steps:
+#        destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
-      - name: Clone
+#
-        id: checkout
+#    steps:
-        uses: actions/checkout@v4
+#      - name: Clone
-
+#        id: checkout
-      - name: Dependencies
+#        uses: actions/checkout@v4
-        id: depends
+#
-        continue-on-error: true
+#      - name: Dependencies
-        run: |
+#        id: depends
-          brew update
+#        continue-on-error: true
-
+#        run: |
-      - name: xcodebuild for swift package
+#          brew update
-        id: xcodebuild
+#
-        run: |
+#      - name: xcodebuild for swift package
-          xcodebuild -scheme llama -destination "${{ matrix.destination }}"
+#        id: xcodebuild
-
+#        run: |
-      - name: Build Swift Example
+#          xcodebuild -scheme llama -destination "${{ matrix.destination }}"
-        id: make_build_swift_example
+#
-        run: |
+#      - name: Build Swift Example
-            make swift
+#        id: make_build_swift_example
 #        run: |
 #            make swift
  windows-msys2:
    runs-on: windows-latest
@ -695,21 +607,6 @@ jobs:
            mingw-w64-${{matrix.env}}-cmake
            mingw-w64-${{matrix.env}}-openblas
      - name: Build using make
        shell: msys2 {0}
        run: |
            make -j $(nproc)
      - name: Clean after building using make
        shell: msys2 {0}
        run: |
            make clean
      - name: Build using make w/ OpenBLAS
        shell: msys2 {0}
        run: |
            make GGML_OPENBLAS=1 -j $(nproc)
      - name: Build using CMake
        shell: msys2 {0}
        run: |
@ -1257,9 +1154,7 @@ jobs:
    runs-on: ubuntu-latest
    needs:
      - ubuntu-focal-make
      - ubuntu-latest-cmake
      - macOS-latest-make
      - macOS-latest-cmake
      - windows-latest-cmake
      - windows-2019-cmake-cuda
--- a/4
+++ b/4
@ -1,3 +1,7 @@
 ifndef LLAMA_MAKEFILE
 $(error The Makefile build is deprecated. Use the CMake build instead. For more details, see https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
 endif
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
 	libllava.a \
--- a/docs/backend/BLIS.md
+++ b/docs/backend/BLIS.md
@ -27,13 +27,6 @@ We recommend using openmp since it's easier to modify the cores being used.
 ### llama.cpp compilation
 Makefile:
 ```bash
 make GGML_BLIS=1 -j
 # make GGML_BLIS=1 llama-benchmark-matmult
 ```
 CMake:
 ```bash
--- a/docs/build.md
+++ b/docs/build.md
@ -7,124 +7,63 @@ git clone https://github.com/ggerganov/llama.cpp
 cd llama.cpp
 ```
-In order to build llama.cpp you have four different options.
+The following sections describe how to build with different backends and options.
- Using `make`:
+## CPU Build
  - On Linux or MacOS:
-      ```bash
+Build llama.cpp using `CMake`:
      make
      ```
-  - On Windows (x86/x64 only, arm64 requires cmake):
+```bash
 cmake -B build
 cmake --build build --config Release
 ```
-    1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
+**Notes**:
    2. Extract `w64devkit` on your pc.
    3. Run `w64devkit.exe`.
    4. Use the `cd` command to reach the `llama.cpp` folder.
    5. From here you can run:
        ```bash
        make
        ```
-  - Notes:
+- For faster compilation, add the `-j` argument to run multiple jobs in parallel, or use a generator that does this automatically such as Ninja. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
-    - For `Q4_0_4_4` quantization type build, add the `GGML_NO_LLAMAFILE=1` flag. For example, use `make GGML_NO_LLAMAFILE=1`.
+- For faster repeated compilation, install [ccache](https://ccache.dev/)
-    - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
+- For debug builds, there are two cases:
    - For faster repeated compilation, install [ccache](https://ccache.dev/).
    - For debug builds, run `make LLAMA_DEBUG=1`
- Using `CMake`:
+    1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
-  ```bash
+    ```bash
-  cmake -B build
+    cmake -B build -DCMAKE_BUILD_TYPE=Debug
-  cmake --build build --config Release
+    cmake --build build
-  ```
+    ```
-  **Notes**:
+    2. Multi-config generators (`-G` param set to Visual Studio, XCode...):
-    - For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`.
+    ```bash
-    - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
+    cmake -B build -G "Xcode"
-    - For faster repeated compilation, install [ccache](https://ccache.dev/).
+    cmake --build build --config Debug
-    - For debug builds, there are two cases:
+    ```
-      1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
+    For more details and a list of supported generators, see the [CMake documentation](https://cmake.org/cmake/help/latest/manual/cmake-generators.7.html).
-      ```bash
+- Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers:
-      cmake -B build -DCMAKE_BUILD_TYPE=Debug
+    - Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
-      cmake --build build
+    - Tab Workload: Desktop-development with C++
-      ```
+    - Tab Components (select quickly via search): C++-_CMake_ Tools for Windows, _Git_ for Windows, C++-_Clang_ Compiler for Windows, MS-Build Support for LLVM-Toolset (clang)
-
+    - Please remember to always use a Developer Command Prompt / PowerShell for VS2022 for git, build, test
-      2. Multi-config generators (`-G` param set to Visual Studio, XCode...):
+    - For Windows on ARM (arm64, WoA) build with:
-
+    ```bash
-      ```bash
+    cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
-      cmake -B build -G "Xcode"
+    cmake --build build-arm64-windows-llvm-release
-      cmake --build build --config Debug
+    ```
-      ```
+    Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_4_8 CPU kernels.
    - Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers:
      - Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
        - Tab Workload: Desktop-development with C++
        - Tab Components (select quickly via search): C++-_CMake_ Tools for Windows, _Git_ for Windows, C++-_Clang_ Compiler for Windows, MS-Build Support for LLVM-Toolset (clang)
      - Please remember to always use a Developer Command Prompt / PowerShell for VS2022 for git, build, test
      - For Windows on ARM (arm64, WoA) build with:
        ```bash
        cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
        cmake --build build-arm64-windows-llvm-release
        ```
        Note: Building for arm64 could also be done just with MSVC (with the build-arm64-windows-MSVC preset, or the standard CMake build instructions). But MSVC does not support inline ARM assembly-code, used e.g. for the accelerated Q4_0_4_8 CPU kernels.
 -   Using `gmake` (FreeBSD):
    1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
    2. Add your user to **video** group
    3. Install compilation dependencies.
        ```bash
        sudo pkg install gmake automake autoconf pkgconf llvm15 openblas
        gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
        ```
 ## Metal Build
 On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
 To disable the Metal build at compile time use the `GGML_NO_METAL=1` flag or the `GGML_METAL=OFF` cmake option.
 When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
 argument.
 ## BLAS Build
-Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS. There are currently several different BLAS implementations available for build and use:
+Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Using BLAS doesn't affect the generation performance. There are currently several different BLAS implementations available for build and use:
-### Accelerate Framework:
+### Accelerate Framework
 This is only available on Mac PCs and it's enabled by default. You can just build using the normal instructions.
-### OpenBLAS:
+### OpenBLAS
 This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS installed on your machine.
 - Using `make`:
  - On Linux:
    ```bash
    make GGML_OPENBLAS=1
    ```
  - On Windows:
    1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
    2. Download the latest version of [OpenBLAS for Windows](https://github.com/xianyi/OpenBLAS/releases).
    3. Extract `w64devkit` on your pc.
    4. From the OpenBLAS zip that you just downloaded copy `libopenblas.a`, located inside the `lib` folder, inside `w64devkit\x86_64-w64-mingw32\lib`.
    5. From the same OpenBLAS zip copy the content of the `include` folder inside `w64devkit\x86_64-w64-mingw32\include`.
    6. Run `w64devkit.exe`.
    7. Use the `cd` command to reach the `llama.cpp` folder.
    8. From here you can run:
        ```bash
        make GGML_OPENBLAS=1
        ```
 - Using `CMake` on Linux:
    ```bash
@ -136,14 +75,6 @@ This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS i
 Check [BLIS.md](./backend/BLIS.md) for more information.
 ### SYCL
 SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
 llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
 For detailed info, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
 ### Intel oneMKL
 Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
@ -161,16 +92,29 @@ Building through oneAPI compilers will make avx_vnni instruction set available f
 Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
-### CUDA
+### Other BLAS libraries
-This provides GPU acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
+Any other BLAS library can be used by setting the `GGML_BLAS_VENDOR` option. See the [CMake documentation](https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors) for a list of supported vendors.
-For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling.
+## Metal Build
 On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
 To disable the Metal build at compile time use the `-DGGML_METAL=OFF` cmake option.
 When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers 0` command-line argument.
 ## SYCL
 SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
 llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
 For detailed info, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
 ## CUDA
 This provides GPU acceleration using an NVIDIA GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from the [NVIDIA developer site](https://developer.nvidia.com/cuda-downloads).
 - Using `make`:
  ```bash
  make GGML_CUDA=1
  ```
 - Using `CMake`:
  ```bash
@ -192,14 +136,10 @@ The following compilation options are also available to tweak performance:
 | GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer       | 128     | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial.                                                                         |
 | GGML_CUDA_FA_ALL_QUANTS       | Boolean                | false   | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer.                                                                                                  |
-### MUSA
+## MUSA
 This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GPU. Make sure to have the MUSA SDK installed. You can download it from here: [MUSA SDK](https://developer.mthreads.com/sdk/download/musa).
 - Using `make`:
  ```bash
  make GGML_MUSA=1
  ```
 - Using `CMake`:
  ```bash
@ -213,16 +153,12 @@ The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enab
 Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
-### hipBLAS
+## HIP
-This provides BLAS acceleration on HIP-supported AMD GPUs.
+This provides GPU acceleration on HIP-supported AMD GPUs.
 Make sure to have ROCm installed.
 You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick).
 - Using `make`:
  ```bash
  make GGML_HIP=1
  ```
 - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
  ```bash
  HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
@ -247,11 +183,6 @@ You can download it from your Linux distro's package manager or from here: [ROCm
      && cmake --build build -- -j 16
  ```
 - Using `make` (example for target gfx1030, build with 16 CPU threads):
  ```bash
  make -j16 GGML_HIP=1 GGML_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
  ```
 - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
  ```bash
  set PATH=%HIP_PATH%\bin;%PATH%
@ -265,11 +196,11 @@ You can download it from your Linux distro's package manager or from here: [ROCm
 The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
 If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
-### Vulkan
+## Vulkan
 **Windows**
-#### w64devkit
+### w64devkit
 Download and extract [`w64devkit`](https://github.com/skeeto/w64devkit/releases).
@ -289,9 +220,14 @@ Libs: -lvulkan-1
 EOF
 ```
 Switch into the `llama.cpp` directory and run `make GGML_VULKAN=1`.
-#### Git Bash MINGW64
+Switch into the `llama.cpp` directory and build using CMake.
 ```sh
 cmake -B build -DGGML_VULKAN=ON
 cmake --build build --config Release
 ```
 ### Git Bash MINGW64
 Download and install [`Git-SCM`](https://git-scm.com/downloads/win) with the default settings
@ -310,20 +246,21 @@ cmake --build build --config Release
 Now you can load the model in conversation mode using `Vulkan`
-```
+```sh
-build/bin/release/llama-cli -m "[PATH TO MODEL]" -ngl 100 -c 16384 -t 10 -n -2 -cnv
+build/bin/Release/llama-cli -m "[PATH TO MODEL]" -ngl 100 -c 16384 -t 10 -n -2 -cnv
 ```
-#### MSYS2
+### MSYS2
 Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies.
-  ```sh
+```sh
-  pacman -S git \
+pacman -S git \
-      mingw-w64-ucrt-x86_64-gcc \
+    mingw-w64-ucrt-x86_64-gcc \
-      mingw-w64-ucrt-x86_64-cmake \
+    mingw-w64-ucrt-x86_64-cmake \
-      mingw-w64-ucrt-x86_64-vulkan-devel \
+    mingw-w64-ucrt-x86_64-vulkan-devel \
-      mingw-w64-ucrt-x86_64-shaderc
+    mingw-w64-ucrt-x86_64-shaderc
-  ```
+```
-Switch into `llama.cpp` directory and build using CMake.
+
 Switch into the `llama.cpp` directory and build using CMake.
 ```sh
 cmake -B build -DGGML_VULKAN=ON
 cmake --build build --config Release
@ -372,7 +309,7 @@ cmake --build build --config Release
 # ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
 ```
-### CANN
+## CANN
 This provides NPU acceleration using the AI cores of your Ascend NPU. And [CANN](https://www.hiascend.com/en/software/cann) is a hierarchical APIs to help you to quickly build AI applications and service based on Ascend NPU.
 For more information about Ascend NPU in [Ascend Community](https://www.hiascend.com/en/).
@ -387,22 +324,26 @@ cmake --build build --config release
 You can test with:
 `./build/bin/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32`
 If the fllowing info is output on screen, you are using `llama.cpp by CANN backend`:
 ```bash
-llm_load_tensors:       CANN buffer size = 13313.00 MiB
+./build/bin/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32
 ```
 If the following info is output on screen, you are using `llama.cpp` with the CANN backend:
 ```bash
 llm_load_tensors:       CANN model buffer size = 13313.00 MiB
 llama_new_context_with_model:       CANN compute buffer size =  1260.81 MiB
 ```
 For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md).
-### Android
+## Android
 To read documentation for how to build on Android, [click here](./android.md)
-### Arm CPU optimized mulmat kernels
+## Notes about GPU-accelerated backends
-Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.
+The GPU may still be used to accelerate some parts of the computation even when using the `-ngl 0` option. You can fully disable GPU acceleration by using `--device none`.
-To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`).
+In most cases, it is possible to build and use multiple backends at the same time. For example, you can build llama.cpp with both CUDA and Vulkan support by using the `-DGGML_CUDA=ON -DGGML_VULKAN=ON` options with CMake. At runtime, you can specify which backend devices to use with the `--device` option. To see a list of available devices, use the `--list-devices` option.
 Backends can be built as dynamic libraries that can be loaded dynamically at runtime. This allows you to use the same llama.cpp binary on different machines with different GPUs. To enable this feature, use the `GGML_BACKEND_DL` option when building.
--- a/examples/base-translate.sh
+++ b/examples/base-translate.sh
@ -1,61 +0,0 @@
 #!/bin/bash
 #
 # Few-shot translation example.
 # Requires a base model (i.e. no fine-tuned or instruct models).
 #
 # Usage:
 #
 #   cd llama.cpp
 #   make -j
 #
 #   ./examples/base-translate.sh <model-base> "<text>" [extra-main-args]
 #
 if [ $# -lt 2 ]; then
  echo "Usage: ./base-translate.sh <model-base> \"<text>\" [extra-main-args]"
  exit 1
 fi
 eargs=""
 if [ $# -gt 2 ]; then
  eargs="${@:3}"
 fi
 ftmp="__llama.cpp_example_tmp__.txt"
 trap "rm -f $ftmp" EXIT
 echo "Translate from English to French:
 ===
 sea otter, peppermint, plush girafe:
 sea otter => loutre de mer
 peppermint => menthe poivrée
 plush girafe => girafe peluche
 ===
 violin
 violin => violon
 ===
 phone, computer, mouse, keyboard:
 phone => téléphone
 computer => ordinateur
 mouse => souris
 keyboard => clavier
 ===
 " > $ftmp
 echo "$2
 " >> $ftmp
 model=$1
 # generate the most likely continuation until the string "===" is found
 ./llama-cli -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
--- a/examples/convert-llama2c-to-ggml/README.md
+++ b/examples/convert-llama2c-to-ggml/README.md
@ -2,11 +2,8 @@
 This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default.
-To convert the model first download the models from the [llama2.c](https://github.com/karpathy/llama2.c) repository:
+To convert the model first download the models from the [llama2.c](https://github.com/karpathy/llama2.c) repository.
 `$ make -j`
 After successful compilation, following usage options are available:
 ```
 usage: ./llama-convert-llama2c-to-ggml [options]
--- a/examples/imatrix/README.md
+++ b/examples/imatrix/README.md
@ -25,8 +25,6 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument
 ## Example
 ```bash
 GGML_CUDA=1 make -j
 # generate importance matrix (imatrix.dat)
 ./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -188,12 +188,6 @@ services:
 `llama-server` is built alongside everything else from the root of the project
 - Using `make`:
  ```bash
  make llama-server
  ```
 - Using `CMake`:
  ```bash
@ -207,15 +201,6 @@ services:
 `llama-server` can also be built with SSL support using OpenSSL 3
 - Using `make`:
  ```bash
  # NOTE: For non-system openssl, use the following:
  #   CXXFLAGS="-I /path/to/openssl/include"
  #   LDFLAGS="-L /path/to/openssl/lib"
  make LLAMA_SERVER_SSL=true llama-server
  ```
 - Using `CMake`:
  ```bash
--- a/scripts/compare-commits.sh
+++ b/scripts/compare-commits.sh
@ -16,15 +16,21 @@ bench_args="${@:3}"
 rm -f llama-bench.sqlite > /dev/null
 # to test a backend, call the script with the corresponding environment variable (e.g. GGML_CUDA=1 ./scripts/compare-commits.sh ...)
 if [ -n "$GGML_CUDA" ]; then
    cmake_opts="-DGGML_CUDA=ON"
 fi
 function run {
    rm -fr build > /dev/null
    cmake -B build -S . $cmake_opts > /dev/null
    cmake --build build -t llama-bench > /dev/null
    build/bin/llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
 }
 git checkout $1 > /dev/null
-make clean > /dev/null
+run
 make -j$(nproc) $make_opts llama-bench > /dev/null
 ./llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
 git checkout $2 > /dev/null
-make clean > /dev/null
+run
 make -j$(nproc) $make_opts llama-bench > /dev/null
 ./llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
 ./scripts/compare-llama-bench.py -b $1 -c $2
--- a/scripts/pod-llama.sh
+++ b/scripts/pod-llama.sh
@ -1,212 +0,0 @@
 #!/bin/bash
 #
 # Use this script only on fresh pods (runpod.io)!
 # Otherwise, it can break your environment!
 #
 if [ -z "$1" ]; then
    echo "Usage: $0 <data>"
    echo "  0: no models"
    echo "  1: tinyllama-1b"
    echo "  2: codellama-7b"
    echo "  3: codellama-13b"
    echo "  4: codellama-34b"
    echo "  5: codellama-7b-instruct"
    echo "  6: codellama-13b-instruct"
    echo "  7: codellama-34b-instruct"
    exit 1
 fi
 set -x
 # setup deps
 apt-get update
 apt-get install -y git-lfs cmake cmake-curses-gui vim ruby
 git-lfs install
 if [ ! -d "/workspace" ]; then
    ln -sfn $(pwd) /workspace
 fi
 # download data
 cd /workspace
 # this is useful to git clone repos without doubling the disk size due to .git
 git clone https://github.com/iboB/git-lfs-download
 ln -sfn /workspace/git-lfs-download/git-lfs-download /usr/local/bin/git-lfs-download
 # llama.cpp
 cd /workspace
 git clone https://github.com/ggerganov/llama.cpp
 cd llama.cpp
 GGML_CUDA=1 make -j
 ln -sfn /workspace/TinyLlama-1.1B-Chat-v0.3  ./models/tinyllama-1b
 ln -sfn /workspace/CodeLlama-7b-hf           ./models/codellama-7b
 ln -sfn /workspace/CodeLlama-13b-hf          ./models/codellama-13b
 ln -sfn /workspace/CodeLlama-34b-hf          ./models/codellama-34b
 ln -sfn /workspace/CodeLlama-7b-Instruct-hf  ./models/codellama-7b-instruct
 ln -sfn /workspace/CodeLlama-13b-Instruct-hf ./models/codellama-13b-instruct
 ln -sfn /workspace/CodeLlama-34b-Instruct-hf ./models/codellama-34b-instruct
 pip install -r requirements.txt
 # cmake
 cd /workspace/llama.cpp
 mkdir build-cublas
 cd build-cublas
 cmake -DGGML_CUDA=1 ../
 make -j
 if [ "$1" -eq "0" ]; then
    exit 0
 fi
 # more models
 if [ "$1" -eq "1" ]; then
    cd /workspace
    git-lfs-download https://huggingface.co/PY007/TinyLlama-1.1B-Chat-v0.3
    cd /workspace/llama.cpp
    python3 examples/convert_legacy_llama.py ./models/tinyllama-1b  --outfile ./models/tinyllama-1b/ggml-model-f16.gguf  --outtype f16
    ./llama-quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_0.gguf q4_0
    ./llama-quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_k.gguf q4_k
    ./llama-quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q8_0.gguf q8_0
 fi
 if [ "$1" -eq "2" ]; then
    cd /workspace
    git-lfs-download https://huggingface.co/codellama/CodeLlama-7b-hf  --without *safetensors*
    rm -v ./CodeLlama-7b-hf/*safetensors*
    cd /workspace/llama.cpp
    python3 examples/convert_legacy_llama.py ./models/codellama-7b  --outfile ./models/codellama-7b/ggml-model-f16.gguf  --outtype f16
    ./llama-quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_0.gguf q4_0
    ./llama-quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_k.gguf q4_k
    ./llama-quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q8_0.gguf q8_0
 fi
 if [ "$1" -eq "3" ]; then
    cd /workspace
    git-lfs-download https://huggingface.co/codellama/CodeLlama-13b-hf --without *safetensors*
    rm -v ./CodeLlama-13b-hf/*safetensors*
    cd /workspace/llama.cpp
    python3 examples/convert_legacy_llama.py ./models/codellama-13b --outfile ./models/codellama-13b/ggml-model-f16.gguf --outtype f16
    ./llama-quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_0.gguf q4_0
    ./llama-quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_k.gguf q4_k
    ./llama-quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q8_0.gguf q8_0
 fi
 if [ "$1" -eq "4" ]; then
    cd /workspace
    git-lfs-download https://huggingface.co/codellama/CodeLlama-34b-hf --without *safetensors*
    rm -v ./CodeLlama-34b-hf/*safetensors*
    cd /workspace/llama.cpp
    python3 examples/convert_legacy_llama.py ./models/codellama-34b --outfile ./models/codellama-34b/ggml-model-f16.gguf --outtype f16
    ./llama-quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_0.gguf q4_0
    ./llama-quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_k.gguf q4_k
    ./llama-quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q8_0.gguf q8_0
 fi
 if [ "$1" -eq "5" ]; then
    cd /workspace
    git-lfs-download https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf  --without *safetensors*
    rm -v ./CodeLlama-7b-Instruct-hf/*safetensors*
    cd /workspace/llama.cpp
    python3 examples/convert_legacy_llama.py ./models/codellama-7b-instruct  --outfile ./models/codellama-7b-instruct/ggml-model-f16.gguf  --outtype f16
    ./llama-quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_0.gguf q4_0
    ./llama-quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_k.gguf q4_k
    ./llama-quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q8_0.gguf q8_0
 fi
 if [ "$1" -eq "6" ]; then
    cd /workspace
    git-lfs-download https://huggingface.co/codellama/CodeLlama-13b-Instruct-hf --without *safetensors*
    rm -v ./CodeLlama-13b-Instruct-hf/*safetensors*
    cd /workspace/llama.cpp
    python3 examples/convert_legacy_llama.py ./models/codellama-13b-instruct --outfile ./models/codellama-13b-instruct/ggml-model-f16.gguf --outtype f16
    ./llama-quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_0.gguf q4_0
    ./llama-quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_k.gguf q4_k
    ./llama-quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q8_0.gguf q8_0
 fi
 if [ "$1" -eq "7" ]; then
    cd /workspace
    git-lfs-download https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf --without *safetensors*
    rm -v ./CodeLlama-34b-Instruct-hf/*safetensors*
    cd /workspace/llama.cpp
    python3 examples/convert_legacy_llama.py ./models/codellama-34b-instruct --outfile ./models/codellama-34b-instruct/ggml-model-f16.gguf --outtype f16
    ./llama-quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_0.gguf q4_0
    ./llama-quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_k.gguf q4_k
    ./llama-quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q8_0.gguf q8_0
 fi
 if [ "$1" -eq "1" ]; then
    # perf + perplexity
    cd /workspace/llama.cpp/build-cublas
    make -j && ../scripts/run-all-perf.sh tinyllama-1b "f16" "-ngl 99 -t 1 -p 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,32,64,128,256,512,1024,2048 -n 128"
    ../scripts/get-wikitext-2.sh
    unzip wikitext-2-raw-v1.zip
    make -j && ./bin/llama-perplexity -m ../models/tinyllama-1b/ggml-model-f16.gguf -f ./wikitext-2-raw/wiki.test.raw -ngl 100 --chunks 32
    # batched
    cd /workspace/llama.cpp
    GGML_CUDA=1 make -j && ./llama-batched ./models/tinyllama-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999
    # batched-bench
    cd /workspace/llama.cpp
    GGML_CUDA=1 make -j && ./llama-batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32
    # parallel
    cd /workspace/llama.cpp
    GGML_CUDA=1 make -j && ./llama-parallel -m ./models/tinyllama-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb
 fi
 # speculative
 #if [ "$1" -eq "7" ]; then
 #    cd /workspace/llama.cpp
 #
 #    GGML_CUDA=1 make -j && ./llama-speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0
 #fi
 # more benches
 #GGML_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf  4096 1 99 1 512,3200 128,128,800 1
 #GGML_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1
--- a/scripts/server-llm.sh
+++ b/scripts/server-llm.sh
@ -1,418 +0,0 @@
 #!/bin/bash
 #
 # Helper script for deploying llama.cpp server with a single Bash command
 #
 # - Works on Linux and macOS
 # - Supports: CPU, CUDA, Metal
 # - Can run all GGUF models from HuggingFace
 # - Can serve requests in parallel
 # - Always builds latest llama.cpp from GitHub
 #
 # Limitations
 #
 # - Chat templates are poorly supported (base models recommended)
 # - Might be unstable!
 #
 # Usage:
 #   ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose] [-non-interactive]
 #
 #   --port:            port number, default is 8888
 #   --repo:            path to a repo containing GGUF model files
 #   --wtype:           weights type (f16, q8_0, q4_0, q4_1), default is user-input
 #   --backend:         cpu, cuda, metal, depends on the OS
 #   --gpu-id:          gpu id, default is 0
 #   --n-parallel:      number of parallel requests, default is 8
 #   --n-kv:            KV cache size, default is 4096
 #   --verbose:         verbose output
 #   --non-interactive: run without asking a permission to run
 #
 # Example:
 #
 #   bash -c "$(curl -s https://ggml.ai/server-llm.sh)"
 #
 set -e
 # required utils: curl, git, make
 if ! command -v curl &> /dev/null; then
    printf "[-] curl not found\n"
    exit 1
 fi
 if ! command -v git &> /dev/null; then
    printf "[-] git not found\n"
    exit 1
 fi
 if ! command -v make &> /dev/null; then
    printf "[-] make not found\n"
    exit 1
 fi
 # parse arguments
 is_interactive=1
 port=8888
 repo=""
 wtype=""
 backend="cpu"
 # if macOS, use metal backend by default
 if [[ "$OSTYPE" == "darwin"* ]]; then
    backend="metal"
 elif command -v nvcc &> /dev/null; then
    backend="cuda"
 fi
 gpu_id=0
 n_parallel=8
 n_kv=4096
 verbose=0
 function print_usage {
    printf "Usage:\n"
    printf "  ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose] [-non-interactive]\n\n"
    printf "  --port:             port number, default is 8888\n"
    printf "  --repo:             path to a repo containing GGUF model files\n"
    printf "  --wtype:            weights type (f16, q8_0, q4_0, q4_1), default is user-input\n"
    printf "  --backend:          cpu, cuda, metal, depends on the OS\n"
    printf "  --gpu-id:           gpu id, default is 0\n"
    printf "  --n-parallel:       number of parallel requests, default is 8\n"
    printf "  --n-kv:             KV cache size, default is 4096\n"
    printf "  --verbose:          verbose output\n\n"
    printf "  --non-interactive:  run without asking a permission to run\n"
    printf "Example:\n\n"
    printf '  bash -c "$(curl -s https://ggml.ai/server-llm.sh)"\n\n'
 }
 while [[ $# -gt 0 ]]; do
    key="$1"
    case $key in
        --non-interactive)
            is_interactive=0
            shift
            ;;
        --port)
            port="$2"
            shift
            shift
            ;;
        --repo)
            repo="$2"
            shift
            shift
            ;;
        --wtype)
            wtype="$2"
            shift
            shift
            ;;
        --backend)
            backend="$2"
            shift
            shift
            ;;
        --gpu-id)
            gpu_id="$2"
            shift
            shift
            ;;
        --n-parallel)
            n_parallel="$2"
            shift
            shift
            ;;
        --n-kv)
            n_kv="$2"
            shift
            shift
            ;;
        --verbose)
            verbose=1
            shift
            ;;
        --help)
            print_usage
            exit 0
            ;;
        *)
            echo "Unknown argument: $key"
            print_usage
            exit 1
            ;;
    esac
 done
 # available weights types
 wtypes=("F16" "Q8_0" "Q4_0" "Q4_1" "Q5_0" "Q5_1" "Q6_K" "Q5_K_M" "Q5_K_S" "Q4_K_M" "Q4_K_S" "Q3_K_L" "Q3_K_M" "Q3_K_S" "Q2_K")
 wfiles=()
 for wt in "${wtypes[@]}"; do
    wfiles+=("")
 done
 # map wtype input to index
 if [[ ! -z "$wtype" ]]; then
    iw=-1
    is=0
    for wt in "${wtypes[@]}"; do
        # uppercase
        uwt=$(echo "$wt" | tr '[:lower:]' '[:upper:]')
        if [[ "$uwt" == "$wtype" ]]; then
            iw=$is
            break
        fi
        is=$((is+1))
    done
    if [[ $iw -eq -1 ]]; then
        printf "[-] Invalid weight type: %s\n" "$wtype"
        exit 1
    fi
    wtype="$iw"
 fi
 # sample repos
 repos=(
    "https://huggingface.co/TheBloke/Llama-2-7B-GGUF"
    "https://huggingface.co/TheBloke/Llama-2-13B-GGUF"
    "https://huggingface.co/TheBloke/Llama-2-70B-GGUF"
    "https://huggingface.co/TheBloke/CodeLlama-7B-GGUF"
    "https://huggingface.co/TheBloke/CodeLlama-13B-GGUF"
    "https://huggingface.co/TheBloke/CodeLlama-34B-GGUF"
    "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF"
    "https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF"
    "https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF"
    "https://huggingface.co/TheBloke/CausalLM-7B-GGUF"
 )
 if [ $is_interactive -eq 1 ]; then
    printf "\n"
    printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n"
    printf "    Based on the options that follow, the script might download a model file\n"
    printf "    from the internet, which can be a few GBs in size. The script will also\n"
    printf "    build the latest llama.cpp source code from GitHub, which can be unstable.\n"
    printf "\n"
    printf "    Upon success, an HTTP server will be started and it will serve the selected\n"
    printf "    model using llama.cpp for demonstration purposes.\n"
    printf "\n"
    printf "    Please note:\n"
    printf "\n"
    printf "    - All new data will be stored in the current folder\n"
    printf "    - The server will be listening on all network interfaces\n"
    printf "    - The server will run with default settings which are not always optimal\n"
    printf "    - Do not judge the quality of a model based on the results from this script\n"
    printf "    - Do not use this script to benchmark llama.cpp\n"
    printf "    - Do not use this script in production\n"
    printf "    - This script is only for demonstration purposes\n"
    printf "\n"
    printf "    If you don't know what you are doing, please press Ctrl-C to abort now\n"
    printf "\n"
    printf "    Press Enter to continue ...\n\n"
    read
 fi
 if [[ -z "$repo" ]]; then
    printf "[+] No repo provided from the command line\n"
    printf "    Please select a number from the list below or enter an URL:\n\n"
    is=0
    for r in "${repos[@]}"; do
        printf "    %2d) %s\n" $is "$r"
        is=$((is+1))
    done
    # ask for repo until index of sample repo is provided or an URL
    while [[ -z "$repo" ]]; do
        printf "\n    Or choose one from: https://huggingface.co/models?sort=trending&search=gguf\n\n"
        read -p "[+] Select repo: " repo
        # check if the input is a number
        if [[ "$repo" =~ ^[0-9]+$ ]]; then
            if [[ "$repo" -ge 0 && "$repo" -lt ${#repos[@]} ]]; then
                repo="${repos[$repo]}"
            else
                printf "[-] Invalid repo index: %s\n" "$repo"
                repo=""
            fi
        elif [[ "$repo" =~ ^https?:// ]]; then
            repo="$repo"
        else
            printf "[-] Invalid repo URL: %s\n" "$repo"
            repo=""
        fi
    done
 fi
 # remove suffix
 repo=$(echo "$repo" | sed -E 's/\/tree\/main$//g')
 printf "[+] Checking for GGUF model files in %s\n" "$repo"
 # find GGUF files in the source
 # TODO: better logic
 model_tree="${repo%/}/tree/main"
 model_files=$(curl -s "$model_tree" | grep -i "\\.gguf</span>" | sed -E 's/.*<span class="truncate group-hover:underline">(.*)<\/span><\/a>/\1/g')
 # list all files in the provided git repo
 printf "[+] Model files:\n\n"
 for file in $model_files; do
    # determine iw by grepping the filename with wtypes
    iw=-1
    is=0
    for wt in "${wtypes[@]}"; do
        # uppercase
        ufile=$(echo "$file" | tr '[:lower:]' '[:upper:]')
        if [[ "$ufile" =~ "$wt" ]]; then
            iw=$is
            break
        fi
        is=$((is+1))
    done
    if [[ $iw -eq -1 ]]; then
        continue
    fi
    wfiles[$iw]="$file"
    have=" "
    if [[ -f "$file" ]]; then
        have="*"
    fi
    printf "    %2d) %s %s\n" $iw "$have" "$file"
 done
 wfile="${wfiles[$wtype]}"
 # ask for weights type until provided and available
 while [[ -z "$wfile" ]]; do
    printf "\n"
    read -p "[+] Select weight type: " wtype
    wfile="${wfiles[$wtype]}"
    if [[ -z "$wfile" ]]; then
        printf "[-] Invalid weight type: %s\n" "$wtype"
        wtype=""
    fi
 done
 printf "[+] Selected weight type: %s (%s)\n" "$wtype" "$wfile"
 url="${repo%/}/resolve/main/$wfile"
 # check file if the model has been downloaded before
 chk="$wfile.chk"
 # check if we should download the file
 # - if $wfile does not exist
 # - if $wfile exists but $chk does not exist
 # - if $wfile exists and $chk exists but $wfile is newer than $chk
 # TODO: better logic using git lfs info
 do_download=0
 if [[ ! -f "$wfile" ]]; then
    do_download=1
 elif [[ ! -f "$chk" ]]; then
    do_download=1
 elif [[ "$wfile" -nt "$chk" ]]; then
    do_download=1
 fi
 if [[ $do_download -eq 1 ]]; then
    printf "[+] Downloading weights from %s\n" "$url"
    # download the weights file
    curl -o "$wfile" -# -L "$url"
    # create a check file if successful
    if [[ $? -eq 0 ]]; then
        printf "[+] Creating check file %s\n" "$chk"
        touch "$chk"
    fi
 else
    printf "[+] Using cached weights %s\n" "$wfile"
 fi
 # get latest llama.cpp and build
 printf "[+] Downloading latest llama.cpp\n"
 llama_cpp_dir="__llama_cpp_port_${port}__"
 if [[ -d "$llama_cpp_dir" && ! -f "$llama_cpp_dir/__ggml_script__" ]]; then
    # if the dir exists and there isn't a file "__ggml_script__" in it, abort
    printf "[-] Directory %s already exists\n" "$llama_cpp_dir"
    printf "[-] Please remove it and try again\n"
    exit 1
 elif [[ -d "$llama_cpp_dir" ]]; then
    printf "[+] Directory %s already exists\n" "$llama_cpp_dir"
    printf "[+] Using cached llama.cpp\n"
    cd "$llama_cpp_dir"
    git reset --hard
    git fetch
    git checkout origin/master
    cd ..
 else
    printf "[+] Cloning llama.cpp\n"
    git clone https://github.com/ggerganov/llama.cpp "$llama_cpp_dir"
 fi
 # mark that that the directory is made by this script
 touch "$llama_cpp_dir/__ggml_script__"
 if [[ $verbose -eq 1 ]]; then
    set -x
 fi
 # build
 cd "$llama_cpp_dir"
 make clean
 log="--silent"
 if [[ $verbose -eq 1 ]]; then
    log=""
 fi
 if [[ "$backend" == "cuda" ]]; then
    printf "[+] Building with CUDA backend\n"
    GGML_CUDA=1 make -j llama-server $log
 elif [[ "$backend" == "cpu" ]]; then
    printf "[+] Building with CPU backend\n"
    make -j llama-server $log
 elif [[ "$backend" == "metal" ]]; then
    printf "[+] Building with Metal backend\n"
    make -j llama-server $log
 else
    printf "[-] Unknown backend: %s\n" "$backend"
    exit 1
 fi
 # run the server
 printf "[+] Running server\n"
 args=""
 if [[ "$backend" == "cuda" ]]; then
    export CUDA_VISIBLE_DEVICES=$gpu_id
    args="-ngl 999"
 elif [[ "$backend" == "cpu" ]]; then
    args="-ngl 0"
 elif [[ "$backend" == "metal" ]]; then
    args="-ngl 999"
 else
    printf "[-] Unknown backend: %s\n" "$backend"
    exit 1
 fi
 if [[ $verbose -eq 1 ]]; then
    args="$args --verbose"
 fi
 ./llama-server -m "../$wfile" --host 0.0.0.0 --port "$port" -c $n_kv -np "$n_parallel" $args
 exit 0