mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-25 13:58:46 +01:00
docker : add build for SYCL, Vulkan + update readme (#5228)
* add vulkan dockerfile * intel dockerfile: compile sycl by default * fix vulkan dockerfile * add docs for vulkan * docs: sycl build in docker * docs: remove trailing spaces * docs: sycl: add docker section * docs: clarify install vulkan SDK outside docker * sycl: use intel/oneapi-basekit docker image * docs: correct TOC * docs: correct docker image for Intel oneMKL
This commit is contained in:
parent
e805f0fa99
commit
6b91b1e0a9
@ -1,8 +1,8 @@
|
|||||||
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
|
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
|
||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
|
|
||||||
FROM intel/hpckit:$ONEAPI_VERSION as build
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
||||||
|
|
||||||
|
ARG LLAMA_SYCL_F16=OFF
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y git
|
apt-get install -y git
|
||||||
|
|
||||||
@ -10,16 +10,18 @@ WORKDIR /app
|
|||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
# for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance
|
|
||||||
RUN mkdir build && \
|
RUN mkdir build && \
|
||||||
cd build && \
|
cd build && \
|
||||||
cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \
|
if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
|
||||||
cmake --build . --config Release --target main server
|
echo "LLAMA_SYCL_F16 is set" && \
|
||||||
|
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
|
||||||
|
fi && \
|
||||||
|
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
|
||||||
|
cmake --build . --config Release --target main
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION as runtime
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
||||||
|
|
||||||
COPY --from=build /app/build/bin/main /main
|
COPY --from=build /app/build/bin/main /main
|
||||||
COPY --from=build /app/build/bin/server /server
|
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
29
.devops/main-vulkan.Dockerfile
Normal file
29
.devops/main-vulkan.Dockerfile
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
ARG UBUNTU_VERSION=jammy
|
||||||
|
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION as build
|
||||||
|
|
||||||
|
# Install build tools
|
||||||
|
RUN apt update && apt install -y git build-essential cmake wget
|
||||||
|
|
||||||
|
# Install Vulkan SDK
|
||||||
|
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
|
||||||
|
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
|
||||||
|
apt update -y && \
|
||||||
|
apt-get install -y vulkan-sdk
|
||||||
|
|
||||||
|
# Build it
|
||||||
|
WORKDIR /app
|
||||||
|
COPY . .
|
||||||
|
RUN mkdir build && \
|
||||||
|
cd build && \
|
||||||
|
cmake .. -DLLAMA_VULKAN=1 && \
|
||||||
|
cmake --build . --config Release --target main
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
WORKDIR /
|
||||||
|
RUN cp /app/build/bin/main /main && \
|
||||||
|
rm -rf /app
|
||||||
|
|
||||||
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/main" ]
|
@ -1,8 +1,8 @@
|
|||||||
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
|
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
|
||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
|
|
||||||
FROM intel/hpckit:$ONEAPI_VERSION as build
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
||||||
|
|
||||||
|
ARG LLAMA_SYCL_F16=OFF
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y git
|
apt-get install -y git
|
||||||
|
|
||||||
@ -10,13 +10,16 @@ WORKDIR /app
|
|||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
# for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance
|
|
||||||
RUN mkdir build && \
|
RUN mkdir build && \
|
||||||
cd build && \
|
cd build && \
|
||||||
cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \
|
if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
|
||||||
cmake --build . --config Release --target main server
|
echo "LLAMA_SYCL_F16 is set" && \
|
||||||
|
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
|
||||||
|
fi && \
|
||||||
|
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
|
||||||
|
cmake --build . --config Release --target server
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION as runtime
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
||||||
|
|
||||||
COPY --from=build /app/build/bin/server /server
|
COPY --from=build /app/build/bin/server /server
|
||||||
|
|
||||||
|
29
.devops/server-vulkan.Dockerfile
Normal file
29
.devops/server-vulkan.Dockerfile
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
ARG UBUNTU_VERSION=jammy
|
||||||
|
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION as build
|
||||||
|
|
||||||
|
# Install build tools
|
||||||
|
RUN apt update && apt install -y git build-essential cmake wget
|
||||||
|
|
||||||
|
# Install Vulkan SDK
|
||||||
|
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
|
||||||
|
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
|
||||||
|
apt update -y && \
|
||||||
|
apt-get install -y vulkan-sdk
|
||||||
|
|
||||||
|
# Build it
|
||||||
|
WORKDIR /app
|
||||||
|
COPY . .
|
||||||
|
RUN mkdir build && \
|
||||||
|
cd build && \
|
||||||
|
cmake .. -DLLAMA_VULKAN=1 && \
|
||||||
|
cmake --build . --config Release --target server
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
WORKDIR /
|
||||||
|
RUN cp /app/build/bin/server /server && \
|
||||||
|
rm -rf /app
|
||||||
|
|
||||||
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/server" ]
|
102
README-sycl.md
102
README-sycl.md
@ -1,22 +1,15 @@
|
|||||||
# llama.cpp for SYCL
|
# llama.cpp for SYCL
|
||||||
|
|
||||||
[Background](#background)
|
- [Background](#background)
|
||||||
|
- [OS](#os)
|
||||||
[OS](#os)
|
- [Intel GPU](#intel-gpu)
|
||||||
|
- [Docker](#docker)
|
||||||
[Intel GPU](#intel-gpu)
|
- [Linux](#linux)
|
||||||
|
- [Windows](#windows)
|
||||||
[Linux](#linux)
|
- [Environment Variable](#environment-variable)
|
||||||
|
- [Known Issue](#known-issue)
|
||||||
[Windows](#windows)
|
- [Q&A](#q&a)
|
||||||
|
- [Todo](#todo)
|
||||||
[Environment Variable](#environment-variable)
|
|
||||||
|
|
||||||
[Known Issue](#known-issue)
|
|
||||||
|
|
||||||
[Q&A](#q&a)
|
|
||||||
|
|
||||||
[Todo](#todo)
|
|
||||||
|
|
||||||
## Background
|
## Background
|
||||||
|
|
||||||
@ -36,7 +29,7 @@ For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
|
|||||||
|
|
||||||
|OS|Status|Verified|
|
|OS|Status|Verified|
|
||||||
|-|-|-|
|
|-|-|-|
|
||||||
|Linux|Support|Ubuntu 22.04|
|
|Linux|Support|Ubuntu 22.04, Fedora Silverblue 39|
|
||||||
|Windows|Support|Windows 11|
|
|Windows|Support|Windows 11|
|
||||||
|
|
||||||
|
|
||||||
@ -50,7 +43,7 @@ For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
|
|||||||
|Intel Data Center Flex Series| Support| Flex 170|
|
|Intel Data Center Flex Series| Support| Flex 170|
|
||||||
|Intel Arc Series| Support| Arc 770, 730M|
|
|Intel Arc Series| Support| Arc 770, 730M|
|
||||||
|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
|
|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
|
||||||
|Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7|
|
|Intel iGPU| Support| iGPU in i5-1250P, i7-1260P, i7-1165G7|
|
||||||
|
|
||||||
Note: If the EUs (Execution Unit) in iGPU is less than 80, the inference speed will be too slow to use.
|
Note: If the EUs (Execution Unit) in iGPU is less than 80, the inference speed will be too slow to use.
|
||||||
|
|
||||||
@ -64,6 +57,38 @@ For iGPU, please make sure the shared memory from host memory is enough. For lla
|
|||||||
|
|
||||||
For dGPU, please make sure the device memory is enough. For llama-2-7b.Q4_0, recommend the device memory is 4GB+.
|
For dGPU, please make sure the device memory is enough. For llama-2-7b.Q4_0, recommend the device memory is 4GB+.
|
||||||
|
|
||||||
|
## Docker
|
||||||
|
|
||||||
|
Note:
|
||||||
|
- Only docker on Linux is tested. Docker on WSL may not work.
|
||||||
|
- You may need to install Intel GPU driver on the host machine (See the [Linux](#linux) section to know how to do that)
|
||||||
|
|
||||||
|
### Build the image
|
||||||
|
|
||||||
|
You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference.
|
||||||
|
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# For F16:
|
||||||
|
#docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile .
|
||||||
|
|
||||||
|
# Or, for F32:
|
||||||
|
docker build -t llama-cpp-sycl -f .devops/main-intel.Dockerfile .
|
||||||
|
|
||||||
|
# Note: you can also use the ".devops/main-server.Dockerfile", which compiles the "server" example
|
||||||
|
```
|
||||||
|
|
||||||
|
### Run
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# Firstly, find all the DRI cards:
|
||||||
|
ls -la /dev/dri
|
||||||
|
# Then, pick the card that you want to use.
|
||||||
|
|
||||||
|
# For example with "/dev/dri/card1"
|
||||||
|
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
||||||
|
```
|
||||||
|
|
||||||
## Linux
|
## Linux
|
||||||
|
|
||||||
### Setup Environment
|
### Setup Environment
|
||||||
@ -76,7 +101,7 @@ Note: for iGPU, please install the client GPU driver.
|
|||||||
|
|
||||||
b. Add user to group: video, render.
|
b. Add user to group: video, render.
|
||||||
|
|
||||||
```
|
```sh
|
||||||
sudo usermod -aG render username
|
sudo usermod -aG render username
|
||||||
sudo usermod -aG video username
|
sudo usermod -aG video username
|
||||||
```
|
```
|
||||||
@ -85,7 +110,7 @@ Note: re-login to enable it.
|
|||||||
|
|
||||||
c. Check
|
c. Check
|
||||||
|
|
||||||
```
|
```sh
|
||||||
sudo apt install clinfo
|
sudo apt install clinfo
|
||||||
sudo clinfo -l
|
sudo clinfo -l
|
||||||
```
|
```
|
||||||
@ -103,7 +128,6 @@ Platform #0: Intel(R) OpenCL HD Graphics
|
|||||||
|
|
||||||
2. Install Intel® oneAPI Base toolkit.
|
2. Install Intel® oneAPI Base toolkit.
|
||||||
|
|
||||||
|
|
||||||
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
|
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
|
||||||
|
|
||||||
Recommend to install to default folder: **/opt/intel/oneapi**.
|
Recommend to install to default folder: **/opt/intel/oneapi**.
|
||||||
@ -112,7 +136,7 @@ Following guide use the default folder as example. If you use other folder, plea
|
|||||||
|
|
||||||
b. Check
|
b. Check
|
||||||
|
|
||||||
```
|
```sh
|
||||||
source /opt/intel/oneapi/setvars.sh
|
source /opt/intel/oneapi/setvars.sh
|
||||||
|
|
||||||
sycl-ls
|
sycl-ls
|
||||||
@ -131,21 +155,25 @@ Output (example):
|
|||||||
|
|
||||||
2. Build locally:
|
2. Build locally:
|
||||||
|
|
||||||
```
|
Note:
|
||||||
|
- You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference.
|
||||||
|
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
|
||||||
|
|
||||||
|
```sh
|
||||||
mkdir -p build
|
mkdir -p build
|
||||||
cd build
|
cd build
|
||||||
source /opt/intel/oneapi/setvars.sh
|
source /opt/intel/oneapi/setvars.sh
|
||||||
|
|
||||||
#for FP16
|
# For FP16:
|
||||||
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
|
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
|
||||||
|
|
||||||
#for FP32
|
# Or, for FP32:
|
||||||
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||||
|
|
||||||
#build example/main only
|
# Build example/main only
|
||||||
#cmake --build . --config Release --target main
|
#cmake --build . --config Release --target main
|
||||||
|
|
||||||
#build all binary
|
# Or, build all binary
|
||||||
cmake --build . --config Release -v
|
cmake --build . --config Release -v
|
||||||
|
|
||||||
cd ..
|
cd ..
|
||||||
@ -153,14 +181,10 @@ cd ..
|
|||||||
|
|
||||||
or
|
or
|
||||||
|
|
||||||
```
|
```sh
|
||||||
./examples/sycl/build.sh
|
./examples/sycl/build.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
Note:
|
|
||||||
|
|
||||||
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
|
|
||||||
|
|
||||||
### Run
|
### Run
|
||||||
|
|
||||||
1. Put model file to folder **models**
|
1. Put model file to folder **models**
|
||||||
@ -177,10 +201,10 @@ source /opt/intel/oneapi/setvars.sh
|
|||||||
|
|
||||||
Run without parameter:
|
Run without parameter:
|
||||||
|
|
||||||
```
|
```sh
|
||||||
./build/bin/ls-sycl-device
|
./build/bin/ls-sycl-device
|
||||||
|
|
||||||
or
|
# or running the "main" executable and look at the output log:
|
||||||
|
|
||||||
./build/bin/main
|
./build/bin/main
|
||||||
```
|
```
|
||||||
@ -209,13 +233,13 @@ found 4 SYCL devices:
|
|||||||
|
|
||||||
Set device ID = 0 by **GGML_SYCL_DEVICE=0**
|
Set device ID = 0 by **GGML_SYCL_DEVICE=0**
|
||||||
|
|
||||||
```
|
```sh
|
||||||
GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
||||||
```
|
```
|
||||||
or run by script:
|
or run by script:
|
||||||
|
|
||||||
```
|
```sh
|
||||||
./examples/sycl/run-llama2.sh
|
./examples/sycl/run_llama2.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
Note:
|
Note:
|
||||||
|
64
README.md
64
README.md
@ -393,28 +393,28 @@ Building the program with BLAS support may lead to some performance improvements
|
|||||||
|
|
||||||
Check [BLIS.md](docs/BLIS.md) for more information.
|
Check [BLIS.md](docs/BLIS.md) for more information.
|
||||||
|
|
||||||
|
- #### SYCL
|
||||||
|
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
|
||||||
|
|
||||||
|
llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
|
||||||
|
|
||||||
|
For detailed info, please refer to [llama.cpp for SYCL](README-sycl.md).
|
||||||
|
|
||||||
- #### Intel oneMKL
|
- #### Intel oneMKL
|
||||||
|
Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./README-sycl.md).
|
||||||
|
|
||||||
- Using manual oneAPI installation:
|
- Using manual oneAPI installation:
|
||||||
By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
|
By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
|
||||||
```bash
|
```bash
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
source /opt/intel/oneapi/setvars.sh # You can skip this step if in oneapi-runtime docker image, only required for manual installation
|
source /opt/intel/oneapi/setvars.sh # You can skip this step if in oneapi-basekit docker image, only required for manual installation
|
||||||
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
|
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
|
||||||
cmake --build . --config Release
|
cmake --build . --config Release
|
||||||
```
|
```
|
||||||
|
|
||||||
- Using oneAPI docker image:
|
- Using oneAPI docker image:
|
||||||
If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-runtime](https://hub.docker.com/r/intel/oneapi-runtime)
|
If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-basekit](https://hub.docker.com/r/intel/oneapi-basekit). Then, you can use the commands given above.
|
||||||
|
|
||||||
```bash
|
|
||||||
mkdir build
|
|
||||||
cd build
|
|
||||||
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
|
|
||||||
cmake --build . --config Release
|
|
||||||
```
|
|
||||||
|
|
||||||
Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni.
|
|
||||||
|
|
||||||
Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
|
Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
|
||||||
|
|
||||||
@ -601,14 +601,48 @@ Building the program with BLAS support may lead to some performance improvements
|
|||||||
|
|
||||||
You can get a list of platforms and devices from the `clinfo -l` command, etc.
|
You can get a list of platforms and devices from the `clinfo -l` command, etc.
|
||||||
|
|
||||||
- #### SYCL
|
- #### Vulkan
|
||||||
|
|
||||||
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
|
**With docker**:
|
||||||
|
|
||||||
llama.cpp based on SYCL is used to support Intel GPU (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
|
You don't need to install Vulkan SDK. It will be installed inside the container.
|
||||||
|
|
||||||
For detailed info, please refer to [llama.cpp for SYCL](README-sycl.md).
|
```sh
|
||||||
|
# Build the image
|
||||||
|
docker build -t llama-cpp-vulkan -f .devops/main-vulkan.Dockerfile .
|
||||||
|
|
||||||
|
# Then, use it:
|
||||||
|
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
||||||
|
```
|
||||||
|
|
||||||
|
**Without docker**:
|
||||||
|
|
||||||
|
Firstly, you need to make sure you installed [Vulkan SDK](https://vulkan.lunarg.com/doc/view/latest/linux/getting_started_ubuntu.html)
|
||||||
|
|
||||||
|
For example, on Ubuntu 22.04 (jammy), use the command below:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add -
|
||||||
|
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
|
||||||
|
apt update -y
|
||||||
|
apt-get install -y vulkan-sdk
|
||||||
|
# To verify the installation, use the command below:
|
||||||
|
vulkaninfo
|
||||||
|
```
|
||||||
|
|
||||||
|
Then, build llama.cpp using the cmake command below:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
mkdir -p build
|
||||||
|
cd build
|
||||||
|
cmake .. -DLLAMA_VULKAN=1
|
||||||
|
cmake --build . --config Release
|
||||||
|
# Test the output binary (with "-ngl 33" to offload all layers to GPU)
|
||||||
|
./bin/main -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
|
||||||
|
|
||||||
|
# You should see in the output, ggml_vulkan detected your GPU. For example:
|
||||||
|
# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
|
||||||
|
```
|
||||||
|
|
||||||
### Prepare Data & Run
|
### Prepare Data & Run
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user