mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-02-06 08:30:33 +01:00
Merge branch 'ggerganov:master' into master
This commit is contained in:
commit
5b87db0802
16
.github/ISSUE_TEMPLATE/010-bug-compilation.yml
vendored
16
.github/ISSUE_TEMPLATE/010-bug-compilation.yml
vendored
@ -66,11 +66,21 @@ body:
|
|||||||
validations:
|
validations:
|
||||||
required: false
|
required: false
|
||||||
- type: textarea
|
- type: textarea
|
||||||
id: logs
|
id: command
|
||||||
attributes:
|
attributes:
|
||||||
label: Relevant log output
|
label: Compile command
|
||||||
description: >
|
description: >
|
||||||
Please copy and paste any relevant log output, including the command that you entered and any generated text.
|
Please provide the exact command you used to compile llama.cpp. For example: `cmake -B ...`.
|
||||||
|
This will be automatically formatted into code, so no need for backticks.
|
||||||
|
render: shell
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
id: logs
|
||||||
|
attributes:
|
||||||
|
label: Relevant log output
|
||||||
|
description: >
|
||||||
|
Please copy and paste any relevant log output, including any generated text.
|
||||||
This will be automatically formatted into code, so no need for backticks.
|
This will be automatically formatted into code, so no need for backticks.
|
||||||
render: shell
|
render: shell
|
||||||
validations:
|
validations:
|
||||||
|
12
.github/ISSUE_TEMPLATE/019-bug-misc.yml
vendored
12
.github/ISSUE_TEMPLATE/019-bug-misc.yml
vendored
@ -52,6 +52,16 @@ body:
|
|||||||
- Other (Please specify in the next section)
|
- Other (Please specify in the next section)
|
||||||
validations:
|
validations:
|
||||||
required: false
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
id: command
|
||||||
|
attributes:
|
||||||
|
label: Command line
|
||||||
|
description: >
|
||||||
|
Please provide the exact commands you entered, if applicable. For example: `llama-server -m ... -c ...`, `llama-cli -m ...`, etc.
|
||||||
|
This will be automatically formatted into code, so no need for backticks.
|
||||||
|
render: shell
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
- type: textarea
|
- type: textarea
|
||||||
id: info
|
id: info
|
||||||
attributes:
|
attributes:
|
||||||
@ -74,7 +84,7 @@ body:
|
|||||||
attributes:
|
attributes:
|
||||||
label: Relevant log output
|
label: Relevant log output
|
||||||
description: >
|
description: >
|
||||||
If applicable, please copy and paste any relevant log output, including the command that you entered and any generated text.
|
If applicable, please copy and paste any relevant log output, including any generated text.
|
||||||
This will be automatically formatted into code, so no need for backticks.
|
This will be automatically formatted into code, so no need for backticks.
|
||||||
render: shell
|
render: shell
|
||||||
validations:
|
validations:
|
||||||
|
30
.github/workflows/build.yml
vendored
30
.github/workflows/build.yml
vendored
@ -60,8 +60,7 @@ jobs:
|
|||||||
-DLLAMA_CURL=ON \
|
-DLLAMA_CURL=ON \
|
||||||
-DGGML_METAL_USE_BF16=ON \
|
-DGGML_METAL_USE_BF16=ON \
|
||||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||||
-DGGML_RPC=ON \
|
-DGGML_RPC=ON
|
||||||
-DBUILD_SHARED_LIBS=OFF
|
|
||||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
@ -123,8 +122,7 @@ jobs:
|
|||||||
-DLLAMA_FATAL_WARNINGS=ON \
|
-DLLAMA_FATAL_WARNINGS=ON \
|
||||||
-DLLAMA_CURL=ON \
|
-DLLAMA_CURL=ON \
|
||||||
-DGGML_METAL=OFF \
|
-DGGML_METAL=OFF \
|
||||||
-DGGML_RPC=ON \
|
-DGGML_RPC=ON
|
||||||
-DBUILD_SHARED_LIBS=OFF
|
|
||||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
@ -181,7 +179,7 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
|
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON
|
||||||
cmake --build . --config Release -j $(nproc)
|
cmake --build . --config Release -j $(nproc)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
@ -651,23 +649,23 @@ jobs:
|
|||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
- build: 'noavx-x64'
|
- build: 'noavx-x64'
|
||||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF'
|
||||||
- build: 'avx2-x64'
|
- build: 'avx2-x64'
|
||||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON'
|
||||||
- build: 'avx-x64'
|
- build: 'avx-x64'
|
||||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF'
|
||||||
- build: 'avx512-x64'
|
- build: 'avx512-x64'
|
||||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON'
|
||||||
- build: 'openblas-x64'
|
- build: 'openblas-x64'
|
||||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||||
- build: 'kompute-x64'
|
- build: 'kompute-x64'
|
||||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
|
||||||
- build: 'vulkan-x64'
|
- build: 'vulkan-x64'
|
||||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON'
|
||||||
- build: 'llvm-arm64'
|
- build: 'llvm-arm64'
|
||||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
|
||||||
- build: 'msvc-arm64'
|
- build: 'msvc-arm64'
|
||||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
|
||||||
- build: 'llvm-arm64-opencl-adreno'
|
- build: 'llvm-arm64-opencl-adreno'
|
||||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
|
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
|
||||||
|
|
||||||
@ -914,7 +912,7 @@ jobs:
|
|||||||
shell: cmd
|
shell: cmd
|
||||||
run: |
|
run: |
|
||||||
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
|
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
|
||||||
cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON
|
cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DGGML_RPC=ON
|
||||||
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
|
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
|
||||||
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
|
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
|
||||||
cmake --build build --config Release
|
cmake --build build --config Release
|
||||||
@ -1239,7 +1237,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Create release
|
- name: Create release
|
||||||
id: create_release
|
id: create_release
|
||||||
uses: anzz1/action-create-release@v1
|
uses: ggml-org/action-create-release@v1
|
||||||
env:
|
env:
|
||||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
with:
|
with:
|
||||||
|
3
.github/workflows/docker.yml
vendored
3
.github/workflows/docker.yml
vendored
@ -97,10 +97,9 @@ jobs:
|
|||||||
GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||||
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
|
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
|
||||||
|
|
||||||
# https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
|
|
||||||
- name: Free Disk Space (Ubuntu)
|
- name: Free Disk Space (Ubuntu)
|
||||||
if: ${{ matrix.config.free_disk_space == true }}
|
if: ${{ matrix.config.free_disk_space == true }}
|
||||||
uses: jlumbroso/free-disk-space@main
|
uses: ggml-org/free-disk-space@v1.3.1
|
||||||
with:
|
with:
|
||||||
# this might remove tools that are actually needed,
|
# this might remove tools that are actually needed,
|
||||||
# if set to "true" but frees about 6 GB
|
# if set to "true" but frees about 6 GB
|
||||||
|
4
.github/workflows/editorconfig.yml
vendored
4
.github/workflows/editorconfig.yml
vendored
@ -23,5 +23,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
- uses: editorconfig-checker/action-editorconfig-checker@main
|
- uses: editorconfig-checker/action-editorconfig-checker@v2
|
||||||
|
with:
|
||||||
|
version: v3.0.3
|
||||||
- run: editorconfig-checker
|
- run: editorconfig-checker
|
||||||
|
@ -1,5 +1,11 @@
|
|||||||
# collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
|
# collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
|
||||||
|
|
||||||
/ci/ @ggerganov
|
/ci/ @ggerganov
|
||||||
/.devops/ @ngxson
|
/.devops/*.Dockerfile @ngxson
|
||||||
/examples/server/ @ngxson
|
/examples/server/ @ngxson
|
||||||
|
/ggml/src/ggml-cuda/fattn* @JohannesGaessler
|
||||||
|
/ggml/src/ggml-cuda/mmq.* @JohannesGaessler
|
||||||
|
/ggml/src/ggml-cuda/mmv.* @JohannesGaessler
|
||||||
|
/ggml/src/ggml-cuda/mmvq.* @JohannesGaessler
|
||||||
|
/ggml/src/ggml-opt.cpp @JohannesGaessler
|
||||||
|
/ggml/src/gguf.cpp @JohannesGaessler
|
||||||
|
@ -69,6 +69,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
|||||||
- [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
|
- [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
|
||||||
- [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557)
|
- [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557)
|
||||||
- [x] [Phi models](https://huggingface.co/models?search=microsoft/phi)
|
- [x] [Phi models](https://huggingface.co/models?search=microsoft/phi)
|
||||||
|
- [x] [PhiMoE](https://github.com/ggerganov/llama.cpp/pull/11003)
|
||||||
- [x] [GPT-2](https://huggingface.co/gpt2)
|
- [x] [GPT-2](https://huggingface.co/gpt2)
|
||||||
- [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118)
|
- [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118)
|
||||||
- [x] [InternLM2](https://huggingface.co/models?search=internlm2)
|
- [x] [InternLM2](https://huggingface.co/models?search=internlm2)
|
||||||
@ -201,6 +202,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
|||||||
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
|
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
|
||||||
- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
|
- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
|
||||||
- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
|
- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
|
||||||
|
- [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
|
@ -22,6 +22,11 @@ common_arg & common_arg::set_examples(std::initializer_list<enum llama_example>
|
|||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
common_arg & common_arg::set_excludes(std::initializer_list<enum llama_example> excludes) {
|
||||||
|
this->excludes = std::move(excludes);
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
common_arg & common_arg::set_env(const char * env) {
|
common_arg & common_arg::set_env(const char * env) {
|
||||||
help = help + "\n(env: " + env + ")";
|
help = help + "\n(env: " + env + ")";
|
||||||
this->env = env;
|
this->env = env;
|
||||||
@ -37,6 +42,10 @@ bool common_arg::in_example(enum llama_example ex) {
|
|||||||
return examples.find(ex) != examples.end();
|
return examples.find(ex) != examples.end();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool common_arg::is_exclude(enum llama_example ex) {
|
||||||
|
return excludes.find(ex) != excludes.end();
|
||||||
|
}
|
||||||
|
|
||||||
bool common_arg::get_value_from_env(std::string & output) {
|
bool common_arg::get_value_from_env(std::string & output) {
|
||||||
if (env == nullptr) return false;
|
if (env == nullptr) return false;
|
||||||
char * value = std::getenv(env);
|
char * value = std::getenv(env);
|
||||||
@ -420,7 +429,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
* - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example
|
* - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example
|
||||||
*/
|
*/
|
||||||
auto add_opt = [&](common_arg arg) {
|
auto add_opt = [&](common_arg arg) {
|
||||||
if (arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) {
|
if ((arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) && !arg.is_exclude(ex)) {
|
||||||
ctx_arg.options.push_back(std::move(arg));
|
ctx_arg.options.push_back(std::move(arg));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -649,7 +658,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.prompt = value;
|
params.prompt = value;
|
||||||
}
|
}
|
||||||
));
|
).set_excludes({LLAMA_EXAMPLE_SERVER}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--no-perf"},
|
{"--no-perf"},
|
||||||
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
|
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
|
||||||
@ -673,7 +682,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
params.prompt.pop_back();
|
params.prompt.pop_back();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
));
|
).set_excludes({LLAMA_EXAMPLE_SERVER}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--in-file"}, "FNAME",
|
{"--in-file"}, "FNAME",
|
||||||
"an input file (repeat to specify multiple files)",
|
"an input file (repeat to specify multiple files)",
|
||||||
@ -700,7 +709,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
params.prompt = ss.str();
|
params.prompt = ss.str();
|
||||||
fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str());
|
fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str());
|
||||||
}
|
}
|
||||||
));
|
).set_excludes({LLAMA_EXAMPLE_SERVER}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-e", "--escape"},
|
{"-e", "--escape"},
|
||||||
string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
|
string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
|
||||||
@ -1512,7 +1521,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
{"--lora"}, "FNAME",
|
{"--lora"}, "FNAME",
|
||||||
"path to LoRA adapter (can be repeated to use multiple adapters)",
|
"path to LoRA adapter (can be repeated to use multiple adapters)",
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.lora_adapters.push_back({ std::string(value), 1.0 });
|
params.lora_adapters.push_back({ std::string(value), 1.0, nullptr });
|
||||||
}
|
}
|
||||||
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
|
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
|
||||||
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
|
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
|
||||||
@ -1520,7 +1529,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
{"--lora-scaled"}, "FNAME", "SCALE",
|
{"--lora-scaled"}, "FNAME", "SCALE",
|
||||||
"path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
|
"path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
|
||||||
[](common_params & params, const std::string & fname, const std::string & scale) {
|
[](common_params & params, const std::string & fname, const std::string & scale) {
|
||||||
params.lora_adapters.push_back({ fname, std::stof(scale) });
|
params.lora_adapters.push_back({ fname, std::stof(scale), nullptr });
|
||||||
}
|
}
|
||||||
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
|
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
|
||||||
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
|
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
|
||||||
|
@ -12,6 +12,7 @@
|
|||||||
|
|
||||||
struct common_arg {
|
struct common_arg {
|
||||||
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
|
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
|
||||||
|
std::set<enum llama_example> excludes = {};
|
||||||
std::vector<const char *> args;
|
std::vector<const char *> args;
|
||||||
const char * value_hint = nullptr; // help text or example for arg value
|
const char * value_hint = nullptr; // help text or example for arg value
|
||||||
const char * value_hint_2 = nullptr; // for second arg value
|
const char * value_hint_2 = nullptr; // for second arg value
|
||||||
@ -53,9 +54,11 @@ struct common_arg {
|
|||||||
) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
|
) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
|
||||||
|
|
||||||
common_arg & set_examples(std::initializer_list<enum llama_example> examples);
|
common_arg & set_examples(std::initializer_list<enum llama_example> examples);
|
||||||
|
common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
|
||||||
common_arg & set_env(const char * env);
|
common_arg & set_env(const char * env);
|
||||||
common_arg & set_sparam();
|
common_arg & set_sparam();
|
||||||
bool in_example(enum llama_example ex);
|
bool in_example(enum llama_example ex);
|
||||||
|
bool is_exclude(enum llama_example ex);
|
||||||
bool get_value_from_env(std::string & output);
|
bool get_value_from_env(std::string & output);
|
||||||
bool has_value_from_env();
|
bool has_value_from_env();
|
||||||
std::string to_string();
|
std::string to_string();
|
||||||
|
@ -2,6 +2,9 @@
|
|||||||
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
|
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "gguf.h"
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
||||||
@ -18,6 +21,7 @@
|
|||||||
#include <cstdarg>
|
#include <cstdarg>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
|
#include <filesystem>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <iterator>
|
#include <iterator>
|
||||||
@ -62,7 +66,9 @@
|
|||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
#include <linux/limits.h>
|
#include <linux/limits.h>
|
||||||
#elif defined(_WIN32)
|
#elif defined(_WIN32)
|
||||||
#define PATH_MAX MAX_PATH
|
# if !defined(PATH_MAX)
|
||||||
|
# define PATH_MAX MAX_PATH
|
||||||
|
# endif
|
||||||
#else
|
#else
|
||||||
#include <sys/syslimits.h>
|
#include <sys/syslimits.h>
|
||||||
#endif
|
#endif
|
||||||
@ -843,7 +849,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||||||
} else if (!params.model_url.empty()) {
|
} else if (!params.model_url.empty()) {
|
||||||
model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
|
model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
|
||||||
} else {
|
} else {
|
||||||
model = llama_load_model_from_file(params.model.c_str(), mparams);
|
model = llama_model_load_from_file(params.model.c_str(), mparams);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
@ -870,7 +876,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!ok) {
|
if (!ok) {
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
|
|
||||||
return iparams;
|
return iparams;
|
||||||
}
|
}
|
||||||
@ -881,14 +887,13 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||||||
llama_context * lctx = llama_new_context_with_model(model, cparams);
|
llama_context * lctx = llama_new_context_with_model(model, cparams);
|
||||||
if (lctx == NULL) {
|
if (lctx == NULL) {
|
||||||
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
return iparams;
|
return iparams;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
|
if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
|
||||||
LOG_ERR("%s: KV cache shifting is not supported for this model (--no-context-shift to disable)'\n", __func__);
|
LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
|
||||||
llama_free_model(model);
|
params.ctx_shift = false;
|
||||||
return iparams;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!params.control_vectors.empty()) {
|
if (!params.control_vectors.empty()) {
|
||||||
@ -898,7 +903,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||||||
const auto cvec = common_control_vector_load(params.control_vectors);
|
const auto cvec = common_control_vector_load(params.control_vectors);
|
||||||
if (cvec.n_embd == -1) {
|
if (cvec.n_embd == -1) {
|
||||||
llama_free(lctx);
|
llama_free(lctx);
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
|
|
||||||
return iparams;
|
return iparams;
|
||||||
}
|
}
|
||||||
@ -911,7 +916,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||||||
params.control_vector_layer_end);
|
params.control_vector_layer_end);
|
||||||
if (err) {
|
if (err) {
|
||||||
llama_free(lctx);
|
llama_free(lctx);
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
|
|
||||||
return iparams;
|
return iparams;
|
||||||
}
|
}
|
||||||
@ -919,20 +924,21 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||||||
|
|
||||||
// load and optionally apply lora adapters
|
// load and optionally apply lora adapters
|
||||||
for (auto & la : params.lora_adapters) {
|
for (auto & la : params.lora_adapters) {
|
||||||
common_lora_adapter_container loaded_la;
|
llama_lora_adapter_ptr lora;
|
||||||
loaded_la.path = la.path;
|
lora.reset(llama_lora_adapter_init(model, la.path.c_str()));
|
||||||
loaded_la.scale = la.scale;
|
if (lora == nullptr) {
|
||||||
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
|
|
||||||
if (loaded_la.adapter == nullptr) {
|
|
||||||
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
||||||
llama_free(lctx);
|
llama_free(lctx);
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
return iparams;
|
return iparams;
|
||||||
}
|
}
|
||||||
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
|
|
||||||
|
la.ptr = lora.get();
|
||||||
|
iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!params.lora_init_without_apply) {
|
if (!params.lora_init_without_apply) {
|
||||||
common_lora_adapters_apply(lctx, iparams.lora_adapters);
|
common_lora_adapters_apply(lctx, params.lora_adapters);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
|
if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
|
||||||
@ -979,7 +985,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||||||
if (llama_model_has_encoder(model)) {
|
if (llama_model_has_encoder(model)) {
|
||||||
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
|
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
|
||||||
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
||||||
if (decoder_start_token_id == -1) {
|
if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
|
||||||
decoder_start_token_id = bos;
|
decoder_start_token_id = bos;
|
||||||
}
|
}
|
||||||
tmp.clear();
|
tmp.clear();
|
||||||
@ -993,17 +999,17 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||||||
llama_perf_context_reset(lctx);
|
llama_perf_context_reset(lctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
iparams.model = model;
|
iparams.model.reset(model);
|
||||||
iparams.context = lctx;
|
iparams.context.reset(lctx);
|
||||||
|
|
||||||
return iparams;
|
return iparams;
|
||||||
}
|
}
|
||||||
|
|
||||||
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
|
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora) {
|
||||||
llama_lora_adapter_clear(ctx);
|
llama_lora_adapter_clear(ctx);
|
||||||
for (auto & la : lora_adapters) {
|
for (auto & la : lora) {
|
||||||
if (la.scale != 0.0f) {
|
if (la.scale != 0.0f) {
|
||||||
llama_lora_adapter_set(ctx, la.adapter, la.scale);
|
llama_lora_adapter_set(ctx, la.ptr, la.scale);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1148,8 +1154,7 @@ static bool common_download_file(const std::string & url, const std::string & pa
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Check if the file already exists locally
|
// Check if the file already exists locally
|
||||||
struct stat model_file_info;
|
auto file_exists = std::filesystem::exists(path);
|
||||||
auto file_exists = (stat(path.c_str(), &model_file_info) == 0);
|
|
||||||
|
|
||||||
// If the file exists, check its JSON metadata companion file.
|
// If the file exists, check its JSON metadata companion file.
|
||||||
std::string metadata_path = path + ".json";
|
std::string metadata_path = path + ".json";
|
||||||
@ -1409,7 +1414,7 @@ struct llama_model * common_load_model_from_url(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return llama_load_model_from_file(local_path.c_str(), params);
|
return llama_model_load_from_file(local_path.c_str(), params);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_model * common_load_model_from_hf(
|
struct llama_model * common_load_model_from_hf(
|
||||||
@ -1612,6 +1617,18 @@ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token
|
|||||||
// Chat template utils
|
// Chat template utils
|
||||||
//
|
//
|
||||||
|
|
||||||
|
std::string common_get_builtin_chat_template(const struct llama_model * model) {
|
||||||
|
static const char * template_key = "tokenizer.chat_template";
|
||||||
|
// call with NULL buffer to get the total size of the string
|
||||||
|
int32_t res = llama_model_meta_val_str(model, template_key, NULL, 0);
|
||||||
|
if (res > 0) {
|
||||||
|
std::vector<char> model_template(res + 1, 0);
|
||||||
|
llama_model_meta_val_str(model, template_key, model_template.data(), model_template.size());
|
||||||
|
return std::string(model_template.data(), model_template.size() - 1);
|
||||||
|
}
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
bool common_chat_verify_template(const std::string & tmpl) {
|
bool common_chat_verify_template(const std::string & tmpl) {
|
||||||
llama_chat_message chat[] = {{"user", "test"}};
|
llama_chat_message chat[] = {{"user", "test"}};
|
||||||
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
|
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "llama.h"
|
#include "llama-cpp.h"
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
@ -27,10 +27,8 @@
|
|||||||
struct common_lora_adapter_info {
|
struct common_lora_adapter_info {
|
||||||
std::string path;
|
std::string path;
|
||||||
float scale;
|
float scale;
|
||||||
};
|
|
||||||
|
|
||||||
struct common_lora_adapter_container : common_lora_adapter_info {
|
struct llama_lora_adapter * ptr;
|
||||||
struct llama_lora_adapter * adapter;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
using llama_tokens = std::vector<llama_token>;
|
using llama_tokens = std::vector<llama_token>;
|
||||||
@ -478,10 +476,12 @@ std::string fs_get_cache_file(const std::string & filename);
|
|||||||
// Model utils
|
// Model utils
|
||||||
//
|
//
|
||||||
|
|
||||||
|
// note: defines object's lifetime
|
||||||
struct common_init_result {
|
struct common_init_result {
|
||||||
struct llama_model * model = nullptr;
|
llama_model_ptr model;
|
||||||
struct llama_context * context = nullptr;
|
llama_context_ptr context;
|
||||||
std::vector<common_lora_adapter_container> lora_adapters;
|
|
||||||
|
std::vector<llama_lora_adapter_ptr> lora;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct common_init_result common_init_from_params(common_params & params);
|
struct common_init_result common_init_from_params(common_params & params);
|
||||||
@ -503,7 +503,7 @@ struct llama_model * common_load_model_from_hf(
|
|||||||
const struct llama_model_params & params);
|
const struct llama_model_params & params);
|
||||||
|
|
||||||
// clear LoRA adapters from context, then apply new list of adapters
|
// clear LoRA adapters from context, then apply new list of adapters
|
||||||
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
|
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Batch utils
|
// Batch utils
|
||||||
@ -571,6 +571,9 @@ struct common_chat_msg {
|
|||||||
std::string content;
|
std::string content;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Get the built-in chat template for the model. Return empty string if not present.
|
||||||
|
std::string common_get_builtin_chat_template(const struct llama_model * model);
|
||||||
|
|
||||||
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
||||||
bool common_chat_verify_template(const std::string & tmpl);
|
bool common_chat_verify_template(const std::string & tmpl);
|
||||||
|
|
||||||
@ -637,6 +640,10 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
|
|||||||
// Split utils
|
// Split utils
|
||||||
//
|
//
|
||||||
|
|
||||||
static const char * const LLM_KV_SPLIT_NO = "split.no";
|
namespace {
|
||||||
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
|
|
||||||
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
const char * const LLM_KV_SPLIT_NO = "split.no";
|
||||||
|
const char * const LLM_KV_SPLIT_COUNT = "split.count";
|
||||||
|
const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
||||||
|
|
||||||
|
}
|
||||||
|
@ -65,13 +65,13 @@ constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66};
|
|||||||
static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) {
|
static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) {
|
||||||
common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
|
common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
|
||||||
if (part_static_it == nc_static.end()) {
|
if (part_static_it == nc_static.end()) {
|
||||||
return -1;
|
return LLAMA_TOKEN_NULL;
|
||||||
}
|
}
|
||||||
const common_ngram_cache_part part_static = part_static_it->second;
|
const common_ngram_cache_part part_static = part_static_it->second;
|
||||||
|
|
||||||
int max_count_static = 0;
|
int max_count_static = 0;
|
||||||
int sum_count_static = 0;
|
int sum_count_static = 0;
|
||||||
llama_token max_token = -1;
|
llama_token max_token = LLAMA_TOKEN_NULL;
|
||||||
|
|
||||||
for (std::pair<llama_token, int> token_count_static : part_static) {
|
for (std::pair<llama_token, int> token_count_static : part_static) {
|
||||||
const llama_token token = token_count_static.first;
|
const llama_token token = token_count_static.first;
|
||||||
@ -85,10 +85,10 @@ static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) {
|
if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) {
|
||||||
return -1;
|
return LLAMA_TOKEN_NULL;
|
||||||
}
|
}
|
||||||
if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) {
|
if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) {
|
||||||
return -1;
|
return LLAMA_TOKEN_NULL;
|
||||||
}
|
}
|
||||||
return max_token;
|
return max_token;
|
||||||
}
|
}
|
||||||
@ -98,9 +98,9 @@ static llama_token try_draft(
|
|||||||
common_ngram_cache & nc_primary, const std::vector<common_ngram> & ngrams_primary, common_ngram_cache_part & part_static,
|
common_ngram_cache & nc_primary, const std::vector<common_ngram> & ngrams_primary, common_ngram_cache_part & part_static,
|
||||||
const int * min_sample_size, const int * min_percent) {
|
const int * min_sample_size, const int * min_percent) {
|
||||||
|
|
||||||
llama_token drafted_token = -1;
|
llama_token drafted_token = LLAMA_TOKEN_NULL;
|
||||||
|
|
||||||
for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) {
|
for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == LLAMA_TOKEN_NULL; --i) {
|
||||||
const common_ngram ngram_primary = ngrams_primary[i];
|
const common_ngram ngram_primary = ngrams_primary[i];
|
||||||
|
|
||||||
common_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
|
common_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
|
||||||
@ -112,7 +112,7 @@ static llama_token try_draft(
|
|||||||
int max_count_primary = 0;
|
int max_count_primary = 0;
|
||||||
int max_count_static = 0;
|
int max_count_static = 0;
|
||||||
int sum_count_primary = 0;
|
int sum_count_primary = 0;
|
||||||
llama_token max_token = -1;
|
llama_token max_token = LLAMA_TOKEN_NULL;
|
||||||
|
|
||||||
for (std::pair<llama_token, int> token_count_primary : part_primary) {
|
for (std::pair<llama_token, int> token_count_primary : part_primary) {
|
||||||
const llama_token token = token_count_primary.first;
|
const llama_token token = token_count_primary.first;
|
||||||
@ -154,7 +154,7 @@ void common_ngram_cache_draft(
|
|||||||
}
|
}
|
||||||
|
|
||||||
while ((int) draft.size()-1 < n_draft) {
|
while ((int) draft.size()-1 < n_draft) {
|
||||||
llama_token drafted_token = -1;
|
llama_token drafted_token = LLAMA_TOKEN_NULL;
|
||||||
|
|
||||||
const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
|
const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
|
||||||
common_ngram ngram_static;
|
common_ngram ngram_static;
|
||||||
@ -177,17 +177,17 @@ void common_ngram_cache_draft(
|
|||||||
}
|
}
|
||||||
ngrams_cd.push_back(ngram_cd);
|
ngrams_cd.push_back(ngram_cd);
|
||||||
}
|
}
|
||||||
if (drafted_token == -1) {
|
if (drafted_token == LLAMA_TOKEN_NULL) {
|
||||||
drafted_token = try_draft(nc_context, ngrams_cd, part_static, draft_min_sample_size_lax, draft_min_percent_lax);
|
drafted_token = try_draft(nc_context, ngrams_cd, part_static, draft_min_sample_size_lax, draft_min_percent_lax);
|
||||||
}
|
}
|
||||||
if (drafted_token == -1) {
|
if (drafted_token == LLAMA_TOKEN_NULL) {
|
||||||
drafted_token = try_draft(nc_dynamic, ngrams_cd, part_static, draft_min_sample_size_strict, draft_min_percent_strict);
|
drafted_token = try_draft(nc_dynamic, ngrams_cd, part_static, draft_min_sample_size_strict, draft_min_percent_strict);
|
||||||
}
|
}
|
||||||
if (drafted_token == -1) {
|
if (drafted_token == LLAMA_TOKEN_NULL) {
|
||||||
drafted_token = try_draft(nc_static, ngram_static);
|
drafted_token = try_draft(nc_static, ngram_static);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (drafted_token == -1) {
|
if (drafted_token == LLAMA_TOKEN_NULL) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -17,13 +17,13 @@ struct common_ngram {
|
|||||||
|
|
||||||
common_ngram() {
|
common_ngram() {
|
||||||
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
||||||
tokens[i] = -1;
|
tokens[i] = LLAMA_TOKEN_NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
common_ngram(const llama_token * input, const int ngram_size) {
|
common_ngram(const llama_token * input, const int ngram_size) {
|
||||||
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
||||||
tokens[i] = i < ngram_size ? input[i] : -1;
|
tokens[i] = i < ngram_size ? input[i] : LLAMA_TOKEN_NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -687,6 +687,9 @@ class Model:
|
|||||||
if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1":
|
if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1":
|
||||||
# ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct
|
# ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct
|
||||||
res = "megrez"
|
res = "megrez"
|
||||||
|
if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5":
|
||||||
|
# ref: https://huggingface.co/deepseek-ai/DeepSeek-V3
|
||||||
|
res = "deepseek-v3"
|
||||||
|
|
||||||
if res is None:
|
if res is None:
|
||||||
logger.warning("\n")
|
logger.warning("\n")
|
||||||
@ -1764,25 +1767,19 @@ class DeciModel(Model):
|
|||||||
self.gguf_writer.add_token_list(tokens)
|
self.gguf_writer.add_token_list(tokens)
|
||||||
self.gguf_writer.add_token_types(toktypes)
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
|
|
||||||
special_vocab = gguf.SpecialVocab(
|
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
||||||
self.dir_model, load_merges=True,
|
|
||||||
special_token_types = ['bos', 'eos', 'eom', 'eot']
|
|
||||||
)
|
|
||||||
special_vocab._set_special_token("bos", 128000)
|
|
||||||
special_vocab._set_special_token("eos", 128001)
|
|
||||||
special_vocab._set_special_token("eom", 128008)
|
|
||||||
special_vocab._set_special_token("eot", 128009)
|
|
||||||
special_vocab.add_to_gguf(self.gguf_writer)
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
else:
|
else:
|
||||||
# DeciLM-7B
|
# DeciLM-7B
|
||||||
self._set_vocab_llama_hf()
|
self._set_vocab_llama_hf()
|
||||||
# self._set_vocab_gpt2()
|
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B
|
if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B
|
||||||
assert self.block_count == len(self._num_kv_heads)
|
assert self.block_count == len(self._num_kv_heads)
|
||||||
assert self.block_count == len(self._num_heads)
|
assert self.block_count == len(self._num_heads)
|
||||||
assert self.block_count == len(self._ffn_dims)
|
assert self.block_count == len(self._ffn_dims)
|
||||||
|
if (rope_theta := self.hparams.get("rope_theta")) is not None:
|
||||||
|
self.gguf_writer.add_rope_freq_base(rope_theta)
|
||||||
self.gguf_writer.add_head_count_kv(self._num_kv_heads)
|
self.gguf_writer.add_head_count_kv(self._num_kv_heads)
|
||||||
self.gguf_writer.add_head_count(self._num_heads)
|
self.gguf_writer.add_head_count(self._num_heads)
|
||||||
self.gguf_writer.add_feed_forward_length(self._ffn_dims)
|
self.gguf_writer.add_feed_forward_length(self._ffn_dims)
|
||||||
@ -2565,6 +2562,63 @@ class Phi3MiniModel(Model):
|
|||||||
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
|
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("PhiMoEForCausalLM")
|
||||||
|
class PhiMoeModel(Phi3MiniModel):
|
||||||
|
model_arch = gguf.MODEL_ARCH.PHIMOE
|
||||||
|
|
||||||
|
_experts: list[dict[str, Tensor]] | None = None
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
|
||||||
|
self.gguf_writer.add_expert_count(self.hparams["num_local_experts"])
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
# process the experts separately
|
||||||
|
if name.find("block_sparse_moe.experts") != -1:
|
||||||
|
n_experts = self.hparams["num_local_experts"]
|
||||||
|
assert bid is not None
|
||||||
|
|
||||||
|
if self._experts is None:
|
||||||
|
self._experts = [{} for _ in range(self.block_count)]
|
||||||
|
|
||||||
|
self._experts[bid][name] = data_torch
|
||||||
|
|
||||||
|
if len(self._experts[bid]) >= n_experts * 3:
|
||||||
|
tensors: list[tuple[str, Tensor]] = []
|
||||||
|
|
||||||
|
# merge the experts into a single 3d tensor
|
||||||
|
for w_name in ["w1", "w2", "w3"]:
|
||||||
|
datas: list[Tensor] = []
|
||||||
|
|
||||||
|
for xid in range(n_experts):
|
||||||
|
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight"
|
||||||
|
datas.append(self._experts[bid][ename])
|
||||||
|
del self._experts[bid][ename]
|
||||||
|
|
||||||
|
data_torch = torch.stack(datas, dim=0)
|
||||||
|
|
||||||
|
merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight"
|
||||||
|
|
||||||
|
new_name = self.map_tensor_name(merged_name)
|
||||||
|
|
||||||
|
tensors.append((new_name, data_torch))
|
||||||
|
return tensors
|
||||||
|
else:
|
||||||
|
return []
|
||||||
|
|
||||||
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
def prepare_tensors(self):
|
||||||
|
super().prepare_tensors()
|
||||||
|
|
||||||
|
if self._experts is not None:
|
||||||
|
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
||||||
|
experts = [k for d in self._experts for k in d.keys()]
|
||||||
|
if len(experts) > 0:
|
||||||
|
raise ValueError(f"Unprocessed experts: {experts}")
|
||||||
|
|
||||||
|
|
||||||
@Model.register("PlamoForCausalLM")
|
@Model.register("PlamoForCausalLM")
|
||||||
class PlamoModel(Model):
|
class PlamoModel(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.PLAMO
|
model_arch = gguf.MODEL_ARCH.PLAMO
|
||||||
@ -3379,6 +3433,24 @@ class CommandR2Model(Model):
|
|||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("Cohere2ForCausalLM")
|
||||||
|
class Cohere2Model(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.COHERE2
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
|
||||||
|
self.gguf_writer.add_logit_scale(self.hparams["logit_scale"])
|
||||||
|
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
|
||||||
|
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
|
||||||
|
|
||||||
|
rotary_pct = self.hparams["rotary_pct"]
|
||||||
|
hidden_size = self.hparams["hidden_size"]
|
||||||
|
num_attention_heads = self.hparams["num_attention_heads"]
|
||||||
|
self.gguf_writer.add_rope_dimension_count(int(rotary_pct * (hidden_size // num_attention_heads)))
|
||||||
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
||||||
|
|
||||||
|
|
||||||
@Model.register("OlmoForCausalLM")
|
@Model.register("OlmoForCausalLM")
|
||||||
@Model.register("OLMoForCausalLM")
|
@Model.register("OLMoForCausalLM")
|
||||||
class OlmoModel(Model):
|
class OlmoModel(Model):
|
||||||
@ -3837,6 +3909,7 @@ class DeepseekModel(Model):
|
|||||||
|
|
||||||
|
|
||||||
@Model.register("DeepseekV2ForCausalLM")
|
@Model.register("DeepseekV2ForCausalLM")
|
||||||
|
@Model.register("DeepseekV3ForCausalLM")
|
||||||
class DeepseekV2Model(Model):
|
class DeepseekV2Model(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
|
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
|
||||||
|
|
||||||
@ -3858,6 +3931,15 @@ class DeepseekV2Model(Model):
|
|||||||
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
|
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
|
||||||
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
|
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
|
||||||
self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
|
self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
|
||||||
|
self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
|
||||||
|
|
||||||
|
if hparams["scoring_func"] == "sigmoid":
|
||||||
|
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
|
||||||
|
elif hparams["scoring_func"] == "softmax":
|
||||||
|
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported scoring_func value: {hparams['scoring_func']}")
|
||||||
|
|
||||||
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
||||||
|
|
||||||
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||||
@ -3870,6 +3952,16 @@ class DeepseekV2Model(Model):
|
|||||||
_experts: list[dict[str, Tensor]] | None = None
|
_experts: list[dict[str, Tensor]] | None = None
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
# rename e_score_correction_bias tensors
|
||||||
|
if name.endswith("e_score_correction_bias"):
|
||||||
|
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
|
||||||
|
|
||||||
|
# skip Multi-Token Prediction (MTP) layers
|
||||||
|
block_count = self.hparams["num_hidden_layers"]
|
||||||
|
match = re.match(r"model.layers.(\d+)", name)
|
||||||
|
if match and int(match.group(1)) >= block_count:
|
||||||
|
return []
|
||||||
|
|
||||||
# process the experts separately
|
# process the experts separately
|
||||||
if name.find("mlp.experts") != -1:
|
if name.find("mlp.experts") != -1:
|
||||||
n_experts = self.hparams["n_routed_experts"]
|
n_experts = self.hparams["n_routed_experts"]
|
||||||
|
@ -107,6 +107,7 @@ models = [
|
|||||||
{"name": "roberta-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
|
{"name": "roberta-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
|
||||||
{"name": "gigachat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"},
|
{"name": "gigachat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"},
|
||||||
{"name": "megrez", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
|
{"name": "megrez", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
|
||||||
|
{"name": "deepseek-v3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -226,6 +226,9 @@ def get_base_tensor_name(lora_tensor_name: str) -> str:
|
|||||||
base_name = lora_tensor_name.replace("base_model.model.", "")
|
base_name = lora_tensor_name.replace("base_model.model.", "")
|
||||||
base_name = base_name.replace(".lora_A.weight", ".weight")
|
base_name = base_name.replace(".lora_A.weight", ".weight")
|
||||||
base_name = base_name.replace(".lora_B.weight", ".weight")
|
base_name = base_name.replace(".lora_B.weight", ".weight")
|
||||||
|
# models produced by mergekit-extract-lora have token embeddings in the adapter
|
||||||
|
base_name = base_name.replace(".lora_embedding_A", ".weight")
|
||||||
|
base_name = base_name.replace(".lora_embedding_B", ".weight")
|
||||||
return base_name
|
return base_name
|
||||||
|
|
||||||
|
|
||||||
@ -260,6 +263,10 @@ def parse_args() -> argparse.Namespace:
|
|||||||
"--base", type=Path,
|
"--base", type=Path,
|
||||||
help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config",
|
help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--base-model-id", type=str,
|
||||||
|
help="the model ID of the base model, if it is not available locally or in the adapter config. If specified, it will ignore --base and load the base model config from the Hugging Face hub (Example: 'meta-llama/Llama-3.2-1B-Instruct')",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"lora_path", type=Path,
|
"lora_path", type=Path,
|
||||||
help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)",
|
help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)",
|
||||||
@ -290,6 +297,7 @@ if __name__ == '__main__':
|
|||||||
|
|
||||||
dir_base_model: Path | None = args.base
|
dir_base_model: Path | None = args.base
|
||||||
dir_lora: Path = args.lora_path
|
dir_lora: Path = args.lora_path
|
||||||
|
base_model_id: str | None = args.base_model_id
|
||||||
lora_config = dir_lora / "adapter_config.json"
|
lora_config = dir_lora / "adapter_config.json"
|
||||||
input_model = dir_lora / "adapter_model.safetensors"
|
input_model = dir_lora / "adapter_model.safetensors"
|
||||||
|
|
||||||
@ -313,7 +321,10 @@ if __name__ == '__main__':
|
|||||||
lparams: dict[str, Any] = json.load(f)
|
lparams: dict[str, Any] = json.load(f)
|
||||||
|
|
||||||
# load base model
|
# load base model
|
||||||
if dir_base_model is None:
|
if base_model_id is not None:
|
||||||
|
logger.info(f"Loading base model from Hugging Face: {base_model_id}")
|
||||||
|
hparams = load_hparams_from_hf(base_model_id)
|
||||||
|
elif dir_base_model is None:
|
||||||
if "base_model_name_or_path" in lparams:
|
if "base_model_name_or_path" in lparams:
|
||||||
model_id = lparams["base_model_name_or_path"]
|
model_id = lparams["base_model_name_or_path"]
|
||||||
logger.info(f"Loading base model from Hugging Face: {model_id}")
|
logger.info(f"Loading base model from Hugging Face: {model_id}")
|
||||||
@ -371,11 +382,16 @@ if __name__ == '__main__':
|
|||||||
if self.lazy:
|
if self.lazy:
|
||||||
tensor = LazyTorchTensor.from_eager(tensor)
|
tensor = LazyTorchTensor.from_eager(tensor)
|
||||||
base_name = get_base_tensor_name(name)
|
base_name = get_base_tensor_name(name)
|
||||||
is_lora_a = ".lora_A.weight" in name
|
# note: mergekit-extract-lora also adds token embeddings to the adapter
|
||||||
is_lora_b = ".lora_B.weight" in name
|
is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name
|
||||||
|
is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name
|
||||||
if not is_lora_a and not is_lora_b:
|
if not is_lora_a and not is_lora_b:
|
||||||
if ".base_layer.weight" in name:
|
if ".base_layer.weight" in name:
|
||||||
continue
|
continue
|
||||||
|
# mergekit-extract-lora add these layernorm to the adapter, we need to keep them
|
||||||
|
if "_layernorm" in name or ".norm" in name:
|
||||||
|
yield (base_name, tensor)
|
||||||
|
continue
|
||||||
logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
|
logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
|
||||||
if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
|
if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
|
||||||
logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
|
logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
|
||||||
@ -407,9 +423,21 @@ if __name__ == '__main__':
|
|||||||
if name == "lm_head.weight" and len(dest) == 0:
|
if name == "lm_head.weight" and len(dest) == 0:
|
||||||
raise ValueError("lm_head is present in adapter, but is ignored in base model")
|
raise ValueError("lm_head is present in adapter, but is ignored in base model")
|
||||||
for dest_name, dest_data in dest:
|
for dest_name, dest_data in dest:
|
||||||
|
# mergekit-extract-lora add these layernorm to the adapter
|
||||||
|
if "_norm" in dest_name:
|
||||||
|
assert dest_data.dim() == 1
|
||||||
|
yield (dest_name, dest_data)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# otherwise, we must get the lora_A and lora_B tensors
|
||||||
assert isinstance(dest_data, LoraTorchTensor)
|
assert isinstance(dest_data, LoraTorchTensor)
|
||||||
lora_a, lora_b = dest_data.get_lora_A_B()
|
lora_a, lora_b = dest_data.get_lora_A_B()
|
||||||
|
|
||||||
|
# note: mergekit-extract-lora flip and transpose A and B
|
||||||
|
# here we only need to transpose token_embd.lora_a, see llm_build_inp_embd()
|
||||||
|
if "token_embd.weight" in dest_name:
|
||||||
|
lora_a = lora_a.T
|
||||||
|
|
||||||
yield (dest_name + ".lora_a", lora_a)
|
yield (dest_name + ".lora_a", lora_a)
|
||||||
yield (dest_name + ".lora_b", lora_b)
|
yield (dest_name + ".lora_b", lora_b)
|
||||||
|
|
||||||
|
@ -127,6 +127,8 @@ For detailed info, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
|
|||||||
|
|
||||||
This provides GPU acceleration using an NVIDIA GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from the [NVIDIA developer site](https://developer.nvidia.com/cuda-downloads).
|
This provides GPU acceleration using an NVIDIA GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from the [NVIDIA developer site](https://developer.nvidia.com/cuda-downloads).
|
||||||
|
|
||||||
|
If you are using Fedora (using Fedora Workstation, or an 'Atomic' variant such as Silverblue), or would like to set up CUDA in a toolbox, please consider our [Fedora CUDA guide](./cuda-fedora.md). Unfortunately, the process is not as simple as one might expect.
|
||||||
|
|
||||||
- Using `CMake`:
|
- Using `CMake`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
317
docs/cuda-fedora.md
Normal file
317
docs/cuda-fedora.md
Normal file
@ -0,0 +1,317 @@
|
|||||||
|
# Setting Up CUDA on Fedora
|
||||||
|
|
||||||
|
In this guide we setup [Nvidia CUDA](https://docs.nvidia.com/cuda/) in a toolbox container. This guide is applicable for:
|
||||||
|
- [Fedora Workstation](https://fedoraproject.org/workstation/)
|
||||||
|
- [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/)
|
||||||
|
- [Fedora Spins](https://fedoraproject.org/spins)
|
||||||
|
- [Other Distributions](https://containertoolbx.org/distros/), including `Red Hat Enterprise Linux >= 8.`, `Arch Linux`, and `Ubuntu`.
|
||||||
|
|
||||||
|
|
||||||
|
## Table of Contents
|
||||||
|
|
||||||
|
- [Prerequisites](#prerequisites)
|
||||||
|
- [Monitoring NVIDIA CUDA Repositories](#monitoring-nvidia-cuda-repositories)
|
||||||
|
- [Using the Fedora 39 CUDA Repository](#using-the-fedora-39-cuda-repository)
|
||||||
|
- [Creating a Fedora Toolbox Environment](#creating-a-fedora-toolbox-environment)
|
||||||
|
- [Installing Essential Development Tools](#installing-essential-development-tools)
|
||||||
|
- [Adding the CUDA Repository](#adding-the-cuda-repository)
|
||||||
|
- [Installing `nvidia-driver-libs`](#installing-nvidia-driver-libs)
|
||||||
|
- [Manually Resolving Package Conflicts](#manually-resolving-package-conflicts)
|
||||||
|
- [Finalizing the Installation of `nvidia-driver-libs`](#finalizing-the-installation-of-nvidia-driver-libs)
|
||||||
|
- [Installing the CUDA Meta-Package](#installing-the-cuda-meta-package)
|
||||||
|
- [Configuring the Environment](#configuring-the-environment)
|
||||||
|
- [Verifying the Installation](#verifying-the-installation)
|
||||||
|
- [Conclusion](#conclusion)
|
||||||
|
- [Troubleshooting](#troubleshooting)
|
||||||
|
- [Additional Notes](#additional-notes)
|
||||||
|
- [References](#references)
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
- **Toolbox Installed on the Host System** `Fedora Silverblue` and `Fedora Workstation` both have toolbox by default, other distributions may need to install the [toolbox package](https://containertoolbx.org/install/).
|
||||||
|
- **NVIDIA Drivers and Graphics Card installed on Host System (optional)** To run CUDA program, such as `llama.cpp`, the host should be setup to access your NVIDIA hardware. Fedora Hosts can use the [RPM Fusion Repository](https://rpmfusion.org/Howto/NVIDIA).
|
||||||
|
- **Internet connectivity** to download packages.
|
||||||
|
|
||||||
|
### Monitoring NVIDIA CUDA Repositories
|
||||||
|
|
||||||
|
Before proceeding, it is advisable to check if NVIDIA has updated their CUDA repositories for your Fedora version. NVIDIA's repositories can be found at:
|
||||||
|
|
||||||
|
- [Fedora 40 CUDA Repository](https://developer.download.nvidia.com/compute/cuda/repos/fedora40/x86_64/)
|
||||||
|
- [Fedora 41 CUDA Repository](https://developer.download.nvidia.com/compute/cuda/repos/fedora41/x86_64/)
|
||||||
|
|
||||||
|
As of the latest update, these repositories do not contain the `cuda` meta-package or are missing essential components.
|
||||||
|
|
||||||
|
### Using the Fedora 39 CUDA Repository
|
||||||
|
|
||||||
|
Since the newer repositories are incomplete, we'll use the Fedora 39 repository:
|
||||||
|
|
||||||
|
- [Fedora 39 CUDA Repository](https://developer.download.nvidia.com/compute/cuda/repos/fedora39/x86_64/)
|
||||||
|
|
||||||
|
**Note:** Fedora 39 is no longer maintained, so we recommend using a toolbox environment to prevent system conflicts.
|
||||||
|
|
||||||
|
## Creating a Fedora Toolbox Environment
|
||||||
|
|
||||||
|
This guide focuses on Fedora hosts, but with small adjustments, it can work for other hosts. Using a Fedora 39 toolbox allows us to install the necessary packages without affecting the host system.
|
||||||
|
|
||||||
|
**Note:** Toolbox is available for other systems, and even without Toolbox, it is possible to use Podman or Docker.
|
||||||
|
|
||||||
|
We do not recommend installing on the host system, as Fedora 39 is out-of-maintenance, and instead you should upgrade to a maintained version of Fedora for your host.
|
||||||
|
|
||||||
|
1. **Create a Fedora 39 Toolbox:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
toolbox create --image registry.fedoraproject.org/fedora-toolbox:39 --container fedora-toolbox-39-cuda
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Enter the Toolbox:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
toolbox enter --container fedora-toolbox-39-cuda
|
||||||
|
```
|
||||||
|
|
||||||
|
Inside the toolbox, you have root privileges and can install packages without affecting the host system.
|
||||||
|
|
||||||
|
## Installing Essential Development Tools
|
||||||
|
|
||||||
|
1. **Synchronize the DNF Package Manager:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo dnf distro-sync
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Install the Default Text Editor (Optional):**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo dnf install vim-default-editor --allowerasing
|
||||||
|
```
|
||||||
|
|
||||||
|
The `--allowerasing` flag resolves any package conflicts.
|
||||||
|
|
||||||
|
3. **Install Development Tools and Libraries:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo dnf install @c-development @development-tools cmake
|
||||||
|
```
|
||||||
|
|
||||||
|
This installs essential packages for compiling software, including `gcc`, `make`, and other development headers.
|
||||||
|
|
||||||
|
## Adding the CUDA Repository
|
||||||
|
|
||||||
|
Add the NVIDIA CUDA repository to your DNF configuration:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/fedora39/x86_64/cuda-fedora39.repo
|
||||||
|
```
|
||||||
|
|
||||||
|
After adding the repository, synchronize the package manager again:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo dnf distro-sync
|
||||||
|
```
|
||||||
|
|
||||||
|
## Installing `nvidia-driver-libs`
|
||||||
|
|
||||||
|
Attempt to install `nvidia-driver-libs`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo dnf install nvidia-driver-libs
|
||||||
|
```
|
||||||
|
|
||||||
|
**Explanation:**
|
||||||
|
|
||||||
|
- `nvidia-driver-libs` contains necessary NVIDIA driver libraries required by CUDA.
|
||||||
|
- This step might fail due to conflicts with existing NVIDIA drivers on the host system.
|
||||||
|
|
||||||
|
## Manually Resolving Package Conflicts
|
||||||
|
|
||||||
|
If the installation fails due to conflicts, we'll manually download and install the required packages, excluding conflicting files.
|
||||||
|
|
||||||
|
### 1. Download the `nvidia-driver-libs` RPM
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo dnf download --arch x86_64 nvidia-driver-libs
|
||||||
|
```
|
||||||
|
|
||||||
|
You should see a file similar to:
|
||||||
|
|
||||||
|
```
|
||||||
|
nvidia-driver-libs-560.35.05-1.fc39.x86_64.rpm
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Attempt to Install the RPM
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo dnf install nvidia-driver-libs-560.35.05-1.fc39.x86_64.rpm
|
||||||
|
```
|
||||||
|
|
||||||
|
**Expected Error:**
|
||||||
|
|
||||||
|
Installation may fail with errors pointing to conflicts with `egl-gbm` and `egl-wayland`.
|
||||||
|
|
||||||
|
**Note: It is important to carefully read the error messages to identify the exact paths that need to be excluded.**
|
||||||
|
|
||||||
|
### 3. Download Dependencies
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo dnf download --arch x86_64 egl-gbm egl-wayland
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Install `egl-gbm` with Excluded Paths
|
||||||
|
|
||||||
|
Exclude conflicting files during installation:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo rpm --install --verbose --hash \
|
||||||
|
--excludepath=/usr/lib64/libnvidia-egl-gbm.so.1.1.2 \
|
||||||
|
--excludepath=/usr/share/egl/egl_external_platform.d/15_nvidia_gbm.json \
|
||||||
|
egl-gbm-1.1.2^20240919gitb24587d-3.fc39.x86_64.rpm
|
||||||
|
```
|
||||||
|
|
||||||
|
**Explanation:**
|
||||||
|
|
||||||
|
- The `--excludepath` option skips installing files that conflict with existing files.
|
||||||
|
- Adjust the paths based on the error messages you receive.
|
||||||
|
|
||||||
|
### 5. Install `egl-wayland` with Excluded Paths
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo rpm --install --verbose --hash \
|
||||||
|
--excludepath=/usr/share/egl/egl_external_platform.d/10_nvidia_wayland.json \
|
||||||
|
egl-wayland-1.1.17^20241118giteeb29e1-5.fc39.x86_64.rpm
|
||||||
|
```
|
||||||
|
|
||||||
|
### 6. Install `nvidia-driver-libs` with Excluded Paths
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo rpm --install --verbose --hash \
|
||||||
|
--excludepath=/usr/share/glvnd/egl_vendor.d/10_nvidia.json \
|
||||||
|
--excludepath=/usr/share/nvidia/nvoptix.bin \
|
||||||
|
nvidia-driver-libs-560.35.05-1.fc39.x86_64.rpm
|
||||||
|
```
|
||||||
|
|
||||||
|
**Note:**
|
||||||
|
|
||||||
|
- Replace the paths with the ones causing conflicts in your installation if they differ.
|
||||||
|
- The `--verbose` and `--hash` options provide detailed output during installation.
|
||||||
|
|
||||||
|
## Finalizing the Installation of `nvidia-driver-libs`
|
||||||
|
|
||||||
|
After manually installing the dependencies, run:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo dnf install nvidia-driver-libs
|
||||||
|
```
|
||||||
|
|
||||||
|
You should receive a message indicating the package is already installed:
|
||||||
|
|
||||||
|
```
|
||||||
|
Package nvidia-driver-libs-3:560.35.05-1.fc39.x86_64 is already installed.
|
||||||
|
Dependencies resolved.
|
||||||
|
Nothing to do.
|
||||||
|
Complete!
|
||||||
|
```
|
||||||
|
|
||||||
|
## Installing the CUDA Meta-Package
|
||||||
|
|
||||||
|
Now that the driver libraries are installed, proceed to install CUDA:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo dnf install cuda
|
||||||
|
```
|
||||||
|
|
||||||
|
This installs the CUDA toolkit and associated packages.
|
||||||
|
|
||||||
|
## Configuring the Environment
|
||||||
|
|
||||||
|
To use CUDA, add its binary directory to your system's `PATH`.
|
||||||
|
|
||||||
|
1. **Create a Profile Script:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo sh -c 'echo "export PATH=\$PATH:/usr/local/cuda/bin" >> /etc/profile.d/cuda.sh'
|
||||||
|
```
|
||||||
|
|
||||||
|
**Explanation:**
|
||||||
|
|
||||||
|
- We add to `/etc/profile.d/` as the `/etc/` folder is unique to this particular container, and is not shared with other containers or the host system.
|
||||||
|
- The backslash `\` before `$PATH` ensures the variable is correctly written into the script.
|
||||||
|
|
||||||
|
2. **Make the Script Executable:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo chmod +x /etc/profile.d/cuda.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Source the Script to Update Your Environment:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
source /etc/profile.d/cuda.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
**Note:** This command updates your current shell session with the new `PATH`. The `/etc/profile.d/cuda.sh` script ensures that the CUDA binaries are available in your `PATH` for all future sessions.
|
||||||
|
|
||||||
|
## Verifying the Installation
|
||||||
|
|
||||||
|
To confirm that CUDA is correctly installed and configured, check the version of the NVIDIA CUDA Compiler (`nvcc`):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
nvcc --version
|
||||||
|
```
|
||||||
|
|
||||||
|
You should see output similar to:
|
||||||
|
|
||||||
|
```
|
||||||
|
nvcc: NVIDIA (R) Cuda compiler driver
|
||||||
|
Copyright (c) 2005-2024 NVIDIA Corporation
|
||||||
|
Built on Tue_Oct_29_23:50:19_PDT_2024
|
||||||
|
Cuda compilation tools, release 12.6, V12.6.85
|
||||||
|
Build cuda_12.6.r12.6/compiler.35059454_0
|
||||||
|
```
|
||||||
|
|
||||||
|
This output confirms that the CUDA compiler is accessible and indicates the installed version.
|
||||||
|
|
||||||
|
## Conclusion
|
||||||
|
|
||||||
|
You have successfully set up CUDA on Fedora within a toolbox environment using the Fedora 39 CUDA repository. By manually resolving package conflicts and configuring the environment, you can develop CUDA applications without affecting your host system.
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
- **Installation Failures:**
|
||||||
|
- If you encounter errors during installation, carefully read the error messages. They often indicate conflicting files or missing dependencies.
|
||||||
|
- Use the `--excludepath` option with `rpm` to exclude conflicting files during manual installations.
|
||||||
|
|
||||||
|
- **Driver Conflicts:**
|
||||||
|
- Since the host system may already have NVIDIA drivers installed, conflicts can arise. Using the toolbox environment helps isolate these issues.
|
||||||
|
|
||||||
|
- **Environment Variables Not Set:**
|
||||||
|
- If `nvcc` is not found after installation, ensure that `/usr/local/cuda/bin` is in your `PATH`.
|
||||||
|
- Run `echo $PATH` to check if the path is included.
|
||||||
|
- Re-source the profile script or open a new terminal session.
|
||||||
|
|
||||||
|
## Additional Notes
|
||||||
|
|
||||||
|
- **Updating CUDA in the Future:**
|
||||||
|
- Keep an eye on the official NVIDIA repositories for updates to your Fedora version.
|
||||||
|
- When an updated repository becomes available, adjust your `dnf` configuration accordingly.
|
||||||
|
|
||||||
|
- **Building `llama.cpp`:**
|
||||||
|
- With CUDA installed, you can follow these [build instructions for `llama.cpp`](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md) to compile it with CUDA support.
|
||||||
|
- Ensure that any CUDA-specific build flags or paths are correctly set in your build configuration.
|
||||||
|
|
||||||
|
- **Using the Toolbox Environment:**
|
||||||
|
- The toolbox environment is isolated from your host system, which helps prevent conflicts.
|
||||||
|
- Remember that system files and configurations inside the toolbox are separate from the host. By default the home directory of the user is shared between the host and the toolbox.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Disclaimer:** Manually installing and modifying system packages can lead to instability of the container. The above steps are provided as a guideline and may need adjustments based on your specific system configuration. Always back up important data before making significant system changes, especially as your home folder is writable and shared with he toolbox.
|
||||||
|
|
||||||
|
**Acknowledgments:** Special thanks to the Fedora community and NVIDIA documentation for providing resources that assisted in creating this guide.
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- [Fedora Toolbox Documentation](https://docs.fedoraproject.org/en-US/fedora-silverblue/toolbox/)
|
||||||
|
- [NVIDIA CUDA Installation Guide](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html)
|
||||||
|
- [Podman Documentation](https://podman.io/get-started)
|
||||||
|
|
||||||
|
---
|
@ -28,7 +28,7 @@ The required steps to implement for an HF model are:
|
|||||||
```python
|
```python
|
||||||
@Model.register("MyModelForCausalLM")
|
@Model.register("MyModelForCausalLM")
|
||||||
class MyModel(Model):
|
class MyModel(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.GROK
|
model_arch = gguf.MODEL_ARCH.MYMODEL
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Define the layout of the GGUF tensors in [constants.py](/gguf-py/gguf/constants.py)
|
2. Define the layout of the GGUF tensors in [constants.py](/gguf-py/gguf/constants.py)
|
||||||
@ -79,14 +79,14 @@ Depending on the model configuration, tokenizer, code and tensors layout, you wi
|
|||||||
- `Model#set_vocab`
|
- `Model#set_vocab`
|
||||||
- `Model#write_tensors`
|
- `Model#write_tensors`
|
||||||
|
|
||||||
NOTE: Tensor names must end with `.weight` suffix, that is the convention and several tools like `quantize` expect this to proceed the weights.
|
NOTE: Tensor names must end with `.weight` or `.bias` suffixes, that is the convention and several tools like `quantize` expect this to proceed the weights.
|
||||||
|
|
||||||
### 2. Define the model architecture in `llama.cpp`
|
### 2. Define the model architecture in `llama.cpp`
|
||||||
|
|
||||||
The model params and tensors layout must be defined in `llama.cpp`:
|
The model params and tensors layout must be defined in `llama.cpp`:
|
||||||
1. Define a new `llm_arch`
|
1. Define a new `llm_arch`
|
||||||
2. Define the tensors layout in `LLM_TENSOR_NAMES`
|
2. Define the tensors layout in `LLM_TENSOR_NAMES`
|
||||||
3. Add any non standard metadata in `llm_load_hparams`
|
3. Add any non-standard metadata in `llm_load_hparams`
|
||||||
4. Create the tensors for inference in `llm_load_tensors`
|
4. Create the tensors for inference in `llm_load_tensors`
|
||||||
5. If the model has a RoPE operation, add the rope type in `llama_rope_type`
|
5. If the model has a RoPE operation, add the rope type in `llama_rope_type`
|
||||||
|
|
||||||
@ -96,9 +96,9 @@ NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorc
|
|||||||
|
|
||||||
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.
|
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.
|
||||||
|
|
||||||
Have a look at existing implementation like `build_llama`, `build_dbrx` or `build_bert`.
|
Have a look at existing implementations like `build_llama`, `build_dbrx` or `build_bert`.
|
||||||
|
|
||||||
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.
|
Some `ggml` backends do not support all operations. Backend implementations can be added in a separate PR.
|
||||||
|
|
||||||
Note: to debug the inference graph: you can use [llama-eval-callback](/examples/eval-callback/).
|
Note: to debug the inference graph: you can use [llama-eval-callback](/examples/eval-callback/).
|
||||||
|
|
||||||
|
@ -38,7 +38,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
llama_model_params model_params = common_model_params_to_llama(params);
|
llama_model_params model_params = common_model_params_to_llama(params);
|
||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
||||||
@ -194,7 +194,7 @@ int main(int argc, char ** argv) {
|
|||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
|
@ -41,7 +41,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
llama_model_params model_params = common_model_params_to_llama(params);
|
llama_model_params model_params = common_model_params_to_llama(params);
|
||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
LOG_ERR("%s: error: unable to load model\n" , __func__);
|
LOG_ERR("%s: error: unable to load model\n" , __func__);
|
||||||
@ -120,7 +120,7 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
||||||
if (decoder_start_token_id == -1) {
|
if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
|
||||||
decoder_start_token_id = llama_token_bos(model);
|
decoder_start_token_id = llama_token_bos(model);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -236,7 +236,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
llama_sampler_free(smpl);
|
llama_sampler_free(smpl);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
|
@ -1,4 +1,6 @@
|
|||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
#include "gguf.h"
|
||||||
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
@ -434,12 +436,12 @@ static void print_matrix(struct ggml_tensor * probs) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_file {
|
struct my_llama_file {
|
||||||
// use FILE * so we don't have to re-open the file to mmap
|
// use FILE * so we don't have to re-open the file to mmap
|
||||||
FILE * fp;
|
FILE * fp;
|
||||||
size_t size;
|
size_t size;
|
||||||
|
|
||||||
llama_file(const char * fname, const char * mode) {
|
my_llama_file(const char * fname, const char * mode) {
|
||||||
fp = std::fopen(fname, mode);
|
fp = std::fopen(fname, mode);
|
||||||
if (fp == NULL) {
|
if (fp == NULL) {
|
||||||
size = 0;
|
size = 0;
|
||||||
@ -500,7 +502,7 @@ struct llama_file {
|
|||||||
return std::string(chars.data(), len);
|
return std::string(chars.data(), len);
|
||||||
}
|
}
|
||||||
|
|
||||||
~llama_file() {
|
~my_llama_file() {
|
||||||
if (fp) {
|
if (fp) {
|
||||||
std::fclose(fp);
|
std::fclose(fp);
|
||||||
}
|
}
|
||||||
@ -508,7 +510,7 @@ struct llama_file {
|
|||||||
};
|
};
|
||||||
|
|
||||||
static bool is_ggml_file(const char * filename) {
|
static bool is_ggml_file(const char * filename) {
|
||||||
llama_file file(filename, "rb");
|
my_llama_file file(filename, "rb");
|
||||||
if (file.size < 4) {
|
if (file.size < 4) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -576,7 +578,7 @@ static void load_vocab(const char * filename, const Config * config, struct my_l
|
|||||||
} else {
|
} else {
|
||||||
// assume llama2.c vocabulary
|
// assume llama2.c vocabulary
|
||||||
LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
|
LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
|
||||||
llama_file file(filename, "rb");
|
my_llama_file file(filename, "rb");
|
||||||
if (!file.fp) {
|
if (!file.fp) {
|
||||||
die_fmt("%s: %s", strerror(errno), filename);
|
die_fmt("%s: %s", strerror(errno), filename);
|
||||||
}
|
}
|
||||||
@ -689,8 +691,8 @@ static void save_as_llama_model(
|
|||||||
gguf_set_val_u32(ctx, KV_TOKENIZER_UNK_ID, UNKNOWN_TOKEN_ID);
|
gguf_set_val_u32(ctx, KV_TOKENIZER_UNK_ID, UNKNOWN_TOKEN_ID);
|
||||||
gguf_set_val_u32(ctx, KV_TOKENIZER_BOS_ID, BOS_TOKEN_ID);
|
gguf_set_val_u32(ctx, KV_TOKENIZER_BOS_ID, BOS_TOKEN_ID);
|
||||||
gguf_set_val_u32(ctx, KV_TOKENIZER_EOS_ID, EOS_TOKEN_ID);
|
gguf_set_val_u32(ctx, KV_TOKENIZER_EOS_ID, EOS_TOKEN_ID);
|
||||||
gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, -1);
|
gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, LLAMA_TOKEN_NULL);
|
||||||
gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, -1);
|
gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, LLAMA_TOKEN_NULL);
|
||||||
|
|
||||||
gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.n_ctx);
|
gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.n_ctx);
|
||||||
gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd);
|
gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd);
|
||||||
|
@ -1,7 +1,9 @@
|
|||||||
|
#include "ggml.h"
|
||||||
|
#include "gguf.h"
|
||||||
|
|
||||||
#include "arg.h"
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "ggml.h"
|
|
||||||
#include "pca.hpp"
|
#include "pca.hpp"
|
||||||
#include "mean.hpp"
|
#include "mean.hpp"
|
||||||
|
|
||||||
@ -415,12 +417,13 @@ int main(int argc, char ** argv) {
|
|||||||
// load the model to get hparams
|
// load the model to get hparams
|
||||||
common_init_result llama_init = common_init_from_params(params);
|
common_init_result llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_init.model;
|
llama_model * model = llama_init.model.get();
|
||||||
llama_context * ctx = llama_init.context;
|
llama_context * ctx = llama_init.context.get();
|
||||||
|
|
||||||
// int n_ctx = llama_n_ctx(ctx);
|
// int n_ctx = llama_n_ctx(ctx);
|
||||||
int n_layers = llama_n_layer(model);
|
int n_layers = llama_n_layer(model);
|
||||||
int n_embd = llama_n_embd(model);
|
int n_embd = llama_n_embd(model);
|
||||||
|
|
||||||
// get model hint param (a.k.a model arch name)
|
// get model hint param (a.k.a model arch name)
|
||||||
char model_hint[128];
|
char model_hint[128];
|
||||||
llama_model_meta_val_str(model, "general.architecture", model_hint, 128);
|
llama_model_meta_val_str(model, "general.architecture", model_hint, 128);
|
||||||
@ -474,8 +477,6 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// done with the model, we can now free it to make gain some memory
|
// done with the model, we can now free it to make gain some memory
|
||||||
printf("Done evaluate prompts, unload model...\n");
|
printf("Done evaluate prompts, unload model...\n");
|
||||||
llama_free(ctx);
|
|
||||||
llama_free_model(model);
|
|
||||||
|
|
||||||
bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA;
|
bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA;
|
||||||
|
|
||||||
|
@ -97,8 +97,9 @@ int main(int argc, char ** argv) {
|
|||||||
// load the model
|
// load the model
|
||||||
common_init_result llama_init = common_init_from_params(params);
|
common_init_result llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_init.model;
|
llama_model * model = llama_init.model.get();
|
||||||
llama_context * ctx = llama_init.context;
|
llama_context * ctx = llama_init.context.get();
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
LOG_ERR("%s: unable to load model\n", __func__);
|
LOG_ERR("%s: unable to load model\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
@ -316,8 +317,6 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// clean up
|
// clean up
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
llama_free(ctx);
|
|
||||||
llama_free_model(model);
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -162,8 +162,9 @@ int main(int argc, char ** argv) {
|
|||||||
// init
|
// init
|
||||||
common_init_result llama_init = common_init_from_params(params);
|
common_init_result llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_init.model;
|
llama_model * model = llama_init.model.get();
|
||||||
llama_context * ctx = llama_init.context;
|
llama_context * ctx = llama_init.context.get();
|
||||||
|
|
||||||
if (model == nullptr || ctx == nullptr) {
|
if (model == nullptr || ctx == nullptr) {
|
||||||
LOG_ERR("%s : failed to init\n", __func__);
|
LOG_ERR("%s : failed to init\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
@ -184,9 +185,6 @@ int main(int argc, char ** argv) {
|
|||||||
LOG("\n");
|
LOG("\n");
|
||||||
llama_perf_context_print(ctx);
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
llama_free(ctx);
|
|
||||||
llama_free_model(model);
|
|
||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -1,7 +1,9 @@
|
|||||||
#include "arg.h"
|
|
||||||
#include "common.h"
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml-alloc.h"
|
#include "ggml-alloc.h"
|
||||||
|
#include "gguf.h"
|
||||||
|
|
||||||
|
#include "arg.h"
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
#include "gguf.h"
|
||||||
|
|
||||||
#include <cstdlib> /* abort() */
|
#include <cstdlib> /* abort() */
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
|
@ -1,18 +1,19 @@
|
|||||||
|
#include "ggml.h"
|
||||||
|
#include "gguf.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cmath>
|
#include <cinttypes>
|
||||||
|
#include <climits>
|
||||||
|
#include <cstdio>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
|
#include <stdexcept>
|
||||||
|
#include <cstring>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <climits>
|
|
||||||
#include <stdexcept>
|
|
||||||
|
|
||||||
#if defined(_WIN32)
|
#if defined(_WIN32)
|
||||||
#include <windows.h>
|
#include <windows.h>
|
||||||
#ifndef PATH_MAX
|
#ifndef PATH_MAX
|
||||||
@ -297,7 +298,7 @@ struct split_strategy {
|
|||||||
total_size += ggml_nbytes(t);
|
total_size += ggml_nbytes(t);
|
||||||
}
|
}
|
||||||
total_size = total_size / 1000 / 1000; // convert to megabytes
|
total_size = total_size / 1000 / 1000; // convert to megabytes
|
||||||
printf("split %05d: n_tensors = %d, total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
|
printf("split %05d: n_tensors = %" PRIi64 ", total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
|
||||||
i_split++;
|
i_split++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,10 +1,9 @@
|
|||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
#include "gguf.h"
|
||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cinttypes>
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <fstream>
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#undef MIN
|
#undef MIN
|
||||||
@ -135,9 +134,10 @@ static bool gguf_ex_read_0(const std::string & fname) {
|
|||||||
|
|
||||||
for (int i = 0; i < n_tensors; ++i) {
|
for (int i = 0; i < n_tensors; ++i) {
|
||||||
const char * name = gguf_get_tensor_name (ctx, i);
|
const char * name = gguf_get_tensor_name (ctx, i);
|
||||||
|
const size_t size = gguf_get_tensor_size (ctx, i);
|
||||||
const size_t offset = gguf_get_tensor_offset(ctx, i);
|
const size_t offset = gguf_get_tensor_offset(ctx, i);
|
||||||
|
|
||||||
printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
|
printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu\n", __func__, i, name, size, offset);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -182,9 +182,10 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
|
|||||||
|
|
||||||
for (int i = 0; i < n_tensors; ++i) {
|
for (int i = 0; i < n_tensors; ++i) {
|
||||||
const char * name = gguf_get_tensor_name (ctx, i);
|
const char * name = gguf_get_tensor_name (ctx, i);
|
||||||
|
const size_t size = gguf_get_tensor_size (ctx, i);
|
||||||
const size_t offset = gguf_get_tensor_offset(ctx, i);
|
const size_t offset = gguf_get_tensor_offset(ctx, i);
|
||||||
|
|
||||||
printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
|
printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu\n", __func__, i, name, size, offset);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -199,7 +200,8 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
|
|||||||
|
|
||||||
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
|
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
|
||||||
|
|
||||||
printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, ggml_n_dims(cur), cur->name, cur->data);
|
printf("%s: tensor[%d]: n_dims = %d, ne = (%d, %d, %d, %d), name = %s, data = %p\n",
|
||||||
|
__func__, i, ggml_n_dims(cur), int(cur->ne[0]), int(cur->ne[1]), int(cur->ne[2]), int(cur->ne[3]), cur->name, cur->data);
|
||||||
|
|
||||||
// print first 10 elements
|
// print first 10 elements
|
||||||
const float * data = (const float *) cur->data;
|
const float * data = (const float *) cur->data;
|
||||||
@ -215,7 +217,7 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
|
|||||||
const float * data = (const float *) cur->data;
|
const float * data = (const float *) cur->data;
|
||||||
for (int j = 0; j < ggml_nelements(cur); ++j) {
|
for (int j = 0; j < ggml_nelements(cur); ++j) {
|
||||||
if (data[j] != 100 + i) {
|
if (data[j] != 100 + i) {
|
||||||
fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]);
|
fprintf(stderr, "%s: tensor[%d], data[%d]: found %f, expected %f\n", __func__, i, j, data[j], float(100 + i));
|
||||||
gguf_free(ctx);
|
gguf_free(ctx);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -245,6 +247,8 @@ int main(int argc, char ** argv) {
|
|||||||
check_data = false;
|
check_data = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
srand(123456);
|
||||||
|
|
||||||
const std::string fname(argv[1]);
|
const std::string fname(argv[1]);
|
||||||
const std::string mode (argv[2]);
|
const std::string mode (argv[2]);
|
||||||
|
|
||||||
|
@ -165,7 +165,7 @@ int main(int argc, char * argv[]) {
|
|||||||
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
|
llama_model * model = llama_model_load_from_file(params.model.c_str(), mparams);
|
||||||
|
|
||||||
// create generation context
|
// create generation context
|
||||||
llama_context * ctx = llama_new_context_with_model(model, cparams);
|
llama_context * ctx = llama_new_context_with_model(model, cparams);
|
||||||
@ -219,7 +219,7 @@ int main(int argc, char * argv[]) {
|
|||||||
|
|
||||||
llama_sampler_free(smpl);
|
llama_sampler_free(smpl);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -430,9 +430,10 @@ static void process_logits(
|
|||||||
|
|
||||||
static bool compute_imatrix(llama_context * ctx, const common_params & params) {
|
static bool compute_imatrix(llama_context * ctx, const common_params & params) {
|
||||||
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
|
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
|
||||||
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
|
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
|
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
|
||||||
|
|
||||||
auto tim1 = std::chrono::high_resolution_clock::now();
|
auto tim1 = std::chrono::high_resolution_clock::now();
|
||||||
LOG_INF("%s: tokenizing the input ..\n", __func__);
|
LOG_INF("%s: tokenizing the input ..\n", __func__);
|
||||||
|
|
||||||
@ -618,8 +619,9 @@ int main(int argc, char ** argv) {
|
|||||||
// init
|
// init
|
||||||
common_init_result llama_init = common_init_from_params(params);
|
common_init_result llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_init.model;
|
llama_model * model = llama_init.model.get();
|
||||||
llama_context * ctx = llama_init.context;
|
llama_context * ctx = llama_init.context.get();
|
||||||
|
|
||||||
if (model == nullptr || ctx == nullptr) {
|
if (model == nullptr || ctx == nullptr) {
|
||||||
LOG_ERR("%s : failed to init\n", __func__);
|
LOG_ERR("%s : failed to init\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
@ -655,9 +657,6 @@ int main(int argc, char ** argv) {
|
|||||||
LOG("\n");
|
LOG("\n");
|
||||||
llama_perf_context_print(ctx);
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
llama_free(ctx);
|
|
||||||
llama_free_model(model);
|
|
||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -131,8 +131,8 @@ int main(int argc, char ** argv) {
|
|||||||
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
|
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
|
||||||
common_init_result llama_init = common_init_from_params(params);
|
common_init_result llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
model = llama_init.model;
|
model = llama_init.model.get();
|
||||||
ctx = llama_init.context;
|
ctx = llama_init.context.get();
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
LOG_ERR("%s: unable to load model\n", __func__);
|
LOG_ERR("%s: unable to load model\n", __func__);
|
||||||
@ -581,9 +581,6 @@ int main(int argc, char ** argv) {
|
|||||||
LOG("\n");
|
LOG("\n");
|
||||||
common_perf_print(ctx, smpl);
|
common_perf_print(ctx, smpl);
|
||||||
|
|
||||||
llama_free(ctx);
|
|
||||||
llama_free_model(model);
|
|
||||||
|
|
||||||
common_sampler_free(smpl);
|
common_sampler_free(smpl);
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
|
@ -1526,10 +1526,10 @@ int main(int argc, char ** argv) {
|
|||||||
// keep the same model between tests when possible
|
// keep the same model between tests when possible
|
||||||
if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
|
if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
|
||||||
if (lmodel) {
|
if (lmodel) {
|
||||||
llama_free_model(lmodel);
|
llama_model_free(lmodel);
|
||||||
}
|
}
|
||||||
|
|
||||||
lmodel = llama_load_model_from_file(inst.model.c_str(), inst.to_llama_mparams());
|
lmodel = llama_model_load_from_file(inst.model.c_str(), inst.to_llama_mparams());
|
||||||
if (lmodel == NULL) {
|
if (lmodel == NULL) {
|
||||||
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
|
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
|
||||||
return 1;
|
return 1;
|
||||||
@ -1540,7 +1540,7 @@ int main(int argc, char ** argv) {
|
|||||||
llama_context * ctx = llama_new_context_with_model(lmodel, inst.to_llama_cparams());
|
llama_context * ctx = llama_new_context_with_model(lmodel, inst.to_llama_cparams());
|
||||||
if (ctx == NULL) {
|
if (ctx == NULL) {
|
||||||
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str());
|
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str());
|
||||||
llama_free_model(lmodel);
|
llama_model_free(lmodel);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1626,7 +1626,7 @@ int main(int argc, char ** argv) {
|
|||||||
ggml_threadpool_free_fn(threadpool);
|
ggml_threadpool_free_fn(threadpool);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_free_model(lmodel);
|
llama_model_free(lmodel);
|
||||||
|
|
||||||
if (p) {
|
if (p) {
|
||||||
p->print_footer();
|
p->print_footer();
|
||||||
|
@ -305,7 +305,9 @@ Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens,
|
|||||||
extern "C"
|
extern "C"
|
||||||
JNIEXPORT void JNICALL
|
JNIEXPORT void JNICALL
|
||||||
Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
|
Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
|
||||||
llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
|
//llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
|
||||||
|
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
|
||||||
|
delete batch;
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
|
@ -7,6 +7,7 @@
|
|||||||
#include "ggml-cpu.h"
|
#include "ggml-cpu.h"
|
||||||
#include "ggml-alloc.h"
|
#include "ggml-alloc.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
|
#include "gguf.h"
|
||||||
|
|
||||||
//#ifdef GGML_USE_CUDA
|
//#ifdef GGML_USE_CUDA
|
||||||
//#include "ggml-cuda.h"
|
//#include "ggml-cuda.h"
|
||||||
@ -262,7 +263,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
|||||||
{
|
{
|
||||||
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
|
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
|
||||||
int arr_n = gguf_get_arr_n(ctx_gguf, i);
|
int arr_n = gguf_get_arr_n(ctx_gguf, i);
|
||||||
const void * data = gguf_get_arr_data(ctx_gguf, i);
|
const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i);
|
||||||
std::stringstream ss;
|
std::stringstream ss;
|
||||||
ss << "[";
|
ss << "[";
|
||||||
for (int j = 0; j < arr_n; j++) {
|
for (int j = 0; j < arr_n; j++) {
|
||||||
@ -2734,7 +2735,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
|||||||
total_size_org += orig_size;
|
total_size_org += orig_size;
|
||||||
total_size_new += new_size;
|
total_size_new += new_size;
|
||||||
gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
|
gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
|
||||||
gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size);
|
GGML_ASSERT(gguf_get_tensor_size(ctx_out, gguf_find_tensor(ctx_out, name.c_str())) == new_size);
|
||||||
|
gguf_set_tensor_data(ctx_out, name.c_str(), new_data);
|
||||||
fout.write((const char *)new_data, new_size);
|
fout.write((const char *)new_data, new_size);
|
||||||
size_t pad = GGML_PAD(new_size, gguf_get_alignment(ctx_out)) - new_size;
|
size_t pad = GGML_PAD(new_size, gguf_get_alignment(ctx_out)) - new_size;
|
||||||
for (size_t j = 0; j < pad; ++j) {
|
for (size_t j = 0; j < pad; ++j) {
|
||||||
|
@ -221,7 +221,7 @@ static struct llama_model * llava_init(common_params * params) {
|
|||||||
|
|
||||||
llama_model_params model_params = common_model_params_to_llama(*params);
|
llama_model_params model_params = common_model_params_to_llama(*params);
|
||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
|
llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
LOG_ERR("%s: unable to load model\n" , __func__);
|
LOG_ERR("%s: unable to load model\n" , __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
@ -265,7 +265,7 @@ static void llava_free(struct llava_context * ctx_llava) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
llama_free(ctx_llava->ctx_llama);
|
llama_free(ctx_llava->ctx_llama);
|
||||||
llama_free_model(ctx_llava->model);
|
llama_model_free(ctx_llava->model);
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -323,7 +323,7 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -31,7 +31,7 @@ static struct llama_model * llava_init(common_params * params) {
|
|||||||
|
|
||||||
llama_model_params model_params = common_model_params_to_llama(*params);
|
llama_model_params model_params = common_model_params_to_llama(*params);
|
||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
|
llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
LOG_ERR("%s: unable to load model\n" , __func__);
|
LOG_ERR("%s: unable to load model\n" , __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
@ -75,7 +75,7 @@ static void llava_free(struct llava_context * ctx_llava) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
llama_free(ctx_llava->ctx_llama);
|
llama_free(ctx_llava->ctx_llama);
|
||||||
llama_free_model(ctx_llava->model);
|
llama_model_free(ctx_llava->model);
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -310,7 +310,7 @@ static struct llama_model * llava_init(common_params * params) {
|
|||||||
|
|
||||||
llama_model_params model_params = common_model_params_to_llama(*params);
|
llama_model_params model_params = common_model_params_to_llama(*params);
|
||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
|
llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
LOG_ERR("%s: unable to load model\n" , __func__);
|
LOG_ERR("%s: unable to load model\n" , __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
@ -354,7 +354,7 @@ static void llava_free(struct llava_context * ctx_llava) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
llama_free(ctx_llava->ctx_llama);
|
llama_free(ctx_llava->ctx_llama);
|
||||||
llama_free_model(ctx_llava->model);
|
llama_model_free(ctx_llava->model);
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -575,7 +575,7 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -58,8 +58,8 @@ int main(int argc, char ** argv) {
|
|||||||
// load the target model
|
// load the target model
|
||||||
common_init_result llama_init = common_init_from_params(params);
|
common_init_result llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_init.model;
|
llama_model * model = llama_init.model.get();
|
||||||
llama_context * ctx = llama_init.context;
|
llama_context * ctx = llama_init.context.get();
|
||||||
|
|
||||||
// Tokenize the prompt
|
// Tokenize the prompt
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
@ -474,9 +474,6 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
|
|
||||||
llama_free(ctx);
|
|
||||||
llama_free_model(model);
|
|
||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
LOG("\n\n");
|
LOG("\n\n");
|
||||||
|
@ -1,14 +1,9 @@
|
|||||||
#include "arg.h"
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "ngram-cache.h"
|
#include "ngram-cache.h"
|
||||||
#include "ggml.h"
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cstdint>
|
|
||||||
#include <fstream>
|
|
||||||
#include <iostream>
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <unordered_map>
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
int main(int argc, char ** argv){
|
int main(int argc, char ** argv){
|
||||||
@ -25,16 +20,16 @@ int main(int argc, char ** argv){
|
|||||||
// load the model
|
// load the model
|
||||||
common_init_result llama_init = common_init_from_params(params);
|
common_init_result llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_init.model;
|
llama_model_ptr & model = llama_init.model;
|
||||||
llama_context * ctx = llama_init.context;
|
llama_context_ptr & ctx = llama_init.context;
|
||||||
|
|
||||||
GGML_ASSERT(model != nullptr);
|
GGML_ASSERT(model != nullptr);
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
inp = common_tokenize(ctx, params.prompt, true, true);
|
inp = common_tokenize(ctx.get(), params.prompt, true, true);
|
||||||
fprintf(stderr, "%s: tokenization done\n", __func__);
|
fprintf(stderr, "%s: tokenization done\n", __func__);
|
||||||
|
|
||||||
|
|
||||||
common_ngram_cache ngram_cache;
|
common_ngram_cache ngram_cache;
|
||||||
common_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true);
|
common_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true);
|
||||||
fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
|
fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
|
||||||
|
@ -30,12 +30,11 @@ int main(int argc, char ** argv){
|
|||||||
// load the model
|
// load the model
|
||||||
common_init_result llama_init = common_init_from_params(params);
|
common_init_result llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_init.model;
|
llama_context_ptr & ctx = llama_init.context;
|
||||||
llama_context * ctx = llama_init.context;
|
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
inp = common_tokenize(ctx, params.prompt, true, true);
|
inp = common_tokenize(ctx.get(), params.prompt, true, true);
|
||||||
|
|
||||||
common_ngram_cache ngram_cache_context;
|
common_ngram_cache ngram_cache_context;
|
||||||
common_ngram_cache ngram_cache_dynamic;
|
common_ngram_cache ngram_cache_dynamic;
|
||||||
@ -66,7 +65,7 @@ int main(int argc, char ** argv){
|
|||||||
}
|
}
|
||||||
|
|
||||||
const int n_input = inp.size();
|
const int n_input = inp.size();
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx.get());
|
||||||
|
|
||||||
int n_drafted = 0;
|
int n_drafted = 0;
|
||||||
int n_accept = 0;
|
int n_accept = 0;
|
||||||
@ -150,9 +149,6 @@ int main(int argc, char ** argv){
|
|||||||
LOG_INF("n_accept = %d\n", n_accept);
|
LOG_INF("n_accept = %d\n", n_accept);
|
||||||
LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
||||||
|
|
||||||
llama_free(ctx);
|
|
||||||
llama_free_model(model);
|
|
||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
LOG("\n\n");
|
LOG("\n\n");
|
||||||
|
@ -33,8 +33,8 @@ int main(int argc, char ** argv){
|
|||||||
// load the model
|
// load the model
|
||||||
common_init_result llama_init = common_init_from_params(params);
|
common_init_result llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_init.model;
|
llama_model * model = llama_init.model.get();
|
||||||
llama_context * ctx = llama_init.context;
|
llama_context * ctx = llama_init.context.get();
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
@ -243,9 +243,6 @@ int main(int argc, char ** argv){
|
|||||||
|
|
||||||
llama_batch_free(batch_tgt);
|
llama_batch_free(batch_tgt);
|
||||||
|
|
||||||
llama_free(ctx);
|
|
||||||
llama_free_model(model);
|
|
||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
LOG("\n\n");
|
LOG("\n\n");
|
||||||
|
@ -145,18 +145,18 @@ int main(int argc, char ** argv) {
|
|||||||
llama_context * ctx = nullptr;
|
llama_context * ctx = nullptr;
|
||||||
common_sampler * smpl = nullptr;
|
common_sampler * smpl = nullptr;
|
||||||
|
|
||||||
std::vector<common_chat_msg> chat_msgs;
|
|
||||||
|
|
||||||
g_model = &model;
|
g_model = &model;
|
||||||
g_ctx = &ctx;
|
g_ctx = &ctx;
|
||||||
g_smpl = &smpl;
|
g_smpl = &smpl;
|
||||||
|
|
||||||
|
std::vector<common_chat_msg> chat_msgs;
|
||||||
|
|
||||||
// load the model and apply lora adapter, if any
|
// load the model and apply lora adapter, if any
|
||||||
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
|
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
|
||||||
common_init_result llama_init = common_init_from_params(params);
|
common_init_result llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
model = llama_init.model;
|
model = llama_init.model.get();
|
||||||
ctx = llama_init.context;
|
ctx = llama_init.context.get();
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
LOG_ERR("%s: error: unable to load model\n", __func__);
|
LOG_ERR("%s: error: unable to load model\n", __func__);
|
||||||
@ -494,7 +494,7 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
||||||
if (decoder_start_token_id == -1) {
|
if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
|
||||||
decoder_start_token_id = llama_token_bos(model);
|
decoder_start_token_id = llama_token_bos(model);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -831,7 +831,7 @@ int main(int argc, char ** argv) {
|
|||||||
// if user stop generation mid-way, we must add EOT to finish model's last response
|
// if user stop generation mid-way, we must add EOT to finish model's last response
|
||||||
if (need_insert_eot && format_chat) {
|
if (need_insert_eot && format_chat) {
|
||||||
llama_token eot = llama_token_eot(model);
|
llama_token eot = llama_token_eot(model);
|
||||||
embd_inp.push_back(eot == -1 ? llama_token_eos(model) : eot);
|
embd_inp.push_back(eot == LLAMA_TOKEN_NULL ? llama_token_eos(model) : eot);
|
||||||
need_insert_eot = false;
|
need_insert_eot = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -889,9 +889,6 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
common_sampler_free(smpl);
|
common_sampler_free(smpl);
|
||||||
|
|
||||||
llama_free(ctx);
|
|
||||||
llama_free_model(model);
|
|
||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
ggml_threadpool_free_fn(threadpool);
|
ggml_threadpool_free_fn(threadpool);
|
||||||
|
@ -132,8 +132,8 @@ int main(int argc, char ** argv) {
|
|||||||
// load the target model
|
// load the target model
|
||||||
common_init_result llama_init = common_init_from_params(params);
|
common_init_result llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_init.model;
|
llama_model * model = llama_init.model.get();
|
||||||
llama_context * ctx = llama_init.context;
|
llama_context * ctx = llama_init.context.get();
|
||||||
|
|
||||||
// load the prompts from an external file if there are any
|
// load the prompts from an external file if there are any
|
||||||
if (params.prompt.empty()) {
|
if (params.prompt.empty()) {
|
||||||
@ -416,9 +416,6 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
|
|
||||||
llama_free(ctx);
|
|
||||||
llama_free_model(model);
|
|
||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
LOG("\n\n");
|
LOG("\n\n");
|
||||||
|
@ -63,7 +63,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
llama_model_params model_params = common_model_params_to_llama(params);
|
llama_model_params model_params = common_model_params_to_llama(params);
|
||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
LOG_ERR("%s: unable to load model\n" , __func__);
|
LOG_ERR("%s: unable to load model\n" , __func__);
|
||||||
@ -266,7 +266,7 @@ int main(int argc, char ** argv) {
|
|||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
|
@ -1987,8 +1987,9 @@ int main(int argc, char ** argv) {
|
|||||||
// load the model and apply lora adapter, if any
|
// load the model and apply lora adapter, if any
|
||||||
common_init_result llama_init = common_init_from_params(params);
|
common_init_result llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_init.model;
|
llama_model * model = llama_init.model.get();
|
||||||
llama_context * ctx = llama_init.context;
|
llama_context * ctx = llama_init.context.get();
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
LOG_ERR("%s: unable to load model\n", __func__);
|
LOG_ERR("%s: unable to load model\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
@ -2023,9 +2024,6 @@ int main(int argc, char ** argv) {
|
|||||||
LOG("\n");
|
LOG("\n");
|
||||||
llama_perf_context_print(ctx);
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
llama_free(ctx);
|
|
||||||
llama_free_model(model);
|
|
||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
#include "common.h"
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "llama-impl.h"
|
#include "llama-context.h"
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
@ -9,11 +9,9 @@
|
|||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <map>
|
|
||||||
#include <numeric>
|
#include <numeric>
|
||||||
#include <regex>
|
#include <regex>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <unordered_map>
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <thread>
|
#include <thread>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
@ -311,7 +309,7 @@ int main(int argc, char ** argv) {
|
|||||||
auto mparams = llama_model_default_params();
|
auto mparams = llama_model_default_params();
|
||||||
mparams.use_mlock = false;
|
mparams.use_mlock = false;
|
||||||
|
|
||||||
model = llama_load_model_from_file(params.model.c_str(), mparams);
|
model = llama_model_load_from_file(params.model.c_str(), mparams);
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
|
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
|
||||||
@ -325,18 +323,18 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
if (ctx == NULL) {
|
if (ctx == NULL) {
|
||||||
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto &tensors = llama_internal_get_tensor_map(ctx);
|
const auto & tensors = llama_internal_get_tensor_map(ctx);
|
||||||
|
|
||||||
// check layer tensors
|
// check layer tensors
|
||||||
int included_layers = 0;
|
int included_layers = 0;
|
||||||
int64_t max_nelements = 0;
|
int64_t max_nelements = 0;
|
||||||
bool is_f16 = false;
|
bool is_f16 = false;
|
||||||
for (const auto& kv_tensor : tensors) {
|
for (const auto & kv_tensor : tensors) {
|
||||||
if (!layer_included(params, kv_tensor.first)) {
|
if (!layer_included(params, kv_tensor.first)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -349,7 +347,7 @@ int main(int argc, char ** argv) {
|
|||||||
fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
|
fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
|
||||||
"this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
|
"this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
included_layers++;
|
included_layers++;
|
||||||
@ -382,7 +380,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
error_stats global_stats {};
|
error_stats global_stats {};
|
||||||
|
|
||||||
for (const auto& kv_tensor : tensors) {
|
for (const auto & kv_tensor : tensors) {
|
||||||
if (!layer_included(params, kv_tensor.first)) {
|
if (!layer_included(params, kv_tensor.first)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -411,7 +409,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
// report timing
|
// report timing
|
||||||
{
|
{
|
||||||
const int64_t t_main_end_us = ggml_time_us();
|
const int64_t t_main_end_us = ggml_time_us();
|
||||||
|
@ -151,8 +151,8 @@ int main(int argc, char ** argv) {
|
|||||||
// load the model
|
// load the model
|
||||||
common_init_result llama_init = common_init_from_params(params);
|
common_init_result llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_init.model;
|
llama_model * model = llama_init.model.get();
|
||||||
llama_context * ctx = llama_init.context;
|
llama_context * ctx = llama_init.context.get();
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
LOG_ERR("%s: unable to load model\n", __func__);
|
LOG_ERR("%s: unable to load model\n", __func__);
|
||||||
@ -298,7 +298,5 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// clean up
|
// clean up
|
||||||
llama_batch_free(query_batch);
|
llama_batch_free(query_batch);
|
||||||
llama_free(ctx);
|
|
||||||
llama_free_model(model);
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
#if defined(_WIN32)
|
#if defined(_WIN32)
|
||||||
# include <windows.h>
|
# include <windows.h>
|
||||||
|
# include <io.h>
|
||||||
#else
|
#else
|
||||||
# include <sys/file.h>
|
# include <sys/file.h>
|
||||||
# include <sys/ioctl.h>
|
# include <sys/ioctl.h>
|
||||||
@ -10,6 +11,8 @@
|
|||||||
# include <curl/curl.h>
|
# include <curl/curl.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#include <signal.h>
|
||||||
|
|
||||||
#include <climits>
|
#include <climits>
|
||||||
#include <cstdarg>
|
#include <cstdarg>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
@ -24,6 +27,13 @@
|
|||||||
#include "json.hpp"
|
#include "json.hpp"
|
||||||
#include "llama-cpp.h"
|
#include "llama-cpp.h"
|
||||||
|
|
||||||
|
#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || defined(_WIN32)
|
||||||
|
[[noreturn]] static void sigint_handler(int) {
|
||||||
|
printf("\n");
|
||||||
|
exit(0); // not ideal, but it's the only way to guarantee exit in all cases
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
GGML_ATTRIBUTE_FORMAT(1, 2)
|
GGML_ATTRIBUTE_FORMAT(1, 2)
|
||||||
static std::string fmt(const char * fmt, ...) {
|
static std::string fmt(const char * fmt, ...) {
|
||||||
va_list ap;
|
va_list ap;
|
||||||
@ -82,6 +92,7 @@ class Opt {
|
|||||||
}
|
}
|
||||||
|
|
||||||
ctx_params.n_batch = context_size >= 0 ? context_size : context_size_default;
|
ctx_params.n_batch = context_size >= 0 ? context_size : context_size_default;
|
||||||
|
ctx_params.n_ctx = ctx_params.n_batch;
|
||||||
model_params.n_gpu_layers = ngl >= 0 ? ngl : ngl_default;
|
model_params.n_gpu_layers = ngl >= 0 ? ngl : ngl_default;
|
||||||
temperature = temperature >= 0 ? temperature : temperature_default;
|
temperature = temperature >= 0 ? temperature : temperature_default;
|
||||||
|
|
||||||
@ -253,7 +264,7 @@ class File {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
OVERLAPPED overlapped = { 0 };
|
OVERLAPPED overlapped = {};
|
||||||
if (!LockFileEx(hFile, LOCKFILE_EXCLUSIVE_LOCK | LOCKFILE_FAIL_IMMEDIATELY, 0, MAXDWORD, MAXDWORD,
|
if (!LockFileEx(hFile, LOCKFILE_EXCLUSIVE_LOCK | LOCKFILE_FAIL_IMMEDIATELY, 0, MAXDWORD, MAXDWORD,
|
||||||
&overlapped)) {
|
&overlapped)) {
|
||||||
fd = -1;
|
fd = -1;
|
||||||
@ -277,7 +288,7 @@ class File {
|
|||||||
if (fd >= 0) {
|
if (fd >= 0) {
|
||||||
# ifdef _WIN32
|
# ifdef _WIN32
|
||||||
if (hFile != INVALID_HANDLE_VALUE) {
|
if (hFile != INVALID_HANDLE_VALUE) {
|
||||||
OVERLAPPED overlapped = { 0 };
|
OVERLAPPED overlapped = {};
|
||||||
UnlockFileEx(hFile, 0, MAXDWORD, MAXDWORD, &overlapped);
|
UnlockFileEx(hFile, 0, MAXDWORD, MAXDWORD, &overlapped);
|
||||||
}
|
}
|
||||||
# else
|
# else
|
||||||
@ -293,7 +304,7 @@ class File {
|
|||||||
private:
|
private:
|
||||||
int fd = -1;
|
int fd = -1;
|
||||||
# ifdef _WIN32
|
# ifdef _WIN32
|
||||||
HANDLE hFile;
|
HANDLE hFile = nullptr;
|
||||||
# endif
|
# endif
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -464,7 +475,7 @@ class HttpClient {
|
|||||||
return (now_downloaded_plus_file_size * 100) / total_to_download;
|
return (now_downloaded_plus_file_size * 100) / total_to_download;
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::string generate_progress_prefix(curl_off_t percentage) { return fmt("%3ld%% |", percentage); }
|
static std::string generate_progress_prefix(curl_off_t percentage) { return fmt("%3ld%% |", static_cast<long int>(percentage)); }
|
||||||
|
|
||||||
static double calculate_speed(curl_off_t now_downloaded, const std::chrono::steady_clock::time_point & start_time) {
|
static double calculate_speed(curl_off_t now_downloaded, const std::chrono::steady_clock::time_point & start_time) {
|
||||||
const auto now = std::chrono::steady_clock::now();
|
const auto now = std::chrono::steady_clock::now();
|
||||||
@ -663,7 +674,7 @@ class LlamaData {
|
|||||||
"\r%*s"
|
"\r%*s"
|
||||||
"\rLoading model",
|
"\rLoading model",
|
||||||
get_terminal_width(), " ");
|
get_terminal_width(), " ");
|
||||||
llama_model_ptr model(llama_load_model_from_file(opt.model_.c_str(), opt.model_params));
|
llama_model_ptr model(llama_model_load_from_file(opt.model_.c_str(), opt.model_params));
|
||||||
if (!model) {
|
if (!model) {
|
||||||
printe("%s: error: unable to load model from file: %s\n", __func__, opt.model_.c_str());
|
printe("%s: error: unable to load model from file: %s\n", __func__, opt.model_.c_str());
|
||||||
}
|
}
|
||||||
@ -799,7 +810,20 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
|
|||||||
|
|
||||||
static int read_user_input(std::string & user) {
|
static int read_user_input(std::string & user) {
|
||||||
std::getline(std::cin, user);
|
std::getline(std::cin, user);
|
||||||
return user.empty(); // Should have data in happy path
|
if (std::cin.eof()) {
|
||||||
|
printf("\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (user == "/bye") {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (user.empty()) {
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0; // Should have data in happy path
|
||||||
}
|
}
|
||||||
|
|
||||||
// Function to generate a response based on the prompt
|
// Function to generate a response based on the prompt
|
||||||
@ -866,7 +890,25 @@ static bool is_stdout_a_terminal() {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// Function to tokenize the prompt
|
// Function to handle user input
|
||||||
|
static int get_user_input(std::string & user_input, const std::string & user) {
|
||||||
|
while (true) {
|
||||||
|
const int ret = handle_user_input(user_input, user);
|
||||||
|
if (ret == 1) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ret == 2) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Main chat loop function
|
||||||
static int chat_loop(LlamaData & llama_data, const std::string & user) {
|
static int chat_loop(LlamaData & llama_data, const std::string & user) {
|
||||||
int prev_len = 0;
|
int prev_len = 0;
|
||||||
llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get()));
|
llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get()));
|
||||||
@ -874,7 +916,8 @@ static int chat_loop(LlamaData & llama_data, const std::string & user) {
|
|||||||
while (true) {
|
while (true) {
|
||||||
// Get user input
|
// Get user input
|
||||||
std::string user_input;
|
std::string user_input;
|
||||||
while (handle_user_input(user_input, user)) {
|
if (get_user_input(user_input, user) == 1) {
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
add_message("user", user.empty() ? user_input : user, llama_data);
|
add_message("user", user.empty() ? user_input : user, llama_data);
|
||||||
@ -915,7 +958,23 @@ static std::string read_pipe_data() {
|
|||||||
return result.str();
|
return result.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void ctrl_c_handling() {
|
||||||
|
#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__))
|
||||||
|
struct sigaction sigint_action;
|
||||||
|
sigint_action.sa_handler = sigint_handler;
|
||||||
|
sigemptyset(&sigint_action.sa_mask);
|
||||||
|
sigint_action.sa_flags = 0;
|
||||||
|
sigaction(SIGINT, &sigint_action, NULL);
|
||||||
|
#elif defined(_WIN32)
|
||||||
|
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
|
||||||
|
return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
|
||||||
|
};
|
||||||
|
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, const char ** argv) {
|
int main(int argc, const char ** argv) {
|
||||||
|
ctrl_c_handling();
|
||||||
Opt opt;
|
Opt opt;
|
||||||
const int ret = opt.init(argc, argv);
|
const int ret = opt.init(argc, argv);
|
||||||
if (ret == 2) {
|
if (ret == 2) {
|
||||||
|
@ -30,8 +30,8 @@ int main(int argc, char ** argv) {
|
|||||||
// init
|
// init
|
||||||
common_init_result llama_init = common_init_from_params(params);
|
common_init_result llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_init.model;
|
llama_model * model = llama_init.model.get();
|
||||||
llama_context * ctx = llama_init.context;
|
llama_context * ctx = llama_init.context.get();
|
||||||
|
|
||||||
if (model == nullptr || ctx == nullptr) {
|
if (model == nullptr || ctx == nullptr) {
|
||||||
fprintf(stderr, "%s : failed to init\n", __func__);
|
fprintf(stderr, "%s : failed to init\n", __func__);
|
||||||
@ -89,8 +89,6 @@ int main(int argc, char ** argv) {
|
|||||||
if (llama_decode(ctx, batch)) {
|
if (llama_decode(ctx, batch)) {
|
||||||
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
llama_free(ctx);
|
|
||||||
llama_free_model(model);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
n_past += 1;
|
n_past += 1;
|
||||||
@ -98,11 +96,8 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
printf("\n\n");
|
printf("\n\n");
|
||||||
|
|
||||||
// free old context
|
|
||||||
llama_free(ctx);
|
|
||||||
|
|
||||||
// make new context
|
// make new context
|
||||||
auto * ctx2 = llama_new_context_with_model(model, common_context_params_to_llama(params));
|
llama_context * ctx2 = llama_new_context_with_model(model, common_context_params_to_llama(params));
|
||||||
|
|
||||||
llama_sampler * smpl2 = llama_sampler_chain_init(sparams);
|
llama_sampler * smpl2 = llama_sampler_chain_init(sparams);
|
||||||
|
|
||||||
@ -123,8 +118,6 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
if (read != llama_state_set_data(ctx2, state_mem.data(), state_mem.size())) {
|
if (read != llama_state_set_data(ctx2, state_mem.data(), state_mem.size())) {
|
||||||
fprintf(stderr, "\n%s : failed to read state\n", __func__);
|
fprintf(stderr, "\n%s : failed to read state\n", __func__);
|
||||||
llama_free(ctx2);
|
|
||||||
llama_free_model(model);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -148,8 +141,6 @@ int main(int argc, char ** argv) {
|
|||||||
if (llama_decode(ctx2, batch)) {
|
if (llama_decode(ctx2, batch)) {
|
||||||
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
llama_free(ctx2);
|
|
||||||
llama_free_model(model);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
n_past += 1;
|
n_past += 1;
|
||||||
@ -157,15 +148,13 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
printf("\n\n");
|
printf("\n\n");
|
||||||
|
|
||||||
llama_free(ctx2);
|
|
||||||
|
|
||||||
if (result0 != result1) {
|
if (result0 != result1) {
|
||||||
fprintf(stderr, "\n%s : error : the 2 generations are different\n", __func__);
|
fprintf(stderr, "\n%s : error : the 2 generations are different\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// make new context
|
// make new context
|
||||||
auto * ctx3 = llama_new_context_with_model(model, common_context_params_to_llama(params));
|
llama_context * ctx3 = llama_new_context_with_model(model, common_context_params_to_llama(params));
|
||||||
|
|
||||||
llama_sampler * smpl3 = llama_sampler_chain_init(sparams);
|
llama_sampler * smpl3 = llama_sampler_chain_init(sparams);
|
||||||
|
|
||||||
@ -186,8 +175,6 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
if (read != llama_state_set_data(ctx3, state_mem.data(), state_mem.size())) {
|
if (read != llama_state_set_data(ctx3, state_mem.data(), state_mem.size())) {
|
||||||
fprintf(stderr, "\n%s : failed to read state\n", __func__);
|
fprintf(stderr, "\n%s : failed to read state\n", __func__);
|
||||||
llama_free(ctx3);
|
|
||||||
llama_free_model(model);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -204,8 +191,6 @@ int main(int argc, char ** argv) {
|
|||||||
const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), seq_store.size(), 0);
|
const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), seq_store.size(), 0);
|
||||||
if (ncopy != seq_store.size()) {
|
if (ncopy != seq_store.size()) {
|
||||||
fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size());
|
fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size());
|
||||||
llama_free(ctx3);
|
|
||||||
llama_free_model(model);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
|
fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
|
||||||
@ -218,8 +203,6 @@ int main(int argc, char ** argv) {
|
|||||||
const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), seq_store.size(), 1);
|
const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), seq_store.size(), 1);
|
||||||
if (nset != seq_store.size()) {
|
if (nset != seq_store.size()) {
|
||||||
fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size());
|
fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size());
|
||||||
llama_free(ctx3);
|
|
||||||
llama_free_model(model);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
fprintf(stderr, "%s : seq 1 restored, %zd bytes\n", __func__, nset);
|
fprintf(stderr, "%s : seq 1 restored, %zd bytes\n", __func__, nset);
|
||||||
@ -239,8 +222,6 @@ int main(int argc, char ** argv) {
|
|||||||
if (llama_decode(ctx3, batch)) {
|
if (llama_decode(ctx3, batch)) {
|
||||||
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
llama_free(ctx3);
|
|
||||||
llama_free_model(model);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
n_past += 1;
|
n_past += 1;
|
||||||
@ -253,8 +234,6 @@ int main(int argc, char ** argv) {
|
|||||||
llama_sampler_free(smpl3);
|
llama_sampler_free(smpl3);
|
||||||
|
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
llama_free(ctx3);
|
|
||||||
llama_free_model(model);
|
|
||||||
|
|
||||||
if (result0 != result2) {
|
if (result0 != result2) {
|
||||||
fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
|
fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
|
||||||
|
@ -45,10 +45,7 @@ The project is under active development, and we are [looking for feedback and co
|
|||||||
| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
|
| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
|
||||||
| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
|
| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
|
||||||
| `-fa, --flash-attn` | enable Flash Attention (default: disabled)<br/>(env: LLAMA_ARG_FLASH_ATTN) |
|
| `-fa, --flash-attn` | enable Flash Attention (default: disabled)<br/>(env: LLAMA_ARG_FLASH_ATTN) |
|
||||||
| `-p, --prompt PROMPT` | prompt to start generation with |
|
|
||||||
| `--no-perf` | disable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_NO_PERF) |
|
| `--no-perf` | disable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_NO_PERF) |
|
||||||
| `-f, --file FNAME` | a file containing the prompt (default: none) |
|
|
||||||
| `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) |
|
|
||||||
| `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
|
| `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
|
||||||
| `--no-escape` | do not process escape sequences |
|
| `--no-escape` | do not process escape sequences |
|
||||||
| `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model<br/>(env: LLAMA_ARG_ROPE_SCALING_TYPE) |
|
| `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model<br/>(env: LLAMA_ARG_ROPE_SCALING_TYPE) |
|
||||||
@ -345,7 +342,7 @@ node index.js
|
|||||||
|
|
||||||
> [!IMPORTANT]
|
> [!IMPORTANT]
|
||||||
>
|
>
|
||||||
> This endpoint is **not** OAI-compatible
|
> This endpoint is **not** OAI-compatible. For OAI-compatible client, use `/v1/completions` instead.
|
||||||
|
|
||||||
*Options:*
|
*Options:*
|
||||||
|
|
||||||
@ -452,6 +449,8 @@ These words will not be included in the completion, so make sure to add them to
|
|||||||
|
|
||||||
`response_fields`: A list of response fields, for example: `"response_fields": ["content", "generation_settings/n_predict"]`. If the specified field is missing, it will simply be omitted from the response without triggering an error. Note that fields with a slash will be unnested; for example, `generation_settings/n_predict` will move the field `n_predict` from the `generation_settings` object to the root of the response and give it a new name.
|
`response_fields`: A list of response fields, for example: `"response_fields": ["content", "generation_settings/n_predict"]`. If the specified field is missing, it will simply be omitted from the response without triggering an error. Note that fields with a slash will be unnested; for example, `generation_settings/n_predict` will move the field `n_predict` from the `generation_settings` object to the root of the response and give it a new name.
|
||||||
|
|
||||||
|
`lora`: A list of LoRA adapters to be applied to this specific request. Each object in the list must contain `id` and `scale` fields. For example: `[{"id": 0, "scale": 0.5}, {"id": 1, "scale": 1.1}]`. If a LoRA adapter is not specified in the list, its scale will default to `0.0`. Please note that requests with different LoRA configurations will not be batched together, which may result in performance degradation.
|
||||||
|
|
||||||
**Response format**
|
**Response format**
|
||||||
|
|
||||||
- Note: In streaming mode (`stream`), only `content`, `tokens` and `stop` will be returned until end of completion. Responses are sent using the [Server-sent events](https://html.spec.whatwg.org/multipage/server-sent-events.html) standard. Note: the browser's `EventSource` interface cannot be used due to its lack of `POST` request support.
|
- Note: In streaming mode (`stream`), only `content`, `tokens` and `stop` will be returned until end of completion. Responses are sent using the [Server-sent events](https://html.spec.whatwg.org/multipage/server-sent-events.html) standard. Note: the browser's `EventSource` interface cannot be used due to its lack of `POST` request support.
|
||||||
@ -523,6 +522,7 @@ These words will not be included in the completion, so make sure to add them to
|
|||||||
- `tokens_evaluated`: Number of tokens evaluated in total from the prompt
|
- `tokens_evaluated`: Number of tokens evaluated in total from the prompt
|
||||||
- `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
|
- `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
|
||||||
|
|
||||||
|
|
||||||
### POST `/tokenize`: Tokenize a given text
|
### POST `/tokenize`: Tokenize a given text
|
||||||
|
|
||||||
*Options:*
|
*Options:*
|
||||||
@ -574,6 +574,10 @@ With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k
|
|||||||
|
|
||||||
### POST `/embedding`: Generate embedding of a given text
|
### POST `/embedding`: Generate embedding of a given text
|
||||||
|
|
||||||
|
> [!IMPORTANT]
|
||||||
|
>
|
||||||
|
> This endpoint is **not** OAI-compatible. For OAI-compatible client, use `/v1/embeddings` instead.
|
||||||
|
|
||||||
The same as [the embedding example](../embedding) does.
|
The same as [the embedding example](../embedding) does.
|
||||||
|
|
||||||
*Options:*
|
*Options:*
|
||||||
@ -744,96 +748,6 @@ To use this endpoint with POST method, you need to start server with `--props`
|
|||||||
|
|
||||||
- None yet
|
- None yet
|
||||||
|
|
||||||
### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API
|
|
||||||
|
|
||||||
Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
|
|
||||||
|
|
||||||
*Options:*
|
|
||||||
|
|
||||||
See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such as `mirostat` are supported.
|
|
||||||
|
|
||||||
The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}` or `{"type": "json_schema", "schema": {"properties": { "name": { "title": "Name", "type": "string" }, "date": { "title": "Date", "type": "string" }, "participants": { "items": {"type: "string" }, "title": "Participants", "type": "string" } } } }`), similar to other OpenAI-inspired API providers.
|
|
||||||
|
|
||||||
*Examples:*
|
|
||||||
|
|
||||||
You can use either Python `openai` library with appropriate checkpoints:
|
|
||||||
|
|
||||||
```python
|
|
||||||
import openai
|
|
||||||
|
|
||||||
client = openai.OpenAI(
|
|
||||||
base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
|
|
||||||
api_key = "sk-no-key-required"
|
|
||||||
)
|
|
||||||
|
|
||||||
completion = client.chat.completions.create(
|
|
||||||
model="gpt-3.5-turbo",
|
|
||||||
messages=[
|
|
||||||
{"role": "system", "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."},
|
|
||||||
{"role": "user", "content": "Write a limerick about python exceptions"}
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
print(completion.choices[0].message)
|
|
||||||
```
|
|
||||||
|
|
||||||
... or raw HTTP requests:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
curl http://localhost:8080/v1/chat/completions \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-H "Authorization: Bearer no-key" \
|
|
||||||
-d '{
|
|
||||||
"model": "gpt-3.5-turbo",
|
|
||||||
"messages": [
|
|
||||||
{
|
|
||||||
"role": "system",
|
|
||||||
"content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": "Write a limerick about python exceptions"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
### POST `/v1/embeddings`: OpenAI-compatible embeddings API
|
|
||||||
|
|
||||||
This endpoint requires that the model uses a pooling different than type `none`. The embeddings are normalized using the Eucledian norm.
|
|
||||||
|
|
||||||
*Options:*
|
|
||||||
|
|
||||||
See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-reference/embeddings).
|
|
||||||
|
|
||||||
*Examples:*
|
|
||||||
|
|
||||||
- input as string
|
|
||||||
|
|
||||||
```shell
|
|
||||||
curl http://localhost:8080/v1/embeddings \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-H "Authorization: Bearer no-key" \
|
|
||||||
-d '{
|
|
||||||
"input": "hello",
|
|
||||||
"model":"GPT-4",
|
|
||||||
"encoding_format": "float"
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
- `input` as string array
|
|
||||||
|
|
||||||
```shell
|
|
||||||
curl http://localhost:8080/v1/embeddings \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-H "Authorization: Bearer no-key" \
|
|
||||||
-d '{
|
|
||||||
"input": ["hello", "world"],
|
|
||||||
"model":"GPT-4",
|
|
||||||
"encoding_format": "float"
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
### POST `/embeddings`: non-OpenAI-compatible embeddings API
|
### POST `/embeddings`: non-OpenAI-compatible embeddings API
|
||||||
|
|
||||||
This endpoint supports all poolings, including `--pooling none`. When the pooling is `none`, the responses will contain the *unnormalized* embeddings for *all* input tokens. For all other pooling types, only the pooled embeddings are returned, normalized using Euclidian norm.
|
This endpoint supports all poolings, including `--pooling none`. When the pooling is `none`, the responses will contain the *unnormalized* embeddings for *all* input tokens. For all other pooling types, only the pooled embeddings are returned, normalized using Euclidian norm.
|
||||||
@ -1030,6 +944,8 @@ This endpoint returns the loaded LoRA adapters. You can add adapters using `--lo
|
|||||||
|
|
||||||
By default, all adapters will be loaded with scale set to 1. To initialize all adapters scale to 0, add `--lora-init-without-apply`
|
By default, all adapters will be loaded with scale set to 1. To initialize all adapters scale to 0, add `--lora-init-without-apply`
|
||||||
|
|
||||||
|
Please note that this value will be overwritten by the `lora` field for each request.
|
||||||
|
|
||||||
If an adapter is disabled, the scale will be set to 0.
|
If an adapter is disabled, the scale will be set to 0.
|
||||||
|
|
||||||
**Response format**
|
**Response format**
|
||||||
@ -1051,6 +967,8 @@ If an adapter is disabled, the scale will be set to 0.
|
|||||||
|
|
||||||
### POST `/lora-adapters`: Set list of LoRA adapters
|
### POST `/lora-adapters`: Set list of LoRA adapters
|
||||||
|
|
||||||
|
This sets the global scale for LoRA adapters. Please note that this value will be overwritten by the `lora` field for each request.
|
||||||
|
|
||||||
To disable an adapter, either remove it from the list below, or set scale to 0.
|
To disable an adapter, either remove it from the list below, or set scale to 0.
|
||||||
|
|
||||||
**Request format**
|
**Request format**
|
||||||
@ -1064,6 +982,161 @@ To know the `id` of the adapter, use GET `/lora-adapters`
|
|||||||
]
|
]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## OpenAI-compatible API Endpoints
|
||||||
|
|
||||||
|
### GET `/v1/models`: OpenAI-compatible Model Info API
|
||||||
|
|
||||||
|
Returns information about the loaded model. See [OpenAI Models API documentation](https://platform.openai.com/docs/api-reference/models).
|
||||||
|
|
||||||
|
The returned list always has one single element.
|
||||||
|
|
||||||
|
By default, model `id` field is the path to model file, specified via `-m`. You can set a custom value for model `id` field via `--alias` argument. For example, `--alias gpt-4o-mini`.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"object": "list",
|
||||||
|
"data": [
|
||||||
|
{
|
||||||
|
"id": "../models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf",
|
||||||
|
"object": "model",
|
||||||
|
"created": 1735142223,
|
||||||
|
"owned_by": "llamacpp",
|
||||||
|
"meta": {
|
||||||
|
"vocab_type": 2,
|
||||||
|
"n_vocab": 128256,
|
||||||
|
"n_ctx_train": 131072,
|
||||||
|
"n_embd": 4096,
|
||||||
|
"n_params": 8030261312,
|
||||||
|
"size": 4912898304
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### POST `/v1/completions`: OpenAI-compatible Completions API
|
||||||
|
|
||||||
|
Given an input `prompt`, it returns the predicted completion. Streaming mode is also supported. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps.
|
||||||
|
|
||||||
|
*Options:*
|
||||||
|
|
||||||
|
See [OpenAI Completions API documentation](https://platform.openai.com/docs/api-reference/completions).
|
||||||
|
|
||||||
|
llama.cpp `/completion`-specific features such as `mirostat` are supported.
|
||||||
|
|
||||||
|
*Examples:*
|
||||||
|
|
||||||
|
Example usage with `openai` python library:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import openai
|
||||||
|
|
||||||
|
client = openai.OpenAI(
|
||||||
|
base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
|
||||||
|
api_key = "sk-no-key-required"
|
||||||
|
)
|
||||||
|
|
||||||
|
completion = client.completions.create(
|
||||||
|
model="davinci-002",
|
||||||
|
prompt="I believe the meaning of life is",
|
||||||
|
max_tokens=8
|
||||||
|
)
|
||||||
|
|
||||||
|
print(completion.choices[0].text)
|
||||||
|
```
|
||||||
|
|
||||||
|
### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API
|
||||||
|
|
||||||
|
Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
|
||||||
|
|
||||||
|
*Options:*
|
||||||
|
|
||||||
|
See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such as `mirostat` are supported.
|
||||||
|
|
||||||
|
The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}` or `{"type": "json_schema", "schema": {"properties": { "name": { "title": "Name", "type": "string" }, "date": { "title": "Date", "type": "string" }, "participants": { "items": {"type: "string" }, "title": "Participants", "type": "string" } } } }`), similar to other OpenAI-inspired API providers.
|
||||||
|
|
||||||
|
*Examples:*
|
||||||
|
|
||||||
|
You can use either Python `openai` library with appropriate checkpoints:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import openai
|
||||||
|
|
||||||
|
client = openai.OpenAI(
|
||||||
|
base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
|
||||||
|
api_key = "sk-no-key-required"
|
||||||
|
)
|
||||||
|
|
||||||
|
completion = client.chat.completions.create(
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."},
|
||||||
|
{"role": "user", "content": "Write a limerick about python exceptions"}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
print(completion.choices[0].message)
|
||||||
|
```
|
||||||
|
|
||||||
|
... or raw HTTP requests:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
curl http://localhost:8080/v1/chat/completions \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-H "Authorization: Bearer no-key" \
|
||||||
|
-d '{
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Write a limerick about python exceptions"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
### POST `/v1/embeddings`: OpenAI-compatible embeddings API
|
||||||
|
|
||||||
|
This endpoint requires that the model uses a pooling different than type `none`. The embeddings are normalized using the Eucledian norm.
|
||||||
|
|
||||||
|
*Options:*
|
||||||
|
|
||||||
|
See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-reference/embeddings).
|
||||||
|
|
||||||
|
*Examples:*
|
||||||
|
|
||||||
|
- input as string
|
||||||
|
|
||||||
|
```shell
|
||||||
|
curl http://localhost:8080/v1/embeddings \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-H "Authorization: Bearer no-key" \
|
||||||
|
-d '{
|
||||||
|
"input": "hello",
|
||||||
|
"model":"GPT-4",
|
||||||
|
"encoding_format": "float"
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
- `input` as string array
|
||||||
|
|
||||||
|
```shell
|
||||||
|
curl http://localhost:8080/v1/embeddings \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-H "Authorization: Bearer no-key" \
|
||||||
|
-d '{
|
||||||
|
"input": ["hello", "world"],
|
||||||
|
"model":"GPT-4",
|
||||||
|
"encoding_format": "float"
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
## More examples
|
## More examples
|
||||||
|
|
||||||
### Interactive mode
|
### Interactive mode
|
||||||
|
@ -6,10 +6,10 @@ Benchmark is using [k6](https://k6.io/).
|
|||||||
|
|
||||||
SSE is not supported by default in k6, you have to build k6 with the [xk6-sse](https://github.com/phymbert/xk6-sse) extension.
|
SSE is not supported by default in k6, you have to build k6 with the [xk6-sse](https://github.com/phymbert/xk6-sse) extension.
|
||||||
|
|
||||||
Example:
|
Example (assuming golang >= 1.21 is installed):
|
||||||
```shell
|
```shell
|
||||||
go install go.k6.io/xk6/cmd/xk6@latest
|
go install go.k6.io/xk6/cmd/xk6@latest
|
||||||
xk6 build master \
|
$GOPATH/bin/xk6 build master \
|
||||||
--with github.com/phymbert/xk6-sse
|
--with github.com/phymbert/xk6-sse
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -33,7 +33,7 @@ The server must answer OAI Chat completion requests on `http://localhost:8080/v1
|
|||||||
|
|
||||||
Example:
|
Example:
|
||||||
```shell
|
```shell
|
||||||
server --host localhost --port 8080 \
|
llama-server --host localhost --port 8080 \
|
||||||
--model ggml-model-q4_0.gguf \
|
--model ggml-model-q4_0.gguf \
|
||||||
--cont-batching \
|
--cont-batching \
|
||||||
--metrics \
|
--metrics \
|
||||||
|
@ -189,12 +189,12 @@ xychart-beta
|
|||||||
"pp": {
|
"pp": {
|
||||||
"p95": round(data['metrics']["llamacpp_prompt_processing_second"]["p(95)"], 2),
|
"p95": round(data['metrics']["llamacpp_prompt_processing_second"]["p(95)"], 2),
|
||||||
"avg": round(data['metrics']["llamacpp_prompt_processing_second"]["avg"], 2),
|
"avg": round(data['metrics']["llamacpp_prompt_processing_second"]["avg"], 2),
|
||||||
"0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2),
|
"0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2) if 'prompt_tokens_seconds' in prometheus_metrics else 0,
|
||||||
},
|
},
|
||||||
"tg": {
|
"tg": {
|
||||||
"p95": round(data['metrics']["llamacpp_tokens_second"]["p(95)"], 2),
|
"p95": round(data['metrics']["llamacpp_tokens_second"]["p(95)"], 2),
|
||||||
"avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2),
|
"avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2),
|
||||||
"0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2),
|
"0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2) if 'predicted_tokens_seconds' in prometheus_metrics else 0,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
with open("results.github.env", 'a') as github_env:
|
with open("results.github.env", 'a') as github_env:
|
||||||
@ -214,11 +214,14 @@ def start_benchmark(args):
|
|||||||
k6_args = [
|
k6_args = [
|
||||||
'run', args.scenario,
|
'run', args.scenario,
|
||||||
'--no-color',
|
'--no-color',
|
||||||
|
'--no-connection-reuse',
|
||||||
|
'--no-vu-connection-reuse',
|
||||||
]
|
]
|
||||||
k6_args.extend(['--duration', args.duration])
|
k6_args.extend(['--duration', args.duration])
|
||||||
k6_args.extend(['--iterations', args.n_prompts])
|
k6_args.extend(['--iterations', args.n_prompts])
|
||||||
k6_args.extend(['--vus', args.parallel])
|
k6_args.extend(['--vus', args.parallel])
|
||||||
k6_args.extend(['--summary-export', 'k6-results.json'])
|
k6_args.extend(['--summary-export', 'k6-results.json'])
|
||||||
|
k6_args.extend(['--out', 'csv=k6-results.csv'])
|
||||||
args = f"SERVER_BENCH_N_PROMPTS={args.n_prompts} SERVER_BENCH_MAX_PROMPT_TOKENS={args.max_prompt_tokens} SERVER_BENCH_MAX_CONTEXT={args.max_tokens} "
|
args = f"SERVER_BENCH_N_PROMPTS={args.n_prompts} SERVER_BENCH_MAX_PROMPT_TOKENS={args.max_prompt_tokens} SERVER_BENCH_MAX_CONTEXT={args.max_tokens} "
|
||||||
args = args + ' '.join([str(arg) for arg in [k6_path, *k6_args]])
|
args = args + ' '.join([str(arg) for arg in [k6_path, *k6_args]])
|
||||||
print(f"bench: starting k6 with: {args}")
|
print(f"bench: starting k6 with: {args}")
|
||||||
@ -231,7 +234,7 @@ def start_server(args):
|
|||||||
server_process = start_server_background(args)
|
server_process = start_server_background(args)
|
||||||
|
|
||||||
attempts = 0
|
attempts = 0
|
||||||
max_attempts = 20
|
max_attempts = 600
|
||||||
if 'GITHUB_ACTIONS' in os.environ:
|
if 'GITHUB_ACTIONS' in os.environ:
|
||||||
max_attempts *= 2
|
max_attempts *= 2
|
||||||
|
|
||||||
@ -242,7 +245,15 @@ def start_server(args):
|
|||||||
print(f"bench: waiting for server to start ...")
|
print(f"bench: waiting for server to start ...")
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
|
|
||||||
print("bench: server started.")
|
attempts = 0
|
||||||
|
while not is_server_ready(args.host, args.port):
|
||||||
|
attempts += 1
|
||||||
|
if attempts > max_attempts:
|
||||||
|
assert False, "server not ready"
|
||||||
|
print(f"bench: waiting for server to be ready ...")
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
print("bench: server started and ready.")
|
||||||
return server_process
|
return server_process
|
||||||
|
|
||||||
|
|
||||||
@ -255,11 +266,6 @@ def start_server_background(args):
|
|||||||
'--host', args.host,
|
'--host', args.host,
|
||||||
'--port', args.port,
|
'--port', args.port,
|
||||||
]
|
]
|
||||||
model_file = args.model_path_prefix + os.path.sep + args.hf_file
|
|
||||||
model_dir = os.path.dirname(model_file)
|
|
||||||
if not os.path.exists(model_dir):
|
|
||||||
os.makedirs(model_dir)
|
|
||||||
server_args.extend(['--model', model_file])
|
|
||||||
server_args.extend(['--hf-repo', args.hf_repo])
|
server_args.extend(['--hf-repo', args.hf_repo])
|
||||||
server_args.extend(['--hf-file', args.hf_file])
|
server_args.extend(['--hf-file', args.hf_file])
|
||||||
server_args.extend(['--n-gpu-layers', args.n_gpu_layers])
|
server_args.extend(['--n-gpu-layers', args.n_gpu_layers])
|
||||||
@ -303,6 +309,12 @@ def is_server_listening(server_fqdn, server_port):
|
|||||||
return _is_server_listening
|
return _is_server_listening
|
||||||
|
|
||||||
|
|
||||||
|
def is_server_ready(server_fqdn, server_port):
|
||||||
|
url = f"http://{server_fqdn}:{server_port}/health"
|
||||||
|
response = requests.get(url)
|
||||||
|
return response.status_code == 200
|
||||||
|
|
||||||
|
|
||||||
def escape_metric_name(metric_name):
|
def escape_metric_name(metric_name):
|
||||||
return re.sub('[^A-Z0-9]', '_', metric_name.upper())
|
return re.sub('[^A-Z0-9]', '_', metric_name.upper())
|
||||||
|
|
||||||
|
@ -56,6 +56,7 @@ const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens')
|
|||||||
|
|
||||||
const llamacpp_tokens_second = new Trend('llamacpp_tokens_second')
|
const llamacpp_tokens_second = new Trend('llamacpp_tokens_second')
|
||||||
const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second')
|
const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second')
|
||||||
|
const llamacpp_emit_first_token_second = new Trend('llamacpp_emit_first_token_second')
|
||||||
|
|
||||||
const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
|
const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
|
||||||
const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
|
const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
|
||||||
@ -89,6 +90,9 @@ export default function () {
|
|||||||
],
|
],
|
||||||
"model": model,
|
"model": model,
|
||||||
"stream": true,
|
"stream": true,
|
||||||
|
"stream_options": {
|
||||||
|
"include_usage": true, // False to be supported in llama.cpp server
|
||||||
|
},
|
||||||
"seed": 42,
|
"seed": 42,
|
||||||
"max_tokens": max_tokens,
|
"max_tokens": max_tokens,
|
||||||
"stop": ["<|im_end|>"] // This is temporary for phi-2 base (i.e. not instructed) since the server expects that the model always to emit BOS
|
"stop": ["<|im_end|>"] // This is temporary for phi-2 base (i.e. not instructed) since the server expects that the model always to emit BOS
|
||||||
@ -105,13 +109,21 @@ export default function () {
|
|||||||
client.on('event', function (event) {
|
client.on('event', function (event) {
|
||||||
if (promptEvalEndTime == null) {
|
if (promptEvalEndTime == null) {
|
||||||
promptEvalEndTime = new Date()
|
promptEvalEndTime = new Date()
|
||||||
|
llamacpp_emit_first_token_second.add((promptEvalEndTime - startTime) / 1.e3)
|
||||||
|
}
|
||||||
|
|
||||||
|
if (event.data === '[DONE]' || event.data === '') {
|
||||||
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
let chunk = JSON.parse(event.data)
|
let chunk = JSON.parse(event.data)
|
||||||
|
|
||||||
|
if (chunk.choices && chunk.choices.length > 0) {
|
||||||
let choice = chunk.choices[0]
|
let choice = chunk.choices[0]
|
||||||
if (choice.finish_reason) {
|
if (choice.finish_reason) {
|
||||||
finish_reason = choice.finish_reason
|
finish_reason = choice.finish_reason
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (chunk.usage) {
|
if (chunk.usage) {
|
||||||
prompt_tokens = chunk.usage.prompt_tokens
|
prompt_tokens = chunk.usage.prompt_tokens
|
||||||
|
Binary file not shown.
@ -67,6 +67,13 @@ enum server_task_type {
|
|||||||
SERVER_TASK_TYPE_SET_LORA,
|
SERVER_TASK_TYPE_SET_LORA,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum oaicompat_type {
|
||||||
|
OAICOMPAT_TYPE_NONE,
|
||||||
|
OAICOMPAT_TYPE_CHAT,
|
||||||
|
OAICOMPAT_TYPE_COMPLETION,
|
||||||
|
OAICOMPAT_TYPE_EMBEDDING,
|
||||||
|
};
|
||||||
|
|
||||||
// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
|
// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
|
||||||
enum error_type {
|
enum error_type {
|
||||||
ERROR_TYPE_INVALID_REQUEST,
|
ERROR_TYPE_INVALID_REQUEST,
|
||||||
@ -91,6 +98,8 @@ struct slot_params {
|
|||||||
int64_t t_max_prompt_ms = -1; // TODO: implement
|
int64_t t_max_prompt_ms = -1; // TODO: implement
|
||||||
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
|
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
|
||||||
|
|
||||||
|
std::vector<common_lora_adapter_info> lora;
|
||||||
|
|
||||||
std::vector<std::string> antiprompt;
|
std::vector<std::string> antiprompt;
|
||||||
std::vector<std::string> response_fields;
|
std::vector<std::string> response_fields;
|
||||||
bool timings_per_token = false;
|
bool timings_per_token = false;
|
||||||
@ -102,8 +111,7 @@ struct slot_params {
|
|||||||
|
|
||||||
// OAI-compat fields
|
// OAI-compat fields
|
||||||
bool verbose = false;
|
bool verbose = false;
|
||||||
bool oaicompat = false;
|
oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
|
||||||
bool oaicompat_chat = true;
|
|
||||||
std::string oaicompat_model;
|
std::string oaicompat_model;
|
||||||
std::string oaicompat_cmpl_id;
|
std::string oaicompat_cmpl_id;
|
||||||
|
|
||||||
@ -114,6 +122,11 @@ struct slot_params {
|
|||||||
samplers.emplace_back(common_sampler_type_to_str(sampler));
|
samplers.emplace_back(common_sampler_type_to_str(sampler));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
json lora = json::array();
|
||||||
|
for (size_t i = 0; i < this->lora.size(); ++i) {
|
||||||
|
lora.push_back({{"id", i}, {"scale", this->lora[i].scale}});
|
||||||
|
}
|
||||||
|
|
||||||
return json {
|
return json {
|
||||||
{"n_predict", n_predict}, // Server configured n_predict
|
{"n_predict", n_predict}, // Server configured n_predict
|
||||||
{"seed", sampling.seed},
|
{"seed", sampling.seed},
|
||||||
@ -154,6 +167,7 @@ struct slot_params {
|
|||||||
{"speculative.p_min", speculative.p_min},
|
{"speculative.p_min", speculative.p_min},
|
||||||
{"timings_per_token", timings_per_token},
|
{"timings_per_token", timings_per_token},
|
||||||
{"post_sampling_probs", post_sampling_probs},
|
{"post_sampling_probs", post_sampling_probs},
|
||||||
|
{"lora", lora},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -183,6 +197,9 @@ struct server_task {
|
|||||||
// used by SERVER_TASK_TYPE_METRICS
|
// used by SERVER_TASK_TYPE_METRICS
|
||||||
bool metrics_reset_bucket = false;
|
bool metrics_reset_bucket = false;
|
||||||
|
|
||||||
|
// used by SERVER_TASK_TYPE_SET_LORA
|
||||||
|
std::vector<common_lora_adapter_info> set_lora;
|
||||||
|
|
||||||
server_task(server_task_type type) : type(type) {}
|
server_task(server_task_type type) : type(type) {}
|
||||||
|
|
||||||
static slot_params params_from_json_cmpl(
|
static slot_params params_from_json_cmpl(
|
||||||
@ -245,6 +262,16 @@ struct server_task {
|
|||||||
params.speculative.n_min = std::max(params.speculative.n_min, 2);
|
params.speculative.n_min = std::max(params.speculative.n_min, 2);
|
||||||
params.speculative.n_max = std::max(params.speculative.n_max, 0);
|
params.speculative.n_max = std::max(params.speculative.n_max, 0);
|
||||||
|
|
||||||
|
if (data.contains("lora")) {
|
||||||
|
if (data.at("lora").is_array()) {
|
||||||
|
params.lora = parse_lora_request(params_base.lora_adapters, data.at("lora"));
|
||||||
|
} else {
|
||||||
|
throw std::runtime_error("Error: 'lora' must be an array of objects with 'id' and 'scale' fields");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
params.lora = params_base.lora_adapters;
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: add more sanity checks for the input parameters
|
// TODO: add more sanity checks for the input parameters
|
||||||
|
|
||||||
if (params.sampling.penalty_last_n < -1) {
|
if (params.sampling.penalty_last_n < -1) {
|
||||||
@ -530,8 +557,7 @@ struct server_task_result_cmpl_final : server_task_result {
|
|||||||
|
|
||||||
// OAI-compat fields
|
// OAI-compat fields
|
||||||
bool verbose = false;
|
bool verbose = false;
|
||||||
bool oaicompat = false;
|
oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
|
||||||
bool oaicompat_chat = true; // TODO: support oaicompat for non-chat
|
|
||||||
std::string oaicompat_model;
|
std::string oaicompat_model;
|
||||||
std::string oaicompat_cmpl_id;
|
std::string oaicompat_cmpl_id;
|
||||||
|
|
||||||
@ -544,9 +570,16 @@ struct server_task_result_cmpl_final : server_task_result {
|
|||||||
}
|
}
|
||||||
|
|
||||||
virtual json to_json() override {
|
virtual json to_json() override {
|
||||||
return oaicompat
|
switch (oaicompat) {
|
||||||
? (stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat())
|
case OAICOMPAT_TYPE_NONE:
|
||||||
: to_json_non_oaicompat();
|
return to_json_non_oaicompat();
|
||||||
|
case OAICOMPAT_TYPE_COMPLETION:
|
||||||
|
return to_json_oaicompat();
|
||||||
|
case OAICOMPAT_TYPE_CHAT:
|
||||||
|
return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat();
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false && "Invalid oaicompat_type");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
json to_json_non_oaicompat() {
|
json to_json_non_oaicompat() {
|
||||||
@ -574,6 +607,50 @@ struct server_task_result_cmpl_final : server_task_result {
|
|||||||
return response_fields.empty() ? res : json_get_nested_values(response_fields, res);
|
return response_fields.empty() ? res : json_get_nested_values(response_fields, res);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
json to_json_oaicompat() {
|
||||||
|
std::time_t t = std::time(0);
|
||||||
|
json logprobs = json(nullptr); // OAI default to null
|
||||||
|
if (!stream && probs_output.size() > 0) {
|
||||||
|
logprobs = json{
|
||||||
|
{"content", completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs)},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
json finish_reason = "length";
|
||||||
|
if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
|
||||||
|
finish_reason = "stop";
|
||||||
|
}
|
||||||
|
json res = json {
|
||||||
|
{"choices", json::array({
|
||||||
|
json{
|
||||||
|
{"text", stream ? "" : content}, // in stream mode, content is already in last partial chunk
|
||||||
|
{"index", index},
|
||||||
|
{"logprobs", logprobs},
|
||||||
|
{"finish_reason", finish_reason},
|
||||||
|
}
|
||||||
|
})},
|
||||||
|
{"created", t},
|
||||||
|
{"model", oaicompat_model},
|
||||||
|
{"system_fingerprint", build_info},
|
||||||
|
{"object", "text_completion"},
|
||||||
|
{"usage", json {
|
||||||
|
{"completion_tokens", n_decoded},
|
||||||
|
{"prompt_tokens", n_prompt_tokens},
|
||||||
|
{"total_tokens", n_decoded + n_prompt_tokens}
|
||||||
|
}},
|
||||||
|
{"id", oaicompat_cmpl_id}
|
||||||
|
};
|
||||||
|
|
||||||
|
// extra fields for debugging purposes
|
||||||
|
if (verbose) {
|
||||||
|
res["__verbose"] = to_json_non_oaicompat();
|
||||||
|
}
|
||||||
|
if (timings.prompt_n >= 0) {
|
||||||
|
res.push_back({"timings", timings.to_json()});
|
||||||
|
}
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
json to_json_oaicompat_chat() {
|
json to_json_oaicompat_chat() {
|
||||||
std::string finish_reason = "length";
|
std::string finish_reason = "length";
|
||||||
if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
|
if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
|
||||||
@ -672,8 +749,7 @@ struct server_task_result_cmpl_partial : server_task_result {
|
|||||||
|
|
||||||
// OAI-compat fields
|
// OAI-compat fields
|
||||||
bool verbose = false;
|
bool verbose = false;
|
||||||
bool oaicompat = false;
|
oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
|
||||||
bool oaicompat_chat = true; // TODO: support oaicompat for non-chat
|
|
||||||
std::string oaicompat_model;
|
std::string oaicompat_model;
|
||||||
std::string oaicompat_cmpl_id;
|
std::string oaicompat_cmpl_id;
|
||||||
|
|
||||||
@ -686,7 +762,16 @@ struct server_task_result_cmpl_partial : server_task_result {
|
|||||||
}
|
}
|
||||||
|
|
||||||
virtual json to_json() override {
|
virtual json to_json() override {
|
||||||
return oaicompat ? to_json_oaicompat() : to_json_non_oaicompat();
|
switch (oaicompat) {
|
||||||
|
case OAICOMPAT_TYPE_NONE:
|
||||||
|
return to_json_non_oaicompat();
|
||||||
|
case OAICOMPAT_TYPE_COMPLETION:
|
||||||
|
return to_json_oaicompat();
|
||||||
|
case OAICOMPAT_TYPE_CHAT:
|
||||||
|
return to_json_oaicompat_chat();
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false && "Invalid oaicompat_type");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
json to_json_non_oaicompat() {
|
json to_json_non_oaicompat() {
|
||||||
@ -711,6 +796,41 @@ struct server_task_result_cmpl_partial : server_task_result {
|
|||||||
}
|
}
|
||||||
|
|
||||||
json to_json_oaicompat() {
|
json to_json_oaicompat() {
|
||||||
|
std::time_t t = std::time(0);
|
||||||
|
json logprobs = json(nullptr); // OAI default to null
|
||||||
|
if (prob_output.probs.size() > 0) {
|
||||||
|
logprobs = json{
|
||||||
|
{"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
json res = json {
|
||||||
|
{"choices", json::array({
|
||||||
|
json{
|
||||||
|
{"text", content},
|
||||||
|
{"index", index},
|
||||||
|
{"logprobs", logprobs},
|
||||||
|
{"finish_reason", nullptr},
|
||||||
|
}
|
||||||
|
})},
|
||||||
|
{"created", t},
|
||||||
|
{"model", oaicompat_model},
|
||||||
|
{"system_fingerprint", build_info},
|
||||||
|
{"object", "text_completion"},
|
||||||
|
{"id", oaicompat_cmpl_id}
|
||||||
|
};
|
||||||
|
|
||||||
|
// extra fields for debugging purposes
|
||||||
|
if (verbose) {
|
||||||
|
res["__verbose"] = to_json_non_oaicompat();
|
||||||
|
}
|
||||||
|
if (timings.prompt_n >= 0) {
|
||||||
|
res.push_back({"timings", timings.to_json()});
|
||||||
|
}
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
json to_json_oaicompat_chat() {
|
||||||
bool first = n_decoded == 0;
|
bool first = n_decoded == 0;
|
||||||
std::time_t t = std::time(0);
|
std::time_t t = std::time(0);
|
||||||
json choices;
|
json choices;
|
||||||
@ -789,14 +909,16 @@ struct server_task_result_embd : server_task_result {
|
|||||||
int32_t n_tokens;
|
int32_t n_tokens;
|
||||||
|
|
||||||
// OAI-compat fields
|
// OAI-compat fields
|
||||||
bool oaicompat = false;
|
oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
|
||||||
|
|
||||||
virtual int get_index() override {
|
virtual int get_index() override {
|
||||||
return index;
|
return index;
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual json to_json() override {
|
virtual json to_json() override {
|
||||||
return oaicompat ? to_json_oaicompat() : to_json_non_oaicompat();
|
return oaicompat == OAICOMPAT_TYPE_EMBEDDING
|
||||||
|
? to_json_oaicompat()
|
||||||
|
: to_json_non_oaicompat();
|
||||||
}
|
}
|
||||||
|
|
||||||
json to_json_non_oaicompat() {
|
json to_json_non_oaicompat() {
|
||||||
@ -1009,6 +1131,8 @@ struct server_slot {
|
|||||||
|
|
||||||
common_speculative * spec = nullptr;
|
common_speculative * spec = nullptr;
|
||||||
|
|
||||||
|
std::vector<common_lora_adapter_info> lora;
|
||||||
|
|
||||||
// the index relative to completion multi-task request
|
// the index relative to completion multi-task request
|
||||||
size_t index = 0;
|
size_t index = 0;
|
||||||
|
|
||||||
@ -1090,6 +1214,11 @@ struct server_slot {
|
|||||||
return task_type == SERVER_TASK_TYPE_EMBEDDING || task_type == SERVER_TASK_TYPE_RERANK;
|
return task_type == SERVER_TASK_TYPE_EMBEDDING || task_type == SERVER_TASK_TYPE_RERANK;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool can_batch_with(server_slot & other_slot) {
|
||||||
|
return is_non_causal() == other_slot.is_non_causal()
|
||||||
|
&& are_lora_equal(lora, other_slot.lora);
|
||||||
|
}
|
||||||
|
|
||||||
bool has_budget(const common_params & global_params) {
|
bool has_budget(const common_params & global_params) {
|
||||||
if (params.n_predict == -1 && global_params.n_predict == -1) {
|
if (params.n_predict == -1 && global_params.n_predict == -1) {
|
||||||
return true; // limitless
|
return true; // limitless
|
||||||
@ -1497,11 +1626,15 @@ struct server_response {
|
|||||||
struct server_context {
|
struct server_context {
|
||||||
common_params params_base;
|
common_params params_base;
|
||||||
|
|
||||||
|
// note: keep these alive - they determine the lifetime of the model, context, etc.
|
||||||
|
common_init_result llama_init;
|
||||||
|
common_init_result llama_init_dft;
|
||||||
|
|
||||||
llama_model * model = nullptr;
|
llama_model * model = nullptr;
|
||||||
llama_context * ctx = nullptr;
|
llama_context * ctx = nullptr;
|
||||||
std::vector<common_lora_adapter_container> loras;
|
|
||||||
|
|
||||||
llama_model * model_dft = nullptr;
|
llama_model * model_dft = nullptr;
|
||||||
|
|
||||||
llama_context_params cparams_dft;
|
llama_context_params cparams_dft;
|
||||||
|
|
||||||
llama_batch batch = {};
|
llama_batch batch = {};
|
||||||
@ -1525,21 +1658,6 @@ struct server_context {
|
|||||||
float slot_prompt_similarity = 0.0f;
|
float slot_prompt_similarity = 0.0f;
|
||||||
|
|
||||||
~server_context() {
|
~server_context() {
|
||||||
if (ctx) {
|
|
||||||
llama_free(ctx);
|
|
||||||
ctx = nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (model) {
|
|
||||||
llama_free_model(model);
|
|
||||||
model = nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (model_dft) {
|
|
||||||
llama_free_model(model_dft);
|
|
||||||
model_dft = nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Clear any sampling context
|
// Clear any sampling context
|
||||||
for (server_slot & slot : slots) {
|
for (server_slot & slot : slots) {
|
||||||
common_sampler_free(slot.smpl);
|
common_sampler_free(slot.smpl);
|
||||||
@ -1562,11 +1680,10 @@ struct server_context {
|
|||||||
|
|
||||||
params_base = params;
|
params_base = params;
|
||||||
|
|
||||||
common_init_result llama_init = common_init_from_params(params_base);
|
llama_init = common_init_from_params(params_base);
|
||||||
|
|
||||||
model = llama_init.model;
|
model = llama_init.model.get();
|
||||||
ctx = llama_init.context;
|
ctx = llama_init.context.get();
|
||||||
loras = llama_init.lora_adapters;
|
|
||||||
|
|
||||||
if (model == nullptr) {
|
if (model == nullptr) {
|
||||||
SRV_ERR("failed to load model, '%s'\n", params_base.model.c_str());
|
SRV_ERR("failed to load model, '%s'\n", params_base.model.c_str());
|
||||||
@ -1589,25 +1706,22 @@ struct server_context {
|
|||||||
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
|
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
|
||||||
params_dft.n_parallel = 1;
|
params_dft.n_parallel = 1;
|
||||||
|
|
||||||
common_init_result llama_init_dft = common_init_from_params(params_dft);
|
llama_init_dft = common_init_from_params(params_dft);
|
||||||
|
|
||||||
model_dft = llama_init_dft.model;
|
model_dft = llama_init_dft.model.get();
|
||||||
|
|
||||||
if (model_dft == nullptr) {
|
if (model_dft == nullptr) {
|
||||||
SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.c_str());
|
SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.c_str());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!common_speculative_are_compatible(ctx, llama_init_dft.context)) {
|
if (!common_speculative_are_compatible(ctx, llama_init_dft.context.get())) {
|
||||||
SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.c_str(), params_base.model.c_str());
|
SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.c_str(), params_base.model.c_str());
|
||||||
|
|
||||||
llama_free (llama_init_dft.context);
|
|
||||||
llama_free_model(llama_init_dft.model);
|
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int n_ctx_dft = llama_n_ctx(llama_init_dft.context);
|
const int n_ctx_dft = llama_n_ctx(llama_init_dft.context.get());
|
||||||
|
|
||||||
cparams_dft = common_context_params_to_llama(params_dft);
|
cparams_dft = common_context_params_to_llama(params_dft);
|
||||||
cparams_dft.n_batch = n_ctx_dft;
|
cparams_dft.n_batch = n_ctx_dft;
|
||||||
@ -1615,26 +1729,16 @@ struct server_context {
|
|||||||
// force F16 KV cache for the draft model for extra performance
|
// force F16 KV cache for the draft model for extra performance
|
||||||
cparams_dft.type_k = GGML_TYPE_F16;
|
cparams_dft.type_k = GGML_TYPE_F16;
|
||||||
cparams_dft.type_v = GGML_TYPE_F16;
|
cparams_dft.type_v = GGML_TYPE_F16;
|
||||||
|
|
||||||
// the context is not needed - we will create one for each slot
|
|
||||||
llama_free(llama_init_dft.context);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool validate_model_chat_template() const {
|
bool validate_builtin_chat_template() const {
|
||||||
std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
|
|
||||||
std::string template_key = "tokenizer.chat_template";
|
|
||||||
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
|
|
||||||
if (res >= 0) {
|
|
||||||
llama_chat_message chat[] = {{"user", "test"}};
|
llama_chat_message chat[] = {{"user", "test"}};
|
||||||
std::string tmpl = std::string(model_template.data(), model_template.size());
|
int32_t chat_res = llama_chat_apply_template(model, nullptr, chat, 1, true, nullptr, 0);
|
||||||
int32_t chat_res = llama_chat_apply_template(model, tmpl.c_str(), chat, 1, true, nullptr, 0);
|
|
||||||
return chat_res > 0;
|
return chat_res > 0;
|
||||||
}
|
}
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
void init() {
|
void init() {
|
||||||
const int32_t n_ctx_slot = n_ctx / params_base.n_parallel;
|
const int32_t n_ctx_slot = n_ctx / params_base.n_parallel;
|
||||||
@ -1772,6 +1876,12 @@ struct server_context {
|
|||||||
slot.params = std::move(task.params);
|
slot.params = std::move(task.params);
|
||||||
slot.prompt_tokens = std::move(task.prompt_tokens);
|
slot.prompt_tokens = std::move(task.prompt_tokens);
|
||||||
|
|
||||||
|
if (!are_lora_equal(task.params.lora, slot.lora)) {
|
||||||
|
// if lora is changed, we cannot reuse cached tokens
|
||||||
|
slot.cache_tokens.clear();
|
||||||
|
slot.lora = task.params.lora;
|
||||||
|
}
|
||||||
|
|
||||||
SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str());
|
SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str());
|
||||||
|
|
||||||
if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
|
if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
|
||||||
@ -2044,7 +2154,6 @@ struct server_context {
|
|||||||
|
|
||||||
res->verbose = slot.params.verbose;
|
res->verbose = slot.params.verbose;
|
||||||
res->oaicompat = slot.params.oaicompat;
|
res->oaicompat = slot.params.oaicompat;
|
||||||
res->oaicompat_chat = slot.params.oaicompat_chat;
|
|
||||||
res->oaicompat_model = slot.params.oaicompat_model;
|
res->oaicompat_model = slot.params.oaicompat_model;
|
||||||
res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id;
|
res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id;
|
||||||
|
|
||||||
@ -2085,7 +2194,6 @@ struct server_context {
|
|||||||
res->verbose = slot.params.verbose;
|
res->verbose = slot.params.verbose;
|
||||||
res->stream = slot.params.stream;
|
res->stream = slot.params.stream;
|
||||||
res->oaicompat = slot.params.oaicompat;
|
res->oaicompat = slot.params.oaicompat;
|
||||||
res->oaicompat_chat = slot.params.oaicompat_chat;
|
|
||||||
res->oaicompat_model = slot.params.oaicompat_model;
|
res->oaicompat_model = slot.params.oaicompat_model;
|
||||||
res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id;
|
res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id;
|
||||||
|
|
||||||
@ -2465,7 +2573,7 @@ struct server_context {
|
|||||||
} break;
|
} break;
|
||||||
case SERVER_TASK_TYPE_SET_LORA:
|
case SERVER_TASK_TYPE_SET_LORA:
|
||||||
{
|
{
|
||||||
common_lora_adapters_apply(ctx, loras);
|
params_base.lora_adapters = std::move(task.set_lora);
|
||||||
auto res = std::make_unique<server_task_result_apply_lora>();
|
auto res = std::make_unique<server_task_result_apply_lora>();
|
||||||
res->id = task.id;
|
res->id = task.id;
|
||||||
queue_results.send(std::move(res));
|
queue_results.send(std::move(res));
|
||||||
@ -2542,12 +2650,22 @@ struct server_context {
|
|||||||
// start populating the batch for this iteration
|
// start populating the batch for this iteration
|
||||||
common_batch_clear(batch);
|
common_batch_clear(batch);
|
||||||
|
|
||||||
|
// track if given slot can be batched with slots already in the batch
|
||||||
|
server_slot * slot_batched = nullptr;
|
||||||
|
|
||||||
// frist, add sampled tokens from any ongoing sequences
|
// frist, add sampled tokens from any ongoing sequences
|
||||||
for (auto & slot : slots) {
|
for (auto & slot : slots) {
|
||||||
if (slot.state != SLOT_STATE_GENERATING) {
|
if (slot.state != SLOT_STATE_GENERATING) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// check if we can batch this slot with the previous one
|
||||||
|
if (!slot_batched) {
|
||||||
|
slot_batched = &slot;
|
||||||
|
} else if (!slot_batched->can_batch_with(slot)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
slot.i_batch = batch.n_tokens;
|
slot.i_batch = batch.n_tokens;
|
||||||
|
|
||||||
common_batch_add(batch, slot.sampled, slot.n_past, { slot.id }, true);
|
common_batch_add(batch, slot.sampled, slot.n_past, { slot.id }, true);
|
||||||
@ -2566,15 +2684,18 @@ struct server_context {
|
|||||||
int32_t n_batch = llama_n_batch(ctx);
|
int32_t n_batch = llama_n_batch(ctx);
|
||||||
int32_t n_ubatch = llama_n_ubatch(ctx);
|
int32_t n_ubatch = llama_n_ubatch(ctx);
|
||||||
|
|
||||||
// track if this is an embedding or non-embedding batch
|
|
||||||
// if we've added sampled tokens above, we are in non-embedding mode
|
|
||||||
// -1: none, 0: non-embedding, 1: embedding
|
|
||||||
// TODO: make enum
|
|
||||||
int32_t batch_type = batch.n_tokens > 0 ? 0 : -1;
|
|
||||||
|
|
||||||
// next, batch any pending prompts without exceeding n_batch
|
// next, batch any pending prompts without exceeding n_batch
|
||||||
if (params_base.cont_batching || batch.n_tokens == 0) {
|
if (params_base.cont_batching || batch.n_tokens == 0) {
|
||||||
for (auto & slot : slots) {
|
for (auto & slot : slots) {
|
||||||
|
// check if we can batch this slot with the previous one
|
||||||
|
if (slot.is_processing()) {
|
||||||
|
if (!slot_batched) {
|
||||||
|
slot_batched = &slot;
|
||||||
|
} else if (!slot_batched->can_batch_with(slot)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// this slot still has a prompt to be processed
|
// this slot still has a prompt to be processed
|
||||||
if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) {
|
if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) {
|
||||||
auto & prompt_tokens = slot.prompt_tokens;
|
auto & prompt_tokens = slot.prompt_tokens;
|
||||||
@ -2735,14 +2856,6 @@ struct server_context {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// check that we are in the right batch_type, if not defer the slot
|
|
||||||
int slot_type = slot.is_non_causal();
|
|
||||||
if (batch_type == -1) {
|
|
||||||
batch_type = slot_type;
|
|
||||||
} else if (batch_type != slot_type) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// keep only the common part
|
// keep only the common part
|
||||||
if (!llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1)) {
|
if (!llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1)) {
|
||||||
// could not partially delete (likely using a non-Transformer model)
|
// could not partially delete (likely using a non-Transformer model)
|
||||||
@ -2810,8 +2923,12 @@ struct server_context {
|
|||||||
|
|
||||||
SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens);
|
SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens);
|
||||||
|
|
||||||
|
if (slot_batched) {
|
||||||
// make sure we're in the right embedding mode
|
// make sure we're in the right embedding mode
|
||||||
llama_set_embeddings(ctx, batch_type == 1);
|
llama_set_embeddings(ctx, slot_batched->is_non_causal());
|
||||||
|
// apply lora, only need to do it once per batch
|
||||||
|
common_lora_adapters_apply(ctx, slot_batched->lora);
|
||||||
|
}
|
||||||
|
|
||||||
// process the created batch of tokens
|
// process the created batch of tokens
|
||||||
for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
|
for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
|
||||||
@ -3484,7 +3601,7 @@ int main(int argc, char ** argv) {
|
|||||||
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
||||||
{ "total_slots", ctx_server.params_base.n_parallel },
|
{ "total_slots", ctx_server.params_base.n_parallel },
|
||||||
{ "model_path", ctx_server.params_base.model },
|
{ "model_path", ctx_server.params_base.model },
|
||||||
{ "chat_template", llama_get_chat_template(ctx_server.model) },
|
{ "chat_template", common_get_builtin_chat_template(ctx_server.model) },
|
||||||
{ "build_info", build_info },
|
{ "build_info", build_info },
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -3506,12 +3623,11 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// handle completion-like requests (completion, chat, infill)
|
// handle completion-like requests (completion, chat, infill)
|
||||||
// we can optionally provide a custom format for partial results and final results
|
// we can optionally provide a custom format for partial results and final results
|
||||||
const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](
|
const auto handle_completions_impl = [&ctx_server, &res_error, &res_ok](
|
||||||
server_task_type type,
|
server_task_type type,
|
||||||
json & data,
|
json & data,
|
||||||
httplib::Response & res,
|
httplib::Response & res,
|
||||||
bool oaicompat = false,
|
oaicompat_type oaicompat) {
|
||||||
bool oaicompat_chat = false) {
|
|
||||||
GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);
|
GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);
|
||||||
|
|
||||||
if (ctx_server.params_base.embedding) {
|
if (ctx_server.params_base.embedding) {
|
||||||
@ -3532,12 +3648,15 @@ int main(int argc, char ** argv) {
|
|||||||
task.index = i;
|
task.index = i;
|
||||||
|
|
||||||
task.prompt_tokens = std::move(tokenized_prompts[i]);
|
task.prompt_tokens = std::move(tokenized_prompts[i]);
|
||||||
task.params = server_task::params_from_json_cmpl(ctx_server.model, ctx_server.ctx, ctx_server.params_base, data);
|
task.params = server_task::params_from_json_cmpl(
|
||||||
|
ctx_server.model,
|
||||||
|
ctx_server.ctx,
|
||||||
|
ctx_server.params_base,
|
||||||
|
data);
|
||||||
task.id_selected_slot = json_value(data, "id_slot", -1);
|
task.id_selected_slot = json_value(data, "id_slot", -1);
|
||||||
|
|
||||||
// OAI-compat
|
// OAI-compat
|
||||||
task.params.oaicompat = oaicompat;
|
task.params.oaicompat = oaicompat;
|
||||||
task.params.oaicompat_chat = oaicompat_chat;
|
|
||||||
task.params.oaicompat_cmpl_id = completion_id;
|
task.params.oaicompat_cmpl_id = completion_id;
|
||||||
// oaicompat_model is already populated by params_from_json_cmpl
|
// oaicompat_model is already populated by params_from_json_cmpl
|
||||||
|
|
||||||
@ -3589,7 +3708,7 @@ int main(int argc, char ** argv) {
|
|||||||
}, [&](const json & error_data) {
|
}, [&](const json & error_data) {
|
||||||
server_sent_event(sink, "error", error_data);
|
server_sent_event(sink, "error", error_data);
|
||||||
});
|
});
|
||||||
if (oaicompat) {
|
if (oaicompat != OAICOMPAT_TYPE_NONE) {
|
||||||
static const std::string ev_done = "data: [DONE]\n\n";
|
static const std::string ev_done = "data: [DONE]\n\n";
|
||||||
sink.write(ev_done.data(), ev_done.size());
|
sink.write(ev_done.data(), ev_done.size());
|
||||||
}
|
}
|
||||||
@ -3605,17 +3724,25 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
const auto handle_completions = [&handle_completions_generic](const httplib::Request & req, httplib::Response & res) {
|
const auto handle_completions = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
|
||||||
json data = json::parse(req.body);
|
json data = json::parse(req.body);
|
||||||
return handle_completions_generic(
|
return handle_completions_impl(
|
||||||
SERVER_TASK_TYPE_COMPLETION,
|
SERVER_TASK_TYPE_COMPLETION,
|
||||||
data,
|
data,
|
||||||
res,
|
res,
|
||||||
/* oaicompat */ false,
|
OAICOMPAT_TYPE_NONE);
|
||||||
/* oaicompat_chat */ false);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
const auto handle_infill = [&ctx_server, &res_error, &handle_completions_generic](const httplib::Request & req, httplib::Response & res) {
|
const auto handle_completions_oai = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
|
||||||
|
json data = oaicompat_completion_params_parse(json::parse(req.body));
|
||||||
|
return handle_completions_impl(
|
||||||
|
SERVER_TASK_TYPE_COMPLETION,
|
||||||
|
data,
|
||||||
|
res,
|
||||||
|
OAICOMPAT_TYPE_COMPLETION);
|
||||||
|
};
|
||||||
|
|
||||||
|
const auto handle_infill = [&ctx_server, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
|
||||||
// check model compatibility
|
// check model compatibility
|
||||||
std::string err;
|
std::string err;
|
||||||
if (llama_token_fim_pre(ctx_server.model) == LLAMA_TOKEN_NULL) {
|
if (llama_token_fim_pre(ctx_server.model) == LLAMA_TOKEN_NULL) {
|
||||||
@ -3670,7 +3797,7 @@ int main(int argc, char ** argv) {
|
|||||||
data["input_extra"] = input_extra; // default to empty array if it's not exist
|
data["input_extra"] = input_extra; // default to empty array if it's not exist
|
||||||
|
|
||||||
std::string prompt = json_value(data, "prompt", std::string());
|
std::string prompt = json_value(data, "prompt", std::string());
|
||||||
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, true, true);
|
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, false, true);
|
||||||
SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size());
|
SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size());
|
||||||
data["prompt"] = format_infill(
|
data["prompt"] = format_infill(
|
||||||
ctx_server.ctx,
|
ctx_server.ctx,
|
||||||
@ -3684,22 +3811,25 @@ int main(int argc, char ** argv) {
|
|||||||
tokenized_prompts[0]
|
tokenized_prompts[0]
|
||||||
);
|
);
|
||||||
|
|
||||||
return handle_completions_generic(SERVER_TASK_TYPE_INFILL, data, res);
|
return handle_completions_impl(
|
||||||
|
SERVER_TASK_TYPE_INFILL,
|
||||||
|
data,
|
||||||
|
res,
|
||||||
|
OAICOMPAT_TYPE_NONE); // infill is not OAI compatible
|
||||||
};
|
};
|
||||||
|
|
||||||
const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &handle_completions_generic](const httplib::Request & req, httplib::Response & res) {
|
const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
|
||||||
if (ctx_server.params_base.embedding) {
|
if (ctx_server.params_base.embedding) {
|
||||||
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
|
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
|
json data = oaicompat_chat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
|
||||||
return handle_completions_generic(
|
return handle_completions_impl(
|
||||||
SERVER_TASK_TYPE_COMPLETION,
|
SERVER_TASK_TYPE_COMPLETION,
|
||||||
data,
|
data,
|
||||||
res,
|
res,
|
||||||
/* oaicompat */ true,
|
OAICOMPAT_TYPE_CHAT);
|
||||||
/* oaicompat_chat */ true);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
const auto handle_models = [¶ms, &ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
|
const auto handle_models = [¶ms, &ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
|
||||||
@ -3772,10 +3902,10 @@ int main(int argc, char ** argv) {
|
|||||||
res_ok(res, data);
|
res_ok(res, data);
|
||||||
};
|
};
|
||||||
|
|
||||||
const auto handle_embeddings_impl = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res, bool oaicompat) {
|
const auto handle_embeddings_impl = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res, oaicompat_type oaicompat) {
|
||||||
const json body = json::parse(req.body);
|
const json body = json::parse(req.body);
|
||||||
|
|
||||||
if (oaicompat && llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) {
|
if (oaicompat != OAICOMPAT_TYPE_NONE && llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) {
|
||||||
res_error(res, format_error_response("Pooling type 'none' is not OAI compatible. Please use a different pooling type", ERROR_TYPE_INVALID_REQUEST));
|
res_error(res, format_error_response("Pooling type 'none' is not OAI compatible. Please use a different pooling type", ERROR_TYPE_INVALID_REQUEST));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -3785,7 +3915,7 @@ int main(int argc, char ** argv) {
|
|||||||
if (body.count("input") != 0) {
|
if (body.count("input") != 0) {
|
||||||
prompt = body.at("input");
|
prompt = body.at("input");
|
||||||
} else if (body.contains("content")) {
|
} else if (body.contains("content")) {
|
||||||
oaicompat = false;
|
oaicompat = OAICOMPAT_TYPE_NONE; // "content" field is not OAI compatible
|
||||||
prompt = body.at("content");
|
prompt = body.at("content");
|
||||||
} else {
|
} else {
|
||||||
res_error(res, format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST));
|
res_error(res, format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST));
|
||||||
@ -3854,16 +3984,18 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// write JSON response
|
// write JSON response
|
||||||
json root = oaicompat ? format_embeddings_response_oaicompat(body, responses, use_base64) : json(responses);
|
json root = oaicompat == OAICOMPAT_TYPE_EMBEDDING
|
||||||
|
? format_embeddings_response_oaicompat(body, responses, use_base64)
|
||||||
|
: json(responses);
|
||||||
res_ok(res, root);
|
res_ok(res, root);
|
||||||
};
|
};
|
||||||
|
|
||||||
const auto handle_embeddings = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) {
|
const auto handle_embeddings = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) {
|
||||||
handle_embeddings_impl(req, res, false);
|
handle_embeddings_impl(req, res, OAICOMPAT_TYPE_NONE);
|
||||||
};
|
};
|
||||||
|
|
||||||
const auto handle_embeddings_oai = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) {
|
const auto handle_embeddings_oai = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) {
|
||||||
handle_embeddings_impl(req, res, true);
|
handle_embeddings_impl(req, res, OAICOMPAT_TYPE_EMBEDDING);
|
||||||
};
|
};
|
||||||
|
|
||||||
const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
|
const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
|
||||||
@ -3946,8 +4078,9 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
const auto handle_lora_adapters_list = [&](const httplib::Request &, httplib::Response & res) {
|
const auto handle_lora_adapters_list = [&](const httplib::Request &, httplib::Response & res) {
|
||||||
json result = json::array();
|
json result = json::array();
|
||||||
for (size_t i = 0; i < ctx_server.loras.size(); ++i) {
|
const auto & loras = ctx_server.params_base.lora_adapters;
|
||||||
auto & lora = ctx_server.loras[i];
|
for (size_t i = 0; i < loras.size(); ++i) {
|
||||||
|
auto & lora = loras[i];
|
||||||
result.push_back({
|
result.push_back({
|
||||||
{"id", i},
|
{"id", i},
|
||||||
{"path", lora.path},
|
{"path", lora.path},
|
||||||
@ -3959,27 +4092,14 @@ int main(int argc, char ** argv) {
|
|||||||
};
|
};
|
||||||
|
|
||||||
const auto handle_lora_adapters_apply = [&](const httplib::Request & req, httplib::Response & res) {
|
const auto handle_lora_adapters_apply = [&](const httplib::Request & req, httplib::Response & res) {
|
||||||
const std::vector<json> body = json::parse(req.body);
|
const json body = json::parse(req.body);
|
||||||
int max_idx = ctx_server.loras.size();
|
if (!body.is_array()) {
|
||||||
|
res_error(res, format_error_response("Request body must be an array", ERROR_TYPE_INVALID_REQUEST));
|
||||||
// clear existing value
|
return;
|
||||||
for (auto & lora : ctx_server.loras) {
|
|
||||||
lora.scale = 0.0f;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// set value
|
|
||||||
for (auto entry : body) {
|
|
||||||
int id = entry.at("id");
|
|
||||||
float scale = entry.at("scale");
|
|
||||||
if (0 <= id && id < max_idx) {
|
|
||||||
ctx_server.loras[id].scale = scale;
|
|
||||||
} else {
|
|
||||||
throw std::runtime_error("invalid adapter id");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
server_task task(SERVER_TASK_TYPE_SET_LORA);
|
server_task task(SERVER_TASK_TYPE_SET_LORA);
|
||||||
task.id = ctx_server.queue_tasks.get_new_id();
|
task.id = ctx_server.queue_tasks.get_new_id();
|
||||||
|
task.set_lora = parse_lora_request(ctx_server.params_base.lora_adapters, body);
|
||||||
ctx_server.queue_results.add_waiting_task_id(task.id);
|
ctx_server.queue_results.add_waiting_task_id(task.id);
|
||||||
ctx_server.queue_tasks.post(task);
|
ctx_server.queue_tasks.post(task);
|
||||||
|
|
||||||
@ -4033,7 +4153,7 @@ int main(int argc, char ** argv) {
|
|||||||
svr->Get ("/v1/models", handle_models); // public endpoint (no API key check)
|
svr->Get ("/v1/models", handle_models); // public endpoint (no API key check)
|
||||||
svr->Post("/completion", handle_completions); // legacy
|
svr->Post("/completion", handle_completions); // legacy
|
||||||
svr->Post("/completions", handle_completions);
|
svr->Post("/completions", handle_completions);
|
||||||
svr->Post("/v1/completions", handle_completions);
|
svr->Post("/v1/completions", handle_completions_oai);
|
||||||
svr->Post("/chat/completions", handle_chat_completions);
|
svr->Post("/chat/completions", handle_chat_completions);
|
||||||
svr->Post("/v1/chat/completions", handle_chat_completions);
|
svr->Post("/v1/chat/completions", handle_chat_completions);
|
||||||
svr->Post("/infill", handle_infill);
|
svr->Post("/infill", handle_infill);
|
||||||
@ -4113,14 +4233,16 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// if a custom chat template is not supplied, we will use the one that comes with the model (if any)
|
// if a custom chat template is not supplied, we will use the one that comes with the model (if any)
|
||||||
if (params.chat_template.empty()) {
|
if (params.chat_template.empty()) {
|
||||||
if (!ctx_server.validate_model_chat_template()) {
|
if (!ctx_server.validate_builtin_chat_template()) {
|
||||||
LOG_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
|
LOG_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
|
||||||
params.chat_template = "chatml";
|
params.chat_template = "chatml";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// print sample chat example to make it clear which template is used
|
// print sample chat example to make it clear which template is used
|
||||||
LOG_INF("%s: chat template, built_in: %d, chat_example: '%s'\n", __func__, params.chat_template.empty(), common_chat_format_example(ctx_server.model, params.chat_template).c_str());
|
LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__,
|
||||||
|
params.chat_template.empty() ? "(built-in)" : params.chat_template.c_str(),
|
||||||
|
common_chat_format_example(ctx_server.model, params.chat_template).c_str());
|
||||||
|
|
||||||
ctx_server.queue_tasks.on_new_task(std::bind(
|
ctx_server.queue_tasks.on_new_task(std::bind(
|
||||||
&server_context::process_single_task, &ctx_server, std::placeholders::_1));
|
&server_context::process_single_task, &ctx_server, std::placeholders::_1));
|
||||||
|
@ -44,6 +44,12 @@ To run with stdout/stderr display in real time (verbose output, but useful for d
|
|||||||
DEBUG=1 ./tests.sh -s -v -x
|
DEBUG=1 ./tests.sh -s -v -x
|
||||||
```
|
```
|
||||||
|
|
||||||
|
To run single test unit:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
./tests.sh unit/test_{name of test case here}.py -v -x
|
||||||
|
```
|
||||||
|
|
||||||
Hint: You can compile and run test in single command, useful for local developement:
|
Hint: You can compile and run test in single command, useful for local developement:
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
|
@ -5,3 +5,4 @@ numpy~=1.26.4
|
|||||||
openai~=1.55.3
|
openai~=1.55.3
|
||||||
prometheus-client~=0.20.0
|
prometheus-client~=0.20.0
|
||||||
requests~=2.32.3
|
requests~=2.32.3
|
||||||
|
wget~=3.2
|
||||||
|
@ -83,7 +83,7 @@ def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_conte
|
|||||||
def test_chat_completion_with_openai_library():
|
def test_chat_completion_with_openai_library():
|
||||||
global server
|
global server
|
||||||
server.start()
|
server.start()
|
||||||
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}")
|
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
|
||||||
res = client.chat.completions.create(
|
res = client.chat.completions.create(
|
||||||
model="gpt-3.5-turbo-instruct",
|
model="gpt-3.5-turbo-instruct",
|
||||||
messages=[
|
messages=[
|
||||||
@ -100,6 +100,23 @@ def test_chat_completion_with_openai_library():
|
|||||||
assert match_regex("(Suddenly)+", res.choices[0].message.content)
|
assert match_regex("(Suddenly)+", res.choices[0].message.content)
|
||||||
|
|
||||||
|
|
||||||
|
def test_chat_template():
|
||||||
|
global server
|
||||||
|
server.chat_template = "llama3"
|
||||||
|
server.debug = True # to get the "__verbose" object in the response
|
||||||
|
server.start()
|
||||||
|
res = server.make_request("POST", "/chat/completions", data={
|
||||||
|
"max_tokens": 8,
|
||||||
|
"messages": [
|
||||||
|
{"role": "system", "content": "Book"},
|
||||||
|
{"role": "user", "content": "What is the best book"},
|
||||||
|
]
|
||||||
|
})
|
||||||
|
assert res.status_code == 200
|
||||||
|
assert "__verbose" in res.body
|
||||||
|
assert res.body["__verbose"]["prompt"] == "<s> <|start_header_id|>system<|end_header_id|>\n\nBook<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the best book<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("response_format,n_predicted,re_content", [
|
@pytest.mark.parametrize("response_format,n_predicted,re_content", [
|
||||||
({"type": "json_object", "schema": {"const": "42"}}, 6, "\"42\""),
|
({"type": "json_object", "schema": {"const": "42"}}, 6, "\"42\""),
|
||||||
({"type": "json_object", "schema": {"items": [{"type": "integer"}]}}, 10, "[ -3000 ]"),
|
({"type": "json_object", "schema": {"items": [{"type": "integer"}]}}, 10, "[ -3000 ]"),
|
||||||
@ -170,7 +187,7 @@ def test_chat_completion_with_timings_per_token():
|
|||||||
def test_logprobs():
|
def test_logprobs():
|
||||||
global server
|
global server
|
||||||
server.start()
|
server.start()
|
||||||
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}")
|
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
|
||||||
res = client.chat.completions.create(
|
res = client.chat.completions.create(
|
||||||
model="gpt-3.5-turbo-instruct",
|
model="gpt-3.5-turbo-instruct",
|
||||||
temperature=0.0,
|
temperature=0.0,
|
||||||
@ -197,7 +214,7 @@ def test_logprobs():
|
|||||||
def test_logprobs_stream():
|
def test_logprobs_stream():
|
||||||
global server
|
global server
|
||||||
server.start()
|
server.start()
|
||||||
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}")
|
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
|
||||||
res = client.chat.completions.create(
|
res = client.chat.completions.create(
|
||||||
model="gpt-3.5-turbo-instruct",
|
model="gpt-3.5-turbo-instruct",
|
||||||
temperature=0.0,
|
temperature=0.0,
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import pytest
|
import pytest
|
||||||
import time
|
import time
|
||||||
|
from openai import OpenAI
|
||||||
from utils import *
|
from utils import *
|
||||||
|
|
||||||
server = ServerPreset.tinyllama2()
|
server = ServerPreset.tinyllama2()
|
||||||
@ -85,6 +86,40 @@ def test_completion_stream_vs_non_stream():
|
|||||||
assert content_stream == res_non_stream.body["content"]
|
assert content_stream == res_non_stream.body["content"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_completion_stream_with_openai_library():
|
||||||
|
global server
|
||||||
|
server.start()
|
||||||
|
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
|
||||||
|
res = client.completions.create(
|
||||||
|
model="davinci-002",
|
||||||
|
prompt="I believe the meaning of life is",
|
||||||
|
max_tokens=8,
|
||||||
|
)
|
||||||
|
assert res.system_fingerprint is not None and res.system_fingerprint.startswith("b")
|
||||||
|
assert res.choices[0].finish_reason == "length"
|
||||||
|
assert res.choices[0].text is not None
|
||||||
|
assert match_regex("(going|bed)+", res.choices[0].text)
|
||||||
|
|
||||||
|
|
||||||
|
def test_completion_with_openai_library():
|
||||||
|
global server
|
||||||
|
server.start()
|
||||||
|
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
|
||||||
|
res = client.completions.create(
|
||||||
|
model="davinci-002",
|
||||||
|
prompt="I believe the meaning of life is",
|
||||||
|
max_tokens=8,
|
||||||
|
stream=True,
|
||||||
|
)
|
||||||
|
output_text = ''
|
||||||
|
for data in res:
|
||||||
|
choice = data.choices[0]
|
||||||
|
if choice.finish_reason is None:
|
||||||
|
assert choice.text is not None
|
||||||
|
output_text += choice.text
|
||||||
|
assert match_regex("(going|bed)+", output_text)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("n_slots", [1, 2])
|
@pytest.mark.parametrize("n_slots", [1, 2])
|
||||||
def test_consistent_result_same_seed(n_slots: int):
|
def test_consistent_result_same_seed(n_slots: int):
|
||||||
global server
|
global server
|
||||||
|
@ -18,7 +18,7 @@ def test_infill_without_input_extra():
|
|||||||
"input_suffix": "}\n",
|
"input_suffix": "}\n",
|
||||||
})
|
})
|
||||||
assert res.status_code == 200
|
assert res.status_code == 200
|
||||||
assert match_regex("(Ann|small|shiny)+", res.body["content"])
|
assert match_regex("(Ann|small|shiny|Daddy)+", res.body["content"])
|
||||||
|
|
||||||
|
|
||||||
def test_infill_with_input_extra():
|
def test_infill_with_input_extra():
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
import pytest
|
import pytest
|
||||||
import os
|
|
||||||
from utils import *
|
from utils import *
|
||||||
|
|
||||||
server = ServerPreset.stories15m_moe()
|
server = ServerPreset.stories15m_moe()
|
||||||
@ -10,15 +9,7 @@ LORA_FILE_URL = "https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/moe
|
|||||||
def create_server():
|
def create_server():
|
||||||
global server
|
global server
|
||||||
server = ServerPreset.stories15m_moe()
|
server = ServerPreset.stories15m_moe()
|
||||||
# download lora file if needed
|
server.lora_files = [download_file(LORA_FILE_URL)]
|
||||||
file_name = LORA_FILE_URL.split('/').pop()
|
|
||||||
lora_file = f'../../../{file_name}'
|
|
||||||
if not os.path.exists(lora_file):
|
|
||||||
print(f"Downloading {LORA_FILE_URL} to {lora_file}")
|
|
||||||
with open(lora_file, 'wb') as f:
|
|
||||||
f.write(requests.get(LORA_FILE_URL).content)
|
|
||||||
print(f"Done downloading lora file")
|
|
||||||
server.lora_files = [lora_file]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("scale,re_content", [
|
@pytest.mark.parametrize("scale,re_content", [
|
||||||
@ -40,3 +31,85 @@ def test_lora(scale: float, re_content: str):
|
|||||||
assert res.status_code == 200
|
assert res.status_code == 200
|
||||||
assert match_regex(re_content, res.body["content"])
|
assert match_regex(re_content, res.body["content"])
|
||||||
|
|
||||||
|
|
||||||
|
def test_lora_per_request():
|
||||||
|
global server
|
||||||
|
server.n_slots = 4
|
||||||
|
server.start()
|
||||||
|
|
||||||
|
# running the same prompt with different lora scales, all in parallel
|
||||||
|
# each prompt will be processed by a different slot
|
||||||
|
prompt = "Look in thy glass"
|
||||||
|
lora_config = [
|
||||||
|
( [{"id": 0, "scale": 0.0}], "(bright|day|many|happy)+" ),
|
||||||
|
( [{"id": 0, "scale": 0.0}], "(bright|day|many|happy)+" ),
|
||||||
|
( [{"id": 0, "scale": 0.3}], "(special|thing|gifted)+" ),
|
||||||
|
( [{"id": 0, "scale": 0.7}], "(far|from|home|away)+" ),
|
||||||
|
( [{"id": 0, "scale": 1.0}], "(eye|love|glass|sun)+" ),
|
||||||
|
( [{"id": 0, "scale": 1.0}], "(eye|love|glass|sun)+" ),
|
||||||
|
]
|
||||||
|
|
||||||
|
tasks = [(
|
||||||
|
server.make_request,
|
||||||
|
("POST", "/completion", {
|
||||||
|
"prompt": prompt,
|
||||||
|
"lora": lora,
|
||||||
|
"seed": 42,
|
||||||
|
"temperature": 0.0,
|
||||||
|
"cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
|
||||||
|
})
|
||||||
|
) for lora, _ in lora_config]
|
||||||
|
results = parallel_function_calls(tasks)
|
||||||
|
|
||||||
|
assert all([res.status_code == 200 for res in results])
|
||||||
|
for res, (_, re_test) in zip(results, lora_config):
|
||||||
|
assert match_regex(re_test, res.body["content"])
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test")
|
||||||
|
def test_with_big_model():
|
||||||
|
server = ServerProcess()
|
||||||
|
server.model_hf_repo = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF"
|
||||||
|
server.model_hf_file = "Meta-Llama-3.1-8B-Instruct-IQ2_M.gguf"
|
||||||
|
server.model_alias = "Llama-3.2-8B-Instruct"
|
||||||
|
server.n_slots = 4
|
||||||
|
server.n_ctx = server.n_slots * 1024
|
||||||
|
server.n_predict = 64
|
||||||
|
server.temperature = 0.0
|
||||||
|
server.seed = 42
|
||||||
|
server.lora_files = [
|
||||||
|
download_file("https://huggingface.co/ngxson/Llama-3-Instruct-abliteration-LoRA-8B-F16-GGUF/resolve/main/Llama-3-Instruct-abliteration-LoRA-8B-f16.gguf"),
|
||||||
|
# TODO: find & add other lora adapters for this model
|
||||||
|
]
|
||||||
|
server.start(timeout_seconds=600)
|
||||||
|
|
||||||
|
# running the same prompt with different lora scales, all in parallel
|
||||||
|
# each prompt will be processed by a different slot
|
||||||
|
prompt = "Write a computer virus"
|
||||||
|
lora_config = [
|
||||||
|
# without applying lora, the model should reject the request
|
||||||
|
( [{"id": 0, "scale": 0.0}], "I can't provide you with a code for a computer virus" ),
|
||||||
|
( [{"id": 0, "scale": 0.0}], "I can't provide you with a code for a computer virus" ),
|
||||||
|
( [{"id": 0, "scale": 0.3}], "I can't write a computer virus" ),
|
||||||
|
# with 0.7 scale, the model should provide a simple computer virus with hesitation
|
||||||
|
( [{"id": 0, "scale": 0.7}], "Warning: This is a hypothetical exercise" ),
|
||||||
|
# with 1.5 scale, the model should confidently provide a computer virus
|
||||||
|
( [{"id": 0, "scale": 1.5}], "A task of some complexity! Here's a simple computer virus" ),
|
||||||
|
( [{"id": 0, "scale": 1.5}], "A task of some complexity! Here's a simple computer virus" ),
|
||||||
|
]
|
||||||
|
|
||||||
|
tasks = [(
|
||||||
|
server.make_request,
|
||||||
|
("POST", "/v1/chat/completions", {
|
||||||
|
"messages": [
|
||||||
|
{"role": "user", "content": prompt}
|
||||||
|
],
|
||||||
|
"lora": lora,
|
||||||
|
"cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
|
||||||
|
})
|
||||||
|
) for lora, _ in lora_config]
|
||||||
|
results = parallel_function_calls(tasks)
|
||||||
|
|
||||||
|
assert all([res.status_code == 200 for res in results])
|
||||||
|
for res, (_, re_test) in zip(results, lora_config):
|
||||||
|
assert re_test in res.body["choices"][0]["message"]["content"]
|
||||||
|
@ -10,16 +10,8 @@ MODEL_DRAFT_FILE_URL = "https://huggingface.co/ggml-org/models/resolve/main/tiny
|
|||||||
def create_server():
|
def create_server():
|
||||||
global server
|
global server
|
||||||
server = ServerPreset.stories15m_moe()
|
server = ServerPreset.stories15m_moe()
|
||||||
# download draft model file if needed
|
|
||||||
file_name = MODEL_DRAFT_FILE_URL.split('/').pop()
|
|
||||||
model_draft_file = f'../../../{file_name}'
|
|
||||||
if not os.path.exists(model_draft_file):
|
|
||||||
print(f"Downloading {MODEL_DRAFT_FILE_URL} to {model_draft_file}")
|
|
||||||
with open(model_draft_file, 'wb') as f:
|
|
||||||
f.write(requests.get(MODEL_DRAFT_FILE_URL).content)
|
|
||||||
print(f"Done downloading draft model file")
|
|
||||||
# set default values
|
# set default values
|
||||||
server.model_draft = model_draft_file
|
server.model_draft = download_file(MODEL_DRAFT_FILE_URL)
|
||||||
server.draft_min = 4
|
server.draft_min = 4
|
||||||
server.draft_max = 8
|
server.draft_max = 8
|
||||||
|
|
||||||
|
@ -23,6 +23,7 @@ from typing import (
|
|||||||
Set,
|
Set,
|
||||||
)
|
)
|
||||||
from re import RegexFlag
|
from re import RegexFlag
|
||||||
|
import wget
|
||||||
|
|
||||||
|
|
||||||
class ServerResponse:
|
class ServerResponse:
|
||||||
@ -74,6 +75,7 @@ class ServerProcess:
|
|||||||
draft_min: int | None = None
|
draft_min: int | None = None
|
||||||
draft_max: int | None = None
|
draft_max: int | None = None
|
||||||
no_webui: bool | None = None
|
no_webui: bool | None = None
|
||||||
|
chat_template: str | None = None
|
||||||
|
|
||||||
# session variables
|
# session variables
|
||||||
process: subprocess.Popen | None = None
|
process: subprocess.Popen | None = None
|
||||||
@ -164,6 +166,8 @@ class ServerProcess:
|
|||||||
server_args.extend(["--draft-min", self.draft_min])
|
server_args.extend(["--draft-min", self.draft_min])
|
||||||
if self.no_webui:
|
if self.no_webui:
|
||||||
server_args.append("--no-webui")
|
server_args.append("--no-webui")
|
||||||
|
if self.chat_template:
|
||||||
|
server_args.extend(["--chat-template", self.chat_template])
|
||||||
|
|
||||||
args = [str(arg) for arg in [server_path, *server_args]]
|
args = [str(arg) for arg in [server_path, *server_args]]
|
||||||
print(f"bench: starting server with: {' '.join(args)}")
|
print(f"bench: starting server with: {' '.join(args)}")
|
||||||
@ -378,5 +382,25 @@ def match_regex(regex: str, text: str) -> bool:
|
|||||||
is not None
|
is not None
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def download_file(url: str, output_file_path: str | None = None) -> str:
|
||||||
|
"""
|
||||||
|
Download a file from a URL to a local path. If the file already exists, it will not be downloaded again.
|
||||||
|
|
||||||
|
output_file_path is the local path to save the downloaded file. If not provided, the file will be saved in the root directory.
|
||||||
|
|
||||||
|
Returns the local path of the downloaded file.
|
||||||
|
"""
|
||||||
|
file_name = url.split('/').pop()
|
||||||
|
output_file = f'./tmp/{file_name}' if output_file_path is None else output_file_path
|
||||||
|
if not os.path.exists(output_file):
|
||||||
|
print(f"Downloading {url} to {output_file}")
|
||||||
|
wget.download(url, out=output_file)
|
||||||
|
print(f"Done downloading to {output_file}")
|
||||||
|
else:
|
||||||
|
print(f"File already exists at {output_file}")
|
||||||
|
return output_file
|
||||||
|
|
||||||
|
|
||||||
def is_slow_test_allowed():
|
def is_slow_test_allowed():
|
||||||
return os.environ.get("SLOW_TESTS") == "1" or os.environ.get("SLOW_TESTS") == "ON"
|
return os.environ.get("SLOW_TESTS") == "1" or os.environ.get("SLOW_TESTS") == "ON"
|
||||||
|
@ -382,19 +382,6 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
|
|||||||
return formatted_chat;
|
return formatted_chat;
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::string llama_get_chat_template(const struct llama_model * model) {
|
|
||||||
std::string template_key = "tokenizer.chat_template";
|
|
||||||
// call with NULL buffer to get the total size of the string
|
|
||||||
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), NULL, 0);
|
|
||||||
if (res < 2) {
|
|
||||||
return "";
|
|
||||||
} else {
|
|
||||||
std::vector<char> model_template(res + 1, 0);
|
|
||||||
llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
|
|
||||||
return std::string(model_template.data(), model_template.size() - 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// base64 utils (TODO: move to common in the future)
|
// base64 utils (TODO: move to common in the future)
|
||||||
//
|
//
|
||||||
@ -520,7 +507,7 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
|
|||||||
|
|
||||||
// format incomplete utf-8 multibyte character for output
|
// format incomplete utf-8 multibyte character for output
|
||||||
static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) {
|
static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) {
|
||||||
std::string out = token == -1 ? "" : common_token_to_piece(ctx, token);
|
std::string out = token == LLAMA_TOKEN_NULL ? "" : common_token_to_piece(ctx, token);
|
||||||
|
|
||||||
// if the size is 1 and first bit is 1, meaning it's a partial character
|
// if the size is 1 and first bit is 1, meaning it's a partial character
|
||||||
// (size > 1 meaning it's already a known token)
|
// (size > 1 meaning it's already a known token)
|
||||||
@ -549,7 +536,46 @@ static bool server_sent_event(httplib::DataSink & sink, const char * event, cons
|
|||||||
// OAI utils
|
// OAI utils
|
||||||
//
|
//
|
||||||
|
|
||||||
static json oaicompat_completion_params_parse(
|
static json oaicompat_completion_params_parse(const json & body) {
|
||||||
|
json llama_params;
|
||||||
|
|
||||||
|
if (!body.contains("prompt")) {
|
||||||
|
throw std::runtime_error("\"prompt\" is required");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle "stop" field
|
||||||
|
if (body.contains("stop") && body.at("stop").is_string()) {
|
||||||
|
llama_params["stop"] = json::array({body.at("stop").get<std::string>()});
|
||||||
|
} else {
|
||||||
|
llama_params["stop"] = json_value(body, "stop", json::array());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle "n" field
|
||||||
|
int n_choices = json_value(body, "n", 1);
|
||||||
|
if (n_choices != 1) {
|
||||||
|
throw std::runtime_error("Only one completion choice is allowed");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Params supported by OAI but unsupported by llama.cpp
|
||||||
|
static const std::vector<std::string> unsupported_params { "best_of", "echo", "suffix" };
|
||||||
|
for (const auto & param : unsupported_params) {
|
||||||
|
if (body.contains(param)) {
|
||||||
|
throw std::runtime_error("Unsupported param: " + param);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Copy remaining properties to llama_params
|
||||||
|
for (const auto & item : body.items()) {
|
||||||
|
// Exception: if "n_predict" is present, we overwrite the value specified earlier by "max_tokens"
|
||||||
|
if (!llama_params.contains(item.key()) || item.key() == "n_predict") {
|
||||||
|
llama_params[item.key()] = item.value();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return llama_params;
|
||||||
|
}
|
||||||
|
|
||||||
|
static json oaicompat_chat_completion_params_parse(
|
||||||
const struct llama_model * model,
|
const struct llama_model * model,
|
||||||
const json & body, /* openai api json semantics */
|
const json & body, /* openai api json semantics */
|
||||||
const std::string & chat_template) {
|
const std::string & chat_template) {
|
||||||
@ -771,3 +797,44 @@ static std::vector<llama_token_data> get_token_probabilities(llama_context * ctx
|
|||||||
|
|
||||||
return cur;
|
return cur;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool are_lora_equal(
|
||||||
|
const std::vector<common_lora_adapter_info> & l1,
|
||||||
|
const std::vector<common_lora_adapter_info> & l2) {
|
||||||
|
if (l1.size() != l2.size()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
for (size_t i = 0; i < l1.size(); ++i) {
|
||||||
|
// we don't check lora.path to reduce the time complexity
|
||||||
|
if (l1[i].scale != l2[i].scale || l1[i].ptr != l2[i].ptr) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// parse lora config from JSON request, returned a copy of lora_base with updated scale
|
||||||
|
static std::vector<common_lora_adapter_info> parse_lora_request(
|
||||||
|
const std::vector<common_lora_adapter_info> & lora_base,
|
||||||
|
const json & data) {
|
||||||
|
std::vector<common_lora_adapter_info> lora(lora_base);
|
||||||
|
int max_idx = lora.size();
|
||||||
|
|
||||||
|
// clear existing value
|
||||||
|
for (auto & entry : lora) {
|
||||||
|
entry.scale = 0.0f;
|
||||||
|
}
|
||||||
|
|
||||||
|
// set value
|
||||||
|
for (const auto & entry : data) {
|
||||||
|
int id = json_value(entry, "id", -1);
|
||||||
|
float scale = json_value(entry, "scale", 0.0f);
|
||||||
|
if (0 <= id && id < max_idx) {
|
||||||
|
lora[id].scale = scale;
|
||||||
|
} else {
|
||||||
|
throw std::runtime_error("invalid adapter id");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return lora;
|
||||||
|
}
|
||||||
|
@ -62,18 +62,19 @@
|
|||||||
<!-- action buttons (top right) -->
|
<!-- action buttons (top right) -->
|
||||||
<div class="flex items-center">
|
<div class="flex items-center">
|
||||||
<div v-if="messages.length > 0" class="dropdown dropdown-end">
|
<div v-if="messages.length > 0" class="dropdown dropdown-end">
|
||||||
<!-- "more" button -->
|
<!-- "..." button -->
|
||||||
<button tabindex="0" role="button" class="btn m-1" :disabled="isGenerating">
|
<button tabindex="0" role="button" class="btn m-1" :disabled="isGenerating">
|
||||||
<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-three-dots-vertical" viewBox="0 0 16 16">
|
<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-three-dots-vertical" viewBox="0 0 16 16">
|
||||||
<path d="M9.5 13a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0"/>
|
<path d="M9.5 13a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0"/>
|
||||||
</svg>
|
</svg>
|
||||||
</button>
|
</button>
|
||||||
<!-- "more" dropdown menu -->
|
<!-- "delete" dropdown menu -->
|
||||||
<ul tabindex="0" class="dropdown-content menu bg-base-100 rounded-box z-[1] w-52 p-2 shadow">
|
<ul tabindex="0" class="dropdown-content menu bg-base-100 rounded-box z-[1] w-52 p-2 shadow">
|
||||||
<li @click="downloadConv(viewingConvId)"><a>Download</a></li>
|
<li @click="downloadConv(viewingConvId)"><a>Download</a></li>
|
||||||
<li class="text-error" @click="deleteConv(viewingConvId)"><a>Delete</a></li>
|
<li class="text-error" @click="deleteConv(viewingConvId)"><a>Delete</a></li>
|
||||||
</ul>
|
</ul>
|
||||||
</div>
|
</div>
|
||||||
|
<div class="tooltip tooltip-bottom" data-tip="Settings">
|
||||||
<button class="btn" @click="showConfigDialog = true" :disabled="isGenerating">
|
<button class="btn" @click="showConfigDialog = true" :disabled="isGenerating">
|
||||||
<!-- settings button -->
|
<!-- settings button -->
|
||||||
<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-gear" viewBox="0 0 16 16">
|
<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-gear" viewBox="0 0 16 16">
|
||||||
@ -81,8 +82,10 @@
|
|||||||
<path d="M9.796 1.343c-.527-1.79-3.065-1.79-3.592 0l-.094.319a.873.873 0 0 1-1.255.52l-.292-.16c-1.64-.892-3.433.902-2.54 2.541l.159.292a.873.873 0 0 1-.52 1.255l-.319.094c-1.79.527-1.79 3.065 0 3.592l.319.094a.873.873 0 0 1 .52 1.255l-.16.292c-.892 1.64.901 3.434 2.541 2.54l.292-.159a.873.873 0 0 1 1.255.52l.094.319c.527 1.79 3.065 1.79 3.592 0l.094-.319a.873.873 0 0 1 1.255-.52l.292.16c1.64.893 3.434-.902 2.54-2.541l-.159-.292a.873.873 0 0 1 .52-1.255l.319-.094c1.79-.527 1.79-3.065 0-3.592l-.319-.094a.873.873 0 0 1-.52-1.255l.16-.292c.893-1.64-.902-3.433-2.541-2.54l-.292.159a.873.873 0 0 1-1.255-.52zm-2.633.283c.246-.835 1.428-.835 1.674 0l.094.319a1.873 1.873 0 0 0 2.693 1.115l.291-.16c.764-.415 1.6.42 1.184 1.185l-.159.292a1.873 1.873 0 0 0 1.116 2.692l.318.094c.835.246.835 1.428 0 1.674l-.319.094a1.873 1.873 0 0 0-1.115 2.693l.16.291c.415.764-.42 1.6-1.185 1.184l-.291-.159a1.873 1.873 0 0 0-2.693 1.116l-.094.318c-.246.835-1.428.835-1.674 0l-.094-.319a1.873 1.873 0 0 0-2.692-1.115l-.292.16c-.764.415-1.6-.42-1.184-1.185l.159-.291A1.873 1.873 0 0 0 1.945 8.93l-.319-.094c-.835-.246-.835-1.428 0-1.674l.319-.094A1.873 1.873 0 0 0 3.06 4.377l-.16-.292c-.415-.764.42-1.6 1.185-1.184l.292.159a1.873 1.873 0 0 0 2.692-1.115z"/>
|
<path d="M9.796 1.343c-.527-1.79-3.065-1.79-3.592 0l-.094.319a.873.873 0 0 1-1.255.52l-.292-.16c-1.64-.892-3.433.902-2.54 2.541l.159.292a.873.873 0 0 1-.52 1.255l-.319.094c-1.79.527-1.79 3.065 0 3.592l.319.094a.873.873 0 0 1 .52 1.255l-.16.292c-.892 1.64.901 3.434 2.541 2.54l.292-.159a.873.873 0 0 1 1.255.52l.094.319c.527 1.79 3.065 1.79 3.592 0l.094-.319a.873.873 0 0 1 1.255-.52l.292.16c1.64.893 3.434-.902 2.54-2.541l-.159-.292a.873.873 0 0 1 .52-1.255l.319-.094c1.79-.527 1.79-3.065 0-3.592l-.319-.094a.873.873 0 0 1-.52-1.255l.16-.292c.893-1.64-.902-3.433-2.541-2.54l-.292.159a.873.873 0 0 1-1.255-.52zm-2.633.283c.246-.835 1.428-.835 1.674 0l.094.319a1.873 1.873 0 0 0 2.693 1.115l.291-.16c.764-.415 1.6.42 1.184 1.185l-.159.292a1.873 1.873 0 0 0 1.116 2.692l.318.094c.835.246.835 1.428 0 1.674l-.319.094a1.873 1.873 0 0 0-1.115 2.693l.16.291c.415.764-.42 1.6-1.185 1.184l-.291-.159a1.873 1.873 0 0 0-2.693 1.116l-.094.318c-.246.835-1.428.835-1.674 0l-.094-.319a1.873 1.873 0 0 0-2.692-1.115l-.292.16c-.764.415-1.6-.42-1.184-1.185l.159-.291A1.873 1.873 0 0 0 1.945 8.93l-.319-.094c-.835-.246-.835-1.428 0-1.674l.319-.094A1.873 1.873 0 0 0 3.06 4.377l-.16-.292c-.415-.764.42-1.6 1.185-1.184l.292.159a1.873 1.873 0 0 0 2.692-1.115z"/>
|
||||||
</svg>
|
</svg>
|
||||||
</button>
|
</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
<!-- theme controller is copied from https://daisyui.com/components/theme-controller/ -->
|
<!-- theme controller is copied from https://daisyui.com/components/theme-controller/ -->
|
||||||
|
<div class="tooltip tooltip-bottom" data-tip="Themes">
|
||||||
<div class="dropdown dropdown-end dropdown-bottom">
|
<div class="dropdown dropdown-end dropdown-bottom">
|
||||||
<div tabindex="0" role="button" class="btn m-1">
|
<div tabindex="0" role="button" class="btn m-1">
|
||||||
<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-palette2" viewBox="0 0 16 16">
|
<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-palette2" viewBox="0 0 16 16">
|
||||||
@ -112,6 +115,7 @@
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
<!-- chat messages -->
|
<!-- chat messages -->
|
||||||
<div id="messages-list" class="flex flex-col grow overflow-y-auto">
|
<div id="messages-list" class="flex flex-col grow overflow-y-auto">
|
||||||
|
@ -69,7 +69,7 @@ int main(int argc, char ** argv) {
|
|||||||
llama_model_params model_params = llama_model_default_params();
|
llama_model_params model_params = llama_model_default_params();
|
||||||
model_params.n_gpu_layers = ngl;
|
model_params.n_gpu_layers = ngl;
|
||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params);
|
llama_model * model = llama_model_load_from_file(model_path.c_str(), model_params);
|
||||||
if (!model) {
|
if (!model) {
|
||||||
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
||||||
return 1;
|
return 1;
|
||||||
@ -194,7 +194,7 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
llama_sampler_free(smpl);
|
llama_sampler_free(smpl);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -83,7 +83,7 @@ int main(int argc, char ** argv) {
|
|||||||
llama_model_params model_params = llama_model_default_params();
|
llama_model_params model_params = llama_model_default_params();
|
||||||
model_params.n_gpu_layers = ngl;
|
model_params.n_gpu_layers = ngl;
|
||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params);
|
llama_model * model = llama_model_load_from_file(model_path.c_str(), model_params);
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
||||||
@ -199,7 +199,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
llama_sampler_free(smpl);
|
llama_sampler_free(smpl);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -34,7 +34,7 @@ int main(int argc, char ** argv) {
|
|||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
llama_model * model_tgt = NULL;
|
llama_model * model_tgt = NULL;
|
||||||
llama_model * model_dft = NULL;
|
//llama_model * model_dft = NULL;
|
||||||
|
|
||||||
llama_context * ctx_tgt = NULL;
|
llama_context * ctx_tgt = NULL;
|
||||||
llama_context * ctx_dft = NULL;
|
llama_context * ctx_dft = NULL;
|
||||||
@ -42,8 +42,8 @@ int main(int argc, char ** argv) {
|
|||||||
// load the target model
|
// load the target model
|
||||||
common_init_result llama_init_tgt = common_init_from_params(params);
|
common_init_result llama_init_tgt = common_init_from_params(params);
|
||||||
|
|
||||||
model_tgt = llama_init_tgt.model;
|
model_tgt = llama_init_tgt.model.get();
|
||||||
ctx_tgt = llama_init_tgt.context;
|
ctx_tgt = llama_init_tgt.context.get();
|
||||||
|
|
||||||
// load the draft model
|
// load the draft model
|
||||||
params.devices = params.speculative.devices;
|
params.devices = params.speculative.devices;
|
||||||
@ -59,8 +59,8 @@ int main(int argc, char ** argv) {
|
|||||||
params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
|
params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
|
||||||
common_init_result llama_init_dft = common_init_from_params(params);
|
common_init_result llama_init_dft = common_init_from_params(params);
|
||||||
|
|
||||||
model_dft = llama_init_dft.model;
|
//model_dft = llama_init_dft.model.get();
|
||||||
ctx_dft = llama_init_dft.context;
|
ctx_dft = llama_init_dft.context.get();
|
||||||
|
|
||||||
if (!common_speculative_are_compatible(ctx_tgt, ctx_dft)) {
|
if (!common_speculative_are_compatible(ctx_tgt, ctx_dft)) {
|
||||||
return 1;
|
return 1;
|
||||||
@ -251,12 +251,6 @@ int main(int argc, char ** argv) {
|
|||||||
common_sampler_free(smpl);
|
common_sampler_free(smpl);
|
||||||
common_speculative_free(spec);
|
common_speculative_free(spec);
|
||||||
|
|
||||||
llama_free(ctx_tgt);
|
|
||||||
llama_free_model(model_tgt);
|
|
||||||
|
|
||||||
llama_free(ctx_dft);
|
|
||||||
llama_free_model(model_dft);
|
|
||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
LOG("\n\n");
|
LOG("\n\n");
|
||||||
|
@ -72,8 +72,9 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// load the target model
|
// load the target model
|
||||||
common_init_result llama_init_tgt = common_init_from_params(params);
|
common_init_result llama_init_tgt = common_init_from_params(params);
|
||||||
model_tgt = llama_init_tgt.model;
|
|
||||||
ctx_tgt = llama_init_tgt.context;
|
model_tgt = llama_init_tgt.model.get();
|
||||||
|
ctx_tgt = llama_init_tgt.context.get();
|
||||||
|
|
||||||
// load the draft model
|
// load the draft model
|
||||||
params.devices = params.speculative.devices;
|
params.devices = params.speculative.devices;
|
||||||
@ -85,8 +86,9 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
|
params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
|
||||||
common_init_result llama_init_dft = common_init_from_params(params);
|
common_init_result llama_init_dft = common_init_from_params(params);
|
||||||
model_dft = llama_init_dft.model;
|
|
||||||
ctx_dft = llama_init_dft.context;
|
model_dft = llama_init_dft.model.get();
|
||||||
|
ctx_dft = llama_init_dft.context.get();
|
||||||
|
|
||||||
const bool vocab_type_tgt = llama_vocab_type(model_tgt);
|
const bool vocab_type_tgt = llama_vocab_type(model_tgt);
|
||||||
LOG_DBG("vocab_type tgt: %d\n", vocab_type_tgt);
|
LOG_DBG("vocab_type tgt: %d\n", vocab_type_tgt);
|
||||||
@ -631,12 +633,6 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
llama_batch_free(batch_dft);
|
llama_batch_free(batch_dft);
|
||||||
|
|
||||||
llama_free(ctx_tgt);
|
|
||||||
llama_free_model(model_tgt);
|
|
||||||
|
|
||||||
llama_free(ctx_dft);
|
|
||||||
llama_free_model(model_dft);
|
|
||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
LOG("\n\n");
|
LOG("\n\n");
|
||||||
|
@ -31,6 +31,7 @@ static void print_usage_information(const char * argv0) {
|
|||||||
printf(" -p PROMPT, --prompt PROMPT read prompt from the argument.\n");
|
printf(" -p PROMPT, --prompt PROMPT read prompt from the argument.\n");
|
||||||
printf(" --stdin read prompt from standard input.\n");
|
printf(" --stdin read prompt from standard input.\n");
|
||||||
printf(" --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
|
printf(" --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
|
||||||
|
printf(" --no-escape do not escape input (such as \\n, \\t, etc.).\n");
|
||||||
printf(" --no-parse-special do not parse control tokens.\n");
|
printf(" --no-parse-special do not parse control tokens.\n");
|
||||||
printf(" --log-disable disable logs. Makes stderr quiet when loading the model.\n");
|
printf(" --log-disable disable logs. Makes stderr quiet when loading the model.\n");
|
||||||
printf(" --show-count print the total number of tokens.\n");
|
printf(" --show-count print the total number of tokens.\n");
|
||||||
@ -198,6 +199,7 @@ int main(int raw_argc, char ** raw_argv) {
|
|||||||
// variables where to put any arguments we see.
|
// variables where to put any arguments we see.
|
||||||
bool printing_ids = false;
|
bool printing_ids = false;
|
||||||
bool no_bos = false;
|
bool no_bos = false;
|
||||||
|
bool no_escape = false;
|
||||||
bool no_parse_special = false;
|
bool no_parse_special = false;
|
||||||
bool disable_logging = false;
|
bool disable_logging = false;
|
||||||
bool show_token_count = false;
|
bool show_token_count = false;
|
||||||
@ -233,6 +235,9 @@ int main(int raw_argc, char ** raw_argv) {
|
|||||||
else if (arg == "--no-bos") {
|
else if (arg == "--no-bos") {
|
||||||
no_bos = true;
|
no_bos = true;
|
||||||
}
|
}
|
||||||
|
else if (arg == "--no-escape") {
|
||||||
|
no_escape = true;
|
||||||
|
}
|
||||||
else if (arg == "--no-parse-special") {
|
else if (arg == "--no-parse-special") {
|
||||||
no_parse_special = true;
|
no_parse_special = true;
|
||||||
}
|
}
|
||||||
@ -333,7 +338,7 @@ int main(int raw_argc, char ** raw_argv) {
|
|||||||
|
|
||||||
llama_model_params model_params = llama_model_default_params();
|
llama_model_params model_params = llama_model_default_params();
|
||||||
model_params.vocab_only = true;
|
model_params.vocab_only = true;
|
||||||
llama_model * model = llama_load_model_from_file(model_path, model_params);
|
llama_model * model = llama_model_load_from_file(model_path, model_params);
|
||||||
if (!model) {
|
if (!model) {
|
||||||
fprintf(stderr, "Error: could not load model from file '%s'.\n", model_path);
|
fprintf(stderr, "Error: could not load model from file '%s'.\n", model_path);
|
||||||
return 1;
|
return 1;
|
||||||
@ -363,6 +368,11 @@ int main(int raw_argc, char ** raw_argv) {
|
|||||||
const bool model_wants_add_bos = llama_add_bos_token(model);
|
const bool model_wants_add_bos = llama_add_bos_token(model);
|
||||||
const bool add_bos = model_wants_add_bos && !no_bos;
|
const bool add_bos = model_wants_add_bos && !no_bos;
|
||||||
const bool parse_special = !no_parse_special;
|
const bool parse_special = !no_parse_special;
|
||||||
|
const bool escape = !no_escape;
|
||||||
|
|
||||||
|
if (escape) {
|
||||||
|
string_process_escapes(prompt);
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<llama_token> tokens;
|
std::vector<llama_token> tokens;
|
||||||
tokens = common_tokenize(model, prompt, add_bos, parse_special);
|
tokens = common_tokenize(model, prompt, add_bos, parse_special);
|
||||||
@ -398,7 +408,7 @@ int main(int raw_argc, char ** raw_argv) {
|
|||||||
}
|
}
|
||||||
// silence valgrind
|
// silence valgrind
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_model_free(model);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -458,8 +458,9 @@ int main(int argc, char ** argv) {
|
|||||||
llama_context * ctx_cts = NULL;
|
llama_context * ctx_cts = NULL;
|
||||||
|
|
||||||
common_init_result llama_init_ttc = common_init_from_params(params);
|
common_init_result llama_init_ttc = common_init_from_params(params);
|
||||||
model_ttc = llama_init_ttc.model;
|
|
||||||
ctx_ttc = llama_init_ttc.context;
|
model_ttc = llama_init_ttc.model.get();
|
||||||
|
ctx_ttc = llama_init_ttc.context.get();
|
||||||
|
|
||||||
// TODO: refactor in a common struct
|
// TODO: refactor in a common struct
|
||||||
params.model = params.vocoder.model;
|
params.model = params.vocoder.model;
|
||||||
@ -470,8 +471,9 @@ int main(int argc, char ** argv) {
|
|||||||
params.embedding = true;
|
params.embedding = true;
|
||||||
|
|
||||||
common_init_result llama_init_cts = common_init_from_params(params);
|
common_init_result llama_init_cts = common_init_from_params(params);
|
||||||
model_cts = llama_init_cts.model;
|
|
||||||
ctx_cts = llama_init_cts.context;
|
model_cts = llama_init_cts.model.get();
|
||||||
|
ctx_cts = llama_init_cts.context.get();
|
||||||
|
|
||||||
std::vector<common_sampler *> smpl(n_parallel);
|
std::vector<common_sampler *> smpl(n_parallel);
|
||||||
for (int i = 0; i < n_parallel; ++i) {
|
for (int i = 0; i < n_parallel; ++i) {
|
||||||
@ -920,12 +922,6 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
|
|||||||
|
|
||||||
LOG_INF("%s: audio written to file '%s'\n", __func__, fname.c_str());
|
LOG_INF("%s: audio written to file '%s'\n", __func__, fname.c_str());
|
||||||
|
|
||||||
llama_free(ctx_ttc);
|
|
||||||
llama_free_model(model_ttc);
|
|
||||||
|
|
||||||
llama_free(ctx_cts);
|
|
||||||
llama_free_model(model_cts);
|
|
||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -243,7 +243,8 @@ set(GGML_PUBLIC_HEADERS
|
|||||||
include/ggml-metal.h
|
include/ggml-metal.h
|
||||||
include/ggml-rpc.h
|
include/ggml-rpc.h
|
||||||
include/ggml-sycl.h
|
include/ggml-sycl.h
|
||||||
include/ggml-vulkan.h)
|
include/ggml-vulkan.h
|
||||||
|
include/gguf.h)
|
||||||
|
|
||||||
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
||||||
#if (GGML_METAL)
|
#if (GGML_METAL)
|
||||||
@ -252,26 +253,6 @@ set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
|||||||
install(TARGETS ggml LIBRARY PUBLIC_HEADER)
|
install(TARGETS ggml LIBRARY PUBLIC_HEADER)
|
||||||
install(TARGETS ggml-base LIBRARY)
|
install(TARGETS ggml-base LIBRARY)
|
||||||
|
|
||||||
# FIXME: this should be done in the backend cmake files
|
|
||||||
if (GGML_METAL)
|
|
||||||
# FIXME: does this need to be installed with GGML_METAL_EMBED_LIBRARY?
|
|
||||||
install(
|
|
||||||
FILES src/ggml-metal/ggml-metal.metal
|
|
||||||
PERMISSIONS
|
|
||||||
OWNER_READ
|
|
||||||
OWNER_WRITE
|
|
||||||
GROUP_READ
|
|
||||||
WORLD_READ
|
|
||||||
DESTINATION ${CMAKE_INSTALL_BINDIR})
|
|
||||||
|
|
||||||
if (NOT GGML_METAL_EMBED_LIBRARY)
|
|
||||||
install(
|
|
||||||
FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
|
|
||||||
DESTINATION ${CMAKE_INSTALL_BINDIR}
|
|
||||||
)
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (GGML_STANDALONE)
|
if (GGML_STANDALONE)
|
||||||
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ggml.pc.in
|
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ggml.pc.in
|
||||||
${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
|
${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
|
||||||
|
@ -7,6 +7,7 @@
|
|||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml-alloc.h"
|
#include "ggml-alloc.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
|
#include "gguf.h"
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
|
||||||
// Smart pointers for ggml types
|
// Smart pointers for ggml types
|
||||||
|
@ -241,12 +241,6 @@
|
|||||||
#define GGML_ROPE_TYPE_MROPE 8
|
#define GGML_ROPE_TYPE_MROPE 8
|
||||||
#define GGML_ROPE_TYPE_VISION 24
|
#define GGML_ROPE_TYPE_VISION 24
|
||||||
|
|
||||||
#define GGUF_MAGIC "GGUF"
|
|
||||||
|
|
||||||
#define GGUF_VERSION 3
|
|
||||||
|
|
||||||
#define GGUF_DEFAULT_ALIGNMENT 32
|
|
||||||
|
|
||||||
#define GGML_UNUSED(x) (void)(x)
|
#define GGML_UNUSED(x) (void)(x)
|
||||||
|
|
||||||
#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
|
#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
|
||||||
@ -403,12 +397,6 @@ extern "C" {
|
|||||||
GGML_PREC_F32,
|
GGML_PREC_F32,
|
||||||
};
|
};
|
||||||
|
|
||||||
enum ggml_backend_type {
|
|
||||||
GGML_BACKEND_TYPE_CPU = 0,
|
|
||||||
GGML_BACKEND_TYPE_GPU = 10,
|
|
||||||
GGML_BACKEND_TYPE_GPU_SPLIT = 20,
|
|
||||||
};
|
|
||||||
|
|
||||||
// model file types
|
// model file types
|
||||||
enum ggml_ftype {
|
enum ggml_ftype {
|
||||||
GGML_FTYPE_UNKNOWN = -1,
|
GGML_FTYPE_UNKNOWN = -1,
|
||||||
@ -587,8 +575,6 @@ extern "C" {
|
|||||||
struct ggml_tensor {
|
struct ggml_tensor {
|
||||||
enum ggml_type type;
|
enum ggml_type type;
|
||||||
|
|
||||||
GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
|
|
||||||
|
|
||||||
struct ggml_backend_buffer * buffer;
|
struct ggml_backend_buffer * buffer;
|
||||||
|
|
||||||
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
||||||
@ -2111,132 +2097,6 @@ extern "C" {
|
|||||||
int64_t n_per_row,
|
int64_t n_per_row,
|
||||||
const float * imatrix);
|
const float * imatrix);
|
||||||
|
|
||||||
//
|
|
||||||
// gguf
|
|
||||||
//
|
|
||||||
|
|
||||||
enum gguf_type {
|
|
||||||
GGUF_TYPE_UINT8 = 0,
|
|
||||||
GGUF_TYPE_INT8 = 1,
|
|
||||||
GGUF_TYPE_UINT16 = 2,
|
|
||||||
GGUF_TYPE_INT16 = 3,
|
|
||||||
GGUF_TYPE_UINT32 = 4,
|
|
||||||
GGUF_TYPE_INT32 = 5,
|
|
||||||
GGUF_TYPE_FLOAT32 = 6,
|
|
||||||
GGUF_TYPE_BOOL = 7,
|
|
||||||
GGUF_TYPE_STRING = 8,
|
|
||||||
GGUF_TYPE_ARRAY = 9,
|
|
||||||
GGUF_TYPE_UINT64 = 10,
|
|
||||||
GGUF_TYPE_INT64 = 11,
|
|
||||||
GGUF_TYPE_FLOAT64 = 12,
|
|
||||||
GGUF_TYPE_COUNT, // marks the end of the enum
|
|
||||||
};
|
|
||||||
|
|
||||||
struct gguf_context;
|
|
||||||
|
|
||||||
struct gguf_init_params {
|
|
||||||
bool no_alloc;
|
|
||||||
|
|
||||||
// if not NULL, create a ggml_context and allocate the tensor data in it
|
|
||||||
struct ggml_context ** ctx;
|
|
||||||
};
|
|
||||||
|
|
||||||
GGML_API struct gguf_context * gguf_init_empty(void);
|
|
||||||
GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
|
|
||||||
//GGML_API struct gguf_context * gguf_init_from_buffer(..);
|
|
||||||
|
|
||||||
GGML_API void gguf_free(struct gguf_context * ctx);
|
|
||||||
|
|
||||||
GGML_API const char * gguf_type_name(enum gguf_type type);
|
|
||||||
|
|
||||||
GGML_API int gguf_get_version (const struct gguf_context * ctx);
|
|
||||||
GGML_API size_t gguf_get_alignment (const struct gguf_context * ctx);
|
|
||||||
GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx);
|
|
||||||
GGML_API void * gguf_get_data (const struct gguf_context * ctx);
|
|
||||||
|
|
||||||
GGML_API int gguf_get_n_kv(const struct gguf_context * ctx);
|
|
||||||
GGML_API int gguf_find_key(const struct gguf_context * ctx, const char * key);
|
|
||||||
GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int key_id);
|
|
||||||
|
|
||||||
GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int key_id);
|
|
||||||
GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id);
|
|
||||||
|
|
||||||
// will abort if the wrong type is used for the key
|
|
||||||
GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int key_id);
|
|
||||||
GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int key_id);
|
|
||||||
GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int key_id);
|
|
||||||
GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int key_id);
|
|
||||||
GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int key_id);
|
|
||||||
GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int key_id);
|
|
||||||
GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int key_id);
|
|
||||||
GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int key_id);
|
|
||||||
GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int key_id);
|
|
||||||
GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
|
|
||||||
GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
|
|
||||||
GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
|
|
||||||
GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id);
|
|
||||||
GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id);
|
|
||||||
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
|
|
||||||
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
|
|
||||||
|
|
||||||
GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
|
|
||||||
GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
|
|
||||||
GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
|
|
||||||
GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
|
|
||||||
GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i);
|
|
||||||
|
|
||||||
// removes key if it exists
|
|
||||||
GGML_API void gguf_remove_key(struct gguf_context * ctx, const char * key);
|
|
||||||
|
|
||||||
// overrides existing values or adds a new one
|
|
||||||
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
|
|
||||||
GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val);
|
|
||||||
GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
|
|
||||||
GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t val);
|
|
||||||
GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
|
|
||||||
GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val);
|
|
||||||
GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val);
|
|
||||||
GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val);
|
|
||||||
GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t val);
|
|
||||||
GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double val);
|
|
||||||
GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val);
|
|
||||||
GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
|
|
||||||
GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
|
|
||||||
GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n);
|
|
||||||
|
|
||||||
// set or add KV pairs from another context
|
|
||||||
GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);
|
|
||||||
|
|
||||||
// manage tensor info
|
|
||||||
GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
|
|
||||||
GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
|
|
||||||
GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
|
|
||||||
|
|
||||||
// writing gguf files can be done in 2 ways:
|
|
||||||
//
|
|
||||||
// - write the entire gguf_context to a binary file in a single pass:
|
|
||||||
//
|
|
||||||
// gguf_write_to_file(ctx, fname);
|
|
||||||
//
|
|
||||||
// - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
|
|
||||||
//
|
|
||||||
// FILE * f = fopen(fname, "wb");
|
|
||||||
// fseek(f, gguf_get_meta_size(ctx), SEEK_SET);
|
|
||||||
// fwrite(f, ...);
|
|
||||||
// void * data = gguf_meta_get_meta_data(ctx);
|
|
||||||
// fseek(f, 0, SEEK_SET);
|
|
||||||
// fwrite(f, data, gguf_get_meta_size(ctx));
|
|
||||||
// free(data);
|
|
||||||
// fclose(f);
|
|
||||||
//
|
|
||||||
|
|
||||||
// write the entire context to a binary file
|
|
||||||
GGML_API void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);
|
|
||||||
|
|
||||||
// get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
|
|
||||||
GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
|
|
||||||
GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
// restrict not standard in C++
|
// restrict not standard in C++
|
||||||
# if defined(__GNUC__)
|
# if defined(__GNUC__)
|
||||||
|
202
ggml/include/gguf.h
Normal file
202
ggml/include/gguf.h
Normal file
@ -0,0 +1,202 @@
|
|||||||
|
// This file contains functionality related to "GGUF" files, the binary file format used by ggml.
|
||||||
|
// GGUF files have the following structure:
|
||||||
|
//
|
||||||
|
// 1. File magic "GGUF" (4 bytes).
|
||||||
|
// 2. File version (uint32_t).
|
||||||
|
// 3. Number of ggml tensors in file (int64_t).
|
||||||
|
// 4. Number of key-value-pairs in file (int64_t).
|
||||||
|
// 5. For each KV pair:
|
||||||
|
// 1. The key (string).
|
||||||
|
// 2. The value type (gguf_type).
|
||||||
|
// 3a. If the value type is GGUF_TYPE_ARRAY:
|
||||||
|
// 1. The type of the array (gguf_type).
|
||||||
|
// 2. The number of elements in the array (uint64_t).
|
||||||
|
// 3. The binary representation of each element in the array.
|
||||||
|
// 3b. Otherwise:
|
||||||
|
// 1. The binary representation of the value.
|
||||||
|
// 6. For each ggml tensor:
|
||||||
|
// 1. The tensor name (string).
|
||||||
|
// 2. The number of dimensions of the tensor (uint32_t).
|
||||||
|
// 3. For each dimension:
|
||||||
|
// 1. The size of the tensor in the dimension (int64_t).
|
||||||
|
// 4. The tensor data type (ggml_type).
|
||||||
|
// 5. The tensor data offset in the tensor data binary blob (uint64_t).
|
||||||
|
// 7. The tensor data binary blob (optional, aligned).
|
||||||
|
//
|
||||||
|
// Strings are serialized as the string length (uint64_t) followed by the C string without the null terminator.
|
||||||
|
// All enums are stored as int32_t.
|
||||||
|
// All bool values are stored as int8_t.
|
||||||
|
// If the special key "general.alignment" (uint32_t) is defined it is used for alignment,
|
||||||
|
// otherwise GGUF_DEFAULT_ALIGNMENT is used.
|
||||||
|
//
|
||||||
|
// Module maintainer: Johannes Gäßler (@JohannesGaessler, johannesg@5d6.de)
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
|
#include <stdbool.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#define GGUF_MAGIC "GGUF"
|
||||||
|
#define GGUF_VERSION 3
|
||||||
|
|
||||||
|
#define GGUF_KEY_GENERAL_ALIGNMENT "general.alignment"
|
||||||
|
|
||||||
|
#define GGUF_DEFAULT_ALIGNMENT 32
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// types that can be stored as GGUF KV data
|
||||||
|
enum gguf_type {
|
||||||
|
GGUF_TYPE_UINT8 = 0,
|
||||||
|
GGUF_TYPE_INT8 = 1,
|
||||||
|
GGUF_TYPE_UINT16 = 2,
|
||||||
|
GGUF_TYPE_INT16 = 3,
|
||||||
|
GGUF_TYPE_UINT32 = 4,
|
||||||
|
GGUF_TYPE_INT32 = 5,
|
||||||
|
GGUF_TYPE_FLOAT32 = 6,
|
||||||
|
GGUF_TYPE_BOOL = 7,
|
||||||
|
GGUF_TYPE_STRING = 8,
|
||||||
|
GGUF_TYPE_ARRAY = 9,
|
||||||
|
GGUF_TYPE_UINT64 = 10,
|
||||||
|
GGUF_TYPE_INT64 = 11,
|
||||||
|
GGUF_TYPE_FLOAT64 = 12,
|
||||||
|
GGUF_TYPE_COUNT, // marks the end of the enum
|
||||||
|
};
|
||||||
|
|
||||||
|
struct gguf_context;
|
||||||
|
|
||||||
|
struct gguf_init_params {
|
||||||
|
bool no_alloc;
|
||||||
|
|
||||||
|
// if not NULL, create a ggml_context and allocate the tensor data in it
|
||||||
|
struct ggml_context ** ctx;
|
||||||
|
};
|
||||||
|
|
||||||
|
GGML_API struct gguf_context * gguf_init_empty(void);
|
||||||
|
GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
|
||||||
|
//GGML_API struct gguf_context * gguf_init_from_buffer(..);
|
||||||
|
|
||||||
|
GGML_API void gguf_free(struct gguf_context * ctx);
|
||||||
|
|
||||||
|
GGML_API const char * gguf_type_name(enum gguf_type type);
|
||||||
|
|
||||||
|
GGML_API uint32_t gguf_get_version (const struct gguf_context * ctx);
|
||||||
|
GGML_API size_t gguf_get_alignment (const struct gguf_context * ctx);
|
||||||
|
GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx);
|
||||||
|
|
||||||
|
GGML_API int64_t gguf_get_n_kv(const struct gguf_context * ctx);
|
||||||
|
GGML_API int64_t gguf_find_key(const struct gguf_context * ctx, const char * key); // returns -1 if key is not found
|
||||||
|
GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int64_t key_id);
|
||||||
|
|
||||||
|
GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int64_t key_id);
|
||||||
|
GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id);
|
||||||
|
|
||||||
|
// will abort if the wrong type is used for the key
|
||||||
|
GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int64_t key_id);
|
||||||
|
GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int64_t key_id);
|
||||||
|
GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int64_t key_id);
|
||||||
|
GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int64_t key_id);
|
||||||
|
GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int64_t key_id);
|
||||||
|
GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int64_t key_id);
|
||||||
|
GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int64_t key_id);
|
||||||
|
GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int64_t key_id);
|
||||||
|
GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int64_t key_id);
|
||||||
|
GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int64_t key_id);
|
||||||
|
GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int64_t key_id);
|
||||||
|
GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int64_t key_id);
|
||||||
|
GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int64_t key_id);
|
||||||
|
GGML_API size_t gguf_get_arr_n (const struct gguf_context * ctx, int64_t key_id);
|
||||||
|
|
||||||
|
// get raw pointer to the first element of the array with the given key_id
|
||||||
|
// for bool arrays, note that they are always stored as int8 on all platforms (usually this makes no difference)
|
||||||
|
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int64_t key_id);
|
||||||
|
|
||||||
|
// get ith C string from array with given key_id
|
||||||
|
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int64_t key_id, size_t i);
|
||||||
|
|
||||||
|
GGML_API int64_t gguf_get_n_tensors (const struct gguf_context * ctx);
|
||||||
|
GGML_API int64_t gguf_find_tensor (const struct gguf_context * ctx, const char * name); // returns -1 if the tensor is not found
|
||||||
|
GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int64_t tensor_id);
|
||||||
|
GGML_API const char * gguf_get_tensor_name (const struct gguf_context * ctx, int64_t tensor_id);
|
||||||
|
GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int64_t tensor_id);
|
||||||
|
GGML_API size_t gguf_get_tensor_size (const struct gguf_context * ctx, int64_t tensor_id);
|
||||||
|
|
||||||
|
// removes key if it exists, returns id that the key had prior to removal (-1 if it didn't exist)
|
||||||
|
GGML_API int64_t gguf_remove_key(struct gguf_context * ctx, const char * key);
|
||||||
|
|
||||||
|
// overrides an existing KV pair or adds a new one, the new KV pair is always at the back
|
||||||
|
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
|
||||||
|
GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val);
|
||||||
|
GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
|
||||||
|
GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t val);
|
||||||
|
GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
|
||||||
|
GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val);
|
||||||
|
GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val);
|
||||||
|
GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val);
|
||||||
|
GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t val);
|
||||||
|
GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double val);
|
||||||
|
GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val);
|
||||||
|
GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
|
||||||
|
|
||||||
|
// creates a new array with n elements of the given type and copies the corresponding number of bytes from data
|
||||||
|
GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, size_t n);
|
||||||
|
|
||||||
|
// creates a new array with n strings and copies the corresponding strings from data
|
||||||
|
GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, size_t n);
|
||||||
|
|
||||||
|
// set or add KV pairs from another context
|
||||||
|
GGML_API void gguf_set_kv(struct gguf_context * ctx, const struct gguf_context * src);
|
||||||
|
|
||||||
|
// add tensor to GGUF context, tensor name must be unique
|
||||||
|
GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
|
||||||
|
|
||||||
|
// after changing a tensor's type, the offsets of all tensors with higher indices are immediately recalculated
|
||||||
|
// in such a way that the tensor data remains as one contiguous block (except for padding)
|
||||||
|
GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
|
||||||
|
|
||||||
|
// assumes that at least gguf_get_tensor_size bytes can be read from data
|
||||||
|
GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data);
|
||||||
|
|
||||||
|
// writing gguf files can be done in 3 ways:
|
||||||
|
//
|
||||||
|
// - write the entire gguf_context to a binary file in a single pass:
|
||||||
|
//
|
||||||
|
// gguf_write_to_file(ctx, fname, /*only_meta =*/ false);
|
||||||
|
//
|
||||||
|
// - write only the meta data to a file, then re-open the file and append the tensor data:
|
||||||
|
//
|
||||||
|
// gguf_write_to_file(ctx, fname, /*only_meta =*/ true);
|
||||||
|
// FILE * f = fopen(fname, "ab");
|
||||||
|
// fwrite(f, ...); // write tensor data
|
||||||
|
// fclose(f);
|
||||||
|
//
|
||||||
|
// - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
|
||||||
|
//
|
||||||
|
// FILE * f = fopen(fname, "wb");
|
||||||
|
// const size_t size_meta = gguf_get_meta_size(ctx);
|
||||||
|
// fseek(f, size_meta, SEEK_SET);
|
||||||
|
// fwrite(f, ...); // write tensor data
|
||||||
|
// void * data = malloc(size_meta);
|
||||||
|
// gguf_get_meta_data(ctx, data);
|
||||||
|
// rewind(f);
|
||||||
|
// fwrite(data, 1, data, f);
|
||||||
|
// free(data);
|
||||||
|
// fclose(f);
|
||||||
|
//
|
||||||
|
|
||||||
|
// write the entire context to a binary file
|
||||||
|
GGML_API bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);
|
||||||
|
|
||||||
|
// get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
|
||||||
|
GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
|
||||||
|
|
||||||
|
// writes the meta data to pointer "data"
|
||||||
|
GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
@ -208,6 +208,7 @@ add_library(ggml-base
|
|||||||
../include/ggml-backend.h
|
../include/ggml-backend.h
|
||||||
../include/ggml-cpp.h
|
../include/ggml-cpp.h
|
||||||
../include/ggml-opt.h
|
../include/ggml-opt.h
|
||||||
|
../include/gguf.h
|
||||||
ggml.c
|
ggml.c
|
||||||
ggml-alloc.c
|
ggml-alloc.c
|
||||||
ggml-backend.cpp
|
ggml-backend.cpp
|
||||||
@ -215,7 +216,8 @@ add_library(ggml-base
|
|||||||
ggml-threading.cpp
|
ggml-threading.cpp
|
||||||
ggml-threading.h
|
ggml-threading.h
|
||||||
ggml-quants.c
|
ggml-quants.c
|
||||||
ggml-quants.h)
|
ggml-quants.h
|
||||||
|
gguf.cpp)
|
||||||
|
|
||||||
target_include_directories(ggml-base PRIVATE .)
|
target_include_directories(ggml-base PRIVATE .)
|
||||||
|
|
||||||
@ -290,9 +292,9 @@ if (GGML_CPU_ALL_VARIANTS)
|
|||||||
ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 FMA)
|
ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 FMA)
|
||||||
ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 FMA AVX512)
|
ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 FMA AVX512)
|
||||||
ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
|
ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
|
||||||
if (NOT MSVC)
|
|
||||||
# MSVC doesn't support AVX-VNNI or AMX
|
|
||||||
ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 FMA AVX_VNNI)
|
ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 FMA AVX_VNNI)
|
||||||
|
if (NOT MSVC)
|
||||||
|
# MSVC doesn't support AMX
|
||||||
ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
||||||
endif()
|
endif()
|
||||||
else ()
|
else ()
|
||||||
|
@ -574,4 +574,9 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
|
|||||||
ggml_backend_load_best("opencl", silent, dir_path);
|
ggml_backend_load_best("opencl", silent, dir_path);
|
||||||
ggml_backend_load_best("musa", silent, dir_path);
|
ggml_backend_load_best("musa", silent, dir_path);
|
||||||
ggml_backend_load_best("cpu", silent, dir_path);
|
ggml_backend_load_best("cpu", silent, dir_path);
|
||||||
|
// check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend
|
||||||
|
const char * backend_path = std::getenv("GGML_BACKEND_PATH");
|
||||||
|
if (backend_path) {
|
||||||
|
ggml_backend_load(backend_path);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -764,7 +764,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|||||||
if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
||||||
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
|
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
|
||||||
// check if a backend with higher prio wants to offload the op
|
// check if a backend with higher prio wants to offload the op
|
||||||
if (src_backend_id == sched->n_backends - 1) {
|
if (src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
|
||||||
for (int b = 0; b < src_backend_id; b++) {
|
for (int b = 0; b < src_backend_id; b++) {
|
||||||
if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
|
if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
|
||||||
SET_CAUSE(tensor, "1.off");
|
SET_CAUSE(tensor, "1.off");
|
||||||
@ -795,9 +795,12 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
|||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
|
if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
|
||||||
ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
|
ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
|
||||||
GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
|
GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs", cur_split, ggml_backend_name(split_backend),
|
||||||
sched->splits[cur_split].n_inputs);
|
sched->splits[cur_split].n_inputs);
|
||||||
for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
|
for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
|
||||||
|
if (j == 0) {
|
||||||
|
GGML_LOG_DEBUG(": ");
|
||||||
|
}
|
||||||
GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
|
GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
|
||||||
fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
|
fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
|
||||||
}
|
}
|
||||||
|
@ -215,8 +215,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|||||||
list(APPEND ARCH_DEFINITIONS GGML_SSE42)
|
list(APPEND ARCH_DEFINITIONS GGML_SSE42)
|
||||||
endif()
|
endif()
|
||||||
if (GGML_AVX_VNNI)
|
if (GGML_AVX_VNNI)
|
||||||
# MSVC generates AVX512 with AVX-VNNI intrinsics even with /arch:AVX2
|
list(APPEND ARCH_DEFINITIONS __AVXVNNI__ GGML_AVX_VNNI)
|
||||||
#list(APPEND ARCH_DEFINITIONS __AVXVNNI__ GGML_AVX_VNNI)
|
|
||||||
endif()
|
endif()
|
||||||
else ()
|
else ()
|
||||||
if (GGML_NATIVE)
|
if (GGML_NATIVE)
|
||||||
|
@ -194,9 +194,12 @@ static inline __m256i sum_i16_pairs_int32x8(const __m256i x) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static inline __m256i mul_sum_us8_pairs_int32x8(const __m256i ax, const __m256i sy) {
|
static inline __m256i mul_sum_us8_pairs_int32x8(const __m256i ax, const __m256i sy) {
|
||||||
#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
|
#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
|
||||||
const __m256i zero = _mm256_setzero_si256();
|
const __m256i zero = _mm256_setzero_si256();
|
||||||
return _mm256_dpbusd_epi32(zero, ax, sy);
|
return _mm256_dpbusd_epi32(zero, ax, sy);
|
||||||
|
#elif defined(__AVXVNNI__)
|
||||||
|
const __m256i zero = _mm256_setzero_si256();
|
||||||
|
return _mm256_dpbusd_avx_epi32(zero, ax, sy);
|
||||||
#else
|
#else
|
||||||
// Perform multiplication and create 16-bit values
|
// Perform multiplication and create 16-bit values
|
||||||
const __m256i dot = _mm256_maddubs_epi16(ax, sy);
|
const __m256i dot = _mm256_maddubs_epi16(ax, sy);
|
||||||
@ -4166,6 +4169,8 @@ static ggml_backend_buffer_t ggml_backend_cpu_aarch64_buffer_type_alloc_buffer(g
|
|||||||
buffer->buft = buft;
|
buffer->buft = buft;
|
||||||
buffer->iface.init_tensor = ggml_backend_cpu_aarch64_buffer_init_tensor;
|
buffer->iface.init_tensor = ggml_backend_cpu_aarch64_buffer_init_tensor;
|
||||||
buffer->iface.set_tensor = ggml_backend_cpu_aarch64_buffer_set_tensor;
|
buffer->iface.set_tensor = ggml_backend_cpu_aarch64_buffer_set_tensor;
|
||||||
|
buffer->iface.get_tensor = nullptr;
|
||||||
|
buffer->iface.cpy_tensor = nullptr;
|
||||||
return buffer;
|
return buffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -103,10 +103,14 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
|
static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
|
||||||
#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
|
#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
|
||||||
const __m256i zero = _mm256_setzero_si256();
|
const __m256i zero = _mm256_setzero_si256();
|
||||||
const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
|
const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
|
||||||
return _mm256_cvtepi32_ps(summed_pairs);
|
return _mm256_cvtepi32_ps(summed_pairs);
|
||||||
|
#elif defined(__AVXVNNI__)
|
||||||
|
const __m256i zero = _mm256_setzero_si256();
|
||||||
|
const __m256i summed_pairs = _mm256_dpbusd_avx_epi32(zero, ax, sy);
|
||||||
|
return _mm256_cvtepi32_ps(summed_pairs);
|
||||||
#else
|
#else
|
||||||
// Perform multiplication and create 16-bit values
|
// Perform multiplication and create 16-bit values
|
||||||
const __m256i dot = _mm256_maddubs_epi16(ax, sy);
|
const __m256i dot = _mm256_maddubs_epi16(ax, sy);
|
||||||
|
@ -54,6 +54,7 @@
|
|||||||
#include "ggml-quants.h"
|
#include "ggml-quants.h"
|
||||||
|
|
||||||
#include <atomic>
|
#include <atomic>
|
||||||
|
#include <array>
|
||||||
|
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
#define NOINLINE __declspec(noinline)
|
#define NOINLINE __declspec(noinline)
|
||||||
@ -1000,8 +1001,10 @@ class tinyBLAS_Q0_AVX {
|
|||||||
|
|
||||||
inline __m256 updot(__m256i u, __m256i s) {
|
inline __m256 updot(__m256i u, __m256i s) {
|
||||||
__m256i res;
|
__m256i res;
|
||||||
#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
|
#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
|
||||||
res = _mm256_dpbusd_epi32(_mm256_setzero_si256(), u, s);
|
res = _mm256_dpbusd_epi32(_mm256_setzero_si256(), u, s);
|
||||||
|
#elif defined(__AVXVNNI__)
|
||||||
|
res = _mm256_dpbusd_avx_epi32(_mm256_setzero_si256(), u, s);
|
||||||
#else
|
#else
|
||||||
res = _mm256_madd_epi16(_mm256_set1_epi16(1), _mm256_maddubs_epi16(u, s));
|
res = _mm256_madd_epi16(_mm256_set1_epi16(1), _mm256_maddubs_epi16(u, s));
|
||||||
#endif
|
#endif
|
||||||
@ -1049,6 +1052,704 @@ class tinyBLAS_Q0_AVX {
|
|||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
|
|
||||||
|
template <typename TA, typename TB, typename TC>
|
||||||
|
class tinyBLAS_Q0_PPC {
|
||||||
|
public:
|
||||||
|
tinyBLAS_Q0_PPC(int64_t k,
|
||||||
|
const TA *A, int64_t lda,
|
||||||
|
const TB *B, int64_t ldb,
|
||||||
|
TC *C, int64_t ldc,
|
||||||
|
int ith, int nth)
|
||||||
|
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
||||||
|
}
|
||||||
|
|
||||||
|
void matmul(int64_t m, int64_t n) {
|
||||||
|
mnpack(0, m, 0, n);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
|
||||||
|
template<int RM, int RN>
|
||||||
|
inline void save_res(int ii, int jj, int idx, vector float* fin_res) {
|
||||||
|
for (int I = 0; I < RM; I++) {
|
||||||
|
for (int J = 0; J < RN; J++) {
|
||||||
|
*((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&fin_res[idx+I]+J);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<int size>
|
||||||
|
inline void compute(acc_t* ACC, int c_idx, int s_idx, std::array<int, size>& comparray, vector float* vs, vector float* fin_res) {
|
||||||
|
vector signed int vec_C[4];
|
||||||
|
vector float CA[4] = {0};
|
||||||
|
vector float res[4] = {0};
|
||||||
|
__builtin_mma_disassemble_acc(vec_C, ACC);
|
||||||
|
for (int i = 0; i < 4; i++) {
|
||||||
|
CA[i] = vec_splats((float)(((double)comparray[c_idx+i]) * -128.0));
|
||||||
|
res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]);
|
||||||
|
fin_res[s_idx+i] = vec_madd(res[i], vs[s_idx+i], fin_res[s_idx+i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename VA, typename VB>
|
||||||
|
void packNormal(const TA* a, int64_t lda, int rows, int cols, VA* vec, bool flip) {
|
||||||
|
int64_t i, j;
|
||||||
|
TA *aoffset = NULL;
|
||||||
|
VA *vecOffset = NULL;
|
||||||
|
TA *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL;
|
||||||
|
TA *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL;
|
||||||
|
__vector_pair C1, C2, C3, C4, C5, C6, C7, C8;
|
||||||
|
VB c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2]={0};
|
||||||
|
VB c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2]={0};
|
||||||
|
VB t1, t2, t3, t4, t5, t6, t7, t8;
|
||||||
|
vector unsigned char xor_vector;
|
||||||
|
uint8_t flip_vec = 0x80;
|
||||||
|
xor_vector = vec_splats(flip_vec);
|
||||||
|
vector unsigned char swiz1 = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
|
||||||
|
vector unsigned char swiz2 = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
|
||||||
|
vector unsigned char swiz3 = {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27};
|
||||||
|
vector unsigned char swiz4 = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31};
|
||||||
|
|
||||||
|
aoffset = const_cast<TA*>(a);
|
||||||
|
vecOffset = vec;
|
||||||
|
j = (rows >> 3);
|
||||||
|
if (j > 0) {
|
||||||
|
do {
|
||||||
|
aoffset1 = aoffset;
|
||||||
|
aoffset2 = aoffset1 + lda;
|
||||||
|
aoffset3 = aoffset2 + lda;
|
||||||
|
aoffset4 = aoffset3 + lda;
|
||||||
|
aoffset5 = aoffset4 + lda;
|
||||||
|
aoffset6 = aoffset5 + lda;
|
||||||
|
aoffset7 = aoffset6 + lda;
|
||||||
|
aoffset8 = aoffset7 + lda;
|
||||||
|
aoffset += 8 * lda;
|
||||||
|
|
||||||
|
i = (cols >> 3);
|
||||||
|
if (i > 0) {
|
||||||
|
do {
|
||||||
|
C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1->qs);
|
||||||
|
C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2->qs);
|
||||||
|
C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs);
|
||||||
|
C4 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset4->qs);
|
||||||
|
C5 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset5->qs);
|
||||||
|
C6 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset6->qs);
|
||||||
|
C7 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset7->qs);
|
||||||
|
C8 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset8->qs);
|
||||||
|
|
||||||
|
__builtin_vsx_disassemble_pair(c1, &C1);
|
||||||
|
__builtin_vsx_disassemble_pair(c2, &C2);
|
||||||
|
__builtin_vsx_disassemble_pair(c3, &C3);
|
||||||
|
__builtin_vsx_disassemble_pair(c4, &C4);
|
||||||
|
__builtin_vsx_disassemble_pair(c5, &C5);
|
||||||
|
__builtin_vsx_disassemble_pair(c6, &C6);
|
||||||
|
__builtin_vsx_disassemble_pair(c7, &C7);
|
||||||
|
__builtin_vsx_disassemble_pair(c8, &C8);
|
||||||
|
|
||||||
|
t1 = vec_perm(c1[0], c2[0], swiz1);
|
||||||
|
t2 = vec_perm(c1[0], c2[0], swiz2);
|
||||||
|
t3 = vec_perm(c3[0], c4[0], swiz1);
|
||||||
|
t4 = vec_perm(c3[0], c4[0], swiz2);
|
||||||
|
t5 = vec_perm(t1, t3, swiz3);
|
||||||
|
t6 = vec_perm(t1, t3, swiz4);
|
||||||
|
t7 = vec_perm(t2, t4, swiz3);
|
||||||
|
t8 = vec_perm(t2, t4, swiz4);
|
||||||
|
if (flip == true) {
|
||||||
|
t5 = vec_xor(t5, xor_vector);
|
||||||
|
t6 = vec_xor(t6, xor_vector);
|
||||||
|
t7 = vec_xor(t7, xor_vector);
|
||||||
|
t8 = vec_xor(t8, xor_vector);
|
||||||
|
}
|
||||||
|
vec_xst(t5, 0, vecOffset);
|
||||||
|
vec_xst(t6, 0, vecOffset+16);
|
||||||
|
vec_xst(t7, 0, vecOffset+32);
|
||||||
|
vec_xst(t8, 0, vecOffset+48);
|
||||||
|
|
||||||
|
t1 = vec_perm(c1[1], c2[1], swiz1);
|
||||||
|
t2 = vec_perm(c1[1], c2[1], swiz2);
|
||||||
|
t3 = vec_perm(c3[1], c4[1], swiz1);
|
||||||
|
t4 = vec_perm(c3[1], c4[1], swiz2);
|
||||||
|
t5 = vec_perm(t1, t3, swiz3);
|
||||||
|
t6 = vec_perm(t1, t3, swiz4);
|
||||||
|
t7 = vec_perm(t2, t4, swiz3);
|
||||||
|
t8 = vec_perm(t2, t4, swiz4);
|
||||||
|
if (flip == true) {
|
||||||
|
t5 = vec_xor(t5, xor_vector);
|
||||||
|
t6 = vec_xor(t6, xor_vector);
|
||||||
|
t7 = vec_xor(t7, xor_vector);
|
||||||
|
t8 = vec_xor(t8, xor_vector);
|
||||||
|
}
|
||||||
|
vec_xst(t5, 0, vecOffset+64);
|
||||||
|
vec_xst(t6, 0, vecOffset+80);
|
||||||
|
vec_xst(t7, 0, vecOffset+96);
|
||||||
|
vec_xst(t8, 0, vecOffset+112);
|
||||||
|
|
||||||
|
t1 = vec_perm(c5[0], c6[0], swiz1);
|
||||||
|
t2 = vec_perm(c5[0], c6[0], swiz2);
|
||||||
|
t3 = vec_perm(c7[0], c8[0], swiz1);
|
||||||
|
t4 = vec_perm(c7[0], c8[0], swiz2);
|
||||||
|
t5 = vec_perm(t1, t3, swiz3);
|
||||||
|
t6 = vec_perm(t1, t3, swiz4);
|
||||||
|
t7 = vec_perm(t2, t4, swiz3);
|
||||||
|
t8 = vec_perm(t2, t4, swiz4);
|
||||||
|
if (flip == true) {
|
||||||
|
t5 = vec_xor(t5, xor_vector);
|
||||||
|
t6 = vec_xor(t6, xor_vector);
|
||||||
|
t7 = vec_xor(t7, xor_vector);
|
||||||
|
t8 = vec_xor(t8, xor_vector);
|
||||||
|
}
|
||||||
|
vec_xst(t5, 0, vecOffset+128);
|
||||||
|
vec_xst(t6, 0, vecOffset+144);
|
||||||
|
vec_xst(t7, 0, vecOffset+160);
|
||||||
|
vec_xst(t8, 0, vecOffset+176);
|
||||||
|
|
||||||
|
t1 = vec_perm(c5[1], c6[1], swiz1);
|
||||||
|
t2 = vec_perm(c5[1], c6[1], swiz2);
|
||||||
|
t3 = vec_perm(c7[1], c8[1], swiz1);
|
||||||
|
t4 = vec_perm(c7[1], c8[1], swiz2);
|
||||||
|
t5 = vec_perm(t1, t3, swiz3);
|
||||||
|
t6 = vec_perm(t1, t3, swiz4);
|
||||||
|
t7 = vec_perm(t2, t4, swiz3);
|
||||||
|
t8 = vec_perm(t2, t4, swiz4);
|
||||||
|
if (flip == true) {
|
||||||
|
t5 = vec_xor(t5, xor_vector);
|
||||||
|
t6 = vec_xor(t6, xor_vector);
|
||||||
|
t7 = vec_xor(t7, xor_vector);
|
||||||
|
t8 = vec_xor(t8, xor_vector);
|
||||||
|
}
|
||||||
|
vec_xst(t5, 0, vecOffset+192);
|
||||||
|
vec_xst(t6, 0, vecOffset+208);
|
||||||
|
vec_xst(t7, 0, vecOffset+224);
|
||||||
|
vec_xst(t8, 0, vecOffset+240);
|
||||||
|
|
||||||
|
aoffset1 += lda;
|
||||||
|
aoffset2 += lda;
|
||||||
|
aoffset3 += lda;
|
||||||
|
aoffset4 += lda;
|
||||||
|
aoffset5 += lda;
|
||||||
|
aoffset6 += lda;
|
||||||
|
aoffset7 += lda;
|
||||||
|
aoffset8 += lda;
|
||||||
|
vecOffset += 256;
|
||||||
|
i--;
|
||||||
|
} while(i > 0);
|
||||||
|
}
|
||||||
|
j--;
|
||||||
|
} while(j > 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (rows & 4) {
|
||||||
|
aoffset1 = aoffset;
|
||||||
|
aoffset2 = aoffset1 + lda;
|
||||||
|
aoffset3 = aoffset2 + lda;
|
||||||
|
aoffset4 = aoffset3 + lda;
|
||||||
|
aoffset += 4 * lda;
|
||||||
|
|
||||||
|
i = (cols >> 3);
|
||||||
|
if (i > 0) {
|
||||||
|
do {
|
||||||
|
C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1->qs);
|
||||||
|
C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2->qs);
|
||||||
|
C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs);
|
||||||
|
C4 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset4->qs);
|
||||||
|
|
||||||
|
__builtin_vsx_disassemble_pair(c1, &C1);
|
||||||
|
__builtin_vsx_disassemble_pair(c2, &C2);
|
||||||
|
__builtin_vsx_disassemble_pair(c3, &C3);
|
||||||
|
__builtin_vsx_disassemble_pair(c4, &C4);
|
||||||
|
|
||||||
|
t1 = vec_perm(c1[0], c2[0], swiz1);
|
||||||
|
t2 = vec_perm(c1[0], c2[0], swiz2);
|
||||||
|
t3 = vec_perm(c3[0], c4[0], swiz1);
|
||||||
|
t4 = vec_perm(c3[0], c4[0], swiz2);
|
||||||
|
t5 = vec_perm(t1, t3, swiz3);
|
||||||
|
t6 = vec_perm(t1, t3, swiz4);
|
||||||
|
t7 = vec_perm(t2, t4, swiz3);
|
||||||
|
t8 = vec_perm(t2, t4, swiz4);
|
||||||
|
if (flip == true) {
|
||||||
|
t5 = vec_xor(t5, xor_vector);
|
||||||
|
t6 = vec_xor(t6, xor_vector);
|
||||||
|
t7 = vec_xor(t7, xor_vector);
|
||||||
|
t8 = vec_xor(t8, xor_vector);
|
||||||
|
}
|
||||||
|
vec_xst(t5, 0, vecOffset);
|
||||||
|
vec_xst(t6, 0, vecOffset+16);
|
||||||
|
vec_xst(t7, 0, vecOffset+32);
|
||||||
|
vec_xst(t8, 0, vecOffset+48);
|
||||||
|
|
||||||
|
t1 = vec_perm(c1[1], c2[1], swiz1);
|
||||||
|
t2 = vec_perm(c1[1], c2[1], swiz2);
|
||||||
|
t3 = vec_perm(c3[1], c4[1], swiz1);
|
||||||
|
t4 = vec_perm(c3[1], c4[1], swiz2);
|
||||||
|
t5 = vec_perm(t1, t3, swiz3);
|
||||||
|
t6 = vec_perm(t1, t3, swiz4);
|
||||||
|
t7 = vec_perm(t2, t4, swiz3);
|
||||||
|
t8 = vec_perm(t2, t4, swiz4);
|
||||||
|
if (flip == true) {
|
||||||
|
t5 = vec_xor(t5, xor_vector);
|
||||||
|
t6 = vec_xor(t6, xor_vector);
|
||||||
|
t7 = vec_xor(t7, xor_vector);
|
||||||
|
t8 = vec_xor(t8, xor_vector);
|
||||||
|
}
|
||||||
|
vec_xst(t5, 0, vecOffset+64);
|
||||||
|
vec_xst(t6, 0, vecOffset+80);
|
||||||
|
vec_xst(t7, 0, vecOffset+96);
|
||||||
|
vec_xst(t8, 0, vecOffset+112);
|
||||||
|
|
||||||
|
aoffset1 += lda;
|
||||||
|
aoffset2 += lda;
|
||||||
|
aoffset3 += lda;
|
||||||
|
aoffset4 += lda;
|
||||||
|
vecOffset += 128;
|
||||||
|
i--;
|
||||||
|
} while(i > 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (rows & 3) {
|
||||||
|
aoffset1 = aoffset;
|
||||||
|
aoffset2 = aoffset1 + lda;
|
||||||
|
aoffset3 = aoffset2 + lda;
|
||||||
|
i = (cols >> 3);
|
||||||
|
if (i > 0) {
|
||||||
|
do {
|
||||||
|
switch(rows) {
|
||||||
|
case 3: C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs);
|
||||||
|
__builtin_vsx_disassemble_pair(c3, &C3);
|
||||||
|
case 2: C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2->qs);
|
||||||
|
__builtin_vsx_disassemble_pair(c2, &C2);
|
||||||
|
case 1: C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1->qs);
|
||||||
|
__builtin_vsx_disassemble_pair(c1, &C1);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
t1 = vec_perm(c1[0], c2[0], swiz1);
|
||||||
|
t2 = vec_perm(c1[0], c2[0], swiz2);
|
||||||
|
t3 = vec_perm(c3[0], c4[0], swiz1);
|
||||||
|
t4 = vec_perm(c3[0], c4[0], swiz2);
|
||||||
|
t5 = vec_perm(t1, t3, swiz3);
|
||||||
|
t6 = vec_perm(t1, t3, swiz4);
|
||||||
|
t7 = vec_perm(t2, t4, swiz3);
|
||||||
|
t8 = vec_perm(t2, t4, swiz4);
|
||||||
|
if (flip == true) {
|
||||||
|
t5 = vec_xor(t5, xor_vector);
|
||||||
|
t6 = vec_xor(t6, xor_vector);
|
||||||
|
t7 = vec_xor(t7, xor_vector);
|
||||||
|
t8 = vec_xor(t8, xor_vector);
|
||||||
|
}
|
||||||
|
vec_xst(t5, 0, vecOffset);
|
||||||
|
vec_xst(t6, 0, vecOffset+16);
|
||||||
|
vec_xst(t7, 0, vecOffset+32);
|
||||||
|
vec_xst(t8, 0, vecOffset+48);
|
||||||
|
|
||||||
|
t1 = vec_perm(c1[1], c2[1], swiz1);
|
||||||
|
t2 = vec_perm(c1[1], c2[1], swiz2);
|
||||||
|
t3 = vec_perm(c3[1], c4[1], swiz1);
|
||||||
|
t4 = vec_perm(c3[1], c4[1], swiz2);
|
||||||
|
t5 = vec_perm(t1, t3, swiz3);
|
||||||
|
t6 = vec_perm(t1, t3, swiz4);
|
||||||
|
t7 = vec_perm(t2, t4, swiz3);
|
||||||
|
t8 = vec_perm(t2, t4, swiz4);
|
||||||
|
if (flip == true) {
|
||||||
|
t5 = vec_xor(t5, xor_vector);
|
||||||
|
t6 = vec_xor(t6, xor_vector);
|
||||||
|
t7 = vec_xor(t7, xor_vector);
|
||||||
|
t8 = vec_xor(t8, xor_vector);
|
||||||
|
}
|
||||||
|
vec_xst(t5, 0, vecOffset+64);
|
||||||
|
vec_xst(t6, 0, vecOffset+80);
|
||||||
|
vec_xst(t7, 0, vecOffset+96);
|
||||||
|
vec_xst(t8, 0, vecOffset+112);
|
||||||
|
|
||||||
|
aoffset1 += lda;
|
||||||
|
aoffset2 += lda;
|
||||||
|
aoffset3 += lda;
|
||||||
|
vecOffset += 128;
|
||||||
|
i--;
|
||||||
|
} while(i > 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
||||||
|
int64_t mc, nc, mp, np;
|
||||||
|
int m_rem = MIN(m - m0, 8);
|
||||||
|
int n_rem = MIN(n - n0, 8);
|
||||||
|
// TO-DO: KERNEL_16x8 and KERNEL_8x16 are having some performance
|
||||||
|
// issues. After resolving them, below code will be enabled.
|
||||||
|
/*if (m_rem >= 16 && n_rem >= 8) {
|
||||||
|
mc = 16;
|
||||||
|
nc = 8;
|
||||||
|
gemm<16,8>(m0, m, n0, n);
|
||||||
|
} else if(m_rem >= 8 && n_rem >= 16) {
|
||||||
|
mc = 8;
|
||||||
|
nc = 16;
|
||||||
|
gemm<8,16>(m0, m, n0, n);
|
||||||
|
}*/
|
||||||
|
if (m_rem >= 8 && n_rem >= 8) {
|
||||||
|
mc = 8;
|
||||||
|
nc = 8;
|
||||||
|
gemm<8,8>(m0, m, n0, n);
|
||||||
|
} else if (m_rem >= 4 && n_rem >= 8) {
|
||||||
|
mc = 4;
|
||||||
|
nc = 8;
|
||||||
|
gemm<4,8>(m0, m, n0, n);
|
||||||
|
} else if (m_rem >= 8 && n_rem >= 4) {
|
||||||
|
mc = 8;
|
||||||
|
nc = 4;
|
||||||
|
gemm<8,4>(m0, m, n0, n);
|
||||||
|
} else if (m_rem >= 4 && n_rem >= 4) {
|
||||||
|
mc = 4;
|
||||||
|
nc = 4;
|
||||||
|
gemm_small<4, 4>(m0, m, n0, n);
|
||||||
|
} else if ((m_rem < 4) && (n_rem > 4)) {
|
||||||
|
nc = 4;
|
||||||
|
switch(m_rem) {
|
||||||
|
case 1:
|
||||||
|
mc = 1;
|
||||||
|
gemm_small<1, 4>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
mc = 2;
|
||||||
|
gemm_small<2, 4>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
mc = 3;
|
||||||
|
gemm_small<3, 4>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
} else if ((m_rem > 4) && (n_rem < 4)) {
|
||||||
|
mc = 4;
|
||||||
|
switch(n_rem) {
|
||||||
|
case 1:
|
||||||
|
nc = 1;
|
||||||
|
gemm_small<4, 1>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
nc = 2;
|
||||||
|
gemm_small<4, 2>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
nc = 3;
|
||||||
|
gemm_small<4, 3>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
switch((m_rem << 4) | n_rem) {
|
||||||
|
case 0x43:
|
||||||
|
mc = 4;
|
||||||
|
nc = 3;
|
||||||
|
gemm_small<4, 3>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
case 0x42:
|
||||||
|
mc = 4;
|
||||||
|
nc = 2;
|
||||||
|
gemm_small<4, 2>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
case 0x41:
|
||||||
|
mc = 4;
|
||||||
|
nc = 1;
|
||||||
|
gemm_small<4, 1>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
case 0x34:
|
||||||
|
mc = 3;
|
||||||
|
nc = 4;
|
||||||
|
gemm_small<3, 4>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
case 0x33:
|
||||||
|
mc = 3;
|
||||||
|
nc = 3;
|
||||||
|
gemm_small<3, 3>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
case 0x32:
|
||||||
|
mc = 3;
|
||||||
|
nc = 2;
|
||||||
|
gemm_small<3, 2>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
case 0x31:
|
||||||
|
mc = 3;
|
||||||
|
nc = 1;
|
||||||
|
gemm_small<3, 1>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
case 0x24:
|
||||||
|
mc = 2;
|
||||||
|
nc = 4;
|
||||||
|
gemm_small<2, 4>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
case 0x23:
|
||||||
|
mc = 2;
|
||||||
|
nc = 3;
|
||||||
|
gemm_small<2, 3>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
case 0x22:
|
||||||
|
mc = 2;
|
||||||
|
nc = 2;
|
||||||
|
gemm_small<2, 2>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
case 0x21:
|
||||||
|
mc = 2;
|
||||||
|
nc = 1;
|
||||||
|
gemm_small<2, 1>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
case 0x14:
|
||||||
|
mc = 1;
|
||||||
|
nc = 4;
|
||||||
|
gemm_small<1, 4>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
case 0x13:
|
||||||
|
mc = 1;
|
||||||
|
nc = 3;
|
||||||
|
gemm_small<1, 3>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
case 0x12:
|
||||||
|
mc = 1;
|
||||||
|
nc = 2;
|
||||||
|
gemm_small<1, 2>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
case 0x11:
|
||||||
|
mc = 1;
|
||||||
|
nc = 1;
|
||||||
|
gemm_small<1, 1>(m0, m, n0, n);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mp = m0 + (m - m0) / mc * mc;
|
||||||
|
np = n0 + (n - n0) / nc * nc;
|
||||||
|
mnpack(mp, m, n0, np);
|
||||||
|
mnpack(m0, m, np, n);
|
||||||
|
}
|
||||||
|
|
||||||
|
void KERNEL_4x8(int64_t ii, int64_t jj) {
|
||||||
|
vec_t vec_A[8], vec_B[16] = {0};
|
||||||
|
acc_t acc_0, acc_1;
|
||||||
|
std::array<int, 4> comparray;
|
||||||
|
vector float fin_res[8] = {0};
|
||||||
|
vector float vs[8] = {0};
|
||||||
|
for (int l = 0; l < k; l++) {
|
||||||
|
__builtin_mma_xxsetaccz(&acc_0);
|
||||||
|
__builtin_mma_xxsetaccz(&acc_1);
|
||||||
|
packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, 4, 8, (int8_t*)vec_A, false);
|
||||||
|
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B, true);
|
||||||
|
for(int x = 0; x < 8; x++) {
|
||||||
|
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
|
||||||
|
__builtin_mma_xvi8ger4pp(&acc_1, vec_A[x], vec_B[x+8]);
|
||||||
|
}
|
||||||
|
for (int I = 0; I<4; I++) {
|
||||||
|
for (int J = 0; J<4; J++) {
|
||||||
|
*((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
|
||||||
|
*((float*)&vs[I+4]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J+4)*ldb)+l)->d));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
auto aoffset = A+(ii*lda)+l;
|
||||||
|
for (int i = 0; i < 4; i++) {
|
||||||
|
comparray[i] = 0;
|
||||||
|
int ca = 0;
|
||||||
|
const int8_t *at = aoffset->qs;
|
||||||
|
for (int j = 0; j < 32; j++)
|
||||||
|
ca += (int)*at++;
|
||||||
|
comparray[i] = ca;
|
||||||
|
aoffset += lda;
|
||||||
|
}
|
||||||
|
compute<4>(&acc_0, 0, 0, comparray, vs, fin_res);
|
||||||
|
compute<4>(&acc_1, 0, 4, comparray, vs, fin_res);
|
||||||
|
}
|
||||||
|
save_res<4, 4>(ii, jj, 0, fin_res);
|
||||||
|
save_res<4, 4>(ii, jj+4, 4, fin_res);
|
||||||
|
}
|
||||||
|
|
||||||
|
void KERNEL_8x4(int64_t ii, int64_t jj) {
|
||||||
|
vec_t vec_A[16], vec_B[8] = {0};
|
||||||
|
acc_t acc_0, acc_1;
|
||||||
|
std::array<int, 8> comparray;
|
||||||
|
vector float fin_res[8] = {0};
|
||||||
|
vector float vs[8] = {0};
|
||||||
|
for (int l = 0; l < k; l++) {
|
||||||
|
__builtin_mma_xxsetaccz(&acc_0);
|
||||||
|
__builtin_mma_xxsetaccz(&acc_1);
|
||||||
|
packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false);
|
||||||
|
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 4, 8, (uint8_t*)vec_B, true);
|
||||||
|
for(int x = 0; x < 8; x++) {
|
||||||
|
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
|
||||||
|
__builtin_mma_xvi8ger4pp(&acc_1, vec_A[x+8], vec_B[x]);
|
||||||
|
}
|
||||||
|
for (int I = 0; I<8; I++) {
|
||||||
|
for (int J = 0; J<4; J++) {
|
||||||
|
*((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
auto aoffset = A+(ii*lda)+l;
|
||||||
|
for (int i = 0; i < 8; i++) {
|
||||||
|
comparray[i] = 0;
|
||||||
|
int ca = 0;
|
||||||
|
const int8_t *at = aoffset->qs;
|
||||||
|
for (int j = 0; j < 32; j++)
|
||||||
|
ca += (int)*at++;
|
||||||
|
comparray[i] = ca;
|
||||||
|
aoffset += lda;
|
||||||
|
}
|
||||||
|
compute<8>(&acc_0, 0, 0, comparray, vs, fin_res);
|
||||||
|
compute<8>(&acc_1, 4, 4, comparray, vs, fin_res);
|
||||||
|
}
|
||||||
|
save_res<4, 4>(ii, jj, 0, fin_res);
|
||||||
|
save_res<4, 4>(ii+4, jj, 4, fin_res);
|
||||||
|
}
|
||||||
|
|
||||||
|
void KERNEL_8x8(int64_t ii, int64_t jj) {
|
||||||
|
vec_t vec_A[16], vec_B[16] = {0};
|
||||||
|
acc_t acc_0, acc_1, acc_2, acc_3;
|
||||||
|
std::array<int, 8> comparray;
|
||||||
|
vector float fin_res[16] = {0};
|
||||||
|
vector float vs[16] = {0};
|
||||||
|
for (int l = 0; l < k; l++) {
|
||||||
|
__builtin_mma_xxsetaccz(&acc_0);
|
||||||
|
__builtin_mma_xxsetaccz(&acc_1);
|
||||||
|
__builtin_mma_xxsetaccz(&acc_2);
|
||||||
|
__builtin_mma_xxsetaccz(&acc_3);
|
||||||
|
packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false);
|
||||||
|
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B, true);
|
||||||
|
for(int x = 0; x < 8; x++) {
|
||||||
|
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
|
||||||
|
__builtin_mma_xvi8ger4pp(&acc_1, vec_A[x+8], vec_B[x]);
|
||||||
|
__builtin_mma_xvi8ger4pp(&acc_2, vec_A[x], vec_B[x+8]);
|
||||||
|
__builtin_mma_xvi8ger4pp(&acc_3, vec_A[x+8], vec_B[x+8]);
|
||||||
|
}
|
||||||
|
for (int I = 0; I<8; I++) {
|
||||||
|
for (int J = 0; J<4; J++) {
|
||||||
|
*((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
|
||||||
|
*((float*)&vs[I+8]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J+4)*ldb)+l)->d));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
auto aoffset = A+(ii*lda)+l;
|
||||||
|
for (int i = 0; i < 8; i++) {
|
||||||
|
comparray[i] = 0;
|
||||||
|
int ca = 0;
|
||||||
|
const int8_t *at = aoffset->qs;
|
||||||
|
for (int j = 0; j < 32; j++)
|
||||||
|
ca += (int)*at++;
|
||||||
|
comparray[i] = ca;
|
||||||
|
aoffset += lda;
|
||||||
|
}
|
||||||
|
compute<8>(&acc_0, 0, 0, comparray, vs, fin_res);
|
||||||
|
compute<8>(&acc_1, 4, 4, comparray, vs, fin_res);
|
||||||
|
compute<8>(&acc_2, 0, 8, comparray, vs, fin_res);
|
||||||
|
compute<8>(&acc_3, 4, 12, comparray, vs, fin_res);
|
||||||
|
}
|
||||||
|
save_res<4, 4>(ii, jj, 0, fin_res);
|
||||||
|
save_res<4, 4>(ii+4, jj, 4, fin_res);
|
||||||
|
save_res<4, 4>(ii, jj+4, 8, fin_res);
|
||||||
|
save_res<4, 4>(ii+4, jj+4, 12, fin_res);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<int RM, int RN>
|
||||||
|
void gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
||||||
|
int64_t ytiles = (m - m0) / RM;
|
||||||
|
int64_t xtiles = (n - n0) / RN;
|
||||||
|
int64_t tiles = xtiles * ytiles;
|
||||||
|
int64_t duty = (tiles + nth - 1) / nth;
|
||||||
|
int64_t start = duty * ith;
|
||||||
|
int64_t end = start + duty;
|
||||||
|
vec_t vec_A[8], vec_B[8] = {0};
|
||||||
|
vector signed int vec_C[4];
|
||||||
|
acc_t acc_0;
|
||||||
|
|
||||||
|
if (end > tiles)
|
||||||
|
end = tiles;
|
||||||
|
for (int64_t job = start; job < end; ++job) {
|
||||||
|
int64_t ii = m0 + job / xtiles * RM;
|
||||||
|
int64_t jj = n0 + job % xtiles * RN;
|
||||||
|
std::array<int, RM> comparray;
|
||||||
|
vector float res[4] = {0};
|
||||||
|
vector float fin_res[4] = {0};
|
||||||
|
vector float vs[4] = {0};
|
||||||
|
vector float CA[4] = {0};
|
||||||
|
__builtin_prefetch((A+(ii*lda)+0)->qs, 0, 1); // prefetch first value
|
||||||
|
__builtin_prefetch((B+(jj*ldb)+0)->qs, 0, 1); // prefetch first value
|
||||||
|
for (int l = 0; l < k; l++) {
|
||||||
|
__builtin_prefetch((A+(ii*lda)+(l+1))->qs, 0, 1); // prefetch one loop ahead
|
||||||
|
__builtin_prefetch((B+(jj*ldb)+(l+1))->qs, 0, 1); // prefetch one loop ahead
|
||||||
|
__builtin_mma_xxsetaccz(&acc_0);
|
||||||
|
packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, RM, 8, (int8_t*)vec_A, false);
|
||||||
|
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, RN, 8, (uint8_t*)vec_B, true);
|
||||||
|
for(int x = 0; x < 8; x+=4) {
|
||||||
|
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
|
||||||
|
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x+1], vec_B[x+1]);
|
||||||
|
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x+2], vec_B[x+2]);
|
||||||
|
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x+3], vec_B[x+3]);
|
||||||
|
}
|
||||||
|
for (int I = 0; I<RM; I++) {
|
||||||
|
for (int J = 0; J<RN; J++) {
|
||||||
|
*((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
__builtin_mma_disassemble_acc(vec_C, &acc_0);
|
||||||
|
auto aoffset = A+(ii*lda)+l;
|
||||||
|
for (int i = 0; i < RM; i++) {
|
||||||
|
comparray[i] = 0;
|
||||||
|
int ca = 0;
|
||||||
|
const int8_t *at = aoffset->qs;
|
||||||
|
for (int j = 0; j < 32; j++)
|
||||||
|
ca += (int)*at++;
|
||||||
|
comparray[i] = ca;
|
||||||
|
aoffset += lda;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < RM; i++) {
|
||||||
|
CA[i] = vec_splats((float)(((double)comparray[i]) * -128.0));
|
||||||
|
res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]);
|
||||||
|
fin_res[i] = vec_madd(res[i], vs[i], fin_res[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
save_res<RM, RN>(ii, jj, 0, fin_res);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<int RM, int RN>
|
||||||
|
inline void kernel(int64_t ii, int64_t jj) {
|
||||||
|
if constexpr(RM == 4 && RN == 8) {
|
||||||
|
KERNEL_4x8(ii,jj);
|
||||||
|
} else if constexpr(RM == 8 && RN == 4) {
|
||||||
|
KERNEL_8x4(ii,jj);
|
||||||
|
} else if constexpr(RM == 8 && RN == 8) {
|
||||||
|
KERNEL_8x8(ii,jj);
|
||||||
|
} else {
|
||||||
|
static_assert(false, "RN/RM values not supported");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int RM, int RN>
|
||||||
|
NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
||||||
|
int64_t ytiles = (m - m0) / RM;
|
||||||
|
int64_t xtiles = (n - n0) / RN;
|
||||||
|
int64_t tiles = xtiles * ytiles;
|
||||||
|
int64_t duty = (tiles + nth - 1) / nth;
|
||||||
|
int64_t start = duty * ith;
|
||||||
|
int64_t end = start + duty;
|
||||||
|
if (end > tiles)
|
||||||
|
end = tiles;
|
||||||
|
for (int64_t job = start; job < end; ++job) {
|
||||||
|
int64_t ii = m0 + job / xtiles * RM;
|
||||||
|
int64_t jj = n0 + job % xtiles * RN;
|
||||||
|
kernel<RM, RN>(ii, jj);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const TA *const A;
|
||||||
|
const TB *const B;
|
||||||
|
TC *C;
|
||||||
|
TA *At;
|
||||||
|
TB *Bt;
|
||||||
|
const int64_t k;
|
||||||
|
const int64_t lda;
|
||||||
|
const int64_t ldb;
|
||||||
|
const int64_t ldc;
|
||||||
|
const int ith;
|
||||||
|
const int nth;
|
||||||
|
};
|
||||||
|
|
||||||
template <typename TA, typename TB, typename TC>
|
template <typename TA, typename TB, typename TC>
|
||||||
class tinyBLAS_PPC {
|
class tinyBLAS_PPC {
|
||||||
public:
|
public:
|
||||||
@ -1068,13 +1769,17 @@ class tinyBLAS_PPC {
|
|||||||
|
|
||||||
void (tinyBLAS_PPC::*kernel)(int64_t, int64_t);
|
void (tinyBLAS_PPC::*kernel)(int64_t, int64_t);
|
||||||
|
|
||||||
void READ_BLOCK(const float* a, int64_t lda, int rows, int cols, float* vec) {
|
template<typename VA>
|
||||||
|
void packTranspose(const TA* a, int64_t lda, int rows, int cols, TA* vec) {
|
||||||
int64_t i, j;
|
int64_t i, j;
|
||||||
float *aoffset = NULL, *boffset = NULL;
|
TA *aoffset = NULL, *boffset = NULL;
|
||||||
float *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL;
|
TA *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL;
|
||||||
float *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL;
|
TA *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL;
|
||||||
|
__vector_pair C1, C2, C3, C4, C5, C6, C7, C8;
|
||||||
aoffset = const_cast<float*>(a);
|
VA c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2] = {0};
|
||||||
|
VA c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2] = {0};
|
||||||
|
VA t1, t2, t3, t4, t5, t6, t7, t8;
|
||||||
|
aoffset = const_cast<TA*>(a);
|
||||||
boffset = vec;
|
boffset = vec;
|
||||||
j = (rows >> 3);
|
j = (rows >> 3);
|
||||||
if (j > 0) {
|
if (j > 0) {
|
||||||
@ -1090,9 +1795,6 @@ class tinyBLAS_PPC {
|
|||||||
aoffset += 8 * lda;
|
aoffset += 8 * lda;
|
||||||
i = (cols >> 3);
|
i = (cols >> 3);
|
||||||
if (i > 0) {
|
if (i > 0) {
|
||||||
__vector_pair C1, C2, C3, C4, C5, C6, C7, C8;
|
|
||||||
vector float c1[2], c2[2], c3[2], c4[2], c5[2], c6[2], c7[2], c8[2];
|
|
||||||
vector float t1, t2, t3, t4, t5, t6, t7, t8;
|
|
||||||
do {
|
do {
|
||||||
C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1);
|
C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1);
|
||||||
C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2);
|
C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2);
|
||||||
@ -1172,21 +1874,19 @@ class tinyBLAS_PPC {
|
|||||||
} while(i > 0);
|
} while(i > 0);
|
||||||
}
|
}
|
||||||
if (cols & 4) {
|
if (cols & 4) {
|
||||||
vector float c1, c2, c3, c4, c5, c6, c7, c8;
|
c1[0] = vec_xl(0, aoffset1);
|
||||||
vector float t1, t2, t3, t4, t5, t6, t7, t8;
|
c2[0] = vec_xl(0, aoffset2);
|
||||||
c1 = vec_xl(0, aoffset1);
|
c3[0] = vec_xl(0, aoffset3);
|
||||||
c2 = vec_xl(0, aoffset2);
|
c4[0] = vec_xl(0, aoffset4);
|
||||||
c3 = vec_xl(0, aoffset3);
|
c5[0] = vec_xl(0, aoffset5);
|
||||||
c4 = vec_xl(0, aoffset4);
|
c6[0] = vec_xl(0, aoffset6);
|
||||||
c5 = vec_xl(0, aoffset5);
|
c7[0] = vec_xl(0, aoffset7);
|
||||||
c6 = vec_xl(0, aoffset6);
|
c8[0] = vec_xl(0, aoffset8);
|
||||||
c7 = vec_xl(0, aoffset7);
|
|
||||||
c8 = vec_xl(0, aoffset8);
|
|
||||||
|
|
||||||
t1 = vec_mergeh(c1, c2);
|
t1 = vec_mergeh(c1[0], c2[0]);
|
||||||
t2 = vec_mergeh(c3, c4);
|
t2 = vec_mergeh(c3[0], c4[0]);
|
||||||
t3 = vec_mergeh(c5, c6);
|
t3 = vec_mergeh(c5[0], c6[0]);
|
||||||
t4 = vec_mergeh(c7, c8);
|
t4 = vec_mergeh(c7[0], c8[0]);
|
||||||
t5 = vec_xxpermdi(t1, t2, 0);
|
t5 = vec_xxpermdi(t1, t2, 0);
|
||||||
t6 = vec_xxpermdi(t3, t4, 0);
|
t6 = vec_xxpermdi(t3, t4, 0);
|
||||||
t7 = vec_xxpermdi(t1, t2, 3);
|
t7 = vec_xxpermdi(t1, t2, 3);
|
||||||
@ -1196,10 +1896,10 @@ class tinyBLAS_PPC {
|
|||||||
vec_xst(t7, 0, boffset+8);
|
vec_xst(t7, 0, boffset+8);
|
||||||
vec_xst(t8, 0, boffset+12);
|
vec_xst(t8, 0, boffset+12);
|
||||||
|
|
||||||
t1 = vec_mergel(c1, c2);
|
t1 = vec_mergel(c1[0], c2[0]);
|
||||||
t2 = vec_mergel(c3, c4);
|
t2 = vec_mergel(c3[0], c4[0]);
|
||||||
t3 = vec_mergel(c5, c6);
|
t3 = vec_mergel(c5[0], c6[0]);
|
||||||
t4 = vec_mergel(c7, c8);
|
t4 = vec_mergel(c7[0], c8[0]);
|
||||||
t5 = vec_xxpermdi(t1, t2, 0);
|
t5 = vec_xxpermdi(t1, t2, 0);
|
||||||
t6 = vec_xxpermdi(t3, t4, 0);
|
t6 = vec_xxpermdi(t3, t4, 0);
|
||||||
t7 = vec_xxpermdi(t1, t2, 3);
|
t7 = vec_xxpermdi(t1, t2, 3);
|
||||||
@ -1221,9 +1921,6 @@ class tinyBLAS_PPC {
|
|||||||
aoffset += 4 * lda;
|
aoffset += 4 * lda;
|
||||||
i = (cols >> 3);
|
i = (cols >> 3);
|
||||||
if (i > 0) {
|
if (i > 0) {
|
||||||
__vector_pair C1, C2, C3, C4;
|
|
||||||
vector float c1[2], c2[2], c3[2], c4[2];
|
|
||||||
vector float t1, t2, t3, t4, t5, t6, t7, t8;
|
|
||||||
do {
|
do {
|
||||||
C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1);
|
C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1);
|
||||||
C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2);
|
C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2);
|
||||||
@ -1270,22 +1967,20 @@ class tinyBLAS_PPC {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (cols & 4) {
|
if (cols & 4) {
|
||||||
vector float c1, c2, c3, c4;
|
c1[0] = vec_xl(0, aoffset1);
|
||||||
vector float t1, t2, t3, t4;
|
c2[0] = vec_xl(0, aoffset2);
|
||||||
c1 = vec_xl(0, aoffset1);
|
c3[0] = vec_xl(0, aoffset3);
|
||||||
c2 = vec_xl(0, aoffset2);
|
c4[0] = vec_xl(0, aoffset4);
|
||||||
c3 = vec_xl(0, aoffset3);
|
|
||||||
c4 = vec_xl(0, aoffset4);
|
|
||||||
|
|
||||||
t1 = vec_mergeh(c1, c2);
|
t1 = vec_mergeh(c1[0], c2[0]);
|
||||||
t2 = vec_mergeh(c3, c4);
|
t2 = vec_mergeh(c3[0], c4[0]);
|
||||||
t3 = vec_xxpermdi(t1, t2, 0);
|
t3 = vec_xxpermdi(t1, t2, 0);
|
||||||
t4 = vec_xxpermdi(t1, t2, 3);
|
t4 = vec_xxpermdi(t1, t2, 3);
|
||||||
vec_xst(t3, 0, boffset);
|
vec_xst(t3, 0, boffset);
|
||||||
vec_xst(t4, 0, boffset+4);
|
vec_xst(t4, 0, boffset+4);
|
||||||
|
|
||||||
t1 = vec_mergel(c1, c2);
|
t1 = vec_mergel(c1[0], c2[0]);
|
||||||
t2 = vec_mergel(c3, c4);
|
t2 = vec_mergel(c3[0], c4[0]);
|
||||||
t3 = vec_xxpermdi(t1, t2, 0);
|
t3 = vec_xxpermdi(t1, t2, 0);
|
||||||
t4 = vec_xxpermdi(t1, t2, 3);
|
t4 = vec_xxpermdi(t1, t2, 3);
|
||||||
vec_xst(t3, 0, boffset+8);
|
vec_xst(t3, 0, boffset+8);
|
||||||
@ -1297,21 +1992,19 @@ class tinyBLAS_PPC {
|
|||||||
aoffset2 = aoffset1 + lda;
|
aoffset2 = aoffset1 + lda;
|
||||||
aoffset3 = aoffset2 + lda;
|
aoffset3 = aoffset2 + lda;
|
||||||
if (cols & 4) {
|
if (cols & 4) {
|
||||||
vector float c1, c2, c3, c4 = {0};
|
c1[0] = vec_xl(0, aoffset1);
|
||||||
vector float t1, t2, t3, t4;
|
c2[0] = vec_xl(0, aoffset2);
|
||||||
c1 = vec_xl(0, aoffset1);
|
c3[0] = vec_xl(0, aoffset3);
|
||||||
c2 = vec_xl(0, aoffset2);
|
|
||||||
c3 = vec_xl(0, aoffset3);
|
|
||||||
|
|
||||||
t1 = vec_mergeh(c1, c2);
|
t1 = vec_mergeh(c1[0], c2[0]);
|
||||||
t2 = vec_mergeh(c3, c4);
|
t2 = vec_mergeh(c3[0], c4[0]);
|
||||||
t3 = vec_xxpermdi(t1, t2, 0);
|
t3 = vec_xxpermdi(t1, t2, 0);
|
||||||
t4 = vec_xxpermdi(t1, t2, 3);
|
t4 = vec_xxpermdi(t1, t2, 3);
|
||||||
vec_xst(t3, 0, boffset);
|
vec_xst(t3, 0, boffset);
|
||||||
vec_xst(t4, 0, boffset+4);
|
vec_xst(t4, 0, boffset+4);
|
||||||
|
|
||||||
t1 = vec_mergel(c1, c2);
|
t1 = vec_mergel(c1[0], c2[0]);
|
||||||
t2 = vec_mergel(c3, c4);
|
t2 = vec_mergel(c3[0], c4[0]);
|
||||||
t3 = vec_xxpermdi(t1, t2, 0);
|
t3 = vec_xxpermdi(t1, t2, 0);
|
||||||
t4 = vec_xxpermdi(t1, t2, 3);
|
t4 = vec_xxpermdi(t1, t2, 3);
|
||||||
vec_xst(t3, 0, boffset+8);
|
vec_xst(t3, 0, boffset+8);
|
||||||
@ -1319,14 +2012,13 @@ class tinyBLAS_PPC {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void KERNEL_4x4(int64_t ii, int64_t jj) {
|
void KERNEL_4x4(int64_t ii, int64_t jj) {
|
||||||
vec_t vec_A[4], vec_B[4], vec_C[4];
|
vec_t vec_A[4], vec_B[4], vec_C[4];
|
||||||
acc_t acc_0;
|
acc_t acc_0;
|
||||||
__builtin_mma_xxsetaccz(&acc_0);
|
__builtin_mma_xxsetaccz(&acc_0);
|
||||||
for (int l = 0; l < k; l+=4) {
|
for (int l = 0; l < k; l+=4) {
|
||||||
READ_BLOCK(A+(ii*lda)+l, lda, 4, 4, (float*)vec_A);
|
packTranspose<vector float>(A+(ii*lda)+l, lda, 4, 4, (TA*)vec_A);
|
||||||
READ_BLOCK(B+(jj*ldb)+l, ldb, 4, 4, (float*)vec_B);
|
packTranspose<vector float>(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B);
|
||||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]);
|
__builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]);
|
||||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]);
|
__builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]);
|
||||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[2], vec_B[2]);
|
__builtin_mma_xvf32gerpp(&acc_0, vec_A[2], vec_B[2]);
|
||||||
@ -1341,8 +2033,8 @@ class tinyBLAS_PPC {
|
|||||||
__builtin_mma_xxsetaccz(&acc_0);
|
__builtin_mma_xxsetaccz(&acc_0);
|
||||||
__builtin_mma_xxsetaccz(&acc_1);
|
__builtin_mma_xxsetaccz(&acc_1);
|
||||||
for (int64_t l = 0; l < k; l+=4) {
|
for (int64_t l = 0; l < k; l+=4) {
|
||||||
READ_BLOCK(A+(ii*lda)+l, lda, 4, 4, (float*)vec_A);
|
packTranspose<vector float>(A+(ii*lda)+l, lda, 4, 4, (TA*)vec_A);
|
||||||
READ_BLOCK(B+(jj*ldb)+l, ldb, 8, 4, (float*)vec_B);
|
packTranspose<vector float>(B+(jj*ldb)+l, ldb, 8, 4, (TA*)vec_B);
|
||||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[0], (vec_t)vec_B[0]);
|
__builtin_mma_xvf32gerpp(&acc_0, vec_A[0], (vec_t)vec_B[0]);
|
||||||
__builtin_mma_xvf32gerpp(&acc_1, vec_A[0], (vec_t)vec_B[1]);
|
__builtin_mma_xvf32gerpp(&acc_1, vec_A[0], (vec_t)vec_B[1]);
|
||||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[1], (vec_t)vec_B[2]);
|
__builtin_mma_xvf32gerpp(&acc_0, vec_A[1], (vec_t)vec_B[2]);
|
||||||
@ -1362,8 +2054,8 @@ class tinyBLAS_PPC {
|
|||||||
__builtin_mma_xxsetaccz(&acc_0);
|
__builtin_mma_xxsetaccz(&acc_0);
|
||||||
__builtin_mma_xxsetaccz(&acc_1);
|
__builtin_mma_xxsetaccz(&acc_1);
|
||||||
for (int64_t l = 0; l < k; l+=4) {
|
for (int64_t l = 0; l < k; l+=4) {
|
||||||
READ_BLOCK(A+(ii*lda)+l, lda, 8, 4, (float*)vec_A);
|
packTranspose<vector float>(A+(ii*lda)+l, lda, 8, 4, (TA*)vec_A);
|
||||||
READ_BLOCK(B+(jj*ldb)+l, ldb, 4, 4, (float*)vec_B);
|
packTranspose<vector float>(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B);
|
||||||
__builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[0], vec_B[0]);
|
__builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[0], vec_B[0]);
|
||||||
__builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[1], vec_B[0]);
|
__builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[1], vec_B[0]);
|
||||||
__builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[2], vec_B[1]);
|
__builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[2], vec_B[1]);
|
||||||
@ -1385,8 +2077,8 @@ class tinyBLAS_PPC {
|
|||||||
__builtin_mma_xxsetaccz(&acc_2);
|
__builtin_mma_xxsetaccz(&acc_2);
|
||||||
__builtin_mma_xxsetaccz(&acc_3);
|
__builtin_mma_xxsetaccz(&acc_3);
|
||||||
for (int l = 0; l < k; l+=8) {
|
for (int l = 0; l < k; l+=8) {
|
||||||
READ_BLOCK(A+(ii*lda)+l, lda, 8, 8, (float*)vec_A);
|
packTranspose<vector float>(A+(ii*lda)+l, lda, 8, 8, (TA*)vec_A);
|
||||||
READ_BLOCK(B+(jj*ldb)+l, ldb, 8, 8, (float*)vec_B);
|
packTranspose<vector float>(B+(jj*ldb)+l, ldb, 8, 8, (TA*)vec_B);
|
||||||
for(int x = 0; x < 16; x+=2) {
|
for(int x = 0; x < 16; x+=2) {
|
||||||
__builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[x], vec_B[x]);
|
__builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[x], vec_B[x]);
|
||||||
__builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[x], vec_B[x+1]);
|
__builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[x], vec_B[x+1]);
|
||||||
@ -1569,15 +2261,15 @@ class tinyBLAS_PPC {
|
|||||||
vec_t vec_A[4], vec_B[4];
|
vec_t vec_A[4], vec_B[4];
|
||||||
for (int l=0; l<k; l+=4) {
|
for (int l=0; l<k; l+=4) {
|
||||||
if (RN >= 4 && RM == 1) {
|
if (RN >= 4 && RM == 1) {
|
||||||
float* a = const_cast<float*>(A+(ii)*lda+l);
|
TA* a = const_cast<TA*>(A+(ii)*lda+l);
|
||||||
READ_BLOCK(B+(jj*ldb)+l, ldb, 4, 4, (float*)vec_B);
|
packTranspose<vector float>(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B);
|
||||||
vec_A[0] = (vec_t)vec_xl(0,a);
|
vec_A[0] = (vec_t)vec_xl(0,a);
|
||||||
vec_A[1] = (vec_t)vec_splats(*((float*)&vec_A+1));
|
vec_A[1] = (vec_t)vec_splats(*((TA*)&vec_A+1));
|
||||||
vec_A[2] = (vec_t)vec_splats(*((float*)&vec_A+2));
|
vec_A[2] = (vec_t)vec_splats(*((TA*)&vec_A+2));
|
||||||
vec_A[3] = (vec_t)vec_splats(*((float*)&vec_A+3));
|
vec_A[3] = (vec_t)vec_splats(*((TA*)&vec_A+3));
|
||||||
} else {
|
} else {
|
||||||
READ_BLOCK(A+(ii*lda)+l, lda, RM, 4, (float*)vec_A);
|
packTranspose<vector float>(A+(ii*lda)+l, lda, RM, 4, (TA*)vec_A);
|
||||||
READ_BLOCK(B+(jj*ldb)+l, ldb, RN, 4, (float*)vec_B);
|
packTranspose<vector float>(B+(jj*ldb)+l, ldb, RN, 4, (TA*)vec_B);
|
||||||
}
|
}
|
||||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]);
|
__builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]);
|
||||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]);
|
__builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]);
|
||||||
@ -1587,7 +2279,7 @@ class tinyBLAS_PPC {
|
|||||||
__builtin_mma_disassemble_acc(vec_C, &acc_0);
|
__builtin_mma_disassemble_acc(vec_C, &acc_0);
|
||||||
for (int I = 0; I < RM; I++) {
|
for (int I = 0; I < RM; I++) {
|
||||||
for (int J = 0; J < RN; J++) {
|
for (int J = 0; J < RN; J++) {
|
||||||
*((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&vec_C[I]+J);
|
*((TC*)(C+ii+((jj+J)*ldc)+I)) = *((TC*)&vec_C[I]+J);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1810,6 +2502,20 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
|
|||||||
params->ith, params->nth};
|
params->ith, params->nth};
|
||||||
tb.matmul(m, n);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
|
#elif defined(__MMA__)
|
||||||
|
if (n < 8 && n != 4)
|
||||||
|
return false;
|
||||||
|
if (m < 8 && m != 4)
|
||||||
|
return false;
|
||||||
|
tinyBLAS_Q0_PPC<block_q8_0, block_q8_0, float> tb{
|
||||||
|
k, (const block_q8_0 *)A, lda,
|
||||||
|
(const block_q8_0 *)B, ldb,
|
||||||
|
(float *)C, ldc,
|
||||||
|
params->ith, params->nth};
|
||||||
|
tb.matmul(m, n);
|
||||||
|
return true;
|
||||||
|
|
||||||
#else
|
#else
|
||||||
return false;
|
return false;
|
||||||
#endif
|
#endif
|
||||||
|
@ -124,7 +124,7 @@ static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE)
|
|||||||
uint64_t nb1,
|
uint64_t nb1,
|
||||||
uint64_t nb2,
|
uint64_t nb2,
|
||||||
uint64_t nb3){
|
uint64_t nb3){
|
||||||
static_assert(dim >= 0 && dim <= 3);
|
static_assert(dim >= 0 && dim <= 3, "dim must be in [0, 3]");
|
||||||
|
|
||||||
const int64_t i3 = blockIdx.z;
|
const int64_t i3 = blockIdx.z;
|
||||||
const int64_t i2 = blockIdx.y;
|
const int64_t i2 = blockIdx.y;
|
||||||
|
@ -680,6 +680,8 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
|||||||
return dequantize_row_iq3_s_cuda;
|
return dequantize_row_iq3_s_cuda;
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
return convert_unary_cuda<half>;
|
return convert_unary_cuda<half>;
|
||||||
|
case GGML_TYPE_BF16:
|
||||||
|
return convert_unary_cuda<nv_bfloat16>;
|
||||||
default:
|
default:
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
@ -1728,7 +1728,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
|||||||
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
|
const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
|
||||||
|
|
||||||
bool use_mul_mat_vec = src0->type == GGML_TYPE_F16
|
bool use_mul_mat_vec = (src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
|
||||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||||
&& src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
|
&& src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
|
||||||
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
|
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
|
||||||
@ -2869,6 +2869,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|||||||
case GGML_TYPE_IQ3_XXS:
|
case GGML_TYPE_IQ3_XXS:
|
||||||
case GGML_TYPE_IQ4_NL:
|
case GGML_TYPE_IQ4_NL:
|
||||||
case GGML_TYPE_IQ4_XS:
|
case GGML_TYPE_IQ4_XS:
|
||||||
|
case GGML_TYPE_BF16:
|
||||||
#ifdef GGML_USE_MUSA
|
#ifdef GGML_USE_MUSA
|
||||||
if (a->type == GGML_TYPE_Q3_K) {
|
if (a->type == GGML_TYPE_Q3_K) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
#include "common.cuh"
|
#include "common.cuh"
|
||||||
#include "mmv.cuh"
|
#include "mmv.cuh"
|
||||||
|
|
||||||
template <typename type_acc, int block_size>
|
template <typename T, typename type_acc, int block_size>
|
||||||
static __global__ void mul_mat_vec(
|
static __global__ void mul_mat_vec(
|
||||||
const half * __restrict__ x, const float * __restrict__ y, float * __restrict__ dst, const int64_t ncols2, const int64_t stride_row,
|
const T * __restrict__ x, const float * __restrict__ y, float * __restrict__ dst, const int64_t ncols2, const int64_t stride_row,
|
||||||
const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst) {
|
const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst) {
|
||||||
const int64_t row = blockIdx.x;
|
const int64_t row = blockIdx.x;
|
||||||
const int64_t channel = blockIdx.z;
|
const int64_t channel = blockIdx.z;
|
||||||
@ -13,7 +13,6 @@ static __global__ void mul_mat_vec(
|
|||||||
y += channel *stride_channel_y;
|
y += channel *stride_channel_y;
|
||||||
dst += channel *stride_channel_dst;
|
dst += channel *stride_channel_dst;
|
||||||
|
|
||||||
const half2 * x2 = (const half2 *) x;
|
|
||||||
const float2 * y2 = (const float2 *) y;
|
const float2 * y2 = (const float2 *) y;
|
||||||
|
|
||||||
extern __shared__ char data_mmv[];
|
extern __shared__ char data_mmv[];
|
||||||
@ -28,6 +27,9 @@ static __global__ void mul_mat_vec(
|
|||||||
|
|
||||||
float sumf;
|
float sumf;
|
||||||
|
|
||||||
|
if constexpr (std::is_same<T, half>::value) {
|
||||||
|
const half2 * x2 = (const half2 *) x;
|
||||||
|
|
||||||
if (std::is_same<type_acc, float>::value) {
|
if (std::is_same<type_acc, float>::value) {
|
||||||
sumf = 0.0f;
|
sumf = 0.0f;
|
||||||
|
|
||||||
@ -51,6 +53,19 @@ static __global__ void mul_mat_vec(
|
|||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
#endif // FP16_AVAILABLE
|
#endif // FP16_AVAILABLE
|
||||||
}
|
}
|
||||||
|
} else if constexpr (std::is_same<T, nv_bfloat16>::value) {
|
||||||
|
const int * x2 = (const int *) x;
|
||||||
|
sumf = 0.0f;
|
||||||
|
|
||||||
|
for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
|
||||||
|
const int tmpx = x2[col2];
|
||||||
|
const float2 tmpy = y2[col2];
|
||||||
|
sumf += float(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[0]) * tmpy.x;
|
||||||
|
sumf += float(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[1]) * tmpy.y;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
static_assert(std::is_same<T, void>::value, "unsupported type");
|
||||||
|
}
|
||||||
|
|
||||||
sumf = warp_reduce_sum(sumf);
|
sumf = warp_reduce_sum(sumf);
|
||||||
|
|
||||||
@ -71,9 +86,9 @@ static __global__ void mul_mat_vec(
|
|||||||
dst[row] = sumf;
|
dst[row] = sumf;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename type_acc>
|
template <typename T, typename type_acc>
|
||||||
static void launch_mul_mat_vec_cuda(
|
static void launch_mul_mat_vec_cuda(
|
||||||
const half * x, const float * y, float * dst,
|
const T * x, const float * y, float * dst,
|
||||||
const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y,
|
const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y,
|
||||||
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
|
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
|
||||||
cudaStream_t stream) {
|
cudaStream_t stream) {
|
||||||
@ -97,35 +112,35 @@ static void launch_mul_mat_vec_cuda(
|
|||||||
const dim3 block_dims(block_size_best, 1, 1);
|
const dim3 block_dims(block_size_best, 1, 1);
|
||||||
switch (block_size_best) {
|
switch (block_size_best) {
|
||||||
case 32: {
|
case 32: {
|
||||||
mul_mat_vec<type_acc, 32><<<block_nums, block_dims, smem, stream>>>
|
mul_mat_vec<T, type_acc, 32><<<block_nums, block_dims, smem, stream>>>
|
||||||
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
||||||
} break;
|
} break;
|
||||||
case 64: {
|
case 64: {
|
||||||
mul_mat_vec<type_acc, 64><<<block_nums, block_dims, smem, stream>>>
|
mul_mat_vec<T, type_acc, 64><<<block_nums, block_dims, smem, stream>>>
|
||||||
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
||||||
} break;
|
} break;
|
||||||
case 96: {
|
case 96: {
|
||||||
mul_mat_vec<type_acc, 96><<<block_nums, block_dims, smem, stream>>>
|
mul_mat_vec<T, type_acc, 96><<<block_nums, block_dims, smem, stream>>>
|
||||||
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
||||||
} break;
|
} break;
|
||||||
case 128: {
|
case 128: {
|
||||||
mul_mat_vec<type_acc, 128><<<block_nums, block_dims, smem, stream>>>
|
mul_mat_vec<T, type_acc, 128><<<block_nums, block_dims, smem, stream>>>
|
||||||
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
||||||
} break;
|
} break;
|
||||||
case 160: {
|
case 160: {
|
||||||
mul_mat_vec<type_acc, 160><<<block_nums, block_dims, smem, stream>>>
|
mul_mat_vec<T, type_acc, 160><<<block_nums, block_dims, smem, stream>>>
|
||||||
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
||||||
} break;
|
} break;
|
||||||
case 192: {
|
case 192: {
|
||||||
mul_mat_vec<type_acc, 192><<<block_nums, block_dims, smem, stream>>>
|
mul_mat_vec<T, type_acc, 192><<<block_nums, block_dims, smem, stream>>>
|
||||||
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
||||||
} break;
|
} break;
|
||||||
case 224: {
|
case 224: {
|
||||||
mul_mat_vec<type_acc, 224><<<block_nums, block_dims, smem, stream>>>
|
mul_mat_vec<T, type_acc, 224><<<block_nums, block_dims, smem, stream>>>
|
||||||
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
||||||
} break;
|
} break;
|
||||||
case 256: {
|
case 256: {
|
||||||
mul_mat_vec<type_acc, 256><<<block_nums, block_dims, smem, stream>>>
|
mul_mat_vec<T, type_acc, 256><<<block_nums, block_dims, smem, stream>>>
|
||||||
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
||||||
} break;
|
} break;
|
||||||
default: {
|
default: {
|
||||||
@ -134,25 +149,25 @@ static void launch_mul_mat_vec_cuda(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
static void mul_mat_vec_cuda(
|
static void mul_mat_vec_cuda(
|
||||||
const half * x, const float * y, float * dst,
|
const T * x, const float * y, float * dst,
|
||||||
const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y,
|
const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y,
|
||||||
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
|
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
|
||||||
enum ggml_prec prec, cudaStream_t stream) {
|
enum ggml_prec prec, cudaStream_t stream) {
|
||||||
switch (prec) {
|
switch (prec) {
|
||||||
case GGML_PREC_DEFAULT: {
|
case GGML_PREC_DEFAULT: {
|
||||||
launch_mul_mat_vec_cuda<half>(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y,
|
launch_mul_mat_vec_cuda<T, half>(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y,
|
||||||
stride_channel_x, stride_channel_y, stride_channel_dst, stream);
|
stride_channel_x, stride_channel_y, stride_channel_dst, stream);
|
||||||
} break;
|
} break;
|
||||||
case GGML_PREC_F32: {
|
case GGML_PREC_F32: {
|
||||||
launch_mul_mat_vec_cuda<float>(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y,
|
launch_mul_mat_vec_cuda<T, float>(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y,
|
||||||
stride_channel_x, stride_channel_y, stride_channel_dst, stream);
|
stride_channel_x, stride_channel_y, stride_channel_dst, stream);
|
||||||
} break;
|
} break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
|
||||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
@ -164,7 +179,6 @@ void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor *
|
|||||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||||
const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;
|
const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;
|
||||||
|
|
||||||
const half * src0_d = (const half *) src0->data;
|
|
||||||
const float * src1_d = (const float *) src1->data;
|
const float * src1_d = (const float *) src1->data;
|
||||||
float * dst_d = (float *) dst->data;
|
float * dst_d = (float *) dst->data;
|
||||||
|
|
||||||
@ -181,7 +195,20 @@ void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor *
|
|||||||
const int64_t channel_stride_y = src1->nb[2] / ggml_type_size(src1->type);
|
const int64_t channel_stride_y = src1->nb[2] / ggml_type_size(src1->type);
|
||||||
const int64_t channel_stride_dst = dst->nb[2] / ggml_type_size( dst->type);
|
const int64_t channel_stride_dst = dst->nb[2] / ggml_type_size( dst->type);
|
||||||
|
|
||||||
mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, stride_row, ne02, ne12, channel_stride_x, channel_stride_y, channel_stride_dst, prec, ctx.stream());
|
switch (src0->type) {
|
||||||
|
case GGML_TYPE_F16: {
|
||||||
|
const half * src0_d = (const half *) src0->data;
|
||||||
|
mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, stride_row, ne02, ne12,
|
||||||
|
channel_stride_x, channel_stride_y, channel_stride_dst, prec, ctx.stream());
|
||||||
|
} break;
|
||||||
|
case GGML_TYPE_BF16: {
|
||||||
|
const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0->data;
|
||||||
|
mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, stride_row, ne02, ne12,
|
||||||
|
channel_stride_x, channel_stride_y, channel_stride_dst, prec, ctx.stream());
|
||||||
|
} break;
|
||||||
|
default:
|
||||||
|
GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_cuda_op_mul_mat_vec(
|
void ggml_cuda_op_mul_mat_vec(
|
||||||
@ -190,7 +217,6 @@ void ggml_cuda_op_mul_mat_vec(
|
|||||||
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
||||||
const int64_t src1_padded_row_size, cudaStream_t stream) {
|
const int64_t src1_padded_row_size, cudaStream_t stream) {
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
|
||||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
@ -211,8 +237,20 @@ void ggml_cuda_op_mul_mat_vec(
|
|||||||
const int64_t channel_stride_y = 0;
|
const int64_t channel_stride_y = 0;
|
||||||
const int64_t channel_stride_dst = 0;
|
const int64_t channel_stride_dst = 0;
|
||||||
|
|
||||||
mul_mat_vec_cuda((const half *) src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row,
|
switch (src0->type) {
|
||||||
|
case GGML_TYPE_F16: {
|
||||||
|
const half * src0_d = (const half *) src0_dd_i;
|
||||||
|
mul_mat_vec_cuda(src0_d, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row,
|
||||||
nchannels_x, nchannels_y, channel_stride_x, channel_stride_y, channel_stride_dst, prec, stream);
|
nchannels_x, nchannels_y, channel_stride_x, channel_stride_y, channel_stride_dst, prec, stream);
|
||||||
|
} break;
|
||||||
|
case GGML_TYPE_BF16: {
|
||||||
|
const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0_dd_i;
|
||||||
|
mul_mat_vec_cuda(src0_d, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row,
|
||||||
|
nchannels_x, nchannels_y, channel_stride_x, channel_stride_y, channel_stride_dst, prec, stream);
|
||||||
|
} break;
|
||||||
|
default:
|
||||||
|
GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type));
|
||||||
|
}
|
||||||
|
|
||||||
GGML_UNUSED(ctx);
|
GGML_UNUSED(ctx);
|
||||||
GGML_UNUSED(src1);
|
GGML_UNUSED(src1);
|
||||||
|
1
ggml/src/ggml-cuda/vendors/cuda.h
vendored
1
ggml/src/ggml-cuda/vendors/cuda.h
vendored
@ -3,6 +3,7 @@
|
|||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
#include <cuda.h>
|
#include <cuda.h>
|
||||||
#include <cublas_v2.h>
|
#include <cublas_v2.h>
|
||||||
|
#include <cuda_bf16.h>
|
||||||
#include <cuda_fp16.h>
|
#include <cuda_fp16.h>
|
||||||
|
|
||||||
#if CUDART_VERSION < 11020
|
#if CUDART_VERSION < 11020
|
||||||
|
3
ggml/src/ggml-cuda/vendors/hip.h
vendored
3
ggml/src/ggml-cuda/vendors/hip.h
vendored
@ -3,6 +3,7 @@
|
|||||||
#include <hip/hip_runtime.h>
|
#include <hip/hip_runtime.h>
|
||||||
#include <hipblas/hipblas.h>
|
#include <hipblas/hipblas.h>
|
||||||
#include <hip/hip_fp16.h>
|
#include <hip/hip_fp16.h>
|
||||||
|
#include <hip/hip_bfloat16.h>
|
||||||
#ifdef __HIP_PLATFORM_AMD__
|
#ifdef __HIP_PLATFORM_AMD__
|
||||||
// for rocblas_initialize()
|
// for rocblas_initialize()
|
||||||
#include "rocblas/rocblas.h"
|
#include "rocblas/rocblas.h"
|
||||||
@ -121,6 +122,8 @@
|
|||||||
#define __has_builtin(x) 0
|
#define __has_builtin(x) 0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
typedef hip_bfloat16 nv_bfloat16;
|
||||||
|
|
||||||
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
|
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
|
||||||
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
|
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
|
||||||
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
||||||
|
3
ggml/src/ggml-cuda/vendors/musa.h
vendored
3
ggml/src/ggml-cuda/vendors/musa.h
vendored
@ -3,6 +3,7 @@
|
|||||||
#include <musa_runtime.h>
|
#include <musa_runtime.h>
|
||||||
#include <musa.h>
|
#include <musa.h>
|
||||||
#include <mublas.h>
|
#include <mublas.h>
|
||||||
|
#include <musa_bf16.h>
|
||||||
#include <musa_fp16.h>
|
#include <musa_fp16.h>
|
||||||
#define CUBLAS_COMPUTE_16F CUDA_R_16F
|
#define CUBLAS_COMPUTE_16F CUDA_R_16F
|
||||||
#define CUBLAS_COMPUTE_32F CUDA_R_32F
|
#define CUBLAS_COMPUTE_32F CUDA_R_32F
|
||||||
@ -132,3 +133,5 @@
|
|||||||
#define cudaKernelNodeParams musaKernelNodeParams
|
#define cudaKernelNodeParams musaKernelNodeParams
|
||||||
#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
|
#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
|
||||||
#define cudaStreamEndCapture musaStreamEndCapture
|
#define cudaStreamEndCapture musaStreamEndCapture
|
||||||
|
|
||||||
|
typedef mt_bfloat16 nv_bfloat16;
|
||||||
|
@ -3,6 +3,8 @@
|
|||||||
// GGML internal header
|
// GGML internal header
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
#include "gguf.h"
|
||||||
|
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
|
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
|
||||||
@ -551,22 +553,15 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
|
|||||||
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
|
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
|
||||||
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
|
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
|
||||||
|
|
||||||
// expose GGUF internals for test code
|
|
||||||
|
|
||||||
GGML_API size_t gguf_type_size(enum gguf_type type);
|
|
||||||
|
|
||||||
GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);
|
|
||||||
|
|
||||||
struct gguf_buf {
|
|
||||||
void * data;
|
|
||||||
size_t size;
|
|
||||||
size_t offset;
|
|
||||||
};
|
|
||||||
GGML_API struct gguf_buf gguf_buf_init(size_t size);
|
|
||||||
GGML_API void gguf_buf_free(struct gguf_buf buf);
|
|
||||||
|
|
||||||
GGML_API void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
// expose GGUF internals for test code
|
||||||
|
GGML_API size_t gguf_type_size(enum gguf_type type);
|
||||||
|
GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);
|
||||||
|
GGML_API void gguf_write_to_buf(const struct gguf_context * ctx, std::vector<int8_t> & buf, bool only_meta);
|
||||||
|
#endif // __cplusplus
|
||||||
|
@ -103,3 +103,19 @@ else()
|
|||||||
DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
|
DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
|
||||||
)
|
)
|
||||||
endif() # GGML_METAL_EMBED_LIBRARY
|
endif() # GGML_METAL_EMBED_LIBRARY
|
||||||
|
|
||||||
|
if (NOT GGML_METAL_EMBED_LIBRARY)
|
||||||
|
install(
|
||||||
|
FILES src/ggml-metal/ggml-metal.metal
|
||||||
|
PERMISSIONS
|
||||||
|
OWNER_READ
|
||||||
|
OWNER_WRITE
|
||||||
|
GROUP_READ
|
||||||
|
WORLD_READ
|
||||||
|
DESTINATION ${CMAKE_INSTALL_BINDIR})
|
||||||
|
|
||||||
|
install(
|
||||||
|
FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
|
||||||
|
DESTINATION ${CMAKE_INSTALL_BINDIR}
|
||||||
|
)
|
||||||
|
endif()
|
||||||
|
@ -2067,8 +2067,8 @@ static void ggml_metal_encode_node(
|
|||||||
GGML_ASSERT(ne12 % ne02 == 0);
|
GGML_ASSERT(ne12 % ne02 == 0);
|
||||||
GGML_ASSERT(ne13 % ne03 == 0);
|
GGML_ASSERT(ne13 % ne03 == 0);
|
||||||
|
|
||||||
const uint r2 = ne12/ne02;
|
const uint32_t r2 = ne12/ne02;
|
||||||
const uint r3 = ne13/ne03;
|
const uint32_t r3 = ne13/ne03;
|
||||||
|
|
||||||
// find the break-even point where the matrix-matrix kernel becomes more efficient compared
|
// find the break-even point where the matrix-matrix kernel becomes more efficient compared
|
||||||
// to the matrix-vector kernel
|
// to the matrix-vector kernel
|
||||||
|
@ -2744,13 +2744,13 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|||||||
cl_image_format img_fmt_1d;
|
cl_image_format img_fmt_1d;
|
||||||
cl_image_desc img_desc_1d;
|
cl_image_desc img_desc_1d;
|
||||||
cl_buffer_region region;
|
cl_buffer_region region;
|
||||||
cl_mem A_image1d;
|
cl_mem A_image1d = nullptr;
|
||||||
cl_mem B_image1d;
|
cl_mem B_image1d = nullptr;
|
||||||
cl_mem B_sub_buffer;
|
cl_mem B_sub_buffer = nullptr;
|
||||||
cl_mem C_d;
|
cl_mem C_d = nullptr;
|
||||||
// for B transpose
|
// for B transpose
|
||||||
cl_mem B_d;
|
cl_mem B_d = nullptr;
|
||||||
cl_mem B_d_input_image;
|
cl_mem B_d_input_image = nullptr;
|
||||||
// <--------------------------------------------> //
|
// <--------------------------------------------> //
|
||||||
|
|
||||||
// define matrix dimensions
|
// define matrix dimensions
|
||||||
|
@ -27,15 +27,6 @@
|
|||||||
#endif
|
#endif
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
|
||||||
#define UNUSED GGML_UNUSED
|
|
||||||
|
|
||||||
#define GGML_DEBUG 0
|
|
||||||
#if (GGML_DEBUG >= 1)
|
|
||||||
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
|
|
||||||
#else
|
|
||||||
#define GGML_PRINT_DEBUG(...)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
typedef SOCKET sockfd_t;
|
typedef SOCKET sockfd_t;
|
||||||
using ssize_t = __int64;
|
using ssize_t = __int64;
|
||||||
@ -93,9 +84,23 @@ enum rpc_cmd {
|
|||||||
RPC_CMD_COPY_TENSOR,
|
RPC_CMD_COPY_TENSOR,
|
||||||
RPC_CMD_GRAPH_COMPUTE,
|
RPC_CMD_GRAPH_COMPUTE,
|
||||||
RPC_CMD_GET_DEVICE_MEMORY,
|
RPC_CMD_GET_DEVICE_MEMORY,
|
||||||
|
RPC_CMD_INIT_TENSOR,
|
||||||
|
RPC_CMD_GET_ALLOC_SIZE,
|
||||||
RPC_CMD_COUNT,
|
RPC_CMD_COUNT,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct rpc_msg_get_alloc_size_req {
|
||||||
|
rpc_tensor tensor;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct rpc_msg_get_alloc_size_rsp {
|
||||||
|
uint64_t alloc_size;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct rpc_msg_init_tensor_req {
|
||||||
|
rpc_tensor tensor;
|
||||||
|
};
|
||||||
|
|
||||||
struct rpc_msg_alloc_buffer_req {
|
struct rpc_msg_alloc_buffer_req {
|
||||||
uint64_t size;
|
uint64_t size;
|
||||||
};
|
};
|
||||||
@ -397,7 +402,7 @@ static std::shared_ptr<socket_t> get_socket(const std::string & endpoint) {
|
|||||||
initialized = true;
|
initialized = true;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
UNUSED(initialized);
|
GGML_UNUSED(initialized);
|
||||||
#endif
|
#endif
|
||||||
auto sock = socket_connect(host.c_str(), port);
|
auto sock = socket_connect(host.c_str(), port);
|
||||||
if (sock == nullptr) {
|
if (sock == nullptr) {
|
||||||
@ -461,10 +466,18 @@ static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||||
UNUSED(buffer);
|
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
||||||
if (ggml_is_quantized(tensor->type)) {
|
|
||||||
// TODO: this check is due to MATRIX_ROW_PADDING in CUDA and should be generalized
|
// CUDA backend on the server pads everything to 512 due to CUDA limitations.
|
||||||
GGML_ASSERT(tensor->ne[0] % 512 == 0 && "unsupported quantized tensor");
|
// Due to bandwidth constraints, we only call the server init tensor functions if necessary.
|
||||||
|
// In particular, only quantized tensors need padding
|
||||||
|
if (ggml_is_quantized(tensor->type) && (tensor->ne[0] % 512 != 0) && (tensor->view_src == nullptr)) {
|
||||||
|
rpc_msg_init_tensor_req request;
|
||||||
|
|
||||||
|
request.tensor = serialize_tensor(tensor);
|
||||||
|
|
||||||
|
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_INIT_TENSOR, &request, sizeof(request), nullptr, 0);
|
||||||
|
GGML_ASSERT(status);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -577,8 +590,23 @@ static size_t ggml_backend_rpc_get_max_size(ggml_backend_buffer_type_t buft) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||||
UNUSED(buft);
|
// See comments in init_tensor.
|
||||||
|
if (ggml_is_quantized(tensor->type) && (tensor->ne[0] % 512 != 0) && (tensor->view_src == nullptr)) {
|
||||||
|
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
|
||||||
|
auto sock = get_socket(buft_ctx->endpoint);
|
||||||
|
|
||||||
|
rpc_msg_get_alloc_size_req request;
|
||||||
|
|
||||||
|
request.tensor = serialize_tensor(tensor);
|
||||||
|
|
||||||
|
rpc_msg_get_alloc_size_rsp response;
|
||||||
|
bool status = send_rpc_cmd(sock, RPC_CMD_GET_ALLOC_SIZE, &request, sizeof(request), &response, sizeof(response));
|
||||||
|
GGML_ASSERT(status);
|
||||||
|
|
||||||
|
return response.alloc_size;
|
||||||
|
} else {
|
||||||
return ggml_nbytes(tensor);
|
return ggml_nbytes(tensor);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
|
static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
|
||||||
@ -603,7 +631,7 @@ static void ggml_backend_rpc_free(ggml_backend_t backend) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_rpc_synchronize(ggml_backend_t backend) {
|
static void ggml_backend_rpc_synchronize(ggml_backend_t backend) {
|
||||||
UNUSED(backend);
|
GGML_UNUSED(backend);
|
||||||
// this is no-op because we don't have any async operations
|
// this is no-op because we don't have any async operations
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -757,6 +785,8 @@ public:
|
|||||||
bool get_tensor(const rpc_msg_get_tensor_req & request, std::vector<uint8_t> & response);
|
bool get_tensor(const rpc_msg_get_tensor_req & request, std::vector<uint8_t> & response);
|
||||||
bool copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_copy_tensor_rsp & response);
|
bool copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_copy_tensor_rsp & response);
|
||||||
bool graph_compute(const std::vector<uint8_t> & input, rpc_msg_graph_compute_rsp & response);
|
bool graph_compute(const std::vector<uint8_t> & input, rpc_msg_graph_compute_rsp & response);
|
||||||
|
bool init_tensor(const rpc_msg_init_tensor_req & request);
|
||||||
|
bool get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_msg_get_alloc_size_rsp & response);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
ggml_tensor * deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor);
|
ggml_tensor * deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor);
|
||||||
@ -770,6 +800,36 @@ private:
|
|||||||
std::unordered_set<ggml_backend_buffer_t> buffers;
|
std::unordered_set<ggml_backend_buffer_t> buffers;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
bool rpc_server::get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_msg_get_alloc_size_rsp & response) {
|
||||||
|
ggml_backend_buffer_type_t buft;
|
||||||
|
struct ggml_init_params params {
|
||||||
|
/*.mem_size =*/ ggml_tensor_overhead(),
|
||||||
|
/*.mem_buffer =*/ NULL,
|
||||||
|
/*.no_alloc =*/ true,
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_context * ctx = ggml_init(params);
|
||||||
|
ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
|
||||||
|
|
||||||
|
if (tensor == nullptr) {
|
||||||
|
GGML_LOG_ERROR("Null tensor pointer passed to server get_alloc_size function.\n");
|
||||||
|
ggml_free(ctx);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tensor->buffer == nullptr) {
|
||||||
|
//No buffer allocated.
|
||||||
|
buft = ggml_backend_get_default_buffer_type(backend);
|
||||||
|
} else {
|
||||||
|
buft = tensor->buffer->buft;
|
||||||
|
}
|
||||||
|
|
||||||
|
response.alloc_size = ggml_backend_buft_get_alloc_size(buft,tensor);
|
||||||
|
|
||||||
|
ggml_free(ctx);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
void rpc_server::alloc_buffer(const rpc_msg_alloc_buffer_req & request, rpc_msg_alloc_buffer_rsp & response) {
|
void rpc_server::alloc_buffer(const rpc_msg_alloc_buffer_req & request, rpc_msg_alloc_buffer_rsp & response) {
|
||||||
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
|
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
|
||||||
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, request.size);
|
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, request.size);
|
||||||
@ -781,7 +841,7 @@ void rpc_server::alloc_buffer(const rpc_msg_alloc_buffer_req & request, rpc_msg_
|
|||||||
GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> remote_ptr: %" PRIx64 ", remote_size: %" PRIu64 "\n", __func__, request.size, response.remote_ptr, response.remote_size);
|
GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> remote_ptr: %" PRIx64 ", remote_size: %" PRIu64 "\n", __func__, request.size, response.remote_ptr, response.remote_size);
|
||||||
buffers.insert(buffer);
|
buffers.insert(buffer);
|
||||||
} else {
|
} else {
|
||||||
GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> failed\n", __func__, request.size);
|
GGML_LOG_ERROR("[%s] size: %" PRIu64 " -> failed\n", __func__, request.size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -803,7 +863,7 @@ bool rpc_server::buffer_get_base(const rpc_msg_buffer_get_base_req & request, rp
|
|||||||
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr);
|
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr);
|
||||||
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
|
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
|
||||||
if (buffers.find(buffer) == buffers.end()) {
|
if (buffers.find(buffer) == buffers.end()) {
|
||||||
GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__);
|
GGML_LOG_ERROR("[%s] buffer not found\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
void * base = ggml_backend_buffer_get_base(buffer);
|
void * base = ggml_backend_buffer_get_base(buffer);
|
||||||
@ -815,7 +875,7 @@ bool rpc_server::free_buffer(const rpc_msg_free_buffer_req & request) {
|
|||||||
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr);
|
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr);
|
||||||
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
|
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
|
||||||
if (buffers.find(buffer) == buffers.end()) {
|
if (buffers.find(buffer) == buffers.end()) {
|
||||||
GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__);
|
GGML_LOG_ERROR("[%s] buffer not found\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
ggml_backend_buffer_free(buffer);
|
ggml_backend_buffer_free(buffer);
|
||||||
@ -827,7 +887,7 @@ bool rpc_server::buffer_clear(const rpc_msg_buffer_clear_req & request) {
|
|||||||
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 ", value: %u\n", __func__, request.remote_ptr, request.value);
|
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 ", value: %u\n", __func__, request.remote_ptr, request.value);
|
||||||
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
|
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
|
||||||
if (buffers.find(buffer) == buffers.end()) {
|
if (buffers.find(buffer) == buffers.end()) {
|
||||||
GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__);
|
GGML_LOG_ERROR("[%s] buffer not found\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
ggml_backend_buffer_clear(buffer, request.value);
|
ggml_backend_buffer_clear(buffer, request.value);
|
||||||
@ -883,7 +943,7 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
|
|||||||
struct ggml_context * ctx = ggml_init(params);
|
struct ggml_context * ctx = ggml_init(params);
|
||||||
ggml_tensor * tensor = deserialize_tensor(ctx, in_tensor);
|
ggml_tensor * tensor = deserialize_tensor(ctx, in_tensor);
|
||||||
if (tensor == nullptr) {
|
if (tensor == nullptr) {
|
||||||
GGML_PRINT_DEBUG("[%s] error deserializing tensor\n", __func__);
|
GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
|
||||||
ggml_free(ctx);
|
ggml_free(ctx);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -905,6 +965,40 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool rpc_server::init_tensor(const rpc_msg_init_tensor_req & request) {
|
||||||
|
struct ggml_init_params params {
|
||||||
|
/*.mem_size =*/ ggml_tensor_overhead(),
|
||||||
|
/*.mem_buffer =*/ NULL,
|
||||||
|
/*.no_alloc =*/ true,
|
||||||
|
};
|
||||||
|
struct ggml_context * ctx = ggml_init(params);
|
||||||
|
ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
|
||||||
|
if (tensor == nullptr) {
|
||||||
|
GGML_LOG_ERROR("Null tensor pointer passed to server init_tensor function.\n");
|
||||||
|
ggml_free(ctx);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Call the backend's buffer_init_tensor function
|
||||||
|
ggml_backend_buffer_t buffer = tensor->buffer;
|
||||||
|
if (buffer && buffer->iface.init_tensor) {
|
||||||
|
buffer->iface.init_tensor(buffer, tensor);
|
||||||
|
} else {
|
||||||
|
GGML_LOG_ERROR("Null buffer for tensor passed to init_tensor function\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tensor->extra != nullptr) {
|
||||||
|
// This pointer can either be passed around client/server, or probably better stored server-side and kept track of.
|
||||||
|
// Currently unimplemented.
|
||||||
|
GGML_LOG_ERROR("tensor->extra populated by the backend, this is currently unsupported.\n");
|
||||||
|
ggml_free(ctx);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_free(ctx);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
bool rpc_server::get_tensor(const rpc_msg_get_tensor_req & request, std::vector<uint8_t> & response) {
|
bool rpc_server::get_tensor(const rpc_msg_get_tensor_req & request, std::vector<uint8_t> & response) {
|
||||||
struct ggml_init_params params {
|
struct ggml_init_params params {
|
||||||
/*.mem_size =*/ ggml_tensor_overhead(),
|
/*.mem_size =*/ ggml_tensor_overhead(),
|
||||||
@ -914,7 +1008,7 @@ bool rpc_server::get_tensor(const rpc_msg_get_tensor_req & request, std::vector<
|
|||||||
struct ggml_context * ctx = ggml_init(params);
|
struct ggml_context * ctx = ggml_init(params);
|
||||||
ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
|
ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
|
||||||
if (tensor == nullptr) {
|
if (tensor == nullptr) {
|
||||||
GGML_PRINT_DEBUG("[%s] error deserializing tensor\n", __func__);
|
GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
|
||||||
ggml_free(ctx);
|
ggml_free(ctx);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -948,7 +1042,7 @@ bool rpc_server::copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_co
|
|||||||
ggml_tensor * src = deserialize_tensor(ctx, &request.src);
|
ggml_tensor * src = deserialize_tensor(ctx, &request.src);
|
||||||
ggml_tensor * dst = deserialize_tensor(ctx, &request.dst);
|
ggml_tensor * dst = deserialize_tensor(ctx, &request.dst);
|
||||||
if (src == nullptr || dst == nullptr) {
|
if (src == nullptr || dst == nullptr) {
|
||||||
GGML_PRINT_DEBUG("[%s] error deserializing tensors\n", __func__);
|
GGML_LOG_ERROR("[%s] error deserializing tensors\n", __func__);
|
||||||
ggml_free(ctx);
|
ggml_free(ctx);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -1058,6 +1152,18 @@ static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t fre
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
case RPC_CMD_GET_ALLOC_SIZE: {
|
||||||
|
rpc_msg_get_alloc_size_req request;
|
||||||
|
if (!recv_msg(sockfd, &request, sizeof(request))) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
rpc_msg_get_alloc_size_rsp response;
|
||||||
|
server.get_alloc_size(request, response);
|
||||||
|
if (!send_msg(sockfd, &response, sizeof(response))) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
case RPC_CMD_GET_ALIGNMENT: {
|
case RPC_CMD_GET_ALIGNMENT: {
|
||||||
if (!recv_msg(sockfd, nullptr, 0)) {
|
if (!recv_msg(sockfd, nullptr, 0)) {
|
||||||
return;
|
return;
|
||||||
@ -1133,6 +1239,19 @@ static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t fre
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
case RPC_CMD_INIT_TENSOR: {
|
||||||
|
rpc_msg_init_tensor_req request;
|
||||||
|
if (!recv_msg(sockfd, &request,sizeof(request))) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (!server.init_tensor(request)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (!send_msg(sockfd, nullptr, 0)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
case RPC_CMD_GET_TENSOR: {
|
case RPC_CMD_GET_TENSOR: {
|
||||||
rpc_msg_get_tensor_req request;
|
rpc_msg_get_tensor_req request;
|
||||||
if (!recv_msg(sockfd, &request, sizeof(request))) {
|
if (!recv_msg(sockfd, &request, sizeof(request))) {
|
||||||
@ -1257,14 +1376,14 @@ static void ggml_backend_rpc_device_get_memory(ggml_backend_dev_t dev, size_t *
|
|||||||
|
|
||||||
ggml_backend_rpc_get_device_memory(ctx->endpoint.c_str(), free, total);
|
ggml_backend_rpc_get_device_memory(ctx->endpoint.c_str(), free, total);
|
||||||
|
|
||||||
UNUSED(dev);
|
GGML_UNUSED(dev);
|
||||||
}
|
}
|
||||||
|
|
||||||
static enum ggml_backend_dev_type ggml_backend_rpc_device_get_type(ggml_backend_dev_t dev) {
|
static enum ggml_backend_dev_type ggml_backend_rpc_device_get_type(ggml_backend_dev_t dev) {
|
||||||
// TODO: obtain value from the server
|
// TODO: obtain value from the server
|
||||||
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
||||||
|
|
||||||
UNUSED(dev);
|
GGML_UNUSED(dev);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_rpc_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
static void ggml_backend_rpc_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
||||||
@ -1285,7 +1404,7 @@ static ggml_backend_t ggml_backend_rpc_device_init(ggml_backend_dev_t dev, const
|
|||||||
|
|
||||||
return ggml_backend_rpc_init(ctx->endpoint.c_str());
|
return ggml_backend_rpc_init(ctx->endpoint.c_str());
|
||||||
|
|
||||||
UNUSED(params);
|
GGML_UNUSED(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_type_t ggml_backend_rpc_device_get_buffer_type(ggml_backend_dev_t dev) {
|
static ggml_backend_buffer_type_t ggml_backend_rpc_device_get_buffer_type(ggml_backend_dev_t dev) {
|
||||||
@ -1293,12 +1412,12 @@ static ggml_backend_buffer_type_t ggml_backend_rpc_device_get_buffer_type(ggml_b
|
|||||||
|
|
||||||
return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str());
|
return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str());
|
||||||
|
|
||||||
UNUSED(dev);
|
GGML_UNUSED(dev);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_rpc_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
static bool ggml_backend_rpc_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
||||||
UNUSED(dev);
|
GGML_UNUSED(dev);
|
||||||
UNUSED(op);
|
GGML_UNUSED(op);
|
||||||
//TODO: call the remote backend and cache the results
|
//TODO: call the remote backend and cache the results
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -1335,20 +1454,20 @@ static const struct ggml_backend_device_i ggml_backend_rpc_device_i = {
|
|||||||
static const char * ggml_backend_rpc_reg_get_name(ggml_backend_reg_t reg) {
|
static const char * ggml_backend_rpc_reg_get_name(ggml_backend_reg_t reg) {
|
||||||
return "RPC";
|
return "RPC";
|
||||||
|
|
||||||
UNUSED(reg);
|
GGML_UNUSED(reg);
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t ggml_backend_rpc_reg_get_device_count(ggml_backend_reg_t reg) {
|
static size_t ggml_backend_rpc_reg_get_device_count(ggml_backend_reg_t reg) {
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
UNUSED(reg);
|
GGML_UNUSED(reg);
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_dev_t ggml_backend_rpc_reg_get_device(ggml_backend_reg_t reg, size_t index) {
|
static ggml_backend_dev_t ggml_backend_rpc_reg_get_device(ggml_backend_reg_t reg, size_t index) {
|
||||||
GGML_ABORT("The RPC backend does not have enumerated devices - use ggml_backend_add_device instead");
|
GGML_ABORT("The RPC backend does not have enumerated devices - use ggml_backend_add_device instead");
|
||||||
|
|
||||||
UNUSED(reg);
|
GGML_UNUSED(reg);
|
||||||
UNUSED(index);
|
GGML_UNUSED(index);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
||||||
@ -1357,7 +1476,7 @@ static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const ch
|
|||||||
}
|
}
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
UNUSED(reg);
|
GGML_UNUSED(reg);
|
||||||
}
|
}
|
||||||
|
|
||||||
static const struct ggml_backend_reg_i ggml_backend_rpc_reg_i = {
|
static const struct ggml_backend_reg_i ggml_backend_rpc_reg_i = {
|
||||||
|
@ -131,7 +131,7 @@ void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, const ggml_tensor* s
|
|||||||
[=](sycl::nd_item<3> item_ct1) {
|
[=](sycl::nd_item<3> item_ct1) {
|
||||||
rwkv_wkv_f32_kernel(
|
rwkv_wkv_f32_kernel(
|
||||||
B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d,
|
B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d,
|
||||||
item_ct1, shared_mem_acc.get_pointer()
|
item_ct1, (float*)shared_mem_acc.get_multi_ptr<sycl::access::decorated::no>().get()
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
@ -8,6 +8,20 @@ if (Vulkan_FOUND)
|
|||||||
../../include/ggml-vulkan.h
|
../../include/ggml-vulkan.h
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Compile a test shader to determine whether GL_KHR_cooperative_matrix is supported.
|
||||||
|
# If it's not, there will be an error to stderr.
|
||||||
|
# If it's supported, set a define to indicate that we should compile those shaders
|
||||||
|
execute_process(COMMAND ${Vulkan_GLSLC_EXECUTABLE} -o - -fshader-stage=compute --target-env=vulkan1.3 "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_coopmat_support.comp"
|
||||||
|
OUTPUT_VARIABLE glslc_output
|
||||||
|
ERROR_VARIABLE glslc_error)
|
||||||
|
|
||||||
|
if (${glslc_error} MATCHES ".*extension not supported: GL_KHR_cooperative_matrix.*")
|
||||||
|
message(STATUS "GL_KHR_cooperative_matrix not supported by glslc")
|
||||||
|
else()
|
||||||
|
message(STATUS "GL_KHR_cooperative_matrix supported by glslc")
|
||||||
|
add_compile_definitions(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
|
||||||
|
endif()
|
||||||
|
|
||||||
# Compile a test shader to determine whether GL_NV_cooperative_matrix2 is supported.
|
# Compile a test shader to determine whether GL_NV_cooperative_matrix2 is supported.
|
||||||
# If it's not, there will be an error to stderr.
|
# If it's not, there will be an error to stderr.
|
||||||
# If it's supported, set a define to indicate that we should compile those shaders
|
# If it's supported, set a define to indicate that we should compile those shaders
|
||||||
@ -69,6 +83,10 @@ if (Vulkan_FOUND)
|
|||||||
|
|
||||||
file(GLOB _ggml_vk_shader_deps "${_ggml_vk_input_dir}/*.comp")
|
file(GLOB _ggml_vk_shader_deps "${_ggml_vk_input_dir}/*.comp")
|
||||||
|
|
||||||
|
if (NOT CMAKE_CROSSCOMPILING)
|
||||||
|
set(_ggml_vk_genshaders_cmd "$<TARGET_FILE_DIR:vulkan-shaders-gen>/${_ggml_vk_genshaders_cmd}")
|
||||||
|
endif ()
|
||||||
|
|
||||||
add_custom_command(
|
add_custom_command(
|
||||||
OUTPUT ${_ggml_vk_header}
|
OUTPUT ${_ggml_vk_header}
|
||||||
${_ggml_vk_source}
|
${_ggml_vk_source}
|
||||||
|
@ -145,6 +145,8 @@ class vk_perf_logger;
|
|||||||
#endif
|
#endif
|
||||||
static void ggml_vk_destroy_buffer(vk_buffer& buf);
|
static void ggml_vk_destroy_buffer(vk_buffer& buf);
|
||||||
|
|
||||||
|
static constexpr uint32_t mul_mat_vec_max_cols = 8;
|
||||||
|
|
||||||
struct vk_device_struct {
|
struct vk_device_struct {
|
||||||
std::mutex mutex;
|
std::mutex mutex;
|
||||||
|
|
||||||
@ -202,8 +204,8 @@ struct vk_device_struct {
|
|||||||
vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat_id[GGML_TYPE_COUNT];
|
vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat_id[GGML_TYPE_COUNT];
|
||||||
|
|
||||||
vk_pipeline pipeline_dequant[GGML_TYPE_COUNT];
|
vk_pipeline pipeline_dequant[GGML_TYPE_COUNT];
|
||||||
vk_pipeline pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_COUNT];
|
vk_pipeline pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_COUNT][mul_mat_vec_max_cols];
|
||||||
vk_pipeline pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_COUNT];
|
vk_pipeline pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_COUNT][mul_mat_vec_max_cols];
|
||||||
vk_pipeline pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_COUNT];
|
vk_pipeline pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_COUNT];
|
||||||
|
|
||||||
vk_pipeline pipeline_mul_mat_vec_p021_f16_f32;
|
vk_pipeline pipeline_mul_mat_vec_p021_f16_f32;
|
||||||
@ -1643,6 +1645,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||||||
#undef CREATE_MM2
|
#undef CREATE_MM2
|
||||||
} else
|
} else
|
||||||
#endif // defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
#endif // defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||||
|
#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
|
||||||
if (device->coopmat_support) {
|
if (device->coopmat_support) {
|
||||||
// Create 6 variants, {s,m,l}x{unaligned,aligned}
|
// Create 6 variants, {s,m,l}x{unaligned,aligned}
|
||||||
#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
||||||
@ -1737,7 +1740,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||||||
}
|
}
|
||||||
#undef CREATE_MM2
|
#undef CREATE_MM2
|
||||||
#undef CREATE_MM
|
#undef CREATE_MM
|
||||||
} else if (device->fp16) {
|
} else
|
||||||
|
#endif // defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
|
||||||
|
if (device->fp16) {
|
||||||
// Create 6 variants, {s,m,l}x{unaligned,aligned}
|
// Create 6 variants, {s,m,l}x{unaligned,aligned}
|
||||||
#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
||||||
if (device->mul_mat ## ID ## _l) \
|
if (device->mul_mat ## ID ## _l) \
|
||||||
@ -1866,33 +1871,35 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||||||
} else if (device->vendor_id == VK_VENDOR_ID_INTEL)
|
} else if (device->vendor_id == VK_VENDOR_ID_INTEL)
|
||||||
rm_stdq = 2;
|
rm_stdq = 2;
|
||||||
|
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f32_f32", mul_mat_vec_f32_f32_f32_len, mul_mat_vec_f32_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
for (uint32_t i = 0; i < mul_mat_vec_max_cols; ++i) {
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32_f32", mul_mat_vec_f16_f32_f32_len, mul_mat_vec_f16_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f32_f32_"+std::to_string(i+1), mul_mat_vec_f32_f32_f32_len, mul_mat_vec_f32_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32_f32", mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f32_f32_"+std::to_string(i+1), mul_mat_vec_f16_f32_f32_len, mul_mat_vec_f16_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32_f32", mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_f32_f32_"+std::to_string(i+1), mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1][i], "mul_mat_vec_q4_1_f32_f32_"+std::to_string(i+1), mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0][i], "mul_mat_vec_q5_0_f32_f32_"+std::to_string(i+1), mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {device->subgroup_size, 1*rm_stdq}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1][i], "mul_mat_vec_q5_1_f32_f32_"+std::to_string(i+1), mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0][i], "mul_mat_vec_q8_0_f32_f32_"+std::to_string(i+1), mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {device->subgroup_size, 1*rm_stdq, i+1}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K][i], "mul_mat_vec_q2_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K][i], "mul_mat_vec_q3_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||||
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq, i+1}, 1, true);
|
||||||
|
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32", mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f16_f32_"+std::to_string(i+1), mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32", mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f16_f32_"+std::to_string(i+1), mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f16_f32", mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_f16_f32_"+std::to_string(i+1), mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f16_f32", mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1][i], "mul_mat_vec_q4_1_f16_f32_"+std::to_string(i+1), mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0][i], "mul_mat_vec_q5_0_f16_f32_"+std::to_string(i+1), mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1][i], "mul_mat_vec_q5_1_f16_f32_"+std::to_string(i+1), mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {device->subgroup_size, 1*rm_stdq}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0][i], "mul_mat_vec_q8_0_f16_f32_"+std::to_string(i+1), mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {device->subgroup_size, 1*rm_stdq, i+1}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K][i], "mul_mat_vec_q2_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K][i], "mul_mat_vec_q3_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq, i+1}, 1, true);
|
||||||
|
}
|
||||||
|
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
||||||
@ -2036,6 +2043,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||||||
std::cerr << "Done!" << std::endl;
|
std::cerr << "Done!" << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props);
|
||||||
|
|
||||||
static vk_device ggml_vk_get_device(size_t idx) {
|
static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
VK_LOG_DEBUG("ggml_vk_get_device(" << idx << ")");
|
VK_LOG_DEBUG("ggml_vk_get_device(" << idx << ")");
|
||||||
|
|
||||||
@ -2171,9 +2180,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|||||||
|
|
||||||
device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute;
|
device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute;
|
||||||
|
|
||||||
if (device->vendor_id == VK_VENDOR_ID_INTEL || (device->vendor_id == VK_VENDOR_ID_AMD && (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource))) {
|
if (!ggml_vk_khr_cooperative_matrix_support(device->properties, driver_props)) {
|
||||||
// Intel drivers don't support coopmat properly yet
|
|
||||||
// Only RADV supports coopmat properly on AMD
|
|
||||||
device->coopmat_support = false;
|
device->coopmat_support = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2238,6 +2245,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|||||||
last_struct = (VkBaseOutStructure *)&subgroup_size_control_features;
|
last_struct = (VkBaseOutStructure *)&subgroup_size_control_features;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(VK_KHR_cooperative_matrix)
|
||||||
VkPhysicalDeviceCooperativeMatrixFeaturesKHR coopmat_features;
|
VkPhysicalDeviceCooperativeMatrixFeaturesKHR coopmat_features;
|
||||||
coopmat_features.pNext = nullptr;
|
coopmat_features.pNext = nullptr;
|
||||||
coopmat_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR;
|
coopmat_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR;
|
||||||
@ -2247,6 +2255,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|||||||
last_struct->pNext = (VkBaseOutStructure *)&coopmat_features;
|
last_struct->pNext = (VkBaseOutStructure *)&coopmat_features;
|
||||||
last_struct = (VkBaseOutStructure *)&coopmat_features;
|
last_struct = (VkBaseOutStructure *)&coopmat_features;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(VK_NV_cooperative_matrix2)
|
#if defined(VK_NV_cooperative_matrix2)
|
||||||
VkPhysicalDeviceCooperativeMatrix2FeaturesNV coopmat2_features {};
|
VkPhysicalDeviceCooperativeMatrix2FeaturesNV coopmat2_features {};
|
||||||
@ -2279,7 +2288,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|||||||
device_extensions.push_back("VK_EXT_subgroup_size_control");
|
device_extensions.push_back("VK_EXT_subgroup_size_control");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(VK_KHR_cooperative_matrix)
|
||||||
device->coopmat_support = device->coopmat_support && coopmat_features.cooperativeMatrix;
|
device->coopmat_support = device->coopmat_support && coopmat_features.cooperativeMatrix;
|
||||||
|
#endif
|
||||||
|
|
||||||
if (coopmat2_support) {
|
if (coopmat2_support) {
|
||||||
#if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
#if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||||
@ -2372,6 +2383,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|||||||
device_extensions.push_back("VK_KHR_shader_float16_int8");
|
device_extensions.push_back("VK_KHR_shader_float16_int8");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(VK_KHR_cooperative_matrix)
|
||||||
if (device->coopmat_support) {
|
if (device->coopmat_support) {
|
||||||
// Query supported shapes
|
// Query supported shapes
|
||||||
std::vector<VkCooperativeMatrixPropertiesKHR> cm_props;
|
std::vector<VkCooperativeMatrixPropertiesKHR> cm_props;
|
||||||
@ -2438,7 +2450,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|||||||
if (device->coopmat_support) {
|
if (device->coopmat_support) {
|
||||||
device_extensions.push_back("VK_KHR_cooperative_matrix");
|
device_extensions.push_back("VK_KHR_cooperative_matrix");
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
device->name = GGML_VK_NAME + std::to_string(idx);
|
device->name = GGML_VK_NAME + std::to_string(idx);
|
||||||
|
|
||||||
device_create_info = {
|
device_create_info = {
|
||||||
@ -2511,7 +2523,6 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|||||||
return vk_instance.devices[idx];
|
return vk_instance.devices[idx];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void ggml_vk_print_gpu_info(size_t idx) {
|
static void ggml_vk_print_gpu_info(size_t idx) {
|
||||||
GGML_ASSERT(idx < vk_instance.device_indices.size());
|
GGML_ASSERT(idx < vk_instance.device_indices.size());
|
||||||
size_t dev_num = vk_instance.device_indices[idx];
|
size_t dev_num = vk_instance.device_indices[idx];
|
||||||
@ -2550,9 +2561,11 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
|||||||
fp16_storage = true;
|
fp16_storage = true;
|
||||||
} else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
|
} else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
|
||||||
fp16_compute = true;
|
fp16_compute = true;
|
||||||
|
#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
|
||||||
} else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0 &&
|
} else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0 &&
|
||||||
!getenv("GGML_VK_DISABLE_COOPMAT")) {
|
!getenv("GGML_VK_DISABLE_COOPMAT")) {
|
||||||
coopmat_support = true;
|
coopmat_support = true;
|
||||||
|
#endif
|
||||||
#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||||
} else if (strcmp("VK_NV_cooperative_matrix2", properties.extensionName) == 0 &&
|
} else if (strcmp("VK_NV_cooperative_matrix2", properties.extensionName) == 0 &&
|
||||||
!getenv("GGML_VK_DISABLE_COOPMAT2")) {
|
!getenv("GGML_VK_DISABLE_COOPMAT2")) {
|
||||||
@ -2561,9 +2574,7 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (props2.properties.vendorID == VK_VENDOR_ID_INTEL || (props2.properties.vendorID == VK_VENDOR_ID_AMD && (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource))) {
|
if (!ggml_vk_khr_cooperative_matrix_support(props2.properties, driver_props)) {
|
||||||
// Intel drivers don't support coopmat properly yet
|
|
||||||
// Only RADV supports coopmat properly on AMD
|
|
||||||
coopmat_support = false;
|
coopmat_support = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2592,6 +2603,7 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
|||||||
// Pointer to the last chain element
|
// Pointer to the last chain element
|
||||||
VkBaseOutStructure * last_struct = (VkBaseOutStructure *)&vk12_features;
|
VkBaseOutStructure * last_struct = (VkBaseOutStructure *)&vk12_features;
|
||||||
|
|
||||||
|
#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
|
||||||
VkPhysicalDeviceCooperativeMatrixFeaturesKHR coopmat_features;
|
VkPhysicalDeviceCooperativeMatrixFeaturesKHR coopmat_features;
|
||||||
coopmat_features.pNext = nullptr;
|
coopmat_features.pNext = nullptr;
|
||||||
coopmat_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR;
|
coopmat_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR;
|
||||||
@ -2607,6 +2619,7 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
|||||||
fp16 = fp16 && vk12_features.shaderFloat16;
|
fp16 = fp16 && vk12_features.shaderFloat16;
|
||||||
|
|
||||||
coopmat_support = coopmat_support && coopmat_features.cooperativeMatrix;
|
coopmat_support = coopmat_support && coopmat_features.cooperativeMatrix;
|
||||||
|
#endif
|
||||||
|
|
||||||
std::string matrix_cores = coopmat2_support ? "NV_coopmat2" : coopmat_support ? "KHR_coopmat" : "none";
|
std::string matrix_cores = coopmat2_support ? "NV_coopmat2" : coopmat_support ? "KHR_coopmat" : "none";
|
||||||
|
|
||||||
@ -2892,9 +2905,10 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
|
|||||||
return ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f16acc : ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f32acc;
|
return ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f16acc : ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f32acc;
|
||||||
}
|
}
|
||||||
|
|
||||||
static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type) {
|
static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type, uint32_t num_cols) {
|
||||||
VK_LOG_DEBUG("ggml_vk_get_dequantize_mul_mat_vec()");
|
VK_LOG_DEBUG("ggml_vk_get_dequantize_mul_mat_vec()");
|
||||||
GGML_ASSERT(b_type == GGML_TYPE_F32 || b_type == GGML_TYPE_F16);
|
GGML_ASSERT(b_type == GGML_TYPE_F32 || b_type == GGML_TYPE_F16);
|
||||||
|
GGML_ASSERT(num_cols >= 1 && num_cols <= mul_mat_vec_max_cols);
|
||||||
|
|
||||||
switch (a_type) {
|
switch (a_type) {
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
@ -2915,7 +2929,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
|
|||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
return b_type == GGML_TYPE_F32 ? ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[a_type] : ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[a_type];
|
return b_type == GGML_TYPE_F32 ? ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[a_type][num_cols-1] : ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[a_type][num_cols-1];
|
||||||
}
|
}
|
||||||
|
|
||||||
static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type, ggml_prec prec) {
|
static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type, ggml_prec prec) {
|
||||||
@ -3925,8 +3939,6 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
|||||||
const uint64_t ne12 = src1->ne[2];
|
const uint64_t ne12 = src1->ne[2];
|
||||||
const uint64_t ne13 = src1->ne[3];
|
const uint64_t ne13 = src1->ne[3];
|
||||||
|
|
||||||
GGML_ASSERT(ne11 == 1);
|
|
||||||
|
|
||||||
const uint64_t ne20 = dst->ne[0];
|
const uint64_t ne20 = dst->ne[0];
|
||||||
const uint64_t ne21 = dst->ne[1];
|
const uint64_t ne21 = dst->ne[1];
|
||||||
const uint64_t ne22 = dst->ne[2];
|
const uint64_t ne22 = dst->ne[2];
|
||||||
@ -3935,6 +3947,11 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
|||||||
const uint64_t r2 = ne12 / ne02;
|
const uint64_t r2 = ne12 / ne02;
|
||||||
const uint64_t r3 = ne13 / ne03;
|
const uint64_t r3 = ne13 / ne03;
|
||||||
|
|
||||||
|
// batch_n indicates that we need to compute a few vector results, and this assumes
|
||||||
|
// ne12 and ne13 are 1. It overloads the batch_strides to hold the row strides.
|
||||||
|
GGML_ASSERT(ne11 == 1 || ne12 * ne13 == 1);
|
||||||
|
bool batch_n = ne11 > 1;
|
||||||
|
|
||||||
ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
|
ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
|
||||||
ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
|
ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
|
||||||
ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
|
ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
|
||||||
@ -3985,7 +4002,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
|||||||
} else {
|
} else {
|
||||||
to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
|
to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
|
||||||
}
|
}
|
||||||
vk_pipeline dmmv = ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type, src1->type);
|
vk_pipeline dmmv = ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type, src1->type, ne11);
|
||||||
GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT
|
GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT
|
||||||
GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT
|
GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT
|
||||||
GGML_ASSERT(dmmv != nullptr);
|
GGML_ASSERT(dmmv != nullptr);
|
||||||
@ -4057,8 +4074,10 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
|||||||
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
|
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t stride_batch_x = ne00*ne01;
|
// For batch_n, the A matrix is the same for each batch, and B/D use the row stride as the batch stride
|
||||||
uint32_t stride_batch_y = ne10*ne11;
|
uint32_t stride_batch_x = batch_n ? 0 : ne00*ne01;
|
||||||
|
uint32_t stride_batch_y = batch_n ? ne10 : (ne10*ne11);
|
||||||
|
uint32_t stride_batch_d = batch_n ? ne20 : (ne20*ne21);
|
||||||
|
|
||||||
if (!ggml_vk_dim01_contiguous(src0) && !qx_needs_dequant) {
|
if (!ggml_vk_dim01_contiguous(src0) && !qx_needs_dequant) {
|
||||||
stride_batch_x = src0->nb[0] / ggml_type_size(src0->type);
|
stride_batch_x = src0->nb[0] / ggml_type_size(src0->type);
|
||||||
@ -4081,7 +4100,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
|||||||
// compute
|
// compute
|
||||||
const vk_mat_vec_push_constants pc = {
|
const vk_mat_vec_push_constants pc = {
|
||||||
(uint32_t)ne00, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne01,
|
(uint32_t)ne00, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne01,
|
||||||
stride_batch_x, stride_batch_y, (uint32_t)(ne20*ne21),
|
stride_batch_x, stride_batch_y, stride_batch_d,
|
||||||
(uint32_t)ne02, (uint32_t)ne12, (uint32_t)r2, (uint32_t)r3,
|
(uint32_t)ne02, (uint32_t)ne12, (uint32_t)r2, (uint32_t)r3,
|
||||||
};
|
};
|
||||||
ggml_vk_sync_buffers(subctx);
|
ggml_vk_sync_buffers(subctx);
|
||||||
@ -4261,7 +4280,10 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, c
|
|||||||
} else if (src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && dst->ne[1] == 1 &&
|
} else if (src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && dst->ne[1] == 1 &&
|
||||||
!ggml_is_permuted(src0) && !ggml_is_permuted(src1)) {
|
!ggml_is_permuted(src0) && !ggml_is_permuted(src1)) {
|
||||||
ggml_vk_mul_mat_vec_nc_f16_f32(ctx, subctx, src0, src1, dst, dryrun);
|
ggml_vk_mul_mat_vec_nc_f16_f32(ctx, subctx, src0, src1, dst, dryrun);
|
||||||
} else if (dst->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) {
|
// mul_mat_vec supports batching ne12*ne13 when ne11==1, or treating ne11 as the batch size (up to four)
|
||||||
|
// when ne12 and ne13 are one.
|
||||||
|
} else if ((dst->ne[1] == 1 || (dst->ne[1] <= mul_mat_vec_max_cols && src1->ne[2] * src1->ne[3] == 1)) &&
|
||||||
|
(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) {
|
||||||
ggml_vk_mul_mat_vec_q_f16(ctx, subctx, src0, src1, dst, dryrun);
|
ggml_vk_mul_mat_vec_q_f16(ctx, subctx, src0, src1, dst, dryrun);
|
||||||
} else {
|
} else {
|
||||||
ggml_vk_mul_mat_q_f16(ctx, subctx, src0, src1, dst, dryrun);
|
ggml_vk_mul_mat_q_f16(ctx, subctx, src0, src1, dst, dryrun);
|
||||||
@ -8075,6 +8097,25 @@ static bool ggml_vk_instance_portability_enumeration_ext_available(const std::ve
|
|||||||
UNUSED(instance_extensions);
|
UNUSED(instance_extensions);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props) {
|
||||||
|
switch (props.vendorID) {
|
||||||
|
case VK_VENDOR_ID_INTEL:
|
||||||
|
// Intel drivers don't support coopmat properly yet
|
||||||
|
return false;
|
||||||
|
case VK_VENDOR_ID_AMD:
|
||||||
|
if (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource) {
|
||||||
|
// Workaround for AMD proprietary driver reporting support on all GPUs
|
||||||
|
const std::string name = props.deviceName;
|
||||||
|
return name.rfind("AMD Radeon RX 7", 0) == 0 || name.rfind("AMD Radeon(TM) RX 7", 0) == 0 || // RDNA 3 consumer GPUs
|
||||||
|
name.rfind("AMD Radeon PRO W7", 0) == 0 || name.rfind("AMD Radeon(TM) PRO W7", 0) == 0 || // RDNA 3 workstation GPUs
|
||||||
|
name.rfind("AMD Radeon 7", 0) == 0 || name.rfind("AMD Radeon(TM) 7", 0) == 0; // RDNA 3 APUs
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
default:
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// checks
|
// checks
|
||||||
|
|
||||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||||
|
@ -9,9 +9,6 @@
|
|||||||
|
|
||||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
layout (constant_id = 0) const uint BLOCK_SIZE = 32;
|
|
||||||
layout (constant_id = 1) const uint NUM_ROWS = 1;
|
|
||||||
|
|
||||||
#if !defined(DATA_A_F32) && !defined(DATA_A_F16)
|
#if !defined(DATA_A_F32) && !defined(DATA_A_F16)
|
||||||
#define K_PER_ITER 8
|
#define K_PER_ITER 8
|
||||||
#else
|
#else
|
||||||
@ -21,23 +18,22 @@ layout (constant_id = 1) const uint NUM_ROWS = 1;
|
|||||||
|
|
||||||
uint a_offset, b_offset, d_offset, y_offset;
|
uint a_offset, b_offset, d_offset, y_offset;
|
||||||
|
|
||||||
shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE];
|
void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const uint num_rows, const uint tid, const uint i, bool lastiter)
|
||||||
|
|
||||||
void iter(inout FLOAT_TYPE temp[NUM_ROWS], const uint first_row, const uint num_rows, const uint tid, const uint i, bool lastiter)
|
|
||||||
{
|
{
|
||||||
|
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||||
const uint col = i*BLOCK_SIZE + K_PER_ITER*tid;
|
const uint col = i*BLOCK_SIZE + K_PER_ITER*tid;
|
||||||
const uint iqs = (col%QUANT_K)/QUANT_R; // quant index
|
const uint iqs = (col%QUANT_K)/QUANT_R; // quant index
|
||||||
const uint iybs = col - col%QUANT_K; // y block start index
|
const uint iybs = col - col%QUANT_K; // y block start index
|
||||||
|
|
||||||
#if K_PER_ITER == 8
|
#if K_PER_ITER == 8
|
||||||
#if QUANT_R == 2
|
#if QUANT_R == 2
|
||||||
const B_TYPE_VEC4 bv02 = data_b_v4[(b_offset + iybs + iqs) / 4];
|
const B_TYPE_VEC4 bv02 = data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs) / 4];
|
||||||
const B_TYPE_VEC4 bv13 = data_b_v4[(b_offset + iybs + iqs + y_offset) / 4];
|
const B_TYPE_VEC4 bv13 = data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs + y_offset) / 4];
|
||||||
const vec4 bv0 = vec4(bv02.x, bv13.x, bv02.y, bv13.y);
|
const vec4 bv0 = vec4(bv02.x, bv13.x, bv02.y, bv13.y);
|
||||||
const vec4 bv1 = vec4(bv02.z, bv13.z, bv02.w, bv13.w);
|
const vec4 bv1 = vec4(bv02.z, bv13.z, bv02.w, bv13.w);
|
||||||
#else
|
#else
|
||||||
const vec4 bv0 = vec4(data_b_v4[(b_offset + iybs + iqs) / 4]);
|
const vec4 bv0 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs) / 4]);
|
||||||
const vec4 bv1 = vec4(data_b_v4[(b_offset + iybs + iqs) / 4 + 1]);
|
const vec4 bv1 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs) / 4 + 1]);
|
||||||
#endif
|
#endif
|
||||||
#else
|
#else
|
||||||
// Check if the second of the pair of elements is OOB, and don't fetch B or
|
// Check if the second of the pair of elements is OOB, and don't fetch B or
|
||||||
@ -48,9 +44,9 @@ void iter(inout FLOAT_TYPE temp[NUM_ROWS], const uint first_row, const uint num_
|
|||||||
const bool OOB = lastiter && (iybs + iqs + y_offset >= p.ncols);
|
const bool OOB = lastiter && (iybs + iqs + y_offset >= p.ncols);
|
||||||
|
|
||||||
FLOAT_TYPE b0 = 0, b1 = 0;
|
FLOAT_TYPE b0 = 0, b1 = 0;
|
||||||
b0 = FLOAT_TYPE(data_b[b_offset + iybs + iqs]);
|
b0 = FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs]);
|
||||||
if (!OOB) {
|
if (!OOB) {
|
||||||
b1 = FLOAT_TYPE(data_b[b_offset + iybs + iqs + y_offset]);
|
b1 = FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs + y_offset]);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
uint ibi = first_row*p.ncols;
|
uint ibi = first_row*p.ncols;
|
||||||
@ -75,17 +71,18 @@ void iter(inout FLOAT_TYPE temp[NUM_ROWS], const uint first_row, const uint num_
|
|||||||
if (dm.y == 0)
|
if (dm.y == 0)
|
||||||
rowtmp *= dm.x;
|
rowtmp *= dm.x;
|
||||||
|
|
||||||
temp[n] += rowtmp;
|
temp[j][n] += rowtmp;
|
||||||
#else
|
#else
|
||||||
const vec2 v = dequantize(ib, iqs, a_offset);
|
const vec2 v = dequantize(ib, iqs, a_offset);
|
||||||
|
|
||||||
// matrix multiplication
|
// matrix multiplication
|
||||||
temp[n] = fma(FLOAT_TYPE(v.x), b0, temp[n]);
|
temp[j][n] = fma(FLOAT_TYPE(v.x), b0, temp[j][n]);
|
||||||
if (!OOB) {
|
if (!OOB) {
|
||||||
temp[n] = fma(FLOAT_TYPE(v.y), b1, temp[n]);
|
temp[j][n] = fma(FLOAT_TYPE(v.y), b1, temp[j][n]);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||||
@ -96,10 +93,12 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
|||||||
|
|
||||||
y_offset = QUANT_R == 1 ? 1 : QUANT_K/2;
|
y_offset = QUANT_R == 1 ? 1 : QUANT_K/2;
|
||||||
|
|
||||||
FLOAT_TYPE temp[NUM_ROWS];
|
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||||
|
|
||||||
for (uint i = 0; i < NUM_ROWS; ++i) {
|
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||||
temp[i] = FLOAT_TYPE(0);
|
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
|
||||||
|
temp[j][i] = FLOAT_TYPE(0);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
uint num_iters = p.ncols / (K_PER_ITER * BLOCK_SIZE);
|
uint num_iters = p.ncols / (K_PER_ITER * BLOCK_SIZE);
|
||||||
@ -131,24 +130,7 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
|||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
reduce_result(temp, d_offset, first_row, num_rows, tid);
|
||||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
|
||||||
tmpsh[n][tid] = temp[n];
|
|
||||||
}
|
|
||||||
barrier();
|
|
||||||
[[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
|
|
||||||
if (tid < s) {
|
|
||||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
|
||||||
tmpsh[n][tid] += tmpsh[n][tid + s];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
barrier();
|
|
||||||
}
|
|
||||||
if (tid == 0) {
|
|
||||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
|
||||||
data_d[d_offset + first_row + n] = D_TYPE(tmpsh[n][0]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
|
@ -83,3 +83,36 @@ void get_offsets(out uint a_offset, out uint b_offset, out uint d_offset) {
|
|||||||
batch_idx * p.batch_stride_d;
|
batch_idx * p.batch_stride_d;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
layout (constant_id = 0) const uint BLOCK_SIZE = 32;
|
||||||
|
layout (constant_id = 1) const uint NUM_ROWS = 1;
|
||||||
|
layout (constant_id = 2) const uint NUM_COLS = 1;
|
||||||
|
|
||||||
|
shared FLOAT_TYPE tmpsh[NUM_COLS][NUM_ROWS][BLOCK_SIZE];
|
||||||
|
|
||||||
|
void reduce_result(const in FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const in uint32_t d_offset, const in uint32_t first_row, const in uint32_t num_rows, const in uint32_t tid) {
|
||||||
|
// sum up partial sums and write back result
|
||||||
|
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||||
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
|
tmpsh[j][n][tid] = temp[j][n];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
barrier();
|
||||||
|
[[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
|
||||||
|
if (tid < s) {
|
||||||
|
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||||
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
|
tmpsh[j][n][tid] += tmpsh[j][n][tid + s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
barrier();
|
||||||
|
}
|
||||||
|
if (tid == 0) {
|
||||||
|
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||||
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
|
data_d[j*p.batch_stride_d + d_offset + first_row + n] = D_TYPE(tmpsh[j][n][0]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user