mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-06 02:48:57 +01:00
Merge master into server-cfg
This commit is contained in:
commit
890d1b8446
5
.gitignore
vendored
5
.gitignore
vendored
@ -16,6 +16,8 @@ build/
|
||||
build-em/
|
||||
build-debug/
|
||||
build-release/
|
||||
build-ci-debug/
|
||||
build-ci-release/
|
||||
build-static/
|
||||
build-cublas/
|
||||
build-opencl/
|
||||
@ -25,9 +27,10 @@ build-no-accel/
|
||||
build-sanitize-addr/
|
||||
build-sanitize-thread/
|
||||
out/
|
||||
tmp/
|
||||
|
||||
models/*
|
||||
*.bin
|
||||
models-mnt
|
||||
|
||||
/main
|
||||
/quantize
|
||||
|
@ -186,7 +186,16 @@ if (LLAMA_BLAS)
|
||||
pkg_check_modules(DepBLAS REQUIRED flexiblas_api)
|
||||
elseif (${LLAMA_BLAS_VENDOR} MATCHES "Intel")
|
||||
# all Intel* libraries share the same include path
|
||||
pkg_check_modules(DepBLAS REQUIRED mkl-sdl)
|
||||
pkg_check_modules(DepBLAS mkl-sdl)
|
||||
if (NOT DepBLAS)
|
||||
if (BUILD_SHARED_LIBS)
|
||||
set(LINK_METHOD dynamic)
|
||||
else()
|
||||
set(LINK_METHOD static)
|
||||
endif()
|
||||
string(REGEX REPLACE ".*_" "" DATA_TYPE_MODEL ${LLAMA_BLAS_VENDOR})
|
||||
pkg_check_modules(DepBLAS REQUIRED mkl-${LINK_METHOD}-${DATA_TYPE_MODEL}-iomp)
|
||||
endif()
|
||||
elseif (${LLAMA_BLAS_VENDOR} MATCHES "NVHPC")
|
||||
# this doesn't provide pkg-config
|
||||
# suggest to assign BLAS_INCLUDE_DIRS on your own
|
||||
@ -512,6 +521,7 @@ if (BUILD_SHARED_LIBS)
|
||||
set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
add_library(ggml_shared SHARED $<TARGET_OBJECTS:ggml>)
|
||||
target_link_libraries(ggml_shared PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
|
||||
install(TARGETS ggml_shared LIBRARY)
|
||||
endif()
|
||||
|
||||
add_library(llama
|
||||
@ -533,8 +543,32 @@ if (BUILD_SHARED_LIBS)
|
||||
if (LLAMA_METAL)
|
||||
set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
|
||||
endif()
|
||||
install(TARGETS llama LIBRARY)
|
||||
endif()
|
||||
|
||||
include(GNUInstallDirs)
|
||||
install(
|
||||
FILES convert.py
|
||||
PERMISSIONS
|
||||
OWNER_READ
|
||||
OWNER_WRITE
|
||||
OWNER_EXECUTE
|
||||
GROUP_READ
|
||||
GROUP_EXECUTE
|
||||
WORLD_READ
|
||||
WORLD_EXECUTE
|
||||
DESTINATION ${CMAKE_INSTALL_BINDIR})
|
||||
install(
|
||||
FILES convert-lora-to-ggml.py
|
||||
PERMISSIONS
|
||||
OWNER_READ
|
||||
OWNER_WRITE
|
||||
OWNER_EXECUTE
|
||||
GROUP_READ
|
||||
GROUP_EXECUTE
|
||||
WORLD_READ
|
||||
WORLD_EXECUTE
|
||||
DESTINATION ${CMAKE_INSTALL_BINDIR})
|
||||
|
||||
#
|
||||
# programs, examples and tests
|
||||
|
20
Makefile
20
Makefile
@ -151,14 +151,11 @@ ifdef LLAMA_MPI
|
||||
CFLAGS += -DGGML_USE_MPI -Wno-cast-qual
|
||||
CXXFLAGS += -DGGML_USE_MPI -Wno-cast-qual
|
||||
OBJS += ggml-mpi.o
|
||||
|
||||
ggml-mpi.o: ggml-mpi.c ggml-mpi.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
endif # LLAMA_MPI
|
||||
|
||||
ifdef LLAMA_OPENBLAS
|
||||
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
|
||||
LDFLAGS += -lopenblas
|
||||
CFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags openblas)
|
||||
LDFLAGS += $(shell pkg-config --libs openblas)
|
||||
endif # LLAMA_OPENBLAS
|
||||
|
||||
ifdef LLAMA_BLIS
|
||||
@ -226,9 +223,6 @@ ifdef LLAMA_METAL
|
||||
CXXFLAGS += -DGGML_USE_METAL
|
||||
LDFLAGS += -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
|
||||
OBJS += ggml-metal.o
|
||||
|
||||
ggml-metal.o: ggml-metal.m ggml-metal.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
endif # LLAMA_METAL
|
||||
|
||||
ifneq ($(filter aarch64%,$(UNAME_M)),)
|
||||
@ -253,6 +247,16 @@ ifneq ($(filter armv8%,$(UNAME_M)),)
|
||||
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
|
||||
endif
|
||||
|
||||
ifdef LLAMA_METAL
|
||||
ggml-metal.o: ggml-metal.m ggml-metal.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
endif # LLAMA_METAL
|
||||
|
||||
ifdef LLAMA_MPI
|
||||
ggml-mpi.o: ggml-mpi.c ggml-mpi.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
endif # LLAMA_MPI
|
||||
|
||||
ifdef LLAMA_NO_K_QUANTS
|
||||
k_quants.o: k_quants.c k_quants.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
@ -360,7 +360,7 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
```bash
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_lp64 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
cmake --build . --config Release
|
||||
```
|
||||
|
||||
@ -640,7 +640,7 @@ Please verify the [sha256 checksums](SHA256SUMS) of all downloaded model files t
|
||||
|
||||
```bash
|
||||
# run the verification script
|
||||
python3 .\scripts\verify-checksum-models.py
|
||||
./scripts/verify-checksum-models.py
|
||||
```
|
||||
|
||||
- On linux or macOS it is also possible to run the following commands to verify if you have all possible latest files in your self-installed `./models` subdirectory:
|
||||
|
32
build.zig
32
build.zig
@ -1,9 +1,19 @@
|
||||
const std = @import("std");
|
||||
const commit_hash = @embedFile(".git/refs/heads/master");
|
||||
|
||||
// Zig Version: 0.11.0-dev.3379+629f0d23b
|
||||
// Zig Version: 0.11.0-dev.3986+e05c242cd
|
||||
pub fn build(b: *std.build.Builder) void {
|
||||
const target = b.standardTargetOptions(.{});
|
||||
const optimize = b.standardOptimizeOption(.{});
|
||||
|
||||
const config_header = b.addConfigHeader(
|
||||
.{ .style = .blank, .include_path = "build-info.h" },
|
||||
.{
|
||||
.BUILD_NUMBER = 0,
|
||||
.BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline
|
||||
},
|
||||
);
|
||||
|
||||
const lib = b.addStaticLibrary(.{
|
||||
.name = "llama",
|
||||
.target = target,
|
||||
@ -13,24 +23,21 @@ pub fn build(b: *std.build.Builder) void {
|
||||
lib.linkLibCpp();
|
||||
lib.addIncludePath(".");
|
||||
lib.addIncludePath("./examples");
|
||||
lib.addCSourceFiles(&.{
|
||||
"ggml.c",
|
||||
}, &.{"-std=c11"});
|
||||
lib.addCSourceFiles(&.{
|
||||
"llama.cpp",
|
||||
}, &.{"-std=c++11"});
|
||||
lib.addConfigHeader(config_header);
|
||||
lib.addCSourceFiles(&.{"ggml.c"}, &.{"-std=c11"});
|
||||
lib.addCSourceFiles(&.{"llama.cpp"}, &.{"-std=c++11"});
|
||||
b.installArtifact(lib);
|
||||
|
||||
const examples = .{
|
||||
"main",
|
||||
"baby-llama",
|
||||
"embedding",
|
||||
// "metal",
|
||||
"metal",
|
||||
"perplexity",
|
||||
"quantize",
|
||||
"quantize-stats",
|
||||
"save-load-state",
|
||||
// "server",
|
||||
"server",
|
||||
"simple",
|
||||
"train-text-from-scratch",
|
||||
};
|
||||
@ -43,16 +50,19 @@ pub fn build(b: *std.build.Builder) void {
|
||||
});
|
||||
exe.addIncludePath(".");
|
||||
exe.addIncludePath("./examples");
|
||||
exe.addConfigHeader(config_header);
|
||||
exe.addCSourceFiles(&.{
|
||||
std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{example_name, example_name}),
|
||||
std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{ example_name, example_name }),
|
||||
"examples/common.cpp",
|
||||
}, &.{"-std=c++11"});
|
||||
exe.linkLibrary(lib);
|
||||
b.installArtifact(exe);
|
||||
|
||||
const run_cmd = b.addRunArtifact(exe);
|
||||
run_cmd.step.dependOn(b.getInstallStep());
|
||||
if (b.args) |args| run_cmd.addArgs(args);
|
||||
const run_step = b.step("run_" ++ example_name, "Run the app");
|
||||
|
||||
const run_step = b.step("run-" ++ example_name, "Run the app");
|
||||
run_step.dependOn(&run_cmd.step);
|
||||
}
|
||||
}
|
||||
|
20
ci/README.md
Normal file
20
ci/README.md
Normal file
@ -0,0 +1,20 @@
|
||||
# CI
|
||||
|
||||
In addition to [Github Actions](https://github.com/ggerganov/llama.cpp/actions) `llama.cpp` uses a custom CI framework:
|
||||
|
||||
https://github.com/ggml-org/ci
|
||||
|
||||
It monitors the `master` branch for new commits and runs the
|
||||
[ci/run.sh](https://github.com/ggerganov/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
|
||||
to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled
|
||||
to cover various hardware architectures, including GPU and Apple Silicon instances.
|
||||
|
||||
Collaborators can optionally trigger the CI run by adding the `ggml-ci` keyword to their commit message.
|
||||
Only the branches of this repo are monitored for this keyword.
|
||||
|
||||
It is a good practice, before publishing changes to execute the full CI locally on your machine:
|
||||
|
||||
```bash
|
||||
mkdir tmp
|
||||
bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
```
|
262
ci/run.sh
Normal file
262
ci/run.sh
Normal file
@ -0,0 +1,262 @@
|
||||
#/bin/bash
|
||||
|
||||
if [ -z "$2" ]; then
|
||||
echo "usage: $0 <output-dir> <mnt-dir>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mkdir -p "$1"
|
||||
mkdir -p "$2"
|
||||
|
||||
OUT=$(realpath "$1")
|
||||
MNT=$(realpath "$2")
|
||||
|
||||
rm -v $OUT/*.log
|
||||
rm -v $OUT/*.exit
|
||||
rm -v $OUT/*.md
|
||||
|
||||
sd=`dirname $0`
|
||||
cd $sd/../
|
||||
SRC=`pwd`
|
||||
|
||||
## helpers
|
||||
|
||||
# download a file if it does not exist or if it is outdated
|
||||
function gg_wget {
|
||||
local out=$1
|
||||
local url=$2
|
||||
|
||||
local cwd=`pwd`
|
||||
|
||||
mkdir -p $out
|
||||
cd $out
|
||||
|
||||
# should not re-download if file is the same
|
||||
wget -nv -N $url
|
||||
|
||||
cd $cwd
|
||||
}
|
||||
|
||||
function gg_printf {
|
||||
printf -- "$@" >> $OUT/README.md
|
||||
}
|
||||
|
||||
function gg_run {
|
||||
ci=$1
|
||||
|
||||
set -o pipefail
|
||||
set -x
|
||||
|
||||
gg_run_$ci | tee $OUT/$ci.log
|
||||
cur=$?
|
||||
echo "$cur" > $OUT/$ci.exit
|
||||
|
||||
set +x
|
||||
set +o pipefail
|
||||
|
||||
gg_sum_$ci
|
||||
|
||||
ret=$((ret | cur))
|
||||
}
|
||||
|
||||
## ci
|
||||
|
||||
# ctest_debug
|
||||
|
||||
function gg_run_ctest_debug {
|
||||
cd ${SRC}
|
||||
|
||||
rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
|
||||
|
||||
set -e
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Debug .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
(time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
|
||||
set +e
|
||||
}
|
||||
|
||||
function gg_sum_ctest_debug {
|
||||
gg_printf '### %s\n\n' "${ci}"
|
||||
|
||||
gg_printf 'Runs ctest in debug mode\n'
|
||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||
gg_printf '```\n'
|
||||
gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
|
||||
gg_printf '```\n'
|
||||
gg_printf '\n'
|
||||
}
|
||||
|
||||
# ctest_release
|
||||
|
||||
function gg_run_ctest_release {
|
||||
cd ${SRC}
|
||||
|
||||
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
||||
|
||||
set -e
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
if [ -z $GG_BUILD_LOW_PERF ]; then
|
||||
(time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
else
|
||||
(time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
fi
|
||||
|
||||
set +e
|
||||
}
|
||||
|
||||
function gg_sum_ctest_release {
|
||||
gg_printf '### %s\n\n' "${ci}"
|
||||
|
||||
gg_printf 'Runs ctest in release mode\n'
|
||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||
gg_printf '```\n'
|
||||
gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
|
||||
gg_printf '```\n'
|
||||
}
|
||||
|
||||
# open_llama_3b_v2
|
||||
|
||||
function gg_run_open_llama_3b_v2 {
|
||||
cd ${SRC}
|
||||
|
||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
|
||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
|
||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
|
||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
|
||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
|
||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
|
||||
|
||||
gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
|
||||
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
|
||||
head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
|
||||
|
||||
path_models="../models-mnt/open-llama/3B-v2"
|
||||
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
|
||||
|
||||
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
||||
|
||||
set -e
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert.py ${path_models}
|
||||
|
||||
model_f16="${path_models}/ggml-model-f16.bin"
|
||||
model_q8_0="${path_models}/ggml-model-q8_0.bin"
|
||||
model_q4_0="${path_models}/ggml-model-q4_0.bin"
|
||||
model_q4_1="${path_models}/ggml-model-q4_1.bin"
|
||||
model_q5_0="${path_models}/ggml-model-q5_0.bin"
|
||||
model_q5_1="${path_models}/ggml-model-q5_1.bin"
|
||||
model_q3_k="${path_models}/ggml-model-q3_k.bin"
|
||||
model_q4_k="${path_models}/ggml-model-q4_k.bin"
|
||||
model_q5_k="${path_models}/ggml-model-q5_k.bin"
|
||||
model_q6_k="${path_models}/ggml-model-q6_k.bin"
|
||||
|
||||
wiki_test_60="${path_wiki}/wiki.test-60.raw"
|
||||
|
||||
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
||||
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
||||
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
||||
./bin/quantize ${model_f16} ${model_q5_0} q5_0
|
||||
./bin/quantize ${model_f16} ${model_q5_1} q5_1
|
||||
./bin/quantize ${model_f16} ${model_q3_k} q3_k
|
||||
./bin/quantize ${model_f16} ${model_q4_k} q4_k
|
||||
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
||||
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
||||
|
||||
(time ./bin/main --model ${model_f16} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/main --model ${model_q8_0} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/main --model ${model_q4_0} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/main --model ${model_q4_1} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/main --model ${model_q5_0} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/main --model ${model_q5_1} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/main --model ${model_q3_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/main --model ${model_q4_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/main --model ${model_q5_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/main --model ${model_q6_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
function check_ppl {
|
||||
qnt="$1"
|
||||
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||
|
||||
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
|
||||
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
|
||||
return 20
|
||||
fi
|
||||
|
||||
printf ' - %s @ %s OK\n' "$qnt" "$ppl"
|
||||
return 0
|
||||
}
|
||||
|
||||
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
|
||||
set +e
|
||||
}
|
||||
|
||||
function gg_sum_open_llama_3b_v2 {
|
||||
gg_printf '### %s\n\n' "${ci}"
|
||||
|
||||
gg_printf 'OpenLLaMA 3B-v2:\n'
|
||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
||||
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
|
||||
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
|
||||
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
|
||||
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
|
||||
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
|
||||
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
||||
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
||||
}
|
||||
|
||||
## main
|
||||
|
||||
if [ -z $GG_BUILD_LOW_PERF ]; then
|
||||
rm -rf ${SRC}/models-mnt
|
||||
|
||||
mnt_models=$(realpath ${MNT}/models)
|
||||
mkdir -p ${mnt_models}
|
||||
ln -sfn ${mnt_models} ${SRC}/models-mnt
|
||||
|
||||
python3 -m pip install -r ${SRC}/requirements.txt
|
||||
fi
|
||||
|
||||
ret=0
|
||||
|
||||
#test $ret -eq 0 && gg_run ctest_debug
|
||||
#test $ret -eq 0 && gg_run ctest_release
|
||||
|
||||
if [ -z $GG_BUILD_LOW_PERF ]; then
|
||||
test $ret -eq 0 && gg_run open_llama_3b_v2
|
||||
fi
|
||||
|
||||
exit $ret
|
1
convert-lora-to-ggml.py
Normal file → Executable file
1
convert-lora-to-ggml.py
Normal file → Executable file
@ -1,3 +1,4 @@
|
||||
#!/usr/bin/env python
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
|
1
convert.py
Normal file → Executable file
1
convert.py
Normal file → Executable file
@ -1,3 +1,4 @@
|
||||
#!/usr/bin/env python
|
||||
import argparse
|
||||
import concurrent.futures
|
||||
import copy
|
||||
|
@ -1,4 +1,5 @@
|
||||
set(TARGET baby-llama)
|
||||
add_executable(${TARGET} baby-llama.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
@ -1,5 +1,6 @@
|
||||
set(TARGET benchmark)
|
||||
add_executable(${TARGET} benchmark-matmult.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
|
@ -168,6 +168,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
break;
|
||||
}
|
||||
params.n_ctx = std::stoi(argv[i]);
|
||||
} else if (arg == "--rope-freq-base") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.rope_freq_base = std::stof(argv[i]);
|
||||
} else if (arg == "--rope-freq-scale") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.rope_freq_scale = std::stof(argv[i]);
|
||||
} else if (arg == "--memory-f32") {
|
||||
params.memory_f16 = false;
|
||||
} else if (arg == "--top-p") {
|
||||
@ -267,6 +279,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
break;
|
||||
}
|
||||
params.n_keep = std::stoi(argv[i]);
|
||||
} else if (arg == "--chunks") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_chunks = std::stoi(argv[i]);
|
||||
} else if (arg == "-m" || arg == "--model") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@ -493,6 +511,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
fprintf(stderr, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
|
||||
fprintf(stderr, " --cfg-smooth-factor N smooth factor between old and new logits (default: %f, 1.0 = no smoothing)\n", params.cfg_smooth_factor);
|
||||
fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||
fprintf(stderr, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
|
||||
fprintf(stderr, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
|
||||
fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
|
||||
fprintf(stderr, " --no-penalize-nl do not penalize newline token\n");
|
||||
fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||
@ -501,6 +521,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
|
||||
fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
|
||||
fprintf(stderr, " --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
|
||||
if (llama_mlock_supported()) {
|
||||
fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
||||
}
|
||||
@ -573,6 +594,8 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
||||
lparams.use_mlock = params.use_mlock;
|
||||
lparams.logits_all = params.perplexity;
|
||||
lparams.embedding = params.embedding;
|
||||
lparams.rope_freq_base = params.rope_freq_base;
|
||||
lparams.rope_freq_scale = params.rope_freq_scale;
|
||||
|
||||
return lparams;
|
||||
}
|
||||
|
@ -28,10 +28,13 @@ struct gpt_params {
|
||||
int32_t n_ctx = 512; // context size
|
||||
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
||||
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||
float rope_freq_base = 10000.0f; // RoPE base frequency
|
||||
float rope_freq_scale = 1.0f; // RoPE frequency scaling factor
|
||||
|
||||
// sampling parameters
|
||||
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
|
||||
|
@ -1,5 +1,6 @@
|
||||
set(TARGET embdinput)
|
||||
add_library(${TARGET} embd-input-lib.cpp embd-input.h)
|
||||
install(TARGETS ${TARGET} LIBRARY)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
@ -8,6 +9,7 @@ endif()
|
||||
|
||||
set(TARGET embd-input-test)
|
||||
add_executable(${TARGET} embd-input-test.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama embdinput ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
|
@ -17,7 +17,7 @@ make
|
||||
import torch
|
||||
|
||||
bin_path = "../LLaVA-13b-delta-v1-1/pytorch_model-00003-of-00003.bin"
|
||||
pth_path = "./examples/embd_input/llava_projection.pth"
|
||||
pth_path = "./examples/embd-input/llava_projection.pth"
|
||||
|
||||
dic = torch.load(bin_path)
|
||||
used_key = ["model.mm_projector.weight","model.mm_projector.bias"]
|
||||
|
@ -59,7 +59,7 @@ if __name__=="__main__":
|
||||
# Also here can use pytorch_model-00003-of-00003.bin directly.
|
||||
a.load_projection(os.path.join(
|
||||
os.path.dirname(__file__) ,
|
||||
"llava_projetion.pth"))
|
||||
"llava_projection.pth"))
|
||||
respose = a.chat_with_image(
|
||||
Image.open("./media/llama1-logo.png").convert('RGB'),
|
||||
"what is the text in the picture?")
|
||||
|
@ -1,5 +1,6 @@
|
||||
set(TARGET embedding)
|
||||
add_executable(${TARGET} embedding.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
|
@ -1,5 +1,6 @@
|
||||
set(TARGET main)
|
||||
add_executable(${TARGET} main.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
|
@ -84,9 +84,17 @@ int main(int argc, char ** argv) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (params.rope_freq_base != 10000.0) {
|
||||
fprintf(stderr, "%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base);
|
||||
}
|
||||
|
||||
if (params.rope_freq_scale != 1.0) {
|
||||
fprintf(stderr, "%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
|
||||
}
|
||||
|
||||
if (params.n_ctx > 2048) {
|
||||
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
|
||||
"expect poor results\n", __func__, params.n_ctx);
|
||||
fprintf(stderr, "%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified);"
|
||||
" you are on your own\n", __func__, params.n_ctx);
|
||||
} else if (params.n_ctx < 8) {
|
||||
fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
|
||||
params.n_ctx = 8;
|
||||
|
@ -1,3 +1,4 @@
|
||||
set(TEST_TARGET metal)
|
||||
add_executable(${TEST_TARGET} metal.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TEST_TARGET} PRIVATE ggml)
|
||||
|
@ -1,5 +1,6 @@
|
||||
set(TARGET perplexity)
|
||||
add_executable(${TARGET} perplexity.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
|
@ -32,13 +32,15 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||
// BOS tokens will be added for each chunk before eval
|
||||
auto tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||
|
||||
int count = 0;
|
||||
const int n_chunk_max = tokens.size() / params.n_ctx;
|
||||
|
||||
const int n_chunk = tokens.size() / params.n_ctx;
|
||||
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
|
||||
const int n_vocab = llama_n_vocab(ctx);
|
||||
const int n_batch = params.n_batch;
|
||||
|
||||
int count = 0;
|
||||
double nll = 0.0;
|
||||
|
||||
fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
|
||||
|
||||
for (int i = 0; i < n_chunk; ++i) {
|
||||
|
@ -1,4 +1,5 @@
|
||||
set(TARGET quantize-stats)
|
||||
add_executable(${TARGET} quantize-stats.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
@ -1,5 +1,6 @@
|
||||
set(TARGET quantize)
|
||||
add_executable(${TARGET} quantize.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
|
@ -14,103 +14,27 @@ struct quant_option {
|
||||
};
|
||||
|
||||
static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||
{
|
||||
"Q4_0",
|
||||
LLAMA_FTYPE_MOSTLY_Q4_0,
|
||||
" 3.50G, +0.2499 ppl @ 7B - small, very high quality loss - legacy, prefer using Q3_K_M",
|
||||
},
|
||||
{
|
||||
"Q4_1",
|
||||
LLAMA_FTYPE_MOSTLY_Q4_1,
|
||||
" 3.90G, +0.1846 ppl @ 7B - small, substantial quality loss - legacy, prefer using Q3_K_L",
|
||||
},
|
||||
{
|
||||
"Q5_0",
|
||||
LLAMA_FTYPE_MOSTLY_Q5_0,
|
||||
" 4.30G, +0.0796 ppl @ 7B - medium, balanced quality - legacy, prefer using Q4_K_M",
|
||||
},
|
||||
{
|
||||
"Q5_1",
|
||||
LLAMA_FTYPE_MOSTLY_Q5_1,
|
||||
" 4.70G, +0.0415 ppl @ 7B - medium, low quality loss - legacy, prefer using Q5_K_M",
|
||||
},
|
||||
{ "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 3.50G, +0.2499 ppl @ 7B", },
|
||||
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1846 ppl @ 7B", },
|
||||
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.30G, +0.0796 ppl @ 7B", },
|
||||
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0415 ppl @ 7B", },
|
||||
#ifdef GGML_USE_K_QUANTS
|
||||
{
|
||||
"Q2_K",
|
||||
LLAMA_FTYPE_MOSTLY_Q2_K,
|
||||
" 2.67G, +0.8698 ppl @ 7B - smallest, extreme quality loss - not recommended",
|
||||
},
|
||||
{
|
||||
"Q3_K",
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_M,
|
||||
"alias for Q3_K_M"
|
||||
},
|
||||
{
|
||||
"Q3_K_S",
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_S,
|
||||
" 2.75G, +0.5505 ppl @ 7B - very small, very high quality loss",
|
||||
},
|
||||
{
|
||||
"Q3_K_M",
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_M,
|
||||
" 3.06G, +0.2437 ppl @ 7B - very small, very high quality loss",
|
||||
},
|
||||
{
|
||||
"Q3_K_L",
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_L,
|
||||
" 3.35G, +0.1803 ppl @ 7B - small, substantial quality loss",
|
||||
},
|
||||
{
|
||||
"Q4_K",
|
||||
LLAMA_FTYPE_MOSTLY_Q4_K_M,
|
||||
"alias for Q4_K_M",
|
||||
},
|
||||
{
|
||||
"Q4_K_S",
|
||||
LLAMA_FTYPE_MOSTLY_Q4_K_S,
|
||||
" 3.56G, +0.1149 ppl @ 7B - small, significant quality loss",
|
||||
},
|
||||
{
|
||||
"Q4_K_M",
|
||||
LLAMA_FTYPE_MOSTLY_Q4_K_M,
|
||||
" 3.80G, +0.0535 ppl @ 7B - medium, balanced quality - *recommended*",
|
||||
},
|
||||
{
|
||||
"Q5_K",
|
||||
LLAMA_FTYPE_MOSTLY_Q5_K_M,
|
||||
"alias for Q5_K_M",
|
||||
},
|
||||
{
|
||||
"Q5_K_S",
|
||||
LLAMA_FTYPE_MOSTLY_Q5_K_S,
|
||||
" 4.33G, +0.0353 ppl @ 7B - large, low quality loss - *recommended*",
|
||||
},
|
||||
{
|
||||
"Q5_K_M",
|
||||
LLAMA_FTYPE_MOSTLY_Q5_K_M,
|
||||
" 4.45G, +0.0142 ppl @ 7B - large, very low quality loss - *recommended*",
|
||||
},
|
||||
{
|
||||
"Q6_K",
|
||||
LLAMA_FTYPE_MOSTLY_Q6_K,
|
||||
" 5.15G, +0.0044 ppl @ 7B - very large, extremely low quality loss",
|
||||
},
|
||||
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.67G, +0.8698 ppl @ 7B", },
|
||||
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
|
||||
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5505 ppl @ 7B", },
|
||||
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.06G, +0.2437 ppl @ 7B", },
|
||||
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1803 ppl @ 7B", },
|
||||
{ "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
|
||||
{ "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.56G, +0.1149 ppl @ 7B", },
|
||||
{ "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0535 ppl @ 7B", },
|
||||
{ "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
|
||||
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0353 ppl @ 7B", },
|
||||
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0142 ppl @ 7B", },
|
||||
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0044 ppl @ 7B", },
|
||||
#endif
|
||||
{
|
||||
"Q8_0",
|
||||
LLAMA_FTYPE_MOSTLY_Q8_0,
|
||||
" 6.70G, +0.0004 ppl @ 7B - very large, extremely low quality loss - not recommended",
|
||||
},
|
||||
{
|
||||
"F16",
|
||||
LLAMA_FTYPE_MOSTLY_F16,
|
||||
"13.00G @ 7B - extremely large, virtually no quality loss - not recommended",
|
||||
},
|
||||
{
|
||||
"F32",
|
||||
LLAMA_FTYPE_ALL_F32,
|
||||
"26.00G @ 7B - absolutely huge, lossless - not recommended",
|
||||
},
|
||||
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ 7B", },
|
||||
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
|
||||
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
||||
};
|
||||
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
set(TARGET save-load-state)
|
||||
add_executable(${TARGET} save-load-state.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
|
@ -2,6 +2,7 @@ set(TARGET server)
|
||||
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
add_executable(${TARGET} server.cpp json.hpp httplib.h)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_compile_definitions(${TARGET} PRIVATE
|
||||
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
||||
)
|
||||
|
@ -66,6 +66,7 @@ Using [curl](https://curl.se/). On Windows `curl.exe` should be available in the
|
||||
```sh
|
||||
curl --request POST \
|
||||
--url http://localhost:8080/completion \
|
||||
--header "Content-Type: application/json" \
|
||||
--data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}'
|
||||
```
|
||||
|
||||
|
@ -32,6 +32,7 @@ tokenize() {
|
||||
--silent \
|
||||
--request POST \
|
||||
--url "${API_URL}/tokenize" \
|
||||
--header "Content-Type: application/json" \
|
||||
--data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \
|
||||
| jq '.tokens[]'
|
||||
}
|
||||
@ -64,6 +65,7 @@ chat_completion() {
|
||||
--no-buffer \
|
||||
--request POST \
|
||||
--url "${API_URL}/completion" \
|
||||
--header "Content-Type: application/json" \
|
||||
--data-raw "${DATA}")
|
||||
|
||||
printf "\n"
|
||||
|
@ -611,6 +611,8 @@ static void server_print_usage(const char * argv0, const gpt_params & params,
|
||||
fprintf(stderr, " -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
||||
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||
fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||
fprintf(stderr, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
|
||||
fprintf(stderr, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
|
||||
fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||
fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||
@ -698,6 +700,18 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
|
||||
break;
|
||||
}
|
||||
params.n_ctx = std::stoi(argv[i]);
|
||||
} else if (arg == "--rope-freq-base") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.rope_freq_base = std::stof(argv[i]);
|
||||
} else if (arg == "--rope-freq-scale") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.rope_freq_scale = std::stof(argv[i]);
|
||||
} else if (arg == "--memory-f32" || arg == "--memory_f32") {
|
||||
params.memory_f16 = false;
|
||||
} else if (arg == "--threads" || arg == "-t") {
|
||||
|
@ -1,5 +1,6 @@
|
||||
set(TARGET simple)
|
||||
add_executable(${TARGET} simple.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
|
@ -1,4 +1,5 @@
|
||||
set(TARGET train-text-from-scratch)
|
||||
add_executable(${TARGET} train-text-from-scratch.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
67
flake.nix
67
flake.nix
@ -6,24 +6,27 @@
|
||||
outputs = { self, nixpkgs, flake-utils }:
|
||||
flake-utils.lib.eachDefaultSystem (system:
|
||||
let
|
||||
inherit (pkgs.stdenv) isAarch64 isDarwin;
|
||||
inherit (pkgs.lib) optionals;
|
||||
isM1 = isAarch64 && isDarwin;
|
||||
osSpecific = if isM1 then
|
||||
with pkgs.darwin.apple_sdk_11_0.frameworks; [
|
||||
Accelerate
|
||||
MetalKit
|
||||
MetalPerformanceShaders
|
||||
MetalPerformanceShadersGraph
|
||||
]
|
||||
else if isDarwin then
|
||||
with pkgs.darwin.apple_sdk.frameworks; [
|
||||
Accelerate
|
||||
CoreGraphics
|
||||
CoreVideo
|
||||
]
|
||||
else
|
||||
[ ];
|
||||
inherit (pkgs.stdenv) isAarch32 isAarch64 isx86_32 isx86_64 isDarwin;
|
||||
osSpecific = with pkgs; [ openmpi ] ++
|
||||
(
|
||||
if isAarch64 && isDarwin then
|
||||
with pkgs.darwin.apple_sdk_11_0.frameworks; [
|
||||
Accelerate
|
||||
MetalKit
|
||||
MetalPerformanceShaders
|
||||
MetalPerformanceShadersGraph
|
||||
]
|
||||
else if isAarch32 && isDarwin then
|
||||
with pkgs.darwin.apple_sdk.frameworks; [
|
||||
Accelerate
|
||||
CoreGraphics
|
||||
CoreVideo
|
||||
]
|
||||
else if isx86_32 || isx86_64 then
|
||||
with pkgs; [ mkl ]
|
||||
else
|
||||
with pkgs; [ openblas ]
|
||||
);
|
||||
pkgs = import nixpkgs { inherit system; };
|
||||
llama-python =
|
||||
pkgs.python310.withPackages (ps: with ps; [ numpy sentencepiece ]);
|
||||
@ -31,26 +34,36 @@
|
||||
packages.default = pkgs.stdenv.mkDerivation {
|
||||
name = "llama.cpp";
|
||||
src = ./.;
|
||||
postPatch = if isM1 then ''
|
||||
postPatch = ''
|
||||
substituteInPlace ./ggml-metal.m \
|
||||
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
||||
'' else
|
||||
"";
|
||||
nativeBuildInputs = with pkgs; [ cmake ];
|
||||
'';
|
||||
nativeBuildInputs = with pkgs; [ cmake pkgconfig ];
|
||||
buildInputs = osSpecific;
|
||||
cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" ] ++ (optionals isM1 [
|
||||
"-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
|
||||
"-DLLAMA_METAL=ON"
|
||||
cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ]
|
||||
++ (if isAarch64 && isDarwin then [
|
||||
"-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
|
||||
"-DLLAMA_METAL=ON"
|
||||
] else if isx86_32 || isx86_64 then [
|
||||
"-DLLAMA_BLAS=ON"
|
||||
"-DLLAMA_BLAS_VENDOR=Intel10_lp64"
|
||||
] else [
|
||||
"-DLLAMA_BLAS=ON"
|
||||
"-DLLAMA_BLAS_VENDOR=OpenBLAS"
|
||||
]);
|
||||
installPhase = ''
|
||||
mkdir -p $out/bin
|
||||
mv bin/* $out/bin/
|
||||
runHook preInstall
|
||||
|
||||
install -D bin/* -t $out/bin
|
||||
install -Dm644 lib*.so -t $out/lib
|
||||
mv $out/bin/main $out/bin/llama
|
||||
mv $out/bin/server $out/bin/llama-server
|
||||
|
||||
echo "#!${llama-python}/bin/python" > $out/bin/convert.py
|
||||
cat ${./convert.py} >> $out/bin/convert.py
|
||||
chmod +x $out/bin/convert.py
|
||||
|
||||
runHook postInstall
|
||||
'';
|
||||
meta.mainProgram = "llama";
|
||||
};
|
||||
|
487
ggml-cuda.cu
487
ggml-cuda.cu
@ -13,6 +13,8 @@
|
||||
#include "ggml-cuda.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
@ -74,7 +76,7 @@ typedef void (*ggml_cuda_op_t)(
|
||||
|
||||
#define QK4_0 32
|
||||
#define QR4_0 2
|
||||
#define QI4_0 4
|
||||
#define QI4_0 (QK4_0 / (4 * QR4_0))
|
||||
typedef struct {
|
||||
half d; // delta
|
||||
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
||||
@ -83,7 +85,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
|
||||
|
||||
#define QK4_1 32
|
||||
#define QR4_1 2
|
||||
#define QI4_1 4
|
||||
#define QI4_1 (QK4_1 / (4 * QR4_1))
|
||||
typedef struct {
|
||||
half d; // delta
|
||||
half m; // min
|
||||
@ -93,7 +95,7 @@ static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong
|
||||
|
||||
#define QK5_0 32
|
||||
#define QR5_0 2
|
||||
#define QI5_0 4
|
||||
#define QI5_0 (QK5_0 / (4 * QR5_0))
|
||||
typedef struct {
|
||||
half d; // delta
|
||||
uint8_t qh[4]; // 5-th bit of quants
|
||||
@ -103,7 +105,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
|
||||
|
||||
#define QK5_1 32
|
||||
#define QR5_1 2
|
||||
#define QI5_1 4
|
||||
#define QI5_1 (QK5_1 / (4 * QR5_1))
|
||||
typedef struct {
|
||||
half d; // delta
|
||||
half m; // min
|
||||
@ -114,7 +116,7 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
|
||||
|
||||
#define QK8_0 32
|
||||
#define QR8_0 1
|
||||
#define QI8_0 8
|
||||
#define QI8_0 (QK8_0 / (4 * QR8_0))
|
||||
typedef struct {
|
||||
half d; // delta
|
||||
int8_t qs[QK8_0]; // quants
|
||||
@ -123,7 +125,7 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
|
||||
|
||||
#define QK8_1 32
|
||||
#define QR8_1 1
|
||||
#define QI8_1 8
|
||||
#define QI8_1 (QK8_1 / (4 * QR8_1))
|
||||
typedef struct {
|
||||
half d; // delta
|
||||
half s; // unquantized sum
|
||||
@ -143,6 +145,8 @@ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_
|
||||
#define K_SCALE_SIZE 12
|
||||
#endif
|
||||
|
||||
#define QR2_K 4
|
||||
#define QI2_K (QK_K / (4*QR2_K))
|
||||
typedef struct {
|
||||
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
||||
uint8_t qs[QK_K/4]; // quants
|
||||
@ -151,6 +155,8 @@ typedef struct {
|
||||
} block_q2_K;
|
||||
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
||||
|
||||
#define QR3_K 4
|
||||
#define QI3_K (QK_K / (4*QR3_K))
|
||||
typedef struct {
|
||||
uint8_t hmask[QK_K/8]; // quants - high bit
|
||||
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
||||
@ -163,6 +169,8 @@ typedef struct {
|
||||
} block_q3_K;
|
||||
//static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
|
||||
|
||||
#define QR4_K 2
|
||||
#define QI4_K (QK_K / (4*QR4_K))
|
||||
#ifdef GGML_QKK_64
|
||||
typedef struct {
|
||||
half d[2]; // super-block scales/mins
|
||||
@ -180,6 +188,8 @@ typedef struct {
|
||||
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
|
||||
#endif
|
||||
|
||||
#define QR5_K 2
|
||||
#define QI5_K (QK_K / (4*QR5_K))
|
||||
#ifdef GGML_QKK_64
|
||||
typedef struct {
|
||||
half d; // super-block scale
|
||||
@ -199,6 +209,8 @@ typedef struct {
|
||||
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
||||
#endif
|
||||
|
||||
#define QR6_K 2
|
||||
#define QI6_K (QK_K / (4*QR6_K))
|
||||
typedef struct {
|
||||
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
||||
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
||||
@ -240,13 +252,13 @@ struct ggml_tensor_extra_gpu {
|
||||
cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
|
||||
};
|
||||
|
||||
static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
|
||||
static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
if (i >= kx) {
|
||||
return;
|
||||
}
|
||||
dst[i] = x[i] + y[i];
|
||||
dst[i] = x[i] + y[i%ky];
|
||||
}
|
||||
|
||||
static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) {
|
||||
@ -1271,8 +1283,9 @@ static __global__ void dequantize_block(const void * __restrict__ vx, float * __
|
||||
y[iybs + iqs + y_offset] = v.y;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
||||
#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
|
||||
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
|
||||
|
||||
int vi;
|
||||
@ -1293,11 +1306,12 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restric
|
||||
return sumi*d;
|
||||
#else
|
||||
return 0.0f; // only to satisfy the compiler
|
||||
#endif // __CUDA_ARCH__ >= 610
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
||||
#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
|
||||
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
|
||||
|
||||
const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
|
||||
@ -1318,11 +1332,12 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restric
|
||||
return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
|
||||
#else
|
||||
return 0.0f; // only to satisfy the compiler
|
||||
#endif // __CUDA_ARCH__ >= 610
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
||||
#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
|
||||
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
|
||||
|
||||
int qs;
|
||||
@ -1353,11 +1368,12 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restric
|
||||
return sumi*d;
|
||||
#else
|
||||
return 0.0f; // only to satisfy the compiler
|
||||
#endif // __CUDA_ARCH__ >= 610
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
||||
#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
|
||||
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
|
||||
|
||||
const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
|
||||
@ -1387,11 +1403,12 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restric
|
||||
return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
|
||||
#else
|
||||
return 0.0f; // only to satisfy the compiler
|
||||
#endif // __CUDA_ARCH__ >= 610
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
||||
#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
|
||||
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
|
||||
|
||||
int vi;
|
||||
@ -1406,7 +1423,220 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restric
|
||||
return sumi*d;
|
||||
#else
|
||||
return 0.0f; // only to satisfy the compiler
|
||||
#endif // __CUDA_ARCH__ >= 610
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
||||
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
const block_q2_K * bq2_K = (const block_q2_K *) vbq;
|
||||
|
||||
const int bq8_offset = QR2_K * (iqs / QI8_1);
|
||||
const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
|
||||
|
||||
float sumf_d = 0.0f;
|
||||
float sumf_m = 0.0f;
|
||||
|
||||
const float d = bq2_K->d;
|
||||
const float dmin = bq2_K->dmin;
|
||||
|
||||
const int v = *((int *) &bq2_K->qs[sizeof(int) * iqs]);
|
||||
|
||||
for (int i = 0; i < QR2_K; ++i) {
|
||||
const int sc = bq2_K->scales[scale_offset + 2*i];
|
||||
|
||||
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
||||
const float d8i = bq8i->d;
|
||||
|
||||
const int vi = (v >> (2*i)) & 0x03030303;
|
||||
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
|
||||
|
||||
sumf_d += d8i * (__dp4a(vi, ui, 0) * (sc & 0xF)); // SIMD dot product
|
||||
sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * (sc >> 4)); // multiply constant q2_K part with sum of q8_1 values
|
||||
}
|
||||
|
||||
return d*sumf_d - dmin*sumf_m;
|
||||
#else
|
||||
return 0.0f; // only to satisfy the compiler
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
||||
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
const block_q3_K * bq3_K = (const block_q3_K *) vbq;
|
||||
|
||||
const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
|
||||
const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
|
||||
|
||||
float sumf = 0.0f;
|
||||
|
||||
const float d = bq3_K->d;
|
||||
|
||||
int vl;
|
||||
memcpy(&vl, &bq3_K->qs[sizeof(int) * iqs], sizeof(int));
|
||||
|
||||
int vh;
|
||||
memcpy(&vh, &bq3_K->hmask[sizeof(int) * (iqs % (QI3_K/2))], sizeof(int));
|
||||
vh = ~vh; // invert the mask so that a 0/1 results in 4/0 being subtracted
|
||||
vh >>= bq8_offset;
|
||||
|
||||
for (int i = 0; i < QR3_K; ++i) {
|
||||
const int isc = scale_offset + 2*i;
|
||||
|
||||
const int isc_low = isc % (QK_K/32);
|
||||
const int sc_shift_low = 4 * (isc / (QK_K/32));
|
||||
const int sc_low = (bq3_K->scales[isc_low] >> sc_shift_low) & 0xF;
|
||||
|
||||
const int isc_high = isc % (QK_K/64);
|
||||
const int sc_shift_high = 2 * (isc / (QK_K/64));
|
||||
const int sc_high = ((bq3_K->scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
|
||||
|
||||
const int sc = (sc_low | sc_high) - 32;
|
||||
|
||||
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
||||
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
|
||||
const float d8i = bq8i->d;
|
||||
|
||||
const int vil = (vl >> (2*i)) & 0x03030303;
|
||||
|
||||
const int vih = ((vh >> i) << 2) & 0x04040404;
|
||||
|
||||
const int vi = __vsubss4(vil, vih);
|
||||
|
||||
sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
|
||||
}
|
||||
|
||||
return d*sumf;
|
||||
#else
|
||||
return 0.0f; // only to satisfy the compiler
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
||||
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
||||
|
||||
const int bq8_offset = QR4_K * (iqs / QI8_1);
|
||||
|
||||
float sumf_d = 0.0f;
|
||||
float sumf_m = 0.0f;
|
||||
|
||||
const float d = bq4_K->d;
|
||||
const float dmin = bq4_K->dmin;
|
||||
|
||||
const int v = *((int *) &bq4_K->qs[sizeof(int) * iqs]);
|
||||
|
||||
for (int i = 0; i < QR4_K; ++i) {
|
||||
const int isc = bq8_offset + i;
|
||||
|
||||
uint8_t sc, m;
|
||||
get_scale_min_k4(isc, bq4_K->scales, sc, m);
|
||||
|
||||
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
||||
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
|
||||
const float d8i = bq8i->d;
|
||||
|
||||
const int vi = (v >> (4*i)) & 0x0F0F0F0F;
|
||||
|
||||
sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
|
||||
sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q4_K with sum of q8_1 values
|
||||
}
|
||||
|
||||
return d*sumf_d - dmin*sumf_m;
|
||||
#else
|
||||
return 0.0f; // only to satisfy the compiler
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
||||
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
||||
|
||||
const int bq8_offset = QR5_K * (iqs / QI8_1);
|
||||
|
||||
float sumf_d = 0.0f;
|
||||
float sumf_m = 0.0f;
|
||||
|
||||
const float d = bq5_K->d;
|
||||
const float dmin = bq5_K->dmin;
|
||||
|
||||
const int vl = *((int *) &bq5_K->qs[sizeof(int) * iqs]);
|
||||
|
||||
const int vh = (*((int *) &bq5_K->qh[sizeof(int) * (iqs % (QI5_K/4))])) >> bq8_offset;
|
||||
|
||||
for (int i = 0; i < QR5_K; ++i) {
|
||||
const int isc = bq8_offset + i;
|
||||
|
||||
uint8_t sc, m;
|
||||
get_scale_min_k4(isc, bq5_K->scales, sc, m);
|
||||
|
||||
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
||||
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
|
||||
const float d8i = bq8i->d;
|
||||
|
||||
const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
|
||||
|
||||
const int vih = ((vh >> i) << 4) & 0x10101010;
|
||||
|
||||
const int vi = vil | vih;
|
||||
|
||||
sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
|
||||
sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q5_K with sum of q8_1 values
|
||||
}
|
||||
|
||||
return d*sumf_d - dmin*sumf_m;
|
||||
#else
|
||||
return 0.0f; // only to satisfy the compiler
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
||||
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
const block_q6_K * bq6_K = (const block_q6_K *) vbq;
|
||||
|
||||
const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
|
||||
const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
|
||||
const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
|
||||
|
||||
float sumf = 0.0f;
|
||||
|
||||
const float d = bq6_K->d;
|
||||
|
||||
int vl;
|
||||
memcpy(&vl, &bq6_K->ql[sizeof(int) * iqs], sizeof(int));
|
||||
|
||||
int vh;
|
||||
memcpy(&vh, &bq6_K->qh[sizeof(int) * ((QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4))], sizeof(int));
|
||||
|
||||
for (int i = 0; i < QR6_K; ++i) {
|
||||
const int sc = bq6_K->scales[scale_offset + 4*i];
|
||||
|
||||
const block_q8_1 * bq8i = bq8_1 + bq8_offset + 2*i;
|
||||
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % (QI8_1))]);
|
||||
const float d8i = bq8i->d;
|
||||
|
||||
const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
|
||||
|
||||
const int vih = ((vh >> (vh_shift + 4*i)) << 4) & 0x30303030;
|
||||
|
||||
const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
|
||||
|
||||
sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
|
||||
}
|
||||
|
||||
return d*sumf;
|
||||
#else
|
||||
return 0.0f; // only to satisfy the compiler
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
|
||||
@ -1429,7 +1659,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
|
||||
for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
|
||||
const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
|
||||
|
||||
const int iby = i + threadIdx.x / qi; // y block index
|
||||
const int iby = (i + threadIdx.x / qi) * qk/QK8_1; // y block index that aligns with ibx
|
||||
|
||||
const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int
|
||||
|
||||
@ -1667,6 +1897,40 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
|
||||
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
||||
}
|
||||
|
||||
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
|
||||
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
const int half_n_dims = ncols/4;
|
||||
|
||||
if (col >= half_n_dims) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
||||
const int i = row*ncols + col;
|
||||
|
||||
const float col_theta_scale = powf(theta_scale, col);
|
||||
|
||||
const float theta = p*col_theta_scale;
|
||||
const float sin_theta = sinf(theta);
|
||||
const float cos_theta = cosf(theta);
|
||||
|
||||
const float x0 = x[i + 0];
|
||||
const float x1 = x[i + half_n_dims];
|
||||
|
||||
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
||||
dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
|
||||
|
||||
const float block_theta = block_p*col_theta_scale;
|
||||
const float sin_block_theta = sinf(block_theta);
|
||||
const float cos_block_theta = cosf(block_theta);
|
||||
|
||||
const float x2 = x[i + half_n_dims * 2];
|
||||
const float x3 = x[i + half_n_dims * 3];
|
||||
|
||||
dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
|
||||
dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
|
||||
}
|
||||
|
||||
static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
|
||||
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
||||
@ -1732,9 +1996,9 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
|
||||
dst[i] = scale * x[i];
|
||||
}
|
||||
|
||||
static void add_f32_cuda(const float * x, const float * y, float * dst, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
|
||||
add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
|
||||
static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
|
||||
const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
|
||||
add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
||||
}
|
||||
|
||||
static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) {
|
||||
@ -1928,7 +2192,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||
GGML_ASSERT(ncols % QK4_0 == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
@ -1937,7 +2201,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||
GGML_ASSERT(ncols % QK4_1 == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
@ -1946,7 +2210,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||
GGML_ASSERT(ncols % QK5_0 == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
@ -1955,7 +2219,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||
GGML_ASSERT(ncols % QK5_1 == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
@ -1964,7 +2228,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||
GGML_ASSERT(ncols % QK8_0 == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
@ -1972,6 +2236,51 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, vec_dot_q2_K_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, vec_dot_q3_K_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, vec_dot_q4_K_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, vec_dot_q5_K_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, vec_dot_q6_K_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
}
|
||||
|
||||
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
||||
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
||||
@ -2064,6 +2373,14 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i
|
||||
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, theta_scale);
|
||||
}
|
||||
|
||||
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
|
||||
GGML_ASSERT(nrows % 4 == 0);
|
||||
const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
|
||||
const int num_blocks_x = (ncols + 4*CUDA_ROPE_BLOCK_SIZE - 1) / (4*CUDA_ROPE_BLOCK_SIZE);
|
||||
const dim3 block_nums(num_blocks_x, nrows, 1);
|
||||
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
|
||||
}
|
||||
|
||||
static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
|
||||
const dim3 block_dims(CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1, 1);
|
||||
const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
|
||||
@ -2293,17 +2610,15 @@ inline void ggml_cuda_op_add(
|
||||
GGML_ASSERT(src1_ddf_i != nullptr);
|
||||
GGML_ASSERT(dst_ddf_i != nullptr);
|
||||
|
||||
// TODO: support broadcasting
|
||||
GGML_ASSERT(ggml_nelements(src0) == ggml_nelements(src1));
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t i01_diff = i01_high - i01_low;
|
||||
|
||||
// const int64_t ne10 = src1->ne[0];
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
const int64_t ne11 = src1->ne[1];
|
||||
|
||||
// compute
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
||||
add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
|
||||
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
||||
add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
||||
} else {
|
||||
@ -2327,19 +2642,12 @@ inline void ggml_cuda_op_mul(
|
||||
GGML_ASSERT(dst_ddf_i != nullptr);
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t i01_diff = i01_high - i01_low;
|
||||
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
const int64_t ne11 = src1->ne[1];
|
||||
|
||||
for (int64_t i01 = i01_low; i01 < i01_high; i01++) {
|
||||
const int64_t i11 = i1*ne11 + i01%ne11; // broadcast src1 across src0
|
||||
|
||||
float * src0_ddf_i01 = src0_ddf_i + i01*ne00;
|
||||
float * src1_ddf_i01 = src1_ddf_i + i11*ne10;
|
||||
float * dst_ddf_i01 = dst_ddf_i + i01*ne00;
|
||||
|
||||
// compute
|
||||
mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
|
||||
}
|
||||
mul_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
|
||||
|
||||
(void) dst;
|
||||
(void) src0_ddq_i;
|
||||
@ -2452,13 +2760,22 @@ inline void ggml_cuda_op_mul_mat_vec(
|
||||
int id;
|
||||
CUDA_CHECK(cudaGetDevice(&id));
|
||||
|
||||
const bool mul_mat_vec_q_implemented = src0->type == GGML_TYPE_Q4_0 ||
|
||||
bool mul_mat_vec_q_implemented =
|
||||
src0->type == GGML_TYPE_Q4_0 ||
|
||||
src0->type == GGML_TYPE_Q4_1 ||
|
||||
src0->type == GGML_TYPE_Q5_0 ||
|
||||
src0->type == GGML_TYPE_Q5_1 ||
|
||||
src0->type == GGML_TYPE_Q8_0;
|
||||
#if QK_K == 256
|
||||
mul_mat_vec_q_implemented = mul_mat_vec_q_implemented ||
|
||||
src0->type == GGML_TYPE_Q2_K ||
|
||||
src0->type == GGML_TYPE_Q3_K ||
|
||||
src0->type == GGML_TYPE_Q4_K ||
|
||||
src0->type == GGML_TYPE_Q5_K ||
|
||||
src0->type == GGML_TYPE_Q6_K;
|
||||
#endif // QK_K == 256
|
||||
|
||||
const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 610 && mul_mat_vec_q_implemented;
|
||||
const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= MIN_CC_DP4A && mul_mat_vec_q_implemented;
|
||||
#endif
|
||||
|
||||
if (use_mul_mat_vec_q) {
|
||||
@ -2484,6 +2801,21 @@ inline void ggml_cuda_op_mul_mat_vec(
|
||||
case GGML_TYPE_Q8_0:
|
||||
mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||
break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
mul_mat_vec_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||
break;
|
||||
case GGML_TYPE_Q3_K:
|
||||
mul_mat_vec_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||
break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
mul_mat_vec_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||
break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
mul_mat_vec_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||
break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
mul_mat_vec_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
@ -2618,13 +2950,21 @@ inline void ggml_cuda_op_rope(
|
||||
const int n_past = ((int32_t *) src1->data)[0];
|
||||
const int n_dims = ((int32_t *) src1->data)[1];
|
||||
const int mode = ((int32_t *) src1->data)[2];
|
||||
GGML_ASSERT(mode == 0);
|
||||
const int n_ctx = ((int32_t *) src1->data)[3];
|
||||
|
||||
const float theta_scale = powf(10000.0, -2.0f/n_dims);
|
||||
const float p = ((mode & 1) == 0 ? n_past + i02 : i02);
|
||||
|
||||
bool is_glm = mode & 4;
|
||||
|
||||
// compute
|
||||
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
|
||||
if (is_glm) {
|
||||
const float id_p = min(p, n_ctx - 2.f);
|
||||
const float block_p = max(p - (n_ctx - 2.f), 0.f);
|
||||
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
|
||||
} else {
|
||||
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
|
||||
}
|
||||
|
||||
(void) dst;
|
||||
(void) src0_ddq_i;
|
||||
@ -3197,6 +3537,11 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
||||
(void) dst;
|
||||
}
|
||||
|
||||
void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
ggml_cuda_cpy(src0, dst, nullptr);
|
||||
(void) src1;
|
||||
}
|
||||
|
||||
void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
||||
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
|
||||
@ -3306,6 +3651,22 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
||||
delete extra;
|
||||
}
|
||||
|
||||
static struct ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
|
||||
static size_t g_temp_tensor_extra_index = 0;
|
||||
|
||||
static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
||||
if (g_temp_tensor_extras == nullptr) {
|
||||
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
|
||||
}
|
||||
|
||||
size_t alloc_index = g_temp_tensor_extra_index;
|
||||
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
|
||||
struct ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
|
||||
memset(extra, 0, sizeof(*extra));
|
||||
|
||||
return extra;
|
||||
}
|
||||
|
||||
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
|
||||
if (scratch && g_scratch_size == 0) {
|
||||
return;
|
||||
@ -3314,7 +3675,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
||||
// recursively assign CUDA buffers until a compute tensor is found
|
||||
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
|
||||
const ggml_op src0_op = tensor->src[0]->op;
|
||||
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
|
||||
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
|
||||
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
|
||||
}
|
||||
}
|
||||
@ -3323,8 +3684,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
||||
}
|
||||
|
||||
tensor->backend = GGML_BACKEND_GPU;
|
||||
struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
|
||||
memset(extra, 0, sizeof(*extra));
|
||||
struct ggml_tensor_extra_gpu * extra;
|
||||
|
||||
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
||||
tensor->op == GGML_OP_VIEW ||
|
||||
@ -3339,10 +3699,12 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
||||
if (tensor->op == GGML_OP_VIEW) {
|
||||
memcpy(&offset, tensor->src[2]->data, sizeof(size_t));
|
||||
}
|
||||
extra = ggml_cuda_alloc_temp_tensor_extra();
|
||||
extra->data_device[g_main_device] = src0_ddc + offset;
|
||||
} else if (tensor->op == GGML_OP_CPY) {
|
||||
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
|
||||
void * src1_ddv = src1_extra->data_device[g_main_device];
|
||||
extra = ggml_cuda_alloc_temp_tensor_extra();
|
||||
extra->data_device[g_main_device] = src1_ddv;
|
||||
} else if (scratch) {
|
||||
GGML_ASSERT(size <= g_scratch_size);
|
||||
@ -3355,6 +3717,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
||||
CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
|
||||
g_scratch_buffer = data;
|
||||
}
|
||||
extra = ggml_cuda_alloc_temp_tensor_extra();
|
||||
extra->data_device[g_main_device] = data + g_scratch_offset;
|
||||
|
||||
g_scratch_offset += size;
|
||||
@ -3364,6 +3727,8 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
||||
void * data;
|
||||
CUDA_CHECK(cudaMalloc(&data, size));
|
||||
CUDA_CHECK(cudaMemset(data, 0, size));
|
||||
extra = new ggml_tensor_extra_gpu;
|
||||
memset(extra, 0, sizeof(*extra));
|
||||
extra->data_device[g_main_device] = data;
|
||||
}
|
||||
|
||||
@ -3416,6 +3781,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
||||
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
||||
|
||||
switch (tensor->op) {
|
||||
case GGML_OP_DUP:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cuda_dup;
|
||||
break;
|
||||
case GGML_OP_ADD:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
@ -3470,6 +3841,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
||||
}
|
||||
func = ggml_cuda_cpy;
|
||||
break;
|
||||
case GGML_OP_CONT:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cuda_dup;
|
||||
break;
|
||||
case GGML_OP_RESHAPE:
|
||||
case GGML_OP_VIEW:
|
||||
case GGML_OP_PERMUTE:
|
||||
|
53
ggml-metal.m
53
ggml-metal.m
@ -739,12 +739,8 @@ void ggml_metal_graph_compute(
|
||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
|
||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
|
||||
|
||||
if (src0t == GGML_TYPE_Q4_0) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01 / 8+((ne01 % 8) & 0x01), ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q4_1) {
|
||||
[encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q2_K ||
|
||||
src0t == GGML_TYPE_Q3_K ||
|
||||
@ -885,28 +881,35 @@ void ggml_metal_graph_compute(
|
||||
|
||||
const int n_past = ((int32_t *)(src1->data))[0];
|
||||
|
||||
float freq_base;
|
||||
float freq_scale;
|
||||
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
|
||||
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
|
||||
|
||||
[encoder setComputePipelineState:ctx->pipeline_rope];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
||||
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
|
||||
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
|
||||
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
|
||||
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
|
||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
|
||||
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
|
||||
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
|
||||
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
|
||||
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
|
||||
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
|
||||
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
|
||||
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
|
||||
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
|
||||
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
|
||||
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
|
||||
[encoder setBytes:&n_past length:sizeof( int) atIndex:18];
|
||||
[encoder setBytes:&n_dims length:sizeof( int) atIndex:19];
|
||||
[encoder setBytes:&mode length:sizeof( int) atIndex:20];
|
||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
||||
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
|
||||
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
|
||||
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
|
||||
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
|
||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
|
||||
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
|
||||
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
|
||||
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
|
||||
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
|
||||
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
|
||||
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
|
||||
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
|
||||
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
|
||||
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
|
||||
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
|
||||
[encoder setBytes:&n_past length:sizeof( int) atIndex:18];
|
||||
[encoder setBytes:&n_dims length:sizeof( int) atIndex:19];
|
||||
[encoder setBytes:&mode length:sizeof( int) atIndex:20];
|
||||
[encoder setBytes:&freq_base length:sizeof(float) atIndex:21];
|
||||
[encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
|
188
ggml-metal.metal
188
ggml-metal.metal
@ -395,9 +395,12 @@ kernel void kernel_mul_mat_q4_0_f32(
|
||||
// each thread in a SIMD group deals with 1 block.
|
||||
for (int column = 0; column < nb / N_SIMDWIDTH; column++) {
|
||||
|
||||
float sumy = 0;
|
||||
for (int i = 0; i < QK4_0 / 4; i++) {
|
||||
y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0) + 4 * i));
|
||||
sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
|
||||
}
|
||||
sumy *= (-8.f);
|
||||
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
// prefetch next x block
|
||||
@ -405,39 +408,50 @@ kernel void kernel_mul_mat_q4_0_f32(
|
||||
|
||||
// calculate
|
||||
float d = qb_curr.d;
|
||||
float2 acc = {0.0f, 0.0f};
|
||||
float acc = sumy;
|
||||
for (int i = 0; i < 16; i++) {
|
||||
acc[0] += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4);
|
||||
acc[1] += yl[i] + yl[i+16];
|
||||
acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4);
|
||||
}
|
||||
sumf[row] += d * (acc[0] - 8.f*acc[1]);
|
||||
sumf[row] += d * acc;
|
||||
qb_curr = qb_next;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < QK4_0 / 4; i++) {
|
||||
y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + (nb / N_SIMDWIDTH) * QK4_0) + 4 * i));
|
||||
}
|
||||
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
// prefetch next x block
|
||||
qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (nb / N_SIMDWIDTH + ((row + 1) / N_DST)) * N_SIMDWIDTH];
|
||||
|
||||
// calculate
|
||||
float d = qb_curr.d;
|
||||
float2 acc = {0.0f, 0.0f};
|
||||
for (int i = 0; i < 16; i++) {
|
||||
acc[0] += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4);
|
||||
acc[1] += yl[i] + yl[i+16];
|
||||
if (nb % N_SIMDWIDTH == 0) {
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
|
||||
dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
|
||||
}
|
||||
}
|
||||
if (tiisg < nb % N_SIMDWIDTH) {
|
||||
sumf[row] += d * (acc[0] - 8.f*acc[1]);
|
||||
}
|
||||
qb_curr = qb_next;
|
||||
} else {
|
||||
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
|
||||
dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
|
||||
float sumy = 0;
|
||||
for (int i = 0; i < QK4_0 / 4; i++) {
|
||||
y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + (nb / N_SIMDWIDTH) * QK4_0) + 4 * i));
|
||||
sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
|
||||
}
|
||||
sumy *= (-8.f);
|
||||
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
// prefetch next x block
|
||||
qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (nb / N_SIMDWIDTH + ((row + 1) / N_DST)) * N_SIMDWIDTH];
|
||||
|
||||
// calculate
|
||||
float d = qb_curr.d;
|
||||
float acc = sumy;
|
||||
for (int i = 0; i < 16; i++) {
|
||||
acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4);
|
||||
}
|
||||
if (tiisg < nb % N_SIMDWIDTH) {
|
||||
sumf[row] += d * acc;
|
||||
}
|
||||
qb_curr = qb_next;
|
||||
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
|
||||
dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -449,65 +463,83 @@ kernel void kernel_mul_mat_q4_1_f32(
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne0,
|
||||
threadgroup float * sum [[threadgroup(0)]],
|
||||
constant int64_t & ne01[[buffer(4)]],
|
||||
uint2 tgpig[[threadgroup_position_in_grid]],
|
||||
uint2 tpitg[[thread_position_in_threadgroup]],
|
||||
uint2 tptg[[threads_per_threadgroup]]) {
|
||||
const int nb = ne00/QK4_1;
|
||||
|
||||
const int64_t r0 = tgpig.x;
|
||||
const int64_t r1 = tgpig.y;
|
||||
|
||||
device const block_q4_1 * x = (device const block_q4_1 *) src0 + r0*nb;
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
const int nb = ne00/QK4_0;
|
||||
const int r0 = tgpig.x;
|
||||
const int r1 = tgpig.y;
|
||||
device const block_q4_1 * x = (device const block_q4_1 *) src0 + (r0 * N_SIMDGROUP + sgitg) * N_DST * nb;
|
||||
device const float * y = (device const float *) src1 + r1*ne10;
|
||||
block_q4_1 qb_curr, qb_next;
|
||||
float4 y_curr[8]; // src1 vector cache
|
||||
float sumf[N_DST]={0.f}, all_sum;
|
||||
thread float * yl=(thread float *)y_curr;
|
||||
|
||||
const uint nth = tptg.x*tptg.y;
|
||||
const uint ith = tptg.y*tpitg.x + tpitg.y;
|
||||
|
||||
const int ix = tpitg.y/4; // 0 or 1
|
||||
const int iy = tpitg.y - 4*ix; // 0...3
|
||||
|
||||
const int first = 4 * iy;
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
for (int i = 2*tpitg.x + ix; i < nb; i += 2*tptg.x) {
|
||||
|
||||
const float d = (float)x[i].d;
|
||||
const float m = (float)x[i].m;
|
||||
|
||||
device const uint8_t * xl = x[i].qs + first;
|
||||
device const float * yl = y + i * QK4_1 + first;
|
||||
|
||||
float2 acc = {0.0f, 0.0f};
|
||||
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
|
||||
acc[0] += yl[j+ 0] * (d * (xl[j] & 0xF) + m);
|
||||
acc[1] += yl[j+16] * (d * (xl[j] >> 4) + m);
|
||||
// bootstrap
|
||||
qb_curr = x[tiisg];
|
||||
// each thread in a SIMD group deals with 1 block.
|
||||
for (int column = 0; column < nb / N_SIMDWIDTH; column++) {
|
||||
|
||||
float sumy = 0;
|
||||
for (int i = 0; i < QK4_0 / 4; i++) {
|
||||
y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0) + 4 * i));
|
||||
sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
|
||||
}
|
||||
|
||||
sumf += acc[0] + acc[1];
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
// prefetch next x block
|
||||
qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (column + ((row + 1) / N_DST)) * N_SIMDWIDTH];
|
||||
|
||||
// calculate
|
||||
const float d = qb_curr.d;
|
||||
const float m = qb_curr.m;
|
||||
float acc = 0.f;
|
||||
for (int i = 0; i < 16; i++) {
|
||||
acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4);
|
||||
}
|
||||
sumf[row] += d * acc + m * sumy;
|
||||
qb_curr = qb_next;
|
||||
}
|
||||
}
|
||||
|
||||
sum[ith] = sumf;
|
||||
if (nb % N_SIMDWIDTH == 0) {
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
|
||||
dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
||||
//
|
||||
// Accumulate the sum from all threads in the threadgroup
|
||||
//
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
if (ith%4 == 0) {
|
||||
sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
|
||||
}
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
if (ith%16 == 0) {
|
||||
sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
|
||||
}
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
if (ith == 0) {
|
||||
for (uint i = 16; i < nth; i += 16) sum[0] += sum[i];
|
||||
dst[r1*ne0 + r0] = sum[0];
|
||||
float sumy = 0;
|
||||
for (int i = 0; i < QK4_0 / 4; i++) {
|
||||
y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + (nb / N_SIMDWIDTH) * QK4_0) + 4 * i));
|
||||
sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
|
||||
}
|
||||
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
// prefetch next x block
|
||||
qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (nb / N_SIMDWIDTH + ((row + 1) / N_DST)) * N_SIMDWIDTH];
|
||||
|
||||
// calculate
|
||||
const float d = qb_curr.d;
|
||||
const float m = qb_curr.m;
|
||||
float acc = 0.f;
|
||||
for (int i = 0; i < 16; i++) {
|
||||
acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4);
|
||||
}
|
||||
if (tiisg < nb % N_SIMDWIDTH) {
|
||||
sumf[row] += d * acc + m * sumy;
|
||||
}
|
||||
qb_curr = qb_next;
|
||||
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
|
||||
dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -624,17 +656,19 @@ kernel void kernel_rope(
|
||||
constant int & n_past,
|
||||
constant int & n_dims,
|
||||
constant int & mode,
|
||||
constant float & freq_base,
|
||||
constant float & freq_scale,
|
||||
uint3 tpig[[thread_position_in_grid]]) {
|
||||
const int64_t i3 = tpig[2];
|
||||
const int64_t i2 = tpig[1];
|
||||
const int64_t i1 = tpig[0];
|
||||
|
||||
const bool is_neox = mode & 2;
|
||||
const float theta_scale = pow(10000.0, -2.0f/n_dims);
|
||||
const float theta_scale = pow(freq_base, -2.0f/n_dims);
|
||||
|
||||
const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
|
||||
|
||||
float theta = (float)p;
|
||||
float theta = freq_scale * (float)p;
|
||||
|
||||
if (!is_neox) {
|
||||
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
||||
|
174
ggml.c
174
ggml.c
@ -31,11 +31,17 @@
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
// static_assert should be a #define, but if it's not,
|
||||
// fall back to the _Static_assert C11 keyword.
|
||||
// if C99 - static_assert is noop
|
||||
// ref: https://stackoverflow.com/a/53923785/4039976
|
||||
#ifndef static_assert
|
||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
||||
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
||||
#else
|
||||
#define static_assert(cond, msg) struct global_scope_noop_trick
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
// disable "possible loss of data" to avoid hundreds of casts
|
||||
@ -112,10 +118,6 @@ typedef void * thread_ret_t;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef __HAIKU__
|
||||
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
||||
#endif
|
||||
|
||||
/*#define GGML_PERF*/
|
||||
#define GGML_DEBUG 0
|
||||
#define GGML_GELU_FP16
|
||||
@ -4410,8 +4412,8 @@ void ggml_free(struct ggml_context * ctx) {
|
||||
if (&g_state.contexts[i].context == ctx) {
|
||||
g_state.contexts[i].used = false;
|
||||
|
||||
GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n",
|
||||
__func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);
|
||||
GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
|
||||
__func__, i, ggml_used_mem(ctx));
|
||||
|
||||
if (ctx->mem_buffer_owned) {
|
||||
GGML_ALIGNED_FREE(ctx->mem_buffer);
|
||||
@ -6954,6 +6956,8 @@ struct ggml_tensor * ggml_rope_impl(
|
||||
int n_past,
|
||||
int n_dims,
|
||||
int mode,
|
||||
float freq_base,
|
||||
float freq_scale,
|
||||
int n_ctx,
|
||||
bool inplace) {
|
||||
GGML_ASSERT(n_past >= 0);
|
||||
@ -6967,12 +6971,14 @@ struct ggml_tensor * ggml_rope_impl(
|
||||
|
||||
ggml_scratch_save(ctx);
|
||||
|
||||
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
|
||||
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 6);
|
||||
|
||||
((int32_t *) b->data)[0] = n_past;
|
||||
((int32_t *) b->data)[1] = n_dims;
|
||||
((int32_t *) b->data)[2] = mode;
|
||||
((int32_t *) b->data)[3] = n_ctx;
|
||||
memcpy((int32_t *) b->data + 4, &freq_base, sizeof(float));
|
||||
memcpy((int32_t *) b->data + 5, &freq_scale, sizeof(float));
|
||||
|
||||
ggml_scratch_load(ctx);
|
||||
|
||||
@ -6991,7 +6997,7 @@ struct ggml_tensor * ggml_rope(
|
||||
int n_dims,
|
||||
int mode,
|
||||
int n_ctx) {
|
||||
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, false);
|
||||
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, 10000.0f, 1.0f, n_ctx, false);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_rope_inplace(
|
||||
@ -7001,7 +7007,19 @@ struct ggml_tensor * ggml_rope_inplace(
|
||||
int n_dims,
|
||||
int mode,
|
||||
int n_ctx) {
|
||||
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, true);
|
||||
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, 10000.0f, 1.0f, n_ctx, true);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_rope_custom_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int n_past,
|
||||
int n_dims,
|
||||
int mode,
|
||||
float freq_base,
|
||||
float freq_scale,
|
||||
int n_ctx) {
|
||||
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, freq_base, freq_scale, n_ctx, true);
|
||||
}
|
||||
|
||||
// ggml_rope_back
|
||||
@ -10684,6 +10702,8 @@ static void ggml_compute_forward_mul_mat(
|
||||
|
||||
const enum ggml_type type = src0->type;
|
||||
|
||||
const bool src1_cont = ggml_is_contiguous(src1);
|
||||
|
||||
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
|
||||
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
||||
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
||||
@ -10747,7 +10767,7 @@ static void ggml_compute_forward_mul_mat(
|
||||
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
||||
|
||||
if (type != GGML_TYPE_F32) {
|
||||
float * const wdata = params->wdata;
|
||||
float * const wdata = params->wdata;
|
||||
ggml_to_float_t const to_float = type_traits[type].to_float;
|
||||
|
||||
size_t id = 0;
|
||||
@ -10805,7 +10825,7 @@ static void ggml_compute_forward_mul_mat(
|
||||
// src1 rows
|
||||
const int64_t nr1 = ne11*ne12*ne13;
|
||||
|
||||
void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
||||
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
||||
const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
|
||||
|
||||
for (int64_t ir1 = 0; ir1 < nr1; ++ir1) {
|
||||
@ -10828,7 +10848,15 @@ static void ggml_compute_forward_mul_mat(
|
||||
const int64_t i3 = i13;
|
||||
|
||||
const char * src0_row = (const char *) src0->data + ( 0 + i02*nb02 + i03*nb03 );
|
||||
const char * src1_col = (const char *) wdata + (i11 + i12*ne11 + i13*ne12*ne11)*row_size;
|
||||
|
||||
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
||||
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
||||
// the original src1 data pointer, so we should index using the indices directly
|
||||
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
||||
const char * src1_col = (const char *) wdata +
|
||||
(src1_cont || src1->type != vec_dot_type
|
||||
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
|
||||
: (i11*nb11 + i12*nb12 + i13*nb13));
|
||||
|
||||
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
|
||||
|
||||
@ -12062,16 +12090,21 @@ static void ggml_compute_forward_rope_f32(
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(ggml_nelements(src1) == 4);
|
||||
GGML_ASSERT(ggml_nelements(src1) == 6);
|
||||
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
|
||||
float freq_base;
|
||||
float freq_scale;
|
||||
|
||||
const int n_past = ((int32_t *) src1->data)[0];
|
||||
const int n_dims = ((int32_t *) src1->data)[1];
|
||||
const int mode = ((int32_t *) src1->data)[2];
|
||||
const int n_ctx = ((int32_t *) src1->data)[3];
|
||||
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
|
||||
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
|
||||
|
||||
assert(n_past >= 0);
|
||||
|
||||
@ -12100,7 +12133,7 @@ static void ggml_compute_forward_rope_f32(
|
||||
// row index used to determine which thread to use
|
||||
int ir = 0;
|
||||
|
||||
const float theta_scale = powf(10000.0, -2.0f/n_dims);
|
||||
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||
|
||||
const bool is_neox = mode & 2;
|
||||
const bool is_glm = mode & 4;
|
||||
@ -12112,7 +12145,7 @@ static void ggml_compute_forward_rope_f32(
|
||||
if (ir++ < ir0) continue;
|
||||
if (ir > ir1) break;
|
||||
|
||||
float theta = (float)p;
|
||||
float theta = freq_scale * (float)p;
|
||||
|
||||
if (is_glm) {
|
||||
theta = MIN(p, n_ctx - 2);
|
||||
@ -12189,16 +12222,21 @@ static void ggml_compute_forward_rope_f16(
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(ggml_nelements(src1) == 4);
|
||||
GGML_ASSERT(ggml_nelements(src1) == 6);
|
||||
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
|
||||
float freq_base;
|
||||
float freq_scale;
|
||||
|
||||
const int n_past = ((int32_t *) src1->data)[0];
|
||||
const int n_dims = ((int32_t *) src1->data)[1];
|
||||
const int mode = ((int32_t *) src1->data)[2];
|
||||
const int n_ctx = ((int32_t *) src1->data)[3];
|
||||
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
|
||||
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
|
||||
|
||||
assert(n_past >= 0);
|
||||
|
||||
@ -12227,7 +12265,7 @@ static void ggml_compute_forward_rope_f16(
|
||||
// row index used to determine which thread to use
|
||||
int ir = 0;
|
||||
|
||||
const float theta_scale = powf(10000.0, -2.0f/n_dims);
|
||||
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||
|
||||
const bool is_neox = mode & 2;
|
||||
const bool is_glm = mode & 4;
|
||||
@ -12239,7 +12277,7 @@ static void ggml_compute_forward_rope_f16(
|
||||
if (ir++ < ir0) continue;
|
||||
if (ir > ir1) break;
|
||||
|
||||
float theta = (float)p;
|
||||
float theta = freq_scale * (float)p;
|
||||
|
||||
if (is_glm) {
|
||||
theta = MIN(p, n_ctx - 2);
|
||||
@ -12300,7 +12338,7 @@ static void ggml_compute_forward_rope_f16(
|
||||
const float x0 = GGML_FP16_TO_FP32(src[0]);
|
||||
const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
|
||||
|
||||
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
||||
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
||||
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
||||
}
|
||||
}
|
||||
@ -12982,12 +13020,13 @@ static void ggml_compute_forward_conv_1d(
|
||||
};
|
||||
}
|
||||
|
||||
// ggml_compute_forward_conv_2d_sk_p0
|
||||
// ggml_compute_forward_conv_2d
|
||||
|
||||
static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
|
||||
static void ggml_compute_forward_conv_2d_f16_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
const struct ggml_tensor * src1,
|
||||
const struct ggml_tensor * opt0,
|
||||
struct ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
@ -13007,28 +13046,37 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
|
||||
// size of the convolution row - the kernel size unrolled across all channels
|
||||
const int ew0 = nk0*nk1*ne02;
|
||||
|
||||
const int32_t s0 = ((const int32_t*)(opt0->data))[0];
|
||||
const int32_t s1 = ((const int32_t*)(opt0->data))[1];
|
||||
const int32_t p0 = ((const int32_t*)(opt0->data))[2];
|
||||
const int32_t p1 = ((const int32_t*)(opt0->data))[3];
|
||||
const int32_t d0 = ((const int32_t*)(opt0->data))[4];
|
||||
const int32_t d1 = ((const int32_t*)(opt0->data))[5];
|
||||
|
||||
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
||||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
|
||||
if (params->type == GGML_TASK_INIT) {
|
||||
// TODO: fix this memset (wsize is overestimated)
|
||||
memset(params->wdata, 0, params->wsize);
|
||||
|
||||
// prepare source data (src1)
|
||||
{
|
||||
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
||||
|
||||
for (int i13 = 0; i13 < ne13; i13++) {
|
||||
for (int i12 = 0; i12 < ne12; i12++) {
|
||||
const float * const src = (float *)((char *) src1->data + i13*nb13 + i12*nb12);
|
||||
ggml_fp16_t * dst_data = wdata + i13*(ne1*ne0*ew0);
|
||||
for (int i12 = 0; i12 < ne12; i12++) {
|
||||
const float * const src = (float *)((char *) src1->data + i12*nb12);
|
||||
ggml_fp16_t * dst_data = wdata;
|
||||
|
||||
for (int i1 = 0; i1 < ne1; i1++) {
|
||||
for (int i0 = 0; i0 < ne0; i0++) {
|
||||
for (int ik1 = 0; ik1 < nk1; ik1++) {
|
||||
for (int ik0 = 0; ik0 < nk0; ik0++) {
|
||||
for (int i1 = 0; i1 < ne1; i1++) {
|
||||
for (int i0 = 0; i0 < ne0; i0++) {
|
||||
for (int ik1 = 0; ik1 < nk1; ik1++) {
|
||||
for (int ik0 = 0; ik0 < nk0; ik0++) {
|
||||
const int idx0 = i0*s0 + ik0*d0 - p0;
|
||||
const int idx1 = i1*s1 + ik1*d1 - p1;
|
||||
|
||||
if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) {
|
||||
dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
|
||||
GGML_FP32_TO_FP16(src[(i1*nk1 + ik1)*ne10 + (i0*nk0 + ik0)]);
|
||||
GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -13071,19 +13119,21 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_conv_2d_sk_p0(
|
||||
static void ggml_compute_forward_conv_2d(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
const struct ggml_tensor * opt0,
|
||||
struct ggml_tensor * dst
|
||||
) {
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
ggml_compute_forward_conv_2d_sk_p0_f16_f32(params, src0, src1, dst);
|
||||
ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, opt0, dst);
|
||||
} break;
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
//ggml_compute_forward_conv_2d_sk_p0_f32(params, src0, src1, dst);
|
||||
//ggml_compute_forward_conv_2d_f32(params, src0, src1, opt0, dst);
|
||||
GGML_ASSERT(false);
|
||||
} break;
|
||||
default:
|
||||
@ -13093,32 +13143,6 @@ static void ggml_compute_forward_conv_2d_sk_p0(
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_conv_2d
|
||||
|
||||
static void ggml_compute_forward_conv_2d(
|
||||
const struct ggml_compute_params* params,
|
||||
const struct ggml_tensor* src0,
|
||||
const struct ggml_tensor* src1,
|
||||
const struct ggml_tensor* opt0,
|
||||
struct ggml_tensor* dst) {
|
||||
const int32_t s0 = ((const int32_t*)(opt0->data))[0];
|
||||
const int32_t s1 = ((const int32_t*)(opt0->data))[1];
|
||||
const int32_t p0 = ((const int32_t*)(opt0->data))[2];
|
||||
const int32_t p1 = ((const int32_t*)(opt0->data))[3];
|
||||
const int32_t d0 = ((const int32_t*)(opt0->data))[4];
|
||||
const int32_t d1 = ((const int32_t*)(opt0->data))[5];
|
||||
GGML_ASSERT(d0 == 1); // dilation not supported
|
||||
GGML_ASSERT(d1 == 1);
|
||||
GGML_ASSERT(p0 == 0); // padding not supported
|
||||
GGML_ASSERT(p1 == 0);
|
||||
|
||||
if (s0 == src0->ne[0] && s1 == src0->ne[1]) {
|
||||
ggml_compute_forward_conv_2d_sk_p0(params, src0, src1, dst);
|
||||
} else {
|
||||
GGML_ASSERT(false); // only stride equal to kernel size is supported
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_pool_1d_sk_p0
|
||||
|
||||
static void ggml_compute_forward_pool_1d_sk_p0(
|
||||
@ -15712,7 +15736,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
||||
// necessary for llama
|
||||
if (src0->grad) {
|
||||
assert(src1->type == GGML_TYPE_I32);
|
||||
assert(ggml_nelements(src1) == 4);
|
||||
assert(ggml_nelements(src1) == 6);
|
||||
const int n_past = ((int32_t *) src1->data)[0];
|
||||
const int n_dims = ((int32_t *) src1->data)[1];
|
||||
const int mode = ((int32_t *) src1->data)[2];
|
||||
@ -15733,7 +15757,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
||||
{
|
||||
if (src0->grad) {
|
||||
assert(src1->type == GGML_TYPE_I32);
|
||||
assert(ggml_nelements(src1) == 4);
|
||||
assert(ggml_nelements(src1) == 3);
|
||||
const int n_past = ((int32_t *) src1->data)[0];
|
||||
const int n_dims = ((int32_t *) src1->data)[1];
|
||||
const int mode = ((int32_t *) src1->data)[2];
|
||||
@ -16293,8 +16317,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
||||
params.nth = n_tasks_arr[node_n];
|
||||
ggml_compute_forward(¶ms, node);
|
||||
ggml_graph_compute_perf_stats_node(node, state->shared);
|
||||
}
|
||||
ggml_graph_compute_perf_stats_node(node, state->shared);
|
||||
}
|
||||
|
||||
// distribute new work or execute it direct if 1T
|
||||
@ -16324,8 +16348,9 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
||||
params.type = GGML_TASK_FINALIZE;
|
||||
ggml_compute_forward(¶ms, node);
|
||||
ggml_graph_compute_perf_stats_node(node, state->shared);
|
||||
}
|
||||
|
||||
ggml_graph_compute_perf_stats_node(node, state->shared);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
@ -16575,19 +16600,22 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
||||
const int64_t ne11 = node->src[1]->ne[1]; // H
|
||||
const int64_t ne12 = node->src[1]->ne[2]; // C
|
||||
|
||||
const int64_t ne0 = node->ne[0];
|
||||
const int64_t ne1 = node->ne[1];
|
||||
const int64_t ne2 = node->ne[2];
|
||||
const int64_t nk = ne00*ne01;
|
||||
const int64_t ew0 = nk * ne02;
|
||||
|
||||
UNUSED(ne02);
|
||||
UNUSED(ne03);
|
||||
UNUSED(nk);
|
||||
UNUSED(ne2);
|
||||
|
||||
size_t cur = 0;
|
||||
|
||||
if (node->src[0]->type == GGML_TYPE_F16 &&
|
||||
node->src[1]->type == GGML_TYPE_F32) {
|
||||
cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12);
|
||||
node->src[1]->type == GGML_TYPE_F32) {
|
||||
cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0);
|
||||
} else if (node->src[0]->type == GGML_TYPE_F32 &&
|
||||
node->src[1]->type == GGML_TYPE_F32) {
|
||||
node->src[1]->type == GGML_TYPE_F32) {
|
||||
cur = sizeof(float)* (ne10*ne11*ne12);
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
@ -16864,9 +16892,6 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
|
||||
}
|
||||
|
||||
void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
||||
//assert(cgraph->work == NULL);
|
||||
//assert(cgraph->work_size == 0);
|
||||
|
||||
uint64_t size_eval = 0;
|
||||
|
||||
// compute size of intermediate results
|
||||
@ -17305,9 +17330,6 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
||||
|
||||
GGML_PRINT("=== GRAPH ===\n");
|
||||
|
||||
GGML_PRINT_DEBUG("n_threads = %d\n", cgraph->n_threads);
|
||||
GGML_PRINT_DEBUG("total work size = %zu bytes\n", cgraph->work_size);
|
||||
|
||||
GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = cgraph->nodes[i];
|
||||
|
11
ggml.h
11
ggml.h
@ -1121,6 +1121,17 @@ extern "C" {
|
||||
int mode,
|
||||
int n_ctx);
|
||||
|
||||
// custom RoPE, in-place, returns view(a)
|
||||
GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int n_past,
|
||||
int n_dims,
|
||||
int mode,
|
||||
float freq_base,
|
||||
float freq_scale,
|
||||
int n_ctx);
|
||||
|
||||
// rotary position embedding backward, i.e compute dx from dy
|
||||
// a - dy
|
||||
GGML_API struct ggml_tensor * ggml_rope_back(
|
||||
|
@ -15,6 +15,14 @@
|
||||
#define K_SCALE_SIZE 12
|
||||
#endif
|
||||
|
||||
#ifndef static_assert
|
||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
||||
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
||||
#else
|
||||
#define static_assert(cond, msg) struct global_scope_noop_trick
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//
|
||||
// Super-block quantization structures
|
||||
//
|
||||
|
169
llama.cpp
169
llama.cpp
@ -101,14 +101,15 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
|
||||
// memory sizes
|
||||
//
|
||||
|
||||
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
||||
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
|
||||
{
|
||||
static std::map<e_model, size_t> k_sizes = {
|
||||
{ MODEL_3B, 256ull * MB },
|
||||
{ MODEL_7B, 512ull * MB },
|
||||
{ MODEL_13B, 512ull * MB },
|
||||
{ MODEL_30B, 512ull * MB },
|
||||
{ MODEL_65B, 1024ull * MB },
|
||||
/* empirical scaling, still a guess */
|
||||
{ MODEL_3B, ((size_t) n_ctx / 16ull + 128ull) * MB },
|
||||
{ MODEL_7B, ((size_t) n_ctx / 16ull + 256ull) * MB },
|
||||
{ MODEL_13B, ((size_t) n_ctx / 12ull + 256ull) * MB },
|
||||
{ MODEL_30B, ((size_t) n_ctx / 10ull + 256ull) * MB },
|
||||
{ MODEL_65B, ((size_t) n_ctx / 8ull + 512ull) * MB },
|
||||
};
|
||||
return k_sizes;
|
||||
}
|
||||
@ -140,14 +141,14 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
||||
|
||||
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
||||
// not actually needed if BLAS is disabled
|
||||
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
||||
static const std::map<e_model, size_t> & MEM_REQ_EVAL(int n_ctx)
|
||||
{
|
||||
static std::map<e_model, size_t> k_sizes = {
|
||||
{ MODEL_3B, 512ull * MB },
|
||||
{ MODEL_7B, 768ull * MB },
|
||||
{ MODEL_13B, 1024ull * MB },
|
||||
{ MODEL_30B, 1280ull * MB },
|
||||
{ MODEL_65B, 1536ull * MB },
|
||||
{ MODEL_3B, ((size_t) n_ctx / 256ull + 512ull) * MB },
|
||||
{ MODEL_7B, ((size_t) n_ctx / 256ull + 768ull) * MB },
|
||||
{ MODEL_13B, ((size_t) n_ctx / 256ull + 1024ull) * MB },
|
||||
{ MODEL_30B, ((size_t) n_ctx / 256ull + 1280ull) * MB },
|
||||
{ MODEL_65B, ((size_t) n_ctx / 256ull + 1536ull) * MB },
|
||||
};
|
||||
return k_sizes;
|
||||
}
|
||||
@ -189,6 +190,10 @@ struct llama_hparams {
|
||||
uint32_t n_head = 32;
|
||||
uint32_t n_layer = 32;
|
||||
uint32_t n_rot = 64;
|
||||
|
||||
float rope_freq_base = 10000.0f;
|
||||
float rope_freq_scale = 1.0f;
|
||||
|
||||
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
||||
|
||||
bool operator!=(const llama_hparams & other) const {
|
||||
@ -303,7 +308,7 @@ struct llama_model {
|
||||
};
|
||||
|
||||
struct llama_context {
|
||||
llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
||||
llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
||||
#ifdef GGML_USE_METAL
|
||||
~llama_context() {
|
||||
if (ctx_metal) {
|
||||
@ -324,7 +329,6 @@ struct llama_context {
|
||||
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
||||
|
||||
const llama_model & model;
|
||||
const llama_vocab & vocab;
|
||||
|
||||
bool model_owner = false;
|
||||
|
||||
@ -648,7 +652,7 @@ struct llama_model_loader {
|
||||
*ctx_size_p = *mmapped_size_p = 0;
|
||||
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
||||
*ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
|
||||
*(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
|
||||
*(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size + 16;
|
||||
}
|
||||
}
|
||||
|
||||
@ -844,6 +848,8 @@ struct llama_context_params llama_context_default_params() {
|
||||
/*.gpu_layers =*/ 0,
|
||||
/*.main_gpu =*/ 0,
|
||||
/*.tensor_split =*/ {0},
|
||||
/*.rope_freq_base =*/ 10000.0f,
|
||||
/*.rope_freq_scale =*/ 1.0f,
|
||||
/*.progress_callback =*/ nullptr,
|
||||
/*.progress_callback_user_data =*/ nullptr,
|
||||
/*.low_vram =*/ false,
|
||||
@ -869,6 +875,10 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
||||
return result;
|
||||
}
|
||||
|
||||
int llama_max_devices() {
|
||||
return LLAMA_MAX_DEVICES;
|
||||
}
|
||||
|
||||
bool llama_mmap_supported() {
|
||||
return llama_mmap::SUPPORTED;
|
||||
}
|
||||
@ -967,6 +977,8 @@ static void llama_model_load_internal(
|
||||
int n_gpu_layers,
|
||||
int main_gpu,
|
||||
const float * tensor_split,
|
||||
float rope_freq_base,
|
||||
float rope_freq_scale,
|
||||
bool low_vram,
|
||||
ggml_type memory_type,
|
||||
bool use_mmap,
|
||||
@ -1001,22 +1013,27 @@ static void llama_model_load_internal(
|
||||
}
|
||||
|
||||
hparams.n_ctx = n_ctx;
|
||||
|
||||
hparams.rope_freq_base = rope_freq_base;
|
||||
hparams.rope_freq_scale = rope_freq_scale;
|
||||
}
|
||||
|
||||
const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
||||
|
||||
{
|
||||
fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
|
||||
fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
||||
fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
|
||||
fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
|
||||
fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
|
||||
fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
|
||||
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
|
||||
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
|
||||
fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
|
||||
fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
||||
fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
|
||||
fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
|
||||
fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
|
||||
fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
|
||||
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
|
||||
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
|
||||
fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
|
||||
fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
|
||||
fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
|
||||
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
||||
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
||||
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
||||
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
||||
}
|
||||
|
||||
if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
|
||||
@ -1165,9 +1182,9 @@ static void llama_model_load_internal(
|
||||
const size_t mem_required =
|
||||
ctx_size +
|
||||
mmapped_size - vram_weights + // weights in VRAM not in memory
|
||||
MEM_REQ_SCRATCH0().at(model.type) +
|
||||
MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
|
||||
MEM_REQ_SCRATCH1().at(model.type) +
|
||||
MEM_REQ_EVAL().at (model.type);
|
||||
MEM_REQ_EVAL(hparams.n_ctx).at(model.type);
|
||||
|
||||
// this is the memory required by one llama_state
|
||||
const size_t mem_required_state =
|
||||
@ -1271,6 +1288,8 @@ static bool llama_model_load(
|
||||
int n_gpu_layers,
|
||||
int main_gpu,
|
||||
float * tensor_split,
|
||||
float rope_freq_base,
|
||||
float rope_freq_scale,
|
||||
bool low_vram,
|
||||
ggml_type memory_type,
|
||||
bool use_mmap,
|
||||
@ -1279,7 +1298,7 @@ static bool llama_model_load(
|
||||
llama_progress_callback progress_callback,
|
||||
void *progress_callback_user_data) {
|
||||
try {
|
||||
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
|
||||
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
|
||||
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
||||
return true;
|
||||
} catch (const std::exception & err) {
|
||||
@ -1331,6 +1350,9 @@ static bool llama_eval_internal(
|
||||
const int n_rot = hparams.n_embd/hparams.n_head;
|
||||
const int n_gpu_layers = model.n_gpu_layers;
|
||||
|
||||
const float freq_base = hparams.rope_freq_base;
|
||||
const float freq_scale = hparams.rope_freq_scale;
|
||||
|
||||
auto & mem_per_token = lctx.mem_per_token;
|
||||
auto & buf_compute = lctx.buf_compute;
|
||||
|
||||
@ -1428,11 +1450,11 @@ static bool llama_eval_internal(
|
||||
offload_func_kq(tmpq);
|
||||
ggml_set_name(tmpq, "tmpq");
|
||||
|
||||
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
|
||||
struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, 0);
|
||||
offload_func_kq(Kcur);
|
||||
ggml_set_name(Kcur, "Kcur");
|
||||
|
||||
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
|
||||
struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, 0);
|
||||
offload_func_kq(Qcur);
|
||||
ggml_set_name(Qcur, "Qcur");
|
||||
|
||||
@ -2006,9 +2028,18 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
|
||||
}
|
||||
|
||||
// Normalize the second derivatives
|
||||
float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
|
||||
for (float & value : second_derivatives) {
|
||||
value /= second_derivatives_sum;
|
||||
{
|
||||
const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
|
||||
|
||||
if (second_derivatives_sum > 1e-6f) {
|
||||
for (float & value : second_derivatives) {
|
||||
value /= second_derivatives_sum;
|
||||
}
|
||||
} else {
|
||||
for (float & value : second_derivatives) {
|
||||
value = 1.0f / second_derivatives.size();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
float cum_sum = 0.0f;
|
||||
@ -2187,7 +2218,7 @@ void llama_sample_classifier_free_guidance(
|
||||
struct llama_context * guidance_ctx,
|
||||
float scale,
|
||||
float smooth_factor) {
|
||||
int64_t t_start_sample_us = t_start_sample_us = ggml_time_us();
|
||||
int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
assert(ctx);
|
||||
auto n_vocab = llama_n_vocab(ctx);
|
||||
@ -2675,8 +2706,9 @@ struct llama_model * llama_load_model_from_file(
|
||||
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
||||
|
||||
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
||||
params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
|
||||
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
|
||||
params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
|
||||
memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
|
||||
params.progress_callback_user_data)) {
|
||||
delete model;
|
||||
fprintf(stderr, "%s: failed to load model\n", __func__);
|
||||
return nullptr;
|
||||
@ -2697,7 +2729,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
llama_context * ctx = new llama_context(*model, model->vocab);
|
||||
llama_context * ctx = new llama_context(*model);
|
||||
|
||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
||||
params.seed = time(NULL);
|
||||
@ -2751,9 +2783,9 @@ struct llama_context * llama_new_context_with_model(
|
||||
ctx->embedding.resize(hparams.n_embd);
|
||||
}
|
||||
|
||||
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
|
||||
ctx->buf_compute.resize(MEM_REQ_EVAL(hparams.n_ctx).at(ctx->model.type));
|
||||
|
||||
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
|
||||
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
|
||||
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
||||
}
|
||||
|
||||
@ -3535,13 +3567,13 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int llama_tokenize(
|
||||
struct llama_context * ctx,
|
||||
int llama_tokenize_with_model(
|
||||
const struct llama_model * model,
|
||||
const char * text,
|
||||
llama_token * tokens,
|
||||
int n_max_tokens,
|
||||
bool add_bos) {
|
||||
auto res = llama_tokenize(ctx->vocab, text, add_bos);
|
||||
auto res = llama_tokenize(model->vocab, text, add_bos);
|
||||
|
||||
if (n_max_tokens < (int) res.size()) {
|
||||
fprintf(stderr, "%s: too many tokens\n", __func__);
|
||||
@ -3555,8 +3587,29 @@ int llama_tokenize(
|
||||
return res.size();
|
||||
}
|
||||
|
||||
int llama_tokenize(
|
||||
struct llama_context * ctx,
|
||||
const char * text,
|
||||
llama_token * tokens,
|
||||
int n_max_tokens,
|
||||
bool add_bos) {
|
||||
return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
|
||||
}
|
||||
|
||||
int llama_n_vocab_from_model(const struct llama_model * model) {
|
||||
return model->vocab.id_to_token.size();
|
||||
}
|
||||
|
||||
int llama_n_ctx_from_model(const struct llama_model * model) {
|
||||
return model->hparams.n_ctx;
|
||||
}
|
||||
|
||||
int llama_n_embd_from_model(const struct llama_model * model) {
|
||||
return model->hparams.n_embd;
|
||||
}
|
||||
|
||||
int llama_n_vocab(const struct llama_context * ctx) {
|
||||
return ctx->vocab.id_to_token.size();
|
||||
return ctx->model.vocab.id_to_token.size();
|
||||
}
|
||||
|
||||
int llama_n_ctx(const struct llama_context * ctx) {
|
||||
@ -3567,17 +3620,25 @@ int llama_n_embd(const struct llama_context * ctx) {
|
||||
return ctx->model.hparams.n_embd;
|
||||
}
|
||||
|
||||
int llama_get_vocab_from_model(
|
||||
const struct llama_model * model,
|
||||
const char * * strings,
|
||||
float * scores,
|
||||
int capacity) {
|
||||
int n = std::min(capacity, (int) model->vocab.id_to_token.size());
|
||||
for (int i = 0; i<n; ++i) {
|
||||
strings[i] = model->vocab.id_to_token[i].tok.c_str();
|
||||
scores[i] = model->vocab.id_to_token[i].score;
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
int llama_get_vocab(
|
||||
const struct llama_context * ctx,
|
||||
const char * * strings,
|
||||
float * scores,
|
||||
int capacity) {
|
||||
int n = std::min(capacity, (int) ctx->vocab.id_to_token.size());
|
||||
for (int i = 0; i<n; ++i) {
|
||||
strings[i] = ctx->vocab.id_to_token[i].tok.c_str();
|
||||
scores[i] = ctx->vocab.id_to_token[i].score;
|
||||
}
|
||||
return n;
|
||||
return llama_get_vocab_from_model(&ctx->model, strings, scores, capacity);
|
||||
}
|
||||
|
||||
float * llama_get_logits(struct llama_context * ctx) {
|
||||
@ -3588,12 +3649,16 @@ float * llama_get_embeddings(struct llama_context * ctx) {
|
||||
return ctx->embedding.data();
|
||||
}
|
||||
|
||||
const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
|
||||
if (token >= llama_n_vocab(ctx)) {
|
||||
const char * llama_token_to_str_with_model(const struct llama_model * model, llama_token token) {
|
||||
if (token >= llama_n_vocab_from_model(model)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return ctx->vocab.id_to_token[token].tok.c_str();
|
||||
return model->vocab.id_to_token[token].tok.c_str();
|
||||
}
|
||||
|
||||
const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
|
||||
return llama_token_to_str_with_model(&ctx->model, token);
|
||||
}
|
||||
|
||||
llama_token llama_token_bos() {
|
||||
|
32
llama.h
32
llama.h
@ -89,6 +89,11 @@ extern "C" {
|
||||
int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
|
||||
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
||||
float rope_freq_base; // RoPE base frequency
|
||||
float rope_freq_scale; // RoPE frequency scaling factor
|
||||
|
||||
// called with a progress value between 0 and 1, pass NULL to disable
|
||||
llama_progress_callback progress_callback;
|
||||
// context pointer passed to the progress callback
|
||||
@ -148,6 +153,8 @@ extern "C" {
|
||||
int32_t n_eval;
|
||||
};
|
||||
|
||||
LLAMA_API int llama_max_devices();
|
||||
|
||||
LLAMA_API struct llama_context_params llama_context_default_params();
|
||||
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
|
||||
|
||||
@ -270,10 +277,21 @@ extern "C" {
|
||||
int n_max_tokens,
|
||||
bool add_bos);
|
||||
|
||||
LLAMA_API int llama_tokenize_with_model(
|
||||
const struct llama_model * model,
|
||||
const char * text,
|
||||
llama_token * tokens,
|
||||
int n_max_tokens,
|
||||
bool add_bos);
|
||||
|
||||
LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
|
||||
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
||||
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
||||
|
||||
LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
|
||||
LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model);
|
||||
LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
|
||||
|
||||
// Get the vocabulary as output parameters.
|
||||
// Returns number of results.
|
||||
LLAMA_API int llama_get_vocab(
|
||||
@ -282,6 +300,12 @@ extern "C" {
|
||||
float * scores,
|
||||
int capacity);
|
||||
|
||||
LLAMA_API int llama_get_vocab_from_model(
|
||||
const struct llama_model * model,
|
||||
const char * * strings,
|
||||
float * scores,
|
||||
int capacity);
|
||||
|
||||
// Token logits obtained from the last call to llama_eval()
|
||||
// The logits for the last token are stored in the last row
|
||||
// Can be mutated in order to change the probabilities of the next token
|
||||
@ -294,7 +318,13 @@ extern "C" {
|
||||
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
||||
|
||||
// Token Id -> String. Uses the vocabulary in the provided context
|
||||
LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
|
||||
LLAMA_API const char * llama_token_to_str(
|
||||
const struct llama_context * ctx,
|
||||
llama_token token);
|
||||
|
||||
LLAMA_API const char * llama_token_to_str_with_model(
|
||||
const struct llama_model * model,
|
||||
llama_token token);
|
||||
|
||||
// Special tokens
|
||||
LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
|
||||
|
2
scripts/verify-checksum-models.py
Normal file → Executable file
2
scripts/verify-checksum-models.py
Normal file → Executable file
@ -1,3 +1,5 @@
|
||||
#!/bin/env python3
|
||||
|
||||
import os
|
||||
import hashlib
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
function(llama_add_test source)
|
||||
get_filename_component(TEST_TARGET ${source} NAME_WE)
|
||||
add_executable(${TEST_TARGET} ${source})
|
||||
install(TARGETS ${TEST_TARGET} RUNTIME)
|
||||
target_link_libraries(${TEST_TARGET} PRIVATE llama)
|
||||
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
|
||||
endfunction()
|
||||
|
@ -200,4 +200,6 @@ int main(void) {
|
||||
test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 5.0f, 5.0f);
|
||||
|
||||
printf("OK\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user