2023-07-18 14:24:43 +03:00
#/bin/bash
2023-07-22 11:48:22 +03:00
#
# sample usage:
#
# mkdir tmp
#
# # CPU-only build
# bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
# # with CUDA support
# GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
ggml : add unified SYCL backend for Intel GPUs (#2690)
* first update for migration
* update init_cublas
* add debug functio, commit all help code
* step 1
* step 2
* step3 add fp16, slower 31->28
* add GGML_LIST_DEVICE function
* step 5 format device and print
* step6, enhance error check, remove CUDA macro, enhance device id to fix none-zero id issue
* support main device is non-zero
* step7 add debug for code path, rm log
* step 8, rename all macro & func from cuda by sycl
* fix error of select non-zero device, format device list
* ren ggml-sycl.hpp -> ggml-sycl.h
* clear CMAKE to rm unused lib and options
* correct queue: rm dtct:get_queue
* add print tensor function to debug
* fix error: wrong result in 658746bb26702e50f2c59c0e4ada8e9da6010481
* summary dpct definition in one header file to replace folder:dpct
* refactor device log
* mv dpct definition from folder dpct to ggml-sycl.h
* update readme, refactor build script
* fix build with sycl
* set nthread=1 when sycl, increase performance
* add run script, comment debug code
* add ls-sycl-device tool
* add ls-sycl-device, rm unused files
* rm rear space
* dos2unix
* Update README_sycl.md
* fix return type
* remove sycl version from include path
* restore rm code to fix hang issue
* add syc and link for sycl readme
* rm original sycl code before refactor
* fix code err
* add know issue for pvc hang issue
* enable SYCL_F16 support
* align pr4766
* check for sycl blas, better performance
* cleanup 1
* remove extra endif
* add build&run script, clean CMakefile, update guide by review comments
* rename macro to intel hardware
* editor config format
* format fixes
* format fixes
* editor format fix
* Remove unused headers
* skip build sycl tool for other code path
* replace tab by space
* fix blas matmul function
* fix mac build
* restore hip dependency
* fix conflict
* ren as review comments
* mv internal function to .cpp file
* export funciton print_sycl_devices(), mv class dpct definition to source file
* update CI/action for sycl code, fix CI error of repeat/dup
* fix action ID format issue
* rm unused strategy
* enable llama_f16 in ci
* fix conflict
* fix build break on MacOS, due to CI of MacOS depend on external ggml, instead of internal ggml
* fix ci cases for unsupported data type
* revert unrelated changed in cuda cmake
remove useless nommq
fix typo of GGML_USE_CLBLAS_SYCL
* revert hip cmake changes
* fix indent
* add prefix in func name
* revert no mmq
* rm cpu blas duplicate
* fix no_new_line
* fix src1->type==F16 bug.
* pass batch offset for F16 src1
* fix batch error
* fix wrong code
* revert sycl checking in test-sampling
* pass void as arguments of ggml_backend_sycl_print_sycl_devices
* remove extra blank line in test-sampling
* revert setting n_threads in sycl
* implement std::isinf for icpx with fast math.
* Update ci/run.sh
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update examples/sycl/run-llama2.sh
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update examples/sycl/run-llama2.sh
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update CMakeLists.txt
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update CMakeLists.txt
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update CMakeLists.txt
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update CMakeLists.txt
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* add copyright and MIT license declare
* update the cmd example
---------
Co-authored-by: jianyuzh <jianyu.zhang@intel.com>
Co-authored-by: luoyu-intel <yu.luo@intel.com>
Co-authored-by: Meng, Hengyu <hengyu.meng@intel.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-01-28 21:26:23 +05:30
# # with SYCL support
# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
2023-07-18 14:24:43 +03:00
if [ -z " $2 " ] ; then
echo " usage: $0 <output-dir> <mnt-dir> "
exit 1
fi
mkdir -p " $1 "
mkdir -p " $2 "
OUT = $( realpath " $1 " )
MNT = $( realpath " $2 " )
2024-01-26 07:18:00 -05:00
rm -f " $OUT /*.log "
rm -f " $OUT /*.exit "
rm -f " $OUT /*.md "
2023-07-18 14:24:43 +03:00
sd = ` dirname $0 `
cd $sd /../
SRC = ` pwd `
2024-02-17 16:03:14 -05:00
CMAKE_EXTRA = "-DLLAMA_FATAL_WARNINGS=ON"
2024-01-02 10:57:44 +02:00
if [ ! -z ${ GG_BUILD_METAL } ] ; then
CMAKE_EXTRA = " ${ CMAKE_EXTRA } -DLLAMA_METAL_SHADER_DEBUG=ON "
fi
2024-01-17 18:54:56 +02:00
if [ ! -z ${ GG_BUILD_CUDA } ] ; then
2024-03-26 01:16:01 +01:00
CMAKE_EXTRA = " ${ CMAKE_EXTRA } -DLLAMA_CUDA=1 "
2024-01-17 18:54:56 +02:00
fi
ggml : add unified SYCL backend for Intel GPUs (#2690)
* first update for migration
* update init_cublas
* add debug functio, commit all help code
* step 1
* step 2
* step3 add fp16, slower 31->28
* add GGML_LIST_DEVICE function
* step 5 format device and print
* step6, enhance error check, remove CUDA macro, enhance device id to fix none-zero id issue
* support main device is non-zero
* step7 add debug for code path, rm log
* step 8, rename all macro & func from cuda by sycl
* fix error of select non-zero device, format device list
* ren ggml-sycl.hpp -> ggml-sycl.h
* clear CMAKE to rm unused lib and options
* correct queue: rm dtct:get_queue
* add print tensor function to debug
* fix error: wrong result in 658746bb26702e50f2c59c0e4ada8e9da6010481
* summary dpct definition in one header file to replace folder:dpct
* refactor device log
* mv dpct definition from folder dpct to ggml-sycl.h
* update readme, refactor build script
* fix build with sycl
* set nthread=1 when sycl, increase performance
* add run script, comment debug code
* add ls-sycl-device tool
* add ls-sycl-device, rm unused files
* rm rear space
* dos2unix
* Update README_sycl.md
* fix return type
* remove sycl version from include path
* restore rm code to fix hang issue
* add syc and link for sycl readme
* rm original sycl code before refactor
* fix code err
* add know issue for pvc hang issue
* enable SYCL_F16 support
* align pr4766
* check for sycl blas, better performance
* cleanup 1
* remove extra endif
* add build&run script, clean CMakefile, update guide by review comments
* rename macro to intel hardware
* editor config format
* format fixes
* format fixes
* editor format fix
* Remove unused headers
* skip build sycl tool for other code path
* replace tab by space
* fix blas matmul function
* fix mac build
* restore hip dependency
* fix conflict
* ren as review comments
* mv internal function to .cpp file
* export funciton print_sycl_devices(), mv class dpct definition to source file
* update CI/action for sycl code, fix CI error of repeat/dup
* fix action ID format issue
* rm unused strategy
* enable llama_f16 in ci
* fix conflict
* fix build break on MacOS, due to CI of MacOS depend on external ggml, instead of internal ggml
* fix ci cases for unsupported data type
* revert unrelated changed in cuda cmake
remove useless nommq
fix typo of GGML_USE_CLBLAS_SYCL
* revert hip cmake changes
* fix indent
* add prefix in func name
* revert no mmq
* rm cpu blas duplicate
* fix no_new_line
* fix src1->type==F16 bug.
* pass batch offset for F16 src1
* fix batch error
* fix wrong code
* revert sycl checking in test-sampling
* pass void as arguments of ggml_backend_sycl_print_sycl_devices
* remove extra blank line in test-sampling
* revert setting n_threads in sycl
* implement std::isinf for icpx with fast math.
* Update ci/run.sh
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update examples/sycl/run-llama2.sh
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update examples/sycl/run-llama2.sh
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update CMakeLists.txt
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update CMakeLists.txt
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update CMakeLists.txt
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update CMakeLists.txt
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* add copyright and MIT license declare
* update the cmd example
---------
Co-authored-by: jianyuzh <jianyu.zhang@intel.com>
Co-authored-by: luoyu-intel <yu.luo@intel.com>
Co-authored-by: Meng, Hengyu <hengyu.meng@intel.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-01-28 21:26:23 +05:30
if [ ! -z ${ GG_BUILD_SYCL } ] ; then
if [ -z ${ ONEAPI_ROOT } ] ; then
2024-03-06 12:08:32 +08:00
echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:"
echo "source /opt/intel/oneapi/setvars.sh"
ggml : add unified SYCL backend for Intel GPUs (#2690)
* first update for migration
* update init_cublas
* add debug functio, commit all help code
* step 1
* step 2
* step3 add fp16, slower 31->28
* add GGML_LIST_DEVICE function
* step 5 format device and print
* step6, enhance error check, remove CUDA macro, enhance device id to fix none-zero id issue
* support main device is non-zero
* step7 add debug for code path, rm log
* step 8, rename all macro & func from cuda by sycl
* fix error of select non-zero device, format device list
* ren ggml-sycl.hpp -> ggml-sycl.h
* clear CMAKE to rm unused lib and options
* correct queue: rm dtct:get_queue
* add print tensor function to debug
* fix error: wrong result in 658746bb26702e50f2c59c0e4ada8e9da6010481
* summary dpct definition in one header file to replace folder:dpct
* refactor device log
* mv dpct definition from folder dpct to ggml-sycl.h
* update readme, refactor build script
* fix build with sycl
* set nthread=1 when sycl, increase performance
* add run script, comment debug code
* add ls-sycl-device tool
* add ls-sycl-device, rm unused files
* rm rear space
* dos2unix
* Update README_sycl.md
* fix return type
* remove sycl version from include path
* restore rm code to fix hang issue
* add syc and link for sycl readme
* rm original sycl code before refactor
* fix code err
* add know issue for pvc hang issue
* enable SYCL_F16 support
* align pr4766
* check for sycl blas, better performance
* cleanup 1
* remove extra endif
* add build&run script, clean CMakefile, update guide by review comments
* rename macro to intel hardware
* editor config format
* format fixes
* format fixes
* editor format fix
* Remove unused headers
* skip build sycl tool for other code path
* replace tab by space
* fix blas matmul function
* fix mac build
* restore hip dependency
* fix conflict
* ren as review comments
* mv internal function to .cpp file
* export funciton print_sycl_devices(), mv class dpct definition to source file
* update CI/action for sycl code, fix CI error of repeat/dup
* fix action ID format issue
* rm unused strategy
* enable llama_f16 in ci
* fix conflict
* fix build break on MacOS, due to CI of MacOS depend on external ggml, instead of internal ggml
* fix ci cases for unsupported data type
* revert unrelated changed in cuda cmake
remove useless nommq
fix typo of GGML_USE_CLBLAS_SYCL
* revert hip cmake changes
* fix indent
* add prefix in func name
* revert no mmq
* rm cpu blas duplicate
* fix no_new_line
* fix src1->type==F16 bug.
* pass batch offset for F16 src1
* fix batch error
* fix wrong code
* revert sycl checking in test-sampling
* pass void as arguments of ggml_backend_sycl_print_sycl_devices
* remove extra blank line in test-sampling
* revert setting n_threads in sycl
* implement std::isinf for icpx with fast math.
* Update ci/run.sh
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update examples/sycl/run-llama2.sh
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update examples/sycl/run-llama2.sh
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update CMakeLists.txt
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update CMakeLists.txt
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update CMakeLists.txt
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update CMakeLists.txt
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* add copyright and MIT license declare
* update the cmd example
---------
Co-authored-by: jianyuzh <jianyu.zhang@intel.com>
Co-authored-by: luoyu-intel <yu.luo@intel.com>
Co-authored-by: Meng, Hengyu <hengyu.meng@intel.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-01-28 21:26:23 +05:30
exit 1
fi
CMAKE_EXTRA = " ${ CMAKE_EXTRA } -DLLAMA_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON "
fi
2023-07-18 14:24:43 +03:00
## helpers
# download a file if it does not exist or if it is outdated
function gg_wget {
local out = $1
local url = $2
local cwd = ` pwd `
mkdir -p $out
cd $out
# should not re-download if file is the same
wget -nv -N $url
cd $cwd
}
function gg_printf {
printf -- " $@ " >> $OUT /README.md
}
function gg_run {
ci = $1
set -o pipefail
set -x
gg_run_$ci | tee $OUT /$ci .log
cur = $?
echo " $cur " > $OUT /$ci .exit
set +x
set +o pipefail
gg_sum_$ci
ret = $(( ret | cur))
}
## ci
# ctest_debug
function gg_run_ctest_debug {
cd ${ SRC }
rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
set -e
2024-01-02 10:57:44 +02:00
( time cmake -DCMAKE_BUILD_TYPE= Debug ${ CMAKE_EXTRA } .. ) 2>& 1 | tee -a $OUT /${ ci } -cmake.log
( time make -j ) 2>& 1 | tee -a $OUT /${ ci } -make.log
2023-07-18 14:24:43 +03:00
2024-01-26 07:18:00 -05:00
( time ctest --output-on-failure -L main -E test-opt ) 2>& 1 | tee -a $OUT /${ ci } -ctest.log
2023-07-18 14:24:43 +03:00
set +e
}
function gg_sum_ctest_debug {
gg_printf '### %s\n\n' " ${ ci } "
gg_printf 'Runs ctest in debug mode\n'
gg_printf '- status: %s\n' " $( cat $OUT /${ ci } .exit) "
gg_printf '```\n'
gg_printf '%s\n' " $( cat $OUT /${ ci } -ctest.log) "
gg_printf '```\n'
gg_printf '\n'
}
# ctest_release
function gg_run_ctest_release {
cd ${ SRC }
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
set -e
2024-01-02 10:57:44 +02:00
( time cmake -DCMAKE_BUILD_TYPE= Release ${ CMAKE_EXTRA } .. ) 2>& 1 | tee -a $OUT /${ ci } -cmake.log
( time make -j ) 2>& 1 | tee -a $OUT /${ ci } -make.log
2023-07-18 14:24:43 +03:00
2023-07-22 11:48:22 +03:00
if [ -z ${ GG_BUILD_LOW_PERF } ] ; then
2024-01-26 07:18:00 -05:00
( time ctest --output-on-failure -L main ) 2>& 1 | tee -a $OUT /${ ci } -ctest.log
2023-07-18 14:24:43 +03:00
else
2024-01-26 07:18:00 -05:00
( time ctest --output-on-failure -L main -E test-opt ) 2>& 1 | tee -a $OUT /${ ci } -ctest.log
2023-07-18 14:24:43 +03:00
fi
set +e
}
function gg_sum_ctest_release {
gg_printf '### %s\n\n' " ${ ci } "
gg_printf 'Runs ctest in release mode\n'
gg_printf '- status: %s\n' " $( cat $OUT /${ ci } .exit) "
gg_printf '```\n'
gg_printf '%s\n' " $( cat $OUT /${ ci } -ctest.log) "
gg_printf '```\n'
}
2024-04-14 13:12:59 +02:00
# test_scripts_debug
function gg_run_test_scripts_debug {
cd ${ SRC }
set -e
2024-05-07 11:08:49 +03:00
( cd ./examples/gguf-split && time bash tests.sh " $SRC /build-ci-debug/bin " " $MNT /models " ) 2>& 1 | tee -a $OUT /${ ci } -scripts.log
( cd ./examples/quantize && time bash tests.sh " $SRC /build-ci-debug/bin " " $MNT /models " ) 2>& 1 | tee -a $OUT /${ ci } -scripts.log
2024-04-14 13:12:59 +02:00
set +e
}
function gg_sum_test_scripts_debug {
gg_printf '### %s\n\n' " ${ ci } "
gg_printf 'Runs test scripts in debug mode\n'
gg_printf '- status: %s\n' " $( cat $OUT /${ ci } .exit) "
gg_printf '```\n'
gg_printf '%s\n' " $( cat $OUT /${ ci } -scripts.log) "
gg_printf '```\n'
gg_printf '\n'
}
# test_scripts_release
function gg_run_test_scripts_release {
cd ${ SRC }
set -e
( cd ./examples/gguf-split && time bash tests.sh " $SRC /build-ci-release/bin " " $MNT /models " ) 2>& 1 | tee -a $OUT /${ ci } -scripts.log
2024-04-25 14:27:20 +03:00
( cd ./examples/quantize && time bash tests.sh " $SRC /build-ci-release/bin " " $MNT /models " ) 2>& 1 | tee -a $OUT /${ ci } -scripts.log
2024-04-14 13:12:59 +02:00
set +e
}
function gg_sum_test_scripts_release {
gg_printf '### %s\n\n' " ${ ci } "
gg_printf 'Runs test scripts in release mode\n'
gg_printf '- status: %s\n' " $( cat $OUT /${ ci } .exit) "
gg_printf '```\n'
gg_printf '%s\n' " $( cat $OUT /${ ci } -scripts.log) "
gg_printf '```\n'
gg_printf '\n'
}
2024-01-26 07:18:00 -05:00
function gg_get_model {
local gguf_3b = " $MNT /models/open-llama/3B-v2/ggml-model-f16.gguf "
local gguf_7b = " $MNT /models/open-llama/7B-v2/ggml-model-f16.gguf "
if [ [ -s $gguf_3b ] ] ; then
echo -n " $gguf_3b "
elif [ [ -s $gguf_7b ] ] ; then
echo -n " $gguf_7b "
else
echo >& 2 "No model found. Can't run gg_run_ctest_with_model."
exit 1
fi
}
function gg_run_ctest_with_model_debug {
cd ${ SRC }
local model; model = $( gg_get_model)
cd build-ci-debug
set -e
( LLAMACPP_TEST_MODELFILE = " $model " time ctest --output-on-failure -L model) 2>& 1 | tee -a $OUT /${ ci } -ctest.log
set +e
cd ..
}
function gg_run_ctest_with_model_release {
cd ${ SRC }
local model; model = $( gg_get_model)
cd build-ci-release
set -e
( LLAMACPP_TEST_MODELFILE = " $model " time ctest --output-on-failure -L model) 2>& 1 | tee -a $OUT /${ ci } -ctest.log
set +e
cd ..
}
function gg_sum_ctest_with_model_debug {
gg_printf '### %s\n\n' " ${ ci } "
gg_printf 'Runs ctest with model files in debug mode\n'
gg_printf '- status: %s\n' " $( cat $OUT /${ ci } .exit) "
gg_printf '```\n'
gg_printf '%s\n' " $( cat $OUT /${ ci } -ctest.log) "
gg_printf '```\n'
}
function gg_sum_ctest_with_model_release {
gg_printf '### %s\n\n' " ${ ci } "
gg_printf 'Runs ctest with model files in release mode\n'
gg_printf '- status: %s\n' " $( cat $OUT /${ ci } .exit) "
gg_printf '```\n'
gg_printf '%s\n' " $( cat $OUT /${ ci } -ctest.log) "
gg_printf '```\n'
}
2023-07-18 14:24:43 +03:00
# open_llama_3b_v2
function gg_run_open_llama_3b_v2 {
cd ${ SRC }
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
2024-02-18 22:39:30 +02:00
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
2023-07-18 14:24:43 +03:00
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
path_models = "../models-mnt/open-llama/3B-v2"
path_wiki = "../models-mnt/wikitext/wikitext-2-raw"
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
set -e
2024-01-17 18:54:56 +02:00
( time cmake -DCMAKE_BUILD_TYPE= Release ${ CMAKE_EXTRA } -DLLAMA_QKK_64= 1 .. ) 2>& 1 | tee -a $OUT /${ ci } -cmake.log
( time make -j ) 2>& 1 | tee -a $OUT /${ ci } -make.log
2023-07-18 14:24:43 +03:00
python3 ../convert.py ${ path_models }
2023-08-21 23:07:43 +03:00
model_f16 = " ${ path_models } /ggml-model-f16.gguf "
model_q8_0 = " ${ path_models } /ggml-model-q8_0.gguf "
model_q4_0 = " ${ path_models } /ggml-model-q4_0.gguf "
model_q4_1 = " ${ path_models } /ggml-model-q4_1.gguf "
model_q5_0 = " ${ path_models } /ggml-model-q5_0.gguf "
model_q5_1 = " ${ path_models } /ggml-model-q5_1.gguf "
model_q2_k = " ${ path_models } /ggml-model-q2_k.gguf "
model_q3_k = " ${ path_models } /ggml-model-q3_k.gguf "
model_q4_k = " ${ path_models } /ggml-model-q4_k.gguf "
model_q5_k = " ${ path_models } /ggml-model-q5_k.gguf "
model_q6_k = " ${ path_models } /ggml-model-q6_k.gguf "
2023-07-18 14:24:43 +03:00
wiki_test_60 = " ${ path_wiki } /wiki.test-60.raw "
./bin/quantize ${ model_f16 } ${ model_q8_0 } q8_0
./bin/quantize ${ model_f16 } ${ model_q4_0 } q4_0
./bin/quantize ${ model_f16 } ${ model_q4_1 } q4_1
./bin/quantize ${ model_f16 } ${ model_q5_0 } q5_0
./bin/quantize ${ model_f16 } ${ model_q5_1 } q5_1
2023-07-22 11:48:22 +03:00
./bin/quantize ${ model_f16 } ${ model_q2_k } q2_k
2023-07-18 14:24:43 +03:00
./bin/quantize ${ model_f16 } ${ model_q3_k } q3_k
./bin/quantize ${ model_f16 } ${ model_q4_k } q4_k
./bin/quantize ${ model_f16 } ${ model_q5_k } q5_k
./bin/quantize ${ model_f16 } ${ model_q6_k } q6_k
2023-07-22 12:00:56 +03:00
( time ./bin/main --model ${ model_f16 } -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>& 1 | tee -a $OUT /${ ci } -tg-f16.log
( time ./bin/main --model ${ model_q8_0 } -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>& 1 | tee -a $OUT /${ ci } -tg-q8_0.log
( time ./bin/main --model ${ model_q4_0 } -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>& 1 | tee -a $OUT /${ ci } -tg-q4_0.log
( time ./bin/main --model ${ model_q4_1 } -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>& 1 | tee -a $OUT /${ ci } -tg-q4_1.log
( time ./bin/main --model ${ model_q5_0 } -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>& 1 | tee -a $OUT /${ ci } -tg-q5_0.log
( time ./bin/main --model ${ model_q5_1 } -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>& 1 | tee -a $OUT /${ ci } -tg-q5_1.log
( time ./bin/main --model ${ model_q2_k } -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>& 1 | tee -a $OUT /${ ci } -tg-q2_k.log
( time ./bin/main --model ${ model_q3_k } -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>& 1 | tee -a $OUT /${ ci } -tg-q3_k.log
( time ./bin/main --model ${ model_q4_k } -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>& 1 | tee -a $OUT /${ ci } -tg-q4_k.log
( time ./bin/main --model ${ model_q5_k } -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>& 1 | tee -a $OUT /${ ci } -tg-q5_k.log
( time ./bin/main --model ${ model_q6_k } -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>& 1 | tee -a $OUT /${ ci } -tg-q6_k.log
2023-07-18 14:24:43 +03:00
2024-02-28 21:44:21 +02:00
( time ./bin/perplexity --model ${ model_f16 } -f ${ wiki_test_60 } -c 128 -b 128 --chunks 1 ) 2>& 1 | tee -a $OUT /${ ci } -tg-f16.log
( time ./bin/perplexity --model ${ model_q8_0 } -f ${ wiki_test_60 } -c 128 -b 128 --chunks 1 ) 2>& 1 | tee -a $OUT /${ ci } -tg-q8_0.log
( time ./bin/perplexity --model ${ model_q4_0 } -f ${ wiki_test_60 } -c 128 -b 128 --chunks 1 ) 2>& 1 | tee -a $OUT /${ ci } -tg-q4_0.log
( time ./bin/perplexity --model ${ model_q4_1 } -f ${ wiki_test_60 } -c 128 -b 128 --chunks 1 ) 2>& 1 | tee -a $OUT /${ ci } -tg-q4_1.log
( time ./bin/perplexity --model ${ model_q5_0 } -f ${ wiki_test_60 } -c 128 -b 128 --chunks 1 ) 2>& 1 | tee -a $OUT /${ ci } -tg-q5_0.log
( time ./bin/perplexity --model ${ model_q5_1 } -f ${ wiki_test_60 } -c 128 -b 128 --chunks 1 ) 2>& 1 | tee -a $OUT /${ ci } -tg-q5_1.log
( time ./bin/perplexity --model ${ model_q2_k } -f ${ wiki_test_60 } -c 128 -b 128 --chunks 1 ) 2>& 1 | tee -a $OUT /${ ci } -tg-q2_k.log
( time ./bin/perplexity --model ${ model_q3_k } -f ${ wiki_test_60 } -c 128 -b 128 --chunks 1 ) 2>& 1 | tee -a $OUT /${ ci } -tg-q3_k.log
( time ./bin/perplexity --model ${ model_q4_k } -f ${ wiki_test_60 } -c 128 -b 128 --chunks 1 ) 2>& 1 | tee -a $OUT /${ ci } -tg-q4_k.log
( time ./bin/perplexity --model ${ model_q5_k } -f ${ wiki_test_60 } -c 128 -b 128 --chunks 1 ) 2>& 1 | tee -a $OUT /${ ci } -tg-q5_k.log
( time ./bin/perplexity --model ${ model_q6_k } -f ${ wiki_test_60 } -c 128 -b 128 --chunks 1 ) 2>& 1 | tee -a $OUT /${ ci } -tg-q6_k.log
( time ./bin/imatrix --model ${ model_f16 } -f ${ wiki_test_60 } -c 128 -b 128 --chunks 1 ) 2>& 1 | tee -a $OUT /${ ci } -imatrix.log
2024-01-17 18:46:30 +02:00
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 12:16:08 +03:00
( time ./bin/save-load-state --model ${ model_q4_0 } ) 2>& 1 | tee -a $OUT /${ ci } -save-load-state.log
( time ./bin/save-load-state -fa --model ${ model_q4_0 } ) 2>& 1 | tee -a $OUT /${ ci } -save-load-state.log
2023-10-17 19:12:46 +03:00
2023-07-18 14:24:43 +03:00
function check_ppl {
qnt = " $1 "
ppl = $( echo " $2 " | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
if [ $( echo " $ppl > 20.0 " | bc) -eq 1 ] ; then
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' " $qnt " " $ppl "
return 20
fi
printf ' - %s @ %s OK\n' " $qnt " " $ppl "
return 0
}
check_ppl "f16" " $( cat $OUT /${ ci } -tg-f16.log | grep "^\[1\]" ) " | tee -a $OUT /${ ci } -ppl.log
check_ppl "q8_0" " $( cat $OUT /${ ci } -tg-q8_0.log | grep "^\[1\]" ) " | tee -a $OUT /${ ci } -ppl.log
check_ppl "q4_0" " $( cat $OUT /${ ci } -tg-q4_0.log | grep "^\[1\]" ) " | tee -a $OUT /${ ci } -ppl.log
check_ppl "q4_1" " $( cat $OUT /${ ci } -tg-q4_1.log | grep "^\[1\]" ) " | tee -a $OUT /${ ci } -ppl.log
check_ppl "q5_0" " $( cat $OUT /${ ci } -tg-q5_0.log | grep "^\[1\]" ) " | tee -a $OUT /${ ci } -ppl.log
check_ppl "q5_1" " $( cat $OUT /${ ci } -tg-q5_1.log | grep "^\[1\]" ) " | tee -a $OUT /${ ci } -ppl.log
2023-07-22 11:48:22 +03:00
check_ppl "q2_k" " $( cat $OUT /${ ci } -tg-q2_k.log | grep "^\[1\]" ) " | tee -a $OUT /${ ci } -ppl.log
2023-07-18 14:24:43 +03:00
check_ppl "q3_k" " $( cat $OUT /${ ci } -tg-q3_k.log | grep "^\[1\]" ) " | tee -a $OUT /${ ci } -ppl.log
check_ppl "q4_k" " $( cat $OUT /${ ci } -tg-q4_k.log | grep "^\[1\]" ) " | tee -a $OUT /${ ci } -ppl.log
check_ppl "q5_k" " $( cat $OUT /${ ci } -tg-q5_k.log | grep "^\[1\]" ) " | tee -a $OUT /${ ci } -ppl.log
check_ppl "q6_k" " $( cat $OUT /${ ci } -tg-q6_k.log | grep "^\[1\]" ) " | tee -a $OUT /${ ci } -ppl.log
2024-01-17 18:46:30 +02:00
cat $OUT /${ ci } -imatrix.log | grep "Final" >> $OUT /${ ci } -imatrix-sum.log
2023-07-18 14:24:43 +03:00
set +e
}
function gg_sum_open_llama_3b_v2 {
gg_printf '### %s\n\n' " ${ ci } "
gg_printf 'OpenLLaMA 3B-v2:\n'
gg_printf '- status: %s\n' " $( cat $OUT /${ ci } .exit) "
gg_printf '- perplexity:\n%s\n' " $( cat $OUT /${ ci } -ppl.log) "
2024-01-17 18:46:30 +02:00
gg_printf '- imatrix:\n```\n%s\n```\n' " $( cat $OUT /${ ci } -imatrix-sum.log) "
2023-07-18 14:24:43 +03:00
gg_printf '- f16: \n```\n%s\n```\n' " $( cat $OUT /${ ci } -tg-f16.log) "
gg_printf '- q8_0:\n```\n%s\n```\n' " $( cat $OUT /${ ci } -tg-q8_0.log) "
gg_printf '- q4_0:\n```\n%s\n```\n' " $( cat $OUT /${ ci } -tg-q4_0.log) "
gg_printf '- q4_1:\n```\n%s\n```\n' " $( cat $OUT /${ ci } -tg-q4_1.log) "
gg_printf '- q5_0:\n```\n%s\n```\n' " $( cat $OUT /${ ci } -tg-q5_0.log) "
gg_printf '- q5_1:\n```\n%s\n```\n' " $( cat $OUT /${ ci } -tg-q5_1.log) "
2023-07-22 11:48:22 +03:00
gg_printf '- q2_k:\n```\n%s\n```\n' " $( cat $OUT /${ ci } -tg-q2_k.log) "
gg_printf '- q3_k:\n```\n%s\n```\n' " $( cat $OUT /${ ci } -tg-q3_k.log) "
gg_printf '- q4_k:\n```\n%s\n```\n' " $( cat $OUT /${ ci } -tg-q4_k.log) "
gg_printf '- q5_k:\n```\n%s\n```\n' " $( cat $OUT /${ ci } -tg-q5_k.log) "
gg_printf '- q6_k:\n```\n%s\n```\n' " $( cat $OUT /${ ci } -tg-q6_k.log) "
2023-10-17 19:12:46 +03:00
gg_printf '- save-load-state: \n```\n%s\n```\n' " $( cat $OUT /${ ci } -save-load-state.log) "
2023-07-22 11:48:22 +03:00
}
# open_llama_7b_v2
# requires: GG_BUILD_CUDA
function gg_run_open_llama_7b_v2 {
cd ${ SRC }
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/config.json
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/tokenizer.model
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/tokenizer_config.json
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/special_tokens_map.json
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/pytorch_model.bin.index.json
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00001-of-00002.bin
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json
2024-02-18 22:39:30 +02:00
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
2023-07-22 11:48:22 +03:00
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
path_models = "../models-mnt/open-llama/7B-v2"
path_wiki = "../models-mnt/wikitext/wikitext-2-raw"
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
set -e
2024-03-26 01:16:01 +01:00
( time cmake -DCMAKE_BUILD_TYPE= Release ${ CMAKE_EXTRA } -DLLAMA_CUDA= 1 .. ) 2>& 1 | tee -a $OUT /${ ci } -cmake.log
( time make -j ) 2>& 1 | tee -a $OUT /${ ci } -make.log
2023-07-22 11:48:22 +03:00
python3 ../convert.py ${ path_models }
2023-08-21 23:07:43 +03:00
model_f16 = " ${ path_models } /ggml-model-f16.gguf "
model_q8_0 = " ${ path_models } /ggml-model-q8_0.gguf "
model_q4_0 = " ${ path_models } /ggml-model-q4_0.gguf "
model_q4_1 = " ${ path_models } /ggml-model-q4_1.gguf "
model_q5_0 = " ${ path_models } /ggml-model-q5_0.gguf "
model_q5_1 = " ${ path_models } /ggml-model-q5_1.gguf "
model_q2_k = " ${ path_models } /ggml-model-q2_k.gguf "
model_q3_k = " ${ path_models } /ggml-model-q3_k.gguf "
model_q4_k = " ${ path_models } /ggml-model-q4_k.gguf "
model_q5_k = " ${ path_models } /ggml-model-q5_k.gguf "
model_q6_k = " ${ path_models } /ggml-model-q6_k.gguf "
2023-07-22 11:48:22 +03:00
wiki_test = " ${ path_wiki } /wiki.test.raw "
./bin/quantize ${ model_f16 } ${ model_q8_0 } q8_0
./bin/quantize ${ model_f16 } ${ model_q4_0 } q4_0
./bin/quantize ${ model_f16 } ${ model_q4_1 } q4_1
./bin/quantize ${ model_f16 } ${ model_q5_0 } q5_0
./bin/quantize ${ model_f16 } ${ model_q5_1 } q5_1
./bin/quantize ${ model_f16 } ${ model_q2_k } q2_k
./bin/quantize ${ model_f16 } ${ model_q3_k } q3_k
./bin/quantize ${ model_f16 } ${ model_q4_k } q4_k
./bin/quantize ${ model_f16 } ${ model_q5_k } q5_k
./bin/quantize ${ model_f16 } ${ model_q6_k } q6_k
2023-08-27 09:03:27 +02:00
( time ./bin/main --model ${ model_f16 } -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>& 1 | tee -a $OUT /${ ci } -tg-f16.log
( time ./bin/main --model ${ model_q8_0 } -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>& 1 | tee -a $OUT /${ ci } -tg-q8_0.log
( time ./bin/main --model ${ model_q4_0 } -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>& 1 | tee -a $OUT /${ ci } -tg-q4_0.log
( time ./bin/main --model ${ model_q4_1 } -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>& 1 | tee -a $OUT /${ ci } -tg-q4_1.log
( time ./bin/main --model ${ model_q5_0 } -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>& 1 | tee -a $OUT /${ ci } -tg-q5_0.log
( time ./bin/main --model ${ model_q5_1 } -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>& 1 | tee -a $OUT /${ ci } -tg-q5_1.log
( time ./bin/main --model ${ model_q2_k } -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>& 1 | tee -a $OUT /${ ci } -tg-q2_k.log
( time ./bin/main --model ${ model_q3_k } -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>& 1 | tee -a $OUT /${ ci } -tg-q3_k.log
( time ./bin/main --model ${ model_q4_k } -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>& 1 | tee -a $OUT /${ ci } -tg-q4_k.log
( time ./bin/main --model ${ model_q5_k } -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>& 1 | tee -a $OUT /${ ci } -tg-q5_k.log
( time ./bin/main --model ${ model_q6_k } -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>& 1 | tee -a $OUT /${ ci } -tg-q6_k.log
2023-07-22 11:48:22 +03:00
( time ./bin/perplexity --model ${ model_f16 } -f ${ wiki_test } -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>& 1 | tee -a $OUT /${ ci } -tg-f16.log
( time ./bin/perplexity --model ${ model_q8_0 } -f ${ wiki_test } -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>& 1 | tee -a $OUT /${ ci } -tg-q8_0.log
( time ./bin/perplexity --model ${ model_q4_0 } -f ${ wiki_test } -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>& 1 | tee -a $OUT /${ ci } -tg-q4_0.log
( time ./bin/perplexity --model ${ model_q4_1 } -f ${ wiki_test } -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>& 1 | tee -a $OUT /${ ci } -tg-q4_1.log
( time ./bin/perplexity --model ${ model_q5_0 } -f ${ wiki_test } -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>& 1 | tee -a $OUT /${ ci } -tg-q5_0.log
( time ./bin/perplexity --model ${ model_q5_1 } -f ${ wiki_test } -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>& 1 | tee -a $OUT /${ ci } -tg-q5_1.log
( time ./bin/perplexity --model ${ model_q2_k } -f ${ wiki_test } -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>& 1 | tee -a $OUT /${ ci } -tg-q2_k.log
( time ./bin/perplexity --model ${ model_q3_k } -f ${ wiki_test } -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>& 1 | tee -a $OUT /${ ci } -tg-q3_k.log
( time ./bin/perplexity --model ${ model_q4_k } -f ${ wiki_test } -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>& 1 | tee -a $OUT /${ ci } -tg-q4_k.log
( time ./bin/perplexity --model ${ model_q5_k } -f ${ wiki_test } -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>& 1 | tee -a $OUT /${ ci } -tg-q5_k.log
( time ./bin/perplexity --model ${ model_q6_k } -f ${ wiki_test } -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>& 1 | tee -a $OUT /${ ci } -tg-q6_k.log
2024-01-17 18:46:30 +02:00
( time ./bin/imatrix --model ${ model_f16 } -f ${ wiki_test } -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>& 1 | tee -a $OUT /${ ci } -imatrix.log
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 12:16:08 +03:00
( time ./bin/save-load-state -ngl 10 --model ${ model_q4_0 } ) 2>& 1 | tee -a $OUT /${ ci } -save-load-state.log
( time ./bin/save-load-state -fa -ngl 10 --model ${ model_q4_0 } ) 2>& 1 | tee -a $OUT /${ ci } -save-load-state.log
( time ./bin/save-load-state -ngl 99 --model ${ model_q4_0 } ) 2>& 1 | tee -a $OUT /${ ci } -save-load-state.log
( time ./bin/save-load-state -fa -ngl 99 --model ${ model_q4_0 } ) 2>& 1 | tee -a $OUT /${ ci } -save-load-state.log
2023-10-17 19:12:46 +03:00
2023-07-22 11:48:22 +03:00
function check_ppl {
qnt = " $1 "
ppl = $( echo " $2 " | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
if [ $( echo " $ppl > 20.0 " | bc) -eq 1 ] ; then
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' " $qnt " " $ppl "
return 20
fi
printf ' - %s @ %s OK\n' " $qnt " " $ppl "
return 0
}
check_ppl "f16" " $( cat $OUT /${ ci } -tg-f16.log | grep "^\[1\]" ) " | tee -a $OUT /${ ci } -ppl.log
check_ppl "q8_0" " $( cat $OUT /${ ci } -tg-q8_0.log | grep "^\[1\]" ) " | tee -a $OUT /${ ci } -ppl.log
check_ppl "q4_0" " $( cat $OUT /${ ci } -tg-q4_0.log | grep "^\[1\]" ) " | tee -a $OUT /${ ci } -ppl.log
check_ppl "q4_1" " $( cat $OUT /${ ci } -tg-q4_1.log | grep "^\[1\]" ) " | tee -a $OUT /${ ci } -ppl.log
check_ppl "q5_0" " $( cat $OUT /${ ci } -tg-q5_0.log | grep "^\[1\]" ) " | tee -a $OUT /${ ci } -ppl.log
check_ppl "q5_1" " $( cat $OUT /${ ci } -tg-q5_1.log | grep "^\[1\]" ) " | tee -a $OUT /${ ci } -ppl.log
check_ppl "q2_k" " $( cat $OUT /${ ci } -tg-q2_k.log | grep "^\[1\]" ) " | tee -a $OUT /${ ci } -ppl.log
check_ppl "q3_k" " $( cat $OUT /${ ci } -tg-q3_k.log | grep "^\[1\]" ) " | tee -a $OUT /${ ci } -ppl.log
check_ppl "q4_k" " $( cat $OUT /${ ci } -tg-q4_k.log | grep "^\[1\]" ) " | tee -a $OUT /${ ci } -ppl.log
check_ppl "q5_k" " $( cat $OUT /${ ci } -tg-q5_k.log | grep "^\[1\]" ) " | tee -a $OUT /${ ci } -ppl.log
check_ppl "q6_k" " $( cat $OUT /${ ci } -tg-q6_k.log | grep "^\[1\]" ) " | tee -a $OUT /${ ci } -ppl.log
2024-01-17 18:46:30 +02:00
cat $OUT /${ ci } -imatrix.log | grep "Final" >> $OUT /${ ci } -imatrix-sum.log
2023-07-22 11:48:22 +03:00
set +e
}
function gg_sum_open_llama_7b_v2 {
gg_printf '### %s\n\n' " ${ ci } "
gg_printf 'OpenLLaMA 7B-v2:\n'
gg_printf '- status: %s\n' " $( cat $OUT /${ ci } .exit) "
gg_printf '- perplexity:\n%s\n' " $( cat $OUT /${ ci } -ppl.log) "
2024-01-17 18:46:30 +02:00
gg_printf '- imatrix:\n```\n%s\n```\n' " $( cat $OUT /${ ci } -imatrix-sum.log) "
2023-07-22 11:48:22 +03:00
gg_printf '- f16: \n```\n%s\n```\n' " $( cat $OUT /${ ci } -tg-f16.log) "
gg_printf '- q8_0:\n```\n%s\n```\n' " $( cat $OUT /${ ci } -tg-q8_0.log) "
gg_printf '- q4_0:\n```\n%s\n```\n' " $( cat $OUT /${ ci } -tg-q4_0.log) "
gg_printf '- q4_1:\n```\n%s\n```\n' " $( cat $OUT /${ ci } -tg-q4_1.log) "
gg_printf '- q5_0:\n```\n%s\n```\n' " $( cat $OUT /${ ci } -tg-q5_0.log) "
gg_printf '- q5_1:\n```\n%s\n```\n' " $( cat $OUT /${ ci } -tg-q5_1.log) "
gg_printf '- q2_k:\n```\n%s\n```\n' " $( cat $OUT /${ ci } -tg-q2_k.log) "
2023-07-18 14:24:43 +03:00
gg_printf '- q3_k:\n```\n%s\n```\n' " $( cat $OUT /${ ci } -tg-q3_k.log) "
gg_printf '- q4_k:\n```\n%s\n```\n' " $( cat $OUT /${ ci } -tg-q4_k.log) "
gg_printf '- q5_k:\n```\n%s\n```\n' " $( cat $OUT /${ ci } -tg-q5_k.log) "
gg_printf '- q6_k:\n```\n%s\n```\n' " $( cat $OUT /${ ci } -tg-q6_k.log) "
2023-10-17 19:12:46 +03:00
gg_printf '- save-load-state: \n```\n%s\n```\n' " $( cat $OUT /${ ci } -save-load-state.log) "
2023-07-18 14:24:43 +03:00
}
2024-02-13 13:01:29 +02:00
# bge-small
function gg_run_embd_bge_small {
cd ${ SRC }
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
2024-03-29 14:34:28 +02:00
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer.json
2024-02-13 13:01:29 +02:00
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer_config.json
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/special_tokens_map.json
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/sentence_bert_config.json
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/vocab.txt
2024-02-16 09:57:55 +02:00
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/modules.json
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
gg_wget models-mnt/bge-small/1_Pooling https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/1_Pooling/config.json
2024-02-13 13:01:29 +02:00
path_models = "../models-mnt/bge-small"
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
set -e
( time cmake -DCMAKE_BUILD_TYPE= Release ${ CMAKE_EXTRA } .. ) 2>& 1 | tee -a $OUT /${ ci } -cmake.log
( time make -j ) 2>& 1 | tee -a $OUT /${ ci } -make.log
python3 ../convert-hf-to-gguf.py ${ path_models }
model_f16 = " ${ path_models } /ggml-model-f16.gguf "
model_q8_0 = " ${ path_models } /ggml-model-q8_0.gguf "
./bin/quantize ${ model_f16 } ${ model_q8_0 } q8_0
( time ./bin/embedding --model ${ model_f16 } -p "I believe the meaning of life is" ) 2>& 1 | tee -a $OUT /${ ci } -tg-f16.log
( time ./bin/embedding --model ${ model_q8_0 } -p "I believe the meaning of life is" ) 2>& 1 | tee -a $OUT /${ ci } -tg-q8_0.log
set +e
}
function gg_sum_embd_bge_small {
gg_printf '### %s\n\n' " ${ ci } "
gg_printf 'BGE Small (BERT):\n'
gg_printf '- status: %s\n' " $( cat $OUT /${ ci } .exit) "
gg_printf '- f16: \n```\n%s\n```\n' " $( cat $OUT /${ ci } -tg-f16.log) "
gg_printf '- q8_0:\n```\n%s\n```\n' " $( cat $OUT /${ ci } -tg-q8_0.log) "
}
2023-07-18 14:24:43 +03:00
## main
2023-07-22 11:48:22 +03:00
if [ -z ${ GG_BUILD_LOW_PERF } ] ; then
2024-01-26 07:18:00 -05:00
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
2023-07-18 14:24:43 +03:00
rm -rf ${ SRC } /models-mnt
2023-07-21 13:48:18 +03:00
mnt_models = ${ MNT } /models
2023-07-18 14:24:43 +03:00
mkdir -p ${ mnt_models }
ln -sfn ${ mnt_models } ${ SRC } /models-mnt
2024-01-26 07:18:00 -05:00
# Create a fresh python3 venv and enter it
python3 -m venv " $MNT /venv "
source " $MNT /venv/bin/activate "
pip install -r ${ SRC } /requirements.txt --disable-pip-version-check
pip install --editable gguf-py --disable-pip-version-check
2023-07-18 14:24:43 +03:00
fi
ret = 0
2023-07-22 11:48:22 +03:00
test $ret -eq 0 && gg_run ctest_debug
test $ret -eq 0 && gg_run ctest_release
2023-07-18 14:24:43 +03:00
2023-07-22 11:48:22 +03:00
if [ -z ${ GG_BUILD_LOW_PERF } ] ; then
2024-02-13 13:01:29 +02:00
test $ret -eq 0 && gg_run embd_bge_small
2024-05-07 11:08:49 +03:00
if [ -z ${ GG_BUILD_CLOUD } ] || [ ${ GG_BUILD_EXTRA_TESTS_0 } ] ; then
test $ret -eq 0 && gg_run test_scripts_debug
test $ret -eq 0 && gg_run test_scripts_release
fi
2024-04-14 13:12:59 +02:00
2023-10-12 13:44:56 +03:00
if [ -z ${ GG_BUILD_VRAM_GB } ] || [ ${ GG_BUILD_VRAM_GB } -ge 8 ] ; then
if [ -z ${ GG_BUILD_CUDA } ] ; then
2024-05-23 10:00:21 +03:00
#test $ret -eq 0 && gg_run open_llama_3b_v2
date # dummy
2023-10-12 13:44:56 +03:00
else
test $ret -eq 0 && gg_run open_llama_7b_v2
fi
2024-01-26 07:18:00 -05:00
test $ret -eq 0 && gg_run ctest_with_model_debug
test $ret -eq 0 && gg_run ctest_with_model_release
2023-07-22 11:48:22 +03:00
fi
2023-07-18 14:24:43 +03:00
fi
exit $ret