llama.cpp/scripts/check-requirements.sh

#!/bin/bash
set -euo pipefail

#
# check-requirements.sh checks all requirements files for each top-level
# convert*.py script.
#
# WARNING: This is quite IO intensive, because a fresh venv is set up for every
# python script. As of 2023-12-22, this writes ~2.7GB of data. An adequately
# sized tmpfs /tmp or ramdisk is recommended if running this frequently.
#
# usage:    check-requirements.sh [<working_dir>]
#           check-requirements.sh nocleanup [<working_dir>]
#
# where:
#           - <working_dir> is a directory that can be used as the base for
#               setting up the venvs. Defaults to `/tmp`.
#           - 'nocleanup' as the first argument will disable automatic cleanup
#               of the files created by this script.
#
# requires:
#           - bash >= 3.2.57
#           - shellcheck
#
# For each script, it creates a fresh venv, `pip install`s the requirements, and
# finally imports the python script to check for `ImportError`.
#

log() {
    local level=$1 msg=$2
    printf >&2 '%s: %s\n' "$level" "$msg"
}

debug() {
    log DEBUG "$@"
}

info() {
    log INFO "$@"
}

fatal() {
    log FATAL "$@"
    exit 1
}

cleanup() {
    if [[ -n ${workdir+x} && -d $workdir && -w $workdir ]]; then
        info "Removing $workdir"
        local count=0
        rm -rfv -- "$workdir" | while read -r; do
            if (( count++ > 750 )); then
                printf .
                count=0
            fi
        done
        printf '\n'
        info "Removed $workdir"
    fi
}

do_cleanup=1
if [[ ${1-} == nocleanup ]]; then
    do_cleanup=0; shift
fi

if (( do_cleanup )); then
    trap exit INT TERM
    trap cleanup EXIT
fi

this=$(realpath -- "$0"); readonly this
cd "$(dirname "$this")/.." # PWD should stay in llama.cpp project directory

shellcheck "$this"

readonly reqs_dir=requirements

if [[ ${1+x} ]]; then
    tmp_dir=$(realpath -- "$1")
    if [[ ! ( -d $tmp_dir && -w $tmp_dir ) ]]; then
        fatal "$tmp_dir is not a writable directory"
    fi
else
    tmp_dir=/tmp
fi

workdir=$(mktemp -d "$tmp_dir/check-requirements.XXXX"); readonly workdir
info "Working directory: $workdir"

check_requirements() {
    local reqs=$1

    info "$reqs: beginning check"
    pip --disable-pip-version-check install -qr "$reqs"
    info "$reqs: OK"
}

check_convert_script() {
    local py=$1             # e.g. ./convert_hf_to_gguf.py
    local pyname=${py##*/}  # e.g. convert_hf_to_gguf.py
    pyname=${pyname%.py}    # e.g. convert_hf_to_gguf

    info "$py: beginning check"

    local reqs="$reqs_dir/requirements-$pyname.txt"
    if [[ ! -r $reqs ]]; then
        fatal "$py missing requirements. Expected: $reqs"
    fi

    local venv="$workdir/$pyname-venv"
    python3 -m venv "$venv"

    (
        # shellcheck source=/dev/null
        source "$venv/bin/activate"

        check_requirements "$reqs"

        python - "$py" "$pyname" <<'EOF'
import sys
from importlib.machinery import SourceFileLoader
py, pyname = sys.argv[1:]
SourceFileLoader(pyname, py).load_module()
EOF
    )

    if (( do_cleanup )); then
        rm -rf -- "$venv"
    fi

    info "$py: imports OK"
}

readonly ignore_eq_eq='check_requirements: ignore "=="'

for req in "$reqs_dir"/*; do
    # Check that all sub-requirements are added to top-level requirements.txt
    if ! grep -qF "$req" requirements.txt; then
        fatal "$req needs to be added to requirements.txt"
    fi

    # Make sure exact release versions aren't being pinned in the requirements
    # Filters out the ignore string
    if grep -vF "$ignore_eq_eq" "$req" | grep -q '=='; then
        tab=$'\t'
        cat >&2 <<EOF
FATAL: Avoid pinning exact package versions. Use '~=' instead.
You can suppress this error by appending the following to the line:
$tab# $ignore_eq_eq
EOF
        exit 1
    fi
done

all_venv="$workdir/all-venv"
python3 -m venv "$all_venv"

(
    # shellcheck source=/dev/null
    source "$all_venv/bin/activate"
    check_requirements requirements.txt
)

if (( do_cleanup )); then
    rm -rf -- "$all_venv"
fi

check_convert_script examples/convert_legacy_llama.py
for py in convert_*.py; do
    # skip convert_hf_to_gguf_update.py
    # TODO: the check is failing for some reason:
    #       https://github.com/ggerganov/llama.cpp/actions/runs/8875330981/job/24364557177?pr=6920
    [[ $py == convert_hf_to_gguf_update.py ]] && continue

    check_convert_script "$py"
done

info 'Done! No issues found.'
python : add check-requirements.sh and GitHub workflow (#4585) * python: add check-requirements.sh and GitHub workflow This script and workflow forces package versions to remain compatible across all convert.py scripts, while allowing secondary convert scripts to import dependencies not wanted in convert.py. Move requirements into ./requirements * Fail on "==" being used for package requirements (but can be suppressed) * Enforce "compatible release" syntax instead of == * Update workflow * Add upper version bound for transformers and protobuf * improve check-requirements.sh * small syntax change * don't remove venvs if nocleanup is passed * See if this fixes docker workflow * Move check-requirements.sh into ./scripts/ --------- Co-authored-by: Jared Van Bortel <jared@nomic.ai> 2023-12-29 15:50:29 +01:00			`#!/bin/bash`
			`set -euo pipefail`

			`#`
			`# check-requirements.sh checks all requirements files for each top-level`
			`# convert*.py script.`
			`#`
			`# WARNING: This is quite IO intensive, because a fresh venv is set up for every`
			`# python script. As of 2023-12-22, this writes ~2.7GB of data. An adequately`
			`# sized tmpfs /tmp or ramdisk is recommended if running this frequently.`
			`#`
			`# usage: check-requirements.sh [<working_dir>]`
			`# check-requirements.sh nocleanup [<working_dir>]`
			`#`
			`# where:`
			`# - <working_dir> is a directory that can be used as the base for`
			# setting up the venvs. Defaults to `/tmp`.
			`# - 'nocleanup' as the first argument will disable automatic cleanup`
			`# of the files created by this script.`
			`#`
			`# requires:`
			`# - bash >= 3.2.57`
			`# - shellcheck`
			`#`
			# For each script, it creates a fresh venv, `pip install`s the requirements, and
			# finally imports the python script to check for `ImportError`.
			`#`

			`log() {`
			`local level=$1 msg=$2`
			`printf >&2 '%s: %s\n' "$level" "$msg"`
			`}`

			`debug() {`
			`log DEBUG "$@"`
			`}`

			`info() {`
			`log INFO "$@"`
			`}`

			`fatal() {`
			`log FATAL "$@"`
			`exit 1`
			`}`

			`cleanup() {`
			`if [[ -n ${workdir+x} && -d $workdir && -w $workdir ]]; then`
			`info "Removing $workdir"`
			`local count=0`
			`rm -rfv -- "$workdir" \| while read -r; do`
			`if (( count++ > 750 )); then`
			`printf .`
			`count=0`
			`fi`
			`done`
			`printf '\n'`
			`info "Removed $workdir"`
			`fi`
			`}`

			`do_cleanup=1`
			`if [[ ${1-} == nocleanup ]]; then`
			`do_cleanup=0; shift`
			`fi`

			`if (( do_cleanup )); then`
			`trap exit INT TERM`
			`trap cleanup EXIT`
			`fi`

			`this=$(realpath -- "$0"); readonly this`
			`cd "$(dirname "$this")/.." # PWD should stay in llama.cpp project directory`

			`shellcheck "$this"`

			`readonly reqs_dir=requirements`

			`if [[ ${1+x} ]]; then`
			`tmp_dir=$(realpath -- "$1")`
			`if [[ ! ( -d $tmp_dir && -w $tmp_dir ) ]]; then`
			`fatal "$tmp_dir is not a writable directory"`
			`fi`
			`else`
			`tmp_dir=/tmp`
			`fi`

			`workdir=$(mktemp -d "$tmp_dir/check-requirements.XXXX"); readonly workdir`
			`info "Working directory: $workdir"`

			`check_requirements() {`
			`local reqs=$1`

			`info "$reqs: beginning check"`
			`pip --disable-pip-version-check install -qr "$reqs"`
			`info "$reqs: OK"`
			`}`

			`check_convert_script() {`
py : switch to snake_case (#8305) * py : switch to snake_case ggml-ci * cont ggml-ci * cont ggml-ci * cont : fix link * gguf-py : use snake_case in scripts entrypoint export * py : rename requirements for convert_legacy_llama.py Needed for scripts/check-requirements.sh --------- Co-authored-by: Francis Couture-Harpin <git@compilade.net> 2024-07-05 06:53:33 +02:00			`local py=$1 # e.g. ./convert_hf_to_gguf.py`
			`local pyname=${py##*/} # e.g. convert_hf_to_gguf.py`
			`pyname=${pyname%.py} # e.g. convert_hf_to_gguf`
python : add check-requirements.sh and GitHub workflow (#4585) * python: add check-requirements.sh and GitHub workflow This script and workflow forces package versions to remain compatible across all convert.py scripts, while allowing secondary convert scripts to import dependencies not wanted in convert.py. Move requirements into ./requirements * Fail on "==" being used for package requirements (but can be suppressed) * Enforce "compatible release" syntax instead of == * Update workflow * Add upper version bound for transformers and protobuf * improve check-requirements.sh * small syntax change * don't remove venvs if nocleanup is passed * See if this fixes docker workflow * Move check-requirements.sh into ./scripts/ --------- Co-authored-by: Jared Van Bortel <jared@nomic.ai> 2023-12-29 15:50:29 +01:00
			`info "$py: beginning check"`

			`local reqs="$reqs_dir/requirements-$pyname.txt"`
			`if [[ ! -r $reqs ]]; then`
			`fatal "$py missing requirements. Expected: $reqs"`
			`fi`

			`local venv="$workdir/$pyname-venv"`
			`python3 -m venv "$venv"`

			`(`
			`# shellcheck source=/dev/null`
			`source "$venv/bin/activate"`

			`check_requirements "$reqs"`

			`python - "$py" "$pyname" <<'EOF'`
			`import sys`
			`from importlib.machinery import SourceFileLoader`
			`py, pyname = sys.argv[1:]`
			`SourceFileLoader(pyname, py).load_module()`
			`EOF`
			`)`

			`if (( do_cleanup )); then`
			`rm -rf -- "$venv"`
			`fi`

			`info "$py: imports OK"`
			`}`

			`readonly ignore_eq_eq='check_requirements: ignore "=="'`

			`for req in "$reqs_dir"/*; do`
			`# Check that all sub-requirements are added to top-level requirements.txt`
			`if ! grep -qF "$req" requirements.txt; then`
			`fatal "$req needs to be added to requirements.txt"`
			`fi`

			`# Make sure exact release versions aren't being pinned in the requirements`
			`# Filters out the ignore string`
			`if grep -vF "$ignore_eq_eq" "$req" \| grep -q '=='; then`
			`tab=$'\t'`
			`cat >&2 <<EOF`
			`FATAL: Avoid pinning exact package versions. Use '~=' instead.`
			`You can suppress this error by appending the following to the line:`
			`$tab# $ignore_eq_eq`
			`EOF`
			`exit 1`
			`fi`
			`done`

			`all_venv="$workdir/all-venv"`
			`python3 -m venv "$all_venv"`

			`(`
			`# shellcheck source=/dev/null`
			`source "$all_venv/bin/activate"`
			`check_requirements requirements.txt`
			`)`

			`if (( do_cleanup )); then`
			`rm -rf -- "$all_venv"`
			`fi`

py : switch to snake_case (#8305) * py : switch to snake_case ggml-ci * cont ggml-ci * cont ggml-ci * cont : fix link * gguf-py : use snake_case in scripts entrypoint export * py : rename requirements for convert_legacy_llama.py Needed for scripts/check-requirements.sh --------- Co-authored-by: Francis Couture-Harpin <git@compilade.net> 2024-07-05 06:53:33 +02:00			`check_convert_script examples/convert_legacy_llama.py`
fix: Update script paths in CI scripts 2024-03-10 18:51:46 +01:00			`for py in convert_*.py; do`
py : switch to snake_case (#8305) * py : switch to snake_case ggml-ci * cont ggml-ci * cont ggml-ci * cont : fix link * gguf-py : use snake_case in scripts entrypoint export * py : rename requirements for convert_legacy_llama.py Needed for scripts/check-requirements.sh --------- Co-authored-by: Francis Couture-Harpin <git@compilade.net> 2024-07-05 06:53:33 +02:00			`# skip convert_hf_to_gguf_update.py`
llama : fix BPE pre-tokenization (#6920) * merged the changes from deepseeker models to main branch * Moved regex patterns to unicode.cpp and updated unicode.h * Moved header files * Resolved issues * added and refactored unicode_regex_split and related functions * Updated/merged the deepseek coder pr * Refactored code * Adding unicode regex mappings * Adding unicode regex function * Added needed functionality, testing remains * Fixed issues * Fixed issue with gpt2 regex custom preprocessor * unicode : fix? unicode_wstring_to_utf8 * lint : fix whitespaces * tests : add tokenizer tests for numbers * unicode : remove redundant headers * tests : remove and rename tokenizer test scripts * tests : add sample usage * gguf-py : reader prints warnings on duplicate keys * llama : towards llama3 tokenization support (wip) * unicode : shot in the dark to fix tests on Windows * unicode : first try custom implementations * convert : add "tokenizer.ggml.pre" GGUF KV (wip) * llama : use new pre-tokenizer type * convert : fix pre-tokenizer type writing * lint : fix * make : add test-tokenizer-0-llama-v3 * wip * models : add llama v3 vocab file * llama : adapt punctuation regex + add llama 3 regex * minor * unicode : set bomb * unicode : set bomb * unicode : always use std::wregex * unicode : support \p{N}, \p{L} and \p{P} natively * unicode : try fix windows * unicode : category support via std::regex * unicode : clean-up * unicode : simplify * convert : add convert-hf-to-gguf-update.py ggml-ci * lint : update * convert : add falcon ggml-ci * unicode : normalize signatures * lint : fix * lint : fix * convert : remove unused functions * convert : add comments * convert : exercise contractions ggml-ci * lint : fix * cmake : refactor test targets * tests : refactor vocab tests ggml-ci * tests : add more vocabs and tests ggml-ci * unicode : cleanup * scripts : ignore new update script in check-requirements.sh * models : add phi-3, mpt, gpt-2, starcoder * tests : disable obsolete ggml-ci * tests : use faster bpe test ggml-ci * llama : more prominent warning for old BPE models * tests : disable test-tokenizer-1-bpe due to slowness ggml-ci --------- Co-authored-by: Jaggzh <jaggz.h@gmail.com> Co-authored-by: Kazim Abrar Mahi <kazimabrarmahi135@gmail.com> 2024-04-29 15:58:41 +02:00			`# TODO: the check is failing for some reason:`
			`# https://github.com/ggerganov/llama.cpp/actions/runs/8875330981/job/24364557177?pr=6920`
fix: Update script paths in CI scripts 2024-03-10 18:51:46 +01:00			`[[ $py == convert_hf_to_gguf_update.py ]] && continue`
llama : fix BPE pre-tokenization (#6920) * merged the changes from deepseeker models to main branch * Moved regex patterns to unicode.cpp and updated unicode.h * Moved header files * Resolved issues * added and refactored unicode_regex_split and related functions * Updated/merged the deepseek coder pr * Refactored code * Adding unicode regex mappings * Adding unicode regex function * Added needed functionality, testing remains * Fixed issues * Fixed issue with gpt2 regex custom preprocessor * unicode : fix? unicode_wstring_to_utf8 * lint : fix whitespaces * tests : add tokenizer tests for numbers * unicode : remove redundant headers * tests : remove and rename tokenizer test scripts * tests : add sample usage * gguf-py : reader prints warnings on duplicate keys * llama : towards llama3 tokenization support (wip) * unicode : shot in the dark to fix tests on Windows * unicode : first try custom implementations * convert : add "tokenizer.ggml.pre" GGUF KV (wip) * llama : use new pre-tokenizer type * convert : fix pre-tokenizer type writing * lint : fix * make : add test-tokenizer-0-llama-v3 * wip * models : add llama v3 vocab file * llama : adapt punctuation regex + add llama 3 regex * minor * unicode : set bomb * unicode : set bomb * unicode : always use std::wregex * unicode : support \p{N}, \p{L} and \p{P} natively * unicode : try fix windows * unicode : category support via std::regex * unicode : clean-up * unicode : simplify * convert : add convert-hf-to-gguf-update.py ggml-ci * lint : update * convert : add falcon ggml-ci * unicode : normalize signatures * lint : fix * lint : fix * convert : remove unused functions * convert : add comments * convert : exercise contractions ggml-ci * lint : fix * cmake : refactor test targets * tests : refactor vocab tests ggml-ci * tests : add more vocabs and tests ggml-ci * unicode : cleanup * scripts : ignore new update script in check-requirements.sh * models : add phi-3, mpt, gpt-2, starcoder * tests : disable obsolete ggml-ci * tests : use faster bpe test ggml-ci * llama : more prominent warning for old BPE models * tests : disable test-tokenizer-1-bpe due to slowness ggml-ci --------- Co-authored-by: Jaggzh <jaggz.h@gmail.com> Co-authored-by: Kazim Abrar Mahi <kazimabrarmahi135@gmail.com> 2024-04-29 15:58:41 +02:00
python : add check-requirements.sh and GitHub workflow (#4585) * python: add check-requirements.sh and GitHub workflow This script and workflow forces package versions to remain compatible across all convert.py scripts, while allowing secondary convert scripts to import dependencies not wanted in convert.py. Move requirements into ./requirements * Fail on "==" being used for package requirements (but can be suppressed) * Enforce "compatible release" syntax instead of == * Update workflow * Add upper version bound for transformers and protobuf * improve check-requirements.sh * small syntax change * don't remove venvs if nocleanup is passed * See if this fixes docker workflow * Move check-requirements.sh into ./scripts/ --------- Co-authored-by: Jared Van Bortel <jared@nomic.ai> 2023-12-29 15:50:29 +01:00			`check_convert_script "$py"`
			`done`

			`info 'Done! No issues found.'`