diff --git a/.devops/lamma-cpp-clblast.srpm.spec b/.devops/lamma-cpp-clblast.srpm.spec
new file mode 100644
index 000000000..739c68281
--- /dev/null
+++ b/.devops/lamma-cpp-clblast.srpm.spec
@@ -0,0 +1,58 @@
+# SRPM for building from source and packaging an RPM for RPM-based distros.
+# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
+# Built and maintained by John Boero - boeroboy@gmail.com
+# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
+
+# Notes for llama.cpp:
+# 1. Tags are currently based on hash - which will not sort asciibetically.
+# We need to declare standard versioning if people want to sort latest releases.
+# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
+# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
+# Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
+# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
+# It is up to the user to install the correct vendor-specific support.
+
+Name: llama.cpp-clblast
+Version: master
+Release: 1%{?dist}
+Summary: OpenCL Inference of LLaMA model in pure C/C++
+License: MIT
+Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
+BuildRequires: coreutils make gcc-c++ git mesa-libOpenCL-devel
+URL: https://github.com/ggerganov/llama.cpp
+
+%define debug_package %{nil}
+%define source_date_epoch_from_changelog 0
+
+%description
+CPU inference for Meta's Lllama2 models using default options.
+
+%prep
+%setup -n llama.cpp-master
+
+%build
+make -j LLAMA_CLBLAST=1
+
+%install
+mkdir -p %{buildroot}%{_bindir}/
+cp -p main %{buildroot}%{_bindir}/llamacppclblast
+cp -p server %{buildroot}%{_bindir}/llamacppclblastserver
+cp -p simple %{buildroot}%{_bindir}/llamacppclblastsimple
+
+%clean
+rm -rf %{buildroot}
+rm -rf %{_builddir}/*
+
+%files
+%{_bindir}/llamacppclblast
+%{_bindir}/llamacppclblastserver
+%{_bindir}/llamacppclblastsimple
+
+%pre
+
+%post
+
+%preun
+%postun
+
+%changelog
diff --git a/.devops/lamma-cpp-cublas.srpm.spec b/.devops/lamma-cpp-cublas.srpm.spec
new file mode 100644
index 000000000..75d32fbe7
--- /dev/null
+++ b/.devops/lamma-cpp-cublas.srpm.spec
@@ -0,0 +1,59 @@
+# SRPM for building from source and packaging an RPM for RPM-based distros.
+# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
+# Built and maintained by John Boero - boeroboy@gmail.com
+# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
+
+# Notes for llama.cpp:
+# 1. Tags are currently based on hash - which will not sort asciibetically.
+# We need to declare standard versioning if people want to sort latest releases.
+# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
+# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
+# Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
+# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
+# It is up to the user to install the correct vendor-specific support.
+
+Name: llama.cpp-cublas
+Version: master
+Release: 1%{?dist}
+Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
+License: MIT
+Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
+BuildRequires: coreutils make gcc-c++ git cuda-toolkit
+Requires: cuda-toolkit
+URL: https://github.com/ggerganov/llama.cpp
+
+%define debug_package %{nil}
+%define source_date_epoch_from_changelog 0
+
+%description
+CPU inference for Meta's Lllama2 models using default options.
+
+%prep
+%setup -n llama.cpp-master
+
+%build
+make -j LLAMA_CUBLAS=1
+
+%install
+mkdir -p %{buildroot}%{_bindir}/
+cp -p main %{buildroot}%{_bindir}/llamacppcublas
+cp -p server %{buildroot}%{_bindir}/llamacppcublasserver
+cp -p simple %{buildroot}%{_bindir}/llamacppcublassimple
+
+%clean
+rm -rf %{buildroot}
+rm -rf %{_builddir}/*
+
+%files
+%{_bindir}/llamacppcublas
+%{_bindir}/llamacppcublasserver
+%{_bindir}/llamacppcublassimple
+
+%pre
+
+%post
+
+%preun
+%postun
+
+%changelog
diff --git a/.devops/llama-cpp.srpm.spec b/.devops/llama-cpp.srpm.spec
new file mode 100644
index 000000000..c65251a5a
--- /dev/null
+++ b/.devops/llama-cpp.srpm.spec
@@ -0,0 +1,58 @@
+# SRPM for building from source and packaging an RPM for RPM-based distros.
+# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
+# Built and maintained by John Boero - boeroboy@gmail.com
+# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
+
+# Notes for llama.cpp:
+# 1. Tags are currently based on hash - which will not sort asciibetically.
+# We need to declare standard versioning if people want to sort latest releases.
+# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
+# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
+# Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
+# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
+# It is up to the user to install the correct vendor-specific support.
+
+Name: llama.cpp
+Version: master
+Release: 1%{?dist}
+Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
+License: MIT
+Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
+BuildRequires: coreutils make gcc-c++ git
+URL: https://github.com/ggerganov/llama.cpp
+
+%define debug_package %{nil}
+%define source_date_epoch_from_changelog 0
+
+%description
+CPU inference for Meta's Lllama2 models using default options.
+
+%prep
+%autosetup
+
+%build
+make -j
+
+%install
+mkdir -p %{buildroot}%{_bindir}/
+cp -p main %{buildroot}%{_bindir}/llamacpp
+cp -p server %{buildroot}%{_bindir}/llamacppserver
+cp -p simple %{buildroot}%{_bindir}/llamacppsimple
+
+%clean
+rm -rf %{buildroot}
+rm -rf %{_builddir}/*
+
+%files
+%{_bindir}/llamacpp
+%{_bindir}/llamacppserver
+%{_bindir}/llamacppsimple
+
+%pre
+
+%post
+
+%preun
+%postun
+
+%changelog
diff --git a/.gitignore b/.gitignore
index 9c749f1ef..f3121794a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,10 @@
*.o
*.a
*.so
+*.gguf
*.bin
+*.exe
+*.dll
.DS_Store
.build/
.cache/
@@ -47,6 +50,8 @@ models-mnt
/server
/Pipfile
/embd-input-test
+/gguf
+/gguf-llama-simple
/libllama.so
/llama-bench
build-info.h
@@ -65,7 +70,6 @@ perf-*.txt
examples/jeopardy/results.txt
-
pyproject.toml
poetry.lock
poetry.toml
@@ -79,4 +83,3 @@ tests/test-quantize-fns
tests/test-quantize-perf
tests/test-sampling
tests/test-tokenizer-0
-
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 824d9f2cf..bb63ef98e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -497,9 +497,11 @@ else()
endif()
#
-# Build libraries
+# libraries
#
+# ggml
+
add_library(ggml OBJECT
ggml.c
ggml.h
@@ -524,10 +526,11 @@ if (BUILD_SHARED_LIBS)
install(TARGETS ggml_shared LIBRARY)
endif()
+# llama
+
add_library(llama
llama.cpp
llama.h
- llama-util.h
)
target_include_directories(llama PUBLIC .)
@@ -546,6 +549,10 @@ if (BUILD_SHARED_LIBS)
install(TARGETS llama LIBRARY)
endif()
+#
+# install
+#
+
include(GNUInstallDirs)
install(
FILES convert.py
@@ -584,6 +591,8 @@ endif()
# programs, examples and tests
#
+add_subdirectory(common)
+
if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
include(CTest)
add_subdirectory(tests)
diff --git a/Makefile b/Makefile
index 502781c69..d31acc450 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
# Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple server embd-input-test llama-bench
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple server embd-input-test gguf llama-bench
# Binaries only useful for tests
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
@@ -45,8 +45,8 @@ OPT = -Ofast
else
OPT = -O3
endif
-CFLAGS = -I. $(OPT) -std=c11 -fPIC
-CXXFLAGS = -I. -I./examples $(OPT) -std=c++11 -fPIC
+CFLAGS = -I. $(OPT) -std=c11 -fPIC
+CXXFLAGS = -I. -I./common $(OPT) -std=c++11 -fPIC
LDFLAGS =
ifdef LLAMA_DEBUG
@@ -329,23 +329,23 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
OBJS += ggml-alloc.o
-llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
+llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@
-common.o: examples/common.cpp examples/common.h
+common.o: common/common.cpp common/common.h
$(CXX) $(CXXFLAGS) -c $< -o $@
-console.o: examples/console.cpp examples/console.h
+console.o: common/console.cpp common/console.h
$(CXX) $(CXXFLAGS) -c $< -o $@
-grammar-parser.o: examples/grammar-parser.cpp examples/grammar-parser.h
+grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h
$(CXX) $(CXXFLAGS) -c $< -o $@
libllama.so: llama.o ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
clean:
- rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch convert-llama2c-to-ggml embd-input-test llama-bench build-info.h $(TEST_TARGETS)
+ rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch convert-llama2c-to-ggml embd-input-test gguf llama-bench build-info.h $(TEST_TARGETS)
#
# Examples
@@ -385,7 +385,10 @@ $(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-in
embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o $(OBJS)
+gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o $(OBJS)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
+train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp build-info.h ggml.o llama.o $(OBJS)
@@ -418,7 +421,7 @@ vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
-tests/test-grammar-parser: tests/test-grammar-parser.cpp examples/grammar-parser.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o common.o $(OBJS)
diff --git a/README.md b/README.md
index 9f8512dc5..f746c49eb 100644
--- a/README.md
+++ b/README.md
@@ -9,11 +9,17 @@
Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
-### 🚧 Incoming breaking change + refactoring:
+### Hot topics
-See PR https://github.com/ggerganov/llama.cpp/pull/2398 for more info.
+A new file format has been introduced: [GGUF](https://github.com/ggerganov/llama.cpp/pull/2398)
-To devs: avoid making big changes to `llama.h` / `llama.cpp` until merged
+Last revision compatible with the old format: [dadbed9](https://github.com/ggerganov/llama.cpp/commit/dadbed99e65252d79f81101a392d0d6497b86caa)
+
+### Current `master` should be considered in Beta - expect some issues for a few days!
+
+### Be prepared to re-convert and / or re-quantize your GGUF models while this notice is up!
+
+### Issues with non-GGUF models will be considered with low priority!
----
@@ -33,6 +39,7 @@ To devs: avoid making big changes to `llama.h` / `llama.cpp` until merged
Memory/Disk Requirements
Quantization
Interactive mode
+ Constrained output with grammars
Instruction mode with Alpaca
Using OpenLLaMA
Using GPT4All
@@ -291,7 +298,7 @@ When built with Metal support, you can enable GPU inference with the `--gpu-laye
Any value larger than 0 will offload the computation to the GPU. For example:
```bash
-./main -m ./models/7B/ggml-model-q4_0.bin -n 128 -ngl 1
+./main -m ./models/7B/ggml-model-q4_0.gguf -n 128 -ngl 1
```
### MPI Build
@@ -330,7 +337,7 @@ The above will distribute the computation across 2 processes on the first host a
Finally, you're ready to run a computation using `mpirun`:
```bash
-mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128
+mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
```
### BLAS Build
@@ -513,10 +520,10 @@ python3 convert.py models/7B/
python convert.py models/7B/ --vocabtype bpe
# quantize the model to 4-bits (using q4_0 method)
-./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin q4_0
+./quantize ./models/7B/ggml-model-f16.gguf ./models/7B/ggml-model-q4_0.gguf q4_0
# run the inference
-./main -m ./models/7B/ggml-model-q4_0.bin -n 128
+./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
```
When running the larger models, make sure you have enough disk space to store all the intermediate files.
@@ -572,7 +579,7 @@ Here is an example of a few-shot interaction, invoked with the command
./examples/chat-13B.sh
# custom arguments using a 13B model
-./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
+./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
```
Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program.
@@ -598,6 +605,16 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
CHAT_SAVE_DIR=./chat/bob ./examples/chat-persistent.sh
```
+### Constrained output with grammars
+
+`llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
+
+```bash
+./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
+```
+
+The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
+
### Instruction mode with Alpaca
1. First, download the `ggml` Alpaca model into the `./models` folder
@@ -635,6 +652,8 @@ OpenLLaMA is an openly licensed reproduction of Meta's original LLaMA model. It
### Using [GPT4All](https://github.com/nomic-ai/gpt4all)
+*Note: these instructions are likely obsoleted by the GGUF update*
+
- Obtain the `tokenizer.model` file from LLaMA model and put it to `models`
- Obtain the `added_tokens.json` file from Alpaca model and put it to `models`
- Obtain the `gpt4all-lora-quantized.bin` file from GPT4All model and put it to `models/gpt4all-7B`
@@ -710,7 +729,7 @@ If your issue is with model generation quality, then please at least scan the fo
#### How to run
1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
-2. Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
+2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
3. Output:
```
perplexity : calculating perplexity over 655 chunks
@@ -809,13 +828,13 @@ docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-
On completion, you are ready to play!
```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
+docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
```
or with a light image:
```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
+docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
```
### Docker With CUDA
@@ -846,8 +865,8 @@ The resulting images, are essentially the same as the non-CUDA images:
After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag.
```bash
-docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
-docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
+docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
+docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
```
### Contributing
@@ -877,3 +896,4 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /
- [BLIS](./docs/BLIS.md)
- [Performance troubleshooting](./docs/token_generation_performance_tips.md)
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
+- [GBNF grammars](./grammars/README.md)
diff --git a/ci/run.sh b/ci/run.sh
old mode 100644
new mode 100755
index 8dc394964..54ba6d710
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -159,17 +159,17 @@ function gg_run_open_llama_3b_v2 {
python3 ../convert.py ${path_models}
- model_f16="${path_models}/ggml-model-f16.bin"
- model_q8_0="${path_models}/ggml-model-q8_0.bin"
- model_q4_0="${path_models}/ggml-model-q4_0.bin"
- model_q4_1="${path_models}/ggml-model-q4_1.bin"
- model_q5_0="${path_models}/ggml-model-q5_0.bin"
- model_q5_1="${path_models}/ggml-model-q5_1.bin"
- model_q2_k="${path_models}/ggml-model-q2_k.bin"
- model_q3_k="${path_models}/ggml-model-q3_k.bin"
- model_q4_k="${path_models}/ggml-model-q4_k.bin"
- model_q5_k="${path_models}/ggml-model-q5_k.bin"
- model_q6_k="${path_models}/ggml-model-q6_k.bin"
+ model_f16="${path_models}/ggml-model-f16.gguf"
+ model_q8_0="${path_models}/ggml-model-q8_0.gguf"
+ model_q4_0="${path_models}/ggml-model-q4_0.gguf"
+ model_q4_1="${path_models}/ggml-model-q4_1.gguf"
+ model_q5_0="${path_models}/ggml-model-q5_0.gguf"
+ model_q5_1="${path_models}/ggml-model-q5_1.gguf"
+ model_q2_k="${path_models}/ggml-model-q2_k.gguf"
+ model_q3_k="${path_models}/ggml-model-q3_k.gguf"
+ model_q4_k="${path_models}/ggml-model-q4_k.gguf"
+ model_q5_k="${path_models}/ggml-model-q5_k.gguf"
+ model_q6_k="${path_models}/ggml-model-q6_k.gguf"
wiki_test_60="${path_wiki}/wiki.test-60.raw"
@@ -285,17 +285,17 @@ function gg_run_open_llama_7b_v2 {
python3 ../convert.py ${path_models}
- model_f16="${path_models}/ggml-model-f16.bin"
- model_q8_0="${path_models}/ggml-model-q8_0.bin"
- model_q4_0="${path_models}/ggml-model-q4_0.bin"
- model_q4_1="${path_models}/ggml-model-q4_1.bin"
- model_q5_0="${path_models}/ggml-model-q5_0.bin"
- model_q5_1="${path_models}/ggml-model-q5_1.bin"
- model_q2_k="${path_models}/ggml-model-q2_k.bin"
- model_q3_k="${path_models}/ggml-model-q3_k.bin"
- model_q4_k="${path_models}/ggml-model-q4_k.bin"
- model_q5_k="${path_models}/ggml-model-q5_k.bin"
- model_q6_k="${path_models}/ggml-model-q6_k.bin"
+ model_f16="${path_models}/ggml-model-f16.gguf"
+ model_q8_0="${path_models}/ggml-model-q8_0.gguf"
+ model_q4_0="${path_models}/ggml-model-q4_0.gguf"
+ model_q4_1="${path_models}/ggml-model-q4_1.gguf"
+ model_q5_0="${path_models}/ggml-model-q5_0.gguf"
+ model_q5_1="${path_models}/ggml-model-q5_1.gguf"
+ model_q2_k="${path_models}/ggml-model-q2_k.gguf"
+ model_q3_k="${path_models}/ggml-model-q3_k.gguf"
+ model_q4_k="${path_models}/ggml-model-q4_k.gguf"
+ model_q5_k="${path_models}/ggml-model-q5_k.gguf"
+ model_q6_k="${path_models}/ggml-model-q6_k.gguf"
wiki_test="${path_wiki}/wiki.test.raw"
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
new file mode 100644
index 000000000..dead56118
--- /dev/null
+++ b/common/CMakeLists.txt
@@ -0,0 +1,20 @@
+# common
+
+set(TARGET common)
+
+add_library(${TARGET} OBJECT
+ common.h
+ common.cpp
+ console.h
+ console.cpp
+ grammar-parser.h
+ grammar-parser.cpp
+ )
+
+if (BUILD_SHARED_LIBS)
+ set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endif()
+
+target_include_directories(${TARGET} PUBLIC .)
+target_compile_features(${TARGET} PUBLIC cxx_std_11)
+target_link_libraries(${TARGET} PRIVATE llama)
diff --git a/examples/common.cpp b/common/common.cpp
similarity index 90%
rename from examples/common.cpp
rename to common/common.cpp
index bd39d9220..88a962ae3 100644
--- a/examples/common.cpp
+++ b/common/common.cpp
@@ -170,18 +170,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break;
}
params.n_ctx = std::stoi(argv[i]);
- } else if (arg == "-gqa" || arg == "--gqa") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.n_gqa = std::stoi(argv[i]);
- } else if (arg == "-eps" || arg == "--rms-norm-eps") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.rms_norm_eps = std::stof(argv[i]);
} else if (arg == "--rope-freq-base") {
if (++i >= argc) {
invalid_param = true;
@@ -301,7 +289,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break;
}
params.n_batch = std::stoi(argv[i]);
- params.n_batch = std::min(512, params.n_batch);
} else if (arg == "--keep") {
if (++i >= argc) {
invalid_param = true;
@@ -400,11 +387,11 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
#else
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
#endif // GGML_USE_CUBLAS
- } else if (arg == "--mul-mat-q" || arg == "-mmq") {
+ } else if (arg == "--no-mul-mat-q" || arg == "-nommq") {
#ifdef GGML_USE_CUBLAS
- params.mul_mat_q = true;
+ params.mul_mat_q = false;
#else
- fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels.\n");
+ fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n");
#endif // GGML_USE_CUBLAS
} else if (arg == "--low-vram" || arg == "-lv") {
#ifdef GGML_USE_CUBLAS
@@ -430,6 +417,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
params.antiprompt.push_back(argv[i]);
} else if (arg == "--perplexity") {
params.perplexity = true;
+ } else if (arg == "--ppl-stride") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.ppl_stride = std::stoi(argv[i]);
+ } else if (arg == "--ppl-output-type") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.ppl_output_type = std::stoi(argv[i]);
} else if (arg == "--hellaswag") {
params.hellaswag = true;
} else if (arg == "--hellaswag-tasks") {
@@ -439,7 +438,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
}
params.hellaswag_tasks = std::stoi(argv[i]);
} else if (arg == "--ignore-eos") {
- params.logit_bias[llama_token_eos()] = -INFINITY;
+ params.ignore_eos = true;
} else if (arg == "--no-penalize-nl") {
params.penalize_nl = false;
} else if (arg == "-l" || arg == "--logit-bias") {
@@ -561,8 +560,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stdout, " -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
- fprintf(stdout, " -gqa N, --gqa N grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
- fprintf(stdout, " -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps);
fprintf(stdout, " --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
fprintf(stdout, " --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
fprintf(stdout, " --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
@@ -614,11 +611,11 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stdout, " number of layers to store in VRAM\n");
fprintf(stdout, " -ts SPLIT --tensor-split SPLIT\n");
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
- fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" );
- fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n" );
- fprintf(stdout, " -mmq, --mul-mat-q use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" );
- fprintf(stdout, " Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" );
- fprintf(stdout, " is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" );
+ fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
+ fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
+ fprintf(stdout, " -nommq, --no-mul-mat-q\n");
+ fprintf(stdout, " use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
+ fprintf(stdout, " Not recommended since this is both slower and uses more VRAM.\n");
#endif
fprintf(stdout, " --mtest compute maximum memory usage\n");
fprintf(stdout, " --export export the computation graph to 'llama.ggml'\n");
@@ -650,24 +647,15 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
return "The";
}
-// TODO: not great allocating this every time
-std::vector llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
- // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
- std::vector res(text.size() + (int) add_bos);
- const int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
- assert(n >= 0);
- res.resize(n);
-
- return res;
-}
+//
+// Model utils
+//
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
auto lparams = llama_context_default_params();
lparams.n_ctx = params.n_ctx;
lparams.n_batch = params.n_batch;
- lparams.n_gqa = params.n_gqa;
- lparams.rms_norm_eps = params.rms_norm_eps;
lparams.n_gpu_layers = params.n_gpu_layers;
lparams.main_gpu = params.main_gpu;
lparams.tensor_split = params.tensor_split;
@@ -685,7 +673,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
return lparams;
}
-std::tuple llama_init_from_gpt_params(const gpt_params & params) {
+std::tuple llama_init_from_gpt_params(gpt_params & params) {
auto lparams = llama_context_params_from_gpt_params(params);
llama_model * model = llama_load_model_from_file(params.model.c_str(), lparams);
@@ -714,5 +702,77 @@ std::tuple llama_init_from_gpt_par
}
}
+ if (params.ignore_eos) {
+ params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
+ }
+
return std::make_tuple(model, lctx);
}
+
+//
+// Vocab utils
+//
+
+std::vector llama_tokenize(
+ struct llama_context * ctx,
+ const std::string & text,
+ bool add_bos) {
+ // upper limit for the number of tokens
+ int n_tokens = text.length() + add_bos;
+ std::vector result(n_tokens);
+ n_tokens = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
+ if (n_tokens < 0) {
+ result.resize(-n_tokens);
+ int check = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
+ GGML_ASSERT(check == -n_tokens);
+ } else {
+ result.resize(n_tokens);
+ }
+ return result;
+}
+
+std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
+ std::vector result(8, 0);
+ const int n_tokens = llama_token_to_str(ctx, token, result.data(), result.size());
+ if (n_tokens < 0) {
+ result.resize(-n_tokens);
+ int check = llama_token_to_str(ctx, token, result.data(), result.size());
+ GGML_ASSERT(check == -n_tokens);
+ } else {
+ result.resize(n_tokens);
+ }
+
+ return std::string(result.data(), result.size());
+}
+
+std::vector llama_tokenize_bpe(
+ struct llama_context * ctx,
+ const std::string & text,
+ bool add_bos) {
+ int n_tokens = text.length() + add_bos;
+ std::vector result(n_tokens);
+ n_tokens = llama_tokenize_bpe(ctx, text.c_str(), result.data(), result.size(), add_bos);
+ if (n_tokens < 0) {
+ result.resize(-n_tokens);
+ int check = llama_tokenize_bpe(ctx, text.c_str(), result.data(), result.size(), add_bos);
+ GGML_ASSERT(check == -n_tokens);
+ } else {
+ result.resize(n_tokens);
+ }
+ return result;
+}
+
+std::string llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token) {
+ std::vector result(8, 0);
+ const int n_tokens = llama_token_to_str_bpe(ctx, token, result.data(), result.size());
+ if (n_tokens < 0) {
+ result.resize(-n_tokens);
+ const int check = llama_token_to_str_bpe(ctx, token, result.data(), result.size());
+ GGML_ASSERT(check == -n_tokens);
+ } else {
+ result.resize(n_tokens);
+ }
+
+ return std::string(result.data(), result.size());
+}
+
diff --git a/examples/common.h b/common/common.h
similarity index 81%
rename from examples/common.h
rename to common/common.h
index 375bc0a3d..d68a8ef88 100644
--- a/examples/common.h
+++ b/common/common.h
@@ -22,19 +22,16 @@ struct gpt_params {
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
- int32_t n_gqa = 1; // grouped-query attention factor (TODO: move to hparams)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
- float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS; // rms norm epsilon
float rope_freq_base = 10000.0f; // RoPE base frequency
float rope_freq_scale = 1.0f; // RoPE frequency scaling factor
// sampling parameters
- std::unordered_map logit_bias; // logit bias for specific tokens
int32_t top_k = 40; // <= 0 to use vocab size
float top_p = 0.95f; // 1.0 = disabled
float tfs_z = 1.00f; // 1.0 = disabled
@@ -48,12 +45,14 @@ struct gpt_params {
float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate
+ std::unordered_map logit_bias; // logit bias for specific tokens
+
// Classifier-Free Guidance
// https://arxiv.org/abs/2306.17806
std::string cfg_negative_prompt; // string to help guidance
float cfg_scale = 1.f; // How strong is guidance
- std::string model = "models/7B/ggml-model.bin"; // model path
+ std::string model = "models/7B/ggml-model-f16.gguf"; // model path
std::string model_alias = "unknown"; // model alias
std::string prompt = "";
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
@@ -65,11 +64,15 @@ struct gpt_params {
std::string lora_adapter = ""; // lora adapter path
std::string lora_base = ""; // base model path for the lora adapter
+ int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
+ int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
+ // (which is more convenient to use for plotting)
+ //
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
bool low_vram = false; // if true, reduce VRAM usage at the cost of performance
- bool mul_mat_q = false; // if true, use experimental mul_mat_q kernels
+ bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
bool memory_f16 = true; // use f16 instead of f32 for memory kv
bool random_prompt = false; // do not randomize prompt if none provided
bool use_color = false; // use color to distinguish generations and inputs
@@ -83,6 +86,7 @@ struct gpt_params {
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
+ bool ignore_eos = false; // ignore generated EOS tokens
bool instruct = false; // instruction mode (used for Alpaca models)
bool penalize_nl = true; // consider newlines as a repeatable token
bool perplexity = false; // compute perplexity over the prompt
@@ -100,15 +104,31 @@ void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
std::string gpt_random_prompt(std::mt19937 & rng);
-//
-// Vocab utils
-//
-
-std::vector llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);
-
//
// Model utils
//
-std::tuple llama_init_from_gpt_params(const gpt_params & params);
+std::tuple llama_init_from_gpt_params(gpt_params & params);
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
+
+//
+// Vocab utils
+//
+
+std::vector llama_tokenize(
+ struct llama_context * ctx,
+ const std::string & text,
+ bool add_bos);
+
+std::vector llama_tokenize_bpe(
+ struct llama_context * ctx,
+ const std::string & text,
+ bool add_bos);
+
+std::string llama_token_to_str(
+ const struct llama_context * ctx,
+ llama_token token);
+
+std::string llama_token_to_str_bpe(
+ const struct llama_context * ctx,
+ llama_token token);
diff --git a/examples/console.cpp b/common/console.cpp
similarity index 100%
rename from examples/console.cpp
rename to common/console.cpp
diff --git a/examples/console.h b/common/console.h
similarity index 100%
rename from examples/console.h
rename to common/console.h
diff --git a/examples/grammar-parser.cpp b/common/grammar-parser.cpp
similarity index 100%
rename from examples/grammar-parser.cpp
rename to common/grammar-parser.cpp
diff --git a/examples/grammar-parser.h b/common/grammar-parser.h
similarity index 100%
rename from examples/grammar-parser.h
rename to common/grammar-parser.h
diff --git a/convert-falcon-hf-to-gguf.py b/convert-falcon-hf-to-gguf.py
new file mode 100755
index 000000000..50069db56
--- /dev/null
+++ b/convert-falcon-hf-to-gguf.py
@@ -0,0 +1,283 @@
+#!/usr/bin/env python3
+# HF falcon--> gguf conversion
+
+import gguf
+import os
+import sys
+import struct
+import json
+import numpy as np
+import torch
+
+from typing import Any, List
+from pathlib import Path
+from transformers import AutoTokenizer
+
+def bytes_to_unicode():
+ # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
+ """
+ Returns list of utf-8 byte and a corresponding list of unicode strings.
+ The reversible bpe codes work on unicode strings.
+ This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+ When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+ This is a significant percentage of your normal, say, 32K bpe vocab.
+ To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+ And avoids mapping to whitespace/control characters the bpe code barfs on.
+ """
+ bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+ cs = bs[:]
+ n = 0
+ for b in range(2**8):
+ if b not in bs:
+ bs.append(b)
+ cs.append(2**8+n)
+ n += 1
+ cs = [chr(n) for n in cs]
+ return dict(zip(bs, cs))
+
+
+def count_model_parts(dir_model: str) -> int:
+ num_parts = 0
+ for filename in os.listdir(dir_model):
+ if filename.startswith("pytorch_model-"):
+ num_parts += 1
+
+ if num_parts > 0:
+ print("gguf: found " + str(num_parts) + " model parts")
+ return num_parts
+
+
+if len(sys.argv) < 3:
+ print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
+ print(" ftype == 0 -> float32")
+ print(" ftype == 1 -> float16")
+ sys.exit(1)
+
+
+# output in the same directory as the model
+dir_model = sys.argv[1]
+last_dir = os.path.basename(os.path.normpath(dir_model))
+
+# possible tensor data types
+# ftype == 0 -> float32
+# ftype == 1 -> float16
+
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if len(sys.argv) > 2:
+ ftype = int(sys.argv[2])
+ if ftype < 0 or ftype > 1:
+ print("Invalid ftype: " + str(ftype))
+
+ sys.exit(1)
+
+fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
+
+print("gguf: loading model "+last_dir)
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+ hparams = json.load(f)
+
+if hparams["architectures"][0] != "RWForCausalLM":
+ print("Model architecture not supported: " + hparams["architectures"][0])
+
+ sys.exit()
+
+# get number of model parts
+num_parts = count_model_parts(dir_model)
+
+ARCH=gguf.MODEL_ARCH.FALCON
+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+
+print("gguf: get model metadata")
+
+block_count = hparams["n_layer"]
+
+gguf_writer.add_name(last_dir)
+gguf_writer.add_context_length(2048) # not in config.json
+gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
+gguf_writer.add_embedding_length(hparams["hidden_size"])
+gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
+gguf_writer.add_block_count(block_count)
+gguf_writer.add_head_count(hparams["n_head"])
+if "n_head_kv" in hparams: gguf_writer.add_head_count_kv(hparams["n_head_kv"])
+gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
+
+# TOKENIZATION
+
+print("gguf: get tokenizer metadata")
+
+tokens: List[str] = []
+merges: List[str] = []
+
+
+if Path(dir_model + "/tokenizer.json").is_file():
+ # gpt2 tokenizer
+ gguf_writer.add_tokenizer_model("gpt2")
+
+ print("gguf: get gpt2 tokenizer merges")
+
+ with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
+ tokenizer_json = json.load(f)
+ merges = tokenizer_json["model"]["merges"]
+
+ gguf_writer.add_token_merges(merges)
+
+ print("gguf: get gpt2 tokenizer vocab")
+
+ vocab_size = len(tokenizer_json["model"]["vocab"])
+
+ # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
+ tokenizer = AutoTokenizer.from_pretrained(dir_model)
+
+ reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
+ byte_encoder = bytes_to_unicode()
+ byte_decoder = {v: k for k, v in byte_encoder.items()}
+
+ for i in range(vocab_size):
+ if i in reverse_vocab:
+ try:
+ text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
+ except KeyError:
+ text = bytearray()
+ for c in reverse_vocab[i]:
+ if ord(c) < 256: # single byte character
+ text.append(byte_decoder[ord(c)])
+ else: # multibyte special token character
+ text.extend(c.encode('utf-8'))
+ else:
+ print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
+ pad_token = f"[PAD{i}]".encode("utf8")
+ text = bytearray(pad_token)
+
+ tokens.append(text)
+
+ gguf_writer.add_token_list(tokens)
+
+ if "added_tokens" in tokenizer_json and Path(dir_model + "/tokenizer_config.json").is_file():
+ print("gguf: get special token ids")
+
+ with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
+ tokenizer_config = json.load(f)
+
+ # find special token ids
+
+ if "bos_token" in tokenizer_config:
+ for key in tokenizer_json["added_tokens"]:
+ if key["content"] == tokenizer_config["bos_token"]:
+ gguf_writer.add_bos_token_id(key["id"])
+
+ if "eos_token" in tokenizer_config:
+ for key in tokenizer_json["added_tokens"]:
+ if key["content"] == tokenizer_config["eos_token"]:
+ gguf_writer.add_eos_token_id(key["id"])
+
+ if "unk_token" in tokenizer_config:
+ for key in tokenizer_json["added_tokens"]:
+ if key["content"] == tokenizer_config["unk_token"]:
+ gguf_writer.add_unk_token_id(key["id"])
+
+ if "sep_token" in tokenizer_config:
+ for key in tokenizer_json["added_tokens"]:
+ if key["content"] == tokenizer_config["sep_token"]:
+ gguf_writer.add_sep_token_id(key["id"])
+
+ if "pad_token" in tokenizer_config:
+ for key in tokenizer_json["added_tokens"]:
+ if key["content"] == tokenizer_config["pad_token"]:
+ gguf_writer.add_pad_token_id(key["id"])
+
+
+# TENSORS
+
+tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
+
+# params for qkv transform
+n_head = hparams["n_head"]
+n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
+head_dim = hparams["hidden_size"] // n_head
+
+# tensor info
+print("gguf: get tensor metadata")
+
+if num_parts == 0:
+ part_names = ("pytorch_model.bin",)
+else:
+ part_names = (
+ f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
+ )
+
+for part_name in part_names:
+ print("gguf: loading model part '" + part_name + "'")
+ model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
+
+ for name in model_part.keys():
+ data = model_part[name]
+
+ old_dtype = data.dtype
+
+ # convert any unsupported data types to float32
+ if data.dtype != torch.float16 and data.dtype != torch.float32:
+ data = data.to(torch.float32)
+
+ # QKV tensor transform
+ # The original query_key_value tensor contains n_head_kv "kv groups",
+ # each consisting of n_head/n_head_kv query weights followed by one key
+ # and one value weight (shared by all query heads in the kv group).
+ # This layout makes it a big pain to work with in GGML.
+ # So we rearrange them here,, so that we have n_head query weights
+ # followed by n_head_kv key weights followed by n_head_kv value weights,
+ # in contiguous fashion.
+ # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
+
+ if "query_key_value" in name:
+ qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
+ q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
+ k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
+ v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
+ data = torch.cat((q,k,v)).reshape_as(data)
+
+ data = data.squeeze().numpy()
+
+ # map tensor names
+ if name.endswith(".weight") and name[:-7] in tensor_map:
+ name = tensor_map[name[:-7]] + ".weight"
+ elif name.endswith(".bias") and name[:-5] in tensor_map:
+ name = tensor_map[name[:-5]] + ".bias"
+ else:
+ print("Can not map tensor '" + name + "'")
+ sys.exit()
+
+ n_dims = len(data.shape)
+ data_dtype = data.dtype
+
+ # if f32 desired, convert any float16 to float32
+ if ftype == 0 and data_dtype == np.float16:
+ data = data.astype(np.float32)
+
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+ if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+ data = data.astype(np.float32)
+
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
+ if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+ data = data.astype(np.float16)
+
+ print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+
+ gguf_writer.add_tensor(name, data)
+
+
+print("gguf: write header")
+gguf_writer.write_header_to_file()
+print("gguf: write metadata")
+gguf_writer.write_kv_data_to_file()
+print("gguf: write tensors")
+gguf_writer.write_tensors_to_file()
+
+gguf_writer.close()
+
+print("gguf: model successfully exported to '" + fname_out + "'")
+print("")
diff --git a/convert-gptneox-hf-to-gguf.py b/convert-gptneox-hf-to-gguf.py
new file mode 100755
index 000000000..6eeff5bb1
--- /dev/null
+++ b/convert-gptneox-hf-to-gguf.py
@@ -0,0 +1,267 @@
+#!/usr/bin/env python3
+# HF gptneox--> gguf conversion
+
+import gguf
+import os
+import sys
+import struct
+import json
+import numpy as np
+import torch
+
+from typing import Any, List
+from pathlib import Path
+from transformers import AutoTokenizer
+
+# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
+
+
+def bytes_to_unicode():
+ """
+ Returns list of utf-8 byte and a corresponding list of unicode strings.
+ The reversible bpe codes work on unicode strings.
+ This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+ When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+ This is a significant percentage of your normal, say, 32K bpe vocab.
+ To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+ And avoids mapping to whitespace/control characters the bpe code barfs on.
+ """
+ bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+ cs = bs[:]
+ n = 0
+ for b in range(2**8):
+ if b not in bs:
+ bs.append(b)
+ cs.append(2**8+n)
+ n += 1
+ cs = [chr(n) for n in cs]
+ return dict(zip(bs, cs))
+
+
+def count_model_parts(dir_model: str) -> int:
+ num_parts = 0
+ for filename in os.listdir(dir_model):
+ if filename.startswith("pytorch_model-"):
+ num_parts += 1
+
+ if num_parts > 0:
+ print("gguf: found " + str(num_parts) + " model parts")
+ return num_parts
+
+
+if len(sys.argv) < 3:
+ print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
+ print(" ftype == 0 -> float32")
+ print(" ftype == 1 -> float16")
+ sys.exit(1)
+
+
+# output in the same directory as the model
+dir_model = sys.argv[1]
+last_dir = os.path.basename(os.path.normpath(dir_model))
+
+# possible tensor data types
+# ftype == 0 -> float32
+# ftype == 1 -> float16
+
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if len(sys.argv) > 2:
+ ftype = int(sys.argv[2])
+ if ftype < 0 or ftype > 1:
+ print("Invalid ftype: " + str(ftype))
+
+ sys.exit(1)
+
+fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
+
+print("gguf: loading model "+last_dir)
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+ hparams = json.load(f)
+
+if hparams["architectures"][0] != "GPTNeoXForCausalLM":
+ print("Model architecture not supported: " + hparams["architectures"][0])
+
+ sys.exit()
+
+# get number of model parts
+num_parts = count_model_parts(dir_model)
+
+ARCH=gguf.MODEL_ARCH.GPTNEOX
+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+
+print("gguf: get model metadata")
+
+block_count = hparams["num_hidden_layers"]
+
+gguf_writer.add_name(last_dir)
+gguf_writer.add_context_length(hparams["max_position_embeddings"])
+gguf_writer.add_embedding_length(hparams["hidden_size"])
+gguf_writer.add_block_count(block_count)
+gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
+gguf_writer.add_rope_dimension_count(int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"])))
+gguf_writer.add_head_count(hparams["num_attention_heads"])
+gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
+gguf_writer.add_layer_norm_eps(hparams["layer_norm_eps"])
+
+# TOKENIZATION
+
+print("gguf: get tokenizer metadata")
+
+tokens: List[str] = []
+merges: List[str] = []
+
+
+if Path(dir_model + "/tokenizer.json").is_file():
+ # gpt2 tokenizer
+ gguf_writer.add_tokenizer_model("gpt2")
+
+ print("gguf: get gpt2 tokenizer merges")
+
+ with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
+ tokenizer_json = json.load(f)
+ merges = tokenizer_json["model"]["merges"]
+
+ gguf_writer.add_token_merges(merges)
+
+ print("gguf: get gpt2 tokenizer vocab")
+
+ vocab_size = len(tokenizer_json["model"]["vocab"])
+
+ # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
+ tokenizer = AutoTokenizer.from_pretrained(dir_model)
+
+ reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
+ byte_encoder = bytes_to_unicode()
+ byte_decoder = {v: k for k, v in byte_encoder.items()}
+
+ for i in range(vocab_size):
+ if i in reverse_vocab:
+ try:
+ text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
+ except KeyError:
+ text = bytearray()
+ for c in reverse_vocab[i]:
+ if ord(c) < 256: # single byte character
+ text.append(byte_decoder[ord(c)])
+ else: # multibyte special token character
+ text.extend(c.encode('utf-8'))
+ else:
+ print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
+ pad_token = f"[PAD{i}]".encode("utf8")
+ text = bytearray(pad_token)
+
+ tokens.append(text)
+
+ gguf_writer.add_token_list(tokens)
+
+ if "added_tokens" in tokenizer_json and Path(dir_model + "/tokenizer_config.json").is_file():
+ print("gguf: get special token ids")
+
+ with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
+ tokenizer_config = json.load(f)
+
+ # find special token ids
+
+ if "bos_token" in tokenizer_config:
+ for key in tokenizer_json["added_tokens"]:
+ if key["content"] == tokenizer_config["bos_token"]:
+ gguf_writer.add_bos_token_id(key["id"])
+
+ if "eos_token" in tokenizer_config:
+ for key in tokenizer_json["added_tokens"]:
+ if key["content"] == tokenizer_config["eos_token"]:
+ gguf_writer.add_eos_token_id(key["id"])
+
+ if "unk_token" in tokenizer_config:
+ for key in tokenizer_json["added_tokens"]:
+ if key["content"] == tokenizer_config["unk_token"]:
+ gguf_writer.add_unk_token_id(key["id"])
+
+ if "sep_token" in tokenizer_config:
+ for key in tokenizer_json["added_tokens"]:
+ if key["content"] == tokenizer_config["sep_token"]:
+ gguf_writer.add_sep_token_id(key["id"])
+
+ if "pad_token" in tokenizer_config:
+ for key in tokenizer_json["added_tokens"]:
+ if key["content"] == tokenizer_config["pad_token"]:
+ gguf_writer.add_pad_token_id(key["id"])
+
+
+# TENSORS
+
+tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
+
+# tensor info
+print("gguf: get tensor metadata")
+
+if num_parts == 0:
+ part_names = ("pytorch_model.bin",)
+else:
+ part_names = (
+ f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
+ )
+
+for part_name in part_names:
+ print("gguf: loading model part '" + part_name + "'")
+ model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
+
+ for name in model_part.keys():
+ data = model_part[name]
+
+ # we don't need these
+ if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"):
+ continue
+
+ old_dtype = data.dtype
+
+ # convert any unsupported data types to float32
+ if data.dtype != torch.float16 and data.dtype != torch.float32:
+ data = data.to(torch.float32)
+
+ data = data.squeeze().numpy()
+
+ # map tensor names
+ if name.endswith(".weight") and name[:-7] in tensor_map:
+ name = tensor_map[name[:-7]] + ".weight"
+ elif name.endswith(".bias") and name[:-5] in tensor_map:
+ name = tensor_map[name[:-5]] + ".bias"
+ else:
+ print("Can not map tensor '" + name + "'")
+ sys.exit()
+
+ n_dims = len(data.shape)
+ data_dtype = data.dtype
+
+ # if f32 desired, convert any float16 to float32
+ if ftype == 0 and data_dtype == np.float16:
+ data = data.astype(np.float32)
+
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+ if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+ data = data.astype(np.float32)
+
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
+ if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+ data = data.astype(np.float16)
+
+ print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+
+ gguf_writer.add_tensor(name, data)
+
+
+print("gguf: write header")
+gguf_writer.write_header_to_file()
+print("gguf: write metadata")
+gguf_writer.write_kv_data_to_file()
+print("gguf: write tensors")
+gguf_writer.write_tensors_to_file()
+
+gguf_writer.close()
+
+print("gguf: model successfully exported to '" + fname_out + "'")
+print("")
diff --git a/convert-llama-7b-pth-to-gguf.py b/convert-llama-7b-pth-to-gguf.py
new file mode 100755
index 000000000..f103f5f61
--- /dev/null
+++ b/convert-llama-7b-pth-to-gguf.py
@@ -0,0 +1,308 @@
+#!/usr/bin/env python3
+# 7b pth llama --> gguf conversion
+# Only models with a single datafile are supported, like 7B
+# HF files required in the model dir: config.json tokenizer_config.json tokenizer.json tokenizer.model
+
+import gguf
+import os
+import sys
+import struct
+import json
+import numpy as np
+import torch
+
+from typing import Any, List
+from pathlib import Path
+from sentencepiece import SentencePieceProcessor
+
+#NDArray = np.ndarray[Any, Any]
+# compatible with python < 3.9
+NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
+
+
+def count_model_parts(dir_model: str) -> int:
+ num_parts = 0
+ for filename in os.listdir(dir_model):
+ if filename.startswith("consolidated."):
+ num_parts += 1
+
+ if num_parts > 0:
+ print("gguf: found " + str(num_parts) + " model parts")
+ return num_parts
+
+
+if len(sys.argv) < 3:
+ print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
+ print(" ftype == 0 -> float32")
+ print(" ftype == 1 -> float16")
+
+ sys.exit(1)
+
+
+# output in the same directory as the model
+dir_model = sys.argv[1]
+last_dir = os.path.basename(os.path.normpath(dir_model))
+
+
+# possible tensor data types
+# ftype == 0 -> float32
+# ftype == 1 -> float16
+
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if len(sys.argv) > 2:
+ ftype = int(sys.argv[2])
+ if ftype < 0 or ftype > 1:
+ print("Invalid ftype: " + str(ftype))
+
+ sys.exit(1)
+
+fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
+
+print("gguf: loading model "+last_dir)
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+ hparams = json.load(f)
+
+if hparams["architectures"][0] != "LlamaForCausalLM":
+ print("Model architecture not supported: " + hparams["architectures"][0])
+ sys.exit()
+
+# get number of model parts
+num_parts = count_model_parts(dir_model)
+
+if num_parts > 1:
+ print("gguf: Only models with a single datafile are supported.")
+
+ sys.exit()
+
+ARCH=gguf.MODEL_ARCH.LLAMA
+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+
+
+print("gguf: get model metadata")
+
+block_count = hparams["num_hidden_layers"]
+head_count = hparams["num_attention_heads"]
+
+if "num_key_value_heads" in hparams:
+ head_count_kv = hparams["num_key_value_heads"]
+else:
+ head_count_kv = head_count
+
+if "_name_or_path" in hparams:
+ hf_repo = hparams["_name_or_path"]
+else:
+ hf_repo = ""
+
+if "max_sequence_length" in hparams:
+ ctx_length = hparams["max_sequence_length"]
+elif "max_position_embeddings" in hparams:
+ ctx_length = hparams["max_position_embeddings"]
+else:
+ print("gguf: can not find ctx length parameter.")
+
+ sys.exit()
+
+
+gguf_writer.add_name(last_dir)
+gguf_writer.add_source_hf_repo(hf_repo)
+gguf_writer.add_tensor_data_layout("Meta AI original pth")
+gguf_writer.add_context_length(ctx_length)
+gguf_writer.add_embedding_length(hparams["hidden_size"])
+gguf_writer.add_block_count(block_count)
+gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
+gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
+gguf_writer.add_head_count(head_count)
+gguf_writer.add_head_count_kv(head_count_kv)
+gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
+
+if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
+ if "type" in hparams["rope_scaling"]:
+ if hparams["rope_scaling"]["type"] == "linear":
+ gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
+
+
+# TOKENIZATION
+
+print("gguf: get tokenizer metadata")
+
+tokens: List[bytes] = []
+scores: List[float] = []
+toktypes: List[int] = []
+
+if Path(dir_model + "/tokenizer.model").is_file():
+ # vocab type sentencepiece
+ print("gguf: get sentencepiece tokenizer vocab and scores")
+
+ tokenizer = SentencePieceProcessor(dir_model + "/tokenizer.model")
+
+ for i in range(tokenizer.vocab_size()):
+ text: bytes
+ score: float
+
+ piece = tokenizer.id_to_piece(i)
+ text = piece.encode("utf-8")
+ score = tokenizer.get_score(i)
+
+ toktype = 1 # defualt to normal token type
+ if tokenizer.is_unknown(i):
+ toktype = 2
+ if tokenizer.is_control(i):
+ toktype = 3
+
+ # toktype = 4 is user-defined = tokens from added_tokens.json
+
+ if tokenizer.is_unused(i):
+ toktype = 5
+ if tokenizer.is_byte(i):
+ toktype = 6
+
+ tokens.append(text)
+ scores.append(score)
+ toktypes.append(toktype)
+
+ if Path(dir_model + "/added_tokens.json").is_file():
+ with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
+ addtokens_json = json.load(f)
+
+ print("gguf: get added tokens")
+
+ for key in addtokens_json:
+ tokens.append( key.encode("utf-8") )
+ scores.append(-1000.0)
+ toktypes.append(4) # user-defined token type
+
+ gguf_writer.add_tokenizer_model("llama")
+ gguf_writer.add_token_list(tokens)
+ gguf_writer.add_token_scores(scores)
+ gguf_writer.add_token_types(toktypes)
+
+
+print("gguf: get special token ids")
+
+if Path(dir_model + "/tokenizer.json").is_file():
+ # Look for special tokens in tokenizer.json if it exists
+
+ with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
+ tokenizer = json.load(f)
+
+ if "added_tokens" in tokenizer and Path(dir_model + "/tokenizer_config.json").is_file():
+
+ with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
+ tokenizer_config = json.load(f)
+
+ if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] != None:
+ for key in tokenizer["added_tokens"]:
+ if key["content"] == tokenizer_config["bos_token"]["content"]:
+ gguf_writer.add_bos_token_id(key["id"])
+
+ if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] != None:
+ for key in tokenizer["added_tokens"]:
+ if key["content"] == tokenizer_config["eos_token"]["content"]:
+ gguf_writer.add_eos_token_id(key["id"])
+
+ if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] != None:
+ for key in tokenizer["added_tokens"]:
+ if key["content"] == tokenizer_config["unk_token"]["content"]:
+ gguf_writer.add_unk_token_id(key["id"])
+
+ if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] != None:
+ for key in tokenizer["added_tokens"]:
+ if key["content"] == tokenizer_config["sep_token"]["content"]:
+ gguf_writer.add_sep_token_id(key["id"])
+
+ if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] != None:
+ for key in tokenizer["added_tokens"]:
+ if key["content"] == tokenizer_config["pad_token"]["content"]:
+ gguf_writer.add_pad_token_id(key["id"])
+else:
+ # If no tokenizer.json: Look for special tokens in config.json
+
+ if "bos_token_id" in hparams and hparams["bos_token_id"] != None:
+ gguf_writer.add_bos_token_id(hparams["bos_token_id"])
+
+ if "eos_token_id" in hparams and hparams["eos_token_id"] != None:
+ gguf_writer.add_eos_token_id(hparams["eos_token_id"])
+
+ if "unk_token_id" in hparams and hparams["unk_token_id"] != None:
+ gguf_writer.add_unk_token_id(hparams["unk_token_id"])
+
+ if "sep_token_id" in hparams and hparams["sep_token_id"] != None:
+ gguf_writer.add_sep_token_id(hparams["sep_token_id"])
+
+ if "pad_token_id" in hparams and hparams["pad_token_id"] != None:
+ gguf_writer.add_pad_token_id(hparams["pad_token_id"])
+
+
+# TENSORS
+
+tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
+
+# tensor info
+print("gguf: get tensor metadata")
+
+part_names = (f"consolidated.{n:02}.pth" for n in range(0, num_parts))
+
+for part_name in part_names:
+ print("gguf: loading model part '" + part_name + "'")
+ model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
+
+ for name in model_part.keys():
+ data = model_part[name]
+
+ # we don't need these
+ if name == "rope.freqs":
+ continue
+
+ old_dtype = data.dtype
+
+ # convert any unsupported data types to float32
+ if data.dtype != torch.float16 and data.dtype != torch.float32:
+ data = data.to(torch.float32)
+
+ data = data.squeeze().numpy()
+
+ # map tensor names
+ if name.endswith(".weight") and name[:-7] in tensor_map:
+ name = tensor_map[name[:-7]] + ".weight"
+ elif name.endswith(".bias") and name[:-5] in tensor_map:
+ name = tensor_map[name[:-5]] + ".bias"
+ else:
+ print("Can not map tensor '" + name + "'")
+ sys.exit()
+
+ n_dims = len(data.shape)
+ data_dtype = data.dtype
+
+ # if f32 desired, convert any float16 to float32
+ if ftype == 0 and data_dtype == np.float16:
+ data = data.astype(np.float32)
+
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+ if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+ data = data.astype(np.float32)
+
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
+ if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+ data = data.astype(np.float16)
+
+ print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+
+ gguf_writer.add_tensor(name, data)
+
+
+print("gguf: write header")
+gguf_writer.write_header_to_file()
+print("gguf: write metadata")
+gguf_writer.write_kv_data_to_file()
+print("gguf: write tensors")
+gguf_writer.write_tensors_to_file()
+
+gguf_writer.close()
+
+
+print("gguf: model successfully exported to '" + fname_out + "'")
+print("")
diff --git a/convert-llama-ggmlv3-to-gguf.py b/convert-llama-ggmlv3-to-gguf.py
new file mode 100755
index 000000000..3bf93627d
--- /dev/null
+++ b/convert-llama-ggmlv3-to-gguf.py
@@ -0,0 +1,345 @@
+#!/usr/bin/env python3
+import sys, struct, math, argparse
+from pathlib import Path
+
+import numpy as np
+
+import gguf
+
+# Note: Does not support GGML_QKK_64
+QK_K = 256
+# Items here are (block size, type size)
+GGML_QUANT_SIZES = {
+ gguf.GGMLQuantizationType.F32 : (1, 4),
+ gguf.GGMLQuantizationType.F16 : (1, 2),
+ gguf.GGMLQuantizationType.Q4_0 : (32, 2 + 16),
+ gguf.GGMLQuantizationType.Q4_1 : (32, 2 + 2 + 16),
+ gguf.GGMLQuantizationType.Q5_0 : (32, 2 + 4 + 16),
+ gguf.GGMLQuantizationType.Q5_1 : (32, 2 + 2 + 4 + 16),
+ gguf.GGMLQuantizationType.Q8_0 : (32, 2 + 32),
+ gguf.GGMLQuantizationType.Q8_1 : (32, 4 + 4 + 32),
+ gguf.GGMLQuantizationType.Q2_K : (256, 2 + 2 + QK_K // 16 + QK_K // 4),
+ gguf.GGMLQuantizationType.Q3_K : (256, 2 + QK_K // 4 + QK_K // 8 + 12),
+ gguf.GGMLQuantizationType.Q4_K : (256, 2 + 2 + QK_K // 2 + 12),
+ gguf.GGMLQuantizationType.Q5_K : (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
+ gguf.GGMLQuantizationType.Q6_K : (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
+ gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8),
+}
+
+class Hyperparameters:
+ def __init__(self):
+ self.n_vocab = self.n_embd = self.n_mult = self.n_head = self.n_layer = self.n_rot = self.ftype = 0
+ self.n_ff = 0
+
+ def set_n_ff(self, model):
+ ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight')
+ assert ff_tensor_idx is not None, 'Missing layer 0 FF tensor'
+ ff_tensor = model.tensors[ff_tensor_idx]
+ self.n_ff = ff_tensor.dims[1]
+
+ def load(self, data, offset):
+ (
+ self.n_vocab,
+ self.n_embd,
+ self.n_mult,
+ self.n_head,
+ self.n_layer,
+ self.n_rot,
+ self.ftype,
+ ) = struct.unpack('<7I', data[offset:offset + (4 * 7)])
+ return 4 * 7
+
+ def __str__(self):
+ return f''
+
+class Vocab:
+ def __init__(self):
+ self.items = []
+
+ def load(self, data, offset, n_vocab):
+ orig_offset = offset
+ for _ in range(n_vocab):
+ itemlen = struct.unpack('= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}'
+ assert name_len < 4096, 'Absurd tensor name length'
+ quant = GGML_QUANT_SIZES.get(dtype)
+ assert quant is not None, 'Unknown tensor type'
+ (blksize, tysize) = quant
+ offset += 12
+ self.dtype= dtype
+ self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
+ offset += 4 * n_dims
+ self.name = bytes(data[offset:offset + name_len])
+ offset += name_len
+ pad = ((offset + 31) & ~31) - offset
+ offset += pad
+ n_elems = np.prod(self.dims)
+ n_bytes = np.int64(np.int64(n_elems) * np.int64(tysize)) // np.int64(blksize)
+ self.start_offset = offset
+ self.len_bytes = n_bytes
+ offset += n_bytes
+ # print(n_dims, name_len, dtype, self.dims, self.name, pad)
+ return offset - orig_offset
+
+class GGMLV3Model:
+ def __init__(self):
+ self.hyperparameters = None
+ self.vocab = None
+ self.tensor_map = {}
+ self.tensors = []
+
+ def validate_header(self, data, offset):
+ if bytes(data[offset:offset + 4]) != b'tjgg' or struct.unpack(' 0:
+ gguf_writer.add_token_types(toktypes)
+ return
+ print(f'* Adding {hp.n_vocab} vocab item(s)')
+ assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab'
+ for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
+ tt = 1 # Normal
+ # Special handling for UNK, BOS, EOS tokens.
+ if tokid <= 2:
+ if tokid == 0:
+ vbytes = b''
+ tt = 2
+ elif tokid == 1:
+ vbytes = b''
+ tt = 3
+ else:
+ vbytes = b''
+ tt = 3
+ elif len(vbytes) == 0:
+ tt = 3 # Control
+ elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
+ vbytes = bytes(f'<0x{vbytes[0]:02X}>', encoding = 'UTF-8')
+ tt = 6 # Byte
+ else:
+ vbytes = vbytes.replace(b' ', b'\xe2\x96\x81')
+ toktypes.append(tt)
+ tokens.append(vbytes)
+ scores.append(vscore)
+ gguf_writer.add_token_list(tokens)
+ gguf_writer.add_token_scores(scores)
+ gguf_writer.add_token_types(toktypes)
+ gguf_writer.add_unk_token_id(0)
+ gguf_writer.add_bos_token_id(1)
+ gguf_writer.add_eos_token_id(2)
+
+ def add_tensors(self, gguf_writer):
+ nm = self.name_map
+ data = self.data
+ print(f'* Adding {len(self.model.tensors)} tensor(s)')
+ for tensor in self.model.tensors:
+ name = str(tensor.name, 'UTF-8')
+ if name.endswith('.weight'):
+ name = name[:-7]
+ suffix = '.weight'
+ elif name.endswith('.bias'):
+ name = name[:-5]
+ suffix = '.bias'
+ mapped_name = nm.get(name)
+ assert mapped_name is not None, f'Bad name {name}'
+ mapped_name += suffix
+ tempdims = list(tensor.dims[:])
+ if len(tempdims) > 1:
+ temp = tempdims[1]
+ tempdims[1] = tempdims[0]
+ tempdims[0] = temp
+ # print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
+ gguf_writer.add_tensor(mapped_name, data[tensor.start_offset:tensor.start_offset + tensor.len_bytes], raw_shape = tempdims, raw_dtype = tensor.dtype)
+
+def handle_metadata(cfg, hp):
+ import convert
+ assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
+ hf_config_path = cfg.model_metadata_dir / "config.json"
+ orig_config_path = cfg.model_metadata_dir / "params.json"
+ # We pass a fake model here. "original" mode will check the shapes of some
+ # tensors if information is missing in the .json file: other than that, the
+ # model data isn't used so this should be safe (at least for now).
+ fakemodel = {
+ 'tok_embeddings.weight': convert.LazyTensor.__new__(convert.LazyTensor),
+ 'layers.0.feed_forward.w1.weight': convert.LazyTensor.__new__(convert.LazyTensor),
+ }
+ fakemodel['tok_embeddings.weight'].shape = [hp.n_vocab]
+ fakemodel['layers.0.feed_forward.w1.weight'].shape = [hp.n_ff]
+ if hf_config_path.exists():
+ params = convert.Params.loadHFTransformerJson(fakemodel, hf_config_path)
+ elif orig_config_path.exists():
+ params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
+ else:
+ raise ValueError('Unable to load metadata')
+ vocab = convert.load_vocab(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, cfg.vocabtype)
+ convert.check_vocab_size(params, vocab)
+ return (params, vocab)
+
+def handle_args():
+ parser = argparse.ArgumentParser(description = 'Convert GGMLv3 models to GGUF')
+ parser.add_argument('--input', '-i', type = Path, help = 'Input GGMLv3 filename')
+ parser.add_argument('--output', '-o', type = Path, help ='Output GGUF filename')
+ parser.add_argument('--name', help = 'Set model name')
+ parser.add_argument('--desc', help = 'Set model description')
+ parser.add_argument('--gqa', type = int, default = 1, help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
+ parser.add_argument('--eps', default = '5.0e-06', help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
+ parser.add_argument('--context-length', '-c', type=int, default = 2048, help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
+ parser.add_argument('--model-metadata-dir', '-m', type = Path, help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
+ parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
+ parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)", default="spm")
+ return parser.parse_args()
+
+def main():
+ cfg = handle_args()
+ print(f'* Using config: {cfg}')
+ print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n')
+ data = np.memmap(cfg.input, mode = 'r')
+ model = GGMLV3Model()
+ print('* Scanning GGML input file')
+ offset = model.load(data, 0)
+ print(f'* GGML model hyperparameters: {model.hyperparameters}')
+ vocab_override = None
+ params_override = None
+ if cfg.model_metadata_dir is not None:
+ (params_override, vocab_override) = handle_metadata(cfg, model.hyperparameters)
+ print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
+ print(f'* Overriding params: {params_override}')
+ print(f'* Overriding vocab: {vocab_override}')
+ else:
+ print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
+ converter = GGMLToGGUF(model, data, cfg, params_override = params_override, vocab_override = vocab_override)
+ converter.save()
+ print(f'* Successful completion. Output saved to: {cfg.output}')
+
+if __name__ == '__main__':
+ main()
diff --git a/convert-llama-hf-to-gguf.py b/convert-llama-hf-to-gguf.py
new file mode 100755
index 000000000..08fde238b
--- /dev/null
+++ b/convert-llama-hf-to-gguf.py
@@ -0,0 +1,328 @@
+#!/usr/bin/env python3
+# HF llama --> gguf conversion
+
+import gguf
+import os
+import sys
+import struct
+import json
+import numpy as np
+import torch
+
+from typing import Any, List, Optional
+from pathlib import Path
+from sentencepiece import SentencePieceProcessor
+
+#NDArray = np.ndarray[Any, Any]
+# compatible with python < 3.9
+NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
+
+# reverse HF permute back to original pth layout
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py
+
+
+def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: Optional[int] = None) -> NDArray:
+ if n_kv_head is not None and n_head != n_kv_head:
+ n_head //= n_kv_head
+
+ return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
+ .swapaxes(1, 2)
+ .reshape(weights.shape))
+
+
+def count_model_parts(dir_model: str) -> int:
+ num_parts = 0
+
+ for filename in os.listdir(dir_model):
+ if filename.startswith("pytorch_model-"):
+ num_parts += 1
+
+ if num_parts > 0:
+ print("gguf: found " + str(num_parts) + " model parts")
+
+ return num_parts
+
+
+if len(sys.argv) < 3:
+ print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
+ print(" ftype == 0 -> float32")
+ print(" ftype == 1 -> float16")
+
+ sys.exit(1)
+
+
+# output in the same directory as the model
+dir_model = sys.argv[1]
+last_dir = os.path.basename(os.path.normpath(dir_model))
+
+
+# possible tensor data types
+# ftype == 0 -> float32
+# ftype == 1 -> float16
+
+
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if len(sys.argv) > 2:
+ ftype = int(sys.argv[2])
+ if ftype < 0 or ftype > 1:
+ print("Invalid ftype: " + str(ftype))
+
+ sys.exit(1)
+
+fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
+
+print("gguf: loading model "+last_dir)
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+ hparams = json.load(f)
+
+if hparams["architectures"][0] != "LlamaForCausalLM":
+ print("Model architecture not supported: " + hparams["architectures"][0])
+
+ sys.exit()
+
+# get number of model parts
+num_parts = count_model_parts(dir_model)
+
+ARCH=gguf.MODEL_ARCH.LLAMA
+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+
+print("gguf: get model metadata")
+
+block_count = hparams["num_hidden_layers"]
+head_count = hparams["num_attention_heads"]
+
+if "num_key_value_heads" in hparams:
+ head_count_kv = hparams["num_key_value_heads"]
+else:
+ head_count_kv = head_count
+
+if "_name_or_path" in hparams:
+ hf_repo = hparams["_name_or_path"]
+else:
+ hf_repo = ""
+
+if "max_sequence_length" in hparams:
+ ctx_length = hparams["max_sequence_length"]
+elif "max_position_embeddings" in hparams:
+ ctx_length = hparams["max_position_embeddings"]
+else:
+ print("gguf: can not find ctx length parameter.")
+
+ sys.exit()
+
+
+gguf_writer.add_name(last_dir)
+gguf_writer.add_source_hf_repo(hf_repo)
+gguf_writer.add_tensor_data_layout("Meta AI original pth")
+gguf_writer.add_context_length(ctx_length)
+gguf_writer.add_embedding_length(hparams["hidden_size"])
+gguf_writer.add_block_count(block_count)
+gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
+gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
+gguf_writer.add_head_count(head_count)
+gguf_writer.add_head_count_kv(head_count_kv)
+gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
+
+if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
+ if "type" in hparams["rope_scaling"]:
+ if hparams["rope_scaling"]["type"] == "linear":
+ gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
+
+
+# TOKENIZATION
+
+print("gguf: get tokenizer metadata")
+
+tokens: List[bytes] = []
+scores: List[float] = []
+toktypes: List[int] = []
+
+if Path(dir_model + "/tokenizer.model").is_file():
+ # vocab type sentencepiece
+ print("gguf: get sentencepiece tokenizer vocab, scores and token types")
+
+ tokenizer = SentencePieceProcessor(dir_model + "/tokenizer.model")
+
+ for i in range(tokenizer.vocab_size()):
+ text: bytes
+ score: float
+
+ piece = tokenizer.id_to_piece(i)
+ text = piece.encode("utf-8")
+ score = tokenizer.get_score(i)
+
+ toktype = 1 # defualt to normal token type
+ if tokenizer.is_unknown(i):
+ toktype = 2
+ if tokenizer.is_control(i):
+ toktype = 3
+
+ # toktype = 4 is user-defined = tokens from added_tokens.json
+
+ if tokenizer.is_unused(i):
+ toktype = 5
+ if tokenizer.is_byte(i):
+ toktype = 6
+
+ tokens.append(text)
+ scores.append(score)
+ toktypes.append(toktype)
+
+ if Path(dir_model + "/added_tokens.json").is_file():
+ with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
+ addtokens_json = json.load(f)
+
+ print("gguf: get added tokens")
+
+ for key in addtokens_json:
+ tokens.append( key.encode("utf-8") )
+ scores.append(-1000.0)
+ toktypes.append(4) # user-defined token type
+
+
+ gguf_writer.add_tokenizer_model("llama")
+ gguf_writer.add_token_list(tokens)
+ gguf_writer.add_token_scores(scores)
+ gguf_writer.add_token_types(toktypes)
+
+
+print("gguf: get special token ids")
+
+if Path(dir_model + "/tokenizer.json").is_file():
+ # Look for special tokens in tokenizer.json if it exists
+
+ with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
+ tokenizer = json.load(f)
+
+ if "added_tokens" in tokenizer and Path(dir_model + "/tokenizer_config.json").is_file():
+
+ with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
+ tokenizer_config = json.load(f)
+
+ if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] != None:
+ for key in tokenizer["added_tokens"]:
+ if key["content"] == tokenizer_config["bos_token"]["content"]:
+ gguf_writer.add_bos_token_id(key["id"])
+
+ if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] != None:
+ for key in tokenizer["added_tokens"]:
+ if key["content"] == tokenizer_config["eos_token"]["content"]:
+ gguf_writer.add_eos_token_id(key["id"])
+
+ if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] != None:
+ for key in tokenizer["added_tokens"]:
+ if key["content"] == tokenizer_config["unk_token"]["content"]:
+ gguf_writer.add_unk_token_id(key["id"])
+
+ if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] != None:
+ for key in tokenizer["added_tokens"]:
+ if key["content"] == tokenizer_config["sep_token"]["content"]:
+ gguf_writer.add_sep_token_id(key["id"])
+
+ if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] != None:
+ for key in tokenizer["added_tokens"]:
+ if key["content"] == tokenizer_config["pad_token"]["content"]:
+ gguf_writer.add_pad_token_id(key["id"])
+else:
+ # If no tokenizer.json: Look for special tokens in config.json
+
+ if "bos_token_id" in hparams and hparams["bos_token_id"] != None:
+ gguf_writer.add_bos_token_id(hparams["bos_token_id"])
+
+ if "eos_token_id" in hparams and hparams["eos_token_id"] != None:
+ gguf_writer.add_eos_token_id(hparams["eos_token_id"])
+
+ if "unk_token_id" in hparams and hparams["unk_token_id"] != None:
+ gguf_writer.add_unk_token_id(hparams["unk_token_id"])
+
+ if "sep_token_id" in hparams and hparams["sep_token_id"] != None:
+ gguf_writer.add_sep_token_id(hparams["sep_token_id"])
+
+ if "pad_token_id" in hparams and hparams["pad_token_id"] != None:
+ gguf_writer.add_pad_token_id(hparams["pad_token_id"])
+
+
+# TENSORS
+
+tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
+
+# tensor info
+print("gguf: get tensor metadata")
+
+if num_parts == 0:
+ part_names = ("pytorch_model.bin",)
+else:
+ part_names = (
+ f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
+ )
+
+for part_name in part_names:
+ print("gguf: loading model part '" + part_name + "'")
+ model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
+
+ for name in model_part.keys():
+ data = model_part[name]
+
+ # we don't need these
+ if name.endswith(".rotary_emb.inv_freq"):
+ continue
+
+ old_dtype = data.dtype
+
+ # convert any unsupported data types to float32
+ if data.dtype != torch.float16 and data.dtype != torch.float32:
+ data = data.to(torch.float32)
+
+ data = data.squeeze().numpy()
+
+ # reverse permute these
+ if name.endswith(".q_proj.weight"):
+ data = reverse_hf_permute(data, head_count)
+ if name.endswith(".k_proj.weight"):
+ data = reverse_hf_permute(data, head_count, head_count_kv)
+
+ # map tensor names
+ if name.endswith(".weight") and name[:-7] in tensor_map:
+ name = tensor_map[name[:-7]] + ".weight"
+ elif name.endswith(".bias") and name[:-5] in tensor_map:
+ name = tensor_map[name[:-5]] + ".bias"
+ else:
+ print("Can not map tensor '" + name + "'")
+ sys.exit()
+
+ n_dims = len(data.shape)
+ data_dtype = data.dtype
+
+ # if f32 desired, convert any float16 to float32
+ if ftype == 0 and data_dtype == np.float16:
+ data = data.astype(np.float32)
+
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+ if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+ data = data.astype(np.float32)
+
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
+ if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+ data = data.astype(np.float16)
+
+ print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+
+ gguf_writer.add_tensor(name, data)
+
+
+print("gguf: write header")
+gguf_writer.write_header_to_file()
+print("gguf: write metadata")
+gguf_writer.write_kv_data_to_file()
+print("gguf: write tensors")
+gguf_writer.write_tensors_to_file()
+
+gguf_writer.close()
+
+
+print("gguf: model successfully exported to '" + fname_out + "'")
+print("")
diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py
index b4999ff5a..a94a7d0af 100755
--- a/convert-lora-to-ggml.py
+++ b/convert-lora-to-ggml.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
import json
import os
import re
@@ -6,23 +6,22 @@ import struct
import sys
from typing import Any, Dict, Sequence, TextIO
+import numpy as np
import torch
-from convert import DATA_TYPE_TO_FTYPE, NUMPY_TYPE_TO_DATA_TYPE, DataType
+NUMPY_TYPE_TO_FTYPE: Dict[str, int] = {"float32": 0, "float16": 1}
+
HF_SUBLAYER_TO_GGML = {
- "self_attn.q_proj": "attention.wq",
- "self_attn.k_proj": "attention.wk",
- "self_attn.v_proj": "attention.wv",
- "self_attn.o_proj": "attention.wo",
- "mlp.gate_proj": "feed_forward.w1",
- "mlp.down_proj": "feed_forward.w2",
- "mlp.up_proj": "feed_forward.w3",
- "input_layernorm": "attention_norm",
+ "self_attn.q_proj": "attn_q",
+ "self_attn.k_proj": "attn_k",
+ "self_attn.v_proj": "attn_v",
+ "self_attn.o_proj": "attn_output",
+ "mlp.gate_proj": "ffn_gate",
+ "mlp.down_proj": "ffn_down",
+ "mlp.up_proj": "ffn_up",
+ "input_layernorm": "attn_norm",
"post_attention_layernorm": "ffn_norm",
- # "norm": "norm",
- # "embed_tokens": "tok_embeddings",
- # "lm_head": "output",
}
@@ -39,7 +38,7 @@ def translate_tensor_name(t: str) -> str:
sys.exit(1)
output_string = (
- f"layers.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
+ f"blk.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
)
return output_string
else:
@@ -54,12 +53,14 @@ def write_file_header(fout: TextIO, params: Dict[str, Any]) -> None:
# https://opendelta.readthedocs.io/en/latest/modules/deltas.html says that `lora_alpha` is an int
# but some models ship a float value instead
# let's convert to int, but fail if lossless conversion is not possible
- assert int(params["lora_alpha"]) == params["lora_alpha"], "cannot convert float to int losslessly"
+ assert (
+ int(params["lora_alpha"]) == params["lora_alpha"]
+ ), "cannot convert float to int losslessly"
fout.write(struct.pack("i", int(params["lora_alpha"])))
def write_tensor_header(
- self, name: str, shape: Sequence[int], data_type: DataType
+ self, name: str, shape: Sequence[int], data_type: np.dtype
) -> None:
sname = name.encode("utf-8")
fout.write(
@@ -67,7 +68,7 @@ def write_tensor_header(
"iii",
len(shape),
len(sname),
- DATA_TYPE_TO_FTYPE[NUMPY_TYPE_TO_DATA_TYPE[data_type]],
+ NUMPY_TYPE_TO_FTYPE[data_type.name],
)
)
fout.write(struct.pack("i" * len(shape), *shape[::-1]))
diff --git a/convert-pth-to-ggml.py b/convert-pth-to-ggml.py
deleted file mode 100644
index dd15393c3..000000000
--- a/convert-pth-to-ggml.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Compatibility stub
-
-import argparse
-
-import convert
-
-parser = argparse.ArgumentParser(
- description="""[DEPRECATED - use `convert.py` instead]
- Convert a LLaMA model checkpoint to a ggml compatible file""")
-parser.add_argument('dir_model', help='directory containing the model checkpoint')
-parser.add_argument('ftype', help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
-args = parser.parse_args()
-convert.main(['--outtype', 'f16' if args.ftype == 1 else 'f32', '--', args.dir_model])
diff --git a/convert.py b/convert.py
old mode 100644
new mode 100755
index f3bf17980..a701ab41b
--- a/convert.py
+++ b/convert.py
@@ -1,4 +1,6 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
+
+import gguf
import argparse
import concurrent.futures
import copy
@@ -16,13 +18,12 @@ import signal
import struct
import sys
import zipfile
+import numpy as np
+
from abc import ABCMeta, abstractmethod
from dataclasses import dataclass
from pathlib import Path
-from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List,
- Literal, Optional, Sequence, Tuple, TypeVar, Union)
-
-import numpy as np
+from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar, Union)
from sentencepiece import SentencePieceProcessor # type: ignore
if TYPE_CHECKING:
@@ -33,57 +34,47 @@ if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
+ARCH=gguf.MODEL_ARCH.LLAMA
+NAMES=gguf.MODEL_TENSOR_NAMES[ARCH]
+
+#
+# data types
+#
@dataclass(frozen=True)
class UnquantizedDataType:
name: str
-
-DT_F16 = UnquantizedDataType('F16')
-DT_F32 = UnquantizedDataType('F32')
-DT_I32 = UnquantizedDataType('I32')
+DT_F16 = UnquantizedDataType('F16')
+DT_F32 = UnquantizedDataType('F32')
+DT_I32 = UnquantizedDataType('I32')
DT_BF16 = UnquantizedDataType('BF16')
-
-@dataclass(frozen=True)
-class QuantizedDataType:
- groupsize: int
- have_addends: bool
- have_g_idx: bool
-
-
-DT_Q4_0 = QuantizedDataType(groupsize=32, have_addends=False, have_g_idx=False)
-DT_Q4_1 = QuantizedDataType(groupsize=32, have_addends=True, have_g_idx=False)
-
-DataType = Union[UnquantizedDataType, QuantizedDataType]
-
-DATA_TYPE_TO_FTYPE: Dict[DataType, int] = {
- DT_F32: 0,
- DT_F16: 1,
- DT_Q4_0: 2,
- DT_Q4_1: 3,
-}
-
-FTYPE_TO_DATA_TYPE: Dict[int, DataType] = \
- {ftype: dtype for (dtype, ftype) in DATA_TYPE_TO_FTYPE.items()}
+DataType = Union[UnquantizedDataType]
DATA_TYPE_TO_NUMPY: Dict[DataType, 'np.dtype[Any]'] = {
DT_BF16: np.dtype(np.uint16),
- DT_F16: np.dtype(np.float16),
- DT_F32: np.dtype(np.float32),
- DT_I32: np.dtype(np.int32),
+ DT_F16: np.dtype(np.float16),
+ DT_F32: np.dtype(np.float32),
+ DT_I32: np.dtype(np.int32),
}
NUMPY_TYPE_TO_DATA_TYPE: Dict['np.dtype[Any]', DataType] = \
{dtype: data_type for (data_type, dtype) in DATA_TYPE_TO_NUMPY.items()}
+SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {
+ 'BF16': DT_BF16,
+ 'F16': DT_F16,
+ 'F32': DT_F32,
+ 'I32': DT_I32,
+}
-class GGMLFileType(enum.Enum):
- AllF32 = 0
+# TODO: match this with `llama_ftype`
+# TODO: rename to LLAMAFileType
+# TODO: move to `gguf.py`
+class GGMLFileType(enum.IntEnum):
+ AllF32 = 0
MostlyF16 = 1 # except 1d tensors
- MostlyQ4_0 = 2 # except 1d tensors
- MostlyQ4_1 = 3 # except 1d tensors
- PerLayerIsQ4_1 = 4 # but tok_embeddings.weight and output.weight are F16
def type_for_tensor(self, name: str, tensor: 'LazyTensor') -> DataType:
if len(tensor.shape) == 1:
@@ -93,60 +84,36 @@ class GGMLFileType(enum.Enum):
return DT_F32
elif self == GGMLFileType.MostlyF16:
return DT_F16
- elif self == GGMLFileType.MostlyQ4_0:
- return DT_Q4_0
- elif self == GGMLFileType.MostlyQ4_1:
- return DT_Q4_1
- elif self == GGMLFileType.PerLayerIsQ4_1:
- if name in ('output.weight', 'tok_embeddings.weight'):
- return DT_F16
- else:
- return DT_Q4_1
else:
raise ValueError(self)
-def make_tensors_list() -> List[str]:
- ret = [
- 'tok_embeddings.weight',
- 'norm.weight',
- 'output.weight',
- ]
- for i in range(80): # maximum number of layer
- ret += [
- f'layers.{i}.attention.wq.weight',
- f'layers.{i}.attention.wk.weight',
- f'layers.{i}.attention.wv.weight',
- f'layers.{i}.attention.wo.weight',
- f'layers.{i}.attention_norm.weight',
- f'layers.{i}.feed_forward.w1.weight',
- f'layers.{i}.feed_forward.w2.weight',
- f'layers.{i}.feed_forward.w3.weight',
- f'layers.{i}.ffn_norm.weight',
- ]
- return ret
-
-
-TENSORS_LIST = make_tensors_list()
-TENSORS_SET = set(TENSORS_LIST)
-
-
-def find_n_mult(n_ff: int, n_embd: int) -> int:
- # hardcoded magic range
- for n_mult in range(8192, 1, -1):
- calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
- if calc_ff == n_ff:
- return n_mult
- raise Exception(f"failed to find n_mult for (n_ff={n_ff}, n_embd={n_embd}).")
+#
+# hparams loading
+#
@dataclass
class Params:
- n_vocab: int
- n_embd: int
- n_mult: int
- n_head: int
- n_layer: int
- n_kv_head: Optional[int] # This parameter is only used for Llama 2
+ n_vocab: int
+ n_embd: int
+ n_mult: int
+ n_layer: int
+ n_ctx: int
+ n_ff: int
+ n_head: int
+ n_head_kv: int
+ f_norm_eps: float
+
+ ftype: Optional[GGMLFileType] = None
+
+ @staticmethod
+ def find_n_mult(n_ff: int, n_embd: int) -> int:
+ # hardcoded magic range
+ for n_mult in range(8192, 1, -1):
+ calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
+ if calc_ff == n_ff:
+ return n_mult
+ raise Exception(f"failed to find n_mult for (n_ff={n_ff}, n_embd={n_embd}).")
@staticmethod
def guessed(model: 'LazyModel') -> 'Params':
@@ -165,37 +132,57 @@ class Params:
raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
- n_head=n_embd // 128 # guessed
+ n_head = n_embd // 128 # guessed
+ n_mult = 256 # guessed
+
+ # TODO: verify this
+ n_ff = int(2 * (4 * n_embd) / 3)
+ n_ff = n_mult * ((n_ff + n_mult - 1) // n_mult)
return Params(
- n_vocab = n_vocab,
- n_embd = n_embd,
- n_mult = 256,
- n_head = n_head,
- n_layer = n_layer,
- n_kv_head = None,
+ n_vocab = n_vocab,
+ n_embd = n_embd,
+ n_mult = n_mult,
+ n_layer = n_layer,
+ n_ctx = -1,
+ n_ff = n_ff,
+ n_head = n_head,
+ n_head_kv = n_head,
+ f_norm_eps = 1e-5,
)
@staticmethod
def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
config = json.load(open(config_path))
- n_vocab = config["vocab_size"];
- n_embd = config["hidden_size"];
- n_head = config["num_attention_heads"];
- n_layer = config["num_hidden_layers"];
- n_ff = config["intermediate_size"];
- n_kv_head = config.get("num_key_value_heads")
+ n_vocab = config["vocab_size"]
+ n_embd = config["hidden_size"]
+ n_layer = config["num_hidden_layers"]
+ n_ff = config["intermediate_size"]
+ n_head = config["num_attention_heads"]
+ n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head
+ f_norm_eps = config["rms_norm_eps"]
- n_mult = find_n_mult(n_ff, n_embd);
+ n_mult = Params.find_n_mult(n_ff, n_embd)
+
+ if "max_sequence_length" in config:
+ n_ctx = config["max_sequence_length"]
+ elif "max_position_embeddings" in config:
+ n_ctx = config["max_position_embeddings"]
+ else:
+ raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
+ "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
return Params(
- n_vocab = n_vocab,
- n_embd = n_embd,
- n_mult = n_mult,
- n_head = n_head,
- n_layer = n_layer,
- n_kv_head = n_kv_head,
+ n_vocab = n_vocab,
+ n_embd = n_embd,
+ n_mult = n_mult,
+ n_layer = n_layer,
+ n_ctx = n_ctx,
+ n_ff = n_ff,
+ n_head = n_head,
+ n_head_kv = n_head_kv,
+ f_norm_eps = f_norm_eps,
)
# LLaMA v2 70B params.json
@@ -204,22 +191,32 @@ class Params:
def loadOriginalParamsJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
config = json.load(open(config_path))
- n_vocab = config["vocab_size"];
- n_embd = config["dim"];
- n_head = config["n_heads"];
- n_layer = config["n_layers"];
- n_mult = config["multiple_of"];
+ n_vocab = config["vocab_size"]
+ n_embd = config["dim"]
+ n_layer = config["n_layers"]
+ n_mult = config["multiple_of"]
+ n_ctx = 2048 if config["norm_eps"] == 1e-06 else 4096 # hack to determine LLaMA v1 vs v2
+ n_ff = -1
+ n_head = config["n_heads"]
+ n_head_kv = config["n_kv_heads"] if "n_kv_heads" in config else n_head
+ f_norm_eps = config["norm_eps"]
if n_vocab == -1:
n_vocab = model["tok_embeddings.weight"].shape[0]
+ if n_ff == -1:
+ n_ff = model["layers.0.feed_forward.w1.weight"].shape[0]
+
return Params(
- n_vocab = n_vocab,
- n_embd = n_embd,
- n_mult = n_mult,
- n_head = n_head,
- n_layer = n_layer,
- n_kv_head = None,
+ n_vocab = n_vocab,
+ n_embd = n_embd,
+ n_mult = n_mult,
+ n_layer = n_layer,
+ n_ctx = n_ctx,
+ n_ff = n_ff,
+ n_head = n_head,
+ n_head_kv = n_head_kv,
+ f_norm_eps = f_norm_eps,
)
@staticmethod
@@ -234,30 +231,73 @@ class Params:
else:
params = Params.guessed(model_plus.model)
- print(f'params: n_vocab:{params.n_vocab} n_embd:{params.n_embd} n_mult:{params.n_mult} n_head:{params.n_head} n_layer:{params.n_layer}')
return params
-class SentencePieceVocab:
- def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path], vocabtype: Optional[str]) -> None:
- self.vocabtype = vocabtype
- if self.vocabtype == "bpe":
- self.sentencepiece_tokenizer = json.loads(open(str(fname_tokenizer)).read())
- else:
- self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
+#
+# vocab
+#
+
+class BpeVocab:
+ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
+ self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
added_tokens: Dict[str, int]
if fname_added_tokens is not None:
- added_tokens = json.load(open(fname_added_tokens))
+ added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
else:
added_tokens = {}
- if self.vocabtype == "bpe":
- vocab_size: int = len(self.sentencepiece_tokenizer)
- else:
- vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
- expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
- actual_ids = sorted(added_tokens.values())
+
+ vocab_size: int = len(self.bpe_tokenizer)
+ expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
+ actual_ids = sorted(added_tokens.values())
if expected_ids != actual_ids:
raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
+
+ items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
+ self.added_tokens_list = [text for (text, idx) in items]
+ self.vocab_size_base: int = vocab_size
+ self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
+ self.fname_tokenizer = fname_tokenizer
+ self.fname_added_tokens = fname_added_tokens
+
+ def bpe_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]:
+ tokenizer = self.bpe_tokenizer
+ from transformers.models.gpt2 import tokenization_gpt2
+ byte_encoder = tokenization_gpt2.bytes_to_unicode()
+ byte_decoder = {v: k for k, v in byte_encoder.items()}
+ for i, item in enumerate(tokenizer):
+ text: bytes = item.encode("utf-8")
+ score: float = -i
+ yield text, score, gguf.TokenType.USER_DEFINED
+
+ def added_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]:
+ for text in self.added_tokens_list:
+ score = -1000.0
+ yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
+
+ def all_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]:
+ yield from self.bpe_tokens()
+ yield from self.added_tokens()
+
+ def __repr__(self) -> str:
+ return f"BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
+
+
+class SentencePieceVocab:
+ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
+ self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
+ added_tokens: Dict[str, int]
+ if fname_added_tokens is not None:
+ added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
+ else:
+ added_tokens = {}
+
+ vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
+ expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
+ actual_ids = sorted(added_tokens.values())
+ if expected_ids != actual_ids:
+ raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
+
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
self.added_tokens_list = [text for (text, idx) in items]
self.vocab_size_base: int = vocab_size
@@ -265,117 +305,66 @@ class SentencePieceVocab:
self.fname_tokenizer = fname_tokenizer
self.fname_added_tokens = fname_added_tokens
- def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
+ def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]:
tokenizer = self.sentencepiece_tokenizer
- if self.vocabtype == "bpe":
- from transformers.models.gpt2 import tokenization_gpt2
- byte_encoder = tokenization_gpt2.bytes_to_unicode()
- byte_decoder = {v: k for k, v in byte_encoder.items()}
- for i, item in enumerate(tokenizer):
- text: bytes
- text = b''.join([x.to_bytes(1, byteorder='big') for x in [byte_decoder[y] for y in item]])
- score: float = -i
- yield text, score
- else:
- for i in range(tokenizer.vocab_size()):
- text: bytes
- if tokenizer.is_unknown(i):
- text = " \u2047 ".encode("utf-8")
- elif tokenizer.is_control(i):
- text = b""
- elif tokenizer.is_byte(i):
- piece = tokenizer.id_to_piece(i)
- if len(piece) != 6:
- raise Exception(f"Invalid token: {piece}")
- byte_value = int(piece[3:-1], 16)
- text = struct.pack("B", byte_value)
- else:
- text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
- score: float = tokenizer.get_score(i)
- yield text, score
+ for i in range(tokenizer.vocab_size()):
+ piece = tokenizer.id_to_piece(i)
+ text: bytes = piece.encode("utf-8")
+ score: float = tokenizer.get_score(i)
- def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
+ toktype = gguf.TokenType.NORMAL
+ if tokenizer.is_unknown(i):
+ toktype = gguf.TokenType.UNKNOWN
+ if tokenizer.is_control(i):
+ toktype = gguf.TokenType.CONTROL
+
+ # NOTE: I think added_tokens are user defined.
+ # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
+ # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
+
+ if tokenizer.is_unused(i):
+ toktype = gguf.TokenType.UNUSED
+ if tokenizer.is_byte(i):
+ toktype = gguf.TokenType.BYTE
+
+ yield text, score, toktype
+
+ def added_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]:
for text in self.added_tokens_list:
score = -1000.0
- yield text.encode("utf-8"), score
+ yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
- def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
+ def all_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]:
yield from self.sentencepiece_tokens()
yield from self.added_tokens()
def __repr__(self) -> str:
return f""
-
-class GGMLVocab:
- def __init__(self, tokens: List[Tuple[bytes, float]]):
- self.tokens = tokens
- self.vocab_size = len(tokens)
-
- def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
- return self.tokens
-
- def __repr__(self) -> str:
- return f""
+Vocab = Union[BpeVocab, SentencePieceVocab]
-Vocab = Union[SentencePieceVocab, GGMLVocab]
+#
+# data loading
+# TODO: reuse (probably move to gguf.py?)
+#
-
-def permute(weights: NDArray, n_head: int, n_kv_head: Optional[int] = None) -> NDArray:
- if n_kv_head is not None and n_head != n_kv_head:
- n_head //= n_kv_head
+def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
+ #print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
+ if n_head_kv is not None and n_head != n_head_kv:
+ n_head //= n_head_kv
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
.swapaxes(1, 2)
.reshape(weights.shape))
-def dequantize_q4(qvalues_pack32: NDArray, scales: NDArray, addends: Optional[NDArray], g_idx: Optional[NDArray]) -> NDArray:
- # First reinterpret each row from a list of int32s containing 8 values each
- # to a list of uint8s containing 2 values each.
- qvalues_pack8 = qvalues_pack32.view(np.uint8)
-
- # Then split out the two values per int8 (which requires an actual
- # conversion because numpy doesn't natively support int4s).
- qvalues = np.zeros([qvalues_pack8.shape[0], qvalues_pack8.shape[1] * 2], dtype=np.uint8)
- qvalues[:, 0::2] = qvalues_pack8 & 0xf
- qvalues[:, 1::2] = qvalues_pack8 >> 4
-
- assert addends is None or addends.shape == scales.shape
- assert qvalues.shape[0] == scales.shape[0]
- assert qvalues.shape[1] % scales.shape[1] == 0
- if g_idx is None:
- repeat_count = qvalues.shape[1] // scales.shape[1]
- scales = scales[:, :, np.newaxis]
- if addends is not None:
- addends = addends[:, :, np.newaxis]
- # Reshape so that the below computation broadcasts over scales and addends:
- qvalues.shape = (qvalues.shape[0], scales.shape[1], int(repeat_count))
- else:
- # In this case the scale and addend is selected for each column by g_idx:
- assert addends is not None
- scales = scales[:, g_idx]
- addends = addends[:, g_idx]
- if addends is None:
- # Q4_0
- qvalues = qvalues.view(np.int8)
- qvalues -= 8
- # And do the actual 'value = scale * qvalue + addend' computation.
- values = scales * qvalues
- if addends is not None:
- values += addends
- if g_idx is None:
- values.shape = (values.shape[0], values.shape[1] * values.shape[2])
- return values
-
-
class Tensor(metaclass=ABCMeta):
data_type: DataType
@abstractmethod
def astype(self, data_type: DataType) -> 'Tensor': ...
@abstractmethod
- def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'Tensor': ...
+ def permute(self, n_head: int, n_head_kv: int) -> 'Tensor': ...
@abstractmethod
def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': ...
@abstractmethod
@@ -413,8 +402,8 @@ class UnquantizedTensor(Tensor):
r = self.ndarray.shape[0] // 3
return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
- def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'UnquantizedTensor':
- return UnquantizedTensor(permute(self.ndarray, n_head, n_kv_head))
+ def permute(self, n_head: int, n_head_kv: int) -> 'UnquantizedTensor':
+ return UnquantizedTensor(permute(self.ndarray, n_head, n_head_kv))
def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, convert: bool = False) -> NDArray:
@@ -433,183 +422,25 @@ def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, conv
return tensor.ndarray
-class GGMLQuantizedTensor(Tensor):
- data_type: QuantizedDataType
-
- def __init__(self, ndarray: NDArray, shape: List[int], data_type: DataType) -> None:
- rows, columns = shape
- assert data_type in (DT_Q4_1, DT_Q4_0) # for now
- assert isinstance(data_type, QuantizedDataType) # redundant, but mypy complains without this
- assert columns % data_type.groupsize == 0
- words_in_block = 6 if data_type == DT_Q4_1 else 5
- self.ndarray = ndarray.view(dtype=np.uint32).reshape((rows, columns // data_type.groupsize, words_in_block))
- self.shape = shape[:]
- self.data_type = data_type
-
- def astype(self, data_type: DataType) -> Tensor:
- if data_type == self.data_type:
- return self
- scales = self.ndarray[:, :, 0].view(np.float32)
- if self.data_type.have_addends:
- addends = self.ndarray[:, :, 1].view(np.float32)
- else:
- addends = None
- qweights = self.ndarray[:, :, -4:].reshape([self.shape[0], self.shape[1] // 8])
-
- dq = dequantize_q4(qweights, scales, addends, g_idx=None)
- return UnquantizedTensor(dq).astype(data_type)
-
- def to_ggml(self) -> 'GGMLQuantizedTensor':
- return self
-
- def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'GGMLQuantizedTensor':
- return GGMLQuantizedTensor(permute(self.ndarray, n_head, n_kv_head), self.shape, self.data_type)
-
- def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor':
- r = self.ndarray.shape[0] // 3
- return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head))
-
- def part(self, n_part: int) -> 'UnquantizedTensor':
- r = self.ndarray.shape[0] // 3
- return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
-
-GGMLCompatibleTensor = Union[UnquantizedTensor, GGMLQuantizedTensor]
+GGMLCompatibleTensor = Union[UnquantizedTensor]
class DeferredPermutedTensor(Tensor):
- def __init__(self, base: Tensor, n_head: int, n_kv_head: Optional[int] = None) -> None:
+ def __init__(self, base: Tensor, n_head: int, n_head_kv: int) -> None:
self.base = base
self.n_head = n_head
- self.n_kv_head = n_kv_head
self.data_type = self.base.data_type
def astype(self, data_type: DataType) -> Tensor:
- return self.base.astype(data_type).permute(self.n_head, self.n_kv_head)
+ return self.base.astype(data_type).permute(self.n_head, self.n_head_kv)
def to_ggml(self) -> GGMLCompatibleTensor:
- return self.base.to_ggml().permute(self.n_head, self.n_kv_head)
+ return self.base.to_ggml().permute(self.n_head, self.n_head_kv)
- def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> Tensor:
+ def permute(self, n_head: int, n_head_kv: int) -> Tensor:
raise Exception("shouldn't permute twice")
-class GPTQForLLaMaQuantizedTensor(Tensor):
- def __init__(self, model: 'LazyModel', namebase: str) -> None:
- qweight = load_unquantized(model[f"{namebase}.qweight"], np.int32)
- scales = load_unquantized(model[f"{namebase}.scales"], np.float32, convert=True)
-
- bias = model.get(f"{namebase}.bias")
- if bias is not None:
- # Q4_1 does not support bias; good thing the bias is always all zeros.
- assert not np.any(load_unquantized(bias))
-
- if f"{namebase}.zeros" in model:
- zeros = load_unquantized(model[f"{namebase}.zeros"], np.float32)
- else:
- qzeros = load_unquantized(model[f"{namebase}.qzeros"], np.int32)
- assert qzeros.dtype == np.int32
- zeros = dequantize_q4(qzeros, scales, scales, g_idx=None)
- assert zeros.dtype == np.float32
-
- assert zeros.shape == scales.shape
-
- # Output is transposed compared to the input, and addends have their sign flipped.
- # Scales and zeros similarly must be transposed but only for newer
- # versions of GPTQ-for-LLaMa; the older versions can be identified by
- # having shape (n_embd, 1).
- qweight = qweight.T
- if scales.shape[1] != 1:
- scales = scales.T
- zeros = zeros.T
-
- # Output also has signs flipped for the addends.
- self.qweight = qweight
- self.scales = scales
- self.addends = -zeros
-
- self.g_idx: Optional[NDArray]
- if f"{namebase}.g_idx" in model:
- self.g_idx = load_unquantized(model[f"{namebase}.g_idx"], np.int32)
- assert self.g_idx.shape == (qweight.shape[1] * 8,)
- else:
- self.g_idx = None
-
- self.shape = [self.qweight.shape[0], self.qweight.shape[1] * 8]
- self.data_type = QuantizedDataType(groupsize=self.groupsize(), have_addends=True,
- have_g_idx=(self.g_idx is not None))
-
- def inspect(self, row: int, col: int) -> None:
- '''For debugging.'''
- qweight = (self.qweight[row, col // 8] >> (4 * (col & 7))) & 0xf
- if self.g_idx is not None:
- group = self.g_idx[col]
- else:
- group = int(col // self.groupsize())
- scale = self.scales[row, group]
- addend = self.addends[row, group]
- with np.printoptions(precision=None, suppress=True):
- print(f'scale:{scale} addend:{addend} qweight:{qweight}')
- print('possible values:', np.arange(16) * scale + addend)
- print('actual value:', qweight * scale + addend)
-
- def astype(self, data_type: DataType) -> Tensor:
- if isinstance(data_type, QuantizedDataType):
- assert self.g_idx is None and data_type.have_addends is True and data_type.have_g_idx is False
- return self.regroup(data_type.groupsize)
-
- dequantized = dequantize_q4(np.ascontiguousarray(self.qweight), self.scales, self.addends, self.g_idx)
- return UnquantizedTensor(dequantized).astype(data_type)
-
- def groupsize(self) -> int:
- assert self.addends.shape == self.scales.shape
- assert self.shape[1] % self.scales.shape[1] == 0
- return self.shape[1] // self.scales.shape[1]
-
- def regroup(self, new_groupsize: int = 32) -> 'GPTQForLLaMaQuantizedTensor':
- # Old versions of GPTQ-for-LLaMa shared scales and addends between all the
- # columns in a row. Newer versions share them between every set of N
- # columns in a row, where N is the `groupsize` parameter, usually 128. The
- # output format shares them between every set of 32 columns. To handle
- # this, duplicate scales and addends for every smaller group.
- # (In the above, 'row' and 'column' are in the sense of the output.)
- assert self.g_idx is None
- old_groupsize = self.groupsize()
- assert old_groupsize >= new_groupsize and old_groupsize % new_groupsize == 0, old_groupsize
- ret = copy.copy(self)
- ret.addends = self.addends.repeat(old_groupsize // new_groupsize, axis=1)
- ret.scales = self.scales.repeat(old_groupsize // new_groupsize, axis=1)
- ret.data_type = QuantizedDataType(groupsize=new_groupsize, have_addends=True, have_g_idx=False)
- return ret
-
- def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> Tensor:
- return DeferredPermutedTensor(self, n_head, n_kv_head)
-
- def to_ggml(self) -> GGMLQuantizedTensor:
- # The output format looks like this:
- # For each row:
- # For each group of 32 columns:
- # - addend (float32, 4 bytes)
- # - scale (float32, 4 bytes)
- # - weights (int4 * 32, 16 bytes)
-
- if self.groupsize() != 32:
- raise Exception("should have been regrouped before converting to ggml")
-
- # Since the output format is mixed between integers and floats, we have
- # to hackily view the floats as int32s just so numpy will let us
- # concatenate them.
- addends_view = self.addends.view(dtype=np.int32)[:, :, np.newaxis]
- scales_view = self.scales.view(dtype=np.int32)[:, :, np.newaxis]
-
- # Split into groups of 4 columns (i.e. 32 columns of quantized data):
- grouped = self.qweight.reshape([self.qweight.shape[0], self.qweight.shape[1] // 4, 4])
-
- # And concatenate:
- grouped = np.concatenate([scales_view, addends_view, grouped], axis=2, casting='no')
-
- return GGMLQuantizedTensor(grouped, self.shape, DT_Q4_1)
-
-
@dataclass
class LazyTensor:
_load: Callable[[], Tensor]
@@ -632,17 +463,6 @@ class LazyTensor:
def validate_conversion_to(self, data_type: DataType) -> None:
if data_type == self.data_type:
return
- if isinstance(data_type, QuantizedDataType):
- if not isinstance(self.data_type, QuantizedDataType):
- raise Exception(f"Can't turn an unquantized tensor into a quantized type ({data_type})")
- if self.data_type.have_g_idx:
- sys.stderr.write(
- "Error: Input uses the newer GPTQ-for-LLaMa format (using g_idx), "
- "which is not yet natively supported by GGML. "
- "For now you can still convert this model by passing `--outtype f16` to dequantize, "
- "but that will result in a much larger output file for no quality benefit.\n")
- sys.exit(1)
- assert not data_type.have_g_idx and self.data_type.have_addends and data_type.have_addends
LazyModel = Dict[str, LazyTensor]
@@ -713,10 +533,10 @@ def merge_multifile_models(models_plus: List[ModelPlus]) -> ModelPlus:
return ModelPlus(model, paths, format, vocab)
-def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_kv_head: Optional[int] = None) -> LazyTensor:
+def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor:
def load() -> Tensor:
- return lazy_tensor.load().permute(n_head, n_kv_head)
- return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_kv_head}) ' + lazy_tensor.description)
+ return lazy_tensor.load().permute(n_head, n_head_kv)
+ return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int) -> LazyTensor:
def load() -> Tensor:
@@ -732,66 +552,6 @@ def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
s[0] = s[0] // 3
return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description)
-def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
- out: LazyModel = {}
- out["tok_embeddings.weight"] = model["model.embed_tokens.weight"]
- out["norm.weight"] = model["model.norm.weight"]
- out["output.weight"] = model["lm_head.weight"]
-
- for i in itertools.count():
- if f"model.layers.{i}.self_attn.q_proj.weight" in model:
- out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
- out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_kv_head)
- out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
- elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
- out[f"layers.{i}.attention.wq.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head)
- out[f"layers.{i}.attention.wk.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head)
- out[f"layers.{i}.attention.wv.weight"] = part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
- else:
- break
-
- out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"]
-
- out[f"layers.{i}.feed_forward.w1.weight"] = model[f"model.layers.{i}.mlp.gate_proj.weight"]
- out[f"layers.{i}.feed_forward.w2.weight"] = model[f"model.layers.{i}.mlp.down_proj.weight"]
- out[f"layers.{i}.feed_forward.w3.weight"] = model[f"model.layers.{i}.mlp.up_proj.weight"]
-
- out[f"layers.{i}.attention_norm.weight"] = model[f"model.layers.{i}.input_layernorm.weight"]
- out[f"layers.{i}.ffn_norm.weight"] = model[f"model.layers.{i}.post_attention_layernorm.weight"]
- return out
-
-
-def handle_quantization(model: LazyModel) -> LazyModel:
- '''Convert a model with entries for 'foo.qweight', 'foo.scales', etc.
- (which resolve to UnquantizedTensors with the raw data) to one with entries
- for 'foo.weight' (which resolve to QuantizedTensors).
- '''
- def convert(name: str) -> Tuple[str, LazyTensor]:
- if name.endswith(".qweight"):
- namebase = name.rsplit('.', 1)[0]
- orig_name = namebase + ".weight"
-
- lazy_tensor = model[name]
- assert len(lazy_tensor.shape) == 2
- real_shape = [lazy_tensor.shape[1], lazy_tensor.shape[0] * 8]
-
- # Calculate type. This replicates the logic in
- # GPTQForLLaMaQuantizedTensor (which is executed when the modelis
- # actually loaded).
- lazy_scales = model[f"{namebase}.scales"]
- scales_width = 1 if lazy_scales.shape[1] == 1 else lazy_scales.shape[0]
- assert real_shape[1] % scales_width == 0
- groupsize = real_shape[1] // scales_width
- have_g_idx = f"{namebase}.g_idx" in model
- data_type = QuantizedDataType(groupsize=groupsize, have_addends=True, have_g_idx=have_g_idx)
-
- def load() -> Tensor:
- return GPTQForLLaMaQuantizedTensor(model, namebase)
-
- return (orig_name, LazyTensor(load, real_shape, data_type, '[quantized]'))
- else:
- return (name, model[name])
- return dict(convert(name) for name in model)
# Functionality that simulates `torch.load` but where individual tensors are
# only loaded into memory on demand, not all at once.
@@ -885,14 +645,6 @@ def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:
return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None)
-SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {
- 'BF16': DT_BF16,
- 'F16': DT_F16,
- 'F32': DT_F32,
- 'I32': DT_I32,
-}
-
-
def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
header_size, = struct.unpack(' bytes:
return ret
-def lazy_load_ggml_file(fp: io.BufferedReader, path: Path) -> ModelPlus:
- magic = must_read(fp, 4)[::-1]
- if magic in (b'ggmf', b'ggjt'):
- version, = struct.unpack("i", must_read(fp, 4))
- assert version == 1
- else:
- assert magic == b'ggml'
- version = None
- n_vocab, n_embd, n_mult, n_head, n_layer, rot, file_type = struct.unpack('<7i', must_read(fp, 28))
-
- tokens: List[Tuple[bytes, float]] = []
- for i in range(n_vocab):
- if i == 32000:
- # HACK: GPT4All messed with the format without changing the magic
- # number. Specifically, they changed the vocab section to contain
- # `n_vocab - 1` tokens instead of `n_vocab` (i.e. omitting the
- # extra pad token). Try to detect if we're reading a file like
- # this.
- orig_pos = fp.tell()
- fp.seek(20, io.SEEK_CUR)
- is_gpt4all = fp.read(21) == b'tok_embeddings.weight'
- fp.seek(orig_pos)
- if is_gpt4all:
- break
-
- length, = struct.unpack("i", must_read(fp, 4))
- text = must_read(fp, length)
- if magic != b'ggml':
- score, = struct.unpack("f", must_read(fp, 4))
- tokens.append((text, score))
- vocab = GGMLVocab(tokens) if magic != b'ggml' else None
-
- model: LazyModel = {}
- # Use mmap for the actual data to avoid race conditions with the file offset.
- off = fp.raw.tell()
- mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
- fp.raw.seek(off) # needed on Windows
-
- def read_tensor() -> None: # this is a function so that variables captured in `load` don't change
- shape_len, name_len, ftype = struct.unpack("iii", must_read(fp, 12))
- assert 0 <= shape_len <= 3
- shape: List[int] = list(struct.unpack(f"{shape_len}i", must_read(fp, 4 * shape_len)))
- shape = shape[::-1]
- name = must_read(fp, name_len).decode('utf-8')
- data_type = FTYPE_TO_DATA_TYPE[ftype]
-
- if magic == b'ggjt':
- fp.seek((fp.tell() + 31) & -32)
-
- if data_type == DT_Q4_1:
- # See GPTQForLLaMaQuantizedTensor.ggml_ndarray()
- size = 24 * (shape[1] // 32) * shape[0]
- elif data_type == DT_Q4_0:
- size = 20 * (shape[1] // 32) * shape[0]
- else:
- numpy_dtype = DATA_TYPE_TO_NUMPY[data_type]
- elm_count = math.prod(shape)
- size = elm_count * numpy_dtype.itemsize
- offset = fp.tell()
- buf = mapped[offset:offset+size]
- fp.seek(size, io.SEEK_CUR)
-
- def load() -> Tensor:
- if isinstance(data_type, QuantizedDataType):
- ndarray = np.frombuffer(buf, dtype=np.uint32)
- return GGMLQuantizedTensor(ndarray, shape, data_type)
- else:
- return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape))
- description = f'ggml offset={offset} type={data_type} path={path}'
- model[name] = LazyTensor(load, shape, data_type, description)
-
- while fp.read(1) != b'':
- fp.seek(-1, io.SEEK_CUR)
- read_tensor()
-
- return ModelPlus(model=model, paths=[path], format='ggml', vocab=vocab)
-
-
@functools.lru_cache(maxsize=None)
def lazy_load_file(path: Path) -> ModelPlus:
fp = open(path, 'rb')
@@ -1010,9 +684,6 @@ def lazy_load_file(path: Path) -> ModelPlus:
if first8[:2] == b'PK':
# A zip file, i.e. PyTorch format
return lazy_load_torch_file(fp, path)
- elif first8[2:4] == b'gg':
- # GGML format
- return lazy_load_ggml_file(fp, path)
elif struct.unpack(' ModelPlus:
In = TypeVar('In')
Out = TypeVar('Out')
-
def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int) -> Iterable[Out]:
'''Parallel map, but with backpressure. If the caller doesn't call `next`
fast enough, this will stop calling `func` at some point rather than
@@ -1043,8 +713,7 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
def check_vocab_size(params: Params, vocab: Vocab) -> None:
if params.n_vocab != vocab.vocab_size:
- # GGMLVocab comes from the same file as the model so shouldn't mismatch:
- assert isinstance(vocab, SentencePieceVocab)
+ assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
if params.n_vocab == vocab.vocab_size_base:
print("Ignoring added_tokens.json since model matches vocab size without it.")
vocab.added_tokens_list = []
@@ -1061,98 +730,157 @@ def check_vocab_size(params: Params, vocab: Vocab) -> None:
class OutputFile:
def __init__(self, fname_out: Path) -> None:
- self.fout = open(fname_out, "wb")
+ self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
- def write_file_header(self, params: Params, file_type: GGMLFileType) -> None:
- self.fout.write(b"ggjt"[::-1]) # magic
- values = [
- 1, # file version
- params.n_vocab,
- params.n_embd,
- params.n_mult,
- params.n_head,
- params.n_layer,
- params.n_embd // params.n_head, # rot (obsolete)
- file_type.value,
- ]
- self.fout.write(struct.pack("i" * len(values), *values))
+ def add_meta_arch(self, params: Params) -> None:
+ self.gguf.add_name ("LLaMA")
+ self.gguf.add_context_length (params.n_ctx)
+ self.gguf.add_embedding_length (params.n_embd)
+ self.gguf.add_block_count (params.n_layer)
+ self.gguf.add_feed_forward_length (params.n_ff)
+ self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
+ self.gguf.add_head_count (params.n_head)
+ self.gguf.add_head_count_kv (params.n_head_kv)
+ self.gguf.add_layer_norm_rms_eps (params.f_norm_eps)
- def write_tensor_header(self, name: str, shape: Sequence[int], data_type: DataType) -> None:
- sname = name.encode('utf-8')
- self.fout.write(struct.pack("iii", len(shape), len(sname), DATA_TYPE_TO_FTYPE[data_type]))
- self.fout.write(struct.pack("i" * len(shape), *shape[::-1]))
- self.fout.write(sname)
- self.fout.seek((self.fout.tell() + 31) & -32)
+ if params.ftype:
+ self.gguf.add_file_type(params.ftype)
- def write_vocab(self, vocab: Vocab) -> None:
- for text, score in vocab.all_tokens():
- self.fout.write(struct.pack("i", len(text)))
- self.fout.write(text)
- self.fout.write(struct.pack("f", score))
+ def add_meta_vocab(self, vocab: Vocab) -> None:
+ tokens = []
+ scores = []
+ toktypes = []
+ # NOTE: `all_tokens` returns the the base vocabulary and added tokens
+ # TODO: add special tokens?
+ for text, score, toktype in vocab.all_tokens():
+ tokens.append(text)
+ scores.append(score)
+ toktypes.append(toktype)
+
+ self.gguf.add_tokenizer_model("llama")
+ self.gguf.add_token_list(tokens)
+ self.gguf.add_token_scores(scores)
+ self.gguf.add_token_types(toktypes)
+
+ def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
+ n_elements = 1
+ for dim in tensor.shape:
+ n_elements *= dim
+ data_type = DATA_TYPE_TO_NUMPY[tensor.data_type]
+ data_nbytes = n_elements * data_type.itemsize
+ self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes)
+
+ def write_meta(self) -> None:
+ self.gguf.write_header_to_file()
+ self.gguf.write_kv_data_to_file()
+
+ def write_tensor_info(self) -> None:
+ self.gguf.write_ti_data_to_file()
+
+ def close(self) -> None:
+ self.gguf.close()
@staticmethod
- def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
- of = OutputFile(fname_out)
- params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0, n_head=1, n_layer=0)
- of = OutputFile(fname_out)
- of.write_file_header(params, file_type=GGMLFileType.AllF32)
- of.write_vocab(vocab)
- of.fout.close()
-
- @staticmethod
- def write_all(fname_out: Path, params: Params, file_type: GGMLFileType, model: LazyModel, vocab: Vocab) -> None:
+ def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab) -> None:
check_vocab_size(params, vocab)
+
of = OutputFile(fname_out)
- of.write_file_header(params, file_type)
- print("Writing vocab...")
- of.write_vocab(vocab)
+
+ # meta data
+ of.add_meta_arch(params)
+ of.add_meta_vocab(vocab)
+ of.write_meta()
+
+ of.close()
+
+ @staticmethod
+ def write_all(fname_out: Path, params: Params, model: LazyModel, vocab: Vocab) -> None:
+ check_vocab_size(params, vocab)
+
+ of = OutputFile(fname_out)
+
+ # meta data
+ of.add_meta_arch(params)
+ of.add_meta_vocab(vocab)
+
+ # tensor info
+ for name, lazy_tensor in model.items():
+ of.add_tensor_info(name, lazy_tensor)
+
+ of.write_meta()
+ of.write_tensor_info()
def do_item(item: Tuple[str, LazyTensor]) -> NDArray:
name, lazy_tensor = item
return lazy_tensor.load().to_ggml().ndarray
+ # tensor data
ndarrays = bounded_parallel_map(do_item, model.items(), concurrency=8)
for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
padi = len(str(len(model)))
print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type}")
- of.write_tensor_header(name, lazy_tensor.shape, lazy_tensor.data_type)
- ndarray.tofile(of.fout)
- of.fout.close()
+ of.gguf.write_tensor_data(ndarray)
+ of.close()
def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFileType:
- wq_type = model["layers.0.attention.wq.weight"].data_type
- if output_type_str == "f32" or (output_type_str is None and wq_type in (DT_F32, DT_BF16)):
+ wq_type = model[NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0)+".weight"].data_type
+
+ if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
return GGMLFileType.AllF32
- if output_type_str == "f16" or (output_type_str is None and wq_type == DT_F16):
+ if output_type_str == "f16" or (output_type_str is None and wq_type in (DT_F16, DT_BF16)):
return GGMLFileType.MostlyF16
- if output_type_str == "q4_1" or (output_type_str is None and isinstance(wq_type, QuantizedDataType) and
- wq_type.have_addends):
- if isinstance(model["output.weight"].data_type, QuantizedDataType):
- return GGMLFileType.MostlyQ4_1
- else:
- return GGMLFileType.PerLayerIsQ4_1
- if output_type_str == "q4_0" or (output_type_str is None and isinstance(wq_type, QuantizedDataType)):
- return GGMLFileType.MostlyQ4_0
+
name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}
+
raise Exception(f"Unexpected combination of types: {name_to_type}")
-
-def do_necessary_conversions(model: LazyModel, params: Params) -> LazyModel:
- model = handle_quantization(model)
-
- if "lm_head.weight" in model:
- model = convert_transformers_to_orig(model, params)
- model = filter_and_sort_tensors(model)
-
- return model
-
-
def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
for (name, tensor) in model.items()}
+def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
+ tmap = gguf.get_tensor_name_map(ARCH, params.n_layer)
+
+ tmp = model
+
+ # HF models permut or pack some of the tensors, so we need to undo that
+ for i in itertools.count():
+ if f"model.layers.{i}.self_attn.q_proj.weight" in model:
+ print(f"Permuting layer {i}")
+ tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
+ tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
+ #tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
+ elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
+ print(f"Unpacking and permuting layer {i}")
+ tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
+ tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv)
+ tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
+ else:
+ break
+
+ out: LazyModel = {}
+ for name, lazy_tensor in model.items():
+ name_new = name
+
+ if name in tmap:
+ name_new = tmap[name]
+ elif name.endswith(".weight") and name[:-7] in tmap:
+ name_new = tmap[name[:-7]] + ".weight"
+ elif name.endswith(".bias") and name[:-5] in tmap:
+ name_new = tmap[name[:-5]] + ".bias"
+ else:
+ raise Exception(f"Unexpected tensor name: {name}")
+
+ if gguf.should_skip_tensor_TMP(ARCH, params.n_layer, name_new):
+ print(f"skipping tensor {name_new}")
+ continue
+ else:
+ print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type} | {lazy_tensor.shape}")
+ out[name_new] = lazy_tensor
+
+ return out
def nth_multifile_path(path: Path, n: int) -> Optional[Path]:
'''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
@@ -1203,11 +931,6 @@ def load_some_model(path: Path) -> ModelPlus:
# Try the PyTorch patterns too, with lower priority
globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
files = [file for glob in globs for file in path.glob(glob)]
- if not files:
- # Try GGML too, but with lower priority, since if both a non-GGML
- # model and a GGML model exist in the same directory, we assume the
- # latter was converted from the former.
- files = list(path.glob("ggml-model*.bin*"))
if not files:
raise Exception(f"Can't find model in directory {path}")
if len(files) > 1:
@@ -1224,19 +947,14 @@ def load_some_model(path: Path) -> ModelPlus:
return model_plus
-def filter_and_sort_tensors(model: LazyModel) -> LazyModel:
- return {name: model[name] for name in TENSORS_LIST if name in model}
-
-
-def load_vocab(path: Path, vocabtype: Optional[str]) -> SentencePieceVocab:
- print(f"vocabtype: {vocabtype}")
+def load_vocab(path: Path, vocabtype: Optional[str]) -> Union[BpeVocab, SentencePieceVocab]:
# Be extra-friendly and accept either a file or a directory. Also, if it's
# a directory, it might be the model directory, and tokenizer.model might
# be in the parent of that.
if path.is_dir():
vocab_file = "tokenizer.model"
if vocabtype == 'bpe':
- vocab_file = "vocab.json"
+ vocab_file = "vocab.json"
path2 = path / vocab_file
# Use `.parent` instead of /.. to handle the symlink case better.
path3 = path.parent / vocab_file
@@ -1246,23 +964,26 @@ def load_vocab(path: Path, vocabtype: Optional[str]) -> SentencePieceVocab:
path = path3
else:
raise FileNotFoundError(
- f"Could not find tokenizer.model in {path} or its parent; "
+ f"Could not find {vocab_file} in {path} or its parent; "
"if it's in another directory, pass the directory as --vocab-dir")
+
+ print(f"Loading vocab file '{path}', type '{vocabtype}'")
+
added_tokens_path = path.parent / "added_tokens.json"
- print(f"Loading vocab file {path}")
- return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None,
- vocabtype)
+ if vocabtype == "bpe":
+ return BpeVocab(path, added_tokens_path if added_tokens_path.exists() else None)
+ elif vocabtype == "spm":
+ return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
+ else:
+ raise ValueError(f"Unsupported vocabulary type {vocabtype}")
def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path:
namestr = {
- GGMLFileType.AllF32: "f32",
+ GGMLFileType.AllF32: "f32",
GGMLFileType.MostlyF16: "f16",
- GGMLFileType.MostlyQ4_0: "q4_0",
- GGMLFileType.MostlyQ4_1: "q4_1",
- GGMLFileType.PerLayerIsQ4_1: "q4_1",
}[file_type]
- ret = model_paths[0].parent / f"ggml-model-{namestr}.bin"
+ ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
if ret in model_paths:
sys.stderr.write(
f"Error: Default output path ({ret}) would overwrite the input. "
@@ -1281,44 +1002,68 @@ def do_dump_model(model_plus: ModelPlus) -> None:
def main(args_in: Optional[List[str]] = None) -> None:
parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
- parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
- parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
- parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
- parser.add_argument("--outtype", choices=["f32", "f16", "q4_1", "q4_0"], help="output format (default: based on input)")
- parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
- parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
- parser.add_argument("model", type=Path,
- help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
- parser.add_argument("--vocabtype", default='spm', choices=["spm", "bpe"], help="vocab format (default: spm)")
+ parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
+ parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
+ parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
+ parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
+ parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
+ parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+ parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
+ parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
+ parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
args = parser.parse_args(args_in)
- vocab: Vocab
if args.dump_single:
model_plus = lazy_load_file(args.model)
do_dump_model(model_plus)
- elif args.vocab_only:
+
+ model_plus = load_some_model(args.model)
+
+ params = Params.load(model_plus)
+ if params.n_ctx == -1:
+ if args.ctx is None:
+ raise Exception("The model doesn't have a context size, and you didn't specify one with --ctx\n"
+ "Please specify one with --ctx:\n"
+ " - LLaMA v1: --ctx 2048\n"
+ " - LLaMA v2: --ctx 4096\n")
+ params.n_ctx = args.ctx
+
+ if args.outtype:
+ params.ftype = {
+ "f32": GGMLFileType.AllF32,
+ "f16": GGMLFileType.MostlyF16,
+ }[args.outtype]
+
+ print(f"params = {params}")
+
+ vocab: Vocab
+ if args.vocab_only:
vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
assert args.outfile, "need --outfile if using --vocab-only"
outfile = args.outfile
- OutputFile.write_vocab_only(outfile, vocab)
+ OutputFile.write_vocab_only(outfile, params, vocab)
print(f"Wrote {outfile}")
else:
- model_plus = load_some_model(args.model)
if args.dump:
do_dump_model(model_plus)
return
+
if model_plus.vocab is not None and args.vocab_dir is None:
vocab = model_plus.vocab
else:
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
vocab = load_vocab(vocab_dir, args.vocabtype)
- params = Params.load(model_plus)
- model = model_plus.model
- model = do_necessary_conversions(model, params)
- output_type = pick_output_type(model, args.outtype)
- model = convert_to_output_type(model, output_type)
- outfile = args.outfile or default_outfile(model_plus.paths, output_type)
- OutputFile.write_all(outfile, params, output_type, model, vocab)
+
+ model = model_plus.model
+ model = convert_model_names(model, params)
+ ftype = pick_output_type(model, args.outtype)
+ model = convert_to_output_type(model, ftype)
+ outfile = args.outfile or default_outfile(model_plus.paths, ftype)
+
+ params.ftype = ftype
+ print(f"Writing {outfile}, format {ftype}")
+
+ OutputFile.write_all(outfile, params, model, vocab)
print(f"Wrote {outfile}")
diff --git a/docs/token_generation_performance_tips.md b/docs/token_generation_performance_tips.md
index 69ba6173c..c9acff7d4 100644
--- a/docs/token_generation_performance_tips.md
+++ b/docs/token_generation_performance_tips.md
@@ -3,7 +3,7 @@
## Verifying that the model is running on the GPU with cuBLAS
Make sure you compiled llama with the correct env variables according to [this guide](../README.md#cublas), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
```shell
-./main -m "path/to/model.bin" -ngl 200000 -p "Please sir, may I have some "
+./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
```
When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
@@ -25,9 +25,9 @@ GPU: A6000 (48GB VRAM)
CPU: 7 physical cores
RAM: 32GB
-Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.ggmlv3.q4_0.bin` (30B parameters, 4bit quantization, GGML)
+Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML)
-Run command: `./main -m "path/to/model.bin" -p "-p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
+Run command: `./main -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
Result:
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index d53652815..d2176c910 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -6,27 +6,6 @@ find_package(Threads REQUIRED)
# ...
-# common
-
-set(TARGET common)
-
-add_library(${TARGET} OBJECT
- common.h
- common.cpp
- console.h
- console.cpp
- grammar-parser.h
- grammar-parser.cpp
- )
-
-if (BUILD_SHARED_LIBS)
- set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-endif()
-
-target_include_directories(${TARGET} PUBLIC .)
-target_compile_features(${TARGET} PUBLIC cxx_std_11)
-target_link_libraries(${TARGET} PRIVATE llama)
-
# examples
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
diff --git a/examples/convert-llama2c-to-ggml/README.md b/examples/convert-llama2c-to-ggml/README.md
index 868f57d6d..fd561fcbc 100644
--- a/examples/convert-llama2c-to-ggml/README.md
+++ b/examples/convert-llama2c-to-ggml/README.md
@@ -12,15 +12,19 @@ usage: ./convert-llama2c-to-ggml [options]
options:
-h, --help show this help message and exit
- --copy-vocab-from-model FNAME model path from which to copy vocab (default 'models/ggml-vocab.bin')
+ --copy-vocab-from-model FNAME model path from which to copy vocab (default 'tokenizer.bin')
--llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model
--llama2c-output-model FNAME model path to save the converted llama2.c model (default ak_llama_model.bin')
```
-An example command is as follows:
+An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows:
-`$ ./convert-llama2c-to-ggml --copy-vocab-from-model --llama2c-model --llama2c-output-model `
+`$ ./convert-llama2c-to-ggml --copy-vocab-from-model ../llama2.c/tokenizer.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.ggmlv3.bin`
-Now you can use the model with command like:
+For now the generated model is in the legacy GGJTv3 format, so you need to convert it to gguf manually:
-`$ ./main -m -p "One day, Lily met a Shoggoth" -n 500 -c 256 -eps 1e-5`
+`$ python ./convert-llama-ggmlv3-to-gguf.py --eps 1e-5 --input stories42M.ggmlv3.bin --output stories42M.gguf.bin`
+
+Now you can use the model with a command like:
+
+`$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
index 1a238c4dd..f8a58dc7a 100644
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -1,5 +1,6 @@
#include "ggml.h"
#include "llama.h"
+
#include
#include
#include
@@ -16,6 +17,9 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
+#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
+#define LLAMA_FILE_VERSION_GGJT_V3 3
+
//////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc.
typedef struct {
int dim; // transformer dimension
@@ -48,10 +52,10 @@ typedef struct {
// float* freq_cis_real; // (seq_len, dim/2)
// float* freq_cis_imag; // (seq_len, dim/2)
// (optional) classifier weights for the logits, on the last layer
- //float* wcls;
+ float* wcls;
} TransformerWeights;
-void malloc_weights(TransformerWeights* w, Config* p) {
+void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
// we calloc instead of malloc to keep valgrind happy
w->token_embedding_table = new float[p->vocab_size * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
@@ -85,9 +89,16 @@ void malloc_weights(TransformerWeights* w, Config* p) {
w->rms_final_weight = new float[p->dim]();
printf("[%s:AK] Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
+
+ if (shared_weights) {
+ w->wcls = NULL;
+ } else {
+ w->wcls = new float[p->vocab_size * p->dim]();
+ printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
+ }
}
-int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) {
+int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) {
if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast(p->vocab_size * p->dim)) return 1;
if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast(p->n_layers * p->dim)) return 1;
if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast(p->n_layers * p->dim * p->dim)) return 1;
@@ -99,6 +110,22 @@ int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) {
if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast(p->n_layers * p->hidden_dim * p->dim)) return 1;
if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast(p->n_layers * p->dim * p->hidden_dim)) return 1;
if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast(p->dim)) return 1;
+
+ // Skip freq_cis_real & freq_cis_imag
+ int head_size = p->dim / p->n_heads;
+ fseek(f, p->seq_len * head_size * sizeof(float), SEEK_CUR);
+
+ if (!shared_weights && fread(w->wcls, sizeof(float), p->vocab_size * p->dim, f) != static_cast(p->vocab_size * p->dim)) return 1;
+
+ // Check we didn't forget to read anything
+ auto curr = ftell(f);
+ fseek(f, 0, SEEK_END);
+ auto end = ftell(f);
+ if (curr != end) {
+ printf("Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", curr, end);
+ return 1;
+ }
+
return 0;
}
@@ -114,6 +141,7 @@ void free_weights(TransformerWeights* w) {
delete w->w2;
delete w->w3;
delete w->rms_final_weight;
+ if (w->wcls) delete w->wcls;
}
void print_sample_weights(TransformerWeights *w){
@@ -130,6 +158,7 @@ void print_sample_weights(TransformerWeights *w){
printf("%f\n", w->w2[0]);
printf("%f\n", w->w3[0]);
printf("%f\n", w->rms_att_weight[0]);
+ if (w->wcls) printf("%f\n", w->wcls[0]);
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -138,14 +167,16 @@ void print_sample_weights(TransformerWeights *w){
struct llama_vocab {
using id = int32_t;
using token = std::string;
+ using ttype = llama_token_type;
- struct token_score {
- token tok;
+ struct token_data {
+ token text;
float score;
+ ttype type;
};
std::unordered_map token_to_id;
- std::vector id_to_token;
+ std::vector id_to_token;
};
struct my_llama_hparams {
@@ -502,49 +533,51 @@ bool is_ggml_file(const char *filename) {
return false;
}
uint32_t magic = file.read_u32();
- return magic == LLAMA_FILE_MAGIC;
+ return magic == GGUF_MAGIC;
}
void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
- // heuristic to infer whether vocab is from ggml or from llama2.c vocabulary
- if (is_ggml_file(filename)) {
-
- struct llama_context_params llama_params = llama_context_default_params();
- llama_params.vocab_only = true;
-
- struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params);
- struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
-
- std::vector strings;
- std::vector scores;
- int n_vocab = llama_n_vocab(lctx);
- strings.resize(n_vocab, NULL);
- scores.resize(n_vocab, 0);
- n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
- GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
- vocab->id_to_token.resize(n_vocab);
- for (int i=0; iid_to_token[i].tok = tok;
- vocab->id_to_token[i].score = score;
- vocab->token_to_id.emplace(tok, i);
- }
- llama_free(lctx);
- llama_free_model(lmodel);
- } else { // assume llama2.c vocabulary
+#pragma message("TODO: implement reading vocabulary using gguf")
+// // heuristic to infer whether vocab is from ggml or from llama2.c vocabulary
+// if (is_ggml_file(filename)) {
+//
+// struct llama_context_params llama_params = llama_context_default_params();
+// llama_params.vocab_only = true;
+//
+// struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params);
+// struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
+//
+// const int n_vocab = llama_n_vocab(lctx);
+// vocab->id_to_token.resize(n_vocab);
+// for (int i=0; iid_to_token[i].text = llama_token_get_text(lctx, i);
+// vocab->id_to_token[i].score = llama_token_get_score(lctx, i);
+// vocab->id_to_token[i].type = llama_token_get_type(lctx, i);
+// vocab->token_to_id.emplace(vocab->id_to_token[i].text, i);
+// }
+// llama_free(lctx);
+// llama_free_model(lmodel);
+// } else
+ { // assume llama2.c vocabulary
printf("Assuming llama2.c vocabulary since %s is not a ggml file\n", filename);
llama_file file(filename, "rb");
- uint32_t n_vocab = config->vocab_size;
+ const int n_vocab = config->vocab_size;
/* uint32_t max_token_length = */ file.read_u32(); // unused
vocab->id_to_token.resize(n_vocab);
- for (uint32_t i=0; iid_to_token[i].tok = tok;
+ std::string text = file.read_string(len);
+ // Special-case handling of <0xXX> single byte tokens.
+ char byte_val;
+ if (sscanf(text.c_str(), "<0x%02hhX>", &byte_val) == 1) {
+ char cstr[2] = { byte_val, 0 };
+ text = cstr;
+ }
+ vocab->id_to_token[i].text = text;
vocab->id_to_token[i].score = score;
- vocab->token_to_id.emplace(tok, i);
+ vocab->id_to_token[i].type = LLAMA_TOKEN_TYPE_UNDEFINED;
+ vocab->token_to_id.emplace(text, i);
}
}
}
@@ -590,9 +623,11 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
if (file.fp == NULL) {
return;
}
+
+#pragma message("TODO: implement file saving using gguf")
// write_magic
- file.write_u32(LLAMA_FILE_MAGIC); // magic
- file.write_u32(LLAMA_FILE_VERSION); // version
+ file.write_u32(LLAMA_FILE_MAGIC_GGJT); // magic
+ file.write_u32(LLAMA_FILE_VERSION_GGJT_V3); // version
// write_hparams
file.write_u32(model->hparams.n_vocab);
file.write_u32(model->hparams.n_embd);
@@ -605,17 +640,17 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
// write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
uint32_t n_vocab = model->hparams.n_vocab;
for (uint32_t i = 0; i < n_vocab; i++) {
- const auto & token_score = vocab->id_to_token.at(i);
- file.write_u32((uint32_t) token_score.tok.size());
- file.write_raw(token_score.tok.data(), token_score.tok.size());
- file.write_raw(&token_score.score, sizeof(token_score.score));
+ const auto & token_data = vocab->id_to_token.at(i);
+ file.write_u32((uint32_t) token_data.text.size());
+ file.write_raw(token_data.text.data(), token_data.text.size());
+ file.write_raw(&token_data.score, sizeof(token_data.score));
}
// stuff AK weights into GG weights one by one.
// w->token_embedding_table -> model->tok_embeddings
// float* -> struct ggml_tensor
stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
- stuff_karpathy_weights_into_gg(model->output, w->token_embedding_table);
+ stuff_karpathy_weights_into_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table);
stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
//print_row(model->norm, 0);
@@ -663,7 +698,7 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
struct train_params get_default_train_params() {
struct train_params params;
- params.fn_vocab_model = "models/ggml-vocab.bin";
+ params.fn_vocab_model = "tokenizer.bin";
params.fn_llama2c_output_model = "ak_llama_model.bin";
params.fn_train_data = "shakespeare.txt";
params.fn_checkpoint_in = "checkpoint.bin";
@@ -716,7 +751,7 @@ void print_usage(int /*argc*/, char ** argv, const struct train_params * params)
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help show this help message and exit\n");
- fprintf(stderr, " --copy-vocab-from-model FNAME llama2.c vocabulary or ggml model path from which to copy vocab (default '%s')\n", params->fn_vocab_model);
+ fprintf(stderr, " --copy-vocab-from-model FNAME llama2.c vocabulary or ggmlv3 model path from which to copy vocab (default '%s')\n", params->fn_vocab_model);
fprintf(stderr, " --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model\n");
fprintf(stderr, " --llama2c-output-model FNAME model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model);
fprintf(stderr, "\n");
@@ -789,9 +824,12 @@ int main(int argc, char ** argv) {
if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; }
// read in the config header
if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; }
+ auto shared_weights = config.vocab_size > 0;
+ config.vocab_size = abs(config.vocab_size);
+
// read in the Transformer weights
- malloc_weights(&weights, &config);
- if(checkpoint_init_weights(&weights, &config, file)) { return 1; }
+ malloc_weights(&weights, &config, shared_weights);
+ if(checkpoint_init_weights(&weights, &config, file, shared_weights)) { return 1; }
fclose(file);
}
diff --git a/examples/embd-input/embd-input-lib.cpp b/examples/embd-input/embd-input-lib.cpp
index 2185b9b0e..8a6ad882e 100644
--- a/examples/embd-input/embd-input-lib.cpp
+++ b/examples/embd-input/embd-input-lib.cpp
@@ -167,7 +167,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
// TODO: Apply penalties
- // float nl_logit = logits[llama_token_nl()];
+ // float nl_logit = logits[llama_token_nl(ctx)];
// auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
// llama_sample_repetition_penalty(ctx, &candidates_p,
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
@@ -176,7 +176,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
// last_n_repeat, alpha_frequency, alpha_presence);
// if (!penalize_nl) {
- // logits[llama_token_nl()] = nl_logit;
+ // logits[llama_token_nl(ctx)] = nl_logit;
// }
if (temp <= 0) {
@@ -211,7 +211,7 @@ const char * sampling(struct MyModel * mymodel) {
llama_context * ctx = mymodel->ctx;
int id = sampling_id(mymodel);
static std::string ret;
- if (id == llama_token_eos()) {
+ if (id == llama_token_eos(ctx)) {
ret = "";
} else {
ret = llama_token_to_str(ctx, id);
diff --git a/examples/embd-input/embd_input.py b/examples/embd-input/embd_input.py
old mode 100644
new mode 100755
index be2896614..f146acdc1
--- a/examples/embd-input/embd_input.py
+++ b/examples/embd-input/embd_input.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
import ctypes
from ctypes import cdll, c_char_p, c_void_p, POINTER, c_float, c_int
import numpy as np
diff --git a/examples/embd-input/llava.py b/examples/embd-input/llava.py
old mode 100644
new mode 100755
index bcbdd2bed..06fad55f4
--- a/examples/embd-input/llava.py
+++ b/examples/embd-input/llava.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))
diff --git a/examples/embd-input/minigpt4.py b/examples/embd-input/minigpt4.py
old mode 100644
new mode 100755
index 15c9b77c0..7b13e4a5c
--- a/examples/embd-input/minigpt4.py
+++ b/examples/embd-input/minigpt4.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))
diff --git a/examples/embd-input/panda_gpt.py b/examples/embd-input/panda_gpt.py
old mode 100644
new mode 100755
index 0cfac5f32..891ad7cc9
--- a/examples/embd-input/panda_gpt.py
+++ b/examples/embd-input/panda_gpt.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 5192d6df5..38395c75b 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -67,28 +67,35 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
for (int i = 0; i < (int) embd_inp.size(); i++) {
- fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
+ fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
}
fprintf(stderr, "\n");
}
- if (params.embedding){
- if (embd_inp.size() > 0) {
- if (llama_eval(ctx, embd_inp.data(), embd_inp.size(), n_past, params.n_threads)) {
- fprintf(stderr, "%s : failed to eval\n", __func__);
- return 1;
- }
- }
-
- const int n_embd = llama_n_embd(ctx);
- const auto embeddings = llama_get_embeddings(ctx);
-
- for (int i = 0; i < n_embd; i++) {
- printf("%f ", embeddings[i]);
- }
- printf("\n");
+ if (embd_inp.size() > (size_t)params.n_ctx) {
+ fprintf(stderr, "%s: error: prompt is longer than the context window (%zu tokens, n_ctx = %d)\n",
+ __func__, embd_inp.size(), params.n_ctx);
+ return 1;
}
+ while (!embd_inp.empty()) {
+ int n_tokens = std::min(params.n_batch, (int) embd_inp.size());
+ if (llama_eval(ctx, embd_inp.data(), n_tokens, n_past, params.n_threads)) {
+ fprintf(stderr, "%s : failed to eval\n", __func__);
+ return 1;
+ }
+ n_past += n_tokens;
+ embd_inp.erase(embd_inp.begin(), embd_inp.begin() + n_tokens);
+ }
+
+ const int n_embd = llama_n_embd(ctx);
+ const auto embeddings = llama_get_embeddings(ctx);
+
+ for (int i = 0; i < n_embd; i++) {
+ printf("%f ", embeddings[i]);
+ }
+ printf("\n");
+
llama_print_timings(ctx);
llama_free(ctx);
llama_free_model(model);
diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp
new file mode 100644
index 000000000..dee00df87
--- /dev/null
+++ b/examples/gguf/gguf.cpp
@@ -0,0 +1,246 @@
+#include "ggml.h"
+#include "llama.h"
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+#undef MIN
+#undef MAX
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+template
+static std::string to_string(const T & val) {
+ std::stringstream ss;
+ ss << val;
+ return ss.str();
+}
+
+bool gguf_ex_write(const std::string & fname) {
+ struct gguf_context * ctx = gguf_init_empty();
+
+ gguf_set_val_u8 (ctx, "some.parameter.uint8", 0x12);
+ gguf_set_val_i8 (ctx, "some.parameter.int8", -0x13);
+ gguf_set_val_u16 (ctx, "some.parameter.uint16", 0x1234);
+ gguf_set_val_i16 (ctx, "some.parameter.int16", -0x1235);
+ gguf_set_val_u32 (ctx, "some.parameter.uint32", 0x12345678);
+ gguf_set_val_i32 (ctx, "some.parameter.int32", -0x12345679);
+ gguf_set_val_f32 (ctx, "some.parameter.float32", 0.123456789f);
+ gguf_set_val_bool(ctx, "some.parameter.bool", true);
+ gguf_set_val_str (ctx, "some.parameter.string", "hello world");
+
+ gguf_set_arr_data(ctx, "some.parameter.arr.i16", GGUF_TYPE_INT16, std::vector{ 1, 2, 3, 4, }.data(), 4);
+ gguf_set_arr_data(ctx, "some.parameter.arr.f32", GGUF_TYPE_FLOAT32, std::vector{ 3.145f, 2.718f, 1.414f, }.data(), 3);
+ gguf_set_arr_str (ctx, "some.parameter.arr.str", std::vector{ "hello", "world", "!" }.data(), 3);
+
+ struct ggml_init_params params = {
+ /*.mem_size =*/ 128ull*1024ull*1024ull,
+ /*.mem_buffer =*/ NULL,
+ /*.no_alloc =*/ false,
+ };
+
+ struct ggml_context * ctx_data = ggml_init(params);
+
+ const int n_tensors = 10;
+
+ // tensor infos
+ for (int i = 0; i < n_tensors; ++i) {
+ const std::string name = "tensor_" + to_string(i);
+
+ int64_t ne[GGML_MAX_DIMS] = { 1 };
+ int32_t n_dims = rand() % GGML_MAX_DIMS + 1;
+
+ for (int j = 0; j < n_dims; ++j) {
+ ne[j] = rand() % 10 + 1;
+ }
+
+ struct ggml_tensor * cur = ggml_new_tensor(ctx_data, GGML_TYPE_F32, n_dims, ne);
+ ggml_set_name(cur, name.c_str());
+
+ {
+ float * data = (float *) cur->data;
+ for (int j = 0; j < ggml_nelements(cur); ++j) {
+ data[j] = 100 + i;
+ }
+ }
+
+ gguf_add_tensor(ctx, cur);
+ }
+
+ gguf_write_to_file(ctx, fname.c_str(), false);
+
+ fprintf(stdout, "%s: wrote file '%s;\n", __func__, fname.c_str());
+
+ ggml_free(ctx_data);
+ gguf_free(ctx);
+
+ return true;
+}
+
+// just read tensor info
+bool gguf_ex_read_0(const std::string & fname) {
+ struct gguf_init_params params = {
+ /*.no_alloc = */ false,
+ /*.ctx = */ NULL,
+ };
+
+ struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
+
+ fprintf(stdout, "%s: version: %d\n", __func__, gguf_get_version(ctx));
+ fprintf(stdout, "%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
+ fprintf(stdout, "%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
+
+ // kv
+ {
+ const int n_kv = gguf_get_n_kv(ctx);
+
+ fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);
+
+ for (int i = 0; i < n_kv; ++i) {
+ const char * key = gguf_get_key(ctx, i);
+
+ fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
+ }
+ }
+
+ // find kv string
+ {
+ const char * findkey = "some.parameter.string";
+
+ const int keyidx = gguf_find_key(ctx, findkey);
+ if (keyidx == -1) {
+ fprintf(stdout, "%s: find key: %s not found.\n", __func__, findkey);
+ } else {
+ const char * key_value = gguf_get_val_str(ctx, keyidx);
+ fprintf(stdout, "%s: find key: %s found, kv[%d] value = %s\n", __func__, findkey, keyidx, key_value);
+ }
+ }
+
+ // tensor info
+ {
+ const int n_tensors = gguf_get_n_tensors(ctx);
+
+ fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);
+
+ for (int i = 0; i < n_tensors; ++i) {
+ const char * name = gguf_get_tensor_name (ctx, i);
+ const size_t offset = gguf_get_tensor_offset(ctx, i);
+
+ fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
+ }
+ }
+
+ gguf_free(ctx);
+
+ return true;
+}
+
+// read and create ggml_context containing the tensors and their data
+bool gguf_ex_read_1(const std::string & fname) {
+ struct ggml_context * ctx_data = NULL;
+
+ struct gguf_init_params params = {
+ /*.no_alloc = */ false,
+ /*.ctx = */ &ctx_data,
+ };
+
+ struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
+
+ fprintf(stdout, "%s: version: %d\n", __func__, gguf_get_version(ctx));
+ fprintf(stdout, "%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
+ fprintf(stdout, "%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
+
+ // kv
+ {
+ const int n_kv = gguf_get_n_kv(ctx);
+
+ fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);
+
+ for (int i = 0; i < n_kv; ++i) {
+ const char * key = gguf_get_key(ctx, i);
+
+ fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
+ }
+ }
+
+ // tensor info
+ {
+ const int n_tensors = gguf_get_n_tensors(ctx);
+
+ fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);
+
+ for (int i = 0; i < n_tensors; ++i) {
+ const char * name = gguf_get_tensor_name (ctx, i);
+ const size_t offset = gguf_get_tensor_offset(ctx, i);
+
+ fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
+ }
+ }
+
+ // data
+ {
+ const int n_tensors = gguf_get_n_tensors(ctx);
+
+ for (int i = 0; i < n_tensors; ++i) {
+ fprintf(stdout, "%s: reading tensor %d data\n", __func__, i);
+
+ const char * name = gguf_get_tensor_name(ctx, i);
+
+ struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
+
+ fprintf(stdout, "%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data);
+
+ // print first 10 elements
+ const float * data = (const float *) cur->data;
+
+ printf("%s data[:10] : ", name);
+ for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) {
+ printf("%f ", data[j]);
+ }
+ printf("\n\n");
+
+ // check data
+ {
+ const float * data = (const float *) cur->data;
+ for (int j = 0; j < ggml_nelements(cur); ++j) {
+ if (data[j] != 100 + i) {
+ fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]);
+ return false;
+ }
+ }
+ }
+ }
+ }
+
+ fprintf(stdout, "%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data));
+
+ ggml_free(ctx_data);
+ gguf_free(ctx);
+
+ return true;
+}
+
+int main(int argc, char ** argv) {
+ if (argc < 3) {
+ fprintf(stdout, "usage: %s data.gguf r|w\n", argv[0]);
+ return -1;
+ }
+
+ const std::string fname(argv[1]);
+ const std::string mode (argv[2]);
+
+ GGML_ASSERT((mode == "r" || mode == "w") && "mode must be r or w");
+
+ if (mode == "w") {
+ GGML_ASSERT(gguf_ex_write(fname) && "failed to write gguf file");
+ } else if (mode == "r") {
+ GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file");
+ GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file");
+ }
+
+ return 0;
+}
diff --git a/examples/gptneox-wip/cmpnct_gpt2bpe.hpp b/examples/gptneox-wip/cmpnct_gpt2bpe.hpp
new file mode 100644
index 000000000..9d433f4b1
--- /dev/null
+++ b/examples/gptneox-wip/cmpnct_gpt2bpe.hpp
@@ -0,0 +1,1133 @@
+#ifndef CMPNCT_GPT2BPE
+#define CMPNCT_GPT2BPE
+
+#include
+#include
+#include
+#include
+#include
+#include