Merge branch 'master' into add_stop_token

2024-12-26 14:20:31 +01:00 · 2023-05-11 10:26:00 -07:00 · 2023-05-11 10:26:00 -07:00 · 099a07fb87
commit 099a07fb87
parent 331343ab0e 2510c1831f
26 changed files with 3100 additions and 928 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -8,6 +8,8 @@ on:
        required: true
        type: boolean
  push:
    branches:
      - master
    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
  pull_request:
    types: [opened, synchronize, edited, reopened, review_requested, ready_for_review]
@ -18,6 +20,8 @@ env:
 jobs:
  ubuntu-latest-make:
    if: github.event.pull_request.draft == false
    runs-on: ubuntu-latest
    steps:
@ -37,6 +41,8 @@ jobs:
          make
  ubuntu-latest-cmake:
    if: github.event.pull_request.draft == false
    runs-on: ubuntu-latest
    steps:
@ -65,6 +71,8 @@ jobs:
          ctest --verbose
  ubuntu-latest-cmake-sanitizer:
    if: github.event.pull_request.draft == false
    runs-on: ubuntu-latest
    continue-on-error: true
@ -73,7 +81,6 @@ jobs:
      matrix:
        sanitizer: [ADDRESS, THREAD, UNDEFINED]
        build_type: [Debug, Release]
        accelerate: [ON, OFF]
    steps:
      - name: Clone
@ -91,7 +98,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DLLAMA_ACCELERATE=${{ matrix.accelerate }}
+          cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
          cmake --build . --config ${{ matrix.build_type }}
      - name: Test
@ -101,6 +108,8 @@ jobs:
          ctest --verbose
  macOS-latest-make:
    if: github.event.pull_request.draft == false
    runs-on: macos-latest
    steps:
@ -119,6 +128,8 @@ jobs:
          make
  macOS-latest-cmake:
    if: github.event.pull_request.draft == false
    runs-on: macOS-latest
    steps:
@ -146,6 +157,8 @@ jobs:
          ctest --verbose
  windows-latest-cmake:
    if: github.event.pull_request.draft == false
    runs-on: windows-latest
    strategy:
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -18,6 +18,8 @@ on:
 jobs:
  push_to_registry:
    name: Push Docker image to Docker Hub
    if: github.event.pull_request.draft == false
    runs-on: ubuntu-latest
    env:
      COMMIT_SHA: ${{ github.sha }}
--- a/.gitignore
+++ b/.gitignore
@ -1,11 +1,15 @@
 *.o
 *.a
 .DS_Store
 .build/
 .cache/
 .direnv/
 .envrc
 .swiftpm
 .venv
 .vs/
 .vscode/
 .DS_Store
 .build/
 build/
 build-em/
 build-debug/
@ -24,17 +28,15 @@ models/*
 /perplexity
 /embedding
 /benchmark-q4_0-matmult
 /vdot
 /Pipfile
 arm_neon.h
 compile_commands.json
 .envrc
 .direnv/
 .venv
 __pycache__
 .swiftpm
 zig-out/
 zig-cache/
 ppl-*.txt
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -55,6 +55,8 @@ option(LLAMA_SANITIZE_UNDEFINED     "llama: enable undefined sanitizer"
 option(LLAMA_AVX                    "llama: enable AVX"                                     ON)
 option(LLAMA_AVX2                   "llama: enable AVX2"                                    ON)
 option(LLAMA_AVX512                 "llama: enable AVX512"                                  OFF)
 option(LLAMA_AVX512_VBMI            "llama: enable AVX512-VBMI"                             OFF)
 option(LLAMA_AVX512_VNNI            "llama: enable AVX512-VNNI"                             OFF)
 option(LLAMA_FMA                    "llama: enable FMA"                                     ON)
 # in MSVC F16C is implied with AVX2/AVX512
 if (NOT MSVC)
@ -64,6 +66,7 @@ endif()
 # 3rd party libs
 option(LLAMA_ACCELERATE             "llama: enable Accelerate framework"                    ON)
 option(LLAMA_OPENBLAS               "llama: use OpenBLAS"                                   OFF)
 option(LLAMA_CUBLAS                 "llama: use cuBLAS"                                     OFF)
 option(LLAMA_BUILD_TESTS            "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES         "llama: build examples" ${LLAMA_STANDALONE})
@ -107,6 +110,7 @@ if (APPLE AND LLAMA_ACCELERATE)
        message(WARNING "Accelerate framework not found")
    endif()
 endif()
 if (LLAMA_OPENBLAS)
    if (LLAMA_STATIC)
        set(BLA_STATIC ON)
@ -140,6 +144,30 @@ if (LLAMA_OPENBLAS)
    endif()
 endif()
 if (LLAMA_CUBLAS)
    cmake_minimum_required(VERSION 3.17)
    find_package(CUDAToolkit)
    if (CUDAToolkit_FOUND)
        message(STATUS "cuBLAS found")
        enable_language(CUDA)
        set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h)
        add_compile_definitions(GGML_USE_CUBLAS)
        if (LLAMA_STATIC)
            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
        else()
            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
        endif()
    else()
        message(WARNING "cuBLAS not found")
    endif()
 endif()
 if (LLAMA_ALL_WARNINGS)
    if (NOT MSVC)
        set(c_flags
@ -151,7 +179,6 @@ if (LLAMA_ALL_WARNINGS)
            -Wshadow
            -Wstrict-prototypes
            -Wpointer-arith
            -Wno-unused-function
        )
        set(cxx_flags
            -Wall
@ -219,11 +246,26 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
    message(STATUS "x86 detected")
    if (MSVC)
        if (LLAMA_AVX512)
-            add_compile_options(/arch:AVX512)
+            add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX512>)
            add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
            # MSVC has no compile-time flags enabling specific
            # AVX512 extensions, neither it defines the
            # macros corresponding to the extensions.
            # Do it manually.
            if (LLAMA_AVX512_VBMI)
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
            endif()
            if (LLAMA_AVX512_VNNI)
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
            endif()
        elseif (LLAMA_AVX2)
-            add_compile_options(/arch:AVX2)
+            add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX2>)
            add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX2>)
        elseif (LLAMA_AVX)
-            add_compile_options(/arch:AVX)
+            add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX>)
            add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX>)
        endif()
    else()
        if (LLAMA_F16C)
@ -240,9 +282,13 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
        endif()
        if (LLAMA_AVX512)
            add_compile_options(-mavx512f)
-            # add_compile_options(-mavx512cd)
+            add_compile_options(-mavx512bw)
-            # add_compile_options(-mavx512dq)
+        endif()
-            # add_compile_options(-mavx512bw)
+        if (LLAMA_AVX512_VBMI)
            add_compile_options(-mavx512vbmi)
        endif()
        if (LLAMA_AVX512_VNNI)
            add_compile_options(-mavx512vnni)
        endif()
    endif()
 else()
@ -256,7 +302,8 @@ endif()
 add_library(ggml OBJECT
            ggml.c
-            ggml.h)
+            ggml.h
            ${GGML_CUDA_SOURCES})
 target_include_directories(ggml PUBLIC .)
 target_compile_features(ggml PUBLIC c_std_11) # don't bump
@ -278,6 +325,14 @@ if (BUILD_SHARED_LIBS)
    target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
 endif()
 if (GGML_CUDA_SOURCES)
    message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
    set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES OFF)
    set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
    set_property(TARGET llama PROPERTY CUDA_ARCHITECTURES OFF)
 endif()
 #
 # programs, examples and tests
 #
@ -289,4 +344,5 @@ endif ()
 if (LLAMA_BUILD_EXAMPLES)
    add_subdirectory(examples)
    add_subdirectory(pocs)
 endif()
--- a/31
+++ b/31
@ -1,3 +1,6 @@
 # Define the default target now so that it is always the first target
 default: main quantize quantize-stats perplexity embedding vdot
 ifndef UNAME_S
 UNAME_S := $(shell uname -s)
 endif
@ -36,7 +39,7 @@ CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
 LDFLAGS  =
 # warnings
-CFLAGS   += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wno-unused-function
+CFLAGS   += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith
 CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
 # OS specific
@ -97,6 +100,13 @@ ifdef LLAMA_OPENBLAS
 	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
 	LDFLAGS += -lopenblas
 endif
 ifdef LLAMA_CUBLAS
 	CFLAGS  += -DGGML_USE_CUBLAS -I/usr/local/cuda/include
 	LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64
 	OBJS	+= ggml-cuda.o
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 	nvcc -arch=native -c -o $@ $<
 endif
 ifdef LLAMA_GPROF
 	CFLAGS   += -pg
 	CXXFLAGS += -pg
@ -133,8 +143,6 @@ $(info I CC:       $(CCV))
 $(info I CXX:      $(CXXV))
 $(info )
 default: main quantize quantize-stats perplexity embedding
 #
 # Build library
 #
@ -151,32 +159,35 @@ common.o: examples/common.cpp examples/common.h
 clean:
 	rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-q4_0-matmult
-main: examples/main/main.cpp ggml.o llama.o common.o
+main: examples/main/main.cpp ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 	@echo
 	@echo '====  Run ./main -h for help.  ===='
 	@echo
-quantize: examples/quantize/quantize.cpp ggml.o llama.o
+quantize: examples/quantize/quantize.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o
+quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
+perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
+embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-libllama.so: llama.o ggml.o
+vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 #
 # Tests
 #
-benchmark: examples/benchmark/benchmark-q4_0-matmult.c ggml.o
+benchmark: examples/benchmark/benchmark-q4_0-matmult.c ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o benchmark-q4_0-matmult $(LDFLAGS)
 	./benchmark-q4_0-matmult
--- a/README.md
+++ b/README.md
@ -7,14 +7,19 @@
 Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 **Warnings**
 - `Q4_2` and `Q4_3` are still in development. Do not expect any kind of backward compatibility until they are finalized
 **Hot topics:**
 - [Added LoRA support](https://github.com/ggerganov/llama.cpp/pull/820)
 - [Add GPU support to ggml](https://github.com/ggerganov/llama.cpp/discussions/915)
 - [Roadmap Apr 2023](https://github.com/ggerganov/llama.cpp/discussions/784)
 ## Description
-The main goal is to run the model using 4-bit quantization on a MacBook
+The main goal of llama.cpp is to run the llama model using 4-bit quantization on a MacBook.
 - Plain C/C++ implementation without dependencies
 - Apple silicon first-class citizen - optimized via ARM NEON and Accelerate framework
@ -50,6 +55,7 @@ New features will probably be added mostly through community contributions.
 - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
 - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
 - Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
 - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
 **UI:**
@ -150,7 +156,7 @@ https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8
 ## Usage
-Here are the step for the LLaMA-7B model.
+Here are the steps for the LLaMA-7B model.
 ### Get the Code
@ -208,8 +214,7 @@ When running the larger models, make sure you have enough disk space to store al
 ### Memory/Disk Requirements
-As the models are currently fully loaded into memory, you will need adequate disk space to save them
+As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
 and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
 | model | original size | quantized size (4-bit) |
 |-------|---------------|------------------------|
@ -221,18 +226,18 @@ and sufficient RAM to load them. At the moment, memory and disk requirements are
 ### Interactive mode
 If you want a more ChatGPT-like experience, you can run in interactive mode by passing `-i` as a parameter.
-In this mode, you can always interrupt generation by pressing Ctrl+C and enter one or more lines of text which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt which makes LLaMa emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
+In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMa emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
-Here is an example few-shot interaction, invoked with the command
+Here is an example of a few-shot interaction, invoked with the command
 ```bash
-# default arguments using 7B model
+# default arguments using a 7B model
 ./examples/chat.sh
-# advanced chat with 13B model
+# advanced chat with a 13B model
 ./examples/chat-13B.sh
-# custom arguments using 13B model
+# custom arguments using a 13B model
 ./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
 ```
@ -271,7 +276,7 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
 ### Using [GPT4All](https://github.com/nomic-ai/gpt4all)
 - Obtain the `gpt4all-lora-quantized.bin` model
- It is distributed in the old `ggml` format which is now obsoleted
+- It is distributed in the old `ggml` format, which is now obsoleted
 - You have to convert it to the new format using [./convert-gpt4all-to-ggml.py](./convert-gpt4all-to-ggml.py). You may also need to
 convert the model from the old format to the new format with [./migrate-ggml-2023-03-30-pr613.py](./migrate-ggml-2023-03-30-pr613.py):
@ -285,7 +290,7 @@ convert the model from the old format to the new format with [./migrate-ggml-202
 ### Obtaining and verifying the Facebook LLaMA original model and Stanford Alpaca model data
- **Under no circumstances share IPFS, magnet links, or any other links to model downloads anywhere in this respository, including in issues, discussions or pull requests. They will be immediately deleted.**
+- **Under no circumstances should IPFS, magnet links, or any other links to model downloads be shared anywhere in this repository, including in issues, discussions, or pull requests. They will be immediately deleted.**
 - The LLaMA models are officially distributed by Facebook and will **never** be provided through this repository.
 - Refer to [Facebook's LLaMA repository](https://github.com/facebookresearch/llama/pull/73/files) if you need to request access to the model data.
 - Please verify the [sha256 checksums](SHA256SUMS) of all downloaded model files to confirm that you have the correct model data files before creating an issue relating to your model files.
@ -297,29 +302,27 @@ convert the model from the old format to the new format with [./migrate-ggml-202
  `shasum -a 256 --ignore-missing -c SHA256SUMS` on macOS
- If your issue is with model generation quality then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
+- If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
-  - LLaMA:
+- LLaMA:
-    - [Introducing LLaMA: A foundational, 65-billion-parameter large language model](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/)
+- [Introducing LLaMA: A foundational, 65-billion-parameter large language model](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/)
-    - [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)
+- [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)
-  - GPT-3
+- GPT-3
-    - [Language Models are Few-Shot Learners](https://arxiv.org/abs/2005.14165)
+- [Language Models are Few-Shot Learners](https://arxiv.org/abs/2005.14165)
-  - GPT-3.5 / InstructGPT / ChatGPT:
+- GPT-3.5 / InstructGPT / ChatGPT:
-    - [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
+- [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
-    - [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
+- [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
-### Perplexity (Measuring model quality)
+### Perplexity (measuring model quality)
-You can use the `perplexity` example to measure perplexity over the given prompt.  For more background,
+You can use the `perplexity` example to measure perplexity over the given prompt. For more background, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity). However, in general, lower perplexity is better for LLMs.
 see https://huggingface.co/docs/transformers/perplexity.  However, in general, lower perplexity is better for LLMs.
 #### Latest measurements
-The latest perplexity scores for the various model sizes and quantizations are being tracked in [discussion #406](https://github.com/ggerganov/llama.cpp/discussions/406).  `llama.cpp` is measuring very well
+The latest perplexity scores for the various model sizes and quantizations are being tracked in [discussion #406](https://github.com/ggerganov/llama.cpp/discussions/406). `llama.cpp` is measuring very well compared to the baseline implementations. Quantization has a small negative impact on quality, but, as you can see, running
 compared to the baseline implementations.  Quantization has a small negative impact to quality, but, as you can see, running
 13B at q4_0 beats the 7B f16 model by a significant amount.
-All measurements are done against wikitext2 test dataset (https://paperswithcode.com/dataset/wikitext-2), with default options (512 length context).
+All measurements are done against the wikitext2 test dataset (https://paperswithcode.com/dataset/wikitext-2), with default options (512 length context).
-Note that the changing the context length will have a significant impact on perplexity (longer context = better perplexity).
+Note that changing the context length will have a significant impact on perplexity (longer context = better perplexity).
 ```
 Perplexity - model options
 5.5985 - 13B, q4_0
@ -361,7 +364,7 @@ https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b0
 #### Prerequisites
 * Docker must be installed and running on your system.
-* Create a folder to store big models & intermediate files (in ex. im using /llama/models)
+* Create a folder to store big models & intermediate files (ex. /llama/models)
 #### Images
 We have two Docker images available for this project:
@ -375,17 +378,17 @@ The easiest way to download the models, convert them to ggml and optimize them i
 Replace `/path/to/models` below with the actual path where you downloaded the models.
- ```bash
+```bash
 docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
 ```
-On complete, you are ready to play!
+On completion, you are ready to play!
 ```bash
 docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
 ```
-or with light image:
+or with a light image:
 ```bash
 docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
@ -406,7 +409,7 @@ docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /mode
 - Always consider cross-compatibility with other operating systems and architectures
 - Avoid fancy looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
 - There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
- Clean-up any trailing whitespaces, use 4 spaces indentation, brackets on same line, `void * ptr`, `int & a`
+- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
 - See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
 ### Docs
--- a/20
+++ b/20
@ -1,12 +1,27 @@
 700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d  models/7B/consolidated.00.pth
 666a4bb533b303bdaf89e1b6a3b6f93535d868de31d903afdc20983dc526c847  models/7B/ggml-model-f16.bin
 fcb7664c2e69776920b526362a243e912f73c36b1ec892eb354bab940f5edb5a  models/7B/ggml-model-q4_0.bin
 cc061458339a3eb8bcecbf0a825e9924fb7d1a8150f63cd5d091caa99215aafe  models/7B/ggml-model-q4_1.bin
 1bc7484c24a87612726d756f1761890e7acf5f412e23378577ce50fbe789b5b8  models/7B/ggml-model-q4_2.bin
 3429bf198ec771886cf81a574df45245f3ebf04f0ce0956b73ef5d0ab01ff48b  models/7B/ggml-model-q4_3.bin
 7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265  models/7B/params.json
 745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08  models/13B/consolidated.00.pth
 d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085  models/13B/consolidated.01.pth
 2b206e9b21fb1076f11cafc624e2af97c9e48ea09312a0962153acc20d45f808  models/13B/ggml-model-f16.bin
 4b69e4d6b6e3275230955997b90407fceca7e5ab3daf2e63a2c9e7270a8e1e3e  models/13B/ggml-model-q4_0.bin
 d9581b5b88e5622532fe897c9f9b0e67a317d22dd27a6f90fa4ab8c6d23ccdbb  models/13B/ggml-model-q4_1.bin
 8d55a2077317ec9a928c7851d6a43e08e51f7e9e08360f2a7a7e1deefea3134f  models/13B/ggml-model-q4_2.bin
 4208cdec9788ffa48dc1a17af2c36a0299f5bf3eb0e2b87889dda7fad591fca3  models/13B/ggml-model-q4_3.bin
 4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f  models/13B/params.json
 e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067  models/30B/consolidated.00.pth
 4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff  models/30B/consolidated.01.pth
 24a87f01028cbd3a12de551dcedb712346c0b5cbdeff1454e0ddf2df9b675378  models/30B/consolidated.02.pth
 1adfcef71420886119544949767f6a56cb6339b4d5fcde755d80fe68b49de93b  models/30B/consolidated.03.pth
 7e1b524061a9f4b27c22a12d6d2a5bf13b8ebbea73e99f218809351ed9cf7d37  models/30B/ggml-model-f16.bin
 7a679908ce31c9d6ae2e38d6059bcd4d0ad3a870cd58cc1c8f7b36f2b2f51c73  models/30B/ggml-model-q4_0.bin
 7b75ac615fa369ee593493a7e6ef87542bf0350255db928b22c5a24f6d598bcd  models/30B/ggml-model-q4_1.bin
 2c82b4954a94a6a284f452f6011c1e4f0d20362c194a0b1eb5737f5fd8a20fb3  models/30B/ggml-model-q4_2.bin
 a6188660199dbcb8d5658abe7d89169869e50423494385830d9e6b330ea7fc33  models/30B/ggml-model-q4_3.bin
 2c07118ea98d69dbe7810d88520e30288fa994751b337f8fca02b171955f44cb  models/30B/params.json
 135c563f6b3938114458183afb01adc9a63bef3d8ff7cccc3977e5d3664ecafe  models/65B/consolidated.00.pth
 9a600b37b19d38c7e43809485f70d17d1dc12206c07efa83bc72bb498a568bde  models/65B/consolidated.01.pth
@ -16,5 +31,10 @@ e7babf7c5606f165a3756f527cb0fedc4f83e67ef1290391e52fb1cce5f26770  models/65B/con
 a287c0dfe49081626567c7fe87f74cce5831f58e459b427b5e05567641f47b78  models/65B/consolidated.05.pth
 72b4eba67a1a3b18cb67a85b70f8f1640caae9b40033ea943fb166bd80a7b36b  models/65B/consolidated.06.pth
 d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638  models/65B/consolidated.07.pth
 60758f2384d74e423dffddfd020ffed9d3bb186ebc54506f9c4a787d0f5367b0  models/65B/ggml-model-f16.bin
 c671fe1bce71499ac732ec999770ebe53ac486623a7891e42c9dfdb6962d2c64  models/65B/ggml-model-q4_0.bin
 4743a28aac3e5f32a6e838a815f51d3779de44fbbe251d745251e66c23c5950f  models/65B/ggml-model-q4_1.bin
 4a145a210c56982389b1ed34387e0590c3e0d7325fa9be4f2284fe4d244a3633  models/65B/ggml-model-q4_2.bin
 305e91a4608b4f627b9b8ad5b4af75187d2684254bfd76dcb9db571618ef293c  models/65B/ggml-model-q4_3.bin
 999ed1659b469ccc2a941714c0a9656fa571d17c9f7c8c7589817ca90edef51b  models/65B/params.json
 9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347  models/tokenizer.model
--- a/convert-lora-to-ggml.py
+++ b/convert-lora-to-ggml.py
@ -0,0 +1,124 @@
 import json
 import os
 import re
 import struct
 import sys
 from typing import Any, Dict, Sequence, TextIO
 import torch
 from convert import DATA_TYPE_TO_FTYPE, NUMPY_TYPE_TO_DATA_TYPE, DataType
 HF_SUBLAYER_TO_GGML = {
    "self_attn.q_proj": "attention.wq",
    "self_attn.k_proj": "attention.wk",
    "self_attn.v_proj": "attention.wv",
    "self_attn.o_proj": "attention.wo",
    "mlp.gate_proj": "feed_forward.w1",
    "mlp.down_proj": "feed_forward.w2",
    "mlp.up_proj": "feed_forward.w3",
    "input_layernorm": "attention_norm",
    "post_attention_layernorm": "ffn_norm",
    # "norm": "norm",
    # "embed_tokens": "tok_embeddings",
    # "lm_head": "output",
 }
 def translate_tensor_name(t: str) -> str:
    match = re.match(r".*layers\.(\d+)\.(\w+\.\w+)\.lora_(A|B)\.weight", t)
    if match:
        nn = match.group(1)
        sub_layer = match.group(2)
        lora_type = match.group(3)
        sub_layer_renamed = HF_SUBLAYER_TO_GGML.get(sub_layer)
        if sub_layer_renamed is None:
            print(f"Error: unrecognized sub-layer {sub_layer} in tensor {t}")
            sys.exit(1)
        output_string = (
            f"layers.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
        )
        return output_string
    else:
        print(f"Error: unrecognized tensor {t}")
        sys.exit(1)
 def write_file_header(fout: TextIO, params: Dict[str, Any]) -> None:
    fout.write(b"ggla"[::-1])  # magic (ggml lora)
    fout.write(struct.pack("i", 1))  # file version
    fout.write(struct.pack("ii", params["r"], params["lora_alpha"]))
 def write_tensor_header(
    self, name: str, shape: Sequence[int], data_type: DataType
 ) -> None:
    sname = name.encode("utf-8")
    fout.write(
        struct.pack(
            "iii",
            len(shape),
            len(sname),
            DATA_TYPE_TO_FTYPE[NUMPY_TYPE_TO_DATA_TYPE[data_type]],
        )
    )
    fout.write(struct.pack("i" * len(shape), *shape[::-1]))
    fout.write(sname)
    fout.seek((fout.tell() + 31) & -32)
 if len(sys.argv) != 2:
    print(f"Usage: python {sys.argv[0]} <path>")
    print(
        "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
    )
    sys.exit(1)
 input_json = os.path.join(sys.argv[1], "adapter_config.json")
 input_model = os.path.join(sys.argv[1], "adapter_model.bin")
 output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
 model = torch.load(input_model, map_location="cpu")
 with open(input_json, "r") as f:
    params = json.load(f)
 if params["peft_type"] != "LORA":
    print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
    sys.exit(1)
 if params["fan_in_fan_out"] == True:
    print("Error: param fan_in_fan_out is not supported")
    sys.exit(1)
 if params["bias"] is not None and params["bias"] != "none":
    print("Error: param bias is not supported")
    sys.exit(1)
 # TODO: these seem to be layers that have been trained but without lora.
 # doesn't seem widely used but eventually should be supported
 if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
    print("Error: param modules_to_save is not supported")
    sys.exit(1)
 with open(output_path, "wb") as fout:
    fout.truncate()
    write_file_header(fout, params)
    for k, v in model.items():
        if k.endswith("lora_A.weight"):
            if v.dtype != torch.float16 and v.dtype != torch.float32:
                v = v.float()
            v = v.T
        else:
            v = v.float()
        t = v.numpy()
        tname = translate_tensor_name(k)
        print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
        write_tensor_header(fout, tname, t.shape, t.dtype)
        t.tofile(fout)
 print(f"Converted {input_json} and {input_model} to {output_path}")
--- a/convert.py
+++ b/convert.py
@ -1085,6 +1085,7 @@ def default_outfile(model_paths: List[Path], params: Params) -> Path:
    namestr = {
        GGMLFileType.AllF32: "f32",
        GGMLFileType.MostlyF16: "f16",
        GGMLFileType.MostlyQ4_0: "q4_0",
        GGMLFileType.MostlyQ4_1: "q4_1",
        GGMLFileType.PerLayerIsQ4_1: "q4_1",
    }[params.file_type]
@ -1108,7 +1109,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
    parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
    parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
    parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
-    parser.add_argument("--outtype", choices=["f32", "f16", "q4_1"], help="output format (default: based on input)")
+    parser.add_argument("--outtype", choices=["f32", "f16", "q4_1", "q4_0"], help="output format (default: based on input)")
    parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
    parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
    parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -139,6 +139,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.model = argv[i];
        } else if (arg == "--lora") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.lora_adapter = argv[i];
            params.use_mmap = false;
        } else if (arg == "--lora-base") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.lora_base = argv[i];
        } else if (arg == "-i" || arg == "--interactive") {
            params.interactive = true;
        } else if (arg == "--embedding") {
@ -243,6 +256,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    }
    fprintf(stderr, "  --mtest               compute maximum memory usage\n");
    fprintf(stderr, "  --verbose-prompt      print prompt before generation\n");
    fprintf(stderr, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
    fprintf(stderr, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
    fprintf(stderr, "  -m FNAME, --model FNAME\n");
    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
    fprintf(stderr, "\n");
--- a/examples/common.h
+++ b/examples/common.h
@ -31,11 +31,12 @@ struct gpt_params {
    std::string model  = "models/lamma-7B/ggml-model.bin"; // model path
    std::string prompt = "";
-    std::string input_prefix = ""; // string to prefix user inputs with
+    std::string input_prefix = "";       // string to prefix user inputs with
    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
    std::string lora_adapter = "";  // lora adapter path
    std::string lora_base = "";     // base model path for the lora adapter
    bool memory_f16        = true;  // use f16 instead of f32 for memory kv
    bool random_prompt     = false; // do not randomize prompt if none provided
    bool use_color         = false; // use color to distinguish generations and inputs
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -114,6 +114,17 @@ int main(int argc, char ** argv) {
        }
    }
    if (!params.lora_adapter.empty()) {
        int err = llama_apply_lora_from_file(ctx,
                                             params.lora_adapter.c_str(),
                                             params.lora_base.empty() ? NULL : params.lora_base.c_str(),
                                             params.n_threads);
        if (err != 0) {
            fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
            return 1;
        }
    }
    // print system information
    {
        fprintf(stderr, "\n");
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -134,6 +134,17 @@ int main(int argc, char ** argv) {
        }
    }
    if (!params.lora_adapter.empty()) {
        int err = llama_apply_lora_from_file(ctx,
                                             params.lora_adapter.c_str(),
                                             params.lora_base.empty() ? NULL : params.lora_base.c_str(),
                                             params.n_threads);
        if (err != 0) {
            fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
            return 1;
        }
    }
    // print system information
    {
        fprintf(stderr, "\n");
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@ -15,6 +15,8 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
 #include <thread>
 #include <mutex>
 struct quantize_stats_params {
    std::string model = "models/7B/ggml-model-f16.bin";
@ -27,7 +29,6 @@ struct quantize_stats_params {
    std::vector<enum ggml_type> include_types;
 };
 const int64_t SCRATCH_ELEMENTS = 32*32;
 const size_t HISTOGRAM_BUCKETS = 150;
 const double HISTOGRAM_RANGE = 0.03;
@ -90,6 +91,13 @@ void update_error_stats(int64_t nelements, const float * input, const float * ou
    stats.num_samples += nelements;
 }
 void combine_error_stats(error_stats & into, const error_stats & from) {
    into.num_samples += from.num_samples;
    into.total_error += from.total_error;
    if (from.max_error > into.max_error) into.max_error = from.max_error;
    for (size_t i=0; i<HISTOGRAM_BUCKETS; ++i) into.error_histogram[i] += from.error_histogram[i];
 }
 double find_quantile(const error_stats & stats, double quantile) {
    double sum = std::accumulate(std::begin(stats.error_histogram), std::end(stats.error_histogram), 0.0);
@ -130,6 +138,36 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
 }
 void test_roundtrip_on_chunk(
        const ggml_tensor * layer,
        int64_t offset,
        int64_t chunk_size,
        const quantize_fns_t & qfns,
        bool use_reference,
        float * input_scratch,
        char * quantized_scratch,
        float * output_scratch,
        error_stats & stats) {
    if (layer->type == GGML_TYPE_F16) {
        for (int i = 0; i < chunk_size; i++) {
            input_scratch[i] = ggml_get_f32_1d(layer, i + offset);
        }
    } else {
        input_scratch = ggml_get_data_f32(layer) + offset;
    }
    if (use_reference) {
        qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size);
    } else {
        qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size);
    }
    qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size);
    update_error_stats(chunk_size, input_scratch, output_scratch, stats);
 }
 // Run quantization function for a single layer and update error stats
 void test_roundtrip_on_layer(
        std::string & name,
@ -137,40 +175,61 @@ void test_roundtrip_on_layer(
        const quantize_fns_t & qfns,
        bool use_reference,
        const ggml_tensor * layer,
-        float * input_scratch,
+        std::vector<float> & input_scratch,
-        char *quantized_scratch,
+        std::vector<char> & quantized_scratch,
-        float * output_scratch,
+        std::vector<float> & output_scratch,
-        error_stats & total_error) {
+        error_stats & total_error,
        int max_thread = 0) {
    assert(tensor_is_contiguous(layer));
    error_stats layer_error {};
-    int64_t nelements = ggml_nelements(layer);
+    uint64_t nelements = ggml_nelements(layer);
-    for (int64_t offset = 0; offset < nelements; offset += SCRATCH_ELEMENTS) {
+    float* input_scratch_ptr = nullptr;
-        int64_t chunk_size = std::min(SCRATCH_ELEMENTS, nelements - offset);
+    if (layer->type == GGML_TYPE_F16) {
-
+        if (input_scratch.size() < nelements) input_scratch.resize(nelements);
-        if (layer->type == GGML_TYPE_F16) {
+        input_scratch_ptr = input_scratch.data();
            for (int i = 0; i < chunk_size; i++) {
                input_scratch[i] = ggml_get_f32_1d(layer, i + offset);
            }
        } else {
            input_scratch = ggml_get_data_f32(layer) + offset;
        }
        if (use_reference) {
            qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size);
        } else {
            qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size);
        }
        qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size);
        update_error_stats(chunk_size, input_scratch, output_scratch, total_error);
        if (print_layer_stats) {
            update_error_stats(chunk_size, input_scratch, output_scratch, layer_error);
        }
    }
    if (quantized_scratch.size() < 4*nelements) quantized_scratch.resize(4*nelements);
    if (output_scratch.size() < nelements) output_scratch.resize(nelements);
    if (max_thread < 1) max_thread = std::thread::hardware_concurrency();
    int chunk_size = 32*512;
    int num_chunks = (nelements + chunk_size - 1)/chunk_size;
    if (num_chunks < 2 || max_thread < 2) {
        test_roundtrip_on_chunk(layer, 0, nelements, qfns, use_reference, input_scratch_ptr, quantized_scratch.data(),
                output_scratch.data(), print_layer_stats ? layer_error : total_error);
    } else {
        auto & stats = print_layer_stats ? layer_error : total_error;
        std::mutex mutex;
        uint64_t counter = 0;
        auto compute = [&mutex, &counter, &stats, &qfns, nelements, layer, use_reference, input_scratch_ptr,
             &quantized_scratch, &output_scratch, chunk_size] () {
            error_stats local_stats {};
            while (true) {
                std::unique_lock<std::mutex> lock(mutex);
                uint64_t offset = counter; counter += chunk_size;
                if (offset >= nelements) {
                    combine_error_stats(stats, local_stats);
                    break;
                }
                lock.unlock();
                uint64_t chunk = offset + chunk_size < nelements ? chunk_size : nelements - offset;
                test_roundtrip_on_chunk(layer, offset, chunk, qfns, use_reference, input_scratch_ptr + offset,
                        quantized_scratch.data() + 4*offset, output_scratch.data() + offset, local_stats);
            }
        };
        int nthread = std::min(num_chunks, max_thread);
        std::vector<std::thread> workers(nthread-1);
        for (auto& w : workers) w = std::thread(compute);
        compute();
        for (auto& w : workers) w.join();
    }
    if (print_layer_stats) {
        print_error_stats(name, layer_error, false);
        combine_error_stats(total_error, layer_error);
    }
 }
@ -181,6 +240,7 @@ int main(int argc, char ** argv) {
    // read command line
    int max_thread = 0;
    bool invalid_param = false;
    std::string arg;
    for (int i = 1; i < argc; i++) {
@ -221,7 +281,7 @@ int main(int argc, char ** argv) {
                break;
            }
            int j;
-            for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], ggml_type_name((ggml_type) i)) != 0; j++) {
+            for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], ggml_type_name((ggml_type) j)) != 0; j++) {
                // find match
            }
            if (j < GGML_TYPE_COUNT) {
@ -230,6 +290,12 @@ int main(int argc, char ** argv) {
                fprintf(stderr, "error: %s not in list of types\n", argv[i]);
                invalid_param = true;
            }
        } else if (arg == "-n" || arg == "--num-threads") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            max_thread = atoi(argv[i]);
        } else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            quantize_stats_print_usage(argc, argv);
@ -295,9 +361,9 @@ int main(int argc, char ** argv) {
    }
    printf("testing %d layers with max size %" PRId64 "\n", included_layers, max_nelements);
    // allocate scratch space
-    std::vector<float> input_scratch(SCRATCH_ELEMENTS);
+    std::vector<float> input_scratch;
-    std::vector<char> quantized_scratch(SCRATCH_ELEMENTS*4);
+    std::vector<char> quantized_scratch;
-    std::vector<float> output_scratch(SCRATCH_ELEMENTS);
+    std::vector<float> output_scratch;
    // loop throught quantization types
    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
@ -328,10 +394,11 @@ int main(int argc, char ** argv) {
                        qfns,
                        params.reference,
                        kv_tensor.second,
-                        input_scratch.data(),
+                        input_scratch,
-                        quantized_scratch.data(),
+                        quantized_scratch,
-                        output_scratch.data(),
+                        output_scratch,
-                        global_stats
+                        global_stats,
                        max_thread
                );
            }
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -10,10 +10,12 @@
 int main(int argc, char ** argv) {
    ggml_time_init();
-    if (argc != 4) {
+    if (argc < 4) {
-        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
+        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type [nthread]\n", argv[0]);
        fprintf(stderr, "  type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0);
        fprintf(stderr, "  type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1);
        fprintf(stderr, "  type = %d - q4_2\n", LLAMA_FTYPE_MOSTLY_Q4_2);
        fprintf(stderr, "  type = %d - q4_3\n", LLAMA_FTYPE_MOSTLY_Q4_3);
        return 1;
    }
@ -28,6 +30,7 @@ int main(int argc, char ** argv) {
    const std::string fname_out = argv[2];
    const enum llama_ftype ftype = (enum llama_ftype)atoi(argv[3]);
    int nthread = argc > 4 ? atoi(argv[4]) : 0;
    const int64_t t_main_start_us = ggml_time_us();
@ -37,7 +40,7 @@ int main(int argc, char ** argv) {
    {
        const int64_t t_start_us = ggml_time_us();
-        if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype)) {
+        if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype, nthread)) {
            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
            return 1;
        }
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -0,0 +1,154 @@
 #include <stdint.h>
 #include <cuda_fp16.h>
 #include "ggml-cuda.h"
 typedef uint16_t ggml_fp16_t;
 static_assert(sizeof(__half) == sizeof(ggml_fp16_t), "wrong fp16 size");
 #define QK4_0 32
 typedef struct {
    float   d;              // delta
    uint8_t qs[QK4_0 / 2];  // nibbles / quants
 } block_q4_0;
 static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");
 #define QK4_1 32
 typedef struct {
    float   d;              // delta
    float   m;              // min
    uint8_t qs[QK4_1 / 2];  // nibbles / quants
 } block_q4_1;
 static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
 #define QK4_2 16
 typedef struct {
    __half  d;              // delta
    uint8_t qs[QK4_2 / 2];  // nibbles / quants
 } block_q4_2;
 static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
 #define QK4_3 16
 typedef struct {
    __half  d;         // delta
    __half  m;         // min
    uint8_t qs[QK4_3 / 2]; // nibbles / quants
 } block_q4_3;
 static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding");
 static __global__ void dequantize_block_q4_0(const void * vx, float * y) {
    const block_q4_0 * x = (const block_q4_0 *) vx;
    const int i = blockIdx.x;
    const float d = x[i].d;
    const uint8_t * pp = x[i].qs;
    for (int l = 0; l < QK4_0; l += 2) {
        const uint8_t vi = pp[l/2];
        const int8_t vi0 = vi & 0xf;
        const int8_t vi1 = vi >> 4;
        const float v0 = (vi0 - 8)*d;
        const float v1 = (vi1 - 8)*d;
        y[i*QK4_0 + l + 0] = v0;
        y[i*QK4_0 + l + 1] = v1;
    }
 }
 static __global__ void dequantize_block_q4_1(const void * vx, float * y) {
    const block_q4_1 * x = (const block_q4_1 *) vx;
    const int i = blockIdx.x;
    const float d = x[i].d;
    const float m = x[i].m;
    const uint8_t * pp = x[i].qs;
    for (int l = 0; l < QK4_1; l += 2) {
        const uint8_t vi = pp[l/2];
        const int8_t vi0 = vi & 0xf;
        const int8_t vi1 = vi >> 4;
        const float v0 = vi0*d + m;
        const float v1 = vi1*d + m;
        y[i*QK4_1 + l + 0] = v0;
        y[i*QK4_1 + l + 1] = v1;
    }
 }
 static __global__ void dequantize_block_q4_2(const void * vx, float * y) {
    const block_q4_2 * x = (const block_q4_2 *) vx;
    const int i = blockIdx.x;
    const float d = x[i].d;
    const uint8_t * pp = x[i].qs;
    for (int l = 0; l < QK4_2; l += 2) {
        const uint8_t vi = pp[l/2];
        const int8_t vi0 = vi & 0xf;
        const int8_t vi1 = vi >> 4;
        const float v0 = (vi0 - 8)*d;
        const float v1 = (vi1 - 8)*d;
        y[i*QK4_2 + l + 0] = v0;
        y[i*QK4_2 + l + 1] = v1;
    }
 }
 static __global__ void dequantize_block_q4_3(const void * vx, float * y) {
    const block_q4_3 * x = (const block_q4_3 *) vx;
    const int i = blockIdx.x;
    const float d = x[i].d;
    const float m = x[i].m;
    const uint8_t * pp = x[i].qs;
    for (int l = 0; l < QK4_3; l += 2) {
        const uint8_t vi = pp[l/2];
        const int8_t vi0 = vi & 0xf;
        const int8_t vi1 = vi >> 4;
        const float v0 = vi0*d + m;
        const float v1 = vi1*d + m;
        y[i*QK4_3 + l + 0] = v0;
        y[i*QK4_3 + l + 1] = v1;
    }
 }
 extern "C" {
    __host__ void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
        const int nb = k / QK4_0;
        dequantize_block_q4_0<<<nb, 1, 0, stream>>>(vx, y);
    }
    __host__ void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
        const int nb = k / QK4_1;
        dequantize_block_q4_1<<<nb, 1, 0, stream>>>(vx, y);
    }
    __host__ void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
        const int nb = k / QK4_2;
        dequantize_block_q4_2<<<nb, 1, 0, stream>>>(vx, y);
    }
    __host__ void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
        const int nb = k / QK4_3;
        dequantize_block_q4_3<<<nb, 1, 0, stream>>>(vx, y);
    }
 }
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@ -0,0 +1,12 @@
 #ifdef  __cplusplus
 extern "C" {
 #endif
 void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
 void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
 void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);
 void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream);
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -204,7 +204,9 @@ enum ggml_type {
    GGML_TYPE_F16  = 1,
    GGML_TYPE_Q4_0 = 2,
    GGML_TYPE_Q4_1 = 3,
-    GGML_TYPE_Q8_0 = 4,
+    GGML_TYPE_Q4_2 = 4,
    GGML_TYPE_Q4_3 = 5,
    GGML_TYPE_Q8_0 = 6,
    GGML_TYPE_I8,
    GGML_TYPE_I16,
    GGML_TYPE_I32,
@ -359,6 +361,8 @@ const char * ggml_type_name(enum ggml_type type);
 size_t ggml_element_size(const struct ggml_tensor * tensor);
 bool ggml_is_quantized(enum ggml_type type);
 struct ggml_context * ggml_init(struct ggml_init_params params);
 void ggml_free(struct ggml_context * ctx);
@ -430,6 +434,12 @@ struct ggml_tensor * ggml_add(
        struct ggml_tensor  * a,
        struct ggml_tensor  * b);
 struct ggml_tensor * ggml_add_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b);
 struct ggml_tensor * ggml_sub(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
@ -620,7 +630,8 @@ struct ggml_tensor * ggml_soft_max(
 // rotary position embedding
 // in-place, returns view(a)
-// if mode == 1, skip n_past elements
+// if mode & 1 == 1, skip n_past elements
 // if mode & 2 == 1, GPT-NeoX style
 // TODO: avoid creating a new tensor every time
 struct ggml_tensor * ggml_rope(
        struct ggml_context * ctx,
@ -800,6 +811,10 @@ enum ggml_opt_result ggml_opt(
 size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
 size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
 size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
 size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
 size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
 //
 // system info
@ -808,6 +823,8 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t *
 int ggml_cpu_has_avx(void);
 int ggml_cpu_has_avx2(void);
 int ggml_cpu_has_avx512(void);
 int ggml_cpu_has_avx512_vbmi(void);
 int ggml_cpu_has_avx512_vnni(void);
 int ggml_cpu_has_fma(void);
 int ggml_cpu_has_neon(void);
 int ggml_cpu_has_arm_fma(void);
@ -815,6 +832,7 @@ int ggml_cpu_has_f16c(void);
 int ggml_cpu_has_fp16_va(void);
 int ggml_cpu_has_wasm_simd(void);
 int ggml_cpu_has_blas(void);
 int ggml_cpu_has_cublas(void);
 int ggml_cpu_has_sse3(void);
 int ggml_cpu_has_vsx(void);
--- a/llama.cpp
+++ b/llama.cpp
@ -1,6 +1,8 @@
 // Defines fileno on msys:
 #ifndef _GNU_SOURCE
 #define _GNU_SOURCE
 #include <cstdint>
 #include <cstdio>
 #endif
 #include "llama_util.h"
@ -22,6 +24,9 @@
 #include <memory>
 #include <algorithm>
 #include <initializer_list>
 #include <thread>
 #include <atomic>
 #include <mutex>
 #define LLAMA_USE_SCRATCH
 #define LLAMA_MAX_SCRATCH_BUFFERS 16
@ -42,35 +47,51 @@ static const size_t MB = 1024*1024;
 // TODO: dynamically determine these sizes
 //       needs modifications in ggml
-static const std::map<e_model, size_t> MEM_REQ_SCRATCH0 = {
+static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
-    { MODEL_7B,    512ull*MB },
+{
-    { MODEL_13B,   512ull*MB },
+    static std::map<e_model, size_t> _MEM_REQ_SCRATCH0 = {
-    { MODEL_30B,   512ull*MB },
+        { MODEL_7B,    512ull * MB },
-    { MODEL_65B,   512ull*MB },
+        { MODEL_13B,   512ull * MB },
-};
+        { MODEL_30B,   512ull * MB },
        { MODEL_65B,   512ull * MB },
    };
    return _MEM_REQ_SCRATCH0;
 }
-static const std::map<e_model, size_t> MEM_REQ_SCRATCH1 = {
+static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
-    { MODEL_7B,    512ull*MB },
+{
-    { MODEL_13B,   512ull*MB },
+    static std::map<e_model, size_t> _MEM_REQ_SCRATCH1 = {
-    { MODEL_30B,   512ull*MB },
+        { MODEL_7B,    512ull * MB },
-    { MODEL_65B,   512ull*MB },
+        { MODEL_13B,   512ull * MB },
        { MODEL_30B,   512ull * MB },
        { MODEL_65B,   512ull * MB },
    };
    return _MEM_REQ_SCRATCH1;
 };
 // 2*n_embd*n_ctx*n_layer*sizeof(float16)
-static const std::map<e_model, size_t> MEM_REQ_KV_SELF = {
+static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
-    { MODEL_7B,   1026ull*MB },
+{
-    { MODEL_13B,  1608ull*MB },
+    static std::map<e_model, size_t> _MEM_REQ_KV_SELF = {
-    { MODEL_30B,  3124ull*MB },
+        { MODEL_7B,   1026ull * MB },
-    { MODEL_65B,  5120ull*MB },
+        { MODEL_13B,  1608ull * MB },
        { MODEL_30B,  3124ull * MB },
        { MODEL_65B,  5120ull * MB },
    };
    return _MEM_REQ_KV_SELF;
 };
 // this is mostly needed for temporary mul_mat buffers to dequantize the data
 // not actually needed if BLAS is disabled
-static const std::map<e_model, size_t> MEM_REQ_EVAL = {
+static const std::map<e_model, size_t> & MEM_REQ_EVAL()
-    { MODEL_7B,   768ull*MB },
+{
-    { MODEL_13B, 1024ull*MB },
+    static std::map<e_model, size_t> _MEM_REQ_EVAL = {
-    { MODEL_30B, 1280ull*MB },
+        { MODEL_7B,   768ull * MB },
-    { MODEL_65B, 1536ull*MB },
+        { MODEL_13B, 1024ull * MB },
        { MODEL_30B, 1280ull * MB },
        { MODEL_65B, 1536ull * MB },
    };
    return _MEM_REQ_EVAL;
 };
 // default hparams (LLaMA 7B)
@ -460,6 +481,8 @@ struct llama_file_loader {
                case GGML_TYPE_F16:
                case GGML_TYPE_Q4_0:
                case GGML_TYPE_Q4_1:
                case GGML_TYPE_Q4_2:
                case GGML_TYPE_Q4_3:
                    break;
                default: {
                    throw format("unrecognized tensor type %u\n", shard.type);
@ -532,6 +555,8 @@ struct llama_file_saver {
            case GGML_TYPE_F16:
            case GGML_TYPE_Q4_0:
            case GGML_TYPE_Q4_1:
            case GGML_TYPE_Q4_2:
            case GGML_TYPE_Q4_3:
                break;
            default: LLAMA_ASSERT(false);
        }
@ -617,6 +642,7 @@ struct llama_model_loader {
            throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
                         name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
        }
        return get_tensor_for(lt);
    }
@ -819,6 +845,8 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
        case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
        case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
                                      return "mostly Q4_1, some F16";
        case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
        case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3";
        default:                      return "unknown, may not work";
    }
 }
@ -899,13 +927,13 @@ static void llama_model_load_internal(
        const size_t mem_required =
            ctx_size +
            mmapped_size +
-            MEM_REQ_SCRATCH0.at(model.type) +
+            MEM_REQ_SCRATCH0().at(model.type) +
-            MEM_REQ_SCRATCH1.at(model.type) +
+            MEM_REQ_SCRATCH1().at(model.type) +
-            MEM_REQ_EVAL.at    (model.type);
+            MEM_REQ_EVAL().at(model.type);
        // this is the memory required by one llama_state
        const size_t mem_required_state =
-            scale*MEM_REQ_KV_SELF.at(model.type);
+            scale*MEM_REQ_KV_SELF().at(model.type);
        fprintf(stderr, "%s: mem required  = %7.2f MB (+ %7.2f MB per state)\n", __func__,
                mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
@ -1047,7 +1075,7 @@ static bool llama_eval_internal(
    // for big prompts, if BLAS is enabled, it is better to use only one thread
    // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
    ggml_cgraph gf = {};
-    gf.n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads;
+    gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_cublas() ? 1 : n_threads;
    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
    memcpy(embd->data, tokens, N*ggml_element_size(embd));
@ -1547,14 +1575,20 @@ static llama_vocab::id llama_sample_top_p_top_k(
 // quantization
 //
-static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) {
+static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) {
    ggml_type quantized_type;
    switch (ftype) {
        case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
        case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
        case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
        case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break;
        default: throw format("invalid output file type %d\n", ftype);
    };
    if (nthread <= 0) {
        nthread = std::thread::hardware_concurrency();
    }
    std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
                                                                            /*vocab_only*/ false));
    llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
@ -1563,6 +1597,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
    size_t total_size_new = 0;
    std::vector<int64_t> hist_all(1 << 4, 0);
    std::vector<std::thread> workers;
    std::mutex mutex;
    size_t idx = 0;
    for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
        llama_buffer read_data;
@ -1581,6 +1618,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        // quantize only 2D tensors
        quantize &= (tensor.ne.size() == 2);
        // GG: uncomment this to keep the output layer in FP16
        //if (tensor.name.rfind("output")) {
        //    quantize = false;
        //}
        enum ggml_type new_type;
        void * new_data;
        size_t new_size;
@ -1616,17 +1658,37 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
            new_data = work.addr;
            std::vector<int64_t> hist_cur(1 << 4, 0);
-            switch (new_type) {
+            int chunk_size = 32 * 512;
-                case GGML_TYPE_Q4_0:
+            const int nchunk = (nelements + chunk_size - 1)/chunk_size;
-                    {
+            const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
-                        new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
+            if (nthread_use < 2) {
-                    } break;
+                new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
-                case GGML_TYPE_Q4_1:
+            } else {
-                    {
+                size_t counter = 0;
-                        new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
+                new_size = 0;
-                    } break;
+                auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () {
-                default:
+                    std::vector<int64_t> local_hist;
-                    LLAMA_ASSERT(false);
+                    size_t local_size = 0;
                    while (true) {
                        std::unique_lock<std::mutex> lock(mutex);
                        size_t first = counter; counter += chunk_size;
                        if (first >= nelements) {
                            if (!local_hist.empty()) {
                                for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j];
                                new_size += local_size;
                            }
                            break;
                        }
                        lock.unlock();
                        size_t last = std::min(nelements, first + chunk_size);
                        if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
                        local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
                    }
                };
                if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);
                for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute);
                compute();
                for (int it = 0; it < nthread_use - 1; ++it) workers[it].join();
            }
            printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
@ -1732,10 +1794,10 @@ struct llama_context * llama_init_from_file(
            ctx->embedding.resize(hparams.n_embd);
        }
-        ctx->buf_compute.resize(MEM_REQ_EVAL.at(ctx->model.type));
+        ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
-        ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0.at(ctx->model.type));
+        ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
-        ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1.at(ctx->model.type));
+        ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
    }
    return ctx;
@ -1748,9 +1810,10 @@ void llama_free(struct llama_context * ctx) {
 int llama_model_quantize(
        const char * fname_inp,
        const char * fname_out,
-  enum llama_ftype   ftype) {
+  enum llama_ftype   ftype,
        int          nthread) {
    try {
-        llama_model_quantize_internal(fname_inp, fname_out, ftype);
+        llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread);
        return 0;
    } catch (const std::string & err) {
        fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
@ -1758,6 +1821,254 @@ int llama_model_quantize(
    }
 }
 int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
    fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
    auto & model = ctx->model;
    const int64_t t_start_lora_us = ggml_time_us();
    auto fin = std::ifstream(path_lora, std::ios::binary);
    if (!fin) {
        fprintf(stderr, "%s: failed to open '%s'\n", __func__, path_lora);
        return 1;
    }
    // verify magic and version
    {
        uint32_t magic;
        fin.read((char *) &magic, sizeof(magic));
        if (magic != 'ggla') {
            fprintf(stderr, "%s: bad file magic\n", __func__);
            return 1;
        }
        uint32_t format_version;
        fin.read((char *) &format_version, sizeof(format_version));
        if (format_version != 1) {
            fprintf(stderr, "%s: unsupported file version\n", __func__ );
            return 1;
        }
    }
    int32_t lora_r;
    int32_t lora_alpha;
    fin.read((char *) &lora_r, sizeof(lora_r));
    fin.read((char *) &lora_alpha, sizeof(lora_alpha));
    float scaling = (float)lora_alpha / (float)lora_r;
    fprintf(stderr, "%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
    // create a temporary ggml context to store the lora tensors
    // todo: calculate size from biggest possible tensor
    std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull);
    struct ggml_init_params params;
    params.mem_size   = lora_buf.size();
    params.mem_buffer = lora_buf.data();
    params.no_alloc   = false;
    ggml_context * lora_ctx = ggml_init(params);
    std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
    // create a name -> tensor map of the model to accelerate lookups
    std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
    for (auto & kv: model.tensors_by_name) {
        model_tensors.insert(kv);
    }
    // load base model
    std::unique_ptr<llama_model_loader> model_loader;
    ggml_context * base_ctx = NULL;
    llama_buffer base_buf;
    if (path_base_model) {
        fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
        model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
        size_t ctx_size, mmapped_size;
        model_loader->calc_sizes(&ctx_size, &mmapped_size);
        base_buf.resize(ctx_size);
        ggml_init_params base_params;
        base_params.mem_size   = base_buf.size;
        base_params.mem_buffer = base_buf.addr;
        base_params.no_alloc   = model_loader->use_mmap;
        base_ctx = ggml_init(base_params);
        model_loader->ggml_ctx = base_ctx;
        // maybe this should in llama_model_loader
        if (model_loader->use_mmap) {
            model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false));
        }
    }
    // read tensors and apply
    bool warned = false;
    int n_tensors = 0;
    while (true) {
        int32_t n_dims;
        int32_t length;
        int32_t ftype;
        fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
        fin.read(reinterpret_cast<char *>(&length), sizeof(length));
        fin.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));
        if (fin.eof()) {
            break;
        }
        int32_t ne[2] = { 1, 1 };
        for (int i = 0; i < n_dims; ++i) {
            fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
        }
        std::string name(length, 0);
        fin.read(&name[0], length);
        // check for lora suffix and get the type of tensor
        const std::string lora_suffix = ".lora";
        size_t pos = name.rfind(lora_suffix);
        if (pos == std::string::npos) {
            fprintf(stderr, "%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
            return 1;
        }
        std::string lora_type = name.substr(pos + lora_suffix.length());
        std::string base_name = name;
        base_name.erase(pos);
        // fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
        if (model_tensors.find(base_name.data()) == model_tensors.end()) {
            fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
            return 1;
        }
        // create ggml tensor
        ggml_type wtype;
        switch (ftype) {
            case 0: wtype = GGML_TYPE_F32;  break;
            case 1: wtype = GGML_TYPE_F16;  break;
            default:
                    {
                        fprintf(stderr, "%s: invalid tensor data type '%d'\n",
                                __func__, ftype);
                        return false;
                    }
        }
        ggml_tensor* lora_tensor;
        if (n_dims == 2) {
            lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
        }
        else {
            fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
            return 1;
        }
        // load tensor data
        size_t offset = fin.tellg();
        size_t tensor_data_size = ggml_nbytes(lora_tensor);
        offset = (offset + 31) & -32;
        fin.seekg(offset);
        fin.read((char*)lora_tensor->data, tensor_data_size);
        lora_tensors[name] = lora_tensor;
        // check if we have both A and B tensors and apply
        if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() &&
            lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
            ggml_tensor * dest_t = model_tensors[base_name];
            ggml_tensor * base_t;
            if (model_loader) {
                // load from base model
                if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) {
                    fprintf(stderr, "%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
                    return 1;
                }
                size_t idx = model_loader->tensors_map.name_to_idx[base_name];
                llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
                base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
                lt.data = (uint8_t *) lt.ggml_tensor->data;
                model_loader->load_data_for(lt);
                lt.ggml_tensor->data = lt.data;
            }
            else {
                base_t = dest_t;
            }
            if (ggml_is_quantized(base_t->type)) {
                if (!warned) {
                    fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
                                    "use a f16 or f32 base model with --lora-base\n", __func__);
                    warned = true;
                }
            }
            ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
            ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
            if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
                fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
                               " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
                return 1;
            }
            // w = w + BA*s
            ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
            if (scaling != 1.0f) {
                ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
                BA = ggml_scale(lora_ctx, BA, scale_tensor);
            }
            ggml_tensor * r;
            if (base_t == dest_t) {
                r = ggml_add_inplace(lora_ctx, dest_t, BA);
            }
            else {
                r = ggml_add(lora_ctx, base_t, BA);
                r = ggml_cpy(lora_ctx, r, dest_t);
            }
            struct ggml_cgraph gf = ggml_build_forward(r);
            gf.n_threads = n_threads;
            ggml_graph_compute(lora_ctx, &gf);
            // we won't need these tensors again, reset the context to save memory
            ggml_free(lora_ctx);
            lora_ctx = ggml_init(params);
            lora_tensors.clear();
            n_tensors++;
            if (n_tensors % 4 == 0)
                fprintf(stderr, ".");
        }
    }
    // TODO: this should be in a destructor, it will leak on failure
    ggml_free(lora_ctx);
    if (base_ctx) {
        ggml_free(base_ctx);
    }
    const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
    fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0);
    return 0;
 }
 int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
    try {
        return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
    } catch (const std::string & err) {
        fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str());
        return 1;
    }
 }
 // Returns the KV cache that will contain the context for the
 // ongoing prediction with the model.
 const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
@ -1915,18 +2226,20 @@ const char * llama_print_system_info(void) {
    static std::string s;
    s  = "";
-    s += "AVX = "       + std::to_string(ggml_cpu_has_avx())       + " | ";
+    s += "AVX = "         + std::to_string(ggml_cpu_has_avx())         + " | ";
-    s += "AVX2 = "      + std::to_string(ggml_cpu_has_avx2())      + " | ";
+    s += "AVX2 = "        + std::to_string(ggml_cpu_has_avx2())        + " | ";
-    s += "AVX512 = "    + std::to_string(ggml_cpu_has_avx512())    + " | ";
+    s += "AVX512 = "      + std::to_string(ggml_cpu_has_avx512())      + " | ";
-    s += "FMA = "       + std::to_string(ggml_cpu_has_fma())       + " | ";
+    s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
-    s += "NEON = "      + std::to_string(ggml_cpu_has_neon())      + " | ";
+    s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
-    s += "ARM_FMA = "   + std::to_string(ggml_cpu_has_arm_fma())   + " | ";
+    s += "FMA = "         + std::to_string(ggml_cpu_has_fma())         + " | ";
-    s += "F16C = "      + std::to_string(ggml_cpu_has_f16c())      + " | ";
+    s += "NEON = "        + std::to_string(ggml_cpu_has_neon())        + " | ";
-    s += "FP16_VA = "   + std::to_string(ggml_cpu_has_fp16_va())   + " | ";
+    s += "ARM_FMA = "     + std::to_string(ggml_cpu_has_arm_fma())     + " | ";
-    s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
+    s += "F16C = "        + std::to_string(ggml_cpu_has_f16c())        + " | ";
-    s += "BLAS = "      + std::to_string(ggml_cpu_has_blas())      + " | ";
+    s += "FP16_VA = "     + std::to_string(ggml_cpu_has_fp16_va())     + " | ";
-    s += "SSE3 = "      + std::to_string(ggml_cpu_has_sse3())      + " | ";
+    s += "WASM_SIMD = "   + std::to_string(ggml_cpu_has_wasm_simd())   + " | ";
-    s += "VSX = "       + std::to_string(ggml_cpu_has_vsx())       + " | ";
+    s += "BLAS = "        + std::to_string(ggml_cpu_has_blas())        + " | ";
    s += "SSE3 = "        + std::to_string(ggml_cpu_has_sse3())        + " | ";
    s += "VSX = "         + std::to_string(ggml_cpu_has_vsx())         + " | ";
    return s.c_str();
 }
--- a/llama.h
+++ b/llama.h
@ -72,6 +72,8 @@ extern "C" {
        LLAMA_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
        LLAMA_FTYPE_MOSTLY_Q4_2 = 5,  // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_3 = 6,  // except 1d tensors
    };
    LLAMA_API struct llama_context_params llama_context_default_params();
@ -91,10 +93,24 @@ extern "C" {
    // TODO: not great API - very likely to change
    // Returns 0 on success
    // nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
    LLAMA_API int llama_model_quantize(
            const char * fname_inp,
            const char * fname_out,
-      enum llama_ftype   ftype);
+      enum llama_ftype   ftype,
            int          nthread);
    // Apply a LoRA adapter to a loaded model
    // path_base_model is the path to a higher quality model to use as a base for
    // the layers modified by the adapter. Can be NULL to use the current loaded model.
    // The model needs to be reloaded before applying a new adapter, otherwise the adapter
    // will be applied on top of the previous one
    // Returns 0 on success
    LLAMA_API int llama_apply_lora_from_file(
            struct llama_context * ctx,
                      const char * path_lora,
                      const char * path_base_model,
                             int   n_threads);
    // Returns the KV cache that will contain the context for the
    // ongoing prediction with the model.
--- a/llama_util.h
+++ b/llama_util.h
@ -168,7 +168,7 @@ struct llama_mmap {
 #ifdef _POSIX_MAPPED_FILES
    static constexpr bool SUPPORTED = true;
-    llama_mmap(struct llama_file * file) {
+    llama_mmap(struct llama_file * file, bool prefetch = true) {
        size = file->size;
        int fd = fileno(file->fp);
        int flags = MAP_SHARED;
@ -180,10 +180,12 @@ struct llama_mmap {
            throw format("mmap failed: %s", strerror(errno));
        }
-        // Advise the kernel to preload the mapped memory
+        if (prefetch) {
-        if (madvise(addr, file->size, MADV_WILLNEED)) {
+            // Advise the kernel to preload the mapped memory
-            fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
+            if (madvise(addr, file->size, MADV_WILLNEED)) {
-                    strerror(errno));
+                fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
                        strerror(errno));
            }
        }
    }
@ -193,14 +195,13 @@ struct llama_mmap {
 #elif defined(_WIN32)
    static constexpr bool SUPPORTED = true;
-    llama_mmap(struct llama_file * file) {
+    llama_mmap(struct llama_file * file, bool prefetch = true) {
        size = file->size;
        HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
        HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
        DWORD error = GetLastError();
        CloseHandle(hFile);
        if (hMapping == NULL) {
            throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
@ -215,13 +216,15 @@ struct llama_mmap {
        }
        #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
-        // Advise the kernel to preload the mapped memory
+        if (prefetch) {
-        WIN32_MEMORY_RANGE_ENTRY range;
+            // Advise the kernel to preload the mapped memory
-        range.VirtualAddress = addr;
+            WIN32_MEMORY_RANGE_ENTRY range;
-        range.NumberOfBytes = (SIZE_T)size;
+            range.VirtualAddress = addr;
-        if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
+            range.NumberOfBytes = (SIZE_T)size;
-            fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
+            if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
-                    llama_format_win_err(GetLastError()).c_str());
+                fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
                        llama_format_win_err(GetLastError()).c_str());
            }
        }
        #else
        #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
--- a/pocs/CMakeLists.txt
+++ b/pocs/CMakeLists.txt
@ -0,0 +1,12 @@
 # dependencies
 find_package(Threads REQUIRED)
 # third-party
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 if (EMSCRIPTEN)
 else()
    add_subdirectory(vdot)
 endif()
--- a/pocs/vdot/CMakeLists.txt
+++ b/pocs/vdot/CMakeLists.txt
@ -0,0 +1,4 @@
 set(TARGET vdot)
 add_executable(${TARGET} vdot.cpp)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/pocs/vdot/vdot.cpp
+++ b/pocs/vdot/vdot.cpp
@ -0,0 +1,305 @@
 #include <cstdio>
 #include <vector>
 #include <random>
 #include <chrono>
 #include <cstdlib>
 #include <cmath>
 #include <cassert>
 #include <cstring>
 #include <array>
 #include <ggml.h>
 constexpr int kVecSize = 1 << 18;
 float drawFromGaussianPdf(std::mt19937& rndm) {
    constexpr double kScale = 1./(1. + std::mt19937::max());
    constexpr double kTwoPiTimesScale = 6.28318530717958647692*kScale;
    static float lastX;
    static bool haveX = false;
    if (haveX) { haveX = false; return lastX; }
    auto r = sqrt(-2*log(1 - kScale*rndm()));
    auto phi = kTwoPiTimesScale * rndm();
    lastX = r*sin(phi);
    haveX = true;
    return r*cos(phi);
 }
 void fillRandomGaussianFloats(std::vector<float>& values, std::mt19937& rndm, float mean = 0) {
    for (auto& v : values) v = mean + drawFromGaussianPdf(rndm);
 }
 // Copy-pasted from ggml.c
 #define QK4_0 32
 typedef struct {
    float   d;          // delta
    uint8_t qs[QK4_0 / 2];  // nibbles / quants
 } block_q4_0;
 static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");
 #define QK4_1 32
 typedef struct {
    float   d;          // delta
    float   m;          // min
    uint8_t qs[QK4_1 / 2];  // nibbles / quants
 } block_q4_1;
 static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
 // Copy-pasted from ggml.c
 #define QK8_0 32
 typedef struct {
    float   d;          // delta
    int8_t  qs[QK8_0];  // quants
 } block_q8_0;
 static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
 // "Scalar" dot product between the quantized vector x and float vector y
 inline double dot(int n, const block_q4_0* x, const float* y) {
    const static float kValues[16] = {-8.f, -7.f, -6.f, -5.f, -4.f, -3.f, -2.f, -1.f, 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f};
    constexpr uint32_t kMask1 = 0x0f0f0f0f;
    uint32_t u1, u2;
    auto q1 = (const uint8_t*)&u1;
    auto q2 = (const uint8_t*)&u2;
    double sum = 0;
    for (int i=0; i<n; ++i) {
        float d = x->d;
        auto u = (const uint32_t*)x->qs;
        float s = 0;
        for (int k=0; k<4; ++k) {
            u1 = u[k] & kMask1;
            u2 = (u[k] >> 4) & kMask1;
            s += y[0]*kValues[q1[0]] + y[1]*kValues[q2[0]] +
                 y[2]*kValues[q1[1]] + y[3]*kValues[q2[1]] +
                 y[4]*kValues[q1[2]] + y[5]*kValues[q2[2]] +
                 y[6]*kValues[q1[3]] + y[7]*kValues[q2[3]];
            y += 8;
        }
        sum += s*d;
        ++x;
    }
    return sum;
 }
 // Alternative version of the above. Faster on my Mac (~45 us vs ~55 us per dot product),
 // but about the same on X86_64 (Ryzen 7950X CPU).
 inline double dot3(int n, const block_q4_0* x, const float* y) {
    const static std::pair<float,float> kValues[256] = {
        {-8.f, -8.f}, {-7.f, -8.f}, {-6.f, -8.f}, {-5.f, -8.f}, {-4.f, -8.f}, {-3.f, -8.f}, {-2.f, -8.f}, {-1.f, -8.f},
        { 0.f, -8.f}, { 1.f, -8.f}, { 2.f, -8.f}, { 3.f, -8.f}, { 4.f, -8.f}, { 5.f, -8.f}, { 6.f, -8.f}, { 7.f, -8.f},
        {-8.f, -7.f}, {-7.f, -7.f}, {-6.f, -7.f}, {-5.f, -7.f}, {-4.f, -7.f}, {-3.f, -7.f}, {-2.f, -7.f}, {-1.f, -7.f},
        { 0.f, -7.f}, { 1.f, -7.f}, { 2.f, -7.f}, { 3.f, -7.f}, { 4.f, -7.f}, { 5.f, -7.f}, { 6.f, -7.f}, { 7.f, -7.f},
        {-8.f, -6.f}, {-7.f, -6.f}, {-6.f, -6.f}, {-5.f, -6.f}, {-4.f, -6.f}, {-3.f, -6.f}, {-2.f, -6.f}, {-1.f, -6.f},
        { 0.f, -6.f}, { 1.f, -6.f}, { 2.f, -6.f}, { 3.f, -6.f}, { 4.f, -6.f}, { 5.f, -6.f}, { 6.f, -6.f}, { 7.f, -6.f},
        {-8.f, -5.f}, {-7.f, -5.f}, {-6.f, -5.f}, {-5.f, -5.f}, {-4.f, -5.f}, {-3.f, -5.f}, {-2.f, -5.f}, {-1.f, -5.f},
        { 0.f, -5.f}, { 1.f, -5.f}, { 2.f, -5.f}, { 3.f, -5.f}, { 4.f, -5.f}, { 5.f, -5.f}, { 6.f, -5.f}, { 7.f, -5.f},
        {-8.f, -4.f}, {-7.f, -4.f}, {-6.f, -4.f}, {-5.f, -4.f}, {-4.f, -4.f}, {-3.f, -4.f}, {-2.f, -4.f}, {-1.f, -4.f},
        { 0.f, -4.f}, { 1.f, -4.f}, { 2.f, -4.f}, { 3.f, -4.f}, { 4.f, -4.f}, { 5.f, -4.f}, { 6.f, -4.f}, { 7.f, -4.f},
        {-8.f, -3.f}, {-7.f, -3.f}, {-6.f, -3.f}, {-5.f, -3.f}, {-4.f, -3.f}, {-3.f, -3.f}, {-2.f, -3.f}, {-1.f, -3.f},
        { 0.f, -3.f}, { 1.f, -3.f}, { 2.f, -3.f}, { 3.f, -3.f}, { 4.f, -3.f}, { 5.f, -3.f}, { 6.f, -3.f}, { 7.f, -3.f},
        {-8.f, -2.f}, {-7.f, -2.f}, {-6.f, -2.f}, {-5.f, -2.f}, {-4.f, -2.f}, {-3.f, -2.f}, {-2.f, -2.f}, {-1.f, -2.f},
        { 0.f, -2.f}, { 1.f, -2.f}, { 2.f, -2.f}, { 3.f, -2.f}, { 4.f, -2.f}, { 5.f, -2.f}, { 6.f, -2.f}, { 7.f, -2.f},
        {-8.f, -1.f}, {-7.f, -1.f}, {-6.f, -1.f}, {-5.f, -1.f}, {-4.f, -1.f}, {-3.f, -1.f}, {-2.f, -1.f}, {-1.f, -1.f},
        { 0.f, -1.f}, { 1.f, -1.f}, { 2.f, -1.f}, { 3.f, -1.f}, { 4.f, -1.f}, { 5.f, -1.f}, { 6.f, -1.f}, { 7.f, -1.f},
        {-8.f,  0.f}, {-7.f,  0.f}, {-6.f,  0.f}, {-5.f,  0.f}, {-4.f,  0.f}, {-3.f,  0.f}, {-2.f,  0.f}, {-1.f,  0.f},
        { 0.f,  0.f}, { 1.f,  0.f}, { 2.f,  0.f}, { 3.f,  0.f}, { 4.f,  0.f}, { 5.f,  0.f}, { 6.f,  0.f}, { 7.f,  0.f},
        {-8.f,  1.f}, {-7.f,  1.f}, {-6.f,  1.f}, {-5.f,  1.f}, {-4.f,  1.f}, {-3.f,  1.f}, {-2.f,  1.f}, {-1.f,  1.f},
        { 0.f,  1.f}, { 1.f,  1.f}, { 2.f,  1.f}, { 3.f,  1.f}, { 4.f,  1.f}, { 5.f,  1.f}, { 6.f,  1.f}, { 7.f,  1.f},
        {-8.f,  2.f}, {-7.f,  2.f}, {-6.f,  2.f}, {-5.f,  2.f}, {-4.f,  2.f}, {-3.f,  2.f}, {-2.f,  2.f}, {-1.f,  2.f},
        { 0.f,  2.f}, { 1.f,  2.f}, { 2.f,  2.f}, { 3.f,  2.f}, { 4.f,  2.f}, { 5.f,  2.f}, { 6.f,  2.f}, { 7.f,  2.f},
        {-8.f,  3.f}, {-7.f,  3.f}, {-6.f,  3.f}, {-5.f,  3.f}, {-4.f,  3.f}, {-3.f,  3.f}, {-2.f,  3.f}, {-1.f,  3.f},
        { 0.f,  3.f}, { 1.f,  3.f}, { 2.f,  3.f}, { 3.f,  3.f}, { 4.f,  3.f}, { 5.f,  3.f}, { 6.f,  3.f}, { 7.f,  3.f},
        {-8.f,  4.f}, {-7.f,  4.f}, {-6.f,  4.f}, {-5.f,  4.f}, {-4.f,  4.f}, {-3.f,  4.f}, {-2.f,  4.f}, {-1.f,  4.f},
        { 0.f,  4.f}, { 1.f,  4.f}, { 2.f,  4.f}, { 3.f,  4.f}, { 4.f,  4.f}, { 5.f,  4.f}, { 6.f,  4.f}, { 7.f,  4.f},
        {-8.f,  5.f}, {-7.f,  5.f}, {-6.f,  5.f}, {-5.f,  5.f}, {-4.f,  5.f}, {-3.f,  5.f}, {-2.f,  5.f}, {-1.f,  5.f},
        { 0.f,  5.f}, { 1.f,  5.f}, { 2.f,  5.f}, { 3.f,  5.f}, { 4.f,  5.f}, { 5.f,  5.f}, { 6.f,  5.f}, { 7.f,  5.f},
        {-8.f,  6.f}, {-7.f,  6.f}, {-6.f,  6.f}, {-5.f,  6.f}, {-4.f,  6.f}, {-3.f,  6.f}, {-2.f,  6.f}, {-1.f,  6.f},
        { 0.f,  6.f}, { 1.f,  6.f}, { 2.f,  6.f}, { 3.f,  6.f}, { 4.f,  6.f}, { 5.f,  6.f}, { 6.f,  6.f}, { 7.f,  6.f},
        {-8.f,  7.f}, {-7.f,  7.f}, {-6.f,  7.f}, {-5.f,  7.f}, {-4.f,  7.f}, {-3.f,  7.f}, {-2.f,  7.f}, {-1.f,  7.f},
        { 0.f,  7.f}, { 1.f,  7.f}, { 2.f,  7.f}, { 3.f,  7.f}, { 4.f,  7.f}, { 5.f,  7.f}, { 6.f,  7.f}, { 7.f,  7.f}
    };
    double sum = 0;
    for (int i=0; i<n; ++i) {
        float d = x->d;
        auto q = x->qs;
        float s = 0;
        for (int k=0; k<4; ++k) {
            s += y[0]*kValues[q[0]].first + y[1]*kValues[q[0]].second +
                 y[2]*kValues[q[1]].first + y[3]*kValues[q[1]].second +
                 y[4]*kValues[q[2]].first + y[5]*kValues[q[2]].second +
                 y[6]*kValues[q[3]].first + y[7]*kValues[q[3]].second;
            y += 8; q += 4;
        }
        sum += s*d;
        ++x;
    }
    return sum;
 }
 inline double dot41(int n, const block_q4_1* x, const float* y) {
    const static float kValues[16] = {0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f};
    constexpr uint32_t kMask1 = 0x0f0f0f0f;
    uint32_t u1, u2;
    auto q1 = (const uint8_t*)&u1;
    auto q2 = (const uint8_t*)&u2;
    double sum = 0;
    for (int i=0; i<n; ++i) {
        auto u = (const uint32_t*)x->qs;
        float s = 0, s1 = 0;
        for (int k=0; k<4; ++k) {
            u1 = u[k] & kMask1;
            u2 = (u[k] >> 4) & kMask1;
            s += y[0]*kValues[q1[0]] + y[1]*kValues[q2[0]] +
                 y[2]*kValues[q1[1]] + y[3]*kValues[q2[1]] +
                 y[4]*kValues[q1[2]] + y[5]*kValues[q2[2]] +
                 y[6]*kValues[q1[3]] + y[7]*kValues[q2[3]];
            s1 += y[0] + y[1] + y[2] + y[3] + y[4] + y[5] + y[6] + y[7];
            y += 8;
        }
        sum += s*x->d + s1*x->m;
        ++x;
    }
    return sum;
 }
 // Copy-pasted from ggml.c
 static void quantize_row_q8_0_reference(const float *x, block_q8_0 *y, int k) {
    assert(k % QK8_0 == 0);
    const int nb = k / QK8_0;
    for (int i = 0; i < nb; i++) {
        float amax = 0.0f; // absolute max
        for (int l = 0; l < QK8_0; l++) {
            const float v = x[i*QK8_0 + l];
            amax = std::max(amax, fabsf(v));
        }
        const float d = amax / ((1 << 7) - 1);
        const float id = d ? 1.0f/d : 0.0f;
        y[i].d = d;
        for (int l = 0; l < QK8_0; ++l) {
            const float   v  = x[i*QK8_0 + l]*id;
            y[i].qs[l] = roundf(v);
        }
    }
 }
 // Copy-pasted from ggml.c
 static void dot_q4_q8(const int n, float* s, const void* vx, const void* vy) {
    const int nb = n / QK8_0;
    const block_q4_0* x = (const block_q4_0*)vx;
    const block_q8_0* y = (const block_q8_0*)vy;
    float sumf = 0;
    for (int i = 0; i < nb; i++) {
        const float d0 = x[i].d;
        const float d1 = y[i].d;
        const uint8_t * p0 = x[i].qs;
        const  int8_t * p1 = y[i].qs;
        int sumi = 0;
        for (int j = 0; j < QK8_0/2; j++) {
            const uint8_t v0 = p0[j];
            const int i0 = (int8_t) (v0 & 0xf) - 8;
            const int i1 = (int8_t) (v0 >> 4)  - 8;
            const int i2 = p1[2*j + 0];
            const int i3 = p1[2*j + 1];
            sumi += i0*i2 + i1*i3;
        }
        sumf += d0*d1*sumi;
    }
    *s = sumf;
 }
 int main(int argc, char** argv) {
    int nloop = argc > 1 ? atoi(argv[1]) : 10;
    bool scalar = argc > 2 ? atoi(argv[2]) : false;
    bool useQ4_1 = argc > 3 ? atoi(argv[3]) : false;
    if (scalar && useQ4_1) {
        printf("It is not possible to use Q4_1 quantization and scalar implementations\n");
        return 1;
    }
    std::mt19937 rndm(1234);
    std::vector<float> x1(kVecSize), y1(kVecSize);
    int n4 = useQ4_1 ? kVecSize / QK4_1 : kVecSize / QK4_0; n4 = 64*((n4 + 63)/64);
    int n8 = kVecSize / QK8_0; n8 = 64*((n8 + 63)/64);
    auto funcs = useQ4_1 ? ggml_internal_get_quantize_fn(GGML_TYPE_Q4_1) : ggml_internal_get_quantize_fn(GGML_TYPE_Q4_0);
    std::vector<block_q4_0> q40;
    std::vector<block_q4_1> q41;
    if (useQ4_1) q41.resize(n4);
    else q40.resize(n4);
    std::vector<block_q8_0> q8(n8);
    std::vector<int64_t> H(16, 0);
    double sumt = 0, sumt2 = 0, maxt = 0;
    double sumqt = 0, sumqt2 = 0, maxqt = 0;
    double sum = 0, sumq = 0, exactSum = 0;
    for (int iloop=0; iloop<nloop; ++iloop) {
        // Fill vector x with random numbers
        fillRandomGaussianFloats(x1, rndm);
        // Fill vector y with random numbers
        fillRandomGaussianFloats(y1, rndm);
        // Compute the exact dot product
        for (int k=0; k<kVecSize; ++k) exactSum += x1[k]*y1[k];
        // quantize x.
        // Note, we do not include this in the timing as in practical application
        // we already have the quantized model weights.
        if (useQ4_1) {
            funcs.quantize_row_q(x1.data(), q41.data(), kVecSize);
        } else {
            funcs.quantize_row_q(x1.data(), q40.data(), kVecSize);
        }
        // Now measure time the dot product needs using the "scalar" version above
        auto t1 = std::chrono::high_resolution_clock::now();
        if (useQ4_1) sum += dot41(kVecSize / QK4_1, q41.data(), y1.data());
        else sum += dot(kVecSize / QK4_0, q40.data(), y1.data());
        auto t2 = std::chrono::high_resolution_clock::now();
        auto t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
        sumt += t; sumt2 += t*t; maxt = std::max(maxt, t);
        // And now measure the time needed to quantize y and perform the dot product with the quantized y
        t1 = std::chrono::high_resolution_clock::now();
        float result;
        if (scalar) {
            quantize_row_q8_0_reference(y1.data(), q8.data(), kVecSize);
            dot_q4_q8(kVecSize, &result, q40.data(), q8.data());
        }
        else {
            funcs.quantize_row_q_dot(y1.data(), q8.data(), kVecSize);
            if (useQ4_1) funcs.vec_dot_q(kVecSize, &result, q41.data(), q8.data());
            else funcs.vec_dot_q(kVecSize, &result, q40.data(), q8.data());
        }
        sumq += result;
        t2 = std::chrono::high_resolution_clock::now();
        t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
        sumqt += t; sumqt2 += t*t; maxqt = std::max(maxqt, t);
    }
    // Report the time (and the average of the dot products so the compiler does not come up with the idea
    // of optimizing away the function calls after figuring that the result is not used).
    sum /= nloop; sumq /= nloop;
    exactSum /= nloop;
    printf("Exact result: <dot> = %g\n",exactSum);
    printf("<dot> = %g, %g\n",sum,sumq);
    sumt /= nloop; sumt2 /= nloop; sumt2 -= sumt*sumt;
    if (sumt2 > 0) sumt2 = sqrt(sumt2);
    printf("time = %g +/- %g us. maxt = %g us\n",sumt,sumt2,maxt);
    sumqt /= nloop; sumqt2 /= nloop; sumqt2 -= sumqt*sumqt;
    if (sumqt2 > 0) sumqt2 = sqrt(sumqt2);
    printf("timeq = %g +/- %g us. maxt = %g us\n",sumqt,sumqt2,maxqt);
    return 0;
 }
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@ -5,13 +5,17 @@
 #include <map>
 #include <vector>
-static const std::map<std::string, std::vector<llama_token>> k_tests = {
+static const std::map<std::string, std::vector<llama_token>> & k_tests()
-    { "Hello World",        { 1,  10994,   2787, }, },
+{
-    { " Hello World",       { 1,  15043,   2787, }, },
+    static std::map<std::string, std::vector<llama_token>> _k_tests = {
-    { " Hello World!",      { 1,  15043,   2787,  29991, }, },
+        { "Hello World",        { 1,  10994,   2787, }, },
-    { " this is 🦙.cpp",    { 1,    445,    338,  29871,    243,    162,    169,    156,  29889,   8223, }, },
+        { " Hello World",       { 1,  15043,   2787, }, },
-    { "w048 7tuijk dsdfhu", { 1,  29893,  29900,  29946,  29947,  29871,  29955,   9161,  13535,  18031,   2176,   6905, }, },
+        { " Hello World!",      { 1,  15043,   2787,  29991, }, },
-    { "нещо на Български",  { 1,    821,   4851,    665,   1386,  29713,   1305, }, },
+        { " this is 🦙.cpp",    { 1,    445,    338,  29871,    243,    162,    169,    156,  29889,   8223, }, },
        { "w048 7tuijk dsdfhu", { 1,  29893,  29900,  29946,  29947,  29871,  29955,   9161,  13535,  18031,   2176,   6905, }, },
        { "нещо на Български",  { 1,    821,   4851,    665,   1386,  29713,   1305, }, },
    };
    return _k_tests;
 };
 int main(int argc, char **argv) {
@ -47,7 +51,7 @@ int main(int argc, char **argv) {
        return 2;
    }
-    for (const auto & test_kv : k_tests) {
+    for (const auto & test_kv : k_tests()) {
        std::vector<llama_token> res(test_kv.first.size());
        const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), res.size(), true);
        res.resize(n);