Merge and update

2025-02-02 23:03:37 +01:00 · 2023-08-09 00:36:11 +03:00 · 2023-08-09 00:36:11 +03:00 · 28046d1e52
commit 28046d1e52
parent ca2467d12c 468ea24fb4
37 changed files with 6347 additions and 2237 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -197,6 +197,8 @@ jobs:
    strategy:
      matrix:
        include:
+          - build: 'noavx'
+            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF'
          - build: 'avx2'
            defines: '-DLLAMA_BUILD_SERVER=ON'
          - build: 'avx'
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -67,11 +67,13 @@ endif()
 option(LLAMA_ACCELERATE                      "llama: enable Accelerate framework"               ON)
 option(LLAMA_BLAS                            "llama: use BLAS"                                  OFF)
 set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
-option(LLAMA_CUBLAS                          "llama: use cuBLAS"                                OFF)
+option(LLAMA_CUBLAS                          "llama: use CUDA"                                  OFF)
+#option(LLAMA_CUDA_CUBLAS                     "llama: use cuBLAS for prompt processing"          OFF)
+set(LLAMA_CUDA_MMQ_Y       "64" CACHE STRING "llama: y tile size for mmq CUDA kernels")
 option(LLAMA_CUDA_FORCE_DMMV                 "llama: use dmmv instead of mmvq CUDA kernels"     OFF)
 set(LLAMA_CUDA_DMMV_X      "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
 set(LLAMA_CUDA_MMV_Y        "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
-option(LLAMA_CUDA_DMMV_F16                   "llama: use 16 bit floats for dmmv CUDA kernels"   OFF)
+option(LLAMA_CUDA_F16                        "llama: use 16 bit floats for some calculations"   OFF)
 set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
 option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
 option(LLAMA_METAL                           "llama: use Metal"                                 OFF)
@ -251,6 +253,10 @@ if (LLAMA_CUBLAS)
        set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)

        add_compile_definitions(GGML_USE_CUBLAS)
+#        if (LLAMA_CUDA_CUBLAS)
+#            add_compile_definitions(GGML_CUDA_CUBLAS)
+#        endif()
+        add_compile_definitions(GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y})
        if (LLAMA_CUDA_FORCE_DMMV)
            add_compile_definitions(GGML_CUDA_FORCE_DMMV)
        endif()
@ -259,8 +265,8 @@ if (LLAMA_CUBLAS)
        if (DEFINED LLAMA_CUDA_DMMV_Y)
            add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_DMMV_Y}) # for backwards compatibility
        endif()
-        if (LLAMA_CUDA_DMMV_F16)
-            add_compile_definitions(GGML_CUDA_DMMV_F16)
+        if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
+            add_compile_definitions(GGML_CUDA_F16)
        endif()
        add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})

@ -271,10 +277,14 @@ if (LLAMA_CUBLAS)
        endif()

    if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-        if (LLAMA_CUDA_DMMV_F16)
-            set(CMAKE_CUDA_ARCHITECTURES "60;61") # needed for f16 CUDA intrinsics
+        # 52 == lowest CUDA 12 standard
+        # 60 == f16 CUDA intrinsics
+        # 61 == integer CUDA intrinsics
+        # 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
+        if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
+            set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
        else()
-            set(CMAKE_CUDA_ARCHITECTURES "52;61") # lowest CUDA 12 standard + lowest for integer intrinsics
+            set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
        endif()
    endif()
    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
@ -357,6 +367,7 @@ if (LLAMA_ALL_WARNINGS)
            -Wshadow
            -Wstrict-prototypes
            -Wpointer-arith
+            -Wmissing-prototypes
        )
        set(cxx_flags
            -Wall
@ -496,6 +507,8 @@ endif()
 add_library(ggml OBJECT
            ggml.c
            ggml.h
+            ggml-alloc.c
+            ggml-alloc.h
            ${GGML_SOURCES_CUDA}
            ${GGML_SOURCES_OPENCL}
            ${GGML_SOURCES_METAL}
--- a/35
+++ b/35
@ -63,7 +63,8 @@ ifdef LLAMA_SERVER_VERBOSE
 endif

 # warnings
-CFLAGS   += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith
+CFLAGS   += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \
+			-Wmissing-prototypes
 CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar

 # OS specific
@ -193,7 +194,7 @@ ifdef LLAMA_CUBLAS
 	CXXFLAGS  += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
 	LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
 	OBJS      += ggml-cuda.o
-	NVCCFLAGS = --forward-unknown-to-host-compiler
+	NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
 ifdef LLAMA_CUDA_NVCC
 	NVCC = $(LLAMA_CUDA_NVCC)
 else
@ -219,19 +220,30 @@ else ifdef LLAMA_CUDA_DMMV_Y
 else
 	NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
 endif # LLAMA_CUDA_MMV_Y
+ifdef LLAMA_CUDA_F16
+	NVCCFLAGS += -DGGML_CUDA_F16
+endif # LLAMA_CUDA_F16
 ifdef LLAMA_CUDA_DMMV_F16
-	NVCCFLAGS += -DGGML_CUDA_DMMV_F16
+	NVCCFLAGS += -DGGML_CUDA_F16
 endif # LLAMA_CUDA_DMMV_F16
 ifdef LLAMA_CUDA_KQUANTS_ITER
 	NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
 else
 	NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
 endif
+ifdef LLAMA_CUDA_MMQ_Y
+	NVCCFLAGS += -DGGML_CUDA_MMQ_Y=$(LLAMA_CUDA_MMQ_Y)
+else
+	NVCCFLAGS += -DGGML_CUDA_MMQ_Y=64
+endif # LLAMA_CUDA_MMQ_Y
+#ifdef LLAMA_CUDA_CUBLAS
+#	NVCCFLAGS += -DGGML_CUDA_CUBLAS
+#endif # LLAMA_CUDA_CUBLAS
 ifdef LLAMA_CUDA_CCBIN
 	NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
 endif
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
-	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
+	$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) -Wno-pedantic -c $< -o $@
 endif # LLAMA_CUBLAS

 ifdef LLAMA_CLBLAST
@ -317,7 +329,12 @@ $(info )
 ggml.o: ggml.c ggml.h ggml-cuda.h
 	$(CC)  $(CFLAGS)   -c $< -o $@

-llama.o: llama.cpp ggml.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
+ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
+	$(CC)  $(CFLAGS)   -c $< -o $@
+
+OBJS += ggml-alloc.o
+
+llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

 common.o: examples/common.cpp examples/common.h
@ -360,7 +377,7 @@ embedding: examples/embedding/embedding.cpp                   build-info.h ggml.
 save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
+server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)

 $(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)
@ -394,13 +411,13 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o
 vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

-tests/test-double-float: tests/test-double-float.c build-info.h ggml.o llama.o common.o $(OBJS)
+tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)

-tests/test-grad0: tests/test-grad0.c build-info.h ggml.o llama.o common.o $(OBJS)
+tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)

-tests/test-opt: tests/test-opt.c build-info.h ggml.o llama.o common.o $(OBJS)
+tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)

 tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o common.o $(OBJS)
--- a/README.md
+++ b/README.md
@ -77,9 +77,10 @@ as the main playground for developing new features for the [ggml](https://github
 **Supported models:**

 - [X] LLaMA 🦙
+- [x] LLaMA 2 🦙🦙
 - [X] [Alpaca](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
 - [X] [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all)
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
+- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
 - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
 - [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894)
 - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
@ -87,6 +88,7 @@ as the main playground for developing new features for the [ggml](https://github
 - [X] [Pygmalion 7B / Metharme 7B](#using-pygmalion-7b--metharme-7b)
 - [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
 - [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B) and its derivations (such as [baichuan-7b-sft](https://huggingface.co/hiyouga/baichuan-7b-sft))
+- [X] [Aquila-7B](https://huggingface.co/BAAI/Aquila-7B) / [AquilaChat-7B](https://huggingface.co/BAAI/AquilaChat-7B)

 **Bindings:**

@ -399,12 +401,16 @@ Building the program with BLAS support may lead to some performance improvements

  The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:

+<!---
+  | LLAMA_CUDA_CUBLAS       | Boolean                |   false | Use cuBLAS instead of custom CUDA kernels for prompt processing. Faster for all quantization formats except for q4_0 and q8_0, especially for k-quants. Increases VRAM usage (700 MiB for 7b, 970 MiB for 13b, 1430 MiB for 33b). |
+--->
  | Option                  | Legal values           | Default | Description |
  |-------------------------|------------------------|---------|-------------|
+  | LLAMA_CUDA_MMQ_Y        | Positive integer >= 32 |      64 | Tile size in y direction when using the custom CUDA kernels for prompt processing. Higher values can be faster depending on the amount of shared memory available. Power of 2 heavily recommended. |
  | LLAMA_CUDA_FORCE_DMMV   | Boolean                |   false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
  | LLAMA_CUDA_DMMV_X       | Positive integer >= 32 |      32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
-  | LLAMA_CUDA_MMV_Y       | Positive integer       |       1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
-  | LLAMA_CUDA_DMMV_F16     | Boolean                |   false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. |
+  | LLAMA_CUDA_MMV_Y        | Positive integer       |       1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
+  | LLAMA_CUDA_F16          | Boolean                |   false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
  | LLAMA_CUDA_KQUANTS_ITER | 1 or 2                 |       2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |

 - #### CLBlast
@ -487,6 +493,9 @@ Building the program with BLAS support may lead to some performance improvements
 # obtain the original LLaMA model weights and place them in ./models
 ls ./models
 65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model
+  # [Optional] for models using BPE tokenizers
+  ls ./models
+  65B 30B 13B 7B vocab.json

 # install Python dependencies
 python3 -m pip install -r requirements.txt
@ -494,6 +503,9 @@ python3 -m pip install -r requirements.txt
 # convert the 7B model to ggml FP16 format
 python3 convert.py models/7B/

+  # [Optional] for models using BPE tokenizers
+  python convert.py models/7B/ --vocabtype bpe
+
 # quantize the model to 4-bits (using q4_0 method)
 ./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin q4_0

@ -650,6 +662,19 @@ python3 convert.py pygmalion-7b/ --outtype q4_1
 - The LLaMA models are officially distributed by Facebook and will **never** be provided through this repository.
 - Refer to [Facebook's LLaMA repository](https://github.com/facebookresearch/llama/pull/73/files) if you need to request access to the model data.

+### Obtaining and using the Facebook LLaMA 2 model
+
+- Refer to [Facebook's LLaMA download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) if you want to access the model data.
+- Alternatively, if you want to save time and space, you can download already converted and quantized models from [TheBloke](https://huggingface.co/TheBloke), including:
+  - [LLaMA 2 7B base](https://huggingface.co/TheBloke/Llama-2-7B-GGML)
+  - [LLaMA 2 13B base](https://huggingface.co/TheBloke/Llama-2-13B-GGML)
+  - [LLaMA 2 70B base](https://huggingface.co/TheBloke/Llama-2-70B-GGML)
+  - [LLaMA 2 7B chat](https://huggingface.co/TheBloke/Llama-2-7B-chat-GGML)
+  - [LLaMA 2 13B chat](https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML)
+  - [LLaMA 2 70B chat](https://huggingface.co/TheBloke/Llama-2-70B-chat-GGML)
+- Specify `-eps 1e-5` for best generation quality
+- Specify `-gqa 8` for 70B models to work
+
 ### Verifying the model files

 Please verify the [sha256 checksums](SHA256SUMS) of all downloaded model files to confirm that you have the correct model data files before creating an issue relating to your model files.
--- a/convert.py
+++ b/convert.py
@ -133,7 +133,7 @@ TENSORS_SET = set(TENSORS_LIST)

 def find_n_mult(n_ff: int, n_embd: int) -> int:
    # hardcoded magic range
-    for n_mult in range(256, 1, -1):
+    for n_mult in range(8192, 1, -1):
        calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
        if calc_ff == n_ff:
            return n_mult
@ -141,11 +141,12 @@ def find_n_mult(n_ff: int, n_embd: int) -> int:

@dataclass
 class Params:
-    n_vocab: int
-    n_embd:  int
-    n_mult:  int
-    n_head:  int
-    n_layer: int
+    n_vocab:   int
+    n_embd:    int
+    n_mult:    int
+    n_head:    int
+    n_layer:   int
+    n_kv_head: Optional[int]  # This parameter is only used for Llama 2

    @staticmethod
    def guessed(model: 'LazyModel') -> 'Params':
@ -167,11 +168,12 @@ class Params:
        n_head=n_embd // 128 # guessed

        return Params(
-            n_vocab = n_vocab,
-            n_embd  = n_embd,
-            n_mult  = 256,
-            n_head  = n_head,
-            n_layer = n_layer,
+            n_vocab   = n_vocab,
+            n_embd    = n_embd,
+            n_mult    = 256,
+            n_head    = n_head,
+            n_layer   = n_layer,
+            n_kv_head = None,
        )

    @staticmethod
@ -183,15 +185,17 @@ class Params:
        n_head  = config["num_attention_heads"];
        n_layer = config["num_hidden_layers"];
        n_ff    = config["intermediate_size"];
+        n_kv_head = config.get("num_key_value_heads")

        n_mult = find_n_mult(n_ff, n_embd);

        return Params(
-            n_vocab = n_vocab,
-            n_embd  = n_embd,
-            n_mult  = n_mult,
-            n_head  = n_head,
-            n_layer = n_layer,
+            n_vocab   = n_vocab,
+            n_embd    = n_embd,
+            n_mult    = n_mult,
+            n_head    = n_head,
+            n_layer   = n_layer,
+            n_kv_head = n_kv_head,
        )

    # LLaMA v2 70B params.json
@ -200,21 +204,22 @@ class Params:
    def loadOriginalParamsJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
        config = json.load(open(config_path))

-        n_vocab = config["vocab_size"];
-        n_embd  = config["dim"];
-        n_head  = config["n_heads"];
-        n_layer = config["n_layers"];
-        n_mult  = config["multiple_of"];
+        n_vocab   = config["vocab_size"];
+        n_embd    = config["dim"];
+        n_head    = config["n_heads"];
+        n_layer   = config["n_layers"];
+        n_mult    = config["multiple_of"];

        if n_vocab == -1:
            n_vocab = model["tok_embeddings.weight"].shape[0]

        return Params(
-            n_vocab = n_vocab,
-            n_embd  = n_embd,
-            n_mult  = n_mult,
-            n_head  = n_head,
-            n_layer = n_layer,
+            n_vocab   = n_vocab,
+            n_embd    = n_embd,
+            n_mult    = n_mult,
+            n_head    = n_head,
+            n_layer   = n_layer,
+            n_kv_head = None,
        )

    @staticmethod
@ -234,14 +239,21 @@ class Params:


 class SentencePieceVocab:
-    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
-        self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
+    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path], vocabtype: Optional[str]) -> None:
+        self.vocabtype = vocabtype
+        if self.vocabtype == "bpe":
+          self.sentencepiece_tokenizer = json.loads(open(str(fname_tokenizer)).read())
+        else:
+          self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
        added_tokens: Dict[str, int]
        if fname_added_tokens is not None:
            added_tokens = json.load(open(fname_added_tokens))
        else:
            added_tokens = {}
-        vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
+        if self.vocabtype == "bpe":
+          vocab_size: int = len(self.sentencepiece_tokenizer)
+        else:
+          vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
        actual_ids = sorted(added_tokens.values())
        if expected_ids != actual_ids:
@ -255,22 +267,32 @@ class SentencePieceVocab:

    def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
        tokenizer = self.sentencepiece_tokenizer
-        for i in range(tokenizer.vocab_size()):
+        if self.vocabtype == "bpe":
+          from transformers.models.gpt2 import tokenization_gpt2
+          byte_encoder = tokenization_gpt2.bytes_to_unicode()
+          byte_decoder = {v: k for k, v in byte_encoder.items()}
+          for i, item in enumerate(tokenizer):
            text: bytes
-            if tokenizer.is_unknown(i):
-                text = " \u2047 ".encode("utf-8")
-            elif tokenizer.is_control(i):
-                text = b""
-            elif tokenizer.is_byte(i):
-                piece = tokenizer.id_to_piece(i)
-                if len(piece) != 6:
-                    raise Exception(f"Invalid token: {piece}")
-                byte_value = int(piece[3:-1], 16)
-                text = struct.pack("B", byte_value)
-            else:
-                text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
-            score: float = tokenizer.get_score(i)
+            text = b''.join([x.to_bytes(1, byteorder='big') for x in [byte_decoder[y] for y in item]])
+            score: float = -i
            yield text, score
+        else:
+          for i in range(tokenizer.vocab_size()):
+              text: bytes
+              if tokenizer.is_unknown(i):
+                  text = " \u2047 ".encode("utf-8")
+              elif tokenizer.is_control(i):
+                  text = b""
+              elif tokenizer.is_byte(i):
+                  piece = tokenizer.id_to_piece(i)
+                  if len(piece) != 6:
+                      raise Exception(f"Invalid token: {piece}")
+                  byte_value = int(piece[3:-1], 16)
+                  text = struct.pack("B", byte_value)
+              else:
+                  text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
+              score: float = tokenizer.get_score(i)
+              yield text, score

    def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
        for text in self.added_tokens_list:
@ -300,10 +322,12 @@ class GGMLVocab:
 Vocab = Union[SentencePieceVocab, GGMLVocab]


-def permute(weights: NDArray, n_head: int) -> NDArray:
+def permute(weights: NDArray, n_head: int, n_kv_head: Optional[int] = None) -> NDArray:
+    if n_kv_head is not None and n_head != n_kv_head:
+        n_head //= n_kv_head
    return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
-                   .swapaxes(1, 2)
-                   .reshape(weights.shape))
+                .swapaxes(1, 2)
+                .reshape(weights.shape))


 def dequantize_q4(qvalues_pack32: NDArray, scales: NDArray, addends: Optional[NDArray], g_idx: Optional[NDArray]) -> NDArray:
@ -351,7 +375,7 @@ class Tensor(metaclass=ABCMeta):
    @abstractmethod
    def astype(self, data_type: DataType) -> 'Tensor': ...
    @abstractmethod
-    def permute(self, n_head: int) -> 'Tensor': ...
+    def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'Tensor': ...
    @abstractmethod
    def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': ...
    @abstractmethod
@ -389,8 +413,8 @@ class UnquantizedTensor(Tensor):
        r = self.ndarray.shape[0] // 3
        return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])

-    def permute(self, n_head: int) -> 'UnquantizedTensor':
-        return UnquantizedTensor(permute(self.ndarray, n_head))
+    def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'UnquantizedTensor':
+        return UnquantizedTensor(permute(self.ndarray, n_head, n_kv_head))


 def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, convert: bool = False) -> NDArray:
@ -438,26 +462,27 @@ class GGMLQuantizedTensor(Tensor):
    def to_ggml(self) -> 'GGMLQuantizedTensor':
        return self

-    def permute(self, n_head: int) -> 'GGMLQuantizedTensor':
-        return GGMLQuantizedTensor(permute(self.ndarray, n_head), self.shape, self.data_type)
+    def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'GGMLQuantizedTensor':
+        return GGMLQuantizedTensor(permute(self.ndarray, n_head, n_kv_head), self.shape, self.data_type)


 GGMLCompatibleTensor = Union[UnquantizedTensor, GGMLQuantizedTensor]


 class DeferredPermutedTensor(Tensor):
-    def __init__(self, base: Tensor, n_head: int) -> None:
+    def __init__(self, base: Tensor, n_head: int, n_kv_head: Optional[int] = None) -> None:
        self.base = base
        self.n_head = n_head
+        self.n_kv_head = n_kv_head
        self.data_type = self.base.data_type

    def astype(self, data_type: DataType) -> Tensor:
-        return self.base.astype(data_type).permute(self.n_head)
+        return self.base.astype(data_type).permute(self.n_head, self.n_kv_head)

    def to_ggml(self) -> GGMLCompatibleTensor:
-        return self.base.to_ggml().permute(self.n_head)
+        return self.base.to_ggml().permute(self.n_head, self.n_kv_head)

-    def permute(self, n_head: int) -> Tensor:
+    def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> Tensor:
        raise Exception("shouldn't permute twice")


@ -549,8 +574,8 @@ class GPTQForLLaMaQuantizedTensor(Tensor):
        ret.data_type = QuantizedDataType(groupsize=new_groupsize, have_addends=True, have_g_idx=False)
        return ret

-    def permute(self, n_head: int) -> Tensor:
-        return DeferredPermutedTensor(self, n_head)
+    def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> Tensor:
+        return DeferredPermutedTensor(self, n_head, n_kv_head)

    def to_ggml(self) -> GGMLQuantizedTensor:
        # The output format looks like this:
@ -681,10 +706,10 @@ def merge_multifile_models(models_plus: List[ModelPlus]) -> ModelPlus:
    return ModelPlus(model, paths, format, vocab)


-def permute_lazy(lazy_tensor: LazyTensor, n_head: int) -> LazyTensor:
+def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_kv_head: Optional[int] = None) -> LazyTensor:
    def load() -> Tensor:
-        return lazy_tensor.load().permute(n_head)
-    return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
+        return lazy_tensor.load().permute(n_head, n_kv_head)
+    return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_kv_head}) ' + lazy_tensor.description)

 def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int) -> LazyTensor:
    def load() -> Tensor:
@ -709,7 +734,7 @@ def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
    for i in itertools.count():
        if f"model.layers.{i}.self_attn.q_proj.weight" in model:
            out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
-            out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head)
+            out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_kv_head)
            out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
        elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
            out[f"layers.{i}.attention.wq.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head)
@ -1196,14 +1221,18 @@ def filter_and_sort_tensors(model: LazyModel) -> LazyModel:
    return {name: model[name] for name in TENSORS_LIST if name in model}


-def load_vocab(path: Path) -> SentencePieceVocab:
+def load_vocab(path: Path, vocabtype: Optional[str]) -> SentencePieceVocab:
+    print(f"vocabtype: {vocabtype}")
    # Be extra-friendly and accept either a file or a directory.  Also, if it's
    # a directory, it might be the model directory, and tokenizer.model might
    # be in the parent of that.
    if path.is_dir():
-        path2 = path / "tokenizer.model"
+        vocab_file = "tokenizer.model"
+        if vocabtype == 'bpe':
+          vocab_file = "vocab.json"
+        path2 = path / vocab_file
        # Use `.parent` instead of /.. to handle the symlink case better.
-        path3 = path.parent / "tokenizer.model"
+        path3 = path.parent / vocab_file
        if path2.exists():
            path = path2
        elif path3.exists():
@ -1214,7 +1243,8 @@ def load_vocab(path: Path) -> SentencePieceVocab:
                "if it's in another directory, pass the directory as --vocab-dir")
    added_tokens_path = path.parent / "added_tokens.json"
    print(f"Loading vocab file {path}")
-    return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
+    return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None,
+                              vocabtype)


 def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path:
@ -1252,6 +1282,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
    parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
    parser.add_argument("model", type=Path,
                        help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
+    parser.add_argument("--vocabtype", default='spm', choices=["spm", "bpe"], help="vocab format (default: spm)")
    args = parser.parse_args(args_in)

    vocab: Vocab
@ -1259,7 +1290,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
        model_plus = lazy_load_file(args.model)
        do_dump_model(model_plus)
    elif args.vocab_only:
-        vocab = load_vocab(args.vocab_dir or args.model)
+        vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
        assert args.outfile, "need --outfile if using --vocab-only"
        outfile = args.outfile
        OutputFile.write_vocab_only(outfile, vocab)
@ -1273,7 +1304,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
            vocab = model_plus.vocab
        else:
            vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
-            vocab = load_vocab(vocab_dir)
+            vocab = load_vocab(vocab_dir, args.vocabtype)
        params = Params.load(model_plus)
        model = model_plus.model
        model = do_necessary_conversions(model, params)
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@ -8,6 +8,12 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

+#ifdef LLAMA_DEFAULT_RMS_EPS
+static const float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
+#else
+static const float rms_norm_eps = 5e-6f;
+#endif
+
 float frand() {
    return (float)rand()/(float)RAND_MAX;
 }
@ -562,7 +568,7 @@ struct ggml_tensor * forward(
        // norm
        {
            // cur shape [n_embd,N,1,1]
-            cur = ggml_rms_norm(ctx0, inpL);
+            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);

            // cur = attention_norm*cur
            cur = ggml_mul(ctx0,
@ -685,7 +691,7 @@ struct ggml_tensor * forward(
            // norm
            {
                // cur shape [n_embd,N,1,1]
-                cur = ggml_rms_norm(ctx0, inpFF);
+                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);

                // cur = ffn_norm*cur
                // cur shape [n_embd,N,1,1]
@ -729,7 +735,7 @@ struct ggml_tensor * forward(
    {

        // inpL shape [n_embd,N,1,1]
-        inpL = ggml_rms_norm(ctx0, inpL);
+        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);

        // inpL = norm*inpL
        // inpL shape [n_embd,N,1,1]
@ -817,7 +823,7 @@ struct ggml_tensor * forward_batch(
        // norm
        {
            // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_rms_norm(ctx0, inpL);
+            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
            assert_shape_2d(cur, n_embd, N*n_batch);

            // cur = attention_norm*cur
@ -981,7 +987,7 @@ struct ggml_tensor * forward_batch(
            // norm
            {
                // cur shape [n_embd,N*n_batch,1,1]
-                cur = ggml_rms_norm(ctx0, inpFF);
+                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
                assert_shape_2d(cur, n_embd, N*n_batch);

                // cur = ffn_norm*cur
@ -1034,7 +1040,7 @@ struct ggml_tensor * forward_batch(
    {

        // inpL shape [n_embd,N*n_batch,1,1]
-        inpL = ggml_rms_norm(ctx0, inpL);
+        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
        assert_shape_2d(inpL, n_embd, N*n_batch);

        // inpL = norm*inpL
@ -1104,7 +1110,7 @@ struct ggml_tensor * forward_lora(
        // norm
        {
            // cur shape [n_embd,N,1,1]
-            cur = ggml_rms_norm(ctx0, inpL);
+            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);

            // cur = attention_norm*cur
            cur = ggml_mul(ctx0,
@ -1251,7 +1257,7 @@ struct ggml_tensor * forward_lora(
            // norm
            {
                // cur shape [n_embd,N,1,1]
-                cur = ggml_rms_norm(ctx0, inpFF);
+                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);

                // cur = ffn_norm*cur
                // cur shape [n_embd,N,1,1]
@ -1295,7 +1301,7 @@ struct ggml_tensor * forward_lora(
    {

        // inpL shape [n_embd,N,1,1]
-        inpL = ggml_rms_norm(ctx0, inpL);
+        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);

        // inpL = norm*inpL
        // inpL shape [n_embd,N,1,1]
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -177,6 +177,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.n_gqa = std::stoi(argv[i]);
+        } else if (arg == "-eps" || arg == "--rms-norm-eps") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.rms_norm_eps = std::stof(argv[i]);
        } else if (arg == "--rope-freq-base") {
            if (++i >= argc) {
                invalid_param = true;
@ -346,7 +352,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
 #ifdef GGML_USE_CUBLAS
            params.main_gpu = std::stoi(argv[i]);
 #else
-      fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
 #endif
        } else if (arg == "--tensor-split" || arg == "-ts") {
            if (++i >= argc) {
@ -370,13 +376,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                }
            }
 #else
-      fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
+#endif // GGML_USE_CUBLAS
+        } else if (arg == "--mul-mat-q" || arg == "-mmq") {
+#ifdef GGML_USE_CUBLAS
+            params.mul_mat_q = true;
+#else
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels.\n");
 #endif // GGML_USE_CUBLAS
        } else if (arg == "--low-vram" || arg == "-lv") {
 #ifdef GGML_USE_CUBLAS
            params.low_vram = true;
 #else
-      fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
 #endif // GGML_USE_CUBLAS
        } else if (arg == "--no-mmap") {
            params.use_mmap = false;
@ -396,8 +408,14 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            params.antiprompt.push_back(argv[i]);
        } else if (arg == "--perplexity") {
            params.perplexity = true;
-        } else if (arg == "--perplexity-lines") {
-            params.perplexity_lines = true;
+        } else if (arg == "--hellaswag") {
+            params.hellaswag = true;
+        } else if (arg == "--hellaswag-tasks") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.hellaswag_tasks = std::stoi(argv[i]);
        } else if (arg == "--ignore-eos") {
            params.logit_bias[llama_token_eos()] = -INFINITY;
        } else if (arg == "--no-penalize-nl") {
@ -426,6 +444,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            exit(0);
        } else if (arg == "--random-prompt") {
            params.random_prompt = true;
+        } else if (arg == "--in-prefix-bos") {
+            params.input_prefix_bos = true;
        } else if (arg == "--in-prefix") {
            if (++i >= argc) {
                invalid_param = true;
@ -511,6 +531,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stdout, "                        not supported with --interactive or other interactive options\n");
    fprintf(stdout, "  --prompt-cache-ro     if specified, uses the prompt cache but does not update it.\n");
    fprintf(stdout, "  --random-prompt       start with a randomized prompt.\n");
+    fprintf(stdout, "  --in-prefix-bos       prefix BOS to user inputs, preceding the `--in-prefix` string\n");
    fprintf(stdout, "  --in-prefix STRING    string to prefix user inputs with (default: empty)\n");
    fprintf(stdout, "  --in-suffix STRING    string to suffix after user inputs with (default: empty)\n");
    fprintf(stdout, "  -f FNAME, --file FNAME\n");
@ -519,6 +540,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stdout, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
    fprintf(stdout, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
    fprintf(stdout, "  -gqa N, --gqa N       grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
+    fprintf(stdout, "  -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps);
    fprintf(stdout, "  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
    fprintf(stdout, "  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
    fprintf(stdout, "  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
@ -549,8 +571,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stdout, "                        not recommended: doubles context memory required and no measurable increase in quality\n");
    fprintf(stdout, "  --temp N              temperature (default: %.1f)\n", (double)params.temp);
    fprintf(stdout, "  --perplexity          compute perplexity over each ctx window of the prompt\n");
-    fprintf(stdout, "  --perplexity-lines    compute perplexity over each line of the prompt\n");
-    fprintf(stdout, "  --keep                number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
+    fprintf(stdout, "  --hellaswag           compute HellaSwag score over random tasks from datafile supplied with -f\n");
+    fprintf(stdout, "  --hellaswag-tasks N   number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
+    fprintf(stdout, "  --keep N              number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
    fprintf(stdout, "  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
    if (llama_mlock_supported()) {
        fprintf(stdout, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
@ -568,6 +591,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stdout, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
    fprintf(stdout, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n" );
    fprintf(stdout, "  -lv, --low-vram       don't allocate VRAM scratch buffer\n" );
+    fprintf(stdout, "  -mmq, --mul-mat-q     use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" );
+    fprintf(stdout, "                        Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" );
+    fprintf(stdout, "                        is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" );
 #endif
    fprintf(stdout, "  --mtest               compute maximum memory usage\n");
    fprintf(stdout, "  --export              export the computation graph to 'llama.ggml'\n");
@ -615,10 +641,12 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    lparams.n_ctx           = params.n_ctx;
    lparams.n_batch         = params.n_batch;
    lparams.n_gqa           = params.n_gqa;
+    lparams.rms_norm_eps    = params.rms_norm_eps;
    lparams.n_gpu_layers    = params.n_gpu_layers;
    lparams.main_gpu        = params.main_gpu;
    lparams.tensor_split    = params.tensor_split;
    lparams.low_vram        = params.low_vram;
+    lparams.mul_mat_q       = params.mul_mat_q;
    lparams.seed            = params.seed;
    lparams.f16_kv          = params.memory_f16;
    lparams.use_mmap        = params.use_mmap;
--- a/examples/common.h
+++ b/examples/common.h
@ -22,18 +22,19 @@
 int32_t get_num_physical_cores();

 struct gpt_params {
-    uint32_t seed                           = -1;  // RNG seed
+    uint32_t seed                           = -1;   // RNG seed
    int32_t n_threads                       = get_num_physical_cores();
-    int32_t n_predict                       = -1;  // new tokens to predict
-    int32_t n_ctx                           = 512; // context size
-    int32_t n_batch                         = 512; // batch size for prompt processing (must be >=32 to use BLAS)
-    int32_t n_gqa                           = 1;   // grouped-query attention factor (TODO: move to hparams)
-    int32_t n_keep                          = 0;   // number of tokens to keep from initial prompt
-    int32_t n_chunks                        = -1;  // max number of chunks to process (-1 = unlimited)
-    int32_t n_gpu_layers                    = 0;   // number of layers to store in VRAM
-    int32_t main_gpu                        = 0;   // the GPU that is used for scratch and small tensors
-    float   tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
-    int32_t n_probs                         = 0;   // if greater than 0, output the probabilities of top n_probs tokens.
+    int32_t n_predict                       = -1;   // new tokens to predict
+    int32_t n_ctx                           = 512;  // context size
+    int32_t n_batch                         = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_gqa                           = 1;    // grouped-query attention factor (TODO: move to hparams)
+    int32_t n_keep                          = 0;    // number of tokens to keep from initial prompt
+    int32_t n_chunks                        = -1;   // max number of chunks to process (-1 = unlimited)
+    int32_t n_gpu_layers                    = 0;    // number of layers to store in VRAM
+    int32_t main_gpu                        = 0;    // the GPU that is used for scratch and small tensors
+    float   tensor_split[LLAMA_MAX_DEVICES] = {0};  // how split tensors should be distributed across GPUs
+    int32_t n_probs                         = 0;    // if greater than 0, output the probabilities of top n_probs tokens.
+    float   rms_norm_eps                    = LLAMA_DEFAULT_RMS_EPS; // rms norm epsilon
    float   rope_freq_base                  = 10000.0f; // RoPE base frequency
    float   rope_freq_scale                 = 1.0f;     // RoPE frequency scaling factor

@ -69,7 +70,11 @@ struct gpt_params {
    std::string lora_adapter = "";  // lora adapter path
    std::string lora_base    = "";  // base model path for the lora adapter

-    bool low_vram          = false;   // if true, reduce VRAM usage at the cost of performance
+    bool hellaswag         = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
+    size_t hellaswag_tasks = 400;   // number of tasks to use when computing the HellaSwag score
+
+    bool low_vram          = false; // if true, reduce VRAM usage at the cost of performance
+    bool mul_mat_q         = false; // if true, use experimental mul_mat_q kernels
    bool memory_f16        = true;  // use f16 instead of f32 for memory kv
    bool random_prompt     = false; // do not randomize prompt if none provided
    bool use_color         = false; // use color to distinguish generations and inputs
@ -81,10 +86,10 @@ struct gpt_params {
    bool interactive_first = false; // wait for user input immediately
    bool multiline_input   = false; // reverse the usage of `\`

+    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
    bool instruct          = false; // instruction mode (used for Alpaca models)
    bool penalize_nl       = true;  // consider newlines as a repeatable token
    bool perplexity        = false; // compute perplexity over the prompt
-    bool perplexity_lines  = false; // compute perplexity over each line of the prompt
    bool use_mmap          = true;  // use mmap for faster loads
    bool use_mlock         = false; // use mlock to keep model in memory
    bool mem_test          = false; // compute maximum memory usage
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -202,9 +202,9 @@ Example usage: `--top-p 0.95`

 -   `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).

-Tail free sampling (TFS) is a text generation technique that aims to reduce the impact of less likely tokens, which may be less relevant, less coherent, or nonsensical, on the output. The method adjusts the logits (token probabilities) by raising them to the power of the parameter z. A higher value of z (e.g., 2.0) will further suppress less likely tokens from the tail of the distribution, while a value of 1.0 disables the effect of TFS. By setting the parameter z, you can control how much the probabilities of less likely tokens are reduced.
+Tail free sampling (TFS) is a text generation technique that aims to reduce the impact of less likely tokens, which may be less relevant, less coherent, or nonsensical, on the output. Similar to Top-P it tries to determine the bulk of the most likely tokens dynamically. But TFS filters out logits based on the second derivative of their probabilities. Adding tokens is stopped after the sum of the second derivatives reaches the parameter z. In short: TFS looks how quickly the probabilities of the tokens decrease and cuts off the tail of unlikely tokens using the parameter z. Typical values for z are in the range of 0.9 to 0.95. A value of 1.0 would include all tokens, and thus disables the effect of TFS.

-Example usage: `--tfs 2.0`
+Example usage: `--tfs 0.95`

 ### Locally Typical Sampling

--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -325,6 +325,10 @@ int main(int argc, char ** argv) {
            }
        }

+        if (params.input_prefix_bos) {
+            fprintf(stderr, "Input prefix with BOS\n");
+        }
+
        if (!params.input_prefix.empty()) {
            fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
        }
@ -633,16 +637,6 @@ int main(int argc, char ** argv) {
                last_n_tokens.push_back(id);
            }

-            // replace end of text token with newline token when in interactive mode
-            if (id == llama_token_eos() && params.interactive && !params.instruct) {
-                id = llama_token_newline.front();
-                if (params.antiprompt.size() != 0) {
-                    // tokenize and inject first reverse prompt
-                    const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
-                    embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
-                }
-            }
-
            // add it to the context
            embd.push_back(id);

@ -708,11 +702,34 @@ int main(int argc, char ** argv) {
                }
            }

+            // deal with end of text token in interactive mode
+            if (last_n_tokens.back() == llama_token_eos()) {
+                if (params.interactive) {
+                    if (params.antiprompt.size() != 0) {
+                        // tokenize and inject first reverse prompt
+                        const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
+                        embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
+                        is_antiprompt = true;
+                    }
+
+                    is_interacting = true;
+                    printf("\n");
+                    console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
+                    fflush(stdout);
+                } else if (params.instruct) {
+                    is_interacting = true;
+                }
+            }
+
            if (n_past > 0 && is_interacting) {
                if (params.instruct) {
                    printf("\n> ");
                }

+                if (params.input_prefix_bos) {
+                    embd_inp.push_back(llama_token_bos());
+                }
+
                std::string buffer;
                if (!params.input_prefix.empty()) {
                    buffer += params.input_prefix;
@ -776,13 +793,9 @@ int main(int argc, char ** argv) {
        }

        // end of text token
-        if (!embd.empty() && embd.back() == llama_token_eos()) {
-            if (params.instruct) {
-                is_interacting = true;
-            } else {
-                fprintf(stderr, " [end of text]\n");
-                break;
-            }
+        if (!embd.empty() && embd.back() == llama_token_eos() && !(params.instruct || params.interactive)) {
+            fprintf(stderr, " [end of text]\n");
+            break;
        }

        // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -121,8 +121,23 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
    printf("\n");
 }

-void perplexity_lines(llama_context * ctx, const gpt_params & params) {
-    // Calculates perplexity over each line of the prompt
+void hellaswag_score(llama_context * ctx, const gpt_params & params) {
+    // Calculates hellaswag score (acc_norm) from prompt
+    //
+    // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
+    // All used data fields are preprocessed as in https://github.com/EleutherAI/lm-evaluation-harness/blob/df3da98c5405deafd519c2ddca52bb7c3fe36bef/lm_eval/tasks/hellaswag.py#L62-L68
+    //
+    // All 10042 tasks should be extracted to keep the results standardized like other implementations.
+    //
+    // Datafile layout:
+    // ['??'] denotes json fields
+    // 6 lines per task:
+    // ['activity_label'] + ": " +['ctx']  - The first part of the query, the context
+    // ['label'] - The index the best common sense ending aka gold ending
+    // ['endings'][0] - Endings added to the first part of the query
+    // ['endings'][1]
+    // ['endings'][2]
+    // ['endings'][3]

    std::vector<std::string> prompt_lines;
    std::istringstream strstream(params.prompt);
@ -132,63 +147,149 @@ void perplexity_lines(llama_context * ctx, const gpt_params & params) {
        prompt_lines.push_back(line);
    }

+    if( prompt_lines.size() % 6 != 0) {
+        fprintf(stderr, "%s : number of lines in prompt not a multiple of 6.\n", __func__);
+        return;
+    }
+
+    size_t hs_task_count = prompt_lines.size()/6;
+    fprintf(stderr, "%s : loaded %lu tasks from prompt.\n", __func__, hs_task_count);
+
+    // This is needed as usual for LLaMA models
+    bool prepend_bos = true;
+
+    // Number of tasks to use when computing the score
+    if ( params.hellaswag_tasks < hs_task_count  ) {
+        hs_task_count = params.hellaswag_tasks;
+    }
+
+    // The tasks should be randomized so the score stabilizes quickly.
+    bool randomize_tasks = true;
+
+    // The random seed should not impact the final result if the computation is done over enough tasks, so kept hardcoded for now
+    std::mt19937 rng(1);
+
+    // Dataholder for hellaswag tasks
+    struct hs_data_t {
+        std::string context;
+        size_t gold_ending_idx;
+        std::string ending[4];
+        size_t ending_logprob_count[4];
+        double ending_logprob[4];
+    };
+
+    fprintf(stderr, "%s : selecting %lu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first")  );
+
+    // Select and read data from prompt lines
+    hs_data_t *hs_data = new hs_data_t[hs_task_count];
+    for (size_t i=0; i < hs_task_count; i++) {
+        size_t idx = i;
+
+        // Select a random example of those left in the prompt
+        if (randomize_tasks) {
+            std::uniform_int_distribution<size_t> dist(0, prompt_lines.size()/6-1 ) ;
+            idx = dist(rng);
+        }
+
+        hs_data[i].context = prompt_lines[idx*6];
+        hs_data[i].gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
+        for (size_t j=0; j < 4; j++) {
+            hs_data[i].ending[j] = " " + prompt_lines[idx*6+2+j];
+        }
+
+        // Delete the selected random example from the prompt
+        if (randomize_tasks) {
+            prompt_lines.erase( std::next(prompt_lines.begin(),idx*6)  , std::next(prompt_lines.begin(),idx*6+6) );
+        }
+    }
+
+    fprintf(stderr, "%s : calculating hellaswag score over selected tasks.\n", __func__);
+    printf("\ntask\tacc_norm\n");
+
+    double acc = 0.0f;
    const int n_vocab = llama_n_vocab(ctx);

-    int counttotal   = 0;
-    size_t n_lines = prompt_lines.size();
+    for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) {

-    double nll = 0.0;
+        // Tokenize the context to count tokens
+        std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, prepend_bos);
+        size_t context_size = context_embd.size();

-    fprintf(stderr, "%s: calculating perplexity over %lu lines\n", __func__, n_lines);
+        for (size_t ending_idx=0;ending_idx<4;ending_idx++) {

-    printf("\nLine\tPPL line\tPPL cumulative\n");
+            // Tokenize the query
+            std::vector<int> query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[ending_idx], prepend_bos);
+            size_t query_size = query_embd.size();

-    for (size_t i = 0; i < n_lines; ++i) {
+            // Stop if query wont fit the ctx window
+            if (query_size > (size_t)params.n_ctx) {
+                fprintf(stderr, "%s : number of tokens in query %lu > n_ctxl\n", __func__, query_size);
+                return;
+            }

-        // Tokenize and insert BOS at start
-        std::vector<int> batch_embd = ::llama_tokenize(ctx, prompt_lines[i], true);
+            // Speedup small evaluations by evaluating atleast 32 tokens
+            if (query_size < 32) {
+                query_embd.resize(32);
+            }

-        size_t batch_size  = batch_embd.size();
+            // Evaluate the query
+            if (llama_eval(ctx, query_embd.data(), query_embd.size(), 0, params.n_threads)) {
+                fprintf(stderr, "%s : failed to eval\n", __func__);
+                return;
+            }

-        // Stop if line is too long
-        if( batch_size > (size_t)params.n_ctx ) {
-            fprintf(stderr, "%s : tokens in line %lu > n_ctxl\n", __func__, i);
-            return;
+            const auto query_logits = llama_get_logits(ctx);
+            std::vector<float> logits;
+            logits.insert(logits.end(), query_logits, query_logits + query_size * n_vocab);
+
+            hs_data[task_idx].ending_logprob_count[ending_idx] = 0;
+            hs_data[task_idx].ending_logprob[ending_idx] = 0.0f;
+
+            // Calculate the logprobs over the ending
+            for (size_t j = context_size-1; j < query_size - 1; j++) {
+                // Calculate probability of next token, given the previous ones.
+                const std::vector<float> tok_logits(
+                    logits.begin() + (j + 0) * n_vocab,
+                    logits.begin() + (j + 1) * n_vocab);
+
+                const float prob = softmax(tok_logits)[query_embd[ j + 1]];
+
+                hs_data[task_idx].ending_logprob[ending_idx] += std::log(prob);
+                hs_data[task_idx].ending_logprob_count[ending_idx]++;
+            }
+
+            // Calculate the mean token logprob for acc_norm
+            hs_data[task_idx].ending_logprob[ending_idx] /= hs_data[task_idx].ending_logprob_count[ending_idx];
+
+
+//            printf("task %lu, ending %lu, whole_len %lu, context_len %lu, ending_logprob_count %lu, ending_logprob %.4f\n",
+//                task_idx,ending_idx,whole_size,context_size, hs_data[task_idx].ending_logprob_count[ending_idx], hs_data[task_idx].ending_logprob[ending_idx] );
        }

-        if (llama_eval(ctx, batch_embd.data(), batch_size, 0, params.n_threads)) {
-            fprintf(stderr, "%s : failed to eval\n", __func__);
-            return;
+        // Find the ending with maximum logprob
+        size_t ending_logprob_max_idx = -1;
+        double ending_logprob_max_val = -INFINITY;
+        for (size_t j=0; j < 4; j++) {
+            if (hs_data[task_idx].ending_logprob[j] > ending_logprob_max_val) {
+                ending_logprob_max_idx = j;
+                ending_logprob_max_val =  hs_data[task_idx].ending_logprob[j];
+            }
        }

-        const auto batch_logits = llama_get_logits(ctx);
-        std::vector<float> logits;
-        logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
+//        printf("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_data[task_idx].gold_ending_idx);

-        double nllline = 0.0;
-        int countline = 0;
-
-        // Perplexity over second half of the line
-        for (size_t j = batch_size/2; j < batch_size - 1; ++j) {
-            // Calculate probability of next token, given the previous ones.
-            const std::vector<float> tok_logits(
-                logits.begin() + (j + 0) * n_vocab,
-                logits.begin() + (j + 1) * n_vocab);
-
-            const float prob = softmax(tok_logits)[batch_embd[ j + 1]];
-
-            nllline += -std::log(prob);
-            ++countline;
+        // If the gold ending got the maximum logprobe add one accuracy point
+        if (ending_logprob_max_idx == hs_data[task_idx].gold_ending_idx) {
+            acc += 1.0;
        }

-        nll += nllline;
-        counttotal += countline;
-
-        // perplexity is e^(average negative log-likelihood)
-        printf("%lu\t%.8lf\t%.8lf\n", i + 1, std::exp(nllline/countline), std::exp(nll / counttotal) );
+        // Print the accumulated accuracy mean x 100
+        printf("%li\t%.8lf\n",task_idx+1, acc/double(task_idx+1)*100.0);
        fflush(stdout);
    }

+    delete [] hs_data;
+
    printf("\n");
 }

@ -240,8 +341,8 @@ int main(int argc, char ** argv) {
                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
    }

-    if (params.perplexity_lines) {
-        perplexity_lines(ctx, params);
+    if (params.hellaswag) {
+        hellaswag_score(ctx, params);
    } else {
        perplexity(ctx, params);
    }
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@ -26,6 +26,7 @@ int main(int argc, char ** argv) {
    auto lparams = llama_context_default_params();

    lparams.n_ctx     = params.n_ctx;
+    lparams.n_gqa     = params.n_gqa;
    lparams.seed      = params.seed;
    lparams.f16_kv    = params.memory_f16;
    lparams.use_mmap  = params.use_mmap;
--- a/examples/server-llama2-13B.sh
+++ b/examples/server-llama2-13B.sh
@ -0,0 +1,26 @@
+#!/bin/bash
+
+set -e
+
+cd "$(dirname "$0")/.." || exit
+
+# Specify the model you want to use here:
+MODEL="${MODEL:-./models/llama-2-13b-chat.ggmlv3.q5_K_M.bin}"
+PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat-system.txt}
+
+# Adjust to the number of CPU cores you want to use.
+N_THREAD="${N_THREAD:-12}"
+
+# Note: you can also override the generation options by specifying them on the command line:
+GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 4096 --batch-size 1024}"
+
+
+# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
+./server $GEN_OPTIONS \
+  --model "$MODEL" \
+  --threads "$N_THREAD" \
+  --rope-freq-scale 1.0 \
+  "$@"
+
+# I used this to test the model with mps, but omitted it from the general purpose. If you want to use it, just specify it on the command line.
+# -ngl 1 \
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -163,7 +163,7 @@ node .

    `content`: Set the text to tokenize.

-    Note that the special `BOS` token is not added in fron of the text and also a space character is not inserted automatically as it is for `/completion`.
+    Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`.

 -   **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does.

--- a/examples/server/chat-llama2.sh
+++ b/examples/server/chat-llama2.sh
@ -0,0 +1,109 @@
+#!/bin/bash
+
+API_URL="${API_URL:-http://127.0.0.1:8080}"
+
+CHAT=(
+    "Hello, Assistant."
+    "Hello. How may I help you today?"
+)
+
+INSTRUCTION="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
+
+trim() {
+    shopt -s extglob
+    set -- "${1##+([[:space:]])}"
+    printf "%s" "${1%%+([[:space:]])}"
+}
+
+trim_trailing() {
+    shopt -s extglob
+    printf "%s" "${1%%+([[:space:]])}"
+}
+
+format_prompt() {
+    if [[ "${#CHAT[@]}" -eq 0 ]]; then
+        echo -n "[INST] <<SYS>>\n${INSTRUCTION}\n<</SYS>>"
+    else
+        LAST_INDEX=$(( ${#CHAT[@]} - 1 ))
+        echo -n "${CHAT[$LAST_INDEX]}\n[INST] $1 [/INST]"
+    fi
+}
+
+tokenize() {
+    curl \
+        --silent \
+        --request POST \
+        --url "${API_URL}/tokenize" \
+        --header "Content-Type: application/json" \
+        --data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \
+    | jq '.tokens[]'
+}
+
+N_KEEP=$(tokenize "[INST] <<SYS>>\n${INSTRUCTION}\n<</SYS>>" | wc -l)
+
+chat_completion() {
+    PROMPT="$(trim_trailing "$(format_prompt "$1")")"
+    DATA="$(echo -n "$PROMPT" | jq -Rs --argjson n_keep $N_KEEP '{
+        prompt: .,
+        temperature: 0.2,
+        top_k: 40,
+        top_p: 0.9,
+        n_keep: $n_keep,
+        n_predict: 1024,
+        stop: ["[INST]"],
+        stream: true
+    }')"
+
+    # Create a temporary file to hold the Python output
+    TEMPFILE=$(mktemp)
+
+    exec 3< <(curl \
+        --silent \
+        --no-buffer \
+        --request POST \
+        --url "${API_URL}/completion" \
+        --header "Content-Type: application/json" \
+        --data-raw "${DATA}")
+
+    python -c "
+import json
+import sys
+
+answer = ''
+while True:
+    line = sys.stdin.readline()
+    if not line:
+        break
+    if line.startswith('data: '):
+        json_content = line[6:].strip()
+        content = json.loads(json_content)['content']
+        sys.stdout.write(content)
+        sys.stdout.flush()
+        answer += content
+
+answer = answer.rstrip('\n')
+
+# Write the answer to the temporary file
+with open('$TEMPFILE', 'w') as f:
+    f.write(answer)
+    " <&3
+
+    exec 3<&-
+
+    # Read the answer from the temporary file
+    ANSWER=$(cat $TEMPFILE)
+
+    # Clean up the temporary file
+    rm $TEMPFILE
+
+    printf "\n"
+
+    CHAT+=("$1" "$(trim "$ANSWER")")
+}
+
+while true; do
+    echo -en "\033[0;32m"  # Green color
+    read -r -e -p "> " QUESTION
+    echo -en "\033[0m"  # Reset color
+    chat_completion "${QUESTION}"
+done
--- a/examples/server/index.html.hpp
+++ b/examples/server/index.html.hpp
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@ -3,12 +3,11 @@
 <head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
+  <meta name="color-scheme" content="light dark">
  <title>llama.cpp - chat</title>

  <style>
    body {
-      background-color: #fff;
-      color: #000;
      font-family: system-ui;
      font-size: 90%;
      max-width: 600px;
@ -103,6 +102,36 @@
      margin: 0;
    }

+    fieldset.two {
+      display: grid;
+      grid-template: "a a";
+      gap: 1em;
+    }
+
+    fieldset.three {
+      display: grid;
+      grid-template: "a a a";
+      gap: 1em;
+    }
+
+    details {
+      border: 1px solid #aaa;
+      border-radius: 4px;
+      padding: 0.5em 0.5em 0;
+      margin-top: 0.5em;
+    }
+
+    summary {
+      font-weight: bold;
+      margin: -0.5em -0.5em 0;
+      padding: 0.5em;
+      cursor: pointer;
+    }
+
+    details[open] {
+      padding: 0.5em;
+    }
+
    textarea {
      padding: 5px;
      flex-grow: 1;
@ -122,7 +151,7 @@

    fieldset label {
      margin: 0.5em 0;
-      /*display: block;*/
+      display: block;
    }

    header, footer {
@ -152,21 +181,21 @@
    })

    const params = signal({
-      n_predict: 400,
-      top_k: 40,
-      top_p: 0.95,
-      tfs_z: 1.0,
-      typical_p: 1.0,
-      temperature: 0.7,
-      repeat_penalty: 1.18,
-      frequency_penalty: 0.0,
-      presence_penalty: 0.0,
-      repeat_last_n: 256,
-      mirostat: 0,
-      mirostat_tau: 5.0,
-      mirostat_eta: 0.1,
      cfg_scale: 4.0,
+      frequency_penalty: 0.0, // 0.0 = disabled
+      mirostat_eta: 0.1, // learning rate
+      mirostat_tau: 5, // target entropy
+      mirostat: 0, // 0/1/2
+      n_predict: 400,
      penalize_nl: true,
+      presence_penalty: 0.0, // 0.0 = disabled
+      repeat_last_n: 256, // 0 = disable penalty, -1 = context size
+      repeat_penalty: 1.18, // 1.0 = disabled
+      temperature: 0.7,
+      tfs_z: 1.0, // 1.0 = disabled
+      top_k: 40, // <= 0 to use vocab size
+      top_p: 0.5, // 1.0 = disabled
+      typical_p: 1.0, // 1.0 = disabled
    })

    const llamaStats = signal(null)
@ -305,29 +334,48 @@
      `
    }

+    const updateSession = (el) => session.value = { ...session.value, [el.target.name]: el.target.value }
+    const updateParams = (el) => params.value = { ...params.value, [el.target.name]: el.target.value }
+    const updateParamsFloat = (el) => params.value = { ...params.value, [el.target.name]: parseFloat(el.target.value) }
+    const updateParamsInt = (el) => params.value = { ...params.value, [el.target.name]: Math.floor(parseFloat(el.target.value)) }
+    const updateArray = (el) => {
+        const [name, index] = el.target.name.split(".")
+        const newarr = session.value[name].map((v, i) => i == index ? el.target.value : v).filter(x => x !== "")
+        session.value = { ...session.value, [name]: newarr }
+    }
+    const appendArray = () => session.value = { ...session.value, stop: [...session.value.stop, ""] }
+
    const ParamSlider = ({param, min, max, step, children}) => {
-      const updateParamsFloat = (el) => params.value = { ...params.value, [param]: parseFloat(el.target.value) }
      return html`
        <div>
-          <label for="${param}"><code>${param}</code></label>
+          <label for="${param}">${children}</label>
          <input type="range" id="${param}" min="${min}" max="${max}" step="${step}" name="${param}" value="${params.value[param]}" oninput=${updateParamsFloat} />
          <span>${params.value[param]}</span>
-          <span>${children}</span>
        </div>
      `
    }

-    const ConfigForm = (props) => {
-      const updateSession = (el) => session.value = { ...session.value, [el.target.name]: el.target.value }
-      const updateParams = (el) => params.value = { ...params.value, [el.target.name]: el.target.value }
-      const updateParamsFloat = (el) => params.value = { ...params.value, [el.target.name]: parseFloat(el.target.value) }
-      const appendArray = () => session.value = { ...session.value, stop: [...session.value.stop, ""] }
-      const updateArray = (el) => {
-        const [name, index] = el.target.name.split(".")
-        const newarr = session.value[name].map((v, i) => i == index ? el.target.value : v).filter(x => x !== "")
-        session.value = { ...session.value, [name]: newarr }
-      }
+    const FloatField = ({label, max, min, name, step, value}) => {
+      return html`
+        <div>
+          <label for="${name}">${label}</label>
+          <input type="range" id="${name}" min="${min}" max="${max}" step="${step}" name="${name}" value="${value}" oninput=${updateParamsFloat} />
+          <span>${value}</span>
+        </div>
+      `
+    };

+    const IntField = ({label, max, min, name, value}) => {
+      return html`
+        <div>
+          <label for="${name}">${label}</label>
+          <input type="range" id="${name}" min="${min}" max="${max}" name="${name}" value="${value}" oninput=${updateParamsInt} />
+          <span>${value}</span>
+        </div>
+      `
+    };
+
+    const ConfigForm = (props) => {
      return html`
        <form>
          <fieldset>
@ -356,7 +404,9 @@
                <textarea type="text" name="system_cfg" value="${session.value.system_cfg}" rows=4 oninput=${updateSession}/>
              </div>
            `}
+          </fieldset>

+          <fieldset class="two">
            <${ParamSlider} min=1 max=1000 step=1 param=n_predict>Predict N tokens<//>
            <${ParamSlider} min=0 max=1000 step=1 param=repeat_last_n>Penalize last N tokens<//>
            ${params.value.repeat_last_n > 0 && html`
@ -365,18 +415,26 @@
              <${ParamSlider} min=0 max=4 step=0.01 param=presence_penalty>Penalize tokens not present in prompt<//>
            `}
            <${ParamSlider} min=0 max=2 step=0.01 param=temperature>Temperature<//>
-            ${params.value.temperature > 0 && html`
-              <div>
-                <input id=mirostat_0 type=radio name=mirostat checked=${params.value.mirostat == 0} value=0 oninput=${updateParamsFloat} />
-                <label for=mirostat_0>Temperature</label>
+          </fieldset>

-                <input id=mirostat_1 type=radio name=mirostat checked=${params.value.mirostat == 1} value=1 oninput=${updateParamsFloat} />
-                <label for=mirostat_1>Mirostat v1</label>
+          ${params.value.temperature > 0 && html`
+            <fieldset class="three">
+              <label>
+                <input type=radio name=mirostat checked=${params.value.mirostat == 0} value=0 oninput=${updateParamsFloat} />
+                Temperature
+              </label>

-                <input id=mirostat_2 type=radio name=mirostat checked=${params.value.mirostat == 2} value=2 oninput=${updateParamsFloat} />
-                <label for=mirostat_2>Mirostat v2</label>
-              </div>
+              <label><input type=radio name=mirostat checked=${params.value.mirostat == 1} value=1 oninput=${updateParamsFloat} />
+                Mirostat v1
+              </label>

+              <label>
+                <input type=radio name=mirostat checked=${params.value.mirostat == 2} value=2 oninput=${updateParamsFloat} />
+                Mirostat v2
+              </label>
+            </fieldset>
+
+            <fieldset class="two">
              ${params.value.mirostat == 0 && html`
                <${ParamSlider} min=1 max=1000 step=1 param=top_k>Top K<//>
                <${ParamSlider} min=0 max=1 step=0.01 param=tfs_z>Tail free sampling<//>
@ -387,14 +445,17 @@
                <${ParamSlider} min=0 max=1 step=0.01 param=mirostat_eta>Mirostat eta, learning rate<//>
                <${ParamSlider} min=0 max=1000 step=1 param=mirostat_tau>Mirostat tau, target entropy<//>
              `}
+              </fieldset>
            `}
-          </fieldset>
        </form>
      `
    }
    // poor mans markdown replacement
    const Markdownish = (params) => {
      const md = params.text
+        .replace(/&/g, '&amp;')
+        .replace(/</g, '&lt;')
+        .replace(/>/g, '&gt;')
        .replace(/^#{1,6} (.*)$/gim, '<h3>$1</h3>')
        .replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>')
        .replace(/__(.*?)__/g, '<strong>$1</strong>')
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -613,6 +613,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    fprintf(stdout, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
    fprintf(stdout, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
    fprintf(stdout, "  -gqa N, --gqa N       grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
+    fprintf(stdout, "  -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps);
    fprintf(stdout, "  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
    fprintf(stdout, "  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
    fprintf(stdout, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
@ -632,6 +633,9 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    fprintf(stdout, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
    fprintf(stdout, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
    fprintf(stdout, "  -lv, --low-vram don't allocate VRAM scratch buffer\n");
+    fprintf(stdout, "  -mmq, --mul-mat-q     use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" );
+    fprintf(stdout, "                        Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" );
+    fprintf(stdout, "                        is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" );
 #endif
    fprintf(stdout, "  -m FNAME, --model FNAME\n");
    fprintf(stdout, "                        model path (default: %s)\n", params.model.c_str());
@ -708,6 +712,12 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
                break;
            }
            params.n_gqa = std::stoi(argv[i]);
+        } else if (arg == "-eps" || arg == "--rms-norm-eps") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.rms_norm_eps = std::stof(argv[i]);
        } else if (arg == "--rope-freq-base") {
            if (++i >= argc) {
                invalid_param = true;
@ -769,13 +779,19 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
                }
            }
 #else
-            LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.", {});
+            LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {});
 #endif // GGML_USE_CUBLAS
        } else if (arg == "--low-vram" || arg == "-lv") {
 #ifdef GGML_USE_CUBLAS
            params.low_vram = true;
 #else
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
+            LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n", {});
+#endif // GGML_USE_CUBLAS
+        } else if (arg == "--mul-mat-q" || arg == "-mmq") {
+#ifdef GGML_USE_CUBLAS
+            params.mul_mat_q = true;
+#else
+            LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels.\n", {});
 #endif // GGML_USE_CUBLAS
        } else if (arg == "--main-gpu" || arg == "-mg") {
            if (++i >= argc) {
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@ -16,6 +16,8 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

+static const float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
+
 struct random_normal_distribution {
    std::mt19937 gen;
    std::normal_distribution<float> rd;
@ -439,7 +441,7 @@ struct ggml_tensor * forward(
        // norm
        {
            // cur shape [n_embd,N,1,1]
-            cur = ggml_rms_norm(ctx0, inpL);
+            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);

            // cur = attention_norm*cur
            cur = ggml_mul(ctx0,
@ -562,7 +564,7 @@ struct ggml_tensor * forward(
            // norm
            {
                // cur shape [n_embd,N,1,1]
-                cur = ggml_rms_norm(ctx0, inpFF);
+                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);

                // cur = ffn_norm*cur
                // cur shape [n_embd,N,1,1]
@ -606,7 +608,7 @@ struct ggml_tensor * forward(
    {

        // inpL shape [n_embd,N,1,1]
-        inpL = ggml_rms_norm(ctx0, inpL);
+        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);

        // inpL = norm*inpL
        // inpL shape [n_embd,N,1,1]
@ -694,7 +696,7 @@ struct ggml_tensor * forward_batch(
        // norm
        {
            // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_rms_norm(ctx0, inpL);
+            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
            assert_shape_2d(cur, n_embd, N*n_batch);

            // cur = attention_norm*cur
@ -857,7 +859,7 @@ struct ggml_tensor * forward_batch(
            // norm
            {
                // cur shape [n_embd,N*n_batch,1,1]
-                cur = ggml_rms_norm(ctx0, inpFF);
+                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
                assert_shape_2d(cur, n_embd, N*n_batch);

                // cur = ffn_norm*cur
@ -910,7 +912,7 @@ struct ggml_tensor * forward_batch(
    {

        // inpL shape [n_embd,N*n_batch,1,1]
-        inpL = ggml_rms_norm(ctx0, inpL);
+        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
        assert_shape_2d(inpL, n_embd, N*n_batch);

        // inpL = norm*inpL
@ -979,7 +981,7 @@ struct ggml_tensor * forward_batch_wo_cache(
        // norm
        {
            // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_rms_norm(ctx0, inpL);
+            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
            assert_shape_2d(cur, n_embd, N*n_batch);

            // cur = attention_norm*cur
@ -1085,7 +1087,7 @@ struct ggml_tensor * forward_batch_wo_cache(
            // norm
            {
                // cur shape [n_embd,N*n_batch,1,1]
-                cur = ggml_rms_norm(ctx0, inpFF);
+                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
                assert_shape_2d(cur, n_embd, N*n_batch);

                // cur = ffn_norm*cur
@ -1138,7 +1140,7 @@ struct ggml_tensor * forward_batch_wo_cache(
    {

        // inpL shape [n_embd,N*n_batch,1,1]
-        inpL = ggml_rms_norm(ctx0, inpL);
+        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
        assert_shape_2d(inpL, n_embd, N*n_batch);

        // inpL = norm*inpL
@ -1203,7 +1205,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(

        // norm
        {
-            cur = ggml_rms_norm(ctx0, inpL);
+            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
            assert_shape_2d(cur, n_embd, N*n_batch);

            // cur = attention_norm*cur
@ -1267,7 +1269,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
        {
            // norm
            {
-                cur = ggml_rms_norm(ctx0, inpFF);
+                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
                assert_shape_2d(cur, n_embd, N*n_batch);

                // cur = ffn_norm*cur
@ -1311,7 +1313,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
    // norm
    {

-        inpL = ggml_rms_norm(ctx0, inpL);
+        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
        assert_shape_2d(inpL, n_embd, N*n_batch);

        // inpL = norm*inpL
@ -1603,7 +1605,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
        struct my_llama_layer & layer = model->layers[il];
        // tensors with values necessary for backward pass are in persistent buf(-1)
        // other tensors with buf(0) and buf(1) are only temporary needed, and their memory reused after layer is completed.
-        use_buf(-1); struct ggml_tensor * t02 = expand(gf, ggml_rms_norm     (ctx0, cur));                                    assert_shape_2d(t02, n_embd, N*n_batch);
+        use_buf(-1); struct ggml_tensor * t02 = expand(gf, ggml_rms_norm     (ctx0, cur, rms_norm_eps));                      assert_shape_2d(t02, n_embd, N*n_batch);
        use_buf( 0); struct ggml_tensor * t03 = expand(gf, ggml_repeat       (ctx0, layer.attention_norm, t02));              assert_shape_2d(t03, n_embd, N*n_batch);
        use_buf(-1); struct ggml_tensor * t04 = expand(gf, ggml_mul          (ctx0, t02, t03));                               assert_shape_2d(t04, n_embd, N*n_batch);
        use_buf(-1); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat      (ctx0, layer.wq, t04));                          assert_shape_2d(t05, n_embd, N*n_batch);
@ -1623,7 +1625,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
        use_buf(-1); struct ggml_tensor * t19 = expand(gf, ggml_reshape_2d   (ctx0, t18, n_embd, N*n_batch));                 assert_shape_2d(t19, n_embd, N*n_batch);
        use_buf( 0); struct ggml_tensor * t20 = expand(gf, ggml_mul_mat      (ctx0, layer.wo, t19));                          assert_shape_2d(t20, n_embd, N*n_batch);
        use_buf(-1); struct ggml_tensor * t21 = expand(gf, ggml_add          (ctx0, t20, cur));                               assert_shape_2d(t21, n_embd, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t22 = expand(gf, ggml_rms_norm     (ctx0, t21));                                    assert_shape_2d(t22, n_embd, N*n_batch);
+        use_buf(-1); struct ggml_tensor * t22 = expand(gf, ggml_rms_norm     (ctx0, t21, rms_norm_eps));                      assert_shape_2d(t22, n_embd, N*n_batch);
        use_buf( 0); struct ggml_tensor * t23 = expand(gf, ggml_repeat       (ctx0, layer.ffn_norm, t22));                    assert_shape_2d(t23, n_embd, N*n_batch);
        use_buf(-1); struct ggml_tensor * t24 = expand(gf, ggml_mul          (ctx0, t23, t22));                               assert_shape_2d(t24, n_embd, N*n_batch);
        use_buf(-1); struct ggml_tensor * t25 = expand(gf, ggml_mul_mat      (ctx0, layer.w3, t24));                          assert_shape_2d(t25, n_ff, N*n_batch);
@ -1666,7 +1668,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
    }
    clr_buf(0);
    use_buf(0);
-    struct ggml_tensor * t31   = expand(gf, ggml_rms_norm  (ctx0, cur));                       assert_shape_2d(t31, n_embd, N*n_batch);
+    struct ggml_tensor * t31   = expand(gf, ggml_rms_norm  (ctx0, cur, rms_norm_eps));         assert_shape_2d(t31, n_embd, N*n_batch);
    struct ggml_tensor * t32   = expand(gf, ggml_repeat    (ctx0, model->norm, t31));          assert_shape_2d(t32, n_embd, N*n_batch);
    struct ggml_tensor * t33   = expand(gf, ggml_mul       (ctx0, t32, t31));                  assert_shape_2d(t33, n_embd, N*n_batch);
    use_buf(-1);
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@ -0,0 +1,541 @@
+#include "ggml-alloc.h"
+#include "ggml.h"
+#include <assert.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define UNUSED(x) (void)(x)
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+//#define GGML_ALLOCATOR_DEBUG
+
+//#define AT_PRINTF printf
+#define AT_PRINTF(...) ((void)0)
+
+struct hash_node {
+    struct ggml_tensor * t;
+    int n_children;
+    int n_views;
+};
+
+static size_t hash(void * p) {
+    return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
+}
+
+static struct hash_node * hash_get(struct hash_node hash_table[], struct ggml_tensor * t) {
+    size_t h = hash(t);
+
+    // linear probing
+    size_t i = h;
+    while (hash_table[i].t != NULL) {
+        if (hash_table[i].t == t) {
+            return &hash_table[i];
+        }
+        i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
+        if (i == h) {
+            // hash table is full
+            GGML_ASSERT(false);
+        }
+    }
+
+    hash_table[i].t = t;
+    return &hash_table[i];
+}
+
+// TODO: GGML_PAD ?
+static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
+    assert(alignment && !(alignment & (alignment - 1))); // power of 2
+    size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
+    return offset + align;
+}
+
+struct free_block {
+    void * addr;
+    size_t size;
+};
+
+#define MAX_FREE_BLOCKS 128
+
+struct ggml_allocr {
+    void * data;
+    size_t size;
+    size_t alignment;
+    int n_free_blocks;
+    struct free_block free_blocks[MAX_FREE_BLOCKS];
+    struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
+    size_t max_size;
+    bool measure;
+
+#ifdef GGML_ALLOCATOR_DEBUG
+    struct ggml_tensor * allocated_tensors[1024];
+#endif
+};
+
+#ifdef GGML_ALLOCATOR_DEBUG
+static void add_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) {
+    for (int i = 0; i < 1024; i++) {
+        if (alloc->allocated_tensors[i] == NULL) {
+            alloc->allocated_tensors[i] = tensor;
+            return;
+        }
+    }
+    GGML_ASSERT(!"out of allocated_tensors");
+}
+static void remove_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) {
+    for (int i = 0; i < 1024; i++) {
+        if (alloc->allocated_tensors[i] == tensor ||
+            (alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) {
+            alloc->allocated_tensors[i] = NULL;
+            return;
+        }
+    }
+    printf("tried to free tensor %s not found\n", tensor->name);
+    GGML_ASSERT(!"tensor not found");
+}
+#endif
+
+
+static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+    return ggml_nbytes(tensor);
+
+    UNUSED(alloc);
+}
+
+void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+    size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
+    size = aligned_offset(NULL, size, alloc->alignment);
+
+    AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
+
+    size_t max_avail = 0;
+
+    // find the best fitting free block
+    int best_fit_block = -1;
+    size_t best_fit_size = SIZE_MAX;
+    for (int i = 0; i < alloc->n_free_blocks; i++) {
+        struct free_block * block = &alloc->free_blocks[i];
+        max_avail = MAX(max_avail, block->size);
+        if (block->size >= size && block->size <= best_fit_size) {
+            best_fit_block = i;
+            best_fit_size = block->size;
+        }
+    }
+
+    AT_PRINTF("block %d\n", best_fit_block);
+
+    if (best_fit_block == -1) {
+        fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
+                __func__, size, max_avail);
+        GGML_ASSERT(!"not enough space in the buffer");
+        return;
+    }
+    struct free_block * block = &alloc->free_blocks[best_fit_block];
+    void * addr = block->addr;
+    block->addr = (char*)block->addr + size;
+    block->size -= size;
+    if (block->size == 0) {
+        // remove block if empty
+        alloc->n_free_blocks--;
+        for (int j = best_fit_block; j < alloc->n_free_blocks; j++) {
+            alloc->free_blocks[j] = alloc->free_blocks[j+1];
+        }
+    }
+
+    tensor->data = addr;
+
+#ifdef GGML_ALLOCATOR_DEBUG
+    add_allocated_tensor(alloc, tensor);
+    size_t cur_max = (char*)addr - (char*)alloc->data + size;
+    if (cur_max > alloc->max_size) {
+        printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
+        for (int i = 0; i < 1024; i++) {
+            if (alloc->allocated_tensors[i]) {
+                printf("%s (%.2f MB) ", alloc->allocated_tensors[i]->name, ggml_nbytes(alloc->allocated_tensors[i]) / 1024.0 / 1024.0);
+            }
+        }
+        printf("\n");
+    }
+#endif
+
+    alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->data + size);
+}
+
+// this is a very naive implementation, but for our case the number of free blocks should be very small
+static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+    void * ptr = tensor->data;
+
+    if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) {
+        // the tensor was not allocated in this buffer
+        // this can happen because the graph allocator will try to free weights and other tensors from different buffers
+        // the easiest way to deal with this is just to ignore it
+        return;
+    }
+
+    size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
+    size = aligned_offset(NULL, size, alloc->alignment);
+    AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
+
+#ifdef GGML_ALLOCATOR_DEBUG
+    remove_allocated_tensor(alloc, tensor);
+#endif
+
+    // see if we can merge with an existing block
+    for (int i = 0; i < alloc->n_free_blocks; i++) {
+        struct free_block * block = &alloc->free_blocks[i];
+        // check if ptr is at the end of the block
+        if ((char*)block->addr + block->size == ptr) {
+            block->size += size;
+            // check if we can merge with the next block
+            if (i < alloc->n_free_blocks - 1 && (char*)block->addr + block->size == alloc->free_blocks[i+1].addr) {
+                block->size += alloc->free_blocks[i+1].size;
+                alloc->n_free_blocks--;
+                for (int j = i+1; j < alloc->n_free_blocks; j++) {
+                    alloc->free_blocks[j] = alloc->free_blocks[j+1];
+                }
+            }
+            return;
+        }
+        // check if ptr is at the beginning of the block
+        if ((char*)ptr + size == block->addr) {
+            block->addr = ptr;
+            block->size += size;
+            // check if we can merge with the previous block
+            if (i > 0 && (char*)alloc->free_blocks[i-1].addr + alloc->free_blocks[i-1].size == block->addr) {
+                alloc->free_blocks[i-1].size += block->size;
+                alloc->n_free_blocks--;
+                for (int j = i; j < alloc->n_free_blocks; j++) {
+                    alloc->free_blocks[j] = alloc->free_blocks[j+1];
+                }
+            }
+            return;
+        }
+    }
+    // otherwise, add a new block
+    GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
+    // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
+    int insert_pos = 0;
+    while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].addr < ptr) {
+        insert_pos++;
+    }
+    // shift all blocks from insert_pos onward to make room for the new block
+    for (int i = alloc->n_free_blocks; i > insert_pos; i--) {
+        alloc->free_blocks[i] = alloc->free_blocks[i-1];
+    }
+    // insert the new block
+    alloc->free_blocks[insert_pos].addr = ptr;
+    alloc->free_blocks[insert_pos].size = size;
+    alloc->n_free_blocks++;
+}
+
+void ggml_allocr_reset(struct ggml_allocr * alloc) {
+    alloc->n_free_blocks = 1;
+    size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
+    alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
+    alloc->free_blocks[0].size = alloc->size - align_offset;
+}
+
+struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
+    struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
+
+    *alloc = (struct ggml_allocr){
+        /*.data          = */ data,
+        /*.size          = */ size,
+        /*.alignment     = */ alignment,
+        /*.n_free_blocks = */ 0,
+        /*.free_blocks   = */ {{0}},
+        /*.hash_table    = */ {{0}},
+        /*.max_size      = */ 0,
+        /*.measure       = */ false,
+#ifdef GGML_ALLOCATOR_DEBUG
+        /*.allocated_tensors = */ = {0},
+#endif
+    };
+
+    ggml_allocr_reset(alloc);
+
+    return alloc;
+}
+
+// address and size of the buffer when measuring
+// it needs to be large enough to fit all the tensors, but it cannot overlap with other existing buffers
+static void * const MEASURE_BASE_ADDR = (void *) 0x1000;
+static const size_t MEASURE_MAX_SIZE  = 1ULL<<40; // 1 TB
+
+struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
+    struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
+
+    *alloc = (struct ggml_allocr){
+        /*.data          = */ MEASURE_BASE_ADDR,
+        /*.size          = */ MEASURE_MAX_SIZE,
+        /*.alignment     = */ alignment,
+        /*.n_free_blocks = */ 0,
+        /*.free_blocks   = */ {{0}},
+        /*.hash_table    = */ {{0}},
+        /*.max_size      = */ 0,
+        /*.measure       = */ true,
+#ifdef GGML_ALLOCATOR_DEBUG
+        /*.allocated_tensors = */ = {0},
+#endif
+    };
+
+    ggml_allocr_reset(alloc);
+
+    return alloc;
+}
+
+void ggml_allocr_free(struct ggml_allocr * alloc) {
+    free(alloc);
+}
+
+bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
+    return alloc->measure;
+}
+
+//////////// compute graph allocator
+
+static bool ggml_is_view(struct ggml_tensor * t) {
+    return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE ||
+           t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY;
+}
+
+static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
+    if (a->type != b->type) {
+        return false;
+    }
+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+        if (a->ne[i] != b->ne[i]) {
+            return false;
+        }
+        if (a->nb[i] != b->nb[i]) {
+            return false;
+        }
+    }
+    return true;
+}
+
+static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) {
+    switch (t->op) {
+        case GGML_OP_PERMUTE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_TRANSPOSE:
+        case GGML_OP_VIEW:
+            return t->src[0];
+        case GGML_OP_CPY:
+            return t->src[1];
+        default:
+            return NULL;
+    }
+}
+
+static struct ggml_tensor * get_view_source(struct ggml_tensor * t) {
+    struct ggml_tensor * parent = t;
+    do {
+        parent = get_view_parent(parent);
+    } while (ggml_is_view(parent));
+    return parent;
+}
+
+static bool ggml_op_can_inplace(enum ggml_op op) {
+    switch (op) {
+        case GGML_OP_SCALE:
+        case GGML_OP_DIAG_MASK_ZERO:
+        case GGML_OP_DIAG_MASK_INF:
+        case GGML_OP_ADD:
+        case GGML_OP_ADD1:
+        case GGML_OP_ACC:
+        case GGML_OP_SUB:
+        case GGML_OP_MUL:
+        case GGML_OP_DIV:
+        case GGML_OP_SQR:
+        case GGML_OP_SQRT:
+        case GGML_OP_LOG:
+        case GGML_OP_UNARY:
+        case GGML_OP_ROPE:
+        case GGML_OP_RMS_NORM:
+        case GGML_OP_SET:
+        case GGML_OP_SOFT_MAX:
+        case GGML_OP_CONT:
+            return true;
+
+        default:
+            return false;
+    }
+}
+
+static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
+    struct hash_node * ht = alloc->hash_table;
+    if (node->data == NULL) {
+        if (ggml_is_view(node)) {
+            size_t offset;
+            switch(node->op) {
+                case GGML_OP_VIEW:
+                    memcpy(&offset, node->op_params, sizeof(size_t));
+                    node->data = (char *) node->src[0]->data + offset;
+                    break;
+                case GGML_OP_PERMUTE:
+                case GGML_OP_RESHAPE:
+                case GGML_OP_TRANSPOSE:
+                    node->data = node->src[0]->data;
+                    break;
+                case GGML_OP_CPY:
+                    node->data = node->src[1]->data;
+                    break;
+                default:
+                    GGML_ASSERT(!"unknown view op");
+                    break;
+            }
+        } else {
+            // see if we can reuse a parent's buffer (inplace)
+            if (ggml_op_can_inplace(node->op)) {
+                for (int i = 0; i < GGML_MAX_SRC; i++) {
+                    struct ggml_tensor * parent = node->src[i];
+                    if (parent == NULL) {
+                        break;
+                    }
+                    struct hash_node * p_hn = hash_get(ht, parent);
+                    if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
+                        if (ggml_is_view(parent)) {
+                            struct ggml_tensor * view_src = get_view_source(parent);
+                            struct hash_node * view_src_hn = hash_get(ht, view_src);
+                            if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
+                                // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
+                                // the parent's data that it will need later (same layout requirement). the problem is that then
+                                // we cannot free the tensor because the original address of the allocation is lost.
+                                // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
+                                // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
+                                AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
+                                node->data = parent->data;
+                                return;
+                            }
+                        }
+                        else {
+                            AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
+                            node->data = parent->data;
+                        }
+                        return;
+                    }
+                }
+            }
+            ggml_allocr_alloc(alloc, node);
+        }
+    }
+}
+
+static size_t ggml_allocator_alloc_graph_tensors_n(
+    struct ggml_allocr * alloc,
+    struct ggml_cgraph ** graphs, int n_graphs,
+    struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
+
+    // reset hash table
+    struct hash_node * ht = alloc->hash_table;
+    memset(ht, 0, sizeof(struct hash_node) * GGML_GRAPH_HASHTABLE_SIZE);
+
+    // count number of children and views
+    for (int g = 0; g < n_graphs; g++) {
+        struct ggml_cgraph * gf = graphs[g];
+        for (int i = 0; i < gf->n_nodes; i++) {
+            struct ggml_tensor * node = gf->nodes[i];
+
+            if (ggml_is_view(node)) {
+                struct ggml_tensor * view_src = get_view_source(node);
+                hash_get(ht, view_src)->n_views += 1;
+            }
+
+            for (int j = 0; j < GGML_MAX_SRC; j++) {
+                struct ggml_tensor * parent = node->src[j];
+                if (parent == NULL) {
+                    break;
+                }
+                hash_get(ht, parent)->n_children += 1;
+            }
+        }
+    }
+
+    // allocate tensors
+    for (int g = 0; g < n_graphs; g++) {
+        struct ggml_cgraph * gf = graphs[g];
+        AT_PRINTF("####### graph %d/%d\n", g, n_graphs);
+        // graph inputs are allocated first to ensure that they are not overwritten by each other
+        if (inputs != NULL && inputs[g] != NULL) {
+            for (int i = 0; inputs[g][i] != NULL; i++) {
+                struct ggml_tensor * input = inputs[g][i];
+                AT_PRINTF("input: %s\n", input->name);
+                allocate_node(alloc, input);
+            }
+        }
+        for (int i = 0; i < gf->n_nodes; i++) {
+            struct ggml_tensor * node = gf->nodes[i];
+
+            // allocate parents (leafs)
+            for (int j = 0; j < GGML_MAX_SRC; j++) {
+                struct ggml_tensor * parent = node->src[j];
+                if (parent == NULL) {
+                    break;
+                }
+                allocate_node(alloc, parent);
+            }
+
+            // allocate node
+            allocate_node(alloc, node);
+
+            AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
+            for (int j = 0; j < GGML_MAX_SRC; j++) {
+                struct ggml_tensor * parent = node->src[j];
+                if (parent == NULL) {
+                    break;
+                }
+                AT_PRINTF("%s", parent->name);
+                if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
+                    AT_PRINTF(", ");
+                }
+            }
+            AT_PRINTF("\n");
+
+            // update parents
+            for (int j = 0; j < GGML_MAX_SRC; j++) {
+                struct ggml_tensor * parent = node->src[j];
+                if (parent == NULL) {
+                    break;
+                }
+                struct hash_node * p_hn = hash_get(ht, parent);
+                p_hn->n_children -= 1;
+
+                //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
+
+                if (p_hn->n_children == 0 && p_hn->n_views == 0) {
+                    if (ggml_is_view(parent)) {
+                        struct ggml_tensor * view_src = get_view_source(parent);
+                        struct hash_node * view_src_hn = hash_get(ht, view_src);
+                        view_src_hn->n_views -= 1;
+                        AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src->n_children, view_src->n_views);
+                        if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
+                            ggml_allocator_free_tensor(alloc, view_src);
+                        }
+                    }
+                    else {
+                        if (parent->data != node->data) {
+                            ggml_allocator_free_tensor(alloc, parent);
+                        }
+                    }
+                }
+            }
+            AT_PRINTF("\n");
+        }
+        // free graph outputs here that wouldn't be freed otherwise because they have no children
+        if (outputs != NULL && outputs[g] != NULL) {
+            for (int i = 0; outputs[g][i] != NULL; i++) {
+                struct ggml_tensor * output = outputs[g][i];
+                AT_PRINTF("output: %s\n", output->name);
+                ggml_allocator_free_tensor(alloc, output);
+            }
+        }
+    }
+
+    return alloc->max_size;
+}
+
+size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
+    return ggml_allocator_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
+}
--- a/ggml-alloc.h
+++ b/ggml-alloc.h
@ -0,0 +1,22 @@
+#pragma once
+
+#include "ggml.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+
+GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
+GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
+
+GGML_API void   ggml_allocr_free(struct ggml_allocr * alloc);
+GGML_API bool   ggml_allocr_is_measure(struct ggml_allocr * alloc);
+GGML_API void   ggml_allocr_reset(struct ggml_allocr * alloc);
+GGML_API void   ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);
+GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
+
+
+#ifdef  __cplusplus
+}
+#endif
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@ -27,6 +27,7 @@ void   ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
 void   ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
 void   ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
 void   ggml_cuda_set_main_device(int main_device);
+void   ggml_cuda_set_mul_mat_q(bool mul_mat_q);
 void   ggml_cuda_set_scratch_size(size_t scratch_size);
 void   ggml_cuda_free_scratch(void);
 bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
--- a/ggml-metal.h
+++ b/ggml-metal.h
@ -61,6 +61,13 @@ void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor *
 // get data from the device into host memory
 void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);

+// try to find operations that can be run concurrently in the graph
+// you should run it again if the topology of your graph changes
+void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
+
+// if the graph has been optimized for concurrently dispatch
+bool ggml_metal_if_optimized(struct ggml_metal_context * ctx);
+
 // same as ggml_graph_compute but uses Metal
 // creates gf->n_threads command buffers in parallel
 void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -36,6 +36,9 @@ struct ggml_metal_context {
    int n_buffers;
    struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];

+    int concur_list[GGML_MAX_NODES];
+    int concur_list_len;
+
    // custom kernels
 #define GGML_METAL_DECL_KERNEL(name) \
    id<MTLFunction>             function_##name; \
@ -98,6 +101,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
    ctx->device = MTLCreateSystemDefaultDevice();
    ctx->queue  = [ctx->device newCommandQueue];
    ctx->n_buffers = 0;
+    ctx->concur_list_len = 0;

    // determine if we can use MPS
    if (MPSSupportsMTLDevice(ctx->device)) {
@ -217,6 +221,13 @@ void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
    ctx->n_cb = n_cb;
 }

+bool ggml_metal_if_optimized(struct ggml_metal_context * ctx) {
+    if (ctx->concur_list_len) {
+        return true;
+    }
+    return false;
+}
+
 // finds the Metal buffer that contains the tensor data on the GPU device
 // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
 // Metal buffer based on the host memory pointer
@ -355,11 +366,98 @@ void ggml_metal_get_tensor(
    memcpy(t->data, (void *) ((uint8_t *) id_src.contents + offs), ggml_nbytes(t));
 }

+void ggml_metal_graph_find_concurrency(
+        struct ggml_metal_context * ctx,
+        struct ggml_cgraph * gf) {
+    int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time
+    int nodes_unused[GGML_MAX_NODES];
+
+    for (int i = 0; i < GGML_MAX_NODES; i++) {ctx->concur_list[i] = 0;}
+    for (int i = 0; i < gf->n_nodes; i++) {nodes_unused[i] = 1;}
+    ctx->concur_list_len = 0;
+
+    int n_left = gf->n_nodes;
+    int n_start = 0; // all nodes before n_start at nodes_unused array have been sorted and store back to ctx->concur_list
+    int level_pos = 0;  // at ctx->concur_list, the last layer (level) ends at level_pos
+
+    while (n_left > 0) {
+        // number of nodes at a layer (that can be issued concurrently)
+        int concurrency = 0;
+        for (int i = n_start; i < ((n_start + search_depth > gf->n_nodes) ? gf->n_nodes : n_start + search_depth); i++) {
+            if (nodes_unused[i]) {
+                // if the requirements for gf->nodes[i] are satisfied
+                int exe_flag=1;
+                // scan all srcs
+                for (int src_ind = 0; src_ind < GGML_MAX_SRC; src_ind++) {
+                    struct ggml_tensor * src_cur = gf->nodes[i]->src[src_ind];
+                    if (src_cur) {
+                        // if is leaf nodes it's satisfied.
+                        if (src_cur->op == GGML_OP_NONE && src_cur->grad == NULL) {continue;}
+
+                        // otherwise this src should be the output from previous nodes.
+                        int is_found = 0;
+                        // scan 2*search_depth back because we inserted barrier.
+                        for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) {
+                            if (gf->nodes[ctx->concur_list[j]] == src_cur) {is_found = 1; break;}
+                        }
+                        if (is_found == 0) {exe_flag = 0; break;}
+                    }
+                }
+                if (exe_flag) {
+                    // check if nodes[i]'s data will be overwritten by a node before nodes[i].
+                    // if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3]
+                    int64_t data_start = (int64_t) gf->nodes[i]->data;
+                    int64_t length = (int64_t) ggml_nbytes(gf->nodes[i]);
+                    for (int j = n_start; j < i; j++) {
+                        if (nodes_unused[j] && gf->nodes[j]->op != GGML_OP_RESHAPE \
+                                            && gf->nodes[j]->op != GGML_OP_VIEW \
+                                            && gf->nodes[j]->op != GGML_OP_TRANSPOSE \
+                                            && gf->nodes[j]->op != GGML_OP_PERMUTE) {
+                            if (((int64_t)gf->nodes[j]->data) >= data_start + length || \
+                                ((int64_t)gf->nodes[j]->data) + (int64_t) ggml_nbytes(gf->nodes[j]) <= data_start) {
+                                continue;
+                            } else {
+                                exe_flag = 0;
+                            }
+                        }
+                    }
+                }
+                if (exe_flag) {
+                    ctx->concur_list[level_pos + concurrency] = i;
+                    nodes_unused[i] = 0;
+                    concurrency++;
+                    ctx->concur_list_len++;
+                }
+            }
+        }
+        n_left -= concurrency;
+        // adding a barrier different layer
+        ctx->concur_list[level_pos + concurrency] = -1;
+        ctx->concur_list_len++;
+        // jump all sorted nodes at nodes_bak
+        while (!nodes_unused[n_start]) {n_start++;}
+        level_pos += concurrency + 1;
+    }
+
+    if (ctx->concur_list_len > GGML_MAX_NODES) {
+        fprintf(stderr, "%s: too many elements for metal ctx->concur_list!\n", __func__);
+    }
+}
+
 void ggml_metal_graph_compute(
        struct ggml_metal_context * ctx,
               struct ggml_cgraph * gf) {
    metal_printf("%s: evaluating graph\n", __func__);

+    // if there is ctx->concur_list, dispatch concurrently
+    // else fallback to serial dispatch
+    MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
+
+    const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_NODES;
+
+    const int n_nodes  = has_concur ? ctx->concur_list_len      : gf->n_nodes;
+    edesc.dispatchType = has_concur ? MTLDispatchTypeConcurrent : MTLDispatchTypeSerial;
+
    // create multiple command buffers and enqueue them
    // then, we encode the graph into the command buffers in parallel

@ -378,7 +476,7 @@ void ggml_metal_graph_compute(
    dispatch_queue_t queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);

    for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
-        const int n_nodes_per_cb = (gf->n_nodes + n_cb - 1) / n_cb;
+        const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;

        dispatch_async(queue, ^{
            size_t offs_src0 = 0;
@ -389,10 +487,21 @@ void ggml_metal_graph_compute(

            id<MTLComputeCommandEncoder> encoder = nil;

-            const int node_start =                                      (cb_idx + 0) * n_nodes_per_cb;
-            const int node_end   = (cb_idx == n_cb - 1) ? gf->n_nodes : (cb_idx + 1) * n_nodes_per_cb;
+            const int node_start =                                  (cb_idx + 0) * n_nodes_per_cb;
+            const int node_end   = (cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb;
+
+            for (int ind = node_start; ind < node_end; ++ind) {
+                const int i = has_concur ? ctx->concur_list[ind] : ind;
+
+                if (i == -1) {
+                    if (encoder == nil) {
+                        encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
+                        continue;
+                    }
+                    [encoder memoryBarrierWithScope:MTLBarrierScopeBuffers];
+                    continue;
+                }

-            for (int i = node_start; i < node_end; ++i) {
                metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));

                struct ggml_tensor * src0 = gf->nodes[i]->src[0];
@ -463,7 +572,7 @@ void ggml_metal_graph_compute(
                    case GGML_OP_ADD:
                        {
                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
+                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                            }

                            if (ggml_nelements(src1) == ne10) {
@ -484,7 +593,7 @@ void ggml_metal_graph_compute(
                    case GGML_OP_MUL:
                        {
                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
+                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                            }

                            if (ggml_nelements(src1) == ne10) {
@ -505,7 +614,7 @@ void ggml_metal_graph_compute(
                    case GGML_OP_SCALE:
                        {
                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
+                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                            }

                            const float scale = *(const float *) src1->data;
@ -519,52 +628,60 @@ void ggml_metal_graph_compute(

                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                        } break;
-                    case GGML_OP_SILU:
-                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
-                            }
+                    case GGML_OP_UNARY:
+                        switch (ggml_get_unary_op(gf->nodes[i])) {
+                            case GGML_UNARY_OP_SILU:
+                                {
+                                    if (encoder == nil) {
+                                        encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
+                                    }

-                            [encoder setComputePipelineState:ctx->pipeline_silu];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                                    [encoder setComputePipelineState:ctx->pipeline_silu];
+                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];

-                            const int64_t n = ggml_nelements(dst);
+                                    const int64_t n = ggml_nelements(dst);

-                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                                } break;
+                            case GGML_UNARY_OP_RELU:
+                                {
+                                    if (encoder == nil) {
+                                        encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
+                                    }
+
+                                    [encoder setComputePipelineState:ctx->pipeline_relu];
+                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+
+                                    const int64_t n = ggml_nelements(dst);
+
+                                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                                } break;
+                            case GGML_UNARY_OP_GELU:
+                                {
+                                    if (encoder == nil) {
+                                        encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
+                                    }
+
+                                    [encoder setComputePipelineState:ctx->pipeline_gelu];
+                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+
+                                    const int64_t n = ggml_nelements(dst);
+
+                                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                                } break;
+                            default:
+                                {
+                                    fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
+                                    GGML_ASSERT(false);
+                                }
                        } break;
-                    case GGML_OP_RELU:
-                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
-                            }
-
-                            [encoder setComputePipelineState:ctx->pipeline_relu];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                            const int64_t n = ggml_nelements(dst);
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                        } break;
-                    case GGML_OP_GELU:
-                    {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
-                            }
-
-                            [encoder setComputePipelineState:ctx->pipeline_gelu];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                            const int64_t n = ggml_nelements(dst);
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                    } break;
                    case GGML_OP_SOFT_MAX:
                        {
                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
+                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                            }

                            const int nth = 32;
@ -582,7 +699,7 @@ void ggml_metal_graph_compute(
                    case GGML_OP_DIAG_MASK_INF:
                        {
                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
+                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                            }

                            const int n_past = ((int32_t *)(dst->op_params))[0];
@ -601,7 +718,8 @@ void ggml_metal_graph_compute(
                            // TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224

                            GGML_ASSERT(ne00 == ne10);
-                            GGML_ASSERT(ne02 == ne12);
+                            // GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
+                            GGML_ASSERT(ne03 == ne13);

                            if (ggml_is_contiguous(src0) &&
                                ggml_is_contiguous(src1) &&
@ -629,11 +747,11 @@ void ggml_metal_graph_compute(
                                    initWithDevice:ctx->device transposeLeft:false transposeRight:true
                                        resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0];

-                                // we need to do ne02 multiplications
+                                // we need to do ne12 multiplications
                                // TODO: is there a way to do this in parallel - currently very slow ..
                                // TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
-                                for (int64_t i02 = 0; i02 < ne02; ++i02) {
-                                    size_t offs_src0_cur = offs_src0 + i02*nb02;
+                                for (int64_t i02 = 0; i02 < ne12; ++i02) {
+                                    size_t offs_src0_cur = offs_src0 + i02/(ne12/ne02)*nb02; // gqa not used for now
                                    size_t offs_src1_cur = offs_src1 + i02*nb12;
                                    size_t offs_dst_cur  = offs_dst  + i02*nb2;

@ -645,7 +763,7 @@ void ggml_metal_graph_compute(
                                }
                            } else {
                                if (encoder == nil) {
-                                    encoder = [command_buffer computeCommandEncoder];
+                                    encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                                }

                                int nth0 = 32;
@ -655,8 +773,6 @@ void ggml_metal_graph_compute(
                                switch (src0t) {
                                    case GGML_TYPE_F16:
                                        {
-                                            GGML_ASSERT(ne02 == ne12);
-
                                            nth0 = 64;
                                            nth1 = 1;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
@ -736,16 +852,18 @@ void ggml_metal_graph_compute(
                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
                                [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
                                [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
-                                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:5];
-                                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:6];
-                                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:7];
-                                [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:8];
-                                [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:9];
-                                [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:10];
-                                [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11];
-                                [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12];
-                                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:13];
-                                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:14];
+                                [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
+                                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
+                                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
+                                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
+                                [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9];
+                                [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10];
+                                [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:11];
+                                [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:12];
+                                [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:13];
+                                [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14];
+                                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:15];
+                                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:16];

                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
                                    src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
@ -772,7 +890,7 @@ void ggml_metal_graph_compute(
                    case GGML_OP_GET_ROWS:
                        {
                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
+                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                            }

                            switch (src0->type) {
@ -801,10 +919,11 @@ void ggml_metal_graph_compute(
                    case GGML_OP_RMS_NORM:
                        {
                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
+                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                            }

-                            const float eps = 1e-6f;
+                            float eps;
+                            memcpy(&eps, dst->op_params, sizeof(float));

                            const int nth = 512;

@ -823,7 +942,7 @@ void ggml_metal_graph_compute(
                    case GGML_OP_NORM:
                        {
                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
+                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                            }

                            const float eps = 1e-5f;
@ -845,7 +964,7 @@ void ggml_metal_graph_compute(
                    case GGML_OP_ALIBI:
                        {
                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
+                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                            }

                            GGML_ASSERT((src0t == GGML_TYPE_F32));
@ -888,7 +1007,7 @@ void ggml_metal_graph_compute(
                    case GGML_OP_ROPE:
                        {
                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
+                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                            }

                            const int n_past = ((int32_t *) dst->op_params)[0];
@ -932,7 +1051,7 @@ void ggml_metal_graph_compute(
                    case GGML_OP_CONT:
                        {
                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
+                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                            }

                            const int nth = 32;
@ -979,8 +1098,10 @@ void ggml_metal_graph_compute(
                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                        } break;
                    default:
-                        fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
-                        GGML_ASSERT(false);
+                        {
+                            fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
+                            GGML_ASSERT(false);
+                        }
                }
            }

--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@ -387,87 +387,90 @@ kernel void kernel_rms_norm(
    }
 }

-// function for calculate inner product between a q4_0 block and 32 floats (yl), sumy is SUM(yl[i])
-float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thread float * yl) {
+// function for calculate inner product between half a q4_0 block and 16 floats (yl), sumy is SUM(yl[i])
+// il indicates where the q4 quants begin (0 or QK4_0/4)
+// we assume that the yl's have been multiplied with the appropriate scale factor
+// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
+inline float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thread float * yl, int il) {
    float d = qb_curr->d;
-    float4 acc = 0.f;
-    device uint16_t * qs = ((device uint16_t *)qb_curr + 1);
-    for (int i = 0; i < 16; i+=2) {
-        acc[0] += yl[i]      * (qs[i / 2] & 0x000F);
-        acc[1] += yl[i + 16] * (qs[i / 2] & 0x00F0);
-        acc[2] += yl[i +  1] * (qs[i / 2] & 0x0F00);
-        acc[3] += yl[i + 17] * (qs[i / 2] & 0xF000);
+    float2 acc = 0.f;
+    device const uint16_t * qs = ((device const uint16_t *)qb_curr + 1 + il/2);
+    for (int i = 0; i < 8; i+=2) {
+        acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
+                + yl[i + 1] * (qs[i / 2] & 0x0F00);
+        acc[1] += yl[i + 8] * (qs[i / 2] & 0x00F0)
+                + yl[i + 9] * (qs[i / 2] & 0xF000);
    }
-    return d * (sumy * -8.f + acc[0] + acc[1]/16.f + acc[2]/256.f + acc[3]/4096.f);
+    return d * (sumy * -8.f + acc[0] + acc[1]);
 }

-// function for calculate inner product between a q4_1 block and 32 floats (yl), sumy is SUM(yl[i])
-float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thread float * yl) {
+// function for calculate inner product between half a q4_1 block and 16 floats (yl), sumy is SUM(yl[i])
+// il indicates where the q4 quants begin (0 or QK4_0/4)
+// we assume that the yl's have been multiplied with the appropriate scale factor
+// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
+inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thread float * yl, int il) {
    float d = qb_curr->d;
    float m = qb_curr->m;
-    float4 acc = 0.f;
-    device uint16_t * qs = ((device uint16_t *)qb_curr + 2);
-    for (int i = 0; i < 16; i+=2) {
-        acc[0] += yl[i]      * (qs[i / 2] & 0x000F);
-        acc[1] += yl[i + 16] * (qs[i / 2] & 0x00F0);
-        acc[2] += yl[i +  1] * (qs[i / 2] & 0x0F00);
-        acc[3] += yl[i + 17] * (qs[i / 2] & 0xF000);
+    device const uint16_t * qs = ((device const uint16_t *)qb_curr + 2 + il/2);
+    float2 acc = 0.f;
+    for (int i = 0; i < 8; i+=2) {
+        acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
+                + yl[i + 1] * (qs[i / 2] & 0x0F00);
+        acc[1] += yl[i + 8] * (qs[i / 2] & 0x00F0)
+                + yl[i + 9] * (qs[i / 2] & 0xF000);
    }
-    return d * (acc[0] + acc[1]/16.f + acc[2]/256.f + acc[3]/4096.f) + sumy * m;
+    return d * (acc[0] + acc[1]) + sumy * m;
 }

 // putting them in the kernel cause a significant performance penalty
 #define N_DST 4 // each SIMD group works on 4 rows
 #define N_SIMDGROUP 2 // number of SIMD groups in a thread group
 #define N_SIMDWIDTH 32 // assuming SIMD group size is 32
-template<typename block_q_type>
+//Note: This is a template, but strictly speaking it only applies to
+//      quantizations where the block size is 32. It also does not
+//      giard against the number of rows not being divisible by
+//      N_DST, so this is another explicit assumption of the implementation.
+template<typename block_q_type, int nr, int nsg, int nw>
 void mul_vec_q_n_f32(device const void * src0, device const float * src1, device float * dst,
                    int64_t ne00, int64_t ne10, int64_t ne0, int64_t ne01,
                    uint2 tgpig, uint tiisg, uint sgitg) {
    const int nb = ne00/QK4_0;
    const int r0 = tgpig.x;
    const int r1 = tgpig.y;
-    device const block_q_type * x = (device const block_q_type *) src0 + (r0 * N_SIMDGROUP + sgitg) * N_DST * nb;
+    const int first_row = (r0 * nsg + sgitg) * nr;
+    device const block_q_type * x = (device const block_q_type *) src0 + first_row * nb;
    device const float      * y = (device const float      *) src1 + r1*ne10;
-    float4 y_curr[8];       // src1 vector cache
-    float sumf[N_DST]={0.f}, all_sum;
-    thread float * yl=(thread float *)y_curr;
+    float yl[16];       // src1 vector cache
+    float sumf[nr]={0.f};

-    // each thread in a SIMD group deals with 1 block.
-    for (int column = 0; column < nb / N_SIMDWIDTH; column++) {
+    const int ix = tiisg/2;
+    const int il = 8*(tiisg%2);
+
+    device const float * yb = y + ix * QK4_0 + il;
+
+    // each thread in a SIMD group deals with half a block.
+    for (int ib = ix; ib < nb; ib += nw/2) {
        float sumy = 0;
-        for (int i = 0; i < QK4_0 / 4; i++) {
-            y_curr[i] = *((device float4  *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0)) + i);
-            sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
+        for (int i = 0; i < 8; i += 2) {
+            sumy += yb[i] + yb[i+1];
+            yl[i+0] = yb[i+ 0];
+            yl[i+1] = yb[i+ 1]/256.f;
+            sumy += yb[i+16] + yb[i+17];
+            yl[i+8] = yb[i+16]/16.f;
+            yl[i+9] = yb[i+17]/4096.f;
        }

-        for (int row = 0; row < N_DST; row++) {
-            sumf[row] += block_q_n_dot_y(x+(tiisg + row * nb + column * N_SIMDWIDTH), sumy, yl);
+        for (int row = 0; row < nr; row++) {
+            sumf[row] += block_q_n_dot_y(x+ib+row*nb, sumy, yl, il);
        }
+
+        yb += QK4_0 * 16;
    }

-    // from now loads two rows every time and 16 blocks per row
-    int ir = tiisg / (N_SIMDWIDTH / 2);
-    int ib = tiisg % (N_SIMDWIDTH / 2);
-    for (int ind = 0; ind < (nb % N_SIMDWIDTH + N_SIMDWIDTH / 2 - 1)/(N_SIMDWIDTH / 2); ind++) {
-        int nb_start = (nb / N_SIMDWIDTH) * N_SIMDWIDTH + ind * (N_SIMDWIDTH / 2); //where the left blocks start
-        float sumy = 0;
-        for (int i = 0; i < QK4_0 / 4; i++) {
-            y_curr[i] = *((device float4 *)(y + (nb_start + ib) * QK4_0) + i);
-            sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
-        }
-
-        for (int row = 0; row < N_DST; row+=2) {
-            if (nb_start + ib < nb) {
-                sumf[row + ir] += block_q_n_dot_y(x + (nb_start + ib + (row + ir) * nb), sumy, yl);
-            }
-        }
-    }
-
-    for (int row = 0; row < N_DST; ++row) {
-        all_sum = simd_sum(sumf[row]);
-        if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
-            dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
+    for (int row = 0; row < nr; ++row) {
+        const float tot = simd_sum(sumf[row]);
+        if (tiisg == 0 && first_row + row < ne01) {
+            dst[r1*ne0 + first_row + row] = tot;
        }
    }
 }
@ -483,7 +486,7 @@ kernel void kernel_mul_mat_q4_0_f32(
        uint2 tgpig[[threadgroup_position_in_grid]],
        uint tiisg[[thread_index_in_simdgroup]],
        uint sgitg[[simdgroup_index_in_threadgroup]]) {
-    mul_vec_q_n_f32<block_q4_0>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
+    mul_vec_q_n_f32<block_q4_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
 }

 kernel void kernel_mul_mat_q4_1_f32(
@ -497,7 +500,7 @@ kernel void kernel_mul_mat_q4_1_f32(
        uint2 tgpig[[threadgroup_position_in_grid]],
        uint tiisg[[thread_index_in_simdgroup]],
        uint sgitg[[simdgroup_index_in_threadgroup]]) {
-     mul_vec_q_n_f32<block_q4_1>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
+     mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
 }

 kernel void kernel_mul_mat_f16_f32(
@ -506,11 +509,13 @@ kernel void kernel_mul_mat_f16_f32(
        device       float * dst,
        constant   int64_t & ne00,
        constant   int64_t & ne01,
+        constant   int64_t & ne02,
        constant  uint64_t & nb00,
        constant  uint64_t & nb01,
        constant  uint64_t & nb02,
        constant   int64_t & ne10,
        constant   int64_t & ne11,
+        constant   int64_t & ne12,
        constant  uint64_t & nb10,
        constant  uint64_t & nb11,
        constant  uint64_t & nb12,
@ -526,7 +531,7 @@ kernel void kernel_mul_mat_f16_f32(
    const int64_t r1 = tgpig.y;
    const int64_t im = tgpig.z;

-    device const half  * x = (device const half  *) (src0 + r0*nb01 + im*nb02);
+    device const half  * x = (device const half  *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
    device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);

    sum[tpitg.x] = 0.0f;
@ -549,6 +554,7 @@ kernel void kernel_mul_mat_f16_f32(
    }
 }

+
 kernel void kernel_alibi_f32(
        device const float * src0,
        device       float * dst,
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -208,6 +208,7 @@

 #define GGML_UNUSED(x) (void)(x)

+#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))

 #define GGML_ASSERT(x) \
    do { \
@ -330,16 +331,6 @@ extern "C" {
        GGML_OP_ARGMAX,
        GGML_OP_REPEAT,
        GGML_OP_REPEAT_BACK,
-        GGML_OP_ABS,
-        GGML_OP_SGN,
-        GGML_OP_NEG,
-        GGML_OP_STEP,
-        GGML_OP_TANH,
-        GGML_OP_ELU,
-        GGML_OP_RELU,
-        GGML_OP_GELU,
-        GGML_OP_GELU_QUICK,
-        GGML_OP_SILU,
        GGML_OP_SILU_BACK,
        GGML_OP_NORM, // normalize
        GGML_OP_RMS_NORM,
@ -378,6 +369,8 @@ extern "C" {
        GGML_OP_WIN_PART,
        GGML_OP_WIN_UNPART,

+        GGML_OP_UNARY,
+
        GGML_OP_MAP_UNARY,
        GGML_OP_MAP_BINARY,

@ -391,6 +384,24 @@ extern "C" {
        GGML_OP_COUNT,
    };

+    enum ggml_unary_op {
+        GGML_UNARY_OP_ABS,
+        GGML_UNARY_OP_SGN,
+        GGML_UNARY_OP_NEG,
+        GGML_UNARY_OP_STEP,
+        GGML_UNARY_OP_TANH,
+        GGML_UNARY_OP_ELU,
+        GGML_UNARY_OP_RELU,
+        GGML_UNARY_OP_GELU,
+        GGML_UNARY_OP_GELU_QUICK,
+        GGML_UNARY_OP_SILU,
+    };
+
+    enum ggml_object_type {
+        GGML_OBJECT_TENSOR,
+        GGML_OBJECT_GRAPH,
+        GGML_OBJECT_WORK_BUFFER
+    };

    // ggml object
    struct ggml_object {
@ -399,7 +410,9 @@ extern "C" {

        struct ggml_object * next;

-        char padding[8];
+        enum ggml_object_type type;
+
+        char padding[4];
    };

    static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
@ -420,7 +433,7 @@ extern "C" {
        enum ggml_op op;

        // op params - allocated as int32_t for alignment
-        int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(uint32_t)];
+        int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];

        bool is_param;

@ -438,7 +451,7 @@ extern "C" {

        void * extra; // extra things e.g. for ggml-cuda.cu

-        char padding[8];
+        char padding[4];
    };

    static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@ -459,6 +472,11 @@ extern "C" {
        void * abort_callback_data;
    };

+    // next prime after GGML_MAX_NODES
+    // #define GGML_GRAPH_HASHTABLE_SIZE 4099
+    // next prime after GGML_MAX_NODES * 2 (nodes + leafs)
+    #define GGML_GRAPH_HASHTABLE_SIZE 8273
+
    // computation graph
    struct ggml_cgraph {
        int n_nodes;
@ -468,12 +486,16 @@ extern "C" {
        struct ggml_tensor * grads[GGML_MAX_NODES];
        struct ggml_tensor * leafs[GGML_MAX_NODES];

+        void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE];
+
        // performance
        int     perf_runs;
        int64_t perf_cycles;
        int64_t perf_time_us;
    };

+    static const size_t GGML_GRAPH_SIZE = sizeof(struct ggml_cgraph);
+
    // scratch buffer
    struct ggml_scratch {
        size_t offs;
@ -535,6 +557,7 @@ extern "C" {

    GGML_API const char * ggml_type_name(enum ggml_type type);
    GGML_API const char * ggml_op_name  (enum ggml_op   op);
+    GGML_API const char * ggml_op_symbol(enum ggml_op   op);

    GGML_API size_t  ggml_element_size(const struct ggml_tensor * tensor);

@ -558,6 +581,7 @@ extern "C" {
    GGML_API size_t  ggml_used_mem(const struct ggml_context * ctx);

    GGML_API size_t  ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
+    GGML_API bool    ggml_get_no_alloc(struct ggml_context * ctx);
    GGML_API void    ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);

    GGML_API void *  ggml_get_mem_buffer     (const struct ggml_context * ctx);
@ -617,9 +641,11 @@ extern "C" {
    GGML_API void *  ggml_get_data    (const struct ggml_tensor * tensor);
    GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);

-    GGML_API const char *         ggml_get_name(const struct ggml_tensor * tensor);
-    GGML_API struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name);
-    GGML_API struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...);
+    GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
+
+    GGML_API const char *         ggml_get_name   (const struct ggml_tensor * tensor);
+    GGML_API struct ggml_tensor * ggml_set_name   (      struct ggml_tensor * tensor, const char * name);
+    GGML_API struct ggml_tensor * ggml_format_name(      struct ggml_tensor * tensor, const char * fmt, ...);

    //
    // operations on tensors with backpropagation
@ -629,6 +655,11 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_dup_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
    GGML_API struct ggml_tensor * ggml_add(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@ -853,14 +884,17 @@ extern "C" {

    GGML_API struct ggml_tensor * ggml_rms_norm(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
+            struct ggml_tensor  * a,
+            float                 eps);

    GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
+            struct ggml_tensor  * a,
+            float                 eps);

    // a - x
    // b - dy
+    // TODO: update with configurable eps
    GGML_API struct ggml_tensor * ggml_rms_norm_back(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@ -952,11 +986,22 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

+    // a -> b, in-place, return view(b)
+    GGML_API struct ggml_tensor * ggml_cpy_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
    // make contiguous
    GGML_API struct ggml_tensor * ggml_cont(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

+    // make contiguous, in-place
+    GGML_API struct ggml_tensor * ggml_cont_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
    // return view(a), b specifies the new shape
    // TODO: when we start computing gradient, make a copy instead of view
    GGML_API struct ggml_tensor * ggml_reshape(
@ -1125,7 +1170,18 @@ extern "C" {
            int                   mode,
            int                   n_ctx);

-    // custom RoPE, in-place, returns view(a)
+    // custom RoPE
+    GGML_API struct ggml_tensor * ggml_rope_custom(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past,
+            int                   n_dims,
+            int                   mode,
+            int                   n_ctx,
+            float                 freq_base,
+            float                 freq_scale);
+
+    // in-place, returns view(a)
    GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@ -1268,6 +1324,16 @@ extern "C" {
    typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
    typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);

+    GGML_API struct ggml_tensor * ggml_unary(
+            struct ggml_context * ctx,
+             struct ggml_tensor * a,
+             enum ggml_unary_op op);
+
+    GGML_API struct ggml_tensor * ggml_unary_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        enum ggml_unary_op op);
+
    GGML_API struct ggml_tensor * ggml_map_unary_f32(
            struct ggml_context        * ctx,
            struct ggml_tensor         * a,
@ -1347,11 +1413,17 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * tensor);

+
    GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);

    GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
    GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);

+    // graph allocation in a context
+    GGML_API struct ggml_cgraph * ggml_new_graph        (struct ggml_context * ctx);
+    GGML_API struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor);
+    GGML_API size_t ggml_graph_overhead(void);
+
    // ggml_graph_plan() has to be called before ggml_graph_compute()
    // when plan.work_size > 0, caller must allocate memory for plan.work_data
    GGML_API struct ggml_cplan ggml_graph_plan   (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
--- a/k_quants.c
+++ b/k_quants.c
@ -39,6 +39,8 @@
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define MAX(a, b) ((a) > (b) ? (a) : (b))

+#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
+
 //
 // 2-6 bit quantization in super-blocks
 //
@ -1353,7 +1355,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
        const __m256i all_scales = _mm256_cvtepi8_epi16(scales8);
        const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
        const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
-        const __m256i scales[2] = {_mm256_set_m128i(l_scales, l_scales), _mm256_set_m128i(h_scales, h_scales)};
+        const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};

        __m256i sumi = _mm256_setzero_si256();

@ -1421,7 +1423,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
        const __m128i summs_1 = _mm_madd_epi16(mins_1, _mm_loadu_si128((const __m128i*)&y[i].bsums[8]));

        // sumf += -dmin * summs in 32bits*8
-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(_mm256_set_m128i(summs_1, summs_0))), acc);
+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(MM256_SET_M128I(summs_1, summs_0))), acc);

        const __m128i scales_0 = _mm_cvtepi8_epi16(scales16);
        const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales16, scales16));
@ -1493,7 +1495,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
        }

        // sumf += dall * isum - dmin * summs in 32bits
-        __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
+        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dall), _mm256_cvtepi32_ps(sumi)), acc);
    }

@ -1644,8 +1646,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
        summs += dmin * smin;

        const __m128i q2bits = _mm_loadu_si128((const __m128i*)q2);
-        const __m256i q2_0 = _mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q2bits, 2), q2bits), m3);
-        const __m256i q2_1 = _mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q2bits, 6), _mm_srli_epi16(q2bits, 4)), m3);
+        const __m256i q2_0 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 2), q2bits), m3);
+        const __m256i q2_1 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 6), _mm_srli_epi16(q2bits, 4)), m3);

        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
@ -1666,6 +1668,62 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri

    *s = hsum_float_8(acc) + summs;

+#elif defined __AVX__
+
+    const __m128i m3 = _mm_set1_epi8(3);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    uint32_t ud, um;
+    const uint8_t * restrict db = (const uint8_t *)&ud;
+    const uint8_t * restrict mb = (const uint8_t *)&um;
+
+    float summs = 0;
+
+    // TODO: optimize this
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
+        const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
+
+        const uint8_t * restrict q2 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
+        ud = (sc[0] >> 0) & 0x0f0f0f0f;
+        um = (sc[0] >> 4) & 0x0f0f0f0f;
+
+        int32_t smin = mb[0] * y[i].bsums[0] + mb[1] * y[i].bsums[1] + mb[2] * y[i].bsums[2] + mb[3] * y[i].bsums[3];
+        summs += dmin * smin;
+
+        const __m128i q2bits = _mm_loadu_si128((const __m128i*)q2);
+        const __m128i q2_0 = _mm_and_si128(q2bits, m3);
+        const __m128i q2_1 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3);
+        const __m128i q2_2 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3);
+        const __m128i q2_3 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3);
+
+        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
+        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
+
+        const __m128i p0 = _mm_maddubs_epi16(q2_0, _mm256_extractf128_si256(q8_0, 0));
+        const __m128i p1 = _mm_maddubs_epi16(q2_1, _mm256_extractf128_si256(q8_0, 1));
+        const __m128i p2 = _mm_maddubs_epi16(q2_2, _mm256_extractf128_si256(q8_1, 0));
+        const __m128i p3 = _mm_maddubs_epi16(q2_3, _mm256_extractf128_si256(q8_1, 1));
+
+        const __m256i p_0 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p0, p0)), _mm_cvtepi16_epi32(p0));
+        const __m256i p_1 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p1, p1)), _mm_cvtepi16_epi32(p1));
+        const __m256i p_2 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p2, p2)), _mm_cvtepi16_epi32(p2));
+        const __m256i p_3 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p3, p3)), _mm_cvtepi16_epi32(p3));
+
+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[0]), _mm256_cvtepi32_ps(p_0)), acc);
+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[1]), _mm256_cvtepi32_ps(p_1)), acc);
+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[2]), _mm256_cvtepi32_ps(p_2)), acc);
+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[3]), _mm256_cvtepi32_ps(p_3)), acc);
+    }
+
+    *s = hsum_float_8(acc) + summs;
+
 #else

    float sumf = 0;
@ -1861,7 +1919,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
        const __m256i all_scales = _mm256_cvtepi8_epi16(scales128);
        const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
        const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
-        const __m256i scales[2] = {_mm256_set_m128i(l_scales, l_scales), _mm256_set_m128i(h_scales, h_scales)};
+        const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};

        // high bit
        const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].hmask);
@ -2072,7 +2130,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
        }

        // multiply with block scale and accumulate
-        __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
+        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);

    }
@ -2247,13 +2305,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
        aux16[0] = a & 0x0f0f;
        aux16[1] = (a >> 4) & 0x0f0f;

-        const __m256i scale_0 = _mm256_set_m128i(_mm_set1_epi16(aux8[2] - 8), _mm_set1_epi16(aux8[0] - 8));
-        const __m256i scale_1 = _mm256_set_m128i(_mm_set1_epi16(aux8[3] - 8), _mm_set1_epi16(aux8[1] - 8));
+        const __m256i scale_0 = MM256_SET_M128I(_mm_set1_epi16(aux8[2] - 8), _mm_set1_epi16(aux8[0] - 8));
+        const __m256i scale_1 = MM256_SET_M128I(_mm_set1_epi16(aux8[3] - 8), _mm_set1_epi16(aux8[1] - 8));

        memcpy(&aux64, x[i].hmask, 8);

        const __m128i haux = _mm_set_epi64x(aux64 >> 1, aux64 >> 0);
-        __m256i q3h_0 = _mm256_set_m128i(_mm_srli_epi16(haux, 2), haux);
+        __m256i q3h_0 = MM256_SET_M128I(_mm_srli_epi16(haux, 2), haux);
        __m256i q3h_1 = _mm256_srli_epi16(q3h_0, 4);
        q3h_0 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_0, m1), 2);
        q3h_1 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_1, m1), 2);
@ -2262,7 +2320,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
        const __m128i q3bits = _mm_loadu_si128((const __m128i*)q3);

        // prepare low and high bits
-        const __m256i q3aux  = _mm256_set_m128i(_mm_srli_epi16(q3bits, 2), q3bits);
+        const __m256i q3aux  = MM256_SET_M128I(_mm_srli_epi16(q3bits, 2), q3bits);
        const __m256i q3l_0 = _mm256_and_si256(q3aux, m3);
        const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3aux, 4), m3);

@ -2295,6 +2353,93 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri

    *s = hsum_float_8(acc);

+#elif defined __AVX__
+
+    const __m128i m3 = _mm_set1_epi8(3);
+    const __m128i m1 = _mm_set1_epi8(1);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    uint64_t aux64;
+
+    uint16_t aux16[2];
+    const int8_t * aux8 = (const int8_t *)aux16;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
+
+        const uint8_t * restrict q3 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const uint16_t a = *(const uint16_t *)x[i].scales;
+        aux16[0] = a & 0x0f0f;
+        aux16[1] = (a >> 4) & 0x0f0f;
+
+        const __m128i scale_0 = _mm_set1_epi16(aux8[0] - 8);
+        const __m128i scale_1 = _mm_set1_epi16(aux8[2] - 8);
+        const __m128i scale_2 = _mm_set1_epi16(aux8[1] - 8);
+        const __m128i scale_3 = _mm_set1_epi16(aux8[3] - 8);
+
+        memcpy(&aux64, x[i].hmask, 8);
+
+        __m128i q3h_0 = _mm_set_epi64x(aux64 >> 1, aux64 >> 0);
+        __m128i q3h_1 = _mm_srli_epi16(q3h_0, 2);
+        __m128i q3h_2 = _mm_srli_epi16(q3h_0, 4);
+        __m128i q3h_3 = _mm_srli_epi16(q3h_0, 6);
+        q3h_0 = _mm_slli_epi16(_mm_andnot_si128(q3h_0, m1), 2);
+        q3h_1 = _mm_slli_epi16(_mm_andnot_si128(q3h_1, m1), 2);
+        q3h_2 = _mm_slli_epi16(_mm_andnot_si128(q3h_2, m1), 2);
+        q3h_3 = _mm_slli_epi16(_mm_andnot_si128(q3h_3, m1), 2);
+
+        // load low 2 bits
+        const __m128i q3bits = _mm_loadu_si128((const __m128i*)q3);
+
+        // prepare low and high bits
+        const __m128i q3l_0 = _mm_and_si128(q3bits, m3);
+        const __m128i q3l_1 = _mm_and_si128(_mm_srli_epi16(q3bits, 2), m3);
+        const __m128i q3l_2 = _mm_and_si128(_mm_srli_epi16(q3bits, 4), m3);
+        const __m128i q3l_3 = _mm_and_si128(_mm_srli_epi16(q3bits, 6), m3);
+
+        // load Q8 quants
+        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
+        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
+
+        // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm_maddubs_epi16,
+        // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
+        // and 2 if the high bit was set)
+        const __m128i q8s_0 = _mm_maddubs_epi16(q3h_0, _mm256_extractf128_si256(q8_0, 0));
+        const __m128i q8s_1 = _mm_maddubs_epi16(q3h_1, _mm256_extractf128_si256(q8_0, 1));
+        const __m128i q8s_2 = _mm_maddubs_epi16(q3h_2, _mm256_extractf128_si256(q8_1, 0));
+        const __m128i q8s_3 = _mm_maddubs_epi16(q3h_3, _mm256_extractf128_si256(q8_1, 1));
+
+        __m128i p16_0 = _mm_maddubs_epi16(q3l_0, _mm256_extractf128_si256(q8_0, 0));
+        __m128i p16_1 = _mm_maddubs_epi16(q3l_1, _mm256_extractf128_si256(q8_0, 1));
+        __m128i p16_2 = _mm_maddubs_epi16(q3l_2, _mm256_extractf128_si256(q8_1, 0));
+        __m128i p16_3 = _mm_maddubs_epi16(q3l_3, _mm256_extractf128_si256(q8_1, 1));
+
+        p16_0 = _mm_sub_epi16(p16_0, q8s_0);
+        p16_1 = _mm_sub_epi16(p16_1, q8s_1);
+        p16_2 = _mm_sub_epi16(p16_2, q8s_2);
+        p16_3 = _mm_sub_epi16(p16_3, q8s_3);
+
+        // multiply with scales
+        p16_0 = _mm_madd_epi16(scale_0, p16_0);
+        p16_1 = _mm_madd_epi16(scale_1, p16_1);
+        p16_2 = _mm_madd_epi16(scale_2, p16_2);
+        p16_3 = _mm_madd_epi16(scale_3, p16_3);
+
+        p16_0 = _mm_add_epi32(p16_0, p16_2);
+        p16_1 = _mm_add_epi32(p16_1, p16_3);
+        __m256i p16 = MM256_SET_M128I(p16_1, p16_0);
+
+        // multiply with block scale and accumulate
+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(p16)), acc);
+
+    }
+
+    *s = hsum_float_8(acc);
+
 #else

    int8_t  aux8[QK_K];
@ -2477,7 +2622,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
        acc_m = _mm_fmadd_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod), acc_m);

        const __m128i sc128  = _mm256_extracti128_si256(mins_and_scales, 0);
-        const __m256i scales = _mm256_set_m128i(sc128, sc128);
+        const __m256i scales = MM256_SET_M128I(sc128, sc128);

        __m256i sumi = _mm256_setzero_si256();

@ -2584,7 +2729,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
        }

        __m256 vd = _mm256_set1_ps(d);
-        __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
+        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);

    }
@ -2781,6 +2926,60 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri

    *s = hsum_float_8(acc) - summs;

+#elif defined __AVX__
+
+    const __m128i m4 = _mm_set1_epi8(0xF);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    float summs = 0;
+
+    uint16_t aux16[2];
+    const uint8_t * scales = (const uint8_t *)aux16;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = ggml_fp16_to_fp32(x[i].d[0]) * y[i].d;
+        const float m = ggml_fp16_to_fp32(x[i].d[1]) * y[i].d;
+        const __m256 vd = _mm256_set1_ps(d);
+
+        const uint16_t * a = (const uint16_t *)x[i].scales;
+        aux16[0] = a[0] & 0x0f0f;
+        aux16[1] = (a[0] >> 4) & 0x0f0f;
+
+        summs += m * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
+
+        const uint8_t * restrict q4 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4);
+        const __m128i q4bits_0 = _mm256_extractf128_si256(q4bits, 0);
+        const __m128i q4bits_1 = _mm256_extractf128_si256(q4bits, 1);
+        const __m128i q4_0 = _mm_and_si128(q4bits_0, m4);
+        const __m128i q4_1 = _mm_and_si128(q4bits_1, m4);
+        const __m128i q4_2 = _mm_and_si128(_mm_srli_epi16(q4bits_0, 4), m4);
+        const __m128i q4_3 = _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4);
+
+        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
+        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
+
+        const __m128i p16_0 = _mm_maddubs_epi16(q4_0, _mm256_extractf128_si256(q8_0, 0));
+        const __m128i p16_1 = _mm_maddubs_epi16(q4_1, _mm256_extractf128_si256(q8_0, 1));
+        const __m128i p16_2 = _mm_maddubs_epi16(q4_2, _mm256_extractf128_si256(q8_1, 0));
+        const __m128i p16_3 = _mm_maddubs_epi16(q4_3, _mm256_extractf128_si256(q8_1, 1));
+
+        const __m128i p32_0 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_0);
+        const __m128i p32_1 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_1);
+        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_1, p32_0))), acc);
+
+        const __m128i p32_2 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_2);
+        const __m128i p32_3 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_3);
+        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_3, p32_2))), acc);
+
+    }
+
+    *s = hsum_float_8(acc) - summs;
+
 #else

    uint8_t aux8[QK_K];
@ -2963,7 +3162,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
        summs += dmin * _mm_extract_epi32(hsum, 0);

        const __m128i sc128  = _mm256_extracti128_si256(mins_and_scales, 0);
-        const __m256i scales = _mm256_set_m128i(sc128, sc128);
+        const __m256i scales = MM256_SET_M128I(sc128, sc128);

        const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].qh);
        __m256i hmask = mone;
@ -3102,7 +3301,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
        }

        __m256 vd = _mm256_set1_ps(d);
-        __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
+        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);

    }
@ -3265,13 +3464,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri

        const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5);

-        const __m256i scale_l = _mm256_set_m128i(_mm_set1_epi16(x[i].scales[1]), _mm_set1_epi16(x[i].scales[0]));
-        const __m256i scale_h = _mm256_set_m128i(_mm_set1_epi16(x[i].scales[3]), _mm_set1_epi16(x[i].scales[2]));
+        const __m256i scale_l = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[1]), _mm_set1_epi16(x[i].scales[0]));
+        const __m256i scale_h = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[3]), _mm_set1_epi16(x[i].scales[2]));

        int64_t aux64;
        memcpy(&aux64, x[i].qh, 8);
        const __m128i haux128 = _mm_set_epi64x(aux64 >> 1, aux64);
-        const __m256i haux256 = _mm256_set_m128i(_mm_srli_epi16(haux128, 2), haux128);
+        const __m256i haux256 = MM256_SET_M128I(_mm_srli_epi16(haux128, 2), haux128);

        const __m256i q5h_0 = _mm256_slli_epi16(_mm256_andnot_si256(haux256, mone), 4);
        const __m256i q5h_1 = _mm256_slli_epi16(_mm256_andnot_si256(_mm256_srli_epi16(haux256, 4), mone), 4);
@ -3295,6 +3494,63 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri

    *s = hsum_float_8(acc);

+#elif defined __AVX__
+
+    const __m128i m4 = _mm_set1_epi8(0xF);
+    const __m128i mone  = _mm_set1_epi8(1);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    for (int i = 0; i < nb; ++i) {
+
+        const uint8_t * restrict q5 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
+
+        const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5);
+
+        const __m128i scale_0 = _mm_set1_epi16(x[i].scales[0]);
+        const __m128i scale_1 = _mm_set1_epi16(x[i].scales[1]);
+        const __m128i scale_2 = _mm_set1_epi16(x[i].scales[2]);
+        const __m128i scale_3 = _mm_set1_epi16(x[i].scales[3]);
+
+        int64_t aux64;
+        memcpy(&aux64, x[i].qh, 8);
+        const __m128i haux128_0 = _mm_set_epi64x(aux64 >> 1, aux64);
+        const __m128i haux128_1 = _mm_srli_epi16(haux128_0, 2);
+
+        const __m128i q5h_0 = _mm_slli_epi16(_mm_andnot_si128(haux128_0, mone), 4);
+        const __m128i q5h_1 = _mm_slli_epi16(_mm_andnot_si128(haux128_1, mone), 4);
+        const __m128i q5h_2 = _mm_slli_epi16(_mm_andnot_si128(_mm_srli_epi16(haux128_0, 4), mone), 4);
+        const __m128i q5h_3 = _mm_slli_epi16(_mm_andnot_si128(_mm_srli_epi16(haux128_1, 4), mone), 4);
+
+        const __m128i q5l_0 = _mm_and_si128(_mm256_extractf128_si256(q5bits, 0), m4);
+        const __m128i q5l_1 = _mm_and_si128(_mm256_extractf128_si256(q5bits, 1), m4);
+        const __m128i q5l_2 = _mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q5bits, 0), 4), m4);
+        const __m128i q5l_3 = _mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q5bits, 1), 4), m4);
+
+        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
+        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
+
+        const __m128i p16_0 = _mm_madd_epi16(scale_0, _mm_maddubs_epi16(q5l_0, _mm256_extractf128_si256(q8_0, 0)));
+        const __m128i p16_1 = _mm_madd_epi16(scale_1, _mm_maddubs_epi16(q5l_1, _mm256_extractf128_si256(q8_0, 1)));
+        const __m128i p16_2 = _mm_madd_epi16(scale_2, _mm_maddubs_epi16(q5l_2, _mm256_extractf128_si256(q8_1, 0)));
+        const __m128i p16_3 = _mm_madd_epi16(scale_3, _mm_maddubs_epi16(q5l_3, _mm256_extractf128_si256(q8_1, 1)));
+        const __m128i s16_0 = _mm_madd_epi16(scale_0, _mm_maddubs_epi16(q5h_0, _mm256_extractf128_si256(q8_0, 0)));
+        const __m128i s16_1 = _mm_madd_epi16(scale_1, _mm_maddubs_epi16(q5h_1, _mm256_extractf128_si256(q8_0, 1)));
+        const __m128i s16_2 = _mm_madd_epi16(scale_2, _mm_maddubs_epi16(q5h_2, _mm256_extractf128_si256(q8_1, 0)));
+        const __m128i s16_3 = _mm_madd_epi16(scale_3, _mm_maddubs_epi16(q5h_3, _mm256_extractf128_si256(q8_1, 1)));
+
+        const __m128i dot_0 = _mm_sub_epi32(_mm_add_epi32(p16_0, p16_2), _mm_add_epi32(s16_0, s16_2));
+        const __m128i dot_1 = _mm_sub_epi32(_mm_add_epi32(p16_1, p16_3), _mm_add_epi32(s16_1, s16_3));
+
+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(dot_1, dot_0))), acc);
+
+    }
+
+    *s = hsum_float_8(acc);
+
 #else

    int8_t aux8[QK_K];
@ -3671,7 +3927,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri

        }

-        __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
+        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
    }

@ -3829,8 +4085,8 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
        const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4);
        const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh);

-        const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q4bitsH, 2), q4bitsH), m2), 4);
-        const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q4bitsH, 6), _mm_srli_epi16(q4bitsH, 4)), m2), 4);
+        const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 2), q4bitsH), m2), 4);
+        const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 6), _mm_srli_epi16(q4bitsH, 4)), m2), 4);

        const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
        const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_1);
@ -3857,6 +4113,77 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri

    *s = hsum_float_8(acc);

+#elif defined __AVX__
+
+    const __m128i m4 = _mm_set1_epi8(0xF);
+    const __m128i m2 = _mm_set1_epi8(3);
+    const __m128i m32s = _mm_set1_epi8(32);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
+
+        const uint8_t * restrict q4 = x[i].ql;
+        const uint8_t * restrict qh = x[i].qh;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const __m64 scales_1 = _mm_set1_pi8(x[i].scales[0]);
+        const __m64 scales_2 = _mm_set1_pi8(x[i].scales[1]);
+        const __m64 scales_3 = _mm_set1_pi8(x[i].scales[2]);
+        const __m64 scales_4 = _mm_set1_pi8(x[i].scales[3]);
+
+        __m128i sumi_0 = _mm_setzero_si128();
+        __m128i sumi_1 = _mm_setzero_si128();
+
+        const __m128i scale_0 = _mm_set_epi64(scales_2, scales_1);
+        const __m128i scale_1 = _mm_set_epi64(scales_4, scales_3);
+
+        const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4);
+        const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh);
+
+        const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH, m2), 4);
+        const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 2), m2), 4);
+        const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 4), m2), 4);
+        const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 6), m2), 4);
+
+        const __m128i q4_0 = _mm_or_si128(_mm_and_si128(_mm256_extractf128_si256(q4bits1, 0), m4), q4h_0);
+        const __m128i q4_1 = _mm_or_si128(_mm_and_si128(_mm256_extractf128_si256(q4bits1, 1), m4), q4h_1);
+        const __m128i q4_2 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q4bits1, 0), 4), m4), q4h_2);
+        const __m128i q4_3 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q4bits1, 1), 4), m4), q4h_3);
+
+        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
+        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
+
+        __m128i q8s_0 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_0, 0));
+        __m128i q8s_1 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_0, 1));
+        __m128i q8s_2 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_1, 0));
+        __m128i q8s_3 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_1, 1));
+
+        __m128i p16_0 = _mm_maddubs_epi16(q4_0, _mm256_extractf128_si256(q8_0, 0));
+        __m128i p16_1 = _mm_maddubs_epi16(q4_1, _mm256_extractf128_si256(q8_0, 1));
+        __m128i p16_2 = _mm_maddubs_epi16(q4_2, _mm256_extractf128_si256(q8_1, 0));
+        __m128i p16_3 = _mm_maddubs_epi16(q4_3, _mm256_extractf128_si256(q8_1, 1));
+
+        p16_0 = _mm_sub_epi16(p16_0, q8s_0);
+        p16_1 = _mm_sub_epi16(p16_1, q8s_1);
+        p16_2 = _mm_sub_epi16(p16_2, q8s_2);
+        p16_3 = _mm_sub_epi16(p16_3, q8s_3);
+
+        p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0);
+        p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_0, scale_0)), p16_1);
+        p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2);
+        p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_1, scale_1)), p16_3);
+
+        sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
+        sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
+
+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi_1, sumi_0))), acc);
+    }
+
+    *s = hsum_float_8(acc);
+
 #else

    int8_t  aux8[QK_K];
--- a/llama.cpp
+++ b/llama.cpp
@ -56,8 +56,14 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

+#if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL)
+#include "ggml-alloc.h"
+#define LLAMA_USE_ALLOCATOR
+#else
 #define LLAMA_USE_SCRATCH
 #define LLAMA_MAX_SCRATCH_BUFFERS 16
+#endif
+

 // available llama models
 enum e_model {
@ -186,6 +192,7 @@ struct llama_hparams {
    // LLaMAv2
    // TODO: load from model data hparams
    float f_ffn_mult = 1.0f;
+    float f_rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;

    float rope_freq_base  = 10000.0f;
    float rope_freq_scale = 1.0f;
@ -326,13 +333,22 @@ struct llama_model {

 struct llama_context {
    llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
-#ifdef GGML_USE_METAL
    ~llama_context() {
+        if (model_owner) {
+            delete &model;
+        }
+#ifdef GGML_USE_METAL
        if (ctx_metal) {
            ggml_metal_free(ctx_metal);
        }
-    }
 #endif
+#ifdef LLAMA_USE_ALLOCATOR
+        if (alloc) {
+            ggml_allocr_free(alloc);
+        }
+#endif
+    }
+
    std::mt19937 rng;

    bool has_evaluated_once = false;
@ -370,7 +386,17 @@ struct llama_context {
    // memory buffers used to evaluate the model
    // TODO: move in llama_state
    llama_ctx_buffer buf_compute;
+
+#ifdef LLAMA_USE_ALLOCATOR
+    llama_ctx_buffer buf_alloc;
+    ggml_allocr * alloc = NULL;
+#endif
+
+#ifdef LLAMA_USE_SCRATCH
    llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
+    int    buf_last = 0;
+    size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
+#endif

 #ifdef GGML_USE_METAL
    ggml_metal_context * ctx_metal = NULL;
@ -380,9 +406,6 @@ struct llama_context {
    ggml_mpi_context * ctx_mpi = NULL;
 #endif

-    int    buf_last = 0;
-    size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
-
    void use_buf(struct ggml_context * ctx, int i) {
 #if defined(LLAMA_USE_SCRATCH)
        size_t last_size = 0;
@ -869,6 +892,7 @@ struct llama_context_params llama_context_default_params() {
        /*.n_ctx                       =*/ 512,
        /*.n_batch                     =*/ 512,
        /*.n_gqa                       =*/ 1,
+        /*.rms_norm_eps                =*/ LLAMA_DEFAULT_RMS_EPS,
        /*.gpu_layers                  =*/ 0,
        /*.main_gpu                    =*/ 0,
        /*.tensor_split                =*/ nullptr,
@ -877,6 +901,7 @@ struct llama_context_params llama_context_default_params() {
        /*.progress_callback           =*/ nullptr,
        /*.progress_callback_user_data =*/ nullptr,
        /*.low_vram                    =*/ false,
+        /*.mul_mat_q                   =*/ false,
        /*.f16_kv                      =*/ true,
        /*.logits_all                  =*/ false,
        /*.vocab_only                  =*/ false,
@ -1000,9 +1025,11 @@ static void llama_model_load_internal(
        int n_ctx,
        int n_batch,
        int n_gqa,
+        float rms_norm_eps,
        int n_gpu_layers,
        int main_gpu,
        const float * tensor_split,
+        const bool mul_mat_q,
        float rope_freq_base,
        float rope_freq_scale,
        bool low_vram,
@ -1024,6 +1051,9 @@ static void llama_model_load_internal(

    auto & hparams = model.hparams;

+    // TODO: read from file
+    hparams.f_rms_norm_eps = rms_norm_eps;
+
    {
        switch (hparams.n_layer) {
            case 26: model.type = e_model::MODEL_3B; break;
@ -1072,6 +1102,7 @@ static void llama_model_load_internal(
        fprintf(stderr, "%s: n_layer    = %u\n",   __func__, hparams.n_layer);
        fprintf(stderr, "%s: n_rot      = %u\n",   __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
        fprintf(stderr, "%s: n_gqa      = %u\n",   __func__, hparams.n_gqa());
+        fprintf(stderr, "%s: rnorm_eps  = %.1e\n", __func__, hparams.f_rms_norm_eps);
        fprintf(stderr, "%s: n_ff       = %u\n",   __func__, n_ff);
        fprintf(stderr, "%s: freq_base  = %.1f\n", __func__, hparams.rope_freq_base);
        fprintf(stderr, "%s: freq_scale = %g\n",   __func__, hparams.rope_freq_scale);
@ -1127,9 +1158,11 @@ static void llama_model_load_internal(
    }

    (void) main_gpu;
+    (void) mul_mat_q;
 #if defined(GGML_USE_CUBLAS)
    fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
    ggml_cuda_set_main_device(main_gpu);
+    ggml_cuda_set_mul_mat_q(mul_mat_q);
 #define LLAMA_BACKEND_OFFLOAD       GGML_BACKEND_GPU
 #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
 #elif defined(GGML_USE_CLBLAST)
@ -1223,12 +1256,16 @@ static void llama_model_load_internal(
        const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;

        // this is the total memory required to run the inference
-        const size_t mem_required =
+        size_t mem_required =
            ctx_size +
-            mmapped_size - vram_weights + // weights in VRAM not in memory
+            mmapped_size - vram_weights; // weights in VRAM not in memory
+
+#ifndef LLAMA_USE_ALLOCATOR
+        mem_required +=
            MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
            MEM_REQ_SCRATCH1().at(model.type) +
            MEM_REQ_EVAL().at(model.type);
+#endif

        // this is the memory required by one llama_state
        const size_t mem_required_state =
@ -1330,9 +1367,11 @@ static bool llama_model_load(
        int n_ctx,
        int n_batch,
        int n_gqa,
+        float rms_norm_eps,
        int n_gpu_layers,
        int main_gpu,
        const float * tensor_split,
+        const bool mul_mat_q,
        float rope_freq_base,
        float rope_freq_scale,
        bool low_vram,
@ -1343,7 +1382,8 @@ static bool llama_model_load(
        llama_progress_callback progress_callback,
        void *progress_callback_user_data) {
    try {
-        llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
+        llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers,
+                                  main_gpu, tensor_split, mul_mat_q, rope_freq_base, rope_freq_scale, low_vram, memory_type,
                                  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
        return true;
    } catch (const std::exception & err) {
@ -1352,32 +1392,15 @@ static bool llama_model_load(
    }
 }

-// evaluate the transformer
-//
-//   - lctx:      llama context
-//   - tokens:    new batch of tokens to process
-//   - embd       embeddings input
-//   - n_tokens   number of tokens
-//   - n_past:    the context size so far
-//   - n_threads: number of threads to use
-//
-static bool llama_eval_internal(
+static struct ggml_cgraph * llama_build_graph(
         llama_context & lctx,
     const llama_token * tokens,
           const float * embd,
                   int   n_tokens,
-                   int   n_past,
-                   int   n_threads,
-            const char * cgraph_fname) {
+                   int   n_past) {

    LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));

-#ifdef GGML_USE_MPI
-    ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
-#endif
-
-    const int64_t t_start_us = ggml_time_us();
-
    const int N = n_tokens;

    const auto & model   = lctx.model;
@ -1393,39 +1416,48 @@ static bool llama_eval_internal(
    const int64_t n_head      = hparams.n_head;
    const int64_t n_head_kv   = hparams.n_head_kv;
    const int64_t n_embd_head = hparams.n_embd_head();
-    const int64_t n_vocab     = hparams.n_vocab;
    const int64_t n_embd_gqa  = hparams.n_embd_gqa();

    LLAMA_ASSERT(n_embd_head == hparams.n_rot);

    const float freq_base  = hparams.rope_freq_base;
    const float freq_scale = hparams.rope_freq_scale;
+    const float rms_norm_eps = hparams.f_rms_norm_eps;

    const int n_gpu_layers = model.n_gpu_layers;

    auto & mem_per_token = lctx.mem_per_token;
    auto & buf_compute   = lctx.buf_compute;

+
    struct ggml_init_params params = {
        /*.mem_size   =*/ buf_compute.size,
        /*.mem_buffer =*/ buf_compute.addr,
        /*.no_alloc   =*/ false,
    };

+#ifdef LLAMA_USE_ALLOCATOR
+    params.no_alloc = true;
+#endif
+
    struct ggml_context * ctx0 = ggml_init(params);

-    ggml_cgraph gf = {};
-
-    // for big prompts, if BLAS is enabled, it is better to use only one thread
-    // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
-    n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
+    ggml_cgraph * gf = ggml_new_graph(ctx0);

    struct ggml_tensor * cur;
    struct ggml_tensor * inpL;

    if (tokens) {
        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+
+#ifdef LLAMA_USE_ALLOCATOR
+        ggml_allocr_alloc(lctx.alloc, inp_tokens);
+        if (!ggml_allocr_is_measure(lctx.alloc)) {
+            memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
+        }
+#else
        memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
+#endif
        ggml_set_name(inp_tokens, "inp_tokens");

        inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
@ -1435,7 +1467,15 @@ static bool llama_eval_internal(
 #endif

        inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
+
+#ifdef LLAMA_USE_ALLOCATOR
+        ggml_allocr_alloc(lctx.alloc, inpL);
+        if (!ggml_allocr_is_measure(lctx.alloc)) {
+            memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
+        }
+#else
        memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
+#endif
    }

    const int i_gpu_start = n_layer - n_gpu_layers;
@ -1462,6 +1502,17 @@ static bool llama_eval_internal(
    }
 #endif // GGML_USE_CUBLAS

+    struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+#ifdef LLAMA_USE_ALLOCATOR
+    ggml_allocr_alloc(lctx.alloc, KQ_scale);
+    if (!ggml_allocr_is_measure(lctx.alloc)) {
+        ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
+    }
+#else
+    ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
+#endif
+    ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
+
    for (int il = 0; il < n_layer; ++il) {
        ggml_format_name(inpL, "layer_inp_%d", il);

@ -1479,7 +1530,7 @@ static bool llama_eval_internal(

        // norm
        {
-            cur = ggml_rms_norm(ctx0, inpL);
+            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
            offload_func(cur);
            ggml_set_name(cur, "rms_norm_0");

@ -1531,8 +1582,8 @@ static bool llama_eval_internal(
                ggml_set_name(v, "v");

                // important: storing RoPE-ed version of K in the KV cache!
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
+                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
+                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
            }

            struct ggml_tensor * Q =
@ -1557,9 +1608,6 @@ static bool llama_eval_internal(
            ggml_set_name(KQ, "KQ");

            // KQ_scaled = KQ / sqrt(n_embd_head)
-            struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
-            ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
-
            // KQ_scaled shape [n_past + N, N, n_head, 1]
            struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
            offload_func_kq(KQ_scaled);
@ -1627,7 +1675,7 @@ static bool llama_eval_internal(
        {
            // norm
            {
-                cur = ggml_rms_norm(ctx0, inpFF);
+                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
                offload_func(cur);
                ggml_set_name(cur, "rms_norm_1");

@ -1675,12 +1723,9 @@ static bool llama_eval_internal(

    lctx.use_buf(ctx0, 0);

-    // used at the end to optionally extract the embeddings
-    struct ggml_tensor * embeddings = NULL;
-
    // norm
    {
-        cur = ggml_rms_norm(ctx0, inpL);
+        cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
        offload_func_nr(cur);
        ggml_set_name(cur, "rms_norm_2");

@ -1688,8 +1733,6 @@ static bool llama_eval_internal(
        cur = ggml_mul(ctx0, cur, model.norm);
        // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
        ggml_set_name(cur, "result_norm");
-
-        embeddings = cur;
    }

    // lm_head
@ -1701,18 +1744,103 @@ static bool llama_eval_internal(
    // logits -> probs
    //cur = ggml_soft_max_inplace(ctx0, cur);

-    // run the computation
-    ggml_build_forward_expand(&gf, cur);
+    ggml_build_forward_expand(gf, cur);
+
+    if (mem_per_token == 0) {
+        mem_per_token = ggml_used_mem(ctx0)/N;
+    }
+
+#if 0
+    printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
+            ggml_used_mem(ctx0)/1024.0/1024.0,
+            lctx.get_buf_max_mem(0)/1024.0/1024.0,
+            lctx.get_buf_max_mem(1)/1024.0/1024.0,
+            lctx.work_buffer.size()/1024.0/1024.0,
+            n_past, N);
+#endif
+
+    ggml_free(ctx0);
+
+    return gf;
+}
+
+// evaluate the transformer
+//
+//   - lctx:      llama context
+//   - tokens:    new batch of tokens to process
+//   - embd       embeddings input
+//   - n_tokens   number of tokens
+//   - n_past:    the context size so far
+//   - n_threads: number of threads to use
+//
+static bool llama_eval_internal(
+         llama_context & lctx,
+     const llama_token * tokens,
+           const float * embd,
+                   int   n_tokens,
+                   int   n_past,
+                   int   n_threads,
+            const char * cgraph_fname) {
+
+    LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
+
+    const int64_t t_start_us = ggml_time_us();
+
+#ifdef GGML_USE_MPI
+    ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
+#endif
+
+    const int N = n_tokens;
+
+    const auto & model   = lctx.model;
+    const auto & hparams = model.hparams;
+
+    const auto & kv_self = lctx.kv_self;
+
+    LLAMA_ASSERT(!!kv_self.ctx);
+
+    const int64_t n_embd      = hparams.n_embd;
+    const int64_t n_vocab     = hparams.n_vocab;
+
+#ifdef LLAMA_USE_ALLOCATOR
+    ggml_allocr_reset(lctx.alloc);
+#endif
+
+    ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past);
+
+#ifdef LLAMA_USE_ALLOCATOR
+    ggml_allocr_alloc_graph(lctx.alloc, gf);
+#endif
+
+    // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
+
+    // for big prompts, if BLAS is enabled, it is better to use only one thread
+    // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
+    n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
+
+    struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
+    struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
+
+    LLAMA_ASSERT(strcmp(res->name, "result_output") == 0);
+    LLAMA_ASSERT(strcmp(embeddings->name, "result_norm") == 0);

 #if GGML_USE_MPI
-    ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer);
+    const int64_t n_layer = hparams.n_layer;
+    ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
 #endif

 #ifdef GGML_USE_METAL
    if (lctx.ctx_metal && N == 1) {
+        // TODO: disabled until #2413 is resolved
+        //if (!ggml_metal_if_optimized(lctx.ctx_metal)) {
+        //    ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf);
+        //}
        ggml_metal_set_n_cb     (lctx.ctx_metal, n_threads);
-        ggml_metal_graph_compute(lctx.ctx_metal, &gf);
-        ggml_metal_get_tensor   (lctx.ctx_metal, cur);
+        ggml_metal_graph_compute(lctx.ctx_metal, gf);
+        ggml_metal_get_tensor   (lctx.ctx_metal, res);
+        if (!lctx.embedding.empty()) {
+            ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
+        }
    } else {
        // IMPORTANT:
        // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
@ -1730,34 +1858,32 @@ static bool llama_eval_internal(
            ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
        }

-        ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
+        ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
    }
 #else
-    ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
+    ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
 #endif

 #if GGML_USE_MPI
-    ggml_mpi_graph_compute_post(lctx.ctx_mpi, &gf, n_layer);
+    ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
 #endif

    // update kv token count
    lctx.kv_self.n = n_past + N;

-    struct ggml_tensor * res = gf.nodes[gf.n_nodes - 1];
-
    if (cgraph_fname) {
-        ggml_graph_export(&gf, cgraph_fname);
+        ggml_graph_export(gf, cgraph_fname);
    }

 #ifdef GGML_PERF
    // print timing information per ggml operation (for debugging purposes)
    // requires GGML_PERF to be defined
-    ggml_graph_print(&gf);
+    ggml_graph_print(gf);
 #endif

    // plot the computation graph in dot format (for debugging purposes)
    //if (n_past%100 == 0) {
-    //    ggml_graph_dump_dot(&gf, NULL, "llama.dot");
+    //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
    //}

    // extract logits
@ -1782,21 +1908,6 @@ static bool llama_eval_internal(
        memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
    }

-    if (mem_per_token == 0) {
-        mem_per_token = ggml_used_mem(ctx0)/N;
-    }
-
-#if 0
-    printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
-            ggml_used_mem(ctx0)/1024.0/1024.0,
-            lctx.get_buf_max_mem(0)/1024.0/1024.0,
-            lctx.get_buf_max_mem(1)/1024.0/1024.0,
-            lctx.work_buffer.size()/1024.0/1024.0,
-            n_past, N);
-#endif
-
-    ggml_free(ctx0);
-
    // measure the performance only for the single-token evals
    if (N == 1) {
        lctx.t_eval_us += ggml_time_us() - t_start_us;
@ -1908,7 +2019,9 @@ struct llama_tokenizer {
            if (token == vocab_.token_to_id.end()) {
                // output any symbols that did not form tokens as bytes.
                for (int j = 0; j < (int) symbol.n; ++j) {
-                    llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
+                    // NOTE: old version, before #2420 - not sure what are the implications of this
+                    //llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
+                    llama_vocab::id token_id = vocab_.token_to_id.at(std::string(1, symbol.text[j]));
                    output.push_back(token_id);
                }
            } else {
@ -3084,8 +3197,8 @@ struct llama_model * llama_load_model_from_file(

    ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;

-    if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.n_gpu_layers,
-                params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
+    if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers,
+                params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
                memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
                params.progress_callback_user_data)) {
        delete model;
@ -3162,10 +3275,47 @@ struct llama_context * llama_new_context_with_model(
            ctx->embedding.resize(hparams.n_embd);
        }

-        ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
+#ifdef LLAMA_USE_ALLOCATOR
+        {
+            static const size_t tensor_alignment = 32;
+            // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
+            ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());

+            // create measure allocator
+            ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
+
+            // build worst-case graph
+            int n_tokens = std::min((int)hparams.n_ctx, params.n_batch);
+            int n_past = hparams.n_ctx - n_tokens;
+            llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
+            ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
+
+            // measure memory requirements for the graph
+            size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
+
+            fprintf(stderr, "%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
+
+            // debug - for comparison with scratch buffer
+            //size_t prev_req =
+            //    MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) +
+            //    MEM_REQ_SCRATCH1().at(ctx->model.type) +
+            //    MEM_REQ_EVAL().at(ctx->model.type);
+            //fprintf(stderr, "%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0);
+
+            // recreate allocator with exact memory requirements
+            ggml_allocr_free(ctx->alloc);
+
+            ctx->buf_alloc.resize(alloc_size);
+            ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
+        }
+#else
+        ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
+#endif
+
+#ifdef LLAMA_USE_SCRATCH
        ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
        ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
+#endif
    }

 #ifdef GGML_USE_METAL
@ -3235,9 +3385,6 @@ struct llama_context * llama_init_from_file(
 }

 void llama_free(struct llama_context * ctx) {
-    if (ctx->model_owner) {
-        delete &ctx->model;
-    }
    delete ctx;
 }

@ -3647,7 +3794,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
        const auto & kv_self = ctx->kv_self;
        const auto & hparams = ctx->model.hparams;
        const int    n_layer = hparams.n_layer;
-        const int    n_embd  = hparams.n_embd;
+        const int    n_embd  = hparams.n_embd_gqa();
        const int    n_ctx   = hparams.n_ctx;

        const size_t kv_size = kv_self.buf.size;
@ -3750,7 +3897,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
        const auto & kv_self = ctx->kv_self;
        const auto & hparams = ctx->model.hparams;
        const int    n_layer = hparams.n_layer;
-        const int    n_embd  = hparams.n_embd;
+        const int    n_embd  = hparams.n_embd_gqa();
        const int    n_ctx   = hparams.n_ctx;

        size_t kv_size;
--- a/llama.h
+++ b/llama.h
@ -53,6 +53,10 @@
 #define LLAMA_SUPPORTS_GPU_OFFLOAD
 #endif

+#ifndef LLAMA_DEFAULT_RMS_EPS
+#define LLAMA_DEFAULT_RMS_EPS 5e-6f
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@ -87,6 +91,7 @@ extern "C" {
        int32_t  n_ctx;        // text context
        int32_t  n_batch;      // prompt processing batch size
        int32_t  n_gqa;        // grouped-query attention (TEMP - will be moved to model hparams)
+        float    rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
        int32_t  n_gpu_layers; // number of layers to store in VRAM
        int32_t  main_gpu;     // the GPU that is used for scratch and small tensors

@ -103,6 +108,7 @@ extern "C" {

        // Keep the booleans together to avoid misalignment during copy-by-value.
        bool low_vram;   // if true, reduce VRAM usage at the cost of performance
+        bool mul_mat_q;  // if true, use experimental mul_mat_q kernels
        bool f16_kv;     // use fp16 for KV cache
        bool logits_all; // the llama_eval() call computes all logits, not just the last one
        bool vocab_only; // only load the vocabulary, no weights
--- a/scripts/build-info.sh
+++ b/scripts/build-info.sh
@ -16,7 +16,8 @@ fi
 echo "#ifndef BUILD_INFO_H"
 echo "#define BUILD_INFO_H"
 echo ""
-echo "#define BUILD_NUMBER $BUILD_NUMBER"
-echo "#define BUILD_COMMIT \"$BUILD_COMMIT\""
+echo "#define BUILD_NUMBER $BUILD_NUMBER" | tr -d '\n'
+echo ""
+echo "#define BUILD_COMMIT \"$BUILD_COMMIT\"" | tr -d '\n'
 echo ""
 echo "#endif // BUILD_INFO_H"
--- a/scripts/sync-ggml.sh
+++ b/scripts/sync-ggml.sh
@ -10,5 +10,5 @@ cp -rpv ../ggml/src/ggml-metal.m     ./ggml-metal.m
 cp -rpv ../ggml/src/ggml-metal.metal ./ggml-metal.metal
 cp -rpv ../ggml/include/ggml/ggml.h  ./ggml.h

-cp -rpv ../ggml/tests/test-opt.c    ./tests/test-opt.c
-cp -rpv ../ggml/tests/test-grad0.c  ./tests/test-grad0.c
+cp -rpv ../ggml/tests/test-opt.cpp    ./tests/test-opt.cpp
+cp -rpv ../ggml/tests/test-grad0.cpp  ./tests/test-grad0.cpp
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -6,10 +6,10 @@ function(llama_add_test source)
    add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
 endfunction()

-# llama_add_test(test-double-float.c) # SLOW
+# llama_add_test(test-double-float.cpp) # SLOW
 llama_add_test(test-quantize-fns.cpp)
 llama_add_test(test-quantize-perf.cpp)
 llama_add_test(test-sampling.cpp)
 llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
-llama_add_test(test-grad0.c) # SLOW
-# llama_add_test(test-opt.c) # SLOW
+llama_add_test(test-grad0.cpp) # SLOW
+# llama_add_test(test-opt.cpp) # SLOW
--- a/tests/test-double-float.cpp
+++ b/tests/test-double-float.cpp
@ -3,10 +3,11 @@
 // This is done by checking all finite (non-NaN, non-infinite) floats.

 #undef NDEBUG
-#include <assert.h>
+#include <cassert>
 #include <immintrin.h>
-#include <math.h>
-#include <stdint.h>
+#include <cmath>
+#include <cstdint>
+#include <cstring>

 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdouble-promotion"
@ -32,8 +33,9 @@ inline static float silu_float(float x) {
 int main(void) {
    uint32_t x = UINT32_MAX;
    do {
-        float f = *(float *)&x;
-        assert(!isfinite(f) || (round_orig(f) == round_float(f)));
+        float f;
+        memcpy(&f, &x, sizeof(x));
+        assert(!std::isfinite(f) || (round_orig(f) == round_float(f)));
    } while (x--);

 #ifdef __F16C__
--- a/tests/test-grad0.cpp
+++ b/tests/test-grad0.cpp
@ -1,10 +1,10 @@
 #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
 #include "ggml.h"

-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>

 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@ -47,16 +47,16 @@

 #define GGML_PRINT(...) printf(__VA_ARGS__)

-float frand(void) {
+static float frand(void) {
    return (float)rand()/(float)RAND_MAX;
 }

-int irand(int n) {
+static int irand(int n) {
    if (n == 0) return 0;
    return rand()%n;
 }

-void get_random_dims(int64_t * dims, int ndims) {
+static void get_random_dims(int64_t * dims, int ndims) {
    dims[0] = dims[1] = dims[2] = dims[3] = 1;

    for (int i = 0; i < ndims; i++) {
@ -64,7 +64,7 @@ void get_random_dims(int64_t * dims, int ndims) {
    }
 }

-struct ggml_tensor * get_random_tensor(
+static struct ggml_tensor * get_random_tensor_f32(
        struct ggml_context * ctx0,
        int ndims,
        int64_t ne[],
@ -112,7 +112,55 @@ struct ggml_tensor * get_random_tensor(
    return result;
 }

-struct ggml_tensor * get_random_tensor_int(
+static struct ggml_tensor * get_random_tensor_f16(
+        struct ggml_context * ctx0,
+        int ndims,
+        int64_t ne[],
+        float fmin,
+        float fmax) {
+    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F16, ndims, ne);
+
+    switch (ndims) {
+        case 1:
+            for (int i0 = 0; i0 < ne[0]; i0++) {
+                ((ggml_fp16_t *)result->data)[i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
+            }
+            break;
+        case 2:
+            for (int i1 = 0; i1 < ne[1]; i1++) {
+                for (int i0 = 0; i0 < ne[0]; i0++) {
+                    ((ggml_fp16_t *)result->data)[i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
+                }
+            }
+            break;
+        case 3:
+            for (int i2 = 0; i2 < ne[2]; i2++) {
+                for (int i1 = 0; i1 < ne[1]; i1++) {
+                    for (int i0 = 0; i0 < ne[0]; i0++) {
+                        ((ggml_fp16_t *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
+                    }
+                }
+            }
+            break;
+        case 4:
+            for (int i3 = 0; i3 < ne[3]; i3++) {
+                for (int i2 = 0; i2 < ne[2]; i2++) {
+                    for (int i1 = 0; i1 < ne[1]; i1++) {
+                        for (int i0 = 0; i0 < ne[0]; i0++) {
+                            ((ggml_fp16_t *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
+                        }
+                    }
+                }
+            }
+            break;
+        default:
+            assert(false);
+    };
+
+    return result;
+}
+
+static struct ggml_tensor * get_random_tensor_i32(
        struct ggml_context * ctx0,
        int ndims,
        int64_t ne[],
@ -160,24 +208,7 @@ struct ggml_tensor * get_random_tensor_int(
    return result;
 }

-float get_element(const struct ggml_tensor * t, int idx) {
-    if (t->type == GGML_TYPE_F32) {
-        return ((float *)t->data)[idx];
-    }
-
-    if (t->type == GGML_TYPE_I32) {
-        return ((int32_t *)t->data)[idx];
-    }
-
-    assert(false);
-    return INFINITY;
-}
-
-void set_element(struct ggml_tensor * t, int idx, float value) {
-    ((float *)t->data)[idx] = value;
-}
-
-void print_elements(const char* label, const struct ggml_tensor * t) {
+static void print_elements(const char* label, const struct ggml_tensor * t) {
    if (!t) {
        printf("%s: %s = null\n", __func__, label);
        return;
@ -186,7 +217,7 @@ void print_elements(const char* label, const struct ggml_tensor * t) {
    printf("%s: %s = [", __func__, label);
    for (int k = 0; k < nelements; ++k) {
        if (k > 0) { printf(", "); }
-        printf("%.5f", get_element(t, k));
+        printf("%.5f", ggml_get_f32_1d(t, k));
    }
    printf("] shape: [");
    for (int k = 0; k < t->n_dims; ++k) {
@ -197,7 +228,7 @@ void print_elements(const char* label, const struct ggml_tensor * t) {

 }

-bool check_gradient(
+static bool check_gradient(
        const char * op_name,
        struct ggml_context * ctx0,
        struct ggml_tensor * x[],
@ -237,23 +268,23 @@ bool check_gradient(
        const int nelements = ggml_nelements(x[i]);
        for (int k = 0; k < nelements; ++k) {
            // compute gradient using finite differences
-            const float x0 = get_element(x[i], k);
+            const float x0 = ggml_get_f32_1d(x[i], k);
            const float xm = x0 - eps;
            const float xp = x0 + eps;
-            set_element(x[i], k, xp);
+            ggml_set_f32_1d(x[i], k, xp);

            ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);

            const float f0 = ggml_get_f32_1d(f, 0);

-            set_element(x[i], k, xm);
+            ggml_set_f32_1d(x[i], k, xm);

            ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);

            const float f1 = ggml_get_f32_1d(f, 0);
            const float g0 = (f0 - f1)/(2.0f*eps);

-            set_element(x[i], k, x0);
+            ggml_set_f32_1d(x[i], k, x0);

            // compute gradient using backward graph
            ggml_graph_reset  (&gf);
@ -261,7 +292,7 @@ bool check_gradient(

            ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);

-            const float g1 = get_element(x[i]->grad, k);
+            const float g1 = ggml_get_f32_1d(x[i]->grad, k);

            const float error_abs = fabsf(g0 - g1);
            const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabsf(g0) : 0;
@ -279,7 +310,7 @@ bool check_gradient(
 }

 // TODO: clean-up this ..
-bool check_mat_mul(
+static bool check_mat_mul(
        const struct ggml_tensor * y,
        const struct ggml_tensor * x0,
        const struct ggml_tensor * x1) {
@ -342,9 +373,9 @@ bool check_mat_mul(

 int main(int argc, const char ** argv) {
    struct ggml_init_params params = {
-        .mem_size   = 128*1024*1024,
-        .mem_buffer = NULL,
-        .no_alloc   = false,
+        /* .mem_size   = */ 128*1024*1024,
+        /* .mem_buffer = */ NULL,
+        /* .no_alloc   = */ false,
    };

    int64_t ne[4];
@ -392,19 +423,35 @@ int main(int argc, const char ** argv) {

        struct ggml_tensor * x[MAX_NARGS];

-        // add
+        // add f32
        {
            const int nargs = 2;

            for (int ndims = 1; ndims <= 4; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }

                struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));

-                check_gradient("add", ctx0, x, f, ndims, nargs, 1e-3f, 2e-3f, 2e-3f);
+                check_gradient("add f32", ctx0, x, f, ndims, nargs, 1e-3f, 2e-3f, 2e-3f);
+            }
+        }
+
+        // add f16
+        {
+            const int nargs = 2;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
+
+                check_gradient("add f16", ctx0, x, f, ndims, nargs, 1e-1f, 2e-1f, 2e-1f);
            }
        }

@ -414,7 +461,7 @@ int main(int argc, const char ** argv) {

            for (int ndims = 1; ndims <= 4; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }

@ -430,7 +477,7 @@ int main(int argc, const char ** argv) {

            for (int ndims = 1; ndims <= 4; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }

@ -446,7 +493,7 @@ int main(int argc, const char ** argv) {

            for (int ndims = 1; ndims <= 4; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, 0.5f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, 0.5f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }

@ -462,7 +509,7 @@ int main(int argc, const char ** argv) {

            for (int ndims = 1; ndims <= 2; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }

@ -478,7 +525,7 @@ int main(int argc, const char ** argv) {

            for (int ndims = 1; ndims <= 2; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }

@ -494,7 +541,7 @@ int main(int argc, const char ** argv) {

            for (int ndims = 1; ndims <= 2; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }

@ -510,7 +557,7 @@ int main(int argc, const char ** argv) {

            for (int ndims = 1; ndims <= 2; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }

@ -527,7 +574,7 @@ int main(int argc, const char ** argv) {

            for (int ndims = 1; ndims <= 4; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }

@ -537,6 +584,40 @@ int main(int argc, const char ** argv) {
            }
        }

+        // mean, not yet fully implemented
+        if(0)
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_mean(ctx0, x[0]));
+
+                check_gradient("mean", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
+        // argmax
+        if (0)
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_argmax(ctx0, x[0]));
+
+                check_gradient("argmax", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
        // repeat
        {
            int64_t ne2[4];
@ -549,15 +630,36 @@ int main(int argc, const char ** argv) {

            const int nargs = 1;
            for (int ndims = 1; ndims <= 2; ++ndims) {
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
-                x[1] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);

                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[1], ggml_repeat(ctx0, x[0], x[1]))));

                check_gradient("repeat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
            }
+        }

+        // repeat back
+        {
+            int64_t ne2[4];
+            get_random_dims(ne2, 4);
+
+            ne2[0] = ne[0] * ne2[0];
+            ne2[1] = ne[1] * ne2[1];
+            ne2[2] = 1;
+            ne2[3] = 1;
+
+            const int nargs = 1;
+            for (int ndims = 1; ndims <= 2; ++ndims) {
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
+                ggml_set_param(ctx0, x[0]);
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[0], ggml_repeat_back(ctx0, x[1], x[0]))));
+
+                check_gradient("repeat back", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
+            }
        }

        // abs (finite differences do not work)
@ -566,7 +668,7 @@ int main(int argc, const char ** argv) {

        //    for (int ndims = 1; ndims <= 2; ++ndims) {
        //        for (int i = 0; i < nargs; ++i) {
-        //            x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+        //            x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
        //            ggml_set_param(ctx0, x[i]);
        //        }

@ -576,17 +678,82 @@ int main(int argc, const char ** argv) {
        //    }
        //}

+        // sgn
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor* f = ggml_sum(ctx0, ggml_sgn(ctx0, x[0]));
+
+                check_gradient("sgn", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
+        // neg
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor* f = ggml_sum(ctx0, ggml_neg(ctx0, x[0]));
+
+                check_gradient("neg", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
+        // step
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor* f = ggml_sum(ctx0, ggml_step(ctx0, x[0]));
+
+                check_gradient("step", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
+        // tanh, not yet fully implemented
+        if(0)
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor* f = ggml_sum(ctx0, ggml_tanh(ctx0, x[0]));
+
+                check_gradient("tanh", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
        // mul_mat
        {
            const int nargs = 2;

            for (int ndims = 2; ndims <= 2; ++ndims) {
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                {
                    int64_t ne2[4];
                    get_random_dims(ne2, 4);
                    ne2[0] = ne[0];
-                    x[1] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f);
+                    x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
                }

                ggml_set_param(ctx0, x[0]);
@ -602,13 +769,63 @@ int main(int argc, const char ** argv) {
            }
        }

+        // elu, not yet fully implemented
+        if(0)
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor* f = ggml_sum(ctx0, ggml_elu(ctx0, x[0]));
+
+                check_gradient("elu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
+        // relu
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor* f = ggml_sum(ctx0, ggml_relu(ctx0, x[0]));
+
+                check_gradient("relu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // gelu, not yet fully implemented
+        if(0)
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor* f = ggml_sum(ctx0, ggml_gelu(ctx0, x[0]));
+
+                check_gradient("gelu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
        // silu
        {
            const int nargs = 1;

            for (int ndims = 1; ndims <= 2; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }

@ -629,11 +846,11 @@ int main(int argc, const char ** argv) {

            for (int ndims = 1; ndims <= 2; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }

-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_rms_norm(ctx0, x[0]));
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_rms_norm(ctx0, x[0], 1e-6f));

                check_gradient("rms_norm", ctx0, x, f, ndims, nargs, 1e-4f, 1.0f, INFINITY);
            }
@ -647,8 +864,8 @@ int main(int argc, const char ** argv) {
            ne2[0] = 1;

            for (int ndims = 1; ndims <= 2; ++ndims) {
-                x[1] = get_random_tensor(ctx0, 1, ne2, -1.0f, 1.0f);
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);

                ggml_set_param(ctx0, x[0]);
                ggml_set_param(ctx0, x[1]);
@ -659,20 +876,37 @@ int main(int argc, const char ** argv) {
            }
        }

-        // cpy
+        // cpy f32
        {
            const int nargs = 2;

            for (int ndims = 1; ndims <= 2; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }
                // x[1] is overwritten by x[0], so the gradients don't propagate to x[1]

                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));

-                check_gradient("cpy", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+                check_gradient("cpy f32", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // cpy f16
+        {
+            const int nargs = 2;
+
+            for (int ndims = 1; ndims <= 2; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+                // x[1] is overwritten by x[0], so the gradients don't propagate to x[1]
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
+
+                check_gradient("cpy f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
            }
        }

@ -689,8 +923,8 @@ int main(int argc, const char ** argv) {
                for (int i = 0; i < ndims; ++i) {
                    ne2[0] *= ne[i];
                }
-                x[0] = get_random_tensor(ctx0, 1, ne2, -1.0f, 1.0f);
-                x[1] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);


@ -712,8 +946,8 @@ int main(int argc, const char ** argv) {
                for (int i = 0; i < ndims; ++i) {
                    ne2[0] *= ne[i];
                }
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
-                x[1] = get_random_tensor(ctx0, 1, ne2, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);


@ -729,7 +963,7 @@ int main(int argc, const char ** argv) {
            const int nargs = 2;
            for (int ndims = 1; ndims <= 4; ++ndims) {

-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);

                get_random_dims(ne2, 1);
@ -737,7 +971,7 @@ int main(int argc, const char ** argv) {
                    get_random_dims(ne2, 1);
                }

-                x[1] = get_random_tensor(ctx0, 1, ne2, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[1]);

                const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
@ -758,7 +992,7 @@ int main(int argc, const char ** argv) {
            const int nargs = 2;
            for (int ndims = 2; ndims <= 4; ++ndims) {

-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);

                get_random_dims(ne2, 2);
@ -766,7 +1000,7 @@ int main(int argc, const char ** argv) {
                    get_random_dims(ne2, 2);
                }

-                x[1] = get_random_tensor(ctx0, 2, ne2, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[1]);

                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
@ -790,7 +1024,7 @@ int main(int argc, const char ** argv) {
            const int nargs = 2;
            for (int ndims = 3; ndims <= 4; ++ndims) {

-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);

                get_random_dims(ne2, 3);
@ -798,7 +1032,7 @@ int main(int argc, const char ** argv) {
                    get_random_dims(ne2, 3);
                }

-                x[1] = get_random_tensor(ctx0, 3, ne2, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, 3, ne2, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[1]);

                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
@ -824,7 +1058,7 @@ int main(int argc, const char ** argv) {
            const int nargs = 2;
            for (int ndims = 4; ndims <= 4; ++ndims) {

-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);

                get_random_dims(ne2, 4);
@ -832,7 +1066,7 @@ int main(int argc, const char ** argv) {
                    get_random_dims(ne2, 4);
                }

-                x[1] = get_random_tensor(ctx0, 4, ne2, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[1]);

                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
@ -858,7 +1092,7 @@ int main(int argc, const char ** argv) {
            const int nargs = 2;
            for (int ndims = 1; ndims <= 4; ++ndims) {

-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);

                get_random_dims(ne2, 1);
@ -866,7 +1100,7 @@ int main(int argc, const char ** argv) {
                    get_random_dims(ne2, 1);
                }

-                x[1] = get_random_tensor(ctx0, 1, ne2, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[1]);

                const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
@ -887,7 +1121,7 @@ int main(int argc, const char ** argv) {
            const int nargs = 1;
            for (int ndims = 2; ndims <= 4; ++ndims) {

-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);

                get_random_dims(ne2, 2);
@ -895,7 +1129,7 @@ int main(int argc, const char ** argv) {
                    get_random_dims(ne2, 2);
                }

-                x[1] = get_random_tensor(ctx0, 2, ne2, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[1]);

                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
@ -915,7 +1149,7 @@ int main(int argc, const char ** argv) {
            const int nargs = 1;
            for (int ndims = 1; ndims <= 4; ++ndims) {

-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);

                ggml_set_param(ctx0, x[0]);

@ -941,7 +1175,7 @@ int main(int argc, const char ** argv) {
            const int nargs = 1;
            for (int ndims = 1; ndims <= 4; ++ndims) {

-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);

                get_random_dims(ne2, 2);
                while (ne2[0]*ne2[1] > ggml_nelements(x[0])) {
@ -971,7 +1205,7 @@ int main(int argc, const char ** argv) {
            const int nargs = 1;
            for (int ndims = 1; ndims <= 4; ++ndims) {

-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);

                get_random_dims(ne2, 3);
                while (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0])) {
@ -1010,7 +1244,7 @@ int main(int argc, const char ** argv) {
                for (int i=ndims; i<4; ++i) {
                    ne2[i] = 1;
                }
-                x[0] = get_random_tensor(ctx0, 4, ne2, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);

                ggml_set_param(ctx0, x[0]);

@ -1043,7 +1277,7 @@ int main(int argc, const char ** argv) {
                for (int i=ndims; i<4; ++i) {
                    ne2[i] = 1;
                }
-                x[0] = get_random_tensor(ctx0, 4, ne2, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);

                ggml_set_param(ctx0, x[0]);

@ -1060,8 +1294,8 @@ int main(int argc, const char ** argv) {
            int64_t ne3[4] = {1+irand(ne[1]), 1, 1, 1};
            const int nargs = 1;
            const int ndims = 2;
-            x[0] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f);
-            x[1] = get_random_tensor_int(ctx0, 1, ne3, 0, ne2[1]);
+            x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
+            x[1] = get_random_tensor_i32(ctx0, 1, ne3, 0, ne2[1]);

            ggml_set_param(ctx0, x[0]);

@ -1075,7 +1309,7 @@ int main(int argc, const char ** argv) {
            const int nargs = 1;
            const int ndims = 2;

-            x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+            x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
            ggml_set_param(ctx0, x[0]);

            int n_past = irand(ne[0]);
@ -1090,7 +1324,7 @@ int main(int argc, const char ** argv) {
            const int nargs = 1;
            const int ndims = 2;

-            x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+            x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
            ggml_set_param(ctx0, x[0]);

            int n_past = irand(ne[0]);
@ -1108,7 +1342,7 @@ int main(int argc, const char ** argv) {
            get_random_dims(ne2, 4);

            for (int ndims = 1; ndims <= 3; ++ndims) {
-                x[0] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);

                struct ggml_tensor * f = ggml_sum(ctx0, ggml_soft_max(ctx0, x[0]));
@ -1125,8 +1359,8 @@ int main(int argc, const char ** argv) {
            get_random_dims(ne2, 4);

            for (int ndims = 1; ndims <= 3; ++ndims) {
-                x[0] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f);
-                x[1] = get_random_tensor(ctx0, ndims, ne2, 0.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, ndims, ne2, 0.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);

                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cross_entropy_loss(ctx0, x[0], x[1]));
@ -1136,7 +1370,7 @@ int main(int argc, const char ** argv) {
            }
        }

-        // rope
+        // rope f32
        {
            const int nargs = 1;

@ -1148,7 +1382,7 @@ int main(int argc, const char ** argv) {
            for (int ndims = 3; ndims <= 4; ++ndims) {
                for (int mode = 0; mode < 4; ++mode) {
                    for (int n_past = 1; n_past < ne2[2]; ++n_past) {
-                        x[0] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f);
+                        x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);

                        ggml_set_param(ctx0, x[0]);

@ -1163,14 +1397,48 @@ int main(int argc, const char ** argv) {

                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], n_past, n_rot, mode, 0));

-                        GGML_PRINT_DEBUG("rope: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
-                        check_gradient("rope", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY);
+                        GGML_PRINT_DEBUG("rope f32: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
+                        check_gradient("rope f32", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY);
                    }
                }
            }
        }

-        // flash_attn
+        // rope f16
+        {
+            const int nargs = 1;
+
+            int64_t ne2[4];
+            get_random_dims(ne2, 4);
+            ne2[0] += ne2[0] % 2;
+            int n_rot = ne2[0];
+
+            for (int ndims = 3; ndims <= 4; ++ndims) {
+                for (int mode = 0; mode < 4; ++mode) {
+                    for (int n_past = 1; n_past < ne2[2]; ++n_past) {
+                        x[0] = get_random_tensor_f16(ctx0, ndims, ne2, -1.0f, 1.0f);
+
+                        ggml_set_param(ctx0, x[0]);
+
+                        const bool skip_past = (mode & 1);
+                        if (skip_past) {
+                            // we have no past, so this would have to work on uninitialized memory.
+                            // we only test the gradients here;
+                            // skip_past should have no influence on gradient computation.
+                            // so when other modes work, we assume that this does as well.
+                            continue;
+                        }
+
+                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], n_past, n_rot, mode, 0));
+
+                        GGML_PRINT_DEBUG("rope f16: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
+                        check_gradient("rope f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
+                    }
+                }
+            }
+        }
+
+        // flash_attn f32
        {
            const int nargs = 3;

@ -1196,16 +1464,57 @@ int main(int argc, const char ** argv) {
                        nek[3] = 1;
                        nev[3] = 1;
                    }
-                    x[0] = get_random_tensor(ctx0, ndims, neq, -0.1250f, 0.1250f);
-                    x[1] = get_random_tensor(ctx0, ndims, nek, -0.1250f, 0.1250f);
-                    x[2] = get_random_tensor(ctx0, ndims, nev, -0.1250f, 0.1250f);
+                    x[0] = get_random_tensor_f32(ctx0, ndims, neq, -0.1250f, 0.1250f);
+                    x[1] = get_random_tensor_f32(ctx0, ndims, nek, -0.1250f, 0.1250f);
+                    x[2] = get_random_tensor_f32(ctx0, ndims, nev, -0.1250f, 0.1250f);
                    ggml_set_param(ctx0, x[0]);
                    ggml_set_param(ctx0, x[1]);
                    ggml_set_param(ctx0, x[2]);

                    struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));

-                    check_gradient("flash_attn", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f);
+                    check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f);
+                }
+            }
+        }
+
+        // flash_attn f16, not yet fully implemented
+        if(0)
+        {
+            const int nargs = 3;
+
+            int64_t ne2[4];
+
+            get_random_dims(ne2, 4);
+            int64_t D = ne2[0];
+            int64_t N = ne2[1];
+            int64_t M = ne2[2] + N;
+            int64_t B = ne2[3];
+
+            for (int masked = 0; masked <= 1; ++masked) {
+                for (int ndims = 2; ndims <= 4; ++ndims) {
+                    int64_t neq[4] = { D, N, B, ne[3] };
+                    int64_t nek[4] = { D, M, B, ne[3] };
+                    int64_t nev[4] = { M, D, B, ne[3] };
+                    if (ndims == 2) {
+                        neq[2] = 1; neq[3] = 1;
+                        nek[2] = 1; nek[3] = 1;
+                        nev[2] = 1; nev[3] = 1;
+                    } else if (ndims == 3) {
+                        neq[3] = 1;
+                        nek[3] = 1;
+                        nev[3] = 1;
+                    }
+                    x[0] = get_random_tensor_f16(ctx0, ndims, neq, -0.1250f, 0.1250f);
+                    x[1] = get_random_tensor_f16(ctx0, ndims, nek, -0.1250f, 0.1250f);
+                    x[2] = get_random_tensor_f16(ctx0, ndims, nev, -0.1250f, 0.1250f);
+                    ggml_set_param(ctx0, x[0]);
+                    ggml_set_param(ctx0, x[1]);
+                    ggml_set_param(ctx0, x[2]);
+
+                    struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
+
+                    check_gradient("flash_attn f16", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f);
                }
            }
        }
--- a/tests/test-opt.cpp
+++ b/tests/test-opt.cpp
@ -1,9 +1,9 @@
 #include "ggml.h"

-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>

 #define MAX_NARGS 2

@ -119,15 +119,16 @@ void set_element(struct ggml_tensor * t, int idx, float value) {

 int main(void) {
    struct ggml_init_params params = {
-        .mem_size   = 1024*1024*1024,
-        .mem_buffer = NULL,
-        .no_alloc   = false,
+        /* .mem_size   = */ 1024*1024*1024,
+        /* .mem_buffer = */ NULL,
+        /* .no_alloc   = */ false,
    };
+
    struct ggml_context * ctx = ggml_init(params);

-    int64_t ne1[4] = {4, 1024, 1, 1};
-    int64_t ne2[4] = {4, 2048, 1, 1};;
-    int64_t ne3[4] = {1024, 2048, 1, 1};
+    int64_t ne1[4] = {4, 128, 1, 1};
+    int64_t ne2[4] = {4, 256, 1, 1};;
+    int64_t ne3[4] = {128, 256, 1, 1};

    struct ggml_tensor * a = get_random_tensor(ctx, 2, ne1, -1, +1);
    struct ggml_tensor * b = get_random_tensor(ctx, 2, ne2, -1, +1);