Merge branch 'master' into compilade/batch-splits

2025-01-12 13:27:21 +01:00 · 2024-07-23 13:37:09 -04:00 · 2024-07-23 13:37:09 -04:00 · 9c0a61f8c3
commit 9c0a61f8c3
parent 1725de768e b841d07408
120 changed files with 17405 additions and 4833 deletions
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@ -6,7 +6,7 @@ ARG CUDA_VERSION=11.7.1
 # Target the CUDA build image
 ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
-FROM ${BASE_CUDA_DEV_CONTAINER} as build
+FROM ${BASE_CUDA_DEV_CONTAINER} AS build
 # Unless otherwise specified, we make a fat build.
 ARG CUDA_DOCKER_ARCH=all
--- a/.devops/full-rocm.Dockerfile
+++ b/.devops/full-rocm.Dockerfile
@ -6,7 +6,7 @@ ARG ROCM_VERSION=5.6
 # Target the CUDA build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
-FROM ${BASE_ROCM_DEV_CONTAINER} as build
+FROM ${BASE_ROCM_DEV_CONTAINER} AS build
 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@ -1,6 +1,6 @@
 ARG UBUNTU_VERSION=22.04
-FROM ubuntu:$UBUNTU_VERSION as build
+FROM ubuntu:$UBUNTU_VERSION AS build
 RUN apt-get update && \
    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
--- a/.devops/llama-cli-cuda.Dockerfile
+++ b/.devops/llama-cli-cuda.Dockerfile
@ -6,7 +6,7 @@ ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VER
 # Target the CUDA runtime image
 ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
-FROM ${BASE_CUDA_DEV_CONTAINER} as build
+FROM ${BASE_CUDA_DEV_CONTAINER} AS build
 # Unless otherwise specified, we make a fat build.
 ARG CUDA_DOCKER_ARCH=all
@ -25,7 +25,7 @@ ENV GGML_CUDA=1
 RUN make -j$(nproc) llama-cli
-FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
+FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
 RUN apt-get update && \
    apt-get install -y libgomp1
--- a/.devops/llama-cli-intel.Dockerfile
+++ b/.devops/llama-cli-intel.Dockerfile
@ -1,6 +1,6 @@
 ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
-FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
+FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
 ARG GGML_SYCL_F16=OFF
 RUN apt-get update && \
@ -17,7 +17,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
    cmake --build build --config Release --target llama-cli
-FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
+FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
 COPY --from=build /app/build/bin/llama-cli /llama-cli
--- a/.devops/llama-cli-rocm.Dockerfile
+++ b/.devops/llama-cli-rocm.Dockerfile
@ -6,7 +6,7 @@ ARG ROCM_VERSION=5.6
 # Target the CUDA build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
-FROM ${BASE_ROCM_DEV_CONTAINER} as build
+FROM ${BASE_ROCM_DEV_CONTAINER} AS build
 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
--- a/.devops/llama-cli-vulkan.Dockerfile
+++ b/.devops/llama-cli-vulkan.Dockerfile
@ -1,6 +1,6 @@
 ARG UBUNTU_VERSION=jammy
-FROM ubuntu:$UBUNTU_VERSION as build
+FROM ubuntu:$UBUNTU_VERSION AS build
 # Install build tools
 RUN apt update && apt install -y git build-essential cmake wget libgomp1
--- a/.devops/llama-cli.Dockerfile
+++ b/.devops/llama-cli.Dockerfile
@ -1,6 +1,6 @@
 ARG UBUNTU_VERSION=22.04
-FROM ubuntu:$UBUNTU_VERSION as build
+FROM ubuntu:$UBUNTU_VERSION AS build
 RUN apt-get update && \
    apt-get install -y build-essential git
@ -11,7 +11,7 @@ COPY . .
 RUN make -j$(nproc) llama-cli
-FROM ubuntu:$UBUNTU_VERSION as runtime
+FROM ubuntu:$UBUNTU_VERSION AS runtime
 RUN apt-get update && \
    apt-get install -y libgomp1
--- a/.devops/llama-server-cuda.Dockerfile
+++ b/.devops/llama-server-cuda.Dockerfile
@ -6,7 +6,7 @@ ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VER
 # Target the CUDA runtime image
 ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
-FROM ${BASE_CUDA_DEV_CONTAINER} as build
+FROM ${BASE_CUDA_DEV_CONTAINER} AS build
 # Unless otherwise specified, we make a fat build.
 ARG CUDA_DOCKER_ARCH=all
@ -27,7 +27,7 @@ ENV LLAMA_CURL=1
 RUN make -j$(nproc) llama-server
-FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
+FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev libgomp1 curl
--- a/.devops/llama-server-intel.Dockerfile
+++ b/.devops/llama-server-intel.Dockerfile
@ -1,6 +1,6 @@
 ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
-FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
+FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
 ARG GGML_SYCL_F16=OFF
 RUN apt-get update && \
@ -17,7 +17,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
    cmake --build build --config Release --target llama-server
-FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
+FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev curl
--- a/.devops/llama-server-rocm.Dockerfile
+++ b/.devops/llama-server-rocm.Dockerfile
@ -6,7 +6,7 @@ ARG ROCM_VERSION=5.6
 # Target the CUDA build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
-FROM ${BASE_ROCM_DEV_CONTAINER} as build
+FROM ${BASE_ROCM_DEV_CONTAINER} AS build
 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
--- a/.devops/llama-server-vulkan.Dockerfile
+++ b/.devops/llama-server-vulkan.Dockerfile
@ -1,6 +1,6 @@
 ARG UBUNTU_VERSION=jammy
-FROM ubuntu:$UBUNTU_VERSION as build
+FROM ubuntu:$UBUNTU_VERSION AS build
 # Install build tools
 RUN apt update && apt install -y git build-essential cmake wget
--- a/.devops/llama-server.Dockerfile
+++ b/.devops/llama-server.Dockerfile
@ -1,6 +1,6 @@
 ARG UBUNTU_VERSION=22.04
-FROM ubuntu:$UBUNTU_VERSION as build
+FROM ubuntu:$UBUNTU_VERSION AS build
 RUN apt-get update && \
    apt-get install -y build-essential git libcurl4-openssl-dev curl
@ -13,7 +13,7 @@ ENV LLAMA_CURL=1
 RUN make -j$(nproc) llama-server
-FROM ubuntu:$UBUNTU_VERSION as runtime
+FROM ubuntu:$UBUNTU_VERSION AS runtime
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev libgomp1
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -860,7 +860,7 @@ jobs:
          mkdir build
          cd build
          cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON
-          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
+          cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1))
      - name: Determine tag name
        id: tag
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -106,6 +106,7 @@ llama_option_depr(WARNING     LLAMA_NATIVE              GGML_NATIVE)
 llama_option_depr(WARNING     LLAMA_RPC                 GGML_RPC)
 llama_option_depr(WARNING     LLAMA_SYCL                GGML_SYCL)
 llama_option_depr(WARNING     LLAMA_SYCL_F16            GGML_SYCL_F16)
 llama_option_depr(WARNING     LLAMA_CANN                GGML_CANN)
 #
 # build the library
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -1,13 +1,17 @@
-# Pull requests
+# Pull requests (for contributors)
 - Always squash-merge the PR before merging
 - Use the following format for your final commit: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
 - Test your changes:
  - Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
  - Execute [the full CI locally on your machine](ci/README.md) before publishing
 - If the pull request contains only documentation changes (e.g., updating READMEs, adding new wiki pages), please add `[no ci]` to the commit title. This will skip unnecessary CI checks and help reduce build times
 - Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
-  - The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your conveience
+  - The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your convenience
 - If your PR becomes stale, don't hesitate to ping the maintainers in the comments
 # Pull requests (for collaborators)
 - Squash-merge PRs
 - Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
 - Optionally, pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
 # Coding guidelines
--- a/32
+++ b/32
@ -876,6 +876,9 @@ OBJ_GGML += \
 OBJ_LLAMA = \
 	src/llama.o \
 	src/llama-vocab.o \
 	src/llama-grammar.o \
 	src/llama-sampling.o \
 	src/unicode.o \
 	src/unicode-data.o
@ -1055,6 +1058,10 @@ src/unicode-data.o: \
 src/llama.o: \
 	src/llama.cpp \
 	src/llama-impl.h \
 	src/llama-vocab.h \
 	src/llama-grammar.h \
 	src/llama-sampling.h \
 	src/unicode.h \
 	include/llama.h \
 	ggml/include/ggml-cuda.h \
@ -1064,6 +1071,29 @@ src/llama.o: \
 	ggml/include/ggml-backend.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 src/llama-vocab.o: \
 	src/llama-vocab.cpp \
 	src/llama-vocab.h \
 	src/llama-impl.h \
 	include/llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 src/llama-grammar.o: \
 	src/llama-grammar.cpp \
 	src/llama-grammar.h \
 	src/llama-impl.h \
 	src/llama-vocab.h \
 	src/llama-sampling.h \
 	include/llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 src/llama-sampling.o: \
 	src/llama-sampling.cpp \
 	src/llama-sampling.h \
 	src/llama-impl.h \
 	include/llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 $(LIB_LLAMA): \
 	$(OBJ_LLAMA) \
 	$(LIB_GGML)
@ -1439,7 +1469,7 @@ run-benchmark-matmult: llama-benchmark-matmult
 .PHONY: run-benchmark-matmult swift
 tests/test-llama-grammar: tests/test-llama-grammar.cpp \
-	$(OBJ_GGML) $(OBJ_COMMON) src/unicode.o src/unicode-data.o
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
--- a/Package.swift
+++ b/Package.swift
@ -4,6 +4,9 @@ import PackageDescription
 var sources = [
    "src/llama.cpp",
    "src/llama-vocab.cpp",
    "src/llama-grammar.cpp",
    "src/llama-sampling.cpp",
    "src/unicode.cpp",
    "src/unicode-data.cpp",
    "ggml/src/ggml.c",
--- a/README.md
+++ b/README.md
@ -3,7 +3,7 @@
 ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
-[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg?branch=master&event=schedule)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
+[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
 [![Conan Center](https://shields.io/conan/v/llama-cpp)](https://conan.io/center/llama-cpp)
 [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
--- a/common/ngram-cache.h
+++ b/common/ngram-cache.h
@ -37,11 +37,18 @@ struct llama_ngram {
    }
 };
 struct llama_token_hash_function {
    size_t operator()(const llama_token token) const {
        // see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
        return token * 11400714819323198485llu;
    }
 };
 struct llama_ngram_hash_function {
    size_t operator()(const llama_ngram & ngram) const {
-        size_t hash = 0;
+        size_t hash = llama_token_hash_function{}(ngram.tokens[0]);
-        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
+        for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) {
-            hash ^= std::hash<llama_token>{}(ngram.tokens[i]);
+            hash ^= llama_token_hash_function{}(ngram.tokens[i]);
        }
        return hash;
    }
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -330,7 +330,7 @@ static llama_token llama_sampling_sample_impl(
        llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
        // Apply grammar constraints to the single token
-        llama_sample_grammar(ctx_main, &single_token_data_array, ctx_sampling->grammar);
+        llama_grammar_sample(ctx_sampling->grammar, ctx_main, &single_token_data_array);
        // Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
        bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
@ -421,7 +421,7 @@ static llama_token_data_array llama_sampling_prepare_impl(
    // apply grammar checks before sampling logic
    if (apply_grammar && ctx_sampling->grammar != NULL) {
-        llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
+        llama_grammar_sample(ctx_sampling->grammar, ctx_main, &cur_p);
    }
    return cur_p;
@ -455,6 +455,6 @@ void llama_sampling_accept(
    ctx_sampling->prev.push_back(id);
    if (ctx_sampling->grammar != NULL && apply_grammar) {
-        llama_grammar_accept_token(ctx_main, ctx_sampling->grammar, id);
+        llama_grammar_accept_token(ctx_sampling->grammar, ctx_main, id);
    }
 }
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -48,34 +48,39 @@ class Model:
    dir_model: Path
    ftype: gguf.LlamaFileType
    fname_out: Path
    is_big_endian: bool
    endianess: gguf.GGUFEndian
    use_temp_file: bool
    lazy: bool
    model_name: str | None
    part_names: list[str]
    is_safetensors: bool
    hparams: dict[str, Any]
    block_count: int
    tensor_map: gguf.TensorNameMap
    tensor_names: set[str] | None
    fname_out: Path
    gguf_writer: gguf.GGUFWriter
    model_name: str | None
    metadata_override: Path | None
    dir_model_card: Path
    # subclasses should define this!
    model_arch: gguf.MODEL_ARCH
-    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool,
+    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
-                 model_name: str | None, split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
+                 use_temp_file: bool = False, eager: bool = False,
                 metadata_override: Path | None = None, model_name: str | None = None,
                 split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
        if type(self) is Model:
            raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
        self.dir_model = dir_model
        self.ftype = ftype
        self.fname_out = fname_out
        self.is_big_endian = is_big_endian
        self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
        self.use_temp_file = use_temp_file
        self.lazy = not eager
        self.model_name = model_name
        self.part_names = Model.get_model_part_names(self.dir_model, "model", ".safetensors")
        self.is_safetensors = len(self.part_names) > 0
        if not self.is_safetensors:
@ -84,6 +89,11 @@ class Model:
        self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
        self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
        self.tensor_names = None
        self.metadata_override = metadata_override
        self.model_name = model_name
        self.dir_model_card = dir_model  # overridden in convert_lora_to_gguf.py
        # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
        if self.ftype == gguf.LlamaFileType.GUESSED:
            # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
            _, first_tensor = next(self.get_tensors())
@ -93,10 +103,8 @@ class Model:
            else:
                logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})")
                self.ftype = gguf.LlamaFileType.MOSTLY_BF16
-        ftype_up: str = self.ftype.name.partition("_")[2].upper()
+
-        ftype_lw: str = ftype_up.lower()
+        # Configure GGUF Writer
        # allow templating the file name with the output ftype, useful with the "auto" ftype
        self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
        self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file,
                                           split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard)
@ -193,7 +201,6 @@ class Model:
        return new_name
    def set_gguf_parameters(self):
        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
        self.gguf_writer.add_block_count(self.block_count)
        if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
@ -232,6 +239,10 @@ class Model:
            self.gguf_writer.add_expert_used_count(n_experts_used)
            logger.info(f"gguf: experts used count = {n_experts_used}")
        if (head_dim := self.hparams.get("head_dim")) is not None:
            self.gguf_writer.add_key_length(head_dim)
            self.gguf_writer.add_value_length(head_dim)
        self.gguf_writer.add_file_type(self.ftype)
        logger.info(f"gguf: file type = {self.ftype}")
@ -250,7 +261,7 @@ class Model:
        return False
-    def write_tensors(self):
+    def prepare_tensors(self):
        max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
        for name, data_torch in self.get_tensors():
@ -333,9 +344,62 @@ class Model:
                self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype)
    def set_type(self):
        self.gguf_writer.add_type(gguf.GGUFType.MODEL)
    def prepare_metadata(self, vocab_only: bool):
        total_params, shared_params, expert_params, expert_count = self.gguf_writer.get_total_parameter_count()
        self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model_card, self.model_name, total_params)
        # Fallback to model directory name if metadata name is still missing
        if self.metadata.name is None:
            self.metadata.name = self.dir_model.name
        # Generate parameter weight class (useful for leader boards) if not yet determined
        if self.metadata.size_label is None and total_params > 0:
            self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count)
        # Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0'
        output_type: str = self.ftype.name.partition("_")[2]
        # Filename Output
        if self.fname_out.is_dir():
            # Generate default filename based on model specification and available metadata
            if not vocab_only:
                fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None)
            else:
                fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab")
            # Use the default filename
            self.fname_out = self.fname_out / f"{fname_default}.gguf"
        else:
            # Output path is a custom defined templated filename
            # Note: `not is_dir()` is used because `.is_file()` will not detect
            #       file template strings as it doesn't actually exist as a file
            # Process templated file name with the output ftype, useful with the "auto" ftype
            self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
        self.set_type()
        logger.info("Set meta model")
        self.metadata.set_gguf_meta_model(self.gguf_writer)
        logger.info("Set model parameters")
        self.set_gguf_parameters()
        logger.info("Set model tokenizer")
        self.set_vocab()
        logger.info("Set model quantization version")
        self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
    def write(self):
-        self.write_tensors()
+        self.prepare_tensors()
-        self.gguf_writer.write_header_to_file(self.fname_out)
+        self.prepare_metadata(vocab_only=False)
        self.gguf_writer.write_header_to_file(path=self.fname_out)
        self.gguf_writer.write_kv_data_to_file()
        self.gguf_writer.write_tensors_to_file(progress=True)
        self.gguf_writer.close()
@ -343,7 +407,9 @@ class Model:
    def write_vocab(self):
        if len(self.gguf_writer.tensors) != 1:
            raise ValueError('Splitting the vocabulary is not supported')
-        self.gguf_writer.write_header_to_file(self.fname_out)
+
        self.prepare_metadata(vocab_only=True)
        self.gguf_writer.write_header_to_file(path=self.fname_out)
        self.gguf_writer.write_kv_data_to_file()
        self.gguf_writer.close()
@ -528,6 +594,15 @@ class Model:
        if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901":
            # ref: https://huggingface.co/core42/jais-13b
            res = "jais"
        if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f":
            # ref: https://huggingface.co/WisdomShell/CodeShell-7B
            res = "codeshell"
        if chkhsh == "63b97e4253352e6f357cc59ea5b583e3a680eaeaf2632188c2b952de2588485e":
            # ref: https://huggingface.co/mistralai/Mistral-Nemo-Base-2407
            res = "tekken"
        if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249":
            # ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M
            res = "smollm"
        if res is None:
            logger.warning("\n")
@ -668,7 +743,7 @@ class Model:
                added_tokens_json = json.load(f)
                for key in added_tokens_json:
                    token_id = added_tokens_json[key]
-                    if (token_id >= vocab_size):
+                    if token_id >= vocab_size:
                        logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
                        continue
@ -685,7 +760,8 @@ class Model:
                    token_id = int(token_id)
                    token: str = token_data["content"]
                    if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
-                        assert tokens[token_id] == token.encode("utf-8")
+                        if tokens[token_id] != token.encode("utf-8"):
                            logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}')
                    if token_data.get("special") or self.does_token_look_special(token):
                        toktypes[token_id] = SentencePieceTokenTypes.CONTROL
                    else:
@ -780,7 +856,6 @@ class GPTNeoXModel(Model):
    def set_gguf_parameters(self):
        block_count = self.hparams["num_hidden_layers"]
        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
        self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
        self.gguf_writer.add_block_count(block_count)
@ -836,7 +911,6 @@ class BloomModel(Model):
    model_arch = gguf.MODEL_ARCH.BLOOM
    def set_gguf_parameters(self):
        self.gguf_writer.add_name("Bloom")
        n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
        n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
        self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
@ -913,7 +987,6 @@ class MPTModel(Model):
    def set_gguf_parameters(self):
        block_count = self.hparams["n_layers"]
        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
        self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
        self.gguf_writer.add_embedding_length(self.hparams["d_model"])
        self.gguf_writer.add_block_count(block_count)
@ -952,7 +1025,6 @@ class OrionModel(Model):
        block_count = self.hparams["num_hidden_layers"]
        head_count = self.hparams["num_attention_heads"]
        head_count_kv = self.hparams.get("num_key_value_heads", head_count)
        hf_repo = self.hparams.get("_name_or_path", "")
        ctx_length = 0
        if "max_sequence_length" in self.hparams:
@ -965,8 +1037,6 @@ class OrionModel(Model):
            raise ValueError("gguf: can not find ctx length parameter.")
        self.gguf_writer.add_file_type(self.ftype)
        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
        self.gguf_writer.add_source_hf_repo(hf_repo)
        self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
        self.gguf_writer.add_context_length(ctx_length)
        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
@ -990,7 +1060,6 @@ class BaichuanModel(Model):
        block_count = self.hparams["num_hidden_layers"]
        head_count = self.hparams["num_attention_heads"]
        head_count_kv = self.hparams.get("num_key_value_heads", head_count)
        hf_repo = self.hparams.get("_name_or_path", "")
        ctx_length = 0
        if "max_sequence_length" in self.hparams:
@ -1002,8 +1071,6 @@ class BaichuanModel(Model):
        else:
            raise ValueError("gguf: can not find ctx length parameter.")
        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
        self.gguf_writer.add_source_hf_repo(hf_repo)
        self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
        self.gguf_writer.add_context_length(ctx_length)
        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
@ -1117,7 +1184,6 @@ class XverseModel(Model):
        block_count = self.hparams["num_hidden_layers"]
        head_count = self.hparams["num_attention_heads"]
        head_count_kv = self.hparams.get("num_key_value_heads", head_count)
        hf_repo = self.hparams.get("_name_or_path", "")
        ctx_length = 0
        if "max_sequence_length" in self.hparams:
@ -1129,8 +1195,6 @@ class XverseModel(Model):
        else:
            raise ValueError("gguf: can not find ctx length parameter.")
        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
        self.gguf_writer.add_source_hf_repo(hf_repo)
        self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
        self.gguf_writer.add_context_length(ctx_length)
        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
@ -1189,7 +1253,6 @@ class FalconModel(Model):
        if n_head_kv is None:
            n_head_kv = self.hparams.get("n_head_kv", 1)  # old name
        self.gguf_writer.add_name("Falcon")
        self.gguf_writer.add_context_length(2048)  # not in config.json
        self.gguf_writer.add_tensor_data_layout("jploski")  # qkv tensor transform
        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
@ -1234,7 +1297,6 @@ class StarCoderModel(Model):
    def set_gguf_parameters(self):
        block_count = self.hparams["n_layer"]
        self.gguf_writer.add_name("StarCoder")
        self.gguf_writer.add_context_length(self.hparams["n_positions"])
        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
        self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
@ -1258,6 +1320,7 @@ class RefactModel(Model):
        special_vocab._set_special_token("prefix", 1)
        special_vocab._set_special_token("suffix", 3)
        special_vocab._set_special_token("middle", 2)
        special_vocab.chat_template = None  # do not add it twice
        special_vocab.add_to_gguf(self.gguf_writer)
    def set_gguf_parameters(self):
@ -1269,7 +1332,6 @@ class RefactModel(Model):
        block_count = self.hparams["n_layer"]
        self.gguf_writer.add_name("Refact")
        # refact uses Alibi. So this is from config.json which might be used by training.
        self.gguf_writer.add_context_length(self.hparams["n_positions"])
        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
@ -1324,7 +1386,6 @@ class StableLMModel(Model):
        hparams = self.hparams
        block_count = hparams["num_hidden_layers"]
        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
        self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
        self.gguf_writer.add_block_count(block_count)
@ -1386,8 +1447,8 @@ class StableLMModel(Model):
        return [(new_name, data_torch)]
-    def write_tensors(self):
+    def prepare_tensors(self):
-        super().write_tensors()
+        super().prepare_tensors()
        if self._q_norms is not None or self._k_norms is not None:
            # flatten two `list[dict[str, Tensor]]` into a single `list[str]`
@ -1430,7 +1491,12 @@ class LlamaModel(Model):
        super().set_gguf_parameters()
        hparams = self.hparams
        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
-        self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
+
        if "head_dim" in hparams:
            rope_dim = hparams["head_dim"]
        else:
            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
        self.gguf_writer.add_rope_dimension_count(rope_dim)
        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
            if self.hparams["rope_scaling"].get("type") == "linear":
@ -1503,8 +1569,8 @@ class LlamaModel(Model):
        return [(self.map_tensor_name(name), data_torch)]
-    def write_tensors(self):
+    def prepare_tensors(self):
-        super().write_tensors()
+        super().prepare_tensors()
        if self._experts is not None:
            # flatten `list[dict[str, Tensor]]` into `list[str]`
@ -1567,7 +1633,6 @@ class GrokModel(Model):
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        self.gguf_writer.add_name("Grok")
    _experts: list[dict[str, Tensor]] | None = None
@ -1616,7 +1681,6 @@ class DbrxModel(Model):
    def set_gguf_parameters(self):
        ffn_config = self.hparams["ffn_config"]
        attn_config = self.hparams["attn_config"]
        self.gguf_writer.add_name(self.hparams["model_type"])
        self.gguf_writer.add_block_count(self.hparams["n_layers"])
        self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
@ -1685,7 +1749,6 @@ class MiniCPMModel(Model):
    def set_gguf_parameters(self):
        block_count = self.hparams["num_hidden_layers"]
        self.gguf_writer.add_name("MiniCPM")
        self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
        self.gguf_writer.add_block_count(block_count)
@ -1755,7 +1818,6 @@ class QwenModel(Model):
        self._set_vocab_qwen()
    def set_gguf_parameters(self):
        self.gguf_writer.add_name("Qwen")
        self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
        self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
@ -1831,8 +1893,8 @@ class Qwen2MoeModel(Model):
        return [(self.map_tensor_name(name), data_torch)]
-    def write_tensors(self):
+    def prepare_tensors(self):
-        super().write_tensors()
+        super().prepare_tensors()
        if self._experts is not None:
            # flatten `list[dict[str, Tensor]]` into `list[str]`
@ -1846,7 +1908,6 @@ class GPT2Model(Model):
    model_arch = gguf.MODEL_ARCH.GPT2
    def set_gguf_parameters(self):
        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
        self.gguf_writer.add_block_count(self.hparams["n_layer"])
        self.gguf_writer.add_context_length(self.hparams["n_ctx"])
        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
@ -1889,7 +1950,6 @@ class Phi2Model(Model):
        n_embd = self.find_hparam(["hidden_size", "n_embd"])
        n_head = self.find_hparam(["num_attention_heads", "n_head"])
        self.gguf_writer.add_name("Phi2")
        self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
        self.gguf_writer.add_embedding_length(n_embd)
@ -1951,7 +2011,7 @@ class Phi3MiniModel(Model):
                for key in added_tokens_json:
                    token_id = added_tokens_json[key]
-                    if (token_id >= vocab_size):
+                    if token_id >= vocab_size:
                        logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
                        continue
@ -1968,7 +2028,8 @@ class Phi3MiniModel(Model):
                    token_id = int(token_id)
                    token = foken_data["content"].encode("utf-8")
                    if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
-                        assert tokens[token_id] == token
+                        if tokens[token_id] != token:
                            logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
                    tokens[token_id] = token
                    scores[token_id] = -1000.0
                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@ -1984,7 +2045,8 @@ class Phi3MiniModel(Model):
                    token_id = int(foken_data["id"])
                    token = foken_data["content"].encode("utf-8")
                    if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
-                        assert tokens[token_id] == token
+                        if tokens[token_id] != token:
                            logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
                    tokens[token_id] = token
                    scores[token_id] = -1000.0
                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@ -2011,7 +2073,6 @@ class Phi3MiniModel(Model):
        orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
        rope_dims = n_embd // n_head
        self.gguf_writer.add_name("Phi3")
        self.gguf_writer.add_context_length(max_pos_embds)
        self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds)
        self.gguf_writer.add_embedding_length(n_embd)
@ -2026,7 +2087,7 @@ class Phi3MiniModel(Model):
        # write rope scaling for long context (128k) model
        rope_scaling = self.find_hparam(['rope_scaling'], True)
-        if (rope_scaling is None):
+        if rope_scaling is None:
            return
        scale = max_pos_embds / orig_max_pos_embds
@ -2068,7 +2129,6 @@ class PlamoModel(Model):
        hparams = self.hparams
        block_count = hparams["num_hidden_layers"]
        self.gguf_writer.add_name("PLaMo")
        self.gguf_writer.add_context_length(4096)  # not in config.json
        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
@ -2113,7 +2173,6 @@ class CodeShellModel(Model):
    def set_gguf_parameters(self):
        block_count = self.hparams["n_layer"]
        self.gguf_writer.add_name("CodeShell")
        self.gguf_writer.add_context_length(self.hparams["n_positions"])
        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
        self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
@ -2226,7 +2285,8 @@ class InternLM2Model(Model):
                        chat_eos_token_id = token_id
                    token = token.encode("utf-8")
                    if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
-                        assert(tokens[token_id] == token)
+                        if tokens[token_id] != token:
                            logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
                    tokens[token_id] = token
                    scores[token_id] = -1000.0
                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@ -2245,7 +2305,8 @@ class InternLM2Model(Model):
                        chat_eos_token_id = token_id
                    token = token.encode("utf-8")
                    if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
-                        assert(tokens[token_id] == token)
+                        if tokens[token_id] != token:
                            logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
                    tokens[token_id] = token
                    scores[token_id] = -1000.0
                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@ -2272,7 +2333,6 @@ class InternLM2Model(Model):
        special_vocab.add_to_gguf(self.gguf_writer)
    def set_gguf_parameters(self):
        self.gguf_writer.add_name("InternLM2")
        self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
        self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
@ -2432,6 +2492,7 @@ class GemmaModel(Model):
        special_vocab._set_special_token("middle", 68)
        special_vocab._set_special_token("fsep",   70)
        special_vocab._set_special_token("eot",    107)
        special_vocab.chat_template = None  # do not add it twice
        special_vocab.add_to_gguf(self.gguf_writer)
        self.gguf_writer.add_add_space_prefix(False)
@ -2440,7 +2501,6 @@ class GemmaModel(Model):
        hparams = self.hparams
        block_count = hparams["num_hidden_layers"]
        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
        self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
        self.gguf_writer.add_block_count(block_count)
@ -2481,7 +2541,6 @@ class Gemma2Model(Model):
        hparams = self.hparams
        block_count = hparams["num_hidden_layers"]
        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
        self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
        self.gguf_writer.add_block_count(block_count)
@ -2556,7 +2615,6 @@ class MambaModel(Model):
        # Fail early for models which don't have a block expansion factor of 2
        assert d_inner == 2 * d_model
        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
        self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
        self.gguf_writer.add_embedding_length(d_model)
        self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
@ -2676,7 +2734,7 @@ class JinaBertV2Model(BertModel):
            yield name, data
-    def set_vocab(self, *args, **kwargs):
+    def set_vocab(self):
        tokenizer_class = 'BertTokenizer'
        with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
            tokenizer_class = json.load(f)['tokenizer_class']
@ -2735,7 +2793,6 @@ class OpenELMModel(Model):
        assert self.block_count == len(self._num_query_heads)
        assert self.block_count == len(self._ffn_dims)
        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
        self.gguf_writer.add_block_count(self.block_count)
        self.gguf_writer.add_context_length(self.hparams["max_context_length"])
        self.gguf_writer.add_embedding_length(n_embd)
@ -2825,7 +2882,7 @@ class ArcticModel(Model):
                    added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
                    for token_id, token_json in added_tokens_decoder.items():
                        token_id = int(token_id)
-                        if (token_id >= vocab_size):
+                        if token_id >= vocab_size:
                            logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
                            continue
@ -2909,8 +2966,8 @@ class ArcticModel(Model):
        return [(self.map_tensor_name(name), data_torch)]
-    def write_tensors(self):
+    def prepare_tensors(self):
-        super().write_tensors()
+        super().prepare_tensors()
        if self._experts is not None:
            # flatten `list[dict[str, Tensor]]` into `list[str]`
@ -2988,8 +3045,8 @@ class DeepseekV2Model(Model):
        return [(self.map_tensor_name(name), data_torch)]
-    def write_tensors(self):
+    def prepare_tensors(self):
-        super().write_tensors()
+        super().prepare_tensors()
        if self._experts is not None:
            # flatten `list[dict[str, Tensor]]` into `list[str]`
@ -3074,7 +3131,7 @@ class T5Model(Model):
                added_tokens_json = json.load(f)
                for key in added_tokens_json:
                    token_id = added_tokens_json[key]
-                    if (token_id >= vocab_size):
+                    if token_id >= vocab_size:
                        logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
                        continue
@ -3107,7 +3164,6 @@ class T5Model(Model):
        self.gguf_writer.add_add_eos_token(True)
    def set_gguf_parameters(self):
        self.gguf_writer.add_name("T5")
        if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
            logger.warning("Couldn't find context length in config.json, assuming default value of 512")
            n_ctx = 512
@ -3181,7 +3237,6 @@ class JaisModel(Model):
        self._set_vocab_gpt2()
    def set_gguf_parameters(self):
        self.gguf_writer.add_name(self.dir_model.name)
        self.gguf_writer.add_block_count(self.hparams["n_layer"])
        self.gguf_writer.add_context_length(self.hparams["n_positions"])
        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
@ -3227,8 +3282,8 @@ class JaisModel(Model):
        return tensors
-    def write_tensors(self):
+    def prepare_tensors(self):
-        super().write_tensors()
+        super().prepare_tensors()
        self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
@ -3387,7 +3442,6 @@ class ChatGLMModel(Model):
        special_vocab.add_to_gguf(self.gguf_writer)
    def set_gguf_parameters(self):
        self.gguf_writer.add_name(self.hparams["_name_or_path"].split("/")[1]) # THUDM/glm4-9b-chat or THUDM/chatglm3-6b
        n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
        n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
        n_head_kv = self.hparams.get("multi_query_group_num", n_head)
@ -3539,6 +3593,10 @@ def parse_args() -> argparse.Namespace:
        "--no-tensor-first-split", action="store_true",
        help="do not add tensors to the first split (disabled by default)"
    )
    parser.add_argument(
        "--metadata", type=Path,
        help="Specify the path for an authorship metadata override file"
    )
    return parser.parse_args()
@ -3564,7 +3622,10 @@ def split_str_to_n_bytes(split_str: str) -> int:
 def main() -> None:
    args = parse_args()
-    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
+    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)
    dir_model = args.model
@ -3588,34 +3649,30 @@ def main() -> None:
    if args.outfile is not None:
        fname_out = args.outfile
    else:
-        # output in the same directory as the model by default
+        fname_out = dir_model
        fname_out = dir_model / 'ggml-model-{ftype}.gguf'
    logger.info(f"Loading model: {dir_model.name}")
    hparams = Model.load_hparams(dir_model)
    with torch.inference_mode():
        output_type = ftype_map[args.outtype]
        model_architecture = hparams["architectures"][0]
        try:
-            model_class = Model.from_model_architecture(hparams["architectures"][0])
+            model_class = Model.from_model_architecture(model_architecture)
        except NotImplementedError:
-            logger.error(f"Model {hparams['architectures'][0]} is not supported")
+            logger.error(f"Model {model_architecture} is not supported")
            sys.exit(1)
-        model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file,
+        model_instance = model_class(dir_model=dir_model, ftype=output_type, fname_out=fname_out,
-                                     args.no_lazy, args.model_name, split_max_tensors=args.split_max_tensors,
+                                     is_big_endian=args.bigendian, use_temp_file=args.use_temp_file,
                                     eager=args.no_lazy,
                                     metadata_override=args.metadata, model_name=args.model_name,
                                     split_max_tensors=args.split_max_tensors,
                                     split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
                                     small_first_shard=args.no_tensor_first_split)
        logger.info("Set model parameters")
        model_instance.gguf_writer.add_type(gguf.GGUFType.MODEL)
        model_instance.set_gguf_parameters()
        logger.info("Set model tokenizer")
        model_instance.set_vocab()
        model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
        if args.vocab_only:
            logger.info("Exporting model vocab...")
            model_instance.write_vocab()
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@ -50,7 +50,7 @@ class TOKENIZER_TYPE(IntEnum):
 # TODO: this string has to exercise as much pre-tokenizer functionality as possible
 #       will be updated with time - contributions welcome
-chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
+CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
 if len(sys.argv) == 2:
    token = sys.argv[1]
@ -91,6 +91,9 @@ models = [
    {"name": "gemma-2",        "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
    {"name": "jais",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
    {"name": "t5",             "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
    {"name": "codeshell",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
    {"name": "tekken",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
    {"name": "smollm",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
 ]
@ -99,8 +102,8 @@ def download_file_with_auth(url, token, save_path):
    response = sess.get(url, headers=headers)
    response.raise_for_status()
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
-    with open(save_path, 'wb') as f:
+    with open(save_path, 'wb') as downloaded_file:
-        f.write(response.content)
+        downloaded_file.write(response.content)
    logger.info(f"File {save_path} downloaded successfully")
@ -159,7 +162,7 @@ for model in models:
        logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
        continue  # Skip to the next model if the tokenizer can't be loaded
-    chktok = tokenizer.encode(chktxt)
+    chktok = tokenizer.encode(CHK_TXT)
    chkhsh = sha256(str(chktok).encode()).hexdigest()
    logger.info(f"model: {name}")
@ -191,7 +194,7 @@ src_func = f"""
        # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
        # use in llama.cpp to implement the same pre-tokenizer
-        chktxt = {repr(chktxt)}
+        chktxt = {repr(CHK_TXT)}
        chktok = tokenizer.encode(chktxt)
        chkhsh = sha256(str(chktok).encode()).hexdigest()
@ -287,7 +290,7 @@ tests = [
    "333333333",
    "Cửa Việt", # llama-bpe fails on this
    " discards",
-    chktxt,
+    CHK_TXT,
 ]
 # write the tests to ./models/ggml-vocab-{name}.gguf.inp
--- a/convert_llama_ggml_to_gguf.py
+++ b/convert_llama_ggml_to_gguf.py
@ -132,6 +132,10 @@ class Tensor:
 class GGMLModel:
    file_format: GGMLFormat
    format_version: int
    def __init__(self):
        self.hyperparameters = None
        self.vocab = None
@ -290,7 +294,7 @@ class GGMLToGGUF:
        if self.vocab_override is not None:
            vo = self.vocab_override
            logger.info('* Adding vocab item(s)')
-            for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
+            for (_, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
                tokens.append(vbytes)
                scores.append(score)
                toktypes.append(ttype)
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@ -251,6 +251,10 @@ def parse_args() -> argparse.Namespace:
        "--verbose", action="store_true",
        help="increase output verbosity",
    )
    parser.add_argument(
        "--dry-run", action="store_true",
        help="only print out what will be done, without writing any new files",
    )
    parser.add_argument(
        "--base", type=Path, required=True,
        help="directory containing base model file",
@ -286,7 +290,7 @@ if __name__ == '__main__':
        fname_out = args.outfile
    else:
        # output in the same directory as the model by default
-        fname_out = dir_lora / 'ggml-lora-{ftype}.gguf'
+        fname_out = dir_lora
    if os.path.exists(input_model):
        # lazy import load_file only if lora is in safetensors format.
@ -310,6 +314,23 @@ if __name__ == '__main__':
        class LoraModel(model_class):
            model_arch = model_class.model_arch
            lora_alpha: float
            def __init__(self, *args, dir_lora_model: Path, lora_alpha: float, **kwargs):
                super().__init__(*args, **kwargs)
                self.dir_model_card = dir_lora_model
                self.lora_alpha = float(lora_alpha)
            def set_type(self):
                self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
                self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
            def set_gguf_parameters(self):
                self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
                super().set_gguf_parameters()
            def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
                tensor_map: dict[str, PartialLoraTensor] = {}
@ -350,6 +371,11 @@ if __name__ == '__main__':
                    yield (dest_name + ".lora_a", lora_a)
                    yield (dest_name + ".lora_b", lora_b)
        with open(lora_config, "r") as f:
            lparams: dict[str, Any] = json.load(f)
        alpha: float = lparams["lora_alpha"]
        model_instance = LoraModel(
            dir_base_model,
            ftype,
@ -357,18 +383,11 @@ if __name__ == '__main__':
            is_big_endian=args.bigendian,
            use_temp_file=False,
            eager=args.no_lazy,
-            model_name=None,
+            dry_run=args.dry_run,
            dir_lora_model=dir_lora,
            lora_alpha=alpha,
        )
        with open(lora_config, "r") as f:
            lparams: dict[str, Any] = json.load(f)
        alpha = lparams["lora_alpha"]
        model_instance.gguf_writer.add_string(gguf.Keys.General.TYPE, gguf.GGUFType.ADAPTER)
        model_instance.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
        model_instance.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, float(alpha))
        model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
        logger.info("Exporting model...")
        model_instance.write()
        logger.info(f"Model successfully exported to {model_instance.fname_out}")
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@ -31,7 +31,7 @@ int main(int argc, char ** argv) {
    int n_parallel = params.n_parallel;
    // total length of the sequences including the prompt
-    int n_predict = 32;
+    int n_predict = params.n_predict;
    // init LLM
--- a/examples/convert_legacy_llama.py
+++ b/examples/convert_legacy_llama.py
@ -24,7 +24,7 @@ from abc import ABC, abstractmethod
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from dataclasses import dataclass
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar, Optional
+from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar
 import numpy as np
@ -346,42 +346,6 @@ class Params:
        return params
@dataclass
 class Metadata:
    name: Optional[str] = None
    author: Optional[str] = None
    version: Optional[str] = None
    url: Optional[str] = None
    description: Optional[str] = None
    license: Optional[str] = None
    source_url: Optional[str] = None
    source_hf_repo: Optional[str] = None
    @staticmethod
    def load(metadata_path: Path) -> Metadata:
        if metadata_path is None or not metadata_path.exists():
            return Metadata()
        with open(metadata_path, 'r') as file:
            data = json.load(file)
        # Create a new Metadata instance
        metadata = Metadata()
        # Assigning values to Metadata attributes if they exist in the JSON file
        # This is based on LLM_KV_NAMES mapping in llama.cpp
        metadata.name = data.get("general.name")
        metadata.author = data.get("general.author")
        metadata.version = data.get("general.version")
        metadata.url = data.get("general.url")
        metadata.description = data.get("general.description")
        metadata.license = data.get("general.license")
        metadata.source_url = data.get("general.source.url")
        metadata.source_hf_repo = data.get("general.source.huggingface.repository")
        return metadata
 #
 # data loading
 # TODO: reuse (probably move to gguf.py?)
@ -806,7 +770,7 @@ class OutputFile:
    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
-    def add_meta_model(self, params: Params, metadata: Metadata | None) -> None:
+    def add_meta_model(self, params: Params, metadata: gguf.Metadata | None) -> None:
        # Metadata About The Model And Its Provenence
        name = "LLaMA"
        if metadata is not None and metadata.name is not None:
@ -824,16 +788,73 @@ class OutputFile:
                self.gguf.add_author(metadata.author)
            if metadata.version is not None:
                self.gguf.add_version(metadata.version)
-            if metadata.url is not None:
+            if metadata.organization is not None:
-                self.gguf.add_url(metadata.url)
+                self.gguf.add_organization(metadata.organization)
            if metadata.finetune is not None:
                self.gguf.add_finetune(metadata.finetune)
            if metadata.basename is not None:
                self.gguf.add_basename(metadata.basename)
            if metadata.description is not None:
                self.gguf.add_description(metadata.description)
            if metadata.quantized_by is not None:
                self.gguf.add_quantized_by(metadata.quantized_by)
            if metadata.size_label is not None:
                self.gguf.add_size_label(metadata.size_label)
            if metadata.license is not None:
-                self.gguf.add_licence(metadata.license)
+                self.gguf.add_license(metadata.license)
            if metadata.license_name is not None:
                self.gguf.add_license_name(metadata.license_name)
            if metadata.license_link is not None:
                self.gguf.add_license_link(metadata.license_link)
            if metadata.url is not None:
                self.gguf.add_url(metadata.url)
            if metadata.doi is not None:
                self.gguf.add_doi(metadata.doi)
            if metadata.uuid is not None:
                self.gguf.add_uuid(metadata.uuid)
            if metadata.repo_url is not None:
                self.gguf.add_repo_url(metadata.repo_url)
            if metadata.source_url is not None:
                self.gguf.add_source_url(metadata.source_url)
-            if metadata.source_hf_repo is not None:
+            if metadata.source_doi is not None:
-                self.gguf.add_source_hf_repo(metadata.source_hf_repo)
+                self.gguf.add_source_doi(metadata.source_doi)
            if metadata.source_uuid is not None:
                self.gguf.add_source_uuid(metadata.source_uuid)
            if metadata.source_repo_url is not None:
                self.gguf.add_source_repo_url(metadata.source_repo_url)
            if metadata.base_models is not None:
                self.gguf.add_base_model_count(len(metadata.base_models))
                for key, base_model_entry in enumerate(metadata.base_models):
                    if "name" in base_model_entry:
                        self.gguf.add_base_model_name(key, base_model_entry["name"])
                    if "author" in base_model_entry:
                        self.gguf.add_base_model_author(key, base_model_entry["author"])
                    if "version" in base_model_entry:
                        self.gguf.add_base_model_version(key, base_model_entry["version"])
                    if "organization" in base_model_entry:
                        self.gguf.add_base_model_organization(key, base_model_entry["organization"])
                    if "url" in base_model_entry:
                        self.gguf.add_base_model_url(key, base_model_entry["url"])
                    if "doi" in base_model_entry:
                        self.gguf.add_base_model_doi(key, base_model_entry["doi"])
                    if "uuid" in base_model_entry:
                        self.gguf.add_base_model_uuid(key, base_model_entry["uuid"])
                    if "repo_url" in base_model_entry:
                        self.gguf.add_base_model_repo_url(key, base_model_entry["repo_url"])
            if metadata.tags is not None:
                self.gguf.add_tags(metadata.tags)
            if metadata.languages is not None:
                self.gguf.add_languages(metadata.languages)
            if metadata.datasets is not None:
                self.gguf.add_datasets(metadata.datasets)
    def add_meta_arch(self, params: Params) -> None:
        # Metadata About The Neural Architecture Itself
@ -944,7 +965,7 @@ class OutputFile:
    @staticmethod
    def write_vocab_only(
        fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
-        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: Metadata | None = None,
+        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: gguf.Metadata | None = None,
    ) -> None:
        check_vocab_size(params, vocab, pad_vocab=pad_vocab)
@ -978,7 +999,7 @@ class OutputFile:
        fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
        concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
        pad_vocab: bool = False,
-        metadata: Metadata | None = None,
+        metadata: gguf.Metadata | None = None,
    ) -> None:
        check_vocab_size(params, vocab, pad_vocab=pad_vocab)
@ -1021,35 +1042,32 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT
    raise ValueError(f"Unexpected combination of types: {name_to_type}")
-def model_parameter_count(model: LazyModel) -> int:
+def per_model_weight_count_estimation(tensors: Iterable[tuple[str, LazyTensor]]) -> tuple[int, int, int]:
-    total_model_parameters = 0
+    total_params = 0
-    for i, (name, lazy_tensor) in enumerate(model.items()):
+    shared_params = 0
-        sum_weights_in_tensor = 1
+    expert_params = 0
    for name, lazy_tensor in tensors:
        # We don't need these
        if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
            continue
        # Got A Tensor
        sum_weights_in_tensor: int = 1
        # Tensor Volume
        for dim in lazy_tensor.shape:
            sum_weights_in_tensor *= dim
        total_model_parameters += sum_weights_in_tensor
    return total_model_parameters
-
+        if ".experts." in name:
-def model_parameter_count_rounded_notation(model_params_count: int) -> str:
+            if ".experts.0." in name:
-    if model_params_count > 1e12 :
+                expert_params += sum_weights_in_tensor
        # Trillions Of Parameters
        scaled_model_params = model_params_count * 1e-12
        scale_suffix = "T"
    elif model_params_count > 1e9 :
        # Billions Of Parameters
        scaled_model_params = model_params_count * 1e-9
        scale_suffix = "B"
    elif model_params_count > 1e6 :
        # Millions Of Parameters
        scaled_model_params = model_params_count * 1e-6
        scale_suffix = "M"
        else:
-        # Thousands Of Parameters
+            shared_params += sum_weights_in_tensor
        scaled_model_params = model_params_count * 1e-3
        scale_suffix = "K"
-    return f"{round(scaled_model_params)}{scale_suffix}"
+        total_params += sum_weights_in_tensor
    return total_params, shared_params, expert_params
 def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
@ -1231,34 +1249,24 @@ class VocabFactory:
        return vocab, special_vocab
-def default_convention_outfile(file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> str:
+def default_convention_outfile(file_type: GGMLFileType, expert_count: int | None, model_params_count: tuple[int, int, int], metadata: gguf.Metadata) -> str:
-    quantization = {
+    name = metadata.name if metadata.name is not None else None
    basename = metadata.basename if metadata.basename is not None else None
    finetune = metadata.finetune if metadata.finetune is not None else None
    version = metadata.version if metadata.version is not None else None
    size_label = metadata.size_label if metadata.size_label is not None else gguf.size_label(*model_params_count, expert_count=expert_count or 0)
    output_type = {
        GGMLFileType.AllF32:    "F32",
        GGMLFileType.MostlyF16: "F16",
        GGMLFileType.MostlyQ8_0: "Q8_0",
    }[file_type]
-    parameters = model_parameter_count_rounded_notation(model_params_count)
+    return gguf.naming_convention(name, basename, finetune, version, size_label, output_type)
    expert_count = ""
    if params.n_experts is not None:
        expert_count = f"{params.n_experts}x"
    version = ""
    if metadata is not None and metadata.version is not None:
        version = f"-{metadata.version}"
    name = "ggml-model"
    if metadata is not None and metadata.name is not None:
        name = metadata.name
    elif params.path_model is not None:
        name = params.path_model.name
    return f"{name}{version}-{expert_count}{parameters}-{quantization}"
-def default_outfile(model_paths: list[Path], file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> Path:
+def default_outfile(model_paths: list[Path], file_type: GGMLFileType, expert_count: int | None, model_params_count: tuple[int, int, int], metadata: gguf.Metadata) -> Path:
-    default_filename = default_convention_outfile(file_type, params, model_params_count, metadata)
+    default_filename = default_convention_outfile(file_type, expert_count, model_params_count, metadata)
    ret = model_paths[0].parent / f"{default_filename}.gguf"
    if ret in model_paths:
        logger.error(
@ -1297,8 +1305,9 @@ def main(args_in: list[str] | None = None) -> None:
    parser.add_argument("--pad-vocab",    action="store_true",    help="add pad tokens when model vocab expects more than tokenizer metadata provides")
    parser.add_argument("--skip-unknown", action="store_true",    help="skip unknown tensor names instead of failing")
    parser.add_argument("--verbose",      action="store_true",    help="increase output verbosity")
-    parser.add_argument("--metadata",     type=Path,              help="Specify the path for a metadata file")
+    parser.add_argument("--metadata",     type=Path,              help="Specify the path for an authorship metadata override file")
    parser.add_argument("--get-outfile",  action="store_true",    help="get calculated default outfile name")
    parser.add_argument("--model-name",   type=str, default=None, help="name of the model")
    args = parser.parse_args(args_in)
@ -1310,32 +1319,36 @@ def main(args_in: list[str] | None = None) -> None:
    else:
        logging.basicConfig(level=logging.INFO)
-    metadata = Metadata.load(args.metadata)
+    model_name = args.model_name
    dir_model = args.model
    metadata = gguf.Metadata.load(args.metadata, dir_model, model_name)
    if args.get_outfile:
-        model_plus = load_some_model(args.model)
+        model_plus = load_some_model(dir_model)
        params = Params.load(model_plus)
        model = convert_model_names(model_plus.model, params, args.skip_unknown)
-        model_params_count = model_parameter_count(model_plus.model)
+        model_params_count = per_model_weight_count_estimation(model_plus.model.items())
        ftype = pick_output_type(model, args.outtype)
-        print(f"{default_convention_outfile(ftype, params, model_params_count, metadata)}") # noqa: NP100
+
        if (metadata is None or metadata.name is None) and params.path_model is not None:
            metadata.name = params.path_model.name
        print(f"{default_convention_outfile(ftype, params.n_experts, model_params_count, metadata)}") # noqa: NP100
        return
    if args.no_vocab and args.vocab_only:
        raise ValueError("--vocab-only does not make sense with --no-vocab")
    if args.dump_single:
-        model_plus = lazy_load_file(args.model)
+        model_plus = lazy_load_file(dir_model)
        do_dump_model(model_plus)
        return
    if not args.vocab_only:
-        model_plus = load_some_model(args.model)
+        model_plus = load_some_model(dir_model)
    else:
-        model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)
+        model_plus = ModelPlus(model = {}, paths = [dir_model / 'dummy'], format = 'none', vocab = None)
    model_params_count = model_parameter_count(model_plus.model)
    logger.info(f"model parameters count : {model_params_count} ({model_parameter_count_rounded_notation(model_params_count)})")
    if args.dump:
        do_dump_model(model_plus)
@ -1368,7 +1381,7 @@ def main(args_in: list[str] | None = None) -> None:
        logger.info(f"params = {params}")
    model_parent_path = model_plus.paths[0].parent
-    vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
+    vocab_path = Path(args.vocab_dir or dir_model or model_parent_path)
    vocab_factory = VocabFactory(vocab_path)
    vocab_types = None if args.no_vocab else args.vocab_type.split(",")
    vocab, special_vocab = vocab_factory.load_vocab(vocab_types, model_parent_path)
@ -1399,13 +1412,21 @@ def main(args_in: list[str] | None = None) -> None:
    assert params is not None
    if metadata.name is None and params.path_model is not None:
        metadata.name = params.path_model.name
    model_params_count = per_model_weight_count_estimation(model_plus.model.items())
    logger.info(f"model parameters count : {model_params_count} ({gguf.model_weight_count_rounded_notation(model_params_count[0])})")
    logger.info(f"Vocab info: {vocab}")
    logger.info(f"Special vocab info: {special_vocab}")
    model   = model_plus.model
    model   = convert_model_names(model, params, args.skip_unknown)
    ftype   = pick_output_type(model, args.outtype)
    model   = convert_to_output_type(model, ftype)
-    outfile = args.outfile or default_outfile(model_plus.paths, ftype, params, model_params_count, metadata)
+    outfile = args.outfile or default_outfile(model_plus.paths, ftype, params.n_experts, model_params_count, metadata=metadata)
    metadata.size_label = gguf.size_label(*model_params_count, expert_count=params.n_experts or 0)
    params.ftype = ftype
    logger.info(f"Writing {outfile}, format {ftype}")
--- a/examples/gbnf-validator/gbnf-validator.cpp
+++ b/examples/gbnf-validator/gbnf-validator.cpp
@ -16,20 +16,25 @@ static bool llama_sample_grammar_string(struct llama_grammar * grammar, const st
    auto decoded = decode_utf8(input_str, {});
    const auto & code_points = decoded.first;
    const llama_grammar_rules  & rules      = llama_grammar_get_rules (grammar);
          llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar);
    size_t pos = 0;
    for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
-        auto prev_stacks = grammar->stacks;
+        const llama_grammar_stacks prev_stacks = llama_grammar_get_stacks(grammar); // copy
-        llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
+
-        if (grammar->stacks.empty()) {
+        llama_grammar_accept(rules, prev_stacks, *it, cur_stacks);
        if (cur_stacks.empty()) {
            error_pos = pos;
            error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'";
-            grammar->stacks = prev_stacks;
+            cur_stacks = prev_stacks;
            return false;
        }
        ++pos;
    }
-    for (const auto & stack : grammar->stacks) {
+    for (const auto & stack : cur_stacks) {
        if (stack.empty()) {
            return true;
        }
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@ -92,6 +92,11 @@ static bool gguf_ex_read_0(const std::string & fname) {
    struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
    if (!ctx) {
        fprintf(stderr, "%s: failed to load '%s'\n", __func__, fname.c_str());
        return false;
    }
    printf("%s: version:      %d\n", __func__, gguf_get_version(ctx));
    printf("%s: alignment:   %zu\n", __func__, gguf_get_alignment(ctx));
    printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -23,6 +23,10 @@
 #include "ggml-cuda.h"
 #include "ggml-sycl.h"
 #ifdef GGML_USE_CANN
 #include "ggml-cann.h"
 #endif
 // utils
 static uint64_t get_time_ns() {
    using clock = std::chrono::high_resolution_clock;
@ -120,6 +124,17 @@ static std::string get_gpu_info() {
            id += "/";
        }
    }
 #endif
 #ifdef GGML_USE_CANN
    uint32_t count = ggml_backend_cann_get_device_count();
    for (uint32_t i = 0; i < count; i++) {
        char buf[128];
        ggml_backend_cann_get_device_description(i, buf, sizeof(buf));
        id += buf;
        if (i < count - 1) {
            id += "/";
        }
    }
 #endif
    // TODO: other backends
    return id;
--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@ -409,7 +409,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
    const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
    if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
-        return env->NewStringUTF("");
+        return nullptr;
    }
    auto new_token_chars = llama_token_to_piece(context, new_token_id);
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@ -26,11 +26,12 @@ actor LlamaContext {
    private var context: OpaquePointer
    private var batch: llama_batch
    private var tokens_list: [llama_token]
    var is_done: Bool = false
    /// This variable is used to store temporarily invalid cchars
    private var temporary_invalid_cchars: [CChar]
-    var n_len: Int32 = 64
+    var n_len: Int32 = 1024
    var n_cur: Int32 = 0
    var n_decode: Int32 = 0
@ -160,6 +161,7 @@ actor LlamaContext {
        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
            print("\n")
            is_done = true
            let new_token_str = String(cString: temporary_invalid_cchars + [0])
            temporary_invalid_cchars.removeAll()
            return new_token_str
--- a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
+++ b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
@ -132,7 +132,7 @@ class LlamaState: ObservableObject {
        messageLog += "\(text)"
        Task.detached {
-            while await llamaContext.n_cur < llamaContext.n_len {
+            while await !llamaContext.is_done {
                let result = await llamaContext.completion_loop()
                await MainActor.run {
                    self.messageLog += "\(result)"
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -16,6 +16,10 @@
 #include "ggml-metal.h"
 #endif
 #ifdef GGML_USE_CANN
 #include "ggml-cann.h"
 #endif
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"
@ -1001,6 +1005,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
    LOG_TEE("%s: CLIP using Metal backend\n", __func__);
 #endif
 #ifdef GGML_USE_CANN
    new_clip->backend = ggml_backend_cann_init(0);
    LOG_TEE("%s: CLIP using CANN backend\n", __func__);
 #endif
    if (!new_clip->backend) {
        new_clip->backend = ggml_backend_cpu_init();
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@ -31,7 +31,6 @@ int main(int argc, char ** argv){
    // load the model
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
    GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
    // tokenize the prompt
    std::vector<llama_token> inp;
@ -65,7 +64,7 @@ int main(int argc, char ** argv){
    }
    const int n_input = inp.size();
-    const int n_ctx = params.n_ctx;
+    const int n_ctx = llama_n_ctx(ctx);
    int n_drafted = 0;
    int n_accept  = 0;
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@ -39,7 +39,6 @@ int main(int argc, char ** argv){
    // load the model
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
    GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
    // tokenize the prompt
    std::vector<llama_token> inp;
--- a/examples/pydantic_models_to_grammar_examples.py
+++ b/examples/pydantic_models_to_grammar_examples.py
@ -1,8 +1,15 @@
-# Function calling example using pydantic models.
+#!/usr/bin/env python3
 """Function calling example using pydantic models."""
 from __future__ import annotations
 import argparse
 import datetime
 import json
 import logging
 import textwrap
 import sys
 from enum import Enum
 from typing import Optional, Union
@ -12,30 +19,54 @@ from pydantic_models_to_grammar import (add_run_method_to_dynamic_model, convert
                                        create_dynamic_model_from_function, generate_gbnf_grammar_and_documentation)
-# Function to get completion on the llama.cpp server with grammar.
+def create_completion(host, prompt, gbnf_grammar):
-def create_completion(prompt, grammar):
+    """Calls the /completion API on llama-server.
    See
    https://github.com/ggerganov/llama.cpp/tree/HEAD/examples/server#api-endpoints
    """
    print(f"  Request:\n    Grammar:\n{textwrap.indent(gbnf_grammar, '      ')}\n    Prompt:\n{textwrap.indent(prompt.rstrip(), '      ')}")
    headers = {"Content-Type": "application/json"}
-    data = {"prompt": prompt, "grammar": grammar}
+    data = {"prompt": prompt, "grammar": gbnf_grammar}
-
+    result = requests.post(f"http://{host}/completion", headers=headers, json=data).json()
    response = requests.post("http://127.0.0.1:8080/completion", headers=headers, json=data)
    data = response.json()
    assert data.get("error") is None, data
-
+    logging.info("Result: %s", result)
-    print(data["content"])
+    content = result["content"]
-    return data["content"]
+    print(f"  Model: {result['model']}")
    print(f"  Result:\n{textwrap.indent(json.dumps(json.loads(content), indent=2), '    ')}")
    return content
 # A function for the agent to send a message to the user.
 class SendMessageToUser(BaseModel):
-    """
+    """Send a message to the User."""
    Send a message to the User.
    """
    chain_of_thought: str = Field(..., description="Your chain of thought while sending the message.")
    message: str = Field(..., description="Message you want to send to the user.")
    def run(self):
-        print(self.message)
+        print(f"SendMessageToUser: {self.message}")
 def example_rce(host):
    """Minimal test case where the LLM call an arbitrary python function."""
    print("- example_rce")
    tools = [SendMessageToUser]
    gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
        pydantic_model_list=tools, outer_object_name="function",
        outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters")
    system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation
    user_message = "What is 42 * 42?"
    prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant"
    text = create_completion(host, prompt, gbnf_grammar)
    json_data = json.loads(text)
    tools_map = {tool.__name__:tool for tool in tools}
    # This finds "SendMessageToUser":
    tool = tools_map.get(json_data["function"])
    if not tool:
        print(f"Error: unknown tool {json_data['function']}")
        return 1
    tool(**json_data["function_parameters"]).run()
    return 0
 # Enum for the calculator tool.
@ -46,11 +77,11 @@ class MathOperation(Enum):
    DIVIDE = "divide"
-# Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt.
+# Simple pydantic calculator tool for the agent that can add, subtract,
 # multiply, and divide. Docstring and description of fields will be used in
 # system prompt.
 class Calculator(BaseModel):
-    """
+    """Perform a math operation on two numbers."""
    Perform a math operation on two numbers.
    """
    number_one: Union[int, float] = Field(..., description="First number.")
    operation: MathOperation = Field(..., description="Math operation to perform.")
    number_two: Union[int, float] = Field(..., description="Second number.")
@ -68,55 +99,61 @@ class Calculator(BaseModel):
            raise ValueError("Unknown operation.")
-# Here the grammar gets generated by passing the available function models to generate_gbnf_grammar_and_documentation function. This also generates a documentation usable by the LLM.
+def example_calculator(host):
-# pydantic_model_list is the list of pydanitc models
+    """Have the LLM ask to get a calculation done.
-# outer_object_name is an optional name for an outer object around the actual model object. Like a "function" object with "function_parameters" which contains the actual model object. If None, no outer object will be generated
+
-# outer_object_content is the name of outer object content.
+    Here the grammar gets generated by passing the available function models to
-# model_prefix is the optional prefix for models in the documentation. (Default="Output Model")
+    generate_gbnf_grammar_and_documentation function. This also generates a
-# fields_prefix is the prefix for the model fields in the documentation. (Default="Output Fields")
+    documentation usable by the LLM.
-gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
+
-    pydantic_model_list=[SendMessageToUser, Calculator], outer_object_name="function",
+    pydantic_model_list is the list of pydantic models outer_object_name is an
    optional name for an outer object around the actual model object. Like a
    "function" object with "function_parameters" which contains the actual model
    object. If None, no outer object will be generated outer_object_content is
    the name of outer object content.
    model_prefix is the optional prefix for models in the documentation. (Default="Output Model")
    fields_prefix is the prefix for the model fields in the documentation. (Default="Output Fields")
    """
    print("- example_calculator")
    tools = [SendMessageToUser, Calculator]
    gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
        pydantic_model_list=tools, outer_object_name="function",
        outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters")
-
+    system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation
-print(gbnf_grammar)
+    user_message1 = "What is 42 * 42?"
-print(documentation)
+    prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message1}<|im_end|>\n<|im_start|>assistant"
-
+    text = create_completion(host, prompt, gbnf_grammar)
-system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation
+    json_data = json.loads(text)
-
+    expected = {
-user_message = "What is 42 * 42?"
+        "function": "Calculator",
-prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant"
+        "function_parameters": {
-
+            "number_one": 42,
-text = create_completion(prompt=prompt, grammar=gbnf_grammar)
+            "operation": "multiply",
-# This should output something like this:
+            "number_two": 42
-# {
+        }
-#     "function": "calculator",
+    }
-#     "function_parameters": {
+    if json_data != expected:
-#         "number_one": 42,
+        print("  Result is not as expected!")
-#         "operation": "multiply",
+    tools_map = {tool.__name__:tool for tool in tools}
-#         "number_two": 42
+    # This finds "Calculator":
-#     }
+    tool = tools_map.get(json_data["function"])
-# }
+    if not tool:
-function_dictionary = json.loads(text)
+        print(f"Error: unknown tool {json_data['function']}")
-if function_dictionary["function"] == "calculator":
+        return 1
-    function_parameters = {**function_dictionary["function_parameters"]}
+    result = tool(**json_data["function_parameters"]).run()
-
+    print(f"  Call {json_data['function']} gave result {result}")
-    print(Calculator(**function_parameters).run())
+    return 0
    # This should output: 1764
 # A example structured output based on pydantic models. The LLM will create an entry for a Book database out of an unstructured text.
 class Category(Enum):
-    """
+    """The category of the book."""
    The category of the book.
    """
    Fiction = "Fiction"
    NonFiction = "Non-Fiction"
 class Book(BaseModel):
-    """
+    """Represents an entry about a book."""
    Represents an entry about a book.
    """
    title: str = Field(..., description="Title of the book.")
    author: str = Field(..., description="Author of the book.")
    published_year: Optional[int] = Field(..., description="Publishing year of the book.")
@ -125,33 +162,42 @@ class Book(BaseModel):
    summary: str = Field(..., description="Summary of the book.")
-# We need no additional parameters other than our list of pydantic models.
+def example_struct(host):
-gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation([Book])
+    """A example structured output based on pydantic models.
-system_message = "You are an advanced AI, tasked to create a dataset entry in JSON for a Book. The following is the expected output model:\n\n" + documentation
+    The LLM will create an entry for a Book database out of an unstructured
    text. We need no additional parameters other than our list of pydantic
    models.
    """
    print("- example_struct")
    tools = [Book]
    gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(pydantic_model_list=tools)
    system_message = "You are an advanced AI, tasked to create a dataset entry in JSON for a Book. The following is the expected output model:\n\n" + documentation
    text = """The Feynman Lectures on Physics is a physics textbook based on some lectures by Richard Feynman, a Nobel laureate who has sometimes been called "The Great Explainer". The lectures were presented before undergraduate students at the California Institute of Technology (Caltech), during 1961–1963. The book's co-authors are Feynman, Robert B. Leighton, and Matthew Sands."""
    prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
    text = create_completion(host, prompt, gbnf_grammar)
    json_data = json.loads(text)
    # In this case, there's no function nor function_parameters.
    # Here the result will vary based on the LLM used.
    keys = sorted(["title", "author", "published_year", "keywords", "category", "summary"])
    if keys != sorted(json_data.keys()):
        print(f"Unexpected result: {sorted(json_data.keys())}")
        return 1
    book = Book(**json_data)
    print(f"  As a Book object: %s" % book)
    return 0
 text = """The Feynman Lectures on Physics is a physics textbook based on some lectures by Richard Feynman, a Nobel laureate who has sometimes been called "The Great Explainer". The lectures were presented before undergraduate students at the California Institute of Technology (Caltech), during 1961–1963. The book's co-authors are Feynman, Robert B. Leighton, and Matthew Sands."""
 prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
 text = create_completion(prompt=prompt, grammar=gbnf_grammar)
 json_data = json.loads(text)
 print(Book(**json_data))
 # An example for parallel function calling with a Python function, a pydantic function model and an OpenAI like function definition.
 def get_current_datetime(output_format: Optional[str] = None):
-    """
+    """Get the current date and time in the given format.
-    Get the current date and time in the given format.
+
    Args:
         output_format: formatting string for the date and time, defaults to '%Y-%m-%d %H:%M:%S'
    """
-    if output_format is None:
+    return datetime.datetime.now().strftime(output_format or "%Y-%m-%d %H:%M:%S")
        output_format = '%Y-%m-%d %H:%M:%S'
    return datetime.datetime.now().strftime(output_format)
-# Example function to get the weather
+# Example function to get the weather.
 def get_current_weather(location, unit):
    """Get the current weather in a given location"""
    if "London" in location:
@ -160,12 +206,16 @@ def get_current_weather(location, unit):
        return json.dumps({"location": "New York", "temperature": "24", "unit": unit.value})
    elif "North Pole" in location:
        return json.dumps({"location": "North Pole", "temperature": "-42", "unit": unit.value})
    else:
    return json.dumps({"location": location, "temperature": "unknown"})
-# Here is a function definition in OpenAI style
+def example_concurrent(host):
-current_weather_tool = {
+    """An example for parallel function calling with a Python function, a pydantic
    function model and an OpenAI like function definition.
    """
    print("- example_concurrent")
    # Function definition in OpenAI style.
    current_weather_tool = {
        "type": "function",
        "function": {
            "name": "get_current_weather",
@ -182,46 +232,81 @@ current_weather_tool = {
                "required": ["location"],
            },
        },
-}
+    }
    # Convert OpenAI function definition into pydantic model.
    current_weather_tool_model = convert_dictionary_to_pydantic_model(current_weather_tool)
    # Add the actual function to a pydantic model.
    current_weather_tool_model = add_run_method_to_dynamic_model(current_weather_tool_model, get_current_weather)
-# Convert OpenAI function definition into pydantic model
+    # Convert normal Python function to a pydantic model.
-current_weather_tool_model = convert_dictionary_to_pydantic_model(current_weather_tool)
+    current_datetime_model = create_dynamic_model_from_function(get_current_datetime)
 # Add the actual function to a pydantic model
 current_weather_tool_model = add_run_method_to_dynamic_model(current_weather_tool_model, get_current_weather)
-# Convert normal Python function to a pydantic model
+    tools = [SendMessageToUser, Calculator, current_datetime_model, current_weather_tool_model]
-current_datetime_model = create_dynamic_model_from_function(get_current_datetime)
+    gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
-
+        pydantic_model_list=tools, outer_object_name="function",
 tool_list = [SendMessageToUser, Calculator, current_datetime_model, current_weather_tool_model]
 gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
    pydantic_model_list=tool_list, outer_object_name="function",
        outer_object_content="params", model_prefix="Function", fields_prefix="Parameters", list_of_outputs=True)
-
+    system_message = "You are an advanced AI assistant. You are interacting with the user and with your environment by calling functions. You call functions by writing JSON objects, which represent specific function calls.\nBelow is a list of your available function calls:\n\n" + documentation
-system_message = "You are an advanced AI assistant. You are interacting with the user and with your environment by calling functions. You call functions by writing JSON objects, which represent specific function calls.\nBelow is a list of your available function calls:\n\n" + documentation
+    text = """Get the date and time, get the current weather in celsius in London and solve the following calculation: 42 * 42"""
    prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
    text = create_completion(host, prompt, gbnf_grammar)
    json_data = json.loads(text)
    expected = [
      {
        "function": "get_current_datetime",
        "params": {
          "output_format": "%Y-%m-%d %H:%M:%S"
        }
      },
      {
        "function": "get_current_weather",
        "params": {
          "location": "London",
          "unit": "celsius"
        }
      },
      {
        "function": "Calculator",
        "params": {
          "number_one": 42,
          "operation": "multiply",
          "number_two": 42
        }
      }
    ]
    res = 0
    if json_data != expected:
        print("  Result is not as expected!")
        print("  This can happen on highly quantized models")
        res = 1
    tools_map = {tool.__name__:tool for tool in tools}
    for call in json_data:
      tool = tools_map.get(call["function"])
      if not tool:
          print(f"Error: unknown tool {call['function']}")
          return 1
      result = tool(**call["params"]).run()
      print(f"  Call {call['function']} returned {result}")
    # Should output something like this:
    #   Call get_current_datetime returned 2024-07-15 09:50:38
    #   Call get_current_weather returned {"location": "London", "temperature": "42", "unit": "celsius"}
    #   Call Calculator returned 1764
    return res
-text = """Get the date and time, get the current weather in celsius in London and solve the following calculation: 42 * 42"""
+def main():
-prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
+    parser = argparse.ArgumentParser(description=sys.modules[__name__].__doc__)
-
+    parser.add_argument("--host", default="localhost:8080", help="llama.cpp server")
-text = create_completion(prompt=prompt, grammar=gbnf_grammar)
+    parser.add_argument("-v", "--verbose", action="store_true", help="enables logging")
-
+    args = parser.parse_args()
-json_data = json.loads(text)
+    logging.basicConfig(level=logging.INFO if args.verbose else logging.ERROR)
-
+    ret = 0
-print(json_data)
+    # Comment out below to only run the example you want.
-# Should output something like this:
+    ret = ret or example_rce(args.host)
-# [{'function': 'get_current_datetime', 'params': {'output_format': '%Y-%m-%d %H:%M:%S'}}, {'function': 'get_current_weather', 'params': {'location': 'London', 'unit': 'celsius'}}, {'function': 'Calculator', 'params': {'number_one': 42, 'operation': 'multiply', 'number_two': 42}}]
+    ret = ret or example_calculator(args.host)
    ret = ret or example_struct(args.host)
    ret = ret or example_concurrent(args.host)
    return ret
-for call in json_data:
+if __name__ == "__main__":
-    if call["function"] == "Calculator":
+    sys.exit(main())
        print(Calculator(**call["params"]).run())
    elif call["function"] == "get_current_datetime":
        print(current_datetime_model(**call["params"]).run())  # pyright: ignore[reportAttributeAccessIssue]
    elif call["function"] == "get_current_weather":
        print(current_weather_tool_model(**call["params"]).run())  # pyright: ignore[reportAttributeAccessIssue]
 # Should output something like this:
 # 2024-01-14 13:36:06
 # {"location": "London", "temperature": "42", "unit": "celsius"}
 # 1764
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -444,7 +444,7 @@ node index.js
    `n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity.
-    `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded.
+    `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token.
    By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt.
    `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
--- a/examples/server/public/completion.js
+++ b/examples/server/public/completion.js
@ -21,7 +21,7 @@ let generation_settings = null;
 //
 export async function* llama(prompt, params = {}, config = {}) {
  let controller = config.controller;
-  const api_url = config.api_url || "";
+  const api_url = config.api_url?.replace(/\/+$/, '') || "";
  if (!controller) {
    controller = new AbortController();
@ -196,7 +196,7 @@ export const llamaComplete = async (params, controller, callback) => {
 // Get the model info from the server. This is useful for getting the context window and so on.
 export const llamaModelInfo = async (config = {}) => {
  if (!generation_settings) {
-    const api_url = config.api_url || "";
+    const api_url = config.api_url?.replace(/\/+$/, '') || "";
    const props = await fetch(`${api_url}/props`).then(r => r.json());
    generation_settings = props.default_generation_settings;
  }
--- a/examples/server/public/index-new.html
+++ b/examples/server/public/index-new.html
@ -14,10 +14,10 @@
  <script type="module">
    import {
      html, h, signal, effect, computed, render, useSignal, useEffect, useRef, Component
-    } from '/index.js';
+    } from './index.js';
-    import { llama } from '/completion.js';
+    import { llama } from './completion.js';
-    import { SchemaConverter } from '/json-schema-to-grammar.mjs';
+    import { SchemaConverter } from './json-schema-to-grammar.mjs';
    import { promptFormats } from './prompt-formats.js';
    import { systemPrompts } from './system-prompts.js'; // multilingual is wip
    let selected_image = false;
@ -225,7 +225,7 @@
        throw new Error("already running");
      }
      controller.value = new AbortController();
-      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) {
+      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: new URL('.', document.baseURI).href })) {
        const data = chunk.data;
        if (data.stop) {
          while (
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@ -479,7 +479,7 @@
        throw new Error("already running");
      }
      controller.value = new AbortController();
-      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: location.pathname.replace(/\/+$/, '') })) {
+      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: new URL('.', document.baseURI).href })) {
        const data = chunk.data;
        if (data.stop) {
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1182,7 +1182,7 @@ struct server_context {
    bool process_token(completion_token_output & result, server_slot & slot) {
        // remember which tokens were sampled - used for repetition penalties during sampling
-        const std::string token_str = llama_token_to_piece(ctx, result.tok, false);
+        const std::string token_str = llama_token_to_piece(ctx, result.tok, params.special);
        slot.sampled = result.tok;
        // search stop word and delete it
--- a/flake.lock
+++ b/flake.lock
@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1720768451,
+        "lastModified": 1721379653,
-        "narHash": "sha256-EYekUHJE2gxeo2pM/zM9Wlqw1Uw2XTJXOSAO79ksc4Y=",
+        "narHash": "sha256-8MUgifkJ7lkZs3u99UDZMB4kbOxvMEXQZ31FO3SopZ0=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "7e7c39ea35c5cdd002cd4588b03a3fb9ece6fad9",
+        "rev": "1d9c2c9b3e71b9ee663d11c5d298727dace8d374",
        "type": "github"
      },
      "original": {
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@ -194,13 +194,19 @@ endif ()
 include(GNUInstallDirs)
 include(CMakePackageConfigHelpers)
 # all public headers
 set(GGML_PUBLIC_HEADERS
    include/ggml.h
    include/ggml-alloc.h
    include/ggml-backend.h
-    "${GGML_HEADERS_CUDA}"
+    include/ggml-blas.h
-    "${GGML_HEADERS_METAL}"
+    include/ggml-cuda.h
-    "${GGML_HEADERS_EXTRA}")
+    include/ggml.h
    include/ggml-kompute.h
    include/ggml-metal.h
    include/ggml-rpc.h
    include/ggml-sycl.h
    include/ggml-vulkan.h)
 set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
 #if (GGML_METAL)
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@ -29,6 +29,7 @@ extern "C" {
    enum ggml_backend_buffer_usage {
        GGML_BACKEND_BUFFER_USAGE_ANY = 0,
        GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
        GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2,
    };
    GGML_API           const char *                   ggml_backend_buffer_name          (ggml_backend_buffer_t buffer);
@ -42,6 +43,7 @@ extern "C" {
    GGML_API           void                           ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
    GGML_API           bool                           ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
    GGML_API           void                           ggml_backend_buffer_set_usage     (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
    GGML_API           enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage     (ggml_backend_buffer_t buffer);
    GGML_API           ggml_backend_buffer_type_t     ggml_backend_buffer_get_type      (ggml_backend_buffer_t buffer);
    GGML_API           void                           ggml_backend_buffer_reset         (ggml_backend_buffer_t buffer);
--- a/ggml/include/ggml-cann.h
+++ b/ggml/include/ggml-cann.h
@ -0,0 +1,125 @@
 /*
 * Copyright (c) 2023-2024 The ggml authors
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */
 #pragma once
 #include "ggml-backend.h"
 #include "ggml.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 /**
 * @brief Maximum number of CANN devices supported.
 */
 #define GGML_CANN_MAX_DEVICES 16
 /**
 * @brief Initializes the CANN backend for a specified device.
 *
 * This function initializes the CANN backend for the given device.
 * It verifies the device index, allocates a context, and creates a backend
 * instance.
 *
 * @param device The index of the device to initialize.
 * @return A pointer to the initialized backend instance, or nullptr on failure.
 */
 GGML_API GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device);
 /**
 * @brief Checks if a given backend is a CANN backend.
 *
 * This function verifies if the provided backend is a CANN backend by comparing
 * its GUID with the CANN backend's GUID.
 *
 * @param backend The backend instance to check.
 * @return True if the backend is a CANN backend, false otherwise.
 */
 GGML_API GGML_CALL bool ggml_backend_is_cann(ggml_backend_t backend);
 /**
 * @brief Retrieves the CANN buffer type for a specified device.
 *
 * This function initializes and returns the buffer type interface associated
 * with the given device. It ensures thread-safe access using a mutex.
 *
 * @param device The device index for which to retrieve the buffer type.
 * @return A pointer to the buffer type interface for the specified device, or
 * nullptr if the device index is out of range.
 */
 GGML_API GGML_CALL ggml_backend_buffer_type_t
 ggml_backend_cann_buffer_type(int32_t device);
 /**
 * @brief Retrieves the number of CANN devices available.
 *
 * This function returns the number of CANN devices available based on
 * information obtained from `ggml_cann_info()`.
 *
 * @return The number of CANN devices available.
 */
 GGML_API GGML_CALL int32_t ggml_backend_cann_get_device_count(void);
 /**
 * @brief Retrieves the description of a specific CANN device.
 *
 * This function sets the specified device, retrieves the SoC name,
 * and writes it into the provided description buffer.
 *
 * @param device The device index to retrieve the description for.
 * @param description Pointer to a buffer where the description will be written.
 * @param description_size Size of the description buffer.
 */
 GGML_API GGML_CALL void ggml_backend_cann_get_device_description(
    int32_t device, char* description, size_t description_size);
 /**
 * @brief Retrieves the memory information of a specific CANN device.
 *
 * This function sets the specified device, retrieves the free and total
 * memory information of the specified type (ACL_HBM_MEM), and stores them
 * in the provided pointers.
 *
 * @param device The device index to retrieve memory information for.
 * @param free Pointer to a variable where the free memory size will be stored.
 * @param total Pointer to a variable where the total memory size will be
 * stored.
 */
 GGML_API GGML_CALL void ggml_backend_cann_get_device_memory(int32_t device,
                                                            size_t* free,
                                                            size_t* total);
 /**
 * @brief Set the logging callback for GGML.
 *
 * This function sets the logging callback and user data for logging.
 *
 * @param log_callback The logging callback to set.
 * @param user_data User data to pass to the logging callback.
 */
 GGML_API void ggml_backend_cann_log_set_callback(ggml_log_callback log_callback,
                                                 void* user_data);
 #ifdef __cplusplus
 }
 #endif
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -753,6 +753,8 @@ extern "C" {
    GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
    GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
    GGML_API bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
    // use this to compute the memory overhead of a tensor
    GGML_API size_t ggml_tensor_overhead(void);
@ -2394,6 +2396,7 @@ extern "C" {
    GGML_API int ggml_cpu_has_rpc        (void);
    GGML_API int ggml_cpu_has_vsx        (void);
    GGML_API int ggml_cpu_has_matmul_int8(void);
    GGML_API int ggml_cpu_has_cann       (void);
    //
    // Internal types and functions exposed for tests and benchmarks
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@ -467,15 +467,18 @@ if (GGML_SYCL)
        message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL or NVIDIA")
    endif()
-    if ( NOT DEFINED ENV{ONEAPI_ROOT})
+    check_cxx_compiler_flag("-fsycl" SUPPORTS_SYCL)
-        message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
+    if ( DEFINED ENV{ONEAPI_ROOT})
        message(STATUS "Using oneAPI Release SYCL compiler (icpx).")
    elseif(SUPPORTS_SYCL)
        message(WARNING "Using open-source SYCL compiler (clang++). Didn't detect ENV {ONEAPI_ROOT}.
         If you expected the oneAPI Release compiler, please install oneAPI & source it, like:
         source /opt/intel/oneapi/setvars.sh")
    else()
        message(FATAL_ERROR, "C++ compiler lacks SYCL support.")
    endif()
    #todo: AOT
    find_package(IntelSYCL REQUIRED)
    find_package(MKL REQUIRED)
    message(STATUS "SYCL found")
    #todo: AOT
    list(APPEND GGML_CDEF_PUBLIC GGML_USE_SYCL)
@ -487,11 +490,9 @@ if (GGML_SYCL)
        add_compile_definitions(GGML_SYCL_FORCE_MMQ)
    endif()
-    add_compile_options(-I./) #include DPCT
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing -fsycl")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
    if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
        add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
    else()
        add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
@ -504,15 +505,15 @@ if (GGML_SYCL)
    list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp")
    if (WIN32)
        find_package(IntelSYCL REQUIRED)
        find_package(MKL REQUIRED)
        set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
    else()
        add_compile_options(-I/${SYCL_INCLUDE_DIR})
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
        if (GGML_SYCL_TARGET STREQUAL "INTEL")
-            set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
+            set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
        elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
-            set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl pthread m dl onemkl)
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
            set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} pthread m dl onemkl)
        endif()
    endif()
 endif()
@ -770,6 +771,74 @@ if (GGML_CPU_HBM)
    target_link_libraries(ggml PUBLIC memkind)
 endif()
 if (GGML_CANN)
    if ("cann${CANN_INSTALL_DIR}" STREQUAL "cann" AND DEFINED ENV{ASCEND_TOOLKIT_HOME})
        set(CANN_INSTALL_DIR $ENV{ASCEND_TOOLKIT_HOME})
        message(STATUS "CANN: updated CANN_INSTALL_DIR from ASCEND_TOOLKIT_HOME=$ENV{ASCEND_TOOLKIT_HOME}")
    endif()
    if (CANN_INSTALL_DIR)
        # Only Support Linux.
        if (GGML_CANN)
            if (NOT UNIX)
                set(GGML_CANN OFF)
                message(WARNING "CANN: CANN toolkit supports unix but not ${CMAKE_SYSTEM_NAME}. Turning off GGML_CANN")
            endif()
        endif()
        # Supported platforms: x86-64, arm64
        if (GGML_CANN)
            if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
            elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64")
            else()
                set(GGML_CANN OFF)
                message(WARNING "CANN: CANN toolkit supports x86-64 and arm64 but not ${CMAKE_SYSTEM_PROCESSOR}. Turning off GGML_CANN")
            endif()
        endif()
        # Set header and libs
        if(GGML_CANN)
            set(CANN_INCLUDE_DIRS
                ${CANN_INSTALL_DIR}/include
                ${CANN_INSTALL_DIR}/include/aclnn
                ${CANN_INSTALL_DIR}/acllib/include
            )
            # TODO: find libs
            link_directories(
                ${CANN_INSTALL_DIR}/lib64
            )
            add_subdirectory(ggml-cann/kernels)
            list(APPEND CANN_LIBRARIES
                ascendcl
                nnopbase
                opapi
                acl_op_compiler
                ascendc_kernels
            )
            set(GGML_HEADERS_CANN "../include/ggml-cann.h")
            file(GLOB GGML_SOURCES_CANN "ggml-cann/*.cpp")
            list(APPEND GGML_SOURCES_CANN "ggml-cann.cpp")
            message(STATUS "CANN: CANN_INCLUDE_DIRS =  ${CANN_INCLUDE_DIRS}")
            message(STATUS "CANN: CANN_LIBRARIES =  ${CANN_LIBRARIES}")
            set(GGML_EXTRA_LIBS     ${GGML_EXTRA_LIBS}     ${CANN_LIBRARIES} )
            set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CANN_INCLUDE_DIRS})
            list(APPEND GGML_CDEF_PUBLIC GGML_USE_CANN)
        endif()
    else()
        set(GGML_CANN OFF)
        message(WARNING "CANN: Can't find CANN_INSTALL_DIR, do you forget to source set_var.sh. Turning off GGML_CANN")
    endif()
    if(NOT GGML_CANN)
        message(WARNING "CANN: GGML_CANN is turned OFF, see above for details.")
    endif()
 endif()
 function(get_flags CCID CCVER)
    set(C_FLAGS "")
    set(CXX_FLAGS "")
@ -1184,6 +1253,7 @@ add_library(ggml
            ${GGML_SOURCES_ROCM}      ${GGML_HEADERS_ROCM}
            ${GGML_SOURCES_BLAS}      ${GGML_HEADERS_BLAS}
            ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
            ${GGML_SOURCES_CANN}      ${GGML_HEADERS_CANN}
            ggml-aarch64.c            ggml-aarch64.h
            )
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@ -776,6 +776,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
                fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
                return false;
            }
            ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
        }
    }
--- a/ggml/src/ggml-backend.c
+++ b/ggml/src/ggml-backend.c
@ -134,6 +134,10 @@ void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backe
    }
 }
 enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage(ggml_backend_buffer_t buffer) {
    return buffer->usage;
 }
 ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
    return buffer->buft;
 }
@ -445,6 +449,11 @@ GGML_CALL static void ggml_backend_registry_init(void) {
    extern GGML_CALL void ggml_backend_kompute_reg_devices(void);
    ggml_backend_kompute_reg_devices();
 #endif
 #ifdef GGML_USE_CANN
    extern GGML_CALL int ggml_backend_cann_reg_devices(void);
    ggml_backend_cann_reg_devices();
 #endif
 }
 GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
--- a/ggml/src/ggml-cann.cpp
+++ b/ggml/src/ggml-cann.cpp
--- a/ggml/src/ggml-cann/.clang-format
+++ b/ggml/src/ggml-cann/.clang-format
@ -0,0 +1,168 @@
 ---
 Language:        Cpp
 # BasedOnStyle:  Google
 AccessModifierOffset: -1
 AlignAfterOpenBracket: Align
 AlignConsecutiveMacros: false
 AlignConsecutiveAssignments: false
 AlignConsecutiveDeclarations: false
 AlignEscapedNewlines: Left
 AlignOperands:   true
 AlignTrailingComments: true
 AllowAllArgumentsOnNextLine: true
 AllowAllConstructorInitializersOnNextLine: true
 AllowAllParametersOfDeclarationOnNextLine: true
 AllowShortBlocksOnASingleLine: Never
 AllowShortCaseLabelsOnASingleLine: false
 AllowShortFunctionsOnASingleLine: All
 AllowShortLambdasOnASingleLine: All
 AllowShortIfStatementsOnASingleLine: WithoutElse
 AllowShortLoopsOnASingleLine: true
 AlwaysBreakAfterDefinitionReturnType: None
 AlwaysBreakAfterReturnType: None
 AlwaysBreakBeforeMultilineStrings: true
 AlwaysBreakTemplateDeclarations: Yes
 BinPackArguments: true
 BinPackParameters: true
 BraceWrapping:
  AfterCaseLabel:  false
  AfterClass:      false
  AfterControlStatement: false
  AfterEnum:       false
  AfterFunction:   false
  AfterNamespace:  false
  AfterObjCDeclaration: false
  AfterStruct:     false
  AfterUnion:      false
  AfterExternBlock: false
  BeforeCatch:     false
  BeforeElse:      false
  IndentBraces:    false
  SplitEmptyFunction: true
  SplitEmptyRecord: true
  SplitEmptyNamespace: true
 BreakBeforeBinaryOperators: None
 BreakBeforeBraces: Attach
 BreakBeforeInheritanceComma: false
 BreakInheritanceList: BeforeColon
 BreakBeforeTernaryOperators: true
 BreakConstructorInitializersBeforeComma: false
 BreakConstructorInitializers: BeforeColon
 BreakAfterJavaFieldAnnotations: false
 BreakStringLiterals: true
 ColumnLimit:     80
 CommentPragmas:  '^ IWYU pragma:'
 CompactNamespaces: false
 ConstructorInitializerAllOnOneLineOrOnePerLine: true
 ConstructorInitializerIndentWidth: 4
 ContinuationIndentWidth: 4
 Cpp11BracedListStyle: true
 DeriveLineEnding: true
 DerivePointerAlignment: true
 DisableFormat:   false
 ExperimentalAutoDetectBinPacking: false
 FixNamespaceComments: true
 ForEachMacros:
  - foreach
  - Q_FOREACH
  - BOOST_FOREACH
 IncludeBlocks:   Regroup
 IncludeCategories:
  - Regex:           '^<ext/.*\.h>'
    Priority:        2
    SortPriority:    0
  - Regex:           '^<.*\.h>'
    Priority:        1
    SortPriority:    0
  - Regex:           '^<.*'
    Priority:        2
    SortPriority:    0
  - Regex:           '.*'
    Priority:        3
    SortPriority:    0
 IncludeIsMainRegex: '([-_](test|unittest))?$'
 IncludeIsMainSourceRegex: ''
 IndentCaseLabels: true
 IndentGotoLabels: true
 IndentPPDirectives: None
 IndentWidth:     4
 IndentWrappedFunctionNames: false
 JavaScriptQuotes: Leave
 JavaScriptWrapImports: true
 KeepEmptyLinesAtTheStartOfBlocks: false
 MacroBlockBegin: ''
 MacroBlockEnd:   ''
 MaxEmptyLinesToKeep: 1
 NamespaceIndentation: None
 ObjCBinPackProtocolList: Never
 ObjCBlockIndentWidth: 2
 ObjCSpaceAfterProperty: false
 ObjCSpaceBeforeProtocolList: true
 PenaltyBreakAssignment: 2
 PenaltyBreakBeforeFirstCallParameter: 1
 PenaltyBreakComment: 300
 PenaltyBreakFirstLessLess: 120
 PenaltyBreakString: 1000
 PenaltyBreakTemplateDeclaration: 10
 PenaltyExcessCharacter: 1000000
 PenaltyReturnTypeOnItsOwnLine: 200
 PointerAlignment: Left
 RawStringFormats:
  - Language:        Cpp
    Delimiters:
      - cc
      - CC
      - cpp
      - Cpp
      - CPP
      - 'c++'
      - 'C++'
    CanonicalDelimiter: ''
    BasedOnStyle:    google
  - Language:        TextProto
    Delimiters:
      - pb
      - PB
      - proto
      - PROTO
    EnclosingFunctions:
      - EqualsProto
      - EquivToProto
      - PARSE_PARTIAL_TEXT_PROTO
      - PARSE_TEST_PROTO
      - PARSE_TEXT_PROTO
      - ParseTextOrDie
      - ParseTextProtoOrDie
    CanonicalDelimiter: ''
    BasedOnStyle:    google
 ReflowComments:  true
 SortIncludes:    true
 SortUsingDeclarations: true
 SpaceAfterCStyleCast: false
 SpaceAfterLogicalNot: false
 SpaceAfterTemplateKeyword: true
 SpaceBeforeAssignmentOperators: true
 SpaceBeforeCpp11BracedList: false
 SpaceBeforeCtorInitializerColon: true
 SpaceBeforeInheritanceColon: true
 SpaceBeforeParens: ControlStatements
 SpaceBeforeRangeBasedForLoopColon: true
 SpaceInEmptyBlock: false
 SpaceInEmptyParentheses: false
 SpacesBeforeTrailingComments: 2
 SpacesInAngles:  false
 SpacesInConditionalStatement: false
 SpacesInContainerLiterals: true
 SpacesInCStyleCastParentheses: false
 SpacesInParentheses: false
 SpacesInSquareBrackets: false
 SpaceBeforeSquareBrackets: false
 Standard:        Auto
 StatementMacros:
  - Q_UNUSED
  - QT_REQUIRE_VERSION
 TabWidth:        8
 UseCRLF:         false
 UseTab:          Never
 ...
--- a/ggml/src/ggml-cann/Doxyfile
+++ b/ggml/src/ggml-cann/Doxyfile
--- a/ggml/src/ggml-cann/acl_tensor.cpp
+++ b/ggml/src/ggml-cann/acl_tensor.cpp
@ -0,0 +1,198 @@
 /*
 * Copyright (c) 2023-2024 The ggml authors
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */
 #include "acl_tensor.h"
 #include <algorithm>
 #include <cstring>
 aclDataType ggml_cann_type_mapping(ggml_type type) {
    switch (type) {
        case GGML_TYPE_F32:
            return ACL_FLOAT;
        case GGML_TYPE_F16:
            return ACL_FLOAT16;
        case GGML_TYPE_I8:
            return ACL_INT8;
        case GGML_TYPE_I16:
            return ACL_INT16;
        case GGML_TYPE_I32:
            return ACL_INT32;
        default:
            return ACL_DT_UNDEFINED;
    }
    return ACL_DT_UNDEFINED;
 }
 aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
                                   size_t* nb, int64_t dims, aclFormat format,
                                   size_t offset) {
    // If tensor is bcasted, Up to GGML_MAX_DIMS additional dimensions will be
    // added.
    int64_t acl_ne[GGML_MAX_DIMS * 2], acl_stride[GGML_MAX_DIMS * 2];
    int64_t acl_storage_len = 0;
    if (ne == nullptr) {
        acl_storage_len = ggml_nbytes(tensor);
        for (int i = 0; i < GGML_MAX_DIMS; i++) {
            acl_ne[i] = tensor->ne[i];
            // The step size of acl is in elements.
            acl_stride[i] = tensor->nb[i] / ggml_element_size(tensor);
        }
    } else {
        // With bcast
        for (int i = 0; i < dims; i++) {
            acl_storage_len += (ne[i] - 1) * nb[i];
            acl_ne[i] = ne[i];
            acl_stride[i] = nb[i] / ggml_element_size(tensor);
        }
    }
    // Reverse ne and stride.
    int64_t final_dims = (dims == 0 ? GGML_MAX_DIMS : dims);
    std::reverse(acl_ne, acl_ne + final_dims);
    std::reverse(acl_stride, acl_stride + final_dims);
    aclTensor* acl_tensor = aclCreateTensor(
        acl_ne, final_dims, ggml_cann_type_mapping(tensor->type), acl_stride,
        offset / ggml_element_size(tensor), format, &acl_storage_len, 1,
        tensor->data);
    return acl_tensor;
 }
 bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1) {
    for (int i = 0; i < GGML_MAX_DIMS; i++) {
        if (t1->ne[i] != t0->ne[i] && t1->ne[i] != 1) {
            return true;
        }
    }
    return false;
 }
 aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
                                   size_t type_size, int64_t* ne, size_t* nb,
                                   int64_t dims, aclFormat format,
                                   size_t offset) {
    int64_t tmp_ne[GGML_MAX_DIMS * 2];
    int64_t tmp_stride[GGML_MAX_DIMS * 2];
    memcpy(tmp_ne, ne, dims * sizeof(int64_t));
    for (int i = 0; i < dims; i++) {
        tmp_stride[i] = nb[i] / type_size;
    }
    std::reverse(tmp_ne, tmp_ne + dims);
    std::reverse(tmp_stride, tmp_stride + dims);
    int64_t acl_storage_len = 0;
    for (int i = 0; i < dims; i++) {
        acl_storage_len += (ne[i] - 1) * nb[i];
    }
    aclTensor* acl_tensor =
        aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
                        format, &acl_storage_len, 1, data_ptr);
    return acl_tensor;
 }
 int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0,
                                  const ggml_tensor* src1,
                                  int64_t* bcast_src0_ne,
                                  int64_t* bcast_src1_ne, size_t* bcast_src0_nb,
                                  size_t* bcast_src1_nb) {
    GGML_ASSERT(ggml_can_repeat(src1, src0));
    int bcast_dim_cnt = 0;
    for (int i = 0; i < GGML_MAX_DIMS; i++) {
        int64_t nr = src0->ne[i] / src1->ne[i];
        bcast_src0_ne[bcast_dim_cnt] = src0->ne[i] / nr;
        bcast_src1_ne[bcast_dim_cnt] = src1->ne[i];
        bcast_src0_nb[bcast_dim_cnt] = src0->nb[i];
        bcast_src1_nb[bcast_dim_cnt] = src1->nb[i];
        bcast_dim_cnt++;
        if (nr != 1) {
            // Need to add an extra dim.
            bcast_src0_ne[bcast_dim_cnt] = nr;
            bcast_src1_ne[bcast_dim_cnt] = 1;
            bcast_src0_nb[bcast_dim_cnt] = bcast_src0_nb[bcast_dim_cnt - 1] *
                                           bcast_src0_ne[bcast_dim_cnt - 1];
            bcast_src1_nb[bcast_dim_cnt] = bcast_src1_nb[bcast_dim_cnt - 1] *
                                           bcast_src1_ne[bcast_dim_cnt - 1];
            bcast_dim_cnt++;
        }
    }
    return bcast_dim_cnt;
 }
 int64_t ggml_cann_get_mulmat_bcast_shape(
    const int64_t* input_ne, const int64_t* weight_ne, const int64_t* dst_ne,
    const size_t* input_nb, const size_t* weight_nb, const size_t* dst_nb,
    int64_t* bcast_input_ne, int64_t* bcast_weight_ne, int64_t* bcast_dst_ne,
    size_t* bcast_input_nb, size_t* bcast_weight_nb, size_t* bcast_dst_nb) {
    // input and dst shoule in same shape, except first two dims.
    GGML_ASSERT(input_ne[2] == dst_ne[2]);
    GGML_ASSERT(input_ne[3] == dst_ne[3]);
    int bcast_dim_cnt = 0;
    // For mul_mat, a dimension needs to be added before the dimension that
    // weight needs to be expanded to satisfy the bcast rule of matrix
    // multiplication.
    for (int i = 0; i < GGML_MAX_DIMS; i++) {
        int64_t nr = input_ne[i] / weight_ne[i];
        // Do not use bcast in the first two dimensions because we only support
        // the bcast batch dimension. Just copy them.
        if (i < 2 || nr == 1) {
            bcast_input_ne[bcast_dim_cnt] = input_ne[i];
            bcast_weight_ne[bcast_dim_cnt] = weight_ne[i];
            bcast_dst_ne[bcast_dim_cnt] = dst_ne[i];
            bcast_input_nb[bcast_dim_cnt] = input_nb[i];
            bcast_weight_nb[bcast_dim_cnt] = weight_nb[i];
            bcast_dst_nb[bcast_dim_cnt] = dst_nb[i];
            bcast_dim_cnt++;
        } else {
            // Need to add an extra dim.
            bcast_input_ne[bcast_dim_cnt] = nr;
            bcast_dst_ne[bcast_dim_cnt] = nr;
            bcast_weight_ne[bcast_dim_cnt] = 1;
            bcast_input_nb[bcast_dim_cnt] = input_nb[i];
            bcast_dst_nb[bcast_dim_cnt] = dst_nb[i];
            bcast_weight_nb[bcast_dim_cnt] = weight_nb[i];
            bcast_dim_cnt++;
            bcast_input_ne[bcast_dim_cnt] = input_ne[i] / nr;
            bcast_dst_ne[bcast_dim_cnt] = dst_ne[i] / nr;
            bcast_weight_ne[bcast_dim_cnt] = weight_ne[i];
            bcast_input_nb[bcast_dim_cnt] = bcast_input_nb[bcast_dim_cnt - 1] *
                                            bcast_input_ne[bcast_dim_cnt - 1];
            bcast_dst_nb[bcast_dim_cnt] = bcast_dst_nb[bcast_dim_cnt - 1] *
                                          bcast_dst_ne[bcast_dim_cnt - 1];
            bcast_weight_nb[bcast_dim_cnt] =
                bcast_weight_nb[bcast_dim_cnt - 1] *
                bcast_weight_ne[bcast_dim_cnt - 1];
            bcast_dim_cnt++;
        }
    }
    return bcast_dim_cnt;
 }
--- a/ggml/src/ggml-cann/acl_tensor.h
+++ b/ggml/src/ggml-cann/acl_tensor.h
@ -0,0 +1,230 @@
 /*
 * Copyright (c) 2023-2024 The ggml authors
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */
 #ifndef CANN_ACL_TENSOR_H
 #define CANN_ACL_TENSOR_H
 #include <aclnn/aclnn_base.h>
 #include "common.h"
 /**
 * @brief	Maps a ggml_type to its corresponding aclDataType.
 *
 * @details	This function takes a ggml_type as input and returns the corresponding
 *			aclDataType. It supports mapping for various ggml_types. If the input type
 *			does not match any of the predefined ggml_types, the function returns
 *          ACL_DT_UNDEFINED.
 *
 * @param	type    The ggml_type to be mapped.
 * @return	The corresponding aclDataType. If the input type is not recognized,
 *			ACL_DT_UNDEFINED is returned.
 */
 aclDataType ggml_cann_type_mapping(ggml_type type);
 /**
 * @brief   Creates an ACL tensor from a ggml_tensor with optional shape.
 *
 * @details This function creates an ACL tensor based on the properties of the
 *          provided ggml_tensor. It supports customer shape by adjusting dimensions
 *          and strides accordingly. If customer shape is applied, additional
 *          dimensions and strides are calculated based on the provided parameters.
 *
 * @param   tensor      Pointer to the ggml_tensor to be converted to ACL tensor.
 * @param   ne          Pointer to an array containing dimensions. Defaults to nullptr
 *                      if no customer shape is applied.
 * @param   nb          Pointer to an array containing strides. Defaults to nullptr
 *                      if no customer shape is applied.
 * @param   dims        Number of dimensions in the tensor. Defaults to 0 if no customer
 *                      shape is applied.
 * @param   format      ACL tensor format. Defaults to ACL_FORMAT_ND.
 * @param   offset      Offset in bytes for the ACL tensor data. Defaults to 0.
 * @return  Pointer to the created ACL tensor.
 */
 aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne = nullptr,
                             size_t* nb = nullptr, int64_t dims = 0,
                             aclFormat format = ACL_FORMAT_ND,
                             size_t offset = 0);
 /**
 * @brief   Creates an ACL tensor from provided parameters.
 *
 * @details This function creates an ACL tensor using the provided data pointer,
 *          data type, dimensions, strides, format, offset, and additional parameters.
 *          It calculates necessary dimensions and strides based on the provided ne and nb
 *          arrays, adjusting them for the ACL tensor creation. The ACL storage length
 *          is also calculated based on the provided dimensions and strides.
 *
 * @param   data_ptr    Pointer to the data buffer for the ACL tensor.
 * @param   dtype       ACL data type of the tensor.
 * @param   type_size   Size of each element in the tensor data buffer.
 * @param   ne          Pointer to an array containing tensor dimensions.
 * @param   nb          Pointer to an array containing tensor strides.
 * @param   dims        Number of dimensions of the tensor.
 * @param   format      ACL tensor format. Defaults to ACL_FORMAT_ND.
 * @param   offset      Offset in bytes for the ACL tensor data. Defaults to 0.
 * @return  Pointer to the created ACL tensor.
 */
 aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
                             size_t type_size, int64_t* ne, size_t* nb,
                             int64_t dims, aclFormat format = ACL_FORMAT_ND,
                             size_t offset = 0);
 /**
 * @brief   Checks if tensors require broadcasting based on their shapes.
 *
 * @details This function determines if two ggml_tensors need to be broadcasted for
 *          element-wise operations. Broadcasting is necessary if the shapes of the
 *          tensors are not identical and no dimension in either tensor equals 1.
 *
 * @param   t0      Pointer to the first ggml_tensor.
 * @param   t1      Pointer to the second ggml_tensor.
 * @return  True if broadcasting is needed, False otherwise.
 *
 * @remarks This function iterates over the dimensions of t0 and t1. It checks if each
 *          dimension in t1 differs from t0's corresponding dimension and is not equal
 *          to 1. If such a dimension is found, broadcasting is required to align t1
 *          with t0 for element-wise operations.
 */
 bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1);
 /**
 * @brief   Computes broadcast shapes and strides for two ggml_tensors.
 *
 * @details This function calculates the broadcast shapes and strides for two ggml_tensors,
 *          following the broadcasting rules similar to numpy. It adjusts dimensions and
 *          strides to ensure compatibility for element-wise operations where one tensor
 *          can be broadcasted to match the shape of another tensor.
 *
 * @param   src0                Pointer to the first ggml_tensor.
 * @param   src1                Pointer to the second ggml_tensor.
 * @param   bcast_ne_src0       Output array to store broadcasted dimensions for src0.
 * @param   bcast_ne_src1       Output array to store broadcasted dimensions for src1.
 * @param   bcast_nb_src0       Output array to store broadcasted strides for src0.
 * @param   bcast_nb_src1       Output array to store broadcasted strides for src1.
 * @return  Number of dimensions in the broadcasted shape.
 *
 * @pre     ggml_can_repeat(src1, src0) must return true, indicating src1 can be broadcasted
 *          to match src0.
 *
 * @remarks This function iterates over the dimensions of src0 and src1, calculating the
 *          necessary broadcast dimensions and strides. If a dimension requires broadcasting
 *          (i.e., its size in src1 is smaller than in src0), an additional dimension is
 *          added with size calculated to match src0's dimension. This adjustment ensures
 *          that src1 can be element-wise broadcasted to src0's shape.
 *
 *  How it works:
 *
 *  if dim0 has padding.
 *  a -> (2, 2) padding = 2
 *   a: [[1, 2, *, *]
 *       [2, 3, *, *]]
 *  nb = (8, 4, 2)
 *
 *  if a should bcast with b -> (2, 4)
 *  b' -> (2, 2, 2)
 *  b : [[1, 2, 3, 4, *, *]
 *       [5, 6, 7, 8, *, *]]
 *  nb = (12, 6, 1)
 *
 *  after bcast:
 *  a' -> (2, 1, 2)
 *  a': [[[1, 2], *, *]
 *       [[2, 3], *, *]]
 *  nb = (8, 4, 2, 1)
 *
 *  b' : [[[1, 2], [3, 4], *, *]
 *        [[5, 6], [7, 8], *, *]]
 *  nb = (12, 6, 2, 1)
 *  \endcode
 *
 *  dim1 in a inserted dim, should add nb for dim1,
 *  and all other nb moves to next in order.
 */
 int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0, const ggml_tensor* src1,
                        int64_t* bcast_ne_src0, int64_t* bcast_ne_src1,
                        size_t* bcast_nb_src0, size_t* bcast_nb_src1);
 // Bcast macro to avoid duplicate code.
 #define BCAST_SHAPE(src0, src1)                                              \
    int64_t bcast_##src0##_ne[GGML_MAX_DIMS * 2];                            \
    int64_t bcast_##src1##_ne[GGML_MAX_DIMS * 2];                            \
    size_t bcast_##src0##_nb[GGML_MAX_DIMS * 2];                             \
    size_t bcast_##src1##_nb[GGML_MAX_DIMS * 2];                             \
    int64_t bcast_dims = ggml_cann_get_bcast_shape(                          \
        src0, src1, bcast_##src0##_ne, bcast_##src1##_ne, bcast_##src0##_nb, \
        bcast_##src1##_nb);
 #define BCAST_PARAM(tensor) bcast_##tensor##_ne, bcast_##tensor##_nb, bcast_dims
 /**
 * @brief Calculates broadcast shapes for matrix multiplication.
 *
 * @details This function computes the broadcast shapes required for matrix multiplication
 *          based on the input, weight, and destination tensor shapes. It ensures that the
 *          dimensions of weight tensors are expanded appropriately to satisfy matrix
 *          multiplication broadcast rules.
 *
 * @param input_ne      Array containing the dimensions of the input tensor.
 * @param weight_ne     Array containing the dimensions of the weight tensor.
 * @param dst_ne        Array containing the dimensions of the destination tensor.
 * @param input_nb      Array containing the strides of the input tensor.
 * @param weight_nb     Array containing the strides of the weight tensor.
 * @param dst_nb        Array containing the strides of the destination tensor.
 * @param bcast_input_ne    Output array for broadcasted input tensor dimensions.
 * @param bcast_weight_ne   Output array for broadcasted weight tensor dimensions.
 * @param bcast_dst_ne      Output array for broadcasted destination tensor dimensions.
 * @param bcast_input_nb    Output array for broadcasted input tensor strides.
 * @param bcast_weight_nb   Output array for broadcasted weight tensor strides.
 * @param bcast_dst_nb      Output array for broadcasted destination tensor strides.
 * @return The number of dimensions in the broadcasted tensors.
 *
 * @remarks This function iterates over the tensor dimensions and calculates the broadcast
 *          shapes needed for matrix multiplication. It ensures that dimensions where
 *          weight tensor requires expansion are appropriately handled to conform with
 *          broadcasting rules.
 * @note compare with ggml_cann_get_bcast_shape, mul_mat broadcast need add this new dim
 *       before cast dim.
 * @sa ggml_cann_get_bcast_shape
 */
 int64_t ggml_cann_get_mulmat_bcast_shape(
    const int64_t* input_ne, const int64_t* weight_ne, const int64_t* dst_ne,
    const size_t* input_nb, const size_t* weight_nb, const size_t* dst_nb,
    int64_t* bcast_input_ne, int64_t* bcast_weight_ne, int64_t* bcast_dst_ne,
    size_t* bcast_input_nb, size_t* bcast_weight_nb, size_t* bcast_dst_nb);
 // Bcast macro to avoid duplicate code.
 #define BCAST_MUL_MAT_SHAPE(input, weight, dst)                         \
    int64_t bcast_##input##_ne[GGML_MAX_DIMS * 2];                      \
    int64_t bcast_##weight##_ne[GGML_MAX_DIMS * 2];                     \
    int64_t bcast_##dst##_ne[GGML_MAX_DIMS * 2];                        \
    size_t bcast_##input##_nb[GGML_MAX_DIMS * 2];                       \
    size_t bcast_##weight##_nb[GGML_MAX_DIMS * 2];                      \
    size_t bcast_##dst##_nb[GGML_MAX_DIMS * 2];                         \
    int64_t bcast_dims = ggml_cann_get_mulmat_bcast_shape(              \
        input->ne, weight->ne, dst->ne, input->nb, weight->nb, dst->nb, \
        bcast_##input##_ne, bcast_##weight##_ne, bcast_##dst##_ne,      \
        bcast_##input##_nb, bcast_##weight##_nb, bcast_##dst##_nb);
 #define BCAST_MUL_MAT_PARAM(tensor) \
    bcast_##tensor##_ne, bcast_##tensor##_nb, bcast_dims
 #endif  // CANN_ACL_TENSOR_H
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
--- a/ggml/src/ggml-cann/aclnn_ops.h
+++ b/ggml/src/ggml-cann/aclnn_ops.h
@ -0,0 +1,592 @@
 #ifndef CANN_ACLNN_OPS
 #define CANN_ACLNN_OPS
 /**
 * @file    acl_tensor
 * @brief   This file contains related functions of ggml_tensor and acl_tensor.
 *          Contains conversion from ggml_tensor to acl_tensor, broadcast and other
 *          functions.
 * @author  hipudding <huafengchun@gmail.com>
 * @author  wangshuai09 <391746016@qq.com>
 * @date    July 15, 2024
 *
 * Copyright (c) 2023-2024 The ggml authors
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */
 #include <aclnnop/aclnn_add.h>
 #include <aclnnop/aclnn_arange.h>
 #include <aclnnop/aclnn_argsort.h>
 #include <aclnnop/aclnn_cat.h>
 #include <aclnnop/aclnn_clamp.h>
 #include <aclnnop/aclnn_div.h>
 #include <aclnnop/aclnn_gelu.h>
 #include <aclnnop/aclnn_hardsigmoid.h>
 #include <aclnnop/aclnn_hardswish.h>
 #include <aclnnop/aclnn_leaky_relu.h>
 #include <aclnnop/aclnn_mul.h>
 #include <aclnnop/aclnn_relu.h>
 #include <aclnnop/aclnn_silu.h>
 #include <aclnnop/aclnn_tanh.h>
 #include "acl_tensor.h"
 #include "common.h"
 /**
 * @brief   Repeats a ggml tensor along each dimension to match the dimensions
 *          of another tensor.
 *
 * @details This function repeats the elements of a source ggml tensor along
 *          each dimension to create a destination tensor with the specified
 *          dimensions. The operation is performed using the ACL backend and
 *          executed asynchronously on the device.
 *
 * @param   ctx The CANN context used for operations.
 * @param   dst The ggml tensor representing the destination, which op is
 *              GGML_OP_REPEAT and specifies the desired dimensions.
 */
 void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Adds two ggml tensors using the CANN backend.
 *
 * @details This function performs an element-wise addition of two tensors. In
 *          case the tensors do not have the same shape, one or both tensors
 *          will be broadcasted to match the shape of the other before the
 *          addition is performed.The formula for the operation is given by:
 *          \f[
 *              \text{dst} = \text{acl_src0} + \alpha \cdot \text{acl_src1}
 *          \f]
 *
 * @param ctx The CANN context used for operations.
 * @param dst The ggml tensor representing the destination, result of the
 *            addition is stored at dst->data, and dst->op is `GGML_OP_ADD`
 */
 void ggml_cann_add(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Applies the Leaky ReLU activation function to a tensor using the CANN
 *          backend.
 *
 * @details This function computes the Leaky ReLU activation for each element of
 *          the input tensor. The Leaky ReLU function allows a small gradient
 *          when the unit is not active (i.e., when the input is negative). The
 *          Leaky ReLU function is defined as:
 *          \f[
 *              \text{dst} = \max(0, src) + \text{negativeSlope} \cdot \min(0,
 *               src)
 *          \f]
 *          `negativeSlope` is in dst->params.
 *
 * @param ctx The CANN context used for operations.
 * @param dst The destination tensor where the result of the Leaky ReLU
 *            activation is stored, which op is `GGML_OP_LEAKY_RELU`
 */
 void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief    Concatenates multiple tensors along a specified dimension using the
 *           CANN backend.
 *
 * @param ctx        The CANN context used for operations.
 * @param tensorList A pointer to the list of tensors to be concatenated.
 * @param dst        The destination tensor where the result of the
 *                   concatenation is stored. dst->op is `GGML_OP_CONCAT`.
 * @param concat_dim The dimension along which the tensors are concatenated.
 *
 * @attention tensorList length should be 2 and the dimension using for concat
 *            default to 1.
 */
 void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Generates a sequence of evenly spaced values within a specified
 *          interval for a ggml tensor using the CANN backend.
 *
 * @details This function creates a sequence of numbers over a specified i
 *          nterval, starting from `start`, ending before `stop`, and
 *          incrementing by `step`. The sequence is stored in the destination
 *          tensor `dst`.
 *
 * @param ctx The CANN context used for operations.
 * @param dst The destination tensor where the generated sequence will be stored.
 *            `start`, 'stop' and 'step' are in dst->op_params and dst->op is
 *            `GGML_OP_ARANGE`.
 */
 void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Computes the square of the elements of a ggml tensor using the CANN
 *          backend.
 * @details The function sets the second source tensor of the destination
 *          tensor `dst` to be equal to the first source tensor. This is
 *          effectively squaring the elements since the multiplication becomes
 *          `element * element`.
 * @param ctx The CANN context used for operations.
 * @param dst The destination tensor where the squared values will be stored，
 *            which dst->op is `GGML_OP_SQR`.
 */
 void ggml_cann_sqr(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Applies a clamp operation to the elements of a ggml tensor using the
 *          CANN backend.
 *
 * @details This function clamps the elements of the input tensor `src` to a
 *          specified range defined by `min` and `max` values. The result is
 *          stored in the destination tensor `dst`. The operation is defined as:
 *          \f[
 *              y = \max(\min(x, max\_value), min\_value)
 *           \f]
 *          where `x` is an element of the input tensor, and `y` is the
 *          corresponding element in the output tensor.
 * @param ctx The CANN context used for operations.
 * @param dst The destination tensor where the clamped values will be stored.
 *            dst->op is `GGML_OP_CLAMP`, `min` and `max` value is in dst->params.
 */
 void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Scales the elements of a ggml tensor by a constant factor using the
 *          CANN backend.
 *
 * @details This function multiplies each element of the input tensor `src` by
 *          a scaling factor `scale`, storing the result in the destination
 *          tensor `dst`. The operation is defined as:
 *          \f[
 *             dst = src \times scale
 *          \f]
 *
 * @param ctx The CANN context used for operations.
 * @param dst The destination tensor where the scaled values will be stored.
 *            dst->op is `GGML_OP_SCALE` and `scale` value is in dst->params.
 */
 void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Sorts the elements of a ggml tensor and returns the indices that
 *          would sort the tensor using the CANN backend.
 *
 * @details This function performs an argsort operation on the input tensor
 *          `src`. It sorts the elements of `src` in either ascending or
 *          descending order, depending on the `GGML_SORT_ORDER_DESC`,
 *          and returns the indices that would sort the original tensor.
 *
 * @param ctx The CANN context used for operations.
 * @param dst The destination tensor where the sorted indices will be stored.
 *            dst->op is `GGML_OP_ARGSORT`.
 */
 void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Computes the Layer Normalization for a ggml tensor using the CANN
 *          backend.
 *
 * @details This function applies the Layer Normalization operation on the
 *          input tensor `src` and stores the result in the destination tensor
 *          `dst`. Layer Normalization normalizes the features at each sample in
 *          a mini-batch independently. It is commonly used in neural networks
 *          to normalize the activations of a layer by adjusting and scaling
 *          the outputs.
 *          The operation is defined as:
 *          \f[
 *              \text { out }=\frac{x-\mathrm{E}[x]}{\sqrt{\text{Var}[x]+eps}}
 *          \f]
 *          `Var` defaults dst->ne[0]. `eps` is in dst->params.
 *
 * @param ctx The CANN context used for operations.
 * @param dst The destination tensor where the normalized values will be stored.
 * @attention `Var` defaults to dst->ne[0].
 */
 void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief  Computes the Group Normalization for a ggml tensor using the CANN
 *         backend.
 *
 * @brief  This function applies the Group Normalization operation on the input
 *         tensor `src` and stores the result in the destination tensor `dst`.
 *         Group Normalization divides the channels into groups and normalizes
 *         the features within each group across spatial locations.
 *         It is commonly used in convolutional neural networks to improve
 *         training stability and performance.
 *         The operation is defined as:
 *         \f[
 *             \text { out }=\frac{x-\mathrm{E}[x]}{\sqrt{\text{Var}[x]+eps}}
 *         \f]
 *
 * @param ctx The CANN context used for operations.
 * @param dst The destination tensor where the normalized values will be stored.
 *            `n_groups` is in dst->params, which split C channel to `n_groups`.
 *            dst->op is `GGML_OP_GROUP_NORM`.
 *
 * @attention eps defaults to 1e-6f.
 */
 void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Computes the accumulation of tensors using the CANN backend.
 *
 * @details This function performs an accumulation operation on two tensors.
 *          Depending on the `inplace` flag, it either updates the destination
 *          tensor `dst` in place by adding `alpha * src1` to it, or it creates
 *          a new tensor as the result of `src0 + alpha * src1` and stores it in
 *          `dst`.
 *          The operation is defined as:
 *          \f[
 *               dst = src0 + alpha \times src1
 *          \f]
 *          if `inplace` is `true`, `src0` is equal to 'dst'.
 * @param ctx The CANN context used for operations.
 * @param dst The destination tensor where the accumulated values will be stored.
 *            `inplace` is in dst->params, and dst->op is `GGML_OP_ACC`.
 */
 void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Computes the sum of elements along the last dimension of a ggml tensor
 *          using the CANN backend.
 *
 * @details This function performs a reduction sum operation along the last
 *          dimension of the input tensor `src`. The result of the sum is stored
 *          in the destination tensor `dst`.
 *
 * @param ctx The CANN context used for operations.
 * @param dst The destination tensor where the reduced values will be stored。
 *            dst->op is `GGML_OP_SUM_ROWS`.
 *
 * @attention `reduce_dims` defaults to 3, which means the last dimension.
 */
 void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Upsamples a ggml tensor using nearest neighbor interpolation using
 *          the CANN backend.
 *
 * @details This function performs upsampling of the input tensor `src` using
 *          nearest neighbor interpolation. The upsampling is applied to the
 *          height and width dimensions (last two dimensions) of the tensor. The
 *          result is stored in the destination tensor `dst`, which must have
 *          the appropriate dimensions for the upsampled output.
 *
 * @param ctx The CANN context used for operations.
 * @param dst The destination tensor where the upsampled values will be stored.
 *            dst->op is `GGML_OP_UPSCALE`.
 */
 void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx,
                                  ggml_tensor* dst);
 /**
 * @brief   Pads a ggml tensor to match the dimensions of the destination tensor
 *          using the CANN backend.
 *
 * @details This function pads the input tensor `src` so that it matches the
 *          dimensions of the destination tensor `dst`. The amount of padding
 *          is calculated based on the difference in sizes between `src` and
 *          `dst` along each dimension. The padded tensor is stored in `dst`.
 *
 * @param ctx The CANN context used for operations.
 * @param dst The destination tensor, which specifies the target dimensions for
 *            padding. dst->op is `GGML_OP_PAD`.
 */
 void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Executes a 2D pooling operation on a ggml tensor using the CANN
 *          backend.
 *
 * @details This function dispatches the execution of a 2D pooling operation on
 *          the input tensor `dst`. The type of pooling (average or max) is
 *          determined by the `op` parameter, which is read from the operation
 *          parameters of `dst`. The function supports average pooling
 *          (`GGML_OP_POOL_AVG`) and max pooling (`GGML_OP_POOL_MAX`). If an
 *          invalid operation is encountered, the function asserts a failure.
 *
 * @param ctx The CANN context used for operations.
 * @param dst The destination tensor on which the pooling operation is to be
 *            performed. dst->op is `GGML_OP_POOL_2D`.
 */
 void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Duplicates a ggml tensor using the CANN backend.
 *
 * @details This function duplicates the contents of the source tensor `src` to
 *          the destination tensor `dst`. The function supports various tensor
 *          types and configurations, including handling of extra data, type
 *          conversions, and special cases for contiguous and non-contiguous
 *          tensors.
 *
 * @param ctx The CANN context used for operations.
 * @param dst The destination tensor where the duplicated data will be stored.
 *            dst->op is `GGML_OP_DUP`
 *
 * @attention Only support Fp16/FP32. Not support when src and dst have
 *            different shape and dst is no-contiguous.
 * @note:     This func need to simplify.
 */
 void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Computes the Root Mean Square (RMS) normalization of a ggml tensor
 *          using the CANN backend.
 *
 * @details This function applies RMS normalization to the input tensor `src`
 *          and stores the result in the destination tensor `dst`. RMS
 *          normalization involves computing the root mean square of the input
 *          tensor along a specified dimension and then dividing each element of
 *          the tensor by this value, adjusted by a small epsilon value to
 *          prevent division by zero.
 *          The operation is defined as:
 *          \f[
 *               \text{RmsNorm}\left(x_i\right)=\frac{x_i}{\text{Rms}(\mathbf{x})} g_i,
 *               \quad \text { where } \text{Rms}(\mathbf{x})=\sqrt{\frac{1}{n} \sum_{i=1}^n x_i^2+e p s}
 *          \f]
 *          `eps` is in dst->op_params.
 * @param ctx The CANN context used for operations.
 * @param dst The destination tensor where the normalized values will be stored.
 *            dst->op is `GGML_OP_RMS_NORM`.
 */
 void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Applies a diagonal mask to the tensor with a specified value.
 *
 * @details This function creates a mask tensor filled with ones, then applies
 *          an upper triangular and lower triangular operation to it based on
 *          the number of past elements specified. Afterward, it adds the masked
 *          tensor to the destination tensor in-place.
 *
 * @param ctx The backend CANN context used for operations.
 * @param dst The destination tensor where the result will be stored. dst->op is
 *            `GGML_OP_DIAG_MASK`
 * @param value The value to use for masking.
 */
 void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst, float value);
 /**
 * @brief   Performs an image-to-column transformation on the input tensor.
 *
 * @details This function takes an input tensor and applies an image-to-column
 *          operation, converting spatial dimensions into column-like
 *          structures suitable for convolutional operations. It supports both
 *          half-precision (F16) and single-precision (F32) floating-point data
 *          types.
 *
 * @param ctx The backend CANN context for executing operations.
 * @param dst The destination tensor that stores the result of the operation.
 *            dst->op is `GGML_OP_IM2COL`.
 */
 void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Computes time step embeddings using sine and cosine functions.
 *
 * @details This function calculates time step embeddings by applying sine and
 *          cosine transformations to a given input tensor, which is typically
 *          used in temporal models like diffusion models or transformers to
 *          encode time information effectively.
 *
 * @param ctx The backend CANN context for executing operations.
 * @param dst The destination tensor where the result of the embedding operation
 *            will be stored. dst->op is `GGML_OP_TIMESTEP_EMBEDDING`.
 */
 void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 // @see ggml_cann_dup.
 void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Computes the softmax activation with optional masking.
 *
 * @details This function computes the softmax activation over the input tensor,
 *          optionally applying a mask and scaling factor. It supports both FP16
 *          and FP32 data types and can handle masking by broadcasting the mask
 *          across rows if necessary.
 *          The function performs the following steps:
 *          1. Multiplies the input tensor by a scale factor.
 *          2. Optionally casts the mask tensor to FP32 if it is in FP16 format.
 *          3. Broadcasts the mask tensor if its dimensions do not match the
 *             input tensor's dimensions.
 *          4. Adds the mask to the scaled input tensor.
 *          5. Applies the softmax activation function along the specified
 *             dimension.
 *
 * @param ctx The backend CANN context for executing operations.
 * @param dst The destination tensor where the result will be stored. dst->op is
 *            `GGML_OP_SOFTMAX`.
 */
 void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Extracts specific rows from a tensor based on indices.
 *
 * @details This function retrieves rows from a source tensor src0 according to
 *          the indices provided in another tensor src1 and stores the result in
 *          a destination tensor (\p dst). It supports different data types
 *          including F32, F16, Q4_0, and Q8_0.
 *
 * @param ctx The backend CANN context for executing operations.
 * @param dst The destination tensor where the extracted rows will be stored.
 *            dst->op is `GGML_OP_GET_ROWS`.
 */
 void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Executes matrix multiplication for the given tensor.
 *
 * @details This function performs matrix multiplication on the source tensors
 *          associated with the destination tensor. It supports matrix
 *          multiplication F32, F16, and Q8_0.
 *
 * @param ctx The backend CANN context for executing operations.
 * @param dst The destination tensor for storing the result of the matrix
 *            multiplication. dst->op is `GGML_OP_MUL_MAT`.
 */
 void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief Applies Rotary Positional Embedding (RoPE) to the input tensor.
 *
 * @details This function implements the RoPE mechanism, which is a method to
 *          encode positional information into sequence data, particularly
 *          useful in transformer models. It supports both F32 and F16 data
 *          types.
 *
 * @param ctx The backend CANN context for executing operations.
 * @param dst The destination tensor where the RoPE-transformed data will be
 *            stored. dst->op is `GGML_OP_ROPE`.
 *
 * @note The function currently does not support cases where the n_dims is less
 *       than the input tensor's first dimension.
 * @note The function currently does not support cases where the freq_factors is
 *       not NULL.
 * @note The function currently does not support cases where the ext_factor is
 *       not equal 0.
 * @note The function currently does not support cases where the freq_scale is
 *       not equal 1.
 */
 void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 template <aclnnStatus getWorkspaceSize(const aclTensor*, const aclTensor*,
                                       aclTensor*, uint64_t*, aclOpExecutor**),
          aclnnStatus execute(void*, uint64_t, aclOpExecutor*, aclrtStream)>
 void ggml_cann_mul_div(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ggml_tensor* src0 = dst->src[0];
    ggml_tensor* src1 = dst->src[1];
    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
    aclTensor* acl_src0;
    aclTensor* acl_src1;
    aclTensor* acl_dst;
    // Need bcast
    if (!ggml_are_same_shape(src0, src1) && ggml_cann_need_bcast(src0, src1)) {
        BCAST_SHAPE(src0, src1)
        acl_src0 = ggml_cann_create_tensor(src0, BCAST_PARAM(src0));
        acl_src1 = ggml_cann_create_tensor(src1, BCAST_PARAM(src1));
        acl_dst = ggml_cann_create_tensor(dst, BCAST_PARAM(src0));
    } else {
        acl_src0 = ggml_cann_create_tensor(src0);
        acl_src1 = ggml_cann_create_tensor(src1);
        acl_dst = ggml_cann_create_tensor(dst);
    }
    uint64_t workspaceSize = 0;
    aclOpExecutor* executor;
    void* workspaceAddr = nullptr;
    ACL_CHECK(getWorkspaceSize(acl_src0, acl_src1, acl_dst, &workspaceSize,
                               &executor));
    if (workspaceSize > 0) {
        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
        workspaceAddr = workspace_allocator.get();
    }
    aclrtStream main_stream = ctx.stream();
    ACL_CHECK(execute(workspaceAddr, workspaceSize, executor, main_stream));
    ACL_CHECK(aclDestroyTensor(acl_src0));
    ACL_CHECK(aclDestroyTensor(acl_src1));
    ACL_CHECK(aclDestroyTensor(acl_dst));
 }
 // Activation functions template.
 template <aclnnStatus getWorkspaceSize(const aclTensor*, aclTensor*, uint64_t*,
                                       aclOpExecutor**),
          aclnnStatus execute(void*, uint64_t, aclOpExecutor*,
                              const aclrtStream)>
 void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ggml_tensor* src = dst->src[0];
    GGML_ASSERT(src->type == GGML_TYPE_F32);
    GGML_ASSERT(dst->type == GGML_TYPE_F32);
    aclTensor* acl_src = ggml_cann_create_tensor(src);
    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
    uint64_t workspaceSize = 0;
    aclOpExecutor* executor;
    void* workspaceAddr = nullptr;
    ACL_CHECK(getWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor));
    if (workspaceSize > 0) {
        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
        workspaceAddr = workspace_allocator.get();
    }
    aclrtStream main_stream = ctx.stream();
    ACL_CHECK(execute(workspaceAddr, workspaceSize, executor, main_stream));
    ACL_CHECK(aclDestroyTensor(acl_src));
    ACL_CHECK(aclDestroyTensor(acl_dst));
 }
 // Activation functions template for const aclTensors.
 template <aclnnStatus getWorkspaceSize(const aclTensor*, const aclTensor*,
                                       uint64_t*, aclOpExecutor**),
          aclnnStatus execute(void*, uint64_t, aclOpExecutor*,
                              const aclrtStream)>
 void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ggml_tensor* src = dst->src[0];
    GGML_ASSERT(src->type == GGML_TYPE_F32);
    GGML_ASSERT(dst->type == GGML_TYPE_F32);
    aclTensor* acl_src = ggml_cann_create_tensor(src);
    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
    uint64_t workspaceSize = 0;
    aclOpExecutor* executor;
    void* workspaceAddr = nullptr;
    ACL_CHECK(getWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor));
    if (workspaceSize > 0) {
        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
        workspaceAddr = workspace_allocator.get();
    }
    aclrtStream main_stream = ctx.stream();
    ACL_CHECK(execute(workspaceAddr, workspaceSize, executor, main_stream));
    ACL_CHECK(aclDestroyTensor(acl_src));
    ACL_CHECK(aclDestroyTensor(acl_dst));
 }
 #endif  // CANN_ACLNN_OPS
--- a/ggml/src/ggml-cann/common.h
+++ b/ggml/src/ggml-cann/common.h
@ -0,0 +1,282 @@
 /*
 * Copyright (c) 2023-2024 The ggml authors
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */
 #ifndef CANN_COMMON_H
 #define CANN_COMMON_H
 #include <acl/acl.h>
 #include <cstdio>
 #include <iostream>
 #include <map>
 #include <memory>
 #include <string>
 #include <vector>
 #include "../include/ggml-cann.h"
 #include "../include/ggml.h"
 #define MATRIX_ROW_PADDING 512
 #define GGML_CANN_MAX_STREAMS 8
 /**
 * @brief Handles CANN-related errors by printing an error message and
 *        terminating the program.
 * @param stmt The statement that caused the error.
 * @param func The function in which the error occurred.
 * @param file The file in which the error occurred.
 * @param line The line number at which the error occurred.
 * @param msg The error message.
 */
 [[noreturn]] void ggml_cann_error(const char* stmt, const char* func,
                                  const char* file, int line, const char* msg);
 /**
 * @brief Checks the result of a CANN function call and invokes the error
 *        handler if the call fails.
 * @param stmt The CANN function call to check.
 * @param success The success code that indicates the call was successful.
 * @param error_fn The function to call to retrieve the error message.
 */
 #define ACL_CHECK_GEN(stmt, success, error_fn)                                \
    do {                                                                      \
        int err_code = (stmt);                                                \
        if (err_code != (success)) {                                          \
            ggml_cann_error(#stmt, __func__, __FILE__, __LINE__, error_fn()); \
        }                                                                     \
    } while (0);
 #define ACL_CHECK(stmt) ACL_CHECK_GEN(stmt, 0, aclGetRecentErrMsg)
 /**
 * @brief Contains information about CANN devices.
 */
 struct ggml_cann_device_info {
    /**
     * @brief Number of CANN devices available.
     */
    int32_t device_count;
    /**
     * @brief Information about a single CANN device.
     */
    struct cann_device_info {
        int cc;                 /**< Compute capability.                   */
        size_t smpb;            /**< Maximum shared memory per block.      */
        bool vmm;               /**< Virtual memory support.               */
        size_t vmm_granularity; /**< Granularity of virtual memory.        */
        size_t total_vram;      /**< Total video RAM available on the device. */
    };
    cann_device_info devices[GGML_CANN_MAX_DEVICES] =
        {}; /**< Array of CANN device information. */
 };
 const ggml_cann_device_info& ggml_cann_info();
 void ggml_cann_set_device(int32_t device);
 int32_t ggml_cann_get_device();
 /**
 * @brief Abstract base class for memory pools used by CANN.
 */
 struct ggml_cann_pool {
    /**
     * @brief Virtual destructor for the memory pool.
     */
    virtual ~ggml_cann_pool() = default;
    /**
     * @brief Allocates memory from the pool.
     *
     * @param size         The size of the memory block to allocate.
     * @param actual_size  Pointer to a variable where the actual allocated size
     *                     will be stored.
     * @return             Pointer to the allocated memory block.
     */
    virtual void* alloc(size_t size, size_t* actual_size) = 0;
    /**
     * @brief Frees a previously allocated memory block.
     *
     * @param ptr   Pointer to the memory block to free.
     * @param size  Size of the memory block to free.
     * @note Note that all CANN opertors are running async. Make sure memory is
     *       still avaiable before this operator finished.
     */
    virtual void free(void* ptr, size_t size) = 0;
 };
 /**
 * @brief RAII wrapper for managing memory allocations from a CANN memory pool.
 */
 struct ggml_cann_pool_alloc {
    ggml_cann_pool* pool = nullptr; /**< Pointer to the memory pool. */
    void* ptr = nullptr;    /**< Pointer to the allocated memory block. */
    size_t actual_size = 0; /**< Actual size of the allocated memory block. */
    /**
     * @brief Default constructor.
     */
    ggml_cann_pool_alloc() = default;
    /**
     * @brief Constructor that initializes the memory pool.
     * @param pool Reference to the memory pool.
     */
    explicit ggml_cann_pool_alloc(ggml_cann_pool& pool) : pool(&pool) {}
    /**
     * @brief Constructor that initializes the memory pool and allocates memory.
     * @param pool Reference to the memory pool.
     * @param size Size of the memory block to allocate.
     */
    ggml_cann_pool_alloc(ggml_cann_pool& pool, size_t size) : pool(&pool) {
        alloc(size);
    }
    /**
     * @brief Destructor that frees the allocated memory block.
     */
    ~ggml_cann_pool_alloc() {
        if (ptr != nullptr) {
            pool->free(ptr, actual_size);
        }
    }
    /**
     * @brief Allocates memory from the pool.
     * @param size Size of the memory block to allocate.
     * @return Pointer to the allocated memory block.
     */
    void* alloc(size_t size) {
        GGML_ASSERT(pool != nullptr);
        GGML_ASSERT(ptr == nullptr);
        ptr = pool->alloc(size, &this->actual_size);
        return ptr;
    }
    /**
     * @brief Allocates memory from a specific memory pool.
     * @param pool Reference to the memory pool.
     * @param size Size of the memory block to allocate.
     * @return Pointer to the allocated memory block.
     */
    void* alloc(ggml_cann_pool& pool, size_t size) {
        this->pool = &pool;
        return alloc(size);
    }
    /**
     * @brief Gets the pointer to the allocated memory block.
     * @return Pointer to the allocated memory block.
     */
    void* get() { return ptr; }
    // Deleted copy constructor
    ggml_cann_pool_alloc(const ggml_cann_pool_alloc&) = delete;
    // Deleted move constructor
    ggml_cann_pool_alloc(ggml_cann_pool_alloc&&) = delete;
    // Deleted copy assignment operator
    ggml_cann_pool_alloc& operator=(const ggml_cann_pool_alloc&) = delete;
    // Deleted move assignment operator
    ggml_cann_pool_alloc& operator=(ggml_cann_pool_alloc&&) = delete;
 };
 /**
 * @brief Context for managing CANN backend operations.
 */
 struct ggml_backend_cann_context {
    int32_t device;                  /**< Device ID. */
    std::string name;                /**< Name of the device. */
    aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
    aclrtStream streams[GGML_CANN_MAX_STREAMS] = {
        {nullptr}}; /**< Array of streams for the device. */
    /**
     * @brief Constructor for initializing the context with a given device.
     * @param device Device ID.
     */
    explicit ggml_backend_cann_context(int device)
        : device(device), name("CANN" + std::to_string(device)) {}
    /**
     * @brief Destructor for cleaning up resources.
     */
    ~ggml_backend_cann_context() {
        if (copy_event != nullptr) {
            ACL_CHECK(aclrtDestroyEvent(copy_event));
        }
        for (int i = 0; i < GGML_CANN_MAX_STREAMS; ++i) {
            if (streams[i] != nullptr) {
                ACL_CHECK(aclrtDestroyStream(streams[i]));
            }
        }
    }
    /**
     * @brief Get or create a stream for a given index.
     * @param stream Index of the stream.
     * @return The stream corresponding to the given index.
     */
    aclrtStream stream(int stream) {
        if (streams[stream] == nullptr) {
            ggml_cann_set_device(device);
            ACL_CHECK(aclrtCreateStream(&streams[stream]));
        }
        return streams[stream];
    }
    /**
     * @brief Get or create the default stream (index 0).
     * @return The default stream.
     */
    aclrtStream stream() { return stream(0); }
    // TODO: each stream should have a memory pool.
    std::unique_ptr<ggml_cann_pool>
        mem_pool; /**< Memory pool for the device. */
    /**
     * @brief Create a new memory pool for a given device.
     * @param device Device ID.
     * @return A unique pointer to the new memory pool.
     */
    static std::unique_ptr<ggml_cann_pool> new_pool_for_device(int device);
    /**
     * @brief Get or create the memory pool for the context.
     * @return Reference to the memory pool.
     */
    ggml_cann_pool& pool() {
        if (mem_pool == nullptr) {
            mem_pool = new_pool_for_device(device);
        }
        return *mem_pool;
    }
 };
 #endif  // CANN_COMMON_H
--- a/ggml/src/ggml-cann/kernels/CMakeLists.txt
+++ b/ggml/src/ggml-cann/kernels/CMakeLists.txt
@ -0,0 +1,32 @@
 if (NOT SOC_TYPE)
    set (SOC_TYPE "Ascend910B3")
 endif()
 file(GLOB SRC_FILES
    get_row_f32.cpp
    get_row_f16.cpp
    get_row_q4_0.cpp
    get_row_q8_0.cpp
    quantize_f32_q8_0.cpp
    quantize_f16_q8_0.cpp
    dup.cpp
 )
 string(TOLOWER ${SOC_TYPE} SOC_VERSION)
 set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
 set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim")
 if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
 elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
 else()
    message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the compiler package is installed.")
 endif()
 include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
 ascendc_library(ascendc_kernels STATIC
    ${SRC_FILES}
 )
 #ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
--- a/ggml/src/ggml-cann/kernels/ascendc_kernels.h
+++ b/ggml/src/ggml-cann/kernels/ascendc_kernels.h
@ -0,0 +1,17 @@
 #ifndef ASCENDC_KERNELS_H
 #define ASCENDC_KERNELS_H
 #include "aclrtlaunch_ascendc_get_row_f32.h"
 #include "aclrtlaunch_ascendc_get_row_f16.h"
 #include "aclrtlaunch_ascendc_get_row_q8_0.h"
 #include "aclrtlaunch_ascendc_get_row_q4_0.h"
 #include "aclrtlaunch_ascendc_quantize_f32_q8_0.h"
 #include "aclrtlaunch_ascendc_quantize_f16_q8_0.h"
 #include "aclrtlaunch_ascendc_dup_by_rows_fp16.h"
 #include "aclrtlaunch_ascendc_dup_by_rows_fp32.h"
 #include "aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16.h"
 #include "aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32.h"
 #endif  // ASCENDC_KERNELS_H
--- a/ggml/src/ggml-cann/kernels/dup.cpp
+++ b/ggml/src/ggml-cann/kernels/dup.cpp
@ -0,0 +1,223 @@
 #include "kernel_operator.h"
 #include <cmath>
 using namespace AscendC;
 #define BUFFER_NUM 2
 template <typename SRC_T, typename DST_T>
 class DupByRows {
   public:
    __aicore__ inline DupByRows() {}
    __aicore__ inline void init(GM_ADDR src, GM_ADDR dst, int64_t *input_ne_ub,
                                size_t *input_nb_ub) {
        /* Dup by rows when src is contigous on first dimension and dst is
        contiguous, each kernel process one row.
        */
        // Input has four dims.
        int64_t op_block_num = GetBlockNum();
        int64_t op_block_idx = GetBlockIdx();
        // param
        num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3];
        num_elem = input_ne_ub[0];
        // index for (ne[1], ne[2], ne[3]): (idx_ne1, idx_ne2, idx_ne3)
        idx_ne3 = op_block_idx / (input_ne_ub[1] * input_ne_ub[2]);
        idx_ne2 = (op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2]))
                  / (input_ne_ub[1]);
        idx_ne1 = op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2])
                - idx_ne2 * input_ne_ub[1];
        // src may not contiguous in dim [1,2,3], so stride decited by ne&nb
        src_stride = input_nb_ub[3] * idx_ne3 + input_nb_ub[2] * idx_ne2
                     + input_nb_ub[1] * idx_ne1;
        // dst is contiguous
        dst_stride = op_block_idx * (input_ne_ub[0] * sizeof(DST_T));
        src_gm.SetGlobalBuffer(reinterpret_cast<__gm__ SRC_T *>(src +
                                                                src_stride));
        dst_gm.SetGlobalBuffer(reinterpret_cast<__gm__ DST_T *>(dst +
                                                                dst_stride));
        pipe.InitBuffer(src_queue, BUFFER_NUM, (sizeof(SRC_T) * num_elem +
                                                32 - 1) / 32 * 32);
        pipe.InitBuffer(dst_queue, BUFFER_NUM, (sizeof(DST_T) * num_elem +
                                                32 - 1) / 32 * 32);
    }
    __aicore__ inline void copy_in() {
        LocalTensor<SRC_T> src_local = src_queue.AllocTensor<SRC_T>();
        DataCopyExtParams dataCopyParams;
        dataCopyParams.blockCount = 1;
        dataCopyParams.blockLen = num_elem * sizeof(SRC_T);
        DataCopyPadExtParams<SRC_T> padParams;
        DataCopyPad(src_local, src_gm, dataCopyParams, padParams);
        src_queue.EnQue(src_local);
    }
    __aicore__ inline void copy_out() {
        LocalTensor<DST_T> dst_local = dst_queue.DeQue<DST_T>();
        DataCopyExtParams dataCopyParams;
        dataCopyParams.blockCount = 1;
        dataCopyParams.blockLen = num_elem * sizeof(DST_T);
        DataCopyPad(dst_gm, dst_local, dataCopyParams);
        dst_queue.FreeTensor(dst_local);
    }
    __aicore__ inline void dup() {
        // main process, copy one row data from src to dst.
        copy_in();
        LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
        LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();
        int32_t BLOCK_NUM = 32 / sizeof(DST_T);
        DataCopy(dst_local, src_local, (num_elem + BLOCK_NUM - 1)
                                        / BLOCK_NUM * BLOCK_NUM);
        dst_queue.EnQue<DST_T>(dst_local);
        src_queue.FreeTensor(src_local);
        copy_out();
    }
    __aicore__ inline void dup_with_cast() {
        // main process, copy one row data from src to dst.
        // cast dtype from src to dst.
        copy_in();
        LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
        LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();
        Cast(dst_local, src_local, RoundMode::CAST_NONE, num_elem);
        dst_queue.EnQue<DST_T>(dst_local);
        src_queue.FreeTensor(src_local);
        copy_out();
    }
   private:
    TPipe pipe;
    GlobalTensor<SRC_T> src_gm;
    GlobalTensor<DST_T> dst_gm;
    int64_t num_rows;
    int64_t num_elem;
    int64_t idx_ne3;
    int64_t idx_ne2;
    int64_t idx_ne1;
    int64_t src_stride;
    int64_t dst_stride;
    TQue<QuePosition::VECIN, BUFFER_NUM> src_queue;
    TQue<QuePosition::VECOUT, BUFFER_NUM> dst_queue;
 };
 template <typename T>
 __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
    auto gm_ptr = (__gm__ uint8_t *)gm;
    auto ub_ptr = (uint8_t *)(ub);
    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
        *ub_ptr = *gm_ptr;
    }
 }
 extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16(
                                                        GM_ADDR src_gm,
                                                        GM_ADDR dst_gm,
                                                        GM_ADDR input_ne_gm,
                                                        GM_ADDR input_nb_gm,
                                                        GM_ADDR output_ne_gm,
                                                        GM_ADDR output_nb_gm) {
    int64_t input_ne_ub[4];
    size_t input_nb_ub[4];
    int64_t output_ne_ub[4];
    size_t output_nb_ub[4];
    copy_to_ub(input_ne_gm, input_ne_ub, 32);
    copy_to_ub(input_nb_gm, input_nb_ub, 32);
    copy_to_ub(output_ne_gm, output_ne_ub, 32);
    copy_to_ub(output_nb_gm, output_nb_ub, 32);
    DupByRows<half, half> op;
    op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
    op.dup();
 }
 extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32(
                                                        GM_ADDR src_gm,
                                                        GM_ADDR dst_gm,
                                                        GM_ADDR input_ne_gm,
                                                        GM_ADDR input_nb_gm,
                                                        GM_ADDR output_ne_gm,
                                                        GM_ADDR output_nb_gm) {
    int64_t input_ne_ub[4];
    size_t input_nb_ub[4];
    int64_t output_ne_ub[4];
    size_t output_nb_ub[4];
    copy_to_ub(input_ne_gm, input_ne_ub, 32);
    copy_to_ub(input_nb_gm, input_nb_ub, 32);
    copy_to_ub(output_ne_gm, output_ne_ub, 32);
    copy_to_ub(output_nb_gm, output_nb_ub, 32);
    DupByRows<float_t, float_t> op;
    op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
    op.dup();
 }
 extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32_to_fp16(
                                                        GM_ADDR src_gm,
                                                        GM_ADDR dst_gm,
                                                        GM_ADDR input_ne_gm,
                                                        GM_ADDR input_nb_gm,
                                                        GM_ADDR output_ne_gm,
                                                        GM_ADDR output_nb_gm) {
    int64_t input_ne_ub[4];
    size_t input_nb_ub[4];
    int64_t output_ne_ub[4];
    size_t output_nb_ub[4];
    copy_to_ub(input_ne_gm, input_ne_ub, 32);
    copy_to_ub(input_nb_gm, input_nb_ub, 32);
    copy_to_ub(output_ne_gm, output_ne_ub, 32);
    copy_to_ub(output_nb_gm, output_nb_ub, 32);
    DupByRows<float_t, half> op;
    op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
    op.dup_with_cast();
 }
 extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16_to_fp32(
                                                        GM_ADDR src_gm,
                                                        GM_ADDR dst_gm,
                                                        GM_ADDR input_ne_gm,
                                                        GM_ADDR input_nb_gm,
                                                        GM_ADDR output_ne_gm,
                                                        GM_ADDR output_nb_gm) {
    // copy params from gm to ub.
    int64_t input_ne_ub[4];
    size_t input_nb_ub[4];
    int64_t output_ne_ub[4];
    size_t output_nb_ub[4];
    copy_to_ub(input_ne_gm, input_ne_ub, 32);
    copy_to_ub(input_nb_gm, input_nb_ub, 32);
    copy_to_ub(output_ne_gm, output_ne_ub, 32);
    copy_to_ub(output_nb_gm, output_nb_ub, 32);
    DupByRows<half, float_t> op;
    op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
    op.dup_with_cast();
 }
--- a/ggml/src/ggml-cann/kernels/get_row_f16.cpp
+++ b/ggml/src/ggml-cann/kernels/get_row_f16.cpp
@ -0,0 +1,186 @@
 #include "kernel_operator.h"
 // optimize me. Use template to avoid copy code.
 using namespace AscendC;
 #define BUFFER_NUM 2
 class GET_ROW_F16 {
   public:
    __aicore__ inline GET_ROW_F16() {}
    __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
                                int64_t *input_ne_ub, size_t *input_nb_ub,
                                int64_t *indices_ne_ub, size_t *indices_nb_ub,
                                int64_t *output_ne_ub, size_t *output_nb_ub) {
        // TODO, use template for F16/f32
        int64_t op_block_num = GetBlockNum();
        int64_t op_block_idx = GetBlockIdx();
        for (int i = 0; i < 4; i++) {
            input_ne[i] = input_ne_ub[i];
            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
            indices_ne[i] = indices_ne_ub[i];
            indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
            output_ne[i] = output_ne_ub[i];
            output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
        }
        // Indices has two dims. n_elements = all rows should get.
        // dr, all rows should this thread get.
        uint64_t n_elements =
            indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
        dr = n_elements / op_block_num;
        uint64_t tails = n_elements % op_block_num;
        if (op_block_idx < tails) {
            dr += 1;
            ir = dr * op_block_idx;
        } else {
            ir = dr * op_block_idx + tails;
        }
        input_gm.SetGlobalBuffer((__gm__ half *)input);
        indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
        output_gm.SetGlobalBuffer((__gm__ float *)output);
        uint64_t input_local_buffer_size = ((input_ne[0] * sizeof(half) + 31)
                                             & ~31);
        uint64_t output_local_buffer_size = ((input_ne[0] * sizeof(float) + 31)
                                              & ~31);
        local_buffer_elems = input_local_buffer_size / sizeof(half);
        // TODO, consider long row that can't put in UB.
        // All data should asign to 32. It's ok because all data is align to 32.
        pipe.InitBuffer(input_queue, BUFFER_NUM, input_local_buffer_size);
        pipe.InitBuffer(output_queue, BUFFER_NUM, output_local_buffer_size);
    }
    __aicore__ inline void copy_in(uint32_t offset, size_t len) {
        LocalTensor<half> input_local = input_queue.AllocTensor<half>();
        size_t tail = len % 32;
        len = len & ~31;
        DataCopy(input_local, input_gm[offset], len);
        if(tail != 0) {
            DataCopyExtParams dataCopyParams;
            dataCopyParams.blockCount = 1;
            dataCopyParams.blockLen = tail * sizeof(half);
            DataCopyPadExtParams<half> padParams;
            DataCopyPad(input_local[len], input_gm[offset + len],
                        dataCopyParams, padParams);
        }
        input_queue.EnQue(input_local);
    }
    __aicore__ inline void copy_out(uint32_t offset, size_t len) {
        LocalTensor<float> output_local = output_queue.DeQue<float>();
        size_t tail = len % 32;
        len = len & ~31;
        DataCopy(output_gm[offset], output_local, len);
        if(tail != 0) {
            DataCopyExtParams dataCopyParams;
            dataCopyParams.blockCount = 1;
            dataCopyParams.blockLen = tail * sizeof(float);
            DataCopyPad(output_gm[offset + len], output_local[len],
                        dataCopyParams);
        }
        output_queue.FreeTensor(output_local);
    }
    __aicore__ inline void calculate_row(int64_t idx) {
        const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
        const int64_t indices_ne1_idx =
            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
            indices_ne[0];
        const int64_t indices_ne0_idx =
            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
             indices_ne1_idx * indices_ne[0]);
        const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
                                       indices_ne1_idx * indices_stride[1] +
                                       indices_ne2_idx * indices_stride[2];
        const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
        const int64_t input_offset = selected_row_idx * input_stride[1] +
                                     indices_ne1_idx * input_stride[2] +
                                     indices_ne2_idx * input_stride[3];
        const int64_t output_offset = indices_ne0_idx * output_stride[1] +
                                      indices_ne1_idx * output_stride[2] +
                                      indices_ne2_idx * output_stride[3];
        copy_in(input_offset, input_ne[0]);
        LocalTensor<half> input_local = input_queue.DeQue<half>();
        LocalTensor<float> output_local = output_queue.AllocTensor<float>();
        Cast(output_local, input_local, RoundMode::CAST_NONE,
             local_buffer_elems);
        output_queue.EnQue(output_local);
        copy_out(output_offset, input_ne[0]);
        input_queue.FreeTensor(input_local);
    }
    __aicore__ inline void calculate() {
        for (int64_t i = ir; i < ir + dr; i++) {
            calculate_row(i);
        }
    }
   private:
    int64_t input_ne[4];
    size_t input_stride[4];
    int64_t indices_ne[4];
    size_t indices_stride[4];
    int64_t output_ne[4];
    size_t output_stride[4];
    size_t local_buffer_elems;
    int64_t ir;
    int64_t dr;
    TPipe pipe;
    GlobalTensor<half> input_gm;
    GlobalTensor<int32_t> indices_gm;
    GlobalTensor<float> output_gm;
    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
 };
 template <typename T>
 __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
    auto gm_ptr = (__gm__ uint8_t *)gm;
    auto ub_ptr = (uint8_t *)(ub);
    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
        *ub_ptr = *gm_ptr;
    }
 }
 extern "C" __global__ __aicore__ void ascendc_get_row_f16(
    GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
    GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm,
    GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
    int64_t input_ne_ub[4];
    size_t input_nb_ub[4];
    int64_t indices_ne_ub[4];
    size_t indices_nb_ub[4];
    int64_t output_ne_ub[4];
    size_t output_nb_ub[4];
    copy_to_ub(input_ne_gm, input_ne_ub, 32);
    copy_to_ub(input_nb_gm, input_nb_ub, 32);
    copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
    copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
    copy_to_ub(output_ne_gm, output_ne_ub, 32);
    copy_to_ub(output_nb_gm, output_nb_ub, 32);
    GET_ROW_F16 op;
    op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
            indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
    op.calculate();
 }
--- a/ggml/src/ggml-cann/kernels/get_row_f32.cpp
+++ b/ggml/src/ggml-cann/kernels/get_row_f32.cpp
@ -0,0 +1,180 @@
 #include "kernel_operator.h"
 // optimize me. Use template to avoid copy code.
 using namespace AscendC;
 #define BUFFER_NUM 2
 class GET_ROW_F32 {
   public:
    __aicore__ inline GET_ROW_F32() {}
    __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
                                int64_t *input_ne_ub, size_t *input_nb_ub,
                                int64_t *indices_ne_ub, size_t *indices_nb_ub,
                                int64_t *output_ne_ub, size_t *output_nb_ub) {
        int64_t op_block_num = GetBlockNum();
        int64_t op_block_idx = GetBlockIdx();
        for (int i = 0; i < 4; i++) {
            input_ne[i] = input_ne_ub[i];
            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
            indices_ne[i] = indices_ne_ub[i];
            indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
            output_ne[i] = output_ne_ub[i];
            output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
        }
        // Indices has two dims. n_elements = all rows should get.
        // dr, all rows should this thread get.
        uint64_t n_elements =
            indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
        dr = n_elements / op_block_num;
        uint64_t tails = n_elements % op_block_num;
        if (op_block_idx < tails) {
            dr += 1;
            ir = dr * op_block_idx;
        } else {
            ir = dr * op_block_idx + tails;
        }
        input_gm.SetGlobalBuffer((__gm__ float *)input);
        indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
        output_gm.SetGlobalBuffer((__gm__ float *)output);
        uint64_t local_buffer_size = ((input_ne[0] * sizeof(float) + 31) & ~31);
        local_buffer_elems = local_buffer_size / sizeof(float);
        // TODO, consider long row that can't put in UB.
        // All data should asign to 32. It's ok because all data is align to 32.
        pipe.InitBuffer(input_queue, BUFFER_NUM, local_buffer_size);
        pipe.InitBuffer(output_queue, BUFFER_NUM, local_buffer_size);
    }
    __aicore__ inline void copy_in(uint32_t offset, size_t len) {
        LocalTensor<float> input_local = input_queue.AllocTensor<float>();
        size_t tail = len % 32;
        len = len & ~31;
        DataCopy(input_local, input_gm[offset], len);
        if(tail != 0) {
            DataCopyExtParams dataCopyParams;
            dataCopyParams.blockCount = 1;
            dataCopyParams.blockLen = tail * sizeof(float);
            DataCopyPadExtParams<float> padParams;
            DataCopyPad(input_local[len], input_gm[offset + len],
                        dataCopyParams, padParams);
        }
        input_queue.EnQue(input_local);
    }
    __aicore__ inline void copy_out(uint32_t offset, size_t len) {
        LocalTensor<float> output_local = output_queue.DeQue<float>();
        size_t tail = len % 32;
        len = len & ~31;
        DataCopy(output_gm[offset], output_local, len);
        if(tail != 0) {
            DataCopyExtParams dataCopyParams;
            dataCopyParams.blockCount = 1;
            dataCopyParams.blockLen = tail * sizeof(float);
            DataCopyPad(output_gm[offset + len], output_local[len],
                        dataCopyParams);
        }
        output_queue.FreeTensor(output_local);
    }
    __aicore__ inline void calculate_row(int64_t idx) {
        const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
        const int64_t indices_ne1_idx =
            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
            indices_ne[0];
        const int64_t indices_ne0_idx =
            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
             indices_ne1_idx * indices_ne[0]);
        const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
                                       indices_ne1_idx * indices_stride[1] +
                                       indices_ne2_idx * indices_stride[2];
        const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
        const int64_t input_offset = selected_row_idx * input_stride[1] +
                                     indices_ne1_idx * input_stride[2] +
                                     indices_ne2_idx * input_stride[3];
        const int64_t output_offset = indices_ne0_idx * output_stride[1] +
                                      indices_ne1_idx * output_stride[2] +
                                      indices_ne2_idx * output_stride[3];
        copy_in(input_offset, input_ne[0]);
        LocalTensor<float> input_local = input_queue.DeQue<float>();
        LocalTensor<float> output_local = output_queue.AllocTensor<float>();
        DataCopy(output_local, input_local, local_buffer_elems);
        output_queue.EnQue(output_local);
        copy_out(output_offset, input_ne[0]);
        input_queue.FreeTensor(input_local);
    }
    __aicore__ inline void calculate() {
        for (int64_t i = ir; i < ir + dr; i++) {
            calculate_row(i);
        }
    }
   private:
    int64_t input_ne[4];
    size_t input_stride[4];
    int64_t indices_ne[4];
    size_t indices_stride[4];
    int64_t output_ne[4];
    size_t output_stride[4];
    size_t local_buffer_elems;
    int64_t ir;
    int64_t dr;
    TPipe pipe;
    GlobalTensor<float> input_gm;
    GlobalTensor<int32_t> indices_gm;
    GlobalTensor<float> output_gm;
    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
 };
 template <typename T>
 __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
    auto gm_ptr = (__gm__ uint8_t *)gm;
    auto ub_ptr = (uint8_t *)(ub);
    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
        *ub_ptr = *gm_ptr;
    }
 }
 extern "C" __global__ __aicore__ void ascendc_get_row_f32(
    GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
    GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm,
    GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
    int64_t input_ne_ub[4];
    size_t input_nb_ub[4];
    int64_t indices_ne_ub[4];
    size_t indices_nb_ub[4];
    int64_t output_ne_ub[4];
    size_t output_nb_ub[4];
    copy_to_ub(input_ne_gm, input_ne_ub, 32);
    copy_to_ub(input_nb_gm, input_nb_ub, 32);
    copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
    copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
    copy_to_ub(output_ne_gm, output_ne_ub, 32);
    copy_to_ub(output_nb_gm, output_nb_ub, 32);
    GET_ROW_F32 op;
    op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
            indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
    op.calculate();
 }
--- a/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp
+++ b/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp
@ -0,0 +1,193 @@
 #include "kernel_operator.h"
 // optimize me. Use template to avoid copy code.
 using namespace AscendC;
 #define BUFFER_NUM 2
 #define QK4_0 32
 class GET_ROW_Q4_0 {
   public:
    __aicore__ inline GET_ROW_Q4_0() {}
    __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
                                int64_t *input_ne_ub, int64_t *indices_ne_ub,
                                size_t *indices_nb_ub, int64_t *output_ne_ub,
                                size_t *output_nb_ub) {
        int64_t op_block_num = GetBlockNum();
        int64_t op_block_idx = GetBlockIdx();
        for (int i = 0; i < 4; i++) {
            input_ne[i] = input_ne_ub[i];
            indices_ne[i] = indices_ne_ub[i];
            indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
            scale_ne[i] = input_ne_ub[i];
            output_ne[i] = output_ne_ub[i];
            output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
        }
        // one scale for a group.
        scale_ne[0] /= QK4_0;
        input_stride[0] = 1;
        scale_stride[0] = 1;
        output_stride[0] = 1;
        for (int i = 1; i < 4; i++) {
            input_stride[i] = input_stride[i - 1] * input_ne[i - 1];
            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
        }
        group_size_in_row = input_ne[0] / QK4_0;
        int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] *
                               input_ne[3] / 2;
        // Indices has two dims. n_elements = all rows should get.
        // dr, all rows should this thread get.
        uint64_t n_elements =
            indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
        dr = n_elements / op_block_num;
        uint64_t tails = n_elements % op_block_num;
        if (op_block_idx < tails) {
            dr += 1;
            ir = dr * op_block_idx;
        } else {
            ir = dr * op_block_idx + tails;
        }
        input_gm.SetGlobalBuffer((__gm__ int4b_t *)input);
        scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset));
        indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
        output_gm.SetGlobalBuffer((__gm__ float *)output);
        pipe.InitBuffer(input_queue, BUFFER_NUM, QK4_0 * sizeof(int4b_t));
        pipe.InitBuffer(cast_queue, BUFFER_NUM, QK4_0 * sizeof(half));
        pipe.InitBuffer(output_queue, BUFFER_NUM, QK4_0 * sizeof(float));
    }
    __aicore__ inline void copy_in(uint32_t offset) {
        LocalTensor<int4b_t> input_local = input_queue.AllocTensor<int4b_t>();
        // 32 * sizeof(int4b_t) = 16, which is not aligned to 32, why no error?
        DataCopy(input_local, input_gm[offset], QK4_0);
        input_queue.EnQue(input_local);
    }
    __aicore__ inline void copy_out(uint32_t offset) {
        LocalTensor<float> output_local = output_queue.DeQue<float>();
        DataCopy(output_gm[offset], output_local, QK4_0);
        output_queue.FreeTensor(output_local);
    }
    __aicore__ inline void calculate_group(int64_t idx, int64_t group) {
        const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
        const int64_t indices_ne1_idx =
            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
            indices_ne[0];
        const int64_t indices_ne0_idx =
            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
             indices_ne1_idx * indices_ne[0]);
        const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
                                       indices_ne1_idx * indices_stride[1] +
                                       indices_ne2_idx * indices_stride[2];
        const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
        const int64_t input_offset = selected_row_idx * input_stride[1] +
                                     indices_ne1_idx * input_stride[2] +
                                     indices_ne2_idx * input_stride[3] +
                                     group * QK4_0;
        const int64_t scale_offset = selected_row_idx * scale_stride[1] +
                                     indices_ne1_idx * scale_stride[2] +
                                     indices_ne2_idx * scale_stride[3] + group;
        const int64_t output_offset = indices_ne0_idx * output_stride[1] +
                                      indices_ne1_idx * output_stride[2] +
                                      indices_ne2_idx * output_stride[3] +
                                      group * QK4_0;
        copy_in(input_offset);
        LocalTensor<int4b_t> input_local = input_queue.DeQue<int4b_t>();
        LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
        LocalTensor<float> output_local = output_queue.AllocTensor<float>();
        // TODO: cast more data to speed up.
        Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);
        Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);
        // Only mul need compile by group.
        half scale = scale_gm.GetValue(scale_offset);
        Muls(output_local, output_local, (float)scale, QK4_0);
        input_queue.FreeTensor(input_local);
        cast_queue.FreeTensor(cast_local);
        output_queue.EnQue(output_local);
        copy_out(output_offset);
    }
    __aicore__ inline void calculate() {
        for (int64_t i = ir; i < ir + dr; i++) {
            for (int64_t j = 0; j < group_size_in_row; j++) {
                calculate_group(i, j);
            }
        }
    }
   private:
    int64_t input_ne[4];
    size_t input_stride[4];
    int64_t scale_ne[4];
    size_t scale_stride[4];
    int64_t indices_ne[4];
    size_t indices_stride[4];
    int64_t output_ne[4];
    size_t output_stride[4];
    int64_t ir;
    int64_t dr;
    int64_t group_size_in_row;
    TPipe pipe;
    GlobalTensor<int4b_t> input_gm;
    GlobalTensor<half> scale_gm;
    GlobalTensor<int32_t> indices_gm;
    GlobalTensor<float> output_gm;
    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
    TQue<QuePosition::VECIN, BUFFER_NUM> cast_queue;
 };
 template <typename T>
 __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
    auto gm_ptr = (__gm__ uint8_t *)gm;
    auto ub_ptr = (uint8_t *)(ub);
    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
        *ub_ptr = *gm_ptr;
    }
 }
 extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
    GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
    GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
    GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
    int64_t input_ne_ub[4];
    int64_t indices_ne_ub[4];
    size_t indices_nb_ub[4];
    int64_t output_ne_ub[4];
    size_t output_nb_ub[4];
    copy_to_ub(input_ne_gm, input_ne_ub, 32);
    copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
    copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
    copy_to_ub(output_ne_gm, output_ne_ub, 32);
    copy_to_ub(output_nb_gm, output_nb_ub, 32);
    GET_ROW_Q4_0 op;
    op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
            indices_nb_ub, output_ne_ub, output_nb_ub);
    op.calculate();
 }
--- a/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp
+++ b/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp
@ -0,0 +1,191 @@
 #include "kernel_operator.h"
 // optimize me. Use template to avoid copy code.
 using namespace AscendC;
 #define BUFFER_NUM 2
 #define QK8_0 32
 class GET_ROW_Q8_0 {
   public:
    __aicore__ inline GET_ROW_Q8_0() {}
    __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
                                int64_t *input_ne_ub, int64_t *indices_ne_ub,
                                size_t *indices_nb_ub, int64_t *output_ne_ub,
                                size_t *output_nb_ub) {
        int64_t op_block_num = GetBlockNum();
        int64_t op_block_idx = GetBlockIdx();
        for (int i = 0; i < 4; i++) {
            input_ne[i] = input_ne_ub[i];
            indices_ne[i] = indices_ne_ub[i];
            indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
            scale_ne[i] = input_ne_ub[i];
            output_ne[i] = output_ne_ub[i];
            output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
        }
        // one scale for a group.
        scale_ne[0] /= QK8_0;
        input_stride[0] = 1;
        scale_stride[0] = 1;
        output_stride[0] = 1;
        for (int i = 1; i < 4; i++) {
            input_stride[i] = input_stride[i - 1] * input_ne[i - 1];
            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
        }
        group_size_in_row = input_ne[0] / QK8_0;
        int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] *
                               input_ne[3] * sizeof(int8_t);
        // Indices has two dims. n_elements = all rows should get.
        // dr, all rows should this thread get.
        uint64_t n_elements =
            indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
        dr = n_elements / op_block_num;
        uint64_t tails = n_elements % op_block_num;
        if (op_block_idx < tails) {
            dr += 1;
            ir = dr * op_block_idx;
        } else {
            ir = dr * op_block_idx + tails;
        }
        input_gm.SetGlobalBuffer((__gm__ int8_t *)input);
        scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset));
        indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
        output_gm.SetGlobalBuffer((__gm__ float *)output);
        pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
        pipe.InitBuffer(cast_queue, BUFFER_NUM, QK8_0 * sizeof(half));
        pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(float));
    }
    __aicore__ inline void copy_in(uint32_t offset) {
        LocalTensor<int8_t> input_local = input_queue.AllocTensor<int8_t>();
        DataCopy(input_local, input_gm[offset], QK8_0);
        input_queue.EnQue(input_local);
    }
    __aicore__ inline void copy_out(uint32_t offset) {
        LocalTensor<float> output_local = output_queue.DeQue<float>();
        DataCopy(output_gm[offset], output_local, QK8_0);
        output_queue.FreeTensor(output_local);
    }
    __aicore__ inline void calculate_group(int64_t idx, int64_t group) {
        const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
        const int64_t indices_ne1_idx =
            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
            indices_ne[0];
        const int64_t indices_ne0_idx =
            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
             indices_ne1_idx * indices_ne[0]);
        const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
                                       indices_ne1_idx * indices_stride[1] +
                                       indices_ne2_idx * indices_stride[2];
        const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
        const int64_t input_offset = selected_row_idx * input_stride[1] +
                                     indices_ne1_idx * input_stride[2] +
                                     indices_ne2_idx * input_stride[3] +
                                     group * QK8_0;
        const int64_t scale_offset = selected_row_idx * scale_stride[1] +
                                     indices_ne1_idx * scale_stride[2] +
                                     indices_ne2_idx * scale_stride[3] + group;
        const int64_t output_offset = indices_ne0_idx * output_stride[1] +
                                      indices_ne1_idx * output_stride[2] +
                                      indices_ne2_idx * output_stride[3] +
                                      group * QK8_0;
        copy_in(input_offset);
        LocalTensor<int8_t> input_local = input_queue.DeQue<int8_t>();
        LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
        LocalTensor<float> output_local = output_queue.AllocTensor<float>();
        // TODO: cast more data to speed up.
        Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0);
        Cast(output_local, cast_local, RoundMode::CAST_NONE, QK8_0);
        // Only mul need compile by group.
        half scale = scale_gm.GetValue(scale_offset);
        Muls(output_local, output_local, (float)scale, QK8_0);
        input_queue.FreeTensor(input_local);
        cast_queue.FreeTensor(cast_local);
        output_queue.EnQue(output_local);
        copy_out(output_offset);
    }
    __aicore__ inline void calculate() {
        for (int64_t i = ir; i < ir + dr; i++) {
            for (int64_t j = 0; j < group_size_in_row; j++) {
                calculate_group(i, j);
            }
        }
    }
   private:
    int64_t input_ne[4];
    size_t input_stride[4];
    int64_t scale_ne[4];
    size_t scale_stride[4];
    int64_t indices_ne[4];
    size_t indices_stride[4];
    int64_t output_ne[4];
    size_t output_stride[4];
    int64_t ir;
    int64_t dr;
    int64_t group_size_in_row;
    TPipe pipe;
    GlobalTensor<int8_t> input_gm;
    GlobalTensor<half> scale_gm;
    GlobalTensor<int32_t> indices_gm;
    GlobalTensor<float> output_gm;
    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
    TQue<QuePosition::VECIN, BUFFER_NUM> cast_queue;
 };
 template <typename T>
 __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
    auto gm_ptr = (__gm__ uint8_t *)gm;
    auto ub_ptr = (uint8_t *)(ub);
    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
        *ub_ptr = *gm_ptr;
    }
 }
 extern "C" __global__ __aicore__ void ascendc_get_row_q8_0(
    GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
    GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
    GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
    int64_t input_ne_ub[4];
    int64_t indices_ne_ub[4];
    size_t indices_nb_ub[4];
    int64_t output_ne_ub[4];
    size_t output_nb_ub[4];
    copy_to_ub(input_ne_gm, input_ne_ub, 32);
    copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
    copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
    copy_to_ub(output_ne_gm, output_ne_ub, 32);
    copy_to_ub(output_nb_gm, output_nb_ub, 32);
    GET_ROW_Q8_0 op;
    op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
            indices_nb_ub, output_ne_ub, output_nb_ub);
    op.calculate();
 }
--- a/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp
+++ b/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp
@ -0,0 +1,208 @@
 #include "kernel_operator.h"
 using namespace AscendC;
 #define BUFFER_NUM 2
 #define QK8_0 32
 class QUANTIZE_F16_Q8_0 {
   public:
    __aicore__ inline QUANTIZE_F16_Q8_0() {}
    __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
                                int64_t *input_ne_ub, size_t *input_nb_ub,
                                int64_t *output_ne_ub) {
        int64_t op_block_num = GetBlockNum();
        int64_t op_block_idx = GetBlockIdx();
        for (int i = 0; i < 4; i++) {
            input_ne[i] = input_ne_ub[i];
            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
            output_ne[i] = output_ne_ub[i];
        }
        output_stride[0] = 1;
        for (int i = 1; i < 4; i++) {
            output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
        }
        scale_ne = input_ne;
        scale_stride[0] = 1;
        scale_stride[1] = input_ne[0] / QK8_0;
        for (int i = 2; i < 4; i++) {
            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
        }
        // split input tensor by rows.
        uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
        dr = nr / op_block_num;
        uint64_t tails = nr % op_block_num;
        if (op_block_idx < tails) {
            dr += 1;
            ir = dr * op_block_idx;
        } else {
            ir = dr * op_block_idx + tails;
        }
        group_size_in_row = scale_stride[1];
        int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] *
                              output_ne[3] * sizeof(uint8_t);
        input_gm.SetGlobalBuffer((__gm__ half *)input);
        output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
        scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size + ir *
                                                 group_size_in_row *
                                                 sizeof(half)));
        pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(half));
        pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
        pipe.InitBuffer(work_queue, 1, 32);
        pipe.InitBuffer(max_queue, 1, 32);
        pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float));
        pipe.InitBuffer(scale_queue, 1, 32);
        pipe.InitBuffer(cast_queue ,1 ,QK8_0 * sizeof(float));
    }
    __aicore__ inline void copy_in(uint32_t offset) {
        LocalTensor<half> input_local = input_queue.AllocTensor<half>();
        DataCopy(input_local, input_gm[offset], QK8_0);
        input_queue.EnQue(input_local);
    }
    __aicore__ inline void copy_out(uint32_t offset) {
        LocalTensor<int8_t> output_local = output_queue.DeQue<int8_t>();
        DataCopy(output_gm[offset], output_local, QK8_0);
        output_queue.FreeTensor(output_local);
    }
    __aicore__ inline half calculate_group(int64_t row, int64_t group) {
        const int64_t i3 = row / (input_ne[1] * input_ne[2]);
        const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
        const int64_t i1 =
            row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
        const int64_t input_offset = i1 * input_stride[1] +
                                     i2 * input_stride[2] +
                                     i3 * input_stride[3] + QK8_0 * group;
        const int64_t output_offset = i1 * output_stride[1] +
                                      i2 * output_stride[2] +
                                      i3 * output_stride[3] + QK8_0 * group;
        copy_in(input_offset);
        LocalTensor<half> input_local = input_queue.DeQue<half>();
        LocalTensor<int8_t> output_local = output_queue.AllocTensor<int8_t>();
        LocalTensor<float> work_local = work_queue.AllocTensor<float>();
        LocalTensor<float> abs_local = abs_queue.AllocTensor<float>();
        LocalTensor<float> max_local = max_queue.AllocTensor<float>();
        LocalTensor<float> cast_local = cast_queue.AllocTensor<float>();
        Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0);
        Abs(abs_local, cast_local, QK8_0);
        ReduceMax(max_local, abs_local, work_local, QK8_0);
        pipe_barrier(PIPE_ALL);
        float d = max_local.GetValue(0);
        d = d / ((1 << 7) - 1);
        if (d != 0) {
            Muls(cast_local, cast_local, 1.0f / d, QK8_0);
        }
        Cast(cast_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
        Cast(input_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
        Cast(output_local, input_local, RoundMode::CAST_ROUND, QK8_0);
        output_queue.EnQue(output_local);
        copy_out(output_offset);
        input_queue.FreeTensor(input_local);
        work_queue.FreeTensor(work_local);
        abs_queue.FreeTensor(abs_local);
        max_queue.FreeTensor(max_local);
        cast_queue.FreeTensor(cast_local);
        return (half)d;
    }
    __aicore__ inline void calculate() {
        LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
        uint32_t scale_local_offset = 0;
        uint32_t scale_global_offset = 0;
        for (int64_t i = ir; i < ir + dr; i++) {
            for (int64_t j = 0; j < group_size_in_row; j++) {
                half scale = calculate_group(i, j);
                scale_local.SetValue(scale_local_offset++, scale);
                if (scale_local_offset == 16) {
                    scale_local_offset = 0;
                    // TODO: OPTIMIZE ME
                    pipe_barrier(PIPE_ALL);
                    DataCopy(scale_gm[scale_global_offset], scale_local, 16);
                    pipe_barrier(PIPE_ALL);
                    scale_global_offset += 16;
                }
            }
        }
        if (scale_local_offset != 0) {
            pipe_barrier(PIPE_ALL);
            DataCopyExtParams dataCopyParams;
            dataCopyParams.blockCount = 1;
            dataCopyParams.blockLen = scale_local_offset * sizeof(half);
            DataCopyPad(scale_gm[scale_global_offset], scale_local,
                        dataCopyParams);
            pipe_barrier(PIPE_ALL);
        }
    }
   private:
    int64_t input_ne[4];
    size_t input_stride[4];
    int64_t *scale_ne;
    size_t scale_stride[4];
    int64_t output_ne[4];
    size_t output_stride[4];
    int64_t group_size_in_row;
    int64_t ir;
    int64_t dr;
    TPipe pipe;
    GlobalTensor<half> input_gm;
    GlobalTensor<half> scale_gm;
    GlobalTensor<int8_t> output_gm;
    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
    TQue<QuePosition::VECIN, 1> work_queue;
    TQue<QuePosition::VECOUT, 1> max_queue;
    TQue<QuePosition::VECIN, 1> abs_queue;
    TQue<QuePosition::VECOUT, 1> scale_queue;
    TQue<QuePosition::VECOUT, 1> cast_queue;
 };
 template <typename T>
 __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
    auto gm_ptr = (__gm__ uint8_t *)gm;
    auto ub_ptr = (uint8_t *)(ub);
    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
        *ub_ptr = *gm_ptr;
    }
 }
 extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
    GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
    GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
    int64_t input_ne_ub[4];
    size_t input_nb_ub[4];
    int64_t output_ne_ub[4];
    copy_to_ub(input_ne_gm, input_ne_ub, 32);
    copy_to_ub(input_nb_gm, input_nb_ub, 32);
    copy_to_ub(output_ne_gm, output_ne_ub, 32);
    QUANTIZE_F16_Q8_0 op;
    op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
    op.calculate();
 }
--- a/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp
+++ b/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp
@ -0,0 +1,206 @@
 #include "kernel_operator.h"
 using namespace AscendC;
 #define BUFFER_NUM 2
 #define QK8_0 32
 class QUANTIZE_F32_Q8_0 {
   public:
    __aicore__ inline QUANTIZE_F32_Q8_0() {}
    __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
                                int64_t *input_ne_ub, size_t *input_nb_ub,
                                int64_t *output_ne_ub) {
        int64_t op_block_num = GetBlockNum();
        int64_t op_block_idx = GetBlockIdx();
        for (int i = 0; i < 4; i++) {
            input_ne[i] = input_ne_ub[i];
            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
            output_ne[i] = output_ne_ub[i];
        }
        output_stride[0] = 1;
        for (int i = 1; i < 4; i++) {
            output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
        }
        scale_ne = input_ne;
        scale_stride[0] = 1;
        scale_stride[1] = input_ne[0] / QK8_0;
        for (int i = 2; i < 4; i++) {
            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
        }
        // split input tensor by rows.
        uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
        dr = nr / op_block_num;
        uint64_t tails = nr % op_block_num;
        if (op_block_idx < tails) {
            dr += 1;
            ir = dr * op_block_idx;
        } else {
            ir = dr * op_block_idx + tails;
        }
        group_size_in_row = scale_stride[1];
        int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] *
                              output_ne[3] * sizeof(uint8_t);
        input_gm.SetGlobalBuffer((__gm__ float *)input);
        output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
        scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size +
                                                 ir * group_size_in_row *
                                                 sizeof(half)));
        pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(float));
        pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
        pipe.InitBuffer(work_queue, 1, 32);
        pipe.InitBuffer(max_queue, 1, 32);
        pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float));
        pipe.InitBuffer(cast_queue, 1, QK8_0 * sizeof(half));
        pipe.InitBuffer(scale_queue, 1, 32);
    }
    __aicore__ inline void copy_in(uint32_t offset) {
        LocalTensor<float> input_local = input_queue.AllocTensor<float>();
        DataCopy(input_local, input_gm[offset], QK8_0);
        input_queue.EnQue(input_local);
    }
    __aicore__ inline void copy_out(uint32_t offset) {
        LocalTensor<int8_t> output_local = output_queue.DeQue<int8_t>();
        DataCopy(output_gm[offset], output_local, QK8_0);
        output_queue.FreeTensor(output_local);
    }
    __aicore__ inline half calculate_group(int64_t row, int64_t group) {
        const int64_t i3 = row / (input_ne[1] * input_ne[2]);
        const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
        const int64_t i1 =
            row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
        const int64_t input_offset = i1 * input_stride[1] +
                                     i2 * input_stride[2] +
                                     i3 * input_stride[3] + QK8_0 * group;
        const int64_t output_offset = i1 * output_stride[1] +
                                      i2 * output_stride[2] +
                                      i3 * output_stride[3] + QK8_0 * group;
        copy_in(input_offset);
        LocalTensor<float> input_local = input_queue.DeQue<float>();
        LocalTensor<int8_t> output_local = output_queue.AllocTensor<int8_t>();
        LocalTensor<float> work_local = work_queue.AllocTensor<float>();
        LocalTensor<float> abs_local = abs_queue.AllocTensor<float>();
        LocalTensor<float> max_local = max_queue.AllocTensor<float>();
        LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
        Abs(abs_local, input_local, QK8_0);
        ReduceMax(max_local, abs_local, work_local, QK8_0);
        pipe_barrier(PIPE_ALL);
        float d = max_local.GetValue(0);
        d = d / ((1 << 7) - 1);
        if (d != 0) {
            Muls(input_local, input_local, 1.0f / d, QK8_0);
        }
        Cast(input_local, input_local, RoundMode::CAST_ROUND, QK8_0);
        Cast(cast_local, input_local, RoundMode::CAST_ROUND, QK8_0);
        Cast(output_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
        output_queue.EnQue(output_local);
        copy_out(output_offset);
        input_queue.FreeTensor(input_local);
        work_queue.FreeTensor(work_local);
        abs_queue.FreeTensor(abs_local);
        max_queue.FreeTensor(max_local);
        cast_queue.FreeTensor(cast_local);
        return (half)d;
    }
    __aicore__ inline void calculate() {
        LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
        uint32_t scale_local_offset = 0;
        uint32_t scale_global_offset = 0;
        for (int64_t i = ir; i < ir + dr; i++) {
            for (int64_t j = 0; j < group_size_in_row; j++) {
                half scale = calculate_group(i, j);
                scale_local.SetValue(scale_local_offset++, scale);
                if (scale_local_offset == 16) {
                    scale_local_offset = 0;
                    // TODO: OPTIMIZE ME
                    pipe_barrier(PIPE_ALL);
                    DataCopy(scale_gm[scale_global_offset], scale_local, 16);
                    pipe_barrier(PIPE_ALL);
                    scale_global_offset += 16;
                }
            }
        }
        if (scale_local_offset != 0) {
            pipe_barrier(PIPE_ALL);
            DataCopyExtParams dataCopyParams;
            dataCopyParams.blockCount = 1;
            dataCopyParams.blockLen = scale_local_offset * sizeof(half);
            DataCopyPad(scale_gm[scale_global_offset], scale_local,
                        dataCopyParams);
            pipe_barrier(PIPE_ALL);
        }
    }
   private:
    int64_t input_ne[4];
    size_t input_stride[4];
    int64_t *scale_ne;
    size_t scale_stride[4];
    int64_t output_ne[4];
    size_t output_stride[4];
    int64_t group_size_in_row;
    int64_t ir;
    int64_t dr;
    TPipe pipe;
    GlobalTensor<float> input_gm;
    GlobalTensor<half> scale_gm;
    GlobalTensor<int8_t> output_gm;
    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
    TQue<QuePosition::VECIN, 1> work_queue;
    TQue<QuePosition::VECOUT, 1> max_queue;
    TQue<QuePosition::VECIN, 1> abs_queue;
    TQue<QuePosition::VECIN, 1> cast_queue;
    TQue<QuePosition::VECOUT, 1> scale_queue;
 };
 template <typename T>
 __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
    auto gm_ptr = (__gm__ uint8_t *)gm;
    auto ub_ptr = (uint8_t *)(ub);
    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
        *ub_ptr = *gm_ptr;
    }
 }
 extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
    GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
    GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
    int64_t input_ne_ub[4];
    size_t input_nb_ub[4];
    int64_t output_ne_ub[4];
    copy_to_ub(input_ne_gm, input_ne_ub, 32);
    copy_to_ub(input_nb_gm, input_nb_ub, 32);
    copy_to_ub(output_ne_gm, output_ne_ub, 32);
    QUANTIZE_F32_Q8_0 op;
    op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
    op.calculate();
 }
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@ -464,12 +464,12 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
        return;
    }
-    if (ggml_is_quantized(tensor->type)) {
+    if (ggml_is_quantized(tensor->type) && tensor->view_src == nullptr && ggml_backend_buffer_get_usage(buffer) != GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
        // initialize padding to 0 to avoid possible NaN values
        size_t original_size = ggml_nbytes(tensor);
        size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
-        if (padded_size > original_size && tensor->view_src == nullptr) {
+        if (padded_size > original_size) {
            ggml_cuda_set_device(ctx->device);
            CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
        }
@ -1485,6 +1485,13 @@ static void ggml_cuda_op_mul_mat(
            dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ctx.pool(id), ggml_nbytes(src0));
        }
        // If src0 is on a temporary compute buffers (partial offloading) there may be some padding that needs to be cleared:
        if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
            const int64_t nbytes_data    = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
            const int64_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
            CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data , 0, nbytes_padding, stream));
        }
        if (src1_on_device && src1_is_contiguous) {
            dev[id].src1_ddf = (float *) src1->data;
        } else {
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@ -459,7 +459,7 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
 static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
+#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
    c = __builtin_amdgcn_sdot4(a, b, c, false);
 #elif defined(RDNA3)
    c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@ -59,6 +59,24 @@ void ggml_cuda_op_mul_mat_q(
        case GGML_TYPE_Q6_K:
            mul_mat_q_case<GGML_TYPE_Q6_K>(ctx, args, stream);
            break;
        case GGML_TYPE_IQ2_XXS:
            mul_mat_q_case<GGML_TYPE_IQ2_XXS>(ctx, args, stream);
            break;
        case GGML_TYPE_IQ2_XS:
            mul_mat_q_case<GGML_TYPE_IQ2_XS>(ctx, args, stream);
            break;
        case GGML_TYPE_IQ2_S:
            mul_mat_q_case<GGML_TYPE_IQ2_S>(ctx, args, stream);
            break;
        case GGML_TYPE_IQ3_XXS:
            mul_mat_q_case<GGML_TYPE_IQ3_XXS>(ctx, args, stream);
            break;
        case GGML_TYPE_IQ3_S:
            mul_mat_q_case<GGML_TYPE_IQ3_S>(ctx, args, stream);
            break;
        case GGML_TYPE_IQ1_S:
            mul_mat_q_case<GGML_TYPE_IQ1_S>(ctx, args, stream);
            break;
        case GGML_TYPE_IQ4_XS:
            mul_mat_q_case<GGML_TYPE_IQ4_XS>(ctx, args, stream);
            break;
@ -93,6 +111,12 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
        case GGML_TYPE_Q4_K:
        case GGML_TYPE_Q5_K:
        case GGML_TYPE_Q6_K:
        case GGML_TYPE_IQ2_XXS:
        case GGML_TYPE_IQ2_XS:
        case GGML_TYPE_IQ2_S:
        case GGML_TYPE_IQ3_XXS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ1_S:
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ4_NL:
            mmq_supported = true;
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
--- a/ggml/src/ggml-cuda/template-instances/generate_cu_files.py
+++ b/ggml/src/ggml-cuda/template-instances/generate_cu_files.py
@ -23,7 +23,8 @@ SOURCE_FATTN_WMMA_CASE = "DECL_FATTN_WMMA_F16_CASE({head_size}, {cols_per_block}
 TYPES_MMQ = [
    "GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0",
    "GGML_TYPE_Q2_K", "GGML_TYPE_Q3_K", "GGML_TYPE_Q4_K", "GGML_TYPE_Q5_K", "GGML_TYPE_Q6_K",
-    "GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS"
+    "GGML_TYPE_IQ2_XXS", "GGML_TYPE_IQ2_XS", "GGML_TYPE_IQ2_S", "GGML_TYPE_IQ3_XXS", "GGML_TYPE_IQ3_S",
    "GGML_TYPE_IQ1_S", "GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS"
 ]
 SOURCE_MMQ = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
--- a/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu
+++ b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu
@ -0,0 +1,5 @@
 // This file has been autogenerated by generate_cu_files.py, do not edit manually.
 #include "../mmq.cuh"
 DECL_MMQ_CASE(GGML_TYPE_IQ1_S);
--- a/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu
+++ b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu
@ -0,0 +1,5 @@
 // This file has been autogenerated by generate_cu_files.py, do not edit manually.
 #include "../mmq.cuh"
 DECL_MMQ_CASE(GGML_TYPE_IQ2_S);
--- a/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu
+++ b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu
@ -0,0 +1,5 @@
 // This file has been autogenerated by generate_cu_files.py, do not edit manually.
 #include "../mmq.cuh"
 DECL_MMQ_CASE(GGML_TYPE_IQ2_XS);
--- a/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu
+++ b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu
@ -0,0 +1,5 @@
 // This file has been autogenerated by generate_cu_files.py, do not edit manually.
 #include "../mmq.cuh"
 DECL_MMQ_CASE(GGML_TYPE_IQ2_XXS);
--- a/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu
+++ b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu
@ -0,0 +1,5 @@
 // This file has been autogenerated by generate_cu_files.py, do not edit manually.
 #include "../mmq.cuh"
 DECL_MMQ_CASE(GGML_TYPE_IQ3_S);
--- a/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu
+++ b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu
@ -0,0 +1,5 @@
 // This file has been autogenerated by generate_cu_files.py, do not edit manually.
 #include "../mmq.cuh"
 DECL_MMQ_CASE(GGML_TYPE_IQ3_XXS);
--- a/ggml/src/ggml-cuda/vecdotq.cuh
+++ b/ggml/src/ggml-cuda/vecdotq.cuh
@ -188,6 +188,27 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
    return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
 }
 template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_16_q8_1_impl(
    const int * v, const int * u, const float * d8_0, const float & d8_1) {
    float sumf = 0.0f;
 #pragma unroll
    for (int i0 = 0; i0 < vdr; i0 += QI8_0/2) {
        int sumi = 0;
 #pragma unroll
        for (int i = i0; i < i0 + QI8_0/2; ++i) {
            // SIMD dot product of quantized values
            sumi = ggml_cuda_dp4a(v[i], u[i], sumi);
        }
        sumf += d8_0[i0/(QI8_0/2)]*sumi;
    }
    return d8_1*sumf;
 }
 #define VDR_Q2_K_Q8_1_MMVQ 1
 #define VDR_Q2_K_Q8_1_MMQ  4
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@ -1786,10 +1786,6 @@ static enum ggml_status ggml_metal_graph_compute(
                                    }
                            };
                            if (ggml_is_quantized(src0t)) {
                                GGML_ASSERT(ne00 >= nth0*nth1);
                            }
                            [encoder setComputePipelineState:pipeline];
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
--- a/ggml/src/ggml-metal.metal
+++ b/ggml/src/ggml-metal.metal
@ -4757,7 +4757,7 @@ void kernel_mul_mv_iq4_nl_f32_impl(
        device const float4 * y4 = (device const float4 *)yb;
        yl[0] = y4[0]; yl[1] = y4[4]; yl[2] = y4[1]; yl[3] = y4[5];
-        for (int row = 0; row < 2; ++row) {
+        for (int row = 0; row < 2 && first_row + row < ne01; ++row) {
            device const block_iq4_nl & xb = x[row*nb + ib];
            device const uint16_t * q4 = (device const uint16_t *)(xb.qs + 8*it);
@ -4789,7 +4789,7 @@ void kernel_mul_mv_iq4_nl_f32_impl(
        yb += 16 * QK4_NL;
    }
-    for (int row = 0; row < 2; ++row) {
+    for (int row = 0; row < 2 && first_row + row < ne01; ++row) {
        all_sum = simd_sum(sumf[row]);
        if (tiisg == 0) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
--- a/ggml/src/ggml-sycl/softmax.cpp
+++ b/ggml/src/ggml-sycl/softmax.cpp
@ -152,7 +152,8 @@ static void soft_max_f32_sycl(const float * x, const float * mask,
    const sycl::range<3> block_dims(1, 1, nth);
    const sycl::range<3> block_nums(1, 1, nrows_x);
-    const size_t n_local_scratch = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE);
+    const size_t n_val_tmp = nth / WARP_SIZE;
    const size_t n_local_scratch = (GGML_PAD(ncols_x, WARP_SIZE) + n_val_tmp);
    const uint32_t n_head_kv   = nrows_x/nrows_y;
    const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@ -38,8 +38,6 @@
 #define VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI 1
 #define VK_DEVICE_DESCRIPTOR_POOL_MODE_SINGLE 2
 #define VK_NUM_TYPES 16
 #define GGML_VK_MAX_NODES 8192
 #define MAX_VK_BUFFERS 256
@ -162,23 +160,23 @@ struct vk_device_struct {
    vk_matmul_pipeline pipeline_matmul_f16_f32;
    vk_pipeline pipeline_matmul_split_k_reduce;
-    vk_matmul_pipeline pipeline_dequant_mul_mat_mat[VK_NUM_TYPES];
+    vk_matmul_pipeline pipeline_dequant_mul_mat_mat[GGML_TYPE_COUNT];
    vk_matmul_pipeline pipeline_matmul_id_f32;
    vk_matmul_pipeline pipeline_matmul_id_f16;
    vk_matmul_pipeline pipeline_matmul_id_f16_f32;
-    vk_matmul_pipeline pipeline_dequant_mul_mat_mat_id[VK_NUM_TYPES];
+    vk_matmul_pipeline pipeline_dequant_mul_mat_mat_id[GGML_TYPE_COUNT];
-    vk_pipeline pipeline_dequant[VK_NUM_TYPES];
+    vk_pipeline pipeline_dequant[GGML_TYPE_COUNT];
-    vk_pipeline pipeline_dequant_mul_mat_vec_f32_f32[VK_NUM_TYPES];
+    vk_pipeline pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_COUNT];
-    vk_pipeline pipeline_dequant_mul_mat_vec_f16_f32[VK_NUM_TYPES];
+    vk_pipeline pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_COUNT];
-    vk_pipeline pipeline_dequant_mul_mat_vec_id_f32[VK_NUM_TYPES];
+    vk_pipeline pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_COUNT];
    vk_pipeline pipeline_mul_mat_vec_p021_f16_f32;
    vk_pipeline pipeline_mul_mat_vec_nc_f16_f32;
-    vk_pipeline pipeline_get_rows[VK_NUM_TYPES];
+    vk_pipeline pipeline_get_rows[GGML_TYPE_COUNT];
-    vk_pipeline pipeline_get_rows_f32[VK_NUM_TYPES];
+    vk_pipeline pipeline_get_rows_f32[GGML_TYPE_COUNT];
    vk_pipeline pipeline_mul_f32;
    vk_pipeline pipeline_div_f32;
    vk_pipeline pipeline_add_f32;
@ -1059,25 +1057,6 @@ static void ggml_vk_wait_events(vk_context * ctx, std::vector<vk::Event>&& event
    );
 }
 static bool ggml_vk_build_shader(ggml_type type) {
    switch(type) {
    case GGML_TYPE_F16:
    case GGML_TYPE_Q4_0:
    case GGML_TYPE_Q4_1:
    case GGML_TYPE_Q5_0:
    case GGML_TYPE_Q5_1:
    case GGML_TYPE_Q8_0:
    case GGML_TYPE_Q2_K:
    case GGML_TYPE_Q3_K:
    case GGML_TYPE_Q4_K:
    case GGML_TYPE_Q5_K:
    case GGML_TYPE_Q6_K:
        return true;
    default:
        return false;
    }
 }
 static void ggml_vk_load_shaders(vk_device& device) {
    VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")");
@ -1112,6 +1091,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
    device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K] = std::make_shared<vk_matmul_pipeline_struct>();
    device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K] = std::make_shared<vk_matmul_pipeline_struct>();
    device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
    device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL] = std::make_shared<vk_matmul_pipeline_struct>();
    device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();
    device->pipeline_matmul_id_f16_f32 = std::make_shared<vk_matmul_pipeline_struct>();
@ -1126,6 +1106,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
    device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K] = std::make_shared<vk_matmul_pipeline_struct>();
    device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K] = std::make_shared<vk_matmul_pipeline_struct>();
    device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
    device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL] = std::make_shared<vk_matmul_pipeline_struct>();
    if (device->fp16) {
        ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
@ -1226,6 +1207,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->l, "matmul_iq4_nl_f32_l", matmul_iq4_nl_f32_len, matmul_iq4_nl_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->m, "matmul_iq4_nl_f32_m", matmul_iq4_nl_f32_len, matmul_iq4_nl_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->s, "matmul_iq4_nl_f32_s", matmul_iq4_nl_f32_len, matmul_iq4_nl_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_l, "matmul_iq4_nl_f32_aligned_l", matmul_iq4_nl_f32_aligned_len, matmul_iq4_nl_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_m, "matmul_iq4_nl_f32_aligned_m", matmul_iq4_nl_f32_aligned_len, matmul_iq4_nl_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_s, "matmul_iq4_nl_f32_aligned_s", matmul_iq4_nl_f32_aligned_len, matmul_iq4_nl_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
@ -1316,6 +1304,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_l, "matmul_id_q6_k_f32_aligned_l", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->l, "matmul_id_iq4_nl_f32_l", matmul_id_iq4_nl_f32_len, matmul_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->m, "matmul_id_iq4_nl_f32_m", matmul_id_iq4_nl_f32_len, matmul_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->s, "matmul_id_iq4_nl_f32_s", matmul_id_iq4_nl_f32_len, matmul_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_l, "matmul_id_iq4_nl_f32_aligned_l", matmul_id_iq4_nl_f32_aligned_len, matmul_id_iq4_nl_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_m, "matmul_id_iq4_nl_f32_aligned_m", matmul_id_iq4_nl_f32_aligned_len, matmul_id_iq4_nl_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_s, "matmul_id_iq4_nl_f32_aligned_s", matmul_id_iq4_nl_f32_aligned_len, matmul_id_iq4_nl_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
    } else {
        ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
        ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
@ -1415,6 +1410,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->l, "matmul_iq4_nl_f32_l", matmul_iq4_nl_f32_fp32_len, matmul_iq4_nl_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->m, "matmul_iq4_nl_f32_m", matmul_iq4_nl_f32_fp32_len, matmul_iq4_nl_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->s, "matmul_iq4_nl_f32_s", matmul_iq4_nl_f32_fp32_len, matmul_iq4_nl_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_l, "matmul_iq4_nl_f32_aligned_l", matmul_iq4_nl_f32_aligned_fp32_len, matmul_iq4_nl_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_m, "matmul_iq4_nl_f32_aligned_m", matmul_iq4_nl_f32_aligned_fp32_len, matmul_iq4_nl_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_s, "matmul_iq4_nl_f32_aligned_s", matmul_iq4_nl_f32_aligned_fp32_len, matmul_iq4_nl_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
@ -1505,6 +1507,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_l, "matmul_id_q6_k_f32_aligned_l", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->l, "matmul_id_iq4_nl_f32_l", matmul_id_iq4_nl_f32_fp32_len, matmul_id_iq4_nl_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->m, "matmul_id_iq4_nl_f32_m", matmul_id_iq4_nl_f32_fp32_len, matmul_id_iq4_nl_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->s, "matmul_id_iq4_nl_f32_s", matmul_id_iq4_nl_f32_fp32_len, matmul_id_iq4_nl_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_l, "matmul_id_iq4_nl_f32_aligned_l", matmul_id_iq4_nl_f32_aligned_fp32_len, matmul_id_iq4_nl_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_m, "matmul_id_iq4_nl_f32_aligned_m", matmul_id_iq4_nl_f32_aligned_fp32_len, matmul_id_iq4_nl_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_s, "matmul_id_iq4_nl_f32_aligned_s", matmul_id_iq4_nl_f32_aligned_fp32_len, matmul_id_iq4_nl_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
    }
    // mul mat vec
@ -1520,6 +1529,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32",  mul_mat_vec_f32_f16_f32_len,  mul_mat_vec_f32_f16_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32",  mul_mat_vec_f16_f16_f32_len,  mul_mat_vec_f16_f16_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
@ -1533,6 +1543,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32",  mul_mat_vec_id_f32_f32_len,  mul_mat_vec_id_f32_f32_data,  "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32",  mul_mat_vec_id_f16_f32_len,  mul_mat_vec_id_f16_f32_data,  "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
@ -1546,6 +1557,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
    // dequant shaders
    ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16",   dequant_f32_len,  dequant_f32_data,  "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
@ -1559,6 +1571,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_k", dequant_q4_k_len, dequant_q4_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_k", dequant_q5_k_len, dequant_q5_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_k", dequant_q6_k_len, dequant_q6_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ4_NL], "dequant_iq4_nl", dequant_iq4_nl_len, dequant_iq4_nl_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
    // get_rows
    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32",  get_rows_f32_len,  get_rows_f32_data,  "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
@ -1568,6 +1581,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl", get_rows_iq4_nl_len, get_rows_iq4_nl_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32",  get_rows_f32_f32_len,  get_rows_f32_f32_data,  "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32",  get_rows_f16_f32_len,  get_rows_f16_f32_data,  "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
@ -1576,6 +1590,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl_f32", get_rows_iq4_nl_f32_len, get_rows_iq4_nl_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1);
@ -2087,6 +2102,7 @@ static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type
        case GGML_TYPE_Q4_K:
        case GGML_TYPE_Q5_K:
        case GGML_TYPE_Q6_K:
        case GGML_TYPE_IQ4_NL:
            break;
        default:
            return nullptr;
@ -2123,6 +2139,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
        case GGML_TYPE_Q4_K:
        case GGML_TYPE_Q5_K:
        case GGML_TYPE_Q6_K:
        case GGML_TYPE_IQ4_NL:
            break;
        default:
            return nullptr;
@ -2148,6 +2165,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
        case GGML_TYPE_Q4_K:
        case GGML_TYPE_Q5_K:
        case GGML_TYPE_Q6_K:
        case GGML_TYPE_IQ4_NL:
            break;
        default:
            return nullptr;
@ -2181,6 +2199,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co
        case GGML_TYPE_Q4_K:
        case GGML_TYPE_Q5_K:
        case GGML_TYPE_Q6_K:
        case GGML_TYPE_IQ4_NL:
            break;
        default:
            return nullptr;
@ -2206,6 +2225,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context
        case GGML_TYPE_Q4_K:
        case GGML_TYPE_Q5_K:
        case GGML_TYPE_Q6_K:
        case GGML_TYPE_IQ4_NL:
            break;
        default:
            return nullptr;
@ -3431,7 +3451,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
    const uint64_t nei0 = ids->ne[0];
    const uint64_t nei1 = ids->ne[1];
-    GGML_ASSERT(nei0 * nei1 <= 2048);
+    GGML_ASSERT(nei0 * nei1 <= 3072);
    const uint32_t nbi1 = ids->nb[1];
    const uint32_t nbi2 = ids->nb[2];
@ -3443,8 +3463,6 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
    const uint64_t n_as = ne02;
    GGML_ASSERT(n_as <= 8);
    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
    ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
    ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
@ -4623,22 +4641,22 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
        }
    }
-    ggml_pipeline_allocate_descriptor_sets(ctx, p, num_it);
+    ggml_pipeline_allocate_descriptor_sets(ctx->device, p, num_it);
    if (split_k > 1) {
-        ggml_pipeline_allocate_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);
+        ggml_pipeline_allocate_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it);
        if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
            // Resize buffer
            if (ctx->prealloc_split_k != nullptr) {
                ggml_vk_destroy_buffer(ctx->prealloc_split_k);
            }
-            ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx, sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
+            ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
        }
    }
-    vk_buffer d_X = ggml_vk_create_buffer_check(ctx, sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
+    vk_buffer d_X = ggml_vk_create_buffer_check(ctx->device, sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
-    vk_buffer d_Y = ggml_vk_create_buffer_check(ctx, sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
+    vk_buffer d_Y = ggml_vk_create_buffer_check(ctx->device, sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
-    vk_buffer d_D = ggml_vk_create_buffer_check(ctx, sizeof(float) * d_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
+    vk_buffer d_D = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
    X_TYPE* x = (X_TYPE *) malloc(sizeof(X_TYPE) * x_ne);
    Y_TYPE* y = (Y_TYPE *) malloc(sizeof(Y_TYPE) * y_ne);
@ -4665,12 +4683,12 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
        }
    }
-    ggml_vk_buffer_write(ctx, d_X, 0, x, sizeof(X_TYPE) * k * m * batch);
+    ggml_vk_buffer_write(d_X, 0, x, sizeof(X_TYPE) * k * m * batch);
-    ggml_vk_buffer_write(ctx, d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch);
+    ggml_vk_buffer_write(d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch);
    vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
    for (size_t i = 0; i < num_it; i++) {
-        ggml_vk_ctx_begin(ctx, subctx);
+        ggml_vk_ctx_begin(ctx->device, subctx);
        ggml_vk_matmul(
            ctx, subctx, p, ggml_vk_subbuffer(d_X), ggml_vk_subbuffer(d_Y), ggml_vk_subbuffer(d_D), ggml_vk_subbuffer(ctx->prealloc_split_k),
            m, n, k,
@ -4689,7 +4707,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
    double time = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
    // copy dst to host
-    ggml_vk_buffer_read(ctx, d_D, 0, d, sizeof(float) * d_ne);
+    ggml_vk_buffer_read(d_D, 0, d, sizeof(float) * d_ne);
    float * d_chk = (float *) malloc(sizeof(float) * d_ne);
@ -4765,7 +4783,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
        if (split_k > 1) {
            float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k);
-            ggml_vk_buffer_read(ctx, ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k);
+            ggml_vk_buffer_read(ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k);
            std::cerr << "d_buf0: " << std::endl << std::endl;
            ggml_vk_print_matrix_area(split_k_buf, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
@ -4785,8 +4803,8 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
    free(d_chk);
-    ggml_vk_queue_cleanup(ctx, ctx->device->transfer_queue);
+    ggml_vk_queue_cleanup(ctx->device, ctx->device->transfer_queue);
-    ggml_vk_queue_cleanup(ctx, ctx->device->compute_queue);
+    ggml_vk_queue_cleanup(ctx->device, ctx->device->compute_queue);
    ggml_vk_destroy_buffer(d_X);
    ggml_vk_destroy_buffer(d_Y);
@ -4834,90 +4852,23 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1
    }
 }
 static void ggml_vk_test_transfer(ggml_backend_vk_context * ctx, size_t ne, bool pinned) {
    VK_LOG_DEBUG("ggml_vk_test_transfer(" << ne << ")");
    // Check transfers are correct
    vk_buffer buffer = ggml_vk_create_buffer_check(ctx, sizeof(float) * ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
    float * x;
    float * y;
    if (pinned) {
        x = (float *) ggml_vk_host_malloc(ctx, sizeof(float) * ne);
        y = (float *) ggml_vk_host_malloc(ctx, sizeof(float) * ne);
    } else {
        x = (float *) malloc(sizeof(float) * ne);
        y = (float *) malloc(sizeof(float) * ne);
    }
    for (size_t i = 0; i < ne; i++) {
        x[i] = rand() / (float)RAND_MAX;
    }
    vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
    ggml_vk_ctx_begin(ctx, subctx);
    auto begin = std::chrono::high_resolution_clock::now();
    ggml_vk_buffer_write_async(ctx, subctx, buffer, 0, x, sizeof(float) * ne);
    for (auto& cpy : subctx->in_memcpys) {
        memcpy(cpy.dst, cpy.src, cpy.n);
    }
    subctx->in_memcpys.clear();
    ggml_vk_ctx_end(subctx);
    ggml_vk_submit(subctx, ctx->fence);
    VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_transfer waitForFences");
    ctx->device->device.resetFences({ ctx->fence });
    auto end = std::chrono::high_resolution_clock::now();
    double ms_to_gpu = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
    ggml_vk_ctx_begin(ctx, subctx);
    begin = std::chrono::high_resolution_clock::now();
    ggml_vk_buffer_read_async(ctx, subctx, buffer, 0, y, sizeof(float) * ne);
    ggml_vk_ctx_end(subctx);
    ggml_vk_submit(subctx, ctx->fence);
    VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_transfer waitForFences");
    ctx->device->device.resetFences({ ctx->fence });
    for (auto& cpy : subctx->out_memcpys) {
        memcpy(cpy.dst, cpy.src, cpy.n);
    }
    subctx->out_memcpys.clear();
    end = std::chrono::high_resolution_clock::now();
    double ms_from_gpu = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
    double avg_err = 0.0;
    for (size_t i = 0; i < ne; i++) {
        avg_err += std::fabs(x[i] - y[i]);
    }
    double kb = ne * sizeof(float) / 1024.0;
    std::cerr << "TEST TRANSFER " << kb << " KB to_gpu " << ms_to_gpu << "ms (" << kb / ms_to_gpu * 1000.0 / 1024.0 << " MB/s) from_gpu " << ms_from_gpu << "ms (" << kb / ms_from_gpu * 1000.0 / 1024.0 << " MB/s) avg_err=" << avg_err / ne << std::endl;
    ggml_vk_destroy_buffer(buffer);
    if (pinned) {
        ggml_vk_host_free(ctx, x);
        ggml_vk_host_free(ctx, y);
    } else {
        free(x);
        free(y);
    }
 }
 static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml_type quant) {
    ggml_quantize_chunk(quant, from, to, 0, 1, ne, nullptr);
 }
 static void ggml_vk_dequantize_data(const void * from, float * to, size_t ne, ggml_type quant) {
    if (quant == GGML_TYPE_F32) {
        memcpy(to, from, sizeof(float) * ne);
        return;
    }
    ggml_type_traits_t tt = ggml_internal_get_type_traits(quant);
    ggml_to_float_t dequant_fn = tt.to_float;
    dequant_fn(from, to, ne);
 }
 static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) {
    VK_LOG_DEBUG("ggml_vk_test_dequant(" << ne << ")");
    const size_t x_sz = sizeof(float) * ne;
@ -4925,24 +4876,26 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
    const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant);
    float * x = (float *) malloc(x_sz);
    void * qx = malloc(qx_sz);
-    vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
+    vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx->device, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
-    vk_buffer x_buf = ggml_vk_create_buffer_check(ctx, x_sz_f16, vk::MemoryPropertyFlagBits::eDeviceLocal);
+    vk_buffer x_buf = ggml_vk_create_buffer_check(ctx->device, x_sz_f16, vk::MemoryPropertyFlagBits::eDeviceLocal);
    float * x_ref = (float *) malloc(x_sz);
    ggml_fp16_t * x_chk = (ggml_fp16_t *) malloc(x_sz_f16);
    for (size_t i = 0; i < ne; i++) {
        x[i] = rand() / (float)RAND_MAX;
    }
-    vk_pipeline p = ctx->device->pipeline_dequant[quant];
+    vk_pipeline p = ggml_vk_get_to_fp16(ctx, quant);
    ggml_vk_quantize_data(x, qx, ne, quant);
    ggml_vk_dequantize_data(qx, x_ref, ne, quant);
-    ggml_pipeline_allocate_descriptor_sets(ctx, p, 1);
+    ggml_pipeline_allocate_descriptor_sets(ctx->device, p, 1);
-    ggml_vk_buffer_write(ctx, qx_buf, 0, qx, qx_sz);
+    ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
    vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
-    ggml_vk_ctx_begin(ctx, subctx);
+    ggml_vk_ctx_begin(ctx->device, subctx);
    const std::vector<uint32_t> pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne };
    ggml_vk_dispatch_pipeline(ctx, subctx, p, { { qx_buf, 0, qx_sz }, { x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1});
    ggml_vk_ctx_end(subctx);
@ -4956,13 +4909,13 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
    auto end = std::chrono::high_resolution_clock::now();
    double ms_dequant = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
-    ggml_vk_buffer_read(ctx, x_buf, 0, x_chk, x_sz_f16);
+    ggml_vk_buffer_read(x_buf, 0, x_chk, x_sz_f16);
    int first_err = -1;
    double avg_err = 0.0;
    for (size_t i = 0; i < ne; i++) {
-        double error = std::fabs(x[i] - ggml_fp16_to_fp32(x_chk[i]));
+        double error = std::fabs(x_ref[i] - ggml_fp16_to_fp32(x_chk[i]));
        avg_err += error;
        if (first_err < 0 && error > 0.05) {
@ -4982,7 +4935,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
        }
        std::cerr << std::endl << "Expected result: " << std::endl << std::endl;
        for (int i = std::max(0, first_err - 5); i < std::min((int)ne, first_err + 5); i++) {
-            std::cerr << x[i] << ", ";
+            std::cerr << x_ref[i] << ", ";
        }
        std::cerr << std::endl;
    }
@ -4992,6 +4945,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
    free(x);
    free(qx);
    free(x_ref);
    free(x_chk);
 }
@ -5040,9 +4994,9 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
    float * x = (float *) malloc(x_sz);
    float * y = (float *) malloc(y_sz);
    void * qx = malloc(qx_sz);
-    vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
+    vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx->device, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
-    vk_buffer y_buf = ggml_vk_create_buffer_check(ctx, y_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
+    vk_buffer y_buf = ggml_vk_create_buffer_check(ctx->device, y_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
-    vk_buffer d_buf = ggml_vk_create_buffer_check(ctx, d_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
+    vk_buffer d_buf = ggml_vk_create_buffer_check(ctx->device, d_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
    float * d = (float *) malloc(d_sz);
    float * d_chk = (float *) malloc(d_sz);
@ -5057,25 +5011,25 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
        y[i] = (i % k == i / k) ? 1.0f : 0.0f;
    }
-    ggml_pipeline_allocate_descriptor_sets(ctx, p, num_it);
+    ggml_pipeline_allocate_descriptor_sets(ctx->device, p, num_it);
    if (split_k > 1) {
-        ggml_pipeline_allocate_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);
+        ggml_pipeline_allocate_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it);
        if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
            // Resize buffer
            if (ctx->prealloc_split_k != nullptr) {
                ggml_vk_destroy_buffer(ctx->prealloc_split_k);
            }
-            ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx, sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
+            ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
        }
    }
-    ggml_vk_buffer_write(ctx, qx_buf, 0, qx, qx_sz);
+    ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
-    ggml_vk_buffer_write(ctx, y_buf, 0, y, y_sz);
+    ggml_vk_buffer_write(y_buf, 0, y, y_sz);
    vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
    for (size_t i = 0; i < num_it; i++) {
-        ggml_vk_ctx_begin(ctx, subctx);
+        ggml_vk_ctx_begin(ctx->device, subctx);
        ggml_vk_matmul(
            ctx, subctx, p, ggml_vk_subbuffer(qx_buf), ggml_vk_subbuffer(y_buf), ggml_vk_subbuffer(d_buf), ggml_vk_subbuffer(ctx->prealloc_split_k),
            m, n, k,
@ -5094,7 +5048,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
    auto end = std::chrono::high_resolution_clock::now();
    double time_ms = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
-    ggml_vk_buffer_read(ctx, d_buf, 0, d, d_sz);
+    ggml_vk_buffer_read(d_buf, 0, d, d_sz);
    ggml_init_params iparams = {
        /*.mem_size   =*/ 1024*1024*1024,
@ -5149,7 +5103,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
        if (split_k > 1) {
            float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k);
-            ggml_vk_buffer_read(ctx, ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k);
+            ggml_vk_buffer_read(ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k);
            std::cerr << "d_buf0: " << std::endl << std::endl;
            ggml_vk_print_matrix_area(split_k_buf, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
@ -5302,12 +5256,9 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
 static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
 #if defined(GGML_VULKAN_RUN_TESTS)
-    ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
+    ctx->staging = ggml_vk_create_buffer_check(ctx->device, 100ul * 1024ul * 1024ul,
        vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
        vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
    ggml_vk_test_transfer(ctx, 8192 * 1000, false);
    ggml_vk_test_transfer(ctx, 8192 * 1000, true);
    ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_F32);
    ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_0);
    ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_1);
@ -5319,85 +5270,90 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
    ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_K);
    ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q5_K);
    ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q6_K);
    ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_IQ4_NL);
    ggml_vk_test_matmul<ggml_fp16_t, ggml_fp16_t>(ctx, 512, 512, 100, 32, 100, 1, 2);
    ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 0);
    ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 1);
    ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 2);
-    ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 0);
+    // ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 0);
-    ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 1);
+    // ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 1);
-    ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 2);
+    // ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 2);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_0);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_0);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_0);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_0);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_0);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_0);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_0);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_0);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_0);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_1);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_1);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_1);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_1);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_1);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_1);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_1);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_1);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_1);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_0);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_0);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_0);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_0);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_0);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_0);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_0);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_0);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_0);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_1);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_1);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_1);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_1);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_1);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_1);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_1);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_1);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_1);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q8_0);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q8_0);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q8_0);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q8_0);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q8_0);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q8_0);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q8_0);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q8_0);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q8_0);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q2_K);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q2_K);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q2_K);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q2_K);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q2_K);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q2_K);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q2_K);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q2_K);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q2_K);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q3_K);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q3_K);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q3_K);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q3_K);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q3_K);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q3_K);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q3_K);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q3_K);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q3_K);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_K);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_K);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_K);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_K);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_K);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_K);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_K);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_K);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_K);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_K);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_K);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_K);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_K);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_K);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_K);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_K);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_K);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_K);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q6_K);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q6_K);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q6_K);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q6_K);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q6_K);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q6_K);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q6_K);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q6_K);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q6_K);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_IQ4_NL);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_IQ4_NL);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_IQ4_NL);
    std::cerr << std::endl;
@ -5429,9 +5385,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
        ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0);
        ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1);
        ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 2);
-        ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0);
+        // ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0);
-        ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1);
+        // ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1);
-        ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2);
+        // ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2);
        std::cerr << std::endl;
    }
@ -6263,6 +6219,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
                    case GGML_TYPE_Q4_K:
                    case GGML_TYPE_Q5_K:
                    case GGML_TYPE_Q6_K:
                    case GGML_TYPE_IQ4_NL:
                        break;
                    default:
                        return false;
@ -6291,6 +6248,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
                    case GGML_TYPE_Q5_0:
                    case GGML_TYPE_Q5_1:
                    case GGML_TYPE_Q8_0:
                    case GGML_TYPE_IQ4_NL:
                        return true;
                    default:
                        return false;
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -3341,7 +3341,7 @@ bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tenso
 }
 // check if t1 can be represented as a repeatition of t0
-static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
+bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
    return ggml_is_empty(t0) ? ggml_is_empty(t1) :
@ -13687,6 +13687,7 @@ static void ggml_compute_forward_soft_max(
    }
 }
 // ggml_compute_forward_soft_max_back
 static void ggml_compute_forward_soft_max_back_f32(
@ -18925,7 +18926,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
        FILE * fout = ggml_fopen(fname, "wb");
        if (!fout) {
-            fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
+            fprintf(stderr, "%s: failed to open %s: %s\n", __func__, fname, strerror(errno));
            return;
        }
@ -19062,7 +19063,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
    {
        FILE * fin = ggml_fopen(fname, "rb");
        if (!fin) {
-            fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
+            fprintf(stderr, "%s: failed to open %s: %s\n", __func__, fname, strerror(errno));
            return result;
        }
@ -20736,6 +20737,7 @@ struct gguf_context * gguf_init_empty(void) {
 struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
    FILE * file = ggml_fopen(fname, "rb");
    if (!file) {
        fprintf(stderr, "%s: failed to open '%s': '%s'\n", __func__, fname, strerror(errno));
        return NULL;
    }
@ -20920,7 +20922,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
            gguf_tensor_info_sanitize(info);
            // make sure there is no duplicated tensor names
-            for (uint64_t j = 0; j < i; ++j) {
+            for (uint64_t j = 0; j < i && ok; ++j) {
                if (strcmp(info->name.data, ctx->infos[j].name.data) == 0) {
                    fprintf(stderr, "%s: duplicated tensor name %s\n", __func__, info->name.data);
                    ok = false;
@ -21902,6 +21904,14 @@ int ggml_cpu_has_rpc(void) {
 #endif
 }
 int ggml_cpu_has_cann(void) {
 #if defined(GGML_USE_CANN)
    return 1;
 #else
    return 0;
 #endif
 }
 int ggml_cpu_has_gpublas(void) {
    return ggml_cpu_has_cuda() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() || ggml_cpu_has_sycl();
 }
--- a/ggml/src/vulkan-shaders/dequant_funcs.comp
+++ b/ggml/src/vulkan-shaders/dequant_funcs.comp
@ -58,3 +58,11 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
    return vec2(int(data_a[a_offset + ib].qs[iqs]), int(data_a[a_offset + ib].qs[iqs + 1])) * d;
 }
 #endif
 #if defined(DATA_A_IQ4_NL)
 vec2 dequantize(uint ib, uint iqs, uint a_offset) {
    const float d = float(data_a[a_offset + ib].d);
    const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
    return vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]) * d;
 }
 #endif
--- a/ggml/src/vulkan-shaders/dequant_iq4_nl.comp
+++ b/ggml/src/vulkan-shaders/dequant_iq4_nl.comp
@ -0,0 +1,30 @@
 #version 450
 #include "dequant_head.comp"
 layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
 layout (binding = 0) readonly buffer A {block_iq4_nl data_a[];};
 layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
 void main() {
    const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
    const uint tid = gl_LocalInvocationID.x % 64;
    const uint il  = tid/32;
    const uint ir  = tid%32;
    const uint ib = 32*i + ir;
    if (ib >= p.nel / 32) {
        return;
    }
    const uint q_idx = 8*il;
    const uint b_idx = 1024*i + 32*ir + q_idx;
    const float d = float(data_a[ib].d);
    [[unroll]] for (uint l = 0; l < 8; ++l) {
        data_b[b_idx + l +  0] = D_TYPE(d * kvalues_iq4nl[data_a[ib].qs[q_idx + l] & 0xF]);
        data_b[b_idx + l + 16] = D_TYPE(d * kvalues_iq4nl[data_a[ib].qs[q_idx + l] >>  4]);
    }
 }
--- a/ggml/src/vulkan-shaders/dequant_q4_0.comp
+++ b/ggml/src/vulkan-shaders/dequant_q4_0.comp
@ -18,15 +18,13 @@ void main() {
        return;
    }
-    const uint b_idx = 1024*i + 32*ir + 8*il;
+    const uint q_idx = 8*il;
    const uint b_idx = 1024*i + 32*ir + q_idx;
    const float d = float(data_a[ib].d);
    const float dm = -8.0f * d;
    const uint q_idx = 8*il;
    [[unroll]] for (uint l = 0; l < 8; ++l) {
-        data_b[b_idx + l +  0] = D_TYPE(d * (data_a[ib].qs[q_idx + l] & 0xF) + dm);
+        data_b[b_idx + l +  0] = D_TYPE(d * ((data_a[ib].qs[q_idx + l] & 0xF) - 8.0f));
-        data_b[b_idx + l + 16] = D_TYPE(d * (data_a[ib].qs[q_idx + l] >>  4) + dm);
+        data_b[b_idx + l + 16] = D_TYPE(d * ((data_a[ib].qs[q_idx + l] >>  4) - 8.0f));
    }
 }
--- a/ggml/src/vulkan-shaders/mul_mm.comp
+++ b/ggml/src/vulkan-shaders/mul_mm.comp
@ -71,7 +71,7 @@ shared FLOAT_TYPE buf_a[BM * (BK+1)];
 shared FLOAT_TYPE buf_b[BN * (BK+1)];
 #ifdef MUL_MAT_ID
-shared u16vec2 row_ids[2048];
+shared u16vec2 row_ids[3072];
 #endif
 void main() {
@ -380,6 +380,19 @@ void main() {
            buf_a[buf_idx    ] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi    ] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi    ] >> qhshift) & 3) << 4)) - 32));
            buf_a[buf_idx + 1] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32));
 #elif defined(DATA_A_IQ4_NL)
            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
            const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a;
            const uint ib = idx / 16;
            const uint iqs = idx & 0xF;
            const float d = float(data_a[ib].d);
            const uint vui = uint(data_a[ib].qs[iqs]);
            const vec2 v = vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]) * d;
            buf_a[buf_idx     ] = FLOAT_TYPE(v.x);
            buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);
 #endif
        }
        [[unroll]] for (uint l = 0; l < BN; l += loadstride_b) {
--- a/ggml/src/vulkan-shaders/types.comp
+++ b/ggml/src/vulkan-shaders/types.comp
@ -177,3 +177,24 @@ struct block_q6_K
 #define A_TYPE block_q6_K
 #endif
 // IQuants
 #if defined(DATA_A_IQ4_NL)
 #extension GL_EXT_shader_16bit_storage : require
 #define QUANT_K 32
 #define QUANT_R 2
 struct block_iq4_nl
 {
    float16_t d;
    uint8_t qs[QUANT_K/2];
 };
 #define A_TYPE block_iq4_nl
 const int8_t kvalues_iq4nl[16] = {
    int8_t(-127), int8_t(-104), int8_t(-83), int8_t(-65), int8_t(-49), int8_t(-35), int8_t(-22), int8_t(-10),
    int8_t(1), int8_t(13), int8_t(25), int8_t(38), int8_t(53), int8_t(69), int8_t(89), int8_t(113)
 };
 #endif
--- a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
@ -52,7 +52,8 @@ const std::vector<std::string> type_names = {
    "q3_k",
    "q4_k",
    "q5_k",
-    "q6_k"
+    "q6_k",
    "iq4_nl"
 };
 void execute_command(const std::string& command, std::string& stdout_str, std::string& stderr_str) {
--- a/gguf-py/README.md
+++ b/gguf-py/README.md
@ -78,5 +78,13 @@ python -m build
 python -m twine upload dist/*
 ```
 ## Run Unit Tests
 From root of this repository you can run this command to run all the unit tests
 ```bash
 python -m unittest discover ./gguf-py -v
 ```
 ## TODO
 - [ ] Include conversion scripts as command line entry points in this package.
--- a/gguf-py/gguf/init.py
+++ b/gguf-py/gguf/init.py
@ -5,3 +5,5 @@ from .gguf_writer import *
 from .quants import *
 from .tensor_mapping import *
 from .vocab import *
 from .utility import *
 from .metadata import *
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -23,15 +23,56 @@ class Keys:
        ARCHITECTURE               = "general.architecture"
        QUANTIZATION_VERSION       = "general.quantization_version"
        ALIGNMENT                  = "general.alignment"
        FILE_TYPE                  = "general.file_type"
        # Authorship Metadata
        NAME                       = "general.name"
        AUTHOR                     = "general.author"
        VERSION                    = "general.version"
-        URL                  = "general.url"
+        ORGANIZATION               = "general.organization"
        FINETUNE                   = "general.finetune"
        BASENAME                   = "general.basename"
        DESCRIPTION                = "general.description"
        QUANTIZED_BY               = "general.quantized_by"
        SIZE_LABEL                 = "general.size_label"
        # Licensing details
        LICENSE                    = "general.license"
-        SOURCE_URL           = "general.source.url"
+        LICENSE_NAME               = "general.license.name"
-        SOURCE_HF_REPO       = "general.source.huggingface.repository"
+        LICENSE_LINK               = "general.license.link"
-        FILE_TYPE            = "general.file_type"
+
        # Typically represents the converted GGUF repo (Unless native)
        URL                        = "general.url" # Model Website/Paper
        DOI                        = "general.doi"
        UUID                       = "general.uuid"
        REPO_URL                   = "general.repo_url" # Model Source Repository (git/svn/etc...)
        # Model Source during conversion
        SOURCE_URL                 = "general.source.url" # Model Website/Paper
        SOURCE_DOI                 = "general.source.doi"
        SOURCE_UUID                = "general.source.uuid"
        SOURCE_REPO_URL            = "general.source.repo_url" # Model Source Repository (git/svn/etc...)
        # Base Model Source. There can be more than one source if it's a merged
        # model like with 'Mistral-7B-Merge-14-v0.1'. This will assist in
        # tracing linage of models as it is finetuned or merged over time.
        BASE_MODEL_COUNT           = "general.base_model.count"
        BASE_MODEL_NAME            = "general.base_model.{id}.name"
        BASE_MODEL_AUTHOR          = "general.base_model.{id}.author"
        BASE_MODEL_VERSION         = "general.base_model.{id}.version"
        BASE_MODEL_ORGANIZATION    = "general.base_model.{id}.organization"
        BASE_MODEL_URL             = "general.base_model.{id}.url" # Model Website/Paper
        BASE_MODEL_DOI             = "general.base_model.{id}.doi"
        BASE_MODEL_UUID            = "general.base_model.{id}.uuid"
        BASE_MODEL_REPO_URL        = "general.base_model.{id}.repo_url" # Model Source Repository (git/svn/etc...)
        # Array based KV stores
        TAGS                       = "general.tags"
        LANGUAGES                  = "general.languages"
        DATASETS                   = "general.datasets"
    class LLM:
        VOCAB_SIZE                        = "{arch}.vocab_size"
@ -1233,7 +1274,6 @@ KEY_GENERAL_URL                  = Keys.General.URL
 KEY_GENERAL_DESCRIPTION          = Keys.General.DESCRIPTION
 KEY_GENERAL_LICENSE              = Keys.General.LICENSE
 KEY_GENERAL_SOURCE_URL           = Keys.General.SOURCE_URL
 KEY_GENERAL_SOURCE_HF_REPO       = Keys.General.SOURCE_HF_REPO
 KEY_GENERAL_FILE_TYPE            = Keys.General.FILE_TYPE
 # LLM
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@ -7,6 +7,7 @@ import struct
 import tempfile
 from dataclasses import dataclass
 from enum import Enum, auto
 from math import prod
 from pathlib import Path
 from io import BufferedWriter
 from typing import IO, Any, Sequence, Mapping
@ -106,6 +107,53 @@ class GGUFWriter:
        self.add_architecture()
    def get_total_parameter_count(self) -> tuple[int, int, int, int]:
        total_params = 0
        shared_params = 0
        expert_params = 0
        expert_sum = 0
        n_expert_tensors = 0
        last_lora_a: tuple[str, TensorInfo] | None = None
        for tensors in self.tensors:
            for name, info in tensors.items():
                shape = info.shape
                if name.endswith(".lora_a"):
                    last_lora_a = (name, info)
                    continue
                elif name.endswith(".lora_b"):
                    if last_lora_a is None or last_lora_a[0] != name[:-1] + "a":
                        # Bail when the LoRA pair can't be found trivially
                        logger.warning("can't measure LoRA size correctly, tensor order is unusual")
                        return 0, 0, 0, 0
                    else:
                        shape = (*shape[:-1], last_lora_a[1].shape[-1])
                size = prod(shape)
                if "_exps." in name:
                    expert_params += (size // shape[-3])
                    expert_sum += shape[-3]
                    n_expert_tensors += 1
                else:
                    shared_params += size
                total_params += size
        # Hopefully this should work even for variable-expert-count models
        expert_count = (expert_sum // n_expert_tensors) if n_expert_tensors > 0 else 0
        # Negate the total to signal it's likely not exact
        if last_lora_a is not None:
            total_params = -total_params
        # NOTE: keep the output in the same order as accepted by 'size_label' in gguf-py/gguf/utility.py
        return total_params, shared_params, expert_params, expert_count
    def format_shard_names(self, path: Path) -> list[Path]:
        if len(self.tensors) == 1:
            return [path]
@ -115,6 +163,7 @@ class GGUFWriter:
        if self.state is WriterState.EMPTY and self.fout is not None and (path is None or path == self.path):
            # allow calling this multiple times as long as the path is the same
            return
        if self.state is not WriterState.NO_FILE:
            raise ValueError(f'Expected output file to be not yet opened, got {self.state}')
@ -136,6 +185,8 @@ class GGUFWriter:
        if self.dry_run:
            logger.info("Dry run, not writing files")
            for name in filenames:
                print(name)  # noqa: NP100
            exit()
        return filenames
@ -430,29 +481,12 @@ class GGUFWriter:
    def add_architecture(self) -> None:
        self.add_string(Keys.General.ARCHITECTURE, self.arch)
-    def add_author(self, author: str) -> None:
+    def add_quantization_version(self, quantization_version: int) -> None:
-        self.add_string(Keys.General.AUTHOR, author)
+        self.add_uint32(Keys.General.QUANTIZATION_VERSION, quantization_version)
-    def add_version(self, version: str) -> None:
+    def add_custom_alignment(self, alignment: int) -> None:
-        self.add_string(Keys.General.VERSION, version)
+        self.data_alignment = alignment
-
+        self.add_uint32(Keys.General.ALIGNMENT, alignment)
    def add_tensor_data_layout(self, layout: str) -> None:
        self.add_string(Keys.LLM.TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
    def add_url(self, url: str) -> None:
        self.add_string(Keys.General.URL, url)
    def add_description(self, description: str) -> None:
        self.add_string(Keys.General.DESCRIPTION, description)
    def add_licence(self, licence: str) -> None:
        self.add_string(Keys.General.LICENSE, licence)
    def add_source_url(self, url: str) -> None:
        self.add_string(Keys.General.SOURCE_URL, url)
    def add_source_hf_repo(self, repo: str) -> None:
        self.add_string(Keys.General.SOURCE_HF_REPO, repo)
    def add_file_type(self, ftype: int) -> None:
        self.add_uint32(Keys.General.FILE_TYPE, ftype)
@ -460,13 +494,101 @@ class GGUFWriter:
    def add_name(self, name: str) -> None:
        self.add_string(Keys.General.NAME, name)
-    def add_quantization_version(self, quantization_version: int) -> None:
+    def add_author(self, author: str) -> None:
-        self.add_uint32(
+        self.add_string(Keys.General.AUTHOR, author)
            Keys.General.QUANTIZATION_VERSION, quantization_version)
-    def add_custom_alignment(self, alignment: int) -> None:
+    def add_version(self, version: str) -> None:
-        self.data_alignment = alignment
+        self.add_string(Keys.General.VERSION, version)
-        self.add_uint32(Keys.General.ALIGNMENT, alignment)
+
    def add_organization(self, organization: str) -> None:
        self.add_string(Keys.General.ORGANIZATION, organization)
    def add_finetune(self, finetune: str) -> None:
        self.add_string(Keys.General.FINETUNE, finetune)
    def add_basename(self, basename: str) -> None:
        self.add_string(Keys.General.BASENAME, basename)
    def add_description(self, description: str) -> None:
        self.add_string(Keys.General.DESCRIPTION, description)
    def add_quantized_by(self, quantized: str) -> None:
        self.add_string(Keys.General.QUANTIZED_BY, quantized)
    def add_size_label(self, size_label: str) -> None:
        self.add_string(Keys.General.SIZE_LABEL, size_label)
    def add_license(self, license: str) -> None:
        self.add_string(Keys.General.LICENSE, license)
    def add_license_name(self, license: str) -> None:
        self.add_string(Keys.General.LICENSE_NAME, license)
    def add_license_link(self, license: str) -> None:
        self.add_string(Keys.General.LICENSE_LINK, license)
    def add_url(self, url: str) -> None:
        self.add_string(Keys.General.URL, url)
    def add_doi(self, doi: str) -> None:
        self.add_string(Keys.General.DOI, doi)
    def add_uuid(self, uuid: str) -> None:
        self.add_string(Keys.General.UUID, uuid)
    def add_repo_url(self, repo_url: str) -> None:
        self.add_string(Keys.General.REPO_URL, repo_url)
    def add_source_url(self, url: str) -> None:
        self.add_string(Keys.General.SOURCE_URL, url)
    def add_source_doi(self, doi: str) -> None:
        self.add_string(Keys.General.SOURCE_DOI, doi)
    def add_source_uuid(self, uuid: str) -> None:
        self.add_string(Keys.General.SOURCE_UUID, uuid)
    def add_source_repo_url(self, repo_url: str) -> None:
        self.add_string(Keys.General.SOURCE_REPO_URL, repo_url)
    def add_base_model_count(self, source_count: int) -> None:
        self.add_uint32(Keys.General.BASE_MODEL_COUNT, source_count)
    def add_base_model_name(self, source_id: int, name: str) -> None:
        self.add_string(Keys.General.BASE_MODEL_NAME.format(id=source_id), name)
    def add_base_model_author(self, source_id: int, author: str) -> None:
        self.add_string(Keys.General.BASE_MODEL_AUTHOR.format(id=source_id), author)
    def add_base_model_version(self, source_id: int, version: str) -> None:
        self.add_string(Keys.General.BASE_MODEL_VERSION.format(id=source_id), version)
    def add_base_model_organization(self, source_id: int, organization: str) -> None:
        self.add_string(Keys.General.BASE_MODEL_ORGANIZATION.format(id=source_id), organization)
    def add_base_model_url(self, source_id: int, url: str) -> None:
        self.add_string(Keys.General.BASE_MODEL_URL.format(id=source_id), url)
    def add_base_model_doi(self, source_id: int, doi: str) -> None:
        self.add_string(Keys.General.BASE_MODEL_DOI.format(id=source_id), doi)
    def add_base_model_uuid(self, source_id: int, uuid: str) -> None:
        self.add_string(Keys.General.BASE_MODEL_UUID.format(id=source_id), uuid)
    def add_base_model_repo_url(self, source_id: int, repo_url: str) -> None:
        self.add_string(Keys.General.BASE_MODEL_REPO_URL.format(id=source_id), repo_url)
    def add_tags(self, tags: Sequence[str]) -> None:
        self.add_array(Keys.General.TAGS, tags)
    def add_languages(self, languages: Sequence[str]) -> None:
        self.add_array(Keys.General.LANGUAGES, languages)
    def add_datasets(self, datasets: Sequence[str]) -> None:
        self.add_array(Keys.General.DATASETS, datasets)
    def add_tensor_data_layout(self, layout: str) -> None:
        self.add_string(Keys.LLM.TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
    def add_vocab_size(self, size: int) -> None:
        self.add_uint32(Keys.LLM.VOCAB_SIZE.format(arch=self.arch), size)
--- a/gguf-py/gguf/metadata.py
+++ b/gguf-py/gguf/metadata.py
@ -0,0 +1,503 @@
 from __future__ import annotations
 import re
 import json
 import yaml
 import logging
 from pathlib import Path
 from typing import Any, Literal, Optional
 from dataclasses import dataclass
 from .constants import Keys
 import gguf
 logger = logging.getLogger("metadata")
@dataclass
 class Metadata:
    # Authorship Metadata to be written to GGUF KV Store
    name: Optional[str] = None
    author: Optional[str] = None
    version: Optional[str] = None
    organization: Optional[str] = None
    finetune: Optional[str] = None
    basename: Optional[str] = None
    description: Optional[str] = None
    quantized_by: Optional[str] = None
    size_label: Optional[str] = None
    url: Optional[str] = None
    doi: Optional[str] = None
    uuid: Optional[str] = None
    repo_url: Optional[str] = None
    source_url: Optional[str] = None
    source_doi: Optional[str] = None
    source_uuid: Optional[str] = None
    source_repo_url: Optional[str] = None
    license: Optional[str] = None
    license_name: Optional[str] = None
    license_link: Optional[str] = None
    base_models: Optional[list[dict]] = None
    tags: Optional[list[str]] = None
    languages: Optional[list[str]] = None
    datasets: Optional[list[str]] = None
    @staticmethod
    def load(metadata_override_path: Optional[Path] = None, model_path: Optional[Path] = None, model_name: Optional[str] = None, total_params: int = 0) -> Metadata:
        # This grabs as many contextual authorship metadata as possible from the model repository
        # making any conversion as required to match the gguf kv store metadata format
        # as well as giving users the ability to override any authorship metadata that may be incorrect
        # Create a new Metadata instance
        metadata = Metadata()
        model_card = Metadata.load_model_card(model_path)
        hf_params = Metadata.load_hf_parameters(model_path)
        # TODO: load adapter_config.json when possible, it usually contains the base model of the LoRA adapter
        # heuristics
        metadata = Metadata.apply_metadata_heuristic(metadata, model_card, hf_params, model_path, total_params)
        # Metadata Override File Provided
        # This is based on LLM_KV_NAMES mapping in llama.cpp
        metadata_override = Metadata.load_metadata_override(metadata_override_path)
        metadata.name            = metadata_override.get(Keys.General.NAME,            metadata.name)
        metadata.author          = metadata_override.get(Keys.General.AUTHOR,          metadata.author)
        metadata.version         = metadata_override.get(Keys.General.VERSION,         metadata.version)
        metadata.organization    = metadata_override.get(Keys.General.ORGANIZATION,    metadata.organization)
        metadata.finetune        = metadata_override.get(Keys.General.FINETUNE,        metadata.finetune)
        metadata.basename        = metadata_override.get(Keys.General.BASENAME,        metadata.basename)
        metadata.description     = metadata_override.get(Keys.General.DESCRIPTION,     metadata.description)
        metadata.quantized_by    = metadata_override.get(Keys.General.QUANTIZED_BY,    metadata.quantized_by)
        metadata.size_label      = metadata_override.get(Keys.General.SIZE_LABEL,      metadata.size_label)
        metadata.license_name    = metadata_override.get(Keys.General.LICENSE_NAME,    metadata.license_name)
        metadata.license_link    = metadata_override.get(Keys.General.LICENSE_LINK,    metadata.license_link)
        metadata.url             = metadata_override.get(Keys.General.URL,             metadata.url)
        metadata.doi             = metadata_override.get(Keys.General.DOI,             metadata.doi)
        metadata.uuid            = metadata_override.get(Keys.General.UUID,            metadata.uuid)
        metadata.repo_url        = metadata_override.get(Keys.General.REPO_URL,        metadata.repo_url)
        metadata.source_url      = metadata_override.get(Keys.General.SOURCE_URL,      metadata.source_url)
        metadata.source_doi      = metadata_override.get(Keys.General.SOURCE_DOI,      metadata.source_doi)
        metadata.source_uuid     = metadata_override.get(Keys.General.SOURCE_UUID,     metadata.source_uuid)
        metadata.source_repo_url = metadata_override.get(Keys.General.SOURCE_REPO_URL, metadata.source_repo_url)
        # Base Models is received here as an array of models
        metadata.base_models     = metadata_override.get("general.base_models",        metadata.base_models)
        metadata.tags            = metadata_override.get(Keys.General.TAGS,            metadata.tags)
        metadata.languages       = metadata_override.get(Keys.General.LANGUAGES,       metadata.languages)
        metadata.datasets        = metadata_override.get(Keys.General.DATASETS,        metadata.datasets)
        # Direct Metadata Override (via direct cli argument)
        if model_name is not None:
            metadata.name = model_name
        return metadata
    @staticmethod
    def load_metadata_override(metadata_override_path: Optional[Path] = None) -> dict[str, Any]:
        if metadata_override_path is None or not metadata_override_path.is_file():
            return {}
        with open(metadata_override_path, "r", encoding="utf-8") as f:
            return json.load(f)
    @staticmethod
    def load_model_card(model_path: Optional[Path] = None) -> dict[str, Any]:
        if model_path is None or not model_path.is_dir():
            return {}
        model_card_path = model_path / "README.md"
        if not model_card_path.is_file():
            return {}
        # The model card metadata is assumed to always be in YAML
        # ref: https://github.com/huggingface/transformers/blob/a5c642fe7a1f25d3bdcd76991443ba6ff7ee34b2/src/transformers/modelcard.py#L468-L473
        with open(model_card_path, "r", encoding="utf-8") as f:
            if f.readline() == "---\n":
                raw = f.read().partition("---\n")[0]
                data = yaml.safe_load(raw)
                if isinstance(data, dict):
                    return data
                else:
                    logger.error(f"while reading YAML model card frontmatter, data is {type(data)} instead of dict")
                    return {}
            else:
                return {}
    @staticmethod
    def load_hf_parameters(model_path: Optional[Path] = None) -> dict[str, Any]:
        if model_path is None or not model_path.is_dir():
            return {}
        config_path = model_path / "config.json"
        if not config_path.is_file():
            return {}
        with open(config_path, "r", encoding="utf-8") as f:
            return json.load(f)
    @staticmethod
    def id_to_title(string):
        # Convert capitalization into title form unless acronym or version number
        return ' '.join([w.title() if w.islower() and not re.match(r'^(v\d+(?:\.\d+)*|\d.*)$', w) else w for w in string.strip().replace('-', ' ').split()])
    @staticmethod
    def get_model_id_components(model_id: Optional[str] = None, total_params: int = 0) -> tuple[str | None, str | None, str | None, str | None, str | None, str | None]:
        # Huggingface often store model id as '<org>/<model name>'
        # so let's parse it and apply some heuristics if possible for model name components
        if model_id is None:
            # model ID missing
            return None, None, None, None, None, None
        if ' ' in model_id:
            # model ID is actually a normal human sentence
            # which means its most likely a normal model name only
            # not part of the hugging face naming standard, but whatever
            return model_id, None, None, None, None, None
        if '/' in model_id:
            # model ID (huggingface style)
            org_component, model_full_name_component = model_id.split('/', 1)
        else:
            # model ID but missing org components
            org_component, model_full_name_component = None, model_id
        # Check if we erroneously matched against './' or '../' etc...
        if org_component is not None and org_component[0] == '.':
            org_component = None
        name_parts: list[str] = model_full_name_component.split('-')
        # Remove empty parts
        for i in reversed(range(len(name_parts))):
            if len(name_parts[i]) == 0:
                del name_parts[i]
        name_types: list[
            set[Literal["basename", "size_label", "finetune", "version", "type"]]
        ] = [set() for _ in name_parts]
        # Annotate the name
        for i, part in enumerate(name_parts):
            # Version
            if re.fullmatch(r'(v|iter)?\d+([.]\d+)*', part, re.IGNORECASE):
                name_types[i].add("version")
            # Quant type (should not be there for base models, but still annotated)
            elif re.fullmatch(r'i?q\d(_\w)*|b?fp?(16|32)', part, re.IGNORECASE):
                name_types[i].add("type")
                name_parts[i] = part.upper()
            # Model size
            elif i > 0 and re.fullmatch(r'(([A]|\d+[x])?\d+([._]\d+)?[KMBT][\d]?|small|mini|medium|large|x?xl)', part, re.IGNORECASE):
                part = part.replace("_", ".")
                # Handle weird bloom-7b1 notation
                if part[-1].isdecimal():
                    part = part[:-2] + "." + part[-1] + part[-2]
                # Normalize the size suffixes
                if len(part) > 1 and part[-2].isdecimal():
                    if part[-1] in "kmbt":
                        part = part[:-1] + part[-1].upper()
                if total_params != 0:
                    try:
                        label_params = float(part[:-1]) * pow(1000, " KMBT".find(part[-1]))
                        # Only use it as a size label if it's close or bigger than the model size
                        # Note that LoRA adapters don't necessarily include all layers,
                        # so this is why bigger label sizes are accepted.
                        # Do not use the size label when it's smaller than 1/8 of the model size
                        if (total_params < 0 and label_params < abs(total_params) // 8) or (
                            # Check both directions when the current model isn't a LoRA adapter
                            total_params > 0 and abs(label_params - total_params) > 7 * total_params // 8
                        ):
                            # Likely a context length
                            name_types[i].add("finetune")
                            # Lowercase the size when it's a context length
                            part = part[:-1] + part[-1].lower()
                    except ValueError:
                        # Failed to convert the size label to float, use it anyway
                        pass
                if len(name_types[i]) == 0:
                    name_types[i].add("size_label")
                name_parts[i] = part
            # Some easy to recognize finetune names
            elif i > 0 and re.fullmatch(r'chat|instruct|vision|lora', part, re.IGNORECASE):
                if total_params < 0 and part.lower() == "lora":
                    # ignore redundant "lora" in the finetune part when the output is a lora adapter
                    name_types[i].add("type")
                else:
                    name_types[i].add("finetune")
        # Ignore word-based size labels when there is at least a number-based one present
        # TODO: should word-based size labels always be removed instead?
        if any(c.isdecimal() for n, t in zip(name_parts, name_types) if "size_label" in t for c in n):
            for n, t in zip(name_parts, name_types):
                if "size_label" in t:
                    if all(c.isalpha() for c in n):
                        t.remove("size_label")
        at_start = True
        # Find the basename through the annotated name
        for part, t in zip(name_parts, name_types):
            if at_start and ((len(t) == 0 and part[0].isalpha()) or "version" in t):
                t.add("basename")
            else:
                if at_start:
                    at_start = False
                if len(t) == 0:
                    t.add("finetune")
        # Remove the basename annotation from trailing version
        for part, t in zip(reversed(name_parts), reversed(name_types)):
            if "basename" in t and len(t) > 1:
                t.remove("basename")
            else:
                break
        basename = "-".join(n for n, t in zip(name_parts, name_types) if "basename" in t) or None
        # Deduplicate size labels using order-preserving 'dict' ('set' seems to sort the keys)
        size_label = "-".join(dict.fromkeys(s for s, t in zip(name_parts, name_types) if "size_label" in t).keys()) or None
        finetune = "-".join(f for f, t in zip(name_parts, name_types) if "finetune" in t) or None
        # TODO: should the basename version always be excluded?
        # NOTE: multiple finetune versions are joined together
        version = "-".join(v for v, t, in zip(name_parts, name_types) if "version" in t and "basename" not in t) or None
        if size_label is None and finetune is None and version is None:
            # Too ambiguous, output nothing
            basename = None
        return model_full_name_component, org_component, basename, finetune, version, size_label
    @staticmethod
    def apply_metadata_heuristic(metadata: Metadata, model_card: Optional[dict] = None, hf_params: Optional[dict] = None, model_path: Optional[Path] = None, total_params: int = 0) -> Metadata:
        # Reference Model Card Metadata: https://github.com/huggingface/hub-docs/blob/main/modelcard.md?plain=1
        # Model Card Heuristics
        ########################
        if model_card is not None:
            if "model_name" in model_card and metadata.name is None:
                # Not part of huggingface model card standard but notice some model creator using it
                # such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF'
                metadata.name = model_card.get("model_name")
            if "model_creator" in model_card and metadata.author is None:
                # Not part of huggingface model card standard but notice some model creator using it
                # such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF'
                metadata.author = model_card.get("model_creator")
            if "model_type" in model_card and metadata.basename is None:
                # Not part of huggingface model card standard but notice some model creator using it
                # such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF'
                metadata.basename = model_card.get("model_type")
            if "base_model" in model_card:
                # This represents the parent models that this is based on
                # Example: stabilityai/stable-diffusion-xl-base-1.0. Can also be a list (for merges)
                # Example of merges: https://huggingface.co/EmbeddedLLM/Mistral-7B-Merge-14-v0.1/blob/main/README.md
                metadata_base_models = []
                base_model_value = model_card.get("base_model", None)
                if base_model_value is not None:
                    if isinstance(base_model_value, str):
                        metadata_base_models.append(base_model_value)
                    elif isinstance(base_model_value, list):
                        metadata_base_models.extend(base_model_value)
                if metadata.base_models is None:
                    metadata.base_models = []
                for model_id in metadata_base_models:
                    # NOTE: model size of base model is assumed to be similar to the size of the current model
                    model_full_name_component, org_component, basename, finetune, version, size_label = Metadata.get_model_id_components(model_id, total_params)
                    base_model = {}
                    if model_full_name_component is not None:
                        base_model["name"] = Metadata.id_to_title(model_full_name_component)
                    if org_component is not None:
                        base_model["organization"] = Metadata.id_to_title(org_component)
                    if version is not None:
                        base_model["version"] = version
                    if org_component is not None and model_full_name_component is not None:
                        base_model["repo_url"] = f"https://huggingface.co/{org_component}/{model_full_name_component}"
                    metadata.base_models.append(base_model)
            if "license" in model_card and metadata.license is None:
                metadata.license = model_card.get("license")
            if "license_name" in model_card and metadata.license_name is None:
                metadata.license_name = model_card.get("license_name")
            if "license_link" in model_card and metadata.license_link is None:
                metadata.license_link = model_card.get("license_link")
            tags_value = model_card.get("tags", None)
            if tags_value is not None:
                if metadata.tags is None:
                    metadata.tags = []
                if isinstance(tags_value, str):
                    metadata.tags.append(tags_value)
                elif isinstance(tags_value, list):
                    metadata.tags.extend(tags_value)
            pipeline_tags_value = model_card.get("pipeline_tag", None)
            if pipeline_tags_value is not None:
                if metadata.tags is None:
                    metadata.tags = []
                if isinstance(pipeline_tags_value, str):
                    metadata.tags.append(pipeline_tags_value)
                elif isinstance(pipeline_tags_value, list):
                    metadata.tags.extend(pipeline_tags_value)
            language_value = model_card.get("languages", model_card.get("language", None))
            if language_value is not None:
                if metadata.languages is None:
                    metadata.languages = []
                if isinstance(language_value, str):
                    metadata.languages.append(language_value)
                elif isinstance(language_value, list):
                    metadata.languages.extend(language_value)
            dataset_value = model_card.get("datasets", model_card.get("dataset", None))
            if dataset_value is not None:
                if metadata.datasets is None:
                    metadata.datasets = []
                if isinstance(dataset_value, str):
                    metadata.datasets.append(dataset_value)
                elif isinstance(dataset_value, list):
                    metadata.datasets.extend(dataset_value)
        # Hugging Face Parameter Heuristics
        ####################################
        if hf_params is not None:
            hf_name_or_path = hf_params.get("_name_or_path")
            if hf_name_or_path is not None and hf_name_or_path.count('/') <= 1:
                # Use _name_or_path only if its actually a model name and not some computer path
                # e.g. 'meta-llama/Llama-2-7b-hf'
                model_id = hf_name_or_path
                model_full_name_component, org_component, basename, finetune, version, size_label = Metadata.get_model_id_components(model_id, total_params)
                if metadata.name is None and model_full_name_component is not None:
                    metadata.name = Metadata.id_to_title(model_full_name_component)
                if metadata.organization is None and org_component is not None:
                    metadata.organization = Metadata.id_to_title(org_component)
                if metadata.basename is None and basename is not None:
                    metadata.basename = basename
                if metadata.finetune is None and finetune is not None:
                    metadata.finetune = finetune
                if metadata.version is None and version is not None:
                    metadata.version = version
                if metadata.size_label is None and size_label is not None:
                    metadata.size_label = size_label
        # Directory Folder Name Fallback Heuristics
        ############################################
        if model_path is not None:
            model_id = model_path.name
            model_full_name_component, org_component, basename, finetune, version, size_label = Metadata.get_model_id_components(model_id, total_params)
            if metadata.name is None and model_full_name_component is not None:
                metadata.name = Metadata.id_to_title(model_full_name_component)
            if metadata.organization is None and org_component is not None:
                metadata.organization = Metadata.id_to_title(org_component)
            if metadata.basename is None and basename is not None:
                metadata.basename = basename
            if metadata.finetune is None and finetune is not None:
                metadata.finetune = finetune
            if metadata.version is None and version is not None:
                metadata.version = version
            if metadata.size_label is None and size_label is not None:
                metadata.size_label = size_label
        return metadata
    def set_gguf_meta_model(self, gguf_writer: gguf.GGUFWriter):
        assert self.name is not None
        gguf_writer.add_name(self.name)
        if self.author is not None:
            gguf_writer.add_author(self.author)
        if self.version is not None:
            gguf_writer.add_version(self.version)
        if self.organization is not None:
            gguf_writer.add_organization(self.organization)
        if self.finetune is not None:
            gguf_writer.add_finetune(self.finetune)
        if self.basename is not None:
            gguf_writer.add_basename(self.basename)
        if self.description is not None:
            gguf_writer.add_description(self.description)
        if self.quantized_by is not None:
            gguf_writer.add_quantized_by(self.quantized_by)
        if self.size_label is not None:
            gguf_writer.add_size_label(self.size_label)
        if self.license is not None:
            gguf_writer.add_license(self.license)
        if self.license_name is not None:
            gguf_writer.add_license_name(self.license_name)
        if self.license_link is not None:
            gguf_writer.add_license_link(self.license_link)
        if self.url is not None:
            gguf_writer.add_url(self.url)
        if self.doi is not None:
            gguf_writer.add_doi(self.doi)
        if self.uuid is not None:
            gguf_writer.add_uuid(self.uuid)
        if self.repo_url is not None:
            gguf_writer.add_repo_url(self.repo_url)
        if self.source_url is not None:
            gguf_writer.add_source_url(self.source_url)
        if self.source_doi is not None:
            gguf_writer.add_source_doi(self.source_doi)
        if self.source_uuid is not None:
            gguf_writer.add_source_uuid(self.source_uuid)
        if self.source_repo_url is not None:
            gguf_writer.add_source_repo_url(self.source_repo_url)
        if self.base_models is not None:
            gguf_writer.add_base_model_count(len(self.base_models))
            for key, base_model_entry in enumerate(self.base_models):
                if "name" in base_model_entry:
                    gguf_writer.add_base_model_name(key, base_model_entry["name"])
                if "author" in base_model_entry:
                    gguf_writer.add_base_model_author(key, base_model_entry["author"])
                if "version" in base_model_entry:
                    gguf_writer.add_base_model_version(key, base_model_entry["version"])
                if "organization" in base_model_entry:
                    gguf_writer.add_base_model_organization(key, base_model_entry["organization"])
                if "url" in base_model_entry:
                    gguf_writer.add_base_model_url(key, base_model_entry["url"])
                if "doi" in base_model_entry:
                    gguf_writer.add_base_model_doi(key, base_model_entry["doi"])
                if "uuid" in base_model_entry:
                    gguf_writer.add_base_model_uuid(key, base_model_entry["uuid"])
                if "repo_url" in base_model_entry:
                    gguf_writer.add_base_model_repo_url(key, base_model_entry["repo_url"])
        if self.tags is not None:
            gguf_writer.add_tags(self.tags)
        if self.languages is not None:
            gguf_writer.add_languages(self.languages)
        if self.datasets is not None:
            gguf_writer.add_datasets(self.datasets)
--- a/gguf-py/gguf/utility.py
+++ b/gguf-py/gguf/utility.py
@ -0,0 +1,69 @@
 from __future__ import annotations
 from typing import Literal
 def fill_templated_filename(filename: str, output_type: str | None) -> str:
    # Given a file name fill in any type templates e.g. 'some-model-name.{ftype}.gguf'
    ftype_lowercase: str = output_type.lower() if output_type is not None else ""
    ftype_uppercase: str = output_type.upper() if output_type is not None else ""
    return filename.format(ftype_lowercase,
                           outtype=ftype_lowercase, ftype=ftype_lowercase,
                           OUTTYPE=ftype_uppercase, FTYPE=ftype_uppercase)
 def model_weight_count_rounded_notation(model_params_count: int, min_digits: int = 2) -> str:
    if model_params_count > 1e12 :
        # Trillions Of Parameters
        scaled_model_params = model_params_count * 1e-12
        scale_suffix = "T"
    elif model_params_count > 1e9 :
        # Billions Of Parameters
        scaled_model_params = model_params_count * 1e-9
        scale_suffix = "B"
    elif model_params_count > 1e6 :
        # Millions Of Parameters
        scaled_model_params = model_params_count * 1e-6
        scale_suffix = "M"
    else:
        # Thousands Of Parameters
        scaled_model_params = model_params_count * 1e-3
        scale_suffix = "K"
    fix = max(min_digits - len(str(round(scaled_model_params)).lstrip('0')), 0)
    return f"{scaled_model_params:.{fix}f}{scale_suffix}"
 def size_label(total_params: int, shared_params: int, expert_params: int, expert_count: int) -> str:
    if expert_count > 0:
        pretty_size = model_weight_count_rounded_notation(abs(shared_params) + abs(expert_params), min_digits=2)
        size_class = f"{expert_count}x{pretty_size}"
    else:
        size_class = model_weight_count_rounded_notation(abs(total_params), min_digits=2)
    return size_class
 def naming_convention(model_name: str | None, base_name: str | None, finetune_string: str | None, version_string: str | None, size_label: str | None, output_type: str | None, model_type: Literal['vocab', 'LoRA'] | None = None) -> str:
    # Reference: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#gguf-naming-convention
    if base_name is not None:
        name = base_name.strip().replace(' ', '-').replace('/', '-')
    elif model_name is not None:
        name = model_name.strip().replace(' ', '-').replace('/', '-')
    else:
        name = "ggml-model"
    parameters = f"-{size_label}" if size_label is not None else ""
    finetune = f"-{finetune_string.strip().replace(' ', '-')}" if finetune_string is not None else ""
    version = f"-{version_string.strip().replace(' ', '-')}" if version_string is not None else ""
    encoding = f"-{output_type.strip().replace(' ', '-').upper()}" if output_type is not None else ""
    kind = f"-{model_type.strip().replace(' ', '-')}" if model_type is not None else ""
    return f"{name}{parameters}{finetune}{version}{encoding}{kind}"
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@ -22,6 +22,7 @@ classifiers = [
 python = ">=3.8"
 numpy = ">=1.17"
 tqdm = ">=4.27"
 pyyaml = ">=5.1"
 [tool.poetry.dev-dependencies]
 pytest = "^5.2"
--- a/gguf-py/scripts/gguf_dump.py
+++ b/gguf-py/scripts/gguf_dump.py
@ -4,6 +4,7 @@ from __future__ import annotations
 import logging
 import argparse
 import os
 import re
 import sys
 from pathlib import Path
 from typing import Any
@ -244,26 +245,58 @@ def dump_markdown_metadata(reader: GGUFReader, args: argparse.Namespace) -> None
        else:
            pretty_type = str(field.types[-1].name)
        def escape_markdown_inline_code(value_string):
            # Find the longest contiguous sequence of backticks in the string then
            # wrap string with appropriate number of backticks required to escape it
            max_backticks = max((len(match.group(0)) for match in re.finditer(r'`+', value_string)), default=0)
            inline_code_marker = '`' * (max_backticks + 1)
            # If the string starts or ends with a backtick, add a space at the beginning and end
            if value_string.startswith('`') or value_string.endswith('`'):
                value_string = f" {value_string} "
            return f"{inline_code_marker}{value_string}{inline_code_marker}"
        total_elements = len(field.data)
        value = ""
        if len(field.types) == 1:
            curr_type = field.types[0]
            if curr_type == GGUFValueType.STRING:
-                value = repr(str(bytes(field.parts[-1]), encoding='utf-8')[:60])
+                truncate_length = 60
                value_string = str(bytes(field.parts[-1]), encoding='utf-8')
                if len(value_string) > truncate_length:
                    head = escape_markdown_inline_code(value_string[:truncate_length // 2])
                    tail = escape_markdown_inline_code(value_string[-truncate_length // 2:])
                    value = "{head}...{tail}".format(head=head, tail=tail)
                else:
                    value = escape_markdown_inline_code(value_string)
            elif curr_type in reader.gguf_scalar_to_np:
                value = str(field.parts[-1][0])
        else:
            if field.types[0] == GGUFValueType.ARRAY:
                curr_type = field.types[1]
                array_elements = []
                if curr_type == GGUFValueType.STRING:
                    render_element = min(5, total_elements)
                    for element_pos in range(render_element):
-                        value += repr(str(bytes(field.parts[-1 - element_pos]), encoding='utf-8')[:5]) + (", " if total_elements > 1 else "")
+                        truncate_length = 30
                        value_string = str(bytes(field.parts[-1 - (total_elements - element_pos - 1) * 2]), encoding='utf-8')
                        if len(value_string) > truncate_length:
                            head = escape_markdown_inline_code(value_string[:truncate_length // 2])
                            tail = escape_markdown_inline_code(value_string[-truncate_length // 2:])
                            value = "{head}...{tail}".format(head=head, tail=tail)
                        else:
                            value = escape_markdown_inline_code(value_string)
                        array_elements.append(value)
                elif curr_type in reader.gguf_scalar_to_np:
                    render_element = min(7, total_elements)
                    for element_pos in range(render_element):
-                        value += str(field.parts[-1 - element_pos][0]) + (", " if total_elements > 1 else "")
+                        array_elements.append(str(field.parts[-1 - (total_elements - element_pos - 1)][0]))
-                value = f'[ {value}{" ..." if total_elements > 1 else ""} ]'
+
                value = f'[ {", ".join(array_elements).strip()}{", ..." if total_elements > len(array_elements) else ""} ]'
        kv_dump_table.append({"n":n, "pretty_type":pretty_type, "total_elements":total_elements, "field_name":field.name, "value":value})
    kv_dump_table_header_map = [
--- a/gguf-py/tests/init.py
+++ b/gguf-py/tests/init.py
@ -0,0 +1 @@
 from .test_metadata import *
--- a/Show More
+++ b/Show More