diff --git a/.devops/full-cuda.Dockerfile b/.devops/full-cuda.Dockerfile index 2a7da586a..61f671465 100644 --- a/.devops/full-cuda.Dockerfile +++ b/.devops/full-cuda.Dockerfile @@ -6,7 +6,7 @@ ARG CUDA_VERSION=11.7.1 # Target the CUDA build image ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} -FROM ${BASE_CUDA_DEV_CONTAINER} as build +FROM ${BASE_CUDA_DEV_CONTAINER} AS build # Unless otherwise specified, we make a fat build. ARG CUDA_DOCKER_ARCH=all diff --git a/.devops/full-rocm.Dockerfile b/.devops/full-rocm.Dockerfile index 5cbd2e7a1..680d1cb92 100644 --- a/.devops/full-rocm.Dockerfile +++ b/.devops/full-rocm.Dockerfile @@ -6,7 +6,7 @@ ARG ROCM_VERSION=5.6 # Target the CUDA build image ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete -FROM ${BASE_ROCM_DEV_CONTAINER} as build +FROM ${BASE_ROCM_DEV_CONTAINER} AS build # Unless otherwise specified, we make a fat build. # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 diff --git a/.devops/full.Dockerfile b/.devops/full.Dockerfile index 6f19afa9c..2a06f82b7 100644 --- a/.devops/full.Dockerfile +++ b/.devops/full.Dockerfile @@ -1,6 +1,6 @@ ARG UBUNTU_VERSION=22.04 -FROM ubuntu:$UBUNTU_VERSION as build +FROM ubuntu:$UBUNTU_VERSION AS build RUN apt-get update && \ apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1 diff --git a/.devops/llama-cli-cuda.Dockerfile b/.devops/llama-cli-cuda.Dockerfile index bff946cbc..8eda63a89 100644 --- a/.devops/llama-cli-cuda.Dockerfile +++ b/.devops/llama-cli-cuda.Dockerfile @@ -6,7 +6,7 @@ ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VER # Target the CUDA runtime image ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} -FROM ${BASE_CUDA_DEV_CONTAINER} as build +FROM ${BASE_CUDA_DEV_CONTAINER} AS build # Unless otherwise specified, we make a fat build. ARG CUDA_DOCKER_ARCH=all @@ -25,7 +25,7 @@ ENV GGML_CUDA=1 RUN make -j$(nproc) llama-cli -FROM ${BASE_CUDA_RUN_CONTAINER} as runtime +FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime RUN apt-get update && \ apt-get install -y libgomp1 diff --git a/.devops/llama-cli-intel.Dockerfile b/.devops/llama-cli-intel.Dockerfile index bd816f9f5..2bf82bb58 100644 --- a/.devops/llama-cli-intel.Dockerfile +++ b/.devops/llama-cli-intel.Dockerfile @@ -1,6 +1,6 @@ ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04 -FROM intel/oneapi-basekit:$ONEAPI_VERSION as build +FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build ARG GGML_SYCL_F16=OFF RUN apt-get update && \ @@ -17,7 +17,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \ cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \ cmake --build build --config Release --target llama-cli -FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime +FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime COPY --from=build /app/build/bin/llama-cli /llama-cli diff --git a/.devops/llama-cli-rocm.Dockerfile b/.devops/llama-cli-rocm.Dockerfile index caa507b08..c3d1ab067 100644 --- a/.devops/llama-cli-rocm.Dockerfile +++ b/.devops/llama-cli-rocm.Dockerfile @@ -6,7 +6,7 @@ ARG ROCM_VERSION=5.6 # Target the CUDA build image ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete -FROM ${BASE_ROCM_DEV_CONTAINER} as build +FROM ${BASE_ROCM_DEV_CONTAINER} AS build # Unless otherwise specified, we make a fat build. # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 diff --git a/.devops/llama-cli-vulkan.Dockerfile b/.devops/llama-cli-vulkan.Dockerfile index 6155d5881..9b0dad8bf 100644 --- a/.devops/llama-cli-vulkan.Dockerfile +++ b/.devops/llama-cli-vulkan.Dockerfile @@ -1,6 +1,6 @@ ARG UBUNTU_VERSION=jammy -FROM ubuntu:$UBUNTU_VERSION as build +FROM ubuntu:$UBUNTU_VERSION AS build # Install build tools RUN apt update && apt install -y git build-essential cmake wget libgomp1 diff --git a/.devops/llama-cli.Dockerfile b/.devops/llama-cli.Dockerfile index 38382bfc9..7f741aa46 100644 --- a/.devops/llama-cli.Dockerfile +++ b/.devops/llama-cli.Dockerfile @@ -1,6 +1,6 @@ ARG UBUNTU_VERSION=22.04 -FROM ubuntu:$UBUNTU_VERSION as build +FROM ubuntu:$UBUNTU_VERSION AS build RUN apt-get update && \ apt-get install -y build-essential git @@ -11,7 +11,7 @@ COPY . . RUN make -j$(nproc) llama-cli -FROM ubuntu:$UBUNTU_VERSION as runtime +FROM ubuntu:$UBUNTU_VERSION AS runtime RUN apt-get update && \ apt-get install -y libgomp1 diff --git a/.devops/llama-server-cuda.Dockerfile b/.devops/llama-server-cuda.Dockerfile index d7eaa0925..67328cf1c 100644 --- a/.devops/llama-server-cuda.Dockerfile +++ b/.devops/llama-server-cuda.Dockerfile @@ -6,7 +6,7 @@ ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VER # Target the CUDA runtime image ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} -FROM ${BASE_CUDA_DEV_CONTAINER} as build +FROM ${BASE_CUDA_DEV_CONTAINER} AS build # Unless otherwise specified, we make a fat build. ARG CUDA_DOCKER_ARCH=all @@ -27,7 +27,7 @@ ENV LLAMA_CURL=1 RUN make -j$(nproc) llama-server -FROM ${BASE_CUDA_RUN_CONTAINER} as runtime +FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime RUN apt-get update && \ apt-get install -y libcurl4-openssl-dev libgomp1 curl diff --git a/.devops/llama-server-intel.Dockerfile b/.devops/llama-server-intel.Dockerfile index 8f8fef8c0..eb9aba618 100644 --- a/.devops/llama-server-intel.Dockerfile +++ b/.devops/llama-server-intel.Dockerfile @@ -1,6 +1,6 @@ ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04 -FROM intel/oneapi-basekit:$ONEAPI_VERSION as build +FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build ARG GGML_SYCL_F16=OFF RUN apt-get update && \ @@ -17,7 +17,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \ cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \ cmake --build build --config Release --target llama-server -FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime +FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime RUN apt-get update && \ apt-get install -y libcurl4-openssl-dev curl diff --git a/.devops/llama-server-rocm.Dockerfile b/.devops/llama-server-rocm.Dockerfile index af96c3325..763b4cd3f 100644 --- a/.devops/llama-server-rocm.Dockerfile +++ b/.devops/llama-server-rocm.Dockerfile @@ -6,7 +6,7 @@ ARG ROCM_VERSION=5.6 # Target the CUDA build image ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete -FROM ${BASE_ROCM_DEV_CONTAINER} as build +FROM ${BASE_ROCM_DEV_CONTAINER} AS build # Unless otherwise specified, we make a fat build. # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 diff --git a/.devops/llama-server-vulkan.Dockerfile b/.devops/llama-server-vulkan.Dockerfile index 49062f84b..13a61ffd8 100644 --- a/.devops/llama-server-vulkan.Dockerfile +++ b/.devops/llama-server-vulkan.Dockerfile @@ -1,6 +1,6 @@ ARG UBUNTU_VERSION=jammy -FROM ubuntu:$UBUNTU_VERSION as build +FROM ubuntu:$UBUNTU_VERSION AS build # Install build tools RUN apt update && apt install -y git build-essential cmake wget diff --git a/.devops/llama-server.Dockerfile b/.devops/llama-server.Dockerfile index a53a5c999..b631d5806 100644 --- a/.devops/llama-server.Dockerfile +++ b/.devops/llama-server.Dockerfile @@ -1,6 +1,6 @@ ARG UBUNTU_VERSION=22.04 -FROM ubuntu:$UBUNTU_VERSION as build +FROM ubuntu:$UBUNTU_VERSION AS build RUN apt-get update && \ apt-get install -y build-essential git libcurl4-openssl-dev curl @@ -13,7 +13,7 @@ ENV LLAMA_CURL=1 RUN make -j$(nproc) llama-server -FROM ubuntu:$UBUNTU_VERSION as runtime +FROM ubuntu:$UBUNTU_VERSION AS runtime RUN apt-get update && \ apt-get install -y libcurl4-openssl-dev libgomp1 diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 1f275603a..a1e183d11 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -860,7 +860,7 @@ jobs: mkdir build cd build cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON - cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} + cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) - name: Determine tag name id: tag diff --git a/CMakeLists.txt b/CMakeLists.txt index 67dcf86d4..793709122 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -106,6 +106,7 @@ llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE) llama_option_depr(WARNING LLAMA_RPC GGML_RPC) llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL) llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16) +llama_option_depr(WARNING LLAMA_CANN GGML_CANN) # # build the library diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 9ad516049..b688f78ec 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,13 +1,17 @@ -# Pull requests +# Pull requests (for contributors) -- Always squash-merge the PR before merging -- Use the following format for your final commit: ` : (#)`. For example: `utils : fix typo in utils.py (#1234)` - Test your changes: - Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library - Execute [the full CI locally on your machine](ci/README.md) before publishing -- If the pull request contains only documentation changes (e.g., updating READMEs, adding new wiki pages), please add `[no ci]` to the commit title. This will skip unnecessary CI checks and help reduce build times - Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs. - - The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your conveience + - The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your convenience +- If your PR becomes stale, don't hesitate to ping the maintainers in the comments + +# Pull requests (for collaborators) + +- Squash-merge PRs +- Use the following format for the squashed commit title: ` : (#)`. For example: `utils : fix typo in utils.py (#1234)` +- Optionally, pick a `` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules # Coding guidelines diff --git a/Makefile b/Makefile index 4584594af..52b55dd89 100644 --- a/Makefile +++ b/Makefile @@ -876,6 +876,9 @@ OBJ_GGML += \ OBJ_LLAMA = \ src/llama.o \ + src/llama-vocab.o \ + src/llama-grammar.o \ + src/llama-sampling.o \ src/unicode.o \ src/unicode-data.o @@ -1055,6 +1058,10 @@ src/unicode-data.o: \ src/llama.o: \ src/llama.cpp \ + src/llama-impl.h \ + src/llama-vocab.h \ + src/llama-grammar.h \ + src/llama-sampling.h \ src/unicode.h \ include/llama.h \ ggml/include/ggml-cuda.h \ @@ -1064,6 +1071,29 @@ src/llama.o: \ ggml/include/ggml-backend.h $(CXX) $(CXXFLAGS) -c $< -o $@ +src/llama-vocab.o: \ + src/llama-vocab.cpp \ + src/llama-vocab.h \ + src/llama-impl.h \ + include/llama.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +src/llama-grammar.o: \ + src/llama-grammar.cpp \ + src/llama-grammar.h \ + src/llama-impl.h \ + src/llama-vocab.h \ + src/llama-sampling.h \ + include/llama.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +src/llama-sampling.o: \ + src/llama-sampling.cpp \ + src/llama-sampling.h \ + src/llama-impl.h \ + include/llama.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + $(LIB_LLAMA): \ $(OBJ_LLAMA) \ $(LIB_GGML) @@ -1439,7 +1469,7 @@ run-benchmark-matmult: llama-benchmark-matmult .PHONY: run-benchmark-matmult swift tests/test-llama-grammar: tests/test-llama-grammar.cpp \ - $(OBJ_GGML) $(OBJ_COMMON) src/unicode.o src/unicode-data.o + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) diff --git a/Package.swift b/Package.swift index d40a48385..1d90b47bf 100644 --- a/Package.swift +++ b/Package.swift @@ -4,6 +4,9 @@ import PackageDescription var sources = [ "src/llama.cpp", + "src/llama-vocab.cpp", + "src/llama-grammar.cpp", + "src/llama-sampling.cpp", "src/unicode.cpp", "src/unicode-data.cpp", "ggml/src/ggml.c", diff --git a/README.md b/README.md index 058919068..7c233b5e1 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png) [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) -[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg?branch=master&event=schedule)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml) +[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml) [![Conan Center](https://shields.io/conan/v/llama-cpp)](https://conan.io/center/llama-cpp) [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml) diff --git a/common/ngram-cache.h b/common/ngram-cache.h index e4fa4cbd1..ab4c9b376 100644 --- a/common/ngram-cache.h +++ b/common/ngram-cache.h @@ -37,11 +37,18 @@ struct llama_ngram { } }; +struct llama_token_hash_function { + size_t operator()(const llama_token token) const { + // see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/ + return token * 11400714819323198485llu; + } +}; + struct llama_ngram_hash_function { size_t operator()(const llama_ngram & ngram) const { - size_t hash = 0; - for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { - hash ^= std::hash{}(ngram.tokens[i]); + size_t hash = llama_token_hash_function{}(ngram.tokens[0]); + for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) { + hash ^= llama_token_hash_function{}(ngram.tokens[i]); } return hash; } diff --git a/common/sampling.cpp b/common/sampling.cpp index 6a483c815..079e40516 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -330,7 +330,7 @@ static llama_token llama_sampling_sample_impl( llama_token_data_array single_token_data_array = { &single_token_data, 1, false }; // Apply grammar constraints to the single token - llama_sample_grammar(ctx_main, &single_token_data_array, ctx_sampling->grammar); + llama_grammar_sample(ctx_sampling->grammar, ctx_main, &single_token_data_array); // Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY bool is_valid = single_token_data_array.data[0].logit != -INFINITY; @@ -421,7 +421,7 @@ static llama_token_data_array llama_sampling_prepare_impl( // apply grammar checks before sampling logic if (apply_grammar && ctx_sampling->grammar != NULL) { - llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar); + llama_grammar_sample(ctx_sampling->grammar, ctx_main, &cur_p); } return cur_p; @@ -455,6 +455,6 @@ void llama_sampling_accept( ctx_sampling->prev.push_back(id); if (ctx_sampling->grammar != NULL && apply_grammar) { - llama_grammar_accept_token(ctx_main, ctx_sampling->grammar, id); + llama_grammar_accept_token(ctx_sampling->grammar, ctx_main, id); } } diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index c2aba9097..dde4fa9c8 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -48,34 +48,39 @@ class Model: dir_model: Path ftype: gguf.LlamaFileType + fname_out: Path is_big_endian: bool endianess: gguf.GGUFEndian use_temp_file: bool lazy: bool - model_name: str | None part_names: list[str] is_safetensors: bool hparams: dict[str, Any] block_count: int tensor_map: gguf.TensorNameMap tensor_names: set[str] | None - fname_out: Path gguf_writer: gguf.GGUFWriter + model_name: str | None + metadata_override: Path | None + dir_model_card: Path # subclasses should define this! model_arch: gguf.MODEL_ARCH - def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, - model_name: str | None, split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False): + def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False, + use_temp_file: bool = False, eager: bool = False, + metadata_override: Path | None = None, model_name: str | None = None, + split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False): if type(self) is Model: raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") + self.dir_model = dir_model self.ftype = ftype + self.fname_out = fname_out self.is_big_endian = is_big_endian self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE self.use_temp_file = use_temp_file self.lazy = not eager - self.model_name = model_name self.part_names = Model.get_model_part_names(self.dir_model, "model", ".safetensors") self.is_safetensors = len(self.part_names) > 0 if not self.is_safetensors: @@ -84,6 +89,11 @@ class Model: self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"]) self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) self.tensor_names = None + self.metadata_override = metadata_override + self.model_name = model_name + self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py + + # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type if self.ftype == gguf.LlamaFileType.GUESSED: # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie. _, first_tensor = next(self.get_tensors()) @@ -93,10 +103,8 @@ class Model: else: logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})") self.ftype = gguf.LlamaFileType.MOSTLY_BF16 - ftype_up: str = self.ftype.name.partition("_")[2].upper() - ftype_lw: str = ftype_up.lower() - # allow templating the file name with the output ftype, useful with the "auto" ftype - self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up) + + # Configure GGUF Writer self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file, split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard) @@ -193,7 +201,6 @@ class Model: return new_name def set_gguf_parameters(self): - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_block_count(self.block_count) if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None: @@ -232,6 +239,10 @@ class Model: self.gguf_writer.add_expert_used_count(n_experts_used) logger.info(f"gguf: experts used count = {n_experts_used}") + if (head_dim := self.hparams.get("head_dim")) is not None: + self.gguf_writer.add_key_length(head_dim) + self.gguf_writer.add_value_length(head_dim) + self.gguf_writer.add_file_type(self.ftype) logger.info(f"gguf: file type = {self.ftype}") @@ -250,7 +261,7 @@ class Model: return False - def write_tensors(self): + def prepare_tensors(self): max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,") for name, data_torch in self.get_tensors(): @@ -333,9 +344,62 @@ class Model: self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype) + def set_type(self): + self.gguf_writer.add_type(gguf.GGUFType.MODEL) + + def prepare_metadata(self, vocab_only: bool): + + total_params, shared_params, expert_params, expert_count = self.gguf_writer.get_total_parameter_count() + + self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model_card, self.model_name, total_params) + + # Fallback to model directory name if metadata name is still missing + if self.metadata.name is None: + self.metadata.name = self.dir_model.name + + # Generate parameter weight class (useful for leader boards) if not yet determined + if self.metadata.size_label is None and total_params > 0: + self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count) + + # Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0' + output_type: str = self.ftype.name.partition("_")[2] + + # Filename Output + if self.fname_out.is_dir(): + # Generate default filename based on model specification and available metadata + if not vocab_only: + fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None) + else: + fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab") + + # Use the default filename + self.fname_out = self.fname_out / f"{fname_default}.gguf" + else: + # Output path is a custom defined templated filename + # Note: `not is_dir()` is used because `.is_file()` will not detect + # file template strings as it doesn't actually exist as a file + + # Process templated file name with the output ftype, useful with the "auto" ftype + self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type) + + self.set_type() + + logger.info("Set meta model") + self.metadata.set_gguf_meta_model(self.gguf_writer) + + logger.info("Set model parameters") + self.set_gguf_parameters() + + logger.info("Set model tokenizer") + self.set_vocab() + + logger.info("Set model quantization version") + self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) + def write(self): - self.write_tensors() - self.gguf_writer.write_header_to_file(self.fname_out) + self.prepare_tensors() + self.prepare_metadata(vocab_only=False) + self.gguf_writer.write_header_to_file(path=self.fname_out) self.gguf_writer.write_kv_data_to_file() self.gguf_writer.write_tensors_to_file(progress=True) self.gguf_writer.close() @@ -343,7 +407,9 @@ class Model: def write_vocab(self): if len(self.gguf_writer.tensors) != 1: raise ValueError('Splitting the vocabulary is not supported') - self.gguf_writer.write_header_to_file(self.fname_out) + + self.prepare_metadata(vocab_only=True) + self.gguf_writer.write_header_to_file(path=self.fname_out) self.gguf_writer.write_kv_data_to_file() self.gguf_writer.close() @@ -528,6 +594,15 @@ class Model: if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901": # ref: https://huggingface.co/core42/jais-13b res = "jais" + if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f": + # ref: https://huggingface.co/WisdomShell/CodeShell-7B + res = "codeshell" + if chkhsh == "63b97e4253352e6f357cc59ea5b583e3a680eaeaf2632188c2b952de2588485e": + # ref: https://huggingface.co/mistralai/Mistral-Nemo-Base-2407 + res = "tekken" + if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249": + # ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M + res = "smollm" if res is None: logger.warning("\n") @@ -668,7 +743,7 @@ class Model: added_tokens_json = json.load(f) for key in added_tokens_json: token_id = added_tokens_json[key] - if (token_id >= vocab_size): + if token_id >= vocab_size: logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') continue @@ -685,7 +760,8 @@ class Model: token_id = int(token_id) token: str = token_data["content"] if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - assert tokens[token_id] == token.encode("utf-8") + if tokens[token_id] != token.encode("utf-8"): + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}') if token_data.get("special") or self.does_token_look_special(token): toktypes[token_id] = SentencePieceTokenTypes.CONTROL else: @@ -780,7 +856,6 @@ class GPTNeoXModel(Model): def set_gguf_parameters(self): block_count = self.hparams["num_hidden_layers"] - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) @@ -836,7 +911,6 @@ class BloomModel(Model): model_arch = gguf.MODEL_ARCH.BLOOM def set_gguf_parameters(self): - self.gguf_writer.add_name("Bloom") n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed)) @@ -913,7 +987,6 @@ class MPTModel(Model): def set_gguf_parameters(self): block_count = self.hparams["n_layers"] - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) self.gguf_writer.add_embedding_length(self.hparams["d_model"]) self.gguf_writer.add_block_count(block_count) @@ -952,7 +1025,6 @@ class OrionModel(Model): block_count = self.hparams["num_hidden_layers"] head_count = self.hparams["num_attention_heads"] head_count_kv = self.hparams.get("num_key_value_heads", head_count) - hf_repo = self.hparams.get("_name_or_path", "") ctx_length = 0 if "max_sequence_length" in self.hparams: @@ -965,8 +1037,6 @@ class OrionModel(Model): raise ValueError("gguf: can not find ctx length parameter.") self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) - self.gguf_writer.add_source_hf_repo(hf_repo) self.gguf_writer.add_tensor_data_layout("Meta AI original pth") self.gguf_writer.add_context_length(ctx_length) self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) @@ -990,7 +1060,6 @@ class BaichuanModel(Model): block_count = self.hparams["num_hidden_layers"] head_count = self.hparams["num_attention_heads"] head_count_kv = self.hparams.get("num_key_value_heads", head_count) - hf_repo = self.hparams.get("_name_or_path", "") ctx_length = 0 if "max_sequence_length" in self.hparams: @@ -1002,8 +1071,6 @@ class BaichuanModel(Model): else: raise ValueError("gguf: can not find ctx length parameter.") - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) - self.gguf_writer.add_source_hf_repo(hf_repo) self.gguf_writer.add_tensor_data_layout("Meta AI original pth") self.gguf_writer.add_context_length(ctx_length) self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) @@ -1117,7 +1184,6 @@ class XverseModel(Model): block_count = self.hparams["num_hidden_layers"] head_count = self.hparams["num_attention_heads"] head_count_kv = self.hparams.get("num_key_value_heads", head_count) - hf_repo = self.hparams.get("_name_or_path", "") ctx_length = 0 if "max_sequence_length" in self.hparams: @@ -1129,8 +1195,6 @@ class XverseModel(Model): else: raise ValueError("gguf: can not find ctx length parameter.") - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) - self.gguf_writer.add_source_hf_repo(hf_repo) self.gguf_writer.add_tensor_data_layout("Meta AI original pth") self.gguf_writer.add_context_length(ctx_length) self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) @@ -1189,7 +1253,6 @@ class FalconModel(Model): if n_head_kv is None: n_head_kv = self.hparams.get("n_head_kv", 1) # old name - self.gguf_writer.add_name("Falcon") self.gguf_writer.add_context_length(2048) # not in config.json self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) @@ -1234,7 +1297,6 @@ class StarCoderModel(Model): def set_gguf_parameters(self): block_count = self.hparams["n_layer"] - self.gguf_writer.add_name("StarCoder") self.gguf_writer.add_context_length(self.hparams["n_positions"]) self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) @@ -1258,6 +1320,7 @@ class RefactModel(Model): special_vocab._set_special_token("prefix", 1) special_vocab._set_special_token("suffix", 3) special_vocab._set_special_token("middle", 2) + special_vocab.chat_template = None # do not add it twice special_vocab.add_to_gguf(self.gguf_writer) def set_gguf_parameters(self): @@ -1269,7 +1332,6 @@ class RefactModel(Model): block_count = self.hparams["n_layer"] - self.gguf_writer.add_name("Refact") # refact uses Alibi. So this is from config.json which might be used by training. self.gguf_writer.add_context_length(self.hparams["n_positions"]) self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) @@ -1324,7 +1386,6 @@ class StableLMModel(Model): hparams = self.hparams block_count = hparams["num_hidden_layers"] - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) @@ -1386,8 +1447,8 @@ class StableLMModel(Model): return [(new_name, data_torch)] - def write_tensors(self): - super().write_tensors() + def prepare_tensors(self): + super().prepare_tensors() if self._q_norms is not None or self._k_norms is not None: # flatten two `list[dict[str, Tensor]]` into a single `list[str]` @@ -1430,7 +1491,12 @@ class LlamaModel(Model): super().set_gguf_parameters() hparams = self.hparams self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) + + if "head_dim" in hparams: + rope_dim = hparams["head_dim"] + else: + rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] + self.gguf_writer.add_rope_dimension_count(rope_dim) if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: if self.hparams["rope_scaling"].get("type") == "linear": @@ -1503,8 +1569,8 @@ class LlamaModel(Model): return [(self.map_tensor_name(name), data_torch)] - def write_tensors(self): - super().write_tensors() + def prepare_tensors(self): + super().prepare_tensors() if self._experts is not None: # flatten `list[dict[str, Tensor]]` into `list[str]` @@ -1567,7 +1633,6 @@ class GrokModel(Model): def set_gguf_parameters(self): super().set_gguf_parameters() - self.gguf_writer.add_name("Grok") _experts: list[dict[str, Tensor]] | None = None @@ -1616,7 +1681,6 @@ class DbrxModel(Model): def set_gguf_parameters(self): ffn_config = self.hparams["ffn_config"] attn_config = self.hparams["attn_config"] - self.gguf_writer.add_name(self.hparams["model_type"]) self.gguf_writer.add_block_count(self.hparams["n_layers"]) self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) @@ -1685,7 +1749,6 @@ class MiniCPMModel(Model): def set_gguf_parameters(self): block_count = self.hparams["num_hidden_layers"] - self.gguf_writer.add_name("MiniCPM") self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) @@ -1755,7 +1818,6 @@ class QwenModel(Model): self._set_vocab_qwen() def set_gguf_parameters(self): - self.gguf_writer.add_name("Qwen") self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) @@ -1831,8 +1893,8 @@ class Qwen2MoeModel(Model): return [(self.map_tensor_name(name), data_torch)] - def write_tensors(self): - super().write_tensors() + def prepare_tensors(self): + super().prepare_tensors() if self._experts is not None: # flatten `list[dict[str, Tensor]]` into `list[str]` @@ -1846,7 +1908,6 @@ class GPT2Model(Model): model_arch = gguf.MODEL_ARCH.GPT2 def set_gguf_parameters(self): - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_block_count(self.hparams["n_layer"]) self.gguf_writer.add_context_length(self.hparams["n_ctx"]) self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) @@ -1889,7 +1950,6 @@ class Phi2Model(Model): n_embd = self.find_hparam(["hidden_size", "n_embd"]) n_head = self.find_hparam(["num_attention_heads", "n_head"]) - self.gguf_writer.add_name("Phi2") self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"])) self.gguf_writer.add_embedding_length(n_embd) @@ -1951,7 +2011,7 @@ class Phi3MiniModel(Model): for key in added_tokens_json: token_id = added_tokens_json[key] - if (token_id >= vocab_size): + if token_id >= vocab_size: logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') continue @@ -1968,7 +2028,8 @@ class Phi3MiniModel(Model): token_id = int(token_id) token = foken_data["content"].encode("utf-8") if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - assert tokens[token_id] == token + if tokens[token_id] != token: + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') tokens[token_id] = token scores[token_id] = -1000.0 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED @@ -1984,7 +2045,8 @@ class Phi3MiniModel(Model): token_id = int(foken_data["id"]) token = foken_data["content"].encode("utf-8") if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - assert tokens[token_id] == token + if tokens[token_id] != token: + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') tokens[token_id] = token scores[token_id] = -1000.0 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED @@ -2011,7 +2073,6 @@ class Phi3MiniModel(Model): orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"]) rope_dims = n_embd // n_head - self.gguf_writer.add_name("Phi3") self.gguf_writer.add_context_length(max_pos_embds) self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds) self.gguf_writer.add_embedding_length(n_embd) @@ -2026,7 +2087,7 @@ class Phi3MiniModel(Model): # write rope scaling for long context (128k) model rope_scaling = self.find_hparam(['rope_scaling'], True) - if (rope_scaling is None): + if rope_scaling is None: return scale = max_pos_embds / orig_max_pos_embds @@ -2068,7 +2129,6 @@ class PlamoModel(Model): hparams = self.hparams block_count = hparams["num_hidden_layers"] - self.gguf_writer.add_name("PLaMo") self.gguf_writer.add_context_length(4096) # not in config.json self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) @@ -2113,7 +2173,6 @@ class CodeShellModel(Model): def set_gguf_parameters(self): block_count = self.hparams["n_layer"] - self.gguf_writer.add_name("CodeShell") self.gguf_writer.add_context_length(self.hparams["n_positions"]) self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) @@ -2226,7 +2285,8 @@ class InternLM2Model(Model): chat_eos_token_id = token_id token = token.encode("utf-8") if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - assert(tokens[token_id] == token) + if tokens[token_id] != token: + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') tokens[token_id] = token scores[token_id] = -1000.0 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED @@ -2245,7 +2305,8 @@ class InternLM2Model(Model): chat_eos_token_id = token_id token = token.encode("utf-8") if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - assert(tokens[token_id] == token) + if tokens[token_id] != token: + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') tokens[token_id] = token scores[token_id] = -1000.0 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED @@ -2272,7 +2333,6 @@ class InternLM2Model(Model): special_vocab.add_to_gguf(self.gguf_writer) def set_gguf_parameters(self): - self.gguf_writer.add_name("InternLM2") self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) @@ -2432,6 +2492,7 @@ class GemmaModel(Model): special_vocab._set_special_token("middle", 68) special_vocab._set_special_token("fsep", 70) special_vocab._set_special_token("eot", 107) + special_vocab.chat_template = None # do not add it twice special_vocab.add_to_gguf(self.gguf_writer) self.gguf_writer.add_add_space_prefix(False) @@ -2440,7 +2501,6 @@ class GemmaModel(Model): hparams = self.hparams block_count = hparams["num_hidden_layers"] - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) @@ -2481,7 +2541,6 @@ class Gemma2Model(Model): hparams = self.hparams block_count = hparams["num_hidden_layers"] - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) @@ -2556,7 +2615,6 @@ class MambaModel(Model): # Fail early for models which don't have a block expansion factor of 2 assert d_inner == 2 * d_model - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default self.gguf_writer.add_embedding_length(d_model) self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading @@ -2676,7 +2734,7 @@ class JinaBertV2Model(BertModel): yield name, data - def set_vocab(self, *args, **kwargs): + def set_vocab(self): tokenizer_class = 'BertTokenizer' with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f: tokenizer_class = json.load(f)['tokenizer_class'] @@ -2735,7 +2793,6 @@ class OpenELMModel(Model): assert self.block_count == len(self._num_query_heads) assert self.block_count == len(self._ffn_dims) - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_block_count(self.block_count) self.gguf_writer.add_context_length(self.hparams["max_context_length"]) self.gguf_writer.add_embedding_length(n_embd) @@ -2825,7 +2882,7 @@ class ArcticModel(Model): added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"] for token_id, token_json in added_tokens_decoder.items(): token_id = int(token_id) - if (token_id >= vocab_size): + if token_id >= vocab_size: logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') continue @@ -2909,8 +2966,8 @@ class ArcticModel(Model): return [(self.map_tensor_name(name), data_torch)] - def write_tensors(self): - super().write_tensors() + def prepare_tensors(self): + super().prepare_tensors() if self._experts is not None: # flatten `list[dict[str, Tensor]]` into `list[str]` @@ -2988,8 +3045,8 @@ class DeepseekV2Model(Model): return [(self.map_tensor_name(name), data_torch)] - def write_tensors(self): - super().write_tensors() + def prepare_tensors(self): + super().prepare_tensors() if self._experts is not None: # flatten `list[dict[str, Tensor]]` into `list[str]` @@ -3074,7 +3131,7 @@ class T5Model(Model): added_tokens_json = json.load(f) for key in added_tokens_json: token_id = added_tokens_json[key] - if (token_id >= vocab_size): + if token_id >= vocab_size: logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') continue @@ -3107,7 +3164,6 @@ class T5Model(Model): self.gguf_writer.add_add_eos_token(True) def set_gguf_parameters(self): - self.gguf_writer.add_name("T5") if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None: logger.warning("Couldn't find context length in config.json, assuming default value of 512") n_ctx = 512 @@ -3181,7 +3237,6 @@ class JaisModel(Model): self._set_vocab_gpt2() def set_gguf_parameters(self): - self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_block_count(self.hparams["n_layer"]) self.gguf_writer.add_context_length(self.hparams["n_positions"]) self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) @@ -3227,8 +3282,8 @@ class JaisModel(Model): return tensors - def write_tensors(self): - super().write_tensors() + def prepare_tensors(self): + super().prepare_tensors() self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias) @@ -3387,7 +3442,6 @@ class ChatGLMModel(Model): special_vocab.add_to_gguf(self.gguf_writer) def set_gguf_parameters(self): - self.gguf_writer.add_name(self.hparams["_name_or_path"].split("/")[1]) # THUDM/glm4-9b-chat or THUDM/chatglm3-6b n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) n_head_kv = self.hparams.get("multi_query_group_num", n_head) @@ -3539,6 +3593,10 @@ def parse_args() -> argparse.Namespace: "--no-tensor-first-split", action="store_true", help="do not add tensors to the first split (disabled by default)" ) + parser.add_argument( + "--metadata", type=Path, + help="Specify the path for an authorship metadata override file" + ) return parser.parse_args() @@ -3564,7 +3622,10 @@ def split_str_to_n_bytes(split_str: str) -> int: def main() -> None: args = parse_args() - logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) + if args.verbose: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) dir_model = args.model @@ -3588,34 +3649,30 @@ def main() -> None: if args.outfile is not None: fname_out = args.outfile else: - # output in the same directory as the model by default - fname_out = dir_model / 'ggml-model-{ftype}.gguf' + fname_out = dir_model logger.info(f"Loading model: {dir_model.name}") hparams = Model.load_hparams(dir_model) with torch.inference_mode(): + output_type = ftype_map[args.outtype] + model_architecture = hparams["architectures"][0] + try: - model_class = Model.from_model_architecture(hparams["architectures"][0]) + model_class = Model.from_model_architecture(model_architecture) except NotImplementedError: - logger.error(f"Model {hparams['architectures'][0]} is not supported") + logger.error(f"Model {model_architecture} is not supported") sys.exit(1) - model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, - args.no_lazy, args.model_name, split_max_tensors=args.split_max_tensors, + model_instance = model_class(dir_model=dir_model, ftype=output_type, fname_out=fname_out, + is_big_endian=args.bigendian, use_temp_file=args.use_temp_file, + eager=args.no_lazy, + metadata_override=args.metadata, model_name=args.model_name, + split_max_tensors=args.split_max_tensors, split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run, small_first_shard=args.no_tensor_first_split) - logger.info("Set model parameters") - model_instance.gguf_writer.add_type(gguf.GGUFType.MODEL) - model_instance.set_gguf_parameters() - - logger.info("Set model tokenizer") - model_instance.set_vocab() - - model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) - if args.vocab_only: logger.info("Exporting model vocab...") model_instance.write_vocab() diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index e4165ae2d..d5a2d925e 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -50,7 +50,7 @@ class TOKENIZER_TYPE(IntEnum): # TODO: this string has to exercise as much pre-tokenizer functionality as possible # will be updated with time - contributions welcome -chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' +CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' if len(sys.argv) == 2: token = sys.argv[1] @@ -91,6 +91,9 @@ models = [ {"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", }, {"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", }, {"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", }, + {"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", }, + {"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", }, + {"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", }, ] @@ -99,8 +102,8 @@ def download_file_with_auth(url, token, save_path): response = sess.get(url, headers=headers) response.raise_for_status() os.makedirs(os.path.dirname(save_path), exist_ok=True) - with open(save_path, 'wb') as f: - f.write(response.content) + with open(save_path, 'wb') as downloaded_file: + downloaded_file.write(response.content) logger.info(f"File {save_path} downloaded successfully") @@ -159,7 +162,7 @@ for model in models: logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}") continue # Skip to the next model if the tokenizer can't be loaded - chktok = tokenizer.encode(chktxt) + chktok = tokenizer.encode(CHK_TXT) chkhsh = sha256(str(chktok).encode()).hexdigest() logger.info(f"model: {name}") @@ -191,7 +194,7 @@ src_func = f""" # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can # use in llama.cpp to implement the same pre-tokenizer - chktxt = {repr(chktxt)} + chktxt = {repr(CHK_TXT)} chktok = tokenizer.encode(chktxt) chkhsh = sha256(str(chktok).encode()).hexdigest() @@ -287,7 +290,7 @@ tests = [ "333333333", "Cửa Việt", # llama-bpe fails on this " discards", - chktxt, + CHK_TXT, ] # write the tests to ./models/ggml-vocab-{name}.gguf.inp diff --git a/convert_llama_ggml_to_gguf.py b/convert_llama_ggml_to_gguf.py index 95ea831a5..7b00b4398 100755 --- a/convert_llama_ggml_to_gguf.py +++ b/convert_llama_ggml_to_gguf.py @@ -132,6 +132,10 @@ class Tensor: class GGMLModel: + + file_format: GGMLFormat + format_version: int + def __init__(self): self.hyperparameters = None self.vocab = None @@ -290,7 +294,7 @@ class GGMLToGGUF: if self.vocab_override is not None: vo = self.vocab_override logger.info('* Adding vocab item(s)') - for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()): + for (_, (vbytes, score, ttype)) in enumerate(vo.all_tokens()): tokens.append(vbytes) scores.append(score) toktypes.append(ttype) diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index 4bb939d45..a88d0d4a9 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -251,6 +251,10 @@ def parse_args() -> argparse.Namespace: "--verbose", action="store_true", help="increase output verbosity", ) + parser.add_argument( + "--dry-run", action="store_true", + help="only print out what will be done, without writing any new files", + ) parser.add_argument( "--base", type=Path, required=True, help="directory containing base model file", @@ -286,7 +290,7 @@ if __name__ == '__main__': fname_out = args.outfile else: # output in the same directory as the model by default - fname_out = dir_lora / 'ggml-lora-{ftype}.gguf' + fname_out = dir_lora if os.path.exists(input_model): # lazy import load_file only if lora is in safetensors format. @@ -310,6 +314,23 @@ if __name__ == '__main__': class LoraModel(model_class): model_arch = model_class.model_arch + lora_alpha: float + + def __init__(self, *args, dir_lora_model: Path, lora_alpha: float, **kwargs): + + super().__init__(*args, **kwargs) + + self.dir_model_card = dir_lora_model + self.lora_alpha = float(lora_alpha) + + def set_type(self): + self.gguf_writer.add_type(gguf.GGUFType.ADAPTER) + self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora") + + def set_gguf_parameters(self): + self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha) + super().set_gguf_parameters() + def get_tensors(self) -> Iterator[tuple[str, Tensor]]: tensor_map: dict[str, PartialLoraTensor] = {} @@ -350,6 +371,11 @@ if __name__ == '__main__': yield (dest_name + ".lora_a", lora_a) yield (dest_name + ".lora_b", lora_b) + with open(lora_config, "r") as f: + lparams: dict[str, Any] = json.load(f) + + alpha: float = lparams["lora_alpha"] + model_instance = LoraModel( dir_base_model, ftype, @@ -357,18 +383,11 @@ if __name__ == '__main__': is_big_endian=args.bigendian, use_temp_file=False, eager=args.no_lazy, - model_name=None, + dry_run=args.dry_run, + dir_lora_model=dir_lora, + lora_alpha=alpha, ) - with open(lora_config, "r") as f: - lparams: dict[str, Any] = json.load(f) - - alpha = lparams["lora_alpha"] - - model_instance.gguf_writer.add_string(gguf.Keys.General.TYPE, gguf.GGUFType.ADAPTER) - model_instance.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora") - model_instance.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, float(alpha)) - model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) logger.info("Exporting model...") model_instance.write() logger.info(f"Model successfully exported to {model_instance.fname_out}") diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index 2442e954d..53fbfb0a8 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -31,7 +31,7 @@ int main(int argc, char ** argv) { int n_parallel = params.n_parallel; // total length of the sequences including the prompt - int n_predict = 32; + int n_predict = params.n_predict; // init LLM diff --git a/examples/convert_legacy_llama.py b/examples/convert_legacy_llama.py index c2c73e8ad..9ab9ab06e 100755 --- a/examples/convert_legacy_llama.py +++ b/examples/convert_legacy_llama.py @@ -24,7 +24,7 @@ from abc import ABC, abstractmethod from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor from dataclasses import dataclass from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar, Optional +from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar import numpy as np @@ -346,42 +346,6 @@ class Params: return params -@dataclass -class Metadata: - name: Optional[str] = None - author: Optional[str] = None - version: Optional[str] = None - url: Optional[str] = None - description: Optional[str] = None - license: Optional[str] = None - source_url: Optional[str] = None - source_hf_repo: Optional[str] = None - - @staticmethod - def load(metadata_path: Path) -> Metadata: - if metadata_path is None or not metadata_path.exists(): - return Metadata() - - with open(metadata_path, 'r') as file: - data = json.load(file) - - # Create a new Metadata instance - metadata = Metadata() - - # Assigning values to Metadata attributes if they exist in the JSON file - # This is based on LLM_KV_NAMES mapping in llama.cpp - metadata.name = data.get("general.name") - metadata.author = data.get("general.author") - metadata.version = data.get("general.version") - metadata.url = data.get("general.url") - metadata.description = data.get("general.description") - metadata.license = data.get("general.license") - metadata.source_url = data.get("general.source.url") - metadata.source_hf_repo = data.get("general.source.huggingface.repository") - - return metadata - - # # data loading # TODO: reuse (probably move to gguf.py?) @@ -806,7 +770,7 @@ class OutputFile: def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE): self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess) - def add_meta_model(self, params: Params, metadata: Metadata | None) -> None: + def add_meta_model(self, params: Params, metadata: gguf.Metadata | None) -> None: # Metadata About The Model And Its Provenence name = "LLaMA" if metadata is not None and metadata.name is not None: @@ -824,16 +788,73 @@ class OutputFile: self.gguf.add_author(metadata.author) if metadata.version is not None: self.gguf.add_version(metadata.version) - if metadata.url is not None: - self.gguf.add_url(metadata.url) + if metadata.organization is not None: + self.gguf.add_organization(metadata.organization) + + if metadata.finetune is not None: + self.gguf.add_finetune(metadata.finetune) + if metadata.basename is not None: + self.gguf.add_basename(metadata.basename) + if metadata.description is not None: self.gguf.add_description(metadata.description) + if metadata.quantized_by is not None: + self.gguf.add_quantized_by(metadata.quantized_by) + + if metadata.size_label is not None: + self.gguf.add_size_label(metadata.size_label) + if metadata.license is not None: - self.gguf.add_licence(metadata.license) + self.gguf.add_license(metadata.license) + if metadata.license_name is not None: + self.gguf.add_license_name(metadata.license_name) + if metadata.license_link is not None: + self.gguf.add_license_link(metadata.license_link) + + if metadata.url is not None: + self.gguf.add_url(metadata.url) + if metadata.doi is not None: + self.gguf.add_doi(metadata.doi) + if metadata.uuid is not None: + self.gguf.add_uuid(metadata.uuid) + if metadata.repo_url is not None: + self.gguf.add_repo_url(metadata.repo_url) + if metadata.source_url is not None: self.gguf.add_source_url(metadata.source_url) - if metadata.source_hf_repo is not None: - self.gguf.add_source_hf_repo(metadata.source_hf_repo) + if metadata.source_doi is not None: + self.gguf.add_source_doi(metadata.source_doi) + if metadata.source_uuid is not None: + self.gguf.add_source_uuid(metadata.source_uuid) + if metadata.source_repo_url is not None: + self.gguf.add_source_repo_url(metadata.source_repo_url) + + if metadata.base_models is not None: + self.gguf.add_base_model_count(len(metadata.base_models)) + for key, base_model_entry in enumerate(metadata.base_models): + if "name" in base_model_entry: + self.gguf.add_base_model_name(key, base_model_entry["name"]) + if "author" in base_model_entry: + self.gguf.add_base_model_author(key, base_model_entry["author"]) + if "version" in base_model_entry: + self.gguf.add_base_model_version(key, base_model_entry["version"]) + if "organization" in base_model_entry: + self.gguf.add_base_model_organization(key, base_model_entry["organization"]) + if "url" in base_model_entry: + self.gguf.add_base_model_url(key, base_model_entry["url"]) + if "doi" in base_model_entry: + self.gguf.add_base_model_doi(key, base_model_entry["doi"]) + if "uuid" in base_model_entry: + self.gguf.add_base_model_uuid(key, base_model_entry["uuid"]) + if "repo_url" in base_model_entry: + self.gguf.add_base_model_repo_url(key, base_model_entry["repo_url"]) + + if metadata.tags is not None: + self.gguf.add_tags(metadata.tags) + if metadata.languages is not None: + self.gguf.add_languages(metadata.languages) + if metadata.datasets is not None: + self.gguf.add_datasets(metadata.datasets) def add_meta_arch(self, params: Params) -> None: # Metadata About The Neural Architecture Itself @@ -944,7 +965,7 @@ class OutputFile: @staticmethod def write_vocab_only( fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, - endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: Metadata | None = None, + endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: gguf.Metadata | None = None, ) -> None: check_vocab_size(params, vocab, pad_vocab=pad_vocab) @@ -978,7 +999,7 @@ class OutputFile: fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, - metadata: Metadata | None = None, + metadata: gguf.Metadata | None = None, ) -> None: check_vocab_size(params, vocab, pad_vocab=pad_vocab) @@ -1021,35 +1042,32 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT raise ValueError(f"Unexpected combination of types: {name_to_type}") -def model_parameter_count(model: LazyModel) -> int: - total_model_parameters = 0 - for i, (name, lazy_tensor) in enumerate(model.items()): - sum_weights_in_tensor = 1 +def per_model_weight_count_estimation(tensors: Iterable[tuple[str, LazyTensor]]) -> tuple[int, int, int]: + total_params = 0 + shared_params = 0 + expert_params = 0 + + for name, lazy_tensor in tensors: + # We don't need these + if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")): + continue + + # Got A Tensor + sum_weights_in_tensor: int = 1 + + # Tensor Volume for dim in lazy_tensor.shape: sum_weights_in_tensor *= dim - total_model_parameters += sum_weights_in_tensor - return total_model_parameters + if ".experts." in name: + if ".experts.0." in name: + expert_params += sum_weights_in_tensor + else: + shared_params += sum_weights_in_tensor -def model_parameter_count_rounded_notation(model_params_count: int) -> str: - if model_params_count > 1e12 : - # Trillions Of Parameters - scaled_model_params = model_params_count * 1e-12 - scale_suffix = "T" - elif model_params_count > 1e9 : - # Billions Of Parameters - scaled_model_params = model_params_count * 1e-9 - scale_suffix = "B" - elif model_params_count > 1e6 : - # Millions Of Parameters - scaled_model_params = model_params_count * 1e-6 - scale_suffix = "M" - else: - # Thousands Of Parameters - scaled_model_params = model_params_count * 1e-3 - scale_suffix = "K" + total_params += sum_weights_in_tensor - return f"{round(scaled_model_params)}{scale_suffix}" + return total_params, shared_params, expert_params def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel: @@ -1231,34 +1249,24 @@ class VocabFactory: return vocab, special_vocab -def default_convention_outfile(file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> str: - quantization = { +def default_convention_outfile(file_type: GGMLFileType, expert_count: int | None, model_params_count: tuple[int, int, int], metadata: gguf.Metadata) -> str: + name = metadata.name if metadata.name is not None else None + basename = metadata.basename if metadata.basename is not None else None + finetune = metadata.finetune if metadata.finetune is not None else None + version = metadata.version if metadata.version is not None else None + size_label = metadata.size_label if metadata.size_label is not None else gguf.size_label(*model_params_count, expert_count=expert_count or 0) + + output_type = { GGMLFileType.AllF32: "F32", GGMLFileType.MostlyF16: "F16", GGMLFileType.MostlyQ8_0: "Q8_0", }[file_type] - parameters = model_parameter_count_rounded_notation(model_params_count) - - expert_count = "" - if params.n_experts is not None: - expert_count = f"{params.n_experts}x" - - version = "" - if metadata is not None and metadata.version is not None: - version = f"-{metadata.version}" - - name = "ggml-model" - if metadata is not None and metadata.name is not None: - name = metadata.name - elif params.path_model is not None: - name = params.path_model.name - - return f"{name}{version}-{expert_count}{parameters}-{quantization}" + return gguf.naming_convention(name, basename, finetune, version, size_label, output_type) -def default_outfile(model_paths: list[Path], file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> Path: - default_filename = default_convention_outfile(file_type, params, model_params_count, metadata) +def default_outfile(model_paths: list[Path], file_type: GGMLFileType, expert_count: int | None, model_params_count: tuple[int, int, int], metadata: gguf.Metadata) -> Path: + default_filename = default_convention_outfile(file_type, expert_count, model_params_count, metadata) ret = model_paths[0].parent / f"{default_filename}.gguf" if ret in model_paths: logger.error( @@ -1297,8 +1305,9 @@ def main(args_in: list[str] | None = None) -> None: parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides") parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing") parser.add_argument("--verbose", action="store_true", help="increase output verbosity") - parser.add_argument("--metadata", type=Path, help="Specify the path for a metadata file") + parser.add_argument("--metadata", type=Path, help="Specify the path for an authorship metadata override file") parser.add_argument("--get-outfile", action="store_true", help="get calculated default outfile name") + parser.add_argument("--model-name", type=str, default=None, help="name of the model") args = parser.parse_args(args_in) @@ -1310,32 +1319,36 @@ def main(args_in: list[str] | None = None) -> None: else: logging.basicConfig(level=logging.INFO) - metadata = Metadata.load(args.metadata) + model_name = args.model_name + dir_model = args.model + + metadata = gguf.Metadata.load(args.metadata, dir_model, model_name) if args.get_outfile: - model_plus = load_some_model(args.model) + model_plus = load_some_model(dir_model) params = Params.load(model_plus) - model = convert_model_names(model_plus.model, params, args.skip_unknown) - model_params_count = model_parameter_count(model_plus.model) - ftype = pick_output_type(model, args.outtype) - print(f"{default_convention_outfile(ftype, params, model_params_count, metadata)}") # noqa: NP100 + model = convert_model_names(model_plus.model, params, args.skip_unknown) + model_params_count = per_model_weight_count_estimation(model_plus.model.items()) + ftype = pick_output_type(model, args.outtype) + + if (metadata is None or metadata.name is None) and params.path_model is not None: + metadata.name = params.path_model.name + + print(f"{default_convention_outfile(ftype, params.n_experts, model_params_count, metadata)}") # noqa: NP100 return if args.no_vocab and args.vocab_only: raise ValueError("--vocab-only does not make sense with --no-vocab") if args.dump_single: - model_plus = lazy_load_file(args.model) + model_plus = lazy_load_file(dir_model) do_dump_model(model_plus) return if not args.vocab_only: - model_plus = load_some_model(args.model) + model_plus = load_some_model(dir_model) else: - model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None) - - model_params_count = model_parameter_count(model_plus.model) - logger.info(f"model parameters count : {model_params_count} ({model_parameter_count_rounded_notation(model_params_count)})") + model_plus = ModelPlus(model = {}, paths = [dir_model / 'dummy'], format = 'none', vocab = None) if args.dump: do_dump_model(model_plus) @@ -1368,7 +1381,7 @@ def main(args_in: list[str] | None = None) -> None: logger.info(f"params = {params}") model_parent_path = model_plus.paths[0].parent - vocab_path = Path(args.vocab_dir or args.model or model_parent_path) + vocab_path = Path(args.vocab_dir or dir_model or model_parent_path) vocab_factory = VocabFactory(vocab_path) vocab_types = None if args.no_vocab else args.vocab_type.split(",") vocab, special_vocab = vocab_factory.load_vocab(vocab_types, model_parent_path) @@ -1399,13 +1412,21 @@ def main(args_in: list[str] | None = None) -> None: assert params is not None + if metadata.name is None and params.path_model is not None: + metadata.name = params.path_model.name + + model_params_count = per_model_weight_count_estimation(model_plus.model.items()) + logger.info(f"model parameters count : {model_params_count} ({gguf.model_weight_count_rounded_notation(model_params_count[0])})") + logger.info(f"Vocab info: {vocab}") logger.info(f"Special vocab info: {special_vocab}") model = model_plus.model model = convert_model_names(model, params, args.skip_unknown) ftype = pick_output_type(model, args.outtype) model = convert_to_output_type(model, ftype) - outfile = args.outfile or default_outfile(model_plus.paths, ftype, params, model_params_count, metadata) + outfile = args.outfile or default_outfile(model_plus.paths, ftype, params.n_experts, model_params_count, metadata=metadata) + + metadata.size_label = gguf.size_label(*model_params_count, expert_count=params.n_experts or 0) params.ftype = ftype logger.info(f"Writing {outfile}, format {ftype}") diff --git a/examples/gbnf-validator/gbnf-validator.cpp b/examples/gbnf-validator/gbnf-validator.cpp index dd53ba9b1..48a705e15 100644 --- a/examples/gbnf-validator/gbnf-validator.cpp +++ b/examples/gbnf-validator/gbnf-validator.cpp @@ -16,20 +16,25 @@ static bool llama_sample_grammar_string(struct llama_grammar * grammar, const st auto decoded = decode_utf8(input_str, {}); const auto & code_points = decoded.first; + const llama_grammar_rules & rules = llama_grammar_get_rules (grammar); + llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar); + size_t pos = 0; for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) { - auto prev_stacks = grammar->stacks; - llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks); - if (grammar->stacks.empty()) { + const llama_grammar_stacks prev_stacks = llama_grammar_get_stacks(grammar); // copy + + llama_grammar_accept(rules, prev_stacks, *it, cur_stacks); + + if (cur_stacks.empty()) { error_pos = pos; error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'"; - grammar->stacks = prev_stacks; + cur_stacks = prev_stacks; return false; } ++pos; } - for (const auto & stack : grammar->stacks) { + for (const auto & stack : cur_stacks) { if (stack.empty()) { return true; } diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp index 575143771..7498f85ef 100644 --- a/examples/gguf/gguf.cpp +++ b/examples/gguf/gguf.cpp @@ -92,6 +92,11 @@ static bool gguf_ex_read_0(const std::string & fname) { struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params); + if (!ctx) { + fprintf(stderr, "%s: failed to load '%s'\n", __func__, fname.c_str()); + return false; + } + printf("%s: version: %d\n", __func__, gguf_get_version(ctx)); printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx)); printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx)); diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index d641a9f12..a6497b6e0 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -23,6 +23,10 @@ #include "ggml-cuda.h" #include "ggml-sycl.h" +#ifdef GGML_USE_CANN +#include "ggml-cann.h" +#endif + // utils static uint64_t get_time_ns() { using clock = std::chrono::high_resolution_clock; @@ -120,6 +124,17 @@ static std::string get_gpu_info() { id += "/"; } } +#endif +#ifdef GGML_USE_CANN + uint32_t count = ggml_backend_cann_get_device_count(); + for (uint32_t i = 0; i < count; i++) { + char buf[128]; + ggml_backend_cann_get_device_description(i, buf, sizeof(buf)); + id += buf; + if (i < count - 1) { + id += "/"; + } + } #endif // TODO: other backends return id; diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp index 92a6b16b1..2aafe2316 100644 --- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp +++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp @@ -409,7 +409,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop( const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value); if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) { - return env->NewStringUTF(""); + return nullptr; } auto new_token_chars = llama_token_to_piece(context, new_token_id); diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift index 2a3f9f758..58c32ca53 100644 --- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -26,11 +26,12 @@ actor LlamaContext { private var context: OpaquePointer private var batch: llama_batch private var tokens_list: [llama_token] + var is_done: Bool = false /// This variable is used to store temporarily invalid cchars private var temporary_invalid_cchars: [CChar] - var n_len: Int32 = 64 + var n_len: Int32 = 1024 var n_cur: Int32 = 0 var n_decode: Int32 = 0 @@ -160,6 +161,7 @@ actor LlamaContext { if llama_token_is_eog(model, new_token_id) || n_cur == n_len { print("\n") + is_done = true let new_token_str = String(cString: temporary_invalid_cchars + [0]) temporary_invalid_cchars.removeAll() return new_token_str diff --git a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift index 2c1e3f61b..b8f6a31d5 100644 --- a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift +++ b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift @@ -132,7 +132,7 @@ class LlamaState: ObservableObject { messageLog += "\(text)" Task.detached { - while await llamaContext.n_cur < llamaContext.n_len { + while await !llamaContext.is_done { let result = await llamaContext.completion_loop() await MainActor.run { self.messageLog += "\(result)" diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index d6882eec3..d23e282fb 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -16,6 +16,10 @@ #include "ggml-metal.h" #endif +#ifdef GGML_USE_CANN +#include "ggml-cann.h" +#endif + #define STB_IMAGE_IMPLEMENTATION #include "stb_image.h" @@ -1001,6 +1005,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { LOG_TEE("%s: CLIP using Metal backend\n", __func__); #endif +#ifdef GGML_USE_CANN + new_clip->backend = ggml_backend_cann_init(0); + LOG_TEE("%s: CLIP using CANN backend\n", __func__); +#endif + if (!new_clip->backend) { new_clip->backend = ggml_backend_cpu_init(); diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp index 0b171c872..2fe67100e 100644 --- a/examples/lookup/lookup-stats.cpp +++ b/examples/lookup/lookup-stats.cpp @@ -31,7 +31,6 @@ int main(int argc, char ** argv){ // load the model std::tie(model, ctx) = llama_init_from_gpt_params(params); - GGML_ASSERT(llama_n_vocab(model) < (1 << 16)); // tokenize the prompt std::vector inp; @@ -65,7 +64,7 @@ int main(int argc, char ** argv){ } const int n_input = inp.size(); - const int n_ctx = params.n_ctx; + const int n_ctx = llama_n_ctx(ctx); int n_drafted = 0; int n_accept = 0; diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index 80ecd925d..bb571bac4 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -39,7 +39,6 @@ int main(int argc, char ** argv){ // load the model std::tie(model, ctx) = llama_init_from_gpt_params(params); - GGML_ASSERT(llama_n_vocab(model) < (1 << 16)); // tokenize the prompt std::vector inp; diff --git a/examples/pydantic_models_to_grammar_examples.py b/examples/pydantic_models_to_grammar_examples.py old mode 100644 new mode 100755 index 504ed98df..eb000d5cc --- a/examples/pydantic_models_to_grammar_examples.py +++ b/examples/pydantic_models_to_grammar_examples.py @@ -1,8 +1,15 @@ -# Function calling example using pydantic models. +#!/usr/bin/env python3 + +"""Function calling example using pydantic models.""" + from __future__ import annotations +import argparse import datetime import json +import logging +import textwrap +import sys from enum import Enum from typing import Optional, Union @@ -12,30 +19,54 @@ from pydantic_models_to_grammar import (add_run_method_to_dynamic_model, convert create_dynamic_model_from_function, generate_gbnf_grammar_and_documentation) -# Function to get completion on the llama.cpp server with grammar. -def create_completion(prompt, grammar): +def create_completion(host, prompt, gbnf_grammar): + """Calls the /completion API on llama-server. + + See + https://github.com/ggerganov/llama.cpp/tree/HEAD/examples/server#api-endpoints + """ + print(f" Request:\n Grammar:\n{textwrap.indent(gbnf_grammar, ' ')}\n Prompt:\n{textwrap.indent(prompt.rstrip(), ' ')}") headers = {"Content-Type": "application/json"} - data = {"prompt": prompt, "grammar": grammar} - - response = requests.post("http://127.0.0.1:8080/completion", headers=headers, json=data) - data = response.json() - + data = {"prompt": prompt, "grammar": gbnf_grammar} + result = requests.post(f"http://{host}/completion", headers=headers, json=data).json() assert data.get("error") is None, data - - print(data["content"]) - return data["content"] + logging.info("Result: %s", result) + content = result["content"] + print(f" Model: {result['model']}") + print(f" Result:\n{textwrap.indent(json.dumps(json.loads(content), indent=2), ' ')}") + return content # A function for the agent to send a message to the user. class SendMessageToUser(BaseModel): - """ - Send a message to the User. - """ + """Send a message to the User.""" chain_of_thought: str = Field(..., description="Your chain of thought while sending the message.") message: str = Field(..., description="Message you want to send to the user.") def run(self): - print(self.message) + print(f"SendMessageToUser: {self.message}") + + +def example_rce(host): + """Minimal test case where the LLM call an arbitrary python function.""" + print("- example_rce") + tools = [SendMessageToUser] + gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation( + pydantic_model_list=tools, outer_object_name="function", + outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters") + system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation + user_message = "What is 42 * 42?" + prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant" + text = create_completion(host, prompt, gbnf_grammar) + json_data = json.loads(text) + tools_map = {tool.__name__:tool for tool in tools} + # This finds "SendMessageToUser": + tool = tools_map.get(json_data["function"]) + if not tool: + print(f"Error: unknown tool {json_data['function']}") + return 1 + tool(**json_data["function_parameters"]).run() + return 0 # Enum for the calculator tool. @@ -46,11 +77,11 @@ class MathOperation(Enum): DIVIDE = "divide" -# Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt. +# Simple pydantic calculator tool for the agent that can add, subtract, +# multiply, and divide. Docstring and description of fields will be used in +# system prompt. class Calculator(BaseModel): - """ - Perform a math operation on two numbers. - """ + """Perform a math operation on two numbers.""" number_one: Union[int, float] = Field(..., description="First number.") operation: MathOperation = Field(..., description="Math operation to perform.") number_two: Union[int, float] = Field(..., description="Second number.") @@ -68,55 +99,61 @@ class Calculator(BaseModel): raise ValueError("Unknown operation.") -# Here the grammar gets generated by passing the available function models to generate_gbnf_grammar_and_documentation function. This also generates a documentation usable by the LLM. -# pydantic_model_list is the list of pydanitc models -# outer_object_name is an optional name for an outer object around the actual model object. Like a "function" object with "function_parameters" which contains the actual model object. If None, no outer object will be generated -# outer_object_content is the name of outer object content. -# model_prefix is the optional prefix for models in the documentation. (Default="Output Model") -# fields_prefix is the prefix for the model fields in the documentation. (Default="Output Fields") -gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation( - pydantic_model_list=[SendMessageToUser, Calculator], outer_object_name="function", - outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters") +def example_calculator(host): + """Have the LLM ask to get a calculation done. -print(gbnf_grammar) -print(documentation) + Here the grammar gets generated by passing the available function models to + generate_gbnf_grammar_and_documentation function. This also generates a + documentation usable by the LLM. -system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation + pydantic_model_list is the list of pydantic models outer_object_name is an + optional name for an outer object around the actual model object. Like a + "function" object with "function_parameters" which contains the actual model + object. If None, no outer object will be generated outer_object_content is + the name of outer object content. -user_message = "What is 42 * 42?" -prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant" - -text = create_completion(prompt=prompt, grammar=gbnf_grammar) -# This should output something like this: -# { -# "function": "calculator", -# "function_parameters": { -# "number_one": 42, -# "operation": "multiply", -# "number_two": 42 -# } -# } -function_dictionary = json.loads(text) -if function_dictionary["function"] == "calculator": - function_parameters = {**function_dictionary["function_parameters"]} - - print(Calculator(**function_parameters).run()) - # This should output: 1764 + model_prefix is the optional prefix for models in the documentation. (Default="Output Model") + fields_prefix is the prefix for the model fields in the documentation. (Default="Output Fields") + """ + print("- example_calculator") + tools = [SendMessageToUser, Calculator] + gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation( + pydantic_model_list=tools, outer_object_name="function", + outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters") + system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation + user_message1 = "What is 42 * 42?" + prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message1}<|im_end|>\n<|im_start|>assistant" + text = create_completion(host, prompt, gbnf_grammar) + json_data = json.loads(text) + expected = { + "function": "Calculator", + "function_parameters": { + "number_one": 42, + "operation": "multiply", + "number_two": 42 + } + } + if json_data != expected: + print(" Result is not as expected!") + tools_map = {tool.__name__:tool for tool in tools} + # This finds "Calculator": + tool = tools_map.get(json_data["function"]) + if not tool: + print(f"Error: unknown tool {json_data['function']}") + return 1 + result = tool(**json_data["function_parameters"]).run() + print(f" Call {json_data['function']} gave result {result}") + return 0 -# A example structured output based on pydantic models. The LLM will create an entry for a Book database out of an unstructured text. class Category(Enum): - """ - The category of the book. - """ + """The category of the book.""" Fiction = "Fiction" NonFiction = "Non-Fiction" class Book(BaseModel): - """ - Represents an entry about a book. - """ + """Represents an entry about a book.""" title: str = Field(..., description="Title of the book.") author: str = Field(..., description="Author of the book.") published_year: Optional[int] = Field(..., description="Publishing year of the book.") @@ -125,33 +162,42 @@ class Book(BaseModel): summary: str = Field(..., description="Summary of the book.") -# We need no additional parameters other than our list of pydantic models. -gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation([Book]) +def example_struct(host): + """A example structured output based on pydantic models. -system_message = "You are an advanced AI, tasked to create a dataset entry in JSON for a Book. The following is the expected output model:\n\n" + documentation + The LLM will create an entry for a Book database out of an unstructured + text. We need no additional parameters other than our list of pydantic + models. + """ + print("- example_struct") + tools = [Book] + gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(pydantic_model_list=tools) + system_message = "You are an advanced AI, tasked to create a dataset entry in JSON for a Book. The following is the expected output model:\n\n" + documentation + text = """The Feynman Lectures on Physics is a physics textbook based on some lectures by Richard Feynman, a Nobel laureate who has sometimes been called "The Great Explainer". The lectures were presented before undergraduate students at the California Institute of Technology (Caltech), during 1961–1963. The book's co-authors are Feynman, Robert B. Leighton, and Matthew Sands.""" + prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant" + text = create_completion(host, prompt, gbnf_grammar) + json_data = json.loads(text) + # In this case, there's no function nor function_parameters. + # Here the result will vary based on the LLM used. + keys = sorted(["title", "author", "published_year", "keywords", "category", "summary"]) + if keys != sorted(json_data.keys()): + print(f"Unexpected result: {sorted(json_data.keys())}") + return 1 + book = Book(**json_data) + print(f" As a Book object: %s" % book) + return 0 -text = """The Feynman Lectures on Physics is a physics textbook based on some lectures by Richard Feynman, a Nobel laureate who has sometimes been called "The Great Explainer". The lectures were presented before undergraduate students at the California Institute of Technology (Caltech), during 1961–1963. The book's co-authors are Feynman, Robert B. Leighton, and Matthew Sands.""" -prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant" - -text = create_completion(prompt=prompt, grammar=gbnf_grammar) - -json_data = json.loads(text) - -print(Book(**json_data)) -# An example for parallel function calling with a Python function, a pydantic function model and an OpenAI like function definition. def get_current_datetime(output_format: Optional[str] = None): - """ - Get the current date and time in the given format. + """Get the current date and time in the given format. + Args: output_format: formatting string for the date and time, defaults to '%Y-%m-%d %H:%M:%S' """ - if output_format is None: - output_format = '%Y-%m-%d %H:%M:%S' - return datetime.datetime.now().strftime(output_format) + return datetime.datetime.now().strftime(output_format or "%Y-%m-%d %H:%M:%S") -# Example function to get the weather +# Example function to get the weather. def get_current_weather(location, unit): """Get the current weather in a given location""" if "London" in location: @@ -160,68 +206,107 @@ def get_current_weather(location, unit): return json.dumps({"location": "New York", "temperature": "24", "unit": unit.value}) elif "North Pole" in location: return json.dumps({"location": "North Pole", "temperature": "-42", "unit": unit.value}) - else: - return json.dumps({"location": location, "temperature": "unknown"}) + return json.dumps({"location": location, "temperature": "unknown"}) -# Here is a function definition in OpenAI style -current_weather_tool = { - "type": "function", - "function": { - "name": "get_current_weather", - "description": "Get the current weather in a given location", - "parameters": { - "type": "object", - "properties": { - "location": { - "type": "string", - "description": "The city and state, e.g. San Francisco, CA", +def example_concurrent(host): + """An example for parallel function calling with a Python function, a pydantic + function model and an OpenAI like function definition. + """ + print("- example_concurrent") + # Function definition in OpenAI style. + current_weather_tool = { + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA", + }, + "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, }, - "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, + "required": ["location"], }, - "required": ["location"], }, - }, -} + } + # Convert OpenAI function definition into pydantic model. + current_weather_tool_model = convert_dictionary_to_pydantic_model(current_weather_tool) + # Add the actual function to a pydantic model. + current_weather_tool_model = add_run_method_to_dynamic_model(current_weather_tool_model, get_current_weather) -# Convert OpenAI function definition into pydantic model -current_weather_tool_model = convert_dictionary_to_pydantic_model(current_weather_tool) -# Add the actual function to a pydantic model -current_weather_tool_model = add_run_method_to_dynamic_model(current_weather_tool_model, get_current_weather) + # Convert normal Python function to a pydantic model. + current_datetime_model = create_dynamic_model_from_function(get_current_datetime) -# Convert normal Python function to a pydantic model -current_datetime_model = create_dynamic_model_from_function(get_current_datetime) - -tool_list = [SendMessageToUser, Calculator, current_datetime_model, current_weather_tool_model] + tools = [SendMessageToUser, Calculator, current_datetime_model, current_weather_tool_model] + gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation( + pydantic_model_list=tools, outer_object_name="function", + outer_object_content="params", model_prefix="Function", fields_prefix="Parameters", list_of_outputs=True) + system_message = "You are an advanced AI assistant. You are interacting with the user and with your environment by calling functions. You call functions by writing JSON objects, which represent specific function calls.\nBelow is a list of your available function calls:\n\n" + documentation + text = """Get the date and time, get the current weather in celsius in London and solve the following calculation: 42 * 42""" + prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant" + text = create_completion(host, prompt, gbnf_grammar) + json_data = json.loads(text) + expected = [ + { + "function": "get_current_datetime", + "params": { + "output_format": "%Y-%m-%d %H:%M:%S" + } + }, + { + "function": "get_current_weather", + "params": { + "location": "London", + "unit": "celsius" + } + }, + { + "function": "Calculator", + "params": { + "number_one": 42, + "operation": "multiply", + "number_two": 42 + } + } + ] + res = 0 + if json_data != expected: + print(" Result is not as expected!") + print(" This can happen on highly quantized models") + res = 1 + tools_map = {tool.__name__:tool for tool in tools} + for call in json_data: + tool = tools_map.get(call["function"]) + if not tool: + print(f"Error: unknown tool {call['function']}") + return 1 + result = tool(**call["params"]).run() + print(f" Call {call['function']} returned {result}") + # Should output something like this: + # Call get_current_datetime returned 2024-07-15 09:50:38 + # Call get_current_weather returned {"location": "London", "temperature": "42", "unit": "celsius"} + # Call Calculator returned 1764 + return res -gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation( - pydantic_model_list=tool_list, outer_object_name="function", - outer_object_content="params", model_prefix="Function", fields_prefix="Parameters", list_of_outputs=True) - -system_message = "You are an advanced AI assistant. You are interacting with the user and with your environment by calling functions. You call functions by writing JSON objects, which represent specific function calls.\nBelow is a list of your available function calls:\n\n" + documentation +def main(): + parser = argparse.ArgumentParser(description=sys.modules[__name__].__doc__) + parser.add_argument("--host", default="localhost:8080", help="llama.cpp server") + parser.add_argument("-v", "--verbose", action="store_true", help="enables logging") + args = parser.parse_args() + logging.basicConfig(level=logging.INFO if args.verbose else logging.ERROR) + ret = 0 + # Comment out below to only run the example you want. + ret = ret or example_rce(args.host) + ret = ret or example_calculator(args.host) + ret = ret or example_struct(args.host) + ret = ret or example_concurrent(args.host) + return ret -text = """Get the date and time, get the current weather in celsius in London and solve the following calculation: 42 * 42""" -prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant" - -text = create_completion(prompt=prompt, grammar=gbnf_grammar) - -json_data = json.loads(text) - -print(json_data) -# Should output something like this: -# [{'function': 'get_current_datetime', 'params': {'output_format': '%Y-%m-%d %H:%M:%S'}}, {'function': 'get_current_weather', 'params': {'location': 'London', 'unit': 'celsius'}}, {'function': 'Calculator', 'params': {'number_one': 42, 'operation': 'multiply', 'number_two': 42}}] - - -for call in json_data: - if call["function"] == "Calculator": - print(Calculator(**call["params"]).run()) - elif call["function"] == "get_current_datetime": - print(current_datetime_model(**call["params"]).run()) # pyright: ignore[reportAttributeAccessIssue] - elif call["function"] == "get_current_weather": - print(current_weather_tool_model(**call["params"]).run()) # pyright: ignore[reportAttributeAccessIssue] -# Should output something like this: -# 2024-01-14 13:36:06 -# {"location": "London", "temperature": "42", "unit": "celsius"} -# 1764 +if __name__ == "__main__": + sys.exit(main()) diff --git a/examples/server/README.md b/examples/server/README.md index e477d1501..ff4074517 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -444,7 +444,7 @@ node index.js `n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity. - `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. + `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token. By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt. `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`. diff --git a/examples/server/public/completion.js b/examples/server/public/completion.js index 987b9a3b8..36818f764 100644 --- a/examples/server/public/completion.js +++ b/examples/server/public/completion.js @@ -21,7 +21,7 @@ let generation_settings = null; // export async function* llama(prompt, params = {}, config = {}) { let controller = config.controller; - const api_url = config.api_url || ""; + const api_url = config.api_url?.replace(/\/+$/, '') || ""; if (!controller) { controller = new AbortController(); @@ -196,7 +196,7 @@ export const llamaComplete = async (params, controller, callback) => { // Get the model info from the server. This is useful for getting the context window and so on. export const llamaModelInfo = async (config = {}) => { if (!generation_settings) { - const api_url = config.api_url || ""; + const api_url = config.api_url?.replace(/\/+$/, '') || ""; const props = await fetch(`${api_url}/props`).then(r => r.json()); generation_settings = props.default_generation_settings; } diff --git a/examples/server/public/index-new.html b/examples/server/public/index-new.html index 5513e9121..c87dd8f1e 100644 --- a/examples/server/public/index-new.html +++ b/examples/server/public/index-new.html @@ -14,10 +14,10 @@