llama.cpp/convert-persimmon-to-gguf.py

#!/usr/bin/env python3
import torch
import os
from pprint import pprint
import sys
import argparse
from pathlib import Path
from sentencepiece import SentencePieceProcessor
if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf


def _flatten_dict(dct, tensors, prefix=None):
    assert isinstance(dct, dict)
    for key in dct.keys():
        new_prefix = prefix + '.' + key if prefix is not None else key
        if isinstance(dct[key], torch.Tensor):
            tensors[new_prefix] = dct[key]
        elif isinstance(dct[key], dict):
            _flatten_dict(dct[key], tensors, new_prefix)
        else:
            raise ValueError(type(dct[key]))
    return None


def _get_sentencepiece_tokenizer_info(dir_model: Path):
    tokenizer_path = dir_model / 'adept_vocab.model'
    print('gguf: getting sentencepiece tokenizer from', tokenizer_path)
    tokenizer = SentencePieceProcessor(str(tokenizer_path))
    print('gguf: adding tokens')
    tokens: list[bytes] = []
    scores: list[float] = []
    toktypes: list[int] = []

    for i in range(tokenizer.vocab_size()):
        text: bytes
        score: float

        piece = tokenizer.id_to_piece(i)
        text = piece.encode("utf-8")
        score = tokenizer.get_score(i)

        toktype = 1
        if tokenizer.is_unknown(i):
            toktype = 2
        if tokenizer.is_control(i):
            toktype = 3
        if tokenizer.is_unused(i):
            toktype = 5
        if tokenizer.is_byte(i):
            toktype = 6

        tokens.append(text)
        scores.append(score)
        toktypes.append(toktype)
        pass
    return tokens, scores, toktypes


def main():
    parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file")
    parser.add_argument("--outfile",             type=Path, help="path to write to; default: based on input")
    parser.add_argument("--ckpt-path",           type=Path, help="path to persimmon checkpoint .pt file")
    parser.add_argument("--model-dir",           type=Path, help="directory containing model e.g. 8b_chat_model_release")
    parser.add_argument("--adept-inference-dir", type=str, help="path to adept-inference code directory")
    args = parser.parse_args()
    sys.path.append(str(args.adept_inference_dir))
    persimmon_model = torch.load(args.ckpt_path)
    hparams = persimmon_model['args']
    pprint(hparams)
    tensors = {}
    _flatten_dict(persimmon_model['model'], tensors, None)

    arch = gguf.MODEL_ARCH.PERSIMMON
    gguf_writer = gguf.GGUFWriter(args.outfile, gguf.MODEL_ARCH_NAMES[arch])

    block_count = hparams.num_layers
    head_count = hparams.num_attention_heads
    head_count_kv = head_count
    ctx_length = hparams.seq_length
    hidden_size = hparams.hidden_size

    gguf_writer.add_name('persimmon-8b-chat')
    gguf_writer.add_context_length(ctx_length)
    gguf_writer.add_embedding_length(hidden_size)
    gguf_writer.add_block_count(block_count)
    gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size)
    gguf_writer.add_rope_dimension_count(hidden_size // head_count)
    gguf_writer.add_head_count(head_count)
    gguf_writer.add_head_count_kv(head_count_kv)
    gguf_writer.add_rope_freq_base(hparams.rotary_emb_base)
    gguf_writer.add_layer_norm_eps(hparams.layernorm_epsilon)

    tokens, scores, toktypes = _get_sentencepiece_tokenizer_info(args.model_dir)
    gguf_writer.add_tokenizer_model('llama')
    gguf_writer.add_token_list(tokens)
    gguf_writer.add_token_scores(scores)
    gguf_writer.add_token_types(toktypes)
    gguf_writer.add_bos_token_id(71013)
    gguf_writer.add_eos_token_id(71013)

    tensor_map = gguf.get_tensor_name_map(arch, block_count)
    print(tensor_map)
    for name in tensors.keys():
        data = tensors[name]
        if name.endswith(".self_attention.rotary_emb.inv_freq"):
            continue
        old_dtype = data.dtype
        # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
        data = data.to(torch.float32).squeeze().numpy()
        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
        if new_name is None:
            print("Can not map tensor '" + name + "'")
            sys.exit()
        n_dims = len(data.shape)
        print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
        gguf_writer.add_tensor(new_name, data)
    print("gguf: write header")
    gguf_writer.write_header_to_file()
    print("gguf: write metadata")
    gguf_writer.write_kv_data_to_file()
    print("gguf: write tensors")
    gguf_writer.write_tensors_to_file()

    gguf_writer.close()

    print(f"gguf: model successfully exported to '{args.outfile}'")
    print("")


if __name__ == '__main__':
    main()
python : add check-requirements.sh and GitHub workflow (#4585) * python: add check-requirements.sh and GitHub workflow This script and workflow forces package versions to remain compatible across all convert.py scripts, while allowing secondary convert scripts to import dependencies not wanted in convert.py. Move requirements into ./requirements * Fail on "==" being used for package requirements (but can be suppressed) * Enforce "compatible release" syntax instead of == * Update workflow * Add upper version bound for transformers and protobuf * improve check-requirements.sh * small syntax change * don't remove venvs if nocleanup is passed * See if this fixes docker workflow * Move check-requirements.sh into ./scripts/ --------- Co-authored-by: Jared Van Bortel <jared@nomic.ai> 2023-12-29 15:50:29 +01:00			`#!/usr/bin/env python3`
llm : support Adept Persimmon 8B (#3410) * Produces garbage output * wip: correct tensors up to RoPE * correct tensors thru RoPE * Correct outputs through masked & softmax'd KQ * fp32 works * Rename adept->persimmon * Produces correct outputs * clean up convert scripts * remove printing logic from ggml.c * remove prints from llama.cpp & fix merge * trivial cleanups * Add offload funcs * update conversion script to directly take adept artifacts rather than .saftensors file * Fix norm eps bug * Support sqr and concat on metal, persimmon-8b-q4 runs correctly * Small changes from review * Formatting changes * Minor changes to conversion script * Remove old script * Fix editorconfig formatting * Fix build * add overlooked offload code ggml-ci 2023-10-07 09:12:43 +02:00			`import torch`
			`import os`
			`from pprint import pprint`
			`import sys`
			`import argparse`
			`from pathlib import Path`
			`from sentencepiece import SentencePieceProcessor`
			`if 'NO_LOCAL_GGUF' not in os.environ:`
gguf-py: Refactor and allow reading/modifying existing GGUF files (#3981) * gguf-py: Refactor and add file reading support * Replay changes from #3871 Credit to @cebtenzzre for that pull * Various type annotation fixes. * sort imports with isort (again) * Fix missing return statement in add_tensor * style cleanup with flake8 * fix NamedTuple and Enum usage * Fix an issue with state init in GGUFReader Move examples to an examples/ directory Clean up examples Add an example of modifying keys in a GGUF file Update documentation with info on examples Try to support people importing gguf/gguf.py directly * Damagage is not a word. * Clean up gguf-py/examples/modify_gguf.py whitespace Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com> * Update gguf-py/examples/modify_gguf.py formatting Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com> * Update gguf-py/gguf/gguf_reader.py type hint Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com> * Make examples executable, formatting changes * Add more information to GGUFReader and examples comments * Include a gguf Python package version bump * Add convert-gguf-endian.py script * cleanup * gguf-py : bump minor version * Reorganize scripts * Make GGUFReader endian detection less arbitrary * Add JSON dumping support to gguf-dump.py Which I kind of regret now * A few for gguf-dump.py cleanups * Murder accidental tuple in gguf-py/scripts/gguf-dump.py Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com> * cleanup * constants : remove unneeded type annotations * fix python 3.8 compat * Set up gguf- scripts in pyproject.toml * And include scripts/__init__.py, derp * convert.py: We can't currently support Q8_0 on big endian. * gguf-py: SpecialVocab: Always try available sources for special token ids gguf-py: SpecialVocab: Try to load merges from merges.txt if not in tokenizer.json gguf-py: SpecialVocab: Add 'add_bos_token' type bools to GGUF metadata u * cleanup * Promote add_X_token to GGUF metadata for BOS and EOS --------- Co-authored-by: Jared Van Bortel <jared@nomic.ai> Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com> 2023-11-11 06:04:50 +01:00			`sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))`
llm : support Adept Persimmon 8B (#3410) * Produces garbage output * wip: correct tensors up to RoPE * correct tensors thru RoPE * Correct outputs through masked & softmax'd KQ * fp32 works * Rename adept->persimmon * Produces correct outputs * clean up convert scripts * remove printing logic from ggml.c * remove prints from llama.cpp & fix merge * trivial cleanups * Add offload funcs * update conversion script to directly take adept artifacts rather than .saftensors file * Fix norm eps bug * Support sqr and concat on metal, persimmon-8b-q4 runs correctly * Small changes from review * Formatting changes * Minor changes to conversion script * Remove old script * Fix editorconfig formatting * Fix build * add overlooked offload code ggml-ci 2023-10-07 09:12:43 +02:00			`import gguf`

ci : add flake8 to github actions (python linting) (#4129) Disabled rules: * E203 Whitespace before ':' - disabled because we often use 'C' Style where values are aligned * E211 Whitespace before '(' (E211) - disabled because we often use 'C' Style where values are aligned * E221 Multiple spaces before operator - disabled because we often use 'C' Style where values are aligned * E225 Missing whitespace around operator - disabled because it's broken so often it seems like a standard * E231 Missing whitespace after ',', ';', or ':' - disabled because we often use 'C' Style where values are aligned * E241 Multiple spaces after ',' - disabled because we often use 'C' Style where values are aligned * E251 Unexpected spaces around keyword / parameter equals - disabled because it's broken so often it seems like a standard * E261 At least two spaces before inline comment - disabled because it's broken so often it seems like a standard * E266 Too many leading '#' for block comment - sometimes used as "section" separator * E501 Line too long - disabled because it's broken so often it seems like a standard * E701 Multiple statements on one line (colon) - broken only in convert.py when defining abstract methods (we can use# noqa instead) * E704 Multiple statements on one line - broken only in convert.py when defining abstract methods (we can use# noqa instead) 2023-11-20 11:35:47 +01:00
llm : support Adept Persimmon 8B (#3410) * Produces garbage output * wip: correct tensors up to RoPE * correct tensors thru RoPE * Correct outputs through masked & softmax'd KQ * fp32 works * Rename adept->persimmon * Produces correct outputs * clean up convert scripts * remove printing logic from ggml.c * remove prints from llama.cpp & fix merge * trivial cleanups * Add offload funcs * update conversion script to directly take adept artifacts rather than .saftensors file * Fix norm eps bug * Support sqr and concat on metal, persimmon-8b-q4 runs correctly * Small changes from review * Formatting changes * Minor changes to conversion script * Remove old script * Fix editorconfig formatting * Fix build * add overlooked offload code ggml-ci 2023-10-07 09:12:43 +02:00			`def _flatten_dict(dct, tensors, prefix=None):`
			`assert isinstance(dct, dict)`
			`for key in dct.keys():`
			`new_prefix = prefix + '.' + key if prefix is not None else key`
			`if isinstance(dct[key], torch.Tensor):`
			`tensors[new_prefix] = dct[key]`
			`elif isinstance(dct[key], dict):`
			`_flatten_dict(dct[key], tensors, new_prefix)`
			`else:`
			`raise ValueError(type(dct[key]))`
			`return None`

ci : add flake8 to github actions (python linting) (#4129) Disabled rules: * E203 Whitespace before ':' - disabled because we often use 'C' Style where values are aligned * E211 Whitespace before '(' (E211) - disabled because we often use 'C' Style where values are aligned * E221 Multiple spaces before operator - disabled because we often use 'C' Style where values are aligned * E225 Missing whitespace around operator - disabled because it's broken so often it seems like a standard * E231 Missing whitespace after ',', ';', or ':' - disabled because we often use 'C' Style where values are aligned * E241 Multiple spaces after ',' - disabled because we often use 'C' Style where values are aligned * E251 Unexpected spaces around keyword / parameter equals - disabled because it's broken so often it seems like a standard * E261 At least two spaces before inline comment - disabled because it's broken so often it seems like a standard * E266 Too many leading '#' for block comment - sometimes used as "section" separator * E501 Line too long - disabled because it's broken so often it seems like a standard * E701 Multiple statements on one line (colon) - broken only in convert.py when defining abstract methods (we can use# noqa instead) * E704 Multiple statements on one line - broken only in convert.py when defining abstract methods (we can use# noqa instead) 2023-11-20 11:35:47 +01:00
llm : support Adept Persimmon 8B (#3410) * Produces garbage output * wip: correct tensors up to RoPE * correct tensors thru RoPE * Correct outputs through masked & softmax'd KQ * fp32 works * Rename adept->persimmon * Produces correct outputs * clean up convert scripts * remove printing logic from ggml.c * remove prints from llama.cpp & fix merge * trivial cleanups * Add offload funcs * update conversion script to directly take adept artifacts rather than .saftensors file * Fix norm eps bug * Support sqr and concat on metal, persimmon-8b-q4 runs correctly * Small changes from review * Formatting changes * Minor changes to conversion script * Remove old script * Fix editorconfig formatting * Fix build * add overlooked offload code ggml-ci 2023-10-07 09:12:43 +02:00			`def _get_sentencepiece_tokenizer_info(dir_model: Path):`
			`tokenizer_path = dir_model / 'adept_vocab.model'`
			`print('gguf: getting sentencepiece tokenizer from', tokenizer_path)`
			`tokenizer = SentencePieceProcessor(str(tokenizer_path))`
			`print('gguf: adding tokens')`
			`tokens: list[bytes] = []`
			`scores: list[float] = []`
			`toktypes: list[int] = []`

			`for i in range(tokenizer.vocab_size()):`
			`text: bytes`
			`score: float`

			`piece = tokenizer.id_to_piece(i)`
			`text = piece.encode("utf-8")`
			`score = tokenizer.get_score(i)`

			`toktype = 1`
			`if tokenizer.is_unknown(i):`
			`toktype = 2`
			`if tokenizer.is_control(i):`
			`toktype = 3`
			`if tokenizer.is_unused(i):`
			`toktype = 5`
			`if tokenizer.is_byte(i):`
			`toktype = 6`

			`tokens.append(text)`
			`scores.append(score)`
			`toktypes.append(toktype)`
			`pass`
			`return tokens, scores, toktypes`

ci : add flake8 to github actions (python linting) (#4129) Disabled rules: * E203 Whitespace before ':' - disabled because we often use 'C' Style where values are aligned * E211 Whitespace before '(' (E211) - disabled because we often use 'C' Style where values are aligned * E221 Multiple spaces before operator - disabled because we often use 'C' Style where values are aligned * E225 Missing whitespace around operator - disabled because it's broken so often it seems like a standard * E231 Missing whitespace after ',', ';', or ':' - disabled because we often use 'C' Style where values are aligned * E241 Multiple spaces after ',' - disabled because we often use 'C' Style where values are aligned * E251 Unexpected spaces around keyword / parameter equals - disabled because it's broken so often it seems like a standard * E261 At least two spaces before inline comment - disabled because it's broken so often it seems like a standard * E266 Too many leading '#' for block comment - sometimes used as "section" separator * E501 Line too long - disabled because it's broken so often it seems like a standard * E701 Multiple statements on one line (colon) - broken only in convert.py when defining abstract methods (we can use# noqa instead) * E704 Multiple statements on one line - broken only in convert.py when defining abstract methods (we can use# noqa instead) 2023-11-20 11:35:47 +01:00
llm : support Adept Persimmon 8B (#3410) * Produces garbage output * wip: correct tensors up to RoPE * correct tensors thru RoPE * Correct outputs through masked & softmax'd KQ * fp32 works * Rename adept->persimmon * Produces correct outputs * clean up convert scripts * remove printing logic from ggml.c * remove prints from llama.cpp & fix merge * trivial cleanups * Add offload funcs * update conversion script to directly take adept artifacts rather than .saftensors file * Fix norm eps bug * Support sqr and concat on metal, persimmon-8b-q4 runs correctly * Small changes from review * Formatting changes * Minor changes to conversion script * Remove old script * Fix editorconfig formatting * Fix build * add overlooked offload code ggml-ci 2023-10-07 09:12:43 +02:00			`def main():`
			`parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file")`
			`parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")`
			`parser.add_argument("--ckpt-path", type=Path, help="path to persimmon checkpoint .pt file")`
			`parser.add_argument("--model-dir", type=Path, help="directory containing model e.g. 8b_chat_model_release")`
			`parser.add_argument("--adept-inference-dir", type=str, help="path to adept-inference code directory")`
			`args = parser.parse_args()`
			`sys.path.append(str(args.adept_inference_dir))`
			`persimmon_model = torch.load(args.ckpt_path)`
			`hparams = persimmon_model['args']`
			`pprint(hparams)`
			`tensors = {}`
			`_flatten_dict(persimmon_model['model'], tensors, None)`

			`arch = gguf.MODEL_ARCH.PERSIMMON`
			`gguf_writer = gguf.GGUFWriter(args.outfile, gguf.MODEL_ARCH_NAMES[arch])`

			`block_count = hparams.num_layers`
			`head_count = hparams.num_attention_heads`
			`head_count_kv = head_count`
			`ctx_length = hparams.seq_length`
			`hidden_size = hparams.hidden_size`

			`gguf_writer.add_name('persimmon-8b-chat')`
			`gguf_writer.add_context_length(ctx_length)`
			`gguf_writer.add_embedding_length(hidden_size)`
			`gguf_writer.add_block_count(block_count)`
			`gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size)`
			`gguf_writer.add_rope_dimension_count(hidden_size // head_count)`
			`gguf_writer.add_head_count(head_count)`
			`gguf_writer.add_head_count_kv(head_count_kv)`
			`gguf_writer.add_rope_freq_base(hparams.rotary_emb_base)`
			`gguf_writer.add_layer_norm_eps(hparams.layernorm_epsilon)`

			`tokens, scores, toktypes = _get_sentencepiece_tokenizer_info(args.model_dir)`
			`gguf_writer.add_tokenizer_model('llama')`
			`gguf_writer.add_token_list(tokens)`
			`gguf_writer.add_token_scores(scores)`
			`gguf_writer.add_token_types(toktypes)`
			`gguf_writer.add_bos_token_id(71013)`
			`gguf_writer.add_eos_token_id(71013)`

			`tensor_map = gguf.get_tensor_name_map(arch, block_count)`
			`print(tensor_map)`
			`for name in tensors.keys():`
			`data = tensors[name]`
			`if name.endswith(".self_attention.rotary_emb.inv_freq"):`
			`continue`
			`old_dtype = data.dtype`
			`# TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)`
			`data = data.to(torch.float32).squeeze().numpy()`
			`new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))`
			`if new_name is None:`
			`print("Can not map tensor '" + name + "'")`
			`sys.exit()`
			`n_dims = len(data.shape)`
			`print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))`
			`gguf_writer.add_tensor(new_name, data)`
			`print("gguf: write header")`
			`gguf_writer.write_header_to_file()`
			`print("gguf: write metadata")`
			`gguf_writer.write_kv_data_to_file()`
			`print("gguf: write tensors")`
			`gguf_writer.write_tensors_to_file()`

			`gguf_writer.close()`

			`print(f"gguf: model successfully exported to '{args.outfile}'")`
			`print("")`


			`if __name__ == '__main__':`
			`main()`