py : new conversion script (#545)

Current status: Working, except for the latest GPTQ-for-LLaMa format that includes `g_idx`. This turns out to require changes to GGML, so for now it only works if you use the `--outtype` option to dequantize it back to f16 (which is pointless except for debugging). I also included some cleanup for the C++ code. This script is meant to replace all the existing conversion scripts (including the ones that convert from older GGML formats), while also adding support for some new formats. Specifically, I've tested with: - [x] `LLaMA` (original) - [x] `llama-65b-4bit` - [x] `alpaca-native` - [x] `alpaca-native-4bit` - [x] LLaMA converted to 'transformers' format using `convert_llama_weights_to_hf.py` - [x] `alpaca-native` quantized with `--true-sequential --act-order --groupsize 128` (dequantized only) - [x] same as above plus `--save_safetensors` - [x] GPT4All - [x] stock unversioned ggml - [x] ggmh There's enough overlap in the logic needed to handle these different cases that it seemed best to move to a single script. I haven't tried this with Alpaca-LoRA because I don't know where to find it. Useful features: - Uses multiple threads for a speedup in some cases (though the Python GIL limits the gain, and sometimes it's disk-bound anyway). - Combines split models into a single file (both the intra-tensor split of the original and the inter-tensor split of 'transformers' format files). Single files are more convenient to work with and more friendly to future changes to use memory mapping on the C++ side. To accomplish this without increasing memory requirements, it has some custom loading code which avoids loading whole input files into memory at once. - Because of the custom loading code, it no longer depends in PyTorch, which might make installing dependencies slightly easier or faster... although it still depends on NumPy and sentencepiece, so I don't know if there's any meaningful difference. In any case, I also added a requirements.txt file to lock the dependency versions in case of any future breaking changes. - Type annotations checked with mypy. - Some attempts to be extra user-friendly: - The script tries to be forgiving with arguments, e.g. you can specify either the model file itself or the directory containing it. - The script doesn't depend on config.json / params.json, just in case the user downloaded files individually and doesn't have those handy. But you still need tokenizer.model and, for Alpaca, added_tokens.json. - The script tries to give a helpful error message if added_tokens.json is missing.
2024-12-25 22:08:46 +01:00 · 2023-04-14 00:03:03 -07:00 · 2023-04-14 00:03:03 -07:00 · 723dac55fa
commit 723dac55fa
parent 0f07cacb05
9 changed files with 1154 additions and 1261 deletions
--- a/README.md
+++ b/README.md
@ -192,10 +192,10 @@ ls ./models
 65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model
 # install Python dependencies
-python3 -m pip install torch numpy sentencepiece
+python3 -m pip install -r requirements.txt
 # convert the 7B model to ggml FP16 format
-python3 convert-pth-to-ggml.py models/7B/ 1
+python3 convert.py models/7B/
 # quantize the model to 4-bits (using method 2 = q4_0)
 ./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin 2
--- a/convert-ggml-to-pth.py
+++ b/convert-ggml-to-pth.py
@ -1,299 +0,0 @@
 # Author: github.com/ductai199x
 import argparse
 import os
 import struct
 import numpy as np
 import torch
 from numba import njit
 from tqdm.auto import tqdm
 def read_header(fin):
    values = struct.unpack("i" * 9, fin.read(4 * 9))
    _, _, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype = values
    return {
        "vocab_size": vocab_size,
        "dim": dim,
        "multiple_of": multiple_of,
        "n_heads": n_heads,
        "n_layers": n_layers,
    }, ftype
 def read_tokens(fin, vocab_size):
    tokens = []
    for _ in range(vocab_size):
        text_len = struct.unpack("i", fin.read(4))[0]
        text_bytes = fin.read(text_len)
        try:
            text = text_bytes.decode()
        except UnicodeDecodeError:
            text = text_bytes.decode(errors="replace")
        score = struct.unpack("f", fin.read(4))[0]
        tokens.append((text, score))
    return tokens
@njit
 def dequantize_weights_numba(fin_data, n_rows, n_cols):
    qk = 32
    nb = n_cols // qk
    bs = 4 + (qk // 2)
    weights = np.zeros((n_rows, n_cols), dtype=np.float32)
    data_pos = 0
    for row in range(n_rows):
        for block in range(nb):
            d = np.frombuffer(fin_data[data_pos : data_pos + 4], dtype=np.float32)[0]
            data_pos += 4
            packed_values = fin_data[data_pos : data_pos + (qk // 2)]
            data_pos += qk // 2
            for i in range(qk // 2):
                packed_value = packed_values[i]
                v0 = np.float32((packed_value & 0b00001111) - 8) * d
                v1 = np.float32((packed_value >> 4) - 8) * d
                weights[row, block * qk + 2 * i] = v0
                weights[row, block * qk + 2 * i + 1] = v1
    return weights
 def dequantize_weights(fin, n_rows, n_cols):
    qk = 32
    nb = n_cols // qk
    data_size = n_rows * n_cols // 2 + n_rows * nb * 4
    fin_data = fin.read(data_size)
    return dequantize_weights_numba(fin_data, n_rows, n_cols)
 def read_variables(fin):
    model = {}
    pbar = tqdm(total=os.path.getsize(fin.name), unit="B", unit_scale=True, desc="Reading variables")
    while True:
        start_pos = fin.tell()
        try:
            n_dims, name_length, ftype_cur = struct.unpack("iii", fin.read(4 * 3))
        except struct.error:
            break
        shape = tuple(struct.unpack("i" * n_dims, fin.read(4 * n_dims)))
        shape = shape[::-1]
        name = fin.read(name_length).decode()
        # ensure tensor data is aligned
        tensor_data_offset = fin.tell()
        tensor_data_offset = (tensor_data_offset + 31) & -32
        fin.seek(tensor_data_offset)
        if ftype_cur == 2:
            # 4-bit quantized weights
            dtype = np.uint8
            data = dequantize_weights(fin, shape[0], shape[1])
            data = data.reshape(shape)
        elif ftype_cur == 0:
            dtype = np.float32
            data_size = np.prod(shape)
            data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape)
        elif ftype_cur == 1:
            dtype = np.float16
            data_size = np.prod(shape)
            data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape)
        model[name] = torch.tensor(data, dtype=torch.float32 if dtype == np.float32 else torch.float16)
        pbar.update(fin.tell() - start_pos)
    return model
 def convert_to_hf_format(model, hparams):
    # This works for llama 7B, need to test with other models
    n_layers = hparams["n_layers"]
    n_heads = hparams["n_heads"]
    dim = hparams["dim"]
    dims_per_head = dim // n_heads
    base = 10000.0
    inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
    # permute for sliced rotary
    def permute(w):
        return w.view(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim)
    state_dict = {}
    for layer_i in range(n_layers):
        state_dict.update(
            {
                f"model.layers.{layer_i}.self_attn.q_proj.weight": permute(
                    model[f"layers.{layer_i}.attention.wq.weight"]
                ),
                f"model.layers.{layer_i}.self_attn.k_proj.weight": permute(
                    model[f"layers.{layer_i}.attention.wk.weight"]
                ),
                f"model.layers.{layer_i}.self_attn.v_proj.weight": model[
                    f"layers.{layer_i}.attention.wv.weight"
                ],
                f"model.layers.{layer_i}.self_attn.o_proj.weight": model[
                    f"layers.{layer_i}.attention.wo.weight"
                ],
                f"model.layers.{layer_i}.mlp.gate_proj.weight": model[
                    f"layers.{layer_i}.feed_forward.w1.weight"
                ],
                f"model.layers.{layer_i}.mlp.down_proj.weight": model[
                    f"layers.{layer_i}.feed_forward.w2.weight"
                ],
                f"model.layers.{layer_i}.mlp.up_proj.weight": model[
                    f"layers.{layer_i}.feed_forward.w3.weight"
                ],
                f"model.layers.{layer_i}.input_layernorm.weight": model[
                    f"layers.{layer_i}.attention_norm.weight"
                ],
                f"model.layers.{layer_i}.post_attention_layernorm.weight": model[
                    f"layers.{layer_i}.ffn_norm.weight"
                ],
            }
        )
        state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
    state_dict.update(
        {
            "model.embed_tokens.weight": model["tok_embeddings.weight"],
            "model.norm.weight": model["norm.weight"],
            "lm_head.weight": model["output.weight"],
        }
    )
    return state_dict
 def chat(model, hparams, llama_dir):
    from transformers import (GenerationConfig, LlamaForCausalLM,
                              LlamaTokenizer, StoppingCriteria,
                              StoppingCriteriaList)
    from transformers.models.llama.configuration_llama import LlamaConfig
    class StoppingCriteriaSub(StoppingCriteria):
        def __init__(self):
            super().__init__()
        def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, stops=[]):
            print(tokenizer.decode(input_ids[0]), end="", flush=True)
            if input_ids[0][-1] == 13:
                return True
            return False
    config = LlamaConfig(
        vocab_size=hparams["vocab_size"],
        dim=hparams["dim"],
        num_hidden_layers=hparams["n_layers"],
        num_attention_heads=hparams["n_heads"],
    )
    llama = LlamaForCausalLM(config=config)
    llama.load_state_dict(state_dict=model, strict=True)
    tokenizer = LlamaTokenizer.from_pretrained(llama_dir)
    device = torch.device("cpu")
    llama = llama.to(device)
    ctx = """You are AI.
 This is a dialog, where User interacts with AI. AI is helpful, kind, obedient, honest, respectful, direct, concise, should try to protect User's privacy, and knows its own limits. Also, AI must answer User and AI cannot stop the conversation by itself.
 User: Hello, AI.
 AI: Hello! How can I assist you today?
 """
    print(ctx.rstrip("\n"))
    while True:
        print("-" * 60)
        prompt = input("User: ")
        if ctx != "":
            ctx = f"{ctx}User: {prompt}\n"
        else:
            ctx = f"{prompt}\nAI:"
        ctx = (ctx[-1920:]) if len(ctx) >= 2048 else ctx
        print("-" * 60)
        if len(ctx.strip()) > 0:
            input_ids = tokenizer(ctx, return_tensors="pt")["input_ids"].to(device)
            generation_config = GenerationConfig(
                temperature=0.8,
                top_p=0.95,
                top_k=50,
                repetition_penalty=1.1764,
            )
            with torch.no_grad():
                generation_output = llama.generate(
                    input_ids=input_ids,
                    generation_config=generation_config,
                    return_dict_in_generate=True,
                    output_scores=True,
                    max_length=2048,
                    do_sample=True,
                    stopping_criteria=StoppingCriteriaList([StoppingCriteriaSub()]),
                )
            s = generation_output.sequences[0]
            decoded = tokenizer.decode(s)
            ctx = f"{decoded}\n"
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--input_dir", "-i", type=str, required=True, help="The input directory containing the ggml files."
    )
    parser.add_argument(
        "--prefix",
        "-p",
        type=str,
        required=True,
        help="The prefix of the ggml files (ggml-model-f16 or ggml-model-q4_0).",
    )
    parser.add_argument(
        "--hf",
        action="store_true",
        help="Whether to save the model in the Hugging Face format. (default: False)",
    )
    parser.add_argument(
        "--chat", "-c", action="store_true", help="Whether to open a chat with the model. (default: False)"
    )
    args = parser.parse_args()
    llama_dir = os.path.abspath(f"{args.input_dir}/../")
    ggml_files = sorted(
        [f"{args.input_dir}/{f}" for f in os.listdir(args.input_dir) if f.startswith(args.prefix)]
    )
    fin = open(ggml_files[0], "rb")
    hparams, ftype = read_header(fin)
    tokens = read_tokens(fin, hparams["vocab_size"])
    model = read_variables(fin)
    for f in tqdm(ggml_files[1:]):
        fin = open(f, "rb")
        read_header(fin)
        read_tokens(fin, hparams["vocab_size"])
        model.update(read_variables(fin))
    if args.hf:
        model = convert_to_hf_format(model, hparams)
    pth_ckpt = {
        "state_dict": model,
        "hparams": hparams,
        "tokens": tokens,
    }
    torch.save(pth_ckpt, f"{args.input_dir}/{args.prefix}-to-torch.pth")
    if args.chat:
        if not args.hf:
            model = convert_to_hf_format(model, hparams)
        chat(model, hparams, llama_dir)
 if __name__ == "__main__":
    main()
--- a/convert-gpt4all-to-ggml.py
+++ b/convert-gpt4all-to-ggml.py
@ -1,107 +0,0 @@
 #!/usr/bin/env python3
 #
 # TODO: deduplicate GPT4All with convert-unversioned-ggml-to-ggml.py
 #
 # Original by https://github.com/eiz
 # https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818
 import argparse
 import glob
 import os
 import struct
 import sys
 from sentencepiece import SentencePieceProcessor
 HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
 def parse_args():
    parser = argparse.ArgumentParser(description='Upgrade a GPT4All model to the current format')
    parser.add_argument('gpt4all_model', help='path to gpt4all-lora-quantized.bin')
    parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
    return parser.parse_args()
 def read_header(f_in):
    struct_fmt = "i" * (3 + len(HPARAMS))
    struct_size = struct.calcsize(struct_fmt)
    buf = f_in.read(struct_size)
    return struct.unpack(struct_fmt, buf)
 def write_header(f_out, header):
    (magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
    if magic != 0x67676d6c:
        raise Exception('Invalid file magic. Must be an old style ggml file.')
    values = [
        0x67676d66, # magic: ggml in hex
        1,          # file version
        vocab_size,
        dim,
        multiple_of,
        n_heads,
        n_layers,
        rot,
        ftype
    ]
    f_out.write(struct.pack("i" * len(values), *values))
 def write_tokens(fout, tokenizer):
    for i in range(tokenizer.vocab_size()):
        if tokenizer.is_unknown(i):
            text = " \u2047 ".encode()
        elif tokenizer.is_control(i):
            text = b""
        elif tokenizer.is_byte(i):
            piece = tokenizer.id_to_piece(i)
            if len(piece) != 6:
                print(f"Invalid token: {piece}")
                sys.exit(1)
            byte_value = int(piece[3:-1], 16)
            text = struct.pack("B", byte_value)
        else:
            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
        fout.write(struct.pack("i", len(text)))
        fout.write(text)
        fout.write(struct.pack("f", tokenizer.get_score(i)))
    # TODO: GPT4All - add extra <pad> token
    text = "<pad>".encode()
    fout.write(struct.pack("i", len(text)))
    fout.write(text)
    fout.write(struct.pack("f", 0.0))
 def read_tokens(f_in, tokenizer):
    for i in range(tokenizer.vocab_size()):
        len_b = f_in.read(4)
        (length,) = struct.unpack("i", len_b)
        f_in.read(length)
 def copy_all_data(f_out, f_in):
    while True:
        buf = f_in.read(1024 * 1024)
        if not buf:
            break
        f_out.write(buf)
 def convert_one_file(path_in, tokenizer):
    path_tmp = f"{path_in}.tmp"
    path_orig= f"{path_in}.orig"
    print(f"converting {path_in}")
    with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out:
        write_header(f_out, read_header(f_in))
        read_tokens(f_in, tokenizer)
        write_tokens(f_out, tokenizer)
        copy_all_data(f_out, f_in)
    os.rename(path_in, path_orig)
    os.rename(path_tmp, path_in)
 def main():
    args = parse_args()
    tokenizer = SentencePieceProcessor(args.tokenizer_model)
    convert_one_file(args.gpt4all_model, tokenizer)
 if __name__ == "__main__":
    main()
--- a/convert-gptq-to-ggml.py
+++ b/convert-gptq-to-ggml.py
@ -1,172 +0,0 @@
 # Convert a GPTQ quantized LLaMA model to a ggml compatible file
 # Based on: https://github.com/qwopqwop200/GPTQ-for-LLaMa
 #
 import os
 import re
 import sys
 import json
 import struct
 import numpy as np
 import torch
 from sentencepiece import SentencePieceProcessor
 if len(sys.argv) != 4:
    print("Usage: convert-gptq-to-ggml.py llamaXXb-4bit.pt tokenizer.model out.bin\n")
    sys.exit(1)
 fname_model = sys.argv[1]
 fname_tokenizer = sys.argv[2]
 dir_out = sys.argv[3]
 model = torch.load(fname_model, map_location="cpu")
 n_vocab, n_embd = model['model.embed_tokens.weight'].shape
 n_layer = 1 + max(int(m.group(1)) for name in model
                  if (m := re.match(r'model\.layers\.([0-9]+)', name)))
 # hardcoded:
 n_mult = 256
 n_head = {32: 32, 40: 40, 60: 52, 80: 64}[n_layer]
 tokenizer = SentencePieceProcessor(fname_tokenizer)
 assert tokenizer.vocab_size() == n_vocab
 fname_out = sys.argv[3]
 fout = open(fname_out, "wb")
 fout.write(struct.pack("i", 0x67676d66)) # magic: ggmf in hex
 fout.write(struct.pack("i", 1)) # file version
 fout.write(struct.pack("i", n_vocab))
 fout.write(struct.pack("i", n_embd))
 fout.write(struct.pack("i", n_mult))
 fout.write(struct.pack("i", n_head))
 fout.write(struct.pack("i", n_layer))
 fout.write(struct.pack("i", n_embd // n_head)) # rot (obsolete)
 fout.write(struct.pack("i", 4))
 # This loop unchanged from convert-pth-to-ggml.py:
 for i in range(tokenizer.vocab_size()):
    if tokenizer.is_unknown(i):
        text = " \u2047 ".encode()
    elif tokenizer.is_control(i):
        text = b""
    elif tokenizer.is_byte(i):
        piece = tokenizer.id_to_piece(i)
        if len(piece) != 6:
            print(f"Invalid token: {piece}")
            sys.exit(1)
        byte_value = int(piece[3:-1], 16)
        text = struct.pack("B", byte_value)
    else:
        text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
    fout.write(struct.pack("i", len(text)))
    fout.write(text)
    fout.write(struct.pack("f", tokenizer.get_score(i)))
 def write_header(shape, dst_name, ftype_cur):
    sname = dst_name.encode()
    fout.write(struct.pack("iii", len(shape), len(sname), ftype_cur))
    fout.write(struct.pack("i" * len(shape), *shape[::-1]))
    fout.write(sname)
    # ensure tensor data is aligned
    tensor_data_offset = fout.tell()
    tensor_data_offset = (tensor_data_offset + 31) & -32
    fout.seek(tensor_data_offset)
 def convert_non_q4(src_name, dst_name):
    v = model[src_name]
    shape = v.shape
    print(f"Processing non-Q4 variable: {src_name} with shape: {shape} and type: {v.dtype}")
    if len(shape) == 1:
        print("  Converting to float32")
        v = v.to(torch.float32)
    ftype_cur = {torch.float16: 1, torch.float32: 0}[v.dtype]
    # header
    write_header(shape, dst_name, ftype_cur)
    # data
    v.numpy().tofile(fout)
 def convert_q4(src_name, dst_name, permute=False):
    zeros = model[f"{src_name}.zeros"].numpy()
    scales = model[f"{src_name}.scales"].numpy()
    bias = model[f"{src_name}.bias"].numpy()
    qweight = model[f"{src_name}.qweight"].numpy().T # transpose
    # Q4_1 does not support bias; good thing the bias is always all zeros.
    assert not np.any(bias)
    # Each int32 item is actually 8 int4 items packed together, and it's transposed.
    shape = (qweight.shape[0], qweight.shape[1] * 8)
    print(f"Processing Q4 variable: {src_name} with shape: {shape}")
    # The output format has the int4 weights in groups of 32 rather than 8.
    # It looks like this:
    # For each row:
    #   For each group of 32 columns:
    #     - addend (float32, 4 bytes)
    #     - scale (float32, 4 bytes)
    #     - weights (int4 * 32, 16 bytes)
    # Note that in the input, the scales and addends are shared between all
    # the columns in a row, so we end up wasting quite a bit of memory with
    # repeated scales and addends.
    addends = -zeros # flip sign
    # Since the output format is mixed between integers and floats, we have
    # to hackily view the floats as int32s just so numpy will let us
    # concatenate them.
    addends_view = addends.view(dtype=np.int32)
    scales_view = scales.view(dtype=np.int32)
    # Split into groups of 4 columns (i.e. 32 columns of quantized data):
    grouped = qweight.reshape([qweight.shape[0], qweight.shape[1] // 4, 4])
    # Repeat addends and scales:
    addends_rep = np.atleast_3d(addends_view).repeat(grouped.shape[1], axis=1)
    scales_rep = np.atleast_3d(scales_view).repeat(grouped.shape[1], axis=1)
    blob = np.concatenate([scales_rep, addends_rep, grouped], axis=2, casting='no')
    if permute:
        # Permute some rows to undo the permutation done by convert_llama_weights_to_hf.py.
        # This can be done after the above conversion because it doesn't affect column order/layout.
        blob = (blob.reshape(n_head, 2, shape[0] // n_head // 2, *blob.shape[1:])
                    .swapaxes(1, 2)
                    .reshape(blob.shape))
    # header
    write_header(shape, dst_name, 3) # ftype = Q4_1
    # data
    blob.tofile(fout)
 convert_non_q4("model.embed_tokens.weight", "tok_embeddings.weight")
 convert_non_q4("model.norm.weight", "norm.weight")
 convert_non_q4("lm_head.weight", "output.weight")
 for i in range(n_layer):
    convert_q4(f"model.layers.{i}.self_attn.q_proj", f"layers.{i}.attention.wq.weight", permute=True)
    convert_q4(f"model.layers.{i}.self_attn.k_proj", f"layers.{i}.attention.wk.weight", permute=True)
    convert_q4(f"model.layers.{i}.self_attn.v_proj", f"layers.{i}.attention.wv.weight")
    convert_q4(f"model.layers.{i}.self_attn.o_proj", f"layers.{i}.attention.wo.weight")
    convert_q4(f"model.layers.{i}.mlp.gate_proj", f"layers.{i}.feed_forward.w1.weight")
    convert_q4(f"model.layers.{i}.mlp.down_proj", f"layers.{i}.feed_forward.w2.weight")
    convert_q4(f"model.layers.{i}.mlp.up_proj",   f"layers.{i}.feed_forward.w3.weight")
    convert_non_q4(f"model.layers.{i}.input_layernorm.weight", f"layers.{i}.attention_norm.weight")
    convert_non_q4(f"model.layers.{i}.post_attention_layernorm.weight", f"layers.{i}.ffn_norm.weight")
 fout.close()
 print(f"Done. Output file: {fname_out}")
 print()
--- a/convert-pth-to-ggml.py
+++ b/convert-pth-to-ggml.py
@ -1,274 +1,11 @@
-# Convert a LLaMA model checkpoint to a ggjt compatible file
+# Compatibility stub
 #
 # Load the model using Torch
 # Iterate over all variables and write them to a binary file.
 #
 # For each variable, write the following:
 #   - Number of dimensions (int)
 #   - Name length (int)
 #   - Dimensions (int[n_dims])
 #   - Name (char[name_length])
 #   - Data (float[n_dims])
 #
 # At the start of the ggml file we write the model parameters
 # and vocabulary.
 #
 import argparse
 import os
 import sys
 import json
 import struct
 import numpy as np
 import torch
-from sentencepiece import SentencePieceProcessor
+import convert
-QK = 32
+parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
-
+parser.add_argument('dir_model',  help='directory containing the model checkpoint')
-GGML_TYPE_Q4_0  = 0
+parser.add_argument('ftype',      help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
-GGML_TYPE_Q4_1  = 1
+args = parser.parse_args()
-GGML_TYPE_I8    = 2
+convert.main(['--outtype', 'f16' if args.ftype == 1 else 'f32', '--', args.dir_model])
 GGML_TYPE_I16   = 3
 GGML_TYPE_I32   = 4
 GGML_TYPE_F16   = 5
 GGML_TYPE_F32   = 6
 WTYPES = {
    0: GGML_TYPE_F32,
    1: GGML_TYPE_F16,
    2: GGML_TYPE_Q4_0,
    3: GGML_TYPE_Q4_1,
 }
 GGML_BLCK_SIZE = {
    GGML_TYPE_Q4_0:  QK,
    GGML_TYPE_Q4_1:  QK,
    GGML_TYPE_I8:    1,
    GGML_TYPE_I16:   1,
    GGML_TYPE_I32:   1,
    GGML_TYPE_F16:   1,
    GGML_TYPE_F32:   1,
 }
 GGML_TYPE_SIZE = {
    GGML_TYPE_Q4_0: 4   + QK//2,
    GGML_TYPE_Q4_1: 4*2 + QK//2,
    GGML_TYPE_I8:   1,
    GGML_TYPE_I16:  2,
    GGML_TYPE_I32:  4,
    GGML_TYPE_F16:  2,
    GGML_TYPE_F32:  4,
 }
 def ggml_nelements(shape):
    r = 1
    for i in shape:
        r *= i
    return r
 def ggml_nbytes(shape, ftype):
    x = ggml_nelements(shape)
    t = WTYPES[ftype]
    x *= GGML_TYPE_SIZE[t]
    x //= GGML_BLCK_SIZE[t]
    return x
 def parse_args():
    parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
    parser.add_argument('dir_model',  help='directory containing the model checkpoint')
    parser.add_argument('ftype',      help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
    parser.add_argument('vocab_only', help='only write vocab to file', type=int, default=0, nargs='?')
    return parser.parse_args()
 def get_n_parts(dim):
    mappings = {4096: 1, 5120: 2, 6656: 4, 8192: 8}
    n_parts = mappings.get(dim)
    if n_parts is None:
        print(f"Invalid dim: {dim}")
        sys.exit(1)
    print(f"n_parts = {n_parts}\n")
    return n_parts
 def load_hparams_and_tokenizer(dir_model):
    # `dir_model` is something like `models/7B` or `models/7B/`.
    # "tokenizer.model" is expected under model's parent dir.
    # When `dir_model` is a symlink, f"{dir_model}/../tokenizer.model" would not be found.
    # Let's use the model's parent dir directly.
    model_parent_dir = os.path.dirname(os.path.normpath(dir_model))
    fname_hparams = f"{dir_model}/params.json"
    fname_tokenizer = f"{model_parent_dir}/tokenizer.model"
    with open(fname_hparams, "r") as f:
        hparams = json.load(f)
        print(hparams)
    tokenizer = SentencePieceProcessor(fname_tokenizer)
    hparams.update({"vocab_size": tokenizer.vocab_size()})
    return hparams, tokenizer
 def write_header(fout, hparams, ftype):
    keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
    values = [
        0x67676a74,  # magic: ggjt in hex
        1, # file version
        *[hparams[key] for key in keys],
        hparams["dim"] // hparams["n_heads"],  # rot (obsolete)
        ftype
    ]
    fout.write(struct.pack("i" * len(values), *values))
 def write_tokens(fout, tokenizer):
    for i in range(tokenizer.vocab_size()):
        if tokenizer.is_unknown(i):
            text = " \u2047 ".encode()
        elif tokenizer.is_control(i):
            text = b""
        elif tokenizer.is_byte(i):
            piece = tokenizer.id_to_piece(i)
            if len(piece) != 6:
                print(f"Invalid token: {piece}")
                sys.exit(1)
            byte_value = int(piece[3:-1], 16)
            text = struct.pack("B", byte_value)
        else:
            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
        fout.write(struct.pack("i", len(text)))
        fout.write(text)
        fout.write(struct.pack("f", tokenizer.get_score(i)))
 def process_and_write_variables(fout, model, ftype, part_id, n_parts):
    for name, datao in model.items():
        if name.endswith("freqs"):
            continue
        # remove dimensions with a single element
        data = datao.numpy().squeeze()
        partshape = data.shape
        n_dims = len(data.shape)
        assert n_dims in (1, 2)
        print(f"Processing variable: {name} with shape: {partshape} and type: {datao.dtype}")
        # coerce single-dimensional tensors from float16 to float32
        ftype_cur = 1
        if ftype == 0 or n_dims == 1:
            print("  Converting to float32")
            data = data.astype(np.float32)
            ftype_cur = 0
        blck_size = GGML_BLCK_SIZE[WTYPES[ftype_cur]]
        type_size = GGML_TYPE_SIZE[WTYPES[ftype_cur]]
        # determine dimension along which multipart tensor is sharded
        #
        # split_dim 0 regex:
        #   - output.*
        #   - layers.*.attention.wq.weight
        #   - layers.*.attention.wk.weight
        #   - layers.*.attention.wv.weight
        #   - layers.*.feed_forward.w1.weight
        #   - layers.*.feed_forward.w3.weight
        #
        # split_dim 1 regex:
        #   - tok_embeddings.*
        #   - layers.*.attention.wo.weight
        #   - layers.*.feed_forward.w2.weight
        #
        if n_dims > 1:
            split_dim = 1
            if "tok_embeddings" in name:
                split_dim = 1
            elif "layers" in name:
                if "attention.wo.weight" in name:
                    split_dim = 1
                elif "feed_forward.w2.weight" in name:
                    split_dim = 1
                else:
                    split_dim = 0
            elif "output" in name:
                split_dim = 0
        # output tensor header
        fullshape = list(partshape)
        if n_dims > 1:
            fullshape[split_dim] *= n_parts
        sname = name.encode()
        fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
        for dim in reversed(fullshape):
            fout.write(struct.pack("i", dim))
        fout.write(sname)
        # ensure tensor data is aligned
        tensor_data_offset = fout.tell()
        while tensor_data_offset % QK != 0:
            fout.write(struct.pack("B", 0))
            tensor_data_offset += 1
        # output unified mappable tensor data
        if n_dims == 1 or n_parts == 1:
            # copy tensor which we thankfully received in one piece
            if part_id == 0:
                data.tofile(fout)
        elif split_dim == 0:
            # reassemble multifile tensor containing some of the rows
            rows_per_chunk = partshape[0]
            current_row = part_id * rows_per_chunk
            bytes_per_row = fullshape[1] // blck_size * type_size
            offset = current_row * bytes_per_row
            fout.seek(tensor_data_offset + offset)
            data.tofile(fout)
        elif split_dim == 1:
            # reassemble multifile tensor containing some of the cols
            cols_per_chunk = partshape[1]
            current_col = part_id * cols_per_chunk
            bytes_per_row = fullshape[1] // blck_size * type_size
            offset_current_col = current_col // blck_size * type_size
            for row in range(partshape[0]):
                offset_row = row * bytes_per_row
                offset = offset_row + offset_current_col
                fout.seek(tensor_data_offset + offset)
                data[row].tofile(fout)
        # advance file position to next tensor
        fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype_cur))
 def main():
    args = parse_args()
    dir_model = args.dir_model
    ftype = args.ftype
    ftype_str = ["f32", "f16"]
    hparams, tokenizer = load_hparams_and_tokenizer(dir_model)
    print(args)
    # if only writing vocab to file
    if args.vocab_only:
        fname_model = f"{dir_model}/consolidated.00.pth"
        fname_out = f"{dir_model}/ggml-vocab.bin"
        print(f"Extracting only the vocab from '{fname_model}'\n")
        with open(fname_out, "wb") as fout:
            write_header(fout, hparams, ftype)
            write_tokens(fout, tokenizer)
        print(f"Done. Output file: {fname_out}\n")
        return
    n_parts = get_n_parts(hparams["dim"])
    fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin"
    # we output a single file for ggml
    with open(fname_out, "wb") as fout:
        write_header(fout, hparams, ftype)
        write_tokens(fout, tokenizer)
        offset_of_tensors = fout.tell()
        # the tensors we load could be split across multiple files
        for part_id in range(n_parts):
            fout.seek(offset_of_tensors)
            print(f"Processing part {part_id+1} of {n_parts}\n")
            fname_model = f"{dir_model}/consolidated.0{part_id}.pth"
            model = torch.load(fname_model, map_location="cpu")
            process_and_write_variables(fout, model, ftype, part_id, n_parts)
            del model
    print(f"Done. Output file: {fname_out}\n")
 if __name__ == "__main__":
    main()
--- a/convert-unversioned-ggml-to-ggml.py
+++ b/convert-unversioned-ggml-to-ggml.py
@ -1,100 +0,0 @@
 #!/usr/bin/env python3
 # Original by https://github.com/eiz
 # https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818
 import argparse
 import glob
 import os
 import struct
 import sys
 from sentencepiece import SentencePieceProcessor
 HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
 def parse_args():
    parser = argparse.ArgumentParser(description='Upgrade old ggml model files to the current format')
    parser.add_argument('dir_model', help='directory containing ggml .bin files')
    parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
    return parser.parse_args()
 def read_header(f_in):
    struct_fmt = "i" * (3 + len(HPARAMS))
    struct_size = struct.calcsize(struct_fmt)
    buf = f_in.read(struct_size)
    return struct.unpack(struct_fmt, buf)
 def write_header(f_out, header):
    (magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
    if magic != 0x67676d6c:
        raise Exception('Invalid file magic. Must be an old style ggml file.')
    values = [
        0x67676d66,  # magic: ggml in hex
        1, # file version
        vocab_size,
        dim,
        multiple_of,
        n_heads,
        n_layers,
        rot,
        ftype
    ]
    f_out.write(struct.pack("i" * len(values), *values))
 def write_tokens(fout, tokenizer):
    for i in range(tokenizer.vocab_size()):
        if tokenizer.is_unknown(i):
            text = " \u2047 ".encode()
        elif tokenizer.is_control(i):
            text = b""
        elif tokenizer.is_byte(i):
            piece = tokenizer.id_to_piece(i)
            if len(piece) != 6:
                print(f"Invalid token: {piece}")
                sys.exit(1)
            byte_value = int(piece[3:-1], 16)
            text = struct.pack("B", byte_value)
        else:
            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
        fout.write(struct.pack("i", len(text)))
        fout.write(text)
        fout.write(struct.pack("f", tokenizer.get_score(i)))
 def read_tokens(f_in, tokenizer):
    for i in range(tokenizer.vocab_size()):
        len_b = f_in.read(4)
        (length,) = struct.unpack("i", len_b)
        f_in.read(length)
 def copy_all_data(f_out, f_in):
    while True:
        buf = f_in.read(1024 * 1024)
        if not buf:
            break
        f_out.write(buf)
 def convert_one_file(path_in, tokenizer):
    path_tmp = f"{path_in}.tmp"
    path_orig= f"{path_in}.orig"
    print(f"converting {path_in}")
    with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out:
        write_header(f_out, read_header(f_in))
        read_tokens(f_in, tokenizer)
        write_tokens(f_out, tokenizer)
        copy_all_data(f_out, f_in)
    os.rename(path_in, path_orig)
    os.rename(path_tmp, path_in)
 def main():
    args = parse_args()
    files = []
    files.extend(glob.glob(f"{args.dir_model}/*.bin"))
    files.extend(glob.glob(f"{args.dir_model}/*.bin.*"))
    tokenizer = SentencePieceProcessor(args.tokenizer_model)
    for file in files:
        convert_one_file(file, tokenizer)
 if __name__ == "__main__":
    main()
--- a/convert.py
+++ b/convert.py
--- a/migrate-ggml-2023-03-30-pr613.py
+++ b/migrate-ggml-2023-03-30-pr613.py
@ -1,311 +0,0 @@
 # Migrate ggml file(s) with ggmf magic to ggml file with ggjt magic
 #
 # We caused a breaking change to the file format on 2023-03-30 in:
 #     https://github.com/ggerganov/llama.cpp/pull/613
 #
 # (1) If you still have the Meta LLaMA .pth files, then close this
 #     file now; you can just run `convert-pth-to-ggml.py` again to
 #     migrate to the new format. The tool is easier to use too. It
 #     isn't necessary anymore to manage split output files because
 #     the new format always combines things into a single file.
 #
 # (2) If you deleted the Meta LLaMA .pth files due to save on disk
 #     space, then this tool is intended to help you.  Please check
 #     out the instructions below.
 #
 # USAGE
 #
 #     python migrate-ggml-2023-03-30-pr613.py INPUT OUTPUT
 #
 # PREREQUISITES
 #
 #     pip install numpy
 #     cd llama.cpp
 #     make -j4
 #
 # EXAMPLE (7B MODEL)
 #
 #     # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
 #     python migrate-ggml-2023-03-30-pr613.py models/7B/ggml-model-f16.bin models/7B/ggml-model-f16-ggjt.bin
 #
 #     # check that it works
 #     ./main -m models/7B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
 #
 #     # you can delete the old files
 #     rm -f models/7B/ggml-model-f16.bin
 #     mv models/7B/ggml-model-f16-ggjt.bin models/7B/ggml-model-f16.bin
 #
 # EXAMPLE (13B MODEL)
 #
 #     # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
 #     python migrate-ggml-2023-03-30-pr613.py models/13B/ggml-model-f16.bin models/13B/ggml-model-f16-ggjt.bin
 #
 #     # check that it works
 #     ./main -m models/13B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
 #
 #     # you can delete the old files
 #     rm -f models/13B/ggml-model-f16.bin*
 #     mv models/13B/ggml-model-f16-ggjt.bin models/13B/ggml-model-f16.bin
 #
 import argparse
 import os
 import sys
 import json
 import struct
 import numpy as np
 QK = 32
 GGML_TYPE_Q4_0  = 0
 GGML_TYPE_Q4_1  = 1
 GGML_TYPE_I8    = 2
 GGML_TYPE_I16   = 3
 GGML_TYPE_I32   = 4
 GGML_TYPE_F16   = 5
 GGML_TYPE_F32   = 6
 WTYPE_NAMES = {
    0: "F32",
    1: "F16",
    2: "Q4_0",
    3: "Q4_1",
 }
 WTYPES = {
    0: GGML_TYPE_F32,
    1: GGML_TYPE_F16,
    2: GGML_TYPE_Q4_0,
    3: GGML_TYPE_Q4_1,
 }
 GGML_BLCK_SIZE = {
    GGML_TYPE_Q4_0:  QK,
    GGML_TYPE_Q4_1:  QK,
    GGML_TYPE_I8:    1,
    GGML_TYPE_I16:   1,
    GGML_TYPE_I32:   1,
    GGML_TYPE_F16:   1,
    GGML_TYPE_F32:   1,
 }
 GGML_TYPE_SIZE = {
    GGML_TYPE_Q4_0: 4   + QK//2,
    GGML_TYPE_Q4_1: 4*2 + QK//2,
    GGML_TYPE_I8:   1,
    GGML_TYPE_I16:  2,
    GGML_TYPE_I32:  4,
    GGML_TYPE_F16:  2,
    GGML_TYPE_F32:  4,
 }
 HPARAMS = [
    'magic',    # int32
    'version',  # int32
    'n_vocab',  # int32
    'n_embd',   # int32
    'n_mult',   # int32
    'n_head',   # int32
    'n_layer',  # int32
    'n_rot',    # int32
    'f16',      # int32
 ]
 def read_hparams(fin):
    struct_fmt = "i" * len(HPARAMS)
    struct_size = struct.calcsize(struct_fmt)
    buf = fin.read(struct_size)
    ints = struct.unpack(struct_fmt, buf)
    hparams = dict(zip(HPARAMS, ints))
    return hparams
 def write_hparams(fout, hparams):
    struct_fmt = "i" * len(HPARAMS)
    struct_size = struct.calcsize(struct_fmt)
    ints = [hparams[h] for h in HPARAMS]
    fout.write(struct.pack(struct_fmt, *ints))
 def read_tokens(fin, hparams):
    tokens = []
    for i in range(hparams['n_vocab']):
        len_b = fin.read(4)
        (length,) = struct.unpack("i", len_b)
        word = fin.read(length)
        score_b = fin.read(4)
        (score,) = struct.unpack("f", score_b)
        tokens.append((word, score))
    return tokens
 def write_tokens(fout, tokens):
    for word, score in tokens:
        fout.write(struct.pack("i", len(word)))
        fout.write(word)
        fout.write(struct.pack("f", score))
 def ggml_nelements(shape):
    r = 1
    for i in shape:
        r *= i
    return r
 def ggml_nbytes(shape, ftype):
    x = ggml_nelements(shape)
    t = WTYPES[ftype]
    x *= GGML_TYPE_SIZE[t]
    x //= GGML_BLCK_SIZE[t]
    return x
 def copy_tensors(fin, fout, part_id, n_parts):
    while True:
        b = fin.read(4)
        if not b: break
        (n_dims,) = struct.unpack("i", b)
        b = fin.read(4)
        (length,) = struct.unpack("i", b)
        b = fin.read(4)
        (ftype,) = struct.unpack("i", b)
        assert n_dims in (1, 2)
        partshape = list(range(n_dims))
        for i in range(n_dims):
            b = fin.read(4)
            partshape[i] = struct.unpack("i", b)[0]
        partshape = list(reversed(partshape))
        name = fin.read(length)
        data = fin.read(ggml_nbytes(partshape, ftype))
        blck_size = GGML_BLCK_SIZE[WTYPES[ftype]]
        type_size = GGML_TYPE_SIZE[WTYPES[ftype]]
        print(f"Processing tensor {name} with shape: {partshape} and type: {WTYPE_NAMES[ftype]}")
        # determine dimension along which multipart tensor is sharded
        #
        # split_dim 0 regex:
        #   - output.*
        #   - layers.*.attention.wq.weight
        #   - layers.*.attention.wk.weight
        #   - layers.*.attention.wv.weight
        #   - layers.*.feed_forward.w1.weight
        #   - layers.*.feed_forward.w3.weight
        #
        # split_dim 1 regex:
        #   - tok_embeddings.*
        #   - layers.*.attention.wo.weight
        #   - layers.*.feed_forward.w2.weight
        #
        if n_dims > 1:
            split_dim = 1
            if b"tok_embeddings" in name:
                split_dim = 1
            elif b"layers" in name:
                if b"attention.wo.weight" in name:
                    split_dim = 1
                elif b"feed_forward.w2.weight" in name:
                    split_dim = 1
                else:
                    split_dim = 0
            elif b"output" in name:
                split_dim = 0
        # output tensor header
        fullshape = list(partshape)
        if n_dims > 1:
            fullshape[split_dim] *= n_parts
        fout.write(struct.pack("iii", n_dims, len(name), ftype))
        for dim in reversed(fullshape):
            fout.write(struct.pack("i", dim))
        fout.write(name)
        # ensure tensor data is aligned
        tensor_data_offset = fout.tell()
        while tensor_data_offset % QK != 0:
            fout.write(struct.pack("B", 0))
            tensor_data_offset += 1
        # output unified mappable tensor data
        if n_dims == 1 or n_parts == 1:
            # copy tensor which we thankfully received in one piece
            if part_id == 0:
                fout.write(data)
        elif split_dim == 0:
            # reassemble multifile tensor containing some of the rows
            rows_per_chunk = partshape[0]
            current_row = part_id * rows_per_chunk
            bytes_per_row = fullshape[1] // blck_size * type_size
            offset = current_row * bytes_per_row
            fout.seek(tensor_data_offset + offset)
            fout.write(data)
        elif split_dim == 1:
            # reassemble multifile tensor containing some of the cols
            cols_per_chunk = partshape[1]
            current_col = part_id * cols_per_chunk
            bpr = partshape[1] // blck_size * type_size
            bytes_per_row = fullshape[1] // blck_size * type_size
            offset_current_col = current_col // blck_size * type_size
            for row in range(partshape[0]):
                offset_row = row * bytes_per_row
                offset = offset_row + offset_current_col
                fout.seek(tensor_data_offset + offset)
                fout.write(data[row * bpr:row * bpr + bpr])
        # advance file position to next tensor
        fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype))
 def parse_args():
    parser = argparse.ArgumentParser(description='Migrate from GGML to new GGJT file format')
    parser.add_argument('fin_path', help='your old ggml file (leave out the .1 .2 etc.)')
    parser.add_argument('fout_path', help='your new ggjt file name')
    return parser.parse_args()
 def main():
    args = parse_args()
    assert args.fin_path
    assert args.fout_path
    assert args.fin_path != args.fout_path
    with open(args.fin_path, "rb") as fin:
        hparams = read_hparams(fin)
        tokens = read_tokens(fin, hparams)
    if hparams['magic'] == 0x67676a74:  # ggjt
        print(f"{args.fin_path}: input ggml has already been converted to 'ggjt' magic\n")
        sys.exit(1)
    if hparams['magic'] != 0x67676d66:  # ggmf
        print(f"{args.fin_path}: input ggml file doesn't have expected 'ggmf' magic: {hparams['magic']:#x}\n")
        sys.exit(1)
    hparams['magic'] = 0x67676a74  # ggjt
    # count number of multipart files by convention
    n_parts = 1
    while True:
        if os.path.exists(f"{args.fin_path}.{n_parts}"):
            n_parts += 1
        else:
            break
    # we output a single file for ggml
    with open(args.fout_path, "wb") as fout:
        write_hparams(fout, hparams)
        write_tokens(fout, tokens)
        offset_of_tensors = fout.tell()
        # the tensors we load could be split across multiple files
        for part_id in range(n_parts):
            fout.seek(offset_of_tensors)
            print(f"Processing part {part_id+1} of {n_parts}\n")
            fin_path = args.fin_path
            if part_id > 0:
                fin_path += f".{part_id}"
            with open(fin_path, "rb") as fin:
                read_tokens(fin, read_hparams(fin))
                copy_tensors(fin, fout, part_id, n_parts)
    print(f"Done. Output file: {args.fout_path}\n")
 if __name__ == "__main__":
    main()
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,2 @@
 numpy==1.24
 sentencepiece==0.1.97