llama.cpp/examples/llava/qwen2_vl_surgery.py

import argparse
from typing import Dict

import torch
import numpy as np
from gguf import *
from transformers import (
    Qwen2VLForConditionalGeneration,
    Qwen2VLProcessor,
    AutoProcessor,
    Qwen2VLConfig
)


VISION = "clip.vision"


def k(raw_key: str, arch: str) -> str:
    return raw_key.format(arch=arch)


def to_gguf_name(name: str) -> str:
    og = name
    name = name.replace("text_model", "t").replace("vision_model", "v")
    name = name.replace("blocks", "blk").replace("embeddings.", "")
    name = name.replace("attn.", "attn_")
    name = name.replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("proj.", "out.")
    # name = name.replace("layrnorm", "ln").replace("layer_norm", "ln").replace("layernorm", "ln")
    name = name.replace("norm1", "ln1").replace("norm2", "ln2")
    name = name.replace("merger.mlp", 'mm')
    print(f"[to_gguf_name] {og} --> {name}")
    return name


def find_vision_tensors(qwen2vl, dtype) -> Dict[str, np.ndarray]:
    vision_model = qwen2vl.visual
    tensor_map = {}
    for name, ten in vision_model.state_dict().items():
        ten = ten.numpy()
        if 'qkv' in name:
            if ten.ndim == 2: # weight
                c3, _ = ten.shape
            else:             # bias
                c3 = ten.shape[0]
            assert c3 % 3 == 0
            c = c3 // 3
            wq = ten[:c]
            wk = ten[c: c * 2]
            wv = ten[c * 2:]
            tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "q")] = wq
            tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "k")] = wk
            tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "v")] = wv
        elif 'merger' in name:
            if name.endswith("ln_q.weight"):
                tensor_map['v.post_ln.weight'] = ten
            elif name.endswith("ln_q.bias"):
                tensor_map['v.post_ln.bias'] = ten
            else:
                # "merger.mlp.%d.weight/bias" --> "mm.%d.weight/bias"
                tensor_map[to_gguf_name(name)] = ten
        elif 'patch_embed.proj.weight' in name:
            # NOTE: split Conv3D into Conv2Ds
            c1, c2, kt, kh, kw = ten.shape
            assert kt == 2, "Current implmentation only support temporal_patch_size of 2"
            tensor_map["v.patch_embd.weight"] = ten[:, :, 0, ...]
            tensor_map["v.patch_embd.weight.1"] = ten[:, :, 1, ...]
        else:
            tensor_map[to_gguf_name(f"vision_model.{name}")] = ten

    for new_name, ten in tensor_map.items():
        if ten.ndim <= 1 or new_name.endswith("_norm.weight"):
            tensor_map[new_name] = ten.astype(np.float32)
        else:
            tensor_map[new_name] = ten.astype(dtype)
    tensor_map["v.position_embd.weight"] = np.zeros([10, 10], dtype=np.float32)  # dummy tensor, just here as a placeholder
    return tensor_map


def main(args):
    if args.data_type == 'fp32':
        dtype = torch.float32
        np_dtype = np.float32
        ftype = 0
    elif args.data_type == 'fp16':
        dtype = torch.float32
        np_dtype = np.float16
        ftype = 1
    else:
        raise ValueError()

    model_name = args.model_name
    print("model_name: ", model_name)
    qwen2vl = Qwen2VLForConditionalGeneration.from_pretrained(
        model_name, torch_dtype=dtype, device_map="cpu"
    )
    cfg: Qwen2VLConfig = qwen2vl.config  # type: ignore[reportAssignmentType]
    vcfg = cfg.vision_config

    if os.path.isdir(model_name):
        if model_name.endswith(os.sep):
            model_name = model_name[:-1]
        model_name = os.path.basename(model_name)
    fname_out = f"{model_name.replace('/', '-').lower()}-vision.gguf"

    fout = GGUFWriter(path=fname_out, arch="clip")
    fout.add_description("image encoder for Qwen2VL")

    fout.add_file_type(ftype)
    fout.add_bool("clip.has_text_encoder", False)
    fout.add_bool("clip.has_vision_encoder", True)
    fout.add_bool("clip.has_qwen2vl_merger", True)
    fout.add_string("clip.projector_type", "qwen2vl_merger")

    print(cfg.vision_config)
    if 'silu' in cfg.vision_config.hidden_act.lower():
        fout.add_bool("clip.use_silu", True)
        fout.add_bool("clip.use_gelu", False)
    elif 'gelu' in cfg.vision_config.hidden_act.lower():
        fout.add_bool("clip.use_silu", False)
        fout.add_bool("clip.use_gelu", 'quick' not in cfg.vision_config.hidden_act.lower())
    else:
        raise ValueError()

    tensor_map = find_vision_tensors(qwen2vl, np_dtype)
    for name, data in tensor_map.items():
        fout.add_tensor(name, data)

    fout.add_uint32("clip.vision.patch_size", vcfg.patch_size)
    fout.add_uint32("clip.vision.image_size", 14 * 40)  # some reasonable size that is divable by (14*2)
    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.embed_dim)
    fout.add_uint32("clip.vision.projection_dim", vcfg.hidden_size)
    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), vcfg.num_heads)
    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
    fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), vcfg.depth)
    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), 0)  # not sure what this does, put 0 here as a placeholder
    fout.add_name(model_name)
    """
    HACK: Since vision rope related parameter aren't stored in the `Qwen2VLConfig,
            it will be hardcoded in the `clip_image_build_graph` from `clip.cpp`.
    """

    processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_name)
    fout.add_array("clip.vision.image_mean", processor.image_processor.image_mean) # type: ignore[reportAttributeAccessIssue]
    fout.add_array("clip.vision.image_std", processor.image_processor.image_std) # type: ignore[reportAttributeAccessIssue]

    fout.write_header_to_file()
    fout.write_kv_data_to_file()
    fout.write_tensors_to_file()
    fout.close()
    print("save model as: ", fname_out)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("model_name", nargs='?', default="Qwen/Qwen2-VL-2B-Instruct")
    parser.add_argument("--data_type", nargs='?', choices=['fp32', 'fp16'], default="fp32")
    args = parser.parse_args()
    main(args)
llama : add Qwen2VL support + multimodal RoPE (#10361) * Barebone Qwen2VL LLM convertor * Add Qwen2VL cli entrypoint * [WIP] add qwen2vl arch * Verify m-rope output * Add vl-rope/2d-rope support for qwen2vl ViT * update qwen2vl cli tool * update 5D tensor op workaround * [WIP] qwen2vl vision model * make batch and clip utils compatible with qwen2vl * [WIP] create inference workflow, gguf convert script but fix * correcting vision-rope behavior, add the missing last layer back to ViT * add arg parser to qwen2vl_surgery * replace variable size array with vector * cuda-gdb cmake preset * add fp32 mrope, vision rope kernel * add fp16 support for qwen2vl and m-rope * add `GGML_ROPE_TYPE_MROPE`, `GGML_ROPE_TYPE_VISION` * fix rope op mode switching, out dated func args * update `llama_hparams` * update to keep up stream changes * resolve linter, test errors * add makefile entry, update speical image padding token * add mrope unit test, fix few compiler warnings * rename `mrope` related function, params * minor updates on debug util, bug fixs * add `m-rope` testcase to `test-backend-ops` * Apply suggestions from code review Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * fix traililng whitespce * store `llama_hparams.rope_sections` with fixed size array * update position id tensor size check in GGML_OP_ROPE * minor updates * update `ggml_backend__supports_op` of unsupported backends remote old `rope_section` compare operator --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2024-12-14 13:43:46 +01:00			`import argparse`
			`from typing import Dict`

			`import torch`
			`import numpy as np`
			`from gguf import *`
			`from transformers import (`
			`Qwen2VLForConditionalGeneration,`
			`Qwen2VLProcessor,`
			`AutoProcessor,`
			`Qwen2VLConfig`
			`)`


			`VISION = "clip.vision"`


			`def k(raw_key: str, arch: str) -> str:`
			`return raw_key.format(arch=arch)`


			`def to_gguf_name(name: str) -> str:`
			`og = name`
			`name = name.replace("text_model", "t").replace("vision_model", "v")`
			`name = name.replace("blocks", "blk").replace("embeddings.", "")`
			`name = name.replace("attn.", "attn_")`
			`name = name.replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("proj.", "out.")`
			`# name = name.replace("layrnorm", "ln").replace("layer_norm", "ln").replace("layernorm", "ln")`
			`name = name.replace("norm1", "ln1").replace("norm2", "ln2")`
			`name = name.replace("merger.mlp", 'mm')`
			`print(f"[to_gguf_name] {og} --> {name}")`
			`return name`


			`def find_vision_tensors(qwen2vl, dtype) -> Dict[str, np.ndarray]:`
			`vision_model = qwen2vl.visual`
			`tensor_map = {}`
			`for name, ten in vision_model.state_dict().items():`
			`ten = ten.numpy()`
			`if 'qkv' in name:`
			`if ten.ndim == 2: # weight`
			`c3, _ = ten.shape`
			`else: # bias`
			`c3 = ten.shape[0]`
			`assert c3 % 3 == 0`
			`c = c3 // 3`
			`wq = ten[:c]`
			`wk = ten[c: c * 2]`
			`wv = ten[c * 2:]`
			`tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "q")] = wq`
			`tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "k")] = wk`
			`tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "v")] = wv`
			`elif 'merger' in name:`
			`if name.endswith("ln_q.weight"):`
			`tensor_map['v.post_ln.weight'] = ten`
			`elif name.endswith("ln_q.bias"):`
			`tensor_map['v.post_ln.bias'] = ten`
			`else:`
			`# "merger.mlp.%d.weight/bias" --> "mm.%d.weight/bias"`
			`tensor_map[to_gguf_name(name)] = ten`
			`elif 'patch_embed.proj.weight' in name:`
			`# NOTE: split Conv3D into Conv2Ds`
			`c1, c2, kt, kh, kw = ten.shape`
			`assert kt == 2, "Current implmentation only support temporal_patch_size of 2"`
			`tensor_map["v.patch_embd.weight"] = ten[:, :, 0, ...]`
			`tensor_map["v.patch_embd.weight.1"] = ten[:, :, 1, ...]`
			`else:`
			`tensor_map[to_gguf_name(f"vision_model.{name}")] = ten`

			`for new_name, ten in tensor_map.items():`
			`if ten.ndim <= 1 or new_name.endswith("_norm.weight"):`
			`tensor_map[new_name] = ten.astype(np.float32)`
			`else:`
			`tensor_map[new_name] = ten.astype(dtype)`
			`tensor_map["v.position_embd.weight"] = np.zeros([10, 10], dtype=np.float32) # dummy tensor, just here as a placeholder`
			`return tensor_map`


			`def main(args):`
			`if args.data_type == 'fp32':`
			`dtype = torch.float32`
			`np_dtype = np.float32`
			`ftype = 0`
			`elif args.data_type == 'fp16':`
			`dtype = torch.float32`
			`np_dtype = np.float16`
			`ftype = 1`
			`else:`
			`raise ValueError()`

			`model_name = args.model_name`
			`print("model_name: ", model_name)`
			`qwen2vl = Qwen2VLForConditionalGeneration.from_pretrained(`
			`model_name, torch_dtype=dtype, device_map="cpu"`
			`)`
			`cfg: Qwen2VLConfig = qwen2vl.config # type: ignore[reportAssignmentType]`
			`vcfg = cfg.vision_config`

			`if os.path.isdir(model_name):`
			`if model_name.endswith(os.sep):`
			`model_name = model_name[:-1]`
			`model_name = os.path.basename(model_name)`
			`fname_out = f"{model_name.replace('/', '-').lower()}-vision.gguf"`

			`fout = GGUFWriter(path=fname_out, arch="clip")`
			`fout.add_description("image encoder for Qwen2VL")`

			`fout.add_file_type(ftype)`
			`fout.add_bool("clip.has_text_encoder", False)`
			`fout.add_bool("clip.has_vision_encoder", True)`
			`fout.add_bool("clip.has_qwen2vl_merger", True)`
			`fout.add_string("clip.projector_type", "qwen2vl_merger")`

			`print(cfg.vision_config)`
			`if 'silu' in cfg.vision_config.hidden_act.lower():`
			`fout.add_bool("clip.use_silu", True)`
			`fout.add_bool("clip.use_gelu", False)`
			`elif 'gelu' in cfg.vision_config.hidden_act.lower():`
			`fout.add_bool("clip.use_silu", False)`
			`fout.add_bool("clip.use_gelu", 'quick' not in cfg.vision_config.hidden_act.lower())`
			`else:`
			`raise ValueError()`

			`tensor_map = find_vision_tensors(qwen2vl, np_dtype)`
			`for name, data in tensor_map.items():`
			`fout.add_tensor(name, data)`

			`fout.add_uint32("clip.vision.patch_size", vcfg.patch_size)`
			`fout.add_uint32("clip.vision.image_size", 14 * 40) # some reasonable size that is divable by (14*2)`
			`fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.embed_dim)`
			`fout.add_uint32("clip.vision.projection_dim", vcfg.hidden_size)`
			`fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), vcfg.num_heads)`
			`fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)`
			`fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), vcfg.depth)`
			`fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), 0) # not sure what this does, put 0 here as a placeholder`
			`fout.add_name(model_name)`
			`"""`
			HACK: Since vision rope related parameter aren't stored in the `Qwen2VLConfig,
			it will be hardcoded in the `clip_image_build_graph` from `clip.cpp`.
			`"""`

			`processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_name)`
			`fout.add_array("clip.vision.image_mean", processor.image_processor.image_mean) # type: ignore[reportAttributeAccessIssue]`
			`fout.add_array("clip.vision.image_std", processor.image_processor.image_std) # type: ignore[reportAttributeAccessIssue]`

			`fout.write_header_to_file()`
			`fout.write_kv_data_to_file()`
			`fout.write_tensors_to_file()`
			`fout.close()`
			`print("save model as: ", fname_out)`


			`if __name__ == "__main__":`
			`parser = argparse.ArgumentParser()`
			`parser.add_argument("model_name", nargs='?', default="Qwen/Qwen2-VL-2B-Instruct")`
			`parser.add_argument("--data_type", nargs='?', choices=['fp32', 'fp16'], default="fp32")`
			`args = parser.parse_args()`
			`main(args)`