llama.cpp/examples/embd-input/panda_gpt.py

import sys
import os
sys.path.insert(0, os.path.dirname(__file__))
from embd_input import MyModel
import numpy as np
from torch import nn
import torch

# use PandaGPT path
panda_gpt_path = os.path.join(os.path.dirname(__file__), "PandaGPT")
imagebind_ckpt_path = "./models/panda_gpt/"

sys.path.insert(0, os.path.join(panda_gpt_path,"code","model"))
from ImageBind.models import imagebind_model
from ImageBind import data

ModalityType = imagebind_model.ModalityType
max_tgt_len = 400

class PandaGPT:
    def __init__(self, args):
        self.visual_encoder,_ = imagebind_model.imagebind_huge(pretrained=True, store_path=imagebind_ckpt_path)
        self.visual_encoder.eval()
        self.llama_proj = nn.Linear(1024, 5120) # self.visual_hidden_size, 5120)
        self.max_tgt_len = max_tgt_len
        self.model = MyModel(["main", *args])
        self.generated_text = ""
        self.device = "cpu"

    def load_projection(self, path):
        state = torch.load(path, map_location="cpu")
        self.llama_proj.load_state_dict({
            "weight": state["llama_proj.weight"],
            "bias": state["llama_proj.bias"]})

    def eval_inputs(self, inputs):
        self.model.eval_string("<Img>")
        embds = self.extract_multimoal_feature(inputs)
        for i in embds:
            self.model.eval_float(i.T)
        self.model.eval_string("</Img> ")

    def chat(self, question):
        return self.chat_with_image(None, question)

    def chat_with_image(self, inputs, question):
        if self.generated_text == "":
            self.model.eval_string("###")
        self.model.eval_string(" Human: ")
        if inputs:
            self.eval_inputs(inputs)
        self.model.eval_string(question)
        self.model.eval_string("\n### Assistant:")
        ret = self.model.generate_with_print(end="###")
        self.generated_text += ret
        return ret

    def extract_multimoal_feature(self, inputs):
        features = []
        for key in ["image", "audio", "video", "thermal"]:
            if key + "_paths" in inputs:
                embeds = self.encode_data(key, inputs[key+"_paths"])
                features.append(embeds)
        return features

    def encode_data(self, data_type, data_paths):

        type_map = {
            "image": ModalityType.VISION,
            "audio": ModalityType.AUDIO,
            "video": ModalityType.VISION,
            "thermal": ModalityType.THERMAL,
        }
        load_map = {
            "image": data.load_and_transform_vision_data,
            "audio": data.load_and_transform_audio_data,
            "video": data.load_and_transform_video_data,
            "thermal": data.load_and_transform_thermal_data
        }

        load_function = load_map[data_type]
        key = type_map[data_type]

        inputs = {key: load_function(data_paths, self.device)}
        with torch.no_grad():
            embeddings = self.visual_encoder(inputs)
            embeds = embeddings[key]
            embeds = self.llama_proj(embeds).cpu().numpy()
        return embeds


if __name__=="__main__":
    a = PandaGPT(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048", "--lora", "./models/panda_gpt/ggml-adapter-model.bin","--temp", "0"])
    a.load_projection("./models/panda_gpt/adapter_model.bin")
    a.chat_with_image(
        {"image_paths": ["./media/llama1-logo.png"]},
        "what is the text in the picture? 'llama' or 'lambda'?")
    a.chat("what is the color of it?")
llama : support input embeddings directly (#1910) * add interface for float input * fixed inpL shape and type * add examples of input floats * add test example for embd input * fixed sampling * add free for context * fixed add end condition for generating * add examples for llava.py * add READMD for llava.py * add READMD for llava.py * add example of PandaGPT * refactor the interface and fixed the styles * add cmake build for embd-input * add cmake build for embd-input * Add MiniGPT-4 example * change the order of the args of llama_eval_internal * fix ci error 2023-06-28 17:53:37 +02:00			`import sys`
			`import os`
			`sys.path.insert(0, os.path.dirname(__file__))`
			`from embd_input import MyModel`
			`import numpy as np`
			`from torch import nn`
			`import torch`

			`# use PandaGPT path`
			`panda_gpt_path = os.path.join(os.path.dirname(__file__), "PandaGPT")`
			`imagebind_ckpt_path = "./models/panda_gpt/"`

			`sys.path.insert(0, os.path.join(panda_gpt_path,"code","model"))`
			`from ImageBind.models import imagebind_model`
			`from ImageBind import data`

			`ModalityType = imagebind_model.ModalityType`
			`max_tgt_len = 400`

			`class PandaGPT:`
			`def __init__(self, args):`
			`self.visual_encoder,_ = imagebind_model.imagebind_huge(pretrained=True, store_path=imagebind_ckpt_path)`
			`self.visual_encoder.eval()`
			`self.llama_proj = nn.Linear(1024, 5120) # self.visual_hidden_size, 5120)`
			`self.max_tgt_len = max_tgt_len`
			`self.model = MyModel(["main", *args])`
			`self.generated_text = ""`
			`self.device = "cpu"`

			`def load_projection(self, path):`
			`state = torch.load(path, map_location="cpu")`
			`self.llama_proj.load_state_dict({`
			`"weight": state["llama_proj.weight"],`
			`"bias": state["llama_proj.bias"]})`

			`def eval_inputs(self, inputs):`
			`self.model.eval_string("<Img>")`
			`embds = self.extract_multimoal_feature(inputs)`
			`for i in embds:`
			`self.model.eval_float(i.T)`
			`self.model.eval_string("</Img> ")`

			`def chat(self, question):`
			`return self.chat_with_image(None, question)`

			`def chat_with_image(self, inputs, question):`
			`if self.generated_text == "":`
			`self.model.eval_string("###")`
			`self.model.eval_string(" Human: ")`
			`if inputs:`
			`self.eval_inputs(inputs)`
			`self.model.eval_string(question)`
			`self.model.eval_string("\n### Assistant:")`
			`ret = self.model.generate_with_print(end="###")`
			`self.generated_text += ret`
			`return ret`

			`def extract_multimoal_feature(self, inputs):`
			`features = []`
			`for key in ["image", "audio", "video", "thermal"]:`
			`if key + "_paths" in inputs:`
			`embeds = self.encode_data(key, inputs[key+"_paths"])`
			`features.append(embeds)`
			`return features`

			`def encode_data(self, data_type, data_paths):`

			`type_map = {`
			`"image": ModalityType.VISION,`
			`"audio": ModalityType.AUDIO,`
			`"video": ModalityType.VISION,`
			`"thermal": ModalityType.THERMAL,`
			`}`
			`load_map = {`
			`"image": data.load_and_transform_vision_data,`
			`"audio": data.load_and_transform_audio_data,`
			`"video": data.load_and_transform_video_data,`
			`"thermal": data.load_and_transform_thermal_data`
			`}`

			`load_function = load_map[data_type]`
			`key = type_map[data_type]`

			`inputs = {key: load_function(data_paths, self.device)}`
			`with torch.no_grad():`
			`embeddings = self.visual_encoder(inputs)`
			`embeds = embeddings[key]`
			`embeds = self.llama_proj(embeds).cpu().numpy()`
			`return embeds`


			`if __name__=="__main__":`
			`a = PandaGPT(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048", "--lora", "./models/panda_gpt/ggml-adapter-model.bin","--temp", "0"])`
			`a.load_projection("./models/panda_gpt/adapter_model.bin")`
			`a.chat_with_image(`
			`{"image_paths": ["./media/llama1-logo.png"]},`
			`"what is the text in the picture? 'llama' or 'lambda'?")`
			`a.chat("what is the color of it?")`