2023-05-10 01:18:02 +02:00
|
|
|
import base64
|
|
|
|
import re
|
|
|
|
from dataclasses import dataclass
|
|
|
|
from io import BytesIO
|
|
|
|
from typing import Any, List, Optional
|
|
|
|
|
|
|
|
import torch
|
2023-05-10 03:49:39 +02:00
|
|
|
from PIL import Image
|
|
|
|
|
2023-05-10 01:18:02 +02:00
|
|
|
from extensions.multimodal.pipeline_loader import load_pipeline
|
|
|
|
from modules import shared
|
2023-05-22 03:42:34 +02:00
|
|
|
from modules.logging_colors import logger
|
2023-05-10 01:18:02 +02:00
|
|
|
from modules.text_generation import encode, get_max_prompt_length
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
class PromptPart:
|
|
|
|
text: str
|
|
|
|
image: Optional[Image.Image] = None
|
|
|
|
is_image: bool = False
|
|
|
|
input_ids: Optional[torch.Tensor] = None
|
|
|
|
embedding: Optional[torch.Tensor] = None
|
|
|
|
|
|
|
|
|
|
|
|
class MultimodalEmbedder:
|
|
|
|
def __init__(self, params: dict):
|
|
|
|
pipeline, source = load_pipeline(params)
|
|
|
|
self.pipeline = pipeline
|
2023-05-22 03:42:34 +02:00
|
|
|
logger.info(f'Multimodal: loaded pipeline {self.pipeline.name()} from pipelines/{source} ({self.pipeline.__class__.__name__})')
|
2023-05-10 01:18:02 +02:00
|
|
|
|
|
|
|
def _split_prompt(self, prompt: str, load_images: bool = False) -> List[PromptPart]:
|
|
|
|
"""Splits a prompt into a list of `PromptParts` to separate image data from text.
|
|
|
|
It will also append `image_start` and `image_end` before and after the image, and optionally parse and load the images,
|
|
|
|
if `load_images` is `True`.
|
|
|
|
"""
|
|
|
|
parts: List[PromptPart] = []
|
|
|
|
curr = 0
|
|
|
|
while True:
|
|
|
|
match = re.search(r'<img src="data:image/jpeg;base64,([A-Za-z0-9+/=]+)">', prompt[curr:])
|
|
|
|
if match is None:
|
|
|
|
# no more image tokens, append the rest of the prompt
|
|
|
|
if curr > 0:
|
|
|
|
# add image end token after last image
|
|
|
|
parts.append(PromptPart(text=self.pipeline.image_end() + prompt[curr:]))
|
|
|
|
else:
|
|
|
|
parts.append(PromptPart(text=prompt))
|
|
|
|
break
|
|
|
|
# found an image, append image start token to the text
|
|
|
|
if match.start() > 0:
|
2023-05-10 01:20:35 +02:00
|
|
|
parts.append(PromptPart(text=prompt[curr:curr + match.start()] + self.pipeline.image_start()))
|
2023-05-10 01:18:02 +02:00
|
|
|
else:
|
|
|
|
parts.append(PromptPart(text=self.pipeline.image_start()))
|
|
|
|
# append the image
|
|
|
|
parts.append(PromptPart(
|
2023-05-10 01:20:35 +02:00
|
|
|
text=match.group(0),
|
2023-05-10 01:18:02 +02:00
|
|
|
image=Image.open(BytesIO(base64.b64decode(match.group(1)))) if load_images else None,
|
|
|
|
is_image=True
|
|
|
|
))
|
|
|
|
curr += match.end()
|
|
|
|
return parts
|
|
|
|
|
|
|
|
def _len_in_tokens_prompt_parts(self, parts: List[PromptPart]) -> int:
|
|
|
|
"""Total length in tokens of all `parts`"""
|
|
|
|
tokens = 0
|
|
|
|
for part in parts:
|
|
|
|
if part.is_image:
|
|
|
|
tokens += self.pipeline.num_image_embeds()
|
|
|
|
elif part.input_ids is not None:
|
|
|
|
tokens += len(part.input_ids)
|
|
|
|
else:
|
|
|
|
tokens += len(encode(part.text)[0])
|
|
|
|
return tokens
|
|
|
|
|
|
|
|
def len_in_tokens(self, prompt: str) -> int:
|
|
|
|
"""Total length in tokens for a given text `prompt`"""
|
|
|
|
parts = self._split_prompt(prompt, False)
|
|
|
|
return self._len_in_tokens_prompt_parts(parts)
|
|
|
|
|
|
|
|
def _encode_single_text(self, part: PromptPart, add_bos_token: bool) -> PromptPart:
|
|
|
|
"""Encode a single prompt `part` to `input_ids`. Returns a `PromptPart`"""
|
|
|
|
if part.is_image:
|
|
|
|
placeholders = torch.ones((self.pipeline.num_image_embeds())) * self.pipeline.placeholder_token_id()
|
|
|
|
part.input_ids = placeholders.to(shared.model.device, dtype=torch.int64)
|
|
|
|
else:
|
|
|
|
part.input_ids = encode(part.text, add_bos_token=add_bos_token)[0].to(shared.model.device, dtype=torch.int64)
|
|
|
|
return part
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def _num_images(parts: List[PromptPart]) -> int:
|
|
|
|
count = 0
|
|
|
|
for part in parts:
|
|
|
|
if part.is_image:
|
|
|
|
count += 1
|
|
|
|
return count
|
|
|
|
|
|
|
|
def _encode_text(self, state, parts: List[PromptPart]) -> List[PromptPart]:
|
|
|
|
"""Encode text to token_ids, also truncate the prompt, if necessary.
|
2023-05-10 01:20:35 +02:00
|
|
|
|
2023-05-10 01:18:02 +02:00
|
|
|
The chat/instruct mode should make prompts that fit in get_max_prompt_length, but if max_new_tokens are set
|
|
|
|
such that the context + min_rows don't fit, we can get a prompt which is too long.
|
|
|
|
We can't truncate image embeddings, as it leads to broken generation, so remove the images instead and warn the user
|
|
|
|
"""
|
|
|
|
encoded: List[PromptPart] = []
|
|
|
|
for i, part in enumerate(parts):
|
2023-05-10 01:20:35 +02:00
|
|
|
encoded.append(self._encode_single_text(part, i == 0 and state['add_bos_token']))
|
2023-05-10 01:18:02 +02:00
|
|
|
|
|
|
|
# truncation:
|
|
|
|
max_len = get_max_prompt_length(state)
|
|
|
|
removed_images = 0
|
|
|
|
|
|
|
|
# 1. remove entire text/image blocks
|
|
|
|
while self._len_in_tokens_prompt_parts(encoded[1:]) > max_len:
|
|
|
|
if encoded[0].is_image:
|
|
|
|
removed_images += 1
|
|
|
|
encoded = encoded[1:]
|
|
|
|
|
|
|
|
# 2. check if the last prompt part doesn't need to get truncated
|
|
|
|
if self._len_in_tokens_prompt_parts(encoded) > max_len:
|
|
|
|
if encoded[0].is_image:
|
|
|
|
# don't truncate image embeddings, just remove the image, otherwise generation will be broken
|
|
|
|
removed_images += 1
|
|
|
|
encoded = encoded[1:]
|
|
|
|
elif len(encoded) > 1 and encoded[0].text.endswith(self.pipeline.image_start()):
|
|
|
|
# see if we can keep image_start token
|
|
|
|
len_image_start = len(encode(self.pipeline.image_start(), add_bos_token=state['add_bos_token'])[0])
|
|
|
|
if self._len_in_tokens_prompt_parts(encoded[1:]) + len_image_start > max_len:
|
|
|
|
# we can't -> remove this text, and the image
|
|
|
|
encoded = encoded[2:]
|
|
|
|
removed_images += 1
|
|
|
|
else:
|
|
|
|
# we can -> just truncate the text
|
|
|
|
trunc_len = self._len_in_tokens_prompt_parts(encoded) - max_len
|
|
|
|
encoded[0].input_ids = encoded[0].input_ids[trunc_len:]
|
|
|
|
elif len(encoded) > 0:
|
|
|
|
# only one text left, truncate it normally
|
|
|
|
trunc_len = self._len_in_tokens_prompt_parts(encoded) - max_len
|
|
|
|
encoded[0].input_ids = encoded[0].input_ids[trunc_len:]
|
|
|
|
|
|
|
|
# notify user if we truncated an image
|
|
|
|
if removed_images > 0:
|
2023-05-22 03:42:34 +02:00
|
|
|
logger.warning(f"Multimodal: removed {removed_images} image(s) from prompt. Try decreasing max_new_tokens if generation is broken")
|
2023-05-10 01:18:02 +02:00
|
|
|
|
|
|
|
return encoded
|
|
|
|
|
|
|
|
def _embed(self, parts: List[PromptPart]) -> List[PromptPart]:
|
|
|
|
# batch images
|
|
|
|
image_indicies = [i for i, part in enumerate(parts) if part.is_image]
|
|
|
|
embedded = self.pipeline.embed_images([parts[i].image for i in image_indicies])
|
|
|
|
for i, embeds in zip(image_indicies, embedded):
|
|
|
|
parts[i].embedding = embeds
|
|
|
|
# embed text
|
|
|
|
for (i, part) in enumerate(parts):
|
|
|
|
if not part.is_image:
|
|
|
|
parts[i].embedding = self.pipeline.embed_tokens(part.input_ids)
|
|
|
|
return parts
|
|
|
|
|
|
|
|
def _remove_old_images(self, parts: List[PromptPart], params: dict) -> List[PromptPart]:
|
|
|
|
if params['add_all_images_to_prompt']:
|
|
|
|
return parts
|
|
|
|
already_added = False
|
|
|
|
for i, part in reversed(list(enumerate(parts))):
|
|
|
|
if part.is_image:
|
|
|
|
if already_added:
|
|
|
|
parts[i].embedding = self.pipeline.placeholder_embeddings()
|
|
|
|
else:
|
|
|
|
already_added = True
|
|
|
|
return parts
|
|
|
|
|
|
|
|
def forward(self, prompt: str, state: Any, params: dict):
|
|
|
|
prompt_parts = self._split_prompt(prompt, True)
|
|
|
|
prompt_parts = self._encode_text(state, prompt_parts)
|
|
|
|
prompt_parts = self._embed(prompt_parts)
|
|
|
|
prompt_parts = self._remove_old_images(prompt_parts, params)
|
|
|
|
embeds = tuple(part.embedding for part in prompt_parts)
|
|
|
|
ids = tuple(part.input_ids for part in prompt_parts)
|
|
|
|
input_embeds = torch.cat(embeds, dim=0)
|
|
|
|
input_ids = torch.cat(ids, dim=0)
|
|
|
|
return prompt, input_ids, input_embeds, self._num_images(prompt_parts)
|