markupdm / processing_markupdm.py

Update processing_markupdm.py

0012b8d verified 11 days ago

18.1 kB

	"""Processor class for MarkupDM."""

	import math
	import re
	import shutil
	import subprocess
	import tempfile
	from pathlib import Path

	import numpy as np
	import torch
	from .fonts import FontManager
	from PIL import Image, ImageDraw
	from transformers import (
	ImageProcessingMixin,
	PreTrainedModel,
	PreTrainedTokenizerBase,
	ProcessorMixin,
	)
	from transformers.utils import logging

	logger = logging.get_logger(__name__)

	MAXIMUM_DECODE_IMAGE_SIZE = 4096
	IMG_FORMAT = "{:03d}.png"
	FONT_FORMAT = "{:03d}.ttf"


	class MarkupDMProcessor(ProcessorMixin): # type: ignore
	attributes = ["tokenizer", "image_processor"]

	# The superclass checks if the tokenizer is a subclass of `PreTrainedTokenizerBase`
	tokenizer_class = "AutoTokenizer"
	tokenizer: PreTrainedTokenizerBase

	# and the image_processor is a subclass of `ImageProcessingMixin`.
	image_processor_class = "AutoImageProcessor"
	image_processor: ImageProcessingMixin

	def __init__(
	self,
	tokenizer: PreTrainedTokenizerBase,
	image_processor: ImageProcessingMixin,
	):
	super().__init__(tokenizer, image_processor)

	# Extend the tokenizer if it has not been extended yet.
	if "<begin_of_image>" not in tokenizer.additional_special_tokens:
	self.extend_base_tokenizer(self.tokenizer)

	# Regular expressions
	boi = "<begin_of_image>"
	img_sep = "<image_sep>"
	self.re_img_size = re.compile(rf"{boi}(\d+){img_sep}(\d+){img_sep}")
	self.re_svg_width = re.compile(r'<svg[^>]\bwidth="(\d+)"[^>]>')
	self.re_svg_height = re.compile(r'<svg[^>]\bheight="(\d+)"[^>]>')

	# Font manager
	self.font_manager = None

	def extend_base_tokenizer(self, tokenizer: PreTrainedTokenizerBase) -> None:
	logger.info("Extending tokenizer...")
	tokenizer.clean_up_tokenization_spaces = False

	# Add special tokens
	additional_special_tokens = [
	"<begin_of_image>",
	"<end_of_image>",
	"<image_sep>",
	"<image_token>",
	]
	logger.info(f"Add special tokens: {additional_special_tokens}")
	tokenizer.add_special_tokens(
	{"additional_special_tokens": additional_special_tokens},
	replace_additional_special_tokens=False,
	)

	def __call__(
	self,
	svg: str \| None = None,
	images: list[Image.Image] \| None = None,
	filenames: list[str] \| None = None,
	vision_model: PreTrainedModel \| None = None,
	) -> dict:
	# Process images
	if not isinstance(images, list):
	images = [images] # type: ignore

	if len(images) > 0 and images[0] is not None:
	output = self.preprocess_images(images)
	output = self.encode_images(output, vision_model)
	else:
	output = {"width": [], "height": [], "image_ids": []}

	# Process the entire example
	output.update({"svg": svg, "filenames": filenames})
	output = self.tokenize_example(output)

	return output

	def preprocess_images(self, images: list[Image.Image]) -> dict:
	assert images is not None, "Images must be provided."
	output: dict = {"image": [], "width": [], "height": []}

	for image in images:
	processed = self.image_processor(image)
	for key, value in processed.items():
	output[key].append(value)

	# Stack tensors
	output["image"] = torch.stack(output["image"])

	return output

	def encode_images(self, example: dict, vision_model: PreTrainedModel) -> dict:
	if "images" in example and "width" not in example:
	example = self.preprocess_images(example["images"])

	assert vision_model is not None, "Vision model must be provided."
	image = example.pop("image")
	image = image.to(dtype=vision_model.dtype, device=vision_model.device)
	with torch.inference_mode():
	_, _, (_, _, image_ids) = vision_model.model.encode(image)
	example["image_ids"] = list(image_ids.view(image.size(0), -1).cpu())

	return example

	def tokenize_example(self, example: dict) -> dict:
	# Validate the input example
	for key in ["svg", "filenames", "width", "height", "image_ids"]:
	msg = f"Missing key: {key}."
	if key in ["width", "height", "image_ids"]:
	msg += " Images must be encoded first using `encode_images`."
	assert example.get(key, None) is not None, msg

	tokenizer = self.tokenizer
	bos_id = tokenizer.bos_token_id
	eos_id = tokenizer.eos_token_id
	bos_id = bos_id if bos_id is not None else eos_id
	boi_id = tokenizer.convert_tokens_to_ids("<begin_of_image>")
	eoi_id = tokenizer.convert_tokens_to_ids("<end_of_image>")
	img_sep_id = tokenizer.convert_tokens_to_ids("<image_sep>")

	# Tokenize images and build a mapping from image filenames to tokens
	name2token = {}
	for filename, image_ids, width, height in zip(
	example["filenames"],
	example["image_ids"],
	example["width"],
	example["height"],
	):
	_image_ids = (image_ids + len(tokenizer)).tolist()
	W_tokens = tokenizer.encode(str(width))
	H_tokens = tokenizer.encode(str(height))

	# Image tokens
	image_tokens = [
	boi_id,
	*W_tokens,
	img_sep_id,
	*H_tokens,
	img_sep_id,
	*_image_ids,
	eoi_id,
	]

	name2token[filename] = image_tokens

	# Tokenize SVG
	# TODO: remove bos_id as it seems to be not necessary in modern practice
	tokens = [bos_id]
	svg = example["svg"]
	while svg:
	# Find the start position of the next image filename
	start, end = len(svg), len(svg)
	for name in name2token.keys():
	_start = svg.find(name)
	if -1 < _start and _start < start:
	start = _start
	end = start + len(name)

	# Tokenize the text before the image filename
	tokens += tokenizer.encode(svg[:start])

	# Append the tokenized image
	if start < end:
	tokens += name2token[svg[start:end]]

	# Update the remaining text
	svg = svg[end:]

	tokens.append(eos_id)

	# Format output data
	input_ids = torch.tensor(tokens)
	image_mask = input_ids >= len(tokenizer)

	# Compute image position ids
	image_pos_ids = torch.zeros_like(input_ids)
	if len(example["image_ids"]) > 0:
	length = example["image_ids"][0].size(0)
	num_images = sum(image_mask) // length
	image_pos_ids[image_mask] = torch.arange(length).repeat(num_images)

	return {
	"input_ids": input_ids,
	"image_mask": image_mask,
	"image_pos_ids": image_pos_ids,
	}

	def decode(
	self,
	tokens: torch.Tensor \| np.ndarray,
	vision_model: PreTrainedModel \| None = None,
	) -> dict:
	tokenizer = self.tokenizer
	bos = tokenizer.bos_token
	eos = tokenizer.eos_token
	bos = bos if bos is not None else eos

	# Validate the input tokens
	msg = "Should be reverted from FIM format before decoding."
	for fim_type in ["prefix", "middle", "suffix"]:
	token_id = tokenizer.convert_tokens_to_ids(f"<fim_{fim_type}>")
	if token_id is None:
	token_id = tokenizer.convert_tokens_to_ids(f"<\|fim_{fim_type}\|>")
	assert token_id is not None, f"{fim_type} token not found"
	assert token_id not in tokens, msg

	tokens = torch.asarray(tokens).detach().cpu()
	assert tokens.ndim == 1, "Tokens must be 1D."
	boi_id = tokenizer.convert_tokens_to_ids("<begin_of_image>")
	eoi_id = tokenizer.convert_tokens_to_ids("<end_of_image>")

	# Decode tokens
	svg = ""
	images: list = []
	filenames: list = []
	while len(tokens) > 0:
	# Find the start position of the next image filename
	boi_idx = torch.where(tokens == boi_id)[0]
	eoi_idx = torch.where(tokens == eoi_id)[0]
	if boi_idx.size(0) > 0:
	start = int(boi_idx[0].item())
	end = int(eoi_idx[0].item()) + 1 if eoi_idx.size(0) > 0 else len(tokens)
	assert start < end, "Invalid image tokens."
	else:
	start, end = len(tokens), len(tokens)

	# Decode the tokens before the image tokens
	svg += tokenizer.decode(tokens[:start])

	# Decode the image tokens
	if start < end:
	# Extract image size
	image_tokens = tokens[start:end]
	image_text = tokenizer.decode(image_tokens)
	matched = self.re_img_size.match(image_text)
	if matched is not None:
	width, height = map(int, matched.groups())
	else:
	width = self.image_processor.size
	height = self.image_processor.size

	# Decode tokens to PIL image
	image_mask = image_tokens >= len(tokenizer)
	image_ids = image_tokens[image_mask] - len(tokenizer)
	image = self.decode_image(vision_model, image_ids, width, height)
	filename = IMG_FORMAT.format(len(images))
	svg += filename

	images.append(image)
	filenames.append(filename)

	# Update the remaining tokens
	tokens = tokens[end:]

	# Remove consecutive <bos> and <eos>
	svg = re.sub(rf"({re.escape(bos)})+", bos, svg)
	svg = re.sub(rf"({re.escape(eos)})+", eos, svg)

	# Extract the text between <bos> and <eos>
	i_bos = svg.find(bos)
	svg = svg[i_bos + len(bos) :] if i_bos > -1 else svg
	i_eos = svg.find(eos, i_bos + 1)
	svg = svg[:i_eos] if i_eos > -1 else svg

	return {"svg": svg, "images": images, "filenames": filenames}

	def decode_image(
	self,
	vision_model: PreTrainedModel \| None = None,
	image_ids: torch.Tensor \| np.ndarray \| None = None,
	width: int \| None = None,
	height: int \| None = None,
	dummy_color: tuple[int, int, int, int] = (200,) * 4,
	pad_value: int = 0,
	) -> Image.Image:
	# Prepare image size
	width = width or self.image_processor.size
	height = height or self.image_processor.size
	width, height = self.compute_safe_image_size(width, height)

	if vision_model is None and image_ids is None:
	# Return a dummy image
	return Image.new("RGBA", (width, height), dummy_color)

	# Compute required length
	assert vision_model is not None, "Vision model must be provided."
	scale_factor = 2 ** (vision_model.model.encoder.num_resolutions - 1)
	latent_size = self.image_processor.size // scale_factor
	required_length = latent_size**2

	# Pad image ids if necessary
	image_ids = torch.asarray(image_ids, device=vision_model.device)
	code_length = image_ids.shape[0] # type: ignore
	if code_length < required_length:
	pad_size = required_length - code_length
	pad = torch.full((pad_size,), pad_value).to(image_ids)
	image_ids = torch.cat([image_ids, pad])

	# Decode image
	with torch.inference_mode():
	codebook_entry = vision_model.model.quantize.get_codebook_entry(
	image_ids, (1, latent_size, latent_size, -1)
	)
	recon = vision_model.model.decode(codebook_entry)[0].float()

	# Postprocess image
	img = self.image_processor.postprocess(
	recon, self.image_processor.size, self.image_processor.size
	)

	# Mask the padded area
	if code_length < required_length:
	img = self.mask_padded_area(img, code_length, scale_factor)

	# Resize the image to the original size
	img = img.resize((width, height), resample=self.image_processor.resample)

	return img # type: ignore

	def compute_safe_image_size(self, width: int, height: int) -> tuple[int, int]:
	long_edge = max(width, height)
	if MAXIMUM_DECODE_IMAGE_SIZE < long_edge:
	scale = MAXIMUM_DECODE_IMAGE_SIZE / long_edge
	width = min(max(int(width * scale), 1), MAXIMUM_DECODE_IMAGE_SIZE)
	height = min(max(int(height * scale), 1), MAXIMUM_DECODE_IMAGE_SIZE)
	return width, height

	def mask_padded_area(
	self,
	img: Image.Image,
	code_length: int,
	scale_factor: int,
	fill: tuple[int, int, int, int] = (200, 200, 200, 255),
	) -> Image.Image:
	draw = ImageDraw.Draw(img, mode="RGBA")
	width, height = img.size
	zw = math.ceil(width / scale_factor)
	cw = code_length % zw
	ch = code_length // zw
	draw.polygon(
	[
	(cw * scale_factor, ch * scale_factor),
	(width, ch * scale_factor),
	(width, height),
	(0, height),
	(0, (ch + 1) * scale_factor),
	(cw * scale_factor, (ch + 1) * scale_factor),
	],
	fill=fill,
	)
	return img

	def set_font_manager(self, fonts_path: str \| None = None) -> None:
	self.font_manager = FontManager(fonts_path)

	def render_preprocess(self, example: dict, out_dir: str \| Path) -> None:
	msg = "Font manager is not set. Call `set_font_manager` first."
	assert self.font_manager is not None, msg

	out_dir = Path(out_dir)
	out_dir.mkdir(parents=True, exist_ok=True)
	svg = example["svg"]

	# Costruct style tag
	found = set()
	style_text = "text{dominant-baseline:text-before-edge}"
	for i, text_str in enumerate(re.findall("<text[^>]*>", svg)):
	matched = re.search('font-family="([^"]*)"', text_str)
	if matched is None:
	logger.warning(f"Font family not found in {text_str}")
	continue

	# Parse font attributes
	font_family = matched.group(1)
	is_bold = 'font-weight="bold"' in text_str
	is_italic = 'font-style="italic"' in text_str
	font_weight = "bold" if is_bold else "regular"
	if is_italic:
	font_style = "bolditalic" if is_bold else "italic"
	else:
	font_style = font_weight
	key = (font_family, font_weight, font_style)
	if key in found:
	continue

	font_bytes = self.font_manager.lookup(
	font_family=font_family,
	font_weight=font_weight,
	font_style=font_style,
	)

	# @font-face
	font_path = FONT_FORMAT.format(i)
	font_face = "@font-face{"
	font_face += f"font-family:'{font_family}';"
	font_face += f"font-weight:{font_weight};"
	font_face += f"font-style:{font_style};"
	font_face += f"src:url('{font_path}');"
	font_face += "}"
	style_text += font_face

	# Save font
	Path(f"{out_dir}/{font_path}").write_bytes(font_bytes)
	found.add(key)

	# Insert style tag
	matched = re.search("<svg[^>]*>", svg)
	assert matched is not None, "SVG tag not found"
	i = matched.span()[1]
	style = f"<style>{style_text}</style>"
	example["svg"] = svg[:i] + style + svg[i:]

	def render(self, example: dict, save_dir: str \| Path \| None = None) -> Image.Image:
	with tempfile.TemporaryDirectory() as tmp_dir:
	self.render_preprocess(example, tmp_dir)

	# Parse the SVG size
	matched = self.re_svg_width.search(example["svg"])
	assert matched is not None, "Width not found in SVG."
	width = int(matched.group(1))
	matched = self.re_svg_height.search(example["svg"])
	assert matched is not None, "Height not found in SVG."
	height = int(matched.group(1))

	# Convert SVG to HTML
	html = '<!DOCTYPE html><html><body style="margin: 0px">'
	html += f"{example['svg']}</body></html>"

	# Save HTML
	Path(f"{tmp_dir}/index.html").write_text(html, encoding="utf-8")

	# Save images
	for img, filename in zip(example["images"], example["filenames"]):
	Path(f"{tmp_dir}/{filename}").parent.mkdir(parents=True, exist_ok=True)
	img.save(f"{tmp_dir}/{filename}")

	# Take screenshot
	command = [
	"google-chrome",
	"--headless",
	"--disable-web-security",
	"--allow-running-insecure-content",
	"--no-sandbox",
	"--disable-infobars",
	"--hide-scrollbars",
	"--disable-dev-shm-usage",
	"--no-zygote",
	f"--window-size={width},{height}",
	f"--screenshot={tmp_dir}/screenshot.png",
	f"{tmp_dir}/index.html",
	]
	subprocess.run(command, check=True, stderr=subprocess.DEVNULL)

	# Load the screenshot as PIL image
	out = Image.open(f"{tmp_dir}/screenshot.png")
	size = (width, height)
	out = out.resize(size, resample=Image.Resampling.LANCZOS) # type: ignore

	# Copy the result if save_dir is specified
	if save_dir is not None:
	shutil.copytree(tmp_dir, save_dir, dirs_exist_ok=True)

	return out