| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | from typing import List, Optional, Union |
| |
|
| | import numpy as np |
| | import torch |
| | from PIL import Image, ImageOps |
| | from torchvision import transforms |
| | from transformers import BaseImageProcessor, BatchFeature, TensorType |
| |
|
| |
|
| | def _select_best_resolution( |
| | img_width: int, img_height: int, target_ratios: List[List[int]], patch_size: int |
| | ): |
| | """ |
| | Selects the best resolution from a list of possible resolutions based on the original size. |
| | |
| | Args: |
| | img_width: the original widths of images. |
| | img_height: the original heights of images. |
| | target_ratios (2d numpy array): dimension size (M,2) |
| | patch_size (int): image patch size |
| | |
| | Returns: |
| | tuple: The best fit resolution in the format (width, height). |
| | """ |
| |
|
| | aspect_ratio = img_width / img_height |
| | best_ratio_diff = float("inf") |
| | best_ratio_w, best_ratio_h = 1, 1 |
| | area = np.int32(img_height) * np.int32(img_height) |
| | for ratio in target_ratios: |
| | target_aspect_ratio = ratio[0] / ratio[1] |
| | ratio_diff = abs(aspect_ratio - target_aspect_ratio) |
| | if ratio_diff < best_ratio_diff: |
| | best_ratio_diff = ratio_diff |
| | best_ratio_w, best_ratio_h = ratio[0], ratio[1] |
| | elif ( |
| | ratio_diff == best_ratio_diff |
| | and area > 0.5 * patch_size * patch_size * ratio[0] * ratio[1] |
| | ): |
| | best_ratio_w, best_ratio_h = ratio[0], ratio[1] |
| |
|
| | return best_ratio_w, best_ratio_h |
| |
|
| |
|
| | def _split_image( |
| | image: Image.Image, |
| | split_image: bool, |
| | split_ratio: List[List[int]], |
| | patch_size: int, |
| | ) -> List[Image.Image]: |
| | """ |
| | Split image into multiple patches |
| | |
| | Args: |
| | image (PIL.Image): Input image. |
| | split_image (bool): Whether to split the image into patches. |
| | split_ratio (2d numpy array): dimension size (M,2) |
| | patch_size (int): image patch size |
| | |
| | Returns: |
| | List[PIL.Image]: List of splitted images. |
| | """ |
| | if split_image: |
| | ratio_width, ratio_height = _select_best_resolution( |
| | image.width, image.height, split_ratio, patch_size |
| | ) |
| | resize_width = patch_size * ratio_width |
| | resize_height = patch_size * ratio_height |
| | blocks = ratio_width * ratio_height |
| | resized_img = image.resize((resize_width, resize_height)) |
| | processed_images = [] |
| | for i in range(blocks): |
| | box = ( |
| | (i % (resize_width // patch_size)) * patch_size, |
| | (i // (resize_width // patch_size)) * patch_size, |
| | ((i % (resize_width // patch_size)) + 1) * patch_size, |
| | ((i // (resize_width // patch_size)) + 1) * patch_size, |
| | ) |
| | |
| | split_img = resized_img.crop(box) |
| | processed_images.append(split_img) |
| | assert len(processed_images) == blocks |
| | if len(processed_images) != 1: |
| | processed_images.insert(0, image) |
| | return processed_images |
| | else: |
| | return [image] |
| |
|
| |
|
| | def keep_ratio_resize_and_pixel_mask( |
| | img: Image.Image, max_size, min_size=336, padding_value=0 |
| | ): |
| | """ |
| | Resize an image while maintaining aspect ratio and create a pixel mask. |
| | |
| | Args: |
| | img (PIL.Image): Input image. |
| | max_size (int): Maximum size for the larger dimension of the image. |
| | min_size (int, optional): Minimum size for the smaller dimension. Defaults to 336. |
| | padding_value (int, optional): Value used for padding. Defaults to 0. |
| | |
| | Returns: |
| | tuple: A tuple containing: |
| | - PIL.Image: Resized and padded image. |
| | - torch.Tensor: Boolean pixel mask. This mask is a 2D tensor of shape (max_size, max_size) where: |
| | - True (1) values indicate pixels that belong to the original resized image. |
| | - False (0) values indicate pixels that are part of the padding. |
| | The mask helps distinguish between actual image content and padded areas in subsequent processing steps. |
| | """ |
| | img = img.convert("RGB") |
| | |
| | scale = max_size / max(img.size) |
| |
|
| | w, h = img.size |
| | if w >= h: |
| | new_size = (max_size, max(int(h * scale), min_size)) |
| | else: |
| | new_size = (max(int(w * scale), min_size), max_size) |
| |
|
| | img_resized = img.resize(new_size, resample=Image.Resampling.BICUBIC) |
| |
|
| | |
| | padding_right, padding_bottom = max_size - new_size[0], max_size - new_size[1] |
| | img_padded = ImageOps.expand( |
| | img_resized, (0, 0, padding_right, padding_bottom), fill=padding_value |
| | ) |
| |
|
| | |
| | pixel_mask = torch.zeros(max_size, max_size) |
| | pixel_mask[: new_size[1], : new_size[0]] = 1 |
| | pixel_mask = pixel_mask.bool() |
| | return img_padded, pixel_mask |
| |
|
| |
|
| | class AriaVisionProcessor(BaseImageProcessor): |
| | """ |
| | A vision processor for the Aria model that handles image preprocessing. |
| | """ |
| |
|
| | def __init__( |
| | self, |
| | max_image_size=980, |
| | min_image_size=336, |
| | image_mean=[0.5, 0.5, 0.5], |
| | image_std=[0.5, 0.5, 0.5], |
| | **kwargs, |
| | ): |
| | """ |
| | Initialize the AriaVisionProcessor. |
| | |
| | Args: |
| | max_image_size (int, optional): Maximum image size. Defaults to 980. |
| | min_image_size (int, optional): Minimum image size. Defaults to 336. |
| | mean (list, optional): Mean values for normalization. Defaults to [0.5, 0.5, 0.5]. |
| | std (list, optional): Standard deviation values for normalization. Defaults to [0.5, 0.5, 0.5]. |
| | """ |
| | super().__init__(**kwargs) |
| |
|
| | self.max_image_size = max_image_size |
| | self.min_image_size = min_image_size |
| | self.image_mean = image_mean |
| | self.image_std = image_std |
| | self.auto_map = { |
| | "AutoProcessor": "processing_aria.AriaProcessor", |
| | "AutoImageProcessor": "vision_processor.AriaVisionProcessor", |
| | } |
| |
|
| | |
| | |
| | |
| | self._transform = None |
| | self._set_processor_class("AriaProcessor") |
| |
|
| | @property |
| | def transform(self): |
| | if self._transform is None: |
| | |
| | self._transform = transforms.Compose( |
| | [ |
| | transforms.ToTensor(), |
| | transforms.Normalize(self.image_mean, self.image_std), |
| | ] |
| | ) |
| | return self._transform |
| |
|
| | def __call__( |
| | self, |
| | images: Union[Image.Image, List[Image.Image]], |
| | max_image_size: Optional[int] = 980, |
| | min_image_size: Optional[int] = 336, |
| | return_tensors: Optional[Union[str, TensorType]] = "pt", |
| | split_image: Optional[bool] = False, |
| | split_ratio: Optional[List[List[int]]] = [ |
| | [1, 2], |
| | [1, 3], |
| | [1, 4], |
| | [1, 5], |
| | [1, 6], |
| | [1, 7], |
| | [1, 8], |
| | [2, 4], |
| | [2, 3], |
| | [2, 2], |
| | [2, 1], |
| | [3, 1], |
| | [3, 2], |
| | [4, 1], |
| | [4, 2], |
| | [5, 1], |
| | [6, 1], |
| | [7, 1], |
| | [8, 1], |
| | ], |
| | ): |
| | """ |
| | Process a list of images. |
| | |
| | Args: |
| | images (list): List of PIL.Image objects. |
| | max_image_size (int, optional): Override the default max image size. Defaults to None. |
| | return_tensors (str or TensorType, optional): The type of tensor to return. Defaults to "pt". |
| | split_image (bool, optional): Whether to split the image. Defaults to False. |
| | split_ratio (list, optional): The ratio for splitting the image. Defaults to a list of common split ratios. |
| | Returns: |
| | BatchFeature: A BatchFeature object containing: |
| | - 'pixel_values': Tensor of processed image pixel values. |
| | - 'pixel_mask': Boolean pixel mask. This mask is a 2D tensor of shape (max_size, max_size) where: |
| | - True (1) values indicate pixels that belong to the original resized image. |
| | - False (0) values indicate pixels that are part of the padding. |
| | The mask helps distinguish between actual image content and padded areas in subsequent processing steps. |
| | - 'num_crops': Tensor of the number of crops for each image. |
| | """ |
| | max_size = self.max_image_size if max_image_size is None else max_image_size |
| | min_size = self.min_image_size if min_image_size is None else min_image_size |
| |
|
| | if max_size not in [490, 980]: |
| | raise ValueError("max_image_size must be either 490 or 980") |
| |
|
| | if isinstance(images, Image.Image): |
| | images = [images] |
| |
|
| | pixel_values = [] |
| | pixel_masks = [] |
| | num_crops = [] |
| |
|
| | for image in images: |
| | crop_images = _split_image(image, split_image, split_ratio, max_size) |
| | num_crops.append(torch.tensor(len(crop_images))) |
| | for crop_image in crop_images: |
| | img_padded, pixel_mask = keep_ratio_resize_and_pixel_mask( |
| | crop_image, max_size, min_size |
| | ) |
| | img_padded = self.transform(img_padded) |
| | pixel_values.append(img_padded) |
| | pixel_masks.append(pixel_mask) |
| |
|
| | return BatchFeature( |
| | data={ |
| | "pixel_values": torch.stack(pixel_values), |
| | "pixel_mask": torch.stack(pixel_masks), |
| | "num_crops": torch.stack(num_crops), |
| | }, |
| | tensor_type=return_tensors, |
| | ) |
| |
|
| | def preprocess( |
| | self, |
| | images, |
| | max_image_size=None, |
| | min_image_size=None, |
| | return_tensors: Optional[Union[str, TensorType]] = None, |
| | split_image: Optional[bool] = False, |
| | split_ratio: Optional[List[List[int]]] = [ |
| | [1, 2], |
| | [1, 3], |
| | [1, 4], |
| | [1, 5], |
| | [1, 6], |
| | [1, 7], |
| | [1, 8], |
| | [2, 4], |
| | [2, 3], |
| | [2, 2], |
| | [2, 1], |
| | [3, 1], |
| | [3, 2], |
| | [4, 1], |
| | [4, 2], |
| | [5, 1], |
| | [6, 1], |
| | [7, 1], |
| | [8, 1], |
| | ], |
| | ): |
| | return self.__call__( |
| | images, |
| | max_image_size=max_image_size, |
| | min_image_size=min_image_size, |
| | return_tensors=return_tensors, |
| | split_image=split_image, |
| | split_ratio=split_ratio, |
| | ) |
| |
|