|
""" |
|
Based upon ImageCaptionLoader in LangChain version: langchain/document_loaders/image_captions.py |
|
But accepts preloaded model to avoid slowness in use and CUDA forking issues |
|
|
|
Loader that uses Pix2Struct models to image caption |
|
|
|
""" |
|
from typing import List, Union, Any, Tuple |
|
|
|
from langchain.docstore.document import Document |
|
from langchain.document_loaders import ImageCaptionLoader |
|
from utils import get_device, clear_torch_cache |
|
from PIL import Image |
|
|
|
|
|
class H2OPix2StructLoader(ImageCaptionLoader): |
|
"""Loader that extracts text from images""" |
|
|
|
def __init__(self, path_images: Union[str, List[str]] = None, model_type="google/pix2struct-textcaps-base", |
|
max_new_tokens=50): |
|
super().__init__(path_images) |
|
self._pix2struct_model = None |
|
self._model_type = model_type |
|
self._max_new_tokens = max_new_tokens |
|
|
|
def set_context(self): |
|
if get_device() == 'cuda': |
|
import torch |
|
n_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0 |
|
if n_gpus > 0: |
|
self.context_class = torch.device |
|
self.device = 'cuda' |
|
else: |
|
self.device = 'cpu' |
|
else: |
|
self.device = 'cpu' |
|
|
|
def load_model(self): |
|
try: |
|
from transformers import AutoProcessor, Pix2StructForConditionalGeneration |
|
except ImportError: |
|
raise ValueError( |
|
"`transformers` package not found, please install with " |
|
"`pip install transformers`." |
|
) |
|
if self._pix2struct_model: |
|
self._pix2struct_model = self._pix2struct_model.to(self.device) |
|
return self |
|
self.set_context() |
|
self._pix2struct_processor = AutoProcessor.from_pretrained(self._model_type) |
|
self._pix2struct_model = Pix2StructForConditionalGeneration.from_pretrained(self._model_type).to(self.device) |
|
return self |
|
|
|
def unload_model(self): |
|
if hasattr(self._pix2struct_model, 'cpu'): |
|
self._pix2struct_model.cpu() |
|
clear_torch_cache() |
|
|
|
def set_image_paths(self, path_images: Union[str, List[str]]): |
|
""" |
|
Load from a list of image files |
|
""" |
|
if isinstance(path_images, str): |
|
self.image_paths = [path_images] |
|
else: |
|
self.image_paths = path_images |
|
|
|
def load(self, prompt=None) -> List[Document]: |
|
if self._pix2struct_model is None: |
|
self.load_model() |
|
results = [] |
|
for path_image in self.image_paths: |
|
caption, metadata = self._get_captions_and_metadata( |
|
processor=self._pix2struct_processor, model=self._pix2struct_model, path_image=path_image |
|
) |
|
doc = Document(page_content=caption, metadata=metadata) |
|
results.append(doc) |
|
|
|
return results |
|
|
|
def _get_captions_and_metadata( |
|
self, processor: Any, model: Any, path_image: str) -> Tuple[str, dict]: |
|
""" |
|
Helper function for getting the captions and metadata of an image |
|
""" |
|
try: |
|
image = Image.open(path_image) |
|
except Exception: |
|
raise ValueError(f"Could not get image data for {path_image}") |
|
inputs = self._pix2struct_processor(images=image, return_tensors="pt") |
|
inputs = inputs.to(self.device) |
|
generated_ids = self._pix2struct_model.generate(**inputs, max_new_tokens=self._max_new_tokens) |
|
generated_text = self._pix2struct_processor.batch_decode(generated_ids, skip_special_tokens=True)[0] |
|
metadata: dict = {"image_path": path_image} |
|
return generated_text, metadata |
|
|