Spaces:

Awiny
/

Image2Paragraph

Runtime error

App Files Files Community

Awiny commited on Apr 17, 2023

Commit

b25eb4e

•

1 Parent(s): b510b75

update lightweight code

Browse files

Files changed (26) hide show

app.py +10 -8
app_w_sam.py +0 -139
models/__pycache__/blip2_model.cpython-38.pyc +0 -0
models/__pycache__/controlnet_model.cpython-38.pyc +0 -0
models/__pycache__/gpt_model.cpython-38.pyc +0 -0
models/__pycache__/grit_model.cpython-38.pyc +0 -0
models/__pycache__/image_text_transformation.cpython-38.pyc +0 -0
models/__pycache__/region_semantic.cpython-38.pyc +0 -0
models/blip2_model.py +15 -10
models/controlnet_model.py +4 -12
models/gpt_model.py +1 -1
models/grit_src/__pycache__/image_dense_captions.cpython-38.pyc +0 -0
models/grit_src/image_dense_captions.py +2 -0
models/image_text_transformation.py +6 -7
models/region_semantic.py +32 -10
models/segment_models/__pycache__/edit_anything_model.cpython-38.pyc +0 -0
models/segment_models/__pycache__/semantic_segment_anything_model.cpython-38.pyc +0 -0
models/segment_models/__pycache__/semgent_anything_model.cpython-38.pyc +0 -0
models/segment_models/edit_anything_model.py +62 -0
models/segment_models/semantic_segment_anything_model.py +2 -0
models/segment_models/semgent_anything_model.py +11 -2
pretrained_models/sam_vit_b_01ec64.pth +3 -0
requirements.txt +1 -0
utils/__pycache__/util.cpython-38.pyc +0 -0
utils/image_dense_captions.py +0 -108
utils/util.py +14 -1

app.py CHANGED Viewed

@@ -12,10 +12,13 @@ parser = argparse.ArgumentParser()
 parser.add_argument('--gpt_version', choices=['gpt-3.5-turbo', 'gpt4'], default='gpt-3.5-turbo')
 parser.add_argument('--image_caption', action='store_true', dest='image_caption', default=True, help='Set this flag to True if you want to use BLIP2 Image Caption')
 parser.add_argument('--dense_caption', action='store_true', dest='dense_caption', default=True, help='Set this flag to True if you want to use Dense Caption')
-parser.add_argument('--semantic_segment', action='store_true', dest='semantic_segment', default=False, help='Set this flag to True if you want to use semantic segmentation')
-parser.add_argument('--image_caption_device', choices=['cuda', 'cpu'], default='cpu', help='Select the device: cuda or cpu, gpu memory larger than 14G is recommended')
-parser.add_argument('--dense_caption_device', choices=['cuda', 'cpu'], default='cpu', help='Select the device: cuda or cpu, < 6G GPU is not recommended>')
-parser.add_argument('--semantic_segment_device', choices=['cuda', 'cpu'], default='cpu', help='Select the device: cuda or cpu, gpu memory larger than 14G is recommended')
 parser.add_argument('--contolnet_device', choices=['cuda', 'cpu'], default='cpu', help='Select the device: cuda or cpu, <6G GPU is not recommended>')
 args = parser.parse_args()
@@ -49,8 +52,7 @@ def process_image(image_src, options=None, processor=None):
     print(options)
     if options is None:
         options = []
-    # processor.args.semantic_segment = "Semantic Segment" in options
-    processor.args.semantic_segment = False
     image_generation_status = "Image Generation" in options
     image_caption, dense_caption, region_semantic, gen_text = processor.image_to_text(image_src)
     if image_generation_status:
@@ -96,7 +98,7 @@ processor = ImageTextTransformation(args)
 # Create Gradio input and output components
 image_input = gr.inputs.Image(type='filepath', label="Input Image")
-# semantic_segment_checkbox = gr.inputs.Checkbox(label="Semantic Segment", default=False)
 image_generation_checkbox = gr.inputs.Checkbox(label="Image Generation", default=False)
@@ -120,7 +122,7 @@ interface = gr.Interface(
     inputs=[image_input,
             gr.CheckboxGroup(
             label="Options",
-            choices=["Image Generation"],
             ),
             ],
     outputs=gr.outputs.HTML(),

 parser.add_argument('--gpt_version', choices=['gpt-3.5-turbo', 'gpt4'], default='gpt-3.5-turbo')
 parser.add_argument('--image_caption', action='store_true', dest='image_caption', default=True, help='Set this flag to True if you want to use BLIP2 Image Caption')
 parser.add_argument('--dense_caption', action='store_true', dest='dense_caption', default=True, help='Set this flag to True if you want to use Dense Caption')
+parser.add_argument('--semantic_segment', action='store_true', dest='semantic_segment', default=True, help='Set this flag to True if you want to use semantic segmentation')
+parser.add_argument('--sam_arch', choices=['vit_b', 'vit_l', 'vit_h'], dest='sam_arch', default='vit_b', help='vit_b is the default model (fast but not accurate), vit_l and vit_h are larger models')
+parser.add_argument('--captioner_base_model', choices=['blip', 'blip2'], dest='captioner_base_model', default='blip', help='blip2 requires 15G GPU memory, blip requires 6G GPU memory')
+parser.add_argument('--region_classify_model', choices=['ssa', 'edit_anything'], dest='region_classify_model', default='edit_anything', help='Select the region classification model: edit anything is ten times faster than ssa, but less accurate.')
+parser.add_argument('--image_caption_device', choices=['cuda', 'cpu'], default='cuda', help='Select the device: cuda or cpu, gpu memory larger than 14G is recommended')
+parser.add_argument('--dense_caption_device', choices=['cuda', 'cpu'], default='cuda', help='Select the device: cuda or cpu, < 6G GPU is not recommended>')
+parser.add_argument('--semantic_segment_device', choices=['cuda', 'cpu'], default='cuda', help='Select the device: cuda or cpu, gpu memory larger than 14G is recommended. Make sue this model and image_caption model on same device.')
 parser.add_argument('--contolnet_device', choices=['cuda', 'cpu'], default='cpu', help='Select the device: cuda or cpu, <6G GPU is not recommended>')
 args = parser.parse_args()
     print(options)
     if options is None:
         options = []
+    processor.args.semantic_segment = "Semantic Segment" in options
     image_generation_status = "Image Generation" in options
     image_caption, dense_caption, region_semantic, gen_text = processor.image_to_text(image_src)
     if image_generation_status:
 # Create Gradio input and output components
 image_input = gr.inputs.Image(type='filepath', label="Input Image")
+semantic_segment_checkbox = gr.inputs.Checkbox(label="Semantic Segment", default=False)
 image_generation_checkbox = gr.inputs.Checkbox(label="Image Generation", default=False)
     inputs=[image_input,
             gr.CheckboxGroup(
             label="Options",
+            choices=["Image Generation", "Semantic Segment"],
             ),
             ],
     outputs=gr.outputs.HTML(),

app_w_sam.py DELETED Viewed

@@ -1,139 +0,0 @@
-import gradio as gr
-import cv2
-import numpy as np
-from PIL import Image
-import base64
-from io import BytesIO
-from models.image_text_transformation import ImageTextTransformation
-import argparse
-import torch
-parser = argparse.ArgumentParser()
-parser.add_argument('--gpt_version', choices=['gpt-3.5-turbo', 'gpt4'], default='gpt-3.5-turbo')
-parser.add_argument('--image_caption', action='store_true', dest='image_caption', default=True, help='Set this flag to True if you want to use BLIP2 Image Caption')
-parser.add_argument('--dense_caption', action='store_true', dest='dense_caption', default=True, help='Set this flag to True if you want to use Dense Caption')
-parser.add_argument('--semantic_segment', action='store_true', dest='semantic_segment', default=True, help='Set this flag to True if you want to use semantic segmentation')
-parser.add_argument('--image_caption_device', choices=['cuda', 'cpu'], default='cpu', help='Select the device: cuda or cpu, gpu memory larger than 14G is recommended')
-parser.add_argument('--dense_caption_device', choices=['cuda', 'cpu'], default='cpu', help='Select the device: cuda or cpu, < 6G GPU is not recommended>')
-parser.add_argument('--semantic_segment_device', choices=['cuda', 'cpu'], default='cpu', help='Select the device: cuda or cpu, gpu memory larger than 14G is recommended')
-parser.add_argument('--contolnet_device', choices=['cuda', 'cpu'], default='cpu', help='Select the device: cuda or cpu, <6G GPU is not recommended>')
-args = parser.parse_args()
-device = "cuda" if torch.cuda.is_available() else "cpu"
-# device = "cpu"
-if device == "cuda":
-    args.image_caption_device = "cpu"
-    args.dense_caption_device = "cuda"
-    args.semantic_segment_device = "cuda"
-    args.contolnet_device = "cuda"
-else:
-    args.image_caption_device = "cpu"
-    args.dense_caption_device = "cpu"
-    args.semantic_segment_device = "cpu"
-    args.contolnet_device = "cpu"
-def pil_image_to_base64(image):
-    buffered = BytesIO()
-    image.save(buffered, format="JPEG")
-    img_str = base64.b64encode(buffered.getvalue()).decode()
-    return img_str
-def add_logo():
-    with open("examples/logo.png", "rb") as f:
-        logo_base64 = base64.b64encode(f.read()).decode()
-    return logo_base64
-def process_image(image_src, options=None, processor=None):
-    print(options)
-    if options is None:
-        options = []
-    processor.args.semantic_segment = "Semantic Segment" in options
-    image_generation_status = "Image Generation" in options
-    image_caption, dense_caption, region_semantic, gen_text = processor.image_to_text(image_src)
-    if image_generation_status:
-        gen_image = processor.text_to_image(gen_text)
-        gen_image_str = pil_image_to_base64(gen_image)
-    # Combine the outputs into a single HTML output
-    custom_output = f'''
-    <h2>Image->Text:</h2>
-    <div style="display: flex; flex-wrap: wrap;">
-        <div style="flex: 1;">
-            <h3>Image Caption</h3>
-            <p>{image_caption}</p>
-        </div>
-        <div style="flex: 1;">
-            <h3>Dense Caption</h3>
-            <p>{dense_caption}</p>
-        </div>
-        <div style="flex: 1;">
-            <h3>Region Semantic</h3>
-            <p>{region_semantic}</p>
-        </div>
-    </div>
-    <div style="display: flex; flex-wrap: wrap;">
-        <div style="flex: 1;">
-            <h3>GPT4 Reasoning:</h3>
-            <p>{gen_text}</p>
-        </div>
-    </div>
-    '''
-    if image_generation_status:
-        custom_output += f'''
-        <h2>Text->Image:</h2>
-        <div style="display: flex; flex-wrap: wrap;">
-            <div style="flex: 1;">
-                <h3>Generated Image</h3>
-                <img src="data:image/jpeg;base64,{gen_image_str}" width="400" style="vertical-align: middle;">
-            </div>
-        </div>
-        '''
-    return custom_output
-processor = ImageTextTransformation(args)
-# Create Gradio input and output components
-image_input = gr.inputs.Image(type='filepath', label="Input Image")
-semantic_segment_checkbox = gr.inputs.Checkbox(label="Semantic Segment", default=False)
-image_generation_checkbox = gr.inputs.Checkbox(label="Image Generation", default=False)
-extra_title = r'![vistors](https://visitor-badge.glitch.me/badge?page_id=fingerrec.Image2Paragraph)' + '\n' + \
-              r'[![Duplicate this Space](https://huggingface.co/datasets/huggingface/badges/raw/main/duplicate-this-space-md-dark.svg)](https://huggingface.co/spaces/Awiny/Image2Paragraph?duplicate=true)' + '\n\n'
-logo_base64 = add_logo()
-# Create the title with the logo
-title_with_logo = \
-    f'<img src="data:image/jpeg;base64,{logo_base64}" width="400" style="vertical-align: middle;"> Understanding Image with Text'
-examples = [
-    ["examples/test_4.jpg"],
-]
-# Create Gradio interface
-interface = gr.Interface(
-    fn=lambda image, options: process_image(image, options, processor),
-    inputs=[image_input,
-            gr.CheckboxGroup(
-            label="Options",
-            choices=["Image Generation", "Semantic Segment"],
-            ),
-            ],
-    outputs=gr.outputs.HTML(),
-    title=title_with_logo,
-    examples=examples,
-    description=extra_title +"""
-    Image.txt. This code support image to text transformation. Then the generated text can do retrieval, question answering et al to conduct zero-shot.
-    \n Github: https://github.com/showlab/Image2Paragraph
-    \n Twitter: https://twitter.com/awinyimgprocess/status/1646225454599372800?s=46&t=HvOe9T2n35iFuCHP5aIHpQ
-    \n Since GPU is expensive, we use CPU for demo and not include semantic segment anything. Run code local with gpu or google colab we provided for fast speed.
-    \n Ttext2image model is controlnet ( very slow in cpu(~2m)), which used canny edge as reference.
-    \n To speed up, we generate image with small size 384, run the code local for high-quality sample.
-    """
-)
-# Launch the interface
-interface.launch()

models/__pycache__/blip2_model.cpython-38.pyc CHANGED Viewed

Binary files a/models/__pycache__/blip2_model.cpython-38.pyc and b/models/__pycache__/blip2_model.cpython-38.pyc differ

models/__pycache__/controlnet_model.cpython-38.pyc CHANGED Viewed

Binary files a/models/__pycache__/controlnet_model.cpython-38.pyc and b/models/__pycache__/controlnet_model.cpython-38.pyc differ

models/__pycache__/gpt_model.cpython-38.pyc CHANGED Viewed

Binary files a/models/__pycache__/gpt_model.cpython-38.pyc and b/models/__pycache__/gpt_model.cpython-38.pyc differ

models/__pycache__/grit_model.cpython-38.pyc CHANGED Viewed

Binary files a/models/__pycache__/grit_model.cpython-38.pyc and b/models/__pycache__/grit_model.cpython-38.pyc differ

models/__pycache__/image_text_transformation.cpython-38.pyc CHANGED Viewed

Binary files a/models/__pycache__/image_text_transformation.cpython-38.pyc and b/models/__pycache__/image_text_transformation.cpython-38.pyc differ

models/__pycache__/region_semantic.cpython-38.pyc CHANGED Viewed

Binary files a/models/__pycache__/region_semantic.cpython-38.pyc and b/models/__pycache__/region_semantic.cpython-38.pyc differ

models/blip2_model.py CHANGED Viewed

@@ -6,28 +6,33 @@ from utils.util import resize_long_edge
 class ImageCaptioning:
-    def __init__(self, device):
         self.device = device
         self.processor, self.model = self.initialize_model()
-    def initialize_model(self):
         if self.device == 'cpu':
             self.data_type = torch.float32
         else:
             self.data_type = torch.float16
-        # uncomment for load stronger captioner
-        # processor = Blip2Processor.from_pretrained("pretrained_models/blip2-opt-2.7b")
-        # model = Blip2ForConditionalGeneration.from_pretrained(
-        #     "pretrained_models/blip2-opt-2.7b", torch_dtype=self.data_type
-        # )
-        processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
-        model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
         model.to(self.device)
         return processor, model
     def image_caption(self, image_src):
         image = Image.open(image_src)
-        image = resize_long_edge(image)
         inputs = self.processor(images=image, return_tensors="pt").to(self.device, self.data_type)
         generated_ids = self.model.generate(**inputs)
         generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

 class ImageCaptioning:
+    def __init__(self, device, captioner_base_model='blip'):
         self.device = device
+        self.captioner_base_model = captioner_base_model
         self.processor, self.model = self.initialize_model()
+    def initialize_model(self,):
         if self.device == 'cpu':
             self.data_type = torch.float32
         else:
             self.data_type = torch.float16
+        if self.captioner_base_model == 'blip2':
+            processor = Blip2Processor.from_pretrained("pretrained_models/blip2-opt-2.7b")
+            model = Blip2ForConditionalGeneration.from_pretrained(
+                "pretrained_models/blip2-opt-2.7b", torch_dtype=self.data_type
+            )
+        # for gpu with small memory
+        elif self.captioner_base_model == 'blip':
+            processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+            model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base", torch_dtype=self.data_type)
+        else:
+            raise ValueError('arch not supported')
         model.to(self.device)
         return processor, model
     def image_caption(self, image_src):
         image = Image.open(image_src)
+        image = resize_long_edge(image, 384)
         inputs = self.processor(images=image, return_tensors="pt").to(self.device, self.data_type)
         generated_ids = self.model.generate(**inputs)
         generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

models/controlnet_model.py CHANGED Viewed

@@ -15,29 +15,21 @@ class TextToImage:
         self.model = self.initialize_model()
     def initialize_model(self):
-        if self.device == 'cpu':
-            self.data_type = torch.float32
-        else:
-            self.data_type = torch.float16
         controlnet = ControlNetModel.from_pretrained(
             "fusing/stable-diffusion-v1-5-controlnet-canny",
-            torch_dtype=self.data_type,
-            map_location=self.device,  # Add this line
-        ).to(self.device)
         pipeline = StableDiffusionControlNetPipeline.from_pretrained(
-            # "pretrained_models/stable-diffusion-v1-5",
             "runwayml/stable-diffusion-v1-5",
             controlnet=controlnet,
             safety_checker=None,
-            torch_dtype=self.data_type,
-            map_location=self.device,  # Add this line
         )
         pipeline.scheduler = UniPCMultistepScheduler.from_config(
             pipeline.scheduler.config
         )
         pipeline.to(self.device)
-        if self.device != 'cpu':
-            pipeline.enable_model_cpu_offload()
         return pipeline
     @staticmethod

         self.model = self.initialize_model()
     def initialize_model(self):
         controlnet = ControlNetModel.from_pretrained(
             "fusing/stable-diffusion-v1-5-controlnet-canny",
+            torch_dtype=torch.float16,
+        )
         pipeline = StableDiffusionControlNetPipeline.from_pretrained(
             "runwayml/stable-diffusion-v1-5",
             controlnet=controlnet,
             safety_checker=None,
+            torch_dtype=torch.float16,
         )
         pipeline.scheduler = UniPCMultistepScheduler.from_config(
             pipeline.scheduler.config
         )
+        pipeline.enable_model_cpu_offload()
         pipeline.to(self.device)
         return pipeline
     @staticmethod

models/gpt_model.py CHANGED Viewed

@@ -17,7 +17,7 @@ class ImageToText:
         Use nouns rather than coordinates to show position information of each object.
         No more than 7 sentences.
         Only use one paragraph.
-        Describe position detailedly.
         Do not appear number.
         """
         template = f"{prompt_prefix_1}{prompt_prefix_2}{{width}}X{{height}}{prompt_prefix_3}{{caption}}{prompt_prefix_4}{{dense_caption}}{prompt_prefix_5}{{region_semantic}}{prompt_suffix}"

         Use nouns rather than coordinates to show position information of each object.
         No more than 7 sentences.
         Only use one paragraph.
+        Describe position of each object.
         Do not appear number.
         """
         template = f"{prompt_prefix_1}{prompt_prefix_2}{{width}}X{{height}}{prompt_prefix_3}{{caption}}{prompt_prefix_4}{{dense_caption}}{prompt_prefix_5}{{region_semantic}}{prompt_suffix}"

models/grit_src/__pycache__/image_dense_captions.cpython-38.pyc CHANGED Viewed

Binary files a/models/grit_src/__pycache__/image_dense_captions.cpython-38.pyc and b/models/grit_src/__pycache__/image_dense_captions.cpython-38.pyc differ

models/grit_src/image_dense_captions.py CHANGED Viewed

@@ -16,6 +16,7 @@ from models.grit_src.grit.config import add_grit_config
 from models.grit_src.grit.predictor import VisualizationDemo
 import json
 # constants
@@ -62,6 +63,7 @@ def image_caption_api(image_src, device):
     demo = VisualizationDemo(cfg)
     if image_src:
         img = read_image(image_src, format="BGR")
         predictions, visualized_output = demo.run_on_image(img)
         new_caption = dense_pred_to_caption(predictions)
     return new_caption

 from models.grit_src.grit.predictor import VisualizationDemo
 import json
+from utils.util import resize_long_edge_cv2
 # constants
     demo = VisualizationDemo(cfg)
     if image_src:
         img = read_image(image_src, format="BGR")
+        img = resize_long_edge_cv2(img, 384)
         predictions, visualized_output = demo.run_on_image(img)
         new_caption = dense_pred_to_caption(predictions)
     return new_caption

models/image_text_transformation.py CHANGED Viewed

@@ -3,13 +3,12 @@ from models.grit_model import DenseCaptioning
 from models.gpt_model import ImageToText
 from models.controlnet_model import TextToImage
 from models.region_semantic import RegionSemantic
-from utils.util import read_image_width_height, display_images_and_text
 import argparse
 from PIL import Image
 import base64
 from io import BytesIO
 import os
-from utils.util import resize_long_edge
 def pil_image_to_base64(image):
     buffered = BytesIO()
@@ -27,23 +26,23 @@ class ImageTextTransformation:
     def init_models(self):
         openai_key = os.environ['OPENAI_KEY']
         print('\033[1;34m' + "Welcome to the Image2Paragraph toolbox...".center(50, '-') + '\033[0m')
         print('\033[1;33m' + "Initializing models...".center(50, '-') + '\033[0m')
         print('\033[1;31m' + "This is time-consuming, please wait...".center(50, '-') + '\033[0m')
-        self.image_caption_model = ImageCaptioning(device=self.args.image_caption_device)
         self.dense_caption_model = DenseCaptioning(device=self.args.dense_caption_device)
         self.gpt_model = ImageToText(openai_key)
         self.controlnet_model = TextToImage(device=self.args.contolnet_device)
-        # time-conusimg on CPU, run on local
-        if self.args.semantic_segment:
-            self.region_semantic_model = RegionSemantic(device=self.args.semantic_segment_device)
         print('\033[1;32m' + "Model initialization finished!".center(50, '-') + '\033[0m')
     def image_to_text(self, img_src):
         # the information to generate paragraph based on the context
         self.ref_image = Image.open(img_src)
-        self.ref_image = resize_long_edge(self.ref_image)
         width, height = read_image_width_height(img_src)
         print(self.args)
         if self.args.image_caption:

 from models.gpt_model import ImageToText
 from models.controlnet_model import TextToImage
 from models.region_semantic import RegionSemantic
+from utils.util import read_image_width_height, display_images_and_text, resize_long_edge
 import argparse
 from PIL import Image
 import base64
 from io import BytesIO
 import os
 def pil_image_to_base64(image):
     buffered = BytesIO()
     def init_models(self):
         openai_key = os.environ['OPENAI_KEY']
+        print(self.args)
         print('\033[1;34m' + "Welcome to the Image2Paragraph toolbox...".center(50, '-') + '\033[0m')
         print('\033[1;33m' + "Initializing models...".center(50, '-') + '\033[0m')
         print('\033[1;31m' + "This is time-consuming, please wait...".center(50, '-') + '\033[0m')
+        self.image_caption_model = ImageCaptioning(device=self.args.image_caption_device, captioner_base_model=self.args.captioner_base_model)
         self.dense_caption_model = DenseCaptioning(device=self.args.dense_caption_device)
         self.gpt_model = ImageToText(openai_key)
         self.controlnet_model = TextToImage(device=self.args.contolnet_device)
+        self.region_semantic_model = RegionSemantic(device=self.args.semantic_segment_device, image_caption_model=self.image_caption_model, region_classify_model=self.args.region_classify_model, sam_arch=self.args.sam_arch)
         print('\033[1;32m' + "Model initialization finished!".center(50, '-') + '\033[0m')
     def image_to_text(self, img_src):
         # the information to generate paragraph based on the context
         self.ref_image = Image.open(img_src)
+        # resize image to long edge 384
+        self.ref_image = resize_long_edge(self.ref_image, 384)
         width, height = read_image_width_height(img_src)
         print(self.args)
         if self.args.image_caption:

models/region_semantic.py CHANGED Viewed

@@ -1,17 +1,27 @@
 from models.segment_models.semgent_anything_model import SegmentAnything
 from models.segment_models.semantic_segment_anything_model import SemanticSegment
 class RegionSemantic():
-    def __init__(self, device):
         self.device = device
         self.init_models()
     def init_models(self):
-        self.segment_model = SegmentAnything(self.device)
-        self.semantic_segment_model = SemanticSegment(self.device)
-    def semantic_prompt_gen(self, anns):
         """
         fliter too small objects and objects with low stability score
         anns: [{'class_name': 'person', 'bbox': [0.0, 0.0, 0.0, 0.0], 'size': [0, 0], 'stability_score': 0.0}, ...]
@@ -19,20 +29,32 @@ class RegionSemantic():
         """
         # Sort annotations by area in descending order
         sorted_annotations = sorted(anns, key=lambda x: x['area'], reverse=True)
         # Select the top 10 largest regions
-        top_10_largest_regions = sorted_annotations[:10]
         semantic_prompt = ""
-        print('\033[1;35m' + '*' * 100 + '\033[0m')
-        print("\nStep3, Semantic Prompt:")
         for region in top_10_largest_regions:
             semantic_prompt += region['class_name'] + ': ' + str(region['bbox']) + "; "
         print(semantic_prompt)
         print('\033[1;35m' + '*' * 100 + '\033[0m')
         return semantic_prompt
-    def region_semantic(self, img_src):
         anns = self.segment_model.generate_mask(img_src)
-        anns_w_class = self.semantic_segment_model.semantic_class_w_mask(img_src, anns)
         return self.semantic_prompt_gen(anns_w_class)
     def region_semantic_debug(self, img_src):

 from models.segment_models.semgent_anything_model import SegmentAnything
 from models.segment_models.semantic_segment_anything_model import SemanticSegment
+from models.segment_models.edit_anything_model import EditAnything
 class RegionSemantic():
+    def __init__(self, device, image_caption_model, region_classify_model='edit_anything', sam_arch='vit_b'):
         self.device = device
+        self.sam_arch = sam_arch
+        self.image_caption_model = image_caption_model
+        self.region_classify_model = region_classify_model
         self.init_models()
     def init_models(self):
+        self.segment_model = SegmentAnything(self.device, arch=self.sam_arch)
+        if self.region_classify_model == 'ssa':
+            self.semantic_segment_model = SemanticSegment(self.device)
+        elif self.region_classify_model == 'edit_anything':
+            self.edit_anything_model = EditAnything(self.image_caption_model)
+            print('initalize edit anything model')
+        else:
+            raise ValueError("semantic_class_model must be 'ssa' or 'edit_anything'")
+    def semantic_prompt_gen(self, anns, topk=5):
         """
         fliter too small objects and objects with low stability score
         anns: [{'class_name': 'person', 'bbox': [0.0, 0.0, 0.0, 0.0], 'size': [0, 0], 'stability_score': 0.0}, ...]
         """
         # Sort annotations by area in descending order
         sorted_annotations = sorted(anns, key=lambda x: x['area'], reverse=True)
+        anns_len = len(sorted_annotations)
         # Select the top 10 largest regions
+        top_10_largest_regions = sorted_annotations[:min(anns_len, topk)]
         semantic_prompt = ""
         for region in top_10_largest_regions:
             semantic_prompt += region['class_name'] + ': ' + str(region['bbox']) + "; "
         print(semantic_prompt)
         print('\033[1;35m' + '*' * 100 + '\033[0m')
         return semantic_prompt
+    def region_semantic(self, img_src, region_classify_model='edit_anything'):
+        print('\033[1;35m' + '*' * 100 + '\033[0m')
+        print("\nStep3, Semantic Prompt:")
+        print('extract region segmentation with SAM model....\n')
         anns = self.segment_model.generate_mask(img_src)
+        print('finished...\n')
+        if region_classify_model == 'ssa':
+            print('generate region supervision with blip2 model....\n')
+            anns_w_class = self.semantic_segment_model.semantic_class_w_mask(img_src, anns)
+            print('finished...\n')
+        elif region_classify_model == 'edit_anything':
+            print('generate region supervision with edit anything model....\n')
+            anns_w_class = self.edit_anything_model.semantic_class_w_mask(img_src, anns)
+            print('finished...\n')
+        else:
+            raise ValueError("semantic_class_model must be 'ssa' or 'edit_anything'")
         return self.semantic_prompt_gen(anns_w_class)
     def region_semantic_debug(self, img_src):

models/segment_models/__pycache__/edit_anything_model.cpython-38.pyc ADDED Viewed

Binary file (3.62 kB). View file

models/segment_models/__pycache__/semantic_segment_anything_model.cpython-38.pyc CHANGED Viewed

Binary files a/models/segment_models/__pycache__/semantic_segment_anything_model.cpython-38.pyc and b/models/segment_models/__pycache__/semantic_segment_anything_model.cpython-38.pyc differ

models/segment_models/__pycache__/semgent_anything_model.cpython-38.pyc CHANGED Viewed

Binary files a/models/segment_models/__pycache__/semgent_anything_model.cpython-38.pyc and b/models/segment_models/__pycache__/semgent_anything_model.cpython-38.pyc differ

models/segment_models/edit_anything_model.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import cv2
+import torch
+import mmcv
+import numpy as np
+from PIL import Image
+from utils.util import resize_long_edge
+from concurrent.futures import ThreadPoolExecutor
+import time
+class EditAnything:
+    def __init__(self, image_caption_model):
+        self.device = image_caption_model.device
+        self.data_type = image_caption_model.data_type
+        self.image_caption_model = image_caption_model
+    def region_classify_w_blip2(self, images):
+        inputs = self.image_caption_model.processor(images=images, return_tensors="pt").to(self.device, self.data_type)
+        generated_ids = self.image_caption_model.model.generate(**inputs)
+        generated_texts = self.image_caption_model.processor.batch_decode(generated_ids, skip_special_tokens=True)
+        return [text.strip() for text in generated_texts]
+    def process_ann(self, ann, image, target_size=(224, 224)):
+        start_time = time.time()
+        m = ann['segmentation']
+        m_3c = m[:, :, np.newaxis]
+        m_3c = np.concatenate((m_3c, m_3c, m_3c), axis=2)
+        bbox = ann['bbox']
+        region = mmcv.imcrop(image * m_3c, np.array([bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3]]), scale=1)
+        resized_region = mmcv.imresize(region, target_size)
+        end_time = time.time()
+        print("process_ann took {:.2f} seconds".format(end_time - start_time))
+        return resized_region, ann
+    def region_level_semantic_api(self, image, anns, topk=5):
+        """
+        rank regions by area, and classify each region with blip2, parallel processing for speed up
+        Args:
+            image: numpy array
+            topk: int
+        Returns:
+            topk_region_w_class_label: list of dict with key 'class_label'
+        """
+        start_time = time.time()
+        if len(anns) == 0:
+            return []
+        sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
+        topk_anns = sorted_anns[:min(topk, len(sorted_anns))]
+        with ThreadPoolExecutor() as executor:
+            regions_and_anns = list(executor.map(lambda ann: self.process_ann(ann, image), topk_anns))
+        regions = [region for region, _ in regions_and_anns]
+        region_class_labels = self.region_classify_w_blip2(regions)
+        for (region, ann), class_label in zip(regions_and_anns, region_class_labels):
+            ann['class_name'] = class_label
+        end_time = time.time()
+        print("region_level_semantic_api took {:.2f} seconds".format(end_time - start_time))
+        return [ann for _, ann in regions_and_anns]
+    def semantic_class_w_mask(self, img_src, anns):
+        image = Image.open(img_src)
+        image = resize_long_edge(image, 384)
+        return self.region_level_semantic_api(image, anns)

models/segment_models/semantic_segment_anything_model.py CHANGED Viewed

@@ -10,6 +10,7 @@ from PIL import Image
 import pycocotools.mask as maskUtils
 from models.segment_models.configs.ade20k_id2label import CONFIG as CONFIG_ADE20K_ID2LABEL
 from models.segment_models.configs.coco_id2label import CONFIG as CONFIG_COCO_ID2LABEL
 # from mmdet.core.visualization.image import imshow_det_bboxes # comment this line if you don't use mmdet
 nlp = spacy.load('en_core_web_sm')
@@ -113,6 +114,7 @@ class SemanticSegment():
         :return: dict('segmentation', 'area', 'bbox', 'predicted_iou', 'point_coords', 'stability_score', 'crop_box', "class_name", "class_proposals"})
         """
         img = mmcv.imread(img_src)
         oneformer_coco_seg = self.oneformer_segmentation(Image.fromarray(img), self.oneformer_coco_processor, self.oneformer_coco_model)
         oneformer_ade20k_seg = self.oneformer_segmentation(Image.fromarray(img), self.oneformer_ade20k_processor, self.oneformer_ade20k_model)
         bitmasks, class_names = [], []

 import pycocotools.mask as maskUtils
 from models.segment_models.configs.ade20k_id2label import CONFIG as CONFIG_ADE20K_ID2LABEL
 from models.segment_models.configs.coco_id2label import CONFIG as CONFIG_COCO_ID2LABEL
+from utils.util import resize_long_edge, resize_long_edge_cv2
 # from mmdet.core.visualization.image import imshow_det_bboxes # comment this line if you don't use mmdet
 nlp = spacy.load('en_core_web_sm')
         :return: dict('segmentation', 'area', 'bbox', 'predicted_iou', 'point_coords', 'stability_score', 'crop_box', "class_name", "class_proposals"})
         """
         img = mmcv.imread(img_src)
+        img = resize_long_edge_cv2(img, 384)
         oneformer_coco_seg = self.oneformer_segmentation(Image.fromarray(img), self.oneformer_coco_processor, self.oneformer_coco_model)
         oneformer_ade20k_seg = self.oneformer_segmentation(Image.fromarray(img), self.oneformer_ade20k_processor, self.oneformer_ade20k_model)
         bitmasks, class_names = [], []

models/segment_models/semgent_anything_model.py CHANGED Viewed

@@ -1,10 +1,18 @@
 import cv2
 from segment_anything import SamAutomaticMaskGenerator, sam_model_registry
-import torch
 class SegmentAnything:
-    def __init__(self, device, arch="vit_h", pretrained_weights="pretrained_models/sam_vit_h_4b8939.pth"):
         self.device = device
         self.model = self.initialize_model(arch, pretrained_weights)
     def initialize_model(self, arch, pretrained_weights):
@@ -16,5 +24,6 @@ class SegmentAnything:
     def generate_mask(self, img_src):
         image = cv2.imread(img_src)
         image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
         anns = self.model.generate(image)
         return anns

 import cv2
 from segment_anything import SamAutomaticMaskGenerator, sam_model_registry
+from utils.util import resize_long_edge_cv2
 class SegmentAnything:
+    def __init__(self, device, arch="vit_b"):
         self.device = device
+        if arch=='vit_b':
+            pretrained_weights="pretrained_models/sam_vit_b_01ec64.pth"
+        elif arch=='vit_l':
+            pretrained_weights="pretrained_models/sam_vit_l_0e2f7b.pth"
+        elif arch=='vit_h':
+            pretrained_weights="pretrained_models/sam_vit_h_0e2f7b.pth"
+        else:
+            raise ValueError(f"arch {arch} not supported")
         self.model = self.initialize_model(arch, pretrained_weights)
     def initialize_model(self, arch, pretrained_weights):
     def generate_mask(self, img_src):
         image = cv2.imread(img_src)
         image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        image = resize_long_edge_cv2(image, 384)
         anns = self.model.generate(image)
         return anns

pretrained_models/sam_vit_b_01ec64.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ec2df62732614e57411cdcf32a23ffdf28910380d03139ee0f4fcbe91eb8c912
+size 375042383

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 --extra-index-url https://download.pytorch.org/whl
 torch==1.9.0+cu111
 torchvision==0.10.0+cu111

+# This file only test on Linux
 --extra-index-url https://download.pytorch.org/whl
 torch==1.9.0+cu111
 torchvision==0.10.0+cu111

utils/__pycache__/util.cpython-38.pyc CHANGED Viewed

Binary files a/utils/__pycache__/util.cpython-38.pyc and b/utils/__pycache__/util.cpython-38.pyc differ

utils/image_dense_captions.py DELETED Viewed

@@ -1,108 +0,0 @@
-import argparse
-import multiprocessing as mp
-import os
-import time
-import cv2
-import tqdm
-import sys
-from detectron2.config import get_cfg
-from detectron2.data.detection_utils import read_image
-from detectron2.utils.logger import setup_logger
-sys.path.insert(0, 'third_party/CenterNet2/projects/CenterNet2/')
-from centernet.config import add_centernet_config
-from grit.config import add_grit_config
-from grit.predictor import VisualizationDemo
-import json
-# constants
-WINDOW_NAME = "GRiT"
-def dense_pred_to_caption(predictions):
-    boxes = predictions["instances"].pred_boxes if predictions["instances"].has("pred_boxes") else None
-    object_description = predictions["instances"].pred_object_descriptions.data
-    new_caption = ""
-    for i in range(len(object_description)):
-        new_caption += (object_description[i] + ": " + str([int(a) for a in boxes[i].tensor.cpu().detach().numpy()[0]])) + "; "
-    return new_caption
-def setup_cfg(args):
-    cfg = get_cfg()
-    if args.cpu:
-        cfg.MODEL.DEVICE="cpu"
-    add_centernet_config(cfg)
-    add_grit_config(cfg)
-    cfg.merge_from_file(args.config_file)
-    cfg.merge_from_list(args.opts)
-    # Set score_threshold for builtin models
-    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = args.confidence_threshold
-    cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = args.confidence_threshold
-    if args.test_task:
-        cfg.MODEL.TEST_TASK = args.test_task
-    cfg.MODEL.BEAM_SIZE = 1
-    cfg.MODEL.ROI_HEADS.SOFT_NMS_ENABLED = False
-    cfg.USE_ACT_CHECKPOINT = False
-    cfg.freeze()
-    return cfg
-def get_parser():
-    parser = argparse.ArgumentParser(description="Detectron2 demo for builtin configs")
-    parser.add_argument(
-        "--config-file",
-        default="",
-        metavar="FILE",
-        help="path to config file",
-    )
-    parser.add_argument("--cpu", action='store_true', help="Use CPU only.")
-    parser.add_argument(
-        "--image_src",
-        default="../examples/1.jpg",
-        help="Input json file include 'image' and 'caption'; "
-    )
-    # "/home/aiops/wangjp/Code/LLP/annotation/coco_karpathy_test_dense_caption.json", "/home/aiops/wangjp/Code/LLP/annotation/coco_karpathy_train_dense_caption.json"
-    parser.add_argument(
-        "--confidence-threshold",
-        type=float,
-        default=0.5,
-        help="Minimum score for instance predictions to be shown",
-    )
-    parser.add_argument(
-        "--test-task",
-        type=str,
-        default='',
-        help="Choose a task to have GRiT perform",
-    )
-    parser.add_argument(
-        "--opts",
-        help="Modify config options using the command-line 'KEY VALUE' pairs",
-        default=[],
-        nargs=argparse.REMAINDER,
-    )
-    return parser
-if __name__ == "__main__":
-    mp.set_start_method("spawn", force=True)
-    args = get_parser().parse_args()
-    setup_logger(name="fvcore")
-    logger = setup_logger()
-    logger.info("Arguments: " + str(args))
-    cfg = setup_cfg(args)
-    demo = VisualizationDemo(cfg)
-    if args.image_src:
-        img = read_image(args.image_src, format="BGR")
-        start_time = time.time()
-        predictions, visualized_output = demo.run_on_image(img)
-        new_caption = dense_pred_to_caption(predictions)
-    print(new_caption)
-    output_file = os.path.expanduser("~/grit_output.txt")
-    with open(output_file, 'w') as f:
-        f.write(new_caption)
-    # sys.exit(new_caption)

utils/util.py CHANGED Viewed

@@ -14,7 +14,6 @@ def read_image_width_height(image_path):
     width, height = image.size
     return width, height
 def resize_long_edge(image, target_size=384):
     # Calculate the aspect ratio
     width, height = image.size
@@ -32,6 +31,20 @@ def resize_long_edge(image, target_size=384):
     resized_image = image.resize((new_width, new_height), Image.ANTIALIAS)
     return resized_image
 def display_images_and_text(source_image_path, generated_image, generated_paragraph, outfile_name):
     source_image = Image.open(source_image_path)
     # Create a new image that can fit the images and the text

     width, height = image.size
     return width, height
 def resize_long_edge(image, target_size=384):
     # Calculate the aspect ratio
     width, height = image.size
     resized_image = image.resize((new_width, new_height), Image.ANTIALIAS)
     return resized_image
+def resize_long_edge_cv2(image, target_size=384):
+    height, width = image.shape[:2]
+    aspect_ratio = float(width) / float(height)
+    if height > width:
+        new_height = target_size
+        new_width = int(target_size * aspect_ratio)
+    else:
+        new_width = target_size
+        new_height = int(target_size / aspect_ratio)
+    resized_image = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_AREA)
+    return resized_image
 def display_images_and_text(source_image_path, generated_image, generated_paragraph, outfile_name):
     source_image = Image.open(source_image_path)
     # Create a new image that can fit the images and the text