Spaces:

SkalskiP
/

florence-sam

Running on Zero

App Files Files Community

SkalskiP commited on Aug 1

Commit

baea9b2

•

1 Parent(s): 76abf0b

initial commit with Florence-2

Browse files

Files changed (6) hide show

.gitignore +2 -0
app.py +74 -0
requirements-local.txt +9 -0
requirements.txt +8 -0
utils/__init__.py +0 -0
utils/florence.py +55 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ /venv
2	+ /.idea

app.py ADDED Viewed

	@@ -0,0 +1,74 @@

+from typing import Tuple
+import gradio as gr
+import supervision as sv
+import torch
+from PIL import Image
+from utils.florence import load_model, run_inference, FLORENCE_DETAILED_CAPTION_TASK, \
+    FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK
+MARKDOWN = """
+# Florence-2 + SAM2 🔥
+"""
+DEVICE = torch.device("cuda")
+FLORENCE_MODEL, FLORENCE_PROCESSOR = load_model(device=DEVICE)
+BOX_ANNOTATOR = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX)
+LABEL_ANNOTATOR = sv.LabelAnnotator(color_lookup=sv.ColorLookup.INDEX)
+def process(
+    image_input,
+) -> Tuple[Image.Image, str]:
+    _, result = run_inference(
+        model=FLORENCE_MODEL,
+        processor=FLORENCE_PROCESSOR,
+        device=DEVICE,
+        image=image_input,
+        task=FLORENCE_DETAILED_CAPTION_TASK
+    )
+    caption = result[FLORENCE_DETAILED_CAPTION_TASK]
+    _, result = run_inference(
+        model=FLORENCE_MODEL,
+        processor=FLORENCE_PROCESSOR,
+        device=DEVICE,
+        image=image_input,
+        task=FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK,
+        text=caption
+    )
+    detections = sv.Detections.from_lmm(
+        lmm=sv.LMM.FLORENCE_2,
+        result=result,
+        resolution_wh=image_input.size
+    )
+    output_image = image_input.copy()
+    output_image = BOX_ANNOTATOR.annotate(output_image, detections)
+    output_image = LABEL_ANNOTATOR.annotate(output_image, detections)
+    return output_image, caption
+with gr.Blocks() as demo:
+    gr.Markdown(MARKDOWN)
+    with gr.Row():
+        with gr.Column():
+            image_input_component = gr.Image(
+                type='pil', label='Upload image')
+            submit_button_component = gr.Button(value='Submit', variant='primary')
+        with gr.Column():
+            image_output_component = gr.Image(type='pil', label='Image output')
+            text_output_component = gr.Textbox(label='Caption output')
+    submit_button_component.click(
+        fn=process,
+        inputs=[image_input_component],
+        outputs=[
+            image_output_component,
+            text_output_component
+        ]
+    )
+demo.launch(debug=False, show_error=True, max_threads=1)

requirements-local.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+torch
+einops
+spaces
+timm
+transformers
+samv2
+gradio
+supervision
+opencv-python

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+einops
+spaces
+timm
+transformers
+samv2
+gradio
+supervision
+opencv-python

utils/__init__.py ADDED Viewed

File without changes

utils/florence.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import os
+from typing import Union, Any, Tuple, Dict
+from unittest.mock import patch
+import torch
+from PIL import Image
+from transformers import AutoModelForCausalLM, AutoProcessor
+from transformers.dynamic_module_utils import get_imports
+FLORENCE_CHECKPOINT = "microsoft/Florence-2-base"
+FLORENCE_DETAILED_CAPTION_TASK = '<MORE_DETAILED_CAPTION>'
+FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK = '<CAPTION_TO_PHRASE_GROUNDING>'
+def fixed_get_imports(filename: Union[str, os.PathLike]) -> list[str]:
+    """Work around for https://huggingface.co/microsoft/phi-1_5/discussions/72."""
+    if not str(filename).endswith("/modeling_florence2.py"):
+        return get_imports(filename)
+    imports = get_imports(filename)
+    imports.remove("flash_attn")
+    return imports
+def load_model(
+    device: torch.device, checkpoint: str = FLORENCE_CHECKPOINT
+) -> Tuple[Any, Any]:
+    with patch("transformers.dynamic_module_utils.get_imports", fixed_get_imports):
+        model = AutoModelForCausalLM.from_pretrained(
+            checkpoint, trust_remote_code=True).to(device).eval()
+        processor = AutoProcessor.from_pretrained(
+            checkpoint, trust_remote_code=True)
+        return model, processor
+def run_inference(
+    model: Any,
+    processor: Any,
+    device: torch.device,
+    image: Image,
+    task: str,
+    text: str = ""
+) -> Tuple[str, Dict]:
+    prompt = task + text
+    inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
+    generated_ids = model.generate(
+        input_ids=inputs["input_ids"],
+        pixel_values=inputs["pixel_values"],
+        max_new_tokens=1024,
+        num_beams=3
+    )
+    generated_text = processor.batch_decode(
+        generated_ids, skip_special_tokens=False)[0]
+    response = processor.post_process_generation(
+        generated_text, task=task, image_size=image.size)
+    return generated_text, response