Spaces:

sujitpal
/

clip-rsicd-demo

Build error

App Files Files Community

Sujit Pal commited on Jul 24, 2021

Commit

6d88167

•

1 Parent(s): c0c0d12

fix: changes based on evaluation

Browse files

Files changed (12) hide show

app.py +11 -4
dashboard_featurefinder.py +85 -47
dashboard_image2image.py +121 -36
dashboard_text2image.py +39 -11
demo-images/Acopulco-Bay.jpg +0 -0
demo-images/Eagle-Bay-Coastline.jpg +0 -0
demo-images/Forest-with-River.jpg +0 -0
demo-images/Highway-through-Forest.jpg +0 -0
demo-images/Multistoreyed-Buildings.jpg +0 -0
demo-images/St-Tropez-Port.jpg +0 -0
demo-images/Street-View-Malayasia.jpg +0 -0
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -5,12 +5,19 @@ import dashboard_featurefinder
 import streamlit as st
 PAGES = {
-    "Text to Image": dashboard_text2image,
-    "Image to Image": dashboard_image2image,
-    "Feature in Image": dashboard_featurefinder,
 }
-st.sidebar.title("Navigation")
 selection = st.sidebar.radio("Go to", list(PAGES.keys()))
 page = PAGES[selection]
 page.app()

 import streamlit as st
 PAGES = {
+    "Retrieve Images given Text": dashboard_text2image,
+    "Retrieve Images given Image": dashboard_image2image,
+    "Find Feature in Image": dashboard_featurefinder,
 }
+st.sidebar.title("CLIP-RSICD")
+st.sidebar.markdown("""
+    The CLIP model from OpenAI is trained in a self-supervised manner using
+    contrastive learning to project images and caption text onto a common
+    embedding space. We have fine-tuned the model (see [Model card](https://huggingface.co/flax-community/clip-rsicd-v2))
+    using the [RSICD dataset](https://github.com/201528014227051/RSICD_optimal).
+    Click here for [more information about our project](https://github.com/arampacha/CLIP-rsicd).
+""")
 selection = st.sidebar.radio("Go to", list(PAGES.keys()))
 page = PAGES[selection]
 page.app()

dashboard_featurefinder.py CHANGED Viewed

@@ -4,6 +4,7 @@ import matplotlib.pyplot as plt
 import nmslib
 import numpy as np
 import os
 import streamlit as st
 from tempfile import NamedTemporaryFile
@@ -61,17 +62,38 @@ def get_image_ranks(probs):
   return ranks
 def app():
     model, processor = utils.load_model(MODEL_PATH, BASELINE_MODEL)
     st.title("Find Features in Images")
     st.markdown("""
-        The CLIP model from OpenAI is trained in a self-supervised manner using
-        contrastive learning to project images and caption text onto a common
-        embedding space. We have fine-tuned the model (see [Model card](https://huggingface.co/flax-community/clip-rsicd-v2))
-        using the RSICD dataset (10k images and ~50k captions from the remote
-        sensing domain). Click here for [more information about our project](https://github.com/arampacha/CLIP-rsicd).
         This demo shows the ability of the model to find specific features
         (specified as text queries) in the image. As an example, say you wish to
         find the parts of the following image that contain a `beach`, `houses`,
@@ -92,46 +114,62 @@ def app():
         for features that you can ask the model to identify.
     """)
     # buf = st.file_uploader("Upload Image for Analysis", type=["png", "jpg"])
-    image_file = st.selectbox("Image File", index=0,
-                            options=[
-                                "St-Tropez-Port.jpg",
-                                "Acopulco-Bay.jpg",
-                                "Highway-through-Forest.jpg",
-                                "Forest-with-River.jpg",
-                                "Eagle-Bay-Coastline.jpg",
-                                "Multistoreyed-Buildings.jpg",
-                                "Street-View-Malayasia.jpg",
-                            ])
-    searched_feature = st.text_input("Feature to find")
     if st.button("Find"):
-        # ftmp = NamedTemporaryFile()
-        # ftmp.write(buf.getvalue())
-        # image = plt.imread(ftmp.name)
-        image = plt.imread(os.path.join("demo-images", image_file))
-        if len(image.shape) != 3 and image.shape[2] != 3:
-            st.error("Image should be an RGB image")
-        if image.shape[0] < 224 or image.shape[1] < 224:
-            st.error("Image should be at least (224 x 224")
-        st.image(image, caption="Input Image")
-        st.markdown("---")
-        num_rows, num_cols, patches = split_image(image)
-        image_preprocessor = Compose([
-            ToPILImage(),
-            Resize(224)
-        ])
-        num_rows, num_cols, patches = split_image(image)
-        patch_probs = get_patch_probabilities(
-            patches,
-            searched_feature,
-            image_preprocessor,
-            model,
-            processor)
-        patch_ranks = get_image_ranks(patch_probs)
-        for i in range(num_rows):
-            row_patches = patches[i * num_cols : (i + 1) * num_cols]
-            row_probs = patch_probs[i * num_cols : (i + 1) * num_cols]
-            row_ranks = patch_ranks[i * num_cols : (i + 1) * num_cols]
-            captions = ["p({:s})={:.3f}, rank={:d}".format(searched_feature, p, r + 1)
-                for p, r in zip(row_probs, row_ranks)]
-            st.image(row_patches, caption=captions)

 import nmslib
 import numpy as np
 import os
+import requests
 import streamlit as st
 from tempfile import NamedTemporaryFile
   return ranks
+def download_and_prepare_image(image_url):
+    """
+        Take input image and resize it to 672x896
+    """
+    try:
+        image_raw = requests.get(image_url, stream=True,).raw
+        image = Image.open(image_raw).convert("RGB")
+        width, height = image.size
+        # print("WID,HGT:", width, height)
+        if width < 224 or height < 224:
+            return None
+        # take the short edge and reduce to 672
+        if width < height:
+            resize_factor = 672 / width
+            image = image.resize((672, int(height * resize_factor)))
+            image = image.crop((0, 0, 672, 896))
+        else:
+            resize_factor = 672 / height
+            image = image.resize((int(width * resize_factor), 896))
+            image = image.crop((0, 0, 896, 672))
+        return np.asarray(image)
+    except Exception as e:
+        # print(e)
+        return None
 def app():
     model, processor = utils.load_model(MODEL_PATH, BASELINE_MODEL)
     st.title("Find Features in Images")
     st.markdown("""
         This demo shows the ability of the model to find specific features
         (specified as text queries) in the image. As an example, say you wish to
         find the parts of the following image that contain a `beach`, `houses`,
         for features that you can ask the model to identify.
     """)
     # buf = st.file_uploader("Upload Image for Analysis", type=["png", "jpg"])
+    image_file = st.selectbox(
+        "Sample Image File",
+        options=[
+            "-- select one --",
+            "St-Tropez-Port.jpg",
+            "Acopulco-Bay.jpg",
+            "Highway-through-Forest.jpg",
+            "Forest-with-River.jpg",
+            "Eagle-Bay-Coastline.jpg",
+            "Multistoreyed-Buildings.jpg",
+            "Street-View-Malayasia.jpg",
+        ])
+    image_url = st.text_input(
+        "OR provide an image URL",
+        value="https://static.eos.com/wp-content/uploads/2019/04/Main.jpg")
+    searched_feature = st.text_input("Feature to find", value="beach")
     if st.button("Find"):
+        # print("image_file:", image_file)
+        # print("image_url:", image_url)
+        if image_file.startswith("--"):
+            image = download_and_prepare_image(image_url)
+        else:
+            image = plt.imread(os.path.join("demo-images", image_file))
+        if image is None:
+            st.error("Image could not be downloaded, please try another one")
+        else:
+            st.image(image, caption="Input Image")
+            st.markdown("---")
+            # print("image.shape:", image.shape)
+            num_rows, num_cols, patches = split_image(image)
+            # print("num_rows, num_cols, num(patches:", num_rows, num_cols, len(patches), patches[0].shape)
+            image_preprocessor = Compose([
+                ToPILImage(),
+                Resize(224)
+            ])
+            num_rows, num_cols, patches = split_image(image)
+            patch_probs = get_patch_probabilities(
+                patches,
+                searched_feature,
+                image_preprocessor,
+                model,
+                processor)
+            patch_ranks = get_image_ranks(patch_probs)
+            pid = 0
+            for i in range(num_rows):
+                cols = st.beta_columns(num_cols)
+                for col in cols:
+                    caption = "#{:d} p({:s})={:.3f}".format(
+                        patch_ranks[pid] + 1, searched_feature, patch_probs[pid])
+                    col.image(patches[pid], caption=caption)
+                    pid += 1
+                # row_patches = patches[i * num_cols : (i + 1) * num_cols]
+                # row_probs = patch_probs[i * num_cols : (i + 1) * num_cols]
+                # row_ranks = patch_ranks[i * num_cols : (i + 1) * num_cols]
+                # captions = ["p({:s})={:.3f}, rank={:d}".format(searched_feature, p, r + 1)
+                #     for p, r in zip(row_probs, row_ranks)]
+                # st.image(row_patches, caption=captions)

dashboard_image2image.py CHANGED Viewed

@@ -2,6 +2,7 @@ import matplotlib.pyplot as plt
 import nmslib
 import numpy as np
 import os
 import streamlit as st
 from PIL import Image
@@ -33,25 +34,48 @@ def load_example_images():
             example_images[image_class].append(image_name)
         else:
             example_images[image_class] = [image_name]
-    return example_images
 def app():
     filenames, index = utils.load_index(IMAGE_VECTOR_FILE)
     model, processor = utils.load_model(MODEL_PATH, BASELINE_MODEL)
-    example_images = load_example_images()
-    example_image_list = sorted([v[np.random.randint(0, len(v))]
-                                for k, v in example_images.items()][0:10])
-    st.title("Image to Image Retrieval")
     st.markdown("""
-        The CLIP model from OpenAI is trained in a self-supervised manner using
-        contrastive learning to project images and caption text onto a common
-        embedding space. We have fine-tuned the model (see [Model card](https://huggingface.co/flax-community/clip-rsicd-v2))
-        using the RSICD dataset (10k images and ~50k captions from the remote
-        sensing domain). Click here for [more information about our project](https://github.com/arampacha/CLIP-rsicd).
         This demo shows the image to image retrieval capabilities of this model, i.e.,
         given an image file name as a query, we use our fine-tuned CLIP model
         to project the query image to the image/caption embedding space and search
@@ -60,31 +84,92 @@ def app():
         Our fine-tuned CLIP model was previously used to generate image vectors for
         our demo, and NMSLib was used for fast vector access.
-        Here are some randomly generated image files from our corpus. You can
-        copy paste one of these below or use one from the results of a text to
-        image search -- {:s}
-    """.format(", ".join("`{:s}`".format(example) for example in example_image_list)))
-    image_name = st.text_input("Provide an Image File Name")
     submit_button = st.button("Find Similar")
-    if submit_button:
-        image = Image.fromarray(plt.imread(os.path.join(IMAGES_DIR, image_name)))
-        inputs = processor(images=image, return_tensors="jax", padding=True)
-        query_vec = model.get_image_features(**inputs)
-        query_vec = np.asarray(query_vec)
-        ids, distances = index.knnQuery(query_vec, k=11)
-        result_filenames = [filenames[id] for id in ids]
-        images, captions = [], []
-        for result_filename, score in zip(result_filenames, distances):
-            if result_filename == image_name:
-                continue
-            images.append(
-                plt.imread(os.path.join(IMAGES_DIR, result_filename)))
-            captions.append("{:s} (score: {:.3f})".format(result_filename, 1.0 - score))
-        images = images[0:10]
-        captions = captions[0:10]
-        st.image(images[0:3], caption=captions[0:3])
-        st.image(images[3:6], caption=captions[3:6])
-        st.image(images[6:9], caption=captions[6:9])
-        st.image(images[9:], caption=captions[9:])

 import nmslib
 import numpy as np
 import os
+import requests
 import streamlit as st
 from PIL import Image
             example_images[image_class].append(image_name)
         else:
             example_images[image_class] = [image_name]
+    example_image_list = sorted([v[np.random.randint(0, len(v))]
+                                for k, v in example_images.items()][0:10])
+    return example_image_list
+def get_image_thumbnail(image_filename):
+    image = Image.open(os.path.join(IMAGES_DIR, image_filename))
+    image = image.resize((100, 100))
+    return image
+def download_and_prepare_image(image_url):
+    try:
+        image_raw = requests.get(image_url, stream=True,).raw
+        image = Image.open(image_raw).convert("RGB")
+        width, height = image.size
+        # print("width, height:", width, height)
+        resize_mult = width / 224 if width < height else height / 224
+        # print("resize_mult:", resize_mult)
+        # print("resize:", width // resize_mult, height // resize_mult)
+        image = image.resize((int(width // resize_mult),
+                              int(height // resize_mult)))
+        width, height = image.size
+        left = int((width - 224) // 2)
+        top = int((height - 224) // 2)
+        right = int((width + 224) // 2)
+        bottom = int((height + 224) // 2)
+        # print("LTRB:", left, top, right, bottom)
+        image = image.crop((left, top, right, bottom))
+        return image
+    except Exception as e:
+        # print(e)
+        return None
 def app():
     filenames, index = utils.load_index(IMAGE_VECTOR_FILE)
     model, processor = utils.load_model(MODEL_PATH, BASELINE_MODEL)
+    example_image_list = load_example_images()
+    st.title("Retrieve Images given Images")
     st.markdown("""
         This demo shows the image to image retrieval capabilities of this model, i.e.,
         given an image file name as a query, we use our fine-tuned CLIP model
         to project the query image to the image/caption embedding space and search
         Our fine-tuned CLIP model was previously used to generate image vectors for
         our demo, and NMSLib was used for fast vector access.
+        Here are some randomly generated image files from our corpus, that you can
+        find similar images for by selecting the button below it. Alternatively you
+        can upload your own image from the Internet.
+    """)
+    suggest_idx = -1
+    col0, col1, col2, col3, col4 = st.beta_columns(5)
+    col0.image(get_image_thumbnail(example_image_list[0]))
+    col1.image(get_image_thumbnail(example_image_list[1]))
+    col2.image(get_image_thumbnail(example_image_list[2]))
+    col3.image(get_image_thumbnail(example_image_list[3]))
+    col4.image(get_image_thumbnail(example_image_list[4]))
+    col0t, col1t, col2t, col3t, col4t = st.beta_columns(5)
+    with col0t:
+        if st.button("Image-1"):
+            suggest_idx = 0
+    with col1t:
+        if st.button("Image-2"):
+            suggest_idx = 1
+    with col2t:
+        if st.button("Image-3"):
+            suggest_idx = 2
+    with col3t:
+        if st.button("Image-4"):
+            suggest_idx = 3
+    with col4t:
+        if st.button("Image-5"):
+            suggest_idx = 4
+    col5, col6, col7, col8, col9 = st.beta_columns(5)
+    col5.image(get_image_thumbnail(example_image_list[5]))
+    col6.image(get_image_thumbnail(example_image_list[6]))
+    col7.image(get_image_thumbnail(example_image_list[7]))
+    col8.image(get_image_thumbnail(example_image_list[8]))
+    col9.image(get_image_thumbnail(example_image_list[9]))
+    col5t, col6t, col7t, col8t, col9t = st.beta_columns(5)
+    with col5t:
+        if st.button("Image-6"):
+            suggest_idx = 5
+    with col6t:
+        if st.button("Image-7"):
+            suggest_idx = 6
+    with col7t:
+        if st.button("Image-8"):
+            suggest_idx = 7
+    with col8t:
+        if st.button("Image-9"):
+            suggest_idx = 8
+    with col9t:
+        if st.button("Image-10"):
+            suggest_idx = 9
+    image_url = st.text_input(
+        "OR provide an image URL",
+        value="https://media.wired.com/photos/5a8c80647b7bd44d86b88077/master/w_2240,c_limit/Satellite-FINAL.jpg")
     submit_button = st.button("Find Similar")
+    if submit_button or suggest_idx > -1:
+        image_name = None
+        if suggest_idx > -1:
+            image_name = example_image_list[suggest_idx]
+            image = Image.fromarray(plt.imread(os.path.join(IMAGES_DIR, image_name)))
+        else:
+            image = download_and_prepare_image(image_url)
+            st.image(image, caption="Input Image")
+            st.markdown("---")
+        if image is None:
+            st.error("Image could not be downloaded, please try another one!")
+        else:
+            inputs = processor(images=image, return_tensors="jax", padding=True)
+            query_vec = model.get_image_features(**inputs)
+            query_vec = np.asarray(query_vec)
+            ids, distances = index.knnQuery(query_vec, k=11)
+            result_filenames = [filenames[id] for id in ids]
+            images, captions = [], []
+            for result_filename, score in zip(result_filenames, distances):
+                if image_name is not None and result_filename == image_name:
+                    continue
+                images.append(
+                    plt.imread(os.path.join(IMAGES_DIR, result_filename)))
+                captions.append("{:s} (score: {:.3f})".format(result_filename, 1.0 - score))
+            images = images[0:10]
+            captions = captions[0:10]
+            st.image(images[0:3], caption=captions[0:3])
+            st.image(images[3:6], caption=captions[3:6])
+            st.image(images[6:9], caption=captions[6:9])
+            st.image(images[9:], caption=captions[9:])
+            suggest_idx = -1

dashboard_text2image.py CHANGED Viewed

@@ -24,14 +24,8 @@ def app():
     filenames, index = utils.load_index(IMAGE_VECTOR_FILE)
     model, processor = utils.load_model(MODEL_PATH, BASELINE_MODEL)
-    st.title("Text to Image Retrieval")
     st.markdown("""
-        The CLIP model from OpenAI is trained in a self-supervised manner using
-        contrastive learning to project images and caption text onto a common
-        embedding space. We have fine-tuned the model (see [Model card](https://huggingface.co/flax-community/clip-rsicd-v2))
-        using the RSICD dataset (10k images and ~50k captions from the remote
-        sensing domain). Click here for [more information about our project](https://github.com/arampacha/CLIP-rsicd).
         This demo shows the image to text retrieval capabilities of this model, i.e.,
         given a text query, we use our fine-tuned CLIP model to project the text query
         to the image/caption embedding space and search for nearby images (by
@@ -40,12 +34,45 @@ def app():
         Our fine-tuned CLIP model was previously used to generate image vectors for
         our demo, and NMSLib was used for fast vector access.
-        Some suggested queries to start you off with -- `ships`, `school house`,
-        `military installations`, `mountains`, `beaches`, `airports`, `lakes`, etc.
     """)
-    query = st.text_input("Text Query:")
-    if st.button("Query"):
         inputs = processor(text=[query], images=None, return_tensors="jax", padding=True)
         query_vec = model.get_text_features(**inputs)
         query_vec = np.asarray(query_vec)
@@ -60,3 +87,4 @@ def app():
         st.image(images[3:6], caption=captions[3:6])
         st.image(images[6:9], caption=captions[6:9])
         st.image(images[9:], caption=captions[9:])

     filenames, index = utils.load_index(IMAGE_VECTOR_FILE)
     model, processor = utils.load_model(MODEL_PATH, BASELINE_MODEL)
+    st.title("Retrieve Images given Text")
     st.markdown("""
         This demo shows the image to text retrieval capabilities of this model, i.e.,
         given a text query, we use our fine-tuned CLIP model to project the text query
         to the image/caption embedding space and search for nearby images (by
         Our fine-tuned CLIP model was previously used to generate image vectors for
         our demo, and NMSLib was used for fast vector access.
     """)
+    suggested_query = [
+        "ships",
+        "school house",
+        "military installation",
+        "mountains",
+        "beaches",
+        "airports",
+        "lakes"
+    ]
+    st.text("Some suggested queries to start you off with...")
+    col0, col1, col2, col3, col4, col5, col6 = st.beta_columns(7)
+        # [1, 1.1, 1.3, 1.1, 1, 1, 1])
+    suggest_idx = -1
+    with col0:
+        if st.button(suggested_query[0]):
+            suggest_idx = 0
+    with col1:
+        if st.button(suggested_query[1]):
+            suggest_idx = 1
+    with col2:
+        if st.button(suggested_query[2]):
+            suggest_idx = 2
+    with col3:
+        if st.button(suggested_query[3]):
+            suggest_idx = 3
+    with col4:
+        if st.button(suggested_query[4]):
+            suggest_idx = 4
+    with col5:
+        if st.button(suggested_query[5]):
+            suggest_idx = 5
+    with col6:
+        if st.button(suggested_query[6]):
+            suggest_idx = 6
+    query = st.text_input("OR enter a text Query:")
+    query = suggested_query[suggest_idx] if suggest_idx > -1 else query
+    if st.button("Query") or suggest_idx > -1:
         inputs = processor(text=[query], images=None, return_tensors="jax", padding=True)
         query_vec = model.get_text_features(**inputs)
         query_vec = np.asarray(query_vec)
         st.image(images[3:6], caption=captions[3:6])
         st.image(images[6:9], caption=captions[6:9])
         st.image(images[9:], caption=captions[9:])
+        suggest_idx = -1