|
import numpy as np |
|
from tensorflow.keras.applications import ResNet50 |
|
from tensorflow.keras.preprocessing import image |
|
from tensorflow.keras.applications.resnet50 import preprocess_input |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
import os |
|
|
|
|
|
model = ResNet50(weights='imagenet', include_top=False, pooling='avg') |
|
|
|
|
|
|
|
def extract_features(img_path, model): |
|
img = image.load_img(img_path, target_size=(224, 224)) |
|
img_data = image.img_to_array(img) |
|
img_data = np.expand_dims(img_data, axis=0) |
|
img_data = preprocess_input(img_data) |
|
features = model.predict(img_data) |
|
return features.flatten() |
|
|
|
|
|
|
|
def find_duplicates(image_dir, threshold=0.9): |
|
image_features = {} |
|
for img_file in os.listdir(image_dir): |
|
img_path = os.path.join(image_dir, img_file) |
|
features = extract_features(img_path, model) |
|
image_features[img_file] = features |
|
|
|
feature_list = list(image_features.values()) |
|
file_list = list(image_features.keys()) |
|
|
|
num_images = len(file_list) |
|
similarity_matrix = np.zeros((num_images, num_images)) |
|
|
|
for i in range(num_images): |
|
for j in range(i, num_images): |
|
if i != j: |
|
similarity = cosine_similarity( |
|
[feature_list[i]], |
|
[feature_list[j]] |
|
)[0][0] |
|
similarity_matrix[i][j] = similarity |
|
similarity_matrix[j][i] = similarity |
|
|
|
duplicates = set() |
|
for i in range(num_images): |
|
for j in range(i + 1, num_images): |
|
if similarity_matrix[i][j] > threshold: |
|
duplicates.add(file_list[j]) |
|
|
|
return len(duplicates), duplicates |
|
|
|
|
|
if __name__ == "__main__": |
|
import sys |
|
|
|
image_dir = sys.argv[1] if len(sys.argv) > 1 else './images' |
|
threshold = float(sys.argv[2]) if len(sys.argv) > 2 else 0.9 |
|
|
|
count, duplicates = find_duplicates(image_dir, threshold) |
|
print(f"Duplicate Images Count: {count}") |
|
for duplicate in duplicates: |
|
print(duplicate) |
|
|