import numpy as np from tensorflow.keras.applications import ResNet50 from tensorflow.keras.preprocessing import image from tensorflow.keras.applications.resnet50 import preprocess_input from sklearn.metrics.pairwise import cosine_similarity import os # Load the pre-trained ResNet50 model model = ResNet50(weights='imagenet', include_top=False, pooling='avg') # Function to extract feature vector from an image def extract_features(img_path, model): img = image.load_img(img_path, target_size=(224, 224)) img_data = image.img_to_array(img) img_data = np.expand_dims(img_data, axis=0) img_data = preprocess_input(img_data) features = model.predict(img_data) return features.flatten() # Function to find and count duplicates def find_duplicates(image_dir, threshold=0.9): image_features = {} for img_file in os.listdir(image_dir): img_path = os.path.join(image_dir, img_file) features = extract_features(img_path, model) image_features[img_file] = features feature_list = list(image_features.values()) file_list = list(image_features.keys()) num_images = len(file_list) similarity_matrix = np.zeros((num_images, num_images)) for i in range(num_images): for j in range(i, num_images): if i != j: similarity = cosine_similarity( [feature_list[i]], [feature_list[j]] )[0][0] similarity_matrix[i][j] = similarity similarity_matrix[j][i] = similarity duplicates = set() for i in range(num_images): for j in range(i + 1, num_images): if similarity_matrix[i][j] > threshold: duplicates.add(file_list[j]) return len(duplicates), duplicates if __name__ == "__main__": import sys image_dir = sys.argv[1] if len(sys.argv) > 1 else './images' threshold = float(sys.argv[2]) if len(sys.argv) > 2 else 0.9 count, duplicates = find_duplicates(image_dir, threshold) print(f"Duplicate Images Count: {count}") for duplicate in duplicates: print(duplicate)