Dev Jethava commited on
Commit
1a7eb25
1 Parent(s): 1b2db0a

Add duplicate detector script

Browse files
Files changed (2) hide show
  1. .idea/.gitignore +8 -0
  2. duplicate_detector.py +64 -0
.idea/.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Default ignored files
2
+ /shelf/
3
+ /workspace.xml
4
+ # Editor-based HTTP Client requests
5
+ /httpRequests/
6
+ # Datasource local storage ignored files
7
+ /dataSources/
8
+ /dataSources.local.xml
duplicate_detector.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from tensorflow.keras.applications import ResNet50
3
+ from tensorflow.keras.preprocessing import image
4
+ from tensorflow.keras.applications.resnet50 import preprocess_input
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+ import os
7
+
8
+ # Load the pre-trained ResNet50 model
9
+ model = ResNet50(weights='imagenet', include_top=False, pooling='avg')
10
+
11
+
12
+ # Function to extract feature vector from an image
13
+ def extract_features(img_path, model):
14
+ img = image.load_img(img_path, target_size=(224, 224))
15
+ img_data = image.img_to_array(img)
16
+ img_data = np.expand_dims(img_data, axis=0)
17
+ img_data = preprocess_input(img_data)
18
+ features = model.predict(img_data)
19
+ return features.flatten()
20
+
21
+
22
+ # Function to find and count duplicates
23
+ def find_duplicates(image_dir, threshold=0.9):
24
+ image_features = {}
25
+ for img_file in os.listdir(image_dir):
26
+ img_path = os.path.join(image_dir, img_file)
27
+ features = extract_features(img_path, model)
28
+ image_features[img_file] = features
29
+
30
+ feature_list = list(image_features.values())
31
+ file_list = list(image_features.keys())
32
+
33
+ num_images = len(file_list)
34
+ similarity_matrix = np.zeros((num_images, num_images))
35
+
36
+ for i in range(num_images):
37
+ for j in range(i, num_images):
38
+ if i != j:
39
+ similarity = cosine_similarity(
40
+ [feature_list[i]],
41
+ [feature_list[j]]
42
+ )[0][0]
43
+ similarity_matrix[i][j] = similarity
44
+ similarity_matrix[j][i] = similarity
45
+
46
+ duplicates = set()
47
+ for i in range(num_images):
48
+ for j in range(i + 1, num_images):
49
+ if similarity_matrix[i][j] > threshold:
50
+ duplicates.add(file_list[j])
51
+
52
+ return len(duplicates), duplicates
53
+
54
+
55
+ if __name__ == "__main__":
56
+ import sys
57
+
58
+ image_dir = sys.argv[1] if len(sys.argv) > 1 else './images'
59
+ threshold = float(sys.argv[2]) if len(sys.argv) > 2 else 0.9
60
+
61
+ count, duplicates = find_duplicates(image_dir, threshold)
62
+ print(f"Duplicate Images Count: {count}")
63
+ for duplicate in duplicates:
64
+ print(duplicate)