Synced repo using 'sync_with_huggingface' Github Action
Browse files- iscc_sct/cli.py +15 -1
- iscc_sct/demo.py +243 -101
- iscc_sct/models.py +1 -1
- iscc_sct/utils.py +61 -0
- poetry.lock +4 -4
- pyproject.toml +3 -2
- space.yml +1 -1
- tests/test_demo.py +0 -79
- tests/test_utils.py +129 -0
iscc_sct/cli.py
CHANGED
@@ -9,7 +9,10 @@ from charset_normalizer import from_bytes
|
|
9 |
def main():
|
10 |
parser = argparse.ArgumentParser(description="Generate Semantic Text-Codes for text files.")
|
11 |
parser.add_argument(
|
12 |
-
"path",
|
|
|
|
|
|
|
13 |
)
|
14 |
parser.add_argument(
|
15 |
"-b", "--bits", type=int, default=256, help="Bit-Length of Code (default 256)"
|
@@ -27,6 +30,17 @@ def main():
|
|
27 |
if not args.debug:
|
28 |
logger.remove()
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
for path in glob.glob(args.path):
|
31 |
path = Path(path)
|
32 |
if path.is_file():
|
|
|
9 |
def main():
|
10 |
parser = argparse.ArgumentParser(description="Generate Semantic Text-Codes for text files.")
|
11 |
parser.add_argument(
|
12 |
+
"path",
|
13 |
+
type=str,
|
14 |
+
help="Path to text files (supports glob patterns) or 'gui' to launch Gradio demo.",
|
15 |
+
nargs="?",
|
16 |
)
|
17 |
parser.add_argument(
|
18 |
"-b", "--bits", type=int, default=256, help="Bit-Length of Code (default 256)"
|
|
|
30 |
if not args.debug:
|
31 |
logger.remove()
|
32 |
|
33 |
+
if args.path == "gui": # pragma: no cover
|
34 |
+
try:
|
35 |
+
from iscc_sct.demo import demo
|
36 |
+
|
37 |
+
demo.launch(inbrowser=True)
|
38 |
+
except ImportError:
|
39 |
+
print(
|
40 |
+
"Error: Gradio is not installed. Please install it with 'pip install gradio' to use the GUI."
|
41 |
+
)
|
42 |
+
return
|
43 |
+
|
44 |
for path in glob.glob(args.path):
|
45 |
path = Path(path)
|
46 |
if path.is_file():
|
iscc_sct/demo.py
CHANGED
@@ -7,6 +7,18 @@ import gradio as gr
|
|
7 |
import iscc_sct as sct
|
8 |
import textwrap
|
9 |
import yaml
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
|
12 |
newline_symbols = {
|
@@ -56,9 +68,21 @@ def compute_iscc_code(text1, text2, bit_length):
|
|
56 |
return code1["iscc"], code2["iscc"], similarity
|
57 |
|
58 |
|
|
|
|
|
|
|
59 |
def compare_codes(code_a, code_b, bits):
|
60 |
-
if
|
61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
|
64 |
def truncate_text(text, max_length=70):
|
@@ -89,9 +113,10 @@ def generate_similarity_bar(similarity):
|
|
89 |
"transform: translateX(-50%);" if similarity >= 0 else "transform: translateX(50%);"
|
90 |
)
|
91 |
|
|
|
|
|
92 |
bar_html = f"""
|
93 |
-
<
|
94 |
-
<div style='width: 100%; border: 1px solid #ccc; height: 30px; position: relative; background-color: #eee;'>
|
95 |
<div style='height: 100%; width: {bar_width}%; background-color: {color}; position: absolute; {position}: 50%;'>
|
96 |
<span style='position: absolute; width: 100%; {text_position} top: 0; line-height: 30px; color: white; {text_alignment}'>{display_similarity:.2f}%</span>
|
97 |
</div>
|
@@ -101,18 +126,17 @@ def generate_similarity_bar(similarity):
|
|
101 |
|
102 |
|
103 |
def load_samples():
|
104 |
-
with open("
|
105 |
return yaml.safe_load(file)["samples"]
|
106 |
|
107 |
|
108 |
samples = load_samples()
|
109 |
|
110 |
-
custom_css = """
|
111 |
-
"""
|
112 |
|
113 |
iscc_theme = gr.themes.Default(
|
114 |
-
font=[gr.themes.GoogleFont("Readex Pro")],
|
115 |
font_mono=[gr.themes.GoogleFont("JetBrains Mono")],
|
|
|
116 |
radius_size=gr.themes.sizes.radius_none,
|
117 |
)
|
118 |
|
@@ -120,7 +144,7 @@ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
|
|
120 |
with gr.Row(variant="panel"):
|
121 |
gr.Markdown(
|
122 |
"""
|
123 |
-
##
|
124 |
Demo of cross-lingual Semantic Text-Code (proof of concept)
|
125 |
""",
|
126 |
)
|
@@ -146,7 +170,7 @@ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
|
|
146 |
lines=12,
|
147 |
max_lines=12,
|
148 |
)
|
149 |
-
out_code_a = gr.Textbox(label="ISCC
|
150 |
with gr.Column(variant="panel"):
|
151 |
in_text_b = gr.TextArea(
|
152 |
label="Text B",
|
@@ -154,35 +178,64 @@ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
|
|
154 |
lines=12,
|
155 |
max_lines=12,
|
156 |
)
|
157 |
-
out_code_b = gr.Textbox(label="ISCC
|
158 |
|
159 |
with gr.Row(variant="panel"):
|
160 |
with gr.Column(variant="panel"):
|
161 |
-
|
|
|
|
|
|
|
|
|
|
|
162 |
|
163 |
with gr.Row(variant="panel"):
|
164 |
-
|
165 |
-
label="ISCC Bit-Length",
|
166 |
-
info="NUMBER OF BITS FOR OUTPUT ISCC",
|
167 |
-
minimum=64,
|
168 |
-
maximum=256,
|
169 |
-
step=32,
|
170 |
-
value=64,
|
171 |
-
)
|
172 |
|
173 |
-
with gr.
|
174 |
-
with gr.
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
186 |
|
187 |
def update_sample_text(choice, group):
|
188 |
if choice == "None":
|
@@ -200,86 +253,151 @@ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
|
|
200 |
outputs=[in_text_b],
|
201 |
)
|
202 |
|
203 |
-
def
|
204 |
-
log.debug(f"{
|
205 |
-
|
206 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
}
|
213 |
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
iscc = sct.Metadata(**result).to_object_format()
|
218 |
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
overlaps = iscc.get_overlaps()
|
223 |
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
|
228 |
-
|
229 |
-
|
230 |
-
content = content[len(overlaps[i - 1]) :]
|
231 |
|
232 |
-
|
233 |
-
|
234 |
-
|
|
|
235 |
|
236 |
-
|
237 |
-
|
|
|
|
|
|
|
|
|
|
|
238 |
|
239 |
-
|
240 |
-
|
241 |
-
if overlap:
|
242 |
-
highlighted_chunks.append((f"\n{no_nl(overlap)}\n", "overlap"))
|
243 |
|
244 |
-
|
245 |
-
|
246 |
-
out_chunks_func: gr.HighlightedText(value=highlighted_chunks, elem_id="chunked-text"),
|
247 |
-
}
|
248 |
|
249 |
-
|
250 |
-
code_a = sct.gen_text_code_semantic(text_a, bits=nbits)["iscc"] if text_a else None
|
251 |
-
code_b = sct.gen_text_code_semantic(text_b, bits=nbits)["iscc"] if text_b else None
|
252 |
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
257 |
|
258 |
return (
|
259 |
-
|
260 |
-
|
|
|
|
|
261 |
similarity,
|
|
|
262 |
)
|
263 |
|
264 |
in_text_a.change(
|
265 |
-
|
266 |
-
inputs=[in_text_a, in_iscc_bits],
|
267 |
-
outputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
268 |
show_progress="full",
|
269 |
trigger_mode="always_last",
|
270 |
)
|
|
|
271 |
in_text_b.change(
|
272 |
-
|
273 |
-
inputs=[in_text_b, in_iscc_bits],
|
274 |
-
outputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
275 |
show_progress="full",
|
276 |
trigger_mode="always_last",
|
277 |
)
|
278 |
|
279 |
in_iscc_bits.change(
|
280 |
-
|
281 |
-
inputs=[in_text_a, in_text_b, in_iscc_bits],
|
282 |
-
outputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
283 |
show_progress="full",
|
284 |
)
|
285 |
|
@@ -292,12 +410,12 @@ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
|
|
292 |
|
293 |
def reset_all():
|
294 |
return (
|
295 |
-
gr.Slider(value=
|
296 |
gr.Dropdown(
|
297 |
-
value="None", choices=["None"] + [
|
298 |
), # Reset sample dropdown A
|
299 |
gr.Dropdown(
|
300 |
-
value="None", choices=["None"] + [
|
301 |
), # Reset sample dropdown B
|
302 |
gr.TextArea(value=""), # Reset Text A
|
303 |
gr.TextArea(value=""), # Reset Text B
|
@@ -308,9 +426,6 @@ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
|
|
308 |
gr.HighlightedText(value=[]), # Reset Chunked Text B
|
309 |
)
|
310 |
|
311 |
-
with gr.Row(variant="panel"):
|
312 |
-
reset_button = gr.Button("Reset All")
|
313 |
-
|
314 |
reset_button.click(
|
315 |
reset_all,
|
316 |
outputs=[
|
@@ -334,31 +449,58 @@ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
|
|
334 |
## Understanding ISCC Semantic Text-Codes
|
335 |
|
336 |
### What is an ISCC Semantic Text-Code?
|
337 |
-
An ISCC Semantic Text-Code is a digital fingerprint for text content. It captures the meaning of
|
338 |
-
not just the exact words.
|
|
|
339 |
|
340 |
### How does it work?
|
341 |
1. **Input**: You provide a text in any language.
|
342 |
-
2. **Processing**:
|
343 |
-
3. **Output**: A unique
|
344 |
|
345 |
### What can it do?
|
346 |
- **Cross-language matching**: It can recognize similar content across different languages.
|
347 |
- **Similarity detection**: It can measure how similar two texts are in meaning, not just in words.
|
348 |
-
- **Content identification**: It can help identify texts with similar content, even if the wording
|
|
|
349 |
|
350 |
### How to use this demo:
|
351 |
1. **Enter text**: Type or paste text into either or both text boxes.
|
352 |
-
2. **Adjust bit length**: Use the slider to change the detail level of the code (higher = more
|
|
|
353 |
3. **View results**: See the generated ISCC code for each text.
|
354 |
-
4. **Compare**: Look at the similarity bar to see how alike the two texts are in meaning
|
|
|
|
|
|
|
|
|
|
|
|
|
355 |
|
356 |
### Why is this useful?
|
357 |
- **Content creators**: Find similar content across languages.
|
358 |
- **Researchers**: Quickly compare documents or find related texts in different languages.
|
359 |
- **Publishers**: Identify potential translations or similar works efficiently.
|
360 |
|
361 |
-
This technology opens up new possibilities for understanding and managing text content across
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
362 |
"""
|
363 |
)
|
364 |
|
|
|
7 |
import iscc_sct as sct
|
8 |
import textwrap
|
9 |
import yaml
|
10 |
+
import pathlib
|
11 |
+
|
12 |
+
|
13 |
+
HERE = pathlib.Path(__file__).parent.absolute()
|
14 |
+
|
15 |
+
|
16 |
+
custom_css = """
|
17 |
+
.simbar {
|
18 |
+
background: white;
|
19 |
+
min-height: 30px;
|
20 |
+
}
|
21 |
+
"""
|
22 |
|
23 |
|
24 |
newline_symbols = {
|
|
|
68 |
return code1["iscc"], code2["iscc"], similarity
|
69 |
|
70 |
|
71 |
+
import binascii
|
72 |
+
|
73 |
+
|
74 |
def compare_codes(code_a, code_b, bits):
|
75 |
+
if code_a and code_b:
|
76 |
+
code_a_str = code_a.value if hasattr(code_a, "value") else str(code_a)
|
77 |
+
code_b_str = code_b.value if hasattr(code_b, "value") else str(code_b)
|
78 |
+
if code_a_str and code_b_str:
|
79 |
+
try:
|
80 |
+
distance = sct.iscc_distance(code_a_str, code_b_str)
|
81 |
+
return generate_similarity_bar(hamming_to_cosine(distance, bits))
|
82 |
+
except binascii.Error:
|
83 |
+
# Invalid ISCC code format
|
84 |
+
return None
|
85 |
+
return None
|
86 |
|
87 |
|
88 |
def truncate_text(text, max_length=70):
|
|
|
113 |
"transform: translateX(-50%);" if similarity >= 0 else "transform: translateX(50%);"
|
114 |
)
|
115 |
|
116 |
+
tooltip = "Similarity based on ISCC code comparison, not direct text comparison."
|
117 |
+
|
118 |
bar_html = f"""
|
119 |
+
<div title="{tooltip}" style='width: 100%; border: 1px solid #ccc; height: 30px; position: relative; background-color: #eee;'>
|
|
|
120 |
<div style='height: 100%; width: {bar_width}%; background-color: {color}; position: absolute; {position}: 50%;'>
|
121 |
<span style='position: absolute; width: 100%; {text_position} top: 0; line-height: 30px; color: white; {text_alignment}'>{display_similarity:.2f}%</span>
|
122 |
</div>
|
|
|
126 |
|
127 |
|
128 |
def load_samples():
|
129 |
+
with open(HERE / "samples.yml", "r", encoding="utf-8") as file:
|
130 |
return yaml.safe_load(file)["samples"]
|
131 |
|
132 |
|
133 |
samples = load_samples()
|
134 |
|
|
|
|
|
135 |
|
136 |
iscc_theme = gr.themes.Default(
|
137 |
+
font=[gr.themes.GoogleFont("Readex Pro Light")],
|
138 |
font_mono=[gr.themes.GoogleFont("JetBrains Mono")],
|
139 |
+
text_size=gr.themes.sizes.text_lg,
|
140 |
radius_size=gr.themes.sizes.radius_none,
|
141 |
)
|
142 |
|
|
|
144 |
with gr.Row(variant="panel"):
|
145 |
gr.Markdown(
|
146 |
"""
|
147 |
+
## 🔮️ ISCC - Semantic-Code Text
|
148 |
Demo of cross-lingual Semantic Text-Code (proof of concept)
|
149 |
""",
|
150 |
)
|
|
|
170 |
lines=12,
|
171 |
max_lines=12,
|
172 |
)
|
173 |
+
out_code_a = gr.Textbox(label="ISCC-SCT for Text A")
|
174 |
with gr.Column(variant="panel"):
|
175 |
in_text_b = gr.TextArea(
|
176 |
label="Text B",
|
|
|
178 |
lines=12,
|
179 |
max_lines=12,
|
180 |
)
|
181 |
+
out_code_b = gr.Textbox(label="ISCC-SCT for Text B")
|
182 |
|
183 |
with gr.Row(variant="panel"):
|
184 |
with gr.Column(variant="panel"):
|
185 |
+
out_similarity_title = gr.Markdown("### ISCC-based Semantic Similarity")
|
186 |
+
with gr.Row(elem_classes="simbar"):
|
187 |
+
out_similarity = gr.HTML()
|
188 |
+
gr.Markdown(
|
189 |
+
"**NOTE:** Similarity is calculated based on the generated ISCC-SCT, not the original text."
|
190 |
+
)
|
191 |
|
192 |
with gr.Row(variant="panel"):
|
193 |
+
reset_button = gr.Button("Reset All")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
194 |
|
195 |
+
with gr.Accordion(label="🔍 Explore Details & Advanced Options", open=False):
|
196 |
+
with gr.Row(variant="panel"):
|
197 |
+
with gr.Column(variant="panel"):
|
198 |
+
in_iscc_bits = gr.Slider(
|
199 |
+
label="ISCC Bit-Length",
|
200 |
+
info="NUMBER OF BITS FOR OUTPUT ISCC",
|
201 |
+
minimum=64,
|
202 |
+
maximum=256,
|
203 |
+
step=32,
|
204 |
+
value=sct.sct_opts.bits,
|
205 |
+
)
|
206 |
+
with gr.Column(variant="panel"):
|
207 |
+
in_max_tokens = gr.Slider(
|
208 |
+
label="Max Tokens",
|
209 |
+
info="MAXIMUM NUMBER OF TOKENS PER CHUNK",
|
210 |
+
minimum=49,
|
211 |
+
maximum=sct.sct_opts.max_tokens,
|
212 |
+
step=1,
|
213 |
+
value=127,
|
214 |
+
)
|
215 |
+
|
216 |
+
with gr.Row(variant="panel"):
|
217 |
+
with gr.Column(variant="panel"):
|
218 |
+
out_chunks_a = gr.HighlightedText(
|
219 |
+
label="Chunked Text A",
|
220 |
+
interactive=False,
|
221 |
+
elem_id="chunked-text-a",
|
222 |
+
)
|
223 |
+
with gr.Column(variant="panel"):
|
224 |
+
out_chunks_b = gr.HighlightedText(
|
225 |
+
label="Chunked Text B",
|
226 |
+
interactive=False,
|
227 |
+
elem_id="chunked-text-b",
|
228 |
+
)
|
229 |
+
|
230 |
+
with gr.Row(variant="panel"):
|
231 |
+
with gr.Column(variant="panel"):
|
232 |
+
gr.Markdown("### Granular Matches")
|
233 |
+
in_granular_matches = gr.Dataframe(
|
234 |
+
headers=["Chunk A", "Similarity", "Chunk B"],
|
235 |
+
column_widths=["45%", "10%", "45%"],
|
236 |
+
wrap=True,
|
237 |
+
elem_classes="granular-matches",
|
238 |
+
)
|
239 |
|
240 |
def update_sample_text(choice, group):
|
241 |
if choice == "None":
|
|
|
253 |
outputs=[in_text_b],
|
254 |
)
|
255 |
|
256 |
+
def process_and_calculate(text_a, text_b, nbits, max_tokens):
|
257 |
+
log.debug(f"Processing text_a: {text_a[:20]}, text_b: {text_b[:20]}")
|
258 |
+
|
259 |
+
def process_single_text(text, suffix):
|
260 |
+
out_code_func = globals().get(f"out_code_{suffix}")
|
261 |
+
out_chunks_func = globals().get(f"out_chunks_{suffix}")
|
262 |
+
|
263 |
+
if not text:
|
264 |
+
return {
|
265 |
+
out_code_func: gr.Textbox(value=None),
|
266 |
+
out_chunks_func: gr.HighlightedText(
|
267 |
+
value=None, elem_id=f"chunked-text-{suffix}"
|
268 |
+
),
|
269 |
+
}
|
270 |
+
|
271 |
+
result = sct.gen_text_code_semantic(
|
272 |
+
text,
|
273 |
+
bits=nbits,
|
274 |
+
simprints=True,
|
275 |
+
offsets=True,
|
276 |
+
sizes=True,
|
277 |
+
contents=True,
|
278 |
+
max_tokens=max_tokens,
|
279 |
+
)
|
280 |
+
iscc = sct.Metadata(**result).to_object_format()
|
281 |
|
282 |
+
# Generate chunked text with simprints and overlaps
|
283 |
+
features = iscc.features[0]
|
284 |
+
highlighted_chunks = []
|
285 |
+
overlaps = iscc.get_overlaps()
|
|
|
286 |
|
287 |
+
for i, feature in enumerate(features.simprints):
|
288 |
+
feature: sct.Feature
|
289 |
+
content = feature.content
|
|
|
290 |
|
291 |
+
# Remove leading overlap
|
292 |
+
if i > 0 and overlaps[i - 1]:
|
293 |
+
content = content[len(overlaps[i - 1]) :]
|
|
|
294 |
|
295 |
+
# Remove trailing overlap
|
296 |
+
if i < len(overlaps) and overlaps[i]:
|
297 |
+
content = content[: -len(overlaps[i])]
|
298 |
|
299 |
+
label = f"{feature.size}:{feature.simprint}"
|
300 |
+
highlighted_chunks.append((no_nl_inner(content), label))
|
|
|
301 |
|
302 |
+
if i < len(overlaps):
|
303 |
+
overlap = overlaps[i]
|
304 |
+
if overlap:
|
305 |
+
highlighted_chunks.append((f"\n{no_nl(overlap)}\n", "overlap"))
|
306 |
|
307 |
+
return {
|
308 |
+
out_code_func: gr.Textbox(value=iscc.iscc),
|
309 |
+
out_chunks_func: gr.HighlightedText(
|
310 |
+
value=highlighted_chunks, elem_id=f"chunked-text-{suffix}"
|
311 |
+
),
|
312 |
+
"metadata": iscc,
|
313 |
+
}
|
314 |
|
315 |
+
result_a = process_single_text(text_a, "a")
|
316 |
+
result_b = process_single_text(text_b, "b")
|
|
|
|
|
317 |
|
318 |
+
code_a = result_a[out_code_a] if text_a else None
|
319 |
+
code_b = result_b[out_code_b] if text_b else None
|
|
|
|
|
320 |
|
321 |
+
similarity = compare_codes(code_a, code_b, nbits) or out_similarity
|
|
|
|
|
322 |
|
323 |
+
granular_matches = []
|
324 |
+
if text_a and text_b:
|
325 |
+
matches = sct.granular_similarity(
|
326 |
+
result_a["metadata"], result_b["metadata"], threshold=80
|
327 |
+
)
|
328 |
+
for match in matches:
|
329 |
+
granular_matches.append(
|
330 |
+
[
|
331 |
+
match[0].content,
|
332 |
+
f"{match[1]}%",
|
333 |
+
match[2].content,
|
334 |
+
]
|
335 |
+
)
|
336 |
|
337 |
return (
|
338 |
+
result_a[out_code_a],
|
339 |
+
result_a[out_chunks_a],
|
340 |
+
result_b[out_code_b],
|
341 |
+
result_b[out_chunks_b],
|
342 |
similarity,
|
343 |
+
gr.Dataframe(value=granular_matches),
|
344 |
)
|
345 |
|
346 |
in_text_a.change(
|
347 |
+
process_and_calculate,
|
348 |
+
inputs=[in_text_a, in_text_b, in_iscc_bits, in_max_tokens],
|
349 |
+
outputs=[
|
350 |
+
out_code_a,
|
351 |
+
out_chunks_a,
|
352 |
+
out_code_b,
|
353 |
+
out_chunks_b,
|
354 |
+
out_similarity,
|
355 |
+
in_granular_matches,
|
356 |
+
],
|
357 |
show_progress="full",
|
358 |
trigger_mode="always_last",
|
359 |
)
|
360 |
+
|
361 |
in_text_b.change(
|
362 |
+
process_and_calculate,
|
363 |
+
inputs=[in_text_a, in_text_b, in_iscc_bits, in_max_tokens],
|
364 |
+
outputs=[
|
365 |
+
out_code_a,
|
366 |
+
out_chunks_a,
|
367 |
+
out_code_b,
|
368 |
+
out_chunks_b,
|
369 |
+
out_similarity,
|
370 |
+
in_granular_matches,
|
371 |
+
],
|
372 |
show_progress="full",
|
373 |
trigger_mode="always_last",
|
374 |
)
|
375 |
|
376 |
in_iscc_bits.change(
|
377 |
+
process_and_calculate,
|
378 |
+
inputs=[in_text_a, in_text_b, in_iscc_bits, in_max_tokens],
|
379 |
+
outputs=[
|
380 |
+
out_code_a,
|
381 |
+
out_chunks_a,
|
382 |
+
out_code_b,
|
383 |
+
out_chunks_b,
|
384 |
+
out_similarity,
|
385 |
+
in_granular_matches,
|
386 |
+
],
|
387 |
+
show_progress="full",
|
388 |
+
)
|
389 |
+
|
390 |
+
in_max_tokens.change(
|
391 |
+
process_and_calculate,
|
392 |
+
inputs=[in_text_a, in_text_b, in_iscc_bits, in_max_tokens],
|
393 |
+
outputs=[
|
394 |
+
out_code_a,
|
395 |
+
out_chunks_a,
|
396 |
+
out_code_b,
|
397 |
+
out_chunks_b,
|
398 |
+
out_similarity,
|
399 |
+
in_granular_matches,
|
400 |
+
],
|
401 |
show_progress="full",
|
402 |
)
|
403 |
|
|
|
410 |
|
411 |
def reset_all():
|
412 |
return (
|
413 |
+
gr.Slider(value=64), # Reset ISCC Bit-Length
|
414 |
gr.Dropdown(
|
415 |
+
value="None", choices=["None"] + [lang for lang in samples["a"]]
|
416 |
), # Reset sample dropdown A
|
417 |
gr.Dropdown(
|
418 |
+
value="None", choices=["None"] + [lang for lang in samples["b"]]
|
419 |
), # Reset sample dropdown B
|
420 |
gr.TextArea(value=""), # Reset Text A
|
421 |
gr.TextArea(value=""), # Reset Text B
|
|
|
426 |
gr.HighlightedText(value=[]), # Reset Chunked Text B
|
427 |
)
|
428 |
|
|
|
|
|
|
|
429 |
reset_button.click(
|
430 |
reset_all,
|
431 |
outputs=[
|
|
|
449 |
## Understanding ISCC Semantic Text-Codes
|
450 |
|
451 |
### What is an ISCC Semantic Text-Code?
|
452 |
+
An ISCC Semantic Text-Code is a digital fingerprint for text content. It captures the meaning of
|
453 |
+
the text, not just the exact words. Technically it is am ISCC-encoded, binarized multi-lingual
|
454 |
+
document-embedding.
|
455 |
|
456 |
### How does it work?
|
457 |
1. **Input**: You provide a text in any language.
|
458 |
+
2. **Processing**: Vector embeddings are created for individual chunks of the text.
|
459 |
+
3. **Output**: A unique ISCC-UNIT is generated that represents the entire text's content.
|
460 |
|
461 |
### What can it do?
|
462 |
- **Cross-language matching**: It can recognize similar content across different languages.
|
463 |
- **Similarity detection**: It can measure how similar two texts are in meaning, not just in words.
|
464 |
+
- **Content identification**: It can help identify texts with similar content, even if the wording
|
465 |
+
is different.
|
466 |
|
467 |
### How to use this demo:
|
468 |
1. **Enter text**: Type or paste text into either or both text boxes.
|
469 |
+
2. **Adjust bit length**: Use the slider to change the detail level of the code (higher = more
|
470 |
+
detailed).
|
471 |
3. **View results**: See the generated ISCC code for each text.
|
472 |
+
4. **Compare**: Look at the similarity bar to see how alike the two texts are in meaning, based on
|
473 |
+
their ISCC codes.
|
474 |
+
|
475 |
+
### Important Note:
|
476 |
+
The similarity shown is calculated by comparing the ISCC codes, not the original texts. This
|
477 |
+
allows for efficient and privacy-preserving comparisons, as only the codes need to be shared
|
478 |
+
or stored.
|
479 |
|
480 |
### Why is this useful?
|
481 |
- **Content creators**: Find similar content across languages.
|
482 |
- **Researchers**: Quickly compare documents or find related texts in different languages.
|
483 |
- **Publishers**: Identify potential translations or similar works efficiently.
|
484 |
|
485 |
+
This technology opens up new possibilities for understanding and managing text content across
|
486 |
+
language barriers!
|
487 |
+
|
488 |
+
### Explore Details & Advanced Options
|
489 |
+
|
490 |
+
The "Explore Details & Advanced Options" section provides additional tools and information:
|
491 |
+
|
492 |
+
1. **ISCC Bit-Length**: Adjust the precision of the ISCC code. Higher values provide more detailed
|
493 |
+
comparisons but may be more sensitive to minor differences.
|
494 |
+
|
495 |
+
2. **Max Tokens**: Set the maximum number of tokens per chunk. This affects how the text is split
|
496 |
+
for processing.
|
497 |
+
|
498 |
+
3. **Chunked Text**: View how each input text is divided into chunks for processing. Each chunk is
|
499 |
+
color-coded and labeled with its size and simprint (a similarity preserving fingerprint).
|
500 |
+
|
501 |
+
4. **Granular Matches**: See a detailed comparison of individual chunks between Text A and Text B.
|
502 |
+
This table shows which specific parts of the texts are most similar, along with their approximate
|
503 |
+
cosine similarity (scaled -100% to +100%).
|
504 |
"""
|
505 |
)
|
506 |
|
iscc_sct/models.py
CHANGED
@@ -70,7 +70,7 @@ The `FeatureSet` model unifies these two formats by allowing either structure to
|
|
70 |
To use the `FeatureSet` model, you can either provide data in the Index-Format or Object-Format.
|
71 |
"""
|
72 |
|
73 |
-
from typing import List, Optional,
|
74 |
from pydantic import BaseModel
|
75 |
|
76 |
|
|
|
70 |
To use the `FeatureSet` model, you can either provide data in the Index-Format or Object-Format.
|
71 |
"""
|
72 |
|
73 |
+
from typing import List, Optional, Union
|
74 |
from pydantic import BaseModel
|
75 |
|
76 |
|
iscc_sct/utils.py
CHANGED
@@ -8,6 +8,8 @@ from pathlib import Path
|
|
8 |
from urllib.request import urlretrieve
|
9 |
from blake3 import blake3
|
10 |
from platformdirs import PlatformDirs
|
|
|
|
|
11 |
|
12 |
|
13 |
APP_NAME = "iscc-sct"
|
@@ -21,8 +23,12 @@ __all__ = [
|
|
21 |
"get_model",
|
22 |
"encode_base32",
|
23 |
"encode_base64",
|
|
|
|
|
24 |
"hamming_distance",
|
25 |
"iscc_distance",
|
|
|
|
|
26 |
"MODEL_PATH",
|
27 |
]
|
28 |
|
@@ -176,3 +182,58 @@ def iscc_distance(iscc1, iscc2):
|
|
176 |
|
177 |
# Calculate and return the Hamming distance
|
178 |
return hamming_distance(content1, content2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
from urllib.request import urlretrieve
|
9 |
from blake3 import blake3
|
10 |
from platformdirs import PlatformDirs
|
11 |
+
from typing import List, Tuple
|
12 |
+
from iscc_sct.models import Metadata, Feature
|
13 |
|
14 |
|
15 |
APP_NAME = "iscc-sct"
|
|
|
23 |
"get_model",
|
24 |
"encode_base32",
|
25 |
"encode_base64",
|
26 |
+
"decode_base32",
|
27 |
+
"decode_base64",
|
28 |
"hamming_distance",
|
29 |
"iscc_distance",
|
30 |
+
"cosine_similarity",
|
31 |
+
"granular_similarity",
|
32 |
"MODEL_PATH",
|
33 |
]
|
34 |
|
|
|
182 |
|
183 |
# Calculate and return the Hamming distance
|
184 |
return hamming_distance(content1, content2)
|
185 |
+
|
186 |
+
|
187 |
+
def cosine_similarity(a, b):
|
188 |
+
# type: (bytes, bytes) -> int
|
189 |
+
"""
|
190 |
+
Calculate the approximate cosine similarity based on Hamming distance for two bytes inputs.
|
191 |
+
|
192 |
+
:param a: The first bytes object.
|
193 |
+
:param b: The second bytes object.
|
194 |
+
:return: The approximate cosine similarity between the two inputs, scaled from -100 to +100.
|
195 |
+
:raise ValueError: If a and b are not the same length.
|
196 |
+
"""
|
197 |
+
if len(a) != len(b):
|
198 |
+
raise ValueError("The lengths of the two bytes objects must be the same")
|
199 |
+
|
200 |
+
distance = hamming_distance(a, b)
|
201 |
+
total_bits = len(a) * 8
|
202 |
+
similarity = 1 - (2 * distance / total_bits)
|
203 |
+
return max(min(int(similarity * 100), 100), -100)
|
204 |
+
|
205 |
+
|
206 |
+
def granular_similarity(metadata_a, metadata_b, threshold=80):
|
207 |
+
# type: (Metadata, Metadata, int) -> List[Tuple[Feature, int, Feature]]
|
208 |
+
"""
|
209 |
+
Compare simprints from two Metadata objects and return matching pairs above a similarity
|
210 |
+
threshold. Only the most similar pair for each simprint_a is included.
|
211 |
+
|
212 |
+
:param metadata_a: The first Metadata object.
|
213 |
+
:param metadata_b: The second Metadata object.
|
214 |
+
:param threshold: The similarity threshold (0-100) above which simprints are considered a match.
|
215 |
+
:return: A list of tuples containing matching simprints and their similarity.
|
216 |
+
"""
|
217 |
+
metadata_a = metadata_a.to_object_format()
|
218 |
+
metadata_b = metadata_b.to_object_format()
|
219 |
+
|
220 |
+
matches = []
|
221 |
+
|
222 |
+
for feature_set_a in metadata_a.features:
|
223 |
+
for simprint_a in feature_set_a.simprints:
|
224 |
+
best_match = None
|
225 |
+
best_similarity = threshold - 1
|
226 |
+
|
227 |
+
for feature_set_b in metadata_b.features:
|
228 |
+
for simprint_b in feature_set_b.simprints:
|
229 |
+
similarity = cosine_similarity(
|
230 |
+
decode_base64(simprint_a.simprint), decode_base64(simprint_b.simprint)
|
231 |
+
)
|
232 |
+
if similarity > best_similarity:
|
233 |
+
best_similarity = similarity
|
234 |
+
best_match = (simprint_a, similarity, simprint_b)
|
235 |
+
|
236 |
+
if best_match:
|
237 |
+
matches.append(best_match)
|
238 |
+
|
239 |
+
return matches
|
poetry.lock
CHANGED
@@ -696,13 +696,13 @@ socks = ["socksio (==1.*)"]
|
|
696 |
|
697 |
[[package]]
|
698 |
name = "huggingface-hub"
|
699 |
-
version = "0.24.
|
700 |
description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
|
701 |
optional = false
|
702 |
python-versions = ">=3.8.0"
|
703 |
files = [
|
704 |
-
{file = "huggingface_hub-0.24.
|
705 |
-
{file = "huggingface_hub-0.24.
|
706 |
]
|
707 |
|
708 |
[package.dependencies]
|
@@ -2796,4 +2796,4 @@ gpu = ["onnxruntime-gpu"]
|
|
2796 |
[metadata]
|
2797 |
lock-version = "2.0"
|
2798 |
python-versions = ">=3.9,<3.13"
|
2799 |
-
content-hash = "
|
|
|
696 |
|
697 |
[[package]]
|
698 |
name = "huggingface-hub"
|
699 |
+
version = "0.24.6"
|
700 |
description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
|
701 |
optional = false
|
702 |
python-versions = ">=3.8.0"
|
703 |
files = [
|
704 |
+
{file = "huggingface_hub-0.24.6-py3-none-any.whl", hash = "sha256:a990f3232aa985fe749bc9474060cbad75e8b2f115f6665a9fda5b9c97818970"},
|
705 |
+
{file = "huggingface_hub-0.24.6.tar.gz", hash = "sha256:cc2579e761d070713eaa9c323e3debe39d5b464ae3a7261c39a9195b27bb8000"},
|
706 |
]
|
707 |
|
708 |
[package.dependencies]
|
|
|
2796 |
[metadata]
|
2797 |
lock-version = "2.0"
|
2798 |
python-versions = ">=3.9,<3.13"
|
2799 |
+
content-hash = "bf76c08f3c1285eb61f541a9cd654079dc15cc4bd77dd8994a815864e7e8c4a0"
|
pyproject.toml
CHANGED
@@ -57,10 +57,11 @@ tokenizers = "*"
|
|
57 |
pydantic-settings = "*"
|
58 |
charset-normalizer = "*"
|
59 |
numpy = "<2.0.0"
|
60 |
-
pybase64 = "
|
61 |
certifi = ">=2024.07.04"
|
62 |
gradio = { version = "*", optional = true }
|
63 |
-
pyyaml = "
|
|
|
64 |
|
65 |
|
66 |
[tool.poetry.extras]
|
|
|
57 |
pydantic-settings = "*"
|
58 |
charset-normalizer = "*"
|
59 |
numpy = "<2.0.0"
|
60 |
+
pybase64 = "*"
|
61 |
certifi = ">=2024.07.04"
|
62 |
gradio = { version = "*", optional = true }
|
63 |
+
pyyaml = "*"
|
64 |
+
pydantic = "*"
|
65 |
|
66 |
|
67 |
[tool.poetry.extras]
|
space.yml
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
title: ISCC-LAB - Semantic-Code Text
|
2 |
-
emoji:
|
3 |
colorFrom: red
|
4 |
colorTo: blue
|
5 |
sdk: gradio
|
|
|
1 |
title: ISCC-LAB - Semantic-Code Text
|
2 |
+
emoji: 🔮
|
3 |
colorFrom: red
|
4 |
colorTo: blue
|
5 |
sdk: gradio
|
tests/test_demo.py
CHANGED
@@ -3,7 +3,6 @@ from iscc_sct.demo import (
|
|
3 |
compare_codes,
|
4 |
hamming_to_cosine,
|
5 |
generate_similarity_bar,
|
6 |
-
recalculate_iscc,
|
7 |
)
|
8 |
|
9 |
|
@@ -45,81 +44,3 @@ def test_generate_similarity_bar():
|
|
45 |
result = generate_similarity_bar(-0.5)
|
46 |
assert "-50.00%" in result
|
47 |
assert "red" in result
|
48 |
-
|
49 |
-
|
50 |
-
from unittest.mock import patch, MagicMock
|
51 |
-
import gradio as gr
|
52 |
-
from iscc_sct.demo import process_text
|
53 |
-
|
54 |
-
|
55 |
-
def test_process_text():
|
56 |
-
# Test with valid input
|
57 |
-
result = process_text("Hello, world!", 64, "a")
|
58 |
-
assert isinstance(result, dict)
|
59 |
-
assert len(result) == 2
|
60 |
-
key, value = next(iter(result.items()))
|
61 |
-
assert isinstance(key, gr.components.Textbox)
|
62 |
-
assert isinstance(value, gr.components.Textbox)
|
63 |
-
assert value.value == "ISCC:CAA7GY4JTDI3XZYV"
|
64 |
-
|
65 |
-
# Test with empty input
|
66 |
-
result = process_text("", 64, "b")
|
67 |
-
assert isinstance(result, dict)
|
68 |
-
assert len(result) == 2
|
69 |
-
for key, value in result.items():
|
70 |
-
assert isinstance(key, (gr.components.Textbox, gr.components.HighlightedText))
|
71 |
-
assert value.value is None
|
72 |
-
|
73 |
-
# Test with different suffix
|
74 |
-
result = process_text("Test", 64, "b")
|
75 |
-
assert len(result) == 2
|
76 |
-
key, value = next(iter(result.items()))
|
77 |
-
assert isinstance(key, gr.components.Textbox)
|
78 |
-
assert isinstance(value, gr.components.Textbox)
|
79 |
-
|
80 |
-
|
81 |
-
@patch("iscc_sct.demo.sct.gen_text_code_semantic")
|
82 |
-
@patch("iscc_sct.demo.compare_codes")
|
83 |
-
def test_recalculate_iscc(mock_compare_codes, mock_gen_text_code):
|
84 |
-
mock_gen_text_code.side_effect = lambda text, bits: {"iscc": f"ISCC:{text[:4].upper()}{bits}"}
|
85 |
-
mock_compare_codes.return_value = "<similarity_html>"
|
86 |
-
|
87 |
-
# Test with both texts non-empty
|
88 |
-
result = recalculate_iscc("Hello", "World", 64)
|
89 |
-
assert len(result) == 3
|
90 |
-
assert isinstance(result[0], gr.components.Textbox)
|
91 |
-
assert result[0].value == "ISCC:HELL64"
|
92 |
-
assert isinstance(result[1], gr.components.Textbox)
|
93 |
-
assert result[1].value == "ISCC:WORL64"
|
94 |
-
assert result[2] == "<similarity_html>"
|
95 |
-
|
96 |
-
# Test with first text empty
|
97 |
-
result = recalculate_iscc("", "World", 128)
|
98 |
-
assert len(result) == 3
|
99 |
-
assert isinstance(result[0], gr.components.Textbox)
|
100 |
-
assert result[0].value is None
|
101 |
-
assert isinstance(result[1], gr.components.Textbox)
|
102 |
-
assert result[1].value == "ISCC:WORL128"
|
103 |
-
assert result[2] is None
|
104 |
-
|
105 |
-
# Test with second text empty
|
106 |
-
result = recalculate_iscc("Hello", "", 256)
|
107 |
-
assert len(result) == 3
|
108 |
-
assert isinstance(result[0], gr.components.Textbox)
|
109 |
-
assert result[0].value == "ISCC:HELL256"
|
110 |
-
assert isinstance(result[1], gr.components.Textbox)
|
111 |
-
assert result[1].value is None
|
112 |
-
assert result[2] is None
|
113 |
-
|
114 |
-
# Test with both texts empty
|
115 |
-
result = recalculate_iscc("", "", 64)
|
116 |
-
assert len(result) == 3
|
117 |
-
assert isinstance(result[0], gr.components.Textbox)
|
118 |
-
assert result[0].value is None
|
119 |
-
assert isinstance(result[1], gr.components.Textbox)
|
120 |
-
assert result[1].value is None
|
121 |
-
assert result[2] is None
|
122 |
-
|
123 |
-
# Verify function calls
|
124 |
-
assert mock_gen_text_code.call_count == 4
|
125 |
-
assert mock_compare_codes.call_count == 1
|
|
|
3 |
compare_codes,
|
4 |
hamming_to_cosine,
|
5 |
generate_similarity_bar,
|
|
|
6 |
)
|
7 |
|
8 |
|
|
|
44 |
result = generate_similarity_bar(-0.5)
|
45 |
assert "-50.00%" in result
|
46 |
assert "red" in result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/test_utils.py
CHANGED
@@ -90,3 +90,132 @@ def test_iscc_distance_different_lengths():
|
|
90 |
iscc2 = sct.create("Hello", bits=96).iscc
|
91 |
with pytest.raises(ValueError, match="The input ISCCs must have the same length"):
|
92 |
utils.iscc_distance(iscc1, iscc2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
iscc2 = sct.create("Hello", bits=96).iscc
|
91 |
with pytest.raises(ValueError, match="The input ISCCs must have the same length"):
|
92 |
utils.iscc_distance(iscc1, iscc2)
|
93 |
+
|
94 |
+
|
95 |
+
def test_cosine_similarity_identical():
|
96 |
+
a = b"\x00\x00\x00\x00"
|
97 |
+
b = b"\x00\x00\x00\x00"
|
98 |
+
assert utils.cosine_similarity(a, b) == 100
|
99 |
+
|
100 |
+
|
101 |
+
def test_cosine_similarity_opposite():
|
102 |
+
a = b"\x00\x00\x00\x00"
|
103 |
+
b = b"\xff\xff\xff\xff"
|
104 |
+
assert utils.cosine_similarity(a, b) == -100
|
105 |
+
|
106 |
+
|
107 |
+
def test_cosine_similarity_half_similar():
|
108 |
+
a = b"\x00\x00\xff\xff"
|
109 |
+
b = b"\x00\x00\x00\x00"
|
110 |
+
assert utils.cosine_similarity(a, b) == 0
|
111 |
+
|
112 |
+
|
113 |
+
def test_cosine_similarity_quarter_similar():
|
114 |
+
a = b"\x00\xff\xff\xff"
|
115 |
+
b = b"\x00\x00\x00\x00"
|
116 |
+
assert utils.cosine_similarity(a, b) == -50
|
117 |
+
|
118 |
+
|
119 |
+
def test_cosine_similarity_three_quarter_similar():
|
120 |
+
a = b"\x00\x00\x00\xff"
|
121 |
+
b = b"\x00\x00\x00\x00"
|
122 |
+
assert utils.cosine_similarity(a, b) == 50
|
123 |
+
|
124 |
+
|
125 |
+
def test_cosine_similarity_different_lengths():
|
126 |
+
a = b"\x00\x00\x00"
|
127 |
+
b = b"\x00\x00\x00\x00"
|
128 |
+
with pytest.raises(ValueError, match="The lengths of the two bytes objects must be the same"):
|
129 |
+
utils.cosine_similarity(a, b)
|
130 |
+
|
131 |
+
|
132 |
+
def test_granular_similarity():
|
133 |
+
from iscc_sct.models import Metadata, FeatureSet, Feature
|
134 |
+
|
135 |
+
# Create two Metadata objects with some matching and non-matching simprints
|
136 |
+
metadata_a = Metadata(
|
137 |
+
iscc="ISCC:KACYPXW563EDNM",
|
138 |
+
features=[
|
139 |
+
FeatureSet(
|
140 |
+
simprints=[
|
141 |
+
Feature(simprint="AAECAwQFBgc"), # Will match
|
142 |
+
Feature(simprint="CAkKCwwNDg8"), # Will not match
|
143 |
+
]
|
144 |
+
)
|
145 |
+
],
|
146 |
+
)
|
147 |
+
|
148 |
+
metadata_b = Metadata(
|
149 |
+
iscc="ISCC:KACYPXW563EDNM",
|
150 |
+
features=[
|
151 |
+
FeatureSet(
|
152 |
+
simprints=[
|
153 |
+
Feature(simprint="AAECAwQFBgc"), # Will match
|
154 |
+
Feature(simprint="EBESExQVFhc"), # Will not match
|
155 |
+
]
|
156 |
+
)
|
157 |
+
],
|
158 |
+
)
|
159 |
+
|
160 |
+
# Test with default threshold
|
161 |
+
matches = utils.granular_similarity(metadata_a, metadata_b)
|
162 |
+
assert len(matches) == 1
|
163 |
+
assert matches[0][0].simprint == "AAECAwQFBgc"
|
164 |
+
assert matches[0][1] == 100
|
165 |
+
assert matches[0][2].simprint == "AAECAwQFBgc"
|
166 |
+
|
167 |
+
# Test with lower threshold
|
168 |
+
matches = utils.granular_similarity(metadata_a, metadata_b, threshold=0)
|
169 |
+
assert len(matches) == 2 # All combinations should match
|
170 |
+
|
171 |
+
# Test with higher threshold
|
172 |
+
matches = utils.granular_similarity(metadata_a, metadata_b, threshold=101)
|
173 |
+
assert len(matches) == 0 # No matches should be found
|
174 |
+
|
175 |
+
|
176 |
+
def test_granular_similarity_no_matches():
|
177 |
+
from iscc_sct.models import Metadata, FeatureSet, Feature
|
178 |
+
|
179 |
+
metadata_a = Metadata(
|
180 |
+
iscc="ISCC:KACYPXW563EDNM",
|
181 |
+
features=[FeatureSet(simprints=[Feature(simprint="AAECAwQFBgc")])],
|
182 |
+
)
|
183 |
+
|
184 |
+
metadata_b = Metadata(
|
185 |
+
iscc="ISCC:KACYPXW563EDNM",
|
186 |
+
features=[FeatureSet(simprints=[Feature(simprint="CAkKCwwNDg8")])],
|
187 |
+
)
|
188 |
+
|
189 |
+
matches = utils.granular_similarity(metadata_a, metadata_b)
|
190 |
+
assert len(matches) == 0
|
191 |
+
|
192 |
+
|
193 |
+
def test_granular_similarity_multiple_matches():
|
194 |
+
from iscc_sct.models import Metadata, FeatureSet, Feature
|
195 |
+
|
196 |
+
metadata_a = Metadata(
|
197 |
+
iscc="ISCC:KACYPXW563EDNM",
|
198 |
+
features=[
|
199 |
+
FeatureSet(
|
200 |
+
simprints=[Feature(simprint="AAECAwQFBgc"), Feature(simprint="CAkKCwwNDg8")]
|
201 |
+
),
|
202 |
+
FeatureSet(simprints=[Feature(simprint="EBESExQVFhc")]),
|
203 |
+
],
|
204 |
+
)
|
205 |
+
|
206 |
+
metadata_b = Metadata(
|
207 |
+
iscc="ISCC:KACYPXW563EDNM",
|
208 |
+
features=[
|
209 |
+
FeatureSet(
|
210 |
+
simprints=[Feature(simprint="AAECAwQFBgc"), Feature(simprint="GBkaGxwdHh8")]
|
211 |
+
),
|
212 |
+
FeatureSet(simprints=[Feature(simprint="EBESExQVFhc")]),
|
213 |
+
],
|
214 |
+
)
|
215 |
+
|
216 |
+
matches = utils.granular_similarity(metadata_a, metadata_b)
|
217 |
+
assert len(matches) == 2
|
218 |
+
assert {(match[0].simprint, match[2].simprint) for match in matches} == {
|
219 |
+
("AAECAwQFBgc", "AAECAwQFBgc"),
|
220 |
+
("EBESExQVFhc", "EBESExQVFhc"),
|
221 |
+
}
|