titusz commited on
Commit
8c51bed
1 Parent(s): 73ab668

Synced repo using 'sync_with_huggingface' Github Action

Browse files
iscc_sct/cli.py CHANGED
@@ -9,7 +9,10 @@ from charset_normalizer import from_bytes
9
  def main():
10
  parser = argparse.ArgumentParser(description="Generate Semantic Text-Codes for text files.")
11
  parser.add_argument(
12
- "path", type=str, help="Path to text files (supports glob patterns).", nargs="?"
 
 
 
13
  )
14
  parser.add_argument(
15
  "-b", "--bits", type=int, default=256, help="Bit-Length of Code (default 256)"
@@ -27,6 +30,17 @@ def main():
27
  if not args.debug:
28
  logger.remove()
29
 
 
 
 
 
 
 
 
 
 
 
 
30
  for path in glob.glob(args.path):
31
  path = Path(path)
32
  if path.is_file():
 
9
  def main():
10
  parser = argparse.ArgumentParser(description="Generate Semantic Text-Codes for text files.")
11
  parser.add_argument(
12
+ "path",
13
+ type=str,
14
+ help="Path to text files (supports glob patterns) or 'gui' to launch Gradio demo.",
15
+ nargs="?",
16
  )
17
  parser.add_argument(
18
  "-b", "--bits", type=int, default=256, help="Bit-Length of Code (default 256)"
 
30
  if not args.debug:
31
  logger.remove()
32
 
33
+ if args.path == "gui": # pragma: no cover
34
+ try:
35
+ from iscc_sct.demo import demo
36
+
37
+ demo.launch(inbrowser=True)
38
+ except ImportError:
39
+ print(
40
+ "Error: Gradio is not installed. Please install it with 'pip install gradio' to use the GUI."
41
+ )
42
+ return
43
+
44
  for path in glob.glob(args.path):
45
  path = Path(path)
46
  if path.is_file():
iscc_sct/demo.py CHANGED
@@ -7,6 +7,18 @@ import gradio as gr
7
  import iscc_sct as sct
8
  import textwrap
9
  import yaml
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
 
12
  newline_symbols = {
@@ -56,9 +68,21 @@ def compute_iscc_code(text1, text2, bit_length):
56
  return code1["iscc"], code2["iscc"], similarity
57
 
58
 
 
 
 
59
  def compare_codes(code_a, code_b, bits):
60
- if all([code_a, code_b]):
61
- return generate_similarity_bar(hamming_to_cosine(sct.iscc_distance(code_a, code_b), bits))
 
 
 
 
 
 
 
 
 
62
 
63
 
64
  def truncate_text(text, max_length=70):
@@ -89,9 +113,10 @@ def generate_similarity_bar(similarity):
89
  "transform: translateX(-50%);" if similarity >= 0 else "transform: translateX(50%);"
90
  )
91
 
 
 
92
  bar_html = f"""
93
- <h3>Semantic Similarity</h3>
94
- <div style='width: 100%; border: 1px solid #ccc; height: 30px; position: relative; background-color: #eee;'>
95
  <div style='height: 100%; width: {bar_width}%; background-color: {color}; position: absolute; {position}: 50%;'>
96
  <span style='position: absolute; width: 100%; {text_position} top: 0; line-height: 30px; color: white; {text_alignment}'>{display_similarity:.2f}%</span>
97
  </div>
@@ -101,18 +126,17 @@ def generate_similarity_bar(similarity):
101
 
102
 
103
  def load_samples():
104
- with open("iscc_sct/samples.yml", "r", encoding="utf-8") as file:
105
  return yaml.safe_load(file)["samples"]
106
 
107
 
108
  samples = load_samples()
109
 
110
- custom_css = """
111
- """
112
 
113
  iscc_theme = gr.themes.Default(
114
- font=[gr.themes.GoogleFont("Readex Pro")],
115
  font_mono=[gr.themes.GoogleFont("JetBrains Mono")],
 
116
  radius_size=gr.themes.sizes.radius_none,
117
  )
118
 
@@ -120,7 +144,7 @@ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
120
  with gr.Row(variant="panel"):
121
  gr.Markdown(
122
  """
123
- ## ✂️ ISCC Semantic Text-Code
124
  Demo of cross-lingual Semantic Text-Code (proof of concept)
125
  """,
126
  )
@@ -146,7 +170,7 @@ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
146
  lines=12,
147
  max_lines=12,
148
  )
149
- out_code_a = gr.Textbox(label="ISCC Code for Text A")
150
  with gr.Column(variant="panel"):
151
  in_text_b = gr.TextArea(
152
  label="Text B",
@@ -154,35 +178,64 @@ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
154
  lines=12,
155
  max_lines=12,
156
  )
157
- out_code_b = gr.Textbox(label="ISCC Code for Text B")
158
 
159
  with gr.Row(variant="panel"):
160
  with gr.Column(variant="panel"):
161
- out_similarity = gr.HTML(label="Similarity")
 
 
 
 
 
162
 
163
  with gr.Row(variant="panel"):
164
- in_iscc_bits = gr.Slider(
165
- label="ISCC Bit-Length",
166
- info="NUMBER OF BITS FOR OUTPUT ISCC",
167
- minimum=64,
168
- maximum=256,
169
- step=32,
170
- value=64,
171
- )
172
 
173
- with gr.Row(variant="panel"):
174
- with gr.Column(variant="panel"):
175
- out_chunks_a = gr.HighlightedText(
176
- label="Chunked Text A",
177
- interactive=False,
178
- elem_id="chunked-text-a",
179
- )
180
- with gr.Column(variant="panel"):
181
- out_chunks_b = gr.HighlightedText(
182
- label="Chunked Text B",
183
- interactive=False,
184
- elem_id="chunked-text-b",
185
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
  def update_sample_text(choice, group):
188
  if choice == "None":
@@ -200,86 +253,151 @@ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
200
  outputs=[in_text_b],
201
  )
202
 
203
- def process_text(text, nbits, suffix):
204
- log.debug(f"{text[:20]}")
205
- out_code_func = globals().get(f"out_code_{suffix}")
206
- out_chunks_func = globals().get(f"out_chunks_{suffix}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
 
208
- if not text:
209
- return {
210
- out_code_func: gr.Textbox(value=None),
211
- out_chunks_func: gr.HighlightedText(value=None, elem_id="chunked-text"),
212
- }
213
 
214
- result = sct.gen_text_code_semantic(
215
- text, bits=nbits, simprints=True, offsets=True, sizes=True, contents=True
216
- )
217
- iscc = sct.Metadata(**result).to_object_format()
218
 
219
- # Generate chunked text with simprints and overlaps
220
- features = iscc.features[0]
221
- highlighted_chunks = []
222
- overlaps = iscc.get_overlaps()
223
 
224
- for i, feature in enumerate(features.simprints):
225
- feature: sct.Feature
226
- content = feature.content
227
 
228
- # Remove leading overlap
229
- if i > 0 and overlaps[i - 1]:
230
- content = content[len(overlaps[i - 1]) :]
231
 
232
- # Remove trailing overlap
233
- if i < len(overlaps) and overlaps[i]:
234
- content = content[: -len(overlaps[i])]
 
235
 
236
- label = f"{feature.size}:{feature.simprint}"
237
- highlighted_chunks.append((no_nl_inner(content), label))
 
 
 
 
 
238
 
239
- if i < len(overlaps):
240
- overlap = overlaps[i]
241
- if overlap:
242
- highlighted_chunks.append((f"\n{no_nl(overlap)}\n", "overlap"))
243
 
244
- return {
245
- out_code_func: gr.Textbox(value=iscc.iscc),
246
- out_chunks_func: gr.HighlightedText(value=highlighted_chunks, elem_id="chunked-text"),
247
- }
248
 
249
- def recalculate_iscc(text_a, text_b, nbits):
250
- code_a = sct.gen_text_code_semantic(text_a, bits=nbits)["iscc"] if text_a else None
251
- code_b = sct.gen_text_code_semantic(text_b, bits=nbits)["iscc"] if text_b else None
252
 
253
- if code_a and code_b:
254
- similarity = compare_codes(code_a, code_b, nbits)
255
- else:
256
- similarity = None
 
 
 
 
 
 
 
 
 
257
 
258
  return (
259
- gr.Textbox(value=code_a) if code_a else gr.Textbox(),
260
- gr.Textbox(value=code_b) if code_b else gr.Textbox(),
 
 
261
  similarity,
 
262
  )
263
 
264
  in_text_a.change(
265
- lambda text, nbits: process_text(text, nbits, "a"),
266
- inputs=[in_text_a, in_iscc_bits],
267
- outputs=[out_code_a, out_chunks_a],
 
 
 
 
 
 
 
268
  show_progress="full",
269
  trigger_mode="always_last",
270
  )
 
271
  in_text_b.change(
272
- lambda text, nbits: process_text(text, nbits, "b"),
273
- inputs=[in_text_b, in_iscc_bits],
274
- outputs=[out_code_b, out_chunks_b],
 
 
 
 
 
 
 
275
  show_progress="full",
276
  trigger_mode="always_last",
277
  )
278
 
279
  in_iscc_bits.change(
280
- recalculate_iscc,
281
- inputs=[in_text_a, in_text_b, in_iscc_bits],
282
- outputs=[out_code_a, out_code_b, out_similarity],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  show_progress="full",
284
  )
285
 
@@ -292,12 +410,12 @@ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
292
 
293
  def reset_all():
294
  return (
295
- gr.Slider(value=128), # Reset ISCC Bit-Length
296
  gr.Dropdown(
297
- value="None", choices=["None"] + [f"a:{lang}" for lang in samples["a"]]
298
  ), # Reset sample dropdown A
299
  gr.Dropdown(
300
- value="None", choices=["None"] + [f"b:{lang}" for lang in samples["b"]]
301
  ), # Reset sample dropdown B
302
  gr.TextArea(value=""), # Reset Text A
303
  gr.TextArea(value=""), # Reset Text B
@@ -308,9 +426,6 @@ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
308
  gr.HighlightedText(value=[]), # Reset Chunked Text B
309
  )
310
 
311
- with gr.Row(variant="panel"):
312
- reset_button = gr.Button("Reset All")
313
-
314
  reset_button.click(
315
  reset_all,
316
  outputs=[
@@ -334,31 +449,58 @@ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
334
  ## Understanding ISCC Semantic Text-Codes
335
 
336
  ### What is an ISCC Semantic Text-Code?
337
- An ISCC Semantic Text-Code is a digital fingerprint for text content. It captures the meaning of the text,
338
- not just the exact words.
 
339
 
340
  ### How does it work?
341
  1. **Input**: You provide a text in any language.
342
- 2. **Processing**: Our system analyzes the meaning of the text.
343
- 3. **Output**: A unique code is generated that represents the text's content.
344
 
345
  ### What can it do?
346
  - **Cross-language matching**: It can recognize similar content across different languages.
347
  - **Similarity detection**: It can measure how similar two texts are in meaning, not just in words.
348
- - **Content identification**: It can help identify texts with similar content, even if the wording is different.
 
349
 
350
  ### How to use this demo:
351
  1. **Enter text**: Type or paste text into either or both text boxes.
352
- 2. **Adjust bit length**: Use the slider to change the detail level of the code (higher = more detailed).
 
353
  3. **View results**: See the generated ISCC code for each text.
354
- 4. **Compare**: Look at the similarity bar to see how alike the two texts are in meaning.
 
 
 
 
 
 
355
 
356
  ### Why is this useful?
357
  - **Content creators**: Find similar content across languages.
358
  - **Researchers**: Quickly compare documents or find related texts in different languages.
359
  - **Publishers**: Identify potential translations or similar works efficiently.
360
 
361
- This technology opens up new possibilities for understanding and managing text content across language barriers!
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
  """
363
  )
364
 
 
7
  import iscc_sct as sct
8
  import textwrap
9
  import yaml
10
+ import pathlib
11
+
12
+
13
+ HERE = pathlib.Path(__file__).parent.absolute()
14
+
15
+
16
+ custom_css = """
17
+ .simbar {
18
+ background: white;
19
+ min-height: 30px;
20
+ }
21
+ """
22
 
23
 
24
  newline_symbols = {
 
68
  return code1["iscc"], code2["iscc"], similarity
69
 
70
 
71
+ import binascii
72
+
73
+
74
  def compare_codes(code_a, code_b, bits):
75
+ if code_a and code_b:
76
+ code_a_str = code_a.value if hasattr(code_a, "value") else str(code_a)
77
+ code_b_str = code_b.value if hasattr(code_b, "value") else str(code_b)
78
+ if code_a_str and code_b_str:
79
+ try:
80
+ distance = sct.iscc_distance(code_a_str, code_b_str)
81
+ return generate_similarity_bar(hamming_to_cosine(distance, bits))
82
+ except binascii.Error:
83
+ # Invalid ISCC code format
84
+ return None
85
+ return None
86
 
87
 
88
  def truncate_text(text, max_length=70):
 
113
  "transform: translateX(-50%);" if similarity >= 0 else "transform: translateX(50%);"
114
  )
115
 
116
+ tooltip = "Similarity based on ISCC code comparison, not direct text comparison."
117
+
118
  bar_html = f"""
119
+ <div title="{tooltip}" style='width: 100%; border: 1px solid #ccc; height: 30px; position: relative; background-color: #eee;'>
 
120
  <div style='height: 100%; width: {bar_width}%; background-color: {color}; position: absolute; {position}: 50%;'>
121
  <span style='position: absolute; width: 100%; {text_position} top: 0; line-height: 30px; color: white; {text_alignment}'>{display_similarity:.2f}%</span>
122
  </div>
 
126
 
127
 
128
  def load_samples():
129
+ with open(HERE / "samples.yml", "r", encoding="utf-8") as file:
130
  return yaml.safe_load(file)["samples"]
131
 
132
 
133
  samples = load_samples()
134
 
 
 
135
 
136
  iscc_theme = gr.themes.Default(
137
+ font=[gr.themes.GoogleFont("Readex Pro Light")],
138
  font_mono=[gr.themes.GoogleFont("JetBrains Mono")],
139
+ text_size=gr.themes.sizes.text_lg,
140
  radius_size=gr.themes.sizes.radius_none,
141
  )
142
 
 
144
  with gr.Row(variant="panel"):
145
  gr.Markdown(
146
  """
147
+ ## 🔮️ ISCC - Semantic-Code Text
148
  Demo of cross-lingual Semantic Text-Code (proof of concept)
149
  """,
150
  )
 
170
  lines=12,
171
  max_lines=12,
172
  )
173
+ out_code_a = gr.Textbox(label="ISCC-SCT for Text A")
174
  with gr.Column(variant="panel"):
175
  in_text_b = gr.TextArea(
176
  label="Text B",
 
178
  lines=12,
179
  max_lines=12,
180
  )
181
+ out_code_b = gr.Textbox(label="ISCC-SCT for Text B")
182
 
183
  with gr.Row(variant="panel"):
184
  with gr.Column(variant="panel"):
185
+ out_similarity_title = gr.Markdown("### ISCC-based Semantic Similarity")
186
+ with gr.Row(elem_classes="simbar"):
187
+ out_similarity = gr.HTML()
188
+ gr.Markdown(
189
+ "**NOTE:** Similarity is calculated based on the generated ISCC-SCT, not the original text."
190
+ )
191
 
192
  with gr.Row(variant="panel"):
193
+ reset_button = gr.Button("Reset All")
 
 
 
 
 
 
 
194
 
195
+ with gr.Accordion(label="🔍 Explore Details & Advanced Options", open=False):
196
+ with gr.Row(variant="panel"):
197
+ with gr.Column(variant="panel"):
198
+ in_iscc_bits = gr.Slider(
199
+ label="ISCC Bit-Length",
200
+ info="NUMBER OF BITS FOR OUTPUT ISCC",
201
+ minimum=64,
202
+ maximum=256,
203
+ step=32,
204
+ value=sct.sct_opts.bits,
205
+ )
206
+ with gr.Column(variant="panel"):
207
+ in_max_tokens = gr.Slider(
208
+ label="Max Tokens",
209
+ info="MAXIMUM NUMBER OF TOKENS PER CHUNK",
210
+ minimum=49,
211
+ maximum=sct.sct_opts.max_tokens,
212
+ step=1,
213
+ value=127,
214
+ )
215
+
216
+ with gr.Row(variant="panel"):
217
+ with gr.Column(variant="panel"):
218
+ out_chunks_a = gr.HighlightedText(
219
+ label="Chunked Text A",
220
+ interactive=False,
221
+ elem_id="chunked-text-a",
222
+ )
223
+ with gr.Column(variant="panel"):
224
+ out_chunks_b = gr.HighlightedText(
225
+ label="Chunked Text B",
226
+ interactive=False,
227
+ elem_id="chunked-text-b",
228
+ )
229
+
230
+ with gr.Row(variant="panel"):
231
+ with gr.Column(variant="panel"):
232
+ gr.Markdown("### Granular Matches")
233
+ in_granular_matches = gr.Dataframe(
234
+ headers=["Chunk A", "Similarity", "Chunk B"],
235
+ column_widths=["45%", "10%", "45%"],
236
+ wrap=True,
237
+ elem_classes="granular-matches",
238
+ )
239
 
240
  def update_sample_text(choice, group):
241
  if choice == "None":
 
253
  outputs=[in_text_b],
254
  )
255
 
256
+ def process_and_calculate(text_a, text_b, nbits, max_tokens):
257
+ log.debug(f"Processing text_a: {text_a[:20]}, text_b: {text_b[:20]}")
258
+
259
+ def process_single_text(text, suffix):
260
+ out_code_func = globals().get(f"out_code_{suffix}")
261
+ out_chunks_func = globals().get(f"out_chunks_{suffix}")
262
+
263
+ if not text:
264
+ return {
265
+ out_code_func: gr.Textbox(value=None),
266
+ out_chunks_func: gr.HighlightedText(
267
+ value=None, elem_id=f"chunked-text-{suffix}"
268
+ ),
269
+ }
270
+
271
+ result = sct.gen_text_code_semantic(
272
+ text,
273
+ bits=nbits,
274
+ simprints=True,
275
+ offsets=True,
276
+ sizes=True,
277
+ contents=True,
278
+ max_tokens=max_tokens,
279
+ )
280
+ iscc = sct.Metadata(**result).to_object_format()
281
 
282
+ # Generate chunked text with simprints and overlaps
283
+ features = iscc.features[0]
284
+ highlighted_chunks = []
285
+ overlaps = iscc.get_overlaps()
 
286
 
287
+ for i, feature in enumerate(features.simprints):
288
+ feature: sct.Feature
289
+ content = feature.content
 
290
 
291
+ # Remove leading overlap
292
+ if i > 0 and overlaps[i - 1]:
293
+ content = content[len(overlaps[i - 1]) :]
 
294
 
295
+ # Remove trailing overlap
296
+ if i < len(overlaps) and overlaps[i]:
297
+ content = content[: -len(overlaps[i])]
298
 
299
+ label = f"{feature.size}:{feature.simprint}"
300
+ highlighted_chunks.append((no_nl_inner(content), label))
 
301
 
302
+ if i < len(overlaps):
303
+ overlap = overlaps[i]
304
+ if overlap:
305
+ highlighted_chunks.append((f"\n{no_nl(overlap)}\n", "overlap"))
306
 
307
+ return {
308
+ out_code_func: gr.Textbox(value=iscc.iscc),
309
+ out_chunks_func: gr.HighlightedText(
310
+ value=highlighted_chunks, elem_id=f"chunked-text-{suffix}"
311
+ ),
312
+ "metadata": iscc,
313
+ }
314
 
315
+ result_a = process_single_text(text_a, "a")
316
+ result_b = process_single_text(text_b, "b")
 
 
317
 
318
+ code_a = result_a[out_code_a] if text_a else None
319
+ code_b = result_b[out_code_b] if text_b else None
 
 
320
 
321
+ similarity = compare_codes(code_a, code_b, nbits) or out_similarity
 
 
322
 
323
+ granular_matches = []
324
+ if text_a and text_b:
325
+ matches = sct.granular_similarity(
326
+ result_a["metadata"], result_b["metadata"], threshold=80
327
+ )
328
+ for match in matches:
329
+ granular_matches.append(
330
+ [
331
+ match[0].content,
332
+ f"{match[1]}%",
333
+ match[2].content,
334
+ ]
335
+ )
336
 
337
  return (
338
+ result_a[out_code_a],
339
+ result_a[out_chunks_a],
340
+ result_b[out_code_b],
341
+ result_b[out_chunks_b],
342
  similarity,
343
+ gr.Dataframe(value=granular_matches),
344
  )
345
 
346
  in_text_a.change(
347
+ process_and_calculate,
348
+ inputs=[in_text_a, in_text_b, in_iscc_bits, in_max_tokens],
349
+ outputs=[
350
+ out_code_a,
351
+ out_chunks_a,
352
+ out_code_b,
353
+ out_chunks_b,
354
+ out_similarity,
355
+ in_granular_matches,
356
+ ],
357
  show_progress="full",
358
  trigger_mode="always_last",
359
  )
360
+
361
  in_text_b.change(
362
+ process_and_calculate,
363
+ inputs=[in_text_a, in_text_b, in_iscc_bits, in_max_tokens],
364
+ outputs=[
365
+ out_code_a,
366
+ out_chunks_a,
367
+ out_code_b,
368
+ out_chunks_b,
369
+ out_similarity,
370
+ in_granular_matches,
371
+ ],
372
  show_progress="full",
373
  trigger_mode="always_last",
374
  )
375
 
376
  in_iscc_bits.change(
377
+ process_and_calculate,
378
+ inputs=[in_text_a, in_text_b, in_iscc_bits, in_max_tokens],
379
+ outputs=[
380
+ out_code_a,
381
+ out_chunks_a,
382
+ out_code_b,
383
+ out_chunks_b,
384
+ out_similarity,
385
+ in_granular_matches,
386
+ ],
387
+ show_progress="full",
388
+ )
389
+
390
+ in_max_tokens.change(
391
+ process_and_calculate,
392
+ inputs=[in_text_a, in_text_b, in_iscc_bits, in_max_tokens],
393
+ outputs=[
394
+ out_code_a,
395
+ out_chunks_a,
396
+ out_code_b,
397
+ out_chunks_b,
398
+ out_similarity,
399
+ in_granular_matches,
400
+ ],
401
  show_progress="full",
402
  )
403
 
 
410
 
411
  def reset_all():
412
  return (
413
+ gr.Slider(value=64), # Reset ISCC Bit-Length
414
  gr.Dropdown(
415
+ value="None", choices=["None"] + [lang for lang in samples["a"]]
416
  ), # Reset sample dropdown A
417
  gr.Dropdown(
418
+ value="None", choices=["None"] + [lang for lang in samples["b"]]
419
  ), # Reset sample dropdown B
420
  gr.TextArea(value=""), # Reset Text A
421
  gr.TextArea(value=""), # Reset Text B
 
426
  gr.HighlightedText(value=[]), # Reset Chunked Text B
427
  )
428
 
 
 
 
429
  reset_button.click(
430
  reset_all,
431
  outputs=[
 
449
  ## Understanding ISCC Semantic Text-Codes
450
 
451
  ### What is an ISCC Semantic Text-Code?
452
+ An ISCC Semantic Text-Code is a digital fingerprint for text content. It captures the meaning of
453
+ the text, not just the exact words. Technically it is am ISCC-encoded, binarized multi-lingual
454
+ document-embedding.
455
 
456
  ### How does it work?
457
  1. **Input**: You provide a text in any language.
458
+ 2. **Processing**: Vector embeddings are created for individual chunks of the text.
459
+ 3. **Output**: A unique ISCC-UNIT is generated that represents the entire text's content.
460
 
461
  ### What can it do?
462
  - **Cross-language matching**: It can recognize similar content across different languages.
463
  - **Similarity detection**: It can measure how similar two texts are in meaning, not just in words.
464
+ - **Content identification**: It can help identify texts with similar content, even if the wording
465
+ is different.
466
 
467
  ### How to use this demo:
468
  1. **Enter text**: Type or paste text into either or both text boxes.
469
+ 2. **Adjust bit length**: Use the slider to change the detail level of the code (higher = more
470
+ detailed).
471
  3. **View results**: See the generated ISCC code for each text.
472
+ 4. **Compare**: Look at the similarity bar to see how alike the two texts are in meaning, based on
473
+ their ISCC codes.
474
+
475
+ ### Important Note:
476
+ The similarity shown is calculated by comparing the ISCC codes, not the original texts. This
477
+ allows for efficient and privacy-preserving comparisons, as only the codes need to be shared
478
+ or stored.
479
 
480
  ### Why is this useful?
481
  - **Content creators**: Find similar content across languages.
482
  - **Researchers**: Quickly compare documents or find related texts in different languages.
483
  - **Publishers**: Identify potential translations or similar works efficiently.
484
 
485
+ This technology opens up new possibilities for understanding and managing text content across
486
+ language barriers!
487
+
488
+ ### Explore Details & Advanced Options
489
+
490
+ The "Explore Details & Advanced Options" section provides additional tools and information:
491
+
492
+ 1. **ISCC Bit-Length**: Adjust the precision of the ISCC code. Higher values provide more detailed
493
+ comparisons but may be more sensitive to minor differences.
494
+
495
+ 2. **Max Tokens**: Set the maximum number of tokens per chunk. This affects how the text is split
496
+ for processing.
497
+
498
+ 3. **Chunked Text**: View how each input text is divided into chunks for processing. Each chunk is
499
+ color-coded and labeled with its size and simprint (a similarity preserving fingerprint).
500
+
501
+ 4. **Granular Matches**: See a detailed comparison of individual chunks between Text A and Text B.
502
+ This table shows which specific parts of the texts are most similar, along with their approximate
503
+ cosine similarity (scaled -100% to +100%).
504
  """
505
  )
506
 
iscc_sct/models.py CHANGED
@@ -70,7 +70,7 @@ The `FeatureSet` model unifies these two formats by allowing either structure to
70
  To use the `FeatureSet` model, you can either provide data in the Index-Format or Object-Format.
71
  """
72
 
73
- from typing import List, Optional, Dict, Any, Union
74
  from pydantic import BaseModel
75
 
76
 
 
70
  To use the `FeatureSet` model, you can either provide data in the Index-Format or Object-Format.
71
  """
72
 
73
+ from typing import List, Optional, Union
74
  from pydantic import BaseModel
75
 
76
 
iscc_sct/utils.py CHANGED
@@ -8,6 +8,8 @@ from pathlib import Path
8
  from urllib.request import urlretrieve
9
  from blake3 import blake3
10
  from platformdirs import PlatformDirs
 
 
11
 
12
 
13
  APP_NAME = "iscc-sct"
@@ -21,8 +23,12 @@ __all__ = [
21
  "get_model",
22
  "encode_base32",
23
  "encode_base64",
 
 
24
  "hamming_distance",
25
  "iscc_distance",
 
 
26
  "MODEL_PATH",
27
  ]
28
 
@@ -176,3 +182,58 @@ def iscc_distance(iscc1, iscc2):
176
 
177
  # Calculate and return the Hamming distance
178
  return hamming_distance(content1, content2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  from urllib.request import urlretrieve
9
  from blake3 import blake3
10
  from platformdirs import PlatformDirs
11
+ from typing import List, Tuple
12
+ from iscc_sct.models import Metadata, Feature
13
 
14
 
15
  APP_NAME = "iscc-sct"
 
23
  "get_model",
24
  "encode_base32",
25
  "encode_base64",
26
+ "decode_base32",
27
+ "decode_base64",
28
  "hamming_distance",
29
  "iscc_distance",
30
+ "cosine_similarity",
31
+ "granular_similarity",
32
  "MODEL_PATH",
33
  ]
34
 
 
182
 
183
  # Calculate and return the Hamming distance
184
  return hamming_distance(content1, content2)
185
+
186
+
187
+ def cosine_similarity(a, b):
188
+ # type: (bytes, bytes) -> int
189
+ """
190
+ Calculate the approximate cosine similarity based on Hamming distance for two bytes inputs.
191
+
192
+ :param a: The first bytes object.
193
+ :param b: The second bytes object.
194
+ :return: The approximate cosine similarity between the two inputs, scaled from -100 to +100.
195
+ :raise ValueError: If a and b are not the same length.
196
+ """
197
+ if len(a) != len(b):
198
+ raise ValueError("The lengths of the two bytes objects must be the same")
199
+
200
+ distance = hamming_distance(a, b)
201
+ total_bits = len(a) * 8
202
+ similarity = 1 - (2 * distance / total_bits)
203
+ return max(min(int(similarity * 100), 100), -100)
204
+
205
+
206
+ def granular_similarity(metadata_a, metadata_b, threshold=80):
207
+ # type: (Metadata, Metadata, int) -> List[Tuple[Feature, int, Feature]]
208
+ """
209
+ Compare simprints from two Metadata objects and return matching pairs above a similarity
210
+ threshold. Only the most similar pair for each simprint_a is included.
211
+
212
+ :param metadata_a: The first Metadata object.
213
+ :param metadata_b: The second Metadata object.
214
+ :param threshold: The similarity threshold (0-100) above which simprints are considered a match.
215
+ :return: A list of tuples containing matching simprints and their similarity.
216
+ """
217
+ metadata_a = metadata_a.to_object_format()
218
+ metadata_b = metadata_b.to_object_format()
219
+
220
+ matches = []
221
+
222
+ for feature_set_a in metadata_a.features:
223
+ for simprint_a in feature_set_a.simprints:
224
+ best_match = None
225
+ best_similarity = threshold - 1
226
+
227
+ for feature_set_b in metadata_b.features:
228
+ for simprint_b in feature_set_b.simprints:
229
+ similarity = cosine_similarity(
230
+ decode_base64(simprint_a.simprint), decode_base64(simprint_b.simprint)
231
+ )
232
+ if similarity > best_similarity:
233
+ best_similarity = similarity
234
+ best_match = (simprint_a, similarity, simprint_b)
235
+
236
+ if best_match:
237
+ matches.append(best_match)
238
+
239
+ return matches
poetry.lock CHANGED
@@ -696,13 +696,13 @@ socks = ["socksio (==1.*)"]
696
 
697
  [[package]]
698
  name = "huggingface-hub"
699
- version = "0.24.5"
700
  description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
701
  optional = false
702
  python-versions = ">=3.8.0"
703
  files = [
704
- {file = "huggingface_hub-0.24.5-py3-none-any.whl", hash = "sha256:d93fb63b1f1a919a22ce91a14518974e81fc4610bf344dfe7572343ce8d3aced"},
705
- {file = "huggingface_hub-0.24.5.tar.gz", hash = "sha256:7b45d6744dd53ce9cbf9880957de00e9d10a9ae837f1c9b7255fc8fa4e8264f3"},
706
  ]
707
 
708
  [package.dependencies]
@@ -2796,4 +2796,4 @@ gpu = ["onnxruntime-gpu"]
2796
  [metadata]
2797
  lock-version = "2.0"
2798
  python-versions = ">=3.9,<3.13"
2799
- content-hash = "e4a4f012af4c1e60326f792c8801857dbf9298d8992fdd83d3b8f0688d4c04ea"
 
696
 
697
  [[package]]
698
  name = "huggingface-hub"
699
+ version = "0.24.6"
700
  description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
701
  optional = false
702
  python-versions = ">=3.8.0"
703
  files = [
704
+ {file = "huggingface_hub-0.24.6-py3-none-any.whl", hash = "sha256:a990f3232aa985fe749bc9474060cbad75e8b2f115f6665a9fda5b9c97818970"},
705
+ {file = "huggingface_hub-0.24.6.tar.gz", hash = "sha256:cc2579e761d070713eaa9c323e3debe39d5b464ae3a7261c39a9195b27bb8000"},
706
  ]
707
 
708
  [package.dependencies]
 
2796
  [metadata]
2797
  lock-version = "2.0"
2798
  python-versions = ">=3.9,<3.13"
2799
+ content-hash = "bf76c08f3c1285eb61f541a9cd654079dc15cc4bd77dd8994a815864e7e8c4a0"
pyproject.toml CHANGED
@@ -57,10 +57,11 @@ tokenizers = "*"
57
  pydantic-settings = "*"
58
  charset-normalizer = "*"
59
  numpy = "<2.0.0"
60
- pybase64 = "^1.4.0"
61
  certifi = ">=2024.07.04"
62
  gradio = { version = "*", optional = true }
63
- pyyaml = "^6.0.2"
 
64
 
65
 
66
  [tool.poetry.extras]
 
57
  pydantic-settings = "*"
58
  charset-normalizer = "*"
59
  numpy = "<2.0.0"
60
+ pybase64 = "*"
61
  certifi = ">=2024.07.04"
62
  gradio = { version = "*", optional = true }
63
+ pyyaml = "*"
64
+ pydantic = "*"
65
 
66
 
67
  [tool.poetry.extras]
space.yml CHANGED
@@ -1,5 +1,5 @@
1
  title: ISCC-LAB - Semantic-Code Text
2
- emoji: ▶️
3
  colorFrom: red
4
  colorTo: blue
5
  sdk: gradio
 
1
  title: ISCC-LAB - Semantic-Code Text
2
+ emoji: 🔮
3
  colorFrom: red
4
  colorTo: blue
5
  sdk: gradio
tests/test_demo.py CHANGED
@@ -3,7 +3,6 @@ from iscc_sct.demo import (
3
  compare_codes,
4
  hamming_to_cosine,
5
  generate_similarity_bar,
6
- recalculate_iscc,
7
  )
8
 
9
 
@@ -45,81 +44,3 @@ def test_generate_similarity_bar():
45
  result = generate_similarity_bar(-0.5)
46
  assert "-50.00%" in result
47
  assert "red" in result
48
-
49
-
50
- from unittest.mock import patch, MagicMock
51
- import gradio as gr
52
- from iscc_sct.demo import process_text
53
-
54
-
55
- def test_process_text():
56
- # Test with valid input
57
- result = process_text("Hello, world!", 64, "a")
58
- assert isinstance(result, dict)
59
- assert len(result) == 2
60
- key, value = next(iter(result.items()))
61
- assert isinstance(key, gr.components.Textbox)
62
- assert isinstance(value, gr.components.Textbox)
63
- assert value.value == "ISCC:CAA7GY4JTDI3XZYV"
64
-
65
- # Test with empty input
66
- result = process_text("", 64, "b")
67
- assert isinstance(result, dict)
68
- assert len(result) == 2
69
- for key, value in result.items():
70
- assert isinstance(key, (gr.components.Textbox, gr.components.HighlightedText))
71
- assert value.value is None
72
-
73
- # Test with different suffix
74
- result = process_text("Test", 64, "b")
75
- assert len(result) == 2
76
- key, value = next(iter(result.items()))
77
- assert isinstance(key, gr.components.Textbox)
78
- assert isinstance(value, gr.components.Textbox)
79
-
80
-
81
- @patch("iscc_sct.demo.sct.gen_text_code_semantic")
82
- @patch("iscc_sct.demo.compare_codes")
83
- def test_recalculate_iscc(mock_compare_codes, mock_gen_text_code):
84
- mock_gen_text_code.side_effect = lambda text, bits: {"iscc": f"ISCC:{text[:4].upper()}{bits}"}
85
- mock_compare_codes.return_value = "<similarity_html>"
86
-
87
- # Test with both texts non-empty
88
- result = recalculate_iscc("Hello", "World", 64)
89
- assert len(result) == 3
90
- assert isinstance(result[0], gr.components.Textbox)
91
- assert result[0].value == "ISCC:HELL64"
92
- assert isinstance(result[1], gr.components.Textbox)
93
- assert result[1].value == "ISCC:WORL64"
94
- assert result[2] == "<similarity_html>"
95
-
96
- # Test with first text empty
97
- result = recalculate_iscc("", "World", 128)
98
- assert len(result) == 3
99
- assert isinstance(result[0], gr.components.Textbox)
100
- assert result[0].value is None
101
- assert isinstance(result[1], gr.components.Textbox)
102
- assert result[1].value == "ISCC:WORL128"
103
- assert result[2] is None
104
-
105
- # Test with second text empty
106
- result = recalculate_iscc("Hello", "", 256)
107
- assert len(result) == 3
108
- assert isinstance(result[0], gr.components.Textbox)
109
- assert result[0].value == "ISCC:HELL256"
110
- assert isinstance(result[1], gr.components.Textbox)
111
- assert result[1].value is None
112
- assert result[2] is None
113
-
114
- # Test with both texts empty
115
- result = recalculate_iscc("", "", 64)
116
- assert len(result) == 3
117
- assert isinstance(result[0], gr.components.Textbox)
118
- assert result[0].value is None
119
- assert isinstance(result[1], gr.components.Textbox)
120
- assert result[1].value is None
121
- assert result[2] is None
122
-
123
- # Verify function calls
124
- assert mock_gen_text_code.call_count == 4
125
- assert mock_compare_codes.call_count == 1
 
3
  compare_codes,
4
  hamming_to_cosine,
5
  generate_similarity_bar,
 
6
  )
7
 
8
 
 
44
  result = generate_similarity_bar(-0.5)
45
  assert "-50.00%" in result
46
  assert "red" in result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tests/test_utils.py CHANGED
@@ -90,3 +90,132 @@ def test_iscc_distance_different_lengths():
90
  iscc2 = sct.create("Hello", bits=96).iscc
91
  with pytest.raises(ValueError, match="The input ISCCs must have the same length"):
92
  utils.iscc_distance(iscc1, iscc2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  iscc2 = sct.create("Hello", bits=96).iscc
91
  with pytest.raises(ValueError, match="The input ISCCs must have the same length"):
92
  utils.iscc_distance(iscc1, iscc2)
93
+
94
+
95
+ def test_cosine_similarity_identical():
96
+ a = b"\x00\x00\x00\x00"
97
+ b = b"\x00\x00\x00\x00"
98
+ assert utils.cosine_similarity(a, b) == 100
99
+
100
+
101
+ def test_cosine_similarity_opposite():
102
+ a = b"\x00\x00\x00\x00"
103
+ b = b"\xff\xff\xff\xff"
104
+ assert utils.cosine_similarity(a, b) == -100
105
+
106
+
107
+ def test_cosine_similarity_half_similar():
108
+ a = b"\x00\x00\xff\xff"
109
+ b = b"\x00\x00\x00\x00"
110
+ assert utils.cosine_similarity(a, b) == 0
111
+
112
+
113
+ def test_cosine_similarity_quarter_similar():
114
+ a = b"\x00\xff\xff\xff"
115
+ b = b"\x00\x00\x00\x00"
116
+ assert utils.cosine_similarity(a, b) == -50
117
+
118
+
119
+ def test_cosine_similarity_three_quarter_similar():
120
+ a = b"\x00\x00\x00\xff"
121
+ b = b"\x00\x00\x00\x00"
122
+ assert utils.cosine_similarity(a, b) == 50
123
+
124
+
125
+ def test_cosine_similarity_different_lengths():
126
+ a = b"\x00\x00\x00"
127
+ b = b"\x00\x00\x00\x00"
128
+ with pytest.raises(ValueError, match="The lengths of the two bytes objects must be the same"):
129
+ utils.cosine_similarity(a, b)
130
+
131
+
132
+ def test_granular_similarity():
133
+ from iscc_sct.models import Metadata, FeatureSet, Feature
134
+
135
+ # Create two Metadata objects with some matching and non-matching simprints
136
+ metadata_a = Metadata(
137
+ iscc="ISCC:KACYPXW563EDNM",
138
+ features=[
139
+ FeatureSet(
140
+ simprints=[
141
+ Feature(simprint="AAECAwQFBgc"), # Will match
142
+ Feature(simprint="CAkKCwwNDg8"), # Will not match
143
+ ]
144
+ )
145
+ ],
146
+ )
147
+
148
+ metadata_b = Metadata(
149
+ iscc="ISCC:KACYPXW563EDNM",
150
+ features=[
151
+ FeatureSet(
152
+ simprints=[
153
+ Feature(simprint="AAECAwQFBgc"), # Will match
154
+ Feature(simprint="EBESExQVFhc"), # Will not match
155
+ ]
156
+ )
157
+ ],
158
+ )
159
+
160
+ # Test with default threshold
161
+ matches = utils.granular_similarity(metadata_a, metadata_b)
162
+ assert len(matches) == 1
163
+ assert matches[0][0].simprint == "AAECAwQFBgc"
164
+ assert matches[0][1] == 100
165
+ assert matches[0][2].simprint == "AAECAwQFBgc"
166
+
167
+ # Test with lower threshold
168
+ matches = utils.granular_similarity(metadata_a, metadata_b, threshold=0)
169
+ assert len(matches) == 2 # All combinations should match
170
+
171
+ # Test with higher threshold
172
+ matches = utils.granular_similarity(metadata_a, metadata_b, threshold=101)
173
+ assert len(matches) == 0 # No matches should be found
174
+
175
+
176
+ def test_granular_similarity_no_matches():
177
+ from iscc_sct.models import Metadata, FeatureSet, Feature
178
+
179
+ metadata_a = Metadata(
180
+ iscc="ISCC:KACYPXW563EDNM",
181
+ features=[FeatureSet(simprints=[Feature(simprint="AAECAwQFBgc")])],
182
+ )
183
+
184
+ metadata_b = Metadata(
185
+ iscc="ISCC:KACYPXW563EDNM",
186
+ features=[FeatureSet(simprints=[Feature(simprint="CAkKCwwNDg8")])],
187
+ )
188
+
189
+ matches = utils.granular_similarity(metadata_a, metadata_b)
190
+ assert len(matches) == 0
191
+
192
+
193
+ def test_granular_similarity_multiple_matches():
194
+ from iscc_sct.models import Metadata, FeatureSet, Feature
195
+
196
+ metadata_a = Metadata(
197
+ iscc="ISCC:KACYPXW563EDNM",
198
+ features=[
199
+ FeatureSet(
200
+ simprints=[Feature(simprint="AAECAwQFBgc"), Feature(simprint="CAkKCwwNDg8")]
201
+ ),
202
+ FeatureSet(simprints=[Feature(simprint="EBESExQVFhc")]),
203
+ ],
204
+ )
205
+
206
+ metadata_b = Metadata(
207
+ iscc="ISCC:KACYPXW563EDNM",
208
+ features=[
209
+ FeatureSet(
210
+ simprints=[Feature(simprint="AAECAwQFBgc"), Feature(simprint="GBkaGxwdHh8")]
211
+ ),
212
+ FeatureSet(simprints=[Feature(simprint="EBESExQVFhc")]),
213
+ ],
214
+ )
215
+
216
+ matches = utils.granular_similarity(metadata_a, metadata_b)
217
+ assert len(matches) == 2
218
+ assert {(match[0].simprint, match[2].simprint) for match in matches} == {
219
+ ("AAECAwQFBgc", "AAECAwQFBgc"),
220
+ ("EBESExQVFhc", "EBESExQVFhc"),
221
+ }