Spaces:

iscc
/

iscc-sct

Running

App Files Files Community

titusz commited on Aug 19

Commit

8c51bed

•

1 Parent(s): 73ab668

Synced repo using 'sync_with_huggingface' Github Action

Browse files

Files changed (9) hide show

iscc_sct/cli.py +15 -1
iscc_sct/demo.py +243 -101
iscc_sct/models.py +1 -1
iscc_sct/utils.py +61 -0
poetry.lock +4 -4
pyproject.toml +3 -2
space.yml +1 -1
tests/test_demo.py +0 -79
tests/test_utils.py +129 -0

iscc_sct/cli.py CHANGED Viewed

@@ -9,7 +9,10 @@ from charset_normalizer import from_bytes
 def main():
     parser = argparse.ArgumentParser(description="Generate Semantic Text-Codes for text files.")
     parser.add_argument(
-        "path", type=str, help="Path to text files (supports glob patterns).", nargs="?"
     )
     parser.add_argument(
         "-b", "--bits", type=int, default=256, help="Bit-Length of Code (default 256)"
@@ -27,6 +30,17 @@ def main():
     if not args.debug:
         logger.remove()
     for path in glob.glob(args.path):
         path = Path(path)
         if path.is_file():

 def main():
     parser = argparse.ArgumentParser(description="Generate Semantic Text-Codes for text files.")
     parser.add_argument(
+        "path",
+        type=str,
+        help="Path to text files (supports glob patterns) or 'gui' to launch Gradio demo.",
+        nargs="?",
     )
     parser.add_argument(
         "-b", "--bits", type=int, default=256, help="Bit-Length of Code (default 256)"
     if not args.debug:
         logger.remove()
+    if args.path == "gui":  # pragma: no cover
+        try:
+            from iscc_sct.demo import demo
+            demo.launch(inbrowser=True)
+        except ImportError:
+            print(
+                "Error: Gradio is not installed. Please install it with 'pip install gradio' to use the GUI."
+            )
+        return
     for path in glob.glob(args.path):
         path = Path(path)
         if path.is_file():

iscc_sct/demo.py CHANGED Viewed

@@ -7,6 +7,18 @@ import gradio as gr
 import iscc_sct as sct
 import textwrap
 import yaml
 newline_symbols = {
@@ -56,9 +68,21 @@ def compute_iscc_code(text1, text2, bit_length):
     return code1["iscc"], code2["iscc"], similarity
 def compare_codes(code_a, code_b, bits):
-    if all([code_a, code_b]):
-        return generate_similarity_bar(hamming_to_cosine(sct.iscc_distance(code_a, code_b), bits))
 def truncate_text(text, max_length=70):
@@ -89,9 +113,10 @@ def generate_similarity_bar(similarity):
         "transform: translateX(-50%);" if similarity >= 0 else "transform: translateX(50%);"
     )
     bar_html = f"""
-    <h3>Semantic Similarity</h3>
-    <div style='width: 100%; border: 1px solid #ccc; height: 30px; position: relative; background-color: #eee;'>
         <div style='height: 100%; width: {bar_width}%; background-color: {color}; position: absolute; {position}: 50%;'>
             <span style='position: absolute; width: 100%; {text_position} top: 0; line-height: 30px; color: white; {text_alignment}'>{display_similarity:.2f}%</span>
         </div>
@@ -101,18 +126,17 @@ def generate_similarity_bar(similarity):
 def load_samples():
-    with open("iscc_sct/samples.yml", "r", encoding="utf-8") as file:
         return yaml.safe_load(file)["samples"]
 samples = load_samples()
-custom_css = """
-"""
 iscc_theme = gr.themes.Default(
-    font=[gr.themes.GoogleFont("Readex Pro")],
     font_mono=[gr.themes.GoogleFont("JetBrains Mono")],
     radius_size=gr.themes.sizes.radius_none,
 )
@@ -120,7 +144,7 @@ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
     with gr.Row(variant="panel"):
         gr.Markdown(
             """
-        ## ✂️ ISCC Semantic Text-Code
         Demo of cross-lingual Semantic Text-Code (proof of concept)
         """,
         )
@@ -146,7 +170,7 @@ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
                 lines=12,
                 max_lines=12,
             )
-            out_code_a = gr.Textbox(label="ISCC Code for Text A")
         with gr.Column(variant="panel"):
             in_text_b = gr.TextArea(
                 label="Text B",
@@ -154,35 +178,64 @@ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
                 lines=12,
                 max_lines=12,
             )
-            out_code_b = gr.Textbox(label="ISCC Code for Text B")
     with gr.Row(variant="panel"):
         with gr.Column(variant="panel"):
-            out_similarity = gr.HTML(label="Similarity")
     with gr.Row(variant="panel"):
-        in_iscc_bits = gr.Slider(
-            label="ISCC Bit-Length",
-            info="NUMBER OF BITS FOR OUTPUT ISCC",
-            minimum=64,
-            maximum=256,
-            step=32,
-            value=64,
-        )
-    with gr.Row(variant="panel"):
-        with gr.Column(variant="panel"):
-            out_chunks_a = gr.HighlightedText(
-                label="Chunked Text A",
-                interactive=False,
-                elem_id="chunked-text-a",
-            )
-        with gr.Column(variant="panel"):
-            out_chunks_b = gr.HighlightedText(
-                label="Chunked Text B",
-                interactive=False,
-                elem_id="chunked-text-b",
-            )
     def update_sample_text(choice, group):
         if choice == "None":
@@ -200,86 +253,151 @@ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
         outputs=[in_text_b],
     )
-    def process_text(text, nbits, suffix):
-        log.debug(f"{text[:20]}")
-        out_code_func = globals().get(f"out_code_{suffix}")
-        out_chunks_func = globals().get(f"out_chunks_{suffix}")
-        if not text:
-            return {
-                out_code_func: gr.Textbox(value=None),
-                out_chunks_func: gr.HighlightedText(value=None, elem_id="chunked-text"),
-            }
-        result = sct.gen_text_code_semantic(
-            text, bits=nbits, simprints=True, offsets=True, sizes=True, contents=True
-        )
-        iscc = sct.Metadata(**result).to_object_format()
-        # Generate chunked text with simprints and overlaps
-        features = iscc.features[0]
-        highlighted_chunks = []
-        overlaps = iscc.get_overlaps()
-        for i, feature in enumerate(features.simprints):
-            feature: sct.Feature
-            content = feature.content
-            # Remove leading overlap
-            if i > 0 and overlaps[i - 1]:
-                content = content[len(overlaps[i - 1]) :]
-            # Remove trailing overlap
-            if i < len(overlaps) and overlaps[i]:
-                content = content[: -len(overlaps[i])]
-            label = f"{feature.size}:{feature.simprint}"
-            highlighted_chunks.append((no_nl_inner(content), label))
-            if i < len(overlaps):
-                overlap = overlaps[i]
-                if overlap:
-                    highlighted_chunks.append((f"\n{no_nl(overlap)}\n", "overlap"))
-        return {
-            out_code_func: gr.Textbox(value=iscc.iscc),
-            out_chunks_func: gr.HighlightedText(value=highlighted_chunks, elem_id="chunked-text"),
-        }
-    def recalculate_iscc(text_a, text_b, nbits):
-        code_a = sct.gen_text_code_semantic(text_a, bits=nbits)["iscc"] if text_a else None
-        code_b = sct.gen_text_code_semantic(text_b, bits=nbits)["iscc"] if text_b else None
-        if code_a and code_b:
-            similarity = compare_codes(code_a, code_b, nbits)
-        else:
-            similarity = None
         return (
-            gr.Textbox(value=code_a) if code_a else gr.Textbox(),
-            gr.Textbox(value=code_b) if code_b else gr.Textbox(),
             similarity,
         )
     in_text_a.change(
-        lambda text, nbits: process_text(text, nbits, "a"),
-        inputs=[in_text_a, in_iscc_bits],
-        outputs=[out_code_a, out_chunks_a],
         show_progress="full",
         trigger_mode="always_last",
     )
     in_text_b.change(
-        lambda text, nbits: process_text(text, nbits, "b"),
-        inputs=[in_text_b, in_iscc_bits],
-        outputs=[out_code_b, out_chunks_b],
         show_progress="full",
         trigger_mode="always_last",
     )
     in_iscc_bits.change(
-        recalculate_iscc,
-        inputs=[in_text_a, in_text_b, in_iscc_bits],
-        outputs=[out_code_a, out_code_b, out_similarity],
         show_progress="full",
     )
@@ -292,12 +410,12 @@ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
     def reset_all():
         return (
-            gr.Slider(value=128),  # Reset ISCC Bit-Length
             gr.Dropdown(
-                value="None", choices=["None"] + [f"a:{lang}" for lang in samples["a"]]
             ),  # Reset sample dropdown A
             gr.Dropdown(
-                value="None", choices=["None"] + [f"b:{lang}" for lang in samples["b"]]
             ),  # Reset sample dropdown B
             gr.TextArea(value=""),  # Reset Text A
             gr.TextArea(value=""),  # Reset Text B
@@ -308,9 +426,6 @@ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
             gr.HighlightedText(value=[]),  # Reset Chunked Text B
         )
-    with gr.Row(variant="panel"):
-        reset_button = gr.Button("Reset All")
     reset_button.click(
         reset_all,
         outputs=[
@@ -334,31 +449,58 @@ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
 ## Understanding ISCC Semantic Text-Codes
 ### What is an ISCC Semantic Text-Code?
-An ISCC Semantic Text-Code is a digital fingerprint for text content. It captures the meaning of the text,
-not just the exact words.
 ### How does it work?
 1. **Input**: You provide a text in any language.
-2. **Processing**: Our system analyzes the meaning of the text.
-3. **Output**: A unique code is generated that represents the text's content.
 ### What can it do?
 - **Cross-language matching**: It can recognize similar content across different languages.
 - **Similarity detection**: It can measure how similar two texts are in meaning, not just in words.
-- **Content identification**: It can help identify texts with similar content, even if the wording is different.
 ### How to use this demo:
 1. **Enter text**: Type or paste text into either or both text boxes.
-2. **Adjust bit length**: Use the slider to change the detail level of the code (higher = more detailed).
 3. **View results**: See the generated ISCC code for each text.
-4. **Compare**: Look at the similarity bar to see how alike the two texts are in meaning.
 ### Why is this useful?
 - **Content creators**: Find similar content across languages.
 - **Researchers**: Quickly compare documents or find related texts in different languages.
 - **Publishers**: Identify potential translations or similar works efficiently.
-This technology opens up new possibilities for understanding and managing text content across language barriers!
 """
             )

 import iscc_sct as sct
 import textwrap
 import yaml
+import pathlib
+HERE = pathlib.Path(__file__).parent.absolute()
+custom_css = """
+.simbar {
+    background: white;
+    min-height: 30px;
+}
+"""
 newline_symbols = {
     return code1["iscc"], code2["iscc"], similarity
+import binascii
 def compare_codes(code_a, code_b, bits):
+    if code_a and code_b:
+        code_a_str = code_a.value if hasattr(code_a, "value") else str(code_a)
+        code_b_str = code_b.value if hasattr(code_b, "value") else str(code_b)
+        if code_a_str and code_b_str:
+            try:
+                distance = sct.iscc_distance(code_a_str, code_b_str)
+                return generate_similarity_bar(hamming_to_cosine(distance, bits))
+            except binascii.Error:
+                # Invalid ISCC code format
+                return None
+    return None
 def truncate_text(text, max_length=70):
         "transform: translateX(-50%);" if similarity >= 0 else "transform: translateX(50%);"
     )
+    tooltip = "Similarity based on ISCC code comparison, not direct text comparison."
     bar_html = f"""
+    <div title="{tooltip}" style='width: 100%; border: 1px solid #ccc; height: 30px; position: relative; background-color: #eee;'>
         <div style='height: 100%; width: {bar_width}%; background-color: {color}; position: absolute; {position}: 50%;'>
             <span style='position: absolute; width: 100%; {text_position} top: 0; line-height: 30px; color: white; {text_alignment}'>{display_similarity:.2f}%</span>
         </div>
 def load_samples():
+    with open(HERE / "samples.yml", "r", encoding="utf-8") as file:
         return yaml.safe_load(file)["samples"]
 samples = load_samples()
 iscc_theme = gr.themes.Default(
+    font=[gr.themes.GoogleFont("Readex Pro Light")],
     font_mono=[gr.themes.GoogleFont("JetBrains Mono")],
+    text_size=gr.themes.sizes.text_lg,
     radius_size=gr.themes.sizes.radius_none,
 )
     with gr.Row(variant="panel"):
         gr.Markdown(
             """
+        ## 🔮️ ISCC - Semantic-Code Text
         Demo of cross-lingual Semantic Text-Code (proof of concept)
         """,
         )
                 lines=12,
                 max_lines=12,
             )
+            out_code_a = gr.Textbox(label="ISCC-SCT for Text A")
         with gr.Column(variant="panel"):
             in_text_b = gr.TextArea(
                 label="Text B",
                 lines=12,
                 max_lines=12,
             )
+            out_code_b = gr.Textbox(label="ISCC-SCT for Text B")
     with gr.Row(variant="panel"):
         with gr.Column(variant="panel"):
+            out_similarity_title = gr.Markdown("### ISCC-based Semantic Similarity")
+            with gr.Row(elem_classes="simbar"):
+                out_similarity = gr.HTML()
+            gr.Markdown(
+                "**NOTE:** Similarity is calculated based on the generated ISCC-SCT, not the original text."
+            )
     with gr.Row(variant="panel"):
+        reset_button = gr.Button("Reset All")
+    with gr.Accordion(label="🔍 Explore Details & Advanced Options", open=False):
+        with gr.Row(variant="panel"):
+            with gr.Column(variant="panel"):
+                in_iscc_bits = gr.Slider(
+                    label="ISCC Bit-Length",
+                    info="NUMBER OF BITS FOR OUTPUT ISCC",
+                    minimum=64,
+                    maximum=256,
+                    step=32,
+                    value=sct.sct_opts.bits,
+                )
+            with gr.Column(variant="panel"):
+                in_max_tokens = gr.Slider(
+                    label="Max Tokens",
+                    info="MAXIMUM NUMBER OF TOKENS PER CHUNK",
+                    minimum=49,
+                    maximum=sct.sct_opts.max_tokens,
+                    step=1,
+                    value=127,
+                )
+        with gr.Row(variant="panel"):
+            with gr.Column(variant="panel"):
+                out_chunks_a = gr.HighlightedText(
+                    label="Chunked Text A",
+                    interactive=False,
+                    elem_id="chunked-text-a",
+                )
+            with gr.Column(variant="panel"):
+                out_chunks_b = gr.HighlightedText(
+                    label="Chunked Text B",
+                    interactive=False,
+                    elem_id="chunked-text-b",
+                )
+        with gr.Row(variant="panel"):
+            with gr.Column(variant="panel"):
+                gr.Markdown("### Granular Matches")
+                in_granular_matches = gr.Dataframe(
+                    headers=["Chunk A", "Similarity", "Chunk B"],
+                    column_widths=["45%", "10%", "45%"],
+                    wrap=True,
+                    elem_classes="granular-matches",
+                )
     def update_sample_text(choice, group):
         if choice == "None":
         outputs=[in_text_b],
     )
+    def process_and_calculate(text_a, text_b, nbits, max_tokens):
+        log.debug(f"Processing text_a: {text_a[:20]}, text_b: {text_b[:20]}")
+        def process_single_text(text, suffix):
+            out_code_func = globals().get(f"out_code_{suffix}")
+            out_chunks_func = globals().get(f"out_chunks_{suffix}")
+            if not text:
+                return {
+                    out_code_func: gr.Textbox(value=None),
+                    out_chunks_func: gr.HighlightedText(
+                        value=None, elem_id=f"chunked-text-{suffix}"
+                    ),
+                }
+            result = sct.gen_text_code_semantic(
+                text,
+                bits=nbits,
+                simprints=True,
+                offsets=True,
+                sizes=True,
+                contents=True,
+                max_tokens=max_tokens,
+            )
+            iscc = sct.Metadata(**result).to_object_format()
+            # Generate chunked text with simprints and overlaps
+            features = iscc.features[0]
+            highlighted_chunks = []
+            overlaps = iscc.get_overlaps()
+            for i, feature in enumerate(features.simprints):
+                feature: sct.Feature
+                content = feature.content
+                # Remove leading overlap
+                if i > 0 and overlaps[i - 1]:
+                    content = content[len(overlaps[i - 1]) :]
+                # Remove trailing overlap
+                if i < len(overlaps) and overlaps[i]:
+                    content = content[: -len(overlaps[i])]
+                label = f"{feature.size}:{feature.simprint}"
+                highlighted_chunks.append((no_nl_inner(content), label))
+                if i < len(overlaps):
+                    overlap = overlaps[i]
+                    if overlap:
+                        highlighted_chunks.append((f"\n{no_nl(overlap)}\n", "overlap"))
+            return {
+                out_code_func: gr.Textbox(value=iscc.iscc),
+                out_chunks_func: gr.HighlightedText(
+                    value=highlighted_chunks, elem_id=f"chunked-text-{suffix}"
+                ),
+                "metadata": iscc,
+            }
+        result_a = process_single_text(text_a, "a")
+        result_b = process_single_text(text_b, "b")
+        code_a = result_a[out_code_a] if text_a else None
+        code_b = result_b[out_code_b] if text_b else None
+        similarity = compare_codes(code_a, code_b, nbits) or out_similarity
+        granular_matches = []
+        if text_a and text_b:
+            matches = sct.granular_similarity(
+                result_a["metadata"], result_b["metadata"], threshold=80
+            )
+            for match in matches:
+                granular_matches.append(
+                    [
+                        match[0].content,
+                        f"{match[1]}%",
+                        match[2].content,
+                    ]
+                )
         return (
+            result_a[out_code_a],
+            result_a[out_chunks_a],
+            result_b[out_code_b],
+            result_b[out_chunks_b],
             similarity,
+            gr.Dataframe(value=granular_matches),
         )
     in_text_a.change(
+        process_and_calculate,
+        inputs=[in_text_a, in_text_b, in_iscc_bits, in_max_tokens],
+        outputs=[
+            out_code_a,
+            out_chunks_a,
+            out_code_b,
+            out_chunks_b,
+            out_similarity,
+            in_granular_matches,
+        ],
         show_progress="full",
         trigger_mode="always_last",
     )
     in_text_b.change(
+        process_and_calculate,
+        inputs=[in_text_a, in_text_b, in_iscc_bits, in_max_tokens],
+        outputs=[
+            out_code_a,
+            out_chunks_a,
+            out_code_b,
+            out_chunks_b,
+            out_similarity,
+            in_granular_matches,
+        ],
         show_progress="full",
         trigger_mode="always_last",
     )
     in_iscc_bits.change(
+        process_and_calculate,
+        inputs=[in_text_a, in_text_b, in_iscc_bits, in_max_tokens],
+        outputs=[
+            out_code_a,
+            out_chunks_a,
+            out_code_b,
+            out_chunks_b,
+            out_similarity,
+            in_granular_matches,
+        ],
+        show_progress="full",
+    )
+    in_max_tokens.change(
+        process_and_calculate,
+        inputs=[in_text_a, in_text_b, in_iscc_bits, in_max_tokens],
+        outputs=[
+            out_code_a,
+            out_chunks_a,
+            out_code_b,
+            out_chunks_b,
+            out_similarity,
+            in_granular_matches,
+        ],
         show_progress="full",
     )
     def reset_all():
         return (
+            gr.Slider(value=64),  # Reset ISCC Bit-Length
             gr.Dropdown(
+                value="None", choices=["None"] + [lang for lang in samples["a"]]
             ),  # Reset sample dropdown A
             gr.Dropdown(
+                value="None", choices=["None"] + [lang for lang in samples["b"]]
             ),  # Reset sample dropdown B
             gr.TextArea(value=""),  # Reset Text A
             gr.TextArea(value=""),  # Reset Text B
             gr.HighlightedText(value=[]),  # Reset Chunked Text B
         )
     reset_button.click(
         reset_all,
         outputs=[
 ## Understanding ISCC Semantic Text-Codes
 ### What is an ISCC Semantic Text-Code?
+An ISCC Semantic Text-Code is a digital fingerprint for text content. It captures the meaning of
+the text, not just the exact words. Technically it is am ISCC-encoded, binarized multi-lingual
+document-embedding.
 ### How does it work?
 1. **Input**: You provide a text in any language.
+2. **Processing**: Vector embeddings are created for individual chunks of the text.
+3. **Output**: A unique ISCC-UNIT is generated that represents the entire text's content.
 ### What can it do?
 - **Cross-language matching**: It can recognize similar content across different languages.
 - **Similarity detection**: It can measure how similar two texts are in meaning, not just in words.
+- **Content identification**: It can help identify texts with similar content, even if the wording
+    is different.
 ### How to use this demo:
 1. **Enter text**: Type or paste text into either or both text boxes.
+2. **Adjust bit length**: Use the slider to change the detail level of the code (higher = more
+    detailed).
 3. **View results**: See the generated ISCC code for each text.
+4. **Compare**: Look at the similarity bar to see how alike the two texts are in meaning, based on
+    their ISCC codes.
+### Important Note:
+The similarity shown is calculated by comparing the ISCC codes, not the original texts. This
+allows for efficient and privacy-preserving comparisons, as only the codes need to be shared
+or stored.
 ### Why is this useful?
 - **Content creators**: Find similar content across languages.
 - **Researchers**: Quickly compare documents or find related texts in different languages.
 - **Publishers**: Identify potential translations or similar works efficiently.
+This technology opens up new possibilities for understanding and managing text content across
+language barriers!
+### Explore Details & Advanced Options
+The "Explore Details & Advanced Options" section provides additional tools and information:
+1. **ISCC Bit-Length**: Adjust the precision of the ISCC code. Higher values provide more detailed
+   comparisons but may be more sensitive to minor differences.
+2. **Max Tokens**: Set the maximum number of tokens per chunk. This affects how the text is split
+   for processing.
+3. **Chunked Text**: View how each input text is divided into chunks for processing. Each chunk is
+   color-coded and labeled with its size and simprint (a similarity preserving fingerprint).
+4. **Granular Matches**: See a detailed comparison of individual chunks between Text A and Text B.
+   This table shows which specific parts of the texts are most similar, along with their approximate
+   cosine similarity (scaled -100% to +100%).
 """
             )

iscc_sct/models.py CHANGED Viewed

@@ -70,7 +70,7 @@ The `FeatureSet` model unifies these two formats by allowing either structure to
 To use the `FeatureSet` model, you can either provide data in the Index-Format or Object-Format.
 """
-from typing import List, Optional, Dict, Any, Union
 from pydantic import BaseModel

 To use the `FeatureSet` model, you can either provide data in the Index-Format or Object-Format.
 """
+from typing import List, Optional, Union
 from pydantic import BaseModel

iscc_sct/utils.py CHANGED Viewed

@@ -8,6 +8,8 @@ from pathlib import Path
 from urllib.request import urlretrieve
 from blake3 import blake3
 from platformdirs import PlatformDirs
 APP_NAME = "iscc-sct"
@@ -21,8 +23,12 @@ __all__ = [
     "get_model",
     "encode_base32",
     "encode_base64",
     "hamming_distance",
     "iscc_distance",
     "MODEL_PATH",
 ]
@@ -176,3 +182,58 @@ def iscc_distance(iscc1, iscc2):
     # Calculate and return the Hamming distance
     return hamming_distance(content1, content2)

 from urllib.request import urlretrieve
 from blake3 import blake3
 from platformdirs import PlatformDirs
+from typing import List, Tuple
+from iscc_sct.models import Metadata, Feature
 APP_NAME = "iscc-sct"
     "get_model",
     "encode_base32",
     "encode_base64",
+    "decode_base32",
+    "decode_base64",
     "hamming_distance",
     "iscc_distance",
+    "cosine_similarity",
+    "granular_similarity",
     "MODEL_PATH",
 ]
     # Calculate and return the Hamming distance
     return hamming_distance(content1, content2)
+def cosine_similarity(a, b):
+    # type: (bytes, bytes) -> int
+    """
+    Calculate the approximate cosine similarity based on Hamming distance for two bytes inputs.
+    :param a: The first bytes object.
+    :param b: The second bytes object.
+    :return: The approximate cosine similarity between the two inputs, scaled from -100 to +100.
+    :raise ValueError: If a and b are not the same length.
+    """
+    if len(a) != len(b):
+        raise ValueError("The lengths of the two bytes objects must be the same")
+    distance = hamming_distance(a, b)
+    total_bits = len(a) * 8
+    similarity = 1 - (2 * distance / total_bits)
+    return max(min(int(similarity * 100), 100), -100)
+def granular_similarity(metadata_a, metadata_b, threshold=80):
+    # type: (Metadata, Metadata, int) -> List[Tuple[Feature, int, Feature]]
+    """
+    Compare simprints from two Metadata objects and return matching pairs above a similarity
+    threshold. Only the most similar pair for each simprint_a is included.
+    :param metadata_a: The first Metadata object.
+    :param metadata_b: The second Metadata object.
+    :param threshold: The similarity threshold (0-100) above which simprints are considered a match.
+    :return: A list of tuples containing matching simprints and their similarity.
+    """
+    metadata_a = metadata_a.to_object_format()
+    metadata_b = metadata_b.to_object_format()
+    matches = []
+    for feature_set_a in metadata_a.features:
+        for simprint_a in feature_set_a.simprints:
+            best_match = None
+            best_similarity = threshold - 1
+            for feature_set_b in metadata_b.features:
+                for simprint_b in feature_set_b.simprints:
+                    similarity = cosine_similarity(
+                        decode_base64(simprint_a.simprint), decode_base64(simprint_b.simprint)
+                    )
+                    if similarity > best_similarity:
+                        best_similarity = similarity
+                        best_match = (simprint_a, similarity, simprint_b)
+            if best_match:
+                matches.append(best_match)
+    return matches

poetry.lock CHANGED Viewed

@@ -696,13 +696,13 @@ socks = ["socksio (==1.*)"]
 [[package]]
 name = "huggingface-hub"
-version = "0.24.5"
 description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
 optional = false
 python-versions = ">=3.8.0"
 files = [
-    {file = "huggingface_hub-0.24.5-py3-none-any.whl", hash = "sha256:d93fb63b1f1a919a22ce91a14518974e81fc4610bf344dfe7572343ce8d3aced"},
-    {file = "huggingface_hub-0.24.5.tar.gz", hash = "sha256:7b45d6744dd53ce9cbf9880957de00e9d10a9ae837f1c9b7255fc8fa4e8264f3"},
 ]
 [package.dependencies]
@@ -2796,4 +2796,4 @@ gpu = ["onnxruntime-gpu"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<3.13"
-content-hash = "e4a4f012af4c1e60326f792c8801857dbf9298d8992fdd83d3b8f0688d4c04ea"

 [[package]]
 name = "huggingface-hub"
+version = "0.24.6"
 description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
 optional = false
 python-versions = ">=3.8.0"
 files = [
+    {file = "huggingface_hub-0.24.6-py3-none-any.whl", hash = "sha256:a990f3232aa985fe749bc9474060cbad75e8b2f115f6665a9fda5b9c97818970"},
+    {file = "huggingface_hub-0.24.6.tar.gz", hash = "sha256:cc2579e761d070713eaa9c323e3debe39d5b464ae3a7261c39a9195b27bb8000"},
 ]
 [package.dependencies]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<3.13"
+content-hash = "bf76c08f3c1285eb61f541a9cd654079dc15cc4bd77dd8994a815864e7e8c4a0"

pyproject.toml CHANGED Viewed

@@ -57,10 +57,11 @@ tokenizers = "*"
 pydantic-settings = "*"
 charset-normalizer = "*"
 numpy = "<2.0.0"
-pybase64 = "^1.4.0"
 certifi = ">=2024.07.04"
 gradio = { version = "*", optional = true }
-pyyaml = "^6.0.2"
 [tool.poetry.extras]

 pydantic-settings = "*"
 charset-normalizer = "*"
 numpy = "<2.0.0"
+pybase64 = "*"
 certifi = ">=2024.07.04"
 gradio = { version = "*", optional = true }
+pyyaml = "*"
+pydantic = "*"
 [tool.poetry.extras]

space.yml CHANGED Viewed

@@ -1,5 +1,5 @@
 title: ISCC-LAB - Semantic-Code Text
-emoji: ▶️
 colorFrom: red
 colorTo: blue
 sdk: gradio

 title: ISCC-LAB - Semantic-Code Text
+emoji: 🔮
 colorFrom: red
 colorTo: blue
 sdk: gradio

tests/test_demo.py CHANGED Viewed

@@ -3,7 +3,6 @@ from iscc_sct.demo import (
     compare_codes,
     hamming_to_cosine,
     generate_similarity_bar,
-    recalculate_iscc,
 )
@@ -45,81 +44,3 @@ def test_generate_similarity_bar():
     result = generate_similarity_bar(-0.5)
     assert "-50.00%" in result
     assert "red" in result
-from unittest.mock import patch, MagicMock
-import gradio as gr
-from iscc_sct.demo import process_text
-def test_process_text():
-    # Test with valid input
-    result = process_text("Hello, world!", 64, "a")
-    assert isinstance(result, dict)
-    assert len(result) == 2
-    key, value = next(iter(result.items()))
-    assert isinstance(key, gr.components.Textbox)
-    assert isinstance(value, gr.components.Textbox)
-    assert value.value == "ISCC:CAA7GY4JTDI3XZYV"
-    # Test with empty input
-    result = process_text("", 64, "b")
-    assert isinstance(result, dict)
-    assert len(result) == 2
-    for key, value in result.items():
-        assert isinstance(key, (gr.components.Textbox, gr.components.HighlightedText))
-        assert value.value is None
-    # Test with different suffix
-    result = process_text("Test", 64, "b")
-    assert len(result) == 2
-    key, value = next(iter(result.items()))
-    assert isinstance(key, gr.components.Textbox)
-    assert isinstance(value, gr.components.Textbox)
-@patch("iscc_sct.demo.sct.gen_text_code_semantic")
-@patch("iscc_sct.demo.compare_codes")
-def test_recalculate_iscc(mock_compare_codes, mock_gen_text_code):
-    mock_gen_text_code.side_effect = lambda text, bits: {"iscc": f"ISCC:{text[:4].upper()}{bits}"}
-    mock_compare_codes.return_value = "<similarity_html>"
-    # Test with both texts non-empty
-    result = recalculate_iscc("Hello", "World", 64)
-    assert len(result) == 3
-    assert isinstance(result[0], gr.components.Textbox)
-    assert result[0].value == "ISCC:HELL64"
-    assert isinstance(result[1], gr.components.Textbox)
-    assert result[1].value == "ISCC:WORL64"
-    assert result[2] == "<similarity_html>"
-    # Test with first text empty
-    result = recalculate_iscc("", "World", 128)
-    assert len(result) == 3
-    assert isinstance(result[0], gr.components.Textbox)
-    assert result[0].value is None
-    assert isinstance(result[1], gr.components.Textbox)
-    assert result[1].value == "ISCC:WORL128"
-    assert result[2] is None
-    # Test with second text empty
-    result = recalculate_iscc("Hello", "", 256)
-    assert len(result) == 3
-    assert isinstance(result[0], gr.components.Textbox)
-    assert result[0].value == "ISCC:HELL256"
-    assert isinstance(result[1], gr.components.Textbox)
-    assert result[1].value is None
-    assert result[2] is None
-    # Test with both texts empty
-    result = recalculate_iscc("", "", 64)
-    assert len(result) == 3
-    assert isinstance(result[0], gr.components.Textbox)
-    assert result[0].value is None
-    assert isinstance(result[1], gr.components.Textbox)
-    assert result[1].value is None
-    assert result[2] is None
-    # Verify function calls
-    assert mock_gen_text_code.call_count == 4
-    assert mock_compare_codes.call_count == 1

     compare_codes,
     hamming_to_cosine,
     generate_similarity_bar,
 )
     result = generate_similarity_bar(-0.5)
     assert "-50.00%" in result
     assert "red" in result

tests/test_utils.py CHANGED Viewed

@@ -90,3 +90,132 @@ def test_iscc_distance_different_lengths():
     iscc2 = sct.create("Hello", bits=96).iscc
     with pytest.raises(ValueError, match="The input ISCCs must have the same length"):
         utils.iscc_distance(iscc1, iscc2)

     iscc2 = sct.create("Hello", bits=96).iscc
     with pytest.raises(ValueError, match="The input ISCCs must have the same length"):
         utils.iscc_distance(iscc1, iscc2)
+def test_cosine_similarity_identical():
+    a = b"\x00\x00\x00\x00"
+    b = b"\x00\x00\x00\x00"
+    assert utils.cosine_similarity(a, b) == 100
+def test_cosine_similarity_opposite():
+    a = b"\x00\x00\x00\x00"
+    b = b"\xff\xff\xff\xff"
+    assert utils.cosine_similarity(a, b) == -100
+def test_cosine_similarity_half_similar():
+    a = b"\x00\x00\xff\xff"
+    b = b"\x00\x00\x00\x00"
+    assert utils.cosine_similarity(a, b) == 0
+def test_cosine_similarity_quarter_similar():
+    a = b"\x00\xff\xff\xff"
+    b = b"\x00\x00\x00\x00"
+    assert utils.cosine_similarity(a, b) == -50
+def test_cosine_similarity_three_quarter_similar():
+    a = b"\x00\x00\x00\xff"
+    b = b"\x00\x00\x00\x00"
+    assert utils.cosine_similarity(a, b) == 50
+def test_cosine_similarity_different_lengths():
+    a = b"\x00\x00\x00"
+    b = b"\x00\x00\x00\x00"
+    with pytest.raises(ValueError, match="The lengths of the two bytes objects must be the same"):
+        utils.cosine_similarity(a, b)
+def test_granular_similarity():
+    from iscc_sct.models import Metadata, FeatureSet, Feature
+    # Create two Metadata objects with some matching and non-matching simprints
+    metadata_a = Metadata(
+        iscc="ISCC:KACYPXW563EDNM",
+        features=[
+            FeatureSet(
+                simprints=[
+                    Feature(simprint="AAECAwQFBgc"),  # Will match
+                    Feature(simprint="CAkKCwwNDg8"),  # Will not match
+                ]
+            )
+        ],
+    )
+    metadata_b = Metadata(
+        iscc="ISCC:KACYPXW563EDNM",
+        features=[
+            FeatureSet(
+                simprints=[
+                    Feature(simprint="AAECAwQFBgc"),  # Will match
+                    Feature(simprint="EBESExQVFhc"),  # Will not match
+                ]
+            )
+        ],
+    )
+    # Test with default threshold
+    matches = utils.granular_similarity(metadata_a, metadata_b)
+    assert len(matches) == 1
+    assert matches[0][0].simprint == "AAECAwQFBgc"
+    assert matches[0][1] == 100
+    assert matches[0][2].simprint == "AAECAwQFBgc"
+    # Test with lower threshold
+    matches = utils.granular_similarity(metadata_a, metadata_b, threshold=0)
+    assert len(matches) == 2  # All combinations should match
+    # Test with higher threshold
+    matches = utils.granular_similarity(metadata_a, metadata_b, threshold=101)
+    assert len(matches) == 0  # No matches should be found
+def test_granular_similarity_no_matches():
+    from iscc_sct.models import Metadata, FeatureSet, Feature
+    metadata_a = Metadata(
+        iscc="ISCC:KACYPXW563EDNM",
+        features=[FeatureSet(simprints=[Feature(simprint="AAECAwQFBgc")])],
+    )
+    metadata_b = Metadata(
+        iscc="ISCC:KACYPXW563EDNM",
+        features=[FeatureSet(simprints=[Feature(simprint="CAkKCwwNDg8")])],
+    )
+    matches = utils.granular_similarity(metadata_a, metadata_b)
+    assert len(matches) == 0
+def test_granular_similarity_multiple_matches():
+    from iscc_sct.models import Metadata, FeatureSet, Feature
+    metadata_a = Metadata(
+        iscc="ISCC:KACYPXW563EDNM",
+        features=[
+            FeatureSet(
+                simprints=[Feature(simprint="AAECAwQFBgc"), Feature(simprint="CAkKCwwNDg8")]
+            ),
+            FeatureSet(simprints=[Feature(simprint="EBESExQVFhc")]),
+        ],
+    )
+    metadata_b = Metadata(
+        iscc="ISCC:KACYPXW563EDNM",
+        features=[
+            FeatureSet(
+                simprints=[Feature(simprint="AAECAwQFBgc"), Feature(simprint="GBkaGxwdHh8")]
+            ),
+            FeatureSet(simprints=[Feature(simprint="EBESExQVFhc")]),
+        ],
+    )
+    matches = utils.granular_similarity(metadata_a, metadata_b)
+    assert len(matches) == 2
+    assert {(match[0].simprint, match[2].simprint) for match in matches} == {
+        ("AAECAwQFBgc", "AAECAwQFBgc"),
+        ("EBESExQVFhc", "EBESExQVFhc"),
+    }