import streamlit as st
import os
import pandas as pd
import json
from html import escape
import difflib
def generate_diff_html_word_level(text1, text2):
"""
Generates word-level difference between text1 and text2 as HTML, correctly handling spaces.
"""
words1 = text1.split()
words2 = text2.split()
diff = []
matcher = difflib.SequenceMatcher(None, words1, words2)
for opcode in matcher.get_opcodes():
tag, i1, i2, j1, j2 = opcode
if tag == 'replace':
diff.append('' + escape(' '.join(words1[i1:i2])) + '')
diff.append('' + escape(' '.join(words2[j1:j2])) + '')
elif tag == 'delete':
diff.append('' + escape(' '.join(words1[i1:i2])) + '')
elif tag == 'insert':
diff.append('' + escape(' '.join(words2[j1:j2])) + '')
elif tag == 'equal':
diff.append(escape(' '.join(words1[i1:i2])))
final_html = ' '.join(diff).replace(' {final_html}'
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
st.set_page_config(layout="wide")
@st.cache_data
def convert_df(df):
return df.to_csv(index=False, quotechar='"').encode('utf-8')
@st.cache_data
def load_narratives_data():
data = []
with open("narratives.jsonl", "r") as f:
for line in f:
data.append(json.loads(line))
return pd.DataFrame(data)
narratives_df = load_narratives_data()
# Extract language from id
narratives_df['language'] = narratives_df['id'].str.extract('-(rus|zho|fas)-')
col1, col2 = st.columns([1, 3], gap="large")
with st.sidebar:
st.title("Options")
with col1:
st.title("Narratives")
# Add language filter
selected_language = st.selectbox(
"Select language",
["All", "rus", "zho", "fas"]
)
if selected_language != "All":
filtered_df = narratives_df[narratives_df['language'] == selected_language]
else:
filtered_df = narratives_df
narrative_ids = filtered_df["id"].tolist()
container_for_nav = st.container()
def sync_from_drop():
if st.session_state.selectbox_narrative == "Overview":
st.session_state.narrative_index = -1
else:
st.session_state.narrative_index = narrative_ids.index(st.session_state.selectbox_narrative)
def sync_from_number():
st.session_state.narrative_index = st.session_state.narrative_number
if st.session_state.narrative_number == -1:
st.session_state.selectbox_narrative = "Overview"
else:
st.session_state.selectbox_narrative = narrative_ids[st.session_state.narrative_number]
narrative_number = container_for_nav.number_input(
min_value=-1, step=1, max_value=len(narrative_ids) - 1,
on_change=sync_from_number,
label=f"Select narrative by index (up to **{len(narrative_ids) - 1}**)",
key="narrative_number"
)
selectbox_narrative = container_for_nav.selectbox(
"Select narrative by ID",
["Overview"] + narrative_ids,
on_change=sync_from_drop,
key="selectbox_narrative"
)
st.divider()
with col2:
narrative_index = narrative_number
if narrative_index >= 0:
narrative = filtered_df.iloc[narrative_index]
st.markdown("Editor
", unsafe_allow_html=True)
container = st.container()
container.subheader(f"Narrative ID: {narrative['id']}")
container.divider()
container.subheader("Diff: Original English vs Altered English")
processed_diff = generate_diff_html_word_level(narrative['original_english'], narrative['altered_english'])
with container.container(border=True):
st.markdown(processed_diff, unsafe_allow_html=True)
container.divider()
container.subheader("Original Text")
original_input = container.text_area("Edit the original text", value=narrative['original'], height=300)
elif narrative_index < 0:
st.title("Overview")
st.write(f"Total number of narratives: {len(filtered_df)}")
if selected_language != "All":
st.write(f"Selected language: {selected_language}")
st.write("Select a narrative from the sidebar to view and edit its details.")