import streamlit as st import os import pandas as pd import json from html import escape import difflib def generate_diff_html_word_level(text1, text2): """ Generates word-level difference between text1 and text2 as HTML, correctly handling spaces. """ words1 = text1.split() words2 = text2.split() diff = [] matcher = difflib.SequenceMatcher(None, words1, words2) for opcode in matcher.get_opcodes(): tag, i1, i2, j1, j2 = opcode if tag == 'replace': diff.append('' + escape(' '.join(words1[i1:i2])) + '') diff.append('' + escape(' '.join(words2[j1:j2])) + '') elif tag == 'delete': diff.append('' + escape(' '.join(words1[i1:i2])) + '') elif tag == 'insert': diff.append('' + escape(' '.join(words2[j1:j2])) + '') elif tag == 'equal': diff.append(escape(' '.join(words1[i1:i2]))) final_html = ' '.join(diff).replace('  {final_html}' os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" st.set_page_config(layout="wide") @st.cache_data def convert_df(df): return df.to_csv(index=False, quotechar='"').encode('utf-8') @st.cache_data def load_narratives_data(): data = [] with open("narratives.jsonl", "r") as f: for line in f: data.append(json.loads(line)) return pd.DataFrame(data) narratives_df = load_narratives_data() # Extract language from id narratives_df['language'] = narratives_df['id'].str.extract('-(rus|zho|fas)-') col1, col2 = st.columns([1, 3], gap="large") with st.sidebar: st.title("Options") with col1: st.title("Narratives") # Add language filter selected_language = st.selectbox( "Select language", ["All", "rus", "zho", "fas"] ) if selected_language != "All": filtered_df = narratives_df[narratives_df['language'] == selected_language] else: filtered_df = narratives_df narrative_ids = filtered_df["id"].tolist() container_for_nav = st.container() def sync_from_drop(): if st.session_state.selectbox_narrative == "Overview": st.session_state.narrative_index = -1 else: st.session_state.narrative_index = narrative_ids.index(st.session_state.selectbox_narrative) def sync_from_number(): st.session_state.narrative_index = st.session_state.narrative_number if st.session_state.narrative_number == -1: st.session_state.selectbox_narrative = "Overview" else: st.session_state.selectbox_narrative = narrative_ids[st.session_state.narrative_number] narrative_number = container_for_nav.number_input( min_value=-1, step=1, max_value=len(narrative_ids) - 1, on_change=sync_from_number, label=f"Select narrative by index (up to **{len(narrative_ids) - 1}**)", key="narrative_number" ) selectbox_narrative = container_for_nav.selectbox( "Select narrative by ID", ["Overview"] + narrative_ids, on_change=sync_from_drop, key="selectbox_narrative" ) st.divider() with col2: narrative_index = narrative_number if narrative_index >= 0: narrative = filtered_df.iloc[narrative_index] st.markdown("

Editor

", unsafe_allow_html=True) container = st.container() container.subheader(f"Narrative ID: {narrative['id']}") container.divider() container.subheader("Diff: Original English vs Altered English") processed_diff = generate_diff_html_word_level(narrative['original_english'].strip(), narrative['altered_english'].strip()) with container.container(border=True): st.markdown(processed_diff, unsafe_allow_html=True) container.divider() container.subheader("Original Text") original_input = container.text_area("Edit the original text", value=narrative['original'].strip(), height=300) elif narrative_index < 0: st.title("Overview") st.write(f"Total number of narratives: {len(filtered_df)}") if selected_language != "All": st.write(f"Selected language: {selected_language}") st.write("Select a narrative from the sidebar to view and edit its details.")