import pandas as pd import streamlit as st from utils import df_to_html, render_svg, combine_json_files, render_metadata, color_mapping data = combine_json_files('./languages') @st.cache_data def render_home_table(): """Renders home table.""" # Compute number of unique domains/urls for key in data.keys(): data[key]['Number of Sites'] = len(data[key].get('Sites', [])) data[key]["Number of Links"] = sum(len(url_data["Links"]) for url_data in data[key].get('Sites', [])) # Convert dict to df df_data = pd.DataFrame(data).transpose() df_data['ISO Code'] = df_data.index df_data['Number of Sites'] = df_data['Number of Sites'].astype(str) # Convert to string df_data['ISO Code'] = df_data['ISO Code'].astype(str) # Convert to string df_data['Number of Sites'] = df_data.apply(lambda row: '{}'.format(row['ISO Code'], row['Number of Sites']), axis=1) df_data['Number of Links'] = df_data.apply(lambda row: '{}'.format(row['ISO Code'], row['Number of Links']), axis=1) df_data["Support by MADLAD400, FLORES200, GLOT500"] = df_data.apply(lambda row: color_mapping([row["Supported by allenai/MADLAD-400"] + row["Supported by facebook/flores"] + row["Supported by cis-lmu/Glot500"]]), axis =1) df_data['Color_Order'] = pd.Categorical(df_data['Support by MADLAD400, FLORES200, GLOT500'], categories=['🟥', '🟧', '🟨', '🟩'], ordered=True) # Sort by Color_Order then ISO Code df_data = df_data.sort_values(by=['Color_Order', 'ISO Code']) # Filter 🟩 df_data = df_data[df_data["Support by MADLAD400, FLORES200, GLOT500"]!= '🟩'] # Display the table df_data = df_data[['ISO Code', 'Language Name', 'Family', 'Subgrouping', 'Number of Sites', 'Number of Links', 'Number of Speakers', 'Support by MADLAD400, FLORES200, GLOT500']] st.write(df_to_html(df_data), unsafe_allow_html=True) @st.cache_data def render_site_table(isocode): # back back_text = '[Back]' st.markdown(back_text, unsafe_allow_html=True) # site urls = data[isocode].get('Sites', []) df_urls = pd.DataFrame(urls) df_urls['Number of Links'] = df_urls['Links'].apply(len) df_urls = df_urls.sort_values(by='Number of Links', ascending=False) df_urls = df_urls.reset_index(drop=True) df_urls['Number of Links'] = df_urls.apply(lambda row: '{}'.format(isocode, row['Site URL'], row['Number of Links']), axis=1) df_urls['Site URL'] = df_urls['Site URL'].apply(lambda url: f'{url}' if url != 'misc' else url) df_urls['Language Name'] = data[isocode]['Language Name'] df_urls['ISO Code'] = isocode # Display the table df_urls = df_urls[['ISO Code', 'Site URL', 'Category', 'Number of Links', 'Possible Parallel Languages', 'Confidence']] st.write(df_to_html(df_urls), unsafe_allow_html=True) @st.cache_data def render_siteurl_table(isocode, url): # back back_text = '[Back]'.format(isocode) st.markdown(back_text, unsafe_allow_html=True) # Find selected domain urls = data[isocode].get('Sites', []) selected_domain = next((d for d in urls if 'Site URL' in d and d['Site URL'] == url), None) if selected_domain: st.write({'Language Name': data[isocode]['Language Name'], 'ISO Code': isocode, 'Site URL': url, 'Links': selected_domain['Links']}) @st.cache_data def render_links_table(isocode): # back back_text = '[Back]' st.markdown(back_text, unsafe_allow_html=True) # output urls = data[isocode].get('Sites', []) lang_name = data[isocode]['Language Name'] all_urls = [{'Site URL': du['Site URL'], 'Links': du['Links']} for du in urls] st.write({'Language Name': lang_name, 'ISO Code': isocode, 'URLs': all_urls}) # show logo render_svg(open("assets/glotweb_logo.svg").read()) def main(): params = st.query_params if 'isocode' in params: if 'siteurl' in params: render_siteurl_table(params['isocode'], params['siteurl']) if 'site' in params: render_site_table(params['isocode']) if 'links' in params: render_links_table(params['isocode']) else: # show home render_metadata() st.markdown("**GlotWeb** is an indexing service for low-resource languages. It indexes **non-religous** sites or links written in each language. This list can be used to create raw text or parallel corpora and to study low-resource languages on the web.\n") render_home_table() st.markdown("\n\nWe compare the level of support for these languages in the three big datasets ([MADLAD400](https://huggingface.co/datasets/allenai/MADLAD-400), [FLORES200](https://huggingface.co/datasets/facebook/flores), [GLOT500](https://huggingface.co/datasets/cis-lmu/Glot500)) of low-resource languages (🟥 0/3 < 🟧 1/3 < 🟨 2/3 < 🟩 3/3). Although the support in these datasets for some of these languages could be just the religious texts.", unsafe_allow_html=True) main()