import pandas as pd
import streamlit as st
from utils import df_to_html, render_svg, combine_json_files, render_metadata, color_mapping
data = combine_json_files('./languages')
@st.cache_data
def render_home_table():
"""Renders home table."""
# Compute number of unique domains/urls
for key in data.keys():
data[key]['Number of Sites'] = len(data[key].get('Sites', []))
data[key]["Number of Links"] = sum(len(url_data["Links"]) for url_data in data[key].get('Sites', []))
# Convert dict to df
df_data = pd.DataFrame(data).transpose()
df_data['ISO Code'] = df_data.index
df_data['Number of Sites'] = df_data['Number of Sites'].astype(str) # Convert to string
df_data['ISO Code'] = df_data['ISO Code'].astype(str) # Convert to string
df_data['Number of Sites'] = df_data.apply(lambda row: '{}'.format(row['ISO Code'], row['Number of Sites']), axis=1)
df_data['Number of Links'] = df_data.apply(lambda row: '{}'.format(row['ISO Code'], row['Number of Links']), axis=1)
df_data["Support by MADLAD400, FLORES200, GLOT500"] = df_data.apply(lambda row: color_mapping([row["Supported by allenai/MADLAD-400"] + row["Supported by facebook/flores"] + row["Supported by cis-lmu/Glot500"]]), axis =1)
df_data['Color_Order'] = pd.Categorical(df_data['Support by MADLAD400, FLORES200, GLOT500'], categories=['🟥', '🟧', '🟨', '🟩'], ordered=True)
# Sort by Color_Order then ISO Code
df_data = df_data.sort_values(by=['Color_Order', 'ISO Code'])
# Filter 🟩
df_data = df_data[df_data["Support by MADLAD400, FLORES200, GLOT500"]!= '🟩']
# Display the table
df_data = df_data[['ISO Code', 'Language Name', 'Family', 'Subgrouping', 'Number of Sites', 'Number of Links', 'Number of Speakers', 'Support by MADLAD400, FLORES200, GLOT500']]
st.write(df_to_html(df_data), unsafe_allow_html=True)
@st.cache_data
def render_site_table(isocode):
# back
back_text = '[Back]'
st.markdown(back_text, unsafe_allow_html=True)
# site
urls = data[isocode].get('Sites', [])
df_urls = pd.DataFrame(urls)
df_urls['Number of Links'] = df_urls['Links'].apply(len)
df_urls = df_urls.sort_values(by='Number of Links', ascending=False)
df_urls = df_urls.reset_index(drop=True)
df_urls['Number of Links'] = df_urls.apply(lambda row: '{}'.format(isocode, row['Site URL'], row['Number of Links']), axis=1)
df_urls['Site URL'] = df_urls['Site URL'].apply(lambda url: f'{url}' if url != 'misc' else url)
df_urls['Language Name'] = data[isocode]['Language Name']
df_urls['ISO Code'] = isocode
# Display the table
df_urls = df_urls[['ISO Code', 'Site URL', 'Category', 'Number of Links', 'Possible Parallel Languages', 'Confidence']]
st.write(df_to_html(df_urls), unsafe_allow_html=True)
@st.cache_data
def render_siteurl_table(isocode, url):
# back
back_text = '[Back]'.format(isocode)
st.markdown(back_text, unsafe_allow_html=True)
# Find selected domain
urls = data[isocode].get('Sites', [])
selected_domain = next((d for d in urls if 'Site URL' in d and d['Site URL'] == url), None)
if selected_domain:
st.write({'Language Name': data[isocode]['Language Name'], 'ISO Code': isocode, 'Site URL': url, 'Links': selected_domain['Links']})
@st.cache_data
def render_links_table(isocode):
# back
back_text = '[Back]'
st.markdown(back_text, unsafe_allow_html=True)
# output
urls = data[isocode].get('Sites', [])
lang_name = data[isocode]['Language Name']
all_urls = [{'Site URL': du['Site URL'], 'Links': du['Links']} for du in urls]
st.write({'Language Name': lang_name, 'ISO Code': isocode, 'URLs': all_urls})
# show logo
render_svg(open("assets/glotweb_logo.svg").read())
def main():
params = st.query_params
if 'isocode' in params:
if 'siteurl' in params:
render_siteurl_table(params['isocode'], params['siteurl'])
if 'site' in params:
render_site_table(params['isocode'])
if 'links' in params:
render_links_table(params['isocode'])
else:
# show home
render_metadata()
st.markdown("**GlotWeb** is an indexing service for low-resource languages. It indexes **non-religous** sites or links written in each language. This list can be used to create raw text or parallel corpora and to study low-resource languages on the web.\n")
render_home_table()
st.markdown("\n\nWe compare the level of support for these languages in the three big datasets ([MADLAD400](https://huggingface.co/datasets/allenai/MADLAD-400), [FLORES200](https://huggingface.co/datasets/facebook/flores), [GLOT500](https://huggingface.co/datasets/cis-lmu/Glot500)) of low-resource languages (🟥 0/3 < 🟧 1/3 < 🟨 2/3 < 🟩 3/3). Although the support in these datasets for some of these languages could be just the religious texts.", unsafe_allow_html=True)
main()