Spaces:
Runtime error
Runtime error
File size: 5,891 Bytes
a013c5c c95a8ea db6d318 0db47ae 2b29a37 145304e c95a8ea 31730d4 2ffcbe6 31730d4 c95a8ea 31730d4 aa00be2 24c5e5a dbbbbd3 24c5e5a 31730d4 24c5e5a 905833b 24c5e5a 8f05d7c d067d75 8f05d7c 651c679 8f05d7c 2ffcbe6 743f5fb 24c5e5a c95a8ea 31730d4 aa00be2 c95a8ea 7c012d1 c95a8ea 79fe79b 731f6bd 49de2b2 0db47ae 49de2b2 f6ae8b8 0db47ae 0e45975 c95a8ea 058de97 726b8f4 731f6bd 726b8f4 058de97 726b8f4 058de97 aa00be2 c95a8ea 1651e6e b33e08e 3b621ef 811d8a2 817d838 c95a8ea 8582a7d 058de97 817d838 8582a7d 46c6f99 31730d4 49de2b2 31730d4 383b08c 0ba8db9 aa00be2 0ba8db9 4045b93 e020bf9 2b29a37 f79bf66 2b29a37 44f705d aa00be2 2bd2657 b33e08e 44f705d 2bd2657 1526a67 410f826 a013c5c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import streamlit as st
import epitran
import langcodes
from langcodes import LanguageTagError
from pathlib import Path
from operator import itemgetter
from collections import defaultdict
# TODO: reverse transliterate?
@st.cache
def get_lang_description_from_mapping_name(string_to_check, add_original_code= True, add_iso_url=False):
description = None
if "generic-Latn" == string_to_check:
return "Generic Latin Script text"
lang = get_langcode_lang_from_mapping_name(string_to_check)
if lang:
items = []
for key, value in lang.describe().items():
if key == "language" and add_iso_url:
iso_code = lang.to_alpha3()
value = f"[{value}](https://iso639-3.sil.org/code/{iso_code})"
items.append(f"{key}: {value}")
description = ", ".join(items)
notes = {
"-red": " (reduced mode)",
"-suf": " (Based on data with suffixes attached)",
"-nosuf": "Based on data with suffixes removed",
"-np": " (naively assume phonemic orthography)",
}
for key, note in notes.items():
if key in string_to_check:
description = description + note
if add_original_code:
description = f"{string_to_check}: " + description
return description
@st.cache
def get_langcode_lang_from_mapping_name(string_to_check):
if len(string_to_check)<2:
return None
substrings = string_to_check.split("-")
iso_lang_and_iso_script = substrings[:2] # first two are ISO 639-3 language, and ISO 15924 script
string_to_check = "-".join(iso_lang_and_iso_script )
lang = langcodes.get(string_to_check)
return lang
@st.cache
def get_valid_epitran_mappings_list():
map_path = Path(epitran.__path__[0]) / "data" / "map"
map_files = map_path.glob("*.*")
valid_mappings = [map_file.stem for map_file in map_files]
valid_mappings.append("cmn-Hans") # special case
valid_mappings.append("cmn-Hant") # Taiwan #1
problem_mappings = ['generic-Latn',
'tur-Latn-bab',
'ood-Latn-sax',
'vie-Latn-so',
'vie-Latn-ce',
'vie-Latn-no',
'kaz-Cyrl-bab'] # https://github.com/dmort27/epitran/issues/98
filtered_mappings = [mapping for mapping in valid_mappings if mapping not in problem_mappings]
return filtered_mappings
def get_epitran(selected_mapping):
if selected_mapping == "cmn-Hans" or selected_mapping == "cmn-Hant":
st.info("Chinese requires a special dictionary. Downloading now")
epitran.download.cedict()
epi = epitran.Epitran(selected_mapping)
return epi
if __name__ == "__main__":
st.write("# Phonemize your text with [Epitran](https://github.com/dmort27/epitran)!")
st.write("Epitran is a library and tool for transliterating orthographic text as IPA (International Phonetic Alphabet), by Mortensen, David R. and Dalmia, Siddharth and Littell, Patrick. [Click here to visit their repository!](https://github.com/dmort27/epitran)")
st.write("I, [Colin Leong](cdleong.github.io) did not create Epitran, but I have created this web app (kindly hosted by Hugging Face) to make it convenient to use: simply type your text in the box below!")
st.write(f"**Feedback:** Provide feedback regarding this web app at https://twitter.com/cleong110, or via slack: https://masakhane-nlp.slack.com/archives/D01DU3MHP7A")
valid_epitran_mappings = get_valid_epitran_mappings_list()
#st.write(valid_epitran_mappings)
st.write(f"It supports converting many writing sytems to IPA symbols, including approximately {len(valid_epitran_mappings)} languages/scripts, listed below:")
index_of_desired_default = valid_epitran_mappings.index("swa-Latn")
selected_mapping = st.selectbox("Select input language/script:",
valid_epitran_mappings,
index=index_of_desired_default,
format_func=get_lang_description_from_mapping_name,
)
description = get_lang_description_from_mapping_name(selected_mapping, add_iso_url=True)
st.write(f"Selected input language/script: {description}")
st.info("attempting to instantiate epitran transliterator for your language/script")
epi = get_epitran(str(selected_mapping))
examples = defaultdict(lambda: 'Try typing some words in the language you chose, and they will be transliterated.')
examples['cmn-Hans'] = '太初有道,道与神同在,道就是神' # https://www.biblegateway.com/passage/?search=John+1&version=CUVS
examples['cmn-Hant'] = '太初有道,道與神同在,道就是神。' # https://www.biblegateway.com/passage/?search=John+1&version=CUV
examples['swa-Latn'] = 'Mwanzoni Kabla ulimwengu haujaumbwa alikuwepo Neno Huyo Neno alikuwa pamoja na Mungu, na Neno alikuwa Mungu.' # https://www.biblegateway.com/passage/?search=John+1&version=SNT
examples['ara-Arab'] = 'فِي الْبَدْءِ كَانَ الْكَلِمَةُ، وَالْكَلِمَةُ كَانَ عِنْدَ اللهِ. وَكَانَ الْكَلِمَةُ اللهُ.' # https://www.biblegateway.com/passage/?search=John+1&version=NAV
examples['urd-Arab'] = 'دُنیا کی ابتدا ء سے پہلے کلام وہاں تھا کلام خدا کے ساتھ تھا اور کلام خدا تھا۔' # https://www.biblegateway.com/passage/?search=John+1&version=ERV-UR
st.write("### Input text below")
input_text = st.text_area(label="Whatever you type here will be transliterated!", value=examples[selected_mapping])
# combined_code = "-".join([iso_lang_code, iso_script_code])
# st.write(f"Combined code: {combined_code}")
st.info(f"transliterating `{input_text}`\n\tusing {selected_mapping}...")
transliteration = epi.transliterate(input_text)
output = {
"original": input_text,
"transliteration":transliteration,
}
st.write(output)
|