Spaces:

cdleong
/

phonemize-text

Runtime error

App Files Files Community

phonemize-text / app.py

cdleong

link

3b621ef almost 3 years ago

raw

history blame contribute delete

5.89 kB

	import streamlit as st
	import epitran
	import langcodes
	from langcodes import LanguageTagError
	from pathlib import Path
	from operator import itemgetter
	from collections import defaultdict
	# TODO: reverse transliterate?


	@st.cache
	def get_lang_description_from_mapping_name(string_to_check, add_original_code= True, add_iso_url=False):
	description = None
	if "generic-Latn" == string_to_check:
	return "Generic Latin Script text"



	lang = get_langcode_lang_from_mapping_name(string_to_check)
	if lang:
	items = []
	for key, value in lang.describe().items():
	if key == "language" and add_iso_url:
	iso_code = lang.to_alpha3()
	value = f"[{value}](https://iso639-3.sil.org/code/{iso_code})"
	items.append(f"{key}: {value}")


	description = ", ".join(items)

	notes = {
	"-red": " (reduced mode)",
	"-suf": " (Based on data with suffixes attached)",
	"-nosuf": "Based on data with suffixes removed",
	"-np": " (naively assume phonemic orthography)",

	}
	for key, note in notes.items():
	if key in string_to_check:
	description = description + note

	if add_original_code:
	description = f"{string_to_check}: " + description
	return description


	@st.cache
	def get_langcode_lang_from_mapping_name(string_to_check):

	if len(string_to_check)<2:
	return None

	substrings = string_to_check.split("-")
	iso_lang_and_iso_script = substrings[:2] # first two are ISO 639-3 language, and ISO 15924 script
	string_to_check = "-".join(iso_lang_and_iso_script )
	lang = langcodes.get(string_to_check)
	return lang




	@st.cache
	def get_valid_epitran_mappings_list():
	map_path = Path(epitran.__path__[0]) / "data" / "map"
	map_files = map_path.glob(".")
	valid_mappings = [map_file.stem for map_file in map_files]
	valid_mappings.append("cmn-Hans") # special case
	valid_mappings.append("cmn-Hant") # Taiwan #1

	problem_mappings = ['generic-Latn',
	'tur-Latn-bab',
	'ood-Latn-sax',
	'vie-Latn-so',
	'vie-Latn-ce',
	'vie-Latn-no',
	'kaz-Cyrl-bab'] # https://github.com/dmort27/epitran/issues/98

	filtered_mappings = [mapping for mapping in valid_mappings if mapping not in problem_mappings]

	return filtered_mappings


	def get_epitran(selected_mapping):
	if selected_mapping == "cmn-Hans" or selected_mapping == "cmn-Hant":
	st.info("Chinese requires a special dictionary. Downloading now")
	epitran.download.cedict()

	epi = epitran.Epitran(selected_mapping)
	return epi





	if __name__ == "__main__":

	st.write("# Phonemize your text with [Epitran](https://github.com/dmort27/epitran)!")

	st.write("Epitran is a library and tool for transliterating orthographic text as IPA (International Phonetic Alphabet), by Mortensen, David R. and Dalmia, Siddharth and Littell, Patrick. [Click here to visit their repository!](https://github.com/dmort27/epitran)")
	st.write("I, [Colin Leong](cdleong.github.io) did not create Epitran, but I have created this web app (kindly hosted by Hugging Face) to make it convenient to use: simply type your text in the box below!")
	st.write(f"Feedback: Provide feedback regarding this web app at https://twitter.com/cleong110, or via slack: https://masakhane-nlp.slack.com/archives/D01DU3MHP7A")

	valid_epitran_mappings = get_valid_epitran_mappings_list()
	#st.write(valid_epitran_mappings)

	st.write(f"It supports converting many writing sytems to IPA symbols, including approximately {len(valid_epitran_mappings)} languages/scripts, listed below:")
	index_of_desired_default = valid_epitran_mappings.index("swa-Latn")
	selected_mapping = st.selectbox("Select input language/script:",
	valid_epitran_mappings,
	index=index_of_desired_default,
	format_func=get_lang_description_from_mapping_name,
	)


	description = get_lang_description_from_mapping_name(selected_mapping, add_iso_url=True)
	st.write(f"Selected input language/script: {description}")

	st.info("attempting to instantiate epitran transliterator for your language/script")
	epi = get_epitran(str(selected_mapping))

	examples = defaultdict(lambda: 'Try typing some words in the language you chose, and they will be transliterated.')
	examples['cmn-Hans'] = '太初有道，道与神同在，道就是神' # https://www.biblegateway.com/passage/?search=John+1&version=CUVS
	examples['cmn-Hant'] = '太初有道，道與神同在，道就是神。' # https://www.biblegateway.com/passage/?search=John+1&version=CUV
	examples['swa-Latn'] = 'Mwanzoni Kabla ulimwengu haujaumbwa alikuwepo Neno Huyo Neno alikuwa pamoja na Mungu, na Neno alikuwa Mungu.' # https://www.biblegateway.com/passage/?search=John+1&version=SNT
	examples['ara-Arab'] = 'فِي الْبَدْءِ كَانَ الْكَلِمَةُ، وَالْكَلِمَةُ كَانَ عِنْدَ اللهِ. وَكَانَ الْكَلِمَةُ اللهُ.' # https://www.biblegateway.com/passage/?search=John+1&version=NAV
	examples['urd-Arab'] = 'دُنیا کی ابتدا ء سے پہلے کلام وہاں تھا کلام خدا کے ساتھ تھا اور کلام خدا تھا۔' # https://www.biblegateway.com/passage/?search=John+1&version=ERV-UR

	st.write("### Input text below")
	input_text = st.text_area(label="Whatever you type here will be transliterated!", value=examples[selected_mapping])

	# combined_code = "-".join([iso_lang_code, iso_script_code])
	# st.write(f"Combined code: {combined_code}")


	st.info(f"transliterating `{input_text}`\n\tusing {selected_mapping}...")
	transliteration = epi.transliterate(input_text)

	output = {
	"original": input_text,
	"transliteration":transliteration,
	}

	st.write(output)