Spaces:

project-ori
/

AronaTTS

Runtime error

App Files Files Community

ORI-Muchim commited on Jun 24, 2023

Commit

937a9da

•

1 Parent(s): 5c3c029

Upload 5 files

Browse files

Files changed (5) hide show

text/LICENSE +19 -0
text/__init__.py +54 -0
text/cleaners.py +17 -0
text/japanese.py +153 -0
text/symbols.py +25 -0

text/LICENSE ADDED Viewed

	@@ -0,0 +1,19 @@

+Copyright (c) 2017 Keith Ito
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.

text/__init__.py ADDED Viewed

	@@ -0,0 +1,54 @@

+""" from https://github.com/keithito/tacotron """
+from text import cleaners
+from text.symbols import symbols
+# Mappings from symbol to numeric ID and vice versa:
+_symbol_to_id = {s: i for i, s in enumerate(symbols)}
+_id_to_symbol = {i: s for i, s in enumerate(symbols)}
+def text_to_sequence(text, cleaner_names):
+  '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+    Args:
+      text: string to convert to a sequence
+      cleaner_names: names of the cleaner functions to run the text through
+    Returns:
+      List of integers corresponding to the symbols in the text
+  '''
+  sequence = []
+  clean_text = _clean_text(text, cleaner_names)
+  for symbol in clean_text:
+    symbol_id = _symbol_to_id[symbol]
+    sequence += [symbol_id]
+  return sequence
+def cleaned_text_to_sequence(cleaned_text):
+  '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+    Args:
+      text: string to convert to a sequence
+    Returns:
+      List of integers corresponding to the symbols in the text
+  '''
+  sequence = [_symbol_to_id[symbol] for symbol in cleaned_text]
+  return sequence
+def sequence_to_text(sequence):
+  '''Converts a sequence of IDs back to a string'''
+  result = ''
+  for symbol_id in sequence:
+    s = _id_to_symbol[symbol_id]
+    result += s
+  return result
+def _clean_text(text, cleaner_names):
+  for name in cleaner_names:
+    cleaner = getattr(cleaners, name)
+    if not cleaner:
+      raise Exception('Unknown cleaner: %s' % name)
+    text = cleaner(text)
+  return text

text/cleaners.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import re
+from text.japanese import japanese_to_romaji_with_accent
+from text.symbols import symbols
+_cleaner_cleans = re.compile('['+'^'.join(symbols)+']')
+def japanese_cleaners(text):
+    text = japanese_to_romaji_with_accent(text)
+    text = re.sub(r'([A-Za-z])$', r'\1.', text)
+    return text
+def japanese_cleaners2(text):
+    text = japanese_cleaners(text).replace('ts', 'ʦ').replace('...', '…')
+    text = ''.join(_cleaner_cleans.findall(text))#.replace('_', '').replace(' ', '')
+    return text

text/japanese.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import re
+from unidecode import unidecode
+import pyopenjtalk
+# Regular expression matching Japanese without punctuation marks:
+_japanese_characters = re.compile(
+    r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
+# Regular expression matching non-Japanese characters or punctuation marks:
+_japanese_marks = re.compile(
+    r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
+# List of (symbol, Japanese) pairs for marks:
+_symbols_to_japanese = [(re.compile('%s' % x[0]), x[1]) for x in [
+    ('％', 'パーセント')
+]]
+# List of (romaji, ipa) pairs for marks:
+_romaji_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
+    ('ts', 'ʦ'),
+    ('u', 'ɯ'),
+    ('j', 'ʥ'),
+    ('y', 'j'),
+    ('ni', 'n^i'),
+    ('nj', 'n^'),
+    ('hi', 'çi'),
+    ('hj', 'ç'),
+    ('f', 'ɸ'),
+    ('I', 'i*'),
+    ('U', 'ɯ*'),
+    ('r', 'ɾ')
+]]
+# List of (romaji, ipa2) pairs for marks:
+_romaji_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
+    ('u', 'ɯ'),
+    ('ʧ', 'tʃ'),
+    ('j', 'dʑ'),
+    ('y', 'j'),
+    ('ni', 'n^i'),
+    ('nj', 'n^'),
+    ('hi', 'çi'),
+    ('hj', 'ç'),
+    ('f', 'ɸ'),
+    ('I', 'i*'),
+    ('U', 'ɯ*'),
+    ('r', 'ɾ')
+]]
+# List of (consonant, sokuon) pairs:
+_real_sokuon = [(re.compile('%s' % x[0]), x[1]) for x in [
+    (r'Q([↑↓]*[kg])', r'k#\1'),
+    (r'Q([↑↓]*[tdjʧ])', r't#\1'),
+    (r'Q([↑↓]*[sʃ])', r's\1'),
+    (r'Q([↑↓]*[pb])', r'p#\1')
+]]
+# List of (consonant, hatsuon) pairs:
+_real_hatsuon = [(re.compile('%s' % x[0]), x[1]) for x in [
+    (r'N([↑↓]*[pbm])', r'm\1'),
+    (r'N([↑↓]*[ʧʥj])', r'n^\1'),
+    (r'N([↑↓]*[tdn])', r'n\1'),
+    (r'N([↑↓]*[kg])', r'ŋ\1')
+]]
+def symbols_to_japanese(text):
+    for regex, replacement in _symbols_to_japanese:
+        text = re.sub(regex, replacement, text)
+    return text
+def japanese_to_romaji_with_accent(text):
+    '''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html'''
+    text = symbols_to_japanese(text)
+    sentences = re.split(_japanese_marks, text)
+    marks = re.findall(_japanese_marks, text)
+    text = ''
+    for i, sentence in enumerate(sentences):
+        if re.match(_japanese_characters, sentence):
+            if text != '':
+                text += ' '
+            labels = pyopenjtalk.extract_fullcontext(sentence)
+            for n, label in enumerate(labels):
+                phoneme = re.search(r'\-([^\+]*)\+', label).group(1)
+                if phoneme not in ['sil', 'pau']:
+                    text += phoneme.replace('ch', 'ʧ').replace('sh',
+                                                               'ʃ').replace('cl', 'Q')
+                else:
+                    continue
+                # n_moras = int(re.search(r'/F:(\d+)_', label).group(1))
+                a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1))
+                a2 = int(re.search(r"\+(\d+)\+", label).group(1))
+                a3 = int(re.search(r"\+(\d+)/", label).group(1))
+                if re.search(r'\-([^\+]*)\+', labels[n + 1]).group(1) in ['sil', 'pau']:
+                    a2_next = -1
+                else:
+                    a2_next = int(
+                        re.search(r"\+(\d+)\+", labels[n + 1]).group(1))
+                # Accent phrase boundary
+                if a3 == 1 and a2_next == 1:
+                    text += ' '
+                # Falling
+                elif a1 == 0 and a2_next == a2 + 1:
+                    text += '↓'
+                # Rising
+                elif a2 == 1 and a2_next == 2:
+                    text += '↑'
+        if i < len(marks):
+            text += unidecode(marks[i]).replace(' ', '')
+    return text
+def get_real_sokuon(text):
+    for regex, replacement in _real_sokuon:
+        text = re.sub(regex, replacement, text)
+    return text
+def get_real_hatsuon(text):
+    for regex, replacement in _real_hatsuon:
+        text = re.sub(regex, replacement, text)
+    return text
+def japanese_to_ipa(text):
+    text = japanese_to_romaji_with_accent(text).replace('...', '…')
+    text = re.sub(
+        r'([aiueo])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text)
+    text = get_real_sokuon(text)
+    text = get_real_hatsuon(text)
+    for regex, replacement in _romaji_to_ipa:
+        text = re.sub(regex, replacement, text)
+    return text
+def japanese_to_ipa2(text):
+    text = japanese_to_romaji_with_accent(text).replace('...', '…')
+    text = get_real_sokuon(text)
+    text = get_real_hatsuon(text)
+    for regex, replacement in _romaji_to_ipa2:
+        text = re.sub(regex, replacement, text)
+    return text
+def japanese_to_ipa3(text):
+    text = japanese_to_ipa2(text).replace('n^', 'ȵ').replace(
+        'ʃ', 'ɕ').replace('*', '\u0325').replace('#', '\u031a')
+    text = re.sub(
+        r'([aiɯeo])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text)
+    text = re.sub(r'((?:^|\s)(?:ts|tɕ|[kpt]))', r'\1ʰ', text)
+    return text

text/symbols.py ADDED Viewed

	@@ -0,0 +1,25 @@

+""" from https://github.com/keithito/tacotron """
+'''
+Defines the set of symbols used in text input to the model.
+'''
+'''# korean_cleaners
+_pad        = '_'
+_punctuation = ',.!?…~'
+_letters = 'ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ '
+'''
+# japanese_cleaners2
+_pad        = '_'
+_punctuation = ',.!?-~…'
+_letters = 'AEINOQUabdefghijkmnoprstuvwyzʃʧʦ↓↑ '
+# Export all symbols:
+symbols = [_pad] + list(_punctuation) + list(_letters)
+# Special symbol ids
+SPACE_ID = symbols.index(" ")