|
import textwrap |
|
import re |
|
|
|
from src.utils import flatten_list, have_emoji, have_langid |
|
|
|
|
|
def setup_nltk(): |
|
import nltk |
|
nltk.download("punkt") |
|
|
|
|
|
|
|
|
|
|
|
sentence_keys = ['sentence_list', 'index'] |
|
|
|
|
|
def init_sentence_state(): |
|
sentence_state = dict(sentence_list=[], index=0) |
|
return sentence_state |
|
|
|
|
|
def unpack_state(sentence_state): |
|
rets = [] |
|
for key in sentence_keys: |
|
rets.append(sentence_state[key]) |
|
return tuple(rets) |
|
|
|
|
|
def pack_state(sentence_state, *args): |
|
|
|
for keyi, key in enumerate(sentence_keys): |
|
if isinstance(sentence_state[key], list): |
|
sentence_state[key] = args[keyi] |
|
else: |
|
sentence_state[key] = args[keyi] |
|
return sentence_state |
|
|
|
|
|
def split_sentences(sentence, n=250): |
|
""" |
|
Splits a sentence by spaces into smaller sentences, each with a maximum length of n characters, |
|
while preserving whitespace characters like new lines. |
|
# 250 due to [!] Warning: The text length exceeds the character limit of 250 for language 'en', this might cause truncated audio. |
|
""" |
|
|
|
words = re.split('(\s+)', sentence) |
|
sentences = [] |
|
current_sentence = [] |
|
current_length = 0 |
|
|
|
for word in words: |
|
|
|
if word == '': |
|
continue |
|
|
|
|
|
if word.isspace(): |
|
if word == '\n': |
|
|
|
sentences.append("".join(current_sentence)) |
|
current_sentence = [] |
|
current_length = 0 |
|
else: |
|
|
|
current_sentence.append(word) |
|
current_length += len(word) |
|
else: |
|
|
|
if current_length + len(word) > n: |
|
if current_sentence: |
|
sentences.append("".join(current_sentence)) |
|
current_sentence = [word] |
|
current_length = len(word) |
|
else: |
|
|
|
sentences.append(word) |
|
current_length = 0 |
|
else: |
|
current_sentence.append(word) |
|
current_length += len(word) |
|
|
|
|
|
if current_sentence: |
|
sentences.append("".join(current_sentence)) |
|
|
|
return sentences |
|
|
|
|
|
def _get_sentences(response, verbose=False, min_start=15, max_length=250): |
|
|
|
import nltk |
|
|
|
sentences = nltk.sent_tokenize(response[min_start:]) |
|
|
|
sentences = flatten_list([split_sentences(x, max_length) for x in sentences]) |
|
|
|
sentences = [x for x in sentences if x.strip()] |
|
|
|
if sentences and min_start > 0: |
|
sentences[0] = response[:min_start] + sentences[0] |
|
elif min_start > 0: |
|
sentences.append(response[:min_start]) |
|
|
|
return sentences |
|
|
|
|
|
def get_sentence(response, sentence_state, is_final=False, verbose=False): |
|
|
|
sentence_list, index = unpack_state(sentence_state) |
|
sentences = _get_sentences(response[index:], min_start=15 if index == 0 else 0, verbose=verbose) |
|
|
|
if len(sentences) >= 2: |
|
|
|
|
|
index_delta = response[index:].index(sentences[0]) |
|
index += index_delta + len(sentences[0]) |
|
sentence_list.append(sentences[0]) |
|
|
|
cleaned_sentence = clean_sentence(sentences[0], verbose=verbose) |
|
return cleaned_sentence, pack_state(sentence_state, sentence_list, index), False |
|
elif is_final: |
|
|
|
cleaned_sentence = clean_sentence(' '.join(sentences), verbose=verbose) |
|
sentence_list.append(' '.join(sentences)) |
|
return cleaned_sentence, pack_state(sentence_state, sentence_list, index), True |
|
else: |
|
return None, pack_state(sentence_state, sentence_list, index), True |
|
|
|
|
|
def clean_sentence(sentence, verbose=False): |
|
if sentence is None or len(sentence) == 0: |
|
if verbose: |
|
print("empty sentence") |
|
return '' |
|
|
|
|
|
sentence = re.sub("```.*?```", "", sentence, flags=re.DOTALL) |
|
sentence = re.sub("`.*?`", "", sentence, flags=re.DOTALL) |
|
sentence = re.sub("\(.*?\)", "", sentence, flags=re.DOTALL) |
|
|
|
|
|
sentence = sentence.replace("```", "") |
|
sentence = sentence.replace("...", " ") |
|
sentence = sentence.replace("(", " ") |
|
sentence = sentence.replace(")", " ") |
|
|
|
sentence = sentence.replace("Dr. ", "Doctor ") |
|
sentence = sentence.replace(" w/ ", " with ") |
|
|
|
sentence = sentence.replace('H2O.ai', "aych two oh ae eye.") |
|
sentence = sentence.replace('H2O.AI', "aych two oh ae eye.") |
|
sentence = sentence.replace('h2o.ai', "aych two oh ae eye.") |
|
sentence = sentence.replace('h2o.ai', "aych two oh ae eye.") |
|
|
|
|
|
if have_emoji: |
|
import emoji |
|
sentence = ''.join([x for x in sentence if not emoji.is_emoji(x)]) |
|
|
|
|
|
sentence = re.sub(r'(\d+)\.(\d+)', r"\1 dot \2", sentence) |
|
|
|
|
|
sentence = re.sub("([^\x00-\x7F]|\w)(\.|\。|\?|\!)", r"\1\2", sentence) |
|
|
|
sentence = sentence.strip() |
|
|
|
if sentence.startswith('. ') or sentence.startswith('? ') or sentence.startswith('! ') or sentence.startswith(', '): |
|
sentence = sentence[2:] |
|
if sentence.startswith('.') or sentence.startswith('?') or sentence.startswith('!') or sentence.startswith(','): |
|
sentence = sentence[1:] |
|
|
|
if sentence == '1.': |
|
sentence = 'One' |
|
if sentence == '2.': |
|
sentence = 'Two' |
|
if sentence == '3.': |
|
sentence = 'Three' |
|
if sentence == '4.': |
|
sentence = 'Four' |
|
if sentence == '5.': |
|
sentence = 'Five' |
|
if sentence == '6.': |
|
sentence = 'Six' |
|
if sentence == '7.': |
|
sentence = 'Seven' |
|
if sentence == '8.': |
|
sentence = 'Eight' |
|
if sentence == '9.': |
|
sentence = 'Nine' |
|
if sentence == '10.': |
|
sentence = 'Ten' |
|
|
|
if len(sentence) == 0: |
|
if verbose: |
|
print("EMPTY SENTENCE after processing") |
|
return '' |
|
|
|
if verbose: |
|
print("Sentence for speech: %s" % sentence) |
|
|
|
return sentence |
|
|
|
|
|
def detect_language(prompt, supported_languages, verbose=False): |
|
if not have_langid: |
|
|
|
return "en" |
|
|
|
import langid |
|
|
|
if len(prompt) > 15: |
|
language_predicted = langid.classify(prompt)[0].strip() |
|
if language_predicted == "zh": |
|
|
|
language_predicted = "zh-cn" |
|
|
|
if language_predicted not in supported_languages: |
|
print(f"Detected a language not supported by xtts :{language_predicted}, switching to english for now") |
|
language = "en" |
|
else: |
|
language = language_predicted |
|
if verbose: |
|
print(f"Language: Predicted sentence language:{language_predicted} , using language for xtts:{language}") |
|
else: |
|
|
|
language = "en" |
|
if verbose: |
|
print(f"Language: Prompt is short or autodetect language disabled using english for xtts") |
|
|
|
return language |
|
|