Spaces:

lenML
/

ChatTTS-Forge

Running on Zero

App Files Files Community

zhzluke96 commited on Jun 6

Commit

f34bda5

•

1 Parent(s): b44532e

update

Browse files

Files changed (4) hide show

modules/api/impl/openai_api.py +50 -4
modules/normalization.py +21 -2
modules/utils/zh_normalization/num.py +15 -6
webui.py +3 -0

modules/api/impl/openai_api.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from fastapi import HTTPException, Body
 from fastapi.responses import StreamingResponse
 import io
@@ -14,7 +14,7 @@ from modules.normalization import text_normalize
 from modules import generate_audio as generate
-from typing import Literal
 import pyrubberband as pyrb
 from modules.api import utils as api_utils
@@ -106,8 +106,29 @@ async def openai_speech_api(
         raise HTTPException(status_code=500, detail=str(e))
-def setup(api_manager: APIManager):
-    api_manager.post(
         "/v1/audio/speech",
         response_class=FileResponse,
         description="""
@@ -122,3 +143,28 @@ openai api document:
 > model 可填任意值
         """,
     )(openai_speech_api)

+from fastapi import File, Form, HTTPException, Body, UploadFile
 from fastapi.responses import StreamingResponse
 import io
 from modules import generate_audio as generate
+from typing import List, Literal, Optional, Union
 import pyrubberband as pyrb
 from modules.api import utils as api_utils
         raise HTTPException(status_code=500, detail=str(e))
+class TranscribeSegment(BaseModel):
+    id: int
+    seek: float
+    start: float
+    end: float
+    text: str
+    tokens: List[int]
+    temperature: float
+    avg_logprob: float
+    compression_ratio: float
+    no_speech_prob: float
+class TranscriptionsVerboseResponse(BaseModel):
+    task: str
+    language: str
+    duration: float
+    text: str
+    segments: List[TranscribeSegment]
+def setup(app: APIManager):
+    app.post(
         "/v1/audio/speech",
         response_class=FileResponse,
         description="""
 > model 可填任意值
         """,
     )(openai_speech_api)
+    @app.post(
+        "/v1/audio/transcriptions",
+        response_class=TranscriptionsVerboseResponse,
+        description="WIP",
+    )
+    async def transcribe(
+        file: UploadFile = File(...),
+        model: str = Form(...),
+        language: Optional[str] = Form(None),
+        prompt: Optional[str] = Form(None),
+        response_format: str = Form("json"),
+        temperature: float = Form(0),
+        timestamp_granularities: List[str] = Form(["segment"]),
+    ):
+        # TODO: Implement transcribe
+        return {
+            "file": file.filename,
+            "model": model,
+            "language": language,
+            "prompt": prompt,
+            "response_format": response_format,
+            "temperature": temperature,
+            "timestamp_granularities": timestamp_granularities,
+        }

modules/normalization.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from modules.utils.zh_normalization.text_normlization import *
 import emojiswitch
 from modules.utils.markdown import markdown_to_text
@@ -5,12 +6,28 @@ from modules import models
 import re
 def is_chinese(text):
     # 中文字符的 Unicode 范围是 \u4e00-\u9fff
     chinese_pattern = re.compile(r"[\u4e00-\u9fff]")
     return bool(chinese_pattern.search(text))
 post_normalize_pipeline = []
 pre_normalize_pipeline = []
@@ -123,7 +140,7 @@ def apply_character_map(text):
 @post_normalize()
 def apply_emoji_map(text):
-    lang = "zh" if is_chinese(text) else "en"
     return emojiswitch.demojize(text, delimiters=("", ""), lang=lang)
@@ -144,6 +161,8 @@ def replace_unk_tokens(text):
     """
     chat_tts = models.load_chat_tts()
     if "tokenizer" not in chat_tts.pretrain_models:
         return text
     tokenizer = chat_tts.pretrain_models["tokenizer"]
     vocab = tokenizer.get_vocab()
@@ -223,7 +242,7 @@ def sentence_normalize(sentence_text: str):
     pattern = re.compile(r"(\[.+?\])|([^[]+)")
     def normalize_part(part):
-        sentences = tx.normalize(part) if is_chinese(part) else [part]
         dest_text = ""
         for sentence in sentences:
             sentence = apply_post_normalize(sentence)

+from functools import lru_cache
 from modules.utils.zh_normalization.text_normlization import *
 import emojiswitch
 from modules.utils.markdown import markdown_to_text
 import re
+@lru_cache(maxsize=64)
 def is_chinese(text):
     # 中文字符的 Unicode 范围是 \u4e00-\u9fff
     chinese_pattern = re.compile(r"[\u4e00-\u9fff]")
     return bool(chinese_pattern.search(text))
+@lru_cache(maxsize=64)
+def is_eng(text):
+    eng_pattern = re.compile(r"[a-zA-Z]")
+    return bool(eng_pattern.search(text))
+@lru_cache(maxsize=64)
+def guess_lang(text):
+    if is_chinese(text):
+        return "zh"
+    if is_eng(text):
+        return "en"
+    return "zh"
 post_normalize_pipeline = []
 pre_normalize_pipeline = []
 @post_normalize()
 def apply_emoji_map(text):
+    lang = guess_lang(text)
     return emojiswitch.demojize(text, delimiters=("", ""), lang=lang)
     """
     chat_tts = models.load_chat_tts()
     if "tokenizer" not in chat_tts.pretrain_models:
+        # 这个地方只有在 huggingface spaces 中才会触发
+        # 因为 hugggingface 自动处理模型卸载加载，所以如果拿不到就算了...
         return text
     tokenizer = chat_tts.pretrain_models["tokenizer"]
     vocab = tokenizer.get_vocab()
     pattern = re.compile(r"(\[.+?\])|([^[]+)")
     def normalize_part(part):
+        sentences = tx.normalize(part) if guess_lang(part) == "zh" else [part]
         dest_text = ""
         for sentence in sentences:
             sentence = apply_post_normalize(sentence)

modules/utils/zh_normalization/num.py CHANGED Viewed

@@ -144,13 +144,22 @@ def replace_number(match) -> str:
     sign = match.group(1)
     number = match.group(2)
     pure_decimal = match.group(5)
-    if pure_decimal:
-        result = num2str(pure_decimal)
-    else:
-        sign: str = "负" if sign else ""
-        number: str = num2str(number)
-        result = f"{sign}{number}"
     return result
 # 范围表达式

     sign = match.group(1)
     number = match.group(2)
     pure_decimal = match.group(5)
+    # TODO 也许可以把 num2str 完全替换成 cn2an
+    import cn2an
+    text = pure_decimal if pure_decimal else f"{sign}{number}"
+    try:
+        result = cn2an.an2cn(text, "low")
+    except ValueError:
+        if pure_decimal:
+            result = num2str(pure_decimal)
+        else:
+            sign: str = "负" if sign else ""
+            number: str = num2str(number)
+            result = f"{sign}{number}"
     return result
 # 范围表达式

webui.py CHANGED Viewed

@@ -45,6 +45,9 @@ from modules import refiner, config
 from modules.utils import env, audio
 from modules.SentenceSplitter import SentenceSplitter
 torch._dynamo.config.cache_size_limit = 64
 torch._dynamo.config.suppress_errors = True
 torch.set_float32_matmul_precision("high")

 from modules.utils import env, audio
 from modules.SentenceSplitter import SentenceSplitter
+# fix: If the system proxy is enabled in the Windows system, you need to skip these
+os.environ["NO_PROXY"] = "localhost,127.0.0.1,0.0.0.0"
 torch._dynamo.config.cache_size_limit = 64
 torch._dynamo.config.suppress_errors = True
 torch.set_float32_matmul_precision("high")