Sarvm-audio-search-and-language-translation

Runtime error

App Files Files Community

Vaibhav Srivastav commited on Nov 30, 2023

Commit

255495b

•

0 Parent(s):

Squash for release.

Browse files

Files changed (17) hide show

.gitattributes +37 -0
.gitignore +162 -0
.pre-commit-config.yaml +55 -0
.vscode/settings.json +26 -0
Dockerfile +61 -0
README.md +10 -0
app.py +291 -0
assets/Excited-Es.wav +0 -0
assets/FastTalking-En.wav +0 -0
assets/Sad-Es.wav +0 -0
assets/Whisper-Fr.wav +0 -0
assets/sample_input.mp3 +3 -0
assets/sample_input_2.mp3 +3 -0
requirements.txt +3 -0
style.css +10 -0
utils.py +206 -0
whl/seamless_communication-1.0.0-py3-none-any.whl +3 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,37 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.whl filter=lfs diff=lfs merge=lfs -text
+*.mp3 filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,162 @@

+gradio_cached_examples/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,55 @@

+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+      - id: check-executables-have-shebangs
+      - id: check-json
+      - id: check-merge-conflict
+      - id: check-shebang-scripts-are-executable
+      - id: check-toml
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: mixed-line-ending
+        args: ["--fix=lf"]
+      - id: requirements-txt-fixer
+      - id: trailing-whitespace
+  - repo: https://github.com/myint/docformatter
+    rev: v1.7.5
+    hooks:
+      - id: docformatter
+        args: ["--in-place"]
+  - repo: https://github.com/pycqa/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+        args: ["--profile", "black"]
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.7.0
+    hooks:
+      - id: mypy
+        args: ["--ignore-missing-imports"]
+        additional_dependencies:
+          ["types-python-slugify", "types-requests", "types-PyYAML"]
+  - repo: https://github.com/psf/black
+    rev: 23.11.0
+    hooks:
+      - id: black
+        language_version: python3.10
+        args: ["--line-length", "119"]
+  - repo: https://github.com/kynan/nbstripout
+    rev: 0.6.1
+    hooks:
+      - id: nbstripout
+        args:
+          [
+            "--extra-keys",
+            "metadata.interpreter metadata.kernelspec cell.metadata.pycharm",
+          ]
+  - repo: https://github.com/nbQA-dev/nbQA
+    rev: 1.7.0
+    hooks:
+      - id: nbqa-black
+      - id: nbqa-pyupgrade
+        args: ["--py37-plus"]
+      - id: nbqa-isort
+        args: ["--float-to-top"]

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+    "editor.formatOnSave": true,
+    "files.insertFinalNewline": false,
+    "[python]": {
+        "editor.defaultFormatter": "ms-python.black-formatter",
+        "editor.formatOnType": true,
+        "editor.codeActionsOnSave": {
+            "source.organizeImports": true
+        }
+    },
+    "[jupyter]": {
+        "files.insertFinalNewline": false
+    },
+    "black-formatter.args": [
+        "--line-length=119"
+    ],
+    "isort.args": ["--profile", "black"],
+    "flake8.args": [
+        "--max-line-length=119"
+    ],
+    "ruff.lint.args": [
+        "--line-length=119"
+    ],
+    "notebook.output.scrolling": true,
+    "notebook.formatOnCellExecution": true
+}

Dockerfile ADDED Viewed

	@@ -0,0 +1,61 @@

+FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install -y --no-install-recommends \
+    git \
+    git-lfs \
+    wget \
+    curl \
+    # python build dependencies \
+    build-essential \
+    libssl-dev \
+    zlib1g-dev \
+    libbz2-dev \
+    libreadline-dev \
+    libsqlite3-dev \
+    libncursesw5-dev \
+    xz-utils \
+    tk-dev \
+    libxml2-dev \
+    libxmlsec1-dev \
+    libffi-dev \
+    liblzma-dev \
+    # gradio dependencies \
+    ffmpeg \
+    # fairseq2 dependencies \
+    libsndfile-dev && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:${PATH}
+WORKDIR ${HOME}/app
+RUN curl https://pyenv.run | bash
+ENV PATH=${HOME}/.pyenv/shims:${HOME}/.pyenv/bin:${PATH}
+ARG PYTHON_VERSION=3.10.13
+RUN pyenv install ${PYTHON_VERSION} && \
+    pyenv global ${PYTHON_VERSION} && \
+    pyenv rehash && \
+    pip install --no-cache-dir -U pip setuptools wheel && \
+    pip install "huggingface-hub==0.19.3" "hf-transfer==0.1.4"
+COPY --chown=1000 . ${HOME}/app
+RUN pip install -r ${HOME}/app/requirements.txt && \
+    pip install fairseq2 --pre --extra-index-url https://fair.pkg.atmeta.com/fairseq2/pt2.1.0/cu121 && \
+    pip install ${HOME}/app/whl/seamless_communication-1.0.0-py3-none-any.whl
+ENV PYTHONPATH=${HOME}/app \
+    PYTHONUNBUFFERED=1 \
+    HF_HUB_ENABLE_HF_TRANSFER=1 \
+    GRADIO_ALLOW_FLAGGING=never \
+    GRADIO_NUM_PORTS=1 \
+    GRADIO_SERVER_NAME=0.0.0.0 \
+    GRADIO_THEME=huggingface \
+    TQDM_POSITION=-1 \
+    TQDM_MININTERVAL=1 \
+    SYSTEM=spaces
+CMD ["python", "app.py"]

README.md ADDED Viewed

	@@ -0,0 +1,10 @@

+---
+title: Seamless Expressive
+emoji: 🏃
+colorFrom: red
+colorTo: blue
+sdk: docker
+pinned: false
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,291 @@

+#!/usr/bin/env python
+import os
+import pathlib
+import tempfile
+import gradio as gr
+import torch
+import torchaudio
+from fairseq2.assets import InProcAssetMetadataProvider, asset_store
+from fairseq2.data import Collater, SequenceData, VocabularyInfo
+from fairseq2.data.audio import (
+    AudioDecoder,
+    WaveformToFbankConverter,
+    WaveformToFbankOutput,
+)
+from seamless_communication.inference import SequenceGeneratorOptions
+from fairseq2.generation import NGramRepeatBlockProcessor
+from fairseq2.memory import MemoryBlock
+from fairseq2.typing import DataType, Device
+from huggingface_hub import snapshot_download
+from seamless_communication.inference import BatchedSpeechOutput, Translator, SequenceGeneratorOptions
+from seamless_communication.models.generator.loader import load_pretssel_vocoder_model
+from seamless_communication.models.unity import (
+    UnitTokenizer,
+    load_gcmvn_stats,
+    load_unity_text_tokenizer,
+    load_unity_unit_tokenizer,
+)
+from torch.nn import Module
+from seamless_communication.cli.expressivity.evaluate.pretssel_inference_helper import PretsselGenerator
+from utils import LANGUAGE_CODE_TO_NAME
+DESCRIPTION = """\
+# Seamless Expressive
+[SeamlessExpressive](https://github.com/facebookresearch/seamless_communication) is a speech-to-speech translation model that captures certain underexplored aspects of prosody such as speech rate and pauses, while preserving the style of one's voice and high content translation quality.
+"""
+CACHE_EXAMPLES = os.getenv("CACHE_EXAMPLES") == "1" and torch.cuda.is_available()
+CHECKPOINTS_PATH = pathlib.Path(os.getenv("CHECKPOINTS_PATH", "/home/user/app/models"))
+if not CHECKPOINTS_PATH.exists():
+    snapshot_download(repo_id="facebook/seamless-expressive", repo_type="model", local_dir=CHECKPOINTS_PATH)
+    snapshot_download(repo_id="facebook/seamless-m4t-v2-large", repo_type="model", local_dir=CHECKPOINTS_PATH)
+# Ensure that we do not have any other environment resolvers and always return
+# "demo" for demo purposes.
+asset_store.env_resolvers.clear()
+asset_store.env_resolvers.append(lambda: "demo")
+# Construct an `InProcAssetMetadataProvider` with environment-specific metadata
+# that just overrides the regular metadata for "demo" environment. Note the "@demo" suffix.
+demo_metadata = [
+    {
+        "name": "seamless_expressivity@demo",
+        "checkpoint": f"file://{CHECKPOINTS_PATH}/m2m_expressive_unity.pt",
+        "char_tokenizer": f"file://{CHECKPOINTS_PATH}/spm_char_lang38_tc.model",
+    },
+    {
+        "name": "vocoder_pretssel@demo",
+        "checkpoint": f"file://{CHECKPOINTS_PATH}/pretssel_melhifigan_wm-final.pt",
+    },
+    {
+        "name": "seamlessM4T_v2_large@demo",
+        "checkpoint": f"file://{CHECKPOINTS_PATH}/seamlessM4T_v2_large.pt",
+        "char_tokenizer": f"file://{CHECKPOINTS_PATH}/spm_char_lang38_tc.model",
+    },
+]
+asset_store.metadata_providers.append(InProcAssetMetadataProvider(demo_metadata))
+LANGUAGE_NAME_TO_CODE = {v: k for k, v in LANGUAGE_CODE_TO_NAME.items()}
+if torch.cuda.is_available():
+    device = torch.device("cuda:0")
+    dtype = torch.float16
+else:
+    device = torch.device("cpu")
+    dtype = torch.float32
+MODEL_NAME = "seamless_expressivity"
+VOCODER_NAME = "vocoder_pretssel"
+# used for ASR for toxicity
+m4t_translator = Translator(
+    model_name_or_card="seamlessM4T_v2_large",
+    vocoder_name_or_card=None,
+    device=device,
+    dtype=dtype,
+)
+unit_tokenizer = load_unity_unit_tokenizer(MODEL_NAME)
+_gcmvn_mean, _gcmvn_std = load_gcmvn_stats(VOCODER_NAME)
+gcmvn_mean = torch.tensor(_gcmvn_mean, device=device, dtype=dtype)
+gcmvn_std = torch.tensor(_gcmvn_std, device=device, dtype=dtype)
+translator = Translator(
+    MODEL_NAME,
+    vocoder_name_or_card=None,
+    device=device,
+    dtype=dtype,
+    apply_mintox=False,
+)
+text_generation_opts = SequenceGeneratorOptions(
+    beam_size=5,
+    unk_penalty=torch.inf,
+    soft_max_seq_len=(0, 200),
+    step_processor=NGramRepeatBlockProcessor(
+        ngram_size=10,
+    ),
+)
+m4t_text_generation_opts = SequenceGeneratorOptions(
+    beam_size=5,
+    unk_penalty=torch.inf,
+    soft_max_seq_len=(1, 200),
+    step_processor=NGramRepeatBlockProcessor(
+        ngram_size=10,
+    ),
+)
+pretssel_generator = PretsselGenerator(
+    VOCODER_NAME,
+    vocab_info=unit_tokenizer.vocab_info,
+    device=device,
+    dtype=dtype,
+)
+decode_audio = AudioDecoder(dtype=torch.float32, device=device)
+convert_to_fbank = WaveformToFbankConverter(
+    num_mel_bins=80,
+    waveform_scale=2**15,
+    channel_last=True,
+    standardize=False,
+    device=device,
+    dtype=dtype,
+)
+def normalize_fbank(data: WaveformToFbankOutput) -> WaveformToFbankOutput:
+    fbank = data["fbank"]
+    std, mean = torch.std_mean(fbank, dim=0)
+    data["fbank"] = fbank.subtract(mean).divide(std)
+    data["gcmvn_fbank"] = fbank.subtract(gcmvn_mean).divide(gcmvn_std)
+    return data
+collate = Collater(pad_value=0, pad_to_multiple=1)
+AUDIO_SAMPLE_RATE = 16000
+MAX_INPUT_AUDIO_LENGTH = 10  # in seconds
+def remove_prosody_tokens_from_text(text):
+    # filter out prosody tokens, there is only emphasis '*', and pause '='
+    text = text.replace("*", "").replace("=", "")
+    text = " ".join(text.split())
+    return text
+def preprocess_audio(input_audio_path: str) -> None:
+    arr, org_sr = torchaudio.load(input_audio_path)
+    new_arr = torchaudio.functional.resample(arr, orig_freq=org_sr, new_freq=AUDIO_SAMPLE_RATE)
+    max_length = int(MAX_INPUT_AUDIO_LENGTH * AUDIO_SAMPLE_RATE)
+    if new_arr.shape[1] > max_length:
+        new_arr = new_arr[:, :max_length]
+        gr.Warning(f"Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used.")
+    torchaudio.save(input_audio_path, new_arr, sample_rate=AUDIO_SAMPLE_RATE)
+def run(
+    input_audio_path: str,
+    source_language: str,
+    target_language: str,
+) -> tuple[str, str]:
+    target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
+    source_language_code = LANGUAGE_NAME_TO_CODE[source_language]
+    preprocess_audio(input_audio_path)
+    with pathlib.Path(input_audio_path).open("rb") as fb:
+        block = MemoryBlock(fb.read())
+        example = decode_audio(block)
+    example = convert_to_fbank(example)
+    example = normalize_fbank(example)
+    example = collate(example)
+    # get transcription for mintox
+    source_sentences, _ = m4t_translator.predict(
+        input=example["fbank"],
+        task_str="S2TT",  # get source text
+        tgt_lang=source_language_code,
+        text_generation_opts=m4t_text_generation_opts,
+    )
+    source_text = str(source_sentences[0])
+    prosody_encoder_input = example["gcmvn_fbank"]
+    text_output, unit_output = translator.predict(
+        example["fbank"],
+        "S2ST",
+        tgt_lang=target_language_code,
+        src_lang=source_language_code,
+        text_generation_opts=text_generation_opts,
+        unit_generation_ngram_filtering=False,
+        duration_factor=1.0,
+        prosody_encoder_input=prosody_encoder_input,
+        src_text=source_text,  # for mintox check
+    )
+    speech_output = pretssel_generator.predict(
+        unit_output.units,
+        tgt_lang=target_language_code,
+        prosody_encoder_input=prosody_encoder_input,
+    )
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+        torchaudio.save(
+            f.name,
+            speech_output.audio_wavs[0][0].to(torch.float32).cpu(),
+            sample_rate=speech_output.sample_rate,
+        )
+    text_out = remove_prosody_tokens_from_text(str(text_output[0]))
+    return f.name, text_out
+TARGET_LANGUAGE_NAMES = [
+    "English",
+    "French",
+    "German",
+    "Spanish",
+]
+with gr.Blocks(css="style.css") as demo:
+    gr.Markdown(DESCRIPTION)
+    gr.DuplicateButton(
+        value="Duplicate Space for private use",
+        elem_id="duplicate-button",
+        visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
+    )
+    with gr.Row():
+        with gr.Column():
+            with gr.Group():
+                input_audio = gr.Audio(label="Input speech", type="filepath")
+                source_language = gr.Dropdown(
+                    label="Source language",
+                    choices=TARGET_LANGUAGE_NAMES,
+                    value="English",
+                )
+                target_language = gr.Dropdown(
+                    label="Target language",
+                    choices=TARGET_LANGUAGE_NAMES,
+                    value="French",
+                )
+            btn = gr.Button()
+        with gr.Column():
+            with gr.Group():
+                output_audio = gr.Audio(label="Translated speech")
+                output_text = gr.Textbox(label="Translated text")
+    gr.Examples(
+        examples=[
+            ["assets/Excited-Es.wav", "English", "Spanish"],
+            ["assets/FastTalking-En.wav", "French", "English"],
+            ["assets/Sad-Es.wav", "English", "Spanish"],
+        ],
+        inputs=[input_audio, source_language, target_language],
+        outputs=[output_audio, output_text],
+        fn=run,
+        cache_examples=CACHE_EXAMPLES,
+        api_name=False,
+    )
+    btn.click(
+        fn=run,
+        inputs=[input_audio, source_language, target_language],
+        outputs=[output_audio, output_text],
+        api_name="run",
+    )
+if __name__ == "__main__":
+    demo.queue(max_size=50).launch()

assets/Excited-Es.wav ADDED Viewed

Binary file (788 kB). View file

assets/FastTalking-En.wav ADDED Viewed

Binary file (788 kB). View file

assets/Sad-Es.wav ADDED Viewed

Binary file (788 kB). View file

assets/Whisper-Fr.wav ADDED Viewed

Binary file (788 kB). View file

assets/sample_input.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:982369687f05bf8fcd6923c4ffcccda0fcce92f44eceae5a9d00a431f07ea87b
+size 10272

assets/sample_input_2.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6a505a4641e3f5f0ddec9508832793aa20e63d2545530b66bc04a9bd19a742e6
+size 30624

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+gradio
+torch
+torchaudio

style.css ADDED Viewed

	@@ -0,0 +1,10 @@

+h1 {
+  text-align: center;
+}
+#duplicate-button {
+  margin: auto;
+  color: #fff;
+  background: #1565c0;
+  border-radius: 100vh;
+}

utils.py ADDED Viewed

	@@ -0,0 +1,206 @@

+# import torch
+# import torchaudio
+# from fairseq2.assets import InProcAssetMetadataProvider, asset_store
+# from fairseq2.data import Collater, SequenceData
+# from fairseq2.data.audio import (
+#     AudioDecoder,
+#     WaveformToFbankConverter,
+#     WaveformToFbankOutput,
+# )
+# from fairseq2.generation import SequenceGeneratorOptions
+# from fairseq2.memory import MemoryBlock
+# from fairseq2.typing import DataType, Device
+# from huggingface_hub import snapshot_download
+# from seamless_communication.inference import BatchedSpeechOutput, Translator
+# from seamless_communication.models.generator.loader import load_pretssel_vocoder_model
+# from seamless_communication.models.unity import (
+#     UnitTokenizer,
+#     load_gcmvn_stats,
+#     load_unity_text_tokenizer,
+#     load_unity_unit_tokenizer,
+# )
+# from torch.nn import Module
+# class PretsselGenerator(Module):
+#     def __init__(
+#         self,
+#         pretssel_name_or_card: str,
+#         unit_tokenizer: UnitTokenizer,
+#         device: Device,
+#         dtype: DataType = torch.float16,
+#     ):
+#         super().__init__()
+#         # Load the model.
+#         if device == torch.device("cpu"):
+#             dtype = torch.float32
+#         self.device = device
+#         self.dtype = dtype
+#         self.pretssel_model = load_pretssel_vocoder_model(
+#             pretssel_name_or_card,
+#             device=device,
+#             dtype=dtype,
+#         )
+#         self.pretssel_model.eval()
+#         vocoder_model_card = asset_store.retrieve_card(pretssel_name_or_card)
+#         self.output_sample_rate = vocoder_model_card.field("sample_rate").as_(int)
+#         self.unit_tokenizer = unit_tokenizer
+#         self.unit_collate = Collater(pad_value=unit_tokenizer.vocab_info.pad_idx)
+#         self.duration_collate = Collater(pad_value=0)
+#     @torch.inference_mode()
+#     def predict(
+#         self,
+#         units: list[list[int]],
+#         tgt_lang: str,
+#         prosody_encoder_input: SequenceData,
+#     ) -> BatchedSpeechOutput:
+#         audio_wavs = []
+#         unit_eos_token = torch.tensor(
+#             [self.unit_tokenizer.vocab_info.eos_idx],
+#             device=self.device,
+#         )
+#         prosody_input_seqs = prosody_encoder_input["seqs"]
+#         prosody_input_lens = prosody_encoder_input["seq_lens"]
+#         for i, u in enumerate(units):
+#             unit = torch.tensor(u).to(unit_eos_token)
+#             # adjust the control symbols for the embedding
+#             unit += 4
+#             unit = torch.cat([unit, unit_eos_token], dim=0)
+#             unit, duration = torch.unique_consecutive(unit, return_counts=True)
+#             # adjust for the last eos token
+#             duration[-1] = 0
+#             duration *= 2
+#             prosody_input_seq = prosody_input_seqs[i][: prosody_input_lens[i]]
+#             audio_wav = self.pretssel_model(
+#                 unit,
+#                 tgt_lang,
+#                 prosody_input_seq,
+#                 durations=duration.unsqueeze(0),
+#             )
+#             audio_wavs.append(audio_wav)
+#         return BatchedSpeechOutput(
+#             units=units,
+#             audio_wavs=audio_wavs,
+#             sample_rate=self.output_sample_rate,
+#         )
+LANGUAGE_CODE_TO_NAME = {
+    "afr": "Afrikaans",
+    "amh": "Amharic",
+    "arb": "Modern Standard Arabic",
+    "ary": "Moroccan Arabic",
+    "arz": "Egyptian Arabic",
+    "asm": "Assamese",
+    "ast": "Asturian",
+    "azj": "North Azerbaijani",
+    "bel": "Belarusian",
+    "ben": "Bengali",
+    "bos": "Bosnian",
+    "bul": "Bulgarian",
+    "cat": "Catalan",
+    "ceb": "Cebuano",
+    "ces": "Czech",
+    "ckb": "Central Kurdish",
+    "cmn": "Mandarin Chinese",
+    "cym": "Welsh",
+    "dan": "Danish",
+    "deu": "German",
+    "ell": "Greek",
+    "eng": "English",
+    "est": "Estonian",
+    "eus": "Basque",
+    "fin": "Finnish",
+    "fra": "French",
+    "gaz": "West Central Oromo",
+    "gle": "Irish",
+    "glg": "Galician",
+    "guj": "Gujarati",
+    "heb": "Hebrew",
+    "hin": "Hindi",
+    "hrv": "Croatian",
+    "hun": "Hungarian",
+    "hye": "Armenian",
+    "ibo": "Igbo",
+    "ind": "Indonesian",
+    "isl": "Icelandic",
+    "ita": "Italian",
+    "jav": "Javanese",
+    "jpn": "Japanese",
+    "kam": "Kamba",
+    "kan": "Kannada",
+    "kat": "Georgian",
+    "kaz": "Kazakh",
+    "kea": "Kabuverdianu",
+    "khk": "Halh Mongolian",
+    "khm": "Khmer",
+    "kir": "Kyrgyz",
+    "kor": "Korean",
+    "lao": "Lao",
+    "lit": "Lithuanian",
+    "ltz": "Luxembourgish",
+    "lug": "Ganda",
+    "luo": "Luo",
+    "lvs": "Standard Latvian",
+    "mai": "Maithili",
+    "mal": "Malayalam",
+    "mar": "Marathi",
+    "mkd": "Macedonian",
+    "mlt": "Maltese",
+    "mni": "Meitei",
+    "mya": "Burmese",
+    "nld": "Dutch",
+    "nno": "Norwegian Nynorsk",
+    "nob": "Norwegian Bokm\u00e5l",
+    "npi": "Nepali",
+    "nya": "Nyanja",
+    "oci": "Occitan",
+    "ory": "Odia",
+    "pan": "Punjabi",
+    "pbt": "Southern Pashto",
+    "pes": "Western Persian",
+    "pol": "Polish",
+    "por": "Portuguese",
+    "ron": "Romanian",
+    "rus": "Russian",
+    "slk": "Slovak",
+    "slv": "Slovenian",
+    "sna": "Shona",
+    "snd": "Sindhi",
+    "som": "Somali",
+    "spa": "Spanish",
+    "srp": "Serbian",
+    "swe": "Swedish",
+    "swh": "Swahili",
+    "tam": "Tamil",
+    "tel": "Telugu",
+    "tgk": "Tajik",
+    "tgl": "Tagalog",
+    "tha": "Thai",
+    "tur": "Turkish",
+    "ukr": "Ukrainian",
+    "urd": "Urdu",
+    "uzn": "Northern Uzbek",
+    "vie": "Vietnamese",
+    "xho": "Xhosa",
+    "yor": "Yoruba",
+    "yue": "Cantonese",
+    "zlm": "Colloquial Malay",
+    "zsm": "Standard Malay",
+    "zul": "Zulu",
+}

whl/seamless_communication-1.0.0-py3-none-any.whl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1df10e0c85ee0ffbc9f2e1bf8896850a52c551383df0332a94d26d9d39770c85
+size 201552