seamless-streaming

Running on T4

App Files Files Community

Anna Sun commited on Nov 27, 2023

Commit

c1e0588

•

1 Parent(s): 7fb1760

Model fixes

Browse files

Files changed (3) hide show

app.py +102 -146
models/vad_s2st_sc_24khz_main.yaml +24 -0
simuleval_transcoder.py +18 -49

app.py CHANGED Viewed

@@ -1,24 +1,15 @@
 from __future__ import annotations
-import os
 import gradio as gr
 import numpy as np
-import torch
-import torchaudio
-import sys
-from sample_wav import sample_wav
-np.set_printoptions(threshold=sys.maxsize)
-from simuleval_transcoder import *
-from pydub import AudioSegment
 import time
-from time import sleep
-from seamless_communication.cli.streaming.agents.tt_waitk_unity_s2t_m4t import (
-    TestTimeWaitKUnityS2TM4T,
-)
 language_code_to_name = {
     "cmn": "Mandarin Chinese",
@@ -32,7 +23,17 @@ LANGUAGE_NAME_TO_CODE = {v: k for k, v in language_code_to_name.items()}
 DEFAULT_TARGET_LANGUAGE = "English"
-# TODO: Update this so it takes in target langs from input, refactor sample rate
 transcoder = SimulevalTranscoder(
     sample_rate=48_000,
     debug=False,
@@ -41,93 +42,97 @@ transcoder = SimulevalTranscoder(
 def start_recording():
     logger.debug(f"start_recording: starting transcoder")
     transcoder.start()
-def translate_audio_segment(audio):
-    logger.debug(f"translate_audio_segment: incoming audio")
-    sample_rate, data = audio
-    # print(sample_rate)
-    # print("--------- start \n")
-    # # print(data)
-    # def map(x):
-    #     return x
-    # print(data.tolist())
-    # print("--------- end \n")
     transcoder.process_incoming_bytes(data.tobytes(), 'eng', sample_rate)
     speech_and_text_output =  transcoder.get_buffered_output()
     if speech_and_text_output is None:
         logger.debug("No output from transcoder.get_buffered_output()")
-        return None, None
-    logger.debug(f"We DID get output from the transcoder! {speech_and_text_output}")
     text = None
     speech = None
     if speech_and_text_output.speech_samples:
-        speech = (speech_and_text_output.speech_samples, speech_and_text_output.speech_sample_rate)
     if speech_and_text_output.text:
         text = speech_and_text_output.text
         if speech_and_text_output.final:
             text += "\n"
-    return speech, text
-def dummy_ouput():
-    np.array()
-def streaming_input_callback(
-    audio_file, translated_audio_bytes_state, translated_text_state
-):
-    translated_wav_segment, translated_text = translate_audio_segment(audio_file)
-    logger.debug(f'translated_audio_bytes_state {translated_audio_bytes_state}')
-    logger.debug(f'translated_wav_segment {translated_wav_segment}')
-    # TODO: accumulate each segment to provide a continuous audio segment
-    # TEMP
-    translated_wav_segment = (46_000, sample_wav())
-    if translated_wav_segment is not None:
-        sample_rate, audio_bytes = translated_wav_segment
-        # TODO: convert to 16 bit int
-        # audio_np_array = np.frombuffer(audio_bytes, dtype=np.float32, count=3)
-        audio_np_array = audio_bytes
-        # combine translated wav
-        if type(translated_audio_bytes_state) is not tuple:
-            translated_audio_bytes_state = (sample_rate, audio_np_array)
-            # translated_audio_bytes_state = np.array([])
         else:
-            translated_audio_bytes_state = (translated_audio_bytes_state[0], np.append(translated_audio_bytes_state[1], translated_wav_segment[1]))
-    if translated_text is not None:
-        translated_text_state += " | " + str(translated_text)
-    # most_recent_input_audio_segment = (most_recent_input_audio_segment[0], np.append(most_recent_input_audio_segment[1], audio_file[1]))
-    # Not necessary but for readability.
-    most_recent_input_audio_segment = audio_file
-    translated_wav_segment = translated_wav_segment
-    output_translation_combined = translated_audio_bytes_state
-    stream_output_text = translated_text_state
-    return [
-        most_recent_input_audio_segment,
-        translated_wav_segment,
-        output_translation_combined,
-        stream_output_text,
-        translated_audio_bytes_state,
-        translated_text_state,
-    ]
 def clear():
     logger.debug(f"Clearing State")
@@ -138,105 +143,56 @@ def blocks():
     with gr.Blocks() as demo:
         with gr.Row():
-            # Hook this up once supported
             target_language = gr.Dropdown(
                 label="Target language",
                 choices=S2ST_TARGET_LANGUAGE_NAMES,
                 value=DEFAULT_TARGET_LANGUAGE,
             )
-        translated_audio_bytes_state = gr.State(None)
         translated_text_state = gr.State("")
         input_audio = gr.Audio(
             label="Input Audio",
-            # source="microphone", # gradio==3.41.0
-            sources=["microphone"], # new gradio seems to call this less often...
             streaming=True,
         )
-        # input_audio = gr.Audio(
-        #     label="Input Audio",
-        #     type="filepath",
-        #     source="microphone",
-        #     streaming=True,
-        # )
-        most_recent_input_audio_segment = gr.Audio(
-            label="Recent Input Audio Segment segments",
-            # format="bytes",
-            streaming=True
-        )
-        # Force translate
-        stream_as_bytes_btn = gr.Button("Force translate most recent recording segment (ask for model output)")
         output_translation_segment = gr.Audio(
             label="Translated audio segment",
-            autoplay=False,
-            streaming=True,
-            type="numpy",
-        )
-        output_translation_combined = gr.Audio(
-            label="Translated audio combined",
-            autoplay=False,
             streaming=True,
-            type="numpy",
         )
-        # Could add output text segment
         stream_output_text = gr.Textbox(label="Translated text")
-        stream_as_bytes_btn.click(
-            streaming_input_callback,
-            [input_audio, translated_audio_bytes_state, translated_text_state],
-            [
-                most_recent_input_audio_segment,
-                output_translation_segment,
-                output_translation_combined,
-                stream_output_text,
-                translated_audio_bytes_state,
-                translated_text_state,
-            ],
         )
-        # input_audio.change(
-        #     streaming_input_callback,
-        #     [input_audio, translated_audio_bytes_state, translated_text_state],
-        #     [
-        #         most_recent_input_audio_segment,
-        #         output_translation_segment,
-        #         output_translation_combined,
-        #         stream_output_text,
-        #         translated_audio_bytes_state,
-        #         translated_text_state,
-        #     ],
-        # )
-        input_audio.stream(
             streaming_input_callback,
-            [input_audio, translated_audio_bytes_state, translated_text_state],
             [
-                most_recent_input_audio_segment,
                 output_translation_segment,
-                output_translation_combined,
                 stream_output_text,
-                translated_audio_bytes_state,
                 translated_text_state,
             ],
         )
-        input_audio.start_recording(
-            start_recording,
-        )
-        input_audio.clear(
-            clear, None, [translated_audio_bytes_state, translated_text_state]
         )
-        input_audio.start_recording(
-            clear, None, [translated_audio_bytes_state, translated_text_state]
         )
-    demo.queue().launch()
 blocks()

 from __future__ import annotations
 import gradio as gr
 import numpy as np
+import asyncio
+from simuleval_transcoder import SimulevalTranscoder, logger
 import time
+from simuleval.utils.agent import build_system_from_dir
+import torch
 language_code_to_name = {
     "cmn": "Mandarin Chinese",
 DEFAULT_TARGET_LANGUAGE = "English"
+def build_agent(model_path, config_name=None):
+    agent = build_system_from_dir(
+        model_path, config_name=config_name,
+    )
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    agent.to(device, fp16=True)
+    return agent
+agent = build_agent("models", "vad_s2st_sc_24khz_main.yaml")
 transcoder = SimulevalTranscoder(
     sample_rate=48_000,
     debug=False,
 def start_recording():
     logger.debug(f"start_recording: starting transcoder")
+    transcoder.reset_states()
     transcoder.start()
+    transcoder.close = False
+def stop_recording():
+    transcoder.close = True
+class MyState:
+    def __init__(self):
+        self.queue = asyncio.Queue()
+        self.close = False
+s = MyState()
+def process_incoming_bytes(audio):
+    logger.debug(f"process_bytes: incoming audio")
+    sample_rate, data = audio
     transcoder.process_incoming_bytes(data.tobytes(), 'eng', sample_rate)
+    s.queue.put_nowait(audio)
+def get_buffered_output():
     speech_and_text_output =  transcoder.get_buffered_output()
     if speech_and_text_output is None:
         logger.debug("No output from transcoder.get_buffered_output()")
+        return None, None, None
+    logger.debug(f"We DID get output from the transcoder!")
     text = None
     speech = None
     if speech_and_text_output.speech_samples:
+        speech = (speech_and_text_output.speech_sample_rate, speech_and_text_output.speech_samples)
     if speech_and_text_output.text:
         text = speech_and_text_output.text
         if speech_and_text_output.final:
             text += "\n"
+    return speech, text, speech_and_text_output.final
+def streaming_input_callback():
+    final = False
+    max_wait_s = 15
+    wait_s = 0
+    translated_text_state = ""
+    while not transcoder.close:
+        translated_wav_segment, translated_text, final = get_buffered_output()
+        if translated_wav_segment is None and translated_text is None:
+            time.sleep(0.3)
+            wait_s += 0.3
+            if wait_s >= max_wait_s:
+                transcoder.close = True
+            continue
+        wait_s = 0
+        if translated_wav_segment is not None:
+            sample_rate, audio_bytes = translated_wav_segment
+            print("output sample rate", sample_rate)
+            translated_wav_segment = sample_rate, np.array(audio_bytes)
         else:
+            translated_wav_segment = bytes()
+        if translated_text is not None:
+            translated_text_state += " | " + str(translated_text)
+        stream_output_text = translated_text_state
+        if translated_text is not None:
+            print("translated:", translated_text_state)
+        yield [
+            translated_wav_segment,
+            stream_output_text,
+            translated_text_state,
+        ]
+def streaming_callback_dummy():
+    while not transcoder.close:
+        if s.queue.empty():
+            print("empty")
+            yield bytes()
+            time.sleep(0.3)
+        else:
+            print("audio")
+            audio = s.queue.get_nowait()
+            s.queue.task_done()
+            yield audio
 def clear():
     logger.debug(f"Clearing State")
     with gr.Blocks() as demo:
         with gr.Row():
+            # TODO: add target language switching
             target_language = gr.Dropdown(
                 label="Target language",
                 choices=S2ST_TARGET_LANGUAGE_NAMES,
                 value=DEFAULT_TARGET_LANGUAGE,
             )
         translated_text_state = gr.State("")
         input_audio = gr.Audio(
             label="Input Audio",
+            sources=["microphone"],
             streaming=True,
         )
         output_translation_segment = gr.Audio(
             label="Translated audio segment",
+            autoplay=True,
             streaming=True,
         )
+        # Output text segment
         stream_output_text = gr.Textbox(label="Translated text")
+        input_audio.clear(
+            clear, None, [output_translation_segment, translated_text_state]
         )
+        input_audio.start_recording(
+            clear, None, [output_translation_segment, translated_text_state]
+        ).then(
+            start_recording
+        ).then(
+            # streaming_callback_dummy,  # TODO: autoplay works fine with streaming_callback_dummy
+            # None,
+            # output_translation_segment
             streaming_input_callback,
+            None,
             [
                 output_translation_segment,
                 stream_output_text,
                 translated_text_state,
             ],
         )
+        input_audio.stop_recording(
+            stop_recording
         )
+        input_audio.stream(
+            process_incoming_bytes, [input_audio], None
         )
+    demo.launch(server_port=6010)
 blocks()

models/vad_s2st_sc_24khz_main.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+agent_class: seamless_communication.streaming.agents.mma_m4t_s2st.SeamlessS2STJointVADAgent
+# checkpoint: checkpoint_best.pt
+monotonic_decoder_model_name: seamless_streaming_monotonic_decoder
+unity_model_name: seamless_streaming_unity
+sentencepiece_model: spm_256k_nllb100.model
+task: s2st
+tgt_lang: "eng"
+min_unit_chunk_size: 50
+decision_threshold: 0.7
+no_early_stop: True
+block_ngrams: True
+vocoder_name: vocoder_pretssel
+wav2vec_yaml: wav2vec.yaml
+# min_starting_wait: 12
+# min_starting_wait_w2vbert: 192
+config_yaml: cfg_fbank_u2t.yaml
+vocoder_sample_rate: 24000
+upstream_idx: 1
+detokenize_only: True
+device: cuda:0
+max_len_a: 0
+max_len_b: 1000

simuleval_transcoder.py CHANGED Viewed

@@ -20,13 +20,6 @@ import time
 import random
 import colorlog
-# Sanity check that pipeline is loadable
-from seamless_communication.cli.streaming.agents.tt_waitk_unity_s2t_m4t import (
-    # TestTimeWaitKUnityS2TM4T,
-    TestTimeWaitKUnityS2TM4TVAD
-)
-from simuleval.utils.agent import build_system_args
 MODEL_SAMPLE_RATE = 16_000
@@ -49,35 +42,6 @@ logger.addHandler(handler)
 logger.setLevel(logging.DEBUG)
-# TODO: Integrate this better so target lang and others can be changed. Also currently dependent on devserver internals
-def build_agent():
-    config = {
-        'dataloader': 'fairseq2_s2t',
-        'data_file': '/large_experiments/seamless/ust/abinesh/data/s2st50_manifests/50-10/simuleval/dev_mtedx_filt_50-10_debug.tsv',
-        'model_name': 'seamlessM4T_v2_large',
-        'device': 'cuda:0',
-        'source_segment_size': 320,
-        'waitk_lagging': 7,
-        'fixed_pre_decision_ratio': 2,
-        'init_target_tokens': '</s> __eng__',
-        'max_len_a': 0,
-        'max_len_b': 200,
-        'agent_class': 'seamless_communication.cli.streaming.agents.tt_waitk_unity_s2t_m4t.TestTimeWaitKUnityS2TM4TVAD',
-        'task': 's2st',
-        'tgt_lang': 'eng',
-        'latency_metrics': 'StartOffset EndOffset AL',
-        'output': 'TestTimeWaitKUnityS2TM4TVAD-wait7-debug'
-    }
-    agent , _ = build_system_args(config)
-    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    # agent.to(device, fp16=True)
-    logger.info(
-        f"Successfully built simuleval agent"
-    )
-    return agent
 class SpeechAndTextOutput:
     def __init__(
         self,
@@ -150,7 +114,7 @@ class OutputSegments:
                 for segment in segment_list:
                     speech_out += segment.content
                 output.speech_samples = speech_out
-                output.speech_sample_rate = MODEL_SAMPLE_RATE
             elif isinstance(segment_list[0], EmptySegment):
                 continue
             else:
@@ -212,8 +176,9 @@ def convert_waveform(
     return waveform, sample_rate
 class SimulevalTranscoder:
-    def __init__(self, sample_rate, debug, buffer_limit):
-        self.agent = build_agent()
         self.input_queue = asyncio.Queue()
         self.output_queue = asyncio.Queue()
         self.states = self.agent.build_states()
@@ -289,6 +254,7 @@ class SimulevalTranscoder:
         )
         # # segment is array([0, 0, 0, ..., 0, 0, 0], dtype=int16)
         self.input_queue.put_nowait(segment)
     def get_input_segment(self):
         if self.input_queue.empty():
@@ -340,10 +306,11 @@ class SimulevalTranscoder:
                 self.first_input_ts = self.get_states_root().first_input_ts
             if not output_segment.is_empty:
                 self.output_queue.put_nowait(output_segment)
             if output_segment.finished:
-                self.debug_log("OUTPUT SEGMENT IS FINISHED. Resetting states.")
                 self.reset_states()
@@ -360,17 +327,19 @@ class SimulevalTranscoder:
         if self.close:
             return  # closes the thread
-        self.debug_log("processing_pipeline")
         while not self.close:
             input_segment = self.get_input_segment()
             if input_segment is None:
-                # if self.get_states_root().is_fresh_state:  # TODO: this is hacky
-                #     time.sleep(0.3)
-                # else:
-                time.sleep(0.03)
                 continue
             self.process_pipeline_impl(input_segment)
-        self.debug_log("finished processing_pipeline")
     def process_pipeline_once(self):
         if self.close:
@@ -392,7 +361,7 @@ class SimulevalTranscoder:
         return output_chunk
     def start(self):
-        self.debug_log("starting transcoder in a thread")
         threading.Thread(target=self.process_pipeline_loop).start()
     def first_translation_time(self):
@@ -400,7 +369,7 @@ class SimulevalTranscoder:
     def get_buffered_output(self) -> SpeechAndTextOutput:
         now = time.time() * 1000
-        self.debug_log(f"get_buffered_output queue size: {self.output_queue.qsize()}")
         while not self.output_queue.empty():
             tmp_out = self.get_output_segment()
             if tmp_out and tmp_out.compute_length(self.g2p) > 0:
@@ -452,4 +421,4 @@ class SimulevalTranscoder:
         self.output_buffer.append(segment.segments)
     def _compute_phoneme_count(self, string: str) -> int:
-        return len([x for x in self.g2p(string) if x != " "])

 import random
 import colorlog
 MODEL_SAMPLE_RATE = 16_000
 logger.setLevel(logging.DEBUG)
 class SpeechAndTextOutput:
     def __init__(
         self,
                 for segment in segment_list:
                     speech_out += segment.content
                 output.speech_samples = speech_out
+                output.speech_sample_rate = segment.sample_rate
             elif isinstance(segment_list[0], EmptySegment):
                 continue
             else:
     return waveform, sample_rate
 class SimulevalTranscoder:
+    def __init__(self, agent, sample_rate, debug, buffer_limit):
+        # agent is stateless
+        self.agent = agent
         self.input_queue = asyncio.Queue()
         self.output_queue = asyncio.Queue()
         self.states = self.agent.build_states()
         )
         # # segment is array([0, 0, 0, ..., 0, 0, 0], dtype=int16)
         self.input_queue.put_nowait(segment)
+        print("process_incoming: put input_queue")
     def get_input_segment(self):
         if self.input_queue.empty():
                 self.first_input_ts = self.get_states_root().first_input_ts
             if not output_segment.is_empty:
+                print("PUT IN OUTPUT QUEUE")
                 self.output_queue.put_nowait(output_segment)
             if output_segment.finished:
+                print("OUTPUT SEGMENT IS FINISHED. Resetting states.")
                 self.reset_states()
         if self.close:
             return  # closes the thread
+        print("processing_pipeline")
         while not self.close:
             input_segment = self.get_input_segment()
             if input_segment is None:
+                if self.get_states_root().is_fresh_state:  # TODO: this is hacky
+                    time.sleep(0.3)
+                    print("loop: input_queue empty")
+                else:
+                    time.sleep(0.03)
                 continue
+            print("loop: got input_segment")
             self.process_pipeline_impl(input_segment)
+        print("finished processing_pipeline")
     def process_pipeline_once(self):
         if self.close:
         return output_chunk
     def start(self):
+        print("starting transcoder in a thread")
         threading.Thread(target=self.process_pipeline_loop).start()
     def first_translation_time(self):
     def get_buffered_output(self) -> SpeechAndTextOutput:
         now = time.time() * 1000
+        print(f"get_buffered_output queue size: {self.output_queue.qsize()}")
         while not self.output_queue.empty():
             tmp_out = self.get_output_segment()
             if tmp_out and tmp_out.compute_length(self.g2p) > 0:
         self.output_buffer.append(segment.segments)
     def _compute_phoneme_count(self, string: str) -> int:
+        return len([x for x in self.g2p(string) if x != " "])