MahmoudAshraf
/

mms-300m-1130-forced-aligner

Automatic Speech Recognition

forced-alignment

Inference Endpoints

Model card Files Files and versions Community

MahmoudAshraf commited on Jun 17

Commit

4f7a07c

•

1 Parent(s): 4d9d54e

update python usage instructions

Files changed (1) hide show

README.md +9 -12

README.md CHANGED Viewed

@@ -177,6 +177,7 @@ pip install git+https://github.com/MahmoudAshraf97/ctc-forced-aligner.git
 ## Usage
 ```python
 from ctc_forced_aligner import (
     load_audio,
     load_alignment_model,
@@ -190,35 +191,32 @@ from ctc_forced_aligner import (
 audio_path = "your/audio/path"
 text_path = "your/text/path"
 language = "iso" # ISO-639-3 Language code
-audio_waveform = load_audio(audio_path, model.dtype, model.device)
-emissions, stride = generate_emissions(
-    model, audio_waveform, args.window_size, args.context_size, args.batch_size
-)
-with open(text_path, "r") as f:
-    lines = f.readlines()
-text = "".join(line for line in lines).replace("\n", " ").strip()
 alignment_model, alignment_tokenizer, alignment_dictionary = load_alignment_model(
     device,
     dtype=torch.float16 if device == "cuda" else torch.float32,
 )
 emissions, stride = generate_emissions(
     alignment_model, audio_waveform, batch_size=batch_size
 )
 tokens_starred, text_starred = preprocess_text(
     text,
     romanize=True,
     language=language,
 )
 segments, scores, blank_id = get_alignments(
     emissions,
     tokens_starred,
@@ -228,5 +226,4 @@ segments, scores, blank_id = get_alignments(
 spans = get_spans(tokens_starred, segments, alignment_tokenizer.decode(blank_id))
 word_timestamps = postprocess_results(text_starred, spans, stride, scores)
 ```

 ## Usage
 ```python
+import torch
 from ctc_forced_aligner import (
     load_audio,
     load_alignment_model,
 audio_path = "your/audio/path"
 text_path = "your/text/path"
 language = "iso" # ISO-639-3 Language code
+device = "cuda" if torch.cuda.is_available() else "cpu"
+batch_size = 16
 alignment_model, alignment_tokenizer, alignment_dictionary = load_alignment_model(
     device,
     dtype=torch.float16 if device == "cuda" else torch.float32,
 )
+audio_waveform = load_audio(audio_path, alignment_model.dtype, alignment_model.device)
+with open(text_path, "r") as f:
+    lines = f.readlines()
+text = "".join(line for line in lines).replace("\n", " ").strip()
 emissions, stride = generate_emissions(
     alignment_model, audio_waveform, batch_size=batch_size
 )
 tokens_starred, text_starred = preprocess_text(
     text,
     romanize=True,
     language=language,
 )
 segments, scores, blank_id = get_alignments(
     emissions,
     tokens_starred,
 spans = get_spans(tokens_starred, segments, alignment_tokenizer.decode(blank_id))
 word_timestamps = postprocess_results(text_starred, spans, stride, scores)
 ```