Yehor
/

hubert-uk

Yehor commited on 29 days ago

Commit

8133fdc

•

1 Parent(s): 1f30811

Update run_demo.py

Files changed (1) hide show

run_demo.py CHANGED Viewed

@@ -12,7 +12,7 @@ device = "cuda:0"  # cuda:0, or cpu
 torch_dtype = torch.float16
 sampling_rate = 16_000
-model_name = "Yehor/mHuBERT-147-uk"
 testset_file = "examples.csv"
 # Load the test dataset
@@ -29,23 +29,22 @@ asr_model = HubertForCTC.from_pretrained(
 processor = Wav2Vec2Processor.from_pretrained(model_name)
-# A util function to make batches
 def make_batches(iterable, n=1):
     lx = len(iterable)
     for ndx in range(0, lx, n):
         yield iterable[ndx : min(ndx + n, lx)]
-# Temporary variables
 predictions_all = []
 references_all = []
-# Inference in the batched mode
 for batch in make_batches(samples, batch_size):
     paths = [it["path"] for it in batch]
     references = [it["text"] for it in batch]
-    # Extract audio
     audio_inputs = []
     for path in paths:
         audio_input, sampling_rate = torchaudio.load(path, backend="sox")
@@ -53,7 +52,7 @@ for batch in make_batches(samples, batch_size):
         audio_inputs.append(audio_input)
-    # Transcribe the audio
     inputs = processor(audio_inputs, sampling_rate=16_000, padding=True).input_values
     features = torch.tensor(np.array(inputs), dtype=torch_dtype).to(device)

 torch_dtype = torch.float16
 sampling_rate = 16_000
+model_name = "Yehor/hubert-uk"
 testset_file = "examples.csv"
 # Load the test dataset
 processor = Wav2Vec2Processor.from_pretrained(model_name)
+# A func to make batches
 def make_batches(iterable, n=1):
     lx = len(iterable)
     for ndx in range(0, lx, n):
         yield iterable[ndx : min(ndx + n, lx)]
 predictions_all = []
 references_all = []
+# Batched inference
 for batch in make_batches(samples, batch_size):
     paths = [it["path"] for it in batch]
     references = [it["text"] for it in batch]
+    # Extract audio features
     audio_inputs = []
     for path in paths:
         audio_input, sampling_rate = torchaudio.load(path, backend="sox")
         audio_inputs.append(audio_input)
+    # Transcribe
     inputs = processor(audio_inputs, sampling_rate=16_000, padding=True).input_values
     features = torch.tensor(np.array(inputs), dtype=torch_dtype).to(device)