wsntxxn commited on
Commit
1d09ca2
1 Parent(s): 90bb951

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +8 -0
README.md CHANGED
@@ -22,7 +22,10 @@ pip install numpy torch torchaudio einops transformers efficientnet_pytorch
22
  import torch
23
  from transformers import AutoModel, PreTrainedTokenizerFast
24
  import torchaudio
 
 
25
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
26
  # use the model trained on AudioCaps
27
  model = AutoModel.from_pretrained(
28
  "wsntxxn/effb2-trm-audiocaps-captioning",
@@ -31,6 +34,7 @@ model = AutoModel.from_pretrained(
31
  tokenizer = PreTrainedTokenizerFast.from_pretrained(
32
  "wsntxxn/audiocaps-simple-tokenizer"
33
  )
 
34
  # inference on a single audio clip
35
  wav, sr = torchaudio.load("/path/to/file.wav")
36
  wav = torchaudio.functional.resample(wav, sr, model.config.sample_rate)
@@ -43,14 +47,18 @@ with torch.no_grad():
43
  )
44
  caption = tokenizer.decode(word_idxs[0], skip_special_tokens=True)
45
  print(caption)
 
46
  # inference on a batch
47
  wav1, sr1 = torchaudio.load("/path/to/file1.wav")
48
  wav1 = torchaudio.functional.resample(wav1, sr1, model.config.sample_rate)
49
  wav1 = wav1.mean(0) if wav1.size(0) > 1 else wav1[0]
 
50
  wav2, sr2 = torchaudio.load("/path/to/file2.wav")
51
  wav2 = torchaudio.functional.resample(wav2, sr2, model.config.sample_rate)
52
  wav2 = wav2.mean(0) if wav2.size(0) > 1 else wav2[0]
 
53
  wav_batch = torch.nn.utils.rnn.pad_sequence([wav1, wav2], batch_first=True)
 
54
  with torch.no_grad():
55
  word_idxs = model(
56
  audio=wav_batch,
 
22
  import torch
23
  from transformers import AutoModel, PreTrainedTokenizerFast
24
  import torchaudio
25
+
26
+
27
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
28
+
29
  # use the model trained on AudioCaps
30
  model = AutoModel.from_pretrained(
31
  "wsntxxn/effb2-trm-audiocaps-captioning",
 
34
  tokenizer = PreTrainedTokenizerFast.from_pretrained(
35
  "wsntxxn/audiocaps-simple-tokenizer"
36
  )
37
+
38
  # inference on a single audio clip
39
  wav, sr = torchaudio.load("/path/to/file.wav")
40
  wav = torchaudio.functional.resample(wav, sr, model.config.sample_rate)
 
47
  )
48
  caption = tokenizer.decode(word_idxs[0], skip_special_tokens=True)
49
  print(caption)
50
+
51
  # inference on a batch
52
  wav1, sr1 = torchaudio.load("/path/to/file1.wav")
53
  wav1 = torchaudio.functional.resample(wav1, sr1, model.config.sample_rate)
54
  wav1 = wav1.mean(0) if wav1.size(0) > 1 else wav1[0]
55
+
56
  wav2, sr2 = torchaudio.load("/path/to/file2.wav")
57
  wav2 = torchaudio.functional.resample(wav2, sr2, model.config.sample_rate)
58
  wav2 = wav2.mean(0) if wav2.size(0) > 1 else wav2[0]
59
+
60
  wav_batch = torch.nn.utils.rnn.pad_sequence([wav1, wav2], batch_first=True)
61
+
62
  with torch.no_grad():
63
  word_idxs = model(
64
  audio=wav_batch,