voidful commited on
Commit
45970cf
1 Parent(s): a494388

add README

Browse files
Files changed (1) hide show
  1. README.md +83 -0
README.md ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ datasets:
4
+ - librispeech
5
+ tags:
6
+ - audio
7
+ - automatic-speech-recognition
8
+ - speech
9
+ - asr
10
+ - hubert
11
+ license: apache-2.0
12
+ metrics:
13
+ - wer
14
+ - cer
15
+ ---
16
+
17
+ # voidful/asr_hubert_cluster_bart_base
18
+
19
+
20
+ ## Usage
21
+ download file
22
+ ```shell
23
+ wget https://raw.githubusercontent.com/voidful/hubert-cluster-code/main/km_feat_100_layer_20
24
+ wget https://cdn-media.huggingface.co/speech_samples/sample1.flac
25
+ ```
26
+
27
+ Hubert kmeans code
28
+ ```python
29
+ import joblib
30
+ import torch
31
+ from transformers import Wav2Vec2FeatureExtractor, HubertModel
32
+ import soundfile as sf
33
+
34
+
35
+ class HubertCode(object):
36
+ def __init__(self, hubert_model, km_path, km_layer):
37
+ self.processor = Wav2Vec2FeatureExtractor.from_pretrained(hubert_model)
38
+ self.model = HubertModel.from_pretrained(hubert_model)
39
+ self.km_model = joblib.load(km_path)
40
+ self.km_layer = km_layer
41
+ self.C_np = self.km_model.cluster_centers_.transpose()
42
+ self.Cnorm_np = (self.C_np ** 2).sum(0, keepdims=True)
43
+
44
+ self.C = torch.from_numpy(self.C_np)
45
+ self.Cnorm = torch.from_numpy(self.Cnorm_np)
46
+ if torch.cuda.is_available():
47
+ self.C = self.C.cuda()
48
+ self.Cnorm = self.Cnorm.cuda()
49
+ self.model = self.model.cuda()
50
+
51
+ def __call__(self, filepath, sampling_rate=None):
52
+ speech, sr = sf.read(filepath)
53
+ input_values = self.processor(speech, return_tensors="pt", sampling_rate=sr).input_values
54
+ if torch.cuda.is_available():
55
+ input_values = input_values.cuda()
56
+ hidden_states = self.model(input_values, output_hidden_states=True).hidden_states
57
+ x = hidden_states[self.km_layer].squeeze()
58
+ dist = (
59
+ x.pow(2).sum(1, keepdim=True)
60
+ - 2 * torch.matmul(x, self.C)
61
+ + self.Cnorm
62
+ )
63
+ return dist.argmin(dim=1).cpu().numpy()
64
+ ```
65
+ input
66
+ ```python
67
+ hc = HubertCode("facebook/hubert-large-ll60k", './km_feat_100_layer_20', 20)
68
+ voice_ids = hc('./sample1.flac')
69
+ ```
70
+ bart model
71
+ ````python
72
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
73
+ tokenizer = AutoTokenizer.from_pretrained("voidful/asr_hubert_cluster_bart_base")
74
+ model = AutoModelForSeq2SeqLM.from_pretrained("voidful/asr_hubert_cluster_bart_base")
75
+ ````
76
+ generate output
77
+ ```python
78
+ gen_output = model.generate(input_ids=tokenizer("".join([f":vtok{i}:" for i in voice_ids]),return_tensors='pt').input_ids,max_length=1024)
79
+ print(tokenizer.decode(gen_output[0], skip_special_tokens=True))
80
+ ```
81
+
82
+ ## Result
83
+ `going along slushy country roads and speaking to damp audience in drifty school rooms day after day for a fortnight he'll have to put in an appearance at some place of worship on sunday morning and he can come to ask immediately afterwards`