Gizachew commited on
Commit
566ae0a
1 Parent(s): b194bfd

Upload 11 files

Browse files
a5-03-02-01-29.wav ADDED
Binary file (96 kB). View file
 
app.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tempfile
2
+ import torch
3
+ import torch.nn.functional as F
4
+ import torchaudio
5
+ import gradio as gr
6
+ from transformers import Wav2Vec2FeatureExtractor, AutoConfig
7
+ from models import Wav2Vec2ForSpeechClassification, HubertForSpeechClassification
8
+
9
+ # Load model and feature extractor
10
+ config = AutoConfig.from_pretrained("Gizachew/wev2vec-large960-agu-amharic")
11
+ feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("Gizachew/wev2vec-large960-agu-amharic")
12
+ model = Wav2Vec2ForSpeechClassification.from_pretrained("Gizachew/wev2vec-large960-agu-amharic")
13
+ sampling_rate = feature_extractor.sampling_rate
14
+
15
+ # Define inputs and outputs for the Gradio interface
16
+ audio_input = gr.Audio(label="Upload file", type="filepath")
17
+ text_output = gr.TextArea(label="Emotion Prediction Output", text_align="right", rtl=True, type="text")
18
+
19
+ def SER(audio):
20
+ with tempfile.NamedTemporaryFile(suffix=".wav") as temp_audio_file:
21
+ # Copy the contents of the uploaded audio file to the temporary file
22
+ temp_audio_file.write(open(audio, "rb").read())
23
+ temp_audio_file.flush()
24
+ # Load the audio file using torchaudio
25
+ speech_array, _sampling_rate = torchaudio.load(temp_audio_file.name)
26
+ resampler = torchaudio.transforms.Resample(_sampling_rate)
27
+ speech = resampler(speech_array).squeeze().numpy()
28
+ inputs = feature_extractor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
29
+ inputs = {key: inputs[key] for key in inputs}
30
+
31
+ with torch.no_grad():
32
+ logits = model(**inputs).logits
33
+
34
+ scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
35
+ # Get the highest score and its corresponding label
36
+ max_index = scores.argmax()
37
+ label = config.id2label[max_index]
38
+ score = scores[max_index]
39
+
40
+ # Format the output string
41
+ output = f"{label}: {score * 100:.1f}%"
42
+
43
+ return output
44
+
45
+
46
+ # Create the Gradio interface
47
+ iface = gr.Interface(
48
+ fn=SER,
49
+ inputs=audio_input,
50
+ outputs=text_output
51
+ )
52
+
53
+ # Launch the Gradio app
54
+ iface.launch(share=True)
collator.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import Dict, List, Optional, Union
3
+ import torch
4
+
5
+ import transformers
6
+ from transformers import Wav2Vec2Processor, Wav2Vec2FeatureExtractor
7
+
8
+
9
+ @dataclass
10
+ class DataCollatorCTCWithPadding:
11
+
12
+ feature_extractor: Wav2Vec2FeatureExtractor
13
+ padding: Union[bool, str] = True
14
+ max_length: Optional[int] = None
15
+ max_length_labels: Optional[int] = None
16
+ pad_to_multiple_of: Optional[int] = None
17
+ pad_to_multiple_of_labels: Optional[int] = None
18
+
19
+ def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
20
+ input_features = [{"input_values": feature["input_values"]} for feature in features]
21
+ label_features = [feature["labels"] for feature in features]
22
+
23
+ d_type = torch.long if isinstance(label_features[0], int) else torch.float
24
+
25
+ batch = self.feature_extractor.pad(
26
+ input_features,
27
+ padding=self.padding,
28
+ max_length=self.max_length,
29
+ pad_to_multiple_of=self.pad_to_multiple_of,
30
+ return_tensors="pt",
31
+ )
32
+
33
+ batch["labels"] = torch.tensor(label_features, dtype=d_type)
34
+
35
+ return batch
f2-01-01-02-50.wav ADDED
Binary file (125 kB). View file
 
h3-02-01-01-19.wav ADDED
Binary file (92.8 kB). View file
 
modeling_outputs.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import Optional, Tuple
3
+ import torch
4
+ from transformers.file_utils import ModelOutput
5
+
6
+
7
+ @dataclass
8
+ class SpeechClassifierOutput(ModelOutput):
9
+ loss: Optional[torch.FloatTensor] = None
10
+ logits: torch.FloatTensor = None
11
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
12
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
models.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
4
+
5
+ from transformers.models.wav2vec2.modeling_wav2vec2 import (
6
+ Wav2Vec2PreTrainedModel,
7
+ Wav2Vec2Model
8
+ )
9
+ from transformers.models.hubert.modeling_hubert import (
10
+ HubertPreTrainedModel,
11
+ HubertModel
12
+ )
13
+
14
+ from modeling_outputs import SpeechClassifierOutput
15
+
16
+
17
+ class Wav2Vec2ClassificationHead(nn.Module):
18
+ """Head for wav2vec classification task."""
19
+
20
+ def __init__(self, config):
21
+ super().__init__()
22
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
23
+ self.dropout = nn.Dropout(config.final_dropout)
24
+ self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
25
+
26
+ def forward(self, features, **kwargs):
27
+ x = features
28
+ x = self.dropout(x)
29
+ x = self.dense(x)
30
+ x = torch.tanh(x)
31
+ x = self.dropout(x)
32
+ x = self.out_proj(x)
33
+ return x
34
+
35
+
36
+ class Wav2Vec2ForSpeechClassification(Wav2Vec2PreTrainedModel):
37
+ def __init__(self, config):
38
+ super().__init__(config)
39
+ self.num_labels = config.num_labels
40
+ self.pooling_mode = config.pooling_mode
41
+ self.config = config
42
+
43
+ self.wav2vec2 = Wav2Vec2Model(config)
44
+ self.classifier = Wav2Vec2ClassificationHead(config)
45
+
46
+ self.init_weights()
47
+
48
+ def freeze_feature_extractor(self):
49
+ self.wav2vec2.feature_extractor._freeze_parameters()
50
+
51
+ def merged_strategy(
52
+ self,
53
+ hidden_states,
54
+ mode="mean"
55
+ ):
56
+ if mode == "mean":
57
+ outputs = torch.mean(hidden_states, dim=1)
58
+ elif mode == "sum":
59
+ outputs = torch.sum(hidden_states, dim=1)
60
+ elif mode == "max":
61
+ outputs = torch.max(hidden_states, dim=1)[0]
62
+ else:
63
+ raise Exception(
64
+ "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")
65
+
66
+ return outputs
67
+
68
+ def forward(
69
+ self,
70
+ input_values,
71
+ attention_mask=None,
72
+ output_attentions=None,
73
+ output_hidden_states=None,
74
+ return_dict=None,
75
+ labels=None,
76
+ ):
77
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
78
+ outputs = self.wav2vec2(
79
+ input_values,
80
+ attention_mask=attention_mask,
81
+ output_attentions=output_attentions,
82
+ output_hidden_states=output_hidden_states,
83
+ return_dict=return_dict,
84
+ )
85
+ hidden_states = outputs[0]
86
+ hidden_states = self.merged_strategy(hidden_states, mode=self.pooling_mode)
87
+ logits = self.classifier(hidden_states)
88
+
89
+ loss = None
90
+ if labels is not None:
91
+ if self.config.problem_type is None:
92
+ if self.num_labels == 1:
93
+ self.config.problem_type = "regression"
94
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
95
+ self.config.problem_type = "single_label_classification"
96
+ else:
97
+ self.config.problem_type = "multi_label_classification"
98
+
99
+ if self.config.problem_type == "regression":
100
+ loss_fct = MSELoss()
101
+ loss = loss_fct(logits.view(-1, self.num_labels), labels)
102
+ elif self.config.problem_type == "single_label_classification":
103
+ loss_fct = CrossEntropyLoss()
104
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
105
+ elif self.config.problem_type == "multi_label_classification":
106
+ loss_fct = BCEWithLogitsLoss()
107
+ loss = loss_fct(logits, labels)
108
+
109
+ if not return_dict:
110
+ output = (logits,) + outputs[2:]
111
+ return ((loss,) + output) if loss is not None else output
112
+
113
+ return SpeechClassifierOutput(
114
+ loss=loss,
115
+ logits=logits,
116
+ hidden_states=outputs.hidden_states,
117
+ attentions=outputs.attentions,
118
+ )
119
+
120
+
121
+ class HubertClassificationHead(nn.Module):
122
+ """Head for hubert classification task."""
123
+
124
+ def __init__(self, config):
125
+ super().__init__()
126
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
127
+ self.dropout = nn.Dropout(config.final_dropout)
128
+ self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
129
+
130
+ def forward(self, features, **kwargs):
131
+ x = features
132
+ x = self.dropout(x)
133
+ x = self.dense(x)
134
+ x = torch.tanh(x)
135
+ x = self.dropout(x)
136
+ x = self.out_proj(x)
137
+ return x
138
+
139
+
140
+ class HubertForSpeechClassification(HubertPreTrainedModel):
141
+ def __init__(self, config):
142
+ super().__init__(config)
143
+ self.num_labels = config.num_labels
144
+ self.pooling_mode = config.pooling_mode
145
+ self.config = config
146
+
147
+ self.hubert = HubertModel(config)
148
+ self.classifier = HubertClassificationHead(config)
149
+
150
+ self.init_weights()
151
+
152
+ def freeze_feature_extractor(self):
153
+ self.hubert.feature_extractor._freeze_parameters()
154
+
155
+ def merged_strategy(
156
+ self,
157
+ hidden_states,
158
+ mode="mean"
159
+ ):
160
+ if mode == "mean":
161
+ outputs = torch.mean(hidden_states, dim=1)
162
+ elif mode == "sum":
163
+ outputs = torch.sum(hidden_states, dim=1)
164
+ elif mode == "max":
165
+ outputs = torch.max(hidden_states, dim=1)[0]
166
+ else:
167
+ raise Exception(
168
+ "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")
169
+
170
+ return outputs
171
+
172
+ def forward(
173
+ self,
174
+ input_values,
175
+ attention_mask=None,
176
+ output_attentions=None,
177
+ output_hidden_states=None,
178
+ return_dict=None,
179
+ labels=None,
180
+ ):
181
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
182
+ outputs = self.hubert(
183
+ input_values,
184
+ attention_mask=attention_mask,
185
+ output_attentions=output_attentions,
186
+ output_hidden_states=output_hidden_states,
187
+ return_dict=return_dict,
188
+ )
189
+ hidden_states = outputs[0]
190
+ hidden_states = self.merged_strategy(hidden_states, mode=self.pooling_mode)
191
+ logits = self.classifier(hidden_states)
192
+
193
+ loss = None
194
+ if labels is not None:
195
+ if self.config.problem_type is None:
196
+ if self.num_labels == 1:
197
+ self.config.problem_type = "regression"
198
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
199
+ self.config.problem_type = "single_label_classification"
200
+ else:
201
+ self.config.problem_type = "multi_label_classification"
202
+
203
+ if self.config.problem_type == "regression":
204
+ loss_fct = MSELoss()
205
+ loss = loss_fct(logits.view(-1, self.num_labels), labels)
206
+ elif self.config.problem_type == "single_label_classification":
207
+ loss_fct = CrossEntropyLoss()
208
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
209
+ elif self.config.problem_type == "multi_label_classification":
210
+ loss_fct = BCEWithLogitsLoss()
211
+ loss = loss_fct(logits, labels)
212
+
213
+ if not return_dict:
214
+ output = (logits,) + outputs[2:]
215
+ return ((loss,) + output) if loss is not None else output
216
+
217
+ return SpeechClassifierOutput(
218
+ loss=loss,
219
+ logits=logits,
220
+ hidden_states=outputs.hidden_states,
221
+ attentions=outputs.attentions,
222
+ )
n1-02-01-01-47.wav ADDED
Binary file (96 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ transformers
2
+ torch
3
+ torchaudio
s4-02-01-02-28.wav ADDED
Binary file (96 kB). View file
 
trainer.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Dict, Union
2
+
3
+ import torch
4
+ from packaging import version
5
+ from torch import nn
6
+
7
+ from transformers import (
8
+ Trainer,
9
+ is_apex_available,
10
+ )
11
+
12
+ if is_apex_available():
13
+ from apex import amp
14
+
15
+ if version.parse(torch.__version__) >= version.parse("1.6"):
16
+ _is_native_amp_available = True
17
+ from torch.cuda.amp import autocast
18
+
19
+
20
+ class CTCTrainer(Trainer):
21
+ def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
22
+ model.train()
23
+ inputs = self._prepare_inputs(inputs)
24
+
25
+ if self.use_amp:
26
+ with autocast():
27
+ loss = self.compute_loss(model, inputs)
28
+ else:
29
+ loss = self.compute_loss(model, inputs)
30
+
31
+ if self.args.gradient_accumulation_steps > 1:
32
+ loss = loss / self.args.gradient_accumulation_steps
33
+
34
+ if self.use_amp:
35
+ self.scaler.scale(loss).backward()
36
+ elif self.use_apex:
37
+ with amp.scale_loss(loss, self.optimizer) as scaled_loss:
38
+ scaled_loss.backward()
39
+ elif self.deepspeed:
40
+ self.deepspeed.backward(loss)
41
+ else:
42
+ loss.backward()
43
+
44
+ return loss.detach()