Spaces:

Gizachew
/

Amharic-SER

Sleeping

App Files Files Community

Gizachew commited on Jun 19

Commit

566ae0a

•

1 Parent(s): b194bfd

Upload 11 files

Browse files

Files changed (11) hide show

a5-03-02-01-29.wav +0 -0
app.py +54 -0
collator.py +35 -0
f2-01-01-02-50.wav +0 -0
h3-02-01-01-19.wav +0 -0
modeling_outputs.py +12 -0
models.py +222 -0
n1-02-01-01-47.wav +0 -0
requirements.txt +3 -0
s4-02-01-02-28.wav +0 -0
trainer.py +44 -0

a5-03-02-01-29.wav ADDED Viewed

Binary file (96 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import tempfile
+import torch
+import torch.nn.functional as F
+import torchaudio
+import gradio as gr
+from transformers import Wav2Vec2FeatureExtractor, AutoConfig
+from models import Wav2Vec2ForSpeechClassification, HubertForSpeechClassification
+# Load model and feature extractor
+config = AutoConfig.from_pretrained("Gizachew/wev2vec-large960-agu-amharic")
+feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("Gizachew/wev2vec-large960-agu-amharic")
+model = Wav2Vec2ForSpeechClassification.from_pretrained("Gizachew/wev2vec-large960-agu-amharic")
+sampling_rate = feature_extractor.sampling_rate
+# Define inputs and outputs for the Gradio interface
+audio_input = gr.Audio(label="Upload file", type="filepath")
+text_output = gr.TextArea(label="Emotion Prediction Output", text_align="right", rtl=True, type="text")
+def SER(audio):
+    with tempfile.NamedTemporaryFile(suffix=".wav") as temp_audio_file:
+        # Copy the contents of the uploaded audio file to the temporary file
+        temp_audio_file.write(open(audio, "rb").read())
+        temp_audio_file.flush()
+        # Load the audio file using torchaudio
+        speech_array, _sampling_rate = torchaudio.load(temp_audio_file.name)
+        resampler = torchaudio.transforms.Resample(_sampling_rate)
+        speech = resampler(speech_array).squeeze().numpy()
+        inputs = feature_extractor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
+        inputs = {key: inputs[key] for key in inputs}
+        with torch.no_grad():
+            logits = model(**inputs).logits
+        scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
+        # Get the highest score and its corresponding label
+        max_index = scores.argmax()
+        label = config.id2label[max_index]
+        score = scores[max_index]
+        # Format the output string
+        output = f"{label}: {score * 100:.1f}%"
+        return output
+# Create the Gradio interface
+iface = gr.Interface(
+    fn=SER,
+    inputs=audio_input,
+    outputs=text_output
+)
+# Launch the Gradio app
+iface.launch(share=True)

collator.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from dataclasses import dataclass
+from typing import Dict, List, Optional, Union
+import torch
+import transformers
+from transformers import Wav2Vec2Processor, Wav2Vec2FeatureExtractor
+@dataclass
+class DataCollatorCTCWithPadding:
+    feature_extractor: Wav2Vec2FeatureExtractor
+    padding: Union[bool, str] = True
+    max_length: Optional[int] = None
+    max_length_labels: Optional[int] = None
+    pad_to_multiple_of: Optional[int] = None
+    pad_to_multiple_of_labels: Optional[int] = None
+    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+        input_features = [{"input_values": feature["input_values"]} for feature in features]
+        label_features = [feature["labels"] for feature in features]
+        d_type = torch.long if isinstance(label_features[0], int) else torch.float
+        batch = self.feature_extractor.pad(
+            input_features,
+            padding=self.padding,
+            max_length=self.max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors="pt",
+        )
+        batch["labels"] = torch.tensor(label_features, dtype=d_type)
+        return batch

f2-01-01-02-50.wav ADDED Viewed

Binary file (125 kB). View file

h3-02-01-01-19.wav ADDED Viewed

Binary file (92.8 kB). View file

modeling_outputs.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from dataclasses import dataclass
+from typing import Optional, Tuple
+import torch
+from transformers.file_utils import ModelOutput
+@dataclass
+class SpeechClassifierOutput(ModelOutput):
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None

models.py ADDED Viewed

	@@ -0,0 +1,222 @@

+import torch
+import torch.nn as nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.models.wav2vec2.modeling_wav2vec2 import (
+    Wav2Vec2PreTrainedModel,
+    Wav2Vec2Model
+)
+from transformers.models.hubert.modeling_hubert import (
+    HubertPreTrainedModel,
+    HubertModel
+)
+from modeling_outputs import SpeechClassifierOutput
+class Wav2Vec2ClassificationHead(nn.Module):
+    """Head for wav2vec classification task."""
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.final_dropout)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+    def forward(self, features, **kwargs):
+        x = features
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = torch.tanh(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+class Wav2Vec2ForSpeechClassification(Wav2Vec2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.pooling_mode = config.pooling_mode
+        self.config = config
+        self.wav2vec2 = Wav2Vec2Model(config)
+        self.classifier = Wav2Vec2ClassificationHead(config)
+        self.init_weights()
+    def freeze_feature_extractor(self):
+        self.wav2vec2.feature_extractor._freeze_parameters()
+    def merged_strategy(
+            self,
+            hidden_states,
+            mode="mean"
+    ):
+        if mode == "mean":
+            outputs = torch.mean(hidden_states, dim=1)
+        elif mode == "sum":
+            outputs = torch.sum(hidden_states, dim=1)
+        elif mode == "max":
+            outputs = torch.max(hidden_states, dim=1)[0]
+        else:
+            raise Exception(
+                "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")
+        return outputs
+    def forward(
+            self,
+            input_values,
+            attention_mask=None,
+            output_attentions=None,
+            output_hidden_states=None,
+            return_dict=None,
+            labels=None,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.wav2vec2(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        hidden_states = self.merged_strategy(hidden_states, mode=self.pooling_mode)
+        logits = self.classifier(hidden_states)
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return SpeechClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+class HubertClassificationHead(nn.Module):
+    """Head for hubert classification task."""
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.final_dropout)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+    def forward(self, features, **kwargs):
+        x = features
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = torch.tanh(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+class HubertForSpeechClassification(HubertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.pooling_mode = config.pooling_mode
+        self.config = config
+        self.hubert = HubertModel(config)
+        self.classifier = HubertClassificationHead(config)
+        self.init_weights()
+    def freeze_feature_extractor(self):
+        self.hubert.feature_extractor._freeze_parameters()
+    def merged_strategy(
+            self,
+            hidden_states,
+            mode="mean"
+    ):
+        if mode == "mean":
+            outputs = torch.mean(hidden_states, dim=1)
+        elif mode == "sum":
+            outputs = torch.sum(hidden_states, dim=1)
+        elif mode == "max":
+            outputs = torch.max(hidden_states, dim=1)[0]
+        else:
+            raise Exception(
+                "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")
+        return outputs
+    def forward(
+            self,
+            input_values,
+            attention_mask=None,
+            output_attentions=None,
+            output_hidden_states=None,
+            return_dict=None,
+            labels=None,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.hubert(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        hidden_states = self.merged_strategy(hidden_states, mode=self.pooling_mode)
+        logits = self.classifier(hidden_states)
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return SpeechClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

n1-02-01-01-47.wav ADDED Viewed

Binary file (96 kB). View file

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+transformers
+torch
+torchaudio

s4-02-01-02-28.wav ADDED Viewed

Binary file (96 kB). View file

trainer.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from typing import Any, Dict, Union
+import torch
+from packaging import version
+from torch import nn
+from transformers import (
+    Trainer,
+    is_apex_available,
+)
+if is_apex_available():
+    from apex import amp
+if version.parse(torch.__version__) >= version.parse("1.6"):
+    _is_native_amp_available = True
+    from torch.cuda.amp import autocast
+class CTCTrainer(Trainer):
+    def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
+        model.train()
+        inputs = self._prepare_inputs(inputs)
+        if self.use_amp:
+            with autocast():
+                loss = self.compute_loss(model, inputs)
+        else:
+            loss = self.compute_loss(model, inputs)
+        if self.args.gradient_accumulation_steps > 1:
+            loss = loss / self.args.gradient_accumulation_steps
+        if self.use_amp:
+            self.scaler.scale(loss).backward()
+        elif self.use_apex:
+            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
+                scaled_loss.backward()
+        elif self.deepspeed:
+            self.deepspeed.backward(loss)
+        else:
+            loss.backward()
+        return loss.detach()