Spaces:

AlexandraDolidze
/

MERC

Runtime error

App Files Files Community

AlexandraDolidze commited on Jan 16

Commit

3d3ef8a

•

1 Parent(s): f2c3ec9

Upload 2 files

Browse files

Files changed (2) hide show

app.py +23 -13
hf_inference.py +7 -7

app.py CHANGED Viewed

@@ -1,18 +1,28 @@
 import streamlit as st
-import hf_inference
-# пути до моделей словарь
-models_dict = ['text_model_path' : text_model_path,
-               'video_model_path' : video_model_path,
-               'audio_model_path': audio_model_path]
-st.title("Multimodal ERC project")
-uploaded_file = st.file_uploader("Choose a video")
-input_text = st.text_area("Please, write transcript", '''That's obligatory.''')
-if uploaded_file is not None & input_text != '''That's obligatory.''':
-    output_emotion = infer_multimodal_model(input_text, uploaded_file, models_dict)
-    # закидываю видео и текст в инференс
-    # получаю аутпут эмоции
-st.write(f"We think that's {output_emotion}")

 import streamlit as st
+from hf_inference import infer_multimodal_model
+paths = {
+    'text_model_path': 'files/bert-large-uncased_none_seed-42.pt',
+    'video_model_path': 'files/XCLIP_Augmented.pt',
+    'audio_model_path': 'files/1d_cnn_with_opensmile.pt',
+    'multimodal_model_path': 'files/multimodal_model_with_early_fusion.pt'
+}
+label2emoji = {'anger': '😠', 'disgust': '🤢', 'fear': '😨', 'joy': '😄', 'neutral': '😶', 'sadness': '😔', 'surprise': '😯'}
+uploaded_video = st.file_uploader('Upload your video')
+text = st.text_input('Enter your text')
+if uploaded_video is not None and text:
+    bytes_data = uploaded_video.getvalue()
+    video_path = 'input_video.mp4'
+    with open(video_path, 'wb') as f:
+        f.write(bytes_data)
+    st.divider()
+    st.subheader('Input Video')
+    st.video(bytes_data)
+    st.subheader('Input Text')
+    st.write(text)
+    label = infer_multimodal_model(text=text, video_path=video_path, model_pathes=paths)
+    st.subheader('Video Emotion')
+    st.write(f'{label} {label2emoji[label] * 3}')

hf_inference.py CHANGED Viewed

@@ -195,26 +195,26 @@ def prepare_models(num_labels: int,
                    text_model_path: str,
                    video_model_path: str,
                    audio_model_path: str,
-                   device: str='cuda'):
     # TEXT
     text_model_name = 'bert-large-uncased'
     text_base_model = AutoModelForSequenceClassification.from_pretrained(
         text_model_name,
         num_labels=num_labels
     )
-    state_dict = torch.load(text_model_path)
     text_base_model.load_state_dict(state_dict, strict=False)
     text_model = TextClassificationModel(text_base_model, device=device)
     # VIDEO
     video_base_model = XCLIPClassificationModel(num_labels)
-    state_dict = torch.load(video_model_path)
     video_base_model.load_state_dict(state_dict, strict=False)
     video_model = VideoClassificationModel(video_base_model, device=device)
     # AUDIO
     audio_base_model = ConvNet(num_labels)
-    checkpoint = torch.load(audio_model_path)
     audio_base_model.load_state_dict(checkpoint['model_state_dict'])
     audio_model = AudioClassificationModel(audio_base_model, device=device)
@@ -274,7 +274,7 @@ def prepare_data_input(text: str,
         verbose=True,
     )
     audio_features = smile.process_files([video_path])
-    redundant_feat = open('redundant_feat.txt').read().split(',')
     audio_features.drop(columns=redundant_feat, inplace=True)
     # TEXT
     text_model_name = 'bert-large-uncased'
@@ -304,9 +304,9 @@ def infer_multimodal_model(text: str,
         input_size=4885,
         hidden_size=512
     )
-    checkpoint = torch.load(model_pathes['multimodal_model_path'])
     multi_model.load_state_dict(checkpoint)
-    device = 'cuda'
     final_model = MainModel(multi_model, device=device)
     batch = prepare_data_input(text, video_path)
     label = final_model(batch).detach().cpu().tolist()

                    text_model_path: str,
                    video_model_path: str,
                    audio_model_path: str,
+                   device: str='cpu'):
     # TEXT
     text_model_name = 'bert-large-uncased'
     text_base_model = AutoModelForSequenceClassification.from_pretrained(
         text_model_name,
         num_labels=num_labels
     )
+    state_dict = torch.load(text_model_path, map_location=torch.device('cpu'))
     text_base_model.load_state_dict(state_dict, strict=False)
     text_model = TextClassificationModel(text_base_model, device=device)
     # VIDEO
     video_base_model = XCLIPClassificationModel(num_labels)
+    state_dict = torch.load(video_model_path, map_location=torch.device('cpu'))
     video_base_model.load_state_dict(state_dict, strict=False)
     video_model = VideoClassificationModel(video_base_model, device=device)
     # AUDIO
     audio_base_model = ConvNet(num_labels)
+    checkpoint = torch.load(audio_model_path, map_location=torch.device('cpu'))
     audio_base_model.load_state_dict(checkpoint['model_state_dict'])
     audio_model = AudioClassificationModel(audio_base_model, device=device)
         verbose=True,
     )
     audio_features = smile.process_files([video_path])
+    redundant_feat = open('files/redundant_feat.txt').read().split(',')
     audio_features.drop(columns=redundant_feat, inplace=True)
     # TEXT
     text_model_name = 'bert-large-uncased'
         input_size=4885,
         hidden_size=512
     )
+    checkpoint = torch.load(model_pathes['multimodal_model_path'], map_location=torch.device('cpu'))
     multi_model.load_state_dict(checkpoint)
+    device = 'cpu'
     final_model = MainModel(multi_model, device=device)
     batch = prepare_data_input(text, video_path)
     label = final_model(batch).detach().cpu().tolist()