Spaces:
Runtime error
Runtime error
AlexandraDolidze
commited on
Commit
•
3d3ef8a
1
Parent(s):
f2c3ec9
Upload 2 files
Browse files- app.py +23 -13
- hf_inference.py +7 -7
app.py
CHANGED
@@ -1,18 +1,28 @@
|
|
1 |
import streamlit as st
|
2 |
-
import
|
3 |
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
|
|
|
|
8 |
|
9 |
-
|
10 |
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
st.write(
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
+
from hf_inference import infer_multimodal_model
|
3 |
|
4 |
+
paths = {
|
5 |
+
'text_model_path': 'files/bert-large-uncased_none_seed-42.pt',
|
6 |
+
'video_model_path': 'files/XCLIP_Augmented.pt',
|
7 |
+
'audio_model_path': 'files/1d_cnn_with_opensmile.pt',
|
8 |
+
'multimodal_model_path': 'files/multimodal_model_with_early_fusion.pt'
|
9 |
+
}
|
10 |
|
11 |
+
label2emoji = {'anger': '😠', 'disgust': '🤢', 'fear': '😨', 'joy': '😄', 'neutral': '😶', 'sadness': '😔', 'surprise': '😯'}
|
12 |
|
13 |
+
uploaded_video = st.file_uploader('Upload your video')
|
14 |
+
text = st.text_input('Enter your text')
|
15 |
+
if uploaded_video is not None and text:
|
16 |
+
bytes_data = uploaded_video.getvalue()
|
17 |
+
video_path = 'input_video.mp4'
|
18 |
+
with open(video_path, 'wb') as f:
|
19 |
+
f.write(bytes_data)
|
20 |
|
21 |
+
st.divider()
|
22 |
+
st.subheader('Input Video')
|
23 |
+
st.video(bytes_data)
|
24 |
+
st.subheader('Input Text')
|
25 |
+
st.write(text)
|
26 |
+
label = infer_multimodal_model(text=text, video_path=video_path, model_pathes=paths)
|
27 |
+
st.subheader('Video Emotion')
|
28 |
+
st.write(f'{label} {label2emoji[label] * 3}')
|
hf_inference.py
CHANGED
@@ -195,26 +195,26 @@ def prepare_models(num_labels: int,
|
|
195 |
text_model_path: str,
|
196 |
video_model_path: str,
|
197 |
audio_model_path: str,
|
198 |
-
device: str='
|
199 |
# TEXT
|
200 |
text_model_name = 'bert-large-uncased'
|
201 |
text_base_model = AutoModelForSequenceClassification.from_pretrained(
|
202 |
text_model_name,
|
203 |
num_labels=num_labels
|
204 |
)
|
205 |
-
state_dict = torch.load(text_model_path)
|
206 |
text_base_model.load_state_dict(state_dict, strict=False)
|
207 |
text_model = TextClassificationModel(text_base_model, device=device)
|
208 |
|
209 |
# VIDEO
|
210 |
video_base_model = XCLIPClassificationModel(num_labels)
|
211 |
-
state_dict = torch.load(video_model_path)
|
212 |
video_base_model.load_state_dict(state_dict, strict=False)
|
213 |
video_model = VideoClassificationModel(video_base_model, device=device)
|
214 |
|
215 |
# AUDIO
|
216 |
audio_base_model = ConvNet(num_labels)
|
217 |
-
checkpoint = torch.load(audio_model_path)
|
218 |
audio_base_model.load_state_dict(checkpoint['model_state_dict'])
|
219 |
audio_model = AudioClassificationModel(audio_base_model, device=device)
|
220 |
|
@@ -274,7 +274,7 @@ def prepare_data_input(text: str,
|
|
274 |
verbose=True,
|
275 |
)
|
276 |
audio_features = smile.process_files([video_path])
|
277 |
-
redundant_feat = open('redundant_feat.txt').read().split(',')
|
278 |
audio_features.drop(columns=redundant_feat, inplace=True)
|
279 |
# TEXT
|
280 |
text_model_name = 'bert-large-uncased'
|
@@ -304,9 +304,9 @@ def infer_multimodal_model(text: str,
|
|
304 |
input_size=4885,
|
305 |
hidden_size=512
|
306 |
)
|
307 |
-
checkpoint = torch.load(model_pathes['multimodal_model_path'])
|
308 |
multi_model.load_state_dict(checkpoint)
|
309 |
-
device = '
|
310 |
final_model = MainModel(multi_model, device=device)
|
311 |
batch = prepare_data_input(text, video_path)
|
312 |
label = final_model(batch).detach().cpu().tolist()
|
|
|
195 |
text_model_path: str,
|
196 |
video_model_path: str,
|
197 |
audio_model_path: str,
|
198 |
+
device: str='cpu'):
|
199 |
# TEXT
|
200 |
text_model_name = 'bert-large-uncased'
|
201 |
text_base_model = AutoModelForSequenceClassification.from_pretrained(
|
202 |
text_model_name,
|
203 |
num_labels=num_labels
|
204 |
)
|
205 |
+
state_dict = torch.load(text_model_path, map_location=torch.device('cpu'))
|
206 |
text_base_model.load_state_dict(state_dict, strict=False)
|
207 |
text_model = TextClassificationModel(text_base_model, device=device)
|
208 |
|
209 |
# VIDEO
|
210 |
video_base_model = XCLIPClassificationModel(num_labels)
|
211 |
+
state_dict = torch.load(video_model_path, map_location=torch.device('cpu'))
|
212 |
video_base_model.load_state_dict(state_dict, strict=False)
|
213 |
video_model = VideoClassificationModel(video_base_model, device=device)
|
214 |
|
215 |
# AUDIO
|
216 |
audio_base_model = ConvNet(num_labels)
|
217 |
+
checkpoint = torch.load(audio_model_path, map_location=torch.device('cpu'))
|
218 |
audio_base_model.load_state_dict(checkpoint['model_state_dict'])
|
219 |
audio_model = AudioClassificationModel(audio_base_model, device=device)
|
220 |
|
|
|
274 |
verbose=True,
|
275 |
)
|
276 |
audio_features = smile.process_files([video_path])
|
277 |
+
redundant_feat = open('files/redundant_feat.txt').read().split(',')
|
278 |
audio_features.drop(columns=redundant_feat, inplace=True)
|
279 |
# TEXT
|
280 |
text_model_name = 'bert-large-uncased'
|
|
|
304 |
input_size=4885,
|
305 |
hidden_size=512
|
306 |
)
|
307 |
+
checkpoint = torch.load(model_pathes['multimodal_model_path'], map_location=torch.device('cpu'))
|
308 |
multi_model.load_state_dict(checkpoint)
|
309 |
+
device = 'cpu'
|
310 |
final_model = MainModel(multi_model, device=device)
|
311 |
batch = prepare_data_input(text, video_path)
|
312 |
label = final_model(batch).detach().cpu().tolist()
|