AlexandraDolidze commited on
Commit
3d3ef8a
1 Parent(s): f2c3ec9

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +23 -13
  2. hf_inference.py +7 -7
app.py CHANGED
@@ -1,18 +1,28 @@
1
  import streamlit as st
2
- import hf_inference
3
 
4
- # пути до моделей словарь
5
- models_dict = ['text_model_path' : text_model_path,
6
- 'video_model_path' : video_model_path,
7
- 'audio_model_path': audio_model_path]
 
 
8
 
9
- st.title("Multimodal ERC project")
10
 
11
- uploaded_file = st.file_uploader("Choose a video")
12
- input_text = st.text_area("Please, write transcript", '''That's obligatory.''')
 
 
 
 
 
13
 
14
- if uploaded_file is not None & input_text != '''That's obligatory.''':
15
- output_emotion = infer_multimodal_model(input_text, uploaded_file, models_dict)
16
- # закидываю видео и текст в инференс
17
- # получаю аутпут эмоции
18
- st.write(f"We think that's {output_emotion}")
 
 
 
 
1
  import streamlit as st
2
+ from hf_inference import infer_multimodal_model
3
 
4
+ paths = {
5
+ 'text_model_path': 'files/bert-large-uncased_none_seed-42.pt',
6
+ 'video_model_path': 'files/XCLIP_Augmented.pt',
7
+ 'audio_model_path': 'files/1d_cnn_with_opensmile.pt',
8
+ 'multimodal_model_path': 'files/multimodal_model_with_early_fusion.pt'
9
+ }
10
 
11
+ label2emoji = {'anger': '😠', 'disgust': '🤢', 'fear': '😨', 'joy': '😄', 'neutral': '😶', 'sadness': '😔', 'surprise': '😯'}
12
 
13
+ uploaded_video = st.file_uploader('Upload your video')
14
+ text = st.text_input('Enter your text')
15
+ if uploaded_video is not None and text:
16
+ bytes_data = uploaded_video.getvalue()
17
+ video_path = 'input_video.mp4'
18
+ with open(video_path, 'wb') as f:
19
+ f.write(bytes_data)
20
 
21
+ st.divider()
22
+ st.subheader('Input Video')
23
+ st.video(bytes_data)
24
+ st.subheader('Input Text')
25
+ st.write(text)
26
+ label = infer_multimodal_model(text=text, video_path=video_path, model_pathes=paths)
27
+ st.subheader('Video Emotion')
28
+ st.write(f'{label} {label2emoji[label] * 3}')
hf_inference.py CHANGED
@@ -195,26 +195,26 @@ def prepare_models(num_labels: int,
195
  text_model_path: str,
196
  video_model_path: str,
197
  audio_model_path: str,
198
- device: str='cuda'):
199
  # TEXT
200
  text_model_name = 'bert-large-uncased'
201
  text_base_model = AutoModelForSequenceClassification.from_pretrained(
202
  text_model_name,
203
  num_labels=num_labels
204
  )
205
- state_dict = torch.load(text_model_path)
206
  text_base_model.load_state_dict(state_dict, strict=False)
207
  text_model = TextClassificationModel(text_base_model, device=device)
208
 
209
  # VIDEO
210
  video_base_model = XCLIPClassificationModel(num_labels)
211
- state_dict = torch.load(video_model_path)
212
  video_base_model.load_state_dict(state_dict, strict=False)
213
  video_model = VideoClassificationModel(video_base_model, device=device)
214
 
215
  # AUDIO
216
  audio_base_model = ConvNet(num_labels)
217
- checkpoint = torch.load(audio_model_path)
218
  audio_base_model.load_state_dict(checkpoint['model_state_dict'])
219
  audio_model = AudioClassificationModel(audio_base_model, device=device)
220
 
@@ -274,7 +274,7 @@ def prepare_data_input(text: str,
274
  verbose=True,
275
  )
276
  audio_features = smile.process_files([video_path])
277
- redundant_feat = open('redundant_feat.txt').read().split(',')
278
  audio_features.drop(columns=redundant_feat, inplace=True)
279
  # TEXT
280
  text_model_name = 'bert-large-uncased'
@@ -304,9 +304,9 @@ def infer_multimodal_model(text: str,
304
  input_size=4885,
305
  hidden_size=512
306
  )
307
- checkpoint = torch.load(model_pathes['multimodal_model_path'])
308
  multi_model.load_state_dict(checkpoint)
309
- device = 'cuda'
310
  final_model = MainModel(multi_model, device=device)
311
  batch = prepare_data_input(text, video_path)
312
  label = final_model(batch).detach().cpu().tolist()
 
195
  text_model_path: str,
196
  video_model_path: str,
197
  audio_model_path: str,
198
+ device: str='cpu'):
199
  # TEXT
200
  text_model_name = 'bert-large-uncased'
201
  text_base_model = AutoModelForSequenceClassification.from_pretrained(
202
  text_model_name,
203
  num_labels=num_labels
204
  )
205
+ state_dict = torch.load(text_model_path, map_location=torch.device('cpu'))
206
  text_base_model.load_state_dict(state_dict, strict=False)
207
  text_model = TextClassificationModel(text_base_model, device=device)
208
 
209
  # VIDEO
210
  video_base_model = XCLIPClassificationModel(num_labels)
211
+ state_dict = torch.load(video_model_path, map_location=torch.device('cpu'))
212
  video_base_model.load_state_dict(state_dict, strict=False)
213
  video_model = VideoClassificationModel(video_base_model, device=device)
214
 
215
  # AUDIO
216
  audio_base_model = ConvNet(num_labels)
217
+ checkpoint = torch.load(audio_model_path, map_location=torch.device('cpu'))
218
  audio_base_model.load_state_dict(checkpoint['model_state_dict'])
219
  audio_model = AudioClassificationModel(audio_base_model, device=device)
220
 
 
274
  verbose=True,
275
  )
276
  audio_features = smile.process_files([video_path])
277
+ redundant_feat = open('files/redundant_feat.txt').read().split(',')
278
  audio_features.drop(columns=redundant_feat, inplace=True)
279
  # TEXT
280
  text_model_name = 'bert-large-uncased'
 
304
  input_size=4885,
305
  hidden_size=512
306
  )
307
+ checkpoint = torch.load(model_pathes['multimodal_model_path'], map_location=torch.device('cpu'))
308
  multi_model.load_state_dict(checkpoint)
309
+ device = 'cpu'
310
  final_model = MainModel(multi_model, device=device)
311
  batch = prepare_data_input(text, video_path)
312
  label = final_model(batch).detach().cpu().tolist()