import streamlit as st from hf_inference import infer_multimodal_model paths = { 'text_model_path': 'bert-large-uncased_none_seed-42.pt', 'video_model_path': 'XCLIP_Augmented.pt', 'audio_model_path': '1d_cnn_with_opensmile.pt', 'multimodal_model_path': 'files/multimodal_model_with_early_fusion.pt' } label2emoji = {'anger': '😠', 'disgust': '🤢', 'fear': '😨', 'joy': '😄', 'neutral': '😶', 'sadness': '😔', 'surprise': '😯'} uploaded_video = st.file_uploader('Upload your video') text = st.text_input('Enter your text') if uploaded_video is not None and text: bytes_data = uploaded_video.getvalue() video_path = 'input_video.mp4' with open(video_path, 'wb') as f: f.write(bytes_data) st.divider() st.subheader('Input Video') st.video(bytes_data) st.subheader('Input Text') st.write(text) label = infer_multimodal_model(text=text, video_path=video_path, model_pathes=paths) st.subheader('Video Emotion') st.write(f'{label} {label2emoji[label] * 3}')