File size: 6,044 Bytes
8fb0be5 ee531be 59f6126 1c8f6d7 93ba2b1 b83f300 78f79fc 024f740 4b331f0 e8a4c9c ee531be 09b358f 7dc42bb 8ac5c0e 78f79fc 2b3433c 78f79fc 2b3433c 69b2504 09b358f 29a10e5 c9323c5 791adc1 4b331f0 ee531be 4b331f0 ee531be 4b331f0 09b358f 060a1e0 09b358f 4b331f0 ee531be 4b331f0 09b358f 8f0bb70 09b358f ee531be 09b358f 4380489 09b358f 4380489 09b358f 8ac5c0e 09b358f ee531be 1c8f6d7 060a1e0 09b358f 623c1fa 09b358f 623c1fa 4b331f0 e86a1dc 2b3433c e86a1dc a8c8823 c9323c5 46f1669 2b3433c 69b2504 c9323c5 69b2504 93ba2b1 b83f300 2b3433c b83f300 78f79fc 791adc1 4b331f0 2b3433c 69b2504 2b3433c 78f79fc ee531be 4b331f0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
import os
import streamlit as st
import whisperx
import torch
from utils import convert_segments_object_to_text, check_password, convert_segments_object_to_text_simple
from gigiachat_requests import get_access_token, get_completion_from_gigachat, get_number_of_tokens, process_transcribation_with_gigachat
from openai_requests import get_completion_from_openai, process_transcribation_with_assistant
from qwen import respond
if check_password():
st.title('Audio Transcription App')
st.sidebar.title("Settings")
device = os.getenv('DEVICE')
batch_size = int(os.getenv('BATCH_SIZE'))
compute_type = os.getenv('COMPUTE_TYPE')
initial_base_prompt = os.getenv('BASE_PROMPT')
initial_processing_prompt = os.getenv('PROCCESS_PROMPT')
min_speakers = st.sidebar.number_input("Минимальное количество спикеров", min_value=1, value=2)
max_speakers = st.sidebar.number_input("Максимальное количество спикеров", min_value=1, value=2)
llm = st.sidebar.selectbox("Производитель LLM", ["Сбер", "OpenAI", "Qwen"], index=0)
if llm == "Сбер":
options = ["GigaChat-Plus", "GigaChat", "GigaChat-Pro"]
elif llm == "OpenAI":
options = ["gpt-4o", "gpt-4o-mini", "gpt-4-turbo", "gpt-4", "gpt-3.5-turbo"]
elif llm == "Qwen":
options = ["Qwen/Qwen2-7B-Instruct"]
else:
options = []
llm_model = st.sidebar.selectbox("Модель", options, index=0)
base_prompt = st.sidebar.text_area("Промпт для резюмирования", value=initial_base_prompt)
enable_processing = st.sidebar.checkbox("Добавить обработку транскрибации", value=False)
processing_prompt = st.sidebar.text_area("Промпт для обработки транскрибации", value=initial_processing_prompt)
ACCESS_TOKEN = st.secrets["HF_TOKEN"]
uploaded_file = st.file_uploader("Загрузите аудиофайл", type=["mp4", "wav", "m4a"])
if uploaded_file is not None:
file_name = uploaded_file.name
if 'file_name' not in st.session_state or st.session_state.file_name != file_name:
st.session_state.transcript = ''
st.session_state.file_name = file_name
print(st.session_state.file_name)
print(st.session_state.transcript)
print(st.session_state.file_name)
print(st.session_state.transcript)
st.audio(uploaded_file)
file_extension = uploaded_file.name.split(".")[-1] # Получаем расширение файла
temp_file_path = f"temp_file.{file_extension}" # Создаем временное имя файла с правильным расширением
with open(temp_file_path, "wb") as f:
f.write(uploaded_file.getbuffer())
print(st.session_state.transcript)
if 'transcript' not in st.session_state or st.session_state.transcript == '':
with st.spinner('Транскрибируем...'):
# Load model
model = whisperx.load_model(os.getenv('WHISPER_MODEL_SIZE'), device, compute_type=compute_type)
# Load and transcribe audio
audio = whisperx.load_audio(temp_file_path)
result = model.transcribe(audio, batch_size=batch_size, language="ru")
print('Transcribed, now aligning')
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
print('Aligned, now diarizing')
diarize_model = whisperx.DiarizationPipeline(use_auth_token=st.secrets["HF_TOKEN"], device=device)
diarize_segments = diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)
result_diar = whisperx.assign_word_speakers(diarize_segments, result)
transcript = convert_segments_object_to_text_simple(result_diar)
st.session_state.transcript = transcript
else:
transcript = st.session_state.transcript
st.write("Результат транскрибации:")
st.text(transcript)
if (llm == 'Сбер'):
access_token = get_access_token()
if (enable_processing):
with st.spinner('Обрабатываем транскрибацию...'):
if (llm == 'Сбер'):
number_of_tokens = get_number_of_tokens(transcript, access_token, llm_model)
print('Количество токенов в транскрибации: ' + str(number_of_tokens))
transcript = process_transcribation_with_gigachat(processing_prompt, transcript, number_of_tokens + 1000, access_token, llm_model)
print(transcript)
elif (llm == 'OpenAI'):
transcript = process_transcribation_with_assistant(processing_prompt, transcript)
print(transcript)
else:
st.write("На данный момент обработка транскрибации не поддерживается этой моделью.")
with st.spinner('Резюмируем...'):
if (llm == 'Сбер'):
summary_answer = get_completion_from_gigachat(base_prompt + transcript, 1024, access_token, llm_model)
elif (llm == 'OpenAI'):
summary_answer = get_completion_from_openai(base_prompt + transcript, llm_model, 1024)
elif (llm == 'Qwen'):
summary_answer = respond(base_prompt + transcript)
st.write("Результат резюмирования:")
st.text(summary_answer) |