Spaces:

ZackBradshaw
/

omni_bot

Runtime error

App Files Files Community

omni_bot / swarms /tools /stt.py

WAWAA

Upload folder using huggingface_hub

4962437 about 1 year ago

raw

history blame contribute delete

3.87 kB

	#speech to text tool

	import os
	import subprocess

	import whisperx
	from pydub import AudioSegment
	from pytube import YouTube


	class SpeechToText:
	def __init__(
	self,
	video_url,
	audio_format='mp3',
	device='cuda',
	batch_size = 16,
	compute_type = "float16",
	hf_api_key = None
	):
	"""
	# Example usage
	video_url = "url"
	speech_to_text = SpeechToText(video_url)
	transcription = speech_to_text.transcribe_youtube_video()
	print(transcription)

	"""
	self.video_url = video_url
	self.audio_format = audio_format
	self.device = device
	self.batch_size = batch_size
	self.compute_type = compute_type
	self.hf_api_key = hf_api_key

	def install(self):
	subprocess.run(["pip", "install", "whisperx"])
	subprocess.run(["pip", "install", "pytube"])
	subprocess.run(["pip", "install", "pydub"])


	def download_youtube_video(self):
	audio_file = f'video.{self.audio_format}'

	# Download video 📥
	yt = YouTube(self.video_url)
	yt_stream = yt.streams.filter(only_audio=True).first()
	yt_stream.download(filename='video.mp4')

	# Convert video to audio 🎧
	video = AudioSegment.from_file("video.mp4", format="mp4")
	video.export(audio_file, format=self.audio_format)
	os.remove("video.mp4")

	return audio_file

	def transcribe_youtube_video(self):
	audio_file = self.download_youtube_video()

	device = "cuda"
	batch_size = 16
	compute_type = "float16"

	# 1. Transcribe with original Whisper (batched) 🗣️
	model = whisperx.load_model("large-v2", device, compute_type=compute_type)
	audio = whisperx.load_audio(audio_file)
	result = model.transcribe(audio, batch_size=batch_size)

	# 2. Align Whisper output 🔍
	model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
	result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)

	# 3. Assign speaker labels 🏷️
	diarize_model = whisperx.DiarizationPipeline(
	use_auth_token=self.hf_api_key,
	device=device
	)
	diarize_model(audio_file)

	try:
	segments = result["segments"]
	transcription = " ".join(segment['text'] for segment in segments)
	return transcription
	except KeyError:
	print("The key 'segments' is not found in the result.")

	def transcribe(self, audio_file):
	model = whisperx.load_model(
	"large-v2",
	self.device,
	self.compute_type
	)
	audio = whisperx.load_audio(audio_file)
	result = model.transcribe(
	audio,
	batch_size=self.batch_size
	)

	# 2. Align Whisper output 🔍
	model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
	result = whisperx.align(
	result["segments"],
	model_a,
	metadata,
	audio,
	self.device,
	return_char_alignments=False
	)

	# 3. Assign speaker labels 🏷️
	diarize_model = whisperx.DiarizationPipeline(
	use_auth_token=self.hf_api_key,
	device=self.device
	)

	diarize_model(audio_file)

	try:
	segments = result["segments"]
	transcription = " ".join(segment['text'] for segment in segments)
	return transcription
	except KeyError:
	print("The key 'segments' is not found in the result.")