seewav-gui

Runtime error

App Files Files Community

seewav-gui / app.py

ysharma HF staff

Update app.py

752d508 verified 7 months ago

raw

history blame contribute delete

9.74 kB

	# Thank you to the authors of seewav for dedicating it into the public domain.
	# This program is also dedicated into the public domain.
	# You may use it, at your choice, under the Unlicense, CC0, or WTFPL license.
	# Enjoy!

	# Mostly from: https://github.com/adefossez/seewav
	# Original author: adefossez


	import math
	import tempfile
	from pathlib import Path
	import subprocess
	import cairo
	import numpy as np
	import gradio as gr
	from pydub import AudioSegment


	def read_audio(audio, seek=None, duration=None):
	"""
	Read the `audio` file, starting at `seek` (or 0) seconds for `duration` (or all) seconds.
	Returns `float[channels, samples]`.
	"""

	audio_segment = AudioSegment.from_file(audio)
	channels = audio_segment.channels
	samplerate = audio_segment.frame_rate

	if seek is not None:
	seek_ms = int(seek * 1000)
	audio_segment = audio_segment[seek_ms:]

	if duration is not None:
	duration_ms = int(duration * 1000)
	audio_segment = audio_segment[:duration_ms]

	samples = audio_segment.get_array_of_samples()
	wav = np.array(samples, dtype=np.float32)
	return wav.reshape(channels, -1), samplerate


	def sigmoid(x):
	return 1 / (1 + np.exp(-x))


	def envelope(wav, window, stride):
	"""
	Extract the envelope of the waveform `wav` (float[samples]), using average pooling
	with `window` samples and the given `stride`.
	"""
	# pos = np.pad(np.maximum(wav, 0), window // 2)
	wav = np.pad(wav, window // 2)
	out = []
	for off in range(0, len(wav) - window, stride):
	frame = wav[off : off + window]
	out.append(np.maximum(frame, 0).mean())
	out = np.array(out)
	# Some form of audio compressor based on the sigmoid.
	out = 1.9 * (sigmoid(2.5 * out) - 0.5)
	return out


	def draw_env(envs, out, fg_colors, bg_color, size):
	"""
	Internal function, draw a single frame (two frames for stereo) using cairo and save
	it to the `out` file as png. envs is a list of envelopes over channels, each env
	is a float[bars] representing the height of the envelope to draw. Each entry will
	be represented by a bar.
	"""
	surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, *size)
	ctx = cairo.Context(surface)
	ctx.scale(*size)

	ctx.set_source_rgb(*bg_color)
	ctx.rectangle(0, 0, 1, 1)
	ctx.fill()

	K = len(envs) # Number of waves to draw (waves are stacked vertically)
	T = len(envs[0]) # Numbert of time steps
	pad_ratio = 0.1 # spacing ratio between 2 bars
	width = 1.0 / (T * (1 + 2 * pad_ratio))
	pad = pad_ratio * width
	delta = 2 * pad + width

	ctx.set_line_width(width)
	for step in range(T):
	for i in range(K):
	half = 0.5 * envs[i][step] # (semi-)height of the bar
	half /= K # as we stack K waves vertically
	midrule = (1 + 2 * i) / (2 * K) # midrule of i-th wave
	ctx.set_source_rgb(*fg_colors[i])
	ctx.move_to(pad + step * delta, midrule - half)
	ctx.line_to(pad + step * delta, midrule)
	ctx.stroke()
	ctx.set_source_rgba(*fg_colors[i], 0.8)
	ctx.move_to(pad + step * delta, midrule)
	ctx.line_to(pad + step * delta, midrule + 0.9 * half)
	ctx.stroke()

	surface.write_to_png(out)


	def interpole(x1, y1, x2, y2, x):
	return y1 + (y2 - y1) * (x - x1) / (x2 - x1)


	def visualize(
	progress,
	audio,
	tmp,
	out,
	seek=None,
	duration=None,
	rate=60,
	bars=50,
	speed=4,
	time=0.4,
	oversample=3,
	fg_color=(0.2, 0.2, 0.2),
	fg_color2=(0.5, 0.3, 0.6),
	bg_color=(1, 1, 1),
	size=(400, 400),
	stereo=False,
	):
	"""
	Generate the visualisation for the `audio` file, using a `tmp` folder and saving the final
	video in `out`.
	`seek` and `durations` gives the extract location if any.
	`rate` is the framerate of the output video.

	`bars` is the number of bars in the animation.
	`speed` is the base speed of transition. Depending on volume, actual speed will vary
	between 0.5 and 2 times it.
	`time` amount of audio shown at once on a frame.
	`oversample` higher values will lead to more frequent changes.
	`fg_color` is the rgb color to use for the foreground.
	`fg_color2` is the rgb color to use for the second wav if stereo is set.
	`bg_color` is the rgb color to use for the background.
	`size` is the `(width, height)` in pixels to generate.
	`stereo` is whether to create 2 waves.
	"""
	try:
	wav, sr = read_audio(audio, seek=seek, duration=duration)
	except (IOError, ValueError) as err:
	raise gr.Error(err)
	# wavs is a list of wav over channels
	wavs = []
	if stereo:
	assert wav.shape[0] == 2, "stereo requires stereo audio file"
	wavs.append(wav[0])
	wavs.append(wav[1])
	else:
	wav = wav.mean(0)
	wavs.append(wav)

	for i, wav in enumerate(wavs):
	wavs[i] = wav / wav.std()

	window = int(sr * time / bars)
	stride = int(window / oversample)
	# envs is a list of env over channels
	envs = []
	for wav in wavs:
	env = envelope(wav, window, stride)
	env = np.pad(env, (bars // 2, 2 * bars))
	envs.append(env)

	duration = len(wavs[0]) / sr
	frames = int(rate * duration)
	smooth = np.hanning(bars)

	gr.Info("Generating the frames...")
	for idx in progress(range(frames)):
	pos = (((idx / rate)) * sr) / stride / bars
	off = int(pos)
	loc = pos - off
	denvs = []
	for env in envs:
	env1 = env[off * bars : (off + 1) * bars]
	env2 = env[(off + 1) * bars : (off + 2) * bars]

	# we want loud parts to be updated faster
	maxvol = math.log10(1e-4 + env2.max()) * 10
	speedup = np.clip(interpole(-6, 0.5, 0, 2, maxvol), 0.5, 2)
	w = sigmoid(speed * speedup * (loc - 0.5))
	denv = (1 - w) * env1 + w * env2
	denv *= smooth
	denvs.append(denv)
	draw_env(denvs, tmp / f"{idx:06d}.png", (fg_color, fg_color2), bg_color, size)
	gr.Info("Encoding the animation video...")
	subprocess.run([
	"ffmpeg", "-y", "-loglevel", "panic", "-r",
	str(rate), "-f", "image2", "-s", f"{size[0]}x{size[1]}", "-i", "%06d.png", "-i", audio, "-c:a", "aac", "-vcodec", "libx264", "-crf", "10", "-pix_fmt", "yuv420p",
	out.resolve()
	], check=True, cwd=tmp)
	return out



	def parse_color(colorstr):
	"""
	Given a comma separated rgb(a) colors, returns a 4-tuple of float.
	"""
	try:
	r, g, b = [float(i) for i in colorstr.split(",")]
	return r, g, b
	except ValueError:
	raise gr.Error(
	"Format for color is 3 floats separated by commas 0.xx,0.xx,0.xx, rgb order"
	)


	def hex_to_rgb(hex_color):
	hex_color = hex_color.lstrip('#')
	r = int(hex_color[0:2], 16) / 255.0
	g = int(hex_color[2:4], 16) / 255.0
	b = int(hex_color[4:6], 16) / 255.0
	return (r, g, b)

	def do_viz(
	inp_aud,
	inp_bgcolor,
	inp_color1,
	inp_nbars,
	inp_vidw,
	inp_vidh,
	progress=gr.Progress(),
	):
	with tempfile.TemporaryDirectory() as tmp, tempfile.NamedTemporaryFile(
	suffix=".mp4",
	delete=False
	) as out:
	return visualize(
	progress.tqdm,
	inp_aud,
	Path(tmp),
	Path(out.name),
	bars=inp_nbars,
	fg_color=hex_to_rgb(inp_color1),
	bg_color=hex_to_rgb(inp_bgcolor),
	size=(inp_vidw, inp_vidh),
	)


	import gradio as gr

	ABOUT = """
	# seewav GUI

	> Have an audio clip but need a video (e.g. for X/Twitter)?

	Convert audio into a nice video!

	An online graphical user interface for [seewav](https://github.com/adefossez/seewav).

	Enjoy!
	"""
	with gr.Blocks() as demo:
	gr.Markdown(ABOUT)
	with gr.Row():
	with gr.Column():
	inp_aud = gr.Audio(type='filepath')
	with gr.Group():
	inp_color1 = gr.ColorPicker(
	label="Color",
	info="Color of the top waveform",
	value="#00237E",
	interactive=True,
	)
	inp_bgcolor = gr.ColorPicker(
	label="Background Color",
	info="Color of the background",
	value="#000000",
	interactive=True,
	)
	with gr.Accordion("Advanced Configuration", open=False):
	inp_nbars = gr.Slider(
	label="Num. Bars",
	value=50,
	interactive=True,
	minimum=5,
	maximum=1500,
	)
	inp_vidw = gr.Slider(
	label="Video Width",
	value=400,
	interactive=True,
	minimum=100,
	maximum=3000,
	)
	inp_vidh = gr.Slider(
	label="Video Height",
	value=400,
	interactive=True,
	minimum=100,
	maximum=3000,
	)
	inp_go = gr.Button("Visualize", variant="primary")
	with gr.Column():
	out_vid = gr.Video(interactive=False)
	inp_go.click(
	do_viz,
	inputs=[
	inp_aud,
	inp_bgcolor,
	inp_color1,
	inp_nbars,
	inp_vidw,
	inp_vidh,
	],
	outputs=[out_vid],
	)
	demo.queue(api_open=True, default_concurrency_limit=20).launch(show_api=True)