image-to-audio / ref /text-to-audio.py
thivav's picture
image-to-audio init commit
c826025
raw
history blame
755 Bytes
# text-to-audio using suno/bark
import scipy
import torch
from transformers import AutoProcessor
from transformers import BarkModel
model = BarkModel.from_pretrained("suno/bark-small")
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print("device: ", device)
processor = AutoProcessor.from_pretrained("suno/bark")
# prepare the inputs
text_prompt = "You are a story teller. You can generate a story based on a simple narrative, the story be no more than 20 words."
inputs = processor(text_prompt)
# generate speech
model = model.to(device)
speech_output = model.generate(**inputs.to(device))
sampling_rate = model.generation_config.sample_rate
scipy.io.wavfile.write("bark_out.wav", rate=sampling_rate, data=speech_output[0].cpu().numpy())