Spaces:
Runtime error
Runtime error
Merge remote-tracking branch 'upstream/main'
Browse files- README.md +1 -1
- app.py +260 -107
- app_batched.py +4 -2
README.md
CHANGED
@@ -5,7 +5,7 @@ tags:
|
|
5 |
- music generation
|
6 |
- language models
|
7 |
- LLMs
|
8 |
-
app_file:
|
9 |
emoji: 🎵
|
10 |
colorFrom: white
|
11 |
colorTo: blue
|
|
|
5 |
- music generation
|
6 |
- language models
|
7 |
- LLMs
|
8 |
+
app_file: app.py
|
9 |
emoji: 🎵
|
10 |
colorFrom: white
|
11 |
colorTo: blue
|
app.py
CHANGED
@@ -7,14 +7,18 @@ LICENSE file in the root directory of this source tree.
|
|
7 |
"""
|
8 |
|
9 |
from tempfile import NamedTemporaryFile
|
|
|
10 |
import torch
|
|
|
11 |
import gradio as gr
|
|
|
12 |
from audiocraft.models import MusicGen
|
13 |
-
|
14 |
from audiocraft.data.audio import audio_write
|
15 |
|
|
|
16 |
|
17 |
MODEL = None
|
|
|
18 |
|
19 |
|
20 |
def load_model(version):
|
@@ -22,14 +26,18 @@ def load_model(version):
|
|
22 |
return MusicGen.get_pretrained(version)
|
23 |
|
24 |
|
25 |
-
def predict(
|
|
|
|
|
26 |
global MODEL
|
27 |
topk = int(topk)
|
28 |
-
if MODEL is None
|
29 |
-
MODEL = load_model(
|
30 |
|
31 |
if duration > MODEL.lm.cfg.dataset.segment_duration:
|
32 |
raise gr.Error("MusicGen currently supports durations of up to 30 seconds!")
|
|
|
|
|
33 |
MODEL.set_generation_params(
|
34 |
use_sampling=True,
|
35 |
top_k=topk,
|
@@ -39,120 +47,265 @@ def predict(model, text, melody, duration, topk, topp, temperature, cfg_coef):
|
|
39 |
duration=duration,
|
40 |
)
|
41 |
|
42 |
-
if
|
43 |
-
|
44 |
-
|
45 |
if melody.dim() == 2:
|
46 |
melody = melody[None]
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
else:
|
55 |
output = MODEL.generate(descriptions=[text], progress=False)
|
56 |
|
57 |
output = output.detach().cpu().float()[0]
|
58 |
with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
|
59 |
-
audio_write(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
waveform_video = gr.make_waveform(file.name)
|
61 |
-
return waveform_video
|
62 |
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
"melody"
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
],
|
127 |
-
[
|
128 |
-
|
129 |
-
|
130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
],
|
132 |
-
|
133 |
-
|
134 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
)
|
136 |
-
|
137 |
-
"""
|
138 |
-
### More details
|
139 |
-
|
140 |
-
The model will generate a short music extract based on the description you provided.
|
141 |
-
You can generate up to 30 seconds of audio.
|
142 |
-
|
143 |
-
We present 4 model variations:
|
144 |
-
1. Melody -- a music generation model capable of generating music condition on text and melody inputs. **Note**, you can also use text only.
|
145 |
-
2. Small -- a 300M transformer decoder conditioned on text only.
|
146 |
-
3. Medium -- a 1.5B transformer decoder conditioned on text only.
|
147 |
-
4. Large -- a 3.3B transformer decoder conditioned on text only (might OOM for the longest sequences.)
|
148 |
-
|
149 |
-
When using `melody`, ou can optionaly provide a reference audio from
|
150 |
-
which a broad melody will be extracted. The model will then try to follow both the description and melody provided.
|
151 |
-
|
152 |
-
You can also use your own GPU or a Google Colab by following the instructions on our repo.
|
153 |
-
See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
|
154 |
-
for more details.
|
155 |
-
"""
|
156 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
|
158 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
"""
|
8 |
|
9 |
from tempfile import NamedTemporaryFile
|
10 |
+
import argparse
|
11 |
import torch
|
12 |
+
import torchaudio
|
13 |
import gradio as gr
|
14 |
+
import os
|
15 |
from audiocraft.models import MusicGen
|
|
|
16 |
from audiocraft.data.audio import audio_write
|
17 |
|
18 |
+
from share_btn import community_icon_html, loading_icon_html, share_js, css
|
19 |
|
20 |
MODEL = None
|
21 |
+
IS_SHARED_SPACE = "radames/MusicGen-Continuation" in os.environ.get("SPACE_ID", "")
|
22 |
|
23 |
|
24 |
def load_model(version):
|
|
|
26 |
return MusicGen.get_pretrained(version)
|
27 |
|
28 |
|
29 |
+
def predict(
|
30 |
+
text, melody_input, duration, continuation, topk, topp, temperature, cfg_coef
|
31 |
+
):
|
32 |
global MODEL
|
33 |
topk = int(topk)
|
34 |
+
if MODEL is None:
|
35 |
+
MODEL = load_model("melody")
|
36 |
|
37 |
if duration > MODEL.lm.cfg.dataset.segment_duration:
|
38 |
raise gr.Error("MusicGen currently supports durations of up to 30 seconds!")
|
39 |
+
if continuation >= duration:
|
40 |
+
raise gr.Error("The continuation setting can't be higher or equal to duration!")
|
41 |
MODEL.set_generation_params(
|
42 |
use_sampling=True,
|
43 |
top_k=topk,
|
|
|
47 |
duration=duration,
|
48 |
)
|
49 |
|
50 |
+
if melody_input:
|
51 |
+
melody, sr = torchaudio.load(melody_input)
|
52 |
+
# sr, melody = melody_input[0], torch.from_numpy(melody_input[1]).to(MODEL.device).float().t().unsqueeze(0)
|
53 |
if melody.dim() == 2:
|
54 |
melody = melody[None]
|
55 |
+
if continuation:
|
56 |
+
prompt_waveform = melody[..., -int(sr * continuation) :]
|
57 |
+
output = MODEL.generate_continuation(
|
58 |
+
prompt=prompt_waveform,
|
59 |
+
prompt_sample_rate=sr,
|
60 |
+
descriptions=[text],
|
61 |
+
progress=True,
|
62 |
+
)
|
63 |
+
else:
|
64 |
+
melody_wavform = melody[
|
65 |
+
..., : int(sr * MODEL.lm.cfg.dataset.segment_duration)
|
66 |
+
]
|
67 |
+
output = MODEL.generate_with_chroma(
|
68 |
+
descriptions=[text],
|
69 |
+
melody_wavs=melody_wavform,
|
70 |
+
melody_sample_rate=sr,
|
71 |
+
progress=True,
|
72 |
+
)
|
73 |
else:
|
74 |
output = MODEL.generate(descriptions=[text], progress=False)
|
75 |
|
76 |
output = output.detach().cpu().float()[0]
|
77 |
with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
|
78 |
+
audio_write(
|
79 |
+
file.name,
|
80 |
+
output,
|
81 |
+
MODEL.sample_rate,
|
82 |
+
strategy="loudness",
|
83 |
+
loudness_headroom_db=16,
|
84 |
+
loudness_compressor=True,
|
85 |
+
add_suffix=False,
|
86 |
+
)
|
87 |
waveform_video = gr.make_waveform(file.name)
|
88 |
+
return waveform_video, melody_input
|
89 |
|
90 |
+
|
91 |
+
def ui(**kwargs):
|
92 |
+
def toggle(choice):
|
93 |
+
if choice == "mic":
|
94 |
+
return gr.update(source="microphone", value=None, label="Microphone")
|
95 |
+
else:
|
96 |
+
return gr.update(source="upload", value=None, label="File")
|
97 |
+
|
98 |
+
with gr.Blocks(css=css) as interface:
|
99 |
+
gr.Markdown(
|
100 |
+
"""
|
101 |
+
# MusicGen
|
102 |
+
This is your private demo for [MusicGen](https://github.com/facebookresearch/audiocraft), a simple and controllable model for music generation
|
103 |
+
presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284)
|
104 |
+
"""
|
105 |
+
)
|
106 |
+
if IS_SHARED_SPACE:
|
107 |
+
gr.Markdown(
|
108 |
+
"""
|
109 |
+
⚠ This Space doesn't work in this shared UI ⚠
|
110 |
+
|
111 |
+
<a href="https://huggingface.co/spaces/musicgen/MusicGen?duplicate=true" style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank">
|
112 |
+
<img style="margin-bottom: 0em;display: inline;margin-top: -.25em;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
|
113 |
+
to use it privately, or use the <a href="https://huggingface.co/spaces/facebook/MusicGen">public demo</a>
|
114 |
+
"""
|
115 |
+
)
|
116 |
+
with gr.Row():
|
117 |
+
with gr.Column():
|
118 |
+
with gr.Row():
|
119 |
+
text = gr.Text(
|
120 |
+
label="Describe your music",
|
121 |
+
lines=2,
|
122 |
+
interactive=True,
|
123 |
+
elem_id="text-input",
|
124 |
+
)
|
125 |
+
with gr.Column():
|
126 |
+
radio = gr.Radio(
|
127 |
+
["file", "mic"],
|
128 |
+
value="file",
|
129 |
+
label="Melody Condition (optional) File or Mic",
|
130 |
+
)
|
131 |
+
melody = gr.Audio(
|
132 |
+
source="upload",
|
133 |
+
type="filepath",
|
134 |
+
label="File",
|
135 |
+
interactive=True,
|
136 |
+
elem_id="melody-input",
|
137 |
+
)
|
138 |
+
with gr.Row():
|
139 |
+
submit = gr.Button("Submit")
|
140 |
+
# with gr.Row():
|
141 |
+
# model = gr.Radio(
|
142 |
+
# ["melody", "medium", "small", "large"],
|
143 |
+
# label="Model",
|
144 |
+
# value="melody",
|
145 |
+
# interactive=True,
|
146 |
+
# )
|
147 |
+
with gr.Row():
|
148 |
+
duration = gr.Slider(
|
149 |
+
minimum=1,
|
150 |
+
maximum=30,
|
151 |
+
value=10,
|
152 |
+
label="Duration",
|
153 |
+
interactive=True,
|
154 |
+
)
|
155 |
+
with gr.Row():
|
156 |
+
continuation = gr.Slider(
|
157 |
+
minimum=0,
|
158 |
+
maximum=30,
|
159 |
+
value=0,
|
160 |
+
label="Continue from the end duration",
|
161 |
+
interactive=True,
|
162 |
+
)
|
163 |
+
with gr.Row():
|
164 |
+
topk = gr.Number(label="Top-k", value=250, interactive=True)
|
165 |
+
topp = gr.Number(label="Top-p", value=0, interactive=True)
|
166 |
+
temperature = gr.Number(
|
167 |
+
label="Temperature", value=1.0, interactive=True
|
168 |
+
)
|
169 |
+
cfg_coef = gr.Number(
|
170 |
+
label="Classifier Free Guidance", value=3.0, interactive=True
|
171 |
+
)
|
172 |
+
with gr.Column():
|
173 |
+
output = gr.Video(label="Generated Music", elem_id="generated-video")
|
174 |
+
output_melody = gr.Audio(label="Melody ", elem_id="melody-output")
|
175 |
+
with gr.Row(visible=False) as share_row:
|
176 |
+
with gr.Group(elem_id="share-btn-container"):
|
177 |
+
community_icon = gr.HTML(community_icon_html)
|
178 |
+
loading_icon = gr.HTML(loading_icon_html)
|
179 |
+
share_button = gr.Button(
|
180 |
+
"Share to community", elem_id="share-btn"
|
181 |
+
)
|
182 |
+
share_button.click(None, [], [], _js=share_js)
|
183 |
+
submit.click(
|
184 |
+
lambda x: gr.update(visible=False),
|
185 |
+
None,
|
186 |
+
[share_row],
|
187 |
+
queue=False,
|
188 |
+
show_progress=False,
|
189 |
+
).then(
|
190 |
+
predict,
|
191 |
+
inputs=[
|
192 |
+
text,
|
193 |
+
melody,
|
194 |
+
duration,
|
195 |
+
continuation,
|
196 |
+
topk,
|
197 |
+
topp,
|
198 |
+
temperature,
|
199 |
+
cfg_coef,
|
200 |
],
|
201 |
+
outputs=[output, output_melody],
|
202 |
+
).then(
|
203 |
+
lambda x: gr.update(visible=True),
|
204 |
+
None,
|
205 |
+
[share_row],
|
206 |
+
queue=False,
|
207 |
+
show_progress=False,
|
208 |
+
)
|
209 |
+
radio.change(toggle, radio, [melody], queue=False, show_progress=False)
|
210 |
+
gr.Examples(
|
211 |
+
fn=predict,
|
212 |
+
examples=[
|
213 |
+
[
|
214 |
+
"An 80s driving pop song with heavy drums and synth pads in the background",
|
215 |
+
"./assets/bach.mp3",
|
216 |
+
],
|
217 |
+
[
|
218 |
+
"A cheerful country song with acoustic guitars",
|
219 |
+
"./assets/bolero_ravel.mp3",
|
220 |
+
],
|
221 |
+
["90s rock song with electric guitar and heavy drums", None, "medium"],
|
222 |
+
[
|
223 |
+
"a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions",
|
224 |
+
"./assets/bach.mp3",
|
225 |
+
],
|
226 |
+
[
|
227 |
+
"lofi slow bpm electro chill with organic samples",
|
228 |
+
None,
|
229 |
+
],
|
230 |
],
|
231 |
+
inputs=[text, melody],
|
232 |
+
outputs=[output],
|
233 |
+
)
|
234 |
+
gr.Markdown(
|
235 |
+
"""
|
236 |
+
### More details
|
237 |
+
|
238 |
+
The model will generate a short music extract based on the description you provided.
|
239 |
+
You can generate up to 30 seconds of audio.
|
240 |
+
|
241 |
+
We present 4 model variations:
|
242 |
+
1. Melody -- a music generation model capable of generating music condition on text and melody inputs. **Note**, you can also use text only.
|
243 |
+
2. Small -- a 300M transformer decoder conditioned on text only.
|
244 |
+
3. Medium -- a 1.5B transformer decoder conditioned on text only.
|
245 |
+
4. Large -- a 3.3B transformer decoder conditioned on text only (might OOM for the longest sequences.)
|
246 |
+
|
247 |
+
When using `melody`, ou can optionaly provide a reference audio from
|
248 |
+
which a broad melody will be extracted. The model will then try to follow both the description and melody provided.
|
249 |
+
|
250 |
+
You can also use your own GPU or a Google Colab by following the instructions on our repo.
|
251 |
+
See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
|
252 |
+
for more details.
|
253 |
+
"""
|
254 |
+
)
|
255 |
+
|
256 |
+
# Show the interface
|
257 |
+
launch_kwargs = {}
|
258 |
+
username = kwargs.get("username")
|
259 |
+
password = kwargs.get("password")
|
260 |
+
server_port = kwargs.get("server_port", 0)
|
261 |
+
inbrowser = kwargs.get("inbrowser", False)
|
262 |
+
share = kwargs.get("share", False)
|
263 |
+
server_name = kwargs.get("listen")
|
264 |
+
|
265 |
+
launch_kwargs["server_name"] = server_name
|
266 |
+
|
267 |
+
if username and password:
|
268 |
+
launch_kwargs["auth"] = (username, password)
|
269 |
+
if server_port > 0:
|
270 |
+
launch_kwargs["server_port"] = server_port
|
271 |
+
if inbrowser:
|
272 |
+
launch_kwargs["inbrowser"] = inbrowser
|
273 |
+
if share:
|
274 |
+
launch_kwargs["share"] = share
|
275 |
+
|
276 |
+
interface.queue().launch(**launch_kwargs, max_threads=1)
|
277 |
+
|
278 |
+
|
279 |
+
if __name__ == "__main__":
|
280 |
+
parser = argparse.ArgumentParser()
|
281 |
+
parser.add_argument(
|
282 |
+
"--listen",
|
283 |
+
type=str,
|
284 |
+
default="127.0.0.1",
|
285 |
+
help="IP to listen on for connections to Gradio",
|
286 |
+
)
|
287 |
+
parser.add_argument(
|
288 |
+
"--username", type=str, default="", help="Username for authentication"
|
289 |
)
|
290 |
+
parser.add_argument(
|
291 |
+
"--password", type=str, default="", help="Password for authentication"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
292 |
)
|
293 |
+
parser.add_argument(
|
294 |
+
"--server_port",
|
295 |
+
type=int,
|
296 |
+
default=0,
|
297 |
+
help="Port to run the server listener on",
|
298 |
+
)
|
299 |
+
parser.add_argument("--inbrowser", action="store_true", help="Open in browser")
|
300 |
+
parser.add_argument("--share", action="store_true", help="Share the gradio UI")
|
301 |
+
|
302 |
+
args = parser.parse_args()
|
303 |
|
304 |
+
ui(
|
305 |
+
username=args.username,
|
306 |
+
password=args.password,
|
307 |
+
inbrowser=args.inbrowser,
|
308 |
+
server_port=args.server_port,
|
309 |
+
share=args.share,
|
310 |
+
listen=args.listen,
|
311 |
+
)
|
app_batched.py
CHANGED
@@ -67,10 +67,13 @@ def predict(texts, melodies):
|
|
67 |
output,
|
68 |
MODEL.sample_rate,
|
69 |
strategy="loudness",
|
|
|
|
|
70 |
add_suffix=False,
|
71 |
)
|
72 |
waveform_video = gr.make_waveform(file.name)
|
73 |
out_files.append(waveform_video)
|
|
|
74 |
return [out_files, melodies]
|
75 |
|
76 |
|
@@ -189,5 +192,4 @@ with gr.Blocks(css=css) as demo:
|
|
189 |
for more details.
|
190 |
"""
|
191 |
)
|
192 |
-
|
193 |
-
demo.queue(max_size=15).launch()
|
|
|
67 |
output,
|
68 |
MODEL.sample_rate,
|
69 |
strategy="loudness",
|
70 |
+
loudness_headroom_db=16,
|
71 |
+
loudness_compressor=True,
|
72 |
add_suffix=False,
|
73 |
)
|
74 |
waveform_video = gr.make_waveform(file.name)
|
75 |
out_files.append(waveform_video)
|
76 |
+
|
77 |
return [out_files, melodies]
|
78 |
|
79 |
|
|
|
192 |
for more details.
|
193 |
"""
|
194 |
)
|
195 |
+
demo.queue(max_size=60).launch()
|
|