Qwen3 TTS 12Hz Base Streaming 0.6B
High-fidelity voice-cloning TTS model from the Qwen family
Model details
Example usage
First, send an initial request with a voice name for referring to the clone. Audio should be around 10-20s in duration.python call.py --text "Hey my name is Alex, I'm your helpful assistant" --ref-audio alex.m4a --ref-text transcript.txt --voice-name Alex
Once, this initial request has been completed the clone will be stored under the specified voice name and reference audio/text no longer need to be passed.python call.py --text "Hey my name is Alex, I'm your helpful assistant" --voice-name Alex
1"""WebSocket client for streaming text-input TTS with Base (voice cloning) only.
2
3Connects to the /v1/audio/speech/stream endpoint, sends text incrementally
4(simulating real-time STT output), and saves a single audio file per stream.
5
6Voice cloning (Base task):
7 Provide --ref-audio (local file) and --ref-text to clone a voice.
8 Set --voice-name to cache the clone server-side so subsequent sessions
9 skip the expensive embedding extraction.
10
11Usage:
12 # Voice cloning (first time: uploads + caches)
13 python call.py \
14 --text "Hello world. How are you?" \
15 --ref-audio /path/to/reference.wav \
16 --ref-text /path/to/transcript.txt \
17 --voice-name my_voice
18
19 # Voice cloning (subsequent: uses cached voice, no ref-audio needed)
20 python call.py \
21 --text "Hello world. How are you?" \
22 --voice-name my_voice
23
24 # Simulate STT: send text word-by-word with delay
25 python call.py \
26 --text "Hello world. How are you? I am fine." \
27 --voice-name my_voice \
28 --simulate-stt --stt-delay 0.1
29
30 # Play audio as it streams (requires: pip install sounddevice numpy)
31 python call.py --text "Hello world." --voice-name my_voice --play
32
33Requirements:
34 pip install websockets
35 pip install sounddevice numpy # optional, for --play
36"""
37
38import argparse
39import asyncio
40import base64
41import json
42import mimetypes
43import os
44import time
45import wave
46
47try:
48 import websockets
49except ImportError:
50 print("Please install websockets: pip install websockets")
51 raise SystemExit(1)
52
53# Optional: for --play (streaming playback)
54try:
55 import numpy as np
56 import sounddevice as sd
57
58 HAS_PLAYBACK = True
59except ImportError:
60 HAS_PLAYBACK = False
61
62
63def _write_wav(path: str, pcm_data: bytes, sample_rate: int, channels: int) -> None:
64 """Write raw PCM-16LE bytes to a WAV file."""
65 with wave.open(path, "wb") as wf:
66 wf.setnchannels(channels)
67 wf.setsampwidth(2) # 16-bit = 2 bytes per sample
68 wf.setframerate(sample_rate)
69 wf.writeframes(pcm_data)
70
71
72def _encode_audio_file(path: str) -> str:
73 """Read a local audio file and return a base64 data URI."""
74 mime_type, _ = mimetypes.guess_type(path)
75 if mime_type is None:
76 ext = os.path.splitext(path)[1].lower()
77 mime_map = {
78 ".wav": "audio/wav",
79 ".mp3": "audio/mpeg",
80 ".m4a": "audio/mp4",
81 ".flac": "audio/flac",
82 ".ogg": "audio/ogg",
83 ".aac": "audio/aac",
84 ".webm": "audio/webm",
85 }
86 mime_type = mime_map.get(ext, "audio/wav")
87
88 with open(path, "rb") as f:
89 audio_bytes = f.read()
90
91 b64 = base64.b64encode(audio_bytes).decode("utf-8")
92 return f"data:{mime_type};base64,{b64}"
93
94
95async def _delete_voice(url: str, voice_name: str) -> None:
96 """Connect to the WebSocket endpoint and send a voice.delete command."""
97 async with websockets.connect(url) as ws:
98 await ws.send(json.dumps({"type": "voice.delete", "voice_name": voice_name}))
99 raw = await ws.recv()
100 msg = json.loads(raw)
101 if msg.get("type") == "voice.deleted":
102 print(f"Voice '{voice_name}' deleted successfully.")
103 elif msg.get("type") == "error":
104 print(f"Error: {msg.get('message')}")
105 else:
106 print(f"Unexpected response: {msg}")
107
108
109async def stream_tts(
110 url: str,
111 text: str,
112 config: dict,
113 output_file: str,
114 simulate_stt: bool = False,
115 stt_delay: float = 0.1,
116 play: bool = False,
117) -> None:
118 """Connect to the streaming TTS endpoint and process audio responses."""
119 if play and not HAS_PLAYBACK:
120 print("Error: --play requires sounddevice and numpy. pip install sounddevice numpy")
121 raise SystemExit(1)
122
123 os.makedirs(os.path.dirname(output_file) or ".", exist_ok=True)
124
125 async with websockets.connect(
126 url,
127 additional_headers={"Authorization": f"Api-Key {os.getenv('BASETEN_API_KEY')}"},
128 ) as ws:
129 # 1. Send session config
130 config_msg = {"type": "session.config", **config}
131 t_request = time.perf_counter()
132 await ws.send(json.dumps(config_msg))
133
134 # Ensure text ends with punctuation to prevent cutoff
135 def ensure_ending_punctuation(t: str) -> str:
136 t = t.strip()
137 if t and t[-1] not in ".!?;:…。!?":
138 return t + "."
139 return t
140
141 text_to_send = ensure_ending_punctuation(text)
142
143 # 2. Send text (either all at once or word-by-word)
144 async def send_text():
145 if simulate_stt:
146 words = text_to_send.split(" ")
147 for i, word in enumerate(words):
148 chunk = word + (" " if i < len(words) - 1 else "")
149 await ws.send(
150 json.dumps(
151 {
152 "type": "input.text",
153 "text": chunk,
154 }
155 )
156 )
157 await asyncio.sleep(stt_delay)
158 else:
159 await ws.send(
160 json.dumps(
161 {
162 "type": "input.text",
163 "text": text_to_send,
164 }
165 )
166 )
167
168 # 3. Signal end of input
169 await ws.send(json.dumps({"type": "input.done"}))
170
171 # Run sender and receiver concurrently
172 sender_task = asyncio.create_task(send_text())
173
174 sentence_count = 0
175 ttfa: float | None = None
176 sample_rate: int = 24000
177 interrupted = False
178 play_stream = None
179
180 # Accumulate all PCM chunks across all sentences into a single buffer.
181 all_pcm: list[bytes] = []
182
183 try:
184 while True:
185 message = await ws.recv()
186
187 if isinstance(message, bytes):
188 if ttfa is None:
189 ttfa = time.perf_counter() - t_request
190 all_pcm.append(message)
191 if play:
192 if play_stream is None:
193 play_stream = sd.OutputStream(
194 samplerate=sample_rate,
195 channels=1,
196 dtype="int16",
197 blocksize=2048,
198 )
199 play_stream.start()
200 buf = np.frombuffer(message, dtype=np.int16)
201 await asyncio.to_thread(play_stream.write, buf)
202 else:
203 msg = json.loads(message)
204 msg_type = msg.get("type")
205
206 if msg_type == "voice.registered":
207 pass
208
209 elif msg_type == "audio.start":
210 pass
211
212 elif msg_type == "audio.done":
213 sample_rate = msg.get("sample_rate", 24000)
214 sentence_count += 1
215
216 elif msg_type == "session.done":
217 if play_stream is not None:
218 await asyncio.to_thread(play_stream.stop)
219 play_stream.close()
220 play_stream = None
221 t_total = time.perf_counter() - t_request
222 pcm_data = b"".join(all_pcm)
223 _write_wav(
224 output_file, pcm_data, sample_rate=sample_rate, channels=1
225 )
226 audio_duration = (
227 len(pcm_data) / (sample_rate * 2) if pcm_data else 0
228 )
229 print(
230 f"\nSession complete: {msg['total_sentences']} sentence(s) generated"
231 )
232 print(
233 f" Saved {output_file} ({len(pcm_data)} PCM bytes, {audio_duration:.2f}s)"
234 )
235 if ttfa is not None:
236 print(f" TTFA: {ttfa * 1000:.1f} ms")
237 print(f" Total time: {t_total * 1000:.1f} ms")
238 if audio_duration > 0:
239 print(f" RTF: {t_total / audio_duration:.2f}x")
240 break
241 elif msg_type == "error":
242 print(f" ERROR: {msg['message']}")
243
244 except (asyncio.CancelledError, KeyboardInterrupt):
245 interrupted = True
246
247 finally:
248 sender_task.cancel()
249 try:
250 await sender_task
251 except asyncio.CancelledError:
252 pass
253
254 if play_stream is not None:
255 try:
256 await asyncio.to_thread(play_stream.stop)
257 play_stream.close()
258 except Exception:
259 pass
260
261 if interrupted:
262 t_total = time.perf_counter() - t_request
263 print(f"\nInterrupted after {t_total * 1000:.0f} ms")
264 pcm_data = b"".join(all_pcm)
265 if pcm_data:
266 _write_wav(
267 output_file, pcm_data, sample_rate=sample_rate, channels=1
268 )
269 audio_duration = len(pcm_data) / (sample_rate * 2)
270 print(
271 f" Saved partial audio: {output_file} "
272 f"({len(pcm_data)} PCM bytes, {audio_duration:.2f}s)"
273 )
274 else:
275 print(" No audio received yet.")
276 await ws.close(code=1000, reason="Client interrupted")
277
278
279def main():
280 parser = argparse.ArgumentParser(description="Streaming text-input TTS client")
281 parser.add_argument(
282 "--url",
283 default="wss://model-<model_id>.api.baseten.co/deployment/<deployment_id>/websocket",
284 help="WebSocket endpoint URL",
285 )
286 parser.add_argument(
287 "--text",
288 required=False,
289 help="Text to synthesize",
290 )
291 parser.add_argument(
292 "--output",
293 default="streaming_tts_output.wav",
294 help="Output WAV file path (default: streaming_tts_output.wav)",
295 )
296
297 # Session config options (Base task only)
298 parser.add_argument("--model", default=None, help="Model name")
299 parser.add_argument("--language", default="Auto", help="Language")
300 parser.add_argument(
301 "--response-format",
302 default="wav",
303 choices=["wav", "pcm", "flac", "mp3", "aac", "opus"],
304 help="Audio format",
305 )
306 parser.add_argument(
307 "--speed", type=float, default=1.0, help="Playback speed (0.25-4.0)"
308 )
309 parser.add_argument("--max-new-tokens", type=int, default=None, help="Max tokens")
310
311 # Voice cloning options (Base task)
312 parser.add_argument(
313 "--ref-audio",
314 default=None,
315 help="Path to local reference audio file for voice cloning",
316 )
317 parser.add_argument(
318 "--ref-text",
319 default=None,
320 help="Transcript of reference audio (enables ICL mode). "
321 "Can be inline text or a path to a .txt file.",
322 )
323 parser.add_argument(
324 "--voice-name",
325 default=None,
326 help="Name for caching the voice clone server-side. "
327 "On first use provide --ref-audio too; subsequent calls reuse the cache.",
328 )
329 parser.add_argument(
330 "--x-vector-only-mode",
331 action="store_true",
332 default=False,
333 help="Speaker embedding only mode (no ICL). Enables per-sentence "
334 "streaming for cached voices, giving much lower TTFA at the cost "
335 "of slightly reduced voice similarity.",
336 )
337 parser.add_argument(
338 "--delete-voice",
339 default=None,
340 metavar="NAME",
341 help="Delete a cached voice clone by name and exit.",
342 )
343
344 # STT simulation
345 parser.add_argument(
346 "--simulate-stt",
347 action="store_true",
348 help="Simulate STT by sending text word-by-word",
349 )
350 parser.add_argument(
351 "--stt-delay",
352 type=float,
353 default=0.1,
354 help="Delay between words in STT simulation (seconds)",
355 )
356 parser.add_argument(
357 "--play",
358 action="store_true",
359 help="Play audio as it streams (requires: pip install sounddevice numpy)",
360 )
361
362 args = parser.parse_args()
363
364 # Handle --delete-voice as a one-shot command
365 if args.delete_voice:
366 asyncio.run(_delete_voice(args.url, args.delete_voice))
367 return
368
369 # If ref-text looks like a file path, read its contents
370 if args.ref_text and os.path.isfile(args.ref_text):
371 with open(args.ref_text) as f:
372 args.ref_text = f.read().strip()
373 print(
374 f"Read ref_text from file: {args.ref_text[:80]}{'...' if len(args.ref_text) > 80 else ''}"
375 )
376
377 # Encode local reference audio as base64 data URI
378 ref_audio_data_uri = None
379 if args.ref_audio:
380 if not os.path.isfile(args.ref_audio):
381 print(f"Error: reference audio file not found: {args.ref_audio}")
382 raise SystemExit(1)
383 ref_audio_data_uri = _encode_audio_file(args.ref_audio)
384 size_kb = os.path.getsize(args.ref_audio) / 1024
385 print(f"Encoded reference audio: {args.ref_audio} ({size_kb:.1f} KB)")
386
387 # Build session config (Base task only; only include non-None values)
388 config: dict = {"task_type": "Base"}
389 for key in [
390 "model",
391 "language",
392 "response_format",
393 "speed",
394 "max_new_tokens",
395 "ref_text",
396 "voice_name",
397 ]:
398 val = getattr(args, key.replace("-", "_"), None)
399 if val is not None:
400 config[key] = val
401
402 if ref_audio_data_uri is not None:
403 config["ref_audio"] = ref_audio_data_uri
404
405 if args.x_vector_only_mode:
406 config["x_vector_only_mode"] = True
407
408 try:
409 asyncio.run(
410 stream_tts(
411 url=args.url,
412 text=args.text,
413 config=config,
414 output_file=args.output,
415 simulate_stt=args.simulate_stt,
416 stt_delay=args.stt_delay,
417 play=args.play,
418 )
419 )
420 except KeyboardInterrupt:
421 pass
422
423
424if __name__ == "__main__":
425 main()
4261{
2 "bytes": "pcm16 bytes"
3}