text to speech
Qwen3 TTS 12Hz Base Streaming 0.6B
High-fidelity voice-cloning TTS model from the Qwen family
Model details
Example usage
First, send an initial request with a voice name for referring to the clone. Audio should be around 10-20s in duration. python call.py voices add --name alex --ref-audio "https://example.com/reference.wav" --ref-text "Transcript of the reference audio.
Once, this initial request has been completed the clone will be stored under the specified voice name and reference audio/text no longer need to be passed.python call.py --text "Hello world." --voice alex --stream-audio
Input
1#!/usr/bin/env python3
2"""WebSocket client for Qwen3-TTS Base deployment on Baseten.
3
4All operations go over a single WebSocket connection:
5 1. Streaming TTS (session.config → input.text → input.done → audio)
6 2. Voice management (voice.list / voice.add / voice.remove)
7
8Set BASETEN_API_KEY in your environment before running.
9
10Examples:
11 # Streaming TTS with a built-in voice
12 python call.py --text "Hello! How are you today?"
13
14 # With a specific voice
15 python call.py --text "Hello world." --voice my_voice
16
17 # Stream PCM for lower latency
18 python call.py --text "Hello world." --voice my_voice --stream-audio
19
20 # Simulate real-time STT drip-feed
21 python call.py --text "Pack my box with five dozen liquor jugs." \
22 --simulate-stt --stt-delay 0.08
23
24 # Upload a voice (local file)
25 python call.py voices add --name my_voice \
26 --ref-audio ./reference.wav \
27 --ref-text "Transcript of the reference audio."
28
29 # Upload a voice (from URL)
30 python call.py voices add --name my_voice \
31 --ref-audio "https://example.com/reference.wav" \
32 --ref-text "Transcript of the reference audio."
33
34 # List voices
35 python call.py voices list
36
37 # Remove a voice
38 python call.py voices remove --name my_voice
39
40Requirements:
41 pip install websockets soundfile numpy requests
42"""
43
44import argparse
45import asyncio
46import base64
47import io
48import json
49import os
50import struct
51import sys
52import time
53
54import numpy as np
55import requests
56
57try:
58 import websockets
59except ImportError:
60 websockets = None
61
62try:
63 import soundfile as sf
64except ImportError:
65 sf = None
66
67WS_URL = "wss://model-wx412j6q.api.baseten.co/deployment/wgl225g/websocket"
68SAMPLE_RATE = 24000
69
70
71def _resolve_text(value: str | None) -> str | None:
72 """If *value* is a path to an existing .txt file, return its contents."""
73 if value and value.endswith(".txt") and os.path.isfile(value):
74 with open(value, "r", encoding="utf-8") as f:
75 return f.read().strip()
76 return value
77
78
79def _api_key() -> str:
80 key = os.getenv("BASETEN_API_KEY")
81 if not key:
82 sys.exit("Error: BASETEN_API_KEY environment variable is not set")
83 return key
84
85
86def _auth_headers() -> dict:
87 return {"Authorization": f"Api-Key {_api_key()}"}
88
89
90async def _ws_connect():
91 if websockets is None:
92 sys.exit("Missing dependency: pip install websockets")
93 return await websockets.connect(
94 WS_URL,
95 max_size=16 * 1024 * 1024,
96 additional_headers=_auth_headers(),
97 open_timeout=30,
98 )
99
100
101# ── Voice management (over WebSocket) ───────────────────────────────────────
102
103async def ws_voice_list() -> None:
104 ws = await _ws_connect()
105 print(f"[ws] Connected to {WS_URL} ...")
106 try:
107 await ws.send(json.dumps({"type": "voice.list"}))
108 resp = json.loads(await ws.recv())
109
110 if resp.get("type") == "error":
111 print(f"[error] {resp['message']}")
112 return
113
114 builtin = resp.get("voices", [])
115 uploaded = resp.get("uploaded_voices", [])
116
117 if builtin:
118 print("Built-in voices:")
119 for name in builtin:
120 print(f" {name}")
121 if uploaded:
122 print(f"\nUploaded voices:")
123 print(f" {'Name':<20} {'Source':<10} {'Ref Text'}")
124 print(f" {'─' * 60}")
125 for v in uploaded:
126 print(f" {v.get('name', '?'):<20} "
127 f"{v.get('embedding_source', ''):<10} "
128 f"{v.get('ref_text', '')[:40]}")
129 if not builtin and not uploaded:
130 print("No voices found.")
131 finally:
132 await ws.close()
133
134
135async def ws_voice_add(
136 name: str,
137 ref_audio: str,
138 ref_text: str | None = None,
139 consent: str = "user_consent",
140) -> None:
141 ref_text = _resolve_text(ref_text)
142 if ref_audio.startswith(("http://", "https://")):
143 print(f"[voices] Downloading {ref_audio}...")
144 r = requests.get(ref_audio, timeout=60)
145 r.raise_for_status()
146 audio_bytes = r.content
147 else:
148 with open(ref_audio, "rb") as f:
149 audio_bytes = f.read()
150
151 msg: dict = {
152 "type": "voice.add",
153 "name": name,
154 "consent": consent,
155 "audio_data": base64.b64encode(audio_bytes).decode(),
156 "audio_format": "wav",
157 }
158 if ref_text:
159 msg["ref_text"] = ref_text
160
161 print(f"[voices] Uploading '{name}' ({len(audio_bytes):,} bytes)...")
162 ws = await _ws_connect()
163 try:
164 await ws.send(json.dumps(msg))
165 resp = json.loads(await ws.recv())
166
167 if resp.get("type") == "error":
168 print(f"[error] {resp['message']}")
169 elif resp.get("success"):
170 print(f"[voices] Created: {json.dumps(resp.get('voice', {}), indent=2)}")
171 else:
172 print(f"[error] {resp.get('error', 'unknown error')}")
173 finally:
174 await ws.close()
175
176
177async def ws_voice_remove(name: str) -> None:
178 ws = await _ws_connect()
179 try:
180 await ws.send(json.dumps({"type": "voice.remove", "name": name}))
181 resp = json.loads(await ws.recv())
182
183 if resp.get("type") == "error":
184 print(f"[error] {resp['message']}")
185 elif resp.get("success"):
186 print(f"[voices] Removed '{name}'")
187 else:
188 print(f"[error] {resp.get('error', f'Voice {name!r} not found')}")
189 finally:
190 await ws.close()
191
192
193# ── WebSocket streaming TTS ─────────────────────────────────────────────────
194
195async def ws_stream(
196 text: str,
197 config: dict,
198 output: str,
199 simulate_stt: bool = False,
200 stt_delay: float = 0.1,
201) -> None:
202 all_pcm: list[np.ndarray] = []
203 sentence_count = 0
204 first_audio_time = None
205 t0 = time.perf_counter()
206
207 is_pcm = config.get("response_format", "wav") == "pcm" or config.get("stream_audio", False)
208
209 print(f"[ws] Connecting to {WS_URL} ...")
210
211 try:
212 ws = await _ws_connect()
213 except websockets.exceptions.InvalidStatus as e:
214 print(f"[error] WebSocket handshake failed: HTTP {e.response.status_code}")
215 for name, value in e.response.headers.raw_items():
216 print(f" {name}: {value}")
217 body = getattr(e.response, "body", None)
218 if body:
219 print(f" body: {body.decode(errors='replace')[:500]}")
220 raise
221 except Exception as e:
222 print(f"[error] WebSocket connection failed: {type(e).__name__}: {e}")
223 raise
224
225 print(f"[ws] Connected (protocol={ws.protocol})")
226
227 try:
228 config_msg = {"type": "session.config", **config}
229 await ws.send(json.dumps(config_msg))
230 print(f"[ws] task_type=Base "
231 f"format={config.get('response_format', 'wav')} "
232 f"stream_audio={config.get('stream_audio', False)} "
233 f"split={config.get('split_granularity', 'sentence')}")
234
235 async def send_text():
236 if simulate_stt:
237 words = text.split(" ")
238 for i, word in enumerate(words):
239 chunk = word + (" " if i < len(words) - 1 else "")
240 await ws.send(json.dumps({"type": "input.text", "text": chunk}))
241 await asyncio.sleep(stt_delay)
242 print(f"[input] Sent {len(words)} words (simulated STT, {stt_delay}s delay)")
243 else:
244 await ws.send(json.dumps({"type": "input.text", "text": text}))
245 print(f"[input] Sent {len(text)} chars")
246 await ws.send(json.dumps({"type": "input.done"}))
247
248 sender = asyncio.create_task(send_text())
249
250 total_bytes = 0
251 current_wav_chunks: list[bytes] = []
252
253 try:
254 while True:
255 msg = await ws.recv()
256
257 if isinstance(msg, bytes):
258 if first_audio_time is None:
259 first_audio_time = time.perf_counter()
260 total_bytes += len(msg)
261 if is_pcm:
262 usable = len(msg) - (len(msg) % 2)
263 if usable > 0:
264 all_pcm.append(np.frombuffer(msg[:usable], dtype=np.int16))
265 else:
266 current_wav_chunks.append(msg)
267 continue
268
269 data = json.loads(msg)
270 mtype = data.get("type")
271
272 if mtype == "audio.start":
273 current_wav_chunks = []
274 sentence_count += 1
275
276 elif mtype == "audio.done":
277 if data.get("error", False):
278 print(f"[error] Generation failed for sentence {data['sentence_index']}")
279 elif not is_pcm and current_wav_chunks:
280 raw = b"".join(current_wav_chunks)
281 try:
282 pcm_arr, _ = sf.read(io.BytesIO(raw))
283 if pcm_arr.ndim > 1:
284 pcm_arr = pcm_arr[:, 0]
285 all_pcm.append((np.clip(pcm_arr, -1, 1) * 32767).astype(np.int16))
286 except Exception as e:
287 print(f"[error] Failed to decode sentence audio: {e}")
288 ts_info = data.get("timestamp_info")
289 if ts_info:
290 wa = ts_info.get("word_alignment", {})
291 words = wa.get("words", [])
292 starts = wa.get("word_start_times_seconds", [])
293 ends = wa.get("word_end_times_seconds", [])
294 for w, s, e in zip(words, starts, ends):
295 print(f" [{s:.3f}–{e:.3f}] {w}")
296 current_wav_chunks = []
297
298 elif mtype == "audio.timestamps":
299 wa = data.get("word_alignment", {})
300 words = wa.get("words", [])
301 starts = wa.get("word_start_times_seconds", [])
302 ends = wa.get("word_end_times_seconds", [])
303 for w, s, e in zip(words, starts, ends):
304 print(f" [{s:.3f}–{e:.3f}] {w}")
305
306 elif mtype == "session.done":
307 break
308
309 elif mtype == "error":
310 print(f"[error] {data['message']}")
311
312 finally:
313 sender.cancel()
314 try:
315 await sender
316 except asyncio.CancelledError:
317 pass
318 finally:
319 await ws.close()
320
321 elapsed = time.perf_counter() - t0
322 ttfa = (first_audio_time - t0) if first_audio_time else None
323 _save_audio(all_pcm, output, sentence_count, total_bytes, elapsed, ttfa)
324
325
326# ── Helpers ──────────────────────────────────────────────────────────────────
327
328def _save_audio(
329 all_pcm: list,
330 output: str,
331 sentence_count: int,
332 total_bytes: int,
333 elapsed: float,
334 ttfa: float | None,
335) -> None:
336 if not all_pcm:
337 print("\nNo audio received.")
338 return
339
340 combined = np.concatenate(all_pcm)
341 audio_duration = len(combined) / SAMPLE_RATE
342
343 if not output.lower().endswith((".wav", ".flac", ".mp3", ".ogg")):
344 output += ".wav"
345
346 if sf is not None:
347 sf.write(output, combined.astype(np.float32) / 32767.0, SAMPLE_RATE)
348 else:
349 _write_wav(output, combined)
350
351 print(f"\n{'─' * 50}")
352 print(f" Output: {output}")
353 print(f" Size: {total_bytes:,} bytes")
354 print(f" Duration: {audio_duration:.2f}s")
355 print(f" Wall time: {elapsed:.2f}s")
356 if ttfa is not None:
357 print(f" TTFA: {ttfa * 1000:.0f}ms")
358 if audio_duration > 0:
359 rtf = elapsed / audio_duration
360 print(f" RTF: {rtf:.3f}x ({1/rtf:.1f}x realtime)")
361 print(f"{'─' * 50}")
362
363
364def _write_wav(path: str, pcm: np.ndarray) -> None:
365 data = pcm.astype(np.int16).tobytes()
366 with open(path, "wb") as f:
367 f.write(b"RIFF")
368 f.write(struct.pack("<I", 36 + len(data)))
369 f.write(b"WAVE")
370 f.write(b"fmt ")
371 f.write(struct.pack("<IHHIIHH", 16, 1, 1, SAMPLE_RATE,
372 SAMPLE_RATE * 2, 2, 16))
373 f.write(b"data")
374 f.write(struct.pack("<I", len(data)))
375 f.write(data)
376
377
378# ── CLI ──────────────────────────────────────────────────────────────────────
379
380def main():
381 p = argparse.ArgumentParser(
382 description="WebSocket client for Qwen3-TTS Base on Baseten",
383 formatter_class=argparse.RawDescriptionHelpFormatter,
384 )
385 sub = p.add_subparsers(dest="mode")
386
387 # ── TTS args (top-level) ─────────────────────────────────────
388 p.add_argument("--text", help="Text to synthesize")
389 p.add_argument("--output", "-o", default="output.wav")
390 p.add_argument("--voice", default=None, help="Speaker name")
391 p.add_argument("--response-format", default="wav",
392 choices=["wav", "pcm", "flac", "mp3", "aac", "opus"])
393 p.add_argument("--speed", type=float, default=1.0)
394 p.add_argument("--stream-audio", action="store_true",
395 help="Progressive PCM streaming (lower latency)")
396 p.add_argument("--split-granularity", default="sentence",
397 choices=["sentence", "clause"])
398 p.add_argument("--language", default=None)
399 p.add_argument("--ref-audio", default=None,
400 help="Reference audio URL for voice cloning")
401 p.add_argument("--ref-text", default=None,
402 help="Reference audio transcript (or path to a .txt file)")
403 p.add_argument("--initial-codec-chunk-frames", type=int, default=None,
404 help="Initial chunk size override (larger = better quality, higher TTFA)")
405 p.add_argument("--x-vector-only", action="store_true",
406 help="Use speaker embedding only, skip in-context learning")
407 p.add_argument("--timestamps", default=None,
408 choices=["sync", "async"],
409 help="Enable word-level timestamps (sync: in audio.done, async: separate messages)")
410 p.add_argument("--simulate-stt", action="store_true")
411 p.add_argument("--stt-delay", type=float, default=0.1)
412
413 # ── voices ───────────────────────────────────────────────────
414 v_p = sub.add_parser("voices", help="Voice management (over WebSocket)")
415 v_sub = v_p.add_subparsers(dest="action", required=True)
416
417 v_sub.add_parser("list", help="List voices")
418
419 va = v_sub.add_parser("add", help="Upload a voice (base64 audio)")
420 va.add_argument("--name", required=True)
421 va.add_argument("--ref-audio", required=True,
422 help="Local WAV file or URL to reference audio")
423 va.add_argument("--ref-text", default=None,
424 help="Transcript of the audio, or path to a .txt file")
425 va.add_argument("--consent", default="user_consent")
426
427 vr = v_sub.add_parser("remove", help="Remove a voice")
428 vr.add_argument("--name", required=True)
429
430 args = p.parse_args()
431
432 if args.mode == "voices":
433 if args.action == "list":
434 asyncio.run(ws_voice_list())
435 elif args.action == "add":
436 asyncio.run(ws_voice_add(
437 args.name,
438 args.ref_audio,
439 ref_text=args.ref_text,
440 consent=args.consent,
441 ))
442 elif args.action == "remove":
443 asyncio.run(ws_voice_remove(args.name))
444 else:
445 if not args.text:
446 p.error("--text is required for TTS")
447
448 config: dict = {
449 "task_type": "Base",
450 "response_format": args.response_format,
451 "speed": args.speed,
452 "split_granularity": args.split_granularity,
453 }
454 if args.stream_audio:
455 config["stream_audio"] = True
456 config["response_format"] = "pcm"
457 if args.initial_codec_chunk_frames is not None:
458 config["initial_codec_chunk_frames"] = args.initial_codec_chunk_frames
459 if args.x_vector_only:
460 config["x_vector_only_mode"] = True
461 if args.timestamps:
462 config["timestamp_type"] = "word"
463 config["timestamp_transport_strategy"] = args.timestamps
464 if args.voice:
465 config["voice"] = args.voice
466 if args.language:
467 config["language"] = args.language
468 if args.ref_audio:
469 config["ref_audio"] = args.ref_audio
470 ref_text = _resolve_text(args.ref_text)
471 if ref_text:
472 config["ref_text"] = ref_text
473
474 asyncio.run(ws_stream(
475 text=args.text,
476 config=config,
477 output=args.output,
478 simulate_stt=args.simulate_stt,
479 stt_delay=args.stt_delay,
480 ))
481
482
483if __name__ == "__main__":
484 main()JSON output
1{
2 "bytes": "pcm16 bytes"
3}