text to speech

Qwen3 TTS 12Hz Base Streaming 0.6B

High-fidelity voice-cloning TTS model from the Qwen family

‌

Model details

Developed by
Qwen
Model family
Qwen
Use case
text to speech
Version
TTS
Variant
12Hz Base
Size
0.6B

Example usage

First, send an initial request with a voice name for referring to the clone. Audio should be around 10-20s in duration.
python call.py voices add --name alex --ref-audio "https://example.com/reference.wav" --ref-text "Transcript of the reference audio.

Once, this initial request has been completed the clone will be stored under the specified voice name and reference audio/text no longer need to be passed.
python call.py --text "Hello world." --voice alex --stream-audio

Input

1#!/usr/bin/env python3
2"""WebSocket client for Qwen3-TTS Base deployment on Baseten.
3
4All operations go over a single WebSocket connection:
5  1. Streaming TTS        (session.config → input.text → input.done → audio)
6  2. Voice management     (voice.list / voice.add / voice.remove)
7
8Set BASETEN_API_KEY in your environment before running.
9
10Examples:
11    # Streaming TTS with a built-in voice
12    python call.py --text "Hello! How are you today?"
13
14    # With a specific voice
15    python call.py --text "Hello world." --voice my_voice
16
17    # Stream PCM for lower latency
18    python call.py --text "Hello world." --voice my_voice --stream-audio
19
20    # Simulate real-time STT drip-feed
21    python call.py --text "Pack my box with five dozen liquor jugs." \
22        --simulate-stt --stt-delay 0.08
23
24    # Upload a voice (local file)
25    python call.py voices add --name my_voice \
26        --ref-audio ./reference.wav \
27        --ref-text "Transcript of the reference audio."
28
29    # Upload a voice (from URL)
30    python call.py voices add --name my_voice \
31        --ref-audio "https://example.com/reference.wav" \
32        --ref-text "Transcript of the reference audio."
33
34    # List voices
35    python call.py voices list
36
37    # Remove a voice
38    python call.py voices remove --name my_voice
39
40Requirements:
41    pip install websockets soundfile numpy requests
42"""
43
44import argparse
45import asyncio
46import base64
47import io
48import json
49import os
50import struct
51import sys
52import time
53
54import numpy as np
55import requests
56
57try:
58    import websockets
59except ImportError:
60    websockets = None
61
62try:
63    import soundfile as sf
64except ImportError:
65    sf = None
66
67WS_URL = "wss://model-wx412j6q.api.baseten.co/deployment/wgl225g/websocket"
68SAMPLE_RATE = 24000
69
70
71def _resolve_text(value: str | None) -> str | None:
72    """If *value* is a path to an existing .txt file, return its contents."""
73    if value and value.endswith(".txt") and os.path.isfile(value):
74        with open(value, "r", encoding="utf-8") as f:
75            return f.read().strip()
76    return value
77
78
79def _api_key() -> str:
80    key = os.getenv("BASETEN_API_KEY")
81    if not key:
82        sys.exit("Error: BASETEN_API_KEY environment variable is not set")
83    return key
84
85
86def _auth_headers() -> dict:
87    return {"Authorization": f"Api-Key {_api_key()}"}
88
89
90async def _ws_connect():
91    if websockets is None:
92        sys.exit("Missing dependency: pip install websockets")
93    return await websockets.connect(
94        WS_URL,
95        max_size=16 * 1024 * 1024,
96        additional_headers=_auth_headers(),
97        open_timeout=30,
98    )
99
100
101# ── Voice management (over WebSocket) ───────────────────────────────────────
102
103async def ws_voice_list() -> None:
104    ws = await _ws_connect()
105    print(f"[ws] Connected to {WS_URL} ...")
106    try:
107        await ws.send(json.dumps({"type": "voice.list"}))
108        resp = json.loads(await ws.recv())
109
110        if resp.get("type") == "error":
111            print(f"[error] {resp['message']}")
112            return
113
114        builtin = resp.get("voices", [])
115        uploaded = resp.get("uploaded_voices", [])
116
117        if builtin:
118            print("Built-in voices:")
119            for name in builtin:
120                print(f"  {name}")
121        if uploaded:
122            print(f"\nUploaded voices:")
123            print(f"  {'Name':<20} {'Source':<10} {'Ref Text'}")
124            print(f"  {'─' * 60}")
125            for v in uploaded:
126                print(f"  {v.get('name', '?'):<20} "
127                      f"{v.get('embedding_source', ''):<10} "
128                      f"{v.get('ref_text', '')[:40]}")
129        if not builtin and not uploaded:
130            print("No voices found.")
131    finally:
132        await ws.close()
133
134
135async def ws_voice_add(
136    name: str,
137    ref_audio: str,
138    ref_text: str | None = None,
139    consent: str = "user_consent",
140) -> None:
141    ref_text = _resolve_text(ref_text)
142    if ref_audio.startswith(("http://", "https://")):
143        print(f"[voices] Downloading {ref_audio}...")
144        r = requests.get(ref_audio, timeout=60)
145        r.raise_for_status()
146        audio_bytes = r.content
147    else:
148        with open(ref_audio, "rb") as f:
149            audio_bytes = f.read()
150
151    msg: dict = {
152        "type": "voice.add",
153        "name": name,
154        "consent": consent,
155        "audio_data": base64.b64encode(audio_bytes).decode(),
156        "audio_format": "wav",
157    }
158    if ref_text:
159        msg["ref_text"] = ref_text
160
161    print(f"[voices] Uploading '{name}' ({len(audio_bytes):,} bytes)...")
162    ws = await _ws_connect()
163    try:
164        await ws.send(json.dumps(msg))
165        resp = json.loads(await ws.recv())
166
167        if resp.get("type") == "error":
168            print(f"[error] {resp['message']}")
169        elif resp.get("success"):
170            print(f"[voices] Created: {json.dumps(resp.get('voice', {}), indent=2)}")
171        else:
172            print(f"[error] {resp.get('error', 'unknown error')}")
173    finally:
174        await ws.close()
175
176
177async def ws_voice_remove(name: str) -> None:
178    ws = await _ws_connect()
179    try:
180        await ws.send(json.dumps({"type": "voice.remove", "name": name}))
181        resp = json.loads(await ws.recv())
182
183        if resp.get("type") == "error":
184            print(f"[error] {resp['message']}")
185        elif resp.get("success"):
186            print(f"[voices] Removed '{name}'")
187        else:
188            print(f"[error] {resp.get('error', f'Voice {name!r} not found')}")
189    finally:
190        await ws.close()
191
192
193# ── WebSocket streaming TTS ─────────────────────────────────────────────────
194
195async def ws_stream(
196    text: str,
197    config: dict,
198    output: str,
199    simulate_stt: bool = False,
200    stt_delay: float = 0.1,
201) -> None:
202    all_pcm: list[np.ndarray] = []
203    sentence_count = 0
204    first_audio_time = None
205    t0 = time.perf_counter()
206
207    is_pcm = config.get("response_format", "wav") == "pcm" or config.get("stream_audio", False)
208
209    print(f"[ws] Connecting to {WS_URL} ...")
210
211    try:
212        ws = await _ws_connect()
213    except websockets.exceptions.InvalidStatus as e:
214        print(f"[error] WebSocket handshake failed: HTTP {e.response.status_code}")
215        for name, value in e.response.headers.raw_items():
216            print(f"        {name}: {value}")
217        body = getattr(e.response, "body", None)
218        if body:
219            print(f"        body: {body.decode(errors='replace')[:500]}")
220        raise
221    except Exception as e:
222        print(f"[error] WebSocket connection failed: {type(e).__name__}: {e}")
223        raise
224
225    print(f"[ws] Connected (protocol={ws.protocol})")
226
227    try:
228        config_msg = {"type": "session.config", **config}
229        await ws.send(json.dumps(config_msg))
230        print(f"[ws] task_type=Base  "
231              f"format={config.get('response_format', 'wav')}  "
232              f"stream_audio={config.get('stream_audio', False)}  "
233              f"split={config.get('split_granularity', 'sentence')}")
234
235        async def send_text():
236            if simulate_stt:
237                words = text.split(" ")
238                for i, word in enumerate(words):
239                    chunk = word + (" " if i < len(words) - 1 else "")
240                    await ws.send(json.dumps({"type": "input.text", "text": chunk}))
241                    await asyncio.sleep(stt_delay)
242                print(f"[input] Sent {len(words)} words (simulated STT, {stt_delay}s delay)")
243            else:
244                await ws.send(json.dumps({"type": "input.text", "text": text}))
245                print(f"[input] Sent {len(text)} chars")
246            await ws.send(json.dumps({"type": "input.done"}))
247
248        sender = asyncio.create_task(send_text())
249
250        total_bytes = 0
251        current_wav_chunks: list[bytes] = []
252
253        try:
254            while True:
255                msg = await ws.recv()
256
257                if isinstance(msg, bytes):
258                    if first_audio_time is None:
259                        first_audio_time = time.perf_counter()
260                    total_bytes += len(msg)
261                    if is_pcm:
262                        usable = len(msg) - (len(msg) % 2)
263                        if usable > 0:
264                            all_pcm.append(np.frombuffer(msg[:usable], dtype=np.int16))
265                    else:
266                        current_wav_chunks.append(msg)
267                    continue
268
269                data = json.loads(msg)
270                mtype = data.get("type")
271
272                if mtype == "audio.start":
273                    current_wav_chunks = []
274                    sentence_count += 1
275
276                elif mtype == "audio.done":
277                    if data.get("error", False):
278                        print(f"[error] Generation failed for sentence {data['sentence_index']}")
279                    elif not is_pcm and current_wav_chunks:
280                        raw = b"".join(current_wav_chunks)
281                        try:
282                            pcm_arr, _ = sf.read(io.BytesIO(raw))
283                            if pcm_arr.ndim > 1:
284                                pcm_arr = pcm_arr[:, 0]
285                            all_pcm.append((np.clip(pcm_arr, -1, 1) * 32767).astype(np.int16))
286                        except Exception as e:
287                            print(f"[error] Failed to decode sentence audio: {e}")
288                    ts_info = data.get("timestamp_info")
289                    if ts_info:
290                        wa = ts_info.get("word_alignment", {})
291                        words = wa.get("words", [])
292                        starts = wa.get("word_start_times_seconds", [])
293                        ends = wa.get("word_end_times_seconds", [])
294                        for w, s, e in zip(words, starts, ends):
295                            print(f"  [{s:.3f}–{e:.3f}] {w}")
296                    current_wav_chunks = []
297
298                elif mtype == "audio.timestamps":
299                    wa = data.get("word_alignment", {})
300                    words = wa.get("words", [])
301                    starts = wa.get("word_start_times_seconds", [])
302                    ends = wa.get("word_end_times_seconds", [])
303                    for w, s, e in zip(words, starts, ends):
304                        print(f"  [{s:.3f}–{e:.3f}] {w}")
305
306                elif mtype == "session.done":
307                    break
308
309                elif mtype == "error":
310                    print(f"[error] {data['message']}")
311
312        finally:
313            sender.cancel()
314            try:
315                await sender
316            except asyncio.CancelledError:
317                pass
318    finally:
319        await ws.close()
320
321    elapsed = time.perf_counter() - t0
322    ttfa = (first_audio_time - t0) if first_audio_time else None
323    _save_audio(all_pcm, output, sentence_count, total_bytes, elapsed, ttfa)
324
325
326# ── Helpers ──────────────────────────────────────────────────────────────────
327
328def _save_audio(
329    all_pcm: list,
330    output: str,
331    sentence_count: int,
332    total_bytes: int,
333    elapsed: float,
334    ttfa: float | None,
335) -> None:
336    if not all_pcm:
337        print("\nNo audio received.")
338        return
339
340    combined = np.concatenate(all_pcm)
341    audio_duration = len(combined) / SAMPLE_RATE
342
343    if not output.lower().endswith((".wav", ".flac", ".mp3", ".ogg")):
344        output += ".wav"
345
346    if sf is not None:
347        sf.write(output, combined.astype(np.float32) / 32767.0, SAMPLE_RATE)
348    else:
349        _write_wav(output, combined)
350
351    print(f"\n{'─' * 50}")
352    print(f"  Output:     {output}")
353    print(f"  Size:       {total_bytes:,} bytes")
354    print(f"  Duration:   {audio_duration:.2f}s")
355    print(f"  Wall time:  {elapsed:.2f}s")
356    if ttfa is not None:
357        print(f"  TTFA:       {ttfa * 1000:.0f}ms")
358    if audio_duration > 0:
359        rtf = elapsed / audio_duration
360        print(f"  RTF:        {rtf:.3f}x  ({1/rtf:.1f}x realtime)")
361    print(f"{'─' * 50}")
362
363
364def _write_wav(path: str, pcm: np.ndarray) -> None:
365    data = pcm.astype(np.int16).tobytes()
366    with open(path, "wb") as f:
367        f.write(b"RIFF")
368        f.write(struct.pack("<I", 36 + len(data)))
369        f.write(b"WAVE")
370        f.write(b"fmt ")
371        f.write(struct.pack("<IHHIIHH", 16, 1, 1, SAMPLE_RATE,
372                            SAMPLE_RATE * 2, 2, 16))
373        f.write(b"data")
374        f.write(struct.pack("<I", len(data)))
375        f.write(data)
376
377
378# ── CLI ──────────────────────────────────────────────────────────────────────
379
380def main():
381    p = argparse.ArgumentParser(
382        description="WebSocket client for Qwen3-TTS Base on Baseten",
383        formatter_class=argparse.RawDescriptionHelpFormatter,
384    )
385    sub = p.add_subparsers(dest="mode")
386
387    # ── TTS args (top-level) ─────────────────────────────────────
388    p.add_argument("--text", help="Text to synthesize")
389    p.add_argument("--output", "-o", default="output.wav")
390    p.add_argument("--voice", default=None, help="Speaker name")
391    p.add_argument("--response-format", default="wav",
392                   choices=["wav", "pcm", "flac", "mp3", "aac", "opus"])
393    p.add_argument("--speed", type=float, default=1.0)
394    p.add_argument("--stream-audio", action="store_true",
395                   help="Progressive PCM streaming (lower latency)")
396    p.add_argument("--split-granularity", default="sentence",
397                   choices=["sentence", "clause"])
398    p.add_argument("--language", default=None)
399    p.add_argument("--ref-audio", default=None,
400                   help="Reference audio URL for voice cloning")
401    p.add_argument("--ref-text", default=None,
402                   help="Reference audio transcript (or path to a .txt file)")
403    p.add_argument("--initial-codec-chunk-frames", type=int, default=None,
404                   help="Initial chunk size override (larger = better quality, higher TTFA)")
405    p.add_argument("--x-vector-only", action="store_true",
406                   help="Use speaker embedding only, skip in-context learning")
407    p.add_argument("--timestamps", default=None,
408                   choices=["sync", "async"],
409                   help="Enable word-level timestamps (sync: in audio.done, async: separate messages)")
410    p.add_argument("--simulate-stt", action="store_true")
411    p.add_argument("--stt-delay", type=float, default=0.1)
412
413    # ── voices ───────────────────────────────────────────────────
414    v_p = sub.add_parser("voices", help="Voice management (over WebSocket)")
415    v_sub = v_p.add_subparsers(dest="action", required=True)
416
417    v_sub.add_parser("list", help="List voices")
418
419    va = v_sub.add_parser("add", help="Upload a voice (base64 audio)")
420    va.add_argument("--name", required=True)
421    va.add_argument("--ref-audio", required=True,
422                    help="Local WAV file or URL to reference audio")
423    va.add_argument("--ref-text", default=None,
424                    help="Transcript of the audio, or path to a .txt file")
425    va.add_argument("--consent", default="user_consent")
426
427    vr = v_sub.add_parser("remove", help="Remove a voice")
428    vr.add_argument("--name", required=True)
429
430    args = p.parse_args()
431
432    if args.mode == "voices":
433        if args.action == "list":
434            asyncio.run(ws_voice_list())
435        elif args.action == "add":
436            asyncio.run(ws_voice_add(
437                args.name,
438                args.ref_audio,
439                ref_text=args.ref_text,
440                consent=args.consent,
441            ))
442        elif args.action == "remove":
443            asyncio.run(ws_voice_remove(args.name))
444    else:
445        if not args.text:
446            p.error("--text is required for TTS")
447
448        config: dict = {
449            "task_type": "Base",
450            "response_format": args.response_format,
451            "speed": args.speed,
452            "split_granularity": args.split_granularity,
453        }
454        if args.stream_audio:
455            config["stream_audio"] = True
456            config["response_format"] = "pcm"
457        if args.initial_codec_chunk_frames is not None:
458            config["initial_codec_chunk_frames"] = args.initial_codec_chunk_frames
459        if args.x_vector_only:
460            config["x_vector_only_mode"] = True
461        if args.timestamps:
462            config["timestamp_type"] = "word"
463            config["timestamp_transport_strategy"] = args.timestamps
464        if args.voice:
465            config["voice"] = args.voice
466        if args.language:
467            config["language"] = args.language
468        if args.ref_audio:
469            config["ref_audio"] = args.ref_audio
470        ref_text = _resolve_text(args.ref_text)
471        if ref_text:
472            config["ref_text"] = ref_text
473
474        asyncio.run(ws_stream(
475            text=args.text,
476            config=config,
477            output=args.output,
478            simulate_stt=args.simulate_stt,
479            stt_delay=args.stt_delay,
480        ))
481
482
483if __name__ == "__main__":
484    main()