"Inference Engineering" is now available. Get your copy here
text to speech

Qwen LogoQwen3 TTS 12Hz Base Streaming 0.6B

High-fidelity voice-cloning TTS model from the Qwen family

Model details

Example usage

First, send an initial request with a voice name for referring to the clone. Audio should be around 10-20s in duration.
python call.py --text "Hey my name is Alex, I'm your helpful assistant" --ref-audio alex.m4a --ref-text transcript.txt --voice-name Alex

Once, this initial request has been completed the clone will be stored under the specified voice name and reference audio/text no longer need to be passed.
python call.py --text "Hey my name is Alex, I'm your helpful assistant" --voice-name Alex

Input

1"""WebSocket client for streaming text-input TTS with Base (voice cloning) only.
2
3Connects to the /v1/audio/speech/stream endpoint, sends text incrementally
4(simulating real-time STT output), and saves a single audio file per stream.
5
6Voice cloning (Base task):
7    Provide --ref-audio (local file) and --ref-text to clone a voice.
8    Set --voice-name to cache the clone server-side so subsequent sessions
9    skip the expensive embedding extraction.
10
11Usage:
12    # Voice cloning (first time: uploads + caches)
13    python call.py \
14        --text "Hello world. How are you?" \
15        --ref-audio /path/to/reference.wav \
16        --ref-text /path/to/transcript.txt \
17        --voice-name my_voice
18
19    # Voice cloning (subsequent: uses cached voice, no ref-audio needed)
20    python call.py \
21        --text "Hello world. How are you?" \
22        --voice-name my_voice
23
24    # Simulate STT: send text word-by-word with delay
25    python call.py \
26        --text "Hello world. How are you? I am fine." \
27        --voice-name my_voice \
28        --simulate-stt --stt-delay 0.1
29
30    # Play audio as it streams (requires: pip install sounddevice numpy)
31    python call.py --text "Hello world." --voice-name my_voice --play
32
33Requirements:
34    pip install websockets
35    pip install sounddevice numpy  # optional, for --play
36"""
37
38import argparse
39import asyncio
40import base64
41import json
42import mimetypes
43import os
44import time
45import wave
46
47try:
48    import websockets
49except ImportError:
50    print("Please install websockets: pip install websockets")
51    raise SystemExit(1)
52
53# Optional: for --play (streaming playback)
54try:
55    import numpy as np
56    import sounddevice as sd
57
58    HAS_PLAYBACK = True
59except ImportError:
60    HAS_PLAYBACK = False
61
62
63def _write_wav(path: str, pcm_data: bytes, sample_rate: int, channels: int) -> None:
64    """Write raw PCM-16LE bytes to a WAV file."""
65    with wave.open(path, "wb") as wf:
66        wf.setnchannels(channels)
67        wf.setsampwidth(2)  # 16-bit = 2 bytes per sample
68        wf.setframerate(sample_rate)
69        wf.writeframes(pcm_data)
70
71
72def _encode_audio_file(path: str) -> str:
73    """Read a local audio file and return a base64 data URI."""
74    mime_type, _ = mimetypes.guess_type(path)
75    if mime_type is None:
76        ext = os.path.splitext(path)[1].lower()
77        mime_map = {
78            ".wav": "audio/wav",
79            ".mp3": "audio/mpeg",
80            ".m4a": "audio/mp4",
81            ".flac": "audio/flac",
82            ".ogg": "audio/ogg",
83            ".aac": "audio/aac",
84            ".webm": "audio/webm",
85        }
86        mime_type = mime_map.get(ext, "audio/wav")
87
88    with open(path, "rb") as f:
89        audio_bytes = f.read()
90
91    b64 = base64.b64encode(audio_bytes).decode("utf-8")
92    return f"data:{mime_type};base64,{b64}"
93
94
95async def _delete_voice(url: str, voice_name: str) -> None:
96    """Connect to the WebSocket endpoint and send a voice.delete command."""
97    async with websockets.connect(url) as ws:
98        await ws.send(json.dumps({"type": "voice.delete", "voice_name": voice_name}))
99        raw = await ws.recv()
100        msg = json.loads(raw)
101        if msg.get("type") == "voice.deleted":
102            print(f"Voice '{voice_name}' deleted successfully.")
103        elif msg.get("type") == "error":
104            print(f"Error: {msg.get('message')}")
105        else:
106            print(f"Unexpected response: {msg}")
107
108
109async def stream_tts(
110    url: str,
111    text: str,
112    config: dict,
113    output_file: str,
114    simulate_stt: bool = False,
115    stt_delay: float = 0.1,
116    play: bool = False,
117) -> None:
118    """Connect to the streaming TTS endpoint and process audio responses."""
119    if play and not HAS_PLAYBACK:
120        print("Error: --play requires sounddevice and numpy. pip install sounddevice numpy")
121        raise SystemExit(1)
122
123    os.makedirs(os.path.dirname(output_file) or ".", exist_ok=True)
124
125    async with websockets.connect(
126        url,
127        additional_headers={"Authorization": f"Api-Key {os.getenv('BASETEN_API_KEY')}"},
128    ) as ws:
129        # 1. Send session config
130        config_msg = {"type": "session.config", **config}
131        t_request = time.perf_counter()
132        await ws.send(json.dumps(config_msg))
133
134        # Ensure text ends with punctuation to prevent cutoff
135        def ensure_ending_punctuation(t: str) -> str:
136            t = t.strip()
137            if t and t[-1] not in ".!?;:…。!?":
138                return t + "."
139            return t
140
141        text_to_send = ensure_ending_punctuation(text)
142
143        # 2. Send text (either all at once or word-by-word)
144        async def send_text():
145            if simulate_stt:
146                words = text_to_send.split(" ")
147                for i, word in enumerate(words):
148                    chunk = word + (" " if i < len(words) - 1 else "")
149                    await ws.send(
150                        json.dumps(
151                            {
152                                "type": "input.text",
153                                "text": chunk,
154                            }
155                        )
156                    )
157                    await asyncio.sleep(stt_delay)
158            else:
159                await ws.send(
160                    json.dumps(
161                        {
162                            "type": "input.text",
163                            "text": text_to_send,
164                        }
165                    )
166                )
167
168            # 3. Signal end of input
169            await ws.send(json.dumps({"type": "input.done"}))
170
171        # Run sender and receiver concurrently
172        sender_task = asyncio.create_task(send_text())
173
174        sentence_count = 0
175        ttfa: float | None = None
176        sample_rate: int = 24000
177        interrupted = False
178        play_stream = None
179
180        # Accumulate all PCM chunks across all sentences into a single buffer.
181        all_pcm: list[bytes] = []
182
183        try:
184            while True:
185                message = await ws.recv()
186
187                if isinstance(message, bytes):
188                    if ttfa is None:
189                        ttfa = time.perf_counter() - t_request
190                    all_pcm.append(message)
191                    if play:
192                        if play_stream is None:
193                            play_stream = sd.OutputStream(
194                                samplerate=sample_rate,
195                                channels=1,
196                                dtype="int16",
197                                blocksize=2048,
198                            )
199                            play_stream.start()
200                        buf = np.frombuffer(message, dtype=np.int16)
201                        await asyncio.to_thread(play_stream.write, buf)
202                else:
203                    msg = json.loads(message)
204                    msg_type = msg.get("type")
205
206                    if msg_type == "voice.registered":
207                        pass
208
209                    elif msg_type == "audio.start":
210                        pass
211
212                    elif msg_type == "audio.done":
213                        sample_rate = msg.get("sample_rate", 24000)
214                        sentence_count += 1
215
216                    elif msg_type == "session.done":
217                        if play_stream is not None:
218                            await asyncio.to_thread(play_stream.stop)
219                            play_stream.close()
220                            play_stream = None
221                        t_total = time.perf_counter() - t_request
222                        pcm_data = b"".join(all_pcm)
223                        _write_wav(
224                            output_file, pcm_data, sample_rate=sample_rate, channels=1
225                        )
226                        audio_duration = (
227                            len(pcm_data) / (sample_rate * 2) if pcm_data else 0
228                        )
229                        print(
230                            f"\nSession complete: {msg['total_sentences']} sentence(s) generated"
231                        )
232                        print(
233                            f"  Saved {output_file} ({len(pcm_data)} PCM bytes, {audio_duration:.2f}s)"
234                        )
235                        if ttfa is not None:
236                            print(f"  TTFA:       {ttfa * 1000:.1f} ms")
237                        print(f"  Total time: {t_total * 1000:.1f} ms")
238                        if audio_duration > 0:
239                            print(f"  RTF:        {t_total / audio_duration:.2f}x")
240                        break
241                    elif msg_type == "error":
242                        print(f"  ERROR: {msg['message']}")
243
244        except (asyncio.CancelledError, KeyboardInterrupt):
245            interrupted = True
246
247        finally:
248            sender_task.cancel()
249            try:
250                await sender_task
251            except asyncio.CancelledError:
252                pass
253
254            if play_stream is not None:
255                try:
256                    await asyncio.to_thread(play_stream.stop)
257                    play_stream.close()
258                except Exception:
259                    pass
260
261            if interrupted:
262                t_total = time.perf_counter() - t_request
263                print(f"\nInterrupted after {t_total * 1000:.0f} ms")
264                pcm_data = b"".join(all_pcm)
265                if pcm_data:
266                    _write_wav(
267                        output_file, pcm_data, sample_rate=sample_rate, channels=1
268                    )
269                    audio_duration = len(pcm_data) / (sample_rate * 2)
270                    print(
271                        f"  Saved partial audio: {output_file} "
272                        f"({len(pcm_data)} PCM bytes, {audio_duration:.2f}s)"
273                    )
274                else:
275                    print("  No audio received yet.")
276                await ws.close(code=1000, reason="Client interrupted")
277
278
279def main():
280    parser = argparse.ArgumentParser(description="Streaming text-input TTS client")
281    parser.add_argument(
282        "--url",
283        default="wss://model-<model_id>.api.baseten.co/deployment/<deployment_id>/websocket",
284        help="WebSocket endpoint URL",
285    )
286    parser.add_argument(
287        "--text",
288        required=False,
289        help="Text to synthesize",
290    )
291    parser.add_argument(
292        "--output",
293        default="streaming_tts_output.wav",
294        help="Output WAV file path (default: streaming_tts_output.wav)",
295    )
296
297    # Session config options (Base task only)
298    parser.add_argument("--model", default=None, help="Model name")
299    parser.add_argument("--language", default="Auto", help="Language")
300    parser.add_argument(
301        "--response-format",
302        default="wav",
303        choices=["wav", "pcm", "flac", "mp3", "aac", "opus"],
304        help="Audio format",
305    )
306    parser.add_argument(
307        "--speed", type=float, default=1.0, help="Playback speed (0.25-4.0)"
308    )
309    parser.add_argument("--max-new-tokens", type=int, default=None, help="Max tokens")
310
311    # Voice cloning options (Base task)
312    parser.add_argument(
313        "--ref-audio",
314        default=None,
315        help="Path to local reference audio file for voice cloning",
316    )
317    parser.add_argument(
318        "--ref-text",
319        default=None,
320        help="Transcript of reference audio (enables ICL mode). "
321        "Can be inline text or a path to a .txt file.",
322    )
323    parser.add_argument(
324        "--voice-name",
325        default=None,
326        help="Name for caching the voice clone server-side. "
327        "On first use provide --ref-audio too; subsequent calls reuse the cache.",
328    )
329    parser.add_argument(
330        "--x-vector-only-mode",
331        action="store_true",
332        default=False,
333        help="Speaker embedding only mode (no ICL). Enables per-sentence "
334        "streaming for cached voices, giving much lower TTFA at the cost "
335        "of slightly reduced voice similarity.",
336    )
337    parser.add_argument(
338        "--delete-voice",
339        default=None,
340        metavar="NAME",
341        help="Delete a cached voice clone by name and exit.",
342    )
343
344    # STT simulation
345    parser.add_argument(
346        "--simulate-stt",
347        action="store_true",
348        help="Simulate STT by sending text word-by-word",
349    )
350    parser.add_argument(
351        "--stt-delay",
352        type=float,
353        default=0.1,
354        help="Delay between words in STT simulation (seconds)",
355    )
356    parser.add_argument(
357        "--play",
358        action="store_true",
359        help="Play audio as it streams (requires: pip install sounddevice numpy)",
360    )
361
362    args = parser.parse_args()
363
364    # Handle --delete-voice as a one-shot command
365    if args.delete_voice:
366        asyncio.run(_delete_voice(args.url, args.delete_voice))
367        return
368
369    # If ref-text looks like a file path, read its contents
370    if args.ref_text and os.path.isfile(args.ref_text):
371        with open(args.ref_text) as f:
372            args.ref_text = f.read().strip()
373        print(
374            f"Read ref_text from file: {args.ref_text[:80]}{'...' if len(args.ref_text) > 80 else ''}"
375        )
376
377    # Encode local reference audio as base64 data URI
378    ref_audio_data_uri = None
379    if args.ref_audio:
380        if not os.path.isfile(args.ref_audio):
381            print(f"Error: reference audio file not found: {args.ref_audio}")
382            raise SystemExit(1)
383        ref_audio_data_uri = _encode_audio_file(args.ref_audio)
384        size_kb = os.path.getsize(args.ref_audio) / 1024
385        print(f"Encoded reference audio: {args.ref_audio} ({size_kb:.1f} KB)")
386
387    # Build session config (Base task only; only include non-None values)
388    config: dict = {"task_type": "Base"}
389    for key in [
390        "model",
391        "language",
392        "response_format",
393        "speed",
394        "max_new_tokens",
395        "ref_text",
396        "voice_name",
397    ]:
398        val = getattr(args, key.replace("-", "_"), None)
399        if val is not None:
400            config[key] = val
401
402    if ref_audio_data_uri is not None:
403        config["ref_audio"] = ref_audio_data_uri
404
405    if args.x_vector_only_mode:
406        config["x_vector_only_mode"] = True
407
408    try:
409        asyncio.run(
410            stream_tts(
411                url=args.url,
412                text=args.text,
413                config=config,
414                output_file=args.output,
415                simulate_stt=args.simulate_stt,
416                stt_delay=args.stt_delay,
417                play=args.play,
418            )
419        )
420    except KeyboardInterrupt:
421        pass
422
423
424if __name__ == "__main__":
425    main()
426
JSON output
1{
2    "bytes": "pcm16 bytes"
3}

🔥 Trending models