text to speech

VoxCPM2

High-fidelity, tokenizer-free voice-cloning model from the VoxCPM family

Model details

Developed by
OpenBMB
Use case
text to speech
Size
2B
Optimization
vLLM
Hardware
H100
License
Apache 2.0

Example usage

Input

1"""
2Demonstrate VoxCPM2 voice caching over /v1/audio/speech.
3
4The first request sends ``voice=<name>`` plus ``ref_audio``. The server stores
5that reference clip as an uploaded voice, then generates via the named-voice
6path and warms VoxCPM2's ref_audio_feat cache. Later requests send only the
7same ``voice`` name and should hit the cache.
8
9Usage:
10    pip install httpx
11
12    python speech_voice_cache_client.py \
13        --ref-audio /path/to/reference.wav \
14        --voice-name demo_voice \
15        --text "This first HTTP request should warm the voice cache." \
16        --text "This second HTTP request should reuse the cached voice." \
17        --stream
18"""
19
20from __future__ import annotations
21
22import argparse
23import base64
24import mimetypes
25import os
26import time
27from pathlib import Path
28
29import httpx
30
31DEFAULT_API_BASE = "https://model-qrj02553.api.baseten.co/deployment/wp5d088/sync"
32
33
34def encode_audio_to_data_url(ref_audio_path: str) -> str:
35    audio_path = Path(ref_audio_path)
36    if not audio_path.exists():
37        raise FileNotFoundError(f"Reference audio not found: {audio_path}")
38
39    mime_type = mimetypes.guess_type(audio_path.name)[0] or "audio/wav"
40    with audio_path.open("rb") as audio_file:
41        encoded = base64.b64encode(audio_file.read()).decode("utf-8")
42    return f"data:{mime_type};base64,{encoded}"
43
44
45def post_speech(
46    *,
47    client: httpx.Client,
48    api_base: str,
49    api_key: str,
50    model: str,
51    text: str,
52    voice_name: str,
53    output_path: Path,
54    response_format: str,
55    stream: bool,
56    ref_audio: str | None = None,
57    ref_text: str | None = None,
58) -> float:
59    payload: dict[str, object] = {
60        "model": model,
61        "input": text,
62        "voice": voice_name,
63        "response_format": response_format,
64        "stream": stream,
65    }
66    if ref_audio is not None:
67        payload["ref_audio"] = ref_audio
68    if ref_text is not None:
69        payload["ref_text"] = ref_text
70
71    url = f"{api_base.rstrip('/')}/v1/audio/speech"
72    output_path.parent.mkdir(parents=True, exist_ok=True)
73    start = time.perf_counter()
74    if stream:
75        bytes_written = 0
76        with client.stream(
77            "POST",
78            url,
79            json=payload,
80            headers={"Authorization": f"Bearer {api_key}"},
81        ) as response:
82            if response.status_code != 200:
83                error_body = response.read().decode("utf-8", errors="replace")
84                raise RuntimeError(f"Speech request failed ({response.status_code}): {error_body[:500]}")
85
86            with output_path.open("wb") as output_file:
87                for chunk in response.iter_bytes():
88                    if not chunk:
89                        continue
90                    output_file.write(chunk)
91                    bytes_written += len(chunk)
92
93        elapsed = time.perf_counter() - start
94        print(f"  streamed {output_path} ({bytes_written:,} bytes) in {elapsed:.2f}s")
95        return elapsed
96
97    response = client.post(
98        url,
99        json=payload,
100        headers={"Authorization": f"Bearer {api_key}"},
101    )
102    elapsed = time.perf_counter() - start
103
104    if response.status_code != 200:
105        raise RuntimeError(f"Speech request failed ({response.status_code}): {response.text[:500]}")
106
107    output_path.write_bytes(response.content)
108    print(f"  saved {output_path} ({len(response.content):,} bytes) in {elapsed:.2f}s")
109    return elapsed
110
111
112def main() -> None:
113    parser = argparse.ArgumentParser(description="VoxCPM2 /v1/audio/speech voice-cache demo client")
114    parser.add_argument("--api-base", default=DEFAULT_API_BASE)
115    parser.add_argument("--api-key", default=os.getenv('BASETEN_API_KEY_INTERNAL'))
116    parser.add_argument("--model", default="voxcpm2")
117    parser.add_argument("--voice-name", default="voxcpm_http_cache_demo")
118    parser.add_argument("--ref-audio", help="Local reference audio sent only on the first request")
119    parser.add_argument("--ref-text", default=None, help="Optional reference transcript for first request")
120    parser.add_argument(
121        "--skip-upload",
122        action="store_true",
123        help="Assume --voice-name already exists; send no ref_audio on any request",
124    )
125    parser.add_argument(
126        "--text",
127        action="append",
128        default=[],
129        help="Text to synthesize. Repeat to make multiple cached requests.",
130    )
131    parser.add_argument("--output-dir", default="voxcpm2_speech_cache_output")
132    parser.add_argument("--response-format", default="wav", choices=["wav", "pcm", "flac", "mp3", "aac", "opus"])
133    parser.add_argument("--stream", action="store_true", help="Send stream=True and write the streamed response to disk")
134    args = parser.parse_args()
135
136    if not args.skip_upload and not args.ref_audio:
137        raise ValueError("Pass --ref-audio for the first request, or --skip-upload for an existing voice.")
138    if not args.text:
139        args.text = [
140            "This first HTTP speech request should upload the voice and warm the cache.",
141            "This second HTTP speech request uses only the voice name and should hit the cache.",
142        ]
143
144    ref_audio_data_url = None if args.skip_upload else encode_audio_to_data_url(args.ref_audio)
145    output_dir = Path(args.output_dir)
146
147    print("HTTP /v1/audio/speech VoxCPM2 cache demo")
148    if ref_audio_data_url is None:
149        print(f"Using existing voice {args.voice_name!r}; no ref_audio will be sent.")
150    else:
151        print(f"Request 1 will send voice={args.voice_name!r} plus ref_audio.")
152    if args.stream:
153        print("Streaming enabled; requests will send stream=True and write chunks as they arrive.")
154    print("Later requests send only the voice name. Watch server logs for VoxCPM2 cache STORE/HIT.")
155
156    with httpx.Client(timeout=300) as client:
157        for idx, text in enumerate(args.text, start=1):
158            print(f"\nRequest {idx}/{len(args.text)}")
159            print("  expected cache behavior:", "miss + store" if idx == 1 else "hit")
160            post_speech(
161                client=client,
162                api_base=args.api_base,
163                api_key=args.api_key,
164                model=args.model,
165                text=text,
166                voice_name=args.voice_name,
167                output_path=output_dir / f"request_{idx:02d}.{args.response_format}",
168                response_format=args.response_format,
169                stream=args.stream,
170                ref_audio=ref_audio_data_url if idx == 1 else None,
171                ref_text=args.ref_text if idx == 1 else None,
172            )
173
174
175if __name__ == "__main__":
176    main()
177
178