text to speech
VoxCPM2
High-fidelity, tokenizer-free voice-cloning model from the VoxCPM family
Model details
View repositoryExample usage
Input
1"""
2Demonstrate VoxCPM2 voice caching over /v1/audio/speech.
3
4The first request sends ``voice=<name>`` plus ``ref_audio``. The server stores
5that reference clip as an uploaded voice, then generates via the named-voice
6path and warms VoxCPM2's ref_audio_feat cache. Later requests send only the
7same ``voice`` name and should hit the cache.
8
9Usage:
10 pip install httpx
11
12 python speech_voice_cache_client.py \
13 --ref-audio /path/to/reference.wav \
14 --voice-name demo_voice \
15 --text "This first HTTP request should warm the voice cache." \
16 --text "This second HTTP request should reuse the cached voice." \
17 --stream
18"""
19
20from __future__ import annotations
21
22import argparse
23import base64
24import mimetypes
25import os
26import time
27from pathlib import Path
28
29import httpx
30
31DEFAULT_API_BASE = "https://model-qrj02553.api.baseten.co/deployment/wp5d088/sync"
32
33
34def encode_audio_to_data_url(ref_audio_path: str) -> str:
35 audio_path = Path(ref_audio_path)
36 if not audio_path.exists():
37 raise FileNotFoundError(f"Reference audio not found: {audio_path}")
38
39 mime_type = mimetypes.guess_type(audio_path.name)[0] or "audio/wav"
40 with audio_path.open("rb") as audio_file:
41 encoded = base64.b64encode(audio_file.read()).decode("utf-8")
42 return f"data:{mime_type};base64,{encoded}"
43
44
45def post_speech(
46 *,
47 client: httpx.Client,
48 api_base: str,
49 api_key: str,
50 model: str,
51 text: str,
52 voice_name: str,
53 output_path: Path,
54 response_format: str,
55 stream: bool,
56 ref_audio: str | None = None,
57 ref_text: str | None = None,
58) -> float:
59 payload: dict[str, object] = {
60 "model": model,
61 "input": text,
62 "voice": voice_name,
63 "response_format": response_format,
64 "stream": stream,
65 }
66 if ref_audio is not None:
67 payload["ref_audio"] = ref_audio
68 if ref_text is not None:
69 payload["ref_text"] = ref_text
70
71 url = f"{api_base.rstrip('/')}/v1/audio/speech"
72 output_path.parent.mkdir(parents=True, exist_ok=True)
73 start = time.perf_counter()
74 if stream:
75 bytes_written = 0
76 with client.stream(
77 "POST",
78 url,
79 json=payload,
80 headers={"Authorization": f"Bearer {api_key}"},
81 ) as response:
82 if response.status_code != 200:
83 error_body = response.read().decode("utf-8", errors="replace")
84 raise RuntimeError(f"Speech request failed ({response.status_code}): {error_body[:500]}")
85
86 with output_path.open("wb") as output_file:
87 for chunk in response.iter_bytes():
88 if not chunk:
89 continue
90 output_file.write(chunk)
91 bytes_written += len(chunk)
92
93 elapsed = time.perf_counter() - start
94 print(f" streamed {output_path} ({bytes_written:,} bytes) in {elapsed:.2f}s")
95 return elapsed
96
97 response = client.post(
98 url,
99 json=payload,
100 headers={"Authorization": f"Bearer {api_key}"},
101 )
102 elapsed = time.perf_counter() - start
103
104 if response.status_code != 200:
105 raise RuntimeError(f"Speech request failed ({response.status_code}): {response.text[:500]}")
106
107 output_path.write_bytes(response.content)
108 print(f" saved {output_path} ({len(response.content):,} bytes) in {elapsed:.2f}s")
109 return elapsed
110
111
112def main() -> None:
113 parser = argparse.ArgumentParser(description="VoxCPM2 /v1/audio/speech voice-cache demo client")
114 parser.add_argument("--api-base", default=DEFAULT_API_BASE)
115 parser.add_argument("--api-key", default=os.getenv('BASETEN_API_KEY_INTERNAL'))
116 parser.add_argument("--model", default="voxcpm2")
117 parser.add_argument("--voice-name", default="voxcpm_http_cache_demo")
118 parser.add_argument("--ref-audio", help="Local reference audio sent only on the first request")
119 parser.add_argument("--ref-text", default=None, help="Optional reference transcript for first request")
120 parser.add_argument(
121 "--skip-upload",
122 action="store_true",
123 help="Assume --voice-name already exists; send no ref_audio on any request",
124 )
125 parser.add_argument(
126 "--text",
127 action="append",
128 default=[],
129 help="Text to synthesize. Repeat to make multiple cached requests.",
130 )
131 parser.add_argument("--output-dir", default="voxcpm2_speech_cache_output")
132 parser.add_argument("--response-format", default="wav", choices=["wav", "pcm", "flac", "mp3", "aac", "opus"])
133 parser.add_argument("--stream", action="store_true", help="Send stream=True and write the streamed response to disk")
134 args = parser.parse_args()
135
136 if not args.skip_upload and not args.ref_audio:
137 raise ValueError("Pass --ref-audio for the first request, or --skip-upload for an existing voice.")
138 if not args.text:
139 args.text = [
140 "This first HTTP speech request should upload the voice and warm the cache.",
141 "This second HTTP speech request uses only the voice name and should hit the cache.",
142 ]
143
144 ref_audio_data_url = None if args.skip_upload else encode_audio_to_data_url(args.ref_audio)
145 output_dir = Path(args.output_dir)
146
147 print("HTTP /v1/audio/speech VoxCPM2 cache demo")
148 if ref_audio_data_url is None:
149 print(f"Using existing voice {args.voice_name!r}; no ref_audio will be sent.")
150 else:
151 print(f"Request 1 will send voice={args.voice_name!r} plus ref_audio.")
152 if args.stream:
153 print("Streaming enabled; requests will send stream=True and write chunks as they arrive.")
154 print("Later requests send only the voice name. Watch server logs for VoxCPM2 cache STORE/HIT.")
155
156 with httpx.Client(timeout=300) as client:
157 for idx, text in enumerate(args.text, start=1):
158 print(f"\nRequest {idx}/{len(args.text)}")
159 print(" expected cache behavior:", "miss + store" if idx == 1 else "hit")
160 post_speech(
161 client=client,
162 api_base=args.api_base,
163 api_key=args.api_key,
164 model=args.model,
165 text=text,
166 voice_name=args.voice_name,
167 output_path=output_dir / f"request_{idx:02d}.{args.response_format}",
168 response_format=args.response_format,
169 stream=args.stream,
170 ref_audio=ref_audio_data_url if idx == 1 else None,
171 ref_text=args.ref_text if idx == 1 else None,
172 )
173
174
175if __name__ == "__main__":
176 main()
177
178JSON output
1null