Orpheus TTS
An incredibly lifelike speech synthesis model by Canopy Labs.
Deploy Orpheus TTS behind an API endpoint in seconds.
Example usage
Orpheus TTS must generate ~83 tokens/second for real-time streaming. This implementation supports streaming and, on an H100 MIG GPU, can produce:
16 concurrent real-time streams with variable traffic
24 concurrent real-time streams with consistent traffic
128 concurrent non-real-time generations for cost-efficient batching
1import requests
2import pyaudio
3import wave
4import time
5import os
6
7
8# ——— Request parameters ———
9orpheus_model_id = "" # Paste your model ID here
10api_key = os.environ["BASETEN_API_KEY"]
11max_tokens = 2000
12
13def stream_audio(text: str, voice: str):
14 """
15 Stream the audio directly to your speakers
16 """
17 p = pyaudio.PyAudio()
18 stream = p.open(format=pyaudio.paInt16, channels=1, rate=24000, output=True)
19
20 resp = requests.post(
21 f"https://model-{orpheus_model_id}.api.baseten.co/environments/production/predict",
22 headers={"Authorization": f"Api-Key {api_key}"},
23 json={"voice": voice, "prompt": text, "max_tokens": max_tokens},
24 stream=True,
25 )
26 resp.raise_for_status()
27
28 for chunk in resp.iter_content(chunk_size=4096):
29 if chunk:
30 stream.write(chunk)
31
32 stream.stop_stream()
33 stream.close()
34 p.terminate()
35
36
37def save_audio(text: str, voice: str, output_path: str = "output.wav"):
38 """
39 Save the audio to a WAV file.
40 """
41 start_time = time.monotonic()
42
43 resp = requests.post(
44 f"https://model-{orpheus_model_id}.api.baseten.co/environments/production/predict",
45 headers={"Authorization": f"Api-Key {api_key}"},
46 json={"voice": voice, "prompt": text, "max_tokens": max_tokens},
47 stream=False,
48 )
49 resp.raise_for_status()
50
51 with wave.open(output_path, "wb") as wf:
52 wf.setnchannels(1)
53 wf.setsampwidth(2)
54 wf.setframerate(24000)
55
56 total_frames = 0
57 chunk_counter = 0
58
59 for chunk in resp.iter_content(chunk_size=4096):
60 if not chunk:
61 continue
62 chunk_counter += 1
63 frame_count = len(chunk) // (wf.getsampwidth() * wf.getnchannels())
64 total_frames += frame_count
65 wf.writeframes(chunk)
66
67 duration = total_frames / wf.getframerate()
68
69 end_time = time.monotonic()
70 elapsed = end_time - start_time
71 print(f"Generated {duration:.2f}s of audio in {chunk_counter} chunks in {elapsed:.2f}s.")
72
73
74if __name__ == "__main__":
75 voice = "tara"
76
77 original_text = """
78 Nothing beside remains. Round the decay of that colossal wreck, boundless and bare,
79 The lone and level sands stretch far away.
80 """
81
82 print("🔊 Streaming live:")
83 stream_audio(original_text, voice)
84
85 print("\n💾 Saving to output.wav:")
86 save_audio(original_text, voice)
87
88 print("Done!")
1{}