Whisper Large V3 Turbo (Streaming)
Streaming version of the Whisper Large V3 Turbo implementation.
Model details
Example usage
Our live transcription is ideal for use cases like live note-taking applications, content captioning, customer support, and any real-time, voice-powered apps. Learn more about our optimized Whisper transcription pipeline in our launch blog.
Features include:
Configurable update cadence for the partial transcriptions delivered
Consistent real-time latency under high volume of concurrent audio streams
Automatic language detection for multilingual transcription
Recommended setups for different use cases:
Balanced
GPU type: H100 MIG
Concurrency target: <= 40
Highly latency-sensitive
GPU type: H100
Concurrency target: <= 20
Try the example code below, or check out our API documentation for more detailed information.
1"""
2Streaming ASR — Minimal Example
3
4Stream audio from your laptop's microphone to the Baseten Streaming ASR
5WebSocket endpoint and print transcription results in real time.
6
7Requirements:
8 pip install websockets sounddevice numpy
9
10Usage:
11 export BASETEN_API_KEY="your-api-key"
12 python call.py --model-id <YOUR_MODEL_ID>
13"""
14
15import argparse
16import asyncio
17import json
18import os
19
20import numpy as np
21import sounddevice as sd
22import websockets
23
24# Audio config
25SAMPLE_RATE = 16000
26CHUNK_SIZE = 512
27CHANNELS = 1
28
29headers = {"Authorization": f"Api-Key {os.environ['BASETEN_API_KEY']}"}
30
31# Metadata sent as the first message — must match StreamingWhisperInput schema
32metadata = {
33 "whisper_params": {
34 "audio_language": "en",
35 },
36 "streaming_params": {
37 "encoding": "pcm_s16le",
38 "sample_rate": SAMPLE_RATE,
39 "enable_partial_transcripts": True,
40 },
41 "streaming_vad_config": {
42 "threshold": 0.5,
43 "min_silence_duration_ms": 300,
44 "speech_pad_ms": 30,
45 },
46}
47
48
49async def stream_microphone_audio(ws_url):
50 loop = asyncio.get_running_loop()
51 async with websockets.connect(ws_url, additional_headers=headers) as ws:
52 print("Connected to server")
53
54 await ws.send(json.dumps(metadata))
55 print("Sent metadata")
56
57 send_queue = asyncio.Queue()
58
59 def audio_callback(indata, frames, time_info, status):
60 if status:
61 print(f"Audio warning: {status}")
62 int16_data = (indata * 32767).astype(np.int16).tobytes()
63 loop.call_soon_threadsafe(send_queue.put_nowait, int16_data)
64
65 with sd.InputStream(
66 samplerate=SAMPLE_RATE,
67 blocksize=CHUNK_SIZE,
68 channels=CHANNELS,
69 dtype="float32",
70 callback=audio_callback,
71 ):
72 print("Streaming mic audio… (Ctrl+C to stop)\n")
73
74 async def send_audio():
75 while True:
76 chunk = await send_queue.get()
77 await ws.send(chunk)
78
79 async def receive_messages():
80 while True:
81 response = await ws.recv()
82 try:
83 message = json.loads(response)
84 msg_type = message.get("type")
85
86 if msg_type == "transcription":
87 is_final = message.get("is_final", False)
88 segments = message.get("segments", [])
89 text = " ".join(
90 seg.get("text", "") for seg in segments
91 ).strip()
92
93 if is_final:
94 print(f"[final] {text}")
95 else:
96 print(f"[partial] {text}")
97
98 elif msg_type == "end_audio":
99 print("[end_audio]", message.get("body"))
100
101 else:
102 print(f"[{msg_type}]", message)
103
104 except Exception as e:
105 print("Parse error:", e, "| raw:", response[:200])
106
107 await asyncio.gather(send_audio(), receive_messages())
108
109
110if __name__ == "__main__":
111 parser = argparse.ArgumentParser()
112 parser.add_argument("--model-id", required=True, help="Baseten model ID")
113 args = parser.parse_args()
114
115 ws_url = f"wss://model-{args.model_id}.api.baseten.co/environments/production/websocket"
116 print(f"Connecting to: {ws_url}")
117
118 asyncio.run(stream_microphone_audio(ws_url))
1191{
2 "segments": [
3 {
4 "start_time": 3.488,
5 "end_time": 6.784,
6 "text": "Tell me what an oak tree looks like.",
7 "log_prob": -1.5155935287475586,
8 "word_timestamps": [],
9 "speaker": null,
10 "speaker_confidence": null,
11 "possible_hallucination": false,
12 "beam_results": [],
13 "language_code": "en",
14 "language_prob": null
15 }
16 ],
17 "language_code": "en",
18 "language_prob": null,
19 "diarization": [],
20 "timing_info": null,
21 "enable_chunk_level_language_detection": false,
22 "audio_length_sec": 3.296,
23 "is_final": true,
24 "transcription_num": 2,
25 "next_partial": null,
26 "pipeline_latency": null,
27 "type": "transcription"
28}