Whisper Large V3 (Streaming)
The most performant model server of Whisper Large V3 for real time transcription
Model details
Example usage
Our live transcription is ideal for use cases like live note-taking applications, content captioning, customer support, and any real-time, voice-powered apps. Learn more about our optimized Whisper transcription pipeline in our launch blog.
Features include:
Configurable update cadence for the partial transcriptions delivered
Consistent real-time latency under high volume of concurrent audio streams
Automatic language detection for multilingual transcription
Recommended setups for different use cases:
Balanced
GPU type: H100 MIG
Concurrency target: <= 40
Highly latency-sensitive
GPU type: H100
Concurrency target: <= 20
Try the example code below, or check out our API documentation for more detailed information.
1"""
2Streaming ASR — Minimal Example
3
4Stream audio from your laptop's microphone to the Baseten Streaming ASR
5WebSocket endpoint and print transcription results in real time.
6
7Requirements:
8 pip install websockets sounddevice numpy
9
10Usage:
11 export BASETEN_API_KEY="your-api-key"
12 python call.py --model-id <YOUR_MODEL_ID>
13"""
14
15import argparse
16import asyncio
17import json
18import os
19
20import numpy as np
21import sounddevice as sd
22import websockets
23
24# Audio config
25SAMPLE_RATE = 16000
26CHUNK_SIZE = 512
27CHANNELS = 1
28
29headers = {"Authorization": f"Api-Key {os.environ['BASETEN_API_KEY']}"}
30
31# Metadata sent as the first message — must match StreamingWhisperInput schema
32metadata = {
33 "whisper_params": {
34 "audio_language": "en",
35 },
36 "streaming_params": {
37 "encoding": "pcm_s16le",
38 "sample_rate": SAMPLE_RATE,
39 "enable_partial_transcripts": True,
40 },
41 "streaming_vad_config": {
42 "threshold": 0.5,
43 "min_silence_duration_ms": 300,
44 "speech_pad_ms": 30,
45 },
46}
47
48
49async def stream_microphone_audio(ws_url):
50 loop = asyncio.get_running_loop()
51 async with websockets.connect(ws_url, additional_headers=headers) as ws:
52 print("Connected to server")
53
54 await ws.send(json.dumps(metadata))
55 print("Sent metadata")
56
57 send_queue = asyncio.Queue()
58
59 def audio_callback(indata, frames, time_info, status):
60 if status:
61 print(f"Audio warning: {status}")
62 int16_data = (indata * 32767).astype(np.int16).tobytes()
63 loop.call_soon_threadsafe(send_queue.put_nowait, int16_data)
64
65 with sd.InputStream(
66 samplerate=SAMPLE_RATE,
67 blocksize=CHUNK_SIZE,
68 channels=CHANNELS,
69 dtype="float32",
70 callback=audio_callback,
71 ):
72 print("Streaming mic audio… (Ctrl+C to stop)\n")
73
74 async def send_audio():
75 while True:
76 chunk = await send_queue.get()
77 await ws.send(chunk)
78
79 async def receive_messages():
80 while True:
81 response = await ws.recv()
82 try:
83 message = json.loads(response)
84 msg_type = message.get("type")
85
86 if msg_type == "transcription":
87 is_final = message.get("is_final", False)
88 segments = message.get("segments", [])
89 text = " ".join(
90 seg.get("text", "") for seg in segments
91 ).strip()
92
93 if is_final:
94 print(f"[final] {text}")
95 else:
96 print(f"[partial] {text}")
97
98 elif msg_type == "end_audio":
99 print("[end_audio]", message.get("body"))
100
101 else:
102 print(f"[{msg_type}]", message)
103
104 except Exception as e:
105 print("Parse error:", e, "| raw:", response[:200])
106
107 await asyncio.gather(send_audio(), receive_messages())
108
109
110if __name__ == "__main__":
111 parser = argparse.ArgumentParser()
112 parser.add_argument("--model-id", required=True, help="Baseten model ID")
113 args = parser.parse_args()
114
115 ws_url = f"wss://model-{args.model_id}.api.baseten.co/environments/production/websocket"
116 print(f"Connecting to: {ws_url}")
117
118 asyncio.run(stream_microphone_audio(ws_url))1{
2 "segments": [
3 {
4 "start_time": 3.488,
5 "end_time": 6.784,
6 "text": "Tell me what an oak tree looks like.",
7 "log_prob": -1.5155935287475586,
8 "word_timestamps": [],
9 "speaker": null,
10 "speaker_confidence": null,
11 "possible_hallucination": false,
12 "beam_results": [],
13 "language_code": "en",
14 "language_prob": null
15 }
16 ],
17 "language_code": "en",
18 "language_prob": null,
19 "diarization": [],
20 "timing_info": null,
21 "enable_chunk_level_language_detection": false,
22 "audio_length_sec": 3.296,
23 "is_final": true,
24 "transcription_num": 2,
25 "next_partial": null,
26 "pipeline_latency": null,
27 "type": "transcription"
28}