transcription

Whisper Large V3 (Streaming)

The most performant model server of Whisper Large V3 for real time transcription

‌

Model details

Developed by
OpenAI
Model family
Whisper
Use case
transcription
Version
V3
Size
Large
Hardware
H100 MIG 40GB
License
Apache 2.0

Example usage

Our live transcription is ideal for use cases like live note-taking applications, content captioning, customer support, and any real-time, voice-powered apps. Learn more about our optimized Whisper transcription pipeline in our launch blog.

Features include:

Configurable update cadence for the partial transcriptions delivered
Consistent real-time latency under high volume of concurrent audio streams
Automatic language detection for multilingual transcription

Recommended setups for different use cases:

Balanced
- GPU type: H100 MIG
- Concurrency target: <= 40
Highly latency-sensitive
- GPU type: H100
- Concurrency target: <= 20

Try the example code below, or check out our API documentation for more detailed information.

Input

1"""
2Streaming ASR — Minimal Example
3
4Stream audio from your laptop's microphone to the Baseten Streaming ASR
5WebSocket endpoint and print transcription results in real time.
6
7Requirements:
8    pip install websockets sounddevice numpy
9
10Usage:
11    export BASETEN_API_KEY="your-api-key"
12    python call.py --model-id <YOUR_MODEL_ID>
13"""
14
15import argparse
16import asyncio
17import json
18import os
19
20import numpy as np
21import sounddevice as sd
22import websockets
23
24# Audio config
25SAMPLE_RATE = 16000
26CHUNK_SIZE = 512
27CHANNELS = 1
28
29headers = {"Authorization": f"Api-Key {os.environ['BASETEN_API_KEY']}"}
30
31# Metadata sent as the first message — must match StreamingWhisperInput schema
32metadata = {
33    "whisper_params": {
34        "audio_language": "en",
35    },
36    "streaming_params": {
37        "encoding": "pcm_s16le",
38        "sample_rate": SAMPLE_RATE,
39        "enable_partial_transcripts": True,
40    },
41    "streaming_vad_config": {
42        "threshold": 0.5,
43        "min_silence_duration_ms": 300,
44        "speech_pad_ms": 30,
45    },
46}
47
48
49async def stream_microphone_audio(ws_url):
50    loop = asyncio.get_running_loop()
51    async with websockets.connect(ws_url, additional_headers=headers) as ws:
52        print("Connected to server")
53
54        await ws.send(json.dumps(metadata))
55        print("Sent metadata")
56
57        send_queue = asyncio.Queue()
58
59        def audio_callback(indata, frames, time_info, status):
60            if status:
61                print(f"Audio warning: {status}")
62            int16_data = (indata * 32767).astype(np.int16).tobytes()
63            loop.call_soon_threadsafe(send_queue.put_nowait, int16_data)
64
65        with sd.InputStream(
66            samplerate=SAMPLE_RATE,
67            blocksize=CHUNK_SIZE,
68            channels=CHANNELS,
69            dtype="float32",
70            callback=audio_callback,
71        ):
72            print("Streaming mic audio… (Ctrl+C to stop)\n")
73
74            async def send_audio():
75                while True:
76                    chunk = await send_queue.get()
77                    await ws.send(chunk)
78
79            async def receive_messages():
80                while True:
81                    response = await ws.recv()
82                    try:
83                        message = json.loads(response)
84                        msg_type = message.get("type")
85
86                        if msg_type == "transcription":
87                            is_final = message.get("is_final", False)
88                            segments = message.get("segments", [])
89                            text = " ".join(
90                                seg.get("text", "") for seg in segments
91                            ).strip()
92
93                            if is_final:
94                                print(f"[final]   {text}")
95                            else:
96                                print(f"[partial] {text}")
97
98                        elif msg_type == "end_audio":
99                            print("[end_audio]", message.get("body"))
100
101                        else:
102                            print(f"[{msg_type}]", message)
103
104                    except Exception as e:
105                        print("Parse error:", e, "| raw:", response[:200])
106
107            await asyncio.gather(send_audio(), receive_messages())
108
109
110if __name__ == "__main__":
111    parser = argparse.ArgumentParser()
112    parser.add_argument("--model-id", required=True, help="Baseten model ID")
113    args = parser.parse_args()
114
115    ws_url = f"wss://model-{args.model_id}.api.baseten.co/environments/production/websocket"
116    print(f"Connecting to: {ws_url}")
117
118    asyncio.run(stream_microphone_audio(ws_url))

JSON output

1{
2    "segments": [
3        {
4            "start_time": 3.488,
5            "end_time": 6.784,
6            "text": "Tell me what an oak tree looks like.",
7            "log_prob": -1.5155935287475586,
8            "word_timestamps": [],
9            "speaker": null,
10            "speaker_confidence": null,
11            "possible_hallucination": false,
12            "beam_results": [],
13            "language_code": "en",
14            "language_prob": null
15        }
16    ],
17    "language_code": "en",
18    "language_prob": null,
19    "diarization": [],
20    "timing_info": null,
21    "enable_chunk_level_language_detection": false,
22    "audio_length_sec": 3.296,
23    "is_final": true,
24    "transcription_num": 2,
25    "next_partial": null,
26    "pipeline_latency": null,
27    "type": "transcription"
28}