Our Series E: we raised $300M at a $5B valuation to power a multi-model future. READ
transcription

OpenAI logoWhisper Large V3 Turbo (Streaming)

Streaming version of the Whisper Large V3 Turbo implementation.

Model details

View repository

Example usage

Our live transcription is ideal for use cases like live note-taking applications, content captioning, customer support, and any real-time, voice-powered apps. Learn more about our optimized Whisper transcription pipeline in our launch blog.

Features include:

  • Configurable update cadence for the partial transcriptions delivered

  • Consistent real-time latency under high volume of concurrent audio streams

  • Automatic language detection for multilingual transcription

Recommended setups for different use cases: 

Try the example code below, or check out our API documentation for more detailed information.

Input
1"""
2Streaming ASR — Minimal Example
3
4Stream audio from your laptop's microphone to the Baseten Streaming ASR
5WebSocket endpoint and print transcription results in real time.
6
7Requirements:
8    pip install websockets sounddevice numpy
9
10Usage:
11    export BASETEN_API_KEY="your-api-key"
12    python call.py --model-id <YOUR_MODEL_ID>
13"""
14
15import argparse
16import asyncio
17import json
18import os
19
20import numpy as np
21import sounddevice as sd
22import websockets
23
24# Audio config
25SAMPLE_RATE = 16000
26CHUNK_SIZE = 512
27CHANNELS = 1
28
29headers = {"Authorization": f"Api-Key {os.environ['BASETEN_API_KEY']}"}
30
31# Metadata sent as the first message — must match StreamingWhisperInput schema
32metadata = {
33    "whisper_params": {
34        "audio_language": "en",
35    },
36    "streaming_params": {
37        "encoding": "pcm_s16le",
38        "sample_rate": SAMPLE_RATE,
39        "enable_partial_transcripts": True,
40    },
41    "streaming_vad_config": {
42        "threshold": 0.5,
43        "min_silence_duration_ms": 300,
44        "speech_pad_ms": 30,
45    },
46}
47
48
49async def stream_microphone_audio(ws_url):
50    loop = asyncio.get_running_loop()
51    async with websockets.connect(ws_url, additional_headers=headers) as ws:
52        print("Connected to server")
53
54        await ws.send(json.dumps(metadata))
55        print("Sent metadata")
56
57        send_queue = asyncio.Queue()
58
59        def audio_callback(indata, frames, time_info, status):
60            if status:
61                print(f"Audio warning: {status}")
62            int16_data = (indata * 32767).astype(np.int16).tobytes()
63            loop.call_soon_threadsafe(send_queue.put_nowait, int16_data)
64
65        with sd.InputStream(
66            samplerate=SAMPLE_RATE,
67            blocksize=CHUNK_SIZE,
68            channels=CHANNELS,
69            dtype="float32",
70            callback=audio_callback,
71        ):
72            print("Streaming mic audio… (Ctrl+C to stop)\n")
73
74            async def send_audio():
75                while True:
76                    chunk = await send_queue.get()
77                    await ws.send(chunk)
78
79            async def receive_messages():
80                while True:
81                    response = await ws.recv()
82                    try:
83                        message = json.loads(response)
84                        msg_type = message.get("type")
85
86                        if msg_type == "transcription":
87                            is_final = message.get("is_final", False)
88                            segments = message.get("segments", [])
89                            text = " ".join(
90                                seg.get("text", "") for seg in segments
91                            ).strip()
92
93                            if is_final:
94                                print(f"[final]   {text}")
95                            else:
96                                print(f"[partial] {text}")
97
98                        elif msg_type == "end_audio":
99                            print("[end_audio]", message.get("body"))
100
101                        else:
102                            print(f"[{msg_type}]", message)
103
104                    except Exception as e:
105                        print("Parse error:", e, "| raw:", response[:200])
106
107            await asyncio.gather(send_audio(), receive_messages())
108
109
110if __name__ == "__main__":
111    parser = argparse.ArgumentParser()
112    parser.add_argument("--model-id", required=True, help="Baseten model ID")
113    args = parser.parse_args()
114
115    ws_url = f"wss://model-{args.model_id}.api.baseten.co/environments/production/websocket"
116    print(f"Connecting to: {ws_url}")
117
118    asyncio.run(stream_microphone_audio(ws_url))
119
JSON output
1{
2    "segments": [
3        {
4            "start_time": 3.488,
5            "end_time": 6.784,
6            "text": "Tell me what an oak tree looks like.",
7            "log_prob": -1.5155935287475586,
8            "word_timestamps": [],
9            "speaker": null,
10            "speaker_confidence": null,
11            "possible_hallucination": false,
12            "beam_results": [],
13            "language_code": "en",
14            "language_prob": null
15        }
16    ],
17    "language_code": "en",
18    "language_prob": null,
19    "diarization": [],
20    "timing_info": null,
21    "enable_chunk_level_language_detection": false,
22    "audio_length_sec": 3.296,
23    "is_final": true,
24    "transcription_num": 2,
25    "next_partial": null,
26    "pipeline_latency": null,
27    "type": "transcription"
28}

transcription models

See all
OpenAI logo
Transcription

Whisper Large V3 (Streaming)

V3 - H100 MIG 40GB
Mistral AI logo
Transcription

Voxtral Mini 4B Realtime 2602

2602 - Mini - H100 MIG 40GB
OpenAI logo
Transcription

Whisper Large V3

V3 - H100 MIG 40GB

OpenAI models

See all
OpenAI logo
Transcription

Whisper Large V3 (Streaming)

V3 - H100 MIG 40GB
OpenAI logo
Transcription

Whisper Large V3

V3 - H100 MIG 40GB
OpenAI logo
Transcription

Whisper Large V3 Turbo

V3 - Turbo - H100 MIG 40GB

🔥 Trending models