Whisper Large V3 Turbo (Streaming)

Example usage

Our live transcription is ideal for use cases like live note-taking applications, content captioning, customer support, and any real-time, voice-powered apps. Learn more about our optimized Whisper transcription pipeline in our launch blog.

Features include:

Configurable update cadence for the partial transcriptions delivered
Consistent real-time latency under high volume of concurrent audio streams
Automatic language detection for multilingual transcription

Recommended setups for different use cases:

Balanced
- GPU type: H100 MIG
- Concurrency target: <= 40
Highly latency-sensitive
- GPU type: H100
- Concurrency target: <= 20

Try the example code below, or check out our API documentation for more detailed information.

Input
"""
Streaming ASR — Minimal Example

Stream audio from your laptop's microphone to the Baseten Streaming ASR
WebSocket endpoint and print transcription results in real time.

Requirements:
    pip install websockets sounddevice numpy

Usage:
    export BASETEN_API_KEY="your-api-key"
    python call.py --model-id <YOUR_MODEL_ID>
"""

import argparse
import asyncio
import json
import os

import numpy as np
import sounddevice as sd
import websockets

# Audio config
SAMPLE_RATE = 16000
CHUNK_SIZE = 512
CHANNELS = 1

headers = {"Authorization": f"Api-Key {os.environ['BASETEN_API_KEY']}"}

# Metadata sent as the first message — must match StreamingWhisperInput schema
metadata = {
    "whisper_params": {
        "audio_language": "en",
    },
    "streaming_params": {
        "encoding": "pcm_s16le",
        "sample_rate": SAMPLE_RATE,
        "enable_partial_transcripts": True,
    },
    "streaming_vad_config": {
        "threshold": 0.5,
        "min_silence_duration_ms": 300,
        "speech_pad_ms": 30,
    },
}


async def stream_microphone_audio(ws_url):
    loop = asyncio.get_running_loop()
    async with websockets.connect(ws_url, additional_headers=headers) as ws:
        print("Connected to server")

        await ws.send(json.dumps(metadata))
        print("Sent metadata")

        send_queue = asyncio.Queue()

        def audio_callback(indata, frames, time_info, status):
            if status:
                print(f"Audio warning: {status}")
            int16_data = (indata * 32767).astype(np.int16).tobytes()
            loop.call_soon_threadsafe(send_queue.put_nowait, int16_data)

        with sd.InputStream(
            samplerate=SAMPLE_RATE,
            blocksize=CHUNK_SIZE,
            channels=CHANNELS,
            dtype="float32",
            callback=audio_callback,
        ):
            print("Streaming mic audio… (Ctrl+C to stop)\n")

            async def send_audio():
                while True:
                    chunk = await send_queue.get()
                    await ws.send(chunk)

            async def receive_messages():
                while True:
                    response = await ws.recv()
                    try:
                        message = json.loads(response)
                        msg_type = message.get("type")

                        if msg_type == "transcription":
                            is_final = message.get("is_final", False)
                            segments = message.get("segments", [])
                            text = " ".join(
                                seg.get("text", "") for seg in segments
                            ).strip()

                            if is_final:
                                print(f"[final]   {text}")
                            else:
                                print(f"[partial] {text}")

                        elif msg_type == "end_audio":
                            print("[end_audio]", message.get("body"))

                        else:
                            print(f"[{msg_type}]", message)

                    except Exception as e:
                        print("Parse error:", e, "| raw:", response[:200])

            await asyncio.gather(send_audio(), receive_messages())


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model-id", required=True, help="Baseten model ID")
    args = parser.parse_args()

    ws_url = f"wss://model-{args.model_id}.api.baseten.co/environments/production/websocket"
    print(f"Connecting to: {ws_url}")

    asyncio.run(stream_microphone_audio(ws_url))
JSON output
{
    "segments": [
        {
            "start_time": 3.488,
            "end_time": 6.784,
            "text": "Tell me what an oak tree looks like.",
            "log_prob": -1.5155935287475586,
            "word_timestamps": [],
            "speaker": null,
            "speaker_confidence": null,
            "possible_hallucination": false,
            "beam_results": [],
            "language_code": "en",
            "language_prob": null
        }
    ],
    "language_code": "en",
    "language_prob": null,
    "diarization": [],
    "timing_info": null,
    "enable_chunk_level_language_detection": false,
    "audio_length_sec": 3.296,
    "is_final": true,
    "transcription_num": 2,
    "next_partial": null,
    "pipeline_latency": null,
    "type": "transcription"
}

Whisper Large V3 Turbo (Streaming)

Model details

Example usage

transcription models

Whisper Large V3 (Streaming)

Voxtral Mini 4B Realtime 2602

Whisper Large V3

OpenAI models

Whisper Large V3 (Streaming)

Whisper Large V3

Whisper Large V3 Turbo

🔥 Trending models

MiniMax M2.5

GLM 5

Kimi K2.5

Explore Baseten today