Introducing Baseten Loops: A Training SDK for Frontier RL. Learn more here
image generation

NVIDIA logoNVIDIA Cosmos 3 Nano (8B)

NVIDIA Cosmos 3 Nano (8B) world foundation model with six omni modes: text2image, text2video, image2video, forward/inverse dynamics, and policy. Single H100.

Model details

Example usage

text2image — render a single still image from a text prompt (num_frames: 1). Returns a base64 JPEG.

Input
1import base64
2import os
3
4import requests
5
6# Find MODEL_ID in the Baseten dashboard: Models -> your deployment -> API.
7MODEL_ID = "YOUR_MODEL_ID"
8BASETEN_API_KEY = os.environ["BASETEN_API_KEY"]
9
10resp = requests.post(
11    f"https://model-{MODEL_ID}.api.baseten.co/environments/production/predict",
12    headers={"Authorization": f"Api-Key {BASETEN_API_KEY}"},
13    json={
14        "model_mode": "text2image",
15        "prompt": "A medium shot of a modern robotics research laboratory with white walls and a gray floor.",
16        "resolution": "720",
17        "aspect_ratio": "16,9",
18        "num_frames": 1,
19        "seed": 0
20    },
21)
22resp.raise_for_status()
23out = resp.json()
24
25# Vision modes return base64-encoded media in `data`.
26with open(f"vision.{out['format']}", "wb") as f:
27    f.write(base64.b64decode(out["data"]))
28print(f"saved vision.{out['format']}")
JSON output
1{
2    "name": "<sample-name>",
3    "model_mode": "text2image",
4    "format": "jpg",
5    "data": "<base64-encoded media>"
6}

text2video (default mode) — generate a video clip from a text prompt. Returns a base64 MP4.

Input
1import base64
2import os
3
4import requests
5
6# Find MODEL_ID in the Baseten dashboard: Models -> your deployment -> API.
7MODEL_ID = "YOUR_MODEL_ID"
8BASETEN_API_KEY = os.environ["BASETEN_API_KEY"]
9
10resp = requests.post(
11    f"https://model-{MODEL_ID}.api.baseten.co/environments/production/predict",
12    headers={"Authorization": f"Api-Key {BASETEN_API_KEY}"},
13    json={
14        "model_mode": "text2video",
15        "prompt": "A bustling city street at night, neon signs reflecting on wet pavement, light rain falling.",
16        "resolution": "720",
17        "aspect_ratio": "16,9",
18        "num_frames": 189,
19        "fps": 24,
20        "seed": 0
21    },
22)
23resp.raise_for_status()
24out = resp.json()
25
26# Vision modes return base64-encoded media in `data`.
27with open(f"vision.{out['format']}", "wb") as f:
28    f.write(base64.b64decode(out["data"]))
29print(f"saved vision.{out['format']}")
JSON output
1{
2    "name": "<sample-name>",
3    "model_mode": "text2video",
4    "format": "mp4",
5    "data": "<base64-encoded media>"
6}

image2video — animate a still image conditioned on a text prompt. Pass the first frame as vision_path (URL or container path). Returns a base64 MP4.

Input
1import base64
2import os
3
4import requests
5
6# Find MODEL_ID in the Baseten dashboard: Models -> your deployment -> API.
7MODEL_ID = "YOUR_MODEL_ID"
8BASETEN_API_KEY = os.environ["BASETEN_API_KEY"]
9
10resp = requests.post(
11    f"https://model-{MODEL_ID}.api.baseten.co/environments/production/predict",
12    headers={"Authorization": f"Api-Key {BASETEN_API_KEY}"},
13    json={
14        "model_mode": "image2video",
15        "vision_path": "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/vision/robot_153.jpg",
16        "prompt": "A robotic arm picks up the red spherical object and places it on a lower shelf, completing a smooth deliberate manipulation.",
17        "resolution": "720",
18        "aspect_ratio": "16,9",
19        "num_frames": 189,
20        "fps": 24,
21        "seed": 0
22    },
23)
24resp.raise_for_status()
25out = resp.json()
26
27# Vision modes return base64-encoded media in `data`.
28with open(f"vision.{out['format']}", "wb") as f:
29    f.write(base64.b64decode(out["data"]))
30print(f"saved vision.{out['format']}")
JSON output
1{
2    "name": "<sample-name>",
3    "model_mode": "image2video",
4    "format": "mp4",
5    "data": "<base64-encoded media>"
6}

forward_dynamics — predict the future video rollout from an initial observation plus an action sequence (vision_path + action_path). Returns a base64 MP4.

Input
1import base64
2import os
3
4import requests
5
6# Find MODEL_ID in the Baseten dashboard: Models -> your deployment -> API.
7MODEL_ID = "YOUR_MODEL_ID"
8BASETEN_API_KEY = os.environ["BASETEN_API_KEY"]
9
10resp = requests.post(
11    f"https://model-{MODEL_ID}.api.baseten.co/environments/production/predict",
12    headers={"Authorization": f"Api-Key {BASETEN_API_KEY}"},
13    json={
14        "model_mode": "forward_dynamics",
15        "vision_path": "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/bridge_0.mp4",
16        "action_path": "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/bridge_0.json",
17        "prompt": "Put the pot to the left of the purple item. This video is captured from a first-person perspective looking at the scene.",
18        "image_size": 480,
19        "fps": 5,
20        "num_steps": 30,
21        "guidance": 1.0,
22        "shift": 5.0,
23        "seed": 0,
24        "action_chunk_size": 16,
25        "domain_name": "bridge_orig_lerobot"
26    },
27)
28resp.raise_for_status()
29out = resp.json()
30
31# Vision modes return base64-encoded media in `data`.
32with open(f"vision.{out['format']}", "wb") as f:
33    f.write(base64.b64decode(out["data"]))
34print(f"saved vision.{out['format']}")
JSON output
1{
2    "name": "<sample-name>",
3    "model_mode": "forward_dynamics",
4    "format": "mp4",
5    "data": "<base64-encoded media>"
6}

inverse_dynamics — infer the action sequence that produced an observed video. Returns the action tensor under sample_outputs.

Input
1import base64
2import os
3
4import requests
5
6# Find MODEL_ID in the Baseten dashboard: Models -> your deployment -> API.
7MODEL_ID = "YOUR_MODEL_ID"
8BASETEN_API_KEY = os.environ["BASETEN_API_KEY"]
9
10resp = requests.post(
11    f"https://model-{MODEL_ID}.api.baseten.co/environments/production/predict",
12    headers={"Authorization": f"Api-Key {BASETEN_API_KEY}"},
13    json={
14        "model_mode": "inverse_dynamics",
15        "vision_path": "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/av_vision_25_73d01c91-51f0-46cf-9b76-5682a76fb349.mp4",
16        "prompt": "You are an autonomous vehicle planning system. This video is captured from a first-person perspective looking at the scene.",
17        "image_size": 480,
18        "fps": 10,
19        "num_steps": 30,
20        "guidance": 1.0,
21        "shift": 5.0,
22        "seed": 0,
23        "raw_action_dim": 9,
24        "action_chunk_size": 60,
25        "domain_name": "av"
26    },
27)
28resp.raise_for_status()
29out = resp.json()
30
31# Action modes return the predicted action tensor under sample_outputs.
32actions = out["sample_outputs"]["outputs"][0]["content"]["action"]
33print(f"{len(actions)} action steps")
34print(actions[0])
JSON output
1{
2    "name": "<sample-name>",
3    "model_mode": "inverse_dynamics",
4    "format": "mp4",
5    "data": "<base64 of input video echoed back>",
6    "sample_outputs": {
7        "status": "success",
8        "outputs": [
9            {
10                "content": {
11                    "action": [
12                        [
13                            "..."
14                        ],
15                        [
16                            "..."
17                        ]
18                    ]
19                },
20                "files": [
21                    "..."
22                ]
23            }
24        ]
25    }
26}

policy — predict the actions needed to reach a goal (described in the prompt) from a starting observation. Returns the action tensor under sample_outputs.

Input
1import base64
2import os
3
4import requests
5
6# Find MODEL_ID in the Baseten dashboard: Models -> your deployment -> API.
7MODEL_ID = "YOUR_MODEL_ID"
8BASETEN_API_KEY = os.environ["BASETEN_API_KEY"]
9
10resp = requests.post(
11    f"https://model-{MODEL_ID}.api.baseten.co/environments/production/predict",
12    headers={"Authorization": f"Api-Key {BASETEN_API_KEY}"},
13    json={
14        "model_mode": "policy",
15        "vision_path": "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/bridge_0.mp4",
16        "prompt": "Put the pot to the left of the purple item. This video is captured from a first-person perspective looking at the scene.",
17        "image_size": 480,
18        "fps": 5,
19        "num_steps": 30,
20        "guidance": 1.0,
21        "shift": 5.0,
22        "seed": 0,
23        "action_chunk_size": 16,
24        "raw_action_dim": 10,
25        "domain_name": "bridge_orig_lerobot"
26    },
27)
28resp.raise_for_status()
29out = resp.json()
30
31# Action modes return the predicted action tensor under sample_outputs.
32actions = out["sample_outputs"]["outputs"][0]["content"]["action"]
33print(f"{len(actions)} action steps")
34print(actions[0])
JSON output
1{
2    "name": "<sample-name>",
3    "model_mode": "policy",
4    "format": "mp4",
5    "data": "<base64 of input video echoed back>",
6    "sample_outputs": {
7        "status": "success",
8        "outputs": [
9            {
10                "content": {
11                    "action": [
12                        [
13                            "..."
14                        ],
15                        [
16                            "..."
17                        ]
18                    ]
19                },
20                "files": [
21                    "..."
22                ]
23            }
24        ]
25    }
26}

🔥 Trending models