image generation

NVIDIA Cosmos 3 Nano (8B)

NVIDIA Cosmos 3 Nano (8B) world foundation model with six omni modes: text2image, text2video, image2video, forward/inverse dynamics, and policy. Single H100.

Deploy now

Model details

Developed by
NVIDIA
Model family
Nemotron
Use case
image generation
Version
V1
Variant
Nano
Size
8B
Hardware
H100
License
NVIDIA AI Foundation Models Community License Agreement

Example usage

text2image — render a single still image from a text prompt (num_frames: 1). Returns a base64 JPEG.

Input

1import base64
2import os
3
4import requests
5
6# Find MODEL_ID in the Baseten dashboard: Models -> your deployment -> API.
7MODEL_ID = "YOUR_MODEL_ID"
8BASETEN_API_KEY = os.environ["BASETEN_API_KEY"]
9
10resp = requests.post(
11    f"https://model-{MODEL_ID}.api.baseten.co/environments/production/predict",
12    headers={"Authorization": f"Api-Key {BASETEN_API_KEY}"},
13    json={
14        "model_mode": "text2image",
15        "prompt": "A medium shot of a modern robotics research laboratory with white walls and a gray floor.",
16        "resolution": "720",
17        "aspect_ratio": "16,9",
18        "num_frames": 1,
19        "seed": 0
20    },
21)
22resp.raise_for_status()
23out = resp.json()
24
25# Vision modes return base64-encoded media in `data`.
26with open(f"vision.{out['format']}", "wb") as f:
27    f.write(base64.b64decode(out["data"]))
28print(f"saved vision.{out['format']}")

JSON output

1{
2    "name": "<sample-name>",
3    "model_mode": "text2image",
4    "format": "jpg",
5    "data": "<base64-encoded media>"
6}

text2video (default mode) — generate a video clip from a text prompt. Returns a base64 MP4.

Input

1import base64
2import os
3
4import requests
5
6# Find MODEL_ID in the Baseten dashboard: Models -> your deployment -> API.
7MODEL_ID = "YOUR_MODEL_ID"
8BASETEN_API_KEY = os.environ["BASETEN_API_KEY"]
9
10resp = requests.post(
11    f"https://model-{MODEL_ID}.api.baseten.co/environments/production/predict",
12    headers={"Authorization": f"Api-Key {BASETEN_API_KEY}"},
13    json={
14        "model_mode": "text2video",
15        "prompt": "A bustling city street at night, neon signs reflecting on wet pavement, light rain falling.",
16        "resolution": "720",
17        "aspect_ratio": "16,9",
18        "num_frames": 189,
19        "fps": 24,
20        "seed": 0
21    },
22)
23resp.raise_for_status()
24out = resp.json()
25
26# Vision modes return base64-encoded media in `data`.
27with open(f"vision.{out['format']}", "wb") as f:
28    f.write(base64.b64decode(out["data"]))
29print(f"saved vision.{out['format']}")

JSON output

1{
2    "name": "<sample-name>",
3    "model_mode": "text2video",
4    "format": "mp4",
5    "data": "<base64-encoded media>"
6}

image2video — animate a still image conditioned on a text prompt. For best results, pass a structured prompt (scene, subject, action, motion, camera, temporal_sequence, constraints) JSON-encoded as a string in the prompt field, per NVIDIA's recommendation. Provide the first frame as vision_path (URL or container path). Returns a base64 MP4.

Input

1import base64
2import json
3import os
4
5import requests
6
7# Find MODEL_ID in the Baseten dashboard: Models -> your deployment -> API.
8MODEL_ID = "YOUR_MODEL_ID"
9BASETEN_API_KEY = os.environ["BASETEN_API_KEY"]
10
11# Cosmos 3 prefers a *structured* prompt for image2video. Describe the scene,
12# subject, action, motion, camera, temporal sequence, and constraints, then
13# pass it JSON-encoded as a string in the `prompt` field.
14prompt = {
15    "scene": "A modern robotics lab with a robotic arm, red spherical object, metal shelf, white walls, and gray floor.",
16    "subject": "The robotic arm",
17    "action": "picks up the red spherical object and places it on the lower shelf",
18    "motion": {
19        "type": "smooth deliberate manipulation",
20        "speed": "slow and controlled",
21        "trajectory": "arm moves from tabletop to lower shelf",
22    },
23    "camera": {
24        "movement": "locked-off static camera",
25        "framing": "medium shot",
26        "preserve_composition": True,
27    },
28    "temporal_sequence": [
29        "The gripper approaches the red sphere",
30        "The gripper closes around the sphere",
31        "The arm lifts the sphere",
32        "The arm moves to the lower shelf",
33        "The gripper releases the sphere",
34    ],
35    "constraints": [
36        "Preserve the identity and layout of the input image",
37        "Do not introduce new objects",
38        "Keep lighting and background consistent",
39        "Avoid fast or jerky motion",
40    ],
41}
42
43resp = requests.post(
44    f"https://model-{MODEL_ID}.api.baseten.co/environments/production/predict",
45    headers={"Authorization": f"Api-Key {BASETEN_API_KEY}"},
46    json={
47        "model_mode": "image2video",
48        "vision_path": "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/vision/robot_153.jpg",
49        "prompt": json.dumps(prompt),
50        "resolution": "720",
51        "aspect_ratio": "16,9",
52        "num_frames": 189,
53        "fps": 24,
54        "seed": 0,
55    },
56)
57resp.raise_for_status()
58out = resp.json()
59
60# Vision modes return base64-encoded media in `data`.
61with open(f"vision.{out['format']}", "wb") as f:
62    f.write(base64.b64decode(out["data"]))
63print(f"saved vision.{out['format']}")

JSON output

1{
2    "name": "<sample-name>",
3    "model_mode": "image2video",
4    "format": "mp4",
5    "data": "<base64-encoded media>"
6}

forward_dynamics — predict the future video rollout from an initial observation plus an action sequence (vision_path + action_path). Returns a base64 MP4.

Input

1import base64
2import os
3
4import requests
5
6# Find MODEL_ID in the Baseten dashboard: Models -> your deployment -> API.
7MODEL_ID = "YOUR_MODEL_ID"
8BASETEN_API_KEY = os.environ["BASETEN_API_KEY"]
9
10resp = requests.post(
11    f"https://model-{MODEL_ID}.api.baseten.co/environments/production/predict",
12    headers={"Authorization": f"Api-Key {BASETEN_API_KEY}"},
13    json={
14        "model_mode": "forward_dynamics",
15        "vision_path": "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/bridge_0.mp4",
16        "action_path": "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/bridge_0.json",
17        "prompt": "Put the pot to the left of the purple item. This video is captured from a first-person perspective looking at the scene.",
18        "image_size": 480,
19        "fps": 5,
20        "num_steps": 30,
21        "guidance": 1.0,
22        "shift": 5.0,
23        "seed": 0,
24        "action_chunk_size": 16,
25        "domain_name": "bridge_orig_lerobot"
26    },
27)
28resp.raise_for_status()
29out = resp.json()
30
31# Vision modes return base64-encoded media in `data`.
32with open(f"vision.{out['format']}", "wb") as f:
33    f.write(base64.b64decode(out["data"]))
34print(f"saved vision.{out['format']}")

JSON output

1{
2    "name": "<sample-name>",
3    "model_mode": "forward_dynamics",
4    "format": "mp4",
5    "data": "<base64-encoded media>"
6}

inverse_dynamics — infer the action sequence that produced an observed video. Returns the action tensor under sample_outputs.

Input

1import base64
2import os
3
4import requests
5
6# Find MODEL_ID in the Baseten dashboard: Models -> your deployment -> API.
7MODEL_ID = "YOUR_MODEL_ID"
8BASETEN_API_KEY = os.environ["BASETEN_API_KEY"]
9
10resp = requests.post(
11    f"https://model-{MODEL_ID}.api.baseten.co/environments/production/predict",
12    headers={"Authorization": f"Api-Key {BASETEN_API_KEY}"},
13    json={
14        "model_mode": "inverse_dynamics",
15        "vision_path": "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/av_vision_25_73d01c91-51f0-46cf-9b76-5682a76fb349.mp4",
16        "prompt": "You are an autonomous vehicle planning system. This video is captured from a first-person perspective looking at the scene.",
17        "image_size": 480,
18        "fps": 10,
19        "num_steps": 30,
20        "guidance": 1.0,
21        "shift": 5.0,
22        "seed": 0,
23        "raw_action_dim": 9,
24        "action_chunk_size": 60,
25        "domain_name": "av"
26    },
27)
28resp.raise_for_status()
29out = resp.json()
30
31# Action modes return the predicted action tensor under sample_outputs.
32actions = out["sample_outputs"]["outputs"][0]["content"]["action"]
33print(f"{len(actions)} action steps")
34print(actions[0])

JSON output

1{
2    "name": "<sample-name>",
3    "model_mode": "inverse_dynamics",
4    "format": "mp4",
5    "data": "<base64 of input video echoed back>",
6    "sample_outputs": {
7        "status": "success",
8        "outputs": [
9            {
10                "content": {
11                    "action": [
12                        [
13                            "..."
14                        ],
15                        [
16                            "..."
17                        ]
18                    ]
19                },
20                "files": [
21                    "..."
22                ]
23            }
24        ]
25    }
26}

policy — predict the actions needed to reach a goal (described in the prompt) from a starting observation. Returns the action tensor under sample_outputs.

Input

1import base64
2import os
3
4import requests
5
6# Find MODEL_ID in the Baseten dashboard: Models -> your deployment -> API.
7MODEL_ID = "YOUR_MODEL_ID"
8BASETEN_API_KEY = os.environ["BASETEN_API_KEY"]
9
10resp = requests.post(
11    f"https://model-{MODEL_ID}.api.baseten.co/environments/production/predict",
12    headers={"Authorization": f"Api-Key {BASETEN_API_KEY}"},
13    json={
14        "model_mode": "policy",
15        "vision_path": "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/bridge_0.mp4",
16        "prompt": "Put the pot to the left of the purple item. This video is captured from a first-person perspective looking at the scene.",
17        "image_size": 480,
18        "fps": 5,
19        "num_steps": 30,
20        "guidance": 1.0,
21        "shift": 5.0,
22        "seed": 0,
23        "action_chunk_size": 16,
24        "raw_action_dim": 10,
25        "domain_name": "bridge_orig_lerobot"
26    },
27)
28resp.raise_for_status()
29out = resp.json()
30
31# Action modes return the predicted action tensor under sample_outputs.
32actions = out["sample_outputs"]["outputs"][0]["content"]["action"]
33print(f"{len(actions)} action steps")
34print(actions[0])

JSON output

1{
2    "name": "<sample-name>",
3    "model_mode": "policy",
4    "format": "mp4",
5    "data": "<base64 of input video echoed back>",
6    "sample_outputs": {
7        "status": "success",
8        "outputs": [
9            {
10                "content": {
11                    "action": [
12                        [
13                            "..."
14                        ],
15                        [
16                            "..."
17                        ]
18                    ]
19                },
20                "files": [
21                    "..."
22                ]
23            }
24        ]
25    }
26}