NVIDIA Cosmos 3 Nano (8B)
NVIDIA Cosmos 3 Nano (8B) world foundation model with six omni modes: text2image, text2video, image2video, forward/inverse dynamics, and policy. Single H100.
Model details
Example usage
text2image — render a single still image from a text prompt (num_frames: 1). Returns a base64 JPEG.
1import base64
2import os
3
4import requests
5
6# Find MODEL_ID in the Baseten dashboard: Models -> your deployment -> API.
7MODEL_ID = "YOUR_MODEL_ID"
8BASETEN_API_KEY = os.environ["BASETEN_API_KEY"]
9
10resp = requests.post(
11 f"https://model-{MODEL_ID}.api.baseten.co/environments/production/predict",
12 headers={"Authorization": f"Api-Key {BASETEN_API_KEY}"},
13 json={
14 "model_mode": "text2image",
15 "prompt": "A medium shot of a modern robotics research laboratory with white walls and a gray floor.",
16 "resolution": "720",
17 "aspect_ratio": "16,9",
18 "num_frames": 1,
19 "seed": 0
20 },
21)
22resp.raise_for_status()
23out = resp.json()
24
25# Vision modes return base64-encoded media in `data`.
26with open(f"vision.{out['format']}", "wb") as f:
27 f.write(base64.b64decode(out["data"]))
28print(f"saved vision.{out['format']}")1{
2 "name": "<sample-name>",
3 "model_mode": "text2image",
4 "format": "jpg",
5 "data": "<base64-encoded media>"
6}text2video (default mode) — generate a video clip from a text prompt. Returns a base64 MP4.
1import base64
2import os
3
4import requests
5
6# Find MODEL_ID in the Baseten dashboard: Models -> your deployment -> API.
7MODEL_ID = "YOUR_MODEL_ID"
8BASETEN_API_KEY = os.environ["BASETEN_API_KEY"]
9
10resp = requests.post(
11 f"https://model-{MODEL_ID}.api.baseten.co/environments/production/predict",
12 headers={"Authorization": f"Api-Key {BASETEN_API_KEY}"},
13 json={
14 "model_mode": "text2video",
15 "prompt": "A bustling city street at night, neon signs reflecting on wet pavement, light rain falling.",
16 "resolution": "720",
17 "aspect_ratio": "16,9",
18 "num_frames": 189,
19 "fps": 24,
20 "seed": 0
21 },
22)
23resp.raise_for_status()
24out = resp.json()
25
26# Vision modes return base64-encoded media in `data`.
27with open(f"vision.{out['format']}", "wb") as f:
28 f.write(base64.b64decode(out["data"]))
29print(f"saved vision.{out['format']}")1{
2 "name": "<sample-name>",
3 "model_mode": "text2video",
4 "format": "mp4",
5 "data": "<base64-encoded media>"
6}image2video — animate a still image conditioned on a text prompt. Pass the first frame as vision_path (URL or container path). Returns a base64 MP4.
1import base64
2import os
3
4import requests
5
6# Find MODEL_ID in the Baseten dashboard: Models -> your deployment -> API.
7MODEL_ID = "YOUR_MODEL_ID"
8BASETEN_API_KEY = os.environ["BASETEN_API_KEY"]
9
10resp = requests.post(
11 f"https://model-{MODEL_ID}.api.baseten.co/environments/production/predict",
12 headers={"Authorization": f"Api-Key {BASETEN_API_KEY}"},
13 json={
14 "model_mode": "image2video",
15 "vision_path": "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/vision/robot_153.jpg",
16 "prompt": "A robotic arm picks up the red spherical object and places it on a lower shelf, completing a smooth deliberate manipulation.",
17 "resolution": "720",
18 "aspect_ratio": "16,9",
19 "num_frames": 189,
20 "fps": 24,
21 "seed": 0
22 },
23)
24resp.raise_for_status()
25out = resp.json()
26
27# Vision modes return base64-encoded media in `data`.
28with open(f"vision.{out['format']}", "wb") as f:
29 f.write(base64.b64decode(out["data"]))
30print(f"saved vision.{out['format']}")1{
2 "name": "<sample-name>",
3 "model_mode": "image2video",
4 "format": "mp4",
5 "data": "<base64-encoded media>"
6}forward_dynamics — predict the future video rollout from an initial observation plus an action sequence (vision_path + action_path). Returns a base64 MP4.
1import base64
2import os
3
4import requests
5
6# Find MODEL_ID in the Baseten dashboard: Models -> your deployment -> API.
7MODEL_ID = "YOUR_MODEL_ID"
8BASETEN_API_KEY = os.environ["BASETEN_API_KEY"]
9
10resp = requests.post(
11 f"https://model-{MODEL_ID}.api.baseten.co/environments/production/predict",
12 headers={"Authorization": f"Api-Key {BASETEN_API_KEY}"},
13 json={
14 "model_mode": "forward_dynamics",
15 "vision_path": "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/bridge_0.mp4",
16 "action_path": "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/bridge_0.json",
17 "prompt": "Put the pot to the left of the purple item. This video is captured from a first-person perspective looking at the scene.",
18 "image_size": 480,
19 "fps": 5,
20 "num_steps": 30,
21 "guidance": 1.0,
22 "shift": 5.0,
23 "seed": 0,
24 "action_chunk_size": 16,
25 "domain_name": "bridge_orig_lerobot"
26 },
27)
28resp.raise_for_status()
29out = resp.json()
30
31# Vision modes return base64-encoded media in `data`.
32with open(f"vision.{out['format']}", "wb") as f:
33 f.write(base64.b64decode(out["data"]))
34print(f"saved vision.{out['format']}")1{
2 "name": "<sample-name>",
3 "model_mode": "forward_dynamics",
4 "format": "mp4",
5 "data": "<base64-encoded media>"
6}inverse_dynamics — infer the action sequence that produced an observed video. Returns the action tensor under sample_outputs.
1import base64
2import os
3
4import requests
5
6# Find MODEL_ID in the Baseten dashboard: Models -> your deployment -> API.
7MODEL_ID = "YOUR_MODEL_ID"
8BASETEN_API_KEY = os.environ["BASETEN_API_KEY"]
9
10resp = requests.post(
11 f"https://model-{MODEL_ID}.api.baseten.co/environments/production/predict",
12 headers={"Authorization": f"Api-Key {BASETEN_API_KEY}"},
13 json={
14 "model_mode": "inverse_dynamics",
15 "vision_path": "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/av_vision_25_73d01c91-51f0-46cf-9b76-5682a76fb349.mp4",
16 "prompt": "You are an autonomous vehicle planning system. This video is captured from a first-person perspective looking at the scene.",
17 "image_size": 480,
18 "fps": 10,
19 "num_steps": 30,
20 "guidance": 1.0,
21 "shift": 5.0,
22 "seed": 0,
23 "raw_action_dim": 9,
24 "action_chunk_size": 60,
25 "domain_name": "av"
26 },
27)
28resp.raise_for_status()
29out = resp.json()
30
31# Action modes return the predicted action tensor under sample_outputs.
32actions = out["sample_outputs"]["outputs"][0]["content"]["action"]
33print(f"{len(actions)} action steps")
34print(actions[0])1{
2 "name": "<sample-name>",
3 "model_mode": "inverse_dynamics",
4 "format": "mp4",
5 "data": "<base64 of input video echoed back>",
6 "sample_outputs": {
7 "status": "success",
8 "outputs": [
9 {
10 "content": {
11 "action": [
12 [
13 "..."
14 ],
15 [
16 "..."
17 ]
18 ]
19 },
20 "files": [
21 "..."
22 ]
23 }
24 ]
25 }
26}policy — predict the actions needed to reach a goal (described in the prompt) from a starting observation. Returns the action tensor under sample_outputs.
1import base64
2import os
3
4import requests
5
6# Find MODEL_ID in the Baseten dashboard: Models -> your deployment -> API.
7MODEL_ID = "YOUR_MODEL_ID"
8BASETEN_API_KEY = os.environ["BASETEN_API_KEY"]
9
10resp = requests.post(
11 f"https://model-{MODEL_ID}.api.baseten.co/environments/production/predict",
12 headers={"Authorization": f"Api-Key {BASETEN_API_KEY}"},
13 json={
14 "model_mode": "policy",
15 "vision_path": "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/bridge_0.mp4",
16 "prompt": "Put the pot to the left of the purple item. This video is captured from a first-person perspective looking at the scene.",
17 "image_size": 480,
18 "fps": 5,
19 "num_steps": 30,
20 "guidance": 1.0,
21 "shift": 5.0,
22 "seed": 0,
23 "action_chunk_size": 16,
24 "raw_action_dim": 10,
25 "domain_name": "bridge_orig_lerobot"
26 },
27)
28resp.raise_for_status()
29out = resp.json()
30
31# Action modes return the predicted action tensor under sample_outputs.
32actions = out["sample_outputs"]["outputs"][0]["content"]["action"]
33print(f"{len(actions)} action steps")
34print(actions[0])1{
2 "name": "<sample-name>",
3 "model_mode": "policy",
4 "format": "mp4",
5 "data": "<base64 of input video echoed back>",
6 "sample_outputs": {
7 "status": "success",
8 "outputs": [
9 {
10 "content": {
11 "action": [
12 [
13 "..."
14 ],
15 [
16 "..."
17 ]
18 ]
19 },
20 "files": [
21 "..."
22 ]
23 }
24 ]
25 }
26}