embedding
EmbeddingGemma

A 300-million-parameter embedding model by Google with leading size-adjusted performance
Model details
Example usage
EmbeddingGemma is a 300 million parameter text embedding model based on the Gemma3 LLM architecture. On the MTEB benchmark, it offers the highest quality of any embedding model less than half a billion parameters and supports over 100 languages with text input of up to 2k tokens.
Input
1import os
2from baseten_performance_client import (
3 PerformanceClient, OpenAIEmbeddingsResponse,
4)
5
6api_key = os.environ.get("BASETEN_API_KEY")
7model_id = "abcd1234"
8base_url = f"https://model-{model_id}.api.baseten.co/environments/production/sync"
9
10client = PerformanceClient(base_url=base_url, api_key=api_key)
11prompts={
12 "query": "task: search result | query: ",
13 "document": "title: none | text: ",
14 "BitextMining": "task: search result | query: ",
15 "Clustering": "task: clustering | query: ",
16 "Classification": "task: classification | query: ",
17 "InstructionRetrieval": "task: code retrieval | query: ",
18 "MultilabelClassification": "task: classification | query: ",
19 "PairClassification": "task: sentence similarity | query: ",
20 "Reranking": "task: search result | query: ",
21 "Retrieval": "task: search result | query: ",
22 "Retrieval-query": "task: search result | query: ",
23 "Retrieval-document": "title: none | text: ",
24 "STS": "task: sentence similarity | query: ",
25 "Summarization": "task: summarization | query: "
26}
27def get_detailed_instruct(query: str, task: str) -> str:
28 """
29 """
30 task_str = prompts[task]
31 return f'{task_str}{query}'
32
33task = 'Given a web search query, retrieve relevant passages that answer the query'
34texts = [
35 get_detailed_instruct(task, "Which planet is known as the Red Planet?"),
36 "Mars, known for its reddish appearance, is often referred to as the Red Planet.",
37 "Jupiter, the largest planet in our solar system, has a prominent red spot.",
38]
39response: OpenAIEmbeddingsResponse = client.embed(
40 input=texts,
41 model="gemmamodel",
42 batch_size=32,
43 max_concurrent_requests=128,
44)
45array = response.numpy()
JSON output
1{
2 "data": [
3 {
4 "embedding": [
5 0
6 ],
7 "index": 0,
8 "object": "embedding"
9 }
10 ],
11 "model": "embeddinggemma",
12 "object": "list",
13 "usage": {
14 "prompt_tokens": 512,
15 "total_tokens": 512
16 }
17}