Gemini 模型從一開始就建構於多模態的基礎上,因此可執行各種圖像處理和電腦視覺工作,包括但不限於生成圖像說明、分類和回答圖像問題,不必訓練專門的機器學習模型。
將圖片傳送給 Gemini
你可以透過下列兩種方式,將圖片做為 Gemini 的輸入內容:
- 傳遞內嵌圖片資料:適合較小的檔案 (包括提示在內,要求總大小小於 20 MB)。
- 使用 File API 上傳圖片:建議用於較大的檔案,或在多項要求中重複使用圖片。
傳遞內嵌圖片資料
您可以在對 generateContent
的要求中傳遞內嵌圖片資料。您可以提供 Base64 編碼字串形式的圖片資料,也可以直接讀取本機檔案 (視語言而定)。
以下範例說明如何從本機檔案讀取圖片,並傳遞至 generateContent
API 進行處理。
Python
from google.genai import types with open('path/to/small-sample.jpg', 'rb') as f: image_bytes = f.read() response = client.models.generate_content( model='gemini-2.5-flash', contents=[ types.Part.from_bytes( data=image_bytes, mime_type='image/jpeg', ), 'Caption this image.' ] ) print(response.text)
JavaScript
import { GoogleGenAI } from "@google/genai"; import * as fs from "node:fs"; const ai = new GoogleGenAI({}); const base64ImageFile = fs.readFileSync("path/to/small-sample.jpg", { encoding: "base64", }); const contents = [ { inlineData: { mimeType: "image/jpeg", data: base64ImageFile, }, }, { text: "Caption this image." }, ]; const response = await ai.models.generateContent({ model: "gemini-2.5-flash", contents: contents, }); console.log(response.text);
Go
bytes, _ := os.ReadFile("path/to/small-sample.jpg") parts := []*genai.Part{ genai.NewPartFromBytes(bytes, "image/jpeg"), genai.NewPartFromText("Caption this image."), } contents := []*genai.Content{ genai.NewContentFromParts(parts, genai.RoleUser), } result, _ := client.Models.GenerateContent( ctx, "gemini-2.5-flash", contents, nil, ) fmt.Println(result.Text())
REST
IMG_PATH="/path/to/your/image1.jpg" if [[ "$(base64 --version 2>&1)" = *"FreeBSD"* ]]; then B64FLAGS="--input" else B64FLAGS="-w0" fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY" \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"image/jpeg", "data": "'"$(base64 $B64FLAGS $IMG_PATH)"'" } }, {"text": "Caption this image."}, ] }] }' 2> /dev/null
您也可以從網址擷取圖片、轉換為位元組,然後傳遞至 generateContent
,如以下範例所示。
Python
from google import genai from google.genai import types import requests image_path = "https://goo.gle/instrument-img" image_bytes = requests.get(image_path).content image = types.Part.from_bytes( data=image_bytes, mime_type="image/jpeg" ) client = genai.Client() response = client.models.generate_content( model="gemini-2.5-flash", contents=["What is this image?", image], ) print(response.text)
JavaScript
import { GoogleGenAI } from "@google/genai"; async function main() { const ai = new GoogleGenAI({}); const imageUrl = "https://goo.gle/instrument-img"; const response = await fetch(imageUrl); const imageArrayBuffer = await response.arrayBuffer(); const base64ImageData = Buffer.from(imageArrayBuffer).toString('base64'); const result = await ai.models.generateContent({ model: "gemini-2.5-flash", contents: [ { inlineData: { mimeType: 'image/jpeg', data: base64ImageData, }, }, { text: "Caption this image." } ], }); console.log(result.text); } main();
Go
package main import ( "context" "fmt" "os" "io" "net/http" "google.golang.org/genai" ) func main() { ctx := context.Background() client, err := genai.NewClient(ctx, nil) if err != nil { log.Fatal(err) } // Download the image. imageResp, _ := http.Get("https://goo.gle/instrument-img") imageBytes, _ := io.ReadAll(imageResp.Body) parts := []*genai.Part{ genai.NewPartFromBytes(imageBytes, "image/jpeg"), genai.NewPartFromText("Caption this image."), } contents := []*genai.Content{ genai.NewContentFromParts(parts, genai.RoleUser), } result, _ := client.Models.GenerateContent( ctx, "gemini-2.5-flash", contents, nil, ) fmt.Println(result.Text()) }
REST
IMG_URL="https://goo.gle/instrument-img" MIME_TYPE=$(curl -sIL "$IMG_URL" | grep -i '^content-type:' | awk -F ': ' '{print $2}' | sed 's/\r$//' | head -n 1) if [[ -z "$MIME_TYPE" || ! "$MIME_TYPE" == image/* ]]; then MIME_TYPE="image/jpeg" fi # Check for macOS if [[ "$(uname)" == "Darwin" ]]; then IMAGE_B64=$(curl -sL "$IMG_URL" | base64 -b 0) elif [[ "$(base64 --version 2>&1)" = *"FreeBSD"* ]]; then IMAGE_B64=$(curl -sL "$IMG_URL" | base64) else IMAGE_B64=$(curl -sL "$IMG_URL" | base64 -w0) fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY" \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"'"$MIME_TYPE"'", "data": "'"$IMAGE_B64"'" } }, {"text": "Caption this image."} ] }] }' 2> /dev/null
使用 File API 上傳圖片
如要上傳大型檔案或重複使用同一張圖片,請使用 Files API。下列程式碼會上傳圖片檔案,然後在呼叫 generateContent
時使用該檔案。如需更多資訊和範例,請參閱 Files API 指南。
Python
from google import genai client = genai.Client() my_file = client.files.upload(file="path/to/sample.jpg") response = client.models.generate_content( model="gemini-2.5-flash", contents=[my_file, "Caption this image."], ) print(response.text)
JavaScript
import { GoogleGenAI, createUserContent, createPartFromUri, } from "@google/genai"; const ai = new GoogleGenAI({}); async function main() { const myfile = await ai.files.upload({ file: "path/to/sample.jpg", config: { mimeType: "image/jpeg" }, }); const response = await ai.models.generateContent({ model: "gemini-2.5-flash", contents: createUserContent([ createPartFromUri(myfile.uri, myfile.mimeType), "Caption this image.", ]), }); console.log(response.text); } await main();
Go
package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main() { ctx := context.Background() client, err := genai.NewClient(ctx, nil) if err != nil { log.Fatal(err) } uploadedFile, _ := client.Files.UploadFromPath(ctx, "path/to/sample.jpg", nil) parts := []*genai.Part{ genai.NewPartFromText("Caption this image."), genai.NewPartFromURI(uploadedFile.URI, uploadedFile.MIMEType), } contents := []*genai.Content{ genai.NewContentFromParts(parts, genai.RoleUser), } result, _ := client.Models.GenerateContent( ctx, "gemini-2.5-flash", contents, nil, ) fmt.Println(result.Text()) }
REST
IMAGE_PATH="path/to/sample.jpg" MIME_TYPE=$(file -b --mime-type "${IMAGE_PATH}") NUM_BYTES=$(wc -c < "${IMAGE_PATH}") DISPLAY_NAME=IMAGE tmp_header_file=upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY" \ -D upload-header.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${NUM_BYTES}" \ -H "X-Goog-Upload-Header-Content-Type: ${MIME_TYPE}" \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': '${DISPLAY_NAME}'}}" 2> /dev/null upload_url=$(grep -i "x-goog-upload-url: " "${tmp_header_file}" | cut -d" " -f2 | tr -d "\r") rm "${tmp_header_file}" # Upload the actual bytes. curl "${upload_url}" \ -H "x-goog-api-key: $GEMINI_API_KEY" \ -H "Content-Length: ${NUM_BYTES}" \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@${IMAGE_PATH}" 2> /dev/null > file_info.json file_uri=$(jq -r ".file.uri" file_info.json) echo file_uri=$file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY" \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"file_data":{"mime_type": "'"${MIME_TYPE}"'", "file_uri": "'"${file_uri}"'"}}, {"text": "Caption this image."}] }] }' 2> /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json
使用多張圖片撰寫提示
您可以在 contents
陣列中加入多個圖片 Part
物件,在單一提示中提供多張圖片。這些可以是內嵌資料 (本機檔案或網址) 和 File API 參照的組合。
Python
from google import genai from google.genai import types client = genai.Client() # Upload the first image image1_path = "path/to/image1.jpg" uploaded_file = client.files.upload(file=image1_path) # Prepare the second image as inline data image2_path = "path/to/image2.png" with open(image2_path, 'rb') as f: img2_bytes = f.read() # Create the prompt with text and multiple images response = client.models.generate_content( model="gemini-2.5-flash", contents=[ "What is different between these two images?", uploaded_file, # Use the uploaded file reference types.Part.from_bytes( data=img2_bytes, mime_type='image/png' ) ] ) print(response.text)
JavaScript
import { GoogleGenAI, createUserContent, createPartFromUri, } from "@google/genai"; import * as fs from "node:fs"; const ai = new GoogleGenAI({}); async function main() { // Upload the first image const image1_path = "path/to/image1.jpg"; const uploadedFile = await ai.files.upload({ file: image1_path, config: { mimeType: "image/jpeg" }, }); // Prepare the second image as inline data const image2_path = "path/to/image2.png"; const base64Image2File = fs.readFileSync(image2_path, { encoding: "base64", }); // Create the prompt with text and multiple images const response = await ai.models.generateContent({ model: "gemini-2.5-flash", contents: createUserContent([ "What is different between these two images?", createPartFromUri(uploadedFile.uri, uploadedFile.mimeType), { inlineData: { mimeType: "image/png", data: base64Image2File, }, }, ]), }); console.log(response.text); } await main();
Go
// Upload the first image image1Path := "path/to/image1.jpg" uploadedFile, _ := client.Files.UploadFromPath(ctx, image1Path, nil) // Prepare the second image as inline data image2Path := "path/to/image2.jpeg" imgBytes, _ := os.ReadFile(image2Path) parts := []*genai.Part{ genai.NewPartFromText("What is different between these two images?"), genai.NewPartFromBytes(imgBytes, "image/jpeg"), genai.NewPartFromURI(uploadedFile.URI, uploadedFile.MIMEType), } contents := []*genai.Content{ genai.NewContentFromParts(parts, genai.RoleUser), } result, _ := client.Models.GenerateContent( ctx, "gemini-2.5-flash", contents, nil, ) fmt.Println(result.Text())
REST
# Upload the first image IMAGE1_PATH="path/to/image1.jpg" MIME1_TYPE=$(file -b --mime-type "${IMAGE1_PATH}") NUM1_BYTES=$(wc -c < "${IMAGE1_PATH}") DISPLAY_NAME1=IMAGE1 tmp_header_file1=upload-header1.tmp curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY" \ -D upload-header1.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${NUM1_BYTES}" \ -H "X-Goog-Upload-Header-Content-Type: ${MIME1_TYPE}" \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': '${DISPLAY_NAME1}'}}" 2> /dev/null upload_url1=$(grep -i "x-goog-upload-url: " "${tmp_header_file1}" | cut -d" " -f2 | tr -d "\r") rm "${tmp_header_file1}" curl "${upload_url1}" \ -H "Content-Length: ${NUM1_BYTES}" \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@${IMAGE1_PATH}" 2> /dev/null > file_info1.json file1_uri=$(jq ".file.uri" file_info1.json) echo file1_uri=$file1_uri # Prepare the second image (inline) IMAGE2_PATH="path/to/image2.png" MIME2_TYPE=$(file -b --mime-type "${IMAGE2_PATH}") if [[ "$(base64 --version 2>&1)" = *"FreeBSD"* ]]; then B64FLAGS="--input" else B64FLAGS="-w0" fi IMAGE2_BASE64=$(base64 $B64FLAGS $IMAGE2_PATH) # Now generate content using both images curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY" \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "What is different between these two images?"}, {"file_data":{"mime_type": "'"${MIME1_TYPE}"'", "file_uri": '$file1_uri'}}, { "inline_data": { "mime_type":"'"${MIME2_TYPE}"'", "data": "'"$IMAGE2_BASE64"'" } } ] }] }' 2> /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json
物件偵測
從 Gemini 2.0 開始,模型會進一步訓練,以偵測圖片中的物件並取得定界框座標。座標會根據圖片尺寸縮放至 [0, 1000]。您需要根據原始圖片大小,縮放這些座標。
Python
from google import genai from google.genai import types from PIL import Image import json client = genai.Client() prompt = "Detect the all of the prominent items in the image. The box_2d should be [ymin, xmin, ymax, xmax] normalized to 0-1000." image = Image.open("/path/to/image.png") config = types.GenerateContentConfig( response_mime_type="application/json" ) response = client.models.generate_content(model="gemini-2.5-flash", contents=[image, prompt], config=config ) width, height = image.size bounding_boxes = json.loads(response.text) converted_bounding_boxes = [] for bounding_box in bounding_boxes: abs_y1 = int(bounding_box["box_2d"][0]/1000 * height) abs_x1 = int(bounding_box["box_2d"][1]/1000 * width) abs_y2 = int(bounding_box["box_2d"][2]/1000 * height) abs_x2 = int(bounding_box["box_2d"][3]/1000 * width) converted_bounding_boxes.append([abs_x1, abs_y1, abs_x2, abs_y2]) print("Image size: ", width, height) print("Bounding boxes:", converted_bounding_boxes)
如需更多範例,請參閱 Gemini 教戰手冊中的下列筆記本:
區隔
從 Gemini 2.5 開始,模型不僅能偵測項目,還能將項目區隔開來,並提供輪廓遮罩。
模型會預測 JSON 清單,其中每個項目代表一個區隔遮罩。每個項目都有定界框 (「box_2d
」),格式為 [y0, x0, y1, x1]
,其中包含介於 0 到 1000 之間的標準化座標、識別物件的標籤 (「label
」),以及定界框內的區隔遮罩 (以 Base64 編碼的 PNG,是值介於 0 到 255 之間的機率地圖)。 遮罩必須調整大小,與周框方塊的尺寸相符,然後在可信度門檻 (中點為 127) 進行二元化。
Python
from google import genai from google.genai import types from PIL import Image, ImageDraw import io import base64 import json import numpy as np import os client = genai.Client() def parse_json(json_output: str): # Parsing out the markdown fencing lines = json_output.splitlines() for i, line in enumerate(lines): if line == "```json": json_output = "\n".join(lines[i+1:]) # Remove everything before "```json" output = json_output.split("```")[0] # Remove everything after the closing "```" break # Exit the loop once "```json" is found return json_output def extract_segmentation_masks(image_path: str, output_dir: str = "segmentation_outputs"): # Load and resize image im = Image.open(image_path) im.thumbnail([1024, 1024], Image.Resampling.LANCZOS) prompt = """ Give the segmentation masks for the wooden and glass items. Output a JSON list of segmentation masks where each entry contains the 2D bounding box in the key "box_2d", the segmentation mask in key "mask", and the text label in the key "label". Use descriptive labels. """ config = types.GenerateContentConfig( thinking_config=types.ThinkingConfig(thinking_budget=0) # set thinking_budget to 0 for better results in object detection ) response = client.models.generate_content( model="gemini-2.5-flash", contents=[prompt, im], # Pillow images can be directly passed as inputs (which will be converted by the SDK) config=config ) # Parse JSON response items = json.loads(parse_json(response.text)) # Create output directory os.makedirs(output_dir, exist_ok=True) # Process each mask for i, item in enumerate(items): # Get bounding box coordinates box = item["box_2d"] y0 = int(box[0] / 1000 * im.size[1]) x0 = int(box[1] / 1000 * im.size[0]) y1 = int(box[2] / 1000 * im.size[1]) x1 = int(box[3] / 1000 * im.size[0]) # Skip invalid boxes if y0 >= y1 or x0 >= x1: continue # Process mask png_str = item["mask"] if not png_str.startswith("data:image/png;base64,"): continue # Remove prefix png_str = png_str.removeprefix("data:image/png;base64,") mask_data = base64.b64decode(png_str) mask = Image.open(io.BytesIO(mask_data)) # Resize mask to match bounding box mask = mask.resize((x1 - x0, y1 - y0), Image.Resampling.BILINEAR) # Convert mask to numpy array for processing mask_array = np.array(mask) # Create overlay for this mask overlay = Image.new('RGBA', im.size, (0, 0, 0, 0)) overlay_draw = ImageDraw.Draw(overlay) # Create overlay for the mask color = (255, 255, 255, 200) for y in range(y0, y1): for x in range(x0, x1): if mask_array[y - y0, x - x0] > 128: # Threshold for mask overlay_draw.point((x, y), fill=color) # Save individual mask and its overlay mask_filename = f"{item['label']}_{i}_mask.png" overlay_filename = f"{item['label']}_{i}_overlay.png" mask.save(os.path.join(output_dir, mask_filename)) # Create and save overlay composite = Image.alpha_composite(im.convert('RGBA'), overlay) composite.save(os.path.join(output_dir, overlay_filename)) print(f"Saved mask and overlay for {item['label']} to {output_dir}") # Example usage if __name__ == "__main__": extract_segmentation_masks("path/to/image.png")
如需更詳細的範例,請參閱食譜指南中的區隔範例。

支援的圖片格式
Gemini 支援下列圖片格式的 MIME 類型:
- PNG -
image/png
- JPEG -
image/jpeg
- WebP -
image/webp
- HEIC -
image/heic
- HEIF -
image/heif
功能
所有 Gemini 模型版本都是多模態,可用於各種圖像處理和電腦視覺工作,包括但不限於圖像說明、圖像問題與回答、圖像分類、物件偵測和分割。
視品質和效能需求而定,Gemini 可減少使用專業機器學習模型的需求。
除了通用功能外,部分後續模型版本還經過特別訓練,可提升特定工作的準確度:
限制和重要技術資訊
檔案限制
Gemini 2.5 Pro/Flash、2.0 Flash、1.5 Pro 和 1.5 Flash 支援每個要求最多 3,600 個圖片檔案。
代幣計算
- Gemini 1.5 Flash 和 Gemini 1.5 Pro:如果兩個維度都 <= 384 像素,則為 258 個權杖。較大的圖片會以圖塊顯示 (最小圖塊 256 像素,最大 768 像素,調整大小為 768x768),每個圖塊的費用為 258 個權杖。
- Gemini 2.0 Flash 和 Gemini 2.5 Flash/Pro:如果兩個維度都 <= 384 像素,則為 258 個權杖。 較大的圖片會分割成 768x768 像素的圖塊,每個圖塊需支付 258 個權杖。
提示與最佳做法
- 確認圖片已正確旋轉。
- 使用清晰的圖片,不要模糊不清。
- 使用含有文字的單一圖片時,請將文字提示詞放在
contents
陣列的圖片部分之後。
後續步驟
本指南說明如何上傳圖片檔案,並從圖片輸入內容生成文字輸出內容。如要進一步瞭解相關內容,請參閱下列資源: