What you'll build
An image classifier that analyzes photos and categorizes them automatically using multimodal AI.
You pass it an image (from file or URL) and it tells you what it is: "cat, 95% confidence, tabby cat sleeping on a sofa". You can define your own categories to classify products, documents, or whatever you need.
When finished, you'll have a Python script that uses Gemini Vision to analyze images. It returns the category, confidence level, and a description. Useful for organizing photos, moderating content, or automating inventory.
The prompt to start
Create an image classifier in Python that:
- Receives an image (path or URL)
- Uses Gemini Vision to analyze it
- Returns: category, confidence, description
- Supports custom categories
What the AI will create
import google.generativeai as genai
from PIL import Image
import requests
from io import BytesIO
import os
import json
genai.configure(api_key=os.environ["GEMINI_API_KEY"])
model = genai.GenerativeModel("gemini-1.5-flash")
def load_image(source: str) -> Image.Image:
"""Load image from path or URL"""
if source.startswith("http"):
response = requests.get(source)
return Image.open(BytesIO(response.content))
return Image.open(source)
def classify(
image_source: str,
categories: list[str] = None
) -> dict:
"""Classify image"""
image = load_image(image_source)
if categories:
cat_list = ", ".join(categories)
prompt = f"""Analyze this image and classify it in ONE of these categories: {cat_list}
Respond in JSON:
{{
"category": "chosen category",
"confidence": 0.0 to 1.0,
"description": "brief description",
"objects": ["detected objects"]
}}"""
else:
prompt = """Analyze this image.
Respond in JSON:
{
"category": "general category",
"confidence": 0.0 to 1.0,
"description": "brief description",
"objects": ["detected objects"]
}"""
response = model.generate_content([prompt, image])
text = response.text.strip().strip("```json").strip("```")
return json.loads(text)
# Usage examples
result = classify("cat.jpg")
print(f"Category: {result['category']}")
print(f"Confidence: {result['confidence']:.0%}")
print(f"Description: {result['description']}")
# With custom categories
result = classify(
"photo.jpg",
categories=["food", "landscape", "person", "animal", "object"]
)
Variant: Batch classification
def classify_batch(images: list[str], categories: list[str]) -> list[dict]:
"""Classify multiple images"""
results = []
for img in images:
try:
result = classify(img, categories)
result["source"] = img
results.append(result)
except Exception as e:
results.append({"source": img, "error": str(e)})
return results
Next level
โ Voice Assistant