What you'll build
A web application that understands text, images, and audio. All in one interface with AI.
You type a question, upload a photo, or record audio, and the AI understands and responds. You can ask "what's in this image?", upload a PDF and request a summary, or record your voice to ask questions.
When finished, you'll have an app with Streamlit that uses Gemini to process multiple input types. With conversation history and a clean interface you can deploy in minutes.
The prompt to start
Create a multimodal Streamlit app that:
- Accepts text, image, or audio as input
- Uses Gemini to process all types
- Shows formatted response
- Saves conversation history
What the AI will create
import streamlit as st
import google.generativeai as genai
from PIL import Image
import io
import os
genai.configure(api_key=os.environ["GEMINI_API_KEY"])
model = genai.GenerativeModel("gemini-1.5-flash")
st.set_page_config(page_title="Multimodal App", page_icon="๐จ")
st.title("๐จ Multimodal AI App")
# History in session state
if "messages" not in st.session_state:
st.session_state.messages = []
# Show history
for msg in st.session_state.messages:
with st.chat_message(msg["role"]):
st.write(msg["content"])
if msg.get("image"):
st.image(msg["image"], width=300)
# Tabs for different inputs
tab1, tab2, tab3 = st.tabs(["๐ฌ Text", "๐ผ๏ธ Image", "๐ค Audio"])
with tab1:
text_input = st.text_area("Write your message:")
if st.button("Send text"):
if text_input:
st.session_state.messages.append({
"role": "user",
"content": text_input
})
response = model.generate_content(text_input)
st.session_state.messages.append({
"role": "assistant",
"content": response.text
})
st.rerun()
with tab2:
uploaded_image = st.file_uploader(
"Upload an image:",
type=["png", "jpg", "jpeg"]
)
image_prompt = st.text_input("What do you want to know about the image?")
if st.button("Analyze image"):
if uploaded_image and image_prompt:
image = Image.open(uploaded_image)
st.session_state.messages.append({
"role": "user",
"content": image_prompt,
"image": image
})
response = model.generate_content([image_prompt, image])
st.session_state.messages.append({
"role": "assistant",
"content": response.text
})
st.rerun()
with tab3:
audio_file = st.file_uploader(
"Upload audio:",
type=["mp3", "wav", "m4a"]
)
if st.button("Transcribe"):
if audio_file:
st.info("Transcription with Whisper/Gemini coming soon")
# Clear button
if st.button("๐๏ธ Clear history"):
st.session_state.messages = []
st.rerun()
Run
pip install streamlit google-generativeai pillow
streamlit run app.py