from pathlib import Path from agno.agent import Agent, RunOutput from agno.media import Image from agno.models.openai import OpenAIResponses from agno.utils.audio import write_audio_to_file from rich import print from rich.text import Text cwd = Path(__file__).parent.resolve() image_agent = Agent(model=OpenAIResponses(id="gpt-5.2")) image_path = Path(__file__).parent.joinpath("sample.jpg") image_story: RunOutput = image_agent.run( "Write a 3 sentence fiction story about the image", images=[Image(filepath=image_path)], ) formatted_text = Text.from_markup( f":sparkles: [bold magenta]Story:[/bold magenta] {image_story.content} :sparkles:" ) print(formatted_text) audio_agent = Agent( model=OpenAIResponses( id="gpt-5.2-audio-preview", modalities=["text", "audio"], audio={"voice": "sage", "format": "wav"}, ), ) audio_story: RunOutput = audio_agent.run( f"Narrate the story with flair: {image_story.content}" ) if audio_story.response_audio is not None: write_audio_to_file( audio=audio_story.response_audio.content, filename="tmp/sample_story.wav" )
Set up your virtual environment
uv venv --python 3.12 source .venv/bin/activate
Install dependencies
uv pip install -U agno rich
Export your OpenAI API key
export OPENAI_API_KEY="your_openai_api_key_here"
Run Agent
python image_to_audio.py
Was this page helpful?