Here is a script that takes a text prompt, camera to take a snapshot, and media player to play back the response:
alias: Gemini Pro Vision
fields:
prompt:
selector:
text:
multiline: true
name: prompt
media_player:
selector:
entity:
filter:
- domain: media_player
name: media player
camera:
selector:
entity:
filter:
- domain: camera
name: camera
sequence:
- service: camera.snapshot
data:
entity_id: "{{ camera }}"
filename: /media/snapshot.jpg
- service: google_generative_ai_conversation.generate_content
data:
prompt: "{{ prompt }}"
image_filename: /media/snapshot.jpg
response_variable: content
- service: tts.speak
target:
entity_id: tts.piper
data:
media_player_entity_id: {{ media_player }}
message: "{{ content.text }}"
cache: false
- variables:
content: "{{ content }}"
- stop: end
response_variable: content
mode: single
icon: mdi:message-image