Gemini Pro API

Here is a script that takes a text prompt, camera to take a snapshot, and media player to play back the response:

alias: Gemini Pro Vision
fields:
  prompt:
    selector:
      text:
        multiline: true
    name: prompt
  media_player:
    selector:
      entity:
        filter:
          - domain: media_player
    name: media player
  camera:
    selector:
      entity:
        filter:
          - domain: camera
    name: camera
sequence:
  - service: camera.snapshot
    data:
      entity_id: "{{ camera }}"
      filename: /media/snapshot.jpg
  - service: google_generative_ai_conversation.generate_content
    data:
      prompt: "{{ prompt }}"
      image_filename: /media/snapshot.jpg
    response_variable: content
  - service: tts.speak
    target:
      entity_id: tts.piper
    data:
      media_player_entity_id: {{ media_player }}
      message: "{{ content.text }}"
      cache: false
  - variables:
      content: "{{ content }}"
  - stop: end
    response_variable: content
mode: single
icon: mdi:message-image