Need a working Voice Assistant

Hi,
I am trying to establish a couple of local voice assistants in my home.
Until now we have a google Nest or Nest Hub in every room to communicate with the smart home.
I recently tinkered with a m5 STack and ordered a couple of ESP32 S3 N16R8, MAX98357A and INMP441.

The goal is to use them to control the lights, shutters and local functions.

So far I built three Devices and they work more or less ok.

However some stuff is missing, I would like to have a chime wehn it detects the wake word and what to be able to make announcements to them.

I have doctored the following yaml but I am not able to play the wave file, or send announcements via Home assistant.

substitutions:
  timer_alarm_sound: "sounds/bell.wav"  # in the esphome/sounds folder
  
  
esphome:
  name: oscar-3
  friendly_name: Oscar Nr. 3
  compile_process_limit: 2
  platformio_options:
    board_build.flash_mode: dio
  on_boot:
    - light.turn_on:
        id: led_strip
        red: 100%
        green: 0%
        blue: 0%
        brightness: 60%
        effect: "Scan Effect With Custom Values"

esp32:
  board: esp32-s3-devkitc-1
  framework:
    type: esp-idf

    sdkconfig_options:
      CONFIG_ESP32S3_DEFAULT_CPU_FREQ_240: "y"
      CONFIG_ESP32S3_DATA_CACHE_64KB: "y"
      CONFIG_ESP32S3_DATA_CACHE_LINE_64B: "y"
      CONFIG_AUDIO_BOARD_CUSTOM: "y"
   
psram:
  mode: octal # Please change this to quad for N8R2 and octal for N16R8
  speed: 80MHz

# Enable logging
logger:

# Enable Home Assistant API
api:
  encryption:
    key: !secret api_key
ota:
  - platform: esphome
    password: "xxxxxxxxxxxxxxxxxxxxxxxxxxxxx"


wifi:
  ssid: !secret wifi_ssid
  password: !secret wifi_password
  output_power: 8.5dB

  # Enable fallback hotspot (captive portal) in case wifi connection fails
  ap:
    ssid: "Oscar-3 Fallback Hotspot"
    password: !secret fallback_ap_pw

captive_portal:


i2s_audio:
  - id: i2s_in # For microphone
    i2s_lrclk_pin: GPIO3  #WS 
    i2s_bclk_pin: GPIO2 #SCK

  - id: i2s_speaker #For Speaker
    i2s_lrclk_pin: GPIO6  #LRC 
    i2s_bclk_pin: GPIO7 #BLCK

microphone:
  - platform: i2s_audio
    id: va_mic
    adc_type: external
    i2s_din_pin: GPIO4 #SD
    channel: left
    pdm: false
    i2s_audio_id: i2s_in
    bits_per_sample: 32bit


output:
  - platform: gpio
    pin: 
      number: GPIO8
      allow_other_uses: true
    id: set_low_speaker

speaker:
  - platform: i2s_audio
    id: va_speaker
    i2s_audio_id: i2s_speaker
    dac_type: external
    i2s_dout_pin:
      number: GPIO8  #  DIN Pin of the MAX98357A Audio Amplifier
      allow_other_uses: true
    channel: mono
    
#media_player:
#  - platform: i2s_audio
#    name: "esp_speaker"
#    id: va_speaker
#    i2s_audio_id: i2s_in
#    dac_type: external
#    i2s_dout_pin:
#      number: GPIO8  #  DIN Pin of the MAX98357A Audio Amplifier
#      allow_other_uses: true
#    mode: mono


voice_assistant:
  microphone: va_mic
  id: va
  noise_suppression_level: 2
  auto_gain: 31dBFS
  volume_multiplier: 4.0
  use_wake_word: false
  speaker: va_speaker
  
  on_error: 
   - if:
        condition:
          switch.is_on: use_wake_word
        then:
          - switch.turn_off: use_wake_word
          - delay: 1sec 
          - switch.turn_on: use_wake_word      
          
  on_wake_word_detected: 
    - if:
        condition:
          lambda: return id(wake_up_bell_active) == true;
        then:
          switch.turn_on: wake_up_bell    
    - light.turn_on:
        id: led_strip
        effect: "Fast Pulse"
        red: 100%
        green: 100%
        blue: 100%
        brightness: 80%
# zuhören was nach dem wakeword gesagt wird
  on_listening:
    - if:
        condition:
            - switch.is_on: wake_up_bell
        then:  
          - lambda: id(va_speaker).play(id(timer_finished_wave_file), sizeof(id(timer_finished_wave_file)));
          - switch.turn_off: wake_up_bell
    - delay: 50ms
    - light.turn_on:
        id: led_strip
        effect: "Slow Pulse"
        red: 0%
        green: 0%
        blue: 100%
        brightness: 80%

# (Optional, Automation): An automation to perform when voice activity detection starts speech-to-text processing.
  on_stt_vad_start: 
    - light.turn_on:
        id: led_strip
        effect: "Scan Effect With Custom Values"
        red: 0%
        green: 100%
        blue: 0%
        brightness: 80%

  on_stt_vad_end:
    - light.turn_off: led_strip   

# beginn der sprachausgabe
  on_tts_stream_start:
    - light.turn_on:
        id: led_strip
        effect: "Fireworks Effect With Custom Values"
        red: 0%
        green: 0%
        blue: 100%
        brightness: 80%  

# beginn der sprachausgabe
  on_tts_stream_end:
    - light.turn_off: led_strip    

# beginn der sprachausgabe
  on_stt_end:  
    - light.turn_off: led_strip

#  on_end:
#    - light.turn_off: led_strip

  on_client_connected:
    - delay: 50ms
    - light.turn_off: led_strip
    - if:
        condition:
          switch.is_on: use_wake_word
        then:
          - voice_assistant.start_continuous:

  on_client_disconnected:
    - if:
        condition:
          switch.is_on: use_wake_word
        then:
          - voice_assistant.stop:
  

binary_sensor:
  - platform: status
    name: API Connection
    id: api_connection
    filters:
      - delayed_on: 1s
    on_press:
      - if:
          condition:
            switch.is_on: use_wake_word
          then:
            - voice_assistant.start_continuous:
    on_release:
      - if:
          condition:
            switch.is_on: use_wake_word
          then:
            - voice_assistant.stop:

switch:
  - platform: template
    name: Use wake word
    id: use_wake_word
    optimistic: true
    restore_mode: RESTORE_DEFAULT_ON
    entity_category: config
    on_turn_on:
      - lambda: id(va).set_use_wake_word(true);
      - if:
          condition:
            not:
              - voice_assistant.is_running
          then:
            - voice_assistant.start_continuous
    
    on_turn_off:
      - voice_assistant.stop
      - lambda: id(va).set_use_wake_word(false);  
          
  - platform: template # wake up bell
    name: Wake up bell
    id: wake_up_bell
    optimistic: true
    restore_mode: RESTORE_DEFAULT_ON
    entity_category: config
    icon: mdi:bell
    on_turn_on: 
      then:
        - lambda: id(wake_up_bell_active) = true;
    on_turn_off: 
      then:
        - lambda: id(wake_up_bell_active) = false;  

light:
  - platform: esp32_rmt_led_strip
    id: led_strip
    rgb_order: GRB
    pin: GPIO09
    num_leds: 4
    chipset: ws2812
    name: "Led Strip"
    effects:
      - pulse:
      - pulse:
          name: "Fast Pulse"
          transition_length: 0.5s
          update_interval: 0.5s
          min_brightness: 0%
          max_brightness: 100%
      - pulse:
          name: "Slow Pulse"
          transition_length: 1s
          update_interval: 1s
          min_brightness: 0%
          max_brightness: 100%
      - addressable_scan:
          name: "Scan Effect With Custom Values"
          move_interval: 200ms
          scan_width: 1      
      - random:
          name: "Random Effect With Custom Values"
          transition_length: 5s
          update_interval: 7s
      - flicker:
          name: "Flicker Effect With Custom Values"
          alpha: 95%
          intensity: 40%
      - addressable_random_twinkle:
          name: "Random Twinkle Effect With Custom Values"
          twinkle_probability: 5%
          progress_interval: 32ms
      - addressable_fireworks:
          name: "Fireworks Effect With Custom Values"
          update_interval: 32ms
          spark_probability: 50%
          use_random_color: true
          fade_out_rate: 120

external_components:
  - source: github://jesserockz/esphome-components
    components: [file]
    refresh: 0s
file:
  - id: timer_finished_wave_file
    file: ${timer_alarm_sound} 
    
globals:
  - id: wake_up_bell_active
    type: bool
    restore_value: false

I struggle to find a good example. All the examples I found lack support or features.

Because I trained my own wakeword I would like to use it, but I don’t know how to get it onto the device so until then I use the wakeword in home assistant set to “hey Oscar” and it works great.

What I need:

  • my own wakeword “Hey oscar”
  • a chime
  • timer
  • anouncements

Anyone mind to help me or knows a good example?

Sorry,

that post was not very well crafted.
I am willing to contribute of course.

I got 6 Assistants running right now:

They are compiled wie esp-idf with a speaker isntead of Media Player, because with mediaplayer I have crackling noise, and out of nowhere it doesnt work anymore.
Downside: This way announcements are not possible.
I am looking into the atom echo, which got both speaker and mediaplayer in the code but it uses some kind auf audiobus.
I am working on it.

This is “my” code (well I got “inspired” by a couple of sources):

esphome:
  name: oscar-nr-2
  friendly_name: Oscar Nr. 2
  compile_process_limit: 2
  platformio_options:
    board_build.flash_mode: dio
  on_boot:
    - light.turn_on:
        id: led_strip
        red: 100%
        green: 0%
        blue: 0%
        brightness: 60%
        effect: "Scan Effect With Custom Values"

esp32:
  board: esp32-s3-devkitc-1
  framework:
    type: esp-idf

    sdkconfig_options:
      CONFIG_ESP32S3_DEFAULT_CPU_FREQ_240: "y"
      CONFIG_ESP32S3_DATA_CACHE_64KB: "y"
      CONFIG_ESP32S3_DATA_CACHE_LINE_64B: "y"
      CONFIG_AUDIO_BOARD_CUSTOM: "y"
   
psram:
  mode: octal # Please change this to quad for N8R2 and octal for N16R8
  speed: 80MHz

# Enable logging
logger:

# Enable Home Assistant API
api:
  encryption:
    key:  !secret api_key

ota:
  - platform: esphome
    password:  !secret ota_password

wifi:
  ssid: !secret wifi_ssid
  password: !secret wifi_password

  # Enable fallback hotspot (captive portal) in case wifi connection fails
  ap:
    ssid: "Oscar-Voice-Esp32-S3-Off-Device"
    password: !secret wifi_password

captive_portal:


i2s_audio:
  - id: i2s_in # For microphone
    i2s_lrclk_pin: GPIO3  #WS 
    i2s_bclk_pin: GPIO2 #SCK

  - id: i2s_speaker #For Speaker
    i2s_lrclk_pin: GPIO6  #LRC 
    i2s_bclk_pin: GPIO7 #BLCK

microphone:
  - platform: i2s_audio
    id: va_mic
    adc_type: external
    i2s_din_pin: GPIO4 #SD
    channel: left
    pdm: false
    i2s_audio_id: i2s_in
    bits_per_sample: 32bit


output:
  - platform: gpio
    pin: 
      number: GPIO8
      allow_other_uses: true
    id: set_low_speaker

speaker:
  - platform: i2s_audio
    id: va_speaker
    i2s_audio_id: i2s_speaker
    dac_type: external
    i2s_dout_pin:
      number: GPIO8  #  DIN Pin of the MAX98357A Audio Amplifier
      allow_other_uses: true
    channel: mono


voice_assistant:
  microphone: va_mic
  id: va
  noise_suppression_level: 2
  auto_gain: 31dBFS
  volume_multiplier: 4.0
  use_wake_word: false
  speaker: va_speaker
  
  on_error: 
   - if:
        condition:
          switch.is_on: use_wake_word
        then:
          - switch.turn_off: use_wake_word
          - delay: 1sec 
          - switch.turn_on: use_wake_word      
          
  on_wake_word_detected: 
    - light.turn_on:
        id: led_strip
        effect: "Fast Pulse"
        red: 100%
        green: 100%
        blue: 100%
        brightness: 80%
# zuhören was nach dem wakeword gesagt wird
  on_listening:
    - delay: 50ms
    - light.turn_on:
        id: led_strip
        effect: "Slow Pulse"
        red: 0%
        green: 0%
        blue: 100%
        brightness: 80%

# (Optional, Automation): An automation to perform when voice activity detection starts speech-to-text processing.
  on_stt_vad_start: 
    - light.turn_on:
        id: led_strip
        effect: "Scan Effect With Custom Values"
        red: 0%
        green: 100%
        blue: 0%
        brightness: 80%


# beginn der sprachausgabe
  on_tts_stream_start:
    - light.turn_on:
        id: led_strip
        effect: "Fireworks Effect With Custom Values"
        red: 0%
        green: 0%
        blue: 100%
        brightness: 80%  

# beginn der sprachausgabe
  on_tts_stream_end:
    - light.turn_off: led_strip    

# beginn der sprachausgabe
  on_stt_end:  
    - light.turn_off: led_strip

#  on_end:
#    - light.turn_off: led_strip

  on_client_connected:
    - delay: 50ms
    - light.turn_off: led_strip
    - if:
        condition:
          switch.is_on: use_wake_word
        then:
          - voice_assistant.start_continuous:

  on_client_disconnected:
    - if:
        condition:
          switch.is_on: use_wake_word
        then:
          - voice_assistant.stop:
  

binary_sensor:
  - platform: status
    name: API Connection
    id: api_connection
    filters:
      - delayed_on: 1s
    on_press:
      - if:
          condition:
            switch.is_on: use_wake_word
          then:
            - voice_assistant.start_continuous:
    on_release:
      - if:
          condition:
            switch.is_on: use_wake_word
          then:
            - voice_assistant.stop:

switch:
  - platform: template
    name: Use wake word
    id: use_wake_word
    optimistic: true
    restore_mode: RESTORE_DEFAULT_ON
    entity_category: config
    on_turn_on:
      - lambda: id(va).set_use_wake_word(true);
      - if:
          condition:
            not:
              - voice_assistant.is_running
          then:
            - voice_assistant.start_continuous
    
    on_turn_off:
      - voice_assistant.stop
      - lambda: id(va).set_use_wake_word(false);    

light:
  - platform: esp32_rmt_led_strip
    id: led_strip
    rgb_order: GRB
    pin: GPIO09
    num_leds: 4
    chipset: ws2812
    name: "Led Strip"
    effects:
      - pulse:
      - pulse:
          name: "Fast Pulse"
          transition_length: 0.5s
          update_interval: 0.5s
          min_brightness: 0%
          max_brightness: 100%
      - pulse:
          name: "Slow Pulse"
          transition_length: 1s
          update_interval: 1s
          min_brightness: 0%
          max_brightness: 100%
      - addressable_scan:
          name: "Scan Effect With Custom Values"
          move_interval: 200ms
          scan_width: 1      
      - random:
          name: "Random Effect With Custom Values"
          transition_length: 5s
          update_interval: 7s
      - flicker:
          name: "Flicker Effect With Custom Values"
          alpha: 95%
          intensity: 40%
      - addressable_random_twinkle:
          name: "Random Twinkle Effect With Custom Values"
          twinkle_probability: 5%
          progress_interval: 32ms
      - addressable_fireworks:
          name: "Fireworks Effect With Custom Values"
          update_interval: 32ms
          spark_probability: 50%
          use_random_color: true
          fade_out_rate: 120
1 Like