Create voice assistant ESP32-S3, MAX98357, INMP441 mic working but no sound

Hello,

I have been working over the past few days on creating a voice assistant using the following hardware:

  • ESP32‑S3
  • MAX98357 (amp I2S)
  • INMP441 (mic I2S)

The voice commands work without any problems, the LED on the ESP32 controller lights up, but there is no sound. I’ve already replaced the amplifier and the speakers, and the dashboard shows that something is playing, but there is no audio.

In other words, I’ve tried everything — even several different configurations — but nothing seems to work. Could someone help me with this problem? I would greatly appreciate it
P.S.: All assistant pipeline are installed and no errors in the log files.

Below you can config used:

esphome:
  name: esp-voiceassistant-2
  friendly_name: esp-voiceassistant-02

'''
esp32:
  board: esp32-s3-devkitc-1
  framework:
    type: esp-idf
  flash_size: 16MB
    

# Enable logging
logger:

# Enable Home Assistant API
api:
  encryption:
    key: !secret api_voice_2

ota:
  - platform: esphome
    password: !secret ota_voice_2

wifi:
  ssid: !secret wifi_ssid
  password: !secret wifi_password


  # Static IPs
  manual_ip:
    static_ip: !secret ip_voice_2
    gateway: !secret gtw_wifi
    subnet: 255.255.255.0
    dns1: !secret dns1_wifi

psram:
  mode: octal
  speed: 80MHz    
# -----------------------

# -----------------------
# I2S BUS
# -----------------------
i2s_audio:
  - id: bus
    i2s_bclk_pin: GPIO5
    i2s_lrclk_pin: GPIO4

# -----------------------
# MICROPHONE
# -----------------------
microphone:
  - platform: i2s_audio
    id: echo_microphone
    i2s_audio_id: bus
    i2s_din_pin: GPIO6
    adc_type: external
    sample_rate: 16000
    bits_per_sample: 32bit
    channel: left
    #use_apll: True

# -----------------------
# SPEAKER
# -----------------------
speaker:
  - platform: i2s_audio
    id: echo_speaker
    i2s_audio_id: bus
    i2s_dout_pin: GPIO7
    dac_type: external
    channel: left
    bits_per_sample: 16bit
    buffer_duration: 80ms

media_player:
  - platform: speaker
    name: None
    id: echo_media_player
    announcement_pipeline:
      speaker: echo_speaker
      format: FLAC
    codec_support_enabled: False
    buffer_size: 6000
    volume_min: 0.5
    volume_increment: 0.25
    on_volume:
      then:
        - light.turn_on:
            id: led
            effect: "Volume Level Display"
        - delay: 1s
        - light.turn_off:
            id: led 
    files:
      - id: mute_switch_on_sound
        file: https://github.com/WarLion/wakeword-files/raw/refs/heads/main/mute_switch_on.wav
      - id: mute_switch_off_sound
        file: https://github.com/WarLion/wakeword-files/raw/refs/heads/main/mute_switch_off.wav     
      - id: timer_finished_wave_file
        file: https://github.com/WarLion/wakeword-files/raw/refs/heads/main/timer_finished.wav
      - id: boot
        file: https://github.com/WarLion/wakeword-files/raw/refs/heads/main/wake_word_triggered.wav
    on_announcement:
      - if:
          condition:
            - microphone.is_capturing:
          then:
            - script.execute: stop_wake_word
      - light.turn_on:
          id: led
          blue: 100%
          red: 0%
          green: 0%
          brightness: 100%
          effect: none
    on_idle:
      - script.execute: start_wake_word
      - script.execute: reset_led

voice_assistant:
  id: va
  micro_wake_word:
  microphone:
    microphone: echo_microphone
    channels: 0
    gain_factor: 4
  media_player: echo_media_player
  noise_suppression_level: 2
  auto_gain: 31dBFS
  on_listening:
    - light.turn_on:
        id: led
        blue: 100%
        red: 0%
        green: 0%
        effect: "Slow Pulse"
  on_stt_vad_end:
    - light.turn_on:
        id: led
        blue: 100%
        red: 0%
        green: 0%
        effect: "Fast Pulse"
  on_tts_start:
    - light.turn_on:
        id: led
        blue: 100%
        red: 0%
        green: 0%
        brightness: 100%
        effect: none
  on_end:
    # Handle the "nevermind" case where there is no announcement
    - wait_until:
        condition:
          - media_player.is_announcing:
        timeout: 0.5s
    # Restart only mWW if enabled; streaming wake words automatically restart
    - if:
        condition:
          - lambda: |-
              return id(wake_word_engine_location).current_option() == "On device";
        then:
          - wait_until:
              - and:
                  - not:
                      voice_assistant.is_running:
                  - not:
                      speaker.is_playing:
          - lambda: id(va).set_use_wake_word(false);
          - micro_wake_word.start:
    - script.execute: reset_led
  on_error:
    - light.turn_on:
        id: led
        red: 100%
        green: 0%
        blue: 0%
        brightness: 100%
        effect: none
    - delay: 2s
    - script.execute: reset_led
  on_client_connected:
    - delay: 2s  # Give the api server time to settle
    - script.execute: start_wake_word
  on_client_disconnected:
    - script.execute: stop_wake_word
  on_timer_finished:
    - script.execute: stop_wake_word
    - wait_until:
        not:
          microphone.is_capturing:
    - switch.turn_on: timer_ringing
    - light.turn_on:
        id: led
        red: 0%
        green: 100%
        blue: 0%
        brightness: 100%
        effect: "Fast Pulse"
    - wait_until:
        - switch.is_off: timer_ringing
    - light.turn_off: led
    - switch.turn_off: timer_ringing

light:
  - platform: esp32_rmt_led_strip
    id: led
    name: None
    disabled_by_default: true
    entity_category: config
    pin: GPIO48
    default_transition_length: 0s
    chipset: WS2812
    num_leds: 1
    rgb_order: grb
    effects:
      - pulse:
          name: "Slow Pulse"
          transition_length: 250ms
          update_interval: 250ms
          min_brightness: 50%
          max_brightness: 100%
      - pulse:
          name: "Fast Pulse"
          transition_length: 100ms
          update_interval: 100ms
          min_brightness: 50%
          max_brightness: 100%

script:
  - id: reset_led
    then:
      - if:
          condition:
            - lambda: |-
                return id(wake_word_engine_location).current_option() == "On device";
            - switch.is_on: use_listen_light
          then:
            - light.turn_on:
                id: led
                red: 100%
                green: 89%
                blue: 71%
                brightness: 60%
                effect: none
          else:
            - if:
                condition:
                  - lambda: |-
                      return id(wake_word_engine_location).current_option() == "On device";
                  - switch.is_on: use_listen_light
                then:
                  - light.turn_on:
                      id: led
                      red: 0%
                      green: 100%
                      blue: 100%
                      brightness: 60%
                      effect: none
                else:
                  - light.turn_off: led
  - id: start_wake_word
    then:
      - if:
          condition:
            and:
              - not:
                  - voice_assistant.is_running:
              - lambda: |-
                  return id(wake_word_engine_location).current_option() == "On device";
          then:
            - lambda: id(va).set_use_wake_word(false);
            - micro_wake_word.start:
      - if:
          condition:
            and:
              - not:
                  - voice_assistant.is_running:
              - lambda: |-
                  return id(wake_word_engine_location).current_option() == "In Home Assistant";
          then:
            - lambda: id(va).set_use_wake_word(true);
            - voice_assistant.start_continuous:
  - id: stop_wake_word
    then:
      - if:
          condition:
            lambda: |-
                  return id(wake_word_engine_location).current_option() == "In Home Assistant";
          then:
            - lambda: id(va).set_use_wake_word(false);
            - voice_assistant.stop:
      - if:
          condition:
            lambda: |-
              return id(wake_word_engine_location).current_option() == "On device";
          then:
            - micro_wake_word.stop:

switch:
  - platform: template
    name: Use listen light
    id: use_listen_light
    optimistic: true
    restore_mode: RESTORE_DEFAULT_ON
    entity_category: config
    on_turn_on:
      - script.execute: reset_led
    on_turn_off:
      - script.execute: reset_led
  - platform: template
    id: timer_ringing
    optimistic: true
    restore_mode: ALWAYS_OFF
    on_turn_off:
      # Turn off the repeat mode and disable the pause between playlist items
      - lambda: |-
              id(echo_media_player)
                ->make_call()
                .set_command(media_player::MediaPlayerCommand::MEDIA_PLAYER_COMMAND_REPEAT_OFF)
                .set_announcement(true)
                .perform();
              id(echo_media_player)->set_playlist_delay_ms(speaker::AudioPipelineType::ANNOUNCEMENT, 0);
      # Stop playing the alarm
      - media_player.stop:
          announcement: true
    on_turn_on:
      # Turn on the repeat mode and pause for 1000 ms between playlist items/repeats
      - lambda: |-
            id(echo_media_player)
              ->make_call()
              .set_command(media_player::MediaPlayerCommand::MEDIA_PLAYER_COMMAND_REPEAT_ONE)
              .set_announcement(true)
              .perform();
            id(echo_media_player)->set_playlist_delay_ms(speaker::AudioPipelineType::ANNOUNCEMENT, 1000);
      - media_player.speaker.play_on_device_media_file:
          media_file: timer_finished_wave_file
          announcement: true
      - delay: 15min
      - switch.turn_off: timer_ringing

select:
  - platform: template
    entity_category: config
    name: Wake word engine location
    id: wake_word_engine_location
    optimistic: true
    restore_value: true
    options:
      - In Home Assistant
      - On device
    initial_option: On device
    on_value:
      - if:
          condition:
            lambda: return x == "In Home Assistant";
          then:
            - micro_wake_word.stop:
            - delay: 500ms
            - lambda: id(va).set_use_wake_word(true);
            - voice_assistant.start_continuous:
      - if:
          condition:
            lambda: return x == "On device";
          then:
            - lambda: id(va).set_use_wake_word(false);
            - voice_assistant.stop:
            - delay: 500ms
            - micro_wake_word.start:

micro_wake_word:
  on_wake_word_detected:
    - voice_assistant.start:
        wake_word: !lambda return wake_word;
  vad:
  models:
    - model: okay_nabu
    - model: hey_mycroft
    - model: hey_jarvis

Which log file? Esphome or HA.

[16:11:48.148][D][micro_wake_word:325]: Detected 'Okay Nabu' with sliding average probability is 0.99 and max probability is 1.00
[16:11:48.148][D][voice_assistant:484]: State changed from IDLE to START_MICROPHONE
[16:11:48.148][D][voice_assistant:491]: Desired state set to START_PIPELINE
[16:11:48.153][D][micro_wake_word:370]: Stopping wake word detection
[16:11:48.154][D][voice_assistant:208]: Starting Microphone
[16:11:48.155][D][ring_buffer:034]: Created ring buffer with size 16384
[16:11:48.160][D][voice_assistant:484]: State changed from START_MICROPHONE to STARTING_MICROPHONE
[16:11:48.167][D][micro_wake_word:378]: State changed from DETECTING_WAKE_WORD to STOPPING
[16:11:48.169][D][voice_assistant:484]: State changed from STARTING_MICROPHONE to START_PIPELINE
[16:11:48.183][D][micro_wake_word:274]: Inference task is stopping, deallocating buffers
[16:11:48.187][D][micro_wake_word:279]: Inference task is finished, freeing task resources
[16:11:48.187][D][micro_wake_word:378]: State changed from STOPPING to STOPPED
[16:11:48.194][D][voice_assistant:229]: Requesting start
[16:11:48.195][D][voice_assistant:484]: State changed from START_PIPELINE to STARTING_PIPELINE
[16:11:48.202][D][voice_assistant:506]: Client started, streaming microphone
[16:11:48.207][D][voice_assistant:484]: State changed from STARTING_PIPELINE to STREAMING_MICROPHONE
[16:11:48.210][D][voice_assistant:491]: Desired state set to STREAMING_MICROPHONE
[16:11:48.210][D][voice_assistant:630]: Event Type: 1
[16:11:48.214][D][voice_assistant:633]: Assist Pipeline running
[16:11:48.218][D][voice_assistant:630]: Event Type: 3
[16:11:48.218][D][voice_assistant:652]: STT started
[16:11:48.224][D][light:079]: 'esp-voiceassistant-02' Setting:
[16:11:48.231][D][light:103]:   Red: 0%, Green: 0%, Blue: 100%
[16:11:48.232][D][light:153]:   Effect: 'Slow Pulse'
[16:11:49.955][D][voice_assistant:630]: Event Type: 11
[16:11:49.961][D][voice_assistant:833]: Starting STT by VAD
[16:11:52.187][D][voice_assistant:630]: Event Type: 12
[16:11:52.188][D][voice_assistant:837]: STT by VAD end
[16:11:52.188][D][voice_assistant:484]: State changed from STREAMING_MICROPHONE to STOP_MICROPHONE
[16:11:52.188][D][voice_assistant:491]: Desired state set to AWAITING_RESPONSE
[16:11:52.199][D][light:079]: 'esp-voiceassistant-02' Setting:
[16:11:52.204][D][light:103]:   Red: 0%, Green: 0%, Blue: 100%
[16:11:52.205][D][light:153]:   Effect: 'Fast Pulse'
[16:11:52.212][D][voice_assistant:484]: State changed from STOP_MICROPHONE to STOPPING_MICROPHONE
[16:11:52.216][D][voice_assistant:484]: State changed from STOPPING_MICROPHONE to AWAITING_RESPONSE
[16:11:52.250][D][voice_assistant:630]: Event Type: 4
[16:11:52.256][D][voice_assistant:669]: Speech recognised as: "Ligar, sala de jantar."
[16:11:52.260][D][voice_assistant:630]: Event Type: 5
[16:11:52.263][D][voice_assistant:674]: Intent started
[16:11:52.268][D][voice_assistant:630]: Event Type: 6
[16:11:52.272][D][voice_assistant:630]: Event Type: 7
[16:11:52.275][D][voice_assistant:727]: Response: "interruptor ligado"
[16:11:52.280][D][voice_assistant:630]: Event Type: 8
[16:11:52.288][D][voice_assistant:749]: Response URL: "http://192.168.30.6:8123/api/tts_proxy/gMV5sBhPXJvHADT1Rdr5Xg.wav"
[16:11:52.288][D][voice_assistant:484]: State changed from AWAITING_RESPONSE to STREAMING_RESPONSE
[16:11:52.292][D][voice_assistant:491]: Desired state set to STREAMING_RESPONSE
[16:11:52.293][D][light:079]: 'esp-voiceassistant-02' Setting:
[16:11:52.297][D][light:067]:   Brightness: 100%
[16:11:52.301][D][light:103]:   Red: 0%, Green: 0%, Blue: 100%
[16:11:52.305][D][light:153]:   Effect: 'None'
[16:11:52.307][D][media_player:085]: 'esp-voiceassistant-02' - Setting
[16:11:52.310][D][media_player:092]:   Media URL: http://192.168.30.6:8123/api/tts_proxy/gMV5sBhPXJvHADT1Rdr5Xg.wav
[16:11:52.316][D][media_player:098]:  Announcement: yes
[16:11:52.316][D][voice_assistant:630]: Event Type: 2
[16:11:52.324][D][voice_assistant:772]: Assist Pipeline ended
[16:11:52.328][D][light:079]: 'esp-voiceassistant-02' Setting:
[16:11:52.331][D][light:067]:   Brightness: 100%
[16:11:52.371][D][light:103]:   Red: 0%, Green: 0%, Blue: 100%
[16:11:52.374][D][speaker_media_player:410]: State changed to ANNOUNCING
[16:11:52.459][D][speaker_media_player.pipeline:114]: Reading WAV file type
[16:11:52.464][D][ring_buffer:034][ann_read]: Created ring buffer with size 6000
[16:11:52.503][D][speaker_media_player.pipeline:124]: Decoded audio has 1 channels, 16000 Hz sample rate, and 16 bits per sample
[16:11:52.505][D][ring_buffer:034][speaker_task]: Created ring buffer with size 2560
[16:11:52.509][D][i2s_audio.speaker:102]: Starting
[16:11:52.514][D][i2s_audio.speaker:106]: Started
[16:11:54.147][D][i2s_audio.speaker:111]: Stopping
[16:11:54.152][D][i2s_audio.speaker:116]: Stopped
[16:11:54.158][D][light:079]: 'esp-voiceassistant-02' Setting:
[16:11:54.161][D][light:067]:   Brightness: 60%
[16:11:54.165][D][light:103]:   Red: 100%, Green: 89%, Blue: 71%
[16:11:54.169][D][speaker_media_player:410]: State changed to IDLE
[16:11:54.172][D][voice_assistant:352]: Announcement finished playing
[16:11:54.176][D][voice_assistant:484]: State changed from STREAMING_RESPONSE to RESPONSE_FINISHED
[16:11:54.179][D][voice_assistant:491]: Desired state set to RESPONSE_FINISHED
[16:11:54.190][D][voice_assistant:484]: State changed from RESPONSE_FINISHED to IDLE
[16:11:54.193][D][voice_assistant:491]: Desired state set to IDLE
[16:11:54.197][D][micro_wake_word:360]: Starting wake word detection
[16:11:54.199][D][light:079]: 'esp-voiceassistant-02' Setting:
[16:11:54.203][D][light:067]:   Brightness: 60%
[16:11:54.207][D][light:103]:   Red: 100%, Green: 89%, Blue: 71%
[16:11:54.211][D][micro_wake_word:378]: State changed from STOPPED to STARTING
[16:11:54.235][D][ring_buffer:034][mww]: Created ring buffer with size 3840
[16:11:54.238][D][micro_wake_word:262]: Inference task has started, attempting to allocate memory for buffers
[16:11:54.242][D][micro_wake_word:267]: Inference task is running
[16:11:54.246][D][micro_wake_word:378]: State changed from STARTING to DETECTING_WAKE_WORD

This is not premade device?

if not did you wire amp and mic on independent i2s bus? they should have seperate i2s but I only see the single “bus” in your yaml. Maybe try using HAVPE yaml as base. you will not need voicekit but if you add I am sure it want make a difference.

Anyway. I would say as a start you may want to verify your speaker is working so quick way would be to do a simple config for media player and do not include mic. below may work (no promises). You can send announcement from HA

esphome:
  name: esp-voiceassistant-2
  friendly_name: esp-voiceassistant-02

'''
esp32:
  board: esp32-s3-devkitc-1
  framework:
    type: esp-idf
  flash_size: 16MB
    

# Enable logging
logger:

# Enable Home Assistant API
api:
  encryption:
    key: !secret api_voice_2

ota:
  - platform: esphome
    password: !secret ota_voice_2

wifi:
  ssid: !secret wifi_ssid
  password: !secret wifi_password


  # Static IPs
  manual_ip:
    static_ip: !secret ip_voice_2
    gateway: !secret gtw_wifi
    subnet: 255.255.255.0
    dns1: !secret dns1_wifi

psram:
  mode: octal
  speed: 80MHz    
# -----------------------

# -----------------------
# I2S BUS
# -----------------------
i2s_audio:
  - id: bus
    i2s_bclk_pin: GPIO5
    i2s_lrclk_pin: GPIO4

# -----------------------
# MICROPHONE
# -----------------------
microphone:
  - platform: i2s_audio
    id: echo_microphone
    i2s_audio_id: bus
    i2s_din_pin: GPIO6
    adc_type: external
    sample_rate: 16000
    bits_per_sample: 32bit
    channel: left
    #use_apll: True

# -----------------------
# SPEAKER
# -----------------------
speaker:
  - platform: i2s_audio
    id: echo_speaker
    i2s_audio_id: bus
    i2s_dout_pin: GPIO7
    dac_type: external
    channel: left
    bits_per_sample: 16bit
    buffer_duration: 80ms

media_player:
  - platform: speaker
    name: None
    id: echo_media_player
    announcement_pipeline:
      speaker: echo_speaker
      format: FLAC
    codec_support_enabled: False
    buffer_size: 6000
    volume_min: 0.5
    volume_increment: 0.25
    files:
      - id: mute_switch_on_sound
        file: https://github.com/WarLion/wakeword-files/raw/refs/heads/main/mute_switch_on.wav
      - id: mute_switch_off_sound
        file: https://github.com/WarLion/wakeword-files/raw/refs/heads/main/mute_switch_off.wav     
      - id: timer_finished_wave_file
        file: https://github.com/WarLion/wakeword-files/raw/refs/heads/main/timer_finished.wav
      - id: boot
        file: https://github.com/WarLion/wakeword-files/raw/refs/heads/main/wake_word_triggered.wav