I’m working with an ESP32-A1S board that uses an ES8388 DAC and trying to configure a wake word function. The board is currently available on Amazon at an affordable price.
Now I’m trying to do the reverse—configure voice-to-text—but I’m facing a roadblock with the micro_wake_word component.
Here’s a summary of the issues I’ve encountered:
The micro_wake_word function requires the esp-idf platform.
The media_playeri2s_audio platform which I used for the text-to-speech function, is not compatible with esp-idf.
I attempted to use the audio_dac component instead, as it is esp-idf-compatible and seems to support the ES8388 DAC, but the module is not found during compilation (though available in auto-completion).
It appears I’m stuck in a loop of platform incompatibility, probably missing a recent update (my esphome is Version: 2025.7.5). Has anyone successfully configured a wake word on this board? I’d appreciate any advice on how to get the micro_wake_word function working.
To reproduce the problem pick the .yaml file in the aforementioned repo and add a micro_wake_word stanza:
so,i have esp32-lyrat 4.3,which is probably almost the same,except for buttons and touch,probably different gpio pins…
mine detects wakeword,but then i2s busy,can’t even play audio when not waiting for wakeword
some info:1. don’t use touch,it messes with audio
2. es8388 shares i2s line for speaker AND microphone - it seems esphome and a lot of people does not use this solution,i got i2s busy every time (you need to stop rec and then play)
3. i think my main problem is es8388,it seems badly initialized
script if anyone is interested(based on another guys idea,since i can barely code)
substitutions:
# Phases of the Voice Assistant
# IDLE: The voice assistant is ready to be triggered by a wake-word
voice_assist_idle_phase_id: '1'
# LISTENING: The voice assistant is ready to listen to a voice command (after being triggered by the wake word)
voice_assist_listening_phase_id: '2'
# THINKING: The voice assistant is currently processing the command
voice_assist_thinking_phase_id: '3'
# REPLYING: The voice assistant is replying to the command
voice_assist_replying_phase_id: '4'
# NOT_READY: The voice assistant is not ready
voice_assist_not_ready_phase_id: '10'
# ERROR: The voice assistant encountered an error
voice_assist_error_phase_id: '11'
# MUTED: The voice assistant is muted and will not reply to a wake-word
voice_assist_muted_phase_id: '12'
esphome:
name: jarvis-speaker
friendly_name: jarvis-speaker
esp32:
board: esp-wrover-kit
cpu_frequency: 240MHz
framework:
type: esp-idf
version: recommended
sdkconfig_options:
CONFIG_ESP32S3_DATA_CACHE_64KB: "y"
CONFIG_ESP32S3_DATA_CACHE_LINE_64B: "y"
CONFIG_ESP32S3_INSTRUCTION_CACHE_32KB: "y"
CONFIG_SPIRAM_RODATA: "y"
CONFIG_SPIRAM_FETCH_INSTRUCTIONS: "y"
CONFIG_BT_ALLOCATION_FROM_SPIRAM_FIRST: "y"
CONFIG_BT_BLE_DYNAMIC_ENV_MEMORY: "y"
CONFIG_MBEDTLS_EXTERNAL_MEM_ALLOC: "y"
CONFIG_MBEDTLS_SSL_PROTO_TLS1_3: "y"
CONFIG_ESP32_DEFAULT_CPU_FREQ_240: "y" #important for 3 models,for one not sure...
# Enable logging
logger:
# Enable Home Assistant API
api:
encryption:
key: "p62RK1NLAN2wW2mwVURJ5hGR7+KMJFck+XhOf49dZG8="
actions:
- action: start_va
then:
- voice_assistant.start
- action: stop_va
then:
- voice_assistant.stop
ota:
- platform: esphome
password: "f632f34cff2d41007f71807cc9916adb"
wifi:
ssid: !secret wifi_ssid
password: !secret wifi_password
#esp32_touch:
# setup_mode: False
# iir_filter: 10ms
i2c:
sda: GPIO18
scl: GPIO23
frequency: 100kHz
id: bus_a
audio_dac:
- platform: es8388
id: es8388_dac
psram:
speed: 120MHz
i2s_audio:
id: i2s_out
i2s_lrclk_pin: GPIO25
i2s_bclk_pin: GPIO5
i2s_mclk_pin: GPIO0
microphone:
- platform: i2s_audio
id: jarvis_mic
adc_type: external
i2s_din_pin: GPIO35
channel: left
sample_rate: 16000
i2s_audio_id: i2s_out
speaker:
- platform: i2s_audio
id: jarvis_speaker
i2s_audio_id: i2s_out
dac_type: external
i2s_dout_pin: GPIO26
sample_rate: 48000
buffer_duration: 90ms
audio_dac: es8388_dac
- platform: mixer
id: mixer_speaker_id
output_speaker: jarvis_speaker
source_speakers:
- id: announcement_spk_mixer_input
- id: media_spk_mixer_input
- platform: resampler
id: media_spk_resampling_input
output_speaker: media_spk_mixer_input
- platform: resampler
id: announcement_spk_resampling_input
output_speaker: announcement_spk_mixer_input
media_player:
- platform: speaker
name: "Jarvis Media Player"
id: jarvis_media_player_id
media_pipeline:
speaker: media_spk_resampling_input
num_channels: 1
announcement_pipeline:
speaker: announcement_spk_resampling_input
num_channels: 1
on_announcement:
- mixer_speaker.apply_ducking:
id: media_spk_mixer_input
decibel_reduction: 25
duration: 0.2s
on_state:
- delay: 0.7s
- if:
condition:
and:
- not:
voice_assistant.is_running:
- not:
media_player.is_announcing:
then:
- mixer_speaker.apply_ducking:
id: media_spk_mixer_input
decibel_reduction: !lambda |-
return id(ducking_decibel).state;
duration: 1.0s
files:
- id: alarm_sound
file: https://github.com/mitrokun/esp32s3-voice_assistant/raw/main/alarm.flac # Should be encoded with a 48000 Hz sample rate, mono or stereo audio, and 16 bits per sample.
- id: beep
file: https://github.com/mitrokun/esp32s3-voice_assistant/raw/main/r2d2d.flac
voice_assistant:
id: va
microphone: jarvis_mic
media_player: jarvis_media_player_id
noise_suppression_level: 2.0
volume_multiplier: 2.0
auto_gain: 31dBFS
# When the voice assistant connects to HA:
# Set init_in_progress to false (Initialization is over).
# If the switch is on, start the voice assistant
on_client_connected:
- lambda: id(init_in_progress) = false;
- if:
condition:
switch.is_on: voice_enabled
then:
- micro_wake_word.start
- lambda: id(voice_assistant_phase) = ${voice_assist_idle_phase_id};
else:
- lambda: id(voice_assistant_phase) = ${voice_assist_muted_phase_id};
- repeat:
count: 3 # Number of blinks
then:
- switch.toggle: wake_led
- delay: 150ms # Delay between on/off
- switch.toggle: wake_led
- delay: 150ms # Задержка перед следующим циклом
# When the voice assistant disconnects to HA:
# Stop the voice assistant
on_client_disconnected:
- lambda: id(voice_assistant_phase) = ${voice_assist_not_ready_phase_id};
- micro_wake_word.stop
on_listening:
# Reset flags
- lambda: |-
id(voice_assistant_phase) = ${voice_assist_listening_phase_id};
id(is_tts_active) = false;
id(question_flag) = false;
# Microphone operation indicator (red led)
- switch.turn_on: wake_led
# Waiting for speech for 4 seconds, otherwise exit
- script.execute: listening_timeout
on_stt_vad_start:
# Turn off the script if speech is detected
- script.stop: listening_timeout
on_stt_vad_end:
- switch.turn_off: wake_led
- lambda: id(voice_assistant_phase) = ${voice_assist_thinking_phase_id};
on_stt_end:
# Event for HA with recognized speech
- homeassistant.event:
event: esphome.stt_text
data:
text: !lambda return x;
on_tts_start:
- lambda: id(voice_assistant_phase) = ${voice_assist_replying_phase_id};
- script.execute: activate_stop_word_once
# Finding a question mark at the end of a sentence.
- lambda: |-
bool is_question = false;
if (!x.empty() && x.back() == '?') {
is_question = true;
}
id(question_flag) = is_question;
# - logger.log:
# format: "question_flag: %d (0=false, 1=true)"
# args:
# - id(question_flag)
on_tts_end:
# Set the flag when the stage is reached
- lambda: |-
id(is_tts_active) = true;
on_timer_finished:
then:
- switch.turn_on: timer_ringing
on_end:
# Additional check for microphone LED
- if:
condition:
- switch.is_on: wake_led
then:
- switch.turn_off: wake_led
- wait_until:
condition:
- media_player.is_announcing:
timeout: 0.5s
- wait_until:
not:
voice_assistant.is_running:
- delay: 0.5s
# New start of the pipeline if the conditions are met
- if:
condition:
and:
- lambda: 'return !id(question_flag);'
- lambda: 'return id(is_tts_active);'
then:
- voice_assistant.start:
else:
# Stop ducking audio.
- mixer_speaker.apply_ducking:
id: media_spk_mixer_input
decibel_reduction: !lambda |-
return id(ducking_decibel).state;
duration: 1.0s
- lambda: id(voice_assistant_phase) = ${voice_assist_idle_phase_id};
# When the voice assistant encounters an error:
# Wait 1 second and set the correct phase (idle or muted depending on the state of the switch)
on_error:
- if:
condition:
lambda: return !id(init_in_progress);
then:
- lambda: id(voice_assistant_phase) = ${voice_assist_error_phase_id};
- delay: 1s
- if:
condition:
switch.is_on: voice_enabled
then:
- lambda: id(voice_assistant_phase) = ${voice_assist_idle_phase_id};
else:
- lambda: id(voice_assistant_phase) = ${voice_assist_muted_phase_id};
micro_wake_word:
models:
- model: https://github.com/kahrendt/microWakeWord/releases/download/okay_nabu_20241226.3/okay_nabu.json
id: okay_nabu
- model: https://github.com/kahrendt/microWakeWord/releases/download/stop/stop.json
id: stop
internal: true
vad:
model: https://github.com/kahrendt/microWakeWord/releases/download/v2.1_models/vad.json
id: mww
stop_after_detection: false
on_wake_word_detected:
- if:
condition:
switch.is_on: timer_ringing
then:
- switch.turn_off: timer_ringing
else:
- if:
condition:
switch.is_on: voice_enabled
then:
- if:
condition:
voice_assistant.is_running:
then:
voice_assistant.stop:
# Stop any other media player announcement
else:
- if:
condition:
media_player.is_announcing:
then:
- media_player.stop:
announcement: true
# Start the voice assistant and play the wake sound, if enabled
else:
- media_player.speaker.play_on_device_media_file:
media_file: beep
announcement: true
- delay: 300ms
- voice_assistant.start:
wake_word: !lambda return wake_word;
globals:
# Global initialisation variable. Initialized to true and set to false once everything is connected. Only used to have a smooth "plugging" experience
- id: init_in_progress
type: bool
restore_value: no
initial_value: 'true'
# Global variable tracking the phase of the voice assistant (defined above). Initialized to not_ready
- id: voice_assistant_phase
type: int
restore_value: no
initial_value: ${voice_assist_not_ready_phase_id}
# Variable for tracking TTS triggering
- id: is_tts_active
type: bool
restore_value: no
initial_value: 'false'
# Variable for tracking built-in continued conversations
- id: question_flag
type: bool
restore_value: no
initial_value: 'false'
script:
- id: listening_timeout
mode: restart
then:
- delay: 4s
- if:
condition:
lambda: |-
return id(voice_assistant_phase) == 2;
then:
- switch.turn_off: wake_led
- voice_assistant.stop:
- lambda: id(voice_assistant_phase) = ${voice_assist_idle_phase_id};
- id: activate_stop_word_once
then:
- delay: 1s
# Enable stop wake word
- if:
condition:
switch.is_off: timer_ringing
then:
- micro_wake_word.enable_model: stop
- wait_until:
not:
media_player.is_announcing:
- if:
condition:
switch.is_off: timer_ringing
then:
- micro_wake_word.disable_model: stop
select:
- platform: es8388
es8388_id: es8388_dac
dac_output:
name: "DAC Output"
adc_input_mic:
name: "ADC Input MIC"
- platform: template
name: "Wake word sensitivity"
optimistic: true
initial_option: Slightly sensitive
restore_value: true
entity_category: config
options:
- Slightly sensitive
- Slightly+ sensitive
- Moderately sensitive
- Very sensitive
on_value:
# Sets specific wake word probabilities computed for each particular model
# Note probability cutoffs are set as a quantized uint8 value, each comment has the corresponding floating point cutoff
# False Accepts per Hour values are tested against all units and channels from the Dinner Party Corpus.
# These cutoffs apply only to the specific models included in the firmware: [email protected], hey_jarvis@v2, hey_mycroft@v2
lambda: |-
if (x == "Slightly sensitive") {
id(okay_nabu).set_probability_cutoff(217); // 0.85 -> 0.000 FAPH on DipCo (Manifest's default)
} else if (x == "Slightly+ sensitive") {
id(okay_nabu).set_probability_cutoff(191); // 0.75
} else if (x == "Moderately sensitive") {
id(okay_nabu).set_probability_cutoff(176); // 0.69 -> 0.376 FAPH on DipCo
} else if (x == "Very sensitive") {
id(okay_nabu).set_probability_cutoff(143); // 0.56 -> 0.751 FAPH on DipCo
}
- platform: logger
id: logger_select
name: Logger Level
disabled_by_default: true
button:
- platform: restart
name: reboot
number:
- platform: template
name: "Decibel Reduction"
id: ducking_decibel
min_value: 0
max_value: 12
step: 1
initial_value: 6
unit_of_measurement: "dB"
set_action:
- mixer_speaker.apply_ducking:
id: media_spk_mixer_input
decibel_reduction: !lambda 'return x;'
duration: 0.2s
binary_sensor:
- platform: gpio
id: jarvis_rec
pin:
number: GPIO36
inverted: true
mode:
input: true
filters:
- delayed_on: 20ms
on_click:
then:
- switch.toggle: voice_enabled
- platform: gpio
id: jarvis_headphones
pin:
number: GPIO19
inverted: true
mode:
input: true
- platform: gpio
id: jarvis_aux
pin:
number: GPIO12
inverted: false
mode:
input: true
switch:
- platform: gpio
pin: GPIO21
name: "AMP Switch"
id: amp_switch
restore_mode: ALWAYS_ON
- platform: gpio
pin:
number: GPIO39
inverted: true
mode:
input: true
id: jarvis_mode
- platform: gpio
pin: GPIO22
id: wake_led
name: "Rec"
- platform: template
name: Enable Voice Assistant
id: voice_enabled
optimistic: true
restore_mode: RESTORE_DEFAULT_ON
icon: mdi:assistant
# When the switch is turned on (on Home Assistant):
# Start the voice assistant component
on_turn_on:
- if:
condition:
lambda: return !id(init_in_progress);
then:
- lambda: id(voice_assistant_phase) = ${voice_assist_idle_phase_id};
- if:
condition:
not:
- voice_assistant.is_running
then:
- micro_wake_word.start
# When the switch is turned off (on Home Assistant):
# Stop the voice assistant component
on_turn_off:
- if:
condition:
lambda: return !id(init_in_progress);
then:
- voice_assistant.stop
- micro_wake_word.stop
- lambda: id(voice_assistant_phase) = ${voice_assist_muted_phase_id};
- platform: template
name: "Ring Timer"
id: timer_ringing
optimistic: true
restore_mode: ALWAYS_OFF
on_turn_off:
# Stop playing the alarm
- media_player.stop:
announcement: true
on_turn_on:
- while:
condition:
switch.is_on: timer_ringing
then:
# Play the alarm sound as an announcement
- media_player.speaker.play_on_device_media_file:
media_file: alarm_sound
announcement: true
# Wait until the alarm sound starts playing
- wait_until:
media_player.is_announcing:
# Wait until the alarm sound stops playing
- wait_until:
not:
media_player.is_announcing:
- delay: 1000ms