ESP32-S3-BOX-3 is hard to get, so I used ESP32-S3 board and some external components. I used the configuration on firmware/voice-assistant/esp32-s3-box-3.yaml at 448c9eb48b64e4b08d21f3f022384fa0c5264c7a · esphome/firmware · GitHub and modified it a bit to fit the new hardware. I used a presence sensor to automatically mute when no one is detected. Thank God it worked.
https://youtu.be/YYBjjoTfzwc
Here is the modified yaml configuration:
substitutions:
loading_illustration_file: https://github.com/esphome/firmware/raw/main/voice-assistant/casita/loading_320_240.png
idle_illustration_file: images/tro_ly/idle_320_240.png
listening_illustration_file: images/tro_ly/listening_320_240.png
thinking_illustration_file: images/tro_ly/thinking_320_240.png
replying_illustration_file: images/tro_ly/replying_320_240.png
error_illustration_file: images/tro_ly/error_320_240.png #/local/images/tro_ly/thinking_320_240.png
loading_illustration_background_color: "000000"
idle_illustration_background_color: "000000"
listening_illustration_background_color: "FFFFFF"
thinking_illustration_background_color: "FFFFFF"
replying_illustration_background_color: "FFFFFF"
error_illustration_background_color: "000000"
voice_assist_idle_phase_id: "1"
voice_assist_listening_phase_id: "2"
voice_assist_thinking_phase_id: "3"
voice_assist_replying_phase_id: "4"
voice_assist_not_ready_phase_id: "10"
voice_assist_error_phase_id: "11"
voice_assist_muted_phase_id: "12"
allowed_characters: " !#%'()+,-./0123456789:;<>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz{|}°²³µ¿ÁÂÄÅÉÖÚßàáâãäåæçèéêëìíîðñòóôõöøùúûüýþāăąćčďĐđēėęěğĮįıļľŁłńňőřśšťũūůűųźŻżŽžơưșțΆΈΌΐΑΒΓΔΕΖΗΘΚΜΝΠΡΣΤΥΦάέήίαβγδεζηθικλμνξοπρςστυφχψωϊόύώАБВГДЕЖЗИКЛМНОПРСТУХЦЧШЪЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяёђєіїјљњћംხạảấầẩậắặẹẽếềểệỉịọỏốồổỗộớờởợụủứừửữựỳ—、,?"
esphome:
name: noc-nha
friendly_name: Nóc nhà
platformio_options:
board_build.flash_mode: dio
on_boot:
priority: 600
then:
- script.execute: draw_display
- delay: 30s
- if:
condition:
lambda: return id(init_in_progress);
then:
- lambda: id(init_in_progress) = false;
- script.execute: draw_display
esp32:
board: esp32-s3-devkitc-1
flash_size: 16MB
framework:
type: esp-idf
version: recommended
sdkconfig_options:
CONFIG_ESP32S3_DEFAULT_CPU_FREQ_240: "y"
CONFIG_ESP32S3_DATA_CACHE_64KB: "y"
CONFIG_ESP32S3_DATA_CACHE_LINE_64B: "y"
psram:
mode: octal # quad for N8R2 and octal for N16R8
speed: 80MHz
# Enable logging
logger:
hardware_uart: USB_SERIAL_JTAG
# Enable Home Assistant API
api:
encryption:
key: ""
on_client_connected:
- script.execute: draw_display
on_client_disconnected:
- script.execute: draw_display
ota:
- platform: esphome
password: ""
wifi:
networks:
- ssid: !secret wifi_ssid
password: !secret wifi_password
# Enable fallback hotspot (captive portal) in case wifi connection fails
ap:
ssid: "Noc-Nha Fallback Hotspot"
password: ""
on_connect:
- script.execute: draw_display
- delay: 5s # Cung cấp thời gian để kết quả ứng biến được truyền tải
on_disconnect:
- script.execute: draw_display
captive_portal:
# Virtual button
button:
- platform: restart
name: "Restart"
id: but_rest
- platform: safe_mode
id: button_safe_mode
name: Safe Mode Boot
- platform: factory_reset
id: factory_reset_btn
name: Factory reset
switch:
- platform: template
name: Display conversation
id: display_conversation
optimistic: true
restore_mode: RESTORE_DEFAULT_ON
entity_category: config
- platform: template
id: mute
name: mute
optimistic: true
restore_mode: RESTORE_DEFAULT_OFF
entity_category: config
on_turn_off:
- light.turn_on:
id: back_light
brightness: 40%
- if:
condition:
lambda: return !id(init_in_progress);
then:
- lambda: id(va).set_use_wake_word(true);
- lambda: id(voice_assistant_phase) = ${voice_assist_idle_phase_id};
- if:
condition:
not:
- voice_assistant.is_running
then:
- voice_assistant.start_continuous
- script.execute: draw_display
on_turn_on:
- light.turn_off: back_light
- if:
condition:
lambda: return !id(init_in_progress);
then:
- voice_assistant.stop
- lambda: id(va).set_use_wake_word(false);
- lambda: id(voice_assistant_phase) = ${voice_assist_muted_phase_id};
- script.execute: draw_display
output:
- platform: ledc
pin: GPIO3
id: backlight_output
light:
- platform: monochromatic
output: backlight_output
name: "Đèn nền"
id: back_light
restore_mode: ALWAYS_ON
binary_sensor:
- platform: gpio
pin: 45
device_class: motion
name: hiện diện
id: Presence
on_press:
then:
- switch.turn_off: mute
on_release:
then:
- switch.turn_on: mute
# Audio and Voice Assistant Config
i2s_audio:
- id: i2s_in
i2s_lrclk_pin: GPIO6 #WS
i2s_bclk_pin: GPIO5 #SCK
- id: i2s_out
i2s_lrclk_pin: GPIO11 #WS LRC
i2s_bclk_pin: GPIO10 #SCK BCLK
microphone:
- platform: i2s_audio
id: va_mic
adc_type: external
i2s_din_pin: GPIO4 #SD pin on the INMP441
channel: left
pdm: false
i2s_audio_id: i2s_in
bits_per_sample: 32 bit
speaker:
- platform: i2s_audio
id: box_speaker
dac_type: external
i2s_audio_id: i2s_out
i2s_dout_pin: 9 # DIN
channel: mono
voice_assistant:
id: va
microphone: va_mic
speaker: box_speaker
use_wake_word: true
noise_suppression_level: 2.0
volume_multiplier: 2.0
auto_gain: 31dBFS
on_wake_word_detected:
- light.turn_on:
id: back_light
brightness: 100%
on_listening: # Tự động hóa để Thực hiện khi micrô trợ lý giọng nói bắt đầu nghe.
- lambda: id(voice_assistant_phase) = ${voice_assist_listening_phase_id};
- text_sensor.template.publish:
id: text_request
state: "..."
- text_sensor.template.publish:
id: text_response
state: "..."
- script.execute: draw_display
on_stt_vad_end: #Tự động hóa để thực hiện khi hoạt động bằng giọng nói Tính năng phát hiện kết thúc quá trình chuyển giọng nói thành văn bản.
- lambda: id(voice_assistant_phase) = ${voice_assist_thinking_phase_id};
- script.execute: draw_display
on_stt_end: #Tự động hóa để thực hiện khi trợ lý giọng nói đã hoàn thành chuyển giọng nói thành văn bản. Văn bản kết quả là có sẵn để tự động hóa dưới dạng biến .x
- text_sensor.template.publish:
id: text_request
state: !lambda return x;
- script.execute: draw_display
on_tts_start: #Tự động hóa để thực hiện Khi trợ lý giọng nói đã bắt đầu chuyển văn bản thành giọng nói. Văn bản sẽ được nói là có sẵn để tự động hóa dưới dạng biến .x
- text_sensor.template.publish:
id: text_response
state: !lambda return x;
on_tts_stream_start: #Tự động hóa để thực hiện khi luồng âm thanh (phản hồi bằng giọng nói) bắt đầu phát lại. Yêu cầu phải được cấu hình.speaker
- lambda: id(voice_assistant_phase) = ${voice_assist_replying_phase_id};
- script.execute: draw_display
on_tts_stream_end: #Tự động hóa để thực hiện khi luồng âm thanh (phản hồi bằng giọng nói) quá trình phát lại kết thúc. Yêu cầu phải được cấu hình.speaker
- lambda: id(voice_assistant_phase) = ${voice_assist_listening_phase_id};
- script.execute: draw_display
on_idle:
- lambda: id(voice_assistant_phase) = ${voice_assist_idle_phase_id};
- script.execute: draw_display
- light.turn_on:
id: back_light
brightness: 40%
on_error:
- if:
condition:
lambda: return !id(init_in_progress);
then:
- lambda: id(voice_assistant_phase) = ${voice_assist_error_phase_id};
- script.execute: draw_display
- delay: 1s
- if:
condition:
switch.is_off: mute
then:
- lambda: id(voice_assistant_phase) = ${voice_assist_idle_phase_id};
else:
- lambda: id(voice_assistant_phase) = ${voice_assist_muted_phase_id};
- script.execute: draw_display
on_client_connected:
- if:
condition:
switch.is_off: mute
then:
- voice_assistant.start_continuous:
- lambda: id(voice_assistant_phase) = ${voice_assist_idle_phase_id};
else:
- lambda: id(voice_assistant_phase) = ${voice_assist_muted_phase_id};
- lambda: id(init_in_progress) = false;
- script.execute: draw_display
on_client_disconnected:
- lambda: id(voice_assistant_phase) = ${voice_assist_not_ready_phase_id};
- script.execute: draw_display
script:
- id: draw_display
then:
- if:
condition:
lambda: return !id(init_in_progress);
then:
- if:
condition:
wifi.connected:
then:
- if:
condition:
api.connected:
then:
- lambda: |
switch(id(voice_assistant_phase)) {
case ${voice_assist_listening_phase_id}:
id(s3_box_lcd).show_page(listening_page);
id(s3_box_lcd).update();
break;
case ${voice_assist_thinking_phase_id}:
id(s3_box_lcd).show_page(thinking_page);
id(s3_box_lcd).update();
break;
case ${voice_assist_replying_phase_id}:
id(s3_box_lcd).show_page(replying_page);
id(s3_box_lcd).update();
break;
case ${voice_assist_error_phase_id}:
id(s3_box_lcd).show_page(error_page);
id(s3_box_lcd).update();
break;
case ${voice_assist_muted_phase_id}:
id(s3_box_lcd).show_page(muted_page);
id(s3_box_lcd).update();
break;
case ${voice_assist_not_ready_phase_id}:
id(s3_box_lcd).show_page(no_ha_page);
id(s3_box_lcd).update();
break;
default:
id(s3_box_lcd).show_page(idle_page);
id(s3_box_lcd).update();
}
else:
- display.page.show: no_ha_page
- component.update: s3_box_lcd
else:
- display.page.show: no_wifi_page
- component.update: s3_box_lcd
else:
- display.page.show: initializing_page
- component.update: s3_box_lcd
globals:
- id: init_in_progress
type: bool
restore_value: false
initial_value: "true"
- id: voice_assistant_phase
type: int
restore_value: false
initial_value: ${voice_assist_not_ready_phase_id}
image:
- file: ${error_illustration_file}
id: casita_error
resize: 320x240
type: RGB24
use_transparency: true
- file: ${idle_illustration_file}
id: casita_idle
resize: 320x240
type: RGB24
use_transparency: true
- file: ${listening_illustration_file}
id: casita_listening
resize: 320x240
type: RGB24
use_transparency: true
- file: ${thinking_illustration_file}
id: casita_thinking
resize: 320x240
type: RGB24
use_transparency: true
- file: ${replying_illustration_file}
id: casita_replying
resize: 320x240
type: RGB24
use_transparency: true
- file: ${loading_illustration_file}
id: casita_initializing
resize: 320x240
type: RGB24
use_transparency: true
- file: images/tro_ly/error-no-wifi.png
id: error_no_wifi
resize: 320x240
type: RGB24
use_transparency: true
- file: images/tro_ly/error-no-ha.png
id: error_no_ha
resize: 320x240
type: RGB24
use_transparency: true
font:
- file:
type: gfonts
family: Roboto
weight: 300
italic: true
glyphs: ${allowed_characters}
id: font_request
size: 15
- file:
type: gfonts
family: Roboto
weight: 300
glyphs: ${allowed_characters}
id: font_response
size: 15
- file:
type: gfonts
family: Roboto
weight: 700
glyphs: ${allowed_characters}
id: font_date
size: 18
- file:
type: gfonts
family: Roboto
weight: 700
glyphs: ${allowed_characters}
id: font_time
size: 36
text_sensor:
- id: text_request
platform: template
on_value:
lambda: |-
if(id(text_request).state.length()>32) {
std::string name = id(text_request).state.c_str();
std::string truncated = esphome::str_truncate(name.c_str(),45);
id(text_request).state = (truncated+"...").c_str();
}
- id: text_response
platform: template
on_value:
lambda: |-
if(id(text_response).state.length()>32) {
std::string name = id(text_response).state.c_str();
std::string truncated = esphome::str_truncate(name.c_str(),45);
id(text_response).state = (truncated+"...").c_str();
}
color:
- id: idle_color
hex: ${idle_illustration_background_color}
- id: listening_color
hex: ${listening_illustration_background_color}
- id: thinking_color
hex: ${thinking_illustration_background_color}
- id: replying_color
hex: ${replying_illustration_background_color}
- id: loading_color
hex: ${loading_illustration_background_color}
- id: error_color
hex: ${error_illustration_background_color}
- id: my_red
red: 60%
green: 100%
blue: 80%
spi:
clk_pin: GPIO17 # SCL
mosi_pin: GPIO16 # SDA
display:
- platform: ili9xxx
id: s3_box_lcd
model: st7789v
rotation: 90
color_order: bgr
data_rate: 80MHz
cs_pin: GPIO8
dc_pin: GPIO18
reset_pin: GPIO15
invert_colors: false
dimensions:
height: 320
width: 240
update_interval: never
pages:
- id: idle_page
lambda: |-
it.fill(id(idle_color));
it.image((it.get_width() / 2), (it.get_height() / 2), id(casita_idle), ImageAlign::CENTER);
- id: listening_page
lambda: |-
it.fill(id(listening_color));
it.image((it.get_width() / 2), (it.get_height() / 2), id(casita_listening), ImageAlign::CENTER);
- id: thinking_page
lambda: |-
it.fill(id(thinking_color));
it.image((it.get_width() / 2), (it.get_height() / 2), id(casita_thinking), ImageAlign::CENTER);
if (id(display_conversation).state) {
it.filled_rectangle(20 , 20 , 280 , 30 , Color::WHITE );
it.rectangle(20 , 20 , 280 , 30 , Color::BLACK );
it.printf(30, 25, id(font_request), Color::BLACK, "%s", id(text_request).state.c_str());
}
- id: replying_page
lambda: |-
it.fill(id(replying_color));
it.image((it.get_width() / 2), (it.get_height() / 2), id(casita_replying), ImageAlign::CENTER);
if (id(display_conversation).state) {
it.filled_rectangle(20 , 20 , 280 , 30 , Color::WHITE );
it.rectangle(20 , 20 , 280 , 30 , Color::BLACK );
it.filled_rectangle(20 , 190 , 280 , 30 , Color::WHITE );
it.rectangle(20 , 190 , 280 , 30 , Color::BLACK );
it.printf(30, 25, id(font_request), Color::BLACK, "%s", id(text_request).state.c_str());
it.printf(30, 195, id(font_response), Color::BLACK, "%s", id(text_response).state.c_str());
}
- id: error_page
lambda: |-
it.fill(id(error_color));
it.image((it.get_width() / 2), (it.get_height() / 2), id(casita_error), ImageAlign::CENTER);
- id: no_ha_page
lambda: |-
it.image((it.get_width() / 2), (it.get_height() / 2), id(error_no_ha), ImageAlign::CENTER);
- id: no_wifi_page
lambda: |-
it.image((it.get_width() / 2), (it.get_height() / 2), id(error_no_wifi), ImageAlign::CENTER);
- id: initializing_page
lambda: |-
it.fill(id(loading_color));
it.image((it.get_width() / 2), (it.get_height() / 2), id(casita_initializing), ImageAlign::CENTER);
- id: muted_page
lambda: |-
it.fill(Color::BLACK);