Alternative to ESP32-S3-BOX-3

ESP32-S3-BOX-3 is hard to get, so I used ESP32-S3 board and some external components. I used the configuration on firmware/voice-assistant/esp32-s3-box-3.yaml at 448c9eb48b64e4b08d21f3f022384fa0c5264c7a · esphome/firmware · GitHub and modified it a bit to fit the new hardware. I used a presence sensor to automatically mute when no one is detected. Thank God it worked.
https://youtu.be/YYBjjoTfzwc



Here is the modified yaml configuration:

substitutions:
  loading_illustration_file: https://github.com/esphome/firmware/raw/main/voice-assistant/casita/loading_320_240.png
  idle_illustration_file: images/tro_ly/idle_320_240.png
  listening_illustration_file: images/tro_ly/listening_320_240.png
  thinking_illustration_file: images/tro_ly/thinking_320_240.png 
  replying_illustration_file: images/tro_ly/replying_320_240.png
  error_illustration_file: images/tro_ly/error_320_240.png #/local/images/tro_ly/thinking_320_240.png
  loading_illustration_background_color: "000000"
  idle_illustration_background_color: "000000"
  listening_illustration_background_color: "FFFFFF"
  thinking_illustration_background_color: "FFFFFF"
  replying_illustration_background_color: "FFFFFF"
  error_illustration_background_color: "000000"
  
  voice_assist_idle_phase_id: "1"
  voice_assist_listening_phase_id: "2"
  voice_assist_thinking_phase_id: "3"
  voice_assist_replying_phase_id: "4"
  voice_assist_not_ready_phase_id: "10"
  voice_assist_error_phase_id: "11"
  voice_assist_muted_phase_id: "12"
  
  allowed_characters: " !#%'()+,-./0123456789:;<>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz{|}°²³µ¿ÁÂÄÅÉÖÚßàáâãäåæçèéêëìíîðñòóôõöøùúûüýþāăąćčďĐđēėęěğĮįıļľŁłńňőřśšťũūůűųźŻżŽžơưșțΆΈΌΐΑΒΓΔΕΖΗΘΚΜΝΠΡΣΤΥΦάέήίαβγδεζηθικλμνξοπρςστυφχψωϊόύώАБВГДЕЖЗИКЛМНОПРСТУХЦЧШЪЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяёђєіїјљњћംხạảấầẩậắặẹẽếềểệỉịọỏốồổỗộớờởợụủứừửữựỳ—、,?"

esphome:
  name: noc-nha
  friendly_name: Nóc nhà
  platformio_options:
    board_build.flash_mode: dio
  on_boot:
    priority: 600
    then:
      - script.execute: draw_display
      - delay: 30s
      - if:
          condition:
            lambda: return id(init_in_progress);
          then:
            - lambda: id(init_in_progress) = false;
            - script.execute: draw_display  

esp32:
  board: esp32-s3-devkitc-1
  flash_size: 16MB
  framework:
    type: esp-idf
    version: recommended
    sdkconfig_options:
      CONFIG_ESP32S3_DEFAULT_CPU_FREQ_240: "y"
      CONFIG_ESP32S3_DATA_CACHE_64KB: "y"
      CONFIG_ESP32S3_DATA_CACHE_LINE_64B: "y"

   
psram:
  mode: octal  # quad for N8R2 and octal for N16R8
  speed: 80MHz

# Enable logging
logger:
  hardware_uart: USB_SERIAL_JTAG

# Enable Home Assistant API
api:
  encryption:
    key: ""
  on_client_connected:
    - script.execute: draw_display
  on_client_disconnected:
    - script.execute: draw_display

ota:
  - platform: esphome
    password: ""


wifi:
  networks:
  - ssid: !secret wifi_ssid
    password: !secret wifi_password


  # Enable fallback hotspot (captive portal) in case wifi connection fails
  ap:
    ssid: "Noc-Nha Fallback Hotspot"
    password: ""
  on_connect:
    - script.execute: draw_display
    - delay: 5s  # Cung cấp thời gian để kết quả ứng biến được truyền tải

  on_disconnect:
    - script.execute: draw_display

captive_portal:
 
# Virtual button        
button:
  - platform: restart
    name: "Restart"
    id: but_rest

  - platform: safe_mode
    id: button_safe_mode
    name: Safe Mode Boot

  - platform: factory_reset
    id: factory_reset_btn
    name: Factory reset

switch:
  - platform: template
    name: Display conversation
    id: display_conversation
    optimistic: true
    restore_mode: RESTORE_DEFAULT_ON
    entity_category: config

  - platform: template
    id: mute
    name: mute
    optimistic: true
    restore_mode: RESTORE_DEFAULT_OFF
    entity_category: config
    on_turn_off:
      - light.turn_on: 
          id: back_light
          brightness: 40%
      - if:
          condition:
            lambda: return !id(init_in_progress);
          then:
            - lambda: id(va).set_use_wake_word(true);
            - lambda: id(voice_assistant_phase) = ${voice_assist_idle_phase_id};
            - if:
                condition:
                  not:
                    - voice_assistant.is_running
                then:
                  - voice_assistant.start_continuous
            - script.execute: draw_display
    on_turn_on:
      - light.turn_off: back_light
      - if:
          condition:
            lambda: return !id(init_in_progress);
          then:
            - voice_assistant.stop
            - lambda: id(va).set_use_wake_word(false);
            - lambda: id(voice_assistant_phase) = ${voice_assist_muted_phase_id};
            - script.execute: draw_display

output:
  - platform: ledc
    pin: GPIO3
    id: backlight_output

light:
  - platform: monochromatic
    output: backlight_output
    name: "Đèn nền"
    id: back_light
    restore_mode: ALWAYS_ON 

binary_sensor:
  - platform: gpio
    pin: 45
    device_class: motion
    name: hiện diện
    id: Presence
    on_press: 
      then:
        - switch.turn_off: mute
    on_release: 
      then:
        - switch.turn_on: mute    
        
# Audio and Voice Assistant Config  
      
i2s_audio:
  - id: i2s_in
    i2s_lrclk_pin: GPIO6  #WS 
    i2s_bclk_pin: GPIO5   #SCK
  - id: i2s_out
    i2s_lrclk_pin: GPIO11  #WS LRC
    i2s_bclk_pin: GPIO10   #SCK BCLK
    
microphone:
  - platform: i2s_audio
    id: va_mic
    adc_type: external
    i2s_din_pin: GPIO4    #SD pin on the INMP441
    channel: left
    pdm: false
    i2s_audio_id: i2s_in
    bits_per_sample: 32 bit
speaker:
  - platform: i2s_audio
    id: box_speaker
    dac_type: external
    i2s_audio_id: i2s_out
    i2s_dout_pin: 9 # DIN
    channel: mono
        
voice_assistant:
  id: va
  microphone: va_mic
  speaker: box_speaker
  use_wake_word: true
  noise_suppression_level: 2.0
  volume_multiplier: 2.0
  auto_gain: 31dBFS
  on_wake_word_detected: 
    - light.turn_on:
        id: back_light
        brightness: 100%
  on_listening: # Tự động hóa để Thực hiện khi micrô trợ lý giọng nói bắt đầu nghe.
    - lambda: id(voice_assistant_phase) = ${voice_assist_listening_phase_id};
    - text_sensor.template.publish:
        id: text_request
        state: "..."
    - text_sensor.template.publish:
        id: text_response
        state: "..."
    - script.execute: draw_display
  on_stt_vad_end: #Tự động hóa để thực hiện khi hoạt động bằng giọng nói Tính năng phát hiện kết thúc quá trình chuyển giọng nói thành văn bản.
    - lambda: id(voice_assistant_phase) = ${voice_assist_thinking_phase_id};
    - script.execute: draw_display
  on_stt_end: #Tự động hóa để thực hiện khi trợ lý giọng nói đã hoàn thành chuyển giọng nói thành văn bản. Văn bản kết quả là có sẵn để tự động hóa dưới dạng biến .x
    - text_sensor.template.publish:
        id: text_request
        state: !lambda return x;
    - script.execute: draw_display
  on_tts_start: #Tự động hóa để thực hiện Khi trợ lý giọng nói đã bắt đầu chuyển văn bản thành giọng nói. Văn bản sẽ được nói là có sẵn để tự động hóa dưới dạng biến .x
    - text_sensor.template.publish:
        id: text_response
        state: !lambda return x;
  on_tts_stream_start: #Tự động hóa để thực hiện khi luồng âm thanh (phản hồi bằng giọng nói) bắt đầu phát lại. Yêu cầu phải được cấu hình.speaker
    - lambda: id(voice_assistant_phase) = ${voice_assist_replying_phase_id};
    - script.execute: draw_display
  on_tts_stream_end: #Tự động hóa để thực hiện khi luồng âm thanh (phản hồi bằng giọng nói) quá trình phát lại kết thúc. Yêu cầu phải được cấu hình.speaker
    - lambda: id(voice_assistant_phase) = ${voice_assist_listening_phase_id};
    - script.execute: draw_display
  on_idle:
    - lambda: id(voice_assistant_phase) = ${voice_assist_idle_phase_id};    
    - script.execute: draw_display 
    - light.turn_on:
        id: back_light
        brightness: 40%
        
  on_error:
    - if:
        condition:
          lambda: return !id(init_in_progress);
        then:
          - lambda: id(voice_assistant_phase) = ${voice_assist_error_phase_id};
          - script.execute: draw_display
          - delay: 1s
          - if:
              condition:
                switch.is_off: mute
              then:
                - lambda: id(voice_assistant_phase) = ${voice_assist_idle_phase_id};
              else:
                - lambda: id(voice_assistant_phase) = ${voice_assist_muted_phase_id};
          - script.execute: draw_display
  on_client_connected:
    - if:
        condition:
          switch.is_off: mute
        then:
          - voice_assistant.start_continuous:
          - lambda: id(voice_assistant_phase) = ${voice_assist_idle_phase_id};
        else:
          - lambda: id(voice_assistant_phase) = ${voice_assist_muted_phase_id};
    - lambda: id(init_in_progress) = false;
    - script.execute: draw_display
  on_client_disconnected:
    - lambda: id(voice_assistant_phase) = ${voice_assist_not_ready_phase_id};
    - script.execute: draw_display  

script:
  - id: draw_display
    then:
      - if:
          condition:
            lambda: return !id(init_in_progress);
          then:
            - if:
                condition:
                  wifi.connected:
                then:
                  - if:
                      condition:
                        api.connected:
                      then:
                        - lambda: |
                            switch(id(voice_assistant_phase)) {
                              case ${voice_assist_listening_phase_id}:
                                id(s3_box_lcd).show_page(listening_page);
                                id(s3_box_lcd).update();
                                break;
                              case ${voice_assist_thinking_phase_id}:
                                id(s3_box_lcd).show_page(thinking_page);
                                id(s3_box_lcd).update();
                                break;
                              case ${voice_assist_replying_phase_id}:
                                id(s3_box_lcd).show_page(replying_page);
                                id(s3_box_lcd).update();
                                break;
                              case ${voice_assist_error_phase_id}:
                                id(s3_box_lcd).show_page(error_page);
                                id(s3_box_lcd).update();
                                break;
                              case ${voice_assist_muted_phase_id}:
                                id(s3_box_lcd).show_page(muted_page);
                                id(s3_box_lcd).update();
                                break;
                              case ${voice_assist_not_ready_phase_id}:
                                id(s3_box_lcd).show_page(no_ha_page);
                                id(s3_box_lcd).update();
                                break;
                              default:
                                id(s3_box_lcd).show_page(idle_page);
                                id(s3_box_lcd).update();
                            }
                      else:
                        - display.page.show: no_ha_page
                        - component.update: s3_box_lcd
                else:
                  - display.page.show: no_wifi_page
                  - component.update: s3_box_lcd
          else:
            - display.page.show: initializing_page
            - component.update: s3_box_lcd

globals:
  - id: init_in_progress
    type: bool
    restore_value: false
    initial_value: "true"
  - id: voice_assistant_phase
    type: int
    restore_value: false
    initial_value: ${voice_assist_not_ready_phase_id}
image:
  - file: ${error_illustration_file}
    id: casita_error
    resize: 320x240
    type: RGB24
    use_transparency: true
  - file: ${idle_illustration_file}
    id: casita_idle
    resize: 320x240
    type: RGB24
    use_transparency: true
  - file: ${listening_illustration_file}
    id: casita_listening
    resize: 320x240
    type: RGB24
    use_transparency: true
  - file: ${thinking_illustration_file}
    id: casita_thinking
    resize: 320x240
    type: RGB24
    use_transparency: true
  - file: ${replying_illustration_file}
    id: casita_replying
    resize: 320x240
    type: RGB24
    use_transparency: true
  - file: ${loading_illustration_file}
    id: casita_initializing
    resize: 320x240
    type: RGB24
    use_transparency: true
  - file: images/tro_ly/error-no-wifi.png
    id: error_no_wifi
    resize: 320x240
    type: RGB24
    use_transparency: true
  - file: images/tro_ly/error-no-ha.png
    id: error_no_ha
    resize: 320x240
    type: RGB24
    use_transparency: true
font:
  - file:
      type: gfonts
      family: Roboto
      weight: 300
      italic: true
    glyphs: ${allowed_characters}
    id: font_request
    size: 15
  - file:
      type: gfonts
      family: Roboto
      weight: 300
    glyphs: ${allowed_characters}
    id: font_response
    size: 15
  - file:
      type: gfonts
      family: Roboto
      weight: 700
    glyphs: ${allowed_characters}
    id: font_date
    size: 18
  - file:
      type: gfonts
      family: Roboto
      weight: 700
    glyphs: ${allowed_characters}
    id: font_time
    size: 36   
text_sensor:
  - id: text_request
    platform: template
    on_value:
      lambda: |-
        if(id(text_request).state.length()>32) {
          std::string name = id(text_request).state.c_str();
          std::string truncated = esphome::str_truncate(name.c_str(),45);
          id(text_request).state = (truncated+"...").c_str();
        }

  - id: text_response
    platform: template
    on_value:
      lambda: |-
        if(id(text_response).state.length()>32) {
          std::string name = id(text_response).state.c_str();
          std::string truncated = esphome::str_truncate(name.c_str(),45);
          id(text_response).state = (truncated+"...").c_str();
        }
color:
  - id: idle_color
    hex: ${idle_illustration_background_color}
  - id: listening_color
    hex: ${listening_illustration_background_color}
  - id: thinking_color
    hex: ${thinking_illustration_background_color}
  - id: replying_color
    hex: ${replying_illustration_background_color}
  - id: loading_color
    hex: ${loading_illustration_background_color}
  - id: error_color
    hex: ${error_illustration_background_color}
  - id: my_red
    red: 60%
    green: 100%
    blue: 80%


spi:
  clk_pin: GPIO17 # SCL
  mosi_pin: GPIO16 # SDA

display:
  - platform: ili9xxx
    id: s3_box_lcd
    model: st7789v
    rotation: 90
    color_order: bgr
    data_rate: 80MHz
    cs_pin: GPIO8
    dc_pin: GPIO18
    reset_pin: GPIO15
    invert_colors: false
    dimensions:
      height: 320
      width: 240
    update_interval: never
    pages:
      - id: idle_page
        lambda: |-
          it.fill(id(idle_color));
          it.image((it.get_width() / 2), (it.get_height() / 2), id(casita_idle), ImageAlign::CENTER);
      - id: listening_page
        lambda: |-
          it.fill(id(listening_color));
          it.image((it.get_width() / 2), (it.get_height() / 2), id(casita_listening), ImageAlign::CENTER);
      - id: thinking_page
        lambda: |-
          it.fill(id(thinking_color));
          it.image((it.get_width() / 2), (it.get_height() / 2), id(casita_thinking), ImageAlign::CENTER);
          if (id(display_conversation).state) {
            it.filled_rectangle(20 , 20 , 280 , 30 , Color::WHITE );
            it.rectangle(20 , 20 , 280 , 30 , Color::BLACK );
            it.printf(30, 25, id(font_request), Color::BLACK, "%s", id(text_request).state.c_str());
          }
      - id: replying_page
        lambda: |-
          it.fill(id(replying_color));
          it.image((it.get_width() / 2), (it.get_height() / 2), id(casita_replying), ImageAlign::CENTER);
          if (id(display_conversation).state) {
            it.filled_rectangle(20 , 20 , 280 , 30 , Color::WHITE );
            it.rectangle(20 , 20 , 280 , 30 , Color::BLACK );
            it.filled_rectangle(20 , 190 , 280 , 30 , Color::WHITE );
            it.rectangle(20 , 190 , 280 , 30 , Color::BLACK );
            it.printf(30, 25, id(font_request), Color::BLACK, "%s", id(text_request).state.c_str());
            it.printf(30, 195, id(font_response), Color::BLACK, "%s", id(text_response).state.c_str());
          }
      - id: error_page
        lambda: |-
          it.fill(id(error_color));
          it.image((it.get_width() / 2), (it.get_height() / 2), id(casita_error), ImageAlign::CENTER);
      - id: no_ha_page
        lambda: |-
          it.image((it.get_width() / 2), (it.get_height() / 2), id(error_no_ha), ImageAlign::CENTER);
      - id: no_wifi_page
        lambda: |-
          it.image((it.get_width() / 2), (it.get_height() / 2), id(error_no_wifi), ImageAlign::CENTER);
      - id: initializing_page
        lambda: |-
          it.fill(id(loading_color));
          it.image((it.get_width() / 2), (it.get_height() / 2), id(casita_initializing), ImageAlign::CENTER);
      - id: muted_page
        lambda: |-
          it.fill(Color::BLACK);    
2 Likes