Custom Build that proudly exposes the geeky hardware

veli · March 9, 2024, 2:59am

demo
gif

Updated as seen fit, originally it was closed, and the size of an original charger - but found a bigger speaker, then added led rings, then a display…

Hardware

Lolin Wemos ESP32S3 mini
Max98357A I2S DAC
MSM261S4030H0 MEMS MIC
SSD1306 128x32 Display
2 Neopixel LED Rings for visual feedback
- 10+ custom lambda effects that emulate Alexa, etc… lights
TTP223B Capacitive Touch Switch you extended with tin foil, so whole top is a touch sensor
Vibration motor to give haptic feedback for touch
- automation that slowly toggles ledc level and frequency to make it smoothly hum and fade as you touch it
Scavanged
- Random Speaker
- Google Home Mini Charger shell

Configuration

voice_assistant
display
- pages that show the contents of TTS, STT based on Voice Assistant state
- custom HA automation to text_sensor to a page that shows OpenAI generated response for what it answered, but translated to emoji
Third party integrations online_image - to send pictures to the display
- Scripts that makes OpenAI generate a picture based on what it responded
- Custom Integrations screen_server for capturing screenshots (OC)

Gallery

Display

Display Pages

Emoji Responses

TTS and STT

Long texts scroll as marquee

Images

Insides

Build process

Interface Dashboards & Frontend

Example GPT generated response images that show in HA as a custom camera

jenova70 · March 13, 2024, 9:19am

Nice entry, I am curious how the emoji and how the text response looks on the display. Can you share a video of the product in use?
Ah, and if possible, the ESPHome configuration too
Thx a lot!
And thx for your entry!

veli · March 13, 2024, 3:30pm

ESPHome

substitutions:
  # HA scripts and entities
  pipeline: "select.voice_assist_pipeline"
  finished_speaking: "select.voice_finished_speaking_detection"
  script_pipeline: "script.voice_assist_loop_pipeline"
  script_finished_speaking: "script.voice_assist_loop_finished"
  script_emoji: "script.voice_assist_emoji"
  script_eesti: "script.voice_assist_eesti"     
  script_process_emoji: "script.voice_assist_emoji_process"
  emojigpt: "input_text.emojigpt"
  # display
  default_url: "http://homeassistant.local:8123/local/yeet.png"
  local_image_size: '128x32'
  local_image_format: 'TRANSPARENT_BINARY'
  local_image_use_transparency: 'true'
  BOARD_TFT_HEIGHT: '32'
  BOARD_TFT_WIDTH: '128'  

globals:
  - id: choice
    type: std::string
    restore_value: no 
    initial_value: '"Katsu"'  

  - id: voice_error_code
    type: std::string
    restore_value: no  
    initial_value: '"No error"'

  - id: choices
    type: std::vector<std::string>
    restore_value: no 
    initial_value: '{"Küsida võib kõike","Katsu mind ja ma värisen","Ma olen ChatGPT"}'  
    
switch: 
  - platform: template
    name: Wake Word
    id: use_wake_word
    optimistic: true
    restore_mode: RESTORE_DEFAULT_ON
    icon: mdi:microphone-settings
    on_turn_on:
      - lambda: |-
          const char* icon = "mdi:microphone";
          id(use_wake_word).set_icon(icon);
          id(${device_name}_voice_assistant).set_use_wake_word(true);
      - if:
          condition:
            not:
              - voice_assistant.is_running:
          then:
            - voice_assistant.start_continuous:
      - lambda: |-
          id(${device_name}_top).turn_on().set_rgb(0.00, 0.74, 0.83).set_effect("Alexircle").perform(); // cyan
    on_turn_off:
      - voice_assistant.stop:
      - lambda: |-
          const char* icon = "mdi:microphone-off";
          id(use_wake_word).set_icon(icon);
          id(${device_name}_voice_assistant).set_use_wake_word(false);
      - delay: ${phi}

voice_assistant: 
  id: ${device_name}_voice_assistant
  microphone: ${device_name}_microphone
  media_player: ${device_name}_media_player
  use_wake_word: true
  noise_suppression_level: 2
  auto_gain: 31dBFS
  volume_multiplier: 2.0 
  on_client_connected:
    - lambda: |-
        ESP_LOGD("voice_assistant", "on_client_connected" );
    - if:
        condition:
          switch.is_on: use_wake_word
        then:
          lambda: |-
            id(start_voice_assistant)->execute(true);
  on_client_disconnected:
    - lambda: |-
        ESP_LOGD("voice_assistant", "on_client_disconnected" );
    - if:
        condition:
          switch.is_on: use_wake_word
        then:
          - voice_assistant.stop:
  on_wake_word_detected:
    then:
      - lambda: |-
          ESP_LOGD("voice_assistant", "on_wake_word_detected" );
          id(start_voice_assistant)->execute(false);
  on_listening: 
    then:  # CHUT UP AND LISTEN 
      - if:
          condition:
            - media_player.is_playing: ${device_name}_media_player          
          then:
            - media_player.stop: ${device_name}_media_player      
      - lambda: |-
          ESP_LOGI("voice_assistant", "on_listening" );
      - lambda: |-          
          id(${device_name}_top).turn_on().set_rgb(0.00, 0.74, 0.83).set_effect("Alexircle").perform(); // cyan      
  on_start: 
    then: 
      - lambda: |-
          ESP_LOGD("voice_assistant", "on_start" );
          id(backlight).turn_on().set_brightness(1.0).perform();
      - if:
          condition:
            - media_player.is_playing: ${device_name}_media_player          
          then:
            - media_player.stop: ${device_name}_media_player
  on_stt_end: 
    then: 
      - lambda: |-
          ESP_LOGD("voice_assistant", "on_stt_end");
      - lambda: |-
          std::string str = x.c_str();
          str = str.substr(0, 255);
          id(${device_name}_stt).publish_state(str);
      - display.page.show: stt          
      - component.update: ssd1306_display
  on_tts_start: 
    then: 
      - lambda: |-
          ESP_LOGD("voice_assistant", "on_tts_start" );
      - lambda: |-
          std::string str = x.c_str();
          str = str.substr(0, 255);
          if ( "EmojiGPT" == id(assistant).state ){
            id(chatgpt_emoji).publish_state(str);
          }
          else{
            id(${device_name}_tts).publish_state(str);
          }
      - homeassistant.service:
          service: ${script_process_emoji}
      - if: 
          condition: 
            - lambda: |- 
                return id(assistant).state == "EmojiGPT";
          then:
            - display.page.show: chatgpt_picture
          else:
            - display.page.show: tts
      - component.update: ssd1306_display
      - lambda: |-
          id(${device_name}_top).turn_on().set_rgb(0.30, 0.69, 0.31).perform(); // green          
  on_end:  
    then: 
      - lambda: |-
          ESP_LOGD("voice_assistant", "on_end");
          id(${device_name}_top).turn_off().set_brightness(0.0).perform();
      - wait_until:
        - not:
          - media_player.is_playing: ${device_name}_media_player
      - lambda: |-
          id(play_sound)->execute("voice/alexa_listened");                
  on_error: 
    then: 
      - lambda: |-
          auto error = code.c_str();

          id(voice_error_code) = message.c_str(); 

          ESP_LOGE("voice_assistant", "on_error %s %s", error, message.c_str() );

          if (strcasecmp(error, "stt-no-text-recognized") == 0) { // orange
            id(${device_name}_bot).turn_on().set_rgb(1.0, 0.59, 0.0).set_effect("Flash").perform();
          }
          else if (strcasecmp(error, "not-connected") == 0) { // red
            id(${device_name}_bot).turn_on().set_rgb(0.95, 0.21, 0.21).set_effect("Flash").perform();
          }
          else if (strcasecmp(error, "stt-stream-failed") == 0) { // amber
            id(${device_name}_bot).turn_on().set_rgb(1.00, 0.76, 0.03).set_effect("Flash").perform();
          }
          else if (strcasecmp(error, "pipeline-timeout") == 0) { // yellow
            id(${device_name}_bot).turn_on().set_rgb(1.00, 0.92, 0.23).set_effect("Flash").perform();
          }
          else { // purple
            id(${device_name}_bot).turn_on().set_rgb(0.61, 0.14, 0.69).set_effect("Flash").perform();
          }
      - display.page.show: error
      - component.update: ssd1306_display          
      - lambda: |-
          id(play_sound)->execute("voice/alexa_error");
      - lambda: |-
          if (code == "wake-provider-missing" || code == "wake-engine-missing") {
            ESP_LOGW("voice_assistant", "%s", code.c_str() );
            id(use_wake_word).turn_off();
          }
      - delay: $phi_x2

text_sensor: 
  - platform: template
    lambda: |-
      auto moji = id(emoji_response).state; 
      return moji;
    name: Emoji Response
    entity_category: diagnostic
    icon: mdi:emoticon-cool-outline

  - platform: homeassistant
    entity_id: ${emojigpt}
    id: emoji_response
    name: Emoji Response
    on_value:
     then:
        - display.page.show: chatgpt_picture          
        - component.update: ssd1306_display

  - platform: homeassistant
    entity_id: ${pipeline}
    id: assistant
    name: (Voice Assist pipeline)
    internal: true    

  - platform: homeassistant
    entity_id: ${finished_speaking}
    name: (Voice Finished speaking detection)
    internal: true

  - platform: template
    id: ${device_name}_tts
    name: TTS
    icon: mdi:speaker-message
    update_interval: never

  - platform: template
    id: ${device_name}_stt
    name: STT
    icon: mdi:microphone-message        
    update_interval: never

  - platform: template
    name: Emoji
    id: chatgpt_emoji
    internal: true
    update_interval: never
    on_value:
      - display.page.show: chatgpt_picture
      - component.update: ssd1306_display      

display: # https://esphome.io/components/display/ssd1306.html
  - platform: ssd1306_i2c
    id: ssd1306_display  
    model: "SSD1306 128x32"
    update_interval: never
    invert: false 
    rotation: 180°
    contrast: 100%
    flip_x: true 
    flip_y: true 
    address: 0x3C
    pages: # screen size: 128x32
      - id: idle
        lambda: |-     
          it.fill(COLOR_OFF);
          int w = it.get_width();
          int h = it.get_height();
          int x = w/2;
          int y = h/2;          
          int size = 8; 
          static int offset = size;

          std::string text = id(choice);
          int text_width = text.length() * size;  // text size / 2 ~= width; 
          if (text_width > it.get_width()) {
            it.printf(x - offset, y, id(text_medium), COLOR_ON, TextAlign::CENTER_LEFT, "%s", text.c_str());
            offset += size * 2;
            if (offset > text_width) {
              offset = size;
            }
          } else {
            it.printf(x, y, id(text_large), COLOR_ON, TextAlign::CENTER, "%s", text.c_str());
          }

      - id: stt
        lambda: |-     
          it.fill(COLOR_OFF);
          int w = it.get_width();
          int h = it.get_height();
          int x = w/2;
          int y = h/2;          
          int size = 8; 
          static int offset = size;
          std::string text = id(${device_name}_stt).state;
          int text_width = text.length() * size;  // text size / 2 ~= width; 
          int wordCount = std::count(text.begin(), text.end(), ' ') + 1;

          if (text_width > it.get_width()) {
            it.printf(0 - offset, y, id(text_medium), COLOR_ON, TextAlign::CENTER_LEFT, "%s", text.c_str());
            offset += size * 2;
            if (offset > text_width) {
              offset = size;
            }
          } else {
            it.printf(x, y, id(text_large), COLOR_ON, TextAlign::CENTER, "%s", text.c_str());
          }

      - id: tts
        lambda: |-     
          it.fill(COLOR_OFF);
          int w = it.get_width();
          int h = it.get_height();
          int x = w/2;
          int y = h/2;          
          int size = 8; 
          static int offset = size;
          std::string text = id(${device_name}_tts).state;
          int text_width = text.length() * size;  // text size / 2 ~= width; 
          int wordCount = std::count(text.begin(), text.end(), ' ') + 1;

          if (text_width > it.get_width()) {
            it.printf(0 - offset, y, id(text_medium), COLOR_ON, TextAlign::CENTER_LEFT, "%s", text.c_str());
            offset += size * 2;
            if (offset > text_width) {
              offset = 0;
            }
          } else {
            it.printf(x, y, id(text_large), COLOR_ON, TextAlign::CENTER, "%s", text.c_str());
          }
      - id: error
        lambda: |-     
          it.fill(COLOR_ON);
          int w = it.get_width();
          int h = it.get_height();
          int x = w/2;
          int y = h/2;          
          int size = 8; 

          static int offset = size;
          std::string text = id(voice_error_code);
          int text_width = text.length() * size;  // text size / 2 ~= width; 

          // Count the number of spaces in the text
          int wordCount = std::count(text.begin(), text.end(), ' ') + 1;

          if (text_width > it.get_width()) {
            it.printf(x - offset, y, id(text_medium), COLOR_OFF, TextAlign::CENTER_LEFT, "%s", text.c_str());
            offset += size * 2;
            if (offset > text_width) {
              offset = size;
            }
          } else {
            it.printf(x, y, id(text_large), COLOR_OFF, TextAlign::CENTER, "%s", text.c_str());
          }
      - id: chatgpt_picture
        lambda: |-     
          it.fill(COLOR_OFF);
          int w = it.get_width();
          int h = it.get_height();
          int x = w/2;
          int y = h/2;          
          int size = 8; 
          std::string text = id(emoji_response).state;
          it.printf(x, y, id(emojifont), COLOR_ON, TextAlign::CENTER, "%s", text.c_str());

      - id: page_online_image
        lambda: |-
          it.fill(COLOR_OFF);
          it.image(0, 0, id(example_image));

    on_page_change:
      - lambda: |-
          auto page = id(ssd1306_display).get_active_page();

          if ( to != id(page_online_image)) { // from == id(page_online_image) &&
            ESP_LOGD("online_image","released for not being on page");
            id(example_image)->release();
          }

          if (to == id(page_online_image) ) {
            auto defaulturl = to_string("${default_url}");
            if ( id(online_image_source).state == "" && id(online_image_source).state != defaulturl ){
              auto call = id(online_image_source).make_call();
              call.set_value(defaulturl.c_str());
              call.perform();
            }
          }
          if (to == id(idle) ) {
            id(randomize).execute();
          }

external_components:
  - source: github://pr#4710
    components: [ online_image ]
    refresh: 1d

online_image:
  - url: ${default_url}
    id: example_image
    type: $local_image_format
    use_transparency: $local_image_use_transparency
    useragent: ${friendly_name}
    update_interval: never
    on_download_finished:
      - lambda: |-
          id(ssd1306_display).show_page(id(page_online_image));
          id(ssd1306_display).update();
    on_error:
      - lambda: |-
          ESP_LOGW("online_image","could not download");

script:
- id: pipeline_next
  mode: single
  then:
    - script.execute: interaction
    - homeassistant.service:
        service: ${script_pipeline}

- id: finished_speaking_next
  mode: single
  then:
    - script.execute: interaction
    - homeassistant.service:
        service: ${script_finished_speaking}

- id: start_voice_assistant
  parameters:
    continuous: bool
  then:
    - if: # CHUT UP AND LISTEN
        condition:
          - media_player.is_playing: ${device_name}_media_player
        then:
          - media_player.stop: ${device_name}_media_player
    - if:  # toggle voice assistant
        condition: voice_assistant.is_running
        then:
          - voice_assistant.stop:
        else:
          - if:
              condition:
                lambda: 'return continuous;'
              then:
                - voice_assistant.start_continuous:
              else:
                - voice_assistant.start:
                    silence_detection: true

HA script:

alias: Voice Emoji Process
sequence:
  - service: conversation.process
    data:
      agent_id: xyz
      text: |-
        {%- if input is defined -%}
          {{input}}
        {%- else -%}
          {{ states('sensor.voice_tts') }}        
        {%- endif -%}
      language: et
    response_variable: conversation_response
  - service: input_text.set_value
    data:
      value: "{{ conversation_response.response.speech.plain.speech }}"
    target:
      entity_id: input_text.emojigpt