Is there a way to stream audio from one ESPHome to another?

I’m thinking about building a voice doorbell using ESPHome with I2S microphone and I2S speaker. What I’d like to achieve is to send my voice through ESPHome microphone inside the house to another ESPHome running in my porch light.

I’d appreciate if anyone has done it before and is willing to share experience!

I’d also love to know this! For similar applications.

There’s not a whole lot of info about the microphone component in esphome to see what can be done with it.

I am waiting for years for this options.
Look here:

microphone:
  - platform: ...
    on_data:
      - logger.log:
          format: "Received %d bytes"
          args: ['x.size()']

It seems that audio are in the “x” variable, i dont dnow what type of data is. Waveform? Binary?
We could try to create a service on home assistant that when x changes state then sends its state to another esp media player and see what happens.

1 Like

I have looked into home assistant source code for voice assistant, but im not a coding expert, it seems that voice assistant start a UDP server for receiving audio, i have wrote a custom udp server and seems to be receiving udp packets, maybe could be useful for receiving audio stream:
image

The nightmare is voice assistant in esphome, i dont understand nothing on how it work.

Anyway

Here the udp receiver source code:
/config/custom_components/audio_receiver/manifest.json

{
    "domain": "audio_receiver",
    "name": "Audio Receiver",
    "version": "1.0",
    "documentation": "",
    "dependencies": [],
    "codeowners": [],
    "requirements": []
}

/config/custom_components/audio_receiver/__init__.py

import asyncio
import logging
import socket
from homeassistant.const import EVENT_HOMEASSISTANT_STOP

DOMAIN = "audio_receiver"

_logger = logging.getLogger(__name__)

class UDPAudioReceiver:
    def __init__(self, hass, host, port):
        self.hass = hass
        self.host = host
        self.port = port

    async def start(self):
        loop = asyncio.get_running_loop()
        self.transport, self.protocol = await loop.create_datagram_endpoint(
            lambda: UDPProtocol(self.hass),
            local_addr=(self.host, self.port)
        )
        self.hass.bus.async_listen_once(EVENT_HOMEASSISTANT_STOP, self.stop)

    async def stop(self, event):
        self.transport.close()

class UDPProtocol(asyncio.DatagramProtocol):
    def __init__(self, hass):
        self.hass = hass

    def datagram_received(self, data, addr):
        _logger.info(f"Data received: {data}")

async def async_setup(hass, config):
    host = "0.0.0.0"  # Replace with your configuration
    port = 12345  # Replace with your configuration
    receiver = UDPAudioReceiver(hass, host, port)
    hass.loop.create_task(receiver.start())
    return True

/config/custom_components/audio_receiver/config_flow.py

import voluptuous as vol
from homeassistant import config_entries
from homeassistant.core import callback
from . import DOMAIN

class AudioReceiverFlowHandler(config_entries.ConfigFlow, domain=DOMAIN):
    VERSION = 1
    CONNECTION_CLASS = config_entries.CONN_CLASS_LOCAL_PUSH

    @staticmethod
    @callback
    def async_get_options_flow(config_entry):
        return OptionsFlowHandler(config_entry)

    async def async_step_user(self, user_input=None):
        if user_input is not None:
            return self.async_create_entry(title="Audio Receiver", data=user_input)

        return self.async_show_form(
            step_id="user",
            data_schema=vol.Schema(
                {
                    vol.Required("host", default="0.0.0.0"): str,
                    vol.Required("port", default=12345): int,
                }
            ),
        )

class OptionsFlowHandler(config_entries.OptionsFlow):
    def __init__(self, config_entry):
        self.config_entry = config_entry

    async def async_step_init(self, user_input=None):
        if user_input is not None:
            return self.async_create_entry(title="", data=user_input)

        return self.async_show_form(
            step_id="init",
            data_schema=vol.Schema(
                {
                    vol.Required("host", default=self.config_entry.options.get("host", "0.0.0.0")): str,
                    vol.Required("port", default=self.config_entry.options.get("port", 12345)): int,
                }
            ),
        )

and configuration.yaml

audio_receiver:
  host: "0.0.0.0"
  port: 12345
1 Like

I am tryng this approach for grab and send audio from microphone section in esphome:

microphone:
  - platform: i2s_audio
    i2s_audio_id: i2s_in
    id: mic
    adc_type: external
    i2s_din_pin: GPIO23
    pdm: false
    on_data:
      - lambda: |-
          for (uint8_t byte : x) {
            id(audio_buffer).push_back(byte);
          }
          if (id(audio_buffer).size() >= 512) {
            int sock = ::socket(AF_INET, SOCK_DGRAM, 0);
            struct sockaddr_in destination;
            destination.sin_family = AF_INET;
            destination.sin_port = htons(12345);  //  UDP receiver port
            destination.sin_addr.s_addr = inet_addr("192.168.1.10");  //  UDP receiver IP

            ::sendto(sock, id(audio_buffer).data(), id(audio_buffer).size(), 0, reinterpret_cast<sockaddr*>(&destination), sizeof(destination));
            ::close(sock);
            id(audio_buffer).clear();
          }
globals:
  - id: is_capturing
    type: bool
    restore_value: no
    initial_value: "false"
  - id: audio_buffer
    type: std::vector<int16_t>
    restore_value: no
    initial_value: 'std::vector<int16_t>()'
  - id: sequence_number
    type: uint32_t
    restore_value: no
    initial_value: '0'

binary_sensor:
  - platform: esp32_touch
    pin: GPIO4
    threshold: 1000
    name: Action
    on_press:
      then:
        if:
          condition:
            lambda: "return !id(is_capturing);"
          then:
            - globals.set:
                id: is_capturing
                value: "true"
            - microphone.capture: mic
            - delay: 5s
            - globals.set:
                id: is_capturing
                value: "false"
            - microphone.stop_capture: mic

button:
  - platform: template
    name: "Cattura"
    on_press:
      - microphone.capture: mic
      - delay: 5s
      - microphone.stop_capture: mic

And receiver section:
/homeassistant/custom_components/audio_receiver/init.py

import asyncio
import logging
import wave
import os
from collections import deque
from datetime import datetime
from homeassistant.const import EVENT_HOMEASSISTANT_STOP

DOMAIN = "audio_receiver"

_logger = logging.getLogger(__name__)

class UDPAudioReceiver:
    def __init__(self, hass, host, port, save_path):
        self.hass = hass
        self.host = host
        self.port = port
        self.save_path = save_path
        self.buffer = deque()
        self.timeout_handle = None

    async def start(self):
        loop = asyncio.get_running_loop()
        self.transport, _ = await loop.create_datagram_endpoint(
            lambda: UDPProtocol(self),
            local_addr=(self.host, self.port)
        )
        _logger.info(f"UDP audio receiver started on {self.host}:{self.port}")
        self.hass.bus.async_listen_once(EVENT_HOMEASSISTANT_STOP, self.stop)

    async def stop(self, event):
        self.transport.close()
        _logger.info("UDP audio receiver stopped")
        self.save_as_wav()

    def save_as_wav(self):
        if self.buffer:
            timestamp = datetime.now().strftime("%H.%M")
            file_path = os.path.join(self.save_path, f"audio-{timestamp}.wav")
            _logger.info("Timeout reached, saving data...")
            with wave.open(file_path, 'wb') as wav_file:
                wav_file.setnchannels(1)
                wav_file.setsampwidth(2)
                wav_file.setframerate(44100)
                while self.buffer:
                    wav_file.writeframes(self.buffer.popleft())
            _logger.info(f"Audio data saved to {file_path}")
        else:
            _logger.info("Timeout reached, but no data to save.")

class UDPProtocol(asyncio.DatagramProtocol):
    def __init__(self, receiver):
        self.receiver = receiver

    def datagram_received(self, data, addr):
        _logger.info(f"Data received from {addr}")
        self.receiver.buffer.append(data)
        if self.receiver.timeout_handle:
            self.receiver.timeout_handle.cancel()
        self.receiver.timeout_handle = asyncio.get_event_loop().call_later(10, self.receiver.save_as_wav)

async def async_setup(hass, config):
    host = config[DOMAIN].get('host', '0.0.0.0')
    port = config[DOMAIN].get('port', 12345)
    save_path = config[DOMAIN].get('save_path', '/media/audio')
    receiver = UDPAudioReceiver(hass, host, port, save_path)
    hass.loop.create_task(receiver.start())
    return True

And config_flow.py

import voluptuous as vol
from homeassistant import config_entries
from homeassistant.core import callback
from . import DOMAIN

class AudioReceiverFlowHandler(config_entries.ConfigFlow, domain=DOMAIN):
    VERSION = 1
    CONNECTION_CLASS = config_entries.CONN_CLASS_LOCAL_PUSH

    @staticmethod
    @callback
    def async_get_options_flow(config_entry):
        return OptionsFlowHandler(config_entry)

    async def async_step_user(self, user_input=None):
        if user_input is not None:
            return self.async_create_entry(title="Audio Receiver", data=user_input)

        return self.async_show_form(
            step_id="user",
            data_schema=vol.Schema(
                {
                    vol.Required("host", default="0.0.0.0"): str,
                    vol.Required("port", default=12345): int,
                    vol.Required("save_path", default="/media/audio"): str,
                }
            ),
        )

class OptionsFlowHandler(config_entries.OptionsFlow):
    def __init__(self, config_entry):
        self.config_entry = config_entry

    async def async_step_init(self, user_input=None):
        if user_input is not None:
            return self.async_create_entry(title="", data=user_input)

        return self.async_show_form(
            step_id="init",
            data_schema=vol.Schema(
                {
                    vol.Required("host", default=self.config_entry.options.get("host", "0.0.0.0")): str,
                    vol.Required("port", default=self.config_entry.options.get("port", 12345)): int,
                    vol.Required("save_path", default=self.config_entry.options.get("save_path", "/media/audio")): str,
                }
            ),
        )

configuration.yaml

audio_receiver:
  host: "0.0.0.0"
  port: 12345
  save_path: "/media/audio" #folder audio has to be created.

I am start the capture or via button or via touch pin, i can see the udp packet sended and received, the receiver after some time of inactivity save the buffer to wav the file is created but i can’t hear nothing of relevants inside wave file :frowning: the capture time is 5 seconds but i receive only a 1 second file with some noise.

Oh neat. This is at least a good start. I’m not too familiar with ESPHome programming but let me try to read the doc. Thanks!

If the goal is to transmit audio from an esp in a door bell and transmit it to a receiver indoors, why would you even use esphome? There are several transmitter/receiver projects available online already you couod use.

I made it. Partially.
There was an update, only documented in ESPhome changelog that x is now uint16_t.
I also had just noise till i changed that. Now I do have chopped but clear voice.

I do tests with VLC at “udp://@:12345” and Options “:network-caching=1000 :demux=rawaud :rawaud-channels=1 :rawaud-samplerate=16000”

2 Likes

I’ve got the same setup running, VLC and an INMP441 hooked up to an Olimex ESP32-C3, but I’m only getting garbled noise. This is the YAML file:

substitutions:
  display_name: record

esphome:
  name: ${display_name}
  name_add_mac_suffix: true  
  platformio_options:
    board_build.mcu: esp32c3
    board_build.variant: esp32c3  
  includes:
    # should contain single line: #include <esp_task_wdt.h>
    - wdt_include.h
    # should contain #include <sys/socket.h>
    # and            #include <netinet/in.h>
    - std_includes.h 
  on_boot:
    then:
      - lambda: !lambda |-
          // increase watchdog timeout
          esp_task_wdt_init(90, false);

esp32:
  variant: ESP32C3
  board: esp32dev
  framework:
    type: esp-idf
    sdkconfig_options:
      CONFIG_BT_BLE_50_FEATURES_SUPPORTED: y
      CONFIG_BT_BLE_42_FEATURES_SUPPORTED: y
      CONFIG_COMPILER_OPTIMIZATION_PERF: y      
      CONFIG_ESP_TASK_WDT_TIMEOUT_S: "90"


status_led:  
  pin: 
    number: GPIO8
    inverted: true

logger: 


wifi:
  ssid: !secret wifi_ssid
  password: !secret wifi_password
  fast_connect: true
  reboot_timeout: 1h  

  ap:
    ssid: !secret ap_ssid
    password: !secret ap_pass


web_server:
  auth:
    username: !secret web_username
    password: !secret web_password


i2s_audio:
  id: i2s_in
  i2s_lrclk_pin: 5 # ws
  i2s_bclk_pin: 6 # sck

microphone:
  - platform: i2s_audio
    i2s_audio_id: i2s_in
    id: mic
    adc_type: external
    i2s_din_pin: 4 # sd
    # bits_per_sample: 32bit
    # sample_rate: 32000
    pdm: false
    channel: right
    on_data:
      - lambda: |-
          for (int16_t byte : x) {
            id(audio_buffer).push_back(byte);
          }
          if (id(audio_buffer).size() >= 512) {
            int sock = ::socket(AF_INET, SOCK_DGRAM, 0);
            struct sockaddr_in destination;
            destination.sin_family = AF_INET;
            destination.sin_port = htons(12345);  //  UDP receiver port
            destination.sin_addr.s_addr = inet_addr("192.168.2.10");  //  UDP receiver IP

            ::sendto(sock, id(audio_buffer).data(), id(audio_buffer).size(), 0, reinterpret_cast<sockaddr*>(&destination), sizeof(destination));
            ::close(sock);
            id(audio_buffer).clear();
          }
globals:
  - id: is_capturing
    type: bool
    restore_value: no
    initial_value: "false"
  - id: audio_buffer
    type: std::vector<int16_t>
    restore_value: no
    initial_value: 'std::vector<int16_t>()'
  - id: sequence_number
    type: uint32_t
    restore_value: no
    initial_value: '0'


button:
  - platform: template
    name: "Record"
    on_press:
      - microphone.capture: mic
      - delay: 5s
      - microphone.stop_capture: mic

The changelogs mention the uint8_t change here: ESPHome 2023.6.0 - 21st June 2023 — ESPHome

Do you maybe have any idea why I keep getting garbled noise? I’ve tried 2 different microphones, they do seem to work with this project: GitHub - stas-sl/esphome-sound-level-meter

That component however uses it’s own I2S code and does some bit shifting:

i2s:
  bck_pin: 4
  ws_pin: 5
  din_pin: 6
  sample_rate: 48000            # default: 48000
  bits_per_sample: 32           # default: 32
  dma_buf_count: 8              # default: 8
  dma_buf_len: 256              # default: 256
  use_apll: true                # default: false

  # right shift samples.
  # for example if mic has 24 bit resolution, and
  # i2s configured as 32 bits, then audio data will be aligned left (MSB)
  # and LSB will be padded with zeros, so you might want to shift them right by 8 bits
  bits_shift: 8                 # default: 0

I solved my issue because of this genious: https://www.reddit.com/r/Esphome/comments/14f5mdf/i2s_sound_sampling_rate_anomalies/

id(audio_buffer).size() does not have the right size. It is the size of elements, but not the size on bytes. " *2" solved my issue! With this I was able to send a clear audio stream to my previous posted VLC-Settings.
As you can see, I also made some of the variables global just to not set them on every loop again.

esphome:
  name: "${name}"
  on_boot:
  - priority: 210.0
      #before MQTT
    then:
    - lambda: |-
        id(destination).sin_family = AF_INET;
        id(destination).sin_port = htons(12345);  //  UDP receiver port
        id(destination).sin_addr.s_addr = inet_addr("192.168.XX.XX");  //  UDP receiver IP

globals:
  - id: mqtt_mic_active
    type: unsigned long
    initial_value: '0'
  - id: audio_buffer
    type: std::vector<int16_t>
    restore_value: no
    initial_value: 'std::vector<int16_t>()'
  - id: sock
    type: int
    restore_value: no
  - id: destination
    type: "struct sockaddr_in"
    restore_value: no

i2s_audio:
  id: i2s_in
  i2s_lrclk_pin: GPIO26
    #WS
  i2s_bclk_pin: GPIO25
    #SCK
microphone:
  - platform: i2s_audio
    i2s_audio_id: i2s_in
    id: inmp441_mic
    adc_type: external
    i2s_din_pin: GPIO33
        #SD
    pdm: false
    use_apll: false
    bits_per_sample: 32bit
        #scaled down to 16bit
    sample_rate: 16000
    channel: right
        #L/R PIN (4) is on low then the left channel is activated, and otherwise the right channel
        #seems to be twisted in esphome...
        # -> right = low
    on_data:
      #The on_data trigger (and the internal callback) for the microphone now provides std::vector<int16>
      - lambda: |-
          for (uint16_t byte : x) {
            id(audio_buffer).push_back(byte);
          }
          if(id(audio_buffer).size() >= 256) {
            id(sock) = ::socket(AF_INET, SOCK_DGRAM, 0);
            ::sendto(id(sock), id(audio_buffer).data(), id(audio_buffer).size() *2, 0, reinterpret_cast<sockaddr*>(&id(destination)), sizeof(id(destination)));
            ::close(id(sock));
            id(audio_buffer).clear();
          }

Hopefully someone else finds this useful.

5 Likes

Thank you for this! My mic needs the left channel, but for the rest the issue with the choppyness mentioned earlier is resolved.

This did result in a pull request because the on_capturing condition for the microphone does not work:

Cheers

test the code on esp32,but the output all noise , esphome version is 2024.9

If anyone else stumbles on this and doesn’t have VLC, I had some success getting a (somewhat distorted) stream using MPV with the above esphome code:

mpv --no-resume-playback udp://0.0.0.0:12345 -v --demuxer=rawaudio --demuxer-rawaudio-channels=1 --demuxer-rawaudio-rate=16000 --demuxer-rawaudio-format=s16be

Works great, thanks a lot for posting this!

Btw the ugly globals can be replaced with static variables inside the lambda, makes the code cleaner:

    ...
    on_data:
      - lambda: |-
          static std::vector<int16_t> audio_buffer;
          static struct sockaddr_in  destination = {
            .sin_family = AF_INET,
            .sin_port = htons(12345),
            .sin_addr = { .s_addr = inet_addr("192.168.X.X") }
          };

          for (uint16_t byte : x) {
            audio_buffer.push_back(byte);
          }
          if(audio_buffer.size() >= 256) {
            int sock = ::socket(AF_INET, SOCK_DGRAM, 0);
            ::sendto(sock, audio_buffer.data(), audio_buffer.size() *2, 0, reinterpret_cast<sockaddr*>(&destination), sizeof(destination));
            ::close(sock);
            audio_buffer.clear();
          }

There is quite a lot of overhead in the underlying network stack (lwIP) in creating Berkeley sockets which is not a good idea to go through for every block of samples.

It would be better to create the socket once, use connect() to fix its remote endpoint (address, port tuple) and save the file descriptor (the return value from socket()) in a static. Subsequent data events can then just call send().

If the destination is always going to be a fixed address designated by its IP address, then you can use a local static as follows.

...
    on_data:
      - lambda: |-
          static int sock = -1;

          if (sock < 0) {
            sock = ::socket(AF_INET, SOCK_DGRAM, 0);
            if (sock >= 0) {
              static const struct sockaddr_in destination = {
                .sin_family = AF_INET,
                .sin_port = htons(12345),
                .sin_addr = { .s_addr = inet_addr("192.168.X.X") }
              };
              if (::connect(sock, reinterpret_cast<const struct sockaddr *>(&destination), sizeof(destination)) != 0) {
                (void) ::close(sock);
                sock = -1;
              }
            }
          }

          if (sock >= 0) {
            static std::vector<int16_t> audio_buffer;

            for (const auto sample : x) {
              audio_buffer.push_back(sample);
            }
            if (audio_buffer.size() >= 256) {
              (void) ::send(sock, audio_buffer.data(), audio_buffer.size() * sizeof(int16_t), 0);
              audio_buffer.clear();
            }
          }

If, on the other hand, the destination can change at runtime or is going to be given as a domain name and looked up using getaddrinfo() (getaddrinfo(3) - Linux manual page) then store the socket FD in a global: and have a lamdba script do the getaddrinfo(), socket(), connect() calls whenever the network state or the source domain name (e.g. from a template text: component) changes, remembering to close() any existing socket before creating a new one otherwise you’ll have a memory leak and run out of heap RAM eventually - causing a reboot. I’ll leave that as an exercise for the reader. I’ve added another post below with an example package.

Note that for UDP sockets, connect() doesn’t actually do any network transfers itself, it merely stores the default destination address in the socket data structure for send() to use later.

1 Like

Hello can anybody provide a full yaml file for ESP32-S3 16N8?
I need that config which streams the data via UDP.

Create a subdirectory of esphome called components and create a file in it called net_headers.h with the following.

/**
 * Header to include for ESPHome lambdas to interact with the network.
 */

#ifndef _UTM_COMPONENTS_NET_HEADERS_H_
#define _UTM_COMPONENTS_NET_HEADERS_H_

#include <sys/types.h>
#include <sys/socket.h>
#include <netdb.h>

#endif // ndef _UTM_COMPONENTS_NET_HEADERS_H_

Now create a subdirectory of esphome called packages and create a file in there called basic-udp-microphone.yaml with the following.

##
#
# @file
#
# Streaming audio from device with with mono 16kHz 32bit PDM I2S microphone.
# Change substitutions before including package to adapt GPIO assignments.
#

substitutions:
  i2s_lrclk_pin: '40'
  i2s_din_pin: '41'

globals:
- id: sock_fd
  type: int
  initial_value: '-1'

esphome:
  project:
    name: under-the-mountain.basic-udp-microphone
    version: 1.0.0
  includes:
  - components/net_headers.h
  on_boot:
  - priority: -200
    then:
    - script.execute: update_server

wifi:
  on_connect:
    then:
    - script.execute: update_server
  on_disconnect:
    then:
    - script.execute: update_server

i2s_audio:
- id: i2saudio
  i2s_lrclk_pin:
    number: ${i2s_lrclk_pin}

microphone:
- platform: i2s_audio
  id: mic
  i2s_din_pin: ${i2s_din_pin}
  pdm: true
  sample_rate: 16000
  bits_per_sample: 32bit
  adc_type: external
  on_data:
    then:
    - lambda: |-
        #pragma GCC diagnostic push
        #pragma GCC diagnostic error "-Wall"
        static std::vector<uint16_t> buffer;
        static const size_t BLOCK_SIZE = 256;
        if (id(sock_fd) < 0) {
          buffer.clear();
          buffer.shrink_to_fit();
        } else {
          buffer.reserve(BLOCK_SIZE);
          for (const uint16_t sample : x) {
            buffer.push_back(sample);
            if (buffer.size() >= BLOCK_SIZE) {
              (void) ::send(id(sock_fd), buffer.data(), buffer.size() * sizeof(uint16_t), 0);
              buffer.clear();
              buffer.reserve(BLOCK_SIZE);
            }
          }
        }
        #pragma GCC diagnostic pop

switch:
- platform: template
  name: Stream Audio
  restore_mode: ALWAYS_OFF
  icon: mdi:record-rec
  lambda: "return id(mic).is_running();"
  turn_on_action:
    then:
    - script.execute: update_server
    - if:
        condition:
          lambda: 'return id(sock_fd) >= 0;'
        then:
        - microphone.capture:
  turn_off_action:
    then:
    - microphone.stop_capture:

text:
- platform: template
  name: UDP Audio Target
  id: server
  icon: mdi:ip-outline
  optimistic: true
  mode: text
  restore_value: true
  entity_category: config
  on_value:
    then:
    - script.execute: update_server

text_sensor:
- platform: template
  name: UDP Audio Target
  id: server_socket
  icon: mdi:ip
  update_interval: never
  entity_category: diagnostic

script:
- id: update_server
  mode: queued
  then:
  - lambda: |-
      #pragma GCC diagnostic push
      #pragma GCC diagnostic error "-Wall"
      static std::string prev_svr_txt;
      std::string svr_txt = id(server).state;

      while (! svr_txt.empty() && isspace(svr_txt.back())) {
        svr_txt.resize(svr_txt.size() - 1);
      }
      while (! svr_txt.empty() && isspace(svr_txt.front())) {
        svr_txt = svr_txt.substr(1);
      }

      if (svr_txt.empty() || ! id(wlan).is_connected()) {
        id(mic).stop();
        id(server_socket).publish_state(id(wlan).is_connected() ? "none" : "offline");
        prev_svr_txt.clear();
        if (id(sock_fd) >= 0) {
          (void) ::close(id(sock_fd));
          id(sock_fd) = -1;
        }
      } else {
        const auto colon_idx = svr_txt.find(':');

        if (colon_idx == svr_txt.npos) {
          prev_svr_txt = svr_txt;
          id(server_socket).publish_state("colon-missing");
        } else {
          std::string a_txt = svr_txt.substr(0, colon_idx);
          std::string p_txt = svr_txt.substr(colon_idx + 1);

          while (! a_txt.empty() && isspace(a_txt.back())) {
            a_txt.resize(a_txt.size() - 1);
          }
          while (! p_txt.empty() && isspace(p_txt.front())) {
            p_txt = p_txt.substr(1);
          }
          svr_txt = a_txt + ":" + p_txt;

          if (prev_svr_txt != svr_txt || id(sock_fd) < 0) {
            prev_svr_txt = svr_txt;
            id(mic).stop();
            if (id(sock_fd) >= 0) {
              (void) ::close(id(sock_fd));
              id(sock_fd) = -1;
            }

            if (a_txt.empty()) {
              id(server_socket).publish_state("address-missing");
            } else if (p_txt.empty()) {
              id(server_socket).publish_state("port-missing");
            } else {
              #pragma GCC diagnostic push
              #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
              static const struct addrinfo HINTS = {
                .ai_family = AF_INET,
                .ai_socktype = SOCK_DGRAM,
              };
              #pragma GCC diagnostic pop

              struct addrinfo * addresses = nullptr;

              if (getaddrinfo(a_txt.c_str(), p_txt.c_str(), &HINTS, &addresses) != 0) {
                id(server_socket).publish_state("lookup-fail");
              } else if (addresses == nullptr) {
                id(server_socket).publish_state("no-addresses");
              } else {
                bool ipv4_found = false;

                for (const struct addrinfo * address = addresses; address != nullptr; address = address->ai_next) {
                  if (address->ai_family == AF_INET) {
                    ipv4_found = true;

                    const int fd = ::socket(address->ai_family, SOCK_DGRAM, 0);

                    if (fd >= 0) {
                      if (::connect(fd, address->ai_addr, address->ai_addrlen) != 0) {
                        (void) ::close(fd);
                      } else {
                        const auto in4_sock_addr = reinterpret_cast<const struct sockaddr_in *>(address->ai_addr);
                        const auto ip_addr = ntohl(in4_sock_addr->sin_addr.s_addr);
                        char txt[] = "255.255.255.255:65535";
                        std::snprintf(txt, sizeof(txt), "%u.%u.%u.%u:%u",
                          (unsigned) ((ip_addr >> 24) & 0xFFU),
                          (unsigned) ((ip_addr >> 16) & 0xFFU),
                          (unsigned) ((ip_addr >>  8) & 0xFFU),
                          (unsigned) ((ip_addr >>  0) & 0xFFU),
                          (unsigned) ntohs(in4_sock_addr->sin_port)
                        );
                        id(server_socket).publish_state(txt);
                        id(sock_fd) = fd;
                        break;
                      }
                    }
                  }
                }

                freeaddrinfo(addresses);

                if (id(sock_fd) < 0) {
                  id(server_socket).publish_state(ipv4_found ? "socket/connect-fail" : "no-ipv4-address");
                }
              }
            }
          }
        }
      }
      #pragma GCC diagnostic pop

Now modify your device configuration file in the esphome directory to include the following, adapting as necessary.

##
#
# @file
#
# Very basic example configuration for S3 device.

substitutions:
  i2s_lrclk_pin: '40'
  i2s_din_pin: '41'

packages:
  udp_microphone: !include packages/basic-udp-microphone.yaml

esphome:
  name: my-esp32
  friendly_name: My ESP32
  name_add_mac_suffix: true
  
esp32:
  board: esp32-s3-devkitc-1
  framework:
    type: arduino

api:

ota:
- platform: esphome

wifi:
  id: wlan
  domain: ""
  networks:
  - ssid: !secret wifi_ssid
    password: !secret wifi_password

mdns:
  disabled: false

Make sure you change the substitutions in your file to match the pins used by the I2S microphone on your board. You might need to change the pdm:, sample_rate:, bits_per_sample: and/or adc_type: under microphone: in the package to match your microphone hardware as well. See the ESPHome and your board’s documentation for that.

You could just merge the package contents into your main device configuration file if you don’t want to use packages and replace the substitutions with the pin numbers directly in the microphone: component.

This gives you a device page in Home Assistant with a switch to start/stop streaming, a text field to enter an IP address or domain name and port to which to stream (e.g. some-device.lan:1234 or 192.168.10.10:2345) and a text sensor that gives you the actual IP address & port to which that resolves or an error message if it doesn’t.

Hey Spamfast, thanks a lot, i got it compiled.
With these substitutions for my i2s config. It was working the voice assitant before.

  i2s_lrclk_pin: GPIO18 #WS
  i2s_din_pin: GPIO4
  i2s_bclk_pin: GPIO2    #SCK

I am wondering why there is no bclk in your settings.

In the settings of the esphome device i put in the ip of my desktop where VLC is running and added the port: 12345.

I hope that’s correct ?

I also got vlc opening “some” stream.

udp://@:12345
network-caching=1000 :demux=rawaud :rawaud-channels=1 :rawaud-samplerate=16000

it shows me seconds but i can’t hear anythning.

any idea ?

Hi. Glad you got it to build. I’m not an expert on I2S but my understanding is that many I2S microphones only need the one clock signal. If you have other I2S devices (a speaker for example) sharing the bus, they might need other clocks. Regarding your silence problems, maybe capture the UDP datagrams to a file isntead of with VLC (programs like socat & netcat can do that) and take a look with a hex editor or dumper - if it’s all zeros then there’s something wrong with the I2S config.

You could also add ESP_LOGI("your-tag-name", "format-string", args...); to your on_data event handler e.g. to print out the first few samples of each block. Again, if it’s showing all zeros you’ve got a problem with the I2S setup. (Take a look at std::printf, std::fprintf, std::sprintf, std::snprintf - cppreference.com for what to put in the format string.)

The code shown works with a M5Stack Atom Echo by the way, which has a an SPM1423 PDM mic. You need to look at the details of the mic on your device and make sure the YAML settings are correct for it.

I’m not familiar with VLC’s options but make sure it’s got the sample size set to 16-bit (2-byte) and check the endianness (big or little) matches the ESP32-S3’s endianness. You might have to tweak whether the samples are signed or unsigned - anything over 8-bit is usually signed but you never know.

I have written a Python function as well which can receive to a raw binary file or WAV or, if installed, play via pyaudio.

from pathlib import Path
from socketserver import BaseRequestHandler, UDPServer
from threading import Thread
from time import sleep, monotonic_ns
from wave import open as wave_open

try:
    from pyaudio import PyAudio
except ModuleNotFoundError:
    PyAudio = None

STREAM_CHANNELS = 1
STREAM_WIDTH = 2
STREAM_RATE = 16000

def server(port, file=None, *, max_length=None, timeout=None,
                          feedback=True, daemon=False):
    """Receive data on a UDP port and record to file or play as audio.

    Arguments:
        port       - port number on which to listen
        file       - file to which to write; if ending in '.wav' will
                     record as audio samples; if None will play audio
        max_length - if not None, stop after this number of seconds
                     from receipt of the first datagram
        timeout    - if not None, once a datagram has been received,
                     close file and return if datagrams doesn't arrive
                     faster than this period in seconds
        feedback   - if true, print a period on standard output for
                     each 4kibytes received & diagnostics at shutdown
        daemon     - if true, re-raise keyboard exception on exit
    """
    wv = False
    if file is not None:
        file = Path(file)
        wv = file.suffix.lower() == '.wav'

    activity_timestamp_ns = None
    start_timestamp_ns = None
    count = 0
    exception = None
    max_length_ns = None if max_length is None \
                                        else max_length * 1000000000
    timeout_ns = None if timeout is None else timeout * 1000000000
    needs_starting = False

    class Handler(BaseRequestHandler):
        def handle(self):
            nonlocal activity_timestamp_ns, start_timestamp_ns
            nonlocal count, needs_starting
            if wv:
                fh.writeframesraw(self.request[0])
            else:
                if needs_starting:
                    needs_starting = False
                    fh.start_stream()
                fh.write(self.request[0])
            previous_count = count
            count += len(self.request[0])
            if feedback and previous_count // 4096 != count // 4096:
                print('.', end='', flush=True)
            activity_timestamp_ns = monotonic_ns()
            if start_timestamp_ns is None:
                start_timestamp_ns = activity_timestamp_ns
    def read_stream():
        nonlocal exception
        with UDPServer(('0.0.0.0', int(port)), Handler) as server:
            thread = Thread(target=server.serve_forever)
            thread.start()
            try:
                while True:
                    sleep(1)
                    now_ns = monotonic_ns()
                    if timeout_ns is not None and \
                       activity_timestamp_ns is not None and \
                       now_ns - activity_timestamp_ns > timeout_ns:
                        break
                    if max_length_ns is not None and \
                       start_timestamp_ns is not None and \
                       now_ns - start_timestamp_ns > max_length_ns:
                        break
            except KeyboardInterrupt as e:
                exception = e
            if feedback:
                diagnostic = ' & removing empty file' if \
                                activity_timestamp_ns is None else ''
                print(f'\nshutting down{diagnostic}', flush=True)
            server.shutdown()
            thread.join()

    if file is not None:
        with (
            wave_open(str(file), 'wb') if wv else open(file, 'wb')
        ) as fh:
            if wv:
                fh.setnchannels(STREAM_CHANNELS)
                fh.setsampwidth(STREAM_WIDTH)
                fh.setframerate(STREAM_RATE)
            read_stream()
        if activity_timestamp_ns is None:
            file.unlink(missing_ok=True)
    else:
        if PyAudio:
            pya = PyAudio()
        else:
            raise ModuleNotFoundError(
                            'Install pyaudio for realtime streaming')
        needs_starting = True
        fh = pya.open(STREAM_RATE, STREAM_CHANNELS,
            pya.get_format_from_width(STREAM_WIDTH), output=True,
            start=not needs_starting
        )
        read_stream()
        fh.stop_stream()
        fh.close()
        pya.terminate()

    if exception and daemon:
        raise exception