Year of the Voice - Chapter 4: Wake words

stuartiannaylor · October 25, 2023, 6:57pm

Tried it on a Pi Zero2 and works well with minimal load.

I couldn’t get the example to run and swapped to a callback with sounddevice code hack here.

# Copyright 2022 David Scripka. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Imports
import sounddevice as sd
import numpy as np
from openwakeword.model import Model
import argparse
import threading

# Parse input arguments
parser=argparse.ArgumentParser()
parser.add_argument(
    "--chunk_size",
    help="How much audio (in number of samples) to predict on at once",
    type=int,
    default=1280,
    required=False
)
parser.add_argument(
    "--model_path",
    help="The path of a specific model to load",
    type=str,
    default="",
    required=False
)
parser.add_argument(
    "--inference_framework",
    help="The inference framework to use (either 'onnx' or 'tflite'",
    type=str,
    default='tflite',
    required=False
)

args=parser.parse_args()

CHANNELS = 1
RATE = 16000
CHUNK = args.chunk_size
sd.default.samplerate = RATE
sd.default.channels = CHANNELS
sd.default.dtype= ('int16', 'int16')

def sd_callback(rec, frames, time, status):
    audio = np.frombuffer(rec, dtype=np.int16)
    prediction = owwModel.predict(audio)

    # Column titles
    n_spaces = 16
    output_string_header = """
        Model Name         | Score | Wakeword Status
        --------------------------------------
        """

    for mdl in owwModel.prediction_buffer.keys():
        # Add scores in formatted table
        scores = list(owwModel.prediction_buffer[mdl])
        curr_score = format(scores[-1], '.20f').replace("-", "")

        output_string_header += f"""{mdl}{" "*(n_spaces - len(mdl))}   | {curr_score[0:5]} | {"--"+" "*20 if scores[-1] <= 0.5 else "Wakeword Detected!"}
        """

    # Print results table
    print("\033[F"*(4*n_models+1))
    print(output_string_header, "                             ", end='\r')

# Load pre-trained openwakeword models
if args.model_path != "":
    owwModel = Model(wakeword_models=[args.model_path], inference_framework=args.inference_framework)
else:
    owwModel = Model(inference_framework=args.inference_framework)

n_models = len(owwModel.models.keys())

# Start streaming from microphone
with sd.InputStream(channels=CHANNELS,
                    samplerate=RATE,
                    blocksize=int(CHUNK),
                    callback=sd_callback):
    threading.Event().wait()

Just create a /etc/asound.conf


pcm.!default {
    type asym
    playback.pcm "plughw:0"
    capture.pcm  "plughw:0"
}

As that will create a plug device which basically autoconverts to the format you want.
hw:0 is the card and really always create a plughw:device than direct hw:device as it makes things so much easier.

The onnx model also runs but with a bit more load so the default is likely the way to go.