Arrival time prediction

janeau · November 15, 2024, 10:07am

I have built a python script that uses machine learning on historic device tracker data from influxDB (in my case my pixel 8 phone) to predict the time someone will be home. See the code below. I have this working on my pc but i am unable to get this running on my rapsberry pi running home assistant OS. I thought the easiest way would be to use Pyscript (hence the commented @service decorators). But i can’t get it to properly import the sklearn package.

What i tried was adding a requirements.txt file with contents: scikit-learn as described in the docs, but then i get a permission error (also shown below). Does anyone know how i can get this code running?

Error installing package:

Logger: homeassistant.util.package
Source: util/package.py:152
First occurred: 11:02:52 (1 occurrences)
Last logged: 11:02:52

Unable to install package scikit-learn: error: Failed to prepare distributions Caused by: Failed to download and build `scipy==1.14.1` Caused by: Build backend failed to build wheel through `build_wheel` (exit status: 1) [stderr] Traceback (most recent call last): File "<string>", line 11, in <module> File "/tmp/.tmpfQXRj0/builds-v0/.tmpcCBMg3/lib/python3.12/site-packages/mesonpy/__init__.py", line 1050, in wrapper return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/tmp/.tmpfQXRj0/builds-v0/.tmpcCBMg3/lib/python3.12/site-packages/mesonpy/__init__.py", line 1103, in build_wheel with _project(config_settings) as project: File "/usr/local/lib/python3.12/contextlib.py", line 137, in __enter__ return next(self.gen) ^^^^^^^^^^^^^^ File "/tmp/.tmpfQXRj0/builds-v0/.tmpcCBMg3/lib/python3.12/site-packages/mesonpy/__init__.py", line 969, in _project yield Project(source_dir, build_dir, meson_args, editable_verbose) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/tmp/.tmpfQXRj0/builds-v0/.tmpcCBMg3/lib/python3.12/site-packages/mesonpy/__init__.py", line 662, in __init__ self._meson = _get_meson_command(pyproject_config.get('meson')) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/tmp/.tmpfQXRj0/builds-v0/.tmpcCBMg3/lib/python3.12/site-packages/mesonpy/__init__.py", line 1006, in _get_meson_command r = subprocess.run(cmd + ['--version'], text=True, capture_output=True) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/subprocess.py", line 548, in run with Popen(*popenargs, **kwargs) as process: ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/subprocess.py", line 1026, in __init__ self._execute_child(args, executable, preexec_fn, close_fds, File "/usr/local/lib/python3.12/subprocess.py", line 1955, in _execute_child raise child_exception_type(errno_num, err_msg, err_filename) PermissionError: [Errno 13] Permission denied: 'meson'

Python arrival time prediction script:

import requests
import datetime
from typing import Any
import pandas as pd
from influxdb import InfluxDBClient  # type: ignore
from sklearn.ensemble import RandomForestRegressor
import pickle

# %% Entity history retrieval


def get_entity_history(
    ip_address: str = "192.168.1.4",
    authorization_token: str = authorization_token,  # Replace with your Home Assistant authorization token
    entity_id: str = "device_tracker.pixel_8",
    start_time: str = (
        datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(days=1)
    ).isoformat(),
    end_time: str = datetime.datetime.now(datetime.timezone.utc).isoformat(),
    minimal_response: bool = False,
    no_attributes: bool = False,
    significant_changes_only: bool = False,
) -> pd.DataFrame:
    """
    Retrieve the history of a specific entity from a Home Assistant instance.

    Args:
        ip_address (str): The IP address of the Home Assistant instance.
        authorization_token (str): The authorization token for accessing the Home Assistant API.
        entity_id (str): The ID of the entity to retrieve history for.
        start_time (str): The start time for the history period in ISO 8601 format ("YYYY-MM-DDTHH:MM:SSZ"). Defaults to 1 day before the current time.
        end_time (str): The end time for the history period in ISO 8601 format. Defaults to current time.
        minimal_response (bool): If True, the response will contain minimal data. Defaults to False.
        no_attributes (bool): If True, the response will not include entity attributes. Defaults to False.
        significant_changes_only (bool): If True, only significant changes will be included in the response. Defaults to False.

    Returns:
        pd.DataFrame: The history data of the specified entity if the request is successful, otherwise empty dataframe
    """

    url = f"http://{ip_address}:8123/api/history/period/{start_time}"
    headers = {
        "Authorization": f"Bearer {authorization_token}",
        "Content-Type": "application/json",
    }

    params = {
        "filter_entity_id": entity_id,
        "end_time": end_time,
    }

    if minimal_response:
        params["minimal_response"] = "true"
    if no_attributes:
        params["no_attributes"] = "true"
    if significant_changes_only:
        params["significant_changes_only"] = "true"

    # Make the request to the Home Assistant API
    response = requests.get(url, headers=headers, params=params)
    if response.status_code == 200:
        return pd.json_normalize(response.json()[0], sep="_")
    else:
        print(f"Failed to retrieve history: {response.status_code}")
        return pd.DataFrame()


def get_entity_history_influxdb(
    db_host: str = "192.168.1.4",
    db_port: int = 8086,
    db_username: str = influx_username,  # Replace with your InfluxDB username
    db_password: str = influx_password,  # Replace with your InfluxDB password
    db_name: str = influx_name,  # Database name
    measurement: str = "state",  # Measurement name
    entity_id: str = "pixel_8",  # Entity ID to filter
    start_time: str = (
        datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(days=1)
    ).isoformat(),
    end_time: str = datetime.datetime.now(datetime.timezone.utc).isoformat(),
) -> pd.DataFrame:
    """
    Retrieve the history of a specific entity from InfluxDB.

    Args:
        db_host (str): The host of the InfluxDB instance.
        db_port (int): The port of the InfluxDB instance.
        db_username (str): The username for accessing InfluxDB.
        db_password (str): The password for accessing InfluxDB.
        db_name (str): The name of the database to query.
        measurement (str): The measurement to query.
        entity_id (str): The ID of the entity to retrieve history for.
        start_time (str): The start time for the history period in ISO 8601 format.
        end_time (str): The end time for the history period in ISO 8601 format.

    Returns:
        list[dict[str, Any]] | None: The history data of the specified entity if the request is successful, otherwise None.
    """

    # Create an InfluxDB client instance
    client = InfluxDBClient(
        host=db_host,
        port=db_port,
        username=db_username,
        password=db_password,
        database=db_name,
    )

    # Create the query using the provided parameters
    query = f"""
    SELECT * FROM "{measurement}"
    WHERE time >= '{start_time}' AND time <= '{end_time}' 
    AND "entity_id"='{entity_id}'
    """

    # Execute the query
    try:
        results = client.query(query)  # type: ignore
        data: list[dict[str, Any]] = []
        # Collect results into a list
        for point in results.get_points():  # type: ignore
            data.append(point)  # type: ignore
        return pd.DataFrame(data)

    except Exception as e:
        print(f"Failed to retrieve history: {e}")
        return pd.DataFrame()
    finally:
        client.close()


# %% Model training and prediction


def create_home_time_prediction_dataset(location_data: pd.DataFrame) -> pd.DataFrame:
    """
    Create a dataset for predicting the time someone arrives home based on device tracker data.

    Args:
        location_data (pd.DataFrame): The location data containing the time and state and other relevant information.

    Returns:
        pd.DataFrame: A dataset with features and target for predicting the time someone arrives home.
    """

    # Create a new empty dataset
    data = pd.DataFrame()

    # Add relevant columns from the location data
    data["time"] = location_data["time"]
    data["value"] = location_data["value"]  # 1 for 'home', 0 for 'not_home'
    data["latitude"] = location_data["latitude"]
    data["longitude"] = location_data["longitude"]
    data["course"] = location_data["course"]
    data["speed"] = location_data["speed"]

    # Convert 'time' to datetime
    data["time"] = pd.to_datetime(location_data["time"])

    data["hour"] = data["time"].dt.hour
    data["day_of_week"] = data["time"].dt.dayofweek

    # Sort data by time
    data = data.sort_values("time")

    # Create a new column for the arrival time
    data["arrival_time"] = data["time"].where(
        (data["value"] == 1) & (data["value"].shift(1) == 0)
    )

    # Backward-fill the arrival times
    data["arrival_time"] = data["arrival_time"].bfill()

    # Calculate time until arrival
    data["time_until_arrival"] = (
        data["arrival_time"] - data["time"]
    ).dt.total_seconds() / 60.0  # in minutes

    # Set the time until arrival to 0 if already home
    data["time_until_arrival"] = data["time_until_arrival"].where(data["value"] == 0, 0)

    return data


def train_and_evaluate_model(data):
    """
    Train a RandomForestRegressor model and evaluate its performance.

    Args:
        data (pd.DataFrame): The dataset containing features and target variable.

    Returns:
        RandomForestRegressor: The trained model.
        float: The mean squared error of the model.
        float: The accuracy of the model within 10 minutes.
    """
    # Prepare features and target variable
    x = data[["hour", "day_of_week", "latitude", "longitude", "course", "speed"]]
    y = data["time_until_arrival"]

    # Split the data into training and testing sets without shuffling
    split_index = int(len(x) * 0.8)
    x_train, x_test = x[:split_index], x[split_index:]
    y_train, y_test = y[:split_index], y[split_index:]

    # Create and train the model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(x_train, y_train)

    # Make predictions
    y_pred = model.predict(x_test)

    # Evaluate the model based on correctness of arrival time
    def evaluate_model(y_test, y_pred, threshold=10):
        correct_predictions = sum(abs(y_test - y_pred) <= threshold)
        total_predictions = len(y_test)
        accuracy = correct_predictions / total_predictions
        return accuracy

    accuracy = evaluate_model(y_test, y_pred)
    return model, accuracy


# @service
def train_model():
    """
    Train a machine learning model to predict the time someone arrives home.

    This function fetches historical data for a specific entity, creates a dataset
    for predicting the time someone arrives home, trains and evaluates a machine
    learning model, and saves the trained model to a file.

    Returns:
        model: The trained machine learning model.
        accuracy: The accuracy of the trained model.
    """
    # Fetch the historical data
    history = get_entity_history_influxdb(
        entity_id="pixel_8", start_time="2024-10-17T00:00:00Z"
    )

    # Create the dataset for predicting the time someone arrives home
    data_pixel_8 = create_home_time_prediction_dataset(history)

    # Train and evaluate the model
    model, accuracy = train_and_evaluate_model(data_pixel_8)

    # Save the model to a file
    with open("pixel_8.pkl", "wb") as f:
        pickle.dump(model, f)

    return model, accuracy


# %% Arrival time prediction


def predict_time_until_arrival(
    hour, day_of_week, latitude, longitude, course, speed, model_name="pixel_8.pkl"
):
    # Load the trained model
    with open(model_name, "rb") as model_file:
        model: RandomForestRegressor = pickle.load(model_file)

    input_data = pd.DataFrame(
        [[hour, day_of_week, latitude, longitude, course, speed]],
        columns=["hour", "day_of_week", "latitude", "longitude", "course", "speed"],
    )
    return model.predict(input_data)


# @service
def get_arrival_time_prediction():
    # Get the most recent history of the Home Assistant entity
    recent_history_home_assistant = get_entity_history().iloc[-1]

    # Extract the relevant data from the enity
    current_hour = datetime.datetime.now().hour
    current_day_of_week = datetime.datetime.now().weekday()
    curent_latitude = recent_history_home_assistant["attributes_latitude"]
    current_longitude = recent_history_home_assistant["attributes_longitude"]
    current_course = recent_history_home_assistant["attributes_course"]
    current_speed = recent_history_home_assistant["attributes_speed"]

    # Predict the time until arrival
    predicted_time_until_arrival = predict_time_until_arrival(
        current_hour,
        current_day_of_week,
        curent_latitude,
        current_longitude,
        current_course,
        current_speed,
    )

    # Calculate the predicted arrival time
    predicted_arrival_time = datetime.datetime.now() + datetime.timedelta(
        minutes=predicted_time_until_arrival[0]
    )
    return predicted_arrival_time, predicted_time_until_arrival[0]


if __name__ == "__main__":
    train_model()
    predicted_arrival_time, predicted_time_until_arrival = get_arrival_time_prediction()
    print(
        f"Predicted arrival time: {predicted_arrival_time}, in {predicted_time_until_arrival} minutes"
    )