I have built a python script that uses machine learning on historic device tracker data from influxDB (in my case my pixel 8 phone) to predict the time someone will be home. See the code below. I have this working on my pc but i am unable to get this running on my rapsberry pi running home assistant OS. I thought the easiest way would be to use Pyscript (hence the commented @service decorators). But i can’t get it to properly import the sklearn package.
What i tried was adding a requirements.txt file with contents: scikit-learn as described in the docs, but then i get a permission error (also shown below). Does anyone know how i can get this code running?
Error installing package:
Logger: homeassistant.util.package
Source: util/package.py:152
First occurred: 11:02:52 (1 occurrences)
Last logged: 11:02:52
Unable to install package scikit-learn: error: Failed to prepare distributions Caused by: Failed to download and build `scipy==1.14.1` Caused by: Build backend failed to build wheel through `build_wheel` (exit status: 1) [stderr] Traceback (most recent call last): File "<string>", line 11, in <module> File "/tmp/.tmpfQXRj0/builds-v0/.tmpcCBMg3/lib/python3.12/site-packages/mesonpy/__init__.py", line 1050, in wrapper return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/tmp/.tmpfQXRj0/builds-v0/.tmpcCBMg3/lib/python3.12/site-packages/mesonpy/__init__.py", line 1103, in build_wheel with _project(config_settings) as project: File "/usr/local/lib/python3.12/contextlib.py", line 137, in __enter__ return next(self.gen) ^^^^^^^^^^^^^^ File "/tmp/.tmpfQXRj0/builds-v0/.tmpcCBMg3/lib/python3.12/site-packages/mesonpy/__init__.py", line 969, in _project yield Project(source_dir, build_dir, meson_args, editable_verbose) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/tmp/.tmpfQXRj0/builds-v0/.tmpcCBMg3/lib/python3.12/site-packages/mesonpy/__init__.py", line 662, in __init__ self._meson = _get_meson_command(pyproject_config.get('meson')) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/tmp/.tmpfQXRj0/builds-v0/.tmpcCBMg3/lib/python3.12/site-packages/mesonpy/__init__.py", line 1006, in _get_meson_command r = subprocess.run(cmd + ['--version'], text=True, capture_output=True) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/subprocess.py", line 548, in run with Popen(*popenargs, **kwargs) as process: ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/subprocess.py", line 1026, in __init__ self._execute_child(args, executable, preexec_fn, close_fds, File "/usr/local/lib/python3.12/subprocess.py", line 1955, in _execute_child raise child_exception_type(errno_num, err_msg, err_filename) PermissionError: [Errno 13] Permission denied: 'meson'
Python arrival time prediction script:
import requests
import datetime
from typing import Any
import pandas as pd
from influxdb import InfluxDBClient # type: ignore
from sklearn.ensemble import RandomForestRegressor
import pickle
# %% Entity history retrieval
def get_entity_history(
ip_address: str = "192.168.1.4",
authorization_token: str = authorization_token, # Replace with your Home Assistant authorization token
entity_id: str = "device_tracker.pixel_8",
start_time: str = (
datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(days=1)
).isoformat(),
end_time: str = datetime.datetime.now(datetime.timezone.utc).isoformat(),
minimal_response: bool = False,
no_attributes: bool = False,
significant_changes_only: bool = False,
) -> pd.DataFrame:
"""
Retrieve the history of a specific entity from a Home Assistant instance.
Args:
ip_address (str): The IP address of the Home Assistant instance.
authorization_token (str): The authorization token for accessing the Home Assistant API.
entity_id (str): The ID of the entity to retrieve history for.
start_time (str): The start time for the history period in ISO 8601 format ("YYYY-MM-DDTHH:MM:SSZ"). Defaults to 1 day before the current time.
end_time (str): The end time for the history period in ISO 8601 format. Defaults to current time.
minimal_response (bool): If True, the response will contain minimal data. Defaults to False.
no_attributes (bool): If True, the response will not include entity attributes. Defaults to False.
significant_changes_only (bool): If True, only significant changes will be included in the response. Defaults to False.
Returns:
pd.DataFrame: The history data of the specified entity if the request is successful, otherwise empty dataframe
"""
url = f"http://{ip_address}:8123/api/history/period/{start_time}"
headers = {
"Authorization": f"Bearer {authorization_token}",
"Content-Type": "application/json",
}
params = {
"filter_entity_id": entity_id,
"end_time": end_time,
}
if minimal_response:
params["minimal_response"] = "true"
if no_attributes:
params["no_attributes"] = "true"
if significant_changes_only:
params["significant_changes_only"] = "true"
# Make the request to the Home Assistant API
response = requests.get(url, headers=headers, params=params)
if response.status_code == 200:
return pd.json_normalize(response.json()[0], sep="_")
else:
print(f"Failed to retrieve history: {response.status_code}")
return pd.DataFrame()
def get_entity_history_influxdb(
db_host: str = "192.168.1.4",
db_port: int = 8086,
db_username: str = influx_username, # Replace with your InfluxDB username
db_password: str = influx_password, # Replace with your InfluxDB password
db_name: str = influx_name, # Database name
measurement: str = "state", # Measurement name
entity_id: str = "pixel_8", # Entity ID to filter
start_time: str = (
datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(days=1)
).isoformat(),
end_time: str = datetime.datetime.now(datetime.timezone.utc).isoformat(),
) -> pd.DataFrame:
"""
Retrieve the history of a specific entity from InfluxDB.
Args:
db_host (str): The host of the InfluxDB instance.
db_port (int): The port of the InfluxDB instance.
db_username (str): The username for accessing InfluxDB.
db_password (str): The password for accessing InfluxDB.
db_name (str): The name of the database to query.
measurement (str): The measurement to query.
entity_id (str): The ID of the entity to retrieve history for.
start_time (str): The start time for the history period in ISO 8601 format.
end_time (str): The end time for the history period in ISO 8601 format.
Returns:
list[dict[str, Any]] | None: The history data of the specified entity if the request is successful, otherwise None.
"""
# Create an InfluxDB client instance
client = InfluxDBClient(
host=db_host,
port=db_port,
username=db_username,
password=db_password,
database=db_name,
)
# Create the query using the provided parameters
query = f"""
SELECT * FROM "{measurement}"
WHERE time >= '{start_time}' AND time <= '{end_time}'
AND "entity_id"='{entity_id}'
"""
# Execute the query
try:
results = client.query(query) # type: ignore
data: list[dict[str, Any]] = []
# Collect results into a list
for point in results.get_points(): # type: ignore
data.append(point) # type: ignore
return pd.DataFrame(data)
except Exception as e:
print(f"Failed to retrieve history: {e}")
return pd.DataFrame()
finally:
client.close()
# %% Model training and prediction
def create_home_time_prediction_dataset(location_data: pd.DataFrame) -> pd.DataFrame:
"""
Create a dataset for predicting the time someone arrives home based on device tracker data.
Args:
location_data (pd.DataFrame): The location data containing the time and state and other relevant information.
Returns:
pd.DataFrame: A dataset with features and target for predicting the time someone arrives home.
"""
# Create a new empty dataset
data = pd.DataFrame()
# Add relevant columns from the location data
data["time"] = location_data["time"]
data["value"] = location_data["value"] # 1 for 'home', 0 for 'not_home'
data["latitude"] = location_data["latitude"]
data["longitude"] = location_data["longitude"]
data["course"] = location_data["course"]
data["speed"] = location_data["speed"]
# Convert 'time' to datetime
data["time"] = pd.to_datetime(location_data["time"])
data["hour"] = data["time"].dt.hour
data["day_of_week"] = data["time"].dt.dayofweek
# Sort data by time
data = data.sort_values("time")
# Create a new column for the arrival time
data["arrival_time"] = data["time"].where(
(data["value"] == 1) & (data["value"].shift(1) == 0)
)
# Backward-fill the arrival times
data["arrival_time"] = data["arrival_time"].bfill()
# Calculate time until arrival
data["time_until_arrival"] = (
data["arrival_time"] - data["time"]
).dt.total_seconds() / 60.0 # in minutes
# Set the time until arrival to 0 if already home
data["time_until_arrival"] = data["time_until_arrival"].where(data["value"] == 0, 0)
return data
def train_and_evaluate_model(data):
"""
Train a RandomForestRegressor model and evaluate its performance.
Args:
data (pd.DataFrame): The dataset containing features and target variable.
Returns:
RandomForestRegressor: The trained model.
float: The mean squared error of the model.
float: The accuracy of the model within 10 minutes.
"""
# Prepare features and target variable
x = data[["hour", "day_of_week", "latitude", "longitude", "course", "speed"]]
y = data["time_until_arrival"]
# Split the data into training and testing sets without shuffling
split_index = int(len(x) * 0.8)
x_train, x_test = x[:split_index], x[split_index:]
y_train, y_test = y[:split_index], y[split_index:]
# Create and train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(x_train, y_train)
# Make predictions
y_pred = model.predict(x_test)
# Evaluate the model based on correctness of arrival time
def evaluate_model(y_test, y_pred, threshold=10):
correct_predictions = sum(abs(y_test - y_pred) <= threshold)
total_predictions = len(y_test)
accuracy = correct_predictions / total_predictions
return accuracy
accuracy = evaluate_model(y_test, y_pred)
return model, accuracy
# @service
def train_model():
"""
Train a machine learning model to predict the time someone arrives home.
This function fetches historical data for a specific entity, creates a dataset
for predicting the time someone arrives home, trains and evaluates a machine
learning model, and saves the trained model to a file.
Returns:
model: The trained machine learning model.
accuracy: The accuracy of the trained model.
"""
# Fetch the historical data
history = get_entity_history_influxdb(
entity_id="pixel_8", start_time="2024-10-17T00:00:00Z"
)
# Create the dataset for predicting the time someone arrives home
data_pixel_8 = create_home_time_prediction_dataset(history)
# Train and evaluate the model
model, accuracy = train_and_evaluate_model(data_pixel_8)
# Save the model to a file
with open("pixel_8.pkl", "wb") as f:
pickle.dump(model, f)
return model, accuracy
# %% Arrival time prediction
def predict_time_until_arrival(
hour, day_of_week, latitude, longitude, course, speed, model_name="pixel_8.pkl"
):
# Load the trained model
with open(model_name, "rb") as model_file:
model: RandomForestRegressor = pickle.load(model_file)
input_data = pd.DataFrame(
[[hour, day_of_week, latitude, longitude, course, speed]],
columns=["hour", "day_of_week", "latitude", "longitude", "course", "speed"],
)
return model.predict(input_data)
# @service
def get_arrival_time_prediction():
# Get the most recent history of the Home Assistant entity
recent_history_home_assistant = get_entity_history().iloc[-1]
# Extract the relevant data from the enity
current_hour = datetime.datetime.now().hour
current_day_of_week = datetime.datetime.now().weekday()
curent_latitude = recent_history_home_assistant["attributes_latitude"]
current_longitude = recent_history_home_assistant["attributes_longitude"]
current_course = recent_history_home_assistant["attributes_course"]
current_speed = recent_history_home_assistant["attributes_speed"]
# Predict the time until arrival
predicted_time_until_arrival = predict_time_until_arrival(
current_hour,
current_day_of_week,
curent_latitude,
current_longitude,
current_course,
current_speed,
)
# Calculate the predicted arrival time
predicted_arrival_time = datetime.datetime.now() + datetime.timedelta(
minutes=predicted_time_until_arrival[0]
)
return predicted_arrival_time, predicted_time_until_arrival[0]
if __name__ == "__main__":
train_model()
predicted_arrival_time, predicted_time_until_arrival = get_arrival_time_prediction()
print(
f"Predicted arrival time: {predicted_arrival_time}, in {predicted_time_until_arrival} minutes"
)