"Regression analysis is a statistical method used to model the relationship between a dependent variable (e.g., energy consumption) and one or more independent variables (e.g., temperature, time of day, occupancy). It's a powerful tool for predicting future energy consumption based on historical data and understanding the factors that influence it."- Gemini 2024
Can we predict future household energy consumption from historic data?
This project uses the UCI Machine Learning Repository's Household Electric Power Consumption Dataset
The dataset contains 2,075,259 measurements of energy use from a single house.
One feature → one target
y = mx + b
where y is target, x is feature, m is slope, b is intercept
Multiple features → one target
y = b0 + b1x1 + b2x2 + ... + bnxn
Nonlinear relationships
y = b0 + b1x + b2x² + b3x³
P = V × I
Where:
Therefore, if we know the active power and voltage, we can directly calculate the intensity. (Note: If we didn't know that we could analyze relationships - e.g. by creating a correlation matrices)
tonia
# Mount Google Drive from google.colab import drive drive.mount('/content/drive') basedir = '/content/drive/MyDrive/tonia' # Magic command %matplotlib inline
import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error, r2_score import matplotlib.pyplot as plt import seaborn as sns from sklearn.ensemble import RandomForestRegressor def get_data(): # Load data df = pd.read_csv('household_power_consumption.txt', sep=';', na_values=['?'] ) # Preprocess df = df.dropna() df['datetime'] = pd.to_datetime( df['Date'] + ' ' + df['Time'], format='%d/%m/%Y %H:%M:%S' ) df = df.drop(['Date', 'Time'], axis=1) df['hour'] = df['datetime'].dt.hour df['day'] = df['datetime'].dt.dayofweek df['month'] = df['datetime'].dt.month return df def analyze_relationships(df): # Calculate correlations correlations = df[['Global_intensity', 'Global_active_power', 'Global_reactive_power', 'Voltage'] ].corr() # Plot correlation matrix plt.figure(figsize=(10, 8)) sns.heatmap(correlations, annot=True, cmap='coolwarm', vmin=-1, vmax=1) plt.title('Correlation Matrix of Power Measurements') plt.show() # Scatter plot of Active Power vs Intensity plt.figure(figsize=(10, 6)) plt.scatter(df['Global_active_power'], df['Global_intensity'], alpha=0.1) plt.xlabel('Global Active Power (kW)') plt.ylabel('Global Intensity (A)') plt.title('Active Power vs Intensity') plt.show() def train_model(X, y): # Split data X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 ) # Scale features scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) models = { 'Linear Regression': LinearRegression(), 'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42) } results = {} for name, model in models.items(): # Train model model.fit(X_train_scaled, y_train) # Make predictions y_pred = model.predict(X_test_scaled) # Calculate metrics mse = mean_squared_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) results[name] = { 'model': model, 'mse': mse, 'r2': r2, 'predictions': y_pred, 'actual': y_test } return results, scaler, X_test, y_test def plot_results(results): plt.figure(figsize=(15, 10)) for i, (name, result) in enumerate(results.items(), 1): plt.subplot(2, 1, i) plt.scatter(result['actual'], result['predictions'], alpha=0.5) plt.plot( [result['actual'].min(), result['actual'].max()], [result['actual'].min(), result['actual'].max()], 'r--', lw=2 ) plt.xlabel('Actual Energy Consumption (Global_intensity)') plt.ylabel('Predicted Energy Consumption') plt.title(f'{name}\nR² Score: {result["r2"]:.3f}, MSE: {result["mse"]:.3f}') plt.tight_layout() plt.show() def feature_importance(results, X): # For Random Forest model model = results['Random Forest']['model'] feature_imp = pd.DataFrame({ 'feature': X.columns, 'importance': model.feature_importances_ }).sort_values('importance', ascending=False) plt.figure(figsize=(10, 6)) sns.barplot(x='importance', y='feature', data=feature_imp) plt.title('Feature Importance in Energy Consumption Prediction') plt.show() return feature_imp def main(): df = get_data() # Optional for feature selection # analyze_relationships(df) # Set features and target features = [ 'hour', 'day', 'month', 'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3' ] X = df[features] y = df['Global_intensity'] results, scaler, X_test, y_test = train_model(X, y) plot_results(results) # Analyze feature importance df = feature_importance(results, X) print("\nFeature Importance:") print(df) # Example prediction print("\nExample Prediction:") sample_data = X_test[:1] sample_scaled = scaler.transform(sample_data) for name, result in results.items(): prediction = result['model'].predict(sample_scaled)[0] print(f"{name} prediction: {prediction:.2f} kW") print(f"Actual value: {y_test.iloc[0]:.2f} kW") if __name__ == "__main__": main()
Feature Importance: feature importance 5 Sub_metering_3 0.483127 3 Sub_metering_1 0.209181 4 Sub_metering_2 0.162645 0 hour 0.083614 2 month 0.040526 1 day 0.020905 Example Prediction: Linear Regression prediction: 6.75 kW Actual value: 6.40 kW Random Forest prediction: 6.21 kW Actual value: 6.40 kW
Scaling features is crucial in regression analysis
The StandardScaler performs standardization by:
StandardScaler
Computing mean (μ) and standard deviation (σ) for each feature
μ
σ
Transforming each value using: z = (x - μ) / σ
z = (x - μ) / σ
Without scaling, the larger-range features would inappropriately dominate the model's learning process.
Equal Feature Influence
Algorithm Performance
Mathematical Stability
Model Interpretability