ARIMA model fitting
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
# Generate synthetic hourly energy load data
np.random.seed(0)
hours = pd.date_range(start='2024-01-01', periods=365*24, freq='H')
energy_load = 1000 + 50 * np.sin(2 * np.pi * hours.dayofyear / 365) + np.random.normal(0, 100, size=len(hours))
# Plot the synthetic energy load data
plt.figure(figsize=(12, 6))
plt.plot(hours, energy_load, label='Energy Load Data')
plt.xlabel('Date')
plt.ylabel('Energy Load')
plt.title('Hourly Energy Load Profile')
plt.legend()
plt.grid(True)
plt.show()
# ARIMA model fitting
model = ARIMA(energy_load.values, order=(3, 1, 1)) # Use .values to get the NumPy array instead of the Pandas Series
model_fit = model.fit()
# Forecasting next 24 hours
forecast = model_fit.forecast(steps=24)
# Plot ARIMA forecast
forecast_index = pd.date_range(start=hours[-1] + pd.Timedelta(hours=1), periods=24, freq='H')
plt.figure(figsize=(12, 6))
plt.plot(hours[-24:], energy_load[-24:], label='Observed')
plt.plot(forecast_index, forecast, color='red', linestyle='--', label='Forecast')
plt.xlabel('Date')
plt.ylabel('Energy Load')
plt.title('ARIMA Forecast of Energy Load')
plt.legend()
plt.grid(True)
plt.show()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
# Generate synthetic hourly energy load data
np.random.seed(0)
hours = pd.date_range(start='2024-01-01', periods=365*24, freq='H')
energy_load = 1000 + 50 * np.sin(2 * np.pi * hours.dayofyear / 365) + np.random.normal(0, 100, size=len(hours))
# Plot the synthetic energy load data
plt.figure(figsize=(12, 6))
plt.plot(hours, energy_load, label='Energy Load Data')
plt.xlabel('Date')
plt.ylabel('Energy Load')
plt.title('Hourly Energy Load Profile')
plt.legend()
plt.grid(True)
plt.show()
# ARIMA model fitting
model = ARIMA(energy_load.values, order=(2, 1, 2)) # Use .values to get the NumPy array instead of the Pandas Series
model_fit = model.fit()
# Forecasting next 24 hours
forecast = model_fit.forecast(steps=24)
# Plot ARIMA forecast
forecast_index = pd.date_range(start=hours[-1] + pd.Timedelta(hours=1), periods=24, freq='H')
plt.figure(figsize=(12, 6))
plt.plot(hours[-24:], energy_load[-24:], label='Observed')
plt.plot(forecast_index, forecast, color='red', linestyle='--', label='Forecast')
plt.xlabel('Date')
plt.ylabel('Energy Load')
plt.title('ARIMA Forecast of Energy Load')
plt.legend()
plt.grid(True)
plt.show()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
# Generate synthetic hourly energy load data with more complex patterns
np.random.seed(0)
hours = pd.date_range(start='2024-01-01', periods=365*24, freq='H')
trend = np.linspace(1000, 1500, len(hours))
seasonal = 200 * np.sin(2 * np.pi * hours.dayofyear / 365) + 100 * np.sin(2 * np.pi * hours.hour / 24)
noise = np.random.normal(0, 50, size=len(hours))
energy_load = pd.Series(trend + seasonal + noise, index=hours)
# Plot the synthetic energy load data
plt.figure(figsize=(12, 6))
plt.plot(energy_load.index, energy_load.values, label='Energy Load Data')
plt.xlabel('Date')
plt.ylabel('Energy Load')
plt.title('Hourly Energy Load Profile')
plt.legend()
plt.grid(True)
plt.show()
# Check for stationarity
result = adfuller(energy_load.values)
print('ADF Statistic:', result[0])
print('p-value:', result[1])
# Since p-value > 0.05, the series is non-stationary. Let's difference it.
energy_load_diff = energy_load.diff().dropna()
# Check stationarity of differenced series
result = adfuller(energy_load_diff.values)
print('\nAfter differencing:')
print('ADF Statistic:', result[0])
print('p-value:', result[1])
# Manually set ARIMA order
# (p, d, q) = (1, 1, 1) for non-seasonal component
# (P, D, Q, s) = (1, 1, 1, 24) for seasonal component (assuming daily seasonality)
order = (1, 1, 1)
seasonal_order = (1, 1, 1, 24)
# Fit the ARIMA model
model = ARIMA(energy_load.values, order=order, seasonal_order=seasonal_order)
model_fit = model.fit()
print(model_fit.summary())
# Forecast next 7 days (168 hours)
forecast = model_fit.forecast(steps=168)
# Plot ARIMA forecast
forecast_index = pd.date_range(start=hours[-1] + pd.Timedelta(hours=1), periods=168, freq='H')
plt.figure(figsize=(12, 6))
plt.plot(energy_load.index[-168:], energy_load.values[-168:], label='Observed')
plt.plot(forecast_index, forecast, color='red', linestyle='--', label='Forecast')
plt.xlabel('Date')
plt.ylabel('Energy Load')
plt.title('ARIMA Forecast of Energy Load')
plt.legend()
plt.grid(True)
plt.show()
ADF Statistic: -1.40456275122879
p-value: 0.5800867339396595
After differencing:
ADF Statistic: -23.219514538809655
p-value: 0.0
SARIMAX Results
========================================================================================
Dep. Variable: y No. Observations: 8760
Model: ARIMA(1, 1, 1)x(1, 1, 1, 24) Log Likelihood -46589.377
Date: Mon, 15 Jul 2024 AIC 93188.753
Time: 05:50:59 BIC 93224.129
Sample: 0 HQIC 93200.809
- 8760
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 0.0076 0.011 0.691 0.490 -0.014 0.029
ma.L1 -0.9793 0.002 -403.064 0.000 -0.984 -0.975
ar.S.L24 -0.0082 0.011 -0.751 0.453 -0.030 0.013
ma.S.L24 -0.9854 0.003 -393.175 0.000 -0.990 -0.980
sigma2 2487.6497 37.619 66.127 0.000 2413.917 2561.382
===================================================================================
Ljung-Box (L1) (Q): 0.00 Jarque-Bera (JB): 0.96
Prob(Q): 1.00 Prob(JB): 0.62
Heteroskedasticity (H): 1.03 Skew: 0.03
Prob(H) (two-sided): 0.48 Kurtosis: 3.01
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
Wavelet function
import numpy as np
import pandas as pd
import pywt
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
# Generate sample data
def generate_load(hour, day):
base_load = 500 + 300 * np.sin(np.pi * hour / 12) # Daily cycle
weekly_variation = 100 * np.sin(np.pi * day / 7) # Weekly cycle
random_variation = np.random.normal(0, 50) # Random noise
return base_load + weekly_variation + random_variation
# Create a datetime index for one month of hourly data
start_date = datetime(2024, 1, 1)
dates = [start_date + timedelta(hours=i) for i in range(24 * 30)] # 30 days
# Generate load data
loads = [generate_load(d.hour, d.weekday()) for d in dates]
# Create DataFrame
df = pd.DataFrame({'load': loads}, index=dates)
# Plot the generated data
plt.figure(figsize=(12, 6))
plt.plot(df.index, df['load'])
plt.title('Generated Hourly Energy Load Profile')
plt.xlabel('Date')
plt.ylabel('Load (kW)')
plt.show()
# Perform Wavelet Analysis
data = df['load'].values
time = np.arange(len(data))
# Choose wavelet function and scales
wavelet = 'morl' # Morlet wavelet
scales = np.arange(1, 128)
# Perform CWT
coefficients, frequencies = pywt.cwt(data, scales, wavelet)
# Visualize the results
plt.figure(figsize=(12, 8))
plt.imshow(np.abs(coefficients), extent=[0, len(data), 1, 128], cmap='jet', aspect='auto',
vmax=abs(coefficients).max(), vmin=-abs(coefficients).max())
plt.colorbar(label='Magnitude')
plt.ylabel('Scale')
plt.xlabel('Time (hours)')
plt.title('Wavelet Transform of Hourly Energy Load Profile')
plt.show()
# Calculate and plot the global wavelet spectrum
global_ws = np.sum(np.abs(coefficients)**2, axis=1) / len(data)
plt.figure(figsize=(10, 6))
plt.plot(scales, global_ws)
plt.xlabel('Scale')
plt.ylabel('Global Wavelet Spectrum')
plt.title('Global Wavelet Spectrum')
plt.show()
Fourier series analysis and Fourier transform
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.fft import fft, fftfreq
from datetime import datetime, timedelta
# Generate sample data
def generate_load(hour, day):
base_load = 500 + 300 * np.sin(np.pi * hour / 12) # Daily cycle
weekly_variation = 100 * np.sin(np.pi * day / 7) # Weekly cycle
random_variation = np.random.normal(0, 50) # Random noise
return base_load + weekly_variation + random_variation
# Create a datetime index for one month of hourly data
start_date = datetime(2024, 1, 1)
dates = [start_date + timedelta(hours=i) for i in range(24 * 30)] # 30 days
# Generate load data
loads = [generate_load(d.hour, d.weekday()) for d in dates]
# Create DataFrame
df = pd.DataFrame({'load': loads}, index=dates)
# Plot the generated data
plt.figure(figsize=(12, 6))
plt.plot(df.index, df['load'])
plt.title('Generated Hourly Energy Load Profile')
plt.xlabel('Date')
plt.ylabel('Load (kW)')
plt.show()
# Fourier Series Analysis
def fourier_series(x, n_terms):
a0 = np.mean(x)
coeffs = [a0]
for n in range(1, n_terms + 1):
an = 2 * np.mean(x * np.cos(2 * np.pi * n * np.arange(len(x)) / len(x)))
bn = 2 * np.mean(x * np.sin(2 * np.pi * n * np.arange(len(x)) / len(x)))
coeffs.extend([an, bn])
return np.array(coeffs)
# Calculate Fourier series coefficients
n_terms = 10
coeffs = fourier_series(df['load'].values, n_terms)
# Reconstruct the signal using Fourier series
t = np.arange(len(df))
reconstructed = coeffs[0] + sum(coeffs[2*i-1] * np.cos(2*np.pi*i*t/len(df)) +
coeffs[2*i] * np.sin(2*np.pi*i*t/len(df))
for i in range(1, n_terms+1))
# Plot original and reconstructed signals
plt.figure(figsize=(12, 6))
plt.plot(df.index, df['load'], label='Original')
plt.plot(df.index, reconstructed, label=f'Reconstructed (n={n_terms})')
plt.title('Fourier Series Reconstruction')
plt.xlabel('Date')
plt.ylabel('Load (kW)')
plt.legend()
plt.show()
# Fourier Transform
N = len(df)
T = 1.0 / 24 # Sample spacing (1 hour)
yf = fft(df['load'].values)
xf = fftfreq(N, T)[:N//2]
# Plot the frequency spectrum
plt.figure(figsize=(12, 6))
plt.plot(xf, 2.0/N * np.abs(yf[0:N//2]))
plt.title('Frequency Spectrum (Fourier Transform)')
plt.xlabel('Frequency (cycles per day)')
plt.ylabel('Magnitude')
plt.xlim(0, 7) # Limit x-axis to 7 cycles per day for better visibility
plt.show()
知乎学术咨询:
https://www.zhihu.com/consult/people/792359672131756032?isMe=1
担任《Mechanical System and Signal Processing》《中国电机工程学报》等期刊审稿专家,擅长领域:信号滤波/降噪,机器学习/深度学习,时间序列预分析/预测,设备故障诊断/缺陷检测/异常检测。
分割线分割线分割线
使用时间序列分析方法预测比特币价格(密集神经网络、卷积网络、LSTM网络等方法,Python,ipynb文件)
完整代码:
https://mbd.pub/o/bread/mbd-ZpmVmZls
MATLAB环境下基于多尺度集成极限学习机的健康评估预测方法(MATLAB R2018A)
完整代码:
https://mbd.pub/o/bread/mbd-Zp2YlJty
基于Savitzky-Golay滤波和Transformer优化网络的multi-step水质预测模型(Python)
完整代码:
https://mbd.pub/o/bread/mbd-ZJyZl5xr
基于Transformer和时间嵌入的外汇股价预测(Python,ipynb文件)
完整代码:
https://mbd.pub/o/bread/mbd-ZpyXmptq
基于机器学习和深度学习的时间序列分析和预测(Python)
主代码:
import os
import numpy as np
import pandas as pd
import seaborn as sns
from ARX_Model import arx
import statsmodels.api as sm
from AR_Model import ar_model
import matplotlib.pyplot as plt
from ARIMA_Model import arima_model
from Plot_Models import plot_models
from Least_Squares import lest_squares
from Normalize_Regression import normalize_regression
from Sequences_Data import sequences_data
from Test_Stationary import test_stationary
from Auto_Correlation import auto_correlation
from Linear_Regression import linear_regression
from Xgboost_Regression import xgboost_regression
from keras import models, layers
from Random_Forest_Regression import random_forest_regression
from Tree_Decision_Regression import tree_decision_regression
# ======================================== Step 1: Load Data ==================================================
os.system('cls')
data = sm.datasets.sunspots.load_pandas() # df = pd.read_csv('monthly_milk_production.csv'), df.info(), X = df["Value"].values
data = data.data["SUNACTIVITY"]
# print('Shape of data \t', data.shape)
# print('Original Dataset:\n', data.head())
# print('Values:\n', data)
# ================================ Step 2.1: Normalize Data (0-1) ================================================
#data, normalize_modele = normalize_regression(data, type_normalize='MinMaxScaler', display_figure='on') # Type_Normalize: 'MinMaxScaler', 'normalize'
# ================================ Step 2.2: Check Stationary Time Series ========================================
#data = test_stationary(data, window=20)
# ==================================== Step 3: Find the lags of AR and etc models ==============================
#auto_correlation(data, nLags=10)
# =========================== Step 4: Split Dataset intro Train and Test =======================================
nLags = 3
num_sample = 300
mu = 0.000001
Data_Lags = pd.DataFrame(np.zeros((len(data), nLags)))
for i in range(0, nLags):
Data_Lags[i] = data.shift(i + 1)
Data_Lags = Data_Lags[nLags:]
data = data[nLags:]
Data_Lags.index = np.arange(0, len(Data_Lags), 1, dtype=int)
data.index = np.arange(0, len(data), 1, dtype=int)
train_size = int(len(data) * 0.8)
# ================================= Step 5: Autoregressive and Automated Methods ===============================
sns.set(style='white')
fig, axs = plt.subplots(nrows=4, ncols=1, sharey='row', figsize=(16, 10))
plot_models(data, [], [], axs, nLags, train_size, num_sample=num_sample, type_model='Actual_Data')
# ------------------------------------------- Least Squares ---------------------------------------------------
lest_squares(data, Data_Lags, train_size, axs, num_sample=num_sample)
# -------------------------------------------- Auto-Regressive (AR) model --------------------------------------
ar_model(data, train_size, axs, n_lags=nLags, num_sample=num_sample)
# ------------------------------------------------ ARX --------------------------------------------------------
arx(data, Data_Lags, train_size, axs, mu=mu, num_sample=num_sample)
# ----------------------------- Auto-Regressive Integrated Moving Averages (ARIMA) -----------------------------
arima_model(data, train_size, axs, order=(5, 1, (1, 1, 1, 1)), seasonal_order=(0, 0, 2, 12), num_sample=num_sample)
# ======================================= Step 5: Machine Learning Models ======================================
# ------------------------------------------- Linear Regression Model -----------------------------------------
linear_regression(data, Data_Lags, train_size, axs, num_sample=num_sample)
# ------------------------------------------ RandomForestRegressor Model ---------------------------------------
random_forest_regression(data, Data_Lags, train_size, axs, n_estimators=100, max_features=nLags, num_sample=num_sample)
# -------------------------------------------- Decision Tree Model ---------------------------------------------
tree_decision_regression(data, Data_Lags, train_size, axs, max_depth=2, num_sample=num_sample)
# ---------------------------------------------- xgboost -------------------------------------------------------
xgboost_regression(data, Data_Lags, train_size, axs, n_estimators=1000, num_sample=num_sample)
# ----------------------------------------------- LSTM model --------------------------------------------------
train_x, train_y = sequences_data(np.array(data[:train_size]), nLags) # Convert to a time series dimension:[samples, nLags, n_features]
test_x, test_y = sequences_data(np.array(data[train_size:]), nLags)
mod = models.Sequential() # Build the model
# mod.add(layers.ConvLSTM2D(filters=64, kernel_size=(1, 1), activation='relu', input_shape=(None, nLags))) # ConvLSTM2D
# mod.add(layers.Flatten())
mod.add(layers.LSTM(units=100, activation='tanh', input_shape=(None, nLags)))
mod.add(layers.Dropout(rate=0.2))
# mod.add(layers.LSTM(units=100, activation='tanh')) # Stacked LSTM
# mod.add(layers.Bidirectional(layers.LSTM(units=100, activation='tanh'), input_shape=(None, 1))) # Bidirectional LSTM: forward and backward
mod.add(layers.Dense(32))
mod.add(layers.Dense(1)) # A Dense layer of 1 node is added in order to predict the label(Prediction of the next value)
mod.compile(optimizer='adam', loss='mse')
mod.fit(train_x, train_y, validation_data=(test_x, test_y), verbose=2, epochs=100)
y_train_pred = pd.Series(mod.predict(train_x).ravel())
y_test_pred = pd.Series(mod.predict(test_x).ravel())
y_train_pred.index = np.arange(nLags, len(y_train_pred)+nLags, 1, dtype=int)
y_test_pred.index = np.arange(train_size + nLags, len(data), 1, dtype=int)
plot_models(data, y_train_pred, y_test_pred, axs, nLags, train_size, num_sample=num_sample, type_model='LSTM')
# data_train = normalize.inverse_transform((np.array(data_train)).reshape(-1, 1))
mod.summary(), plt.tight_layout(), plt.subplots_adjust(wspace=0, hspace=0.2), plt.show()
完整代码:
https://mbd.pub/o/bread/mbd-ZpmWl5hx
基于机器学习的耗电量预测(Python)
Tools and Libraries
The following libraries and tools were used in this project:
- Pandas: For data manipulation and analysis.
- Matplotlib: For data visualization.
- Seaborn: For enhanced data visualization.
- Numpy: For numerical operations.
- XGBoost: For implementing the gradient boosting model.
- Scikit-learn: For model evaluation and time series cross-validation.
Project Workflow
1. Data Loading and Visualization
- Load the dataset containing PJME hourly energy usage.
- Plot the time series data to visualize trends and patterns.
2. Outlier Analysis and Removal
- Identify and remove outliers in the dataset to improve model accuracy.
- Visualize the distribution of energy usage and filter out anomalies.
3. Train-Test Split
- Split the dataset into training and testing sets based on a specific date (01-01-2015).
- Visualize the train-test split to ensure correct partitioning.
4. Time Series Cross-Validation
- Implement time series cross-validation with 5 splits to evaluate model performance.
- Visualize each fold's train-test split to ensure proper cross-validation.
5. Feature Engineering
- Create new time-based features (hour, day of the week, month, year, etc.) to enhance model performance.
- Add lag features to incorporate historical energy usage information.
6. Model Training and Evaluation
- Train the XGBoost regressor using the created features and target variable.
- Evaluate model performance using root mean squared error (RMSE) across different folds.
- Visualize model predictions against actual values to assess accuracy.
7. Future Predictions
- Generate future time frames for prediction.
- Use the trained model to predict future energy usage.
- Visualize the future predictions to understand the model's forecast.
8. Model Saving and Loading
- Save the trained model to a file for future use.
- Load the saved model and test its prediction capability.
Results
- The model achieved an average RMSE score of 3742.5833 across the validation folds.
- The trained model effectively captures the trend and seasonality in the PJME energy usage data.
- Future predictions indicate a reasonable forecast of energy usage, with visualizations supporting the model's accuracy.
Conclusion
This project demonstrates the application of machine learning techniques to time series forecasting in the energy sector. The use of XGBoost and careful feature engineering resulted in a model capable of predicting future energy consumption with reasonable accuracy. Further improvements could be made by exploring additional features or different model architectures.
完整代码:
https://mbd.pub/o/bread/mbd-ZpmVmp9y
基于注意力机制长短期记忆(LSTM)网络的通用电气股票价格预测(Python,Keras)
ipynb文件运行,采用的模块如下:
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense, Dropout, Lambda, Dot, Activation, Concatenate, Layer
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from datetime import datetime
完整代码:
https://mbd.pub/o/bread/mbd-ZpmUmJ9x
基于Arima模型和Transformer模型的能源消耗预测(Python)
算法运行环境为Jupyter Notebook,采用模块如下:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_selection import f_regression, mutual_info_regression, SelectKBest
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import ModelCheckpoint
import keras.backend as K
程序部分出图如下。
完整代码:
https://mbd.pub/o/bread/mbd-ZZibm5Zs