Skip to content
Snippets Groups Projects
Commit ef0f098b authored by Roland Peterer's avatar Roland Peterer
Browse files

merge conflict resolved

parents 59ba635a b154bf4d
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id: tags:
```
# default_exp change_points
```
%% Cell type:markdown id: tags:
# Change point detection
> Determine the change points of time series signals
%% Cell type:code id: tags:
```
#export
from pathlib import Path
import functools
from iet_challenge.preprocessing import *
from iet_challenge.features import FEATURES, TARGETS
from iet_challenge.metrics import simple_tests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(15, 5)})
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, RobustScaler
import ruptures as rpt
```
%% Cell type:code id: tags:
```
#export
DATA_FOLDER = Path("data")
TURBINE_TRAIN = DATA_FOLDER / "turbine-signals-train.csv"
METMAST_TRAIN = DATA_FOLDER / "metmast-train.csv"
TURBINE_TEST = DATA_FOLDER / "turbine-signals-testing.csv"
METMAST_TEST = DATA_FOLDER / "metmast-testing.csv"
```
%% Cell type:code id: tags:
```
#export
def main():
# Load data.
turbine_train = pd.read_csv(TURBINE_TRAIN, index_col=0, parse_dates=True)
metmast_train = pd.read_csv(METMAST_TRAIN, index_col="Timestamp", parse_dates=True).drop(columns="Unnamed: 0")
# Join dataframes and run preprocessing pipeline.
df_train = join_frames([turbine_train, metmast_train])
dropna_rows = functools.partial(dropna, axis=0)
# Detrend signals
targets = ["Gen_Bear2_Temp_Avg"]
# features = ['Amb_WindSpeed_Avg', 'Grd_Prod_Pwr_Avg', 'Amb_Temp_Avg']
features = ['Amb_Temp_Avg', 'Gen_RPM_Avg']
lin_reg = LinearRegression()
detrend_signal = functools.partial(detrend, regr=lin_reg, features=features, targets=TARGETS)
pipeline = compose(filter_power, filter_regexp, filter_constants, dropna, resample, detrend_signal)
X = pipeline(df_train[FEATURES + TARGETS])
scaler = RobustScaler(quantile_range=(5.0, 95.0)).fit(X)
X_train = scaler.transform(X)
df_X_train = pd.DataFrame(data=X_train, columns=X.columns, index=X.index)
return df_X_train
if __name__ == "__main__":
X_train = main()
```
%% Cell type:code id: tags:
```
X_train.plot(y="Gen_Bear2_Temp_Avg_detrend")
```
%% Output
<AxesSubplot:xlabel='Timestamp'>
%% Cell type:code id: tags:
```
df_cpd = X_train.copy()
df_wtg_fails = pd.read_csv(DATA_FOLDER / "turbine-failures-train.csv", index_col=0, parse_dates=True)
fails = df_wtg_fails.loc[df_wtg_fails.Component == "GENERATOR"].copy()
fails["Failure"] = 1
fails = fails.resample('D').sum()
fail_index = fails.loc[fails.Failure == 1].index
df_cpd["index"] = np.arange(0, df_cpd.shape[0])
bkps = df_cpd.loc[fail_index, "index"].values
bkps = np.append(bkps, df_cpd.shape[0])
print(bkps)
```
%% Output
[192 205 247 275 300 609]
%% Cell type:code id: tags:
```
fails.loc[fails.Failure == 1]
```
%% Output
Failure
Timestamp
2016-07-11 00:00:00+00:00 1
2016-07-24 00:00:00+00:00 1
2016-09-04 00:00:00+00:00 1
2016-10-02 00:00:00+00:00 1
2016-10-27 00:00:00+00:00 1
%% Cell type:code id: tags:
```
# detection
target = ["Gen_Bear2_Temp_Avg_detrend"]
algo = rpt.Dynp(model="l2").fit(df_cpd[target])
result = algo.predict(n_bkps=5)
print(result)
# display
# ax = rpt.display(df_cpd[target], bkps, result, figsize=(15, 5))
ax = df_cpd[target].plot()
ax.vlines(df_cpd.index[result[:-1]], -1, 2, colors='r', linestyles='dashed', linewidth=3)
ax.vlines(df_cpd.index[bkps[:-1]], -1, 2, colors='k', linestyles='dashed', linewidth=3)
```
%% Output
[190, 305, 535, 545, 565, 609]
<matplotlib.collections.LineCollection at 0x7fa4a022f220>
This diff is collapsed.
__version__ = "0.0.1"
import logging
logging.basicConfig(filename='preprocessing.log', level=logging.INFO, format='%(asctime)s %(message)s')
logger = logging.getLogger("Preprocess")
# -*- coding: utf-8 -*-
"""
iet_challenge.features
~~~~~~~~~~~~~~~~~
Here the features for the training are specified
"""
FEATURES = [
"Gen_RPM_Avg",
"Gen_RPM_Std",
"Gen_Bear_Temp_Avg",
"Gen_Phase1_Temp_Avg",
"Gen_Phase2_Temp_Avg",
"Gen_Phase3_Temp_Avg",
"Hyd_Oil_Temp_Avg",
"Gear_Oil_Temp_Avg",
"Gear_Bear_Temp_Avg",
"Nac_Temp_Avg",
"Rtr_RPM_Avg",
"Amb_WindSpeed_Avg",
"Amb_WindSpeed_Std",
"Amb_WindDir_Relative_Avg",
"Amb_WindDir_Abs_Avg",
"Amb_Temp_Avg",
"Prod_LatestAvg_ActPwrGen0",
"Prod_LatestAvg_ActPwrGen1",
"Prod_LatestAvg_ActPwrGen2",
"Prod_LatestAvg_TotActPwr",
"Prod_LatestAvg_ReactPwrGen0",
"Prod_LatestAvg_ReactPwrGen1",
"Prod_LatestAvg_ReactPwrGen2",
"Prod_LatestAvg_TotReactPwr",
"HVTrafo_Phase1_Temp_Avg",
"HVTrafo_Phase2_Temp_Avg",
"HVTrafo_Phase3_Temp_Avg",
"Grd_InverterPhase1_Temp_Avg",
"Cont_Top_Temp_Avg",
"Cont_Hub_Temp_Avg",
"Cont_VCP_Temp_Avg",
"Gen_SlipRing_Temp_Avg",
"Spin_Temp_Avg",
"Blds_PitchAngle_Avg",
"Blds_PitchAngle_Std",
"Cont_VCP_ChokcoilTemp_Avg",
"Grd_RtrInvPhase1_Temp_Avg",
"Grd_RtrInvPhase2_Temp_Avg",
"Grd_RtrInvPhase3_Temp_Avg",
"Cont_VCP_WtrTemp_Avg",
"Grd_Prod_Pwr_Avg",
"Rtr_RPM_Std",
"Grd_Prod_Pwr_Std",
"Nac_Direction_Avg",
]
TARGETS = [
"Gen_Bear2_Temp_Avg",
]
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score
import numpy as np
def simple_tests(model, X_train, y_train):
y_pred = model.predict(X_train)
model_mse = mean_squared_error(y_train, y_pred)
model_rmse = np.sqrt(model_mse)
print("RMSE on training set:", model_rmse)
print("MAE on training set:", mean_absolute_error(y_train, y_pred))
print()
model_scores = cross_val_score(
model, X_train, y_train, scoring="neg_mean_squared_error", cv=5
)
model_rmse_scores = np.sqrt(-model_scores)
print("RMSE")
display_scores(model_rmse_scores)
print()
model_scores = cross_val_score(
model, X_train, y_train, scoring="neg_mean_absolute_error", cv=5
)
print("MAE")
display_scores(-model_scores)
def display_scores(scores):
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard deviation:", scores.std())
def score_stats(scores):
return scores.mean(), scores.std()
# AUTOGENERATED! DO NOT EDIT! File to edit: 03_model.ipynb (unless otherwise specified).
__all__ = ['DATA_FOLDER', 'TURBINE_TRAIN', 'METMAST_TRAIN', 'TURBINE_TEST', 'METMAST_TEST', 'main']
# Cell
from typing import List, Callable, Union
from pathlib import Path
import functools
from .preprocessing import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(15, 5)})
import xgboost as xgb
from sklearn.linear_model import LinearRegression
# Cell
DATA_FOLDER = Path("data")
TURBINE_TRAIN = DATA_FOLDER / "turbine-signals-train.csv"
METMAST_TRAIN = DATA_FOLDER / "metmast-train.csv"
TURBINE_TEST = DATA_FOLDER / "turbine-signals-testing.csv"
METMAST_TEST = DATA_FOLDER / "metmast-testing.csv"
# Cell
def main():
# Load data.
turbine_train = pd.read_csv(TURBINE_TRAIN, index_col=0, parse_dates=True)
metmast_train = pd.read_csv(METMAST_TRAIN, index_col="Timestamp", parse_dates=True).drop(columns="Unnamed: 0")
turbine_test = pd.read_csv(TURBINE_TRAIN, index_col=0, parse_dates=True)
metmast_test = pd.read_csv(METMAST_TRAIN, index_col="Timestamp", parse_dates=True).drop(columns="Unnamed: 0")
# Join dataframes and run preprocessing pipeline.
df_train = join_frames([turbine_train, metmast_train])
df_test = join_frames([turbine_test, metmast_test])
# Detrend signals for NBM
target = ["Gen_Bear_Temp_Avg"]
# features = ['Amb_WindSpeed_Avg', 'Grd_Prod_Pwr_Avg', 'Amb_Temp_Avg']
features = ['Amb_Temp_Avg']
lin_reg = LinearRegression()
detrend_bear_temp = functools.partial(detrend, regr=lin_reg, features=features, target=target)
timestamps = ["2016-07-11 00:00:00+00:00", "2016-10-27 00:00:00+00:00"]
remove_intervals = functools.partial(remove_interval, timestamps=timestamps)
pipeline = compose(filter_power, filter_regexp, filter_constants, dropna, resample, add_time_cols, detrend_bear_temp, remove_intervals)
df_train = pipeline(df_train)
df_test = pipeline(df_test)
return df_train, df_test
if __name__ == "__main__":
df_train, df_test = main()
\ No newline at end of file
# -*- coding: utf-8 -*-
"""
iet_challenge.preprocessing
~~~~~~~~~~~~~~~~~
Functions for preprocessing
"""
__all__ = [
"log_process",
"join_frames",
"add_time_cols",
"square_root",
"filter_constants",
"filter_power",
"resample",
"filter_regexp",
"dropna",
"filter_corr",
"compose",
"detrend",
"ComposableFunction",
"remove_interval",
]
# Cell
from typing import List, Callable, Union
import functools
from iet_challenge import logger
import pandas as pd
import numpy as np
# Cell
def log_process(func):
def wrapper(*args, **kwargs):
df = args[0]
columns = set(df.columns)
n_samples = df.shape[0]
result = func(*args, **kwargs)
removed_columns = columns - set(result.columns)
n_removed_rows = n_samples - result.shape[0]
logger.info(
f"{func} Columns removed: {len(removed_columns)}, rows removed: {n_removed_rows} \n {removed_columns}"
)
return result
return wrapper
def join_frames(frames: List[pd.DataFrame]) -> pd.DataFrame:
frame = frames[0].join(frames[1:], how="outer")
return frame[~frame.index.duplicated()]
@log_process
def add_time_cols(df) -> pd.DataFrame:
df["Year"] = df.index.year
df["Month"] = df.index.month_name()
df["Day"] = df.index.day_name()
df["Hour"] = df.index.hour
df["Minute"] = df.index.minute
return df
def square_root(x):
return np.sqrt(np.square(x).sum())
@log_process
def filter_constants(df) -> pd.DataFrame:
df_tmp = df.std() / df.agg(square_root)
tmp = df_tmp.loc[df_tmp < 1e-6].index.tolist()
return df[df.columns[~df.columns.isin(tmp)]]
@log_process
def filter_power(
df, col: str = "Grd_Prod_Pwr_Avg", rated: float = 2000, thresh: float = 0.1
) -> pd.DataFrame:
return df.loc[df[col] > rated * thresh]
def resample(df, period: str = "D") -> pd.DataFrame:
return df.resample(period).mean()
@log_process
def filter_regexp(df, regex: str = "^((?!Min).)*[^xn]$") -> pd.DataFrame:
return df.filter(regex=regex)
@log_process
def dropna(df, axis=1, thresh=1.0) -> pd.DataFrame:
return df.dropna(axis=axis, thresh=int(thresh * df.shape[~axis & 1]))
@log_process
def filter_corr(df, thresh: float = 0.95, pre_choice=None) -> pd.DataFrame:
corr = df.corr()
corr = corr - np.eye(corr.shape[0])
mask = corr.abs() > 0.95
to_filter = corr[mask].unstack().dropna().to_frame()
keep = set(pre_choice)
discard = set()
for i, j in to_filter.index:
if i not in discard:
keep.add(i)
if j not in keep:
discard.add(j)
return df[df.columns[~df.columns.isin(discard)]]
ComposableFunction = Callable[[pd.DataFrame], pd.DataFrame]
def compose(*functions: ComposableFunction) -> ComposableFunction:
return functools.reduce(lambda f, g: lambda x: g(f(x)), functions)
def detrend(df, regr, features: List[str], targets: List[str]) -> pd.DataFrame:
df_trend = df[features + targets].dropna()
for target in targets:
model = regr.fit(df_trend[features], df_trend[target])
df[f"{target}_detrend"] = df_trend[target] - model.predict(df_trend[features])
return df
def remove_interval(
df: pd.DataFrame,
timestamps: List[Union[str, pd.Timestamp]],
delta: Union[str, pd.Timedelta] = "30D",
):
mask = np.ones(len(df.index), dtype=bool)
for timestamp in timestamps:
start = pd.Timestamp(timestamp) - pd.Timedelta(delta)
end = pd.Timestamp(timestamp) + pd.Timedelta(delta)
mask = np.logical_and(mask, (df.index < start) | (df.index > end))
return df.loc[mask]
......@@ -16,6 +16,8 @@ missingno = "^0.5.0"
seaborn = "^0.11.2"
plotly = "^5.3.1"
kaleido = "=0.2.1"
ruptures = "^1.1.4"
xgboost = "^1.5.0"
[tool.poetry.dev-dependencies]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment