-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathpreprocess.py
59 lines (48 loc) · 1.53 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import pandas as pd
def preprocess(
df,
diff_n=1,
smooth_n=3,
lags_n=5,
shuffle=False,
dropna=True,
freq=None,
freq_agg="mean",
) -> pd.DataFrame:
"""
Prepare data for model training and scoring.
Parameters:
diff_n (int): The order of differencing.
smooth_n (int): The window size for smoothing (moving average).
lags_n (list): The list of lags to include.
shuffle (bool): Whether to shuffle the data.
dropna (bool): Whether to drop missing values.
freq (str): The frequency string to resample the data.
freq_agg (str): The aggregation method for resampling.
"""
X = (
df.sort_values(by=["metric_timestamp"])
.reset_index(drop=True)
.set_index("metric_timestamp")
)
X = X[["metric_value"]]
if freq is not None:
if freq_agg == "mean":
X = X.resample(freq).mean()
elif freq_agg == "sum":
X = X.resample(freq).sum()
# Add other aggregation methods as needed
else:
raise ValueError(f"Unsupported aggregation method: {freq_agg}")
if diff_n > 0:
X["metric_value"] = X["metric_value"].diff(periods=diff_n).dropna()
if smooth_n > 0:
X["metric_value"] = X["metric_value"].rolling(window=smooth_n).mean().dropna()
if lags_n > 0:
for lag in range(1, lags_n + 1):
X[f"lag_{lag}"] = X["metric_value"].shift(lag)
if shuffle:
X = X.sample(frac=1)
if dropna:
X = X.dropna()
return X