Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Utils Methods and TIE Speedups #250

Merged
merged 17 commits into from
Aug 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ All notable changes to this project will be documented in this file. If you make
- Modern dependency stacks now supported!
- Upgrading past major versions of Scikit-Learn (1.0) and Pandas (2.0), in conjunction with their own dependencies, caused small divergences in the MonteCarloAEP analysis method with Daily GBM, and the Wake Losses analysis method with UQ. The magnitude of the differences are small compared with the magnitude of the output.
- In general, OpenOA is now moving away from pinning the maximum dependency version, and will stick to defining minimum dependencies to ensure modern API usage is supported across the software.
- `utils.filters.bin_filter` was converted from a for loop to a vectorized method
- `utils.filters.bin_filter` and `utils.timeseries.percent_nan` were converted to be nearly pure NumPy methods operating on NumPy arrays for significant speedups of the TIE analysis method.
- `analysis.TurbineLongTermGrossEnergy.filter_turbine_data` was cleaned up for a minor gain in efficiency and readability.

## 3.0rc2
- Everything from release candidate 1
Expand Down
54 changes: 27 additions & 27 deletions examples/03_turbine_ideal_energy.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@
" </tr>\n",
" <tr>\n",
" <th>time</th>\n",
" <th>WTUR_TurNam</th>\n",
" <th>asset_id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
Expand Down Expand Up @@ -206,29 +206,29 @@
"</div>"
],
"text/plain": [
" WROT_BlPthAngVal WTUR_W WMET_HorWdSpd \\\n",
"time WTUR_TurNam \n",
"2014-01-01 00:00:00 R80736 -1.00 642.78003 7.12 \n",
" R80721 -1.01 441.06000 6.39 \n",
" R80790 -0.96 658.53003 7.11 \n",
" R80711 -0.93 514.23999 6.87 \n",
"2014-01-01 00:10:00 R80790 -0.96 640.23999 7.01 \n",
" WROT_BlPthAngVal WTUR_W WMET_HorWdSpd \\\n",
"time asset_id \n",
"2014-01-01 00:00:00 R80736 -1.00 642.78003 7.12 \n",
" R80721 -1.01 441.06000 6.39 \n",
" R80790 -0.96 658.53003 7.11 \n",
" R80711 -0.93 514.23999 6.87 \n",
"2014-01-01 00:10:00 R80790 -0.96 640.23999 7.01 \n",
"\n",
" Va_avg WMET_EnvTmp Ya_avg \\\n",
"time WTUR_TurNam \n",
"2014-01-01 00:00:00 R80736 0.66 4.69 181.34000 \n",
" R80721 -2.48 4.94 179.82001 \n",
" R80790 1.07 4.55 172.39000 \n",
" R80711 6.95 4.30 172.77000 \n",
"2014-01-01 00:10:00 R80790 -1.90 4.68 172.39000 \n",
" Va_avg WMET_EnvTmp Ya_avg WMET_HorWdDir \\\n",
"time asset_id \n",
"2014-01-01 00:00:00 R80736 0.66 4.69 181.34000 182.00999 \n",
" R80721 -2.48 4.94 179.82001 177.36000 \n",
" R80790 1.07 4.55 172.39000 173.50999 \n",
" R80711 6.95 4.30 172.77000 179.72000 \n",
"2014-01-01 00:10:00 R80790 -1.90 4.68 172.39000 170.46001 \n",
"\n",
" WMET_HorWdDir energy_kwh WTUR_SupWh \n",
"time WTUR_TurNam \n",
"2014-01-01 00:00:00 R80736 182.00999 107.130005 107.130005 \n",
" R80721 177.36000 73.510000 73.510000 \n",
" R80790 173.50999 109.755005 109.755005 \n",
" R80711 179.72000 85.706665 85.706665 \n",
"2014-01-01 00:10:00 R80790 170.46001 106.706665 106.706665 "
" energy_kwh WTUR_SupWh \n",
"time asset_id \n",
"2014-01-01 00:00:00 R80736 107.130005 107.130005 \n",
" R80721 73.510000 73.510000 \n",
" R80790 109.755005 109.755005 \n",
" R80711 85.706665 85.706665 \n",
"2014-01-01 00:10:00 R80790 106.706665 106.706665 "
]
},
"execution_count": 4,
Expand Down Expand Up @@ -305,7 +305,7 @@
"output_type": "stream",
"text": [
"INFO:openoa.analysis.turbine_long_term_gross_energy:Running the long term gross energy analysis\n",
"100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:12<00:00, 6.32s/it]\n",
"100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:13<00:00, 6.86s/it]\n",
"INFO:openoa.analysis.turbine_long_term_gross_energy:Run completed\n"
]
}
Expand Down Expand Up @@ -540,7 +540,7 @@
"output_type": "stream",
"text": [
"INFO:openoa.analysis.turbine_long_term_gross_energy:Running the long term gross energy analysis\n",
"100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [16:11<00:00, 9.72s/it]\n",
"100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [10:48<00:00, 6.49s/it]\n",
"INFO:openoa.analysis.turbine_long_term_gross_energy:Run completed\n"
]
}
Expand All @@ -566,7 +566,7 @@
{
"data": {
"text/plain": [
"13571894.791458625"
"13629417.559871903"
]
},
"execution_count": 13,
Expand All @@ -586,7 +586,7 @@
{
"data": {
"text/plain": [
"589131.3024185459"
"643649.6728541044"
]
},
"execution_count": 14,
Expand All @@ -608,7 +608,7 @@
"output_type": "stream",
"text": [
"Mean long-term turbine ideal energy is 13.6 GWh/year\n",
"Uncertainty in long-term turbine ideal energy is 0.6 GWh/year, or 4.3% percent\n"
"Uncertainty in long-term turbine ideal energy is 0.6 GWh/year, or 4.7% percent\n"
]
}
],
Expand Down
78 changes: 32 additions & 46 deletions openoa/analysis/turbine_long_term_gross_energy.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,63 +300,49 @@ def filter_turbine_data(self) -> None:

# Loop through turbines
for t in self.turbine_ids:
turb_capac = dic[t]["WTUR_W"].max()
scada_df = self.scada_dict[t]
turbine_capacity_real = scada_df.WTUR_W.max()

max_bin = (
self._run.max_power_filter * turb_capac
self._run.max_power_filter * turbine_capacity_real
) # Set maximum range for using bin-filter

dic[t].dropna(
scada_df.dropna(
subset=["WMET_HorWdSpd", "WTUR_SupWh"], inplace=True
) # Drop any data where scada wind speed or energy is NaN

# Flag turbine energy data less than zero
dic[t].loc[:, "flag_neg"] = filters.range_flag(
dic[t].loc[:, "WTUR_W"], lower=0, upper=turb_capac
scada_df = scada_df.assign(
flag_neg=filters.range_flag(scada_df.WTUR_W, lower=0, upper=turbine_capacity_real),
flag_range=filters.range_flag(scada_df.WMET_HorWdSpd, lower=0, upper=40),
flag_frozen=filters.unresponsive_flag(scada_df.WMET_HorWdSpd, threshold=3),
flag_window=filters.window_range_flag(
window_col=dic[t].loc[:, "WMET_HorWdSpd"],
window_start=5.0,
window_end=40,
value_col=dic[t].loc[:, "WTUR_W"],
value_min=0.02 * turbine_capacity_real,
value_max=1.2 * turbine_capacity_real,
),
flag_bin=filters.bin_filter(
bin_col=dic[t].loc[:, "WTUR_W"],
value_col=dic[t].loc[:, "WMET_HorWdSpd"],
bin_width=0.06 * turbine_capacity_real,
threshold=self._run.wind_bin_thresh,
center_type="median",
bin_min=np.round(0.01 * turbine_capacity_real),
bin_max=np.round(max_bin),
threshold_type="std",
direction="all",
),
)
# Apply range filter
dic[t].loc[:, "flag_range"] = filters.range_flag(
dic[t].loc[:, "WMET_HorWdSpd"], lower=0, upper=40
)
# Apply frozen/unresponsive sensor filter
dic[t].loc[:, "flag_frozen"] = filters.unresponsive_flag(
dic[t].loc[:, "WMET_HorWdSpd"], threshold=3
)
# Apply window range filter
dic[t].loc[:, "flag_window"] = filters.window_range_flag(
window_col=dic[t].loc[:, "WMET_HorWdSpd"],
window_start=5.0,
window_end=40,
value_col=dic[t].loc[:, "WTUR_W"],
value_min=0.02 * turb_capac,
value_max=1.2 * turb_capac,
)

threshold_wind_bin = self._run.wind_bin_thresh
# Apply bin-based filter
dic[t].loc[:, "flag_bin"] = filters.bin_filter(
bin_col=dic[t].loc[:, "WTUR_W"],
value_col=dic[t].loc[:, "WMET_HorWdSpd"],
bin_width=0.06 * turb_capac,
threshold=threshold_wind_bin, # wind bin thresh
center_type="median",
bin_min=np.round(0.01 * turb_capac),
bin_max=np.round(max_bin),
threshold_type="std",
direction="all",
)

# Create a 'final' flag which is true if any of the previous flags are true
dic[t].loc[:, "flag_final"] = (
(dic[t].loc[:, "flag_range"])
| (dic[t].loc[:, "flag_window"])
| (dic[t].loc[:, "flag_bin"])
| (dic[t].loc[:, "flag_frozen"])
self.scada_dict[t].loc[:, "flag_final"] = (
scada_df.flag_range
| scada_df.flag_window
| scada_df.flag_bin
| scada_df.flag_frozen
)

# Set negative turbine data to zero
dic[t].loc[dic[t]["flag_neg"], "WTUR_W"] = 0

def setup_daily_reanalysis_data(self) -> None:
"""
Process reanalysis data to daily means for later use in the GAM model.
Expand Down
81 changes: 46 additions & 35 deletions openoa/utils/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,8 +147,8 @@ def std_range_flag(
raise ValueError("The inputs to `col` and `threshold` must be the same length.")

subset = data.loc[:, col].copy()
data_mean = subset.mean(axis=0)
data_std = subset.std(axis=0) * np.array(threshold)
data_mean = np.nanmean(subset.values, axis=0)
data_std = np.nanstd(subset.values, ddof=1, axis=0) * np.array(threshold)
flag = subset.le(data_mean - data_std) | subset.ge(data_mean + data_std)

# Return back a pd.Series if one was provided, else a pd.DataFrame
Expand Down Expand Up @@ -232,47 +232,58 @@ def bin_filter(

# Set bin min and max values if not passed to function
if bin_min is None:
bin_min = bin_col.min()
bin_min = np.min(bin_col.values)
if bin_max is None:
bin_max = bin_col.max()
bin_max = np.max(bin_col.values)

# Define bin edges
bin_edges = np.arange(bin_min, bin_max, bin_width)

# Ensure the last bin edge value is bin_max
bin_edges = np.unique(np.clip(np.append(bin_edges, bin_max), bin_min, bin_max))

# Define empty flag of 'False' values with indices matching value_col
flag = pd.Series(index=value_col.index, data=False)

# Loop through bins and applying flagging
nbins = len(bin_edges)
for i in range(nbins - 1):
# Get data that fall wihtin bin
y_bin = value_col.loc[(bin_col <= bin_edges[i + 1]) & (bin_col > bin_edges[i])]

# Get center of binned data
center = y_bin.mean() if center_type == "mean" else y_bin.median()

# Define threshold of data flag
if threshold_type == "std":
deviation = y_bin.std() * threshold
elif threshold_type == "scalar":
deviation = threshold
else: # median absolute deviation (mad)
deviation = (y_bin - center).abs().median() * threshold

# Perform flagging depending on specfied direction
if direction == "above":
flag_bin = y_bin > (center + deviation)
elif direction == "below":
flag_bin = y_bin < (center - deviation)
else:
flag_bin = (y_bin > (center + deviation)) | (y_bin < (center - deviation))

# Record flags in final flag column
flag.loc[flag_bin.index] = flag_bin

# Bin the data and recreate the comparison data as a multi-column data frame
which_bin_col = np.digitize(bin_col, bin_edges, right=True)

# Create the flag values as a matrix with each column being the timestamp's binned value,
# e.g., all columns values are NaN if the data point is not in that bin
flag_vals = (
value_col.to_frame().set_index(pd.Series(which_bin_col, name="bin"), append=True).unstack()
)
drop = [i for i, el in enumerate(flag_vals.columns.names) if el != "bin"]
flag_vals.columns = flag_vals.columns.droplevel(drop).rename(None)

# Create a False array as default, so flags are set to True
flag_df = pd.DataFrame(np.zeros_like(flag_vals, dtype=bool), index=flag_vals.index)

# Get center of binned data
if center_type == "median":
center = np.nanmedian(flag_vals.values, axis=0)
else:
center = np.nanmean(flag_vals.values, axis=0)
center = pd.DataFrame(
np.full(flag_vals.shape, center),
index=flag_vals.index,
columns=flag_vals.columns,
)

# Define threshold of data flag
if threshold_type == "std":
deviation = np.nanstd(flag_vals.values, ddof=1, axis=0) * threshold
elif threshold_type == "scalar":
deviation = threshold
else: # median absolute deviation (mad)
deviation = np.nanmedian(np.abs(flag_vals.values - center.values), axis=0) * threshold

# Perform flagging depending on specfied direction
if direction in ("above", "all"):
flag_df |= flag_vals > center + deviation
if direction in ("below", "all"):
flag_df |= flag_vals < center - deviation

# Get all instances where the value is True, and reset any values outside the bin limits
flag = pd.Series(np.nanmax(flag_df, axis=1), index=flag_df.index)
flag.loc[(bin_col <= bin_min) | (bin_col > bin_max)] = False
return flag


Expand Down
2 changes: 1 addition & 1 deletion openoa/utils/timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ def percent_nan(col: pd.Series | str, data: pd.DataFrame = None):
Returns:
:obj:`float`: Percentage of NaN data in the data series
"""
return 1 if (denominator := float(col.size)) == 0 else col.isnull().sum() / denominator
return 1 if (denominator := float(col.size)) == 0 else np.isnan(col.values).sum() / denominator


@series_method(data_cols=["dt_col"])
Expand Down
8 changes: 5 additions & 3 deletions test/unit/test_timeseries_toolkit.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,9 +143,11 @@ def test_num_hours(self):

def test_percent_nan(self):
test_dict = {}
test_dict["a"] = pd.Series(["", 1, 2, 1e5, np.Inf])
test_dict["b"] = pd.Series(["", np.nan, 2, 1e5, np.Inf])
test_dict["c"] = pd.Series([np.nan, 1, 2, 1e5, np.nan])

# All should be float Series given PlantData requirements
test_dict["a"] = pd.Series([True, 1, 2, 1e5, np.Inf]).astype(float)
test_dict["b"] = pd.Series([False, np.nan, 2, 1e5, np.Inf]).astype(float)
test_dict["c"] = pd.Series([np.nan, 1, 2, 1e5, np.nan]).astype(float)

nan_values = {"a": 0.0, "b": 0.2, "c": 0.4}

Expand Down
Loading