NREL · RHammond2 · Aug 4, 2023 · Jul 17, 2023 · Jul 17, 2023 · Jul 17, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -22,6 +22,9 @@ All notable changes to this project will be documented in this file. If you make
 - Modern dependency stacks now supported!
   - Upgrading past major versions of Scikit-Learn (1.0) and Pandas (2.0), in conjunction with their own dependencies, caused small divergences in the MonteCarloAEP analysis method with Daily GBM, and the Wake Losses analysis method with UQ. The magnitude of the differences are small compared with the magnitude of the output.
   - In general, OpenOA is now moving away from pinning the maximum dependency version, and will stick to defining minimum dependencies to ensure modern API usage is supported across the software.
+- `utils.filters.bin_filter` was converted from a for loop to a vectorized method
+- `utils.filters.bin_filter` and `utils.timeseries.percent_nan` were converted to be nearly pure NumPy methods operating on NumPy arrays for significant speedups of the TIE analysis method.
+- `analysis.TurbineLongTermGrossEnergy.filter_turbine_data` was cleaned up for a minor gain in efficiency and readability.
 
 ## 3.0rc2
 - Everything from release candidate 1

diff --git a/examples/03_turbine_ideal_energy.ipynb b/examples/03_turbine_ideal_energy.ipynb
@@ -126,7 +126,7 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>time</th>\n",
-       "      <th>WTUR_TurNam</th>\n",
+       "      <th>asset_id</th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
@@ -206,29 +206,29 @@
        "</div>"
       ],
       "text/plain": [
-       "                                 WROT_BlPthAngVal     WTUR_W  WMET_HorWdSpd  \\\n",
-       "time                WTUR_TurNam                                               \n",
-       "2014-01-01 00:00:00 R80736                  -1.00  642.78003           7.12   \n",
-       "                    R80721                  -1.01  441.06000           6.39   \n",
-       "                    R80790                  -0.96  658.53003           7.11   \n",
-       "                    R80711                  -0.93  514.23999           6.87   \n",
-       "2014-01-01 00:10:00 R80790                  -0.96  640.23999           7.01   \n",
+       "                              WROT_BlPthAngVal     WTUR_W  WMET_HorWdSpd  \\\n",
+       "time                asset_id                                               \n",
+       "2014-01-01 00:00:00 R80736               -1.00  642.78003           7.12   \n",
+       "                    R80721               -1.01  441.06000           6.39   \n",
+       "                    R80790               -0.96  658.53003           7.11   \n",
+       "                    R80711               -0.93  514.23999           6.87   \n",
+       "2014-01-01 00:10:00 R80790               -0.96  640.23999           7.01   \n",
        "\n",
-       "                                 Va_avg  WMET_EnvTmp     Ya_avg  \\\n",
-       "time                WTUR_TurNam                                   \n",
-       "2014-01-01 00:00:00 R80736         0.66         4.69  181.34000   \n",
-       "                    R80721        -2.48         4.94  179.82001   \n",
-       "                    R80790         1.07         4.55  172.39000   \n",
-       "                    R80711         6.95         4.30  172.77000   \n",
-       "2014-01-01 00:10:00 R80790        -1.90         4.68  172.39000   \n",
+       "                              Va_avg  WMET_EnvTmp     Ya_avg  WMET_HorWdDir  \\\n",
+       "time                asset_id                                                  \n",
+       "2014-01-01 00:00:00 R80736      0.66         4.69  181.34000      182.00999   \n",
+       "                    R80721     -2.48         4.94  179.82001      177.36000   \n",
+       "                    R80790      1.07         4.55  172.39000      173.50999   \n",
+       "                    R80711      6.95         4.30  172.77000      179.72000   \n",
+       "2014-01-01 00:10:00 R80790     -1.90         4.68  172.39000      170.46001   \n",
        "\n",
-       "                                 WMET_HorWdDir  energy_kwh  WTUR_SupWh  \n",
-       "time                WTUR_TurNam                                         \n",
-       "2014-01-01 00:00:00 R80736           182.00999  107.130005  107.130005  \n",
-       "                    R80721           177.36000   73.510000   73.510000  \n",
-       "                    R80790           173.50999  109.755005  109.755005  \n",
-       "                    R80711           179.72000   85.706665   85.706665  \n",
-       "2014-01-01 00:10:00 R80790           170.46001  106.706665  106.706665  "
+       "                              energy_kwh  WTUR_SupWh  \n",
+       "time                asset_id                          \n",
+       "2014-01-01 00:00:00 R80736    107.130005  107.130005  \n",
+       "                    R80721     73.510000   73.510000  \n",
+       "                    R80790    109.755005  109.755005  \n",
+       "                    R80711     85.706665   85.706665  \n",
+       "2014-01-01 00:10:00 R80790    106.706665  106.706665  "
       ]
      },
      "execution_count": 4,
@@ -305,7 +305,7 @@
      "output_type": "stream",
      "text": [
       "INFO:openoa.analysis.turbine_long_term_gross_energy:Running the long term gross energy analysis\n",
-      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:12<00:00,  6.32s/it]\n",
+      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:13<00:00,  6.86s/it]\n",
       "INFO:openoa.analysis.turbine_long_term_gross_energy:Run completed\n"
      ]
     }
@@ -540,7 +540,7 @@
      "output_type": "stream",
      "text": [
       "INFO:openoa.analysis.turbine_long_term_gross_energy:Running the long term gross energy analysis\n",
-      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [16:11<00:00,  9.72s/it]\n",
+      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [10:48<00:00,  6.49s/it]\n",
       "INFO:openoa.analysis.turbine_long_term_gross_energy:Run completed\n"
      ]
     }
@@ -566,7 +566,7 @@
     {
      "data": {
       "text/plain": [
-       "13571894.791458625"
+       "13629417.559871903"
       ]
      },
      "execution_count": 13,
@@ -586,7 +586,7 @@
     {
      "data": {
       "text/plain": [
-       "589131.3024185459"
+       "643649.6728541044"
       ]
      },
      "execution_count": 14,
@@ -608,7 +608,7 @@
      "output_type": "stream",
      "text": [
       "Mean long-term turbine ideal energy is 13.6 GWh/year\n",
-      "Uncertainty in long-term turbine ideal energy is 0.6 GWh/year, or 4.3% percent\n"
+      "Uncertainty in long-term turbine ideal energy is 0.6 GWh/year, or 4.7% percent\n"
      ]
     }
    ],

diff --git a/openoa/analysis/turbine_long_term_gross_energy.py b/openoa/analysis/turbine_long_term_gross_energy.py
@@ -300,63 +300,49 @@ def filter_turbine_data(self) -> None:
 
         # Loop through turbines
         for t in self.turbine_ids:
-            turb_capac = dic[t]["WTUR_W"].max()
+            scada_df = self.scada_dict[t]
+            turbine_capacity_real = scada_df.WTUR_W.max()
 
             max_bin = (
-                self._run.max_power_filter * turb_capac
+                self._run.max_power_filter * turbine_capacity_real
             )  # Set maximum range for using bin-filter
 
-            dic[t].dropna(
+            scada_df.dropna(
                 subset=["WMET_HorWdSpd", "WTUR_SupWh"], inplace=True
             )  # Drop any data where scada wind speed or energy is NaN
 
-            # Flag turbine energy data less than zero
-            dic[t].loc[:, "flag_neg"] = filters.range_flag(
-                dic[t].loc[:, "WTUR_W"], lower=0, upper=turb_capac
+            scada_df = scada_df.assign(
+                flag_neg=filters.range_flag(scada_df.WTUR_W, lower=0, upper=turbine_capacity_real),
+                flag_range=filters.range_flag(scada_df.WMET_HorWdSpd, lower=0, upper=40),
+                flag_frozen=filters.unresponsive_flag(scada_df.WMET_HorWdSpd, threshold=3),
+                flag_window=filters.window_range_flag(
+                    window_col=dic[t].loc[:, "WMET_HorWdSpd"],
+                    window_start=5.0,
+                    window_end=40,
+                    value_col=dic[t].loc[:, "WTUR_W"],
+                    value_min=0.02 * turbine_capacity_real,
+                    value_max=1.2 * turbine_capacity_real,
+                ),
+                flag_bin=filters.bin_filter(
+                    bin_col=dic[t].loc[:, "WTUR_W"],
+                    value_col=dic[t].loc[:, "WMET_HorWdSpd"],
+                    bin_width=0.06 * turbine_capacity_real,
+                    threshold=self._run.wind_bin_thresh,
+                    center_type="median",
+                    bin_min=np.round(0.01 * turbine_capacity_real),
+                    bin_max=np.round(max_bin),
+                    threshold_type="std",
+                    direction="all",
+                ),
             )
-            # Apply range filter
-            dic[t].loc[:, "flag_range"] = filters.range_flag(
-                dic[t].loc[:, "WMET_HorWdSpd"], lower=0, upper=40
-            )
-            # Apply frozen/unresponsive sensor filter
-            dic[t].loc[:, "flag_frozen"] = filters.unresponsive_flag(
-                dic[t].loc[:, "WMET_HorWdSpd"], threshold=3
-            )
-            # Apply window range filter
-            dic[t].loc[:, "flag_window"] = filters.window_range_flag(
-                window_col=dic[t].loc[:, "WMET_HorWdSpd"],
-                window_start=5.0,
-                window_end=40,
-                value_col=dic[t].loc[:, "WTUR_W"],
-                value_min=0.02 * turb_capac,
-                value_max=1.2 * turb_capac,
-            )
-
-            threshold_wind_bin = self._run.wind_bin_thresh
-            # Apply bin-based filter
-            dic[t].loc[:, "flag_bin"] = filters.bin_filter(
-                bin_col=dic[t].loc[:, "WTUR_W"],
-                value_col=dic[t].loc[:, "WMET_HorWdSpd"],
-                bin_width=0.06 * turb_capac,
-                threshold=threshold_wind_bin,  # wind bin thresh
-                center_type="median",
-                bin_min=np.round(0.01 * turb_capac),
-                bin_max=np.round(max_bin),
-                threshold_type="std",
-                direction="all",
-            )
-
             # Create a 'final' flag which is true if any of the previous flags are true
-            dic[t].loc[:, "flag_final"] = (
-                (dic[t].loc[:, "flag_range"])
-                | (dic[t].loc[:, "flag_window"])
-                | (dic[t].loc[:, "flag_bin"])
-                | (dic[t].loc[:, "flag_frozen"])
+            self.scada_dict[t].loc[:, "flag_final"] = (
+                scada_df.flag_range
+                | scada_df.flag_window
+                | scada_df.flag_bin
+                | scada_df.flag_frozen
             )
 
-            # Set negative turbine data to zero
-            dic[t].loc[dic[t]["flag_neg"], "WTUR_W"] = 0
-
     def setup_daily_reanalysis_data(self) -> None:
         """
         Process reanalysis data to daily means for later use in the GAM model.

diff --git a/openoa/utils/filters.py b/openoa/utils/filters.py
@@ -147,8 +147,8 @@ def std_range_flag(
         raise ValueError("The inputs to `col` and `threshold` must be the same length.")
 
     subset = data.loc[:, col].copy()
-    data_mean = subset.mean(axis=0)
-    data_std = subset.std(axis=0) * np.array(threshold)
+    data_mean = np.nanmean(subset.values, axis=0)
+    data_std = np.nanstd(subset.values, ddof=1, axis=0) * np.array(threshold)
     flag = subset.le(data_mean - data_std) | subset.ge(data_mean + data_std)
 
     # Return back a pd.Series if one was provided, else a pd.DataFrame
@@ -232,47 +232,58 @@ def bin_filter(
 
     # Set bin min and max values if not passed to function
     if bin_min is None:
-        bin_min = bin_col.min()
+        bin_min = np.min(bin_col.values)
     if bin_max is None:
-        bin_max = bin_col.max()
+        bin_max = np.max(bin_col.values)
 
     # Define bin edges
     bin_edges = np.arange(bin_min, bin_max, bin_width)
 
     # Ensure the last bin edge value is bin_max
     bin_edges = np.unique(np.clip(np.append(bin_edges, bin_max), bin_min, bin_max))
 
-    # Define empty flag of 'False' values with indices matching value_col
-    flag = pd.Series(index=value_col.index, data=False)
-
-    # Loop through bins and applying flagging
-    nbins = len(bin_edges)
-    for i in range(nbins - 1):
-        # Get data that fall wihtin bin
-        y_bin = value_col.loc[(bin_col <= bin_edges[i + 1]) & (bin_col > bin_edges[i])]
-
-        # Get center of binned data
-        center = y_bin.mean() if center_type == "mean" else y_bin.median()
-
-        # Define threshold of data flag
-        if threshold_type == "std":
-            deviation = y_bin.std() * threshold
-        elif threshold_type == "scalar":
-            deviation = threshold
-        else:  # median absolute deviation (mad)
-            deviation = (y_bin - center).abs().median() * threshold
-
-        # Perform flagging depending on specfied direction
-        if direction == "above":
-            flag_bin = y_bin > (center + deviation)
-        elif direction == "below":
-            flag_bin = y_bin < (center - deviation)
-        else:
-            flag_bin = (y_bin > (center + deviation)) | (y_bin < (center - deviation))
-
-        # Record flags in final flag column
-        flag.loc[flag_bin.index] = flag_bin
-
+    # Bin the data and recreate the comparison data as a multi-column data frame
+    which_bin_col = np.digitize(bin_col, bin_edges, right=True)
+
+    # Create the flag values as a matrix with each column being the timestamp's binned value,
+    # e.g., all columns values are NaN if the data point is not in that bin
+    flag_vals = (
+        value_col.to_frame().set_index(pd.Series(which_bin_col, name="bin"), append=True).unstack()
+    )
+    drop = [i for i, el in enumerate(flag_vals.columns.names) if el != "bin"]
+    flag_vals.columns = flag_vals.columns.droplevel(drop).rename(None)
+
+    # Create a False array as default, so flags are set to True
+    flag_df = pd.DataFrame(np.zeros_like(flag_vals, dtype=bool), index=flag_vals.index)
+
+    # Get center of binned data
+    if center_type == "median":
+        center = np.nanmedian(flag_vals.values, axis=0)
+    else:
+        center = np.nanmean(flag_vals.values, axis=0)
+    center = pd.DataFrame(
+        np.full(flag_vals.shape, center),
+        index=flag_vals.index,
+        columns=flag_vals.columns,
+    )
+
+    # Define threshold of data flag
+    if threshold_type == "std":
+        deviation = np.nanstd(flag_vals.values, ddof=1, axis=0) * threshold
+    elif threshold_type == "scalar":
+        deviation = threshold
+    else:  # median absolute deviation (mad)
+        deviation = np.nanmedian(np.abs(flag_vals.values - center.values), axis=0) * threshold
+
+    # Perform flagging depending on specfied direction
+    if direction in ("above", "all"):
+        flag_df |= flag_vals > center + deviation
+    if direction in ("below", "all"):
+        flag_df |= flag_vals < center - deviation
+
+    # Get all instances where the value is True, and reset any values outside the bin limits
+    flag = pd.Series(np.nanmax(flag_df, axis=1), index=flag_df.index)
+    flag.loc[(bin_col <= bin_min) | (bin_col > bin_max)] = False
     return flag
 
 

diff --git a/openoa/utils/timeseries.py b/openoa/utils/timeseries.py
@@ -227,7 +227,7 @@ def percent_nan(col: pd.Series | str, data: pd.DataFrame = None):
     Returns:
         :obj:`float`: Percentage of NaN data in the data series
     """
-    return 1 if (denominator := float(col.size)) == 0 else col.isnull().sum() / denominator
+    return 1 if (denominator := float(col.size)) == 0 else np.isnan(col.values).sum() / denominator
 
 
 @series_method(data_cols=["dt_col"])

diff --git a/test/unit/test_timeseries_toolkit.py b/test/unit/test_timeseries_toolkit.py
@@ -143,9 +143,11 @@ def test_num_hours(self):
 
     def test_percent_nan(self):
         test_dict = {}
-        test_dict["a"] = pd.Series(["", 1, 2, 1e5, np.Inf])
-        test_dict["b"] = pd.Series(["", np.nan, 2, 1e5, np.Inf])
-        test_dict["c"] = pd.Series([np.nan, 1, 2, 1e5, np.nan])
+
+        # All should be float Series given PlantData requirements
+        test_dict["a"] = pd.Series([True, 1, 2, 1e5, np.Inf]).astype(float)
+        test_dict["b"] = pd.Series([False, np.nan, 2, 1e5, np.Inf]).astype(float)
+        test_dict["c"] = pd.Series([np.nan, 1, 2, 1e5, np.nan]).astype(float)
 
         nan_values = {"a": 0.0, "b": 0.2, "c": 0.4}