IAMconsortium · danielhuppmann · Oct 19, 2023 · Oct 17, 2023 · Oct 17, 2023 · Oct 17, 2023
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
@@ -1,5 +1,7 @@
 # Release v2.0.0
 
+- [#792](https://github.com/IAMconsortium/pyam/pull/792) Support region-aggregation with weights-index >> data-index
+
 ## Highlights
 
 - Use **ixmp4** as dependency for better integration with the IIASA Scenario Explorer database infrastructure 

diff --git a/pyam/aggregation.py b/pyam/aggregation.py
@@ -4,7 +4,7 @@
 from itertools import compress
 
 from pyam.index import replace_index_values
-from pyam.logging import adjust_log_level
+from pyam.logging import adjust_log_level, format_log_message
 from pyam.str import find_depth, is_str, reduce_hierarchy
 from pyam.utils import KNOWN_FUNCS, is_list_like, to_list
 from pyam._compare import _compare
@@ -116,7 +116,10 @@ def _aggregate_region(
         raise ValueError("Using weights and components in one operation not supported.")
 
     # default subregions to all regions other than `region`
-    subregions = subregions or df._all_other_regions(region, variable)
+    if weight is None:
+        subregions = subregions or df._all_other_regions(region, variable)
+    else:
+        subregions = subregions or df._all_other_regions(region, [variable, weight])
 
     if not len(subregions):
         logger.info(
@@ -214,10 +217,28 @@ def _agg_weight(data, weight, method, drop_negative_weights):
         raise ValueError("Only method 'np.sum' allowed for weighted average.")
 
     weight = weight.droplevel(["variable", "unit"])
+    data_index = data.droplevel(["variable", "unit"]).index
+
+    # check that weights exist for all data rows
+    missing_weights = data_index.difference(weight.index)
+    if not missing_weights.empty:
+        raise ValueError(
+            format_log_message(
+                "Missing weights for the following data rows", missing_weights
+            )
+        )
 
-    if not data.droplevel(["variable", "unit"]).index.equals(weight.index):
-        raise ValueError("Inconsistent index between variable and weight!")
+    # warn if no data exists for available weights
+    missing_data = weight.index.difference(data_index)
+    if not missing_data.empty:
+        logger.warning(
+            format_log_message(
+                "Ignoring weights for the following missing data rows", missing_data
+            )
+        )
+        weight[missing_data] = np.nan
 
+    # remove (and warn) negative values from weights due to strange behavior
     if drop_negative_weights is True:
         if any(weight < 0):
             logger.warning(

diff --git a/tests/test_feature_aggregate.py b/tests/test_feature_aggregate.py
@@ -1,5 +1,5 @@
 import pytest
-import logging
+import re
 
 import numpy as np
 import pandas as pd
@@ -238,7 +238,7 @@ def test_check_aggregate_region_log(simple_df, caplog):
 @pytest.mark.parametrize(
     "variable",
     (
-        ("Primary Energy"),
+        "Primary Energy",
         (["Primary Energy", "Primary Energy|Coal", "Primary Energy|Wind"]),
     ),
 )
@@ -252,7 +252,7 @@ def test_aggregate_region_append(simple_df, variable):
 @pytest.mark.parametrize(
     "variable",
     (
-        ("Primary Energy"),
+        "Primary Energy",
         (["Primary Energy", "Primary Energy|Coal", "Primary Energy|Wind"]),
     ),
 )
@@ -315,7 +315,13 @@ def test_aggregate_region_with_weights(simple_df, caplog):
     exp = simple_df.filter(variable=v, region="World")
     assert_iamframe_equal(simple_df.aggregate_region(v, weight=w), exp)
 
-    # test that dropping negative weights works as expected
+
+def test_aggregate_region_with_negative_weights(simple_df, caplog):
+    # carbon price shouldn't be summed but be weighted by emissions
+    v = "Price|Carbon"
+    w = "Emissions|CO2"
+
+    # dropping negative weights works as expected
     neg_weights_df = simple_df.copy()
     neg_weights_df._data[18] = -6
     exp = simple_df.filter(variable=v, region="World", year=2010)
@@ -329,24 +335,65 @@ def test_aggregate_region_with_weights(simple_df, caplog):
     idx = caplog.messages.index(msg)
     assert caplog.records[idx].levelname == "WARNING"
 
-    # test that not dropping negative weights works as expected
+    # *not* dropping negative weights works as expected
     exp = simple_df.filter(variable=v, region="World")
     exp._data[0] = -8
     assert_iamframe_equal(
         neg_weights_df.aggregate_region(v, weight=w, drop_negative_weights=False), exp
     )
 
 
-def test_aggregate_region_with_weights_raises(simple_df):
+@pytest.mark.parametrize(
+    "filter_arg,log_message",
+    (
+        (dict(year=2010), ""),
+        (dict(), "model_a   scen_a  reg_b  2005\n1  "),
+    ),
+)
+def test_aggregate_region_with_weights_inconsistent_index(
+    simple_df, caplog, filter_arg, log_message
+):
     # carbon price shouldn't be summed but be weighted by emissions
     v = "Price|Carbon"
     w = "Emissions|CO2"
 
-    # inconsistent index of variable and weight raises an error
-    _df = simple_df.filter(variable=w, region="reg_b", keep=False)
-    with pytest.raises(ValueError, match="Inconsistent index between variable and wei"):
+    log_message = "\n0  " + log_message + "model_a   scen_a  reg_b  2010"
+    if simple_df.time_domain == "datetime":
+        time_col = "     time"
+        log_message = log_message.replace(" 2005", "2005-06-17").replace(
+            " 2010", "2010-07-21"
+        )
+    else:
+        time_col = "year"
+
+    # missing weight row raises an error
+    _df = simple_df.filter(variable=w, region="reg_b", keep=False, **filter_arg)
+    match = r"Missing weights for the following data.*\n.*" + re.escape(log_message)
+    with pytest.raises(ValueError, match=match):
         _df.aggregate_region(v, weight=w)
 
+    # missing data row prints a warning (data-index is a subset of weight-index)
+    exp = simple_df.filter(variable=v, region="World")
+    if not filter_arg:
+        exp._data[0] = 1.0
+    exp._data[1] = 30.0
+    _df = simple_df.filter(variable=v, region="reg_b", keep=False, **filter_arg)
+    assert_iamframe_equal(_df.aggregate_region(v, weight=w), exp)
+
+    msg = (
+        "Ignoring weights for the following missing data rows:\n"
+        f"     model scenario region  {time_col}" + log_message
+    )
+
+    idx = caplog.messages.index(msg)
+    assert caplog.records[idx].levelname == "WARNING"
+
+
+def test_aggregate_region_with_weights_raises(simple_df):
+    # carbon price shouldn't be summed but be weighted by emissions
+    v = "Price|Carbon"
+    w = "Emissions|CO2"
+
     # using weight and method other than 'sum' raises an error
     pytest.raises(ValueError, simple_df.aggregate_region, v, method="max", weight="bar")
 

diff --git a/tests/test_feature_growth_rate.py b/tests/test_feature_growth_rate.py
@@ -66,5 +66,5 @@ def test_growth_rate_timeseries(x2010, rates):
 def test_growth_rate_timeseries_fails(value):
     """Check that a timeseries reaching/crossing 0 raises"""
 
-    with pytest.raises(ValueError, match="Cannot compute growth rate when*."):
+    with pytest.raises(ValueError, match="Cannot compute growth rate when"):
         growth_rate(pd.Series([1.0, value]))
diff --git a/tests/test_io.py b/tests/test_io.py
@@ -43,7 +43,7 @@ def test_not_a_file():
 
 def test_io_list():
     # initializing with a list raises an error
-    match = r"Initializing from list is not supported,*."
+    match = "Initializing from list is not supported,"
     with pytest.raises(ValueError, match=match):
         IamDataFrame([1, 2])