From af4b68f20d7d3f61c99b246cbf81b4648dbedf88 Mon Sep 17 00:00:00 2001
From: Di Chen <chenditc@umich.edu>
Date: Wed, 10 May 2023 17:05:34 +0800
Subject: [PATCH 01/11] Add util function to help automatically get horizon

---
 qlib/utils/data.py       | 20 +++++++++++++++++++-
 tests/misc/test_utils.py | 18 ++++++++++++++++++
 2 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/qlib/utils/data.py b/qlib/utils/data.py
index 6c62f75583..ad2b0b7e69 100644
--- a/qlib/utils/data.py
+++ b/qlib/utils/data.py
@@ -3,10 +3,12 @@
 """
 This module covers some utility functions that operate on data or basic object
 """
+import re
 from copy import deepcopy
 from typing import List, Union
-import pandas as pd
+
 import numpy as np
+import pandas as pd
 
 
 def robust_zscore(x: pd.Series, zscore=False):
@@ -103,3 +105,19 @@ def update_config(base_config: dict, ext_config: Union[dict, List[dict]]):
                     # one of then are not dict. Then replace
                     base_config[key] = ec[key]
     return base_config
+
+
+def guess_horizon(label):
+    """
+        Try to guess the horizon by parsing label
+    """
+    regex = r"Ref\(\s*\$[a-zA-Z]+,\s*-(\d+)\)"
+    horizon_list = [int(x) for x in re.findall(regex, label)]
+
+    if len(horizon_list) == 0:
+        return None
+    max_horizon = max(horizon_list)
+    # Unlikely the label doesn't use future information
+    if max_horizon < 2:
+        return None
+    return max_horizon
\ No newline at end of file
diff --git a/tests/misc/test_utils.py b/tests/misc/test_utils.py
index 2be792faf7..0e397a22da 100644
--- a/tests/misc/test_utils.py
+++ b/tests/misc/test_utils.py
@@ -9,6 +9,7 @@
 from qlib.log import TimeInspector
 from qlib.constant import REG_CN, REG_US, REG_TW
 from qlib.utils.time import cal_sam_minute as cal_sam_minute_new, get_min_cal, CN_TIME, US_TIME, TW_TIME
+from qlib.utils.data import guess_horizon
 
 REG_MAP = {REG_CN: CN_TIME, REG_US: US_TIME, REG_TW: TW_TIME}
 
@@ -111,6 +112,23 @@ def gen_args(cal: List):
                 for args in args_l:
                     cal_sam_minute_new(*args, region=region)
 
+class DataUtils(TestCase):
+    @classmethod
+    def setUpClass(cls):
+        init()
+
+    def test_guess_horizon(self):
+        label1 = "Ref($close, -2) / Ref($close, -1) - 1"
+        result1 = guess_horizon(label1)
+        assert(result1 == 2)
+
+        label1 = "Ref($close, -5) / Ref($close, -1) - 1"
+        result1 = guess_horizon(label1)
+        assert(result1 == 5)
+
+        label1 = "Ref($close, -1) / Ref($close, -1) - 1"
+        result1 = guess_horizon(label1)
+        assert(result1 is None)
 
 if __name__ == "__main__":
     unittest.main()

From 1146f1730fa143850ac5f58072c687d6fe39f4ff Mon Sep 17 00:00:00 2001
From: Di Chen <chenditc@umich.edu>
Date: Wed, 10 May 2023 17:14:13 +0800
Subject: [PATCH 02/11] Reformat for CI

---
 tests/misc/test_utils.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tests/misc/test_utils.py b/tests/misc/test_utils.py
index 0e397a22da..095841fcd9 100644
--- a/tests/misc/test_utils.py
+++ b/tests/misc/test_utils.py
@@ -118,17 +118,17 @@ def setUpClass(cls):
         init()
 
     def test_guess_horizon(self):
-        label1 = "Ref($close, -2) / Ref($close, -1) - 1"
-        result1 = guess_horizon(label1)
-        assert(result1 == 2)
+        label = "Ref($close, -2) / Ref($close, -1) - 1"
+        result = guess_horizon(label)
+        assert result == 2
 
-        label1 = "Ref($close, -5) / Ref($close, -1) - 1"
-        result1 = guess_horizon(label1)
-        assert(result1 == 5)
+        label = "Ref($close, -5) / Ref($close, -1) - 1"
+        result = guess_horizon(label)
+        assert result == 5
 
-        label1 = "Ref($close, -1) / Ref($close, -1) - 1"
-        result1 = guess_horizon(label1)
-        assert(result1 is None)
+        label = "Ref($close, -1) / Ref($close, -1) - 1"
+        result = guess_horizon(label)
+        assert result is None
 
 if __name__ == "__main__":
     unittest.main()

From d779bddf2e7b38f04cb68060cecccbcdbbc72fd6 Mon Sep 17 00:00:00 2001
From: Di Chen <chenditc@umich.edu>
Date: Wed, 10 May 2023 17:38:33 +0800
Subject: [PATCH 03/11] Leverage horizon change

---
 qlib/contrib/data/dataset.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/qlib/contrib/data/dataset.py b/qlib/contrib/data/dataset.py
index 9ce522cc06..3554a94d10 100644
--- a/qlib/contrib/data/dataset.py
+++ b/qlib/contrib/data/dataset.py
@@ -6,6 +6,7 @@
 import warnings
 import numpy as np
 import pandas as pd
+from qlib.utils.data import guess_horizon
 
 from qlib.data.dataset import DatasetH
 
@@ -130,6 +131,16 @@ def __init__(
         input_size=None,
         **kwargs,
     ):
+        if horizon == 0:
+            # Try to guess horizon
+            if type(handler) is dict:
+                label = handler.get("kwargs", {}).get("label",[""])[0]
+            else:
+                label = handler.data_loader.fields["label"][0][0]
+            horizon = guess_horizon(label)
+            # Failed to guess horizon, set back to 0
+            if horizon is None:
+                horizon = 0
 
         assert num_states == 0 or horizon > 0, "please specify `horizon` to avoid data leakage"
         assert memory_mode in ["sample", "daily"], "unsupported memory mode"

From dfe504567e630bf26567f4fbda85791338c05e0c Mon Sep 17 00:00:00 2001
From: Di Chen <chenditc@umich.edu>
Date: Wed, 10 May 2023 17:42:11 +0800
Subject: [PATCH 04/11] Udpate config yaml

---
 examples/benchmarks/TRA/workflow_config_tra_Alpha158.yaml     | 3 +--
 .../benchmarks/TRA/workflow_config_tra_Alpha158_full.yaml     | 1 -
 examples/benchmarks/TRA/workflow_config_tra_Alpha360.yaml     | 3 +--
 qlib/utils/data.py                                            | 2 +-
 tests/misc/test_utils.py                                      | 4 ++--
 5 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/examples/benchmarks/TRA/workflow_config_tra_Alpha158.yaml b/examples/benchmarks/TRA/workflow_config_tra_Alpha158.yaml
index c86f87fc65..001d5885e5 100644
--- a/examples/benchmarks/TRA/workflow_config_tra_Alpha158.yaml
+++ b/examples/benchmarks/TRA/workflow_config_tra_Alpha158.yaml
@@ -31,7 +31,7 @@ data_handler_config: &data_handler_config
         fields_group: label
   label: ["Ref($close, -2) / Ref($close, -1) - 1"]
 
-num_states: &num_states 3
+num_states: &num_states 5
 
 memory_mode: &memory_mode sample
 
@@ -112,7 +112,6 @@ task:
         valid: [2015-01-01, 2016-12-31]
         test: [2017-01-01, 2020-08-01]
       seq_len: 60
-      horizon: 2
       input_size:
       num_states: *num_states
       batch_size: 1024
diff --git a/examples/benchmarks/TRA/workflow_config_tra_Alpha158_full.yaml b/examples/benchmarks/TRA/workflow_config_tra_Alpha158_full.yaml
index 75f18f3ee6..66815de711 100644
--- a/examples/benchmarks/TRA/workflow_config_tra_Alpha158_full.yaml
+++ b/examples/benchmarks/TRA/workflow_config_tra_Alpha158_full.yaml
@@ -106,7 +106,6 @@ task:
         valid: [2015-01-01, 2016-12-31]
         test: [2017-01-01, 2020-08-01]
       seq_len: 60
-      horizon: 2
       input_size:
       num_states: *num_states
       batch_size: 1024
diff --git a/examples/benchmarks/TRA/workflow_config_tra_Alpha360.yaml b/examples/benchmarks/TRA/workflow_config_tra_Alpha360.yaml
index 9ab5b904ba..139d6cf431 100644
--- a/examples/benchmarks/TRA/workflow_config_tra_Alpha360.yaml
+++ b/examples/benchmarks/TRA/workflow_config_tra_Alpha360.yaml
@@ -25,7 +25,7 @@ data_handler_config: &data_handler_config
         fields_group: label
   label: ["Ref($close, -2) / Ref($close, -1) - 1"]
 
-num_states: &num_states 3
+num_states: &num_states 5
 
 memory_mode: &memory_mode sample
 
@@ -106,7 +106,6 @@ task:
         valid: [2015-01-01, 2016-12-31]
         test: [2017-01-01, 2020-08-01]
       seq_len: 60
-      horizon: 2
       input_size: 6
       num_states: *num_states
       batch_size: 1024
diff --git a/qlib/utils/data.py b/qlib/utils/data.py
index ad2b0b7e69..870739975b 100644
--- a/qlib/utils/data.py
+++ b/qlib/utils/data.py
@@ -120,4 +120,4 @@ def guess_horizon(label):
     # Unlikely the label doesn't use future information
     if max_horizon < 2:
         return None
-    return max_horizon
\ No newline at end of file
+    return max_horizon + 1
\ No newline at end of file
diff --git a/tests/misc/test_utils.py b/tests/misc/test_utils.py
index 095841fcd9..f63a0d7697 100644
--- a/tests/misc/test_utils.py
+++ b/tests/misc/test_utils.py
@@ -120,11 +120,11 @@ def setUpClass(cls):
     def test_guess_horizon(self):
         label = "Ref($close, -2) / Ref($close, -1) - 1"
         result = guess_horizon(label)
-        assert result == 2
+        assert result == 3
 
         label = "Ref($close, -5) / Ref($close, -1) - 1"
         result = guess_horizon(label)
-        assert result == 5
+        assert result == 6
 
         label = "Ref($close, -1) / Ref($close, -1) - 1"
         result = guess_horizon(label)

From d62206ce9a4baf53d621ca0e6d0ede44c5c76ee6 Mon Sep 17 00:00:00 2001
From: Di Chen <chenditc@umich.edu>
Date: Wed, 10 May 2023 17:47:20 +0800
Subject: [PATCH 05/11] Update for formatting

---
 qlib/contrib/data/dataset.py | 2 +-
 qlib/utils/data.py           | 2 +-
 tests/misc/test_utils.py     | 2 --
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/qlib/contrib/data/dataset.py b/qlib/contrib/data/dataset.py
index 3554a94d10..08d86fe732 100644
--- a/qlib/contrib/data/dataset.py
+++ b/qlib/contrib/data/dataset.py
@@ -134,7 +134,7 @@ def __init__(
         if horizon == 0:
             # Try to guess horizon
             if type(handler) is dict:
-                label = handler.get("kwargs", {}).get("label",[""])[0]
+                label = handler.get("kwargs", {}).get("label", [""])[0]
             else:
                 label = handler.data_loader.fields["label"][0][0]
             horizon = guess_horizon(label)
diff --git a/qlib/utils/data.py b/qlib/utils/data.py
index 870739975b..978b920c0c 100644
--- a/qlib/utils/data.py
+++ b/qlib/utils/data.py
@@ -109,7 +109,7 @@ def update_config(base_config: dict, ext_config: Union[dict, List[dict]]):
 
 def guess_horizon(label):
     """
-        Try to guess the horizon by parsing label
+    Try to guess the horizon by parsing label
     """
     regex = r"Ref\(\s*\$[a-zA-Z]+,\s*-(\d+)\)"
     horizon_list = [int(x) for x in re.findall(regex, label)]
diff --git a/tests/misc/test_utils.py b/tests/misc/test_utils.py
index f63a0d7697..69c5de9edb 100644
--- a/tests/misc/test_utils.py
+++ b/tests/misc/test_utils.py
@@ -111,7 +111,6 @@ def gen_args(cal: List):
             with TimeInspector.logt():
                 for args in args_l:
                     cal_sam_minute_new(*args, region=region)
-
 class DataUtils(TestCase):
     @classmethod
     def setUpClass(cls):
@@ -129,6 +128,5 @@ def test_guess_horizon(self):
         label = "Ref($close, -1) / Ref($close, -1) - 1"
         result = guess_horizon(label)
         assert result is None
-
 if __name__ == "__main__":
     unittest.main()

From 67674628eaabe97ec0478b0da3e25b7c12b5f948 Mon Sep 17 00:00:00 2001
From: Di Chen <chenditc@umich.edu>
Date: Wed, 10 May 2023 23:05:59 +0800
Subject: [PATCH 06/11] Adapt to pickled handler

---
 qlib/contrib/data/dataset.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/qlib/contrib/data/dataset.py b/qlib/contrib/data/dataset.py
index 08d86fe732..c8aadbaec1 100644
--- a/qlib/contrib/data/dataset.py
+++ b/qlib/contrib/data/dataset.py
@@ -7,6 +7,7 @@
 import numpy as np
 import pandas as pd
 from qlib.utils.data import guess_horizon
+from qlib.utils import init_instance_by_config
 
 from qlib.data.dataset import DatasetH
 
@@ -134,11 +135,11 @@ def __init__(
         if horizon == 0:
             # Try to guess horizon
             if type(handler) is dict:
-                label = handler.get("kwargs", {}).get("label", [""])[0]
-            else:
-                label = handler.data_loader.fields["label"][0][0]
+                handler = init_instance_by_config(handler)
+            elif type(handler) is str: # pickled handler
+                handler = init_instance_by_config(handler)
+            label = handler.data_loader.fields["label"][0][0]
             horizon = guess_horizon(label)
-            # Failed to guess horizon, set back to 0
             if horizon is None:
                 horizon = 0
 

From 54d12f14a3a5ee83d1f23ae3dc542a9664fc841d Mon Sep 17 00:00:00 2001
From: Di Chen <chenditc@umich.edu>
Date: Thu, 11 May 2023 08:39:02 +0800
Subject: [PATCH 07/11] Fix CI error

---
 examples/benchmarks/TRA/workflow_config_tra_Alpha158.yaml | 2 +-
 examples/benchmarks/TRA/workflow_config_tra_Alpha360.yaml | 2 +-
 qlib/contrib/data/dataset.py                              | 2 +-
 qlib/utils/data.py                                        | 2 +-
 tests/misc/test_utils.py                                  | 4 ++++
 5 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/examples/benchmarks/TRA/workflow_config_tra_Alpha158.yaml b/examples/benchmarks/TRA/workflow_config_tra_Alpha158.yaml
index 001d5885e5..77172ff11f 100644
--- a/examples/benchmarks/TRA/workflow_config_tra_Alpha158.yaml
+++ b/examples/benchmarks/TRA/workflow_config_tra_Alpha158.yaml
@@ -31,7 +31,7 @@ data_handler_config: &data_handler_config
         fields_group: label
   label: ["Ref($close, -2) / Ref($close, -1) - 1"]
 
-num_states: &num_states 5
+num_states: &num_states 3
 
 memory_mode: &memory_mode sample
 
diff --git a/examples/benchmarks/TRA/workflow_config_tra_Alpha360.yaml b/examples/benchmarks/TRA/workflow_config_tra_Alpha360.yaml
index 139d6cf431..a4a2486a58 100644
--- a/examples/benchmarks/TRA/workflow_config_tra_Alpha360.yaml
+++ b/examples/benchmarks/TRA/workflow_config_tra_Alpha360.yaml
@@ -25,7 +25,7 @@ data_handler_config: &data_handler_config
         fields_group: label
   label: ["Ref($close, -2) / Ref($close, -1) - 1"]
 
-num_states: &num_states 5
+num_states: &num_states 3
 
 memory_mode: &memory_mode sample
 
diff --git a/qlib/contrib/data/dataset.py b/qlib/contrib/data/dataset.py
index c8aadbaec1..f881da70f4 100644
--- a/qlib/contrib/data/dataset.py
+++ b/qlib/contrib/data/dataset.py
@@ -136,7 +136,7 @@ def __init__(
             # Try to guess horizon
             if type(handler) is dict:
                 handler = init_instance_by_config(handler)
-            elif type(handler) is str: # pickled handler
+            elif type(handler) is str:  # pickled handler
                 handler = init_instance_by_config(handler)
             label = handler.data_loader.fields["label"][0][0]
             horizon = guess_horizon(label)
diff --git a/qlib/utils/data.py b/qlib/utils/data.py
index 978b920c0c..1ebd7ef68b 100644
--- a/qlib/utils/data.py
+++ b/qlib/utils/data.py
@@ -120,4 +120,4 @@ def guess_horizon(label):
     # Unlikely the label doesn't use future information
     if max_horizon < 2:
         return None
-    return max_horizon + 1
\ No newline at end of file
+    return max_horizon + 1
diff --git a/tests/misc/test_utils.py b/tests/misc/test_utils.py
index 69c5de9edb..2e809df8de 100644
--- a/tests/misc/test_utils.py
+++ b/tests/misc/test_utils.py
@@ -111,6 +111,8 @@ def gen_args(cal: List):
             with TimeInspector.logt():
                 for args in args_l:
                     cal_sam_minute_new(*args, region=region)
+
+                    
 class DataUtils(TestCase):
     @classmethod
     def setUpClass(cls):
@@ -128,5 +130,7 @@ def test_guess_horizon(self):
         label = "Ref($close, -1) / Ref($close, -1) - 1"
         result = guess_horizon(label)
         assert result is None
+
+
 if __name__ == "__main__":
     unittest.main()

From e3a6b2d278147b920d1920ac28f33454d66762a9 Mon Sep 17 00:00:00 2001
From: Di Chen <chenditc@umich.edu>
Date: Thu, 11 May 2023 17:23:55 +0800
Subject: [PATCH 08/11] remove blank

---
 tests/misc/test_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/misc/test_utils.py b/tests/misc/test_utils.py
index 2e809df8de..03e782359c 100644
--- a/tests/misc/test_utils.py
+++ b/tests/misc/test_utils.py
@@ -112,7 +112,7 @@ def gen_args(cal: List):
                 for args in args_l:
                     cal_sam_minute_new(*args, region=region)
 
-                    
+
 class DataUtils(TestCase):
     @classmethod
     def setUpClass(cls):

From b3af98350f9dc16e267a2ff75e1ff76a35839c53 Mon Sep 17 00:00:00 2001
From: Di Chen <chenditc@umich.edu>
Date: Fri, 12 May 2023 17:49:13 +0800
Subject: [PATCH 09/11] Fix lint

---
 qlib/contrib/data/dataset.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/qlib/contrib/data/dataset.py b/qlib/contrib/data/dataset.py
index f881da70f4..798d1dff39 100644
--- a/qlib/contrib/data/dataset.py
+++ b/qlib/contrib/data/dataset.py
@@ -134,9 +134,9 @@ def __init__(
     ):
         if horizon == 0:
             # Try to guess horizon
-            if type(handler) is dict:
+            if isinstance(handler, dict):
                 handler = init_instance_by_config(handler)
-            elif type(handler) is str:  # pickled handler
+            elif isinstance(handler, str):  # pickled handler
                 handler = init_instance_by_config(handler)
             label = handler.data_loader.fields["label"][0][0]
             horizon = guess_horizon(label)

From 091dae6c60ef6dd3153ac896ebfde19fd3552f1b Mon Sep 17 00:00:00 2001
From: Di Chen <chenditc@umich.edu>
Date: Fri, 12 May 2023 18:24:23 +0800
Subject: [PATCH 10/11] Update tests

---
 qlib/contrib/data/dataset.py | 2 --
 qlib/utils/data.py           | 4 ++--
 tests/misc/test_utils.py     | 2 +-
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/qlib/contrib/data/dataset.py b/qlib/contrib/data/dataset.py
index 798d1dff39..5535b6efe2 100644
--- a/qlib/contrib/data/dataset.py
+++ b/qlib/contrib/data/dataset.py
@@ -140,8 +140,6 @@ def __init__(
                 handler = init_instance_by_config(handler)
             label = handler.data_loader.fields["label"][0][0]
             horizon = guess_horizon(label)
-            if horizon is None:
-                horizon = 0
 
         assert num_states == 0 or horizon > 0, "please specify `horizon` to avoid data leakage"
         assert memory_mode in ["sample", "daily"], "unsupported memory mode"
diff --git a/qlib/utils/data.py b/qlib/utils/data.py
index 1ebd7ef68b..7b196c50ec 100644
--- a/qlib/utils/data.py
+++ b/qlib/utils/data.py
@@ -115,9 +115,9 @@ def guess_horizon(label):
     horizon_list = [int(x) for x in re.findall(regex, label)]
 
     if len(horizon_list) == 0:
-        return None
+        return 0
     max_horizon = max(horizon_list)
     # Unlikely the label doesn't use future information
     if max_horizon < 2:
-        return None
+        return 0
     return max_horizon + 1
diff --git a/tests/misc/test_utils.py b/tests/misc/test_utils.py
index 03e782359c..5ea0fe0989 100644
--- a/tests/misc/test_utils.py
+++ b/tests/misc/test_utils.py
@@ -129,7 +129,7 @@ def test_guess_horizon(self):
 
         label = "Ref($close, -1) / Ref($close, -1) - 1"
         result = guess_horizon(label)
-        assert result is None
+        assert result is 0
 
 
 if __name__ == "__main__":

From adb3d045438e668d7a0fcdcecd978586067c0228 Mon Sep 17 00:00:00 2001
From: Di Chen <chenditc@umich.edu>
Date: Thu, 18 May 2023 22:08:57 +0800
Subject: [PATCH 11/11] Remove redundant check

---
 qlib/contrib/data/dataset.py | 4 +---
 tests/misc/test_utils.py     | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/qlib/contrib/data/dataset.py b/qlib/contrib/data/dataset.py
index 5535b6efe2..ccc667ae91 100644
--- a/qlib/contrib/data/dataset.py
+++ b/qlib/contrib/data/dataset.py
@@ -134,9 +134,7 @@ def __init__(
     ):
         if horizon == 0:
             # Try to guess horizon
-            if isinstance(handler, dict):
-                handler = init_instance_by_config(handler)
-            elif isinstance(handler, str):  # pickled handler
+            if isinstance(handler, (dict, str)):
                 handler = init_instance_by_config(handler)
             label = handler.data_loader.fields["label"][0][0]
             horizon = guess_horizon(label)
diff --git a/tests/misc/test_utils.py b/tests/misc/test_utils.py
index 5ea0fe0989..53108a75e3 100644
--- a/tests/misc/test_utils.py
+++ b/tests/misc/test_utils.py
@@ -129,7 +129,7 @@ def test_guess_horizon(self):
 
         label = "Ref($close, -1) / Ref($close, -1) - 1"
         result = guess_horizon(label)
-        assert result is 0
+        assert result == 0
 
 
 if __name__ == "__main__":