From af4b68f20d7d3f61c99b246cbf81b4648dbedf88 Mon Sep 17 00:00:00 2001 From: Di Chen Date: Wed, 10 May 2023 17:05:34 +0800 Subject: [PATCH 01/11] Add util function to help automatically get horizon --- qlib/utils/data.py | 20 +++++++++++++++++++- tests/misc/test_utils.py | 18 ++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/qlib/utils/data.py b/qlib/utils/data.py index 6c62f75583..ad2b0b7e69 100644 --- a/qlib/utils/data.py +++ b/qlib/utils/data.py @@ -3,10 +3,12 @@ """ This module covers some utility functions that operate on data or basic object """ +import re from copy import deepcopy from typing import List, Union -import pandas as pd + import numpy as np +import pandas as pd def robust_zscore(x: pd.Series, zscore=False): @@ -103,3 +105,19 @@ def update_config(base_config: dict, ext_config: Union[dict, List[dict]]): # one of then are not dict. Then replace base_config[key] = ec[key] return base_config + + +def guess_horizon(label): + """ + Try to guess the horizon by parsing label + """ + regex = r"Ref\(\s*\$[a-zA-Z]+,\s*-(\d+)\)" + horizon_list = [int(x) for x in re.findall(regex, label)] + + if len(horizon_list) == 0: + return None + max_horizon = max(horizon_list) + # Unlikely the label doesn't use future information + if max_horizon < 2: + return None + return max_horizon \ No newline at end of file diff --git a/tests/misc/test_utils.py b/tests/misc/test_utils.py index 2be792faf7..0e397a22da 100644 --- a/tests/misc/test_utils.py +++ b/tests/misc/test_utils.py @@ -9,6 +9,7 @@ from qlib.log import TimeInspector from qlib.constant import REG_CN, REG_US, REG_TW from qlib.utils.time import cal_sam_minute as cal_sam_minute_new, get_min_cal, CN_TIME, US_TIME, TW_TIME +from qlib.utils.data import guess_horizon REG_MAP = {REG_CN: CN_TIME, REG_US: US_TIME, REG_TW: TW_TIME} @@ -111,6 +112,23 @@ def gen_args(cal: List): for args in args_l: cal_sam_minute_new(*args, region=region) +class DataUtils(TestCase): + @classmethod + def setUpClass(cls): + init() + + def test_guess_horizon(self): + label1 = "Ref($close, -2) / Ref($close, -1) - 1" + result1 = guess_horizon(label1) + assert(result1 == 2) + + label1 = "Ref($close, -5) / Ref($close, -1) - 1" + result1 = guess_horizon(label1) + assert(result1 == 5) + + label1 = "Ref($close, -1) / Ref($close, -1) - 1" + result1 = guess_horizon(label1) + assert(result1 is None) if __name__ == "__main__": unittest.main() From 1146f1730fa143850ac5f58072c687d6fe39f4ff Mon Sep 17 00:00:00 2001 From: Di Chen Date: Wed, 10 May 2023 17:14:13 +0800 Subject: [PATCH 02/11] Reformat for CI --- tests/misc/test_utils.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/misc/test_utils.py b/tests/misc/test_utils.py index 0e397a22da..095841fcd9 100644 --- a/tests/misc/test_utils.py +++ b/tests/misc/test_utils.py @@ -118,17 +118,17 @@ def setUpClass(cls): init() def test_guess_horizon(self): - label1 = "Ref($close, -2) / Ref($close, -1) - 1" - result1 = guess_horizon(label1) - assert(result1 == 2) + label = "Ref($close, -2) / Ref($close, -1) - 1" + result = guess_horizon(label) + assert result == 2 - label1 = "Ref($close, -5) / Ref($close, -1) - 1" - result1 = guess_horizon(label1) - assert(result1 == 5) + label = "Ref($close, -5) / Ref($close, -1) - 1" + result = guess_horizon(label) + assert result == 5 - label1 = "Ref($close, -1) / Ref($close, -1) - 1" - result1 = guess_horizon(label1) - assert(result1 is None) + label = "Ref($close, -1) / Ref($close, -1) - 1" + result = guess_horizon(label) + assert result is None if __name__ == "__main__": unittest.main() From d779bddf2e7b38f04cb68060cecccbcdbbc72fd6 Mon Sep 17 00:00:00 2001 From: Di Chen Date: Wed, 10 May 2023 17:38:33 +0800 Subject: [PATCH 03/11] Leverage horizon change --- qlib/contrib/data/dataset.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/qlib/contrib/data/dataset.py b/qlib/contrib/data/dataset.py index 9ce522cc06..3554a94d10 100644 --- a/qlib/contrib/data/dataset.py +++ b/qlib/contrib/data/dataset.py @@ -6,6 +6,7 @@ import warnings import numpy as np import pandas as pd +from qlib.utils.data import guess_horizon from qlib.data.dataset import DatasetH @@ -130,6 +131,16 @@ def __init__( input_size=None, **kwargs, ): + if horizon == 0: + # Try to guess horizon + if type(handler) is dict: + label = handler.get("kwargs", {}).get("label",[""])[0] + else: + label = handler.data_loader.fields["label"][0][0] + horizon = guess_horizon(label) + # Failed to guess horizon, set back to 0 + if horizon is None: + horizon = 0 assert num_states == 0 or horizon > 0, "please specify `horizon` to avoid data leakage" assert memory_mode in ["sample", "daily"], "unsupported memory mode" From dfe504567e630bf26567f4fbda85791338c05e0c Mon Sep 17 00:00:00 2001 From: Di Chen Date: Wed, 10 May 2023 17:42:11 +0800 Subject: [PATCH 04/11] Udpate config yaml --- examples/benchmarks/TRA/workflow_config_tra_Alpha158.yaml | 3 +-- .../benchmarks/TRA/workflow_config_tra_Alpha158_full.yaml | 1 - examples/benchmarks/TRA/workflow_config_tra_Alpha360.yaml | 3 +-- qlib/utils/data.py | 2 +- tests/misc/test_utils.py | 4 ++-- 5 files changed, 5 insertions(+), 8 deletions(-) diff --git a/examples/benchmarks/TRA/workflow_config_tra_Alpha158.yaml b/examples/benchmarks/TRA/workflow_config_tra_Alpha158.yaml index c86f87fc65..001d5885e5 100644 --- a/examples/benchmarks/TRA/workflow_config_tra_Alpha158.yaml +++ b/examples/benchmarks/TRA/workflow_config_tra_Alpha158.yaml @@ -31,7 +31,7 @@ data_handler_config: &data_handler_config fields_group: label label: ["Ref($close, -2) / Ref($close, -1) - 1"] -num_states: &num_states 3 +num_states: &num_states 5 memory_mode: &memory_mode sample @@ -112,7 +112,6 @@ task: valid: [2015-01-01, 2016-12-31] test: [2017-01-01, 2020-08-01] seq_len: 60 - horizon: 2 input_size: num_states: *num_states batch_size: 1024 diff --git a/examples/benchmarks/TRA/workflow_config_tra_Alpha158_full.yaml b/examples/benchmarks/TRA/workflow_config_tra_Alpha158_full.yaml index 75f18f3ee6..66815de711 100644 --- a/examples/benchmarks/TRA/workflow_config_tra_Alpha158_full.yaml +++ b/examples/benchmarks/TRA/workflow_config_tra_Alpha158_full.yaml @@ -106,7 +106,6 @@ task: valid: [2015-01-01, 2016-12-31] test: [2017-01-01, 2020-08-01] seq_len: 60 - horizon: 2 input_size: num_states: *num_states batch_size: 1024 diff --git a/examples/benchmarks/TRA/workflow_config_tra_Alpha360.yaml b/examples/benchmarks/TRA/workflow_config_tra_Alpha360.yaml index 9ab5b904ba..139d6cf431 100644 --- a/examples/benchmarks/TRA/workflow_config_tra_Alpha360.yaml +++ b/examples/benchmarks/TRA/workflow_config_tra_Alpha360.yaml @@ -25,7 +25,7 @@ data_handler_config: &data_handler_config fields_group: label label: ["Ref($close, -2) / Ref($close, -1) - 1"] -num_states: &num_states 3 +num_states: &num_states 5 memory_mode: &memory_mode sample @@ -106,7 +106,6 @@ task: valid: [2015-01-01, 2016-12-31] test: [2017-01-01, 2020-08-01] seq_len: 60 - horizon: 2 input_size: 6 num_states: *num_states batch_size: 1024 diff --git a/qlib/utils/data.py b/qlib/utils/data.py index ad2b0b7e69..870739975b 100644 --- a/qlib/utils/data.py +++ b/qlib/utils/data.py @@ -120,4 +120,4 @@ def guess_horizon(label): # Unlikely the label doesn't use future information if max_horizon < 2: return None - return max_horizon \ No newline at end of file + return max_horizon + 1 \ No newline at end of file diff --git a/tests/misc/test_utils.py b/tests/misc/test_utils.py index 095841fcd9..f63a0d7697 100644 --- a/tests/misc/test_utils.py +++ b/tests/misc/test_utils.py @@ -120,11 +120,11 @@ def setUpClass(cls): def test_guess_horizon(self): label = "Ref($close, -2) / Ref($close, -1) - 1" result = guess_horizon(label) - assert result == 2 + assert result == 3 label = "Ref($close, -5) / Ref($close, -1) - 1" result = guess_horizon(label) - assert result == 5 + assert result == 6 label = "Ref($close, -1) / Ref($close, -1) - 1" result = guess_horizon(label) From d62206ce9a4baf53d621ca0e6d0ede44c5c76ee6 Mon Sep 17 00:00:00 2001 From: Di Chen Date: Wed, 10 May 2023 17:47:20 +0800 Subject: [PATCH 05/11] Update for formatting --- qlib/contrib/data/dataset.py | 2 +- qlib/utils/data.py | 2 +- tests/misc/test_utils.py | 2 -- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/qlib/contrib/data/dataset.py b/qlib/contrib/data/dataset.py index 3554a94d10..08d86fe732 100644 --- a/qlib/contrib/data/dataset.py +++ b/qlib/contrib/data/dataset.py @@ -134,7 +134,7 @@ def __init__( if horizon == 0: # Try to guess horizon if type(handler) is dict: - label = handler.get("kwargs", {}).get("label",[""])[0] + label = handler.get("kwargs", {}).get("label", [""])[0] else: label = handler.data_loader.fields["label"][0][0] horizon = guess_horizon(label) diff --git a/qlib/utils/data.py b/qlib/utils/data.py index 870739975b..978b920c0c 100644 --- a/qlib/utils/data.py +++ b/qlib/utils/data.py @@ -109,7 +109,7 @@ def update_config(base_config: dict, ext_config: Union[dict, List[dict]]): def guess_horizon(label): """ - Try to guess the horizon by parsing label + Try to guess the horizon by parsing label """ regex = r"Ref\(\s*\$[a-zA-Z]+,\s*-(\d+)\)" horizon_list = [int(x) for x in re.findall(regex, label)] diff --git a/tests/misc/test_utils.py b/tests/misc/test_utils.py index f63a0d7697..69c5de9edb 100644 --- a/tests/misc/test_utils.py +++ b/tests/misc/test_utils.py @@ -111,7 +111,6 @@ def gen_args(cal: List): with TimeInspector.logt(): for args in args_l: cal_sam_minute_new(*args, region=region) - class DataUtils(TestCase): @classmethod def setUpClass(cls): @@ -129,6 +128,5 @@ def test_guess_horizon(self): label = "Ref($close, -1) / Ref($close, -1) - 1" result = guess_horizon(label) assert result is None - if __name__ == "__main__": unittest.main() From 67674628eaabe97ec0478b0da3e25b7c12b5f948 Mon Sep 17 00:00:00 2001 From: Di Chen Date: Wed, 10 May 2023 23:05:59 +0800 Subject: [PATCH 06/11] Adapt to pickled handler --- qlib/contrib/data/dataset.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/qlib/contrib/data/dataset.py b/qlib/contrib/data/dataset.py index 08d86fe732..c8aadbaec1 100644 --- a/qlib/contrib/data/dataset.py +++ b/qlib/contrib/data/dataset.py @@ -7,6 +7,7 @@ import numpy as np import pandas as pd from qlib.utils.data import guess_horizon +from qlib.utils import init_instance_by_config from qlib.data.dataset import DatasetH @@ -134,11 +135,11 @@ def __init__( if horizon == 0: # Try to guess horizon if type(handler) is dict: - label = handler.get("kwargs", {}).get("label", [""])[0] - else: - label = handler.data_loader.fields["label"][0][0] + handler = init_instance_by_config(handler) + elif type(handler) is str: # pickled handler + handler = init_instance_by_config(handler) + label = handler.data_loader.fields["label"][0][0] horizon = guess_horizon(label) - # Failed to guess horizon, set back to 0 if horizon is None: horizon = 0 From 54d12f14a3a5ee83d1f23ae3dc542a9664fc841d Mon Sep 17 00:00:00 2001 From: Di Chen Date: Thu, 11 May 2023 08:39:02 +0800 Subject: [PATCH 07/11] Fix CI error --- examples/benchmarks/TRA/workflow_config_tra_Alpha158.yaml | 2 +- examples/benchmarks/TRA/workflow_config_tra_Alpha360.yaml | 2 +- qlib/contrib/data/dataset.py | 2 +- qlib/utils/data.py | 2 +- tests/misc/test_utils.py | 4 ++++ 5 files changed, 8 insertions(+), 4 deletions(-) diff --git a/examples/benchmarks/TRA/workflow_config_tra_Alpha158.yaml b/examples/benchmarks/TRA/workflow_config_tra_Alpha158.yaml index 001d5885e5..77172ff11f 100644 --- a/examples/benchmarks/TRA/workflow_config_tra_Alpha158.yaml +++ b/examples/benchmarks/TRA/workflow_config_tra_Alpha158.yaml @@ -31,7 +31,7 @@ data_handler_config: &data_handler_config fields_group: label label: ["Ref($close, -2) / Ref($close, -1) - 1"] -num_states: &num_states 5 +num_states: &num_states 3 memory_mode: &memory_mode sample diff --git a/examples/benchmarks/TRA/workflow_config_tra_Alpha360.yaml b/examples/benchmarks/TRA/workflow_config_tra_Alpha360.yaml index 139d6cf431..a4a2486a58 100644 --- a/examples/benchmarks/TRA/workflow_config_tra_Alpha360.yaml +++ b/examples/benchmarks/TRA/workflow_config_tra_Alpha360.yaml @@ -25,7 +25,7 @@ data_handler_config: &data_handler_config fields_group: label label: ["Ref($close, -2) / Ref($close, -1) - 1"] -num_states: &num_states 5 +num_states: &num_states 3 memory_mode: &memory_mode sample diff --git a/qlib/contrib/data/dataset.py b/qlib/contrib/data/dataset.py index c8aadbaec1..f881da70f4 100644 --- a/qlib/contrib/data/dataset.py +++ b/qlib/contrib/data/dataset.py @@ -136,7 +136,7 @@ def __init__( # Try to guess horizon if type(handler) is dict: handler = init_instance_by_config(handler) - elif type(handler) is str: # pickled handler + elif type(handler) is str: # pickled handler handler = init_instance_by_config(handler) label = handler.data_loader.fields["label"][0][0] horizon = guess_horizon(label) diff --git a/qlib/utils/data.py b/qlib/utils/data.py index 978b920c0c..1ebd7ef68b 100644 --- a/qlib/utils/data.py +++ b/qlib/utils/data.py @@ -120,4 +120,4 @@ def guess_horizon(label): # Unlikely the label doesn't use future information if max_horizon < 2: return None - return max_horizon + 1 \ No newline at end of file + return max_horizon + 1 diff --git a/tests/misc/test_utils.py b/tests/misc/test_utils.py index 69c5de9edb..2e809df8de 100644 --- a/tests/misc/test_utils.py +++ b/tests/misc/test_utils.py @@ -111,6 +111,8 @@ def gen_args(cal: List): with TimeInspector.logt(): for args in args_l: cal_sam_minute_new(*args, region=region) + + class DataUtils(TestCase): @classmethod def setUpClass(cls): @@ -128,5 +130,7 @@ def test_guess_horizon(self): label = "Ref($close, -1) / Ref($close, -1) - 1" result = guess_horizon(label) assert result is None + + if __name__ == "__main__": unittest.main() From e3a6b2d278147b920d1920ac28f33454d66762a9 Mon Sep 17 00:00:00 2001 From: Di Chen Date: Thu, 11 May 2023 17:23:55 +0800 Subject: [PATCH 08/11] remove blank --- tests/misc/test_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/misc/test_utils.py b/tests/misc/test_utils.py index 2e809df8de..03e782359c 100644 --- a/tests/misc/test_utils.py +++ b/tests/misc/test_utils.py @@ -112,7 +112,7 @@ def gen_args(cal: List): for args in args_l: cal_sam_minute_new(*args, region=region) - + class DataUtils(TestCase): @classmethod def setUpClass(cls): From b3af98350f9dc16e267a2ff75e1ff76a35839c53 Mon Sep 17 00:00:00 2001 From: Di Chen Date: Fri, 12 May 2023 17:49:13 +0800 Subject: [PATCH 09/11] Fix lint --- qlib/contrib/data/dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qlib/contrib/data/dataset.py b/qlib/contrib/data/dataset.py index f881da70f4..798d1dff39 100644 --- a/qlib/contrib/data/dataset.py +++ b/qlib/contrib/data/dataset.py @@ -134,9 +134,9 @@ def __init__( ): if horizon == 0: # Try to guess horizon - if type(handler) is dict: + if isinstance(handler, dict): handler = init_instance_by_config(handler) - elif type(handler) is str: # pickled handler + elif isinstance(handler, str): # pickled handler handler = init_instance_by_config(handler) label = handler.data_loader.fields["label"][0][0] horizon = guess_horizon(label) From 091dae6c60ef6dd3153ac896ebfde19fd3552f1b Mon Sep 17 00:00:00 2001 From: Di Chen Date: Fri, 12 May 2023 18:24:23 +0800 Subject: [PATCH 10/11] Update tests --- qlib/contrib/data/dataset.py | 2 -- qlib/utils/data.py | 4 ++-- tests/misc/test_utils.py | 2 +- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/qlib/contrib/data/dataset.py b/qlib/contrib/data/dataset.py index 798d1dff39..5535b6efe2 100644 --- a/qlib/contrib/data/dataset.py +++ b/qlib/contrib/data/dataset.py @@ -140,8 +140,6 @@ def __init__( handler = init_instance_by_config(handler) label = handler.data_loader.fields["label"][0][0] horizon = guess_horizon(label) - if horizon is None: - horizon = 0 assert num_states == 0 or horizon > 0, "please specify `horizon` to avoid data leakage" assert memory_mode in ["sample", "daily"], "unsupported memory mode" diff --git a/qlib/utils/data.py b/qlib/utils/data.py index 1ebd7ef68b..7b196c50ec 100644 --- a/qlib/utils/data.py +++ b/qlib/utils/data.py @@ -115,9 +115,9 @@ def guess_horizon(label): horizon_list = [int(x) for x in re.findall(regex, label)] if len(horizon_list) == 0: - return None + return 0 max_horizon = max(horizon_list) # Unlikely the label doesn't use future information if max_horizon < 2: - return None + return 0 return max_horizon + 1 diff --git a/tests/misc/test_utils.py b/tests/misc/test_utils.py index 03e782359c..5ea0fe0989 100644 --- a/tests/misc/test_utils.py +++ b/tests/misc/test_utils.py @@ -129,7 +129,7 @@ def test_guess_horizon(self): label = "Ref($close, -1) / Ref($close, -1) - 1" result = guess_horizon(label) - assert result is None + assert result is 0 if __name__ == "__main__": From adb3d045438e668d7a0fcdcecd978586067c0228 Mon Sep 17 00:00:00 2001 From: Di Chen Date: Thu, 18 May 2023 22:08:57 +0800 Subject: [PATCH 11/11] Remove redundant check --- qlib/contrib/data/dataset.py | 4 +--- tests/misc/test_utils.py | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/qlib/contrib/data/dataset.py b/qlib/contrib/data/dataset.py index 5535b6efe2..ccc667ae91 100644 --- a/qlib/contrib/data/dataset.py +++ b/qlib/contrib/data/dataset.py @@ -134,9 +134,7 @@ def __init__( ): if horizon == 0: # Try to guess horizon - if isinstance(handler, dict): - handler = init_instance_by_config(handler) - elif isinstance(handler, str): # pickled handler + if isinstance(handler, (dict, str)): handler = init_instance_by_config(handler) label = handler.data_loader.fields["label"][0][0] horizon = guess_horizon(label) diff --git a/tests/misc/test_utils.py b/tests/misc/test_utils.py index 5ea0fe0989..53108a75e3 100644 --- a/tests/misc/test_utils.py +++ b/tests/misc/test_utils.py @@ -129,7 +129,7 @@ def test_guess_horizon(self): label = "Ref($close, -1) / Ref($close, -1) - 1" result = guess_horizon(label) - assert result is 0 + assert result == 0 if __name__ == "__main__":