diff --git a/qlib/contrib/data/handler.py b/qlib/contrib/data/handler.py index ce052f5506..7c63e5a639 100644 --- a/qlib/contrib/data/handler.py +++ b/qlib/contrib/data/handler.py @@ -1,6 +1,7 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. +from qlib.contrib.data.loader import Alpha158DL, Alpha360DL from ...data.dataset.handler import DataHandlerLP from ...data.dataset.processor import Processor from ...utils import get_callable_kwargs @@ -66,7 +67,7 @@ def __init__( "class": "QlibDataLoader", "kwargs": { "config": { - "feature": self.get_feature_config(), + "feature": Alpha360DL.get_feature_config(), "label": kwargs.pop("label", self.get_label_config()), }, "filter_pipe": filter_pipe, @@ -88,51 +89,6 @@ def __init__( def get_label_config(self): return ["Ref($close, -2)/Ref($close, -1) - 1"], ["LABEL0"] - @staticmethod - def get_feature_config(): - # NOTE: - # Alpha360 tries to provide a dataset with original price data - # the original price data includes the prices and volume in the last 60 days. - # To make it easier to learn models from this dataset, all the prices and volume - # are normalized by the latest price and volume data ( dividing by $close, $volume) - # So the latest normalized $close will be 1 (with name CLOSE0), the latest normalized $volume will be 1 (with name VOLUME0) - # If further normalization are executed (e.g. centralization), CLOSE0 and VOLUME0 will be 0. - fields = [] - names = [] - - for i in range(59, 0, -1): - fields += ["Ref($close, %d)/$close" % i] - names += ["CLOSE%d" % i] - fields += ["$close/$close"] - names += ["CLOSE0"] - for i in range(59, 0, -1): - fields += ["Ref($open, %d)/$close" % i] - names += ["OPEN%d" % i] - fields += ["$open/$close"] - names += ["OPEN0"] - for i in range(59, 0, -1): - fields += ["Ref($high, %d)/$close" % i] - names += ["HIGH%d" % i] - fields += ["$high/$close"] - names += ["HIGH0"] - for i in range(59, 0, -1): - fields += ["Ref($low, %d)/$close" % i] - names += ["LOW%d" % i] - fields += ["$low/$close"] - names += ["LOW0"] - for i in range(59, 0, -1): - fields += ["Ref($vwap, %d)/$close" % i] - names += ["VWAP%d" % i] - fields += ["$vwap/$close"] - names += ["VWAP0"] - for i in range(59, 0, -1): - fields += ["Ref($volume, %d)/($volume+1e-12)" % i] - names += ["VOLUME%d" % i] - fields += ["$volume/($volume+1e-12)"] - names += ["VOLUME0"] - - return fields, names - class Alpha360vwap(Alpha360): def get_label_config(self): @@ -190,242 +146,11 @@ def get_feature_config(self): }, "rolling": {}, } - return self.parse_config_to_fields(conf) + return Alpha158DL.get_feature_config(conf) def get_label_config(self): return ["Ref($close, -2)/Ref($close, -1) - 1"], ["LABEL0"] - @staticmethod - def parse_config_to_fields(config): - """create factors from config - - config = { - 'kbar': {}, # whether to use some hard-code kbar features - 'price': { # whether to use raw price features - 'windows': [0, 1, 2, 3, 4], # use price at n days ago - 'feature': ['OPEN', 'HIGH', 'LOW'] # which price field to use - }, - 'volume': { # whether to use raw volume features - 'windows': [0, 1, 2, 3, 4], # use volume at n days ago - }, - 'rolling': { # whether to use rolling operator based features - 'windows': [5, 10, 20, 30, 60], # rolling windows size - 'include': ['ROC', 'MA', 'STD'], # rolling operator to use - #if include is None we will use default operators - 'exclude': ['RANK'], # rolling operator not to use - } - } - """ - fields = [] - names = [] - if "kbar" in config: - fields += [ - "($close-$open)/$open", - "($high-$low)/$open", - "($close-$open)/($high-$low+1e-12)", - "($high-Greater($open, $close))/$open", - "($high-Greater($open, $close))/($high-$low+1e-12)", - "(Less($open, $close)-$low)/$open", - "(Less($open, $close)-$low)/($high-$low+1e-12)", - "(2*$close-$high-$low)/$open", - "(2*$close-$high-$low)/($high-$low+1e-12)", - ] - names += [ - "KMID", - "KLEN", - "KMID2", - "KUP", - "KUP2", - "KLOW", - "KLOW2", - "KSFT", - "KSFT2", - ] - if "price" in config: - windows = config["price"].get("windows", range(5)) - feature = config["price"].get("feature", ["OPEN", "HIGH", "LOW", "CLOSE", "VWAP"]) - for field in feature: - field = field.lower() - fields += ["Ref($%s, %d)/$close" % (field, d) if d != 0 else "$%s/$close" % field for d in windows] - names += [field.upper() + str(d) for d in windows] - if "volume" in config: - windows = config["volume"].get("windows", range(5)) - fields += ["Ref($volume, %d)/($volume+1e-12)" % d if d != 0 else "$volume/($volume+1e-12)" for d in windows] - names += ["VOLUME" + str(d) for d in windows] - if "rolling" in config: - windows = config["rolling"].get("windows", [5, 10, 20, 30, 60]) - include = config["rolling"].get("include", None) - exclude = config["rolling"].get("exclude", []) - # `exclude` in dataset config unnecessary filed - # `include` in dataset config necessary field - - def use(x): - return x not in exclude and (include is None or x in include) - - # Some factor ref: https://guorn.com/static/upload/file/3/134065454575605.pdf - if use("ROC"): - # https://www.investopedia.com/terms/r/rateofchange.asp - # Rate of change, the price change in the past d days, divided by latest close price to remove unit - fields += ["Ref($close, %d)/$close" % d for d in windows] - names += ["ROC%d" % d for d in windows] - if use("MA"): - # https://www.investopedia.com/ask/answers/071414/whats-difference-between-moving-average-and-weighted-moving-average.asp - # Simple Moving Average, the simple moving average in the past d days, divided by latest close price to remove unit - fields += ["Mean($close, %d)/$close" % d for d in windows] - names += ["MA%d" % d for d in windows] - if use("STD"): - # The standard diviation of close price for the past d days, divided by latest close price to remove unit - fields += ["Std($close, %d)/$close" % d for d in windows] - names += ["STD%d" % d for d in windows] - if use("BETA"): - # The rate of close price change in the past d days, divided by latest close price to remove unit - # For example, price increase 10 dollar per day in the past d days, then Slope will be 10. - fields += ["Slope($close, %d)/$close" % d for d in windows] - names += ["BETA%d" % d for d in windows] - if use("RSQR"): - # The R-sqaure value of linear regression for the past d days, represent the trend linear - fields += ["Rsquare($close, %d)" % d for d in windows] - names += ["RSQR%d" % d for d in windows] - if use("RESI"): - # The redisdual for linear regression for the past d days, represent the trend linearity for past d days. - fields += ["Resi($close, %d)/$close" % d for d in windows] - names += ["RESI%d" % d for d in windows] - if use("MAX"): - # The max price for past d days, divided by latest close price to remove unit - fields += ["Max($high, %d)/$close" % d for d in windows] - names += ["MAX%d" % d for d in windows] - if use("LOW"): - # The low price for past d days, divided by latest close price to remove unit - fields += ["Min($low, %d)/$close" % d for d in windows] - names += ["MIN%d" % d for d in windows] - if use("QTLU"): - # The 80% quantile of past d day's close price, divided by latest close price to remove unit - # Used with MIN and MAX - fields += ["Quantile($close, %d, 0.8)/$close" % d for d in windows] - names += ["QTLU%d" % d for d in windows] - if use("QTLD"): - # The 20% quantile of past d day's close price, divided by latest close price to remove unit - fields += ["Quantile($close, %d, 0.2)/$close" % d for d in windows] - names += ["QTLD%d" % d for d in windows] - if use("RANK"): - # Get the percentile of current close price in past d day's close price. - # Represent the current price level comparing to past N days, add additional information to moving average. - fields += ["Rank($close, %d)" % d for d in windows] - names += ["RANK%d" % d for d in windows] - if use("RSV"): - # Represent the price position between upper and lower resistent price for past d days. - fields += ["($close-Min($low, %d))/(Max($high, %d)-Min($low, %d)+1e-12)" % (d, d, d) for d in windows] - names += ["RSV%d" % d for d in windows] - if use("IMAX"): - # The number of days between current date and previous highest price date. - # Part of Aroon Indicator https://www.investopedia.com/terms/a/aroon.asp - # The indicator measures the time between highs and the time between lows over a time period. - # The idea is that strong uptrends will regularly see new highs, and strong downtrends will regularly see new lows. - fields += ["IdxMax($high, %d)/%d" % (d, d) for d in windows] - names += ["IMAX%d" % d for d in windows] - if use("IMIN"): - # The number of days between current date and previous lowest price date. - # Part of Aroon Indicator https://www.investopedia.com/terms/a/aroon.asp - # The indicator measures the time between highs and the time between lows over a time period. - # The idea is that strong uptrends will regularly see new highs, and strong downtrends will regularly see new lows. - fields += ["IdxMin($low, %d)/%d" % (d, d) for d in windows] - names += ["IMIN%d" % d for d in windows] - if use("IMXD"): - # The time period between previous lowest-price date occur after highest price date. - # Large value suggest downward momemtum. - fields += ["(IdxMax($high, %d)-IdxMin($low, %d))/%d" % (d, d, d) for d in windows] - names += ["IMXD%d" % d for d in windows] - if use("CORR"): - # The correlation between absolute close price and log scaled trading volume - fields += ["Corr($close, Log($volume+1), %d)" % d for d in windows] - names += ["CORR%d" % d for d in windows] - if use("CORD"): - # The correlation between price change ratio and volume change ratio - fields += ["Corr($close/Ref($close,1), Log($volume/Ref($volume, 1)+1), %d)" % d for d in windows] - names += ["CORD%d" % d for d in windows] - if use("CNTP"): - # The percentage of days in past d days that price go up. - fields += ["Mean($close>Ref($close, 1), %d)" % d for d in windows] - names += ["CNTP%d" % d for d in windows] - if use("CNTN"): - # The percentage of days in past d days that price go down. - fields += ["Mean($closeRef($close, 1), %d)-Mean($closeRef($close, 1), %d)" % d for d in windows] + names += ["CNTP%d" % d for d in windows] + if use("CNTN"): + # The percentage of days in past d days that price go down. + fields += ["Mean($closeRef($close, 1), %d)-Mean($close dict: def load(self, instruments=None, start_time=None, end_time=None) -> pd.DataFrame: self._maybe_load_raw_data() + + # 1) Filter by instruments if instruments is None: df = self._data else: df = self._data.loc(axis=0)[:, instruments] + + # 2) Filter by Datetime if start_time is None and end_time is None: return df # NOTE: avoid copy by loc # pd.Timestamp(None) == NaT, use NaT as index can not fetch correct thing, so do not change None. @@ -275,6 +279,55 @@ def _maybe_load_raw_data(self): self._data = self._config +class NestedDataLoader(DataLoader): + """ + We have multiple DataLoader, we can use this class to combine them. + """ + + def __init__(self, dataloader_l: List[Dict], join="left") -> None: + """ + + Parameters + ---------- + dataloader_l : list[dict] + A list of dataloader, for exmaple + + .. code-block:: python + + nd = NestedDataLoader( + dataloader_l=[ + { + "class": "qlib.contrib.data.loader.Alpha158DL", + }, { + "class": "qlib.contrib.data.loader.Alpha360DL", + "kwargs": { + "config": { + "label": ( ["Ref($close, -2)/Ref($close, -1) - 1"], ["LABEL0"]) + } + } + } + ] + ) + join : + it will pass to pd.concat when merging it. + """ + super().__init__() + self.data_loader_l = [ + (dl if isinstance(dl, DataLoader) else init_instance_by_config(dl)) for dl in dataloader_l + ] + self.join = join + + def load(self, instruments=None, start_time=None, end_time=None) -> pd.DataFrame: + df_full = None + for dl in self.data_loader_l: + df_current = dl.load(instruments, start_time, end_time) + if df_full is None: + df_full = df_current + else: + df_full = pd.merge(df_full, df_current, left_index=True, right_index=True, how=self.join) + return df_full.sort_index(axis=1) + + class DataLoaderDH(DataLoader): """DataLoaderDH DataLoader based on (D)ata (H)andler diff --git a/tests/data_mid_layer_tests/test_dataloader.py b/tests/data_mid_layer_tests/test_dataloader.py new file mode 100644 index 0000000000..e3cb741bb7 --- /dev/null +++ b/tests/data_mid_layer_tests/test_dataloader.py @@ -0,0 +1,50 @@ +# TODO: +# dump alpha 360 to dataframe and merge it with Alpha158 + +import sys +import unittest +import qlib +from pathlib import Path + +sys.path.append(str(Path(__file__).resolve().parent)) +from qlib.data.dataset.loader import NestedDataLoader +from qlib.contrib.data.loader import Alpha158DL, Alpha360DL + + +class TestDataLoader(unittest.TestCase): + + def test_nested_data_loader(self): + qlib.init() + nd = NestedDataLoader( + dataloader_l=[ + { + "class": "qlib.contrib.data.loader.Alpha158DL", + }, + { + "class": "qlib.contrib.data.loader.Alpha360DL", + "kwargs": {"config": {"label": (["Ref($close, -2)/Ref($close, -1) - 1"], ["LABEL0"])}}, + }, + ] + ) + # Of course you can use StaticDataLoader + + dataset = nd.load() + + assert dataset is not None + + columns = dataset.columns.tolist() + columns_list = [tup[1] for tup in columns] + + for col in Alpha158DL.get_feature_config()[1]: + assert col in columns_list + + for col in Alpha360DL.get_feature_config()[1]: + assert col in columns_list + + assert "LABEL0" in columns_list + + # Then you can use it wth DataHandler; + + +if __name__ == "__main__": + unittest.main()