forked from microsoft/qlib
-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
807 lines (677 loc) · 26.9 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import re
import copy
import importlib
import time
import bisect
import pickle
import random
import requests
import functools
from pathlib import Path
from typing import Iterable, Tuple, List
import numpy as np
import pandas as pd
from loguru import logger
from yahooquery import Ticker
from tqdm import tqdm
from functools import partial
from concurrent.futures import ProcessPoolExecutor
from bs4 import BeautifulSoup
HS_SYMBOLS_URL = "http://app.finance.ifeng.com/hq/list.php?type=stock_a&class={s_type}"
CALENDAR_URL_BASE = "http://push2his.eastmoney.com/api/qt/stock/kline/get?secid={market}.{bench_code}&fields1=f1%2Cf2%2Cf3%2Cf4%2Cf5&fields2=f51%2Cf52%2Cf53%2Cf54%2Cf55%2Cf56%2Cf57%2Cf58&klt=101&fqt=0&beg=19900101&end=20991231"
SZSE_CALENDAR_URL = "http://www.szse.cn/api/report/exchange/onepersistenthour/monthList?month={month}&random={random}"
CALENDAR_BENCH_URL_MAP = {
"CSI300": CALENDAR_URL_BASE.format(market=1, bench_code="000300"),
"CSI500": CALENDAR_URL_BASE.format(market=1, bench_code="000905"),
"CSI100": CALENDAR_URL_BASE.format(market=1, bench_code="000903"),
# NOTE: Use the time series of SH600000 as the sequence of all stocks
"ALL": CALENDAR_URL_BASE.format(market=1, bench_code="000905"),
# NOTE: Use the time series of ^GSPC(SP500) as the sequence of all stocks
"US_ALL": "^GSPC",
"IN_ALL": "^NSEI",
"BR_ALL": "^BVSP",
}
_BENCH_CALENDAR_LIST = None
_ALL_CALENDAR_LIST = None
_HS_SYMBOLS = None
_US_SYMBOLS = None
_IN_SYMBOLS = None
_BR_SYMBOLS = None
_EN_FUND_SYMBOLS = None
_CALENDAR_MAP = {}
# NOTE: Until 2020-10-20 20:00:00
MINIMUM_SYMBOLS_NUM = 3900
def get_calendar_list(bench_code="CSI300") -> List[pd.Timestamp]:
"""get SH/SZ history calendar list
Parameters
----------
bench_code: str
value from ["CSI300", "CSI500", "ALL", "US_ALL"]
Returns
-------
history calendar list
"""
logger.info(f"get calendar list: {bench_code}......")
def _get_calendar(url):
_value_list = requests.get(url, timeout=None).json()["data"]["klines"]
return sorted(map(lambda x: pd.Timestamp(x.split(",")[0]), _value_list))
calendar = _CALENDAR_MAP.get(bench_code, None)
if calendar is None:
if bench_code.startswith("US_") or bench_code.startswith("IN_") or bench_code.startswith("BR_"):
print(Ticker(CALENDAR_BENCH_URL_MAP[bench_code]))
print(Ticker(CALENDAR_BENCH_URL_MAP[bench_code]).history(interval="1d", period="max"))
df = Ticker(CALENDAR_BENCH_URL_MAP[bench_code]).history(interval="1d", period="max")
calendar = df.index.get_level_values(level="date").map(pd.Timestamp).unique().tolist()
else:
if bench_code.upper() == "ALL":
@deco_retry
def _get_calendar(month):
_cal = []
try:
resp = requests.get(
SZSE_CALENDAR_URL.format(month=month, random=random.random), timeout=None
).json()
for _r in resp["data"]:
if int(_r["jybz"]):
_cal.append(pd.Timestamp(_r["jyrq"]))
except Exception as e:
raise ValueError(f"{month}-->{e}") from e
return _cal
month_range = pd.date_range(start="2000-01", end=pd.Timestamp.now() + pd.Timedelta(days=31), freq="M")
calendar = []
for _m in month_range:
cal = _get_calendar(_m.strftime("%Y-%m"))
if cal:
calendar += cal
calendar = list(filter(lambda x: x <= pd.Timestamp.now(), calendar))
else:
calendar = _get_calendar(CALENDAR_BENCH_URL_MAP[bench_code])
_CALENDAR_MAP[bench_code] = calendar
logger.info(f"end of get calendar list: {bench_code}.")
return calendar
def return_date_list(date_field_name: str, file_path: Path):
date_list = pd.read_csv(file_path, sep=",", index_col=0)[date_field_name].to_list()
return sorted([pd.Timestamp(x) for x in date_list])
def get_calendar_list_by_ratio(
source_dir: [str, Path],
date_field_name: str = "date",
threshold: float = 0.5,
minimum_count: int = 10,
max_workers: int = 16,
) -> list:
"""get calendar list by selecting the date when few funds trade in this day
Parameters
----------
source_dir: str or Path
The directory where the raw data collected from the Internet is saved
date_field_name: str
date field name, default is date
threshold: float
threshold to exclude some days when few funds trade in this day, default 0.5
minimum_count: int
minimum count of funds should trade in one day
max_workers: int
Concurrent number, default is 16
Returns
-------
history calendar list
"""
logger.info(f"get calendar list from {source_dir} by threshold = {threshold}......")
source_dir = Path(source_dir).expanduser()
file_list = list(source_dir.glob("*.csv"))
_number_all_funds = len(file_list)
logger.info(f"count how many funds trade in this day......")
_dict_count_trade = dict() # dict{date:count}
_fun = partial(return_date_list, date_field_name)
all_oldest_list = []
with tqdm(total=_number_all_funds) as p_bar:
with ProcessPoolExecutor(max_workers=max_workers) as executor:
for date_list in executor.map(_fun, file_list):
if date_list:
all_oldest_list.append(date_list[0])
for date in date_list:
if date not in _dict_count_trade:
_dict_count_trade[date] = 0
_dict_count_trade[date] += 1
p_bar.update()
logger.info(f"count how many funds have founded in this day......")
_dict_count_founding = {date: _number_all_funds for date in _dict_count_trade} # dict{date:count}
with tqdm(total=_number_all_funds) as p_bar:
for oldest_date in all_oldest_list:
for date in _dict_count_founding.keys():
if date < oldest_date:
_dict_count_founding[date] -= 1
calendar = [
date for date, count in _dict_count_trade.items() if count >= max(int(count * threshold), minimum_count)
]
return calendar
def get_hs_stock_symbols() -> list:
"""get SH/SZ stock symbols
Returns
-------
stock symbols
"""
global _HS_SYMBOLS # pylint: disable=W0603
def _get_symbol():
"""
Get the stock pool from a web page and process it into the format required by yahooquery.
Format of data retrieved from the web page: 600519, 000001
The data format required by yahooquery: 600519.ss, 000001.sz
Returns
-------
set: Returns the set of symbol codes.
Examples:
-------
{600000.ss, 600001.ss, 600002.ss, 600003.ss, ...}
"""
url = "http://99.push2.eastmoney.com/api/qt/clist/get?pn=1&pz=10000&po=1&np=1&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f12"
try:
resp = requests.get(url, timeout=None)
resp.raise_for_status()
except requests.exceptions.HTTPError as e:
raise requests.exceptions.HTTPError(f"Request to {url} failed with status code {resp.status_code}") from e
try:
_symbols = [_v["f12"] for _v in resp.json()["data"]["diff"]]
except Exception as e:
logger.warning("An error occurred while extracting data from the response.")
raise
if len(_symbols) < 3900:
raise ValueError("The complete list of stocks is not available.")
# Add suffix after the stock code to conform to yahooquery standard, otherwise the data will not be fetched.
_symbols = [
_symbol + ".ss" if _symbol.startswith("6") else _symbol + ".sz" if _symbol.startswith(("0", "3")) else None
for _symbol in _symbols
]
_symbols = [_symbol for _symbol in _symbols if _symbol is not None]
return set(_symbols)
if _HS_SYMBOLS is None:
symbols = set()
_retry = 60
# It may take multiple times to get the complete
while len(symbols) < MINIMUM_SYMBOLS_NUM:
symbols |= _get_symbol()
time.sleep(3)
symbol_cache_path = Path("~/.cache/hs_symbols_cache.pkl").expanduser().resolve()
symbol_cache_path.parent.mkdir(parents=True, exist_ok=True)
if symbol_cache_path.exists():
with symbol_cache_path.open("rb") as fp:
cache_symbols = pickle.load(fp)
symbols |= cache_symbols
with symbol_cache_path.open("wb") as fp:
pickle.dump(symbols, fp)
_HS_SYMBOLS = sorted(list(symbols))
return _HS_SYMBOLS
def get_us_stock_symbols(qlib_data_path: [str, Path] = None) -> list:
"""get US stock symbols
Returns
-------
stock symbols
"""
global _US_SYMBOLS # pylint: disable=W0603
@deco_retry
def _get_eastmoney():
url = "http://4.push2.eastmoney.com/api/qt/clist/get?pn=1&pz=10000&fs=m:105,m:106,m:107&fields=f12"
resp = requests.get(url, timeout=None)
if resp.status_code != 200:
raise ValueError("request error")
try:
_symbols = [_v["f12"].replace("_", "-P") for _v in resp.json()["data"]["diff"].values()]
except Exception as e:
logger.warning(f"request error: {e}")
raise
if len(_symbols) < 8000:
raise ValueError("request error")
return _symbols
@deco_retry
def _get_nasdaq():
_res_symbols = []
for _name in ["otherlisted", "nasdaqtraded"]:
url = f"ftp://ftp.nasdaqtrader.com/SymbolDirectory/{_name}.txt"
df = pd.read_csv(url, sep="|")
df = df.rename(columns={"ACT Symbol": "Symbol"})
_symbols = df["Symbol"].dropna()
_symbols = _symbols.str.replace("$", "-P", regex=False)
_symbols = _symbols.str.replace(".W", "-WT", regex=False)
_symbols = _symbols.str.replace(".U", "-UN", regex=False)
_symbols = _symbols.str.replace(".R", "-RI", regex=False)
_symbols = _symbols.str.replace(".", "-", regex=False)
_res_symbols += _symbols.unique().tolist()
return _res_symbols
@deco_retry
def _get_nyse():
url = "https://www.nyse.com/api/quotes/filter"
_parms = {
"instrumentType": "EQUITY",
"pageNumber": 1,
"sortColumn": "NORMALIZED_TICKER",
"sortOrder": "ASC",
"maxResultsPerPage": 10000,
"filterToken": "",
}
resp = requests.post(url, json=_parms, timeout=None)
if resp.status_code != 200:
raise ValueError("request error")
try:
_symbols = [_v["symbolTicker"].replace("-", "-P") for _v in resp.json()]
except Exception as e:
logger.warning(f"request error: {e}")
_symbols = []
return _symbols
if _US_SYMBOLS is None:
_all_symbols = _get_eastmoney() + _get_nasdaq() + _get_nyse()
if qlib_data_path is not None:
for _index in ["nasdaq100", "sp500"]:
ins_df = pd.read_csv(
Path(qlib_data_path).joinpath(f"instruments/{_index}.txt"),
sep="\t",
names=["symbol", "start_date", "end_date"],
)
_all_symbols += ins_df["symbol"].unique().tolist()
def _format(s_):
s_ = s_.replace(".", "-")
s_ = s_.strip("$")
s_ = s_.strip("*")
return s_
_US_SYMBOLS = sorted(set(map(_format, filter(lambda x: len(x) < 8 and not x.endswith("WS"), _all_symbols))))
return _US_SYMBOLS
def get_in_stock_symbols(qlib_data_path: [str, Path] = None) -> list:
"""get IN stock symbols
Returns
-------
stock symbols
"""
global _IN_SYMBOLS # pylint: disable=W0603
@deco_retry
def _get_nifty():
url = f"https://www1.nseindia.com/content/equities/EQUITY_L.csv"
df = pd.read_csv(url)
df = df.rename(columns={"SYMBOL": "Symbol"})
df["Symbol"] = df["Symbol"] + ".NS"
_symbols = df["Symbol"].dropna()
_symbols = _symbols.unique().tolist()
return _symbols
if _IN_SYMBOLS is None:
_all_symbols = _get_nifty()
if qlib_data_path is not None:
for _index in ["nifty"]:
ins_df = pd.read_csv(
Path(qlib_data_path).joinpath(f"instruments/{_index}.txt"),
sep="\t",
names=["symbol", "start_date", "end_date"],
)
_all_symbols += ins_df["symbol"].unique().tolist()
def _format(s_):
s_ = s_.replace(".", "-")
s_ = s_.strip("$")
s_ = s_.strip("*")
return s_
_IN_SYMBOLS = sorted(set(_all_symbols))
return _IN_SYMBOLS
def get_br_stock_symbols(qlib_data_path: [str, Path] = None) -> list:
"""get Brazil(B3) stock symbols
Returns
-------
B3 stock symbols
"""
global _BR_SYMBOLS # pylint: disable=W0603
@deco_retry
def _get_ibovespa():
_symbols = []
url = "https://www.fundamentus.com.br/detalhes.php?papel="
# Request
agent = {"User-Agent": "Mozilla/5.0"}
page = requests.get(url, headers=agent, timeout=None)
# BeautifulSoup
soup = BeautifulSoup(page.content, "html.parser")
tbody = soup.find("tbody")
children = tbody.findChildren("a", recursive=True)
for child in children:
_symbols.append(str(child).rsplit('"', maxsplit=1)[-1].split(">")[1].split("<")[0])
return _symbols
if _BR_SYMBOLS is None:
_all_symbols = _get_ibovespa()
if qlib_data_path is not None:
for _index in ["ibov"]:
ins_df = pd.read_csv(
Path(qlib_data_path).joinpath(f"instruments/{_index}.txt"),
sep="\t",
names=["symbol", "start_date", "end_date"],
)
_all_symbols += ins_df["symbol"].unique().tolist()
def _format(s_):
s_ = s_.strip()
s_ = s_.strip("$")
s_ = s_.strip("*")
s_ = s_ + ".SA"
return s_
_BR_SYMBOLS = sorted(set(map(_format, _all_symbols)))
return _BR_SYMBOLS
def get_en_fund_symbols(qlib_data_path: [str, Path] = None) -> list:
"""get en fund symbols
Returns
-------
fund symbols in China
"""
global _EN_FUND_SYMBOLS # pylint: disable=W0603
@deco_retry
def _get_eastmoney():
url = "http://fund.eastmoney.com/js/fundcode_search.js"
resp = requests.get(url, timeout=None)
if resp.status_code != 200:
raise ValueError("request error")
try:
_symbols = []
for sub_data in re.findall(r"[\[](.*?)[\]]", resp.content.decode().split("= [")[-1].replace("];", "")):
data = sub_data.replace('"', "").replace("'", "")
# TODO: do we need other information, like fund_name from ['000001', 'HXCZHH', '华夏成长混合', '混合型', 'HUAXIACHENGZHANGHUNHE']
_symbols.append(data.split(",")[0])
except Exception as e:
logger.warning(f"request error: {e}")
raise
if len(_symbols) < 8000:
raise ValueError("request error")
return _symbols
if _EN_FUND_SYMBOLS is None:
_all_symbols = _get_eastmoney()
_EN_FUND_SYMBOLS = sorted(set(_all_symbols))
return _EN_FUND_SYMBOLS
def symbol_suffix_to_prefix(symbol: str, capital: bool = True) -> str:
"""symbol suffix to prefix
Parameters
----------
symbol: str
symbol
capital : bool
by default True
Returns
-------
"""
code, exchange = symbol.split(".")
if exchange.lower() in ["sh", "ss"]:
res = f"sh{code}"
else:
res = f"{exchange}{code}"
return res.upper() if capital else res.lower()
def symbol_prefix_to_sufix(symbol: str, capital: bool = True) -> str:
"""symbol prefix to sufix
Parameters
----------
symbol: str
symbol
capital : bool
by default True
Returns
-------
"""
res = f"{symbol[:-2]}.{symbol[-2:]}"
return res.upper() if capital else res.lower()
def deco_retry(retry: int = 5, retry_sleep: int = 3):
def deco_func(func):
@functools.wraps(func)
def wrapper(*args, **kwargs):
_retry = 5 if callable(retry) else retry
_result = None
for _i in range(1, _retry + 1):
try:
_result = func(*args, **kwargs)
break
except Exception as e:
logger.warning(f"{func.__name__}: {_i} :{e}")
if _i == _retry:
raise
time.sleep(retry_sleep)
return _result
return wrapper
return deco_func(retry) if callable(retry) else deco_func
def get_trading_date_by_shift(trading_list: list, trading_date: pd.Timestamp, shift: int = 1):
"""get trading date by shift
Parameters
----------
trading_list: list
trading calendar list
shift : int
shift, default is 1
trading_date : pd.Timestamp
trading date
Returns
-------
"""
trading_date = pd.Timestamp(trading_date)
left_index = bisect.bisect_left(trading_list, trading_date)
try:
res = trading_list[left_index + shift]
except IndexError:
res = trading_date
return res
def generate_minutes_calendar_from_daily(
calendars: Iterable,
freq: str = "1min",
am_range: Tuple[str, str] = ("09:30:00", "11:29:00"),
pm_range: Tuple[str, str] = ("13:00:00", "14:59:00"),
) -> pd.Index:
"""generate minutes calendar
Parameters
----------
calendars: Iterable
daily calendar
freq: str
by default 1min
am_range: Tuple[str, str]
AM Time Range, by default China-Stock: ("09:30:00", "11:29:00")
pm_range: Tuple[str, str]
PM Time Range, by default China-Stock: ("13:00:00", "14:59:00")
"""
daily_format: str = "%Y-%m-%d"
res = []
for _day in calendars:
for _range in [am_range, pm_range]:
res.append(
pd.date_range(
f"{pd.Timestamp(_day).strftime(daily_format)} {_range[0]}",
f"{pd.Timestamp(_day).strftime(daily_format)} {_range[1]}",
freq=freq,
)
)
return pd.Index(sorted(set(np.hstack(res))))
def get_instruments(
qlib_dir: str,
index_name: str,
method: str = "parse_instruments",
freq: str = "day",
request_retry: int = 5,
retry_sleep: int = 3,
market_index: str = "cn_index",
):
"""
Parameters
----------
qlib_dir: str
qlib data dir, default "Path(__file__).parent/qlib_data"
index_name: str
index name, value from ["csi100", "csi300"]
method: str
method, value from ["parse_instruments", "save_new_companies"]
freq: str
freq, value from ["day", "1min"]
request_retry: int
request retry, by default 5
retry_sleep: int
request sleep, by default 3
market_index: str
Where the files to obtain the index are located,
for example data_collector.cn_index.collector
Examples
-------
# parse instruments
$ python collector.py --index_name CSI300 --qlib_dir ~/.qlib/qlib_data/cn_data --method parse_instruments
# parse new companies
$ python collector.py --index_name CSI300 --qlib_dir ~/.qlib/qlib_data/cn_data --method save_new_companies
"""
_cur_module = importlib.import_module("data_collector.{}.collector".format(market_index))
obj = getattr(_cur_module, f"{index_name.upper()}Index")(
qlib_dir=qlib_dir, index_name=index_name, freq=freq, request_retry=request_retry, retry_sleep=retry_sleep
)
getattr(obj, method)()
def _get_all_1d_data(_date_field_name: str, _symbol_field_name: str, _1d_data_all: pd.DataFrame):
df = copy.deepcopy(_1d_data_all)
df.reset_index(inplace=True)
df.rename(columns={"datetime": _date_field_name, "instrument": _symbol_field_name}, inplace=True)
df.columns = list(map(lambda x: x[1:] if x.startswith("$") else x, df.columns))
return df
def get_1d_data(
_date_field_name: str,
_symbol_field_name: str,
symbol: str,
start: str,
end: str,
_1d_data_all: pd.DataFrame,
) -> pd.DataFrame:
"""get 1d data
Returns
------
data_1d: pd.DataFrame
data_1d.columns = [_date_field_name, _symbol_field_name, "paused", "volume", "factor", "close"]
"""
_all_1d_data = _get_all_1d_data(_date_field_name, _symbol_field_name, _1d_data_all)
return _all_1d_data[
(_all_1d_data[_symbol_field_name] == symbol.upper())
& (_all_1d_data[_date_field_name] >= pd.Timestamp(start))
& (_all_1d_data[_date_field_name] < pd.Timestamp(end))
]
def calc_adjusted_price(
df: pd.DataFrame,
_1d_data_all: pd.DataFrame,
_date_field_name: str,
_symbol_field_name: str,
frequence: str,
consistent_1d: bool = True,
calc_paused: bool = True,
) -> pd.DataFrame:
"""calc adjusted price
This method does 4 things.
1. Adds the `paused` field.
- The added paused field comes from the paused field of the 1d data.
2. Aligns the time of the 1d data.
3. The data is reweighted.
- The reweighting method:
- volume / factor
- open * factor
- high * factor
- low * factor
- close * factor
4. Called `calc_paused_num` method to add the `paused_num` field.
- The `paused_num` is the number of consecutive days of trading suspension.
"""
# TODO: using daily data factor
if df.empty:
return df
df = df.copy()
df.drop_duplicates(subset=_date_field_name, inplace=True)
df.sort_values(_date_field_name, inplace=True)
symbol = df.iloc[0][_symbol_field_name]
df[_date_field_name] = pd.to_datetime(df[_date_field_name])
# get 1d data from qlib
_start = pd.Timestamp(df[_date_field_name].min()).strftime("%Y-%m-%d")
_end = (pd.Timestamp(df[_date_field_name].max()) + pd.Timedelta(days=1)).strftime("%Y-%m-%d")
data_1d: pd.DataFrame = get_1d_data(_date_field_name, _symbol_field_name, symbol, _start, _end, _1d_data_all)
data_1d = data_1d.copy()
if data_1d is None or data_1d.empty:
df["factor"] = 1 / df.loc[df["close"].first_valid_index()]["close"]
# TODO: np.nan or 1 or 0
df["paused"] = np.nan
else:
# NOTE: volume is np.nan or volume <= 0, paused = 1
# FIXME: find a more accurate data source
data_1d["paused"] = 0
data_1d.loc[(data_1d["volume"].isna()) | (data_1d["volume"] <= 0), "paused"] = 1
data_1d = data_1d.set_index(_date_field_name)
# add factor from 1d data
# NOTE: 1d data info:
# - Close price adjusted for splits. Adjusted close price adjusted for both dividends and splits.
# - data_1d.adjclose: Adjusted close price adjusted for both dividends and splits.
# - data_1d.close: `data_1d.adjclose / (close for the first trading day that is not np.nan)`
def _calc_factor(df_1d: pd.DataFrame):
try:
_date = pd.Timestamp(pd.Timestamp(df_1d[_date_field_name].iloc[0]).date())
df_1d["factor"] = data_1d.loc[_date]["close"] / df_1d.loc[df_1d["close"].last_valid_index()]["close"]
df_1d["paused"] = data_1d.loc[_date]["paused"]
except Exception:
df_1d["factor"] = np.nan
df_1d["paused"] = np.nan
return df_1d
df = df.groupby([df[_date_field_name].dt.date], group_keys=False).apply(_calc_factor)
if consistent_1d:
# the date sequence is consistent with 1d
df.set_index(_date_field_name, inplace=True)
df = df.reindex(
generate_minutes_calendar_from_daily(
calendars=pd.to_datetime(data_1d.reset_index()[_date_field_name].drop_duplicates()),
freq=frequence,
am_range=("09:30:00", "11:29:00"),
pm_range=("13:00:00", "14:59:00"),
)
)
df[_symbol_field_name] = df.loc[df[_symbol_field_name].first_valid_index()][_symbol_field_name]
df.index.names = [_date_field_name]
df.reset_index(inplace=True)
for _col in ["open", "close", "high", "low", "volume"]:
if _col not in df.columns:
continue
if _col == "volume":
df[_col] = df[_col] / df["factor"]
else:
df[_col] = df[_col] * df["factor"]
if calc_paused:
df = calc_paused_num(df, _date_field_name, _symbol_field_name)
return df
def calc_paused_num(df: pd.DataFrame, _date_field_name, _symbol_field_name):
"""calc paused num
This method adds the paused_num field
- The `paused_num` is the number of consecutive days of trading suspension.
"""
_symbol = df.iloc[0][_symbol_field_name]
df = df.copy()
df["_tmp_date"] = df[_date_field_name].apply(lambda x: pd.Timestamp(x).date())
# remove data that starts and ends with `np.nan` all day
all_data = []
# Record the number of consecutive trading days where the whole day is nan, to remove the last trading day where the whole day is nan
all_nan_nums = 0
# Record the number of consecutive occurrences of trading days that are not nan throughout the day
not_nan_nums = 0
for _date, _df in df.groupby("_tmp_date"):
_df["paused"] = 0
if not _df.loc[_df["volume"] < 0].empty:
logger.warning(f"volume < 0, will fill np.nan: {_date} {_symbol}")
_df.loc[_df["volume"] < 0, "volume"] = np.nan
check_fields = set(_df.columns) - {
"_tmp_date",
"paused",
"factor",
_date_field_name,
_symbol_field_name,
}
if _df.loc[:, list(check_fields)].isna().values.all() or (_df["volume"] == 0).all():
all_nan_nums += 1
not_nan_nums = 0
_df["paused"] = 1
if all_data:
_df["paused_num"] = not_nan_nums
all_data.append(_df)
else:
all_nan_nums = 0
not_nan_nums += 1
_df["paused_num"] = not_nan_nums
all_data.append(_df)
all_data = all_data[: len(all_data) - all_nan_nums]
if all_data:
df = pd.concat(all_data, sort=False)
else:
logger.warning(f"data is empty: {_symbol}")
df = pd.DataFrame()
return df
del df["_tmp_date"]
return df
if __name__ == "__main__":
assert len(get_hs_stock_symbols()) >= MINIMUM_SYMBOLS_NUM