From 0b5a17948b95e585a2c87b43b2232af22c38b86d Mon Sep 17 00:00:00 2001 From: Marcus Read Date: Tue, 22 Nov 2022 23:53:58 +0000 Subject: [PATCH 1/3] Fix `Ticker.history` bugs Fixes: - #127 BUG: inaccurate evaluation of 'day' corresponding with close price. - #128 BUG: inaccurate intraday price indexing when exchanges observe DST. Also, improves performance by reducing DataFrame manipulations. --- yahooquery/ticker.py | 4 +- yahooquery/utils/__init__.py | 126 ++++++++++++++++++++++------------- 2 files changed, 80 insertions(+), 50 deletions(-) diff --git a/yahooquery/ticker.py b/yahooquery/ticker.py index 44c959c..8427b0b 100644 --- a/yahooquery/ticker.py +++ b/yahooquery/ticker.py @@ -1279,7 +1279,6 @@ def history( df = self._historical_data_to_dataframe(data, params, adj_timezone) if adj_ohlc and "adjclose" in df: df = self._adjust_ohlc(df) - df = df[~df.index.duplicated(keep='first')] return df def _history_1m(self, adj_timezone=True, adj_ohlc=False): @@ -1305,7 +1304,8 @@ def _historical_data_to_dataframe(self, data, params, adj_timezone): d = {} for symbol in self._symbols: if "timestamp" in data[symbol]: - d[symbol] = _history_dataframe(data, symbol, params, adj_timezone) + daily = params["interval"][-1] == "d" + d[symbol] = _history_dataframe(data[symbol], daily, adj_timezone) else: d[symbol] = data[symbol] d = {k: v for k, v in d.items() if isinstance(v, pd.DataFrame)} diff --git a/yahooquery/utils/__init__.py b/yahooquery/utils/__init__.py index 33354cb..31eff44 100644 --- a/yahooquery/utils/__init__.py +++ b/yahooquery/utils/__init__.py @@ -1,7 +1,7 @@ +import datetime import random import re import time -from datetime import datetime import pandas as pd from requests import Session @@ -112,57 +112,87 @@ def _convert_to_list(symbols, comma_split=False): def _convert_to_timestamp(date=None, start=True): if date is None: date = int((-858880800 * start) + (time.time() * (not start))) - elif isinstance(date, datetime): + elif isinstance(date, datetime.datetime): date = int(time.mktime(date.timetuple())) else: date = int(time.mktime(time.strptime(str(date), "%Y-%m-%d"))) return date -def _history_dataframe(data, symbol, params, adj_timezone=True): - df = pd.DataFrame(data[symbol]["indicators"]["quote"][0]) - if data[symbol]["indicators"].get("adjclose"): - df["adjclose"] = data[symbol]["indicators"]["adjclose"][0]["adjclose"] - df.index = pd.to_datetime(data[symbol]["timestamp"], unit="s") + pd.Timedelta( - (data[symbol]["meta"]["gmtoffset"] * adj_timezone), "s" - ) - if params["interval"][-1] not in ["m", "h"]: - df.index = df.index.date - df.dropna(inplace=True) - if data[symbol].get("events"): - df = pd.merge( - df, - _events_to_dataframe(data, symbol, params, adj_timezone), - how="outer", - left_index=True, - right_index=True, - ) - return df - +def _get_daily_index(data, index_utc, adj_timezone): + # evalute if last indice represents a live interval + last_trade_secs = data["meta"]["regularMarketTime"] * 10**9 + last_trade = pd.Timestamp(last_trade_secs, tz="UTC") + has_live_indice = index_utc[-1] >= last_trade - pd.Timedelta(2, "S") + if has_live_indice: + # remove it + live_indice = index_utc[-1] + index_utc = index_utc[:-1] + # evaluate if it should be put back later. If the close price for + # the day is already included in the data, i.e. if the live indice + # is simply duplicating data represented in the prior row, then the + # following will evaluate to False. + keep_live_indice = live_indice > index_utc[-1] + datetime.timedelta(1) + + tz = data["meta"]["exchangeTimezoneName"] + index_local = index_utc.tz_convert(tz) + times = index_local.time + + bv = times <= datetime.time(14) + if (bv).all(): + index = index_local.floor("d") + elif (~bv).all(): + index = index_local.ceil("d") + else: + # mix of open times pre and post 14:00. + index1 = index_local[bv].floor("d") + index2 = index_local[~bv].ceil("d") + index = index1.union(index2) + + index = pd.Index(index.date) + if has_live_indice and keep_live_indice: + live_indice = live_indice.astimezone(tz) if adj_timezone else live_indice + # do not keep tz info + live_indice = live_indice.tz_localize(None).to_pydatetime() + index = index.insert(len(index), live_indice) + return index + + +def _event_as_srs(event_data, event): + index = pd.Index([int(v) for v in event_data.keys()], dtype="int64") + if event == "dividends": + values = [d["amount"] for d in event_data.values()] + else: + values = [d["numerator"] / d["denominator"] for d in event_data.values()] + return pd.Series(values, index=index) + + +def _history_dataframe(data, daily, adj_timezone=True): + data_dict = data["indicators"]["quote"][0].copy() + if "adjclose" in data["indicators"]: + data_dict["adjclose"] = data["indicators"]["adjclose"][0]["adjclose"] + + if 'events' in data: + for event, event_data in data["events"].items(): + if event not in ("dividends", "splits"): + continue + data_dict[event] = _event_as_srs(event_data, event) + + df = pd.DataFrame(data_dict, index=data["timestamp"]) # align all on timestamps + df.dropna(how="all", inplace=True) + + index_utc = pd.to_datetime(df.index, unit="s", utc=True) + if daily: + index = _get_daily_index(data, index_utc, adj_timezone) + if len(index) == len(df) - 1: + # a live_indice was removed + df = df.iloc[:-1] + elif adj_timezone: + tz = data["meta"]["exchangeTimezoneName"] + # localize and remove tz info + index = index_utc.tz_convert(tz).tz_localize(None) + else: + index = index_utc.tz_localize(None) # remove UTC tz info -def _events_to_dataframe(data, symbol, params, adj_timezone): - dataframes = [] - for event in ["dividends", "splits"]: - try: - df = pd.DataFrame(data[symbol]["events"][event].values()) - df.set_index("date", inplace=True) - df.index = pd.to_datetime(df.index, unit="s") + pd.Timedelta( - (data[symbol]["meta"]["gmtoffset"] * adj_timezone), "s" - ) - if params["interval"][-1] not in ["m", "h"]: - df.index = df.index.date - if event == "dividends": - df.rename(columns={"amount": "dividends"}, inplace=True) - else: - df["splits"] = df["numerator"] / df["denominator"] - df = df[["splits"]] - dataframes.append(df) - except KeyError: - pass - return ( - pd.merge( - dataframes[0], dataframes[1], how="outer", left_index=True, right_index=True - ) - if len(dataframes) > 1 - else dataframes[0] - ) + df.index = index + return df From 74530c0302fb1a019fd1a49e20d12f9dde0b8c9b Mon Sep 17 00:00:00 2001 From: Marcus Read Date: Thu, 24 Nov 2022 22:28:33 +0000 Subject: [PATCH 2/3] Add timezone info to `.history` return Adds timezone info to: - index of intraday data. - any live indice of daily data. Also, sets column order of return as ohlcv then any 'adjclose', 'dividends', 'splits'. --- yahooquery/utils/__init__.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/yahooquery/utils/__init__.py b/yahooquery/utils/__init__.py index 31eff44..836ab7d 100644 --- a/yahooquery/utils/__init__.py +++ b/yahooquery/utils/__init__.py @@ -121,8 +121,8 @@ def _convert_to_timestamp(date=None, start=True): def _get_daily_index(data, index_utc, adj_timezone): # evalute if last indice represents a live interval - last_trade_secs = data["meta"]["regularMarketTime"] * 10**9 - last_trade = pd.Timestamp(last_trade_secs, tz="UTC") + timestamp = data["meta"]["regularMarketTime"] + last_trade = pd.Timestamp.fromtimestamp(timestamp, tz="UTC") has_live_indice = index_utc[-1] >= last_trade - pd.Timedelta(2, "S") if has_live_indice: # remove it @@ -130,8 +130,9 @@ def _get_daily_index(data, index_utc, adj_timezone): index_utc = index_utc[:-1] # evaluate if it should be put back later. If the close price for # the day is already included in the data, i.e. if the live indice - # is simply duplicating data represented in the prior row, then the - # following will evaluate to False. + # is simply duplicating data represented in the prior row, then the + # following will evaluate to False (as live_indice will now be + # within one day of the prior indice) keep_live_indice = live_indice > index_utc[-1] + datetime.timedelta(1) tz = data["meta"]["exchangeTimezoneName"] @@ -152,9 +153,7 @@ def _get_daily_index(data, index_utc, adj_timezone): index = pd.Index(index.date) if has_live_indice and keep_live_indice: live_indice = live_indice.astimezone(tz) if adj_timezone else live_indice - # do not keep tz info - live_indice = live_indice.tz_localize(None).to_pydatetime() - index = index.insert(len(index), live_indice) + index = index.insert(len(index), live_indice.to_pydatetime()) return index @@ -169,30 +168,33 @@ def _event_as_srs(event_data, event): def _history_dataframe(data, daily, adj_timezone=True): data_dict = data["indicators"]["quote"][0].copy() + cols = [ + col for col in ("open", "high", "low", "close", "volume") if col in data_dict + ] if "adjclose" in data["indicators"]: data_dict["adjclose"] = data["indicators"]["adjclose"][0]["adjclose"] + cols.append("adjclose") if 'events' in data: for event, event_data in data["events"].items(): if event not in ("dividends", "splits"): continue data_dict[event] = _event_as_srs(event_data, event) + cols.append(event) df = pd.DataFrame(data_dict, index=data["timestamp"]) # align all on timestamps df.dropna(how="all", inplace=True) + df = df[cols] # determine column order - index_utc = pd.to_datetime(df.index, unit="s", utc=True) + index = pd.to_datetime(df.index, unit="s", utc=True) if daily: - index = _get_daily_index(data, index_utc, adj_timezone) + index = _get_daily_index(data, index, adj_timezone) if len(index) == len(df) - 1: # a live_indice was removed df = df.iloc[:-1] elif adj_timezone: tz = data["meta"]["exchangeTimezoneName"] - # localize and remove tz info - index = index_utc.tz_convert(tz).tz_localize(None) - else: - index = index_utc.tz_localize(None) # remove UTC tz info + index = index.tz_convert(tz) df.index = index return df From 9d8fd98c6e9cc0da2c4f800f3d3260440744b86f Mon Sep 17 00:00:00 2001 From: Marcus Read Date: Thu, 24 Nov 2022 22:30:32 +0000 Subject: [PATCH 3/3] Add tests for `utils.__init__._history_dataframe` Adds `TestHistoryDataframe` test class. --- tests/test_ticker.py | 665 ++++++++++++++++++++++++++++++++++- yahooquery/utils/__init__.py | 8 +- 2 files changed, 664 insertions(+), 9 deletions(-) diff --git a/tests/test_ticker.py b/tests/test_ticker.py index cbaf9aa..ac2fe35 100644 --- a/tests/test_ticker.py +++ b/tests/test_ticker.py @@ -1,10 +1,13 @@ -import pytest -import itertools +import datetime import os + +import itertools +import pytest import pandas as pd -from datetime import datetime +from pandas.testing import assert_index_equal, assert_frame_equal, assert_series_equal from yahooquery import Ticker +from yahooquery.utils.__init__ import _history_dataframe TICKERS = [ @@ -148,8 +151,8 @@ def test_p_get_financial_data(ticker): ) def test_history(ticker, period, interval): assert isinstance(ticker.history(period, interval), pd.DataFrame) - - + + def test_dividend_history(ticker): df = ticker.dividend_history(start='1970-01-01') assert isinstance(df, pd.DataFrame) @@ -160,7 +163,8 @@ def test_dividend_history(ticker): [ (start, end) for start, end in zip( - [datetime(2019, 1, 1), "2019-01-01"], ["2019-12-30", datetime(2019, 12, 30)] + [datetime.datetime(2019, 1, 1), "2019-01-01"], + ["2019-12-30", datetime.datetime(2019, 12, 30)], ) ], ) @@ -178,3 +182,652 @@ def test_history_bad_args(ticker, period, interval): def test_adj_ohlc(ticker): assert ticker.history(period="max", adj_ohlc=True) is not None + + +class TestHistoryDataframe(): + """Tests for `utils.__init__._history_dataframe` and dependencies.""" + + @pytest.fixture + def tz_us(self): + yield 'America/New_York' + + @pytest.fixture + def tz_oz(self): + yield 'Australia/Sydney' + + @pytest.fixture + def tz_hk(self): + yield 'Asia/Hong_Kong' + + @pytest.fixture + def utc(self): + yield 'UTC' + + @pytest.fixture + def timestamps_daily(self, utc, tz_oz, tz_us, tz_hk): + """Timestamps representing fictional open datetimes and expected mapped days. + + Expected conversions to specific timezones explicitly declared and asserted. + + Yields + ------- + tuple[list[int] + [0] [list[int]] + Unix timestamps, i.e. format as used by Yahoo API. Timestamps represent + datetimes of session opens in terms of UTC. + + [1] pd.Index dtype 'object', values as type datetime.date + Expected days that timestamps would map to if local timezone were + 'America/New_York'. In this case all timestamps are expected to map to + the day of the date of the timestamp. + + [2] pd.Index dtype 'object', values as type datetime.date + Expected days that timestamps would map to if local timezone were + 'Australia/Sydney'. In this case all timestamps are expected to map to + the day after the date of the timestamp. + + [3] pd.Index dtype 'object', values as type datetime.date + Expected days that timestamps would map to if local timezone were 'UTC'. + The first timestamp is expected to map to day of the date of the + timestamp. All other timestamps are expected to map to the day after. + """ + tss = [ + 1667568600, + 1667831400, + 1667917800, + 1668004200, + 1668090600, + 1668177000, + 1668436200, + 1668522600, + 1668609000, + 1668695400, + 1668781800, + 1669041000, + 1669127400, + 1669213800, + ] + + expected_utc = pd.DatetimeIndex( + [ + '2022-11-04 13:30:00', '2022-11-07 14:30:00', + '2022-11-08 14:30:00', '2022-11-09 14:30:00', + '2022-11-10 14:30:00', '2022-11-11 14:30:00', + '2022-11-14 14:30:00', '2022-11-15 14:30:00', + '2022-11-16 14:30:00', '2022-11-17 14:30:00', + '2022-11-18 14:30:00', '2022-11-21 14:30:00', + '2022-11-22 14:30:00', '2022-11-23 14:30:00', + ], + tz=utc, + ) + dti = pd.to_datetime(tss, unit="s") + dti_utc = dti.tz_localize(utc) + assert_index_equal(dti_utc, expected_utc) + expected_utc_days = pd.Index( + [ + datetime.date(2022, 11, 4), + datetime.date(2022, 11, 8), + datetime.date(2022, 11, 9), + datetime.date(2022, 11, 10), + datetime.date(2022, 11, 11), + datetime.date(2022, 11, 12), + datetime.date(2022, 11, 15), + datetime.date(2022, 11, 16), + datetime.date(2022, 11, 17), + datetime.date(2022, 11, 18), + datetime.date(2022, 11, 19), + datetime.date(2022, 11, 22), + datetime.date(2022, 11, 23), + datetime.date(2022, 11, 24), + ] + ) + + expected_oz = pd.DatetimeIndex( + [ + '2022-11-05 00:30:00', '2022-11-08 01:30:00', + '2022-11-09 01:30:00', '2022-11-10 01:30:00', + '2022-11-11 01:30:00', '2022-11-12 01:30:00', + '2022-11-15 01:30:00', '2022-11-16 01:30:00', + '2022-11-17 01:30:00', '2022-11-18 01:30:00', + '2022-11-19 01:30:00', '2022-11-22 01:30:00', + '2022-11-23 01:30:00', '2022-11-24 01:30:00', + ], + tz=tz_oz, + ) + dti_oz = dti_utc.tz_convert(tz_oz) + assert_index_equal(dti_oz, expected_oz) + expected_oz_days = pd.Index( + [ + datetime.date(2022, 11, 5), + datetime.date(2022, 11, 8), + datetime.date(2022, 11, 9), + datetime.date(2022, 11, 10), + datetime.date(2022, 11, 11), + datetime.date(2022, 11, 12), + datetime.date(2022, 11, 15), + datetime.date(2022, 11, 16), + datetime.date(2022, 11, 17), + datetime.date(2022, 11, 18), + datetime.date(2022, 11, 19), + datetime.date(2022, 11, 22), + datetime.date(2022, 11, 23), + datetime.date(2022, 11, 24), + ] + ) + assert_index_equal(pd.Index(dti_oz.date), expected_oz_days) + + expected_us = pd.DatetimeIndex( + [ + '2022-11-04 09:30:00', '2022-11-07 09:30:00', + '2022-11-08 09:30:00', '2022-11-09 09:30:00', + '2022-11-10 09:30:00', '2022-11-11 09:30:00', + '2022-11-14 09:30:00', '2022-11-15 09:30:00', + '2022-11-16 09:30:00', '2022-11-17 09:30:00', + '2022-11-18 09:30:00', '2022-11-21 09:30:00', + '2022-11-22 09:30:00', '2022-11-23 09:30:00', + ], + tz=tz_us, + ) + dti_us = dti_utc.tz_convert(tz_us) + assert_index_equal(dti_us, expected_us) + expected_us_days = pd.Index( + [ + datetime.date(2022, 11, 4), + datetime.date(2022, 11, 7), + datetime.date(2022, 11, 8), + datetime.date(2022, 11, 9), + datetime.date(2022, 11, 10), + datetime.date(2022, 11, 11), + datetime.date(2022, 11, 14), + datetime.date(2022, 11, 15), + datetime.date(2022, 11, 16), + datetime.date(2022, 11, 17), + datetime.date(2022, 11, 18), + datetime.date(2022, 11, 21), + datetime.date(2022, 11, 22), + datetime.date(2022, 11, 23), + ] + ) + assert_index_equal(pd.Index(dti_us.date), expected_us_days) + + expected_hk = pd.DatetimeIndex( + [ + '2022-11-04 21:30', '2022-11-07 22:30', + '2022-11-08 22:30', '2022-11-09 22:30', + '2022-11-10 22:30', '2022-11-11 22:30', + '2022-11-14 22:30', '2022-11-15 22:30', + '2022-11-16 22:30', '2022-11-17 22:30', + '2022-11-18 22:30', '2022-11-21 22:30', + '2022-11-22 22:30', '2022-11-23 22:30', + ], + tz=tz_hk, + ) + dti_hk = dti_utc.tz_convert(tz_hk) + assert_index_equal(dti_hk, expected_hk) + expected_hk_days = expected_oz_days # same, both should map to next day + assert_index_equal( + pd.Index(dti_hk.date + datetime.timedelta(1)), expected_hk_days + ) + + yield ( + tss, expected_us_days, expected_oz_days, expected_hk_days, expected_utc_days + ) + + @pytest.fixture + def quote(self): + """Fictional mock OHLCV data for 14 datapoints. + + Yields both unordered data and dictionary representing expected + order of return. + """ + opens = list(range(2, 16)) + lows = list(range(1, 15)) + highs = list(range(4, 18)) + closes = list(range(3, 17)) + volumes = list(range(50, 64)) + data = { + "volume": volumes, + "close": closes, + "open": opens, + "high": highs, + "low": lows, + } + expected = { + "open": opens, + "high": highs, + "low": lows, + "close": closes, + "volume": volumes, + } + yield data, expected + + @pytest.fixture + def adjclose(self): + """Fictional mock adjclose data for 14 datapoints.""" + yield [i + 0.25 for i in range(3, 17)] + + @staticmethod + def get_dividends(tss): + """Get fictional mock dividends data for 2 timestamps of `tss`. + + Returns + ------- + tuple[dict[str, dict[str, float | int]], list[float]] + [0] dict[str, dict[str, float | int]] + Mock data for symbol_data["events"]["dividends"]. Data + includes dividends for two timestamps of `tss`. + [1] list[float] + Expected contents of dividends column of DataFrame created + for `tss` and with data that includes [0]. + """ + indices = (2, 8) + amount = 0.12 + d = {str(tss[i]): {'amount': amount, 'date': tss[i]} for i in indices} + expected = [amount if i in indices else float('nan') for i in range(14)] + return d, expected + + @pytest.fixture + def dividends_daily(self, timestamps_daily): + """Mock data and expected col values for daily dividends. + + See `get_dividends.__doc__` + """ + yield self.get_dividends(timestamps_daily[0]) + + @staticmethod + def get_splits(tss): + """Get fictional mock splits data for 1 timestamps of `tss`. + + Returns + ------- + tuple[dict[str, dict[str, int | str]], list[float]] + [0] dict[str, dict[str, float | int]] + Mock data for symbol_data["events"]["splits"]. Data + includes splits for one timestamp of `tss`. + [1] list[float] + Expected contents of splits column of DataFrame created + for `tss` and with data that includes [0]. + """ + indice = 11 + ts = tss[indice] + d = { + str(ts): { + 'data': ts, + 'numerator': 3, + 'denominator': 1, + 'splitRatio': '3:1' + } + } + expected = [3 if i == indice else float('nan') for i in range(14)] + return d, expected + + @pytest.fixture + def splits_daily(self, timestamps_daily): + """Mock data and expected col values for daily splits. + + See `get_splits.__doc__` + """ + yield self.get_splits(timestamps_daily[0]) + + @staticmethod + def build_mock_data( + tss, tz, quote, adjclose=None, splits=None, dividends=None, last_trade=None + ): + """Get mock data for a symbol from which to create dataframe. + + Return can be passed as `data` parameter of `_history_dataframe`. + """ + if last_trade is None: + last_trade = 1669237204 + expected_ts = pd.Timestamp('2022-11-23 21:00:04') + assert pd.Timestamp.fromtimestamp(last_trade) == expected_ts + meta = { + 'regularMarketTime': last_trade, + 'exchangeTimezoneName': tz, + } + + indicators = {"quote": [quote.copy()]} + if adjclose is not None: + indicators["adjclose"] = [{"adjclose": adjclose}] + + events = {"fake_event": {'1667568600': {"fake_event_key": 66.6}}} + for key, event_data in zip(("dividends", "splits"), (dividends, splits)): + if event_data is None: + continue + events[key] = event_data + + return dict(meta=meta, indicators=indicators, timestamp=tss, events=events) + + @staticmethod + def create_expected(expected_index, quote, dividends, splits, adjclose=None): + """Create expected return from column parts.""" + df = pd.DataFrame(quote, index=expected_index) + if adjclose is not None: + df["adjclose"] = adjclose + df["dividends"] = dividends + df["splits"] = splits + return df + + @staticmethod + def verify_expected_daily_row_11(df, indice): + """Hard coded sanity check on specific row of expected dataframe.""" + i = 11 + expected = pd.Series( + dict(open=13, high=15, low=12, close=14, volume=61, adjclose=14.25), + name=indice, + ) + assert_series_equal(df.iloc[i][:-2], expected) + assert pd.isna(df.iloc[i][-2]) # no dividends + assert df.iloc[i][-1] == 3 # splits + return df + + @pytest.fixture + def expected_daily_utc( + self, timestamps_daily, quote, dividends_daily, splits_daily, adjclose + ): + """Expected return if timestamps interpreted with local tz as utc.""" + df = self.create_expected( + timestamps_daily[4], quote[1], dividends_daily[1], splits_daily[1], adjclose + ) + self.verify_expected_daily_row_11(df, datetime.date(2022, 11, 22)) + yield df + + @pytest.fixture + def expected_daily_us( + self, timestamps_daily, quote, dividends_daily, splits_daily, adjclose + ): + """Expected return if timestamps interpreted with local tz as us.""" + df = self.create_expected( + timestamps_daily[1], quote[1], dividends_daily[1], splits_daily[1], adjclose + ) + self.verify_expected_daily_row_11(df, datetime.date(2022, 11, 21)) + yield df + + @pytest.fixture + def expected_daily_us_bare(self, timestamps_daily, quote): + """As `expected_daily_us` with only ohlcv columns.""" + df = pd.DataFrame(quote[1], index=timestamps_daily[1]) + # Hard coded sanity check for specific row + i = 11 + expected = pd.Series( + dict(open=13, high=15, low=12, close=14, volume=61), + name=datetime.date(2022, 11, 21), + ) + assert_series_equal(df.iloc[i], expected) + yield df + + @pytest.fixture + def expected_daily_oz( + self, timestamps_daily, quote, dividends_daily, splits_daily, adjclose + ): + """Expected return if timestamps interpreted with local tz as oz.""" + df = self.create_expected( + timestamps_daily[2], quote[1], dividends_daily[1], splits_daily[1], adjclose + ) + self.verify_expected_daily_row_11(df, datetime.date(2022, 11, 22)) + yield df + + @pytest.fixture + def expected_daily_hk( + self, timestamps_daily, quote, dividends_daily, splits_daily, adjclose + ): + """Expected return if timestamps interpreted with local tz as oz.""" + df = self.create_expected( + timestamps_daily[3], quote[1], dividends_daily[1], splits_daily[1], adjclose + ) + self.verify_expected_daily_row_11(df, datetime.date(2022, 11, 22)) + yield df + + def test_daily( + self, + timestamps_daily, + quote, + adjclose, + dividends_daily, + splits_daily, + expected_daily_utc, + expected_daily_us, + expected_daily_oz, + expected_daily_hk, + utc, + tz_us, + tz_oz, + tz_hk, + ): + """Test for expected returns for mock data reflecting a daily period.""" + + def f(data, adj_timezone): + return _history_dataframe(data, daily=True, adj_timezone=adj_timezone) + + tss = timestamps_daily[0] + quote_, _ = quote + adjclose_ = adjclose + splits, _ = splits_daily + dividends, _ = dividends_daily + + expecteds = ( + expected_daily_utc, expected_daily_us, expected_daily_oz, expected_daily_hk + ) + tzs = (utc, tz_us, tz_oz, tz_hk) + for expected, tz in zip(expecteds, tzs): + data = self.build_mock_data(tss, tz, quote_, adjclose_, splits, dividends) + for adj_timezone in (True, False): + # tz makes no difference as daily and there is no live indice + rtrn = f(data, adj_timezone=adj_timezone) + assert_frame_equal(rtrn, expected) + + # check effect if there are no dividends and/or splits + expected = expected_daily_us + tz = tz_us + adj_timezone = False + # no dividends + dividends_srs = expected.pop("dividends") + data = self.build_mock_data(tss, tz, quote_, adjclose_, splits=splits) + rtrn = f(data, adj_timezone) + assert_frame_equal(rtrn, expected) + # no splits + expected.pop("splits") + expected["dividends"] = dividends_srs + data = self.build_mock_data(tss, tz, quote_, adjclose_, dividends=dividends) + rtrn = f(data, adj_timezone) + assert_frame_equal(rtrn, expected) + # neither dividends nor splits + expected.pop("dividends") + data = self.build_mock_data(tss, tz, quote_, adjclose_) + rtrn = f(data, adj_timezone) + assert_frame_equal(rtrn, expected) + + def test_live_indice( + self, timestamps_daily, expected_daily_us_bare, tz_us, utc, quote + ): + """Test daily data with live indice.""" + live_indice = 1669231860 + expected_li_ts = pd.Timestamp("2022-11-23 19:31") + assert pd.Timestamp.fromtimestamp(live_indice) == expected_li_ts + + tss, expected_days, *_ = timestamps_daily + tss = tss[:-1] + tss.append(live_indice) + + expected_df = expected_daily_us_bare + data = self.build_mock_data(tss, tz_us, quote[0], last_trade=live_indice) + + # verify live indice has utc timezone when adj_timezone True + rtrn = _history_dataframe(data, daily=True, adj_timezone=False) + expected_li = pd.Timestamp("2022-11-23 19:31", tz=utc).to_pydatetime() + expected_index = expected_days[:-1] + expected_index = expected_index.insert(len(expected_index), expected_li) + expected_df.index = expected_index + assert_frame_equal(rtrn, expected_df) + + # verify live indice has local timezone when adj_timezone True + rtrn = _history_dataframe(data, daily=True, adj_timezone=True) + expected_li = pd.Timestamp("2022-11-23 14:31", tz=tz_us).to_pydatetime() + expected_index = expected_index[:-1].insert(len(expected_index)-1, expected_li) + expected_df.index = expected_index + assert_frame_equal(rtrn, expected_df) + + def test_duplicate_live_indice( + self, timestamps_daily, expected_daily_us_bare, tz_us, quote + ): + """Test live indice removed if day already represented.""" + live_indice = 1669237204 + expected_li_ts = pd.Timestamp("2022-11-23 21:00:04") + assert pd.Timestamp.fromtimestamp(live_indice) == expected_li_ts + + tss = timestamps_daily[0] + # to get it all to fit to 14 indices, lose the first ts + tss = tss[1:] + tss.append(live_indice) + + data = self.build_mock_data(tss, tz_us, quote[0], last_trade=live_indice) + rtrn = _history_dataframe(data, daily=True, adj_timezone=False) + + # create expected + expected_template = expected_daily_us_bare + expected_index = expected_template.index[1:] + assert expected_index[-1] == datetime.date(2022, 11, 23) + # last row, live indice, expected to be removed as day already represented + expected_df = expected_template[:-1] + expected_df.index = expected_index + assert_frame_equal(rtrn, expected_df) + + @pytest.fixture + def timestamps_intraday(self, utc): + """Timestamps representing fictional datetimes and expected mapped indices. + + Timestamps cover two days with change in DST observance. + + Yields + ------- + tuple[list[int] + [0] [list[int]] + Unix timestamps, i.e. format as used by Yahoo API. Timestamps represent + datetimes of hourly indices in terms of UTC. + + [1] pd.DatetimeIndex dtype 'datetime64[ns, UTC]' + Expected indices that timestamps would map to if local timezone were + 'UTC'. + """ + tss = [ + 1667568600, + 1667572200, + 1667575800, + 1667579400, + 1667583000, + 1667586600, + 1667590200, + 1667831400, + 1667835000, + 1667838600, + 1667842200, + 1667845800, + 1667849400, + 1667853000, + ] + + expected_index_utc = pd.DatetimeIndex( + [ + '2022-11-04 13:30:00', '2022-11-04 14:30:00', + '2022-11-04 15:30:00', '2022-11-04 16:30:00', + '2022-11-04 17:30:00', '2022-11-04 18:30:00', + '2022-11-04 19:30:00', '2022-11-07 14:30:00', + '2022-11-07 15:30:00', '2022-11-07 16:30:00', + '2022-11-07 17:30:00', '2022-11-07 18:30:00', + '2022-11-07 19:30:00', '2022-11-07 20:30:00' + ], + tz=utc, + ) + dti = pd.to_datetime(tss, unit="s") + dti_utc = dti.tz_localize(utc) + assert_index_equal(dti_utc, expected_index_utc) + + yield tss, expected_index_utc + + @pytest.fixture + def dividends_intraday(self, timestamps_intraday): + """Get mock data and expected col values for intraday dividends. + + The Yahoo API attaches any dividends to the first intraday indice + of each session. This mock does not respect this alignment, which + is inconsequential for the test purposes. + + See `get_dividends.__doc__`. + """ + yield self.get_dividends(timestamps_intraday[0]) + + @pytest.fixture + def splits_intraday(self, timestamps_intraday): + """Mock data and expected col values for intraday splits. + + The Yahoo API attaches any dividends to the first intraday indice + of each session. This mock does not respect this alignment, which + is inconsequential for the test purposes. + + See `get_splits.__doc__`. + """ + yield self.get_splits(timestamps_intraday[0]) + + @pytest.fixture + def expected_intraday( + self, timestamps_intraday, quote, dividends_intraday, splits_intraday + ): + """Expected return for intraday timestamps.""" + _, expected_utc = timestamps_intraday + df = self.create_expected( + expected_utc, quote[1], dividends_intraday[1], splits_intraday[1] + ) + # hard coded sanity check on specific row + i = 8 + expected = pd.Series( + dict(open=10, high=12, low=9, close=11, volume=58, dividends=0.12), + name=pd.Timestamp("2022-11-7 15:30", tz="UTC"), + ) + assert_series_equal(df.iloc[i][:-1], expected) + assert pd.isna(df.iloc[i][-1]) + yield df + + def test_intraday( + self, + timestamps_intraday, + tz_us, + quote, + splits_intraday, + dividends_intraday, + expected_intraday, + ): + """Test for expected returns for mock data reflecting a daily period.""" + + def f(data, adj_timezone): + return _history_dataframe(data, daily=False, adj_timezone=adj_timezone) + + tz = tz_us + tss, _ = timestamps_intraday + quote_, _ = quote + splits, _ = splits_intraday + dividends, _ = dividends_intraday + + data = self.build_mock_data(tss, tz, quote_, splits=splits, dividends=dividends) + rtrn = f(data, adj_timezone=False) + expected = expected_intraday + assert_frame_equal(rtrn, expected) + rtrn = f(data, adj_timezone=True) + expected.index = expected.index.tz_convert(tz) + assert_frame_equal(rtrn, expected) + + # no dividends + dividends_srs = expected.pop("dividends") + data = self.build_mock_data(tss, tz, quote_, splits=splits) + rtrn = f(data, adj_timezone=True) + assert_frame_equal(rtrn, expected) + # no splits + expected.pop("splits") + expected["dividends"] = dividends_srs + data = self.build_mock_data(tss, tz, quote_, dividends=dividends) + rtrn = f(data, adj_timezone=True) + assert_frame_equal(rtrn, expected) + # neither dividends nor splits + expected.pop("dividends") + data = self.build_mock_data(tss, tz, quote_) + rtrn = f(data, adj_timezone=True) + assert_frame_equal(rtrn, expected) diff --git a/yahooquery/utils/__init__.py b/yahooquery/utils/__init__.py index 836ab7d..27af52c 100644 --- a/yahooquery/utils/__init__.py +++ b/yahooquery/utils/__init__.py @@ -122,18 +122,20 @@ def _convert_to_timestamp(date=None, start=True): def _get_daily_index(data, index_utc, adj_timezone): # evalute if last indice represents a live interval timestamp = data["meta"]["regularMarketTime"] - last_trade = pd.Timestamp.fromtimestamp(timestamp, tz="UTC") + last_trade = pd.Timestamp.fromtimestamp(timestamp) + last_trade = last_trade.tz_localize("UTC") has_live_indice = index_utc[-1] >= last_trade - pd.Timedelta(2, "S") if has_live_indice: # remove it live_indice = index_utc[-1] index_utc = index_utc[:-1] + ONE_DAY = datetime.timedelta(1) # evaluate if it should be put back later. If the close price for # the day is already included in the data, i.e. if the live indice # is simply duplicating data represented in the prior row, then the # following will evaluate to False (as live_indice will now be # within one day of the prior indice) - keep_live_indice = live_indice > index_utc[-1] + datetime.timedelta(1) + keep_live_indice = index_utc.empty or live_indice > index_utc[-1] + ONE_DAY tz = data["meta"]["exchangeTimezoneName"] index_local = index_utc.tz_convert(tz) @@ -187,7 +189,7 @@ def _history_dataframe(data, daily, adj_timezone=True): df = df[cols] # determine column order index = pd.to_datetime(df.index, unit="s", utc=True) - if daily: + if daily and not df.empty: index = _get_daily_index(data, index, adj_timezone) if len(index) == len(df) - 1: # a live_indice was removed