diff --git a/tests/ticker.py b/tests/ticker.py index 681f0387d..5db57316b 100644 --- a/tests/ticker.py +++ b/tests/ticker.py @@ -18,6 +18,8 @@ import unittest import requests_cache from typing import Union, Any +import re +from urllib.parse import urlparse, parse_qs, urlencode, urlunparse ticker_attributes = ( ("major_holders", pd.DataFrame), @@ -76,7 +78,7 @@ def test_getTz(self): tkrs = ["IMP.JO", "BHG.JO", "SSW.JO", "BP.L", "INTC"] for tkr in tkrs: # First step: remove ticker from tz-cache - yf.utils.get_tz_cache().store(tkr, None) + yf.cache.get_tz_cache().store(tkr, None) # Test: dat = yf.Ticker(tkr, session=self.session) @@ -295,14 +297,25 @@ def test_no_expensive_calls_introduced(self): will quickly trigger spam-block when doing bulk download of history data. """ symbol = "GOOGL" - range = "1y" + period = "1y" with requests_cache.CachedSession(backend="memory") as session: ticker = yf.Ticker(symbol, session=session) - ticker.history(range) - actual_urls_called = tuple([r.url for r in session.cache.filter()]) + ticker.history(period=period) + actual_urls_called = [r.url for r in session.cache.filter()] + + # Remove 'crumb' argument + for i in range(len(actual_urls_called)): + u = actual_urls_called[i] + parsed_url = urlparse(u) + query_params = parse_qs(parsed_url.query) + query_params.pop('crumb', None) + query_params.pop('cookie', None) + u = urlunparse(parsed_url._replace(query=urlencode(query_params, doseq=True))) + actual_urls_called[i] = u + actual_urls_called = tuple(actual_urls_called) expected_urls = ( - f"https://query2.finance.yahoo.com/v8/finance/chart/{symbol}?events=div%2Csplits%2CcapitalGains&includePrePost=False&interval=1d&range={range}", + f"https://query2.finance.yahoo.com/v8/finance/chart/{symbol}?events=div%2Csplits%2CcapitalGains&includePrePost=False&interval=1d&range={period}", ) self.assertEqual( expected_urls, diff --git a/yfinance/__init__.py b/yfinance/__init__.py index bdbc93d62..141be3621 100644 --- a/yfinance/__init__.py +++ b/yfinance/__init__.py @@ -23,7 +23,8 @@ from .ticker import Ticker from .tickers import Tickers from .multi import download -from .utils import set_tz_cache_location, enable_debug_mode +from .utils import enable_debug_mode +from .cache import set_tz_cache_location __version__ = version.version __author__ = "Ran Aroussi" diff --git a/yfinance/base.py b/yfinance/base.py index 21af2bb51..4c981f225 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -34,9 +34,8 @@ import pandas as pd import requests -from . import shared -from . import utils -from .data import TickerData +from . import shared, utils, cache +from .data import YfData from .scrapers.analysis import Analysis from .scrapers.fundamentals import Fundamentals from .scrapers.holders import Holders @@ -68,12 +67,12 @@ def __init__(self, ticker, session=None): if utils.is_isin(self.ticker): self.ticker = utils.get_ticker_by_isin(self.ticker, None, session) - self._data: TickerData = TickerData(self.ticker, session=session) + self._data: YfData = YfData(session=session) - self._analysis = Analysis(self._data) - self._holders = Holders(self._data) - self._quote = Quote(self._data) - self._fundamentals = Fundamentals(self._data) + self._analysis = Analysis(self._data, ticker) + self._holders = Holders(self._data, ticker) + self._quote = Quote(self._data, ticker) + self._fundamentals = Fundamentals(self._data, ticker) self._fast_info = None @@ -1641,12 +1640,12 @@ def map_signals_to_ranges(f, f_up, f_down): def _get_ticker_tz(self, proxy, timeout): if self._tz is not None: return self._tz - cache = utils.get_tz_cache() - tz = cache.lookup(self.ticker) + c = cache.get_tz_cache() + tz = c.lookup(self.ticker) if tz and not utils.is_valid_timezone(tz): # Clear from cache and force re-fetch - cache.store(self.ticker, None) + c.store(self.ticker, None) tz = None if tz is None: @@ -1654,7 +1653,7 @@ def _get_ticker_tz(self, proxy, timeout): if utils.is_valid_timezone(tz): # info fetch is relatively slow so cache timezone - cache.store(self.ticker, tz) + c.store(self.ticker, tz) else: tz = None diff --git a/yfinance/cache.py b/yfinance/cache.py new file mode 100644 index 000000000..d1acf2810 --- /dev/null +++ b/yfinance/cache.py @@ -0,0 +1,400 @@ +import peewee as _peewee +from threading import Lock +import os as _os +import appdirs as _ad +import atexit as _atexit +import datetime as _datetime +import pickle as _pkl + +from .utils import get_yf_logger + +_cache_init_lock = Lock() + +# -------------- +# TimeZone cache +# -------------- + +class _TzCacheException(Exception): + pass + + +class _TzCacheDummy: + """Dummy cache to use if tz cache is disabled""" + + def lookup(self, tkr): + return None + + def store(self, tkr, tz): + pass + + @property + def tz_db(self): + return None + + +class _TzCacheManager: + _tz_cache = None + + @classmethod + def get_tz_cache(cls): + if cls._tz_cache is None: + with _cache_init_lock: + cls._initialise() + return cls._tz_cache + + @classmethod + def _initialise(cls, cache_dir=None): + cls._tz_cache = _TzCache() + + +class _TzDBManager: + _db = None + _cache_dir = _os.path.join(_ad.user_cache_dir(), "py-yfinance") + + @classmethod + def get_database(cls): + if cls._db is None: + cls._initialise() + return cls._db + + @classmethod + def close_db(cls): + if cls._db is not None: + try: + cls._db.close() + except Exception: + # Must discard exceptions because Python trying to quit. + pass + + + @classmethod + def _initialise(cls, cache_dir=None): + if cache_dir is not None: + cls._cache_dir = cache_dir + + if not _os.path.isdir(cls._cache_dir): + try: + _os.makedirs(cls._cache_dir) + except OSError as err: + raise _TzCacheException(f"Error creating TzCache folder: '{cls._cache_dir}' reason: {err}") + elif not (_os.access(cls._cache_dir, _os.R_OK) and _os.access(cls._cache_dir, _os.W_OK)): + raise _TzCacheException(f"Cannot read and write in TzCache folder: '{cls._cache_dir}'") + + cls._db = _peewee.SqliteDatabase( + _os.path.join(cls._cache_dir, 'tkr-tz.db'), + pragmas={'journal_mode': 'wal', 'cache_size': -64} + ) + + old_cache_file_path = _os.path.join(cls._cache_dir, "tkr-tz.csv") + if _os.path.isfile(old_cache_file_path): + _os.remove(old_cache_file_path) + + @classmethod + def set_location(cls, new_cache_dir): + if cls._db is not None: + cls._db.close() + cls._db = None + cls._cache_dir = new_cache_dir + + @classmethod + def get_location(cls): + return cls._cache_dir + +# close DB when Python exists +_atexit.register(_TzDBManager.close_db) + + +tz_db_proxy = _peewee.Proxy() +class _KV(_peewee.Model): + key = _peewee.CharField(primary_key=True) + value = _peewee.CharField(null=True) + + class Meta: + database = tz_db_proxy + without_rowid = True + + +class _TzCache: + def __init__(self): + self.initialised = -1 + self.db = None + self.dummy = False + + def get_db(self): + if self.db is not None: + return self.db + + try: + self.db = _TzDBManager.get_database() + except _TzCacheException as err: + get_yf_logger().info(f"Failed to create TzCache, reason: {err}. " + "TzCache will not be used. " + "Tip: You can direct cache to use a different location with 'set_tz_cache_location(mylocation)'") + self.dummy = True + return None + return self.db + + def initialise(self): + if self.initialised != -1: + return + + db = self.get_db() + if db is None: + self.initialised = 0 # failure + return + + db.connect() + tz_db_proxy.initialize(db) + db.create_tables([_KV]) + self.initialised = 1 # success + + def lookup(self, key): + if self.dummy: + return None + + if self.initialised == -1: + self.initialise() + + if self.initialised == 0: # failure + return None + + try: + return _KV.get(_KV.key == key).value + except _KV.DoesNotExist: + return None + + def store(self, key, value): + if self.dummy: + return + + if self.initialised == -1: + self.initialise() + + if self.initialised == 0: # failure + return + + db = self.get_db() + if db is None: + return + try: + if value is None: + q = _KV.delete().where(_KV.key == key) + q.execute() + return + with db.atomic(): + _KV.insert(key=key, value=value).execute() + except _peewee.IntegrityError: + # Integrity error means the key already exists. Try updating the key. + old_value = self.lookup(key) + if old_value != value: + get_yf_logger().debug(f"Value for key {key} changed from {old_value} to {value}.") + with db.atomic(): + q = _KV.update(value=value).where(_KV.key == key) + q.execute() + + +def get_tz_cache(): + return _TzCacheManager.get_tz_cache() + + +def set_tz_cache_location(cache_dir: str): + """ + Sets the path to create the "py-yfinance" cache folder in. + Useful if the default folder returned by "appdir.user_cache_dir()" is not writable. + Must be called before cache is used (that is, before fetching tickers). + :param cache_dir: Path to use for caches + :return: None + """ + _TzDBManager.set_location(cache_dir) + + + +# -------------- +# Cookie cache +# -------------- + +class _CookieCacheException(Exception): + pass + + +class _CookieCacheDummy: + """Dummy cache to use if Cookie cache is disabled""" + + def lookup(self, tkr): + return None + + def store(self, tkr, Cookie): + pass + + @property + def Cookie_db(self): + return None + + +class _CookieCacheManager: + _Cookie_cache = None + + @classmethod + def get_cookie_cache(cls): + if cls._Cookie_cache is None: + with _cache_init_lock: + cls._initialise() + return cls._Cookie_cache + + @classmethod + def _initialise(cls, cache_dir=None): + cls._Cookie_cache = _CookieCache() + + +class _CookieDBManager: + _db = None + _cache_dir = _os.path.join(_ad.user_cache_dir(), "py-yfinance") + + @classmethod + def get_database(cls): + if cls._db is None: + cls._initialise() + return cls._db + + @classmethod + def close_db(cls): + if cls._db is not None: + try: + cls._db.close() + except Exception: + # Must discard exceptions because Python trying to quit. + pass + + + @classmethod + def _initialise(cls, cache_dir=None): + if cache_dir is not None: + cls._cache_dir = cache_dir + + if not _os.path.isdir(cls._cache_dir): + try: + _os.makedirs(cls._cache_dir) + except OSError as err: + raise _CookieCacheException(f"Error creating CookieCache folder: '{cls._cache_dir}' reason: {err}") + elif not (_os.access(cls._cache_dir, _os.R_OK) and _os.access(cls._cache_dir, _os.W_OK)): + raise _CookieCacheException(f"Cannot read and write in CookieCache folder: '{cls._cache_dir}'") + + cls._db = _peewee.SqliteDatabase( + _os.path.join(cls._cache_dir, 'cookies.db'), + pragmas={'journal_mode': 'wal', 'cache_size': -64} + ) + + @classmethod + def set_location(cls, new_cache_dir): + if cls._db is not None: + cls._db.close() + cls._db = None + cls._cache_dir = new_cache_dir + + @classmethod + def get_location(cls): + return cls._cache_dir + +# close DB when Python exists +_atexit.register(_CookieDBManager.close_db) + + +Cookie_db_proxy = _peewee.Proxy() +class _CookieSchema(_peewee.Model): + strategy = _peewee.CharField(primary_key=True) + fetch_date = _peewee.DateTimeField(default=_datetime.datetime.now) + + # Which cookie type depends on strategy + cookie_bytes = _peewee.BlobField() + + class Meta: + database = Cookie_db_proxy + without_rowid = True + + +class _CookieCache: + def __init__(self): + self.initialised = -1 + self.db = None + self.dummy = False + + def get_db(self): + if self.db is not None: + return self.db + + try: + self.db = _CookieDBManager.get_database() + except _CookieCacheException as err: + get_yf_logger().info(f"Failed to create CookieCache, reason: {err}. " + "CookieCache will not be used. " + "Tip: You can direct cache to use a different location with 'set_tz_cache_location(mylocation)'") + self.dummy = True + return None + return self.db + + def initialise(self): + if self.initialised != -1: + return + + db = self.get_db() + if db is None: + self.initialised = 0 # failure + return + + db.connect() + Cookie_db_proxy.initialize(db) + db.create_tables([_CookieSchema]) + self.initialised = 1 # success + + def lookup(self, strategy): + if self.dummy: + return None + + if self.initialised == -1: + self.initialise() + + if self.initialised == 0: # failure + return None + + try: + data = _CookieSchema.get(_CookieSchema.strategy == strategy) + cookie = _pkl.loads(data.cookie_bytes) + return {'cookie':cookie, 'age':_datetime.datetime.now()-data.fetch_date} + except _CookieSchema.DoesNotExist: + return None + + def store(self, strategy, cookie): + if self.dummy: + return + + if self.initialised == -1: + self.initialise() + + if self.initialised == 0: # failure + return + + db = self.get_db() + if db is None: + return + try: + q = _CookieSchema.delete().where(_CookieSchema.strategy == strategy) + q.execute() + if cookie is None: + return + with db.atomic(): + cookie_pkl = _pkl.dumps(cookie, _pkl.HIGHEST_PROTOCOL) + _CookieSchema.insert(strategy=strategy, cookie_bytes=cookie_pkl).execute() + except _peewee.IntegrityError: + raise + # # Integrity error means the strategy already exists. Try updating the strategy. + # old_value = self.lookup(strategy) + # if old_value != cookie: + # get_yf_logger().debug(f"cookie for strategy {strategy} changed from {old_value} to {cookie}.") + # with db.atomic(): + # q = _CookieSchema.update(cookie=cookie).where(_CookieSchema.strategy == strategy) + # q.execute() + + +def get_cookie_cache(): + return _CookieCacheManager.get_cookie_cache() + diff --git a/yfinance/data.py b/yfinance/data.py index 442c7477a..b3791154b 100644 --- a/yfinance/data.py +++ b/yfinance/data.py @@ -1,16 +1,13 @@ import functools from functools import lru_cache -import logging - import requests as requests -import re -import random -import time +from bs4 import BeautifulSoup +import datetime from frozendict import frozendict -from . import utils +from . import utils, cache cache_maxsize = 64 @@ -36,25 +33,348 @@ def wrapped(*args, **kwargs): return wrapped -class TickerData: +import threading +class SingletonMeta(type): + """ + Metaclass that creates a Singleton instance. + """ + _instances = {} + _lock = threading.Lock() + + def __call__(cls, *args, **kwargs): + with cls._lock: + if cls not in cls._instances: + instance = super().__call__(*args, **kwargs) + cls._instances[cls] = instance + else: + cls._instances[cls]._set_session(*args, **kwargs) + return cls._instances[cls] + + +class YfData(metaclass=SingletonMeta): """ - Have one place to retrieve data from Yahoo API in order to ease caching and speed up operations + Have one place to retrieve data from Yahoo API in order to ease caching and speed up operations. + Singleton means one session one cookie shared by all threads. """ user_agent_headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} - def __init__(self, ticker: str, session=None): - self.ticker = ticker - self._session = session or requests + def __init__(self, session=None): + self._session = session or requests.Session() - def get(self, url, user_agent_headers=None, params=None, proxy=None, timeout=30): - proxy = self._get_proxy(proxy) + try: + self._session.cache + except AttributeError: + # Not caching + self._session_is_caching = False + else: + # Is caching. This is annoying. + # Can't simply use a non-caching session to fetch cookie & crumb, + # because then the caching-session won't have cookie. + self._session_is_caching = True + from requests_cache import DO_NOT_CACHE + self._expire_after = DO_NOT_CACHE + self._crumb = None + self._cookie = None + if self._session_is_caching and self._cookie is None: + utils.print_once("WARNING: cookie & crumb does not work well with requests_cache. Am experimenting with 'expire_after=DO_NOT_CACHE', but you need to help stress-test.") + + # Default to using 'basic' strategy + self._cookie_strategy = 'basic' + # If it fails, then fallback method is 'csrf' + # self._cookie_strategy = 'csrf' + + self._cookie_lock = threading.Lock() + + def _set_session(self, session): + if session is None: + return + with self._cookie_lock: + self._session = session + + def _set_cookie_strategy(self, strategy, have_lock=False): + if strategy == self._cookie_strategy: + return + if not have_lock: + self._cookie_lock.acquire() + + try: + self._cookie_strategy = strategy + if self._cookie_strategy == 'csrf': + utils.get_yf_logger().debug(f'toggling cookie strategy {self._cookie_strategy} -> basic') + self._session.cookies.clear() + self._cookie_strategy = 'basic' + else: + utils.get_yf_logger().debug(f'toggling cookie strategy {self._cookie_strategy} -> csrf') + self._cookie_strategy = 'csrf' + self._cookie = None + self._crumb = None + except Exception: + self._cookie_lock.release() + raise + + if not have_lock: + self._cookie_lock.release() + + def _save_session_cookies(self): + try: + cache.get_cookie_cache().store('csrf', self._session.cookies) + except Exception: + return False + return True + + def _load_session_cookies(self): + cookie_dict = cache.get_cookie_cache().lookup('csrf') + if cookie_dict is None: + return False + # Periodically refresh, 24 hours seems fair. + if cookie_dict['age'] > datetime.timedelta(days=1): + return False + self._session.cookies.update(cookie_dict['cookie']) + utils.get_yf_logger().debug('loaded persistent cookie') + + def _save_cookie_basic(self, cookie): + try: + cache.get_cookie_cache().store('basic', cookie) + except Exception: + return False + return True + def _load_cookie_basic(self): + cookie_dict = cache.get_cookie_cache().lookup('basic') + if cookie_dict is None: + return None + # Periodically refresh, 24 hours seems fair. + if cookie_dict['age'] > datetime.timedelta(days=1): + return None + utils.get_yf_logger().debug('loaded persistent cookie') + return cookie_dict['cookie'] + + def _get_cookie_basic(self, proxy=None, timeout=30): + if self._cookie is not None: + utils.get_yf_logger().debug('reusing cookie') + return self._cookie + + self._cookie = self._load_cookie_basic() + if self._cookie is not None: + return self._cookie + + # To avoid infinite recursion, do NOT use self.get() + # - 'allow_redirects' copied from @psychoz971 solution - does it help USA? response = self._session.get( - url=url, - params=params, + url='https://fc.yahoo.com', + headers=self.user_agent_headers, proxies=proxy, timeout=timeout, - headers=user_agent_headers or self.user_agent_headers) + allow_redirects=True) + + if not response.cookies: + utils.get_yf_logger().debug("response.cookies = None") + return None + self._cookie = list(response.cookies)[0] + if self._cookie == '': + utils.get_yf_logger().debug("list(response.cookies)[0] = ''") + return None + self._save_cookie_basic(self._cookie) + utils.get_yf_logger().debug(f"fetched basic cookie = {self._cookie}") + return self._cookie + + def _get_crumb_basic(self, proxy=None, timeout=30): + if self._crumb is not None: + utils.get_yf_logger().debug('reusing crumb') + return self._crumb + + cookie = self._get_cookie_basic() + if cookie is None: + return None + + # - 'allow_redirects' copied from @psychoz971 solution - does it help USA? + get_args = { + 'url': "https://query1.finance.yahoo.com/v1/test/getcrumb", + 'headers': self.user_agent_headers, + 'cookies': {cookie.name: cookie.value}, + 'proxies': proxy, + 'timeout': timeout, + 'allow_redirects': True + } + if self._session_is_caching: + get_args['expire_after'] = self._expire_after + crumb_response = self._session.get(**get_args) + else: + crumb_response = self._session.get(**get_args) + self._crumb = crumb_response.text + if self._crumb is None or '' in self._crumb: + return None + + utils.get_yf_logger().debug(f"crumb = '{self._crumb}'") + return self._crumb + + @utils.log_indent_decorator + def _get_cookie_and_crumb_basic(self, proxy, timeout): + cookie = self._get_cookie_basic(proxy, timeout) + crumb = self._get_crumb_basic(proxy, timeout) + return cookie, crumb + + def _get_cookie_csrf(self, proxy, timeout): + if utils.reuse_cookie and self._cookie is not None: + utils.get_yf_logger().debug('reusing cookie') + return True + + elif self._load_session_cookies(): + utils.get_yf_logger().debug('reusing persistent cookie') + self._cookie = True + return True + + base_args = { + 'headers': self.user_agent_headers, + 'proxies': proxy, + 'timeout': timeout} + + get_args = {**base_args, 'url': 'https://guce.yahoo.com/consent'} + if self._session_is_caching: + get_args['expire_after'] = self._expire_after + response = self._session.get(**get_args) + else: + response = self._session.get(**get_args) + + soup = BeautifulSoup(response.content, 'html.parser') + csrfTokenInput = soup.find('input', attrs={'name': 'csrfToken'}) + if csrfTokenInput is None: + return False + csrfToken = csrfTokenInput['value'] + utils.get_yf_logger().debug(f'csrfToken = {csrfToken}') + sessionIdInput = soup.find('input', attrs={'name': 'sessionId'}) + sessionId = sessionIdInput['value'] + utils.get_yf_logger().debug(f"sessionId='{sessionId}") + + originalDoneUrl = 'https://finance.yahoo.com/' + namespace = 'yahoo' + data = { + 'agree': ['agree', 'agree'], + 'consentUUID': 'default', + 'sessionId': sessionId, + 'csrfToken': csrfToken, + 'originalDoneUrl': originalDoneUrl, + 'namespace': namespace, + } + post_args = {**base_args, + 'url': f'https://consent.yahoo.com/v2/collectConsent?sessionId={sessionId}', + 'data': data} + get_args = {**base_args, + 'url': f'https://guce.yahoo.com/copyConsent?sessionId={sessionId}', + 'data': data} + if self._session_is_caching: + post_args['expire_after'] = self._expire_after + get_args['expire_after'] = self._expire_after + self._session.post(**post_args) + self._session.get(**get_args) + else: + self._session.post(**post_args) + self._session.get(**get_args) + self._cookie = True + self._save_session_cookies() + return True + + @utils.log_indent_decorator + def _get_crumb_csrf(self, proxy=None, timeout=30): + # Credit goes to @bot-unit #1729 + + if self._crumb is not None: + utils.get_yf_logger().debug('reusing crumb') + return self._crumb + + if not self._get_cookie_csrf(proxy, timeout): + # This cookie stored in session + return None + + get_args = { + 'url': 'https://query2.finance.yahoo.com/v1/test/getcrumb', + 'headers': self.user_agent_headers, + 'proxies': proxy, + 'timeout': timeout} + if self._session_is_caching: + get_args['expire_after'] = self._expire_after + r = self._session.get(**get_args) + else: + r = self._session.get(**get_args) + self._crumb = r.text + + if self._crumb is None or '' in self._crumb or self._crumb == '': + return None + + utils.get_yf_logger().debug(f"crumb = '{self._crumb}'") + return self._crumb + + @utils.log_indent_decorator + def _get_cookie_and_crumb(self, proxy=None, timeout=30): + cookie, crumb, strategy = None, None, None + + utils.get_yf_logger().debug(f"cookie_mode = '{self._cookie_strategy}'") + + with self._cookie_lock: + if self._cookie_strategy == 'csrf': + crumb = self._get_crumb_csrf() + if crumb is None: + # Fail + self._set_cookie_strategy('basic', have_lock=True) + cookie, crumb = self._get_cookie_and_crumb_basic(proxy, timeout) + else: + # Fallback strategy + cookie, crumb = self._get_cookie_and_crumb_basic(proxy, timeout) + if cookie is None or crumb is None: + # Fail + self._set_cookie_strategy('csrf', have_lock=True) + crumb = self._get_crumb_csrf() + strategy = self._cookie_strategy + return cookie, crumb, strategy + + @utils.log_indent_decorator + def get(self, url, user_agent_headers=None, params=None, proxy=None, timeout=30): + # Important: treat input arguments as immutable. + + if len(url) > 200: + utils.get_yf_logger().debug(f'url={url[:200]}...') + else: + utils.get_yf_logger().debug(f'url={url}') + utils.get_yf_logger().debug(f'params={params}') + proxy = self._get_proxy(proxy) + + if params is None: + params = {} + if 'crumb' in params: + raise Exception("Don't manually add 'crumb' to params dict, let data.py handle it") + + cookie, crumb, strategy = self._get_cookie_and_crumb() + if crumb is not None: + crumbs = {'crumb': crumb} + else: + crumbs = {} + if strategy == 'basic' and cookie is not None: + # Basic cookie strategy adds cookie to GET parameters + cookies = {cookie.name: cookie.value} + else: + cookies = None + + request_args = { + 'url': url, + 'params': {**params, **crumbs}, + 'cookies': cookies, + 'proxies': proxy, + 'timeout': timeout, + 'headers': user_agent_headers or self.user_agent_headers + } + response = self._session.get(**request_args) + if response.status_code >= 400: + # Retry with other cookie strategy + if strategy == 'basic': + self._set_cookie_strategy('csrf') + else: + self._set_cookie_strategy('basic') + cookie, crumb, strategy = self._get_cookie_and_crumb(proxy, timeout) + request_args['params']['crumb'] = crumb + if strategy == 'basic': + request_args['cookies'] = {cookie.name: cookie.value} + response = self._session.get(**request_args) + return response @lru_cache_freezeargs @@ -71,6 +391,7 @@ def _get_proxy(self, proxy): return proxy def get_raw_json(self, url, user_agent_headers=None, params=None, proxy=None, timeout=30): + utils.get_yf_logger().debug(f'get_raw_json(): {url}') response = self.get(url, user_agent_headers=user_agent_headers, params=params, proxy=proxy, timeout=timeout) response.raise_for_status() return response.json() diff --git a/yfinance/multi.py b/yfinance/multi.py index d51791f81..ece3cc3ea 100644 --- a/yfinance/multi.py +++ b/yfinance/multi.py @@ -29,6 +29,7 @@ import pandas as _pd from . import Ticker, utils +from .data import YfData from . import shared @@ -143,6 +144,9 @@ def download(tickers, start=None, end=None, actions=False, threads=True, ignore_ shared._ERRORS = {} shared._TRACEBACKS = {} + # Ensure data initialised with session. + YfData(session=session) + # download using threads if threads: if threads is True: @@ -154,7 +158,7 @@ def download(tickers, start=None, end=None, actions=False, threads=True, ignore_ actions=actions, auto_adjust=auto_adjust, back_adjust=back_adjust, repair=repair, keepna=keepna, progress=(progress and i > 0), proxy=proxy, - rounding=rounding, timeout=timeout, session=session) + rounding=rounding, timeout=timeout) while len(shared._DFS) < len(tickers): _time.sleep(0.01) # download synchronously @@ -165,7 +169,7 @@ def download(tickers, start=None, end=None, actions=False, threads=True, ignore_ actions=actions, auto_adjust=auto_adjust, back_adjust=back_adjust, repair=repair, keepna=keepna, proxy=proxy, - rounding=rounding, timeout=timeout, session=session) + rounding=rounding, timeout=timeout) if progress: shared._PROGRESS_BAR.animate() @@ -257,10 +261,10 @@ def _download_one_threaded(ticker, start=None, end=None, auto_adjust=False, back_adjust=False, repair=False, actions=False, progress=True, period="max", interval="1d", prepost=False, proxy=None, - keepna=False, rounding=False, timeout=10, session=None): + keepna=False, rounding=False, timeout=10): data = _download_one(ticker, start, end, auto_adjust, back_adjust, repair, actions, period, interval, prepost, proxy, rounding, - keepna, timeout, session) + keepna, timeout) if progress: shared._PROGRESS_BAR.animate() @@ -269,10 +273,10 @@ def _download_one(ticker, start=None, end=None, auto_adjust=False, back_adjust=False, repair=False, actions=False, period="max", interval="1d", prepost=False, proxy=None, rounding=False, - keepna=False, timeout=10, session=None): + keepna=False, timeout=10): data = None try: - data = Ticker(ticker, session=session).history( + data = Ticker(ticker).history( period=period, interval=interval, start=start, end=end, prepost=prepost, actions=actions, auto_adjust=auto_adjust, diff --git a/yfinance/scrapers/analysis.py b/yfinance/scrapers/analysis.py index 0a86d3430..27c27c9a4 100644 --- a/yfinance/scrapers/analysis.py +++ b/yfinance/scrapers/analysis.py @@ -1,14 +1,15 @@ import pandas as pd from yfinance import utils -from yfinance.data import TickerData +from yfinance.data import YfData from yfinance.exceptions import YFNotImplementedError class Analysis: - def __init__(self, data: TickerData, proxy=None): + def __init__(self, data: YfData, symbol: str, proxy=None): self._data = data + self._symbol = symbol self.proxy = proxy self._earnings_trend = None diff --git a/yfinance/scrapers/fundamentals.py b/yfinance/scrapers/fundamentals.py index 7745f1a2d..27623e1ec 100644 --- a/yfinance/scrapers/fundamentals.py +++ b/yfinance/scrapers/fundamentals.py @@ -4,14 +4,15 @@ import pandas as pd from yfinance import utils, const -from yfinance.data import TickerData +from yfinance.data import YfData from yfinance.exceptions import YFinanceException, YFNotImplementedError class Fundamentals: - def __init__(self, data: TickerData, proxy=None): + def __init__(self, data: YfData, symbol: str, proxy=None): self._data = data + self._symbol = symbol self.proxy = proxy self._earnings = None @@ -21,7 +22,7 @@ def __init__(self, data: TickerData, proxy=None): self._financials_data = None self._fin_data_quote = None self._basics_already_scraped = False - self._financials = Financials(data) + self._financials = Financials(data, symbol) @property def financials(self) -> "Financials": @@ -41,8 +42,9 @@ def shares(self) -> pd.DataFrame: class Financials: - def __init__(self, data: TickerData): + def __init__(self, data: YfData, symbol: str): self._data = data + self._symbol = symbol self._income_time_series = {} self._balance_sheet_time_series = {} self._cash_flow_time_series = {} @@ -85,7 +87,7 @@ def _fetch_time_series(self, name, timescale, proxy=None): if statement is not None: return statement except YFinanceException as e: - utils.get_yf_logger().error(f"{self._data.ticker}: Failed to create {name} financials table for reason: {e}") + utils.get_yf_logger().error(f"{self._symbol}: Failed to create {name} financials table for reason: {e}") return pd.DataFrame() def _create_financials_table(self, name, timescale, proxy): @@ -105,7 +107,7 @@ def get_financials_time_series(self, timescale, keys: list, proxy=None) -> pd.Da timescale = timescale_translation[timescale] # Step 2: construct url: - ts_url_base = f"https://query2.finance.yahoo.com/ws/fundamentals-timeseries/v1/finance/timeseries/{self._data.ticker}?symbol={self._data.ticker}" + ts_url_base = f"https://query2.finance.yahoo.com/ws/fundamentals-timeseries/v1/finance/timeseries/{self._symbol}?symbol={self._symbol}" url = ts_url_base + "&type=" + ",".join([timescale + k for k in keys]) # Yahoo returns maximum 4 years or 5 quarters, regardless of start_dt: start_dt = datetime.datetime(2016, 12, 31) diff --git a/yfinance/scrapers/holders.py b/yfinance/scrapers/holders.py index 684278774..f442ea1c9 100644 --- a/yfinance/scrapers/holders.py +++ b/yfinance/scrapers/holders.py @@ -1,13 +1,14 @@ import pandas as pd -from yfinance.data import TickerData +from yfinance.data import YfData class Holders: _SCRAPE_URL_ = 'https://finance.yahoo.com/quote' - def __init__(self, data: TickerData, proxy=None): + def __init__(self, data: YfData, symbol: str, proxy=None): self._data = data + self._symbol = symbol self.proxy = proxy self._major = None @@ -33,7 +34,7 @@ def mutualfund(self) -> pd.DataFrame: return self._mutualfund def _scrape(self, proxy): - ticker_url = f"{self._SCRAPE_URL_}/{self._data.ticker}" + ticker_url = f"{self._SCRAPE_URL_}/{self._symbol}" try: resp = self._data.cache_get(ticker_url + '/holders', proxy=proxy) holders = pd.read_html(resp.text) diff --git a/yfinance/scrapers/quote.py b/yfinance/scrapers/quote.py index 53d3cbc30..98e6bc50a 100644 --- a/yfinance/scrapers/quote.py +++ b/yfinance/scrapers/quote.py @@ -8,7 +8,7 @@ import pandas as pd from yfinance import utils -from yfinance.data import TickerData +from yfinance.data import YfData from yfinance.exceptions import YFNotImplementedError info_retired_keys_price = {"currentPrice", "dayHigh", "dayLow", "open", "previousClose", "volume", "volume24Hr"} @@ -21,7 +21,7 @@ info_retired_keys = info_retired_keys_price | info_retired_keys_exchange | info_retired_keys_marketCap | info_retired_keys_symbol -_BASIC_URL_ = "https://query2.finance.yahoo.com/v6/finance/quoteSummary" +_BASIC_URL_ = "https://query2.finance.yahoo.com/v10/finance/quoteSummary" class InfoDictWrapper(MutableMapping): @@ -551,8 +551,9 @@ def market_cap(self): class Quote: - def __init__(self, data: TickerData, proxy=None): + def __init__(self, data: YfData, symbol: str, proxy=None): self._data = data + self._symbol = symbol self.proxy = proxy self._info = None @@ -596,13 +597,14 @@ def _fetch(self, proxy): return self._already_fetched = True modules = ['financialData', 'quoteType', 'defaultKeyStatistics', 'assetProfile', 'summaryDetail'] + modules = ','.join(modules) params_dict = {"modules": modules, "ssl": "true"} result = self._data.get_raw_json( - _BASIC_URL_ + f"/{self._data.ticker}", params=params_dict, proxy=proxy + _BASIC_URL_ + f"/{self._symbol}", params=params_dict, proxy=proxy ) - result["quoteSummary"]["result"][0]["symbol"] = self._data.ticker + result["quoteSummary"]["result"][0]["symbol"] = self._symbol query1_info = next( - (info for info in result.get("quoteSummary", {}).get("result", []) if info["symbol"] == self._data.ticker), + (info for info in result.get("quoteSummary", {}).get("result", []) if info["symbol"] == self._symbol), None, ) # Most keys that appear in multiple dicts have same value. Except 'maxAge' because @@ -670,7 +672,7 @@ def _fetch_complementary(self, proxy): # pass # # For just one/few variable is faster to query directly: - url = f"https://query1.finance.yahoo.com/ws/fundamentals-timeseries/v1/finance/timeseries/{self._data.ticker}?symbol={self._data.ticker}" + url = f"https://query1.finance.yahoo.com/ws/fundamentals-timeseries/v1/finance/timeseries/{self._symbol}?symbol={self._symbol}" for k in keys: url += "&type=" + k # Request 6 months of data diff --git a/yfinance/utils.py b/yfinance/utils.py index 7e1e7bf1d..2f28fdb88 100644 --- a/yfinance/utils.py +++ b/yfinance/utils.py @@ -21,22 +21,16 @@ from __future__ import print_function -import atexit as _atexit - import datetime as _datetime import logging -import os as _os import re as _re -import peewee as _peewee import sys as _sys import threading from functools import lru_cache from inspect import getmembers -from threading import Lock from types import FunctionType -from typing import Dict, Union, List, Optional +from typing import Dict, List, Optional -import appdirs as _ad import numpy as _np import pandas as _pd import pytz as _tz @@ -701,7 +695,7 @@ def safe_merge_dfs(df_main, df_sub, interval): df_main['Dividends'] = 0.0 return df_main else: - empty_row_data = {c:[_np.nan] for c in const.price_colnames}|{'Volume':[0]} + empty_row_data = {**{c:[_np.nan] for c in const.price_colnames}, 'Volume':[0]} if interval == '1d': # For 1d, add all out-of-range event dates for i in _np.where(f_outOfRange)[0]: @@ -903,204 +897,3 @@ def __update_amount(self, new_amount): def __str__(self): return str(self.prog_bar) - -# --------------------------------- -# TimeZone cache related code -# --------------------------------- - - -_cache_init_lock = Lock() - - -class _TzCacheException(Exception): - pass - - -class _TzCacheDummy: - """Dummy cache to use if tz cache is disabled""" - - def lookup(self, tkr): - return None - - def store(self, tkr, tz): - pass - - @property - def tz_db(self): - return None - - -class _TzCacheManager: - _tz_cache = None - - @classmethod - def get_tz_cache(cls): - if cls._tz_cache is None: - with _cache_init_lock: - cls._initialise() - return cls._tz_cache - - @classmethod - def _initialise(cls, cache_dir=None): - cls._tz_cache = _TzCache() - - -class _DBManager: - _db = None - _cache_dir = _os.path.join(_ad.user_cache_dir(), "py-yfinance") - - @classmethod - def get_database(cls): - if cls._db is None: - cls._initialise() - return cls._db - - @classmethod - def close_db(cls): - if cls._db is not None: - try: - cls._db.close() - except Exception as e: - # Must discard exceptions because Python trying to quit. - pass - - - @classmethod - def _initialise(cls, cache_dir=None): - if cache_dir is not None: - cls._cache_dir = cache_dir - - if not _os.path.isdir(cls._cache_dir): - try: - _os.makedirs(cls._cache_dir) - except OSError as err: - raise _TzCacheException(f"Error creating TzCache folder: '{cls._cache_dir}' reason: {err}") - elif not (_os.access(cls._cache_dir, _os.R_OK) and _os.access(cls._cache_dir, _os.W_OK)): - raise _TzCacheException(f"Cannot read and write in TzCache folder: '{cls._cache_dir}'") - - cls._db = _peewee.SqliteDatabase( - _os.path.join(cls._cache_dir, 'tkr-tz.db'), - pragmas={'journal_mode': 'wal', 'cache_size': -64} - ) - - old_cache_file_path = _os.path.join(cls._cache_dir, "tkr-tz.csv") - if _os.path.isfile(old_cache_file_path): - _os.remove(old_cache_file_path) - - @classmethod - def set_location(cls, new_cache_dir): - if cls._db is not None: - cls._db.close() - cls._db = None - cls._cache_dir = new_cache_dir - - @classmethod - def get_location(cls): - return cls._cache_dir - -# close DB when Python exists -_atexit.register(_DBManager.close_db) - - -db_proxy = _peewee.Proxy() -class _KV(_peewee.Model): - key = _peewee.CharField(primary_key=True) - value = _peewee.CharField(null=True) - - class Meta: - database = db_proxy - without_rowid = True - - -class _TzCache: - def __init__(self): - self.initialised = -1 - self.db = None - self.dummy = False - - def get_db(self): - if self.db is not None: - return self.db - - try: - self.db = _DBManager.get_database() - except _TzCacheException as err: - get_yf_logger().info(f"Failed to create TzCache, reason: {err}. " - "TzCache will not be used. " - "Tip: You can direct cache to use a different location with 'set_tz_cache_location(mylocation)'") - self.dummy = True - return None - return self.db - - def initialise(self): - if self.initialised != -1: - return - - db = self.get_db() - if db is None: - self.initialised = 0 # failure - return - - db.connect() - db_proxy.initialize(db) - db.create_tables([_KV]) - self.initialised = 1 # success - - def lookup(self, key): - if self.dummy: - return None - - if self.initialised == -1: - self.initialise() - - if self.initialised == 0: # failure - return None - - try: - return _KV.get(_KV.key == key).value - except _KV.DoesNotExist: - return None - - def store(self, key, value): - if self.dummy: - return - - if self.initialised == -1: - self.initialise() - - if self.initialised == 0: # failure - return - - db = self.get_db() - if db is None: - return - try: - if value is None: - q = _KV.delete().where(_KV.key == key) - q.execute() - return - with db.atomic(): - _KV.insert(key=key, value=value).execute() - except _peewee.IntegrityError: - # Integrity error means the key already exists. Try updating the key. - old_value = self.lookup(key) - if old_value != value: - get_yf_logger().debug(f"Value for key {key} changed from {old_value} to {value}.") - with db.atomic(): - q = _KV.update(value=value).where(_KV.key == key) - q.execute() - - -def get_tz_cache(): - return _TzCacheManager.get_tz_cache() - - -def set_tz_cache_location(cache_dir: str): - """ - Sets the path to create the "py-yfinance" cache folder in. - Useful if the default folder returned by "appdir.user_cache_dir()" is not writable. - Must be called before cache is used (that is, before fetching tickers). - :param cache_dir: Path to use for caches - :return: None - """ - _DBManager.set_location(cache_dir)