Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

remove sqlalchemy dependency #18

Open
wants to merge 4 commits into
base: v1.0
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,4 @@ paper
# Others
DEPLOYMENT.md
playground.*
*.pdf
3 changes: 2 additions & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
recursive-include scidownl/config *.ini
recursive-include scidownl/config *.ini
include scidownl/db/*.txt
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.0.1
1.1.2
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ loguru>=0.6.0
requests>=2.27.1
pysocks
setuptools>=58.0.4
SQLAlchemy>=1.4.31
tablib>=3.2.0
tablib[cli]
wget>=3.2
2 changes: 1 addition & 1 deletion scidownl/config/global.ini
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ num_workers = 500
check_timeout = 10

[global_db]
db_name = scidownl.db
db_name = db/scidownl.txt

[log]
console_log_format = <level>[{level}]</level> | <green>{time:YYYY/MM/DD HH:mm:ss}</green> | <level>{message}</level>
Expand Down
4 changes: 1 addition & 3 deletions scidownl/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
from os import PathLike
from typing import Union, Iterable, List, Optional

from ..db.entities import ScihubUrl


class BaseTask(ABC):
"""Abstract task with a `run` method."""
Expand Down Expand Up @@ -119,7 +117,7 @@ class ScihubUrlChooser(ABC):
__chooser_type__ = "base"

@abstractmethod
def next(self) -> Optional[ScihubUrl]:
def next(self) -> Optional[str]:
"""Returns the next scihub url or None if reach the end."""
raise NotImplementedError("Implement next method before calling it.")

Expand Down
40 changes: 3 additions & 37 deletions scidownl/core/chooser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from typing import Optional

from .base import ScihubUrlChooser
from ..db.entities import ScihubUrl
from ..db.service import ScihubUrlService


Expand All @@ -21,7 +20,7 @@ def __init__(self):
self.cursor = 0
self._lock = RLock()

def next(self) -> Optional[ScihubUrl]:
def next(self) -> Optional[str]:
with self._lock:
if self.cursor < 0 or self.cursor >= len(self.scihub_urls):
raise StopIteration
Expand All @@ -46,7 +45,7 @@ def __init__(self):
self.cursor = 0
self._lock = RLock()

def next(self) -> Optional[ScihubUrl]:
def next(self) -> Optional[str]:
with self._lock:
if len(self.temp_zone) == 0:
raise StopIteration
Expand All @@ -61,41 +60,8 @@ def __len__(self):
return len(self.temp_zone)


class AvailabilityFirstScihubUrlChooser(ScihubUrlChooser):
"""Availability-first chooser of scihub urls.
A scihub url is considered as more available if it has a less failed rate
failed_rate = (failed_times) / (success_times + failed_times + 0.01)
The tail 0.01 is used to avoid divide by zero error if (success_times + failed_times) == 0.
"""
__chooser_type__ = "availability_first"

def __init__(self):
self.service = ScihubUrlService()
self.scihub_urls = self.service.get_all_urls()

# Sort by availability.
self.temp_zone = sorted(
self.scihub_urls,
key=lambda url: url.failed_times / (url.success_times + url.failed_times + 0.01)
)

self.cursor = 0
self._lock = RLock()

def next(self) -> Optional[ScihubUrl]:
with self._lock:
if self.cursor < 0 or self.cursor >= len(self.temp_zone):
raise StopIteration
selected_url = self.temp_zone[self.cursor]
self.cursor += 1
return selected_url

def __len__(self):
return len(self.temp_zone)


scihub_url_choosers = {
"simple": SimpleScihubUrlChooser,
"random": RandomScihubUrlChooser,
"availability_first": AvailabilityFirstScihubUrlChooser
# "availability_first": AvailabilityFirstScihubUrlChooser
}
6 changes: 3 additions & 3 deletions scidownl/core/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from .base import BaseExtractor, BaseInformation, BaseTask, BaseTaskStep
from .content import HtmlContent
from .information import PdfUrlTitleInformation, UrlInformation
from .chooser import scihub_url_choosers, AvailabilityFirstScihubUrlChooser
from .chooser import scihub_url_choosers, SimpleScihubUrlChooser
from ..exception import PdfTagNotFoundException, PdfUrlNotFoundException, ExtractException
from ..db.service import ScihubUrlService
from ..log import get_logger
Expand All @@ -19,9 +19,9 @@

def get_default_referer():
scihub_url_chooser_type = configs['scihub.task']['scihub_url_chooser_type']
chooser_cls = scihub_url_choosers.get(scihub_url_chooser_type, AvailabilityFirstScihubUrlChooser)
chooser_cls = scihub_url_choosers.get(scihub_url_chooser_type, SimpleScihubUrlChooser)
chooser = chooser_cls()
scihub_url = "https://sci-hub.se" if len(chooser) == 0 else chooser.next().url
scihub_url = "https://sci-hub.se" if len(chooser) == 0 else chooser.next()
return scihub_url


Expand Down
8 changes: 4 additions & 4 deletions scidownl/core/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from .crawler import ScihubCrawler
from .extractor import HtmlPdfExtractor
from .downloader import UrlDownloader
from .chooser import AvailabilityFirstScihubUrlChooser, scihub_url_choosers
from .chooser import SimpleScihubUrlChooser, scihub_url_choosers
from .updater import CrawlingScihubDomainUpdater
from ..log import get_logger
from ..config import get_config
Expand All @@ -17,7 +17,7 @@
configs = get_config()

scihub_url_chooser_type = configs['scihub.task']['scihub_url_chooser_type']
default_chooser_cls = scihub_url_choosers.get(scihub_url_chooser_type, AvailabilityFirstScihubUrlChooser)
default_chooser_cls = scihub_url_choosers.get(scihub_url_chooser_type, SimpleScihubUrlChooser)


class ScihubTask(BaseTask):
Expand Down Expand Up @@ -54,8 +54,8 @@ def run(self):

for i, scihub_url in enumerate(self.scihub_url_chooser):
try:
logger.info(f"Choose scihub url [{i}]: {scihub_url.url}")
return self._run(scihub_url.url)
logger.info(f"Choose scihub url [{i}]: {scihub_url}")
return self._run(scihub_url)
except Exception as e:
logger.warning(f"Error occurs, task status: {self.context['status']}, error: {self.context['error']}")
continue
Expand Down
8 changes: 4 additions & 4 deletions scidownl/core/updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from .base import DomainUpdater
from ..log import get_logger
from ..config import get_config
from ..db.entities import ScihubUrl
# from ..db.entities import ScihubUrl
from ..db.service import ScihubUrlService

logger = get_logger()
Expand All @@ -29,7 +29,7 @@ def __init__(self, domain_source_url: str = None):
self._domain_url_pattern = configs['scihub.domain.updater.crawl']['scihub_url_pattern']
self._exclude_url_pattern = configs['scihub.domain.updater.crawl']['exclude_url_pattern']

def update_domains(self) -> Union[List, Iterable[ScihubUrl]]:
def update_domains(self) -> Union[List, Iterable[str]]:
html = requests.get(self.domain_source_url).text
domain_urls = re.findall(self._domain_url_pattern, html)

Expand All @@ -40,7 +40,7 @@ def update_domains(self) -> Union[List, Iterable[ScihubUrl]]:
logger.info(f"Found {len(available_domain_urls)} valid SciHub domains in total: {available_domain_urls}")

# Save to db.
urls_to_save = [ScihubUrl(url=url) for url in available_domain_urls]
urls_to_save = [url for url in available_domain_urls]
self.service.add_urls(urls_to_save)
logger.info(f"Saved {len(urls_to_save)} SciHub domains to local db.")
return available_domain_urls
Expand Down Expand Up @@ -92,7 +92,7 @@ def update_domains(self) -> Union[List, Iterable[str]]:

logger.info(f"Found {len(valid_urls)} valid SciHub domains in total: {valid_urls}")
# Save to db.
urls_to_save = [ScihubUrl(url=url) for url in valid_urls]
urls_to_save = [url for url in valid_urls]
self.service.add_urls(urls_to_save)
logger.info(f"Saved {len(urls_to_save)} SciHub domains to local db.")
return valid_urls
Expand Down
27 changes: 1 addition & 26 deletions scidownl/db/entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,9 @@
"""Entities of tables"""
import os

from sqlalchemy import create_engine, Column, Integer, String
from sqlalchemy.ext.declarative import declarative_base

from ..config import get_config

Base = declarative_base()
configs = get_config()


Expand All @@ -21,27 +18,5 @@ def get_engine(echo: bool = False, test: bool = False):
par_dirpath = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
dbname = 'test-scidownl.db' if test else configs['global_db']['db_name']
db_path = os.path.join(par_dirpath, dbname)
engine = create_engine(f'sqlite:///{db_path}?check_same_thread=False', echo=echo)
engine = open(db_path, 'r')
return engine


def create_tables(test: bool = False):
"""Create all tables that are not exist.

:param test: if True, using test db instead.
"""
engine = get_engine(test=test)
Base.metadata.create_all(engine, checkfirst=True)


class ScihubUrl(Base):
__tablename__ = "scihub_url"

url = Column(String(50), primary_key=True)
success_times = Column(Integer, default=0)
failed_times = Column(Integer, default=0)

def __repr__(self):
return f"<ScihubUrl(url={self.url}, " \
f"success_times={self.success_times}, " \
f"failed_times={self.failed_times})>"
8 changes: 8 additions & 0 deletions scidownl/db/scidownl.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
https://sci-hub.ru
http://sci-hub.kr
https://sci-hub.st
https://sci-hub.tw
http://sci-hub.st
http://sci-hub.tw
https://sci-hub.se
http://sci-hub.se
55 changes: 10 additions & 45 deletions scidownl/db/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,63 +2,28 @@
"""Services to manipulate entities"""
from typing import List, Union

from sqlalchemy.orm import sessionmaker
# from sqlalchemy.orm import sessionmaker

from ..log import get_logger
from .entities import get_engine, ScihubUrl
from ..db.entities import create_tables
from .entities import get_engine
# from ..db.entities import create_tables

logger = get_logger()


class ScihubUrlService:
def __init__(self, test: bool = False):
create_tables()
self.engine = get_engine(test=test)
self.session_class = sessionmaker(bind=self.engine)

def add_urls(self, urls: Union[List[ScihubUrl]]) -> None:
if urls is None or len(urls) == 0:
return
session = self.session_class()
for url in urls:
try:
session.add(url)
session.commit()
except Exception as e:
session.rollback()
session.close()

def increment_success_times(self, url: str) -> None:
if url is None or not isinstance(url, str):
return
session = self.session_class()
try:
session.query(ScihubUrl).filter_by(url=url).update({
ScihubUrl.success_times: ScihubUrl.success_times + 1
})
session.commit()
except Exception as e:
logger.warning(f"Cannot increment success times: {url}, reason: {e}")
session.rollback()
session.close()
print(f"Success to fetch url: {url}")


def increment_failed_times(self, url: str) -> None:
if url is None or not isinstance(url, str):
return
session = self.session_class()
try:
session.query(ScihubUrl).filter_by(url=url).update({
ScihubUrl.failed_times: ScihubUrl.failed_times + 1
})
session.commit()
except Exception as e:
logger.warning(f"Cannot increment failed times: {url}, reason: {e}")
session.rollback()
session.close()
print(f"Failed to fetch url: {url}")


def get_all_urls(self) -> List[ScihubUrl]:
session = self.session_class()
all_urls = session.query(ScihubUrl).all()
session.close()
def get_all_urls(self) -> List[str]:
with self.engine as conn:
all_urls = conn.read().splitlines()
return all_urls