Tishacy · mjamei · May 2, 2022 · May 2, 2022 · May 2, 2022 · May 2, 2022
diff --git a/.gitignore b/.gitignore
@@ -25,3 +25,4 @@ paper
 # Others
 DEPLOYMENT.md
 playground.*
+*.pdf
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1 +1,2 @@
-recursive-include scidownl/config *.ini
+recursive-include scidownl/config *.ini
+include scidownl/db/*.txt
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-1.0.1
+1.1.2
diff --git a/requirements.txt b/requirements.txt
@@ -4,7 +4,6 @@ loguru>=0.6.0
 requests>=2.27.1
 pysocks
 setuptools>=58.0.4
-SQLAlchemy>=1.4.31
 tablib>=3.2.0
 tablib[cli]
 wget>=3.2
diff --git a/scidownl/config/global.ini b/scidownl/config/global.ini
@@ -16,7 +16,7 @@ num_workers = 500
 check_timeout = 10
 
 [global_db]
-db_name = scidownl.db
+db_name = db/scidownl.txt
 
 [log]
 console_log_format = <level>[{level}]</level> | <green>{time:YYYY/MM/DD HH:mm:ss}</green> | <level>{message}</level>

diff --git a/scidownl/core/base.py b/scidownl/core/base.py
@@ -4,8 +4,6 @@
 from os import PathLike
 from typing import Union, Iterable, List, Optional
 
-from ..db.entities import ScihubUrl
-
 
 class BaseTask(ABC):
     """Abstract task with a `run` method."""
@@ -119,7 +117,7 @@ class ScihubUrlChooser(ABC):
     __chooser_type__ = "base"
 
     @abstractmethod
-    def next(self) -> Optional[ScihubUrl]:
+    def next(self) -> Optional[str]:
         """Returns the next scihub url or None if reach the end."""
         raise NotImplementedError("Implement next method before calling it.")
 

diff --git a/scidownl/core/chooser.py b/scidownl/core/chooser.py
@@ -5,7 +5,6 @@
 from typing import Optional
 
 from .base import ScihubUrlChooser
-from ..db.entities import ScihubUrl
 from ..db.service import ScihubUrlService
 
 
@@ -21,7 +20,7 @@ def __init__(self):
         self.cursor = 0
         self._lock = RLock()
 
-    def next(self) -> Optional[ScihubUrl]:
+    def next(self) -> Optional[str]:
         with self._lock:
             if self.cursor < 0 or self.cursor >= len(self.scihub_urls):
                 raise StopIteration
@@ -46,7 +45,7 @@ def __init__(self):
         self.cursor = 0
         self._lock = RLock()
 
-    def next(self) -> Optional[ScihubUrl]:
+    def next(self) -> Optional[str]:
         with self._lock:
             if len(self.temp_zone) == 0:
                 raise StopIteration
@@ -61,41 +60,8 @@ def __len__(self):
         return len(self.temp_zone)
 
 
-class AvailabilityFirstScihubUrlChooser(ScihubUrlChooser):
-    """Availability-first chooser of scihub urls.
-    A scihub url is considered as more available if it has a less failed rate
-        failed_rate = (failed_times) / (success_times + failed_times + 0.01)
-    The tail 0.01 is used to avoid divide by zero error if (success_times + failed_times) == 0.
-    """
-    __chooser_type__ = "availability_first"
-
-    def __init__(self):
-        self.service = ScihubUrlService()
-        self.scihub_urls = self.service.get_all_urls()
-
-        # Sort by availability.
-        self.temp_zone = sorted(
-            self.scihub_urls,
-            key=lambda url: url.failed_times / (url.success_times + url.failed_times + 0.01)
-        )
-
-        self.cursor = 0
-        self._lock = RLock()
-
-    def next(self) -> Optional[ScihubUrl]:
-        with self._lock:
-            if self.cursor < 0 or self.cursor >= len(self.temp_zone):
-                raise StopIteration
-            selected_url = self.temp_zone[self.cursor]
-            self.cursor += 1
-            return selected_url
-
-    def __len__(self):
-        return len(self.temp_zone)
-
-
 scihub_url_choosers = {
     "simple": SimpleScihubUrlChooser,
     "random": RandomScihubUrlChooser,
-    "availability_first": AvailabilityFirstScihubUrlChooser
+    # "availability_first": AvailabilityFirstScihubUrlChooser
 }
diff --git a/scidownl/core/extractor.py b/scidownl/core/extractor.py
@@ -7,7 +7,7 @@
 from .base import BaseExtractor, BaseInformation, BaseTask, BaseTaskStep
 from .content import HtmlContent
 from .information import PdfUrlTitleInformation, UrlInformation
-from .chooser import scihub_url_choosers, AvailabilityFirstScihubUrlChooser
+from .chooser import scihub_url_choosers, SimpleScihubUrlChooser
 from ..exception import PdfTagNotFoundException, PdfUrlNotFoundException, ExtractException
 from ..db.service import ScihubUrlService
 from ..log import get_logger
@@ -19,9 +19,9 @@
 
 def get_default_referer():
     scihub_url_chooser_type = configs['scihub.task']['scihub_url_chooser_type']
-    chooser_cls = scihub_url_choosers.get(scihub_url_chooser_type, AvailabilityFirstScihubUrlChooser)
+    chooser_cls = scihub_url_choosers.get(scihub_url_chooser_type, SimpleScihubUrlChooser)
     chooser = chooser_cls()
-    scihub_url = "https://sci-hub.se" if len(chooser) == 0 else chooser.next().url
+    scihub_url = "https://sci-hub.se" if len(chooser) == 0 else chooser.next()
     return scihub_url
 
 

diff --git a/scidownl/core/task.py b/scidownl/core/task.py
@@ -7,7 +7,7 @@
 from .crawler import ScihubCrawler
 from .extractor import HtmlPdfExtractor
 from .downloader import UrlDownloader
-from .chooser import AvailabilityFirstScihubUrlChooser, scihub_url_choosers
+from .chooser import SimpleScihubUrlChooser, scihub_url_choosers
 from .updater import CrawlingScihubDomainUpdater
 from ..log import get_logger
 from ..config import get_config
@@ -17,7 +17,7 @@
 configs = get_config()
 
 scihub_url_chooser_type = configs['scihub.task']['scihub_url_chooser_type']
-default_chooser_cls = scihub_url_choosers.get(scihub_url_chooser_type, AvailabilityFirstScihubUrlChooser)
+default_chooser_cls = scihub_url_choosers.get(scihub_url_chooser_type, SimpleScihubUrlChooser)
 
 
 class ScihubTask(BaseTask):
@@ -54,8 +54,8 @@ def run(self):
 
         for i, scihub_url in enumerate(self.scihub_url_chooser):
             try:
-                logger.info(f"Choose scihub url [{i}]: {scihub_url.url}")
-                return self._run(scihub_url.url)
+                logger.info(f"Choose scihub url [{i}]: {scihub_url}")
+                return self._run(scihub_url)
             except Exception as e:
                 logger.warning(f"Error occurs, task status: {self.context['status']}, error: {self.context['error']}")
                 continue

diff --git a/scidownl/core/updater.py b/scidownl/core/updater.py
@@ -11,7 +11,7 @@
 from .base import DomainUpdater
 from ..log import get_logger
 from ..config import get_config
-from ..db.entities import ScihubUrl
+# from ..db.entities import ScihubUrl
 from ..db.service import ScihubUrlService
 
 logger = get_logger()
@@ -29,7 +29,7 @@ def __init__(self, domain_source_url: str = None):
         self._domain_url_pattern = configs['scihub.domain.updater.crawl']['scihub_url_pattern']
         self._exclude_url_pattern = configs['scihub.domain.updater.crawl']['exclude_url_pattern']
 
-    def update_domains(self) -> Union[List, Iterable[ScihubUrl]]:
+    def update_domains(self) -> Union[List, Iterable[str]]:
         html = requests.get(self.domain_source_url).text
         domain_urls = re.findall(self._domain_url_pattern, html)
 
@@ -40,7 +40,7 @@ def update_domains(self) -> Union[List, Iterable[ScihubUrl]]:
         logger.info(f"Found {len(available_domain_urls)} valid SciHub domains in total: {available_domain_urls}")
 
         # Save to db.
-        urls_to_save = [ScihubUrl(url=url) for url in available_domain_urls]
+        urls_to_save = [url for url in available_domain_urls]
         self.service.add_urls(urls_to_save)
         logger.info(f"Saved {len(urls_to_save)} SciHub domains to local db.")
         return available_domain_urls
@@ -92,7 +92,7 @@ def update_domains(self) -> Union[List, Iterable[str]]:
 
         logger.info(f"Found {len(valid_urls)} valid SciHub domains in total: {valid_urls}")
         # Save to db.
-        urls_to_save = [ScihubUrl(url=url) for url in valid_urls]
+        urls_to_save = [url for url in valid_urls]
         self.service.add_urls(urls_to_save)
         logger.info(f"Saved {len(urls_to_save)} SciHub domains to local db.")
         return valid_urls

diff --git a/scidownl/db/entities.py b/scidownl/db/entities.py
@@ -2,12 +2,9 @@
 """Entities of tables"""
 import os
 
-from sqlalchemy import create_engine, Column, Integer, String
-from sqlalchemy.ext.declarative import declarative_base
 
 from ..config import get_config
 
-Base = declarative_base()
 configs = get_config()
 
 
@@ -21,27 +18,5 @@ def get_engine(echo: bool = False, test: bool = False):
     par_dirpath = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
     dbname = 'test-scidownl.db' if test else configs['global_db']['db_name']
     db_path = os.path.join(par_dirpath, dbname)
-    engine = create_engine(f'sqlite:///{db_path}?check_same_thread=False', echo=echo)
+    engine = open(db_path, 'r')
     return engine
-
-
-def create_tables(test: bool = False):
-    """Create all tables that are not exist.
-
-    :param test: if True, using test db instead.
-    """
-    engine = get_engine(test=test)
-    Base.metadata.create_all(engine, checkfirst=True)
-
-
-class ScihubUrl(Base):
-    __tablename__ = "scihub_url"
-
-    url = Column(String(50), primary_key=True)
-    success_times = Column(Integer, default=0)
-    failed_times = Column(Integer, default=0)
-
-    def __repr__(self):
-        return f"<ScihubUrl(url={self.url}, " \
-               f"success_times={self.success_times}, " \
-               f"failed_times={self.failed_times})>"
diff --git a/scidownl/db/scidownl.txt b/scidownl/db/scidownl.txt
@@ -0,0 +1,8 @@
+https://sci-hub.ru
+http://sci-hub.kr
+https://sci-hub.st
+https://sci-hub.tw
+http://sci-hub.st
+http://sci-hub.tw
+https://sci-hub.se
+http://sci-hub.se
diff --git a/scidownl/db/service.py b/scidownl/db/service.py
@@ -2,63 +2,28 @@
 """Services to manipulate entities"""
 from typing import List, Union
 
-from sqlalchemy.orm import sessionmaker
+# from sqlalchemy.orm import sessionmaker
 
 from ..log import get_logger
-from .entities import get_engine, ScihubUrl
-from ..db.entities import create_tables
+from .entities import get_engine
+# from ..db.entities import create_tables
 
 logger = get_logger()
 
 
 class ScihubUrlService:
     def __init__(self, test: bool = False):
-        create_tables()
         self.engine = get_engine(test=test)
-        self.session_class = sessionmaker(bind=self.engine)
-
-    def add_urls(self, urls: Union[List[ScihubUrl]]) -> None:
-        if urls is None or len(urls) == 0:
-            return
-        session = self.session_class()
-        for url in urls:
-            try:
-                session.add(url)
-                session.commit()
-            except Exception as e:
-                session.rollback()
-        session.close()
 
     def increment_success_times(self, url: str) -> None:
-        if url is None or not isinstance(url, str):
-            return
-        session = self.session_class()
-        try:
-            session.query(ScihubUrl).filter_by(url=url).update({
-                ScihubUrl.success_times: ScihubUrl.success_times + 1
-            })
-            session.commit()
-        except Exception as e:
-            logger.warning(f"Cannot increment success times: {url}, reason: {e}")
-            session.rollback()
-        session.close()
+        print(f"Success to fetch url: {url}")
+
 
     def increment_failed_times(self, url: str) -> None:
-        if url is None or not isinstance(url, str):
-            return
-        session = self.session_class()
-        try:
-            session.query(ScihubUrl).filter_by(url=url).update({
-                ScihubUrl.failed_times: ScihubUrl.failed_times + 1
-            })
-            session.commit()
-        except Exception as e:
-            logger.warning(f"Cannot increment failed times: {url}, reason: {e}")
-            session.rollback()
-        session.close()
+        print(f"Failed to fetch url: {url}")
+
 
-    def get_all_urls(self) -> List[ScihubUrl]:
-        session = self.session_class()
-        all_urls = session.query(ScihubUrl).all()
-        session.close()
+    def get_all_urls(self) -> List[str]:
+        with self.engine as conn:
+            all_urls = conn.read().splitlines() 
         return all_urls