Merge pull request #88 from eseiver/update

Change update functions & readme
PLOS · Mar 6, 2018 · a6d6376 · a6d6376
2 parents 7c4abf4 + a68b06c
commit a6d6376
Show file tree

Hide file tree

Showing 6 changed files with 91 additions and 85 deletions.
diff --git a/README.rst b/README.rst
@@ -48,7 +48,7 @@ How to run the program
 
 Execute the following command.
 
-``(allofplos)$ python -m allofplos.plos_corpus``
+``(allofplos)$ python -m allofplos.update``
 
 The first time it runs it will download a >4.4 Gb zip file
 (**allofplos_xml.zip**) with all the XML files inside.
@@ -59,7 +59,7 @@ allofplos\_xml directory inside your installation of `allofplos`.
 
 If you want to see the directory on your file system where this is installed run
 
-``python -c "from allofplos.plos_regex import corpusdir; print(corpusdir)"``
+``python -c "from allofplos import get_corpus_dir; print(get_corpus_dir())"``
 
 If you ever downloaded the corpus before, it will make an incremental
 update to the existing corpus, the script checks for and then downloads
@@ -80,8 +80,8 @@ Here’s what the print statements might look like on a typical run:
 
     147 new articles to download.
     147 new articles downloaded.
-    3 corrected articles found.
-    0 corrected articles downloaded with new xml.
+    3 amended articles found.
+    0 amended articles downloaded with new xml.
     Creating new text list of uncorrected proofs from scratch.
     No new VOR articles indexed in Solr.
     17 VOR articles directly downloaded.
@@ -106,9 +106,9 @@ Should return something like this:
 
 ::
 
-      ......
+      ........
       ----------------------------------------------------------------------
-      Ran 6 tests in 3.327s
+      Ran 8 tests in 0.257s
 
       OK
 

diff --git a/allofplos/article_class.py b/allofplos/article_class.py
@@ -107,7 +107,7 @@ def doi(self, d):
         instantiating the article object.
         """
         if validate_doi(d) is False:
-            raise Exception("Invalid format for PLOS DOI")
+            raise Exception("Invalid format for PLOS DOI: {}".format(d))
         self.reset_memoized_attrs()
         self._doi = d
 

diff --git a/allofplos/corpus/plos_corpus.py b/allofplos/corpus/plos_corpus.py
@@ -278,9 +278,16 @@ def compare_article_pubdate(doi, days=22, directory=None):
         print("Pubdate error in {}".format(doi))
 
 
+def download_xml(doi, tempdir=newarticledir):
+    """For a given DOI, download its remote XML file to tempdir."""
+    art = Article(doi, directory=tempdir)
+    with open(art.filename, 'w', encoding='utf8') as f:
+        f.write(art.get_remote_xml())
+    return art
+
+
 def download_updated_xml(article_file,
-                         tempdir=newarticledir,
-                         vor_check=False):
+                         tempdir=newarticledir):
     """
     For an article file, compare local XML to remote XML
     If they're different, download new version of article
@@ -305,21 +312,9 @@ def download_updated_xml(article_file,
 
     if articleXML_remote == articleXML_local:
         updated = False
-        get_new = False
     else:
-        get_new = True
-        if vor_check:
-            # make sure that update is to a VOR for uncorrected proof
-            get_new = False
-            if article.remote_proof == 'vor_update':
-                get_new = True
-            # else:
-            #     updated = False
-        if get_new:
-            article_new = Article(article.doi, directory=tempdir)
-            with open(article_new.filename, 'w', encoding='utf8') as f:
-                f.write(articleXML_remote)
-            updated = True
+        article_new = download_xml(article.doi, tempdir=tempdir)
+        updated = True
     return updated
 
 
@@ -496,10 +491,10 @@ def download_vor_updates(directory=None, tempdir=newarticledir,
     if vor_updates_available is None:
         vor_updates_available = check_for_vor_updates()
     vor_updated_article_list = []
-    for article in tqdm(vor_updates_available, disable=None):
-        updated = download_updated_xml(article, vor_check=True)
+    for doi in tqdm(vor_updates_available, disable=None):
+        updated = download_updated_xml(doi_to_path(doi), tempdir=tempdir)
         if updated:
-            vor_updated_article_list.append(article)
+            vor_updated_article_list.append(doi)
 
     old_uncorrected_proofs = get_uncorrected_proofs()
     new_uncorrected_proofs_list = list(old_uncorrected_proofs - set(vor_updated_article_list))
@@ -547,7 +542,7 @@ def remote_proofs_direct_check(tempdir=newarticledir, article_list=None):
     print("Checking directly for additional VOR updates...")
     for doi in tqdm(article_list, disable=None):
         f = doi_to_path(doi)
-        updated = download_updated_xml(f, vor_check=True)
+        updated = download_updated_xml(f)
         if updated:
             proofs_download_list.append(doi)
     if proofs_download_list:
@@ -646,50 +641,3 @@ def download_corpus_metadata_files(csv_abstracts=True, csv_no_abstracts=True, sq
         inF.close()
         outF.close()
         print("Extraction complete.")
-
-
-def main():
-    """
-    Entry point for the program. This is used when the program is used as a
-    standalone script
-    :return: None
-    """
-    directory = get_corpus_dir()
-
-    # Step 0: Initialize first copy of repository
-    try:
-        corpus_files = [name for name in os.listdir(directory) if os.path.isfile(
-                        os.path.join(directory, name))]
-    except FileNotFoundError:
-        corpus_files = []
-    if len(corpus_files) < min_files_for_valid_corpus:
-        print('Not enough articles in {}, re-downloading zip file'.format(directory))
-        # TODO: check if zip file is in top-level directory before downloading
-        create_local_plos_corpus()
-
-    # Step 1: Query solr via URL and construct DOI list
-        # Filtered by article type & scheduled for the last 14 days.
-        # Returns specific URL query & the number of search results.
-        # Parses the returned dictionary of article DOIs, removing common leading numbers, as a list.
-        # Compares to list of existing articles in the PLOS corpus folder to create list of DOIs to download.
-    print("Checking for new articles...")
-    dois_needed_list = get_dois_needed_list()
-
-    # Step 2: Download new articles
-        # For every doi in dois_needed_list, grab the accompanying XML from journal pages
-        # If no new articles, don't run any other cells
-        # Check if articles are uncorrected proofs
-        # Check if amended articles linked to new amendment articles are updated
-        # Merge new XML into folder
-        # If need to bulk download, please start here:
-        # https://drive.google.com/open?id=0B_JDnoghFeEKLTlJT09IckMwOFk
-    download_check_and_move(dois_needed_list,
-                            uncorrected_proofs_text_list,
-                            tempdir=newarticledir,
-                            destination=get_corpus_dir()
-                            )
-    return None
-
-
-if __name__ == "__main__":
-    main()
diff --git a/allofplos/plos_corpus.py b/allofplos/plos_corpus.py
@@ -1,4 +1,10 @@
-from .corpus.plos_corpus import main
+import warnings
+
+from .update import main
 
 if __name__ == "__main__":
+    warnings.simplefilter('always', DeprecationWarning)
+    warnings.warn("This update method is deprecated. use 'python -m allofplos.update'",
+                  DeprecationWarning,
+                  stacklevel=2)
     main()
diff --git a/allofplos/transformations.py b/allofplos/transformations.py
@@ -95,14 +95,13 @@ def filename_to_doi(filename):
     :return: full unique identifier for a PLOS article
     """
     filename = os.path.basename(filename)
-    if correction in filename and validate_filename(filename):
+    if not validate_filename(filename):
+        raise Exception("Invalid format for PLOS filename: {}".format(filename))
+    elif correction in filename:
         article = 'annotation/' + filename.split('.', 4)[2]
         doi = PREFIX + article
-    elif validate_filename(filename):
+    else:
         doi = PREFIX + os.path.splitext(filename)[0]
-    # NOTE: A filename should never validate as a DOI, so the next elif is wrong.
-    elif validate_doi(filename):
-        doi = filename
     return doi
 
 
@@ -155,6 +154,8 @@ def doi_to_url(doi):
     :param doi: full unique identifier for a PLOS article
     :return: online location of a PLOS article's XML
     """
+    if validate_doi(doi) is False:
+        raise Exception("Invalid format for PLOS DOI: {}".format(doi))
     journal = Journal.doi_to_journal(doi)
     base_page = _get_base_page(journal)
     return ''.join([base_page, 'article/file?id=', doi, URL_SUFFIX])
@@ -174,13 +175,12 @@ def doi_to_path(doi, directory=None):
     """
     if directory is None:
         directory = get_corpus_dir()
-    if doi.startswith(ANNOTATION_DOI) and validate_doi(doi):
+    if not validate_doi(doi):
+        raise Exception("Invalid format for PLOS DOI: {}".format(doi))
+    elif doi.startswith(ANNOTATION_DOI):
         article_file = os.path.join(directory, "plos.correction." + doi.split('/')[-1] + SUFFIX_LOWER)
-    elif validate_doi(doi):
+    else:
         article_file = os.path.join(directory, doi.lstrip(PREFIX) + SUFFIX_LOWER)
-    # NOTE: The following check is weird, a DOI should never validate as a file name.
-    elif validate_filename(doi):
-        article_file = doi
     return article_file
 
 

diff --git a/allofplos/update.py b/allofplos/update.py
@@ -0,0 +1,52 @@
+import os
+
+from . import get_corpus_dir, newarticledir, uncorrected_proofs_text_list
+from .corpus.plos_corpus import (create_local_plos_corpus, get_dois_needed_list, download_check_and_move,
+                                 min_files_for_valid_corpus)
+
+
+def main():
+    """
+    Entry point for the program. This is used when the program is used as a
+    standalone script
+    :return: None
+    """
+    directory = get_corpus_dir()
+
+    # Step 0: Initialize first copy of repository
+    try:
+        corpus_files = [name for name in os.listdir(directory) if os.path.isfile(
+                        os.path.join(directory, name))]
+    except FileNotFoundError:
+        corpus_files = []
+    if len(corpus_files) < min_files_for_valid_corpus:
+        print('Not enough articles in {}, re-downloading zip file'.format(directory))
+        # TODO: check if zip file is in top-level directory before downloading
+        create_local_plos_corpus()
+
+    # Step 1: Query solr via URL and construct DOI list
+        # Filtered by article type & scheduled for the last 14 days.
+        # Returns specific URL query & the number of search results.
+        # Parses the returned dictionary of article DOIs, removing common leading numbers, as a list.
+        # Compares to list of existing articles in the PLOS corpus folder to create list of DOIs to download.
+    print("Checking for new articles...")
+    dois_needed_list = get_dois_needed_list()
+
+    # Step 2: Download new articles
+        # For every doi in dois_needed_list, grab the accompanying XML from journal pages
+        # If no new articles, don't run any other cells
+        # Check if articles are uncorrected proofs
+        # Check if amended articles linked to new amendment articles are updated
+        # Merge new XML into folder
+        # If need to bulk download, please start here:
+        # https://drive.google.com/open?id=0B_JDnoghFeEKLTlJT09IckMwOFk
+    download_check_and_move(dois_needed_list,
+                            uncorrected_proofs_text_list,
+                            tempdir=newarticledir,
+                            destination=get_corpus_dir()
+                            )
+    return None
+
+
+if __name__ == "__main__":
+    main()