Skip to content

Commit

Permalink
Merge pull request #88 from eseiver/update
Browse files Browse the repository at this point in the history
Change update functions & readme
  • Loading branch information
mpacer authored Mar 6, 2018
2 parents 7c4abf4 + a68b06c commit a6d6376
Show file tree
Hide file tree
Showing 6 changed files with 91 additions and 85 deletions.
12 changes: 6 additions & 6 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ How to run the program

Execute the following command.

``(allofplos)$ python -m allofplos.plos_corpus``
``(allofplos)$ python -m allofplos.update``

The first time it runs it will download a >4.4 Gb zip file
(**allofplos_xml.zip**) with all the XML files inside.
Expand All @@ -59,7 +59,7 @@ allofplos\_xml directory inside your installation of `allofplos`.

If you want to see the directory on your file system where this is installed run

``python -c "from allofplos.plos_regex import corpusdir; print(corpusdir)"``
``python -c "from allofplos import get_corpus_dir; print(get_corpus_dir())"``

If you ever downloaded the corpus before, it will make an incremental
update to the existing corpus, the script checks for and then downloads
Expand All @@ -80,8 +80,8 @@ Here’s what the print statements might look like on a typical run:

147 new articles to download.
147 new articles downloaded.
3 corrected articles found.
0 corrected articles downloaded with new xml.
3 amended articles found.
0 amended articles downloaded with new xml.
Creating new text list of uncorrected proofs from scratch.
No new VOR articles indexed in Solr.
17 VOR articles directly downloaded.
Expand All @@ -106,9 +106,9 @@ Should return something like this:

::

......
........
----------------------------------------------------------------------
Ran 6 tests in 3.327s
Ran 8 tests in 0.257s

OK

Expand Down
2 changes: 1 addition & 1 deletion allofplos/article_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def doi(self, d):
instantiating the article object.
"""
if validate_doi(d) is False:
raise Exception("Invalid format for PLOS DOI")
raise Exception("Invalid format for PLOS DOI: {}".format(d))
self.reset_memoized_attrs()
self._doi = d

Expand Down
82 changes: 15 additions & 67 deletions allofplos/corpus/plos_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,9 +278,16 @@ def compare_article_pubdate(doi, days=22, directory=None):
print("Pubdate error in {}".format(doi))


def download_xml(doi, tempdir=newarticledir):
"""For a given DOI, download its remote XML file to tempdir."""
art = Article(doi, directory=tempdir)
with open(art.filename, 'w', encoding='utf8') as f:
f.write(art.get_remote_xml())
return art


def download_updated_xml(article_file,
tempdir=newarticledir,
vor_check=False):
tempdir=newarticledir):
"""
For an article file, compare local XML to remote XML
If they're different, download new version of article
Expand All @@ -305,21 +312,9 @@ def download_updated_xml(article_file,

if articleXML_remote == articleXML_local:
updated = False
get_new = False
else:
get_new = True
if vor_check:
# make sure that update is to a VOR for uncorrected proof
get_new = False
if article.remote_proof == 'vor_update':
get_new = True
# else:
# updated = False
if get_new:
article_new = Article(article.doi, directory=tempdir)
with open(article_new.filename, 'w', encoding='utf8') as f:
f.write(articleXML_remote)
updated = True
article_new = download_xml(article.doi, tempdir=tempdir)
updated = True
return updated


Expand Down Expand Up @@ -496,10 +491,10 @@ def download_vor_updates(directory=None, tempdir=newarticledir,
if vor_updates_available is None:
vor_updates_available = check_for_vor_updates()
vor_updated_article_list = []
for article in tqdm(vor_updates_available, disable=None):
updated = download_updated_xml(article, vor_check=True)
for doi in tqdm(vor_updates_available, disable=None):
updated = download_updated_xml(doi_to_path(doi), tempdir=tempdir)
if updated:
vor_updated_article_list.append(article)
vor_updated_article_list.append(doi)

old_uncorrected_proofs = get_uncorrected_proofs()
new_uncorrected_proofs_list = list(old_uncorrected_proofs - set(vor_updated_article_list))
Expand Down Expand Up @@ -547,7 +542,7 @@ def remote_proofs_direct_check(tempdir=newarticledir, article_list=None):
print("Checking directly for additional VOR updates...")
for doi in tqdm(article_list, disable=None):
f = doi_to_path(doi)
updated = download_updated_xml(f, vor_check=True)
updated = download_updated_xml(f)
if updated:
proofs_download_list.append(doi)
if proofs_download_list:
Expand Down Expand Up @@ -646,50 +641,3 @@ def download_corpus_metadata_files(csv_abstracts=True, csv_no_abstracts=True, sq
inF.close()
outF.close()
print("Extraction complete.")


def main():
"""
Entry point for the program. This is used when the program is used as a
standalone script
:return: None
"""
directory = get_corpus_dir()

# Step 0: Initialize first copy of repository
try:
corpus_files = [name for name in os.listdir(directory) if os.path.isfile(
os.path.join(directory, name))]
except FileNotFoundError:
corpus_files = []
if len(corpus_files) < min_files_for_valid_corpus:
print('Not enough articles in {}, re-downloading zip file'.format(directory))
# TODO: check if zip file is in top-level directory before downloading
create_local_plos_corpus()

# Step 1: Query solr via URL and construct DOI list
# Filtered by article type & scheduled for the last 14 days.
# Returns specific URL query & the number of search results.
# Parses the returned dictionary of article DOIs, removing common leading numbers, as a list.
# Compares to list of existing articles in the PLOS corpus folder to create list of DOIs to download.
print("Checking for new articles...")
dois_needed_list = get_dois_needed_list()

# Step 2: Download new articles
# For every doi in dois_needed_list, grab the accompanying XML from journal pages
# If no new articles, don't run any other cells
# Check if articles are uncorrected proofs
# Check if amended articles linked to new amendment articles are updated
# Merge new XML into folder
# If need to bulk download, please start here:
# https://drive.google.com/open?id=0B_JDnoghFeEKLTlJT09IckMwOFk
download_check_and_move(dois_needed_list,
uncorrected_proofs_text_list,
tempdir=newarticledir,
destination=get_corpus_dir()
)
return None


if __name__ == "__main__":
main()
8 changes: 7 additions & 1 deletion allofplos/plos_corpus.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
from .corpus.plos_corpus import main
import warnings

from .update import main

if __name__ == "__main__":
warnings.simplefilter('always', DeprecationWarning)
warnings.warn("This update method is deprecated. use 'python -m allofplos.update'",
DeprecationWarning,
stacklevel=2)
main()
20 changes: 10 additions & 10 deletions allofplos/transformations.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,14 +95,13 @@ def filename_to_doi(filename):
:return: full unique identifier for a PLOS article
"""
filename = os.path.basename(filename)
if correction in filename and validate_filename(filename):
if not validate_filename(filename):
raise Exception("Invalid format for PLOS filename: {}".format(filename))
elif correction in filename:
article = 'annotation/' + filename.split('.', 4)[2]
doi = PREFIX + article
elif validate_filename(filename):
else:
doi = PREFIX + os.path.splitext(filename)[0]
# NOTE: A filename should never validate as a DOI, so the next elif is wrong.
elif validate_doi(filename):
doi = filename
return doi


Expand Down Expand Up @@ -155,6 +154,8 @@ def doi_to_url(doi):
:param doi: full unique identifier for a PLOS article
:return: online location of a PLOS article's XML
"""
if validate_doi(doi) is False:
raise Exception("Invalid format for PLOS DOI: {}".format(doi))
journal = Journal.doi_to_journal(doi)
base_page = _get_base_page(journal)
return ''.join([base_page, 'article/file?id=', doi, URL_SUFFIX])
Expand All @@ -174,13 +175,12 @@ def doi_to_path(doi, directory=None):
"""
if directory is None:
directory = get_corpus_dir()
if doi.startswith(ANNOTATION_DOI) and validate_doi(doi):
if not validate_doi(doi):
raise Exception("Invalid format for PLOS DOI: {}".format(doi))
elif doi.startswith(ANNOTATION_DOI):
article_file = os.path.join(directory, "plos.correction." + doi.split('/')[-1] + SUFFIX_LOWER)
elif validate_doi(doi):
else:
article_file = os.path.join(directory, doi.lstrip(PREFIX) + SUFFIX_LOWER)
# NOTE: The following check is weird, a DOI should never validate as a file name.
elif validate_filename(doi):
article_file = doi
return article_file


Expand Down
52 changes: 52 additions & 0 deletions allofplos/update.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import os

from . import get_corpus_dir, newarticledir, uncorrected_proofs_text_list
from .corpus.plos_corpus import (create_local_plos_corpus, get_dois_needed_list, download_check_and_move,
min_files_for_valid_corpus)


def main():
"""
Entry point for the program. This is used when the program is used as a
standalone script
:return: None
"""
directory = get_corpus_dir()

# Step 0: Initialize first copy of repository
try:
corpus_files = [name for name in os.listdir(directory) if os.path.isfile(
os.path.join(directory, name))]
except FileNotFoundError:
corpus_files = []
if len(corpus_files) < min_files_for_valid_corpus:
print('Not enough articles in {}, re-downloading zip file'.format(directory))
# TODO: check if zip file is in top-level directory before downloading
create_local_plos_corpus()

# Step 1: Query solr via URL and construct DOI list
# Filtered by article type & scheduled for the last 14 days.
# Returns specific URL query & the number of search results.
# Parses the returned dictionary of article DOIs, removing common leading numbers, as a list.
# Compares to list of existing articles in the PLOS corpus folder to create list of DOIs to download.
print("Checking for new articles...")
dois_needed_list = get_dois_needed_list()

# Step 2: Download new articles
# For every doi in dois_needed_list, grab the accompanying XML from journal pages
# If no new articles, don't run any other cells
# Check if articles are uncorrected proofs
# Check if amended articles linked to new amendment articles are updated
# Merge new XML into folder
# If need to bulk download, please start here:
# https://drive.google.com/open?id=0B_JDnoghFeEKLTlJT09IckMwOFk
download_check_and_move(dois_needed_list,
uncorrected_proofs_text_list,
tempdir=newarticledir,
destination=get_corpus_dir()
)
return None


if __name__ == "__main__":
main()

0 comments on commit a6d6376

Please sign in to comment.