From a0502c957243c69dbf8178a6f32d27dc669e9f4f Mon Sep 17 00:00:00 2001 From: Elizabeth Seiver Date: Tue, 27 Feb 2018 17:21:42 -0800 Subject: [PATCH 1/4] change update function to allofplos.update deprecates allofplos.plos_corpus, updates README with new command --- README.rst | 12 ++++++------ allofplos/plos_corpus.py | 6 ++++++ allofplos/update.py | 4 ++++ 3 files changed, 16 insertions(+), 6 deletions(-) create mode 100644 allofplos/update.py diff --git a/README.rst b/README.rst index 23ef59f3..e093e765 100644 --- a/README.rst +++ b/README.rst @@ -48,7 +48,7 @@ How to run the program Execute the following command. -``(allofplos)$ python -m allofplos.plos_corpus`` +``(allofplos)$ python -m allofplos.update`` The first time it runs it will download a >4.4 Gb zip file (**allofplos_xml.zip**) with all the XML files inside. @@ -59,7 +59,7 @@ allofplos\_xml directory inside your installation of `allofplos`. If you want to see the directory on your file system where this is installed run -``python -c "from allofplos.plos_regex import corpusdir; print(corpusdir)"`` +``python -c "from allofplos import get_corpus_dir; print(get_corpus_dir())"`` If you ever downloaded the corpus before, it will make an incremental update to the existing corpus, the script checks for and then downloads @@ -80,8 +80,8 @@ Here’s what the print statements might look like on a typical run: 147 new articles to download. 147 new articles downloaded. - 3 corrected articles found. - 0 corrected articles downloaded with new xml. + 3 amended articles found. + 0 amended articles downloaded with new xml. Creating new text list of uncorrected proofs from scratch. No new VOR articles indexed in Solr. 17 VOR articles directly downloaded. @@ -106,9 +106,9 @@ Should return something like this: :: - ...... + ........ ---------------------------------------------------------------------- - Ran 6 tests in 3.327s + Ran 8 tests in 0.257s OK diff --git a/allofplos/plos_corpus.py b/allofplos/plos_corpus.py index 6df4db00..93ecb717 100644 --- a/allofplos/plos_corpus.py +++ b/allofplos/plos_corpus.py @@ -1,4 +1,10 @@ +import warnings + from .corpus.plos_corpus import main if __name__ == "__main__": + warnings.simplefilter('always', DeprecationWarning) + warnings.warn("This update method is deprecated. use 'python -m allofplos.update'", + DeprecationWarning, + stacklevel=2) main() diff --git a/allofplos/update.py b/allofplos/update.py new file mode 100644 index 00000000..6df4db00 --- /dev/null +++ b/allofplos/update.py @@ -0,0 +1,4 @@ +from .corpus.plos_corpus import main + +if __name__ == "__main__": + main() From 0b4d94211164a9a53fdef08e11568bf6283bea36 Mon Sep 17 00:00:00 2001 From: Elizabeth Seiver Date: Tue, 27 Feb 2018 17:47:30 -0800 Subject: [PATCH 2/4] fix filename_to_doi and download functions `filename_to_doi` no longer allows passing through a DOI. new `download_xml` function is based on DOI and not file, and is passed into `download_updated_xml`. --- allofplos/corpus/plos_corpus.py | 35 ++++++++++++++------------------- allofplos/transformations.py | 5 ++--- 2 files changed, 17 insertions(+), 23 deletions(-) diff --git a/allofplos/corpus/plos_corpus.py b/allofplos/corpus/plos_corpus.py index 3550052a..982d8fc7 100644 --- a/allofplos/corpus/plos_corpus.py +++ b/allofplos/corpus/plos_corpus.py @@ -278,9 +278,16 @@ def compare_article_pubdate(doi, days=22, directory=None): print("Pubdate error in {}".format(doi)) +def download_xml(doi, tempdir=newarticledir): + """For a given DOI, download its remote XML file to tempdir.""" + art = Article(doi, directory=tempdir) + with open(art.filename, 'w', encoding='utf8') as f: + f.write(art.get_remote_xml()) + return art + + def download_updated_xml(article_file, - tempdir=newarticledir, - vor_check=False): + tempdir=newarticledir): """ For an article file, compare local XML to remote XML If they're different, download new version of article @@ -305,21 +312,9 @@ def download_updated_xml(article_file, if articleXML_remote == articleXML_local: updated = False - get_new = False else: - get_new = True - if vor_check: - # make sure that update is to a VOR for uncorrected proof - get_new = False - if article.remote_proof == 'vor_update': - get_new = True - # else: - # updated = False - if get_new: - article_new = Article(article.doi, directory=tempdir) - with open(article_new.filename, 'w', encoding='utf8') as f: - f.write(articleXML_remote) - updated = True + article_new = download_xml(article.doi, tempdir=tempdir) + updated = True return updated @@ -496,10 +491,10 @@ def download_vor_updates(directory=None, tempdir=newarticledir, if vor_updates_available is None: vor_updates_available = check_for_vor_updates() vor_updated_article_list = [] - for article in tqdm(vor_updates_available, disable=None): - updated = download_updated_xml(article, vor_check=True) + for doi in tqdm(vor_updates_available, disable=None): + updated = download_updated_xml(doi_to_path(doi), tempdir=tempdir) if updated: - vor_updated_article_list.append(article) + vor_updated_article_list.append(doi) old_uncorrected_proofs = get_uncorrected_proofs() new_uncorrected_proofs_list = list(old_uncorrected_proofs - set(vor_updated_article_list)) @@ -547,7 +542,7 @@ def remote_proofs_direct_check(tempdir=newarticledir, article_list=None): print("Checking directly for additional VOR updates...") for doi in tqdm(article_list, disable=None): f = doi_to_path(doi) - updated = download_updated_xml(f, vor_check=True) + updated = download_updated_xml(f) if updated: proofs_download_list.append(doi) if proofs_download_list: diff --git a/allofplos/transformations.py b/allofplos/transformations.py index 820e6a7f..abe78a55 100644 --- a/allofplos/transformations.py +++ b/allofplos/transformations.py @@ -100,9 +100,8 @@ def filename_to_doi(filename): doi = PREFIX + article elif validate_filename(filename): doi = PREFIX + os.path.splitext(filename)[0] - # NOTE: A filename should never validate as a DOI, so the next elif is wrong. - elif validate_doi(filename): - doi = filename + else: + doi = '' return doi From 2dc637cb1f020c391fcdbb0ae2119e54a6801883 Mon Sep 17 00:00:00 2001 From: Elizabeth Seiver Date: Mon, 5 Mar 2018 15:51:47 -0800 Subject: [PATCH 3/4] move main() to update.py point .plos_corpus.py at update.py delete `main()` from corpus.plos_corpus.py --- allofplos/corpus/plos_corpus.py | 47 ------------------------------- allofplos/plos_corpus.py | 2 +- allofplos/update.py | 50 ++++++++++++++++++++++++++++++++- 3 files changed, 50 insertions(+), 49 deletions(-) diff --git a/allofplos/corpus/plos_corpus.py b/allofplos/corpus/plos_corpus.py index 982d8fc7..f2c45231 100644 --- a/allofplos/corpus/plos_corpus.py +++ b/allofplos/corpus/plos_corpus.py @@ -641,50 +641,3 @@ def download_corpus_metadata_files(csv_abstracts=True, csv_no_abstracts=True, sq inF.close() outF.close() print("Extraction complete.") - - -def main(): - """ - Entry point for the program. This is used when the program is used as a - standalone script - :return: None - """ - directory = get_corpus_dir() - - # Step 0: Initialize first copy of repository - try: - corpus_files = [name for name in os.listdir(directory) if os.path.isfile( - os.path.join(directory, name))] - except FileNotFoundError: - corpus_files = [] - if len(corpus_files) < min_files_for_valid_corpus: - print('Not enough articles in {}, re-downloading zip file'.format(directory)) - # TODO: check if zip file is in top-level directory before downloading - create_local_plos_corpus() - - # Step 1: Query solr via URL and construct DOI list - # Filtered by article type & scheduled for the last 14 days. - # Returns specific URL query & the number of search results. - # Parses the returned dictionary of article DOIs, removing common leading numbers, as a list. - # Compares to list of existing articles in the PLOS corpus folder to create list of DOIs to download. - print("Checking for new articles...") - dois_needed_list = get_dois_needed_list() - - # Step 2: Download new articles - # For every doi in dois_needed_list, grab the accompanying XML from journal pages - # If no new articles, don't run any other cells - # Check if articles are uncorrected proofs - # Check if amended articles linked to new amendment articles are updated - # Merge new XML into folder - # If need to bulk download, please start here: - # https://drive.google.com/open?id=0B_JDnoghFeEKLTlJT09IckMwOFk - download_check_and_move(dois_needed_list, - uncorrected_proofs_text_list, - tempdir=newarticledir, - destination=get_corpus_dir() - ) - return None - - -if __name__ == "__main__": - main() diff --git a/allofplos/plos_corpus.py b/allofplos/plos_corpus.py index 93ecb717..32c8c62c 100644 --- a/allofplos/plos_corpus.py +++ b/allofplos/plos_corpus.py @@ -1,6 +1,6 @@ import warnings -from .corpus.plos_corpus import main +from .update import main if __name__ == "__main__": warnings.simplefilter('always', DeprecationWarning) diff --git a/allofplos/update.py b/allofplos/update.py index 6df4db00..c47d292e 100644 --- a/allofplos/update.py +++ b/allofplos/update.py @@ -1,4 +1,52 @@ -from .corpus.plos_corpus import main +import os + +from . import get_corpus_dir, newarticledir, uncorrected_proofs_text_list +from .corpus.plos_corpus import (create_local_plos_corpus, get_dois_needed_list, download_check_and_move, + min_files_for_valid_corpus) + + +def main(): + """ + Entry point for the program. This is used when the program is used as a + standalone script + :return: None + """ + directory = get_corpus_dir() + + # Step 0: Initialize first copy of repository + try: + corpus_files = [name for name in os.listdir(directory) if os.path.isfile( + os.path.join(directory, name))] + except FileNotFoundError: + corpus_files = [] + if len(corpus_files) < min_files_for_valid_corpus: + print('Not enough articles in {}, re-downloading zip file'.format(directory)) + # TODO: check if zip file is in top-level directory before downloading + create_local_plos_corpus() + + # Step 1: Query solr via URL and construct DOI list + # Filtered by article type & scheduled for the last 14 days. + # Returns specific URL query & the number of search results. + # Parses the returned dictionary of article DOIs, removing common leading numbers, as a list. + # Compares to list of existing articles in the PLOS corpus folder to create list of DOIs to download. + print("Checking for new articles...") + dois_needed_list = get_dois_needed_list() + + # Step 2: Download new articles + # For every doi in dois_needed_list, grab the accompanying XML from journal pages + # If no new articles, don't run any other cells + # Check if articles are uncorrected proofs + # Check if amended articles linked to new amendment articles are updated + # Merge new XML into folder + # If need to bulk download, please start here: + # https://drive.google.com/open?id=0B_JDnoghFeEKLTlJT09IckMwOFk + download_check_and_move(dois_needed_list, + uncorrected_proofs_text_list, + tempdir=newarticledir, + destination=get_corpus_dir() + ) + return None + if __name__ == "__main__": main() From a68b06cf6564f92f941e347f775068dcb366cc75 Mon Sep 17 00:00:00 2001 From: Elizabeth Seiver Date: Mon, 5 Mar 2018 17:01:56 -0800 Subject: [PATCH 4/4] more exceptions for invalid formats also include string that isn't validating --- allofplos/article_class.py | 2 +- allofplos/transformations.py | 19 ++++++++++--------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/allofplos/article_class.py b/allofplos/article_class.py index 94464357..07676390 100644 --- a/allofplos/article_class.py +++ b/allofplos/article_class.py @@ -107,7 +107,7 @@ def doi(self, d): instantiating the article object. """ if validate_doi(d) is False: - raise Exception("Invalid format for PLOS DOI") + raise Exception("Invalid format for PLOS DOI: {}".format(d)) self.reset_memoized_attrs() self._doi = d diff --git a/allofplos/transformations.py b/allofplos/transformations.py index abe78a55..fa146ad4 100644 --- a/allofplos/transformations.py +++ b/allofplos/transformations.py @@ -95,13 +95,13 @@ def filename_to_doi(filename): :return: full unique identifier for a PLOS article """ filename = os.path.basename(filename) - if correction in filename and validate_filename(filename): + if not validate_filename(filename): + raise Exception("Invalid format for PLOS filename: {}".format(filename)) + elif correction in filename: article = 'annotation/' + filename.split('.', 4)[2] doi = PREFIX + article - elif validate_filename(filename): - doi = PREFIX + os.path.splitext(filename)[0] else: - doi = '' + doi = PREFIX + os.path.splitext(filename)[0] return doi @@ -154,6 +154,8 @@ def doi_to_url(doi): :param doi: full unique identifier for a PLOS article :return: online location of a PLOS article's XML """ + if validate_doi(doi) is False: + raise Exception("Invalid format for PLOS DOI: {}".format(doi)) journal = Journal.doi_to_journal(doi) base_page = _get_base_page(journal) return ''.join([base_page, 'article/file?id=', doi, URL_SUFFIX]) @@ -173,13 +175,12 @@ def doi_to_path(doi, directory=None): """ if directory is None: directory = get_corpus_dir() - if doi.startswith(ANNOTATION_DOI) and validate_doi(doi): + if not validate_doi(doi): + raise Exception("Invalid format for PLOS DOI: {}".format(doi)) + elif doi.startswith(ANNOTATION_DOI): article_file = os.path.join(directory, "plos.correction." + doi.split('/')[-1] + SUFFIX_LOWER) - elif validate_doi(doi): + else: article_file = os.path.join(directory, doi.lstrip(PREFIX) + SUFFIX_LOWER) - # NOTE: The following check is weird, a DOI should never validate as a file name. - elif validate_filename(doi): - article_file = doi return article_file