PLOS · x-j · Apr 1, 2022 · Apr 15, 2022 · Apr 15, 2022 · Apr 16, 2022
diff --git a/allofplos/article.py b/allofplos/article.py
@@ -1214,11 +1214,8 @@ def body(self):
         :rtype: {str}
         """
 
-        xml_tree = et.parse(self.filename)
-        root = xml_tree.getroot()
-
         # limit the text to the body section
-        body = root.find('./body')
+        body = self.root.find('./body')
 
         # remove supplementary material section
         for sec in body.findall('.//sec'):
@@ -1378,3 +1375,69 @@ def from_filename(cls, filename):
         else:
             directory = None
         return cls(filename_to_doi(filename), directory=directory)
+
+    # region: review_crawling2022
+    @classmethod
+    def from_xml(cls, source, directory = None):
+        """Initiate an article object using an XML-encoded string.
+            Parses the XML to obtain the article's doi. 
+
+            :param source: string containing XML describing an article
+            :param directory: path to directory containing the XML for this article. Defaults to `get_corpus_dir()` via `Article().__init__`.
+        """
+        root = et.fromstring(source)
+        doi = root.find("front//article-id[@pub-id-type='doi']").text.strip()
+        a = Article(doi, directory)
+        a.tree = root.getroottree()
+        return a
+
+    @tree.setter
+    def tree(self, value):
+        """
+        Set tree to the given object.
+        """
+        assert isinstance(value, et._ElementTree)   # TODO better validation?
+        self._tree = value
+
+    def get_subarticles(self):
+        """Get sub-articles embedded in the XML tree of this article.
+
+        :rtype: list
+        :return: list of lxml elements that are roots of each sub-article 
+        """
+        sub_articles = self.root.findall('sub-article')
+        return sub_articles     # maybe return list of Articles instead?
+
+    def get_author_names(self):
+        """
+        Compresses the list of dicts stored in `self.authors` into a simpler list of author names.
+
+        :rtype: list
+        """
+        parsed_authors = []
+        for author in self.authors:
+            if author['given_names'] is None and author['surname'] is None:
+                parsed_authors.append(author['group_name'])
+            else:
+                parsed_authors.append(author['given_names']+ ' ' +author['surname'])
+        return parsed_authors
+
+    @property
+    def categories(self):
+        """
+        Get the categories (or keywords) defined for this article.
+
+        :rtype: list
+        """
+        keywords_set = set()    # using a set because they tend to be duplicated
+        categories = self.root.find('.//front').find('.//article-categories')
+        if categories is None:
+            return None
+
+        for el in categories[1:]:   # skipping the first one because it's a "heading"
+            for subj in el.iterdescendants():
+                if len(subj) == 1:  keywords_set.add(subj[0].text.strip())
+        return list(keywords_set)
+
+
+    # endregion
diff --git a/allofplos/corpus/plos_corpus.py b/allofplos/corpus/plos_corpus.py
@@ -94,14 +94,14 @@ def download_corpus_zip():
     return file_path
 
 
-def unzip_articles(file_path):
+def unzip_articles(file_path, extract_directory = get_corpus_dir(), delete_file=True):
     """
     Unzips zip file of all of PLOS article XML to specified directory
     :param file_path: path to file to be extracted
+    :param extract_directory: directory where articles are copied to
+    :param delete_file: whether to delete the compressed archive after extracting articles
     :return: None
     """
-    extract_directory = get_corpus_dir()
-
     os.makedirs(extract_directory, exist_ok=True)
 
     with zipfile.ZipFile(file_path, "r") as zip_ref:
@@ -110,7 +110,8 @@ def unzip_articles(file_path):
             zip_ref.extract(article, path=extract_directory)
         tqdm.write("Extraction complete.")
 
-    os.remove(file_path)
+    if delete_file:
+        os.remove(file_path)
 
 
 def listdir_nohidden(path, extension='.xml', include_dir=True):
@@ -633,7 +634,7 @@ def download_check_and_move(article_list, proof_filepath, tempdir, destination):
     move_articles(tempdir, destination)
 
 
-def create_local_plos_corpus(directory=None, rm_metadata=True):
+def create_local_plos_corpus(directory=None, rm_metadata=True, unzip=True, delete_file=True):
     """
     Downloads a fresh copy of the PLOS corpus by:
     1) creating directory if it doesn't exist
@@ -642,6 +643,8 @@ def create_local_plos_corpus(directory=None, rm_metadata=True):
     3) extracting the individual XML files into the corpus directory
     :param directory: directory where the corpus is to be downloaded and extracted
     :param rm_metadata: COMPLETE HERE
+    :param unzip: whether to extract article files to corpus dir, or just keep the zip file instead. Defaults to `True`
+    :param delete_file: whether to delete the compressed archive after extracting articles. Defaults to `True`
     :return: None
     """
     if directory is None:
@@ -650,4 +653,6 @@ def create_local_plos_corpus(directory=None, rm_metadata=True):
         print('Creating folder for article xml')
     os.makedirs(directory, exist_ok=True)
     zip_path = download_corpus_zip()
-    unzip_articles(file_path=zip_path)
+    if unzip:
+        unzip_articles(file_path=zip_path, extract_directory=get_corpus_dir(), delete_file=delete_file)
+
diff --git a/allofplos/plos_regex.py b/allofplos/plos_regex.py
@@ -8,8 +8,9 @@
 from . import get_corpus_dir
 
 regex_match_prefix = r"^10\.1371/"
-regex_body_match = (r"((journal\.p[a-zA-Z]{3}\.[\d]{7}$)"
+regex_body_match = (r"((journal\.p[a-zA-Z]{3}\.[\d]{7})"
                     r"|(annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}$))")
+regex_suffix_match = r"(\.[rs][0-9]{3})?"   # matches reviews and supplementary materials
 regex_body_search = (r"((journal\.p[a-zA-Z]{3}\.[\d]{7})"
                      r"|(annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}))")
 regex_body_currents = (r"((currents\.[a-zA-Z]{2,9}\.[a-zA-Z0-9]{32}$)"
@@ -18,17 +19,20 @@
                        r"|([a-zA-Z0-9]{32}$))")
 regex_file_search = (r"((journal\.p[a-zA-Z]{3}\.[\d]{7})"
                      r"|(plos\.correction\.[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}))")
-full_doi_regex_match = re.compile(regex_match_prefix+regex_body_match)
+full_doi_regex_match = re.compile(regex_match_prefix+regex_body_match+regex_suffix_match)
 full_doi_regex_search = re.compile(r"10\.1371/journal\.p[a-zA-Z]{3}\.[\d]{7}"
                                    "|10\.1371/annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}")
 currents_doi_regex = re.compile(regex_match_prefix+regex_body_currents)
 file_regex_match = re.compile(regex_file_search+r"\.xml")
-BASE_URL = 'https://journals.plos.org/plosone/article/file?id='
-URL_SUFFIX = '&type=manuscript'
-external_url_regex_match = re.compile(re.escape(BASE_URL) +
-                                      re.escape("10.1371/") +
-                                      regex_body_search +
-                                      re.escape(URL_SUFFIX))
+regex_type_match = r"(article)|(peerReview)"
+regex_file_suffix = r"&type=((manuscript)|(supplementary))"
+
+BASE_URL = 'https://journals.plos.org/plosone/'
+external_url_regex_match = re.compile(re.escape(BASE_URL) + re.escape("article/file?id=10.1371/") +
+                                      regex_body_search + regex_suffix_match + regex_file_suffix)
+plos_url_regex_match = re.compile(re.escape("https://journals.plos.org/") + r"[a-z]+/" +
+                                  regex_type_match + re.escape("?id=10.1371/") +
+                                  regex_body_search + regex_suffix_match)
 
 
 def validate_doi(doi):
@@ -56,14 +60,27 @@ def validate_filename(filename):
         return False
 
 
-def validate_url(url):
+def validate_file_url(url):
     """
-    For an individual string, tests whether the full string is in a valid article url format or not
+    For an individual string, tests whether the full string is in a valid article (manuscript) url format or not
     Example: 'https://journals.plos.org/plosone/article/file?id=10.1371/journal.pcbi.0020147&type=manuscript' is True,
     but 'https://journals.plos.org/plosone/article/file?id=10.1371/journal.pcbi.0020147' is False
-    :return: True if string is in a valid PLOS article url; False if not
+
+    Urls leading to files containing supplementary material are valid.
+    example: 'https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0222522.s002&type=supplementary' is True
+
+    :return: True if string is in a valid PLOS file url; False if not
+    """
+    return bool(external_url_regex_match.match(url))
+
+
+def validate_plos_url(url):
+    """
+    Tests whether the given `url` string is a valid PLOS website format.
+
+    :return True if string is in a valid PLOS url; False otherwise
     """
-    return bool(external_url_regex_match.search(url))
+    return bool(plos_url_regex_match.search(url))
 
 
 def find_valid_dois(doi):

diff --git a/allofplos/samples/corpus_analysis.py b/allofplos/samples/corpus_analysis.py
@@ -17,7 +17,7 @@
 
 from .. import get_corpus_dir, newarticledir
 
-from ..plos_regex import (validate_doi, full_doi_regex_match, validate_url, validate_filename)
+from ..plos_regex import (validate_doi, full_doi_regex_match, validate_file_url, validate_filename)
 from ..transformations import (filename_to_doi, doi_to_url)
 from ..corpus.plos_corpus import (listdir_nohidden, uncorrected_proofs_text_list,
                                   download_updated_xml, get_all_solr_dois,
@@ -49,7 +49,7 @@ def validate_corpus(directory=None):
 
     # check urls
     plos_urls = [doi_to_url(doi) for doi in plos_valid_dois]
-    plos_valid_urls = [url for url in plos_urls if validate_url(url)]
+    plos_valid_urls = [url for url in plos_urls if validate_file_url(url)]
     if set(plos_urls) == set(plos_valid_urls) and len(plos_valid_urls) == len(plos_valid_dois):
         pass
     else:

diff --git a/allofplos/transformations.py b/allofplos/transformations.py
@@ -36,7 +36,8 @@
                   'assetFile': 'article/file',
                   'assetXMLFile': 'article/file',
                   'articleMetrics': 'article/metrics',
-                  'articleRelated': 'article/related'}
+                  'articleRelated': 'article/related',
+                  'peerReview': 'article/peerReview'}
 
 
 def _get_base_page(journal):
@@ -144,8 +145,8 @@ def url_to_doi(url):
     Example:
     url_to_path('https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.1000001') = \
     '10.1371/journal.pone.1000001'
-    :param url: online location of a PLOS article's XML
-    :return: full unique identifier for a PLOS article
+    :param url: online location of a PLOS article's XML (not neccessarily, base link works fine too)
+    :return: full unique identifier for a PLOS article (or for a peer review, or supplementary material etc.)
     """
     return url[url.index(PREFIX):].rstrip(URL_SUFFIX).rstrip(INT_URL_SUFFIX)
 

diff --git a/tests/test_unittests.py b/tests/test_unittests.py
@@ -92,7 +92,7 @@ def test_class_doi1(self):
         self.assertEqual(article.dtd, "JATS 1.1d3", 'dtd does not transform correctly for {}'.format(article.doi))
         self.assertEqual(article.editor, [{'contrib_initials': 'EGL', 'given_names': 'Eric Gordon', 'surname': 'Lamb', 'group_name': None, 'ids': [], 'rid_dict': {'aff': ['edit1']}, 'contrib_type': 'editor', 'author_type': None, 'editor_type': None, 'email': None, 'affiliations': ['University of Saskatchewan, CANADA'], 'author_roles': {None: ['Editor']}, 'footnotes': []}], 'editor does not transform correctly for {}'.format(article.doi))
         article_relpath = os.path.relpath(article.filepath, TESTDIR)
-        self.assertEqual(article_relpath, "testdata/journal.pone.0185809.xml", 'filename does not transform correctly for {}'.format(article.doi))
+        self.assertEqual(article_relpath, os.path.join("testdata","journal.pone.0185809.xml"), 'filename does not transform correctly for {}'.format(article.doi))
         self.assertEqual(article.journal, "PLOS ONE", 'journal does not transform correctly for {}'.format(article.doi))
         self.assertEqual(article.local, True, 'local does not transform correctly for {}'.format(article.doi))
         self.assertEqual(article.page, "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0185809", 'page does not transform correctly for {}'.format(article.doi))
@@ -133,7 +133,7 @@ def test_example_doi(self):
         self.assertEqual(article.dtd, "JATS 1.1d3", 'dtd does not transform correctly for {}'.format(article.doi))
         self.assertEqual(article.editor, [], 'editor does not transform correctly for {}'.format(article.doi))
         article_relpath = os.path.relpath(article.filepath, TESTDIR)
-        self.assertEqual(article_relpath, "testdata/journal.pbio.2001413.xml", 'filename does not transform correctly for {}'.format(article.doi))
+        self.assertEqual(article_relpath, os.path.join("testdata","journal.pbio.2001413.xml"), 'filename does not transform correctly for {}'.format(article.doi))
         self.assertEqual(article.journal, "PLOS Biology", 'journal does not transform correctly for {}'.format(article.doi))
         self.assertEqual(article.local, True, 'local does not transform correctly for {}'.format(article.doi))
         self.assertEqual(article.page, "https://journals.plos.org/plosbiology/article?id=10.1371/journal.pbio.2001413", 'page does not transform correctly for {}'.format(article.doi))
@@ -172,7 +172,7 @@ def test_example_doi2(self):
         self.assertEqual(article.dtd, "NLM 3.0", 'dtd does not transform correctly for {}'.format(article.doi))
         self.assertEqual(article.editor, [], 'editor does not transform correctly for {}'.format(article.doi))
         article_relpath = os.path.relpath(article.filepath, TESTDIR)
-        self.assertEqual(article_relpath, "testdata/plos.correction.3155a3e9-5fbe-435c-a07a-e9a4846ec0b6.xml", 'filename does not transform correctly for {}'.format(article.doi))
+        self.assertEqual(article_relpath, os.path.join("testdata","plos.correction.3155a3e9-5fbe-435c-a07a-e9a4846ec0b6.xml"), 'filename does not transform correctly for {}'.format(article.doi))
         self.assertEqual(article.journal, "PLOS ONE", 'journal does not transform correctly for {}'.format(article.doi))
         self.assertEqual(article.local, True, 'local does not transform correctly for {}'.format(article.doi))
         self.assertEqual(article.page, "https://journals.plos.org/plosone/article?id=10.1371/annotation/3155a3e9-5fbe-435c-a07a-e9a4846ec0b6", 'page does not transform correctly for {}'.format(article.doi))