Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support peer reviews & other utility changes #142

Open
wants to merge 19 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 67 additions & 4 deletions allofplos/article.py
Original file line number Diff line number Diff line change
Expand Up @@ -1214,11 +1214,8 @@ def body(self):
:rtype: {str}
"""

xml_tree = et.parse(self.filename)
root = xml_tree.getroot()

# limit the text to the body section
body = root.find('./body')
body = self.root.find('./body')

# remove supplementary material section
for sec in body.findall('.//sec'):
Expand Down Expand Up @@ -1378,3 +1375,69 @@ def from_filename(cls, filename):
else:
directory = None
return cls(filename_to_doi(filename), directory=directory)

# region: review_crawling2022
@classmethod
def from_xml(cls, source, directory = None):
"""Initiate an article object using an XML-encoded string.
Parses the XML to obtain the article's doi.

:param source: string containing XML describing an article
:param directory: path to directory containing the XML for this article. Defaults to `get_corpus_dir()` via `Article().__init__`.
"""
root = et.fromstring(source)
doi = root.find("front//article-id[@pub-id-type='doi']").text.strip()
a = Article(doi, directory)
a.tree = root.getroottree()
return a

@tree.setter
def tree(self, value):
"""
Set tree to the given object.
"""
assert isinstance(value, et._ElementTree) # TODO better validation?
self._tree = value

def get_subarticles(self):
"""Get sub-articles embedded in the XML tree of this article.

:rtype: list
:return: list of lxml elements that are roots of each sub-article
"""
sub_articles = self.root.findall('sub-article')
return sub_articles # maybe return list of Articles instead?

def get_author_names(self):
"""
Compresses the list of dicts stored in `self.authors` into a simpler list of author names.

:rtype: list
"""
parsed_authors = []
for author in self.authors:
if author['given_names'] is None and author['surname'] is None:
parsed_authors.append(author['group_name'])
else:
parsed_authors.append(author['given_names']+ ' ' +author['surname'])
return parsed_authors

@property
def categories(self):
"""
Get the categories (or keywords) defined for this article.

:rtype: list
"""
keywords_set = set() # using a set because they tend to be duplicated
categories = self.root.find('.//front').find('.//article-categories')
if categories is None:
return None

for el in categories[1:]: # skipping the first one because it's a "heading"
for subj in el.iterdescendants():
if len(subj) == 1: keywords_set.add(subj[0].text.strip())
return list(keywords_set)


# endregion
17 changes: 11 additions & 6 deletions allofplos/corpus/plos_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,14 +94,14 @@ def download_corpus_zip():
return file_path


def unzip_articles(file_path):
def unzip_articles(file_path, extract_directory = get_corpus_dir(), delete_file=True):
"""
Unzips zip file of all of PLOS article XML to specified directory
:param file_path: path to file to be extracted
:param extract_directory: directory where articles are copied to
:param delete_file: whether to delete the compressed archive after extracting articles
:return: None
"""
extract_directory = get_corpus_dir()

os.makedirs(extract_directory, exist_ok=True)

with zipfile.ZipFile(file_path, "r") as zip_ref:
Expand All @@ -110,7 +110,8 @@ def unzip_articles(file_path):
zip_ref.extract(article, path=extract_directory)
tqdm.write("Extraction complete.")

os.remove(file_path)
if delete_file:
os.remove(file_path)


def listdir_nohidden(path, extension='.xml', include_dir=True):
Expand Down Expand Up @@ -633,7 +634,7 @@ def download_check_and_move(article_list, proof_filepath, tempdir, destination):
move_articles(tempdir, destination)


def create_local_plos_corpus(directory=None, rm_metadata=True):
def create_local_plos_corpus(directory=None, rm_metadata=True, unzip=True, delete_file=True):
"""
Downloads a fresh copy of the PLOS corpus by:
1) creating directory if it doesn't exist
Expand All @@ -642,6 +643,8 @@ def create_local_plos_corpus(directory=None, rm_metadata=True):
3) extracting the individual XML files into the corpus directory
:param directory: directory where the corpus is to be downloaded and extracted
:param rm_metadata: COMPLETE HERE
:param unzip: whether to extract article files to corpus dir, or just keep the zip file instead. Defaults to `True`
:param delete_file: whether to delete the compressed archive after extracting articles. Defaults to `True`
:return: None
"""
if directory is None:
Expand All @@ -650,4 +653,6 @@ def create_local_plos_corpus(directory=None, rm_metadata=True):
print('Creating folder for article xml')
os.makedirs(directory, exist_ok=True)
zip_path = download_corpus_zip()
unzip_articles(file_path=zip_path)
if unzip:
unzip_articles(file_path=zip_path, extract_directory=get_corpus_dir(), delete_file=delete_file)

41 changes: 29 additions & 12 deletions allofplos/plos_regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@
from . import get_corpus_dir

regex_match_prefix = r"^10\.1371/"
regex_body_match = (r"((journal\.p[a-zA-Z]{3}\.[\d]{7}$)"
regex_body_match = (r"((journal\.p[a-zA-Z]{3}\.[\d]{7})"
r"|(annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}$))")
regex_suffix_match = r"(\.[rs][0-9]{3})?" # matches reviews and supplementary materials
regex_body_search = (r"((journal\.p[a-zA-Z]{3}\.[\d]{7})"
r"|(annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}))")
regex_body_currents = (r"((currents\.[a-zA-Z]{2,9}\.[a-zA-Z0-9]{32}$)"
Expand All @@ -18,17 +19,20 @@
r"|([a-zA-Z0-9]{32}$))")
regex_file_search = (r"((journal\.p[a-zA-Z]{3}\.[\d]{7})"
r"|(plos\.correction\.[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}))")
full_doi_regex_match = re.compile(regex_match_prefix+regex_body_match)
full_doi_regex_match = re.compile(regex_match_prefix+regex_body_match+regex_suffix_match)
full_doi_regex_search = re.compile(r"10\.1371/journal\.p[a-zA-Z]{3}\.[\d]{7}"
"|10\.1371/annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}")
currents_doi_regex = re.compile(regex_match_prefix+regex_body_currents)
file_regex_match = re.compile(regex_file_search+r"\.xml")
BASE_URL = 'https://journals.plos.org/plosone/article/file?id='
URL_SUFFIX = '&type=manuscript'
external_url_regex_match = re.compile(re.escape(BASE_URL) +
re.escape("10.1371/") +
regex_body_search +
re.escape(URL_SUFFIX))
regex_type_match = r"(article)|(peerReview)"
regex_file_suffix = r"&type=((manuscript)|(supplementary))"

BASE_URL = 'https://journals.plos.org/plosone/'
external_url_regex_match = re.compile(re.escape(BASE_URL) + re.escape("article/file?id=10.1371/") +
regex_body_search + regex_suffix_match + regex_file_suffix)
plos_url_regex_match = re.compile(re.escape("https://journals.plos.org/") + r"[a-z]+/" +
regex_type_match + re.escape("?id=10.1371/") +
regex_body_search + regex_suffix_match)


def validate_doi(doi):
Expand Down Expand Up @@ -56,14 +60,27 @@ def validate_filename(filename):
return False


def validate_url(url):
def validate_file_url(url):
"""
For an individual string, tests whether the full string is in a valid article url format or not
For an individual string, tests whether the full string is in a valid article (manuscript) url format or not
Example: 'https://journals.plos.org/plosone/article/file?id=10.1371/journal.pcbi.0020147&type=manuscript' is True,
but 'https://journals.plos.org/plosone/article/file?id=10.1371/journal.pcbi.0020147' is False
:return: True if string is in a valid PLOS article url; False if not

Urls leading to files containing supplementary material are valid.
example: 'https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0222522.s002&type=supplementary' is True

:return: True if string is in a valid PLOS file url; False if not
"""
return bool(external_url_regex_match.match(url))


def validate_plos_url(url):
"""
Tests whether the given `url` string is a valid PLOS website format.

:return True if string is in a valid PLOS url; False otherwise
"""
return bool(external_url_regex_match.search(url))
return bool(plos_url_regex_match.search(url))


def find_valid_dois(doi):
Expand Down
4 changes: 2 additions & 2 deletions allofplos/samples/corpus_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

from .. import get_corpus_dir, newarticledir

from ..plos_regex import (validate_doi, full_doi_regex_match, validate_url, validate_filename)
from ..plos_regex import (validate_doi, full_doi_regex_match, validate_file_url, validate_filename)
from ..transformations import (filename_to_doi, doi_to_url)
from ..corpus.plos_corpus import (listdir_nohidden, uncorrected_proofs_text_list,
download_updated_xml, get_all_solr_dois,
Expand Down Expand Up @@ -49,7 +49,7 @@ def validate_corpus(directory=None):

# check urls
plos_urls = [doi_to_url(doi) for doi in plos_valid_dois]
plos_valid_urls = [url for url in plos_urls if validate_url(url)]
plos_valid_urls = [url for url in plos_urls if validate_file_url(url)]
if set(plos_urls) == set(plos_valid_urls) and len(plos_valid_urls) == len(plos_valid_dois):
pass
else:
Expand Down
7 changes: 4 additions & 3 deletions allofplos/transformations.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@
'assetFile': 'article/file',
'assetXMLFile': 'article/file',
'articleMetrics': 'article/metrics',
'articleRelated': 'article/related'}
'articleRelated': 'article/related',
'peerReview': 'article/peerReview'}


def _get_base_page(journal):
Expand Down Expand Up @@ -144,8 +145,8 @@ def url_to_doi(url):
Example:
url_to_path('https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.1000001') = \
'10.1371/journal.pone.1000001'
:param url: online location of a PLOS article's XML
:return: full unique identifier for a PLOS article
:param url: online location of a PLOS article's XML (not neccessarily, base link works fine too)
:return: full unique identifier for a PLOS article (or for a peer review, or supplementary material etc.)
"""
return url[url.index(PREFIX):].rstrip(URL_SUFFIX).rstrip(INT_URL_SUFFIX)

Expand Down
6 changes: 3 additions & 3 deletions tests/test_unittests.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def test_class_doi1(self):
self.assertEqual(article.dtd, "JATS 1.1d3", 'dtd does not transform correctly for {}'.format(article.doi))
self.assertEqual(article.editor, [{'contrib_initials': 'EGL', 'given_names': 'Eric Gordon', 'surname': 'Lamb', 'group_name': None, 'ids': [], 'rid_dict': {'aff': ['edit1']}, 'contrib_type': 'editor', 'author_type': None, 'editor_type': None, 'email': None, 'affiliations': ['University of Saskatchewan, CANADA'], 'author_roles': {None: ['Editor']}, 'footnotes': []}], 'editor does not transform correctly for {}'.format(article.doi))
article_relpath = os.path.relpath(article.filepath, TESTDIR)
self.assertEqual(article_relpath, "testdata/journal.pone.0185809.xml", 'filename does not transform correctly for {}'.format(article.doi))
self.assertEqual(article_relpath, os.path.join("testdata","journal.pone.0185809.xml"), 'filename does not transform correctly for {}'.format(article.doi))
self.assertEqual(article.journal, "PLOS ONE", 'journal does not transform correctly for {}'.format(article.doi))
self.assertEqual(article.local, True, 'local does not transform correctly for {}'.format(article.doi))
self.assertEqual(article.page, "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0185809", 'page does not transform correctly for {}'.format(article.doi))
Expand Down Expand Up @@ -133,7 +133,7 @@ def test_example_doi(self):
self.assertEqual(article.dtd, "JATS 1.1d3", 'dtd does not transform correctly for {}'.format(article.doi))
self.assertEqual(article.editor, [], 'editor does not transform correctly for {}'.format(article.doi))
article_relpath = os.path.relpath(article.filepath, TESTDIR)
self.assertEqual(article_relpath, "testdata/journal.pbio.2001413.xml", 'filename does not transform correctly for {}'.format(article.doi))
self.assertEqual(article_relpath, os.path.join("testdata","journal.pbio.2001413.xml"), 'filename does not transform correctly for {}'.format(article.doi))
self.assertEqual(article.journal, "PLOS Biology", 'journal does not transform correctly for {}'.format(article.doi))
self.assertEqual(article.local, True, 'local does not transform correctly for {}'.format(article.doi))
self.assertEqual(article.page, "https://journals.plos.org/plosbiology/article?id=10.1371/journal.pbio.2001413", 'page does not transform correctly for {}'.format(article.doi))
Expand Down Expand Up @@ -172,7 +172,7 @@ def test_example_doi2(self):
self.assertEqual(article.dtd, "NLM 3.0", 'dtd does not transform correctly for {}'.format(article.doi))
self.assertEqual(article.editor, [], 'editor does not transform correctly for {}'.format(article.doi))
article_relpath = os.path.relpath(article.filepath, TESTDIR)
self.assertEqual(article_relpath, "testdata/plos.correction.3155a3e9-5fbe-435c-a07a-e9a4846ec0b6.xml", 'filename does not transform correctly for {}'.format(article.doi))
self.assertEqual(article_relpath, os.path.join("testdata","plos.correction.3155a3e9-5fbe-435c-a07a-e9a4846ec0b6.xml"), 'filename does not transform correctly for {}'.format(article.doi))
self.assertEqual(article.journal, "PLOS ONE", 'journal does not transform correctly for {}'.format(article.doi))
self.assertEqual(article.local, True, 'local does not transform correctly for {}'.format(article.doi))
self.assertEqual(article.page, "https://journals.plos.org/plosone/article?id=10.1371/annotation/3155a3e9-5fbe-435c-a07a-e9a4846ec0b6", 'page does not transform correctly for {}'.format(article.doi))
Expand Down