Skip to content

Commit

Permalink
Merge branch 'master' into bug-112
Browse files Browse the repository at this point in the history
  • Loading branch information
Michael-E-Rose authored Jul 8, 2024
2 parents 97e9cff + 0d7e680 commit 5eb02d1
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 16 deletions.
20 changes: 14 additions & 6 deletions .github/workflows/test-and-build-docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,13 @@ jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.pull_request.head.ref }}
repository: ${{ github.event.pull_request.head.repo.full_name }}
- name: Set up Python
uses: actions/setup-python@v2
uses: actions/setup-python@v5
with:
python-version: 3.8
- name: Install dependencies
Expand All @@ -25,14 +29,18 @@ jobs:
- name: Run tests
run: pytest --cov=pubmed_parser tests/ --verbose
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v1
uses: codecov/codecov-action@v4

docs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.pull_request.head.ref || github.ref }}
repository: ${{ github.event.pull_request.head.repo.full_name || github.repository }}
- name: Set up Python
uses: actions/setup-python@v2
uses: actions/setup-python@v5
with:
python-version: 3.8
- name: Install dependencies
Expand All @@ -48,7 +56,7 @@ jobs:
make html
touch _build/html/.nojekyll
- name: Deploy to GitHub Pages
uses: peaceiris/actions-gh-pages@v3
uses: peaceiris/actions-gh-pages@v4
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
publish_dir: docs/_build/html/
Expand Down
24 changes: 14 additions & 10 deletions pubmed_parser/pubmed_oa_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ def parse_pubmed_xml(path, include_path=False, nxml=False):

journal_node = tree.findall(".//journal-title")
if journal_node is not None:
journal = " ".join([j.text for j in journal_node])
journal = " ".join(["".join(node.itertext()) for node in journal_node])
else:
journal = ""

Expand Down Expand Up @@ -263,6 +263,15 @@ def parse_pubmed_xml(path, include_path=False, nxml=False):
return dict_out


def get_reference(reference):
"""Get reference from one of the three possible positions."""
for tag in ["mixed-citation", "element-citation", "citation"]:
ref = reference.find(tag)
if ref is not None:
return ref
return None


def parse_pubmed_references(path):
"""
Given path to xml file, parse references articles
Expand All @@ -288,15 +297,10 @@ def parse_pubmed_references(path):
for reference in references:
ref_id = reference.attrib["id"]

if reference.find("mixed-citation") is not None:
ref = reference.find("mixed-citation")
elif reference.find("element-citation") is not None:
ref = reference.find("element-citation")
else:
ref = None

ref = get_reference(reference)
if ref is not None:
if "publication-type" in ref.attrib.keys() and ref is not None:
ref_types = ["citation-type", "publication-type"]
if any(ref_type in ref_types for ref_type in ref.attrib.keys()):
if ref.attrib.values() is not None:
journal_type = ref.attrib.values()[0]
else:
Expand Down Expand Up @@ -557,7 +561,7 @@ def parse_pubmed_table(path, return_xml=True):
pmc = dict_article_meta["pmc"]

# parse table
tables = tree.xpath(".//body.//sec.//table-wrap")
tables = tree.xpath(".//body//table-wrap")
table_dicts = list()
for table in tables:
if table.find("label") is not None:
Expand Down
Empty file added tests/__init__.py
Empty file.
13 changes: 13 additions & 0 deletions tests/test_pubmed_oa_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def test_parse_pubmed_xml():
assert parsed_xml.get("doi") == "10.1371/journal.pone.0046493"
assert parsed_xml.get("subjects") == "Research Article; Biology; Biochemistry; Enzymes; Enzyme Metabolism; Lipids; Fatty Acids; Glycerides; Lipid Metabolism; Neutral Lipids; Metabolism; Lipid Metabolism; Proteins; Globular Proteins; Protein Classes; Recombinant Proteins; Biotechnology; Microbiology; Bacterial Pathogens; Bacteriology; Emerging Infectious Diseases; Host-Pathogen Interaction; Microbial Growth and Development; Microbial Metabolism; Microbial Pathogens; Microbial Physiology; Proteomics; Sequence Analysis; Spectrometric Identification of Proteins" # noqa
assert "Competing Interests: " in parsed_xml.get("coi_statement")
assert parsed_xml.get("journal") == "PLoS ONE"
assert parsed_xml.get('publication_year') == 2012
assert parsed_xml.get('publication_date') == '01-01-2012'
assert parsed_xml.get('epublication_date') == '28-9-2012'
Expand Down Expand Up @@ -66,6 +67,18 @@ def test_parse_pubmed_references():
assert isinstance(references[0], dict)
assert len(references) == 58, "Expected references to have length of 29"

references_9539395 = pp.parse_pubmed_references(pubmed_xml_9539395)
assert references_9539395[0].get('pmid') == '36094679'


def test_parse_pubmed_table():
"""
Test parsing table from PubMed XML file
"""
table_9539395 = pp.parse_pubmed_table(pubmed_xml_9539395)
expected_cols = ['Gene', 'Uninfected and untreated', 'Day 7 postinoculation', 'PBS', 'sACE22.v2.4-IgG1']
assert table_9539395[0].get('table_columns') == expected_cols


def test_parse_pubmed_caption():
"""
Expand Down

0 comments on commit 5eb02d1

Please sign in to comment.