Merge branch 'master' into bug-112

titipata · Jul 8, 2024 · 5eb02d1 · 5eb02d1
2 parents 97e9cff + 0d7e680
commit 5eb02d1
Show file tree

Hide file tree

Showing 4 changed files with 41 additions and 16 deletions.
diff --git a/.github/workflows/test-and-build-docs.yml b/.github/workflows/test-and-build-docs.yml
@@ -10,9 +10,13 @@ jobs:
   test:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+        ref: ${{ github.event.pull_request.head.ref }}
+        repository: ${{ github.event.pull_request.head.repo.full_name }}
     - name: Set up Python
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v5
       with:
         python-version: 3.8
     - name: Install dependencies
@@ -25,14 +29,18 @@ jobs:
     - name: Run tests
       run: pytest --cov=pubmed_parser tests/ --verbose
     - name: Upload coverage to Codecov
-      uses: codecov/codecov-action@v1
+      uses: codecov/codecov-action@v4
 
   docs:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
+      with:
+          fetch-depth: 0
+          ref: ${{ github.event.pull_request.head.ref || github.ref }}
+          repository: ${{ github.event.pull_request.head.repo.full_name || github.repository }}
     - name: Set up Python
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v5
       with:
         python-version: 3.8
     - name: Install dependencies
@@ -48,7 +56,7 @@ jobs:
         make html
         touch _build/html/.nojekyll
     - name: Deploy to GitHub Pages
-      uses: peaceiris/actions-gh-pages@v3
+      uses: peaceiris/actions-gh-pages@v4
       with:
         github_token: ${{ secrets.GITHUB_TOKEN }}
         publish_dir: docs/_build/html/

diff --git a/pubmed_parser/pubmed_oa_parser.py b/pubmed_parser/pubmed_oa_parser.py
@@ -177,7 +177,7 @@ def parse_pubmed_xml(path, include_path=False, nxml=False):
 
     journal_node = tree.findall(".//journal-title")
     if journal_node is not None:
-        journal = " ".join([j.text for j in journal_node])
+        journal = " ".join(["".join(node.itertext()) for node in journal_node])
     else:
         journal = ""
 
@@ -263,6 +263,15 @@ def parse_pubmed_xml(path, include_path=False, nxml=False):
     return dict_out
 
 
+def get_reference(reference):
+    """Get reference from one of the three possible positions."""
+    for tag in ["mixed-citation", "element-citation", "citation"]:
+        ref = reference.find(tag)
+        if ref is not None:
+            return ref
+    return None
+
+
 def parse_pubmed_references(path):
     """
     Given path to xml file, parse references articles
@@ -288,15 +297,10 @@ def parse_pubmed_references(path):
     for reference in references:
         ref_id = reference.attrib["id"]
 
-        if reference.find("mixed-citation") is not None:
-            ref = reference.find("mixed-citation")
-        elif reference.find("element-citation") is not None:
-            ref = reference.find("element-citation")
-        else:
-            ref = None
-
+        ref = get_reference(reference)
         if ref is not None:
-            if "publication-type" in ref.attrib.keys() and ref is not None:
+            ref_types = ["citation-type", "publication-type"]
+            if any(ref_type in ref_types for ref_type in ref.attrib.keys()):
                 if ref.attrib.values() is not None:
                     journal_type = ref.attrib.values()[0]
                 else:
@@ -557,7 +561,7 @@ def parse_pubmed_table(path, return_xml=True):
     pmc = dict_article_meta["pmc"]
 
     # parse table
-    tables = tree.xpath(".//body.//sec.//table-wrap")
+    tables = tree.xpath(".//body//table-wrap")
     table_dicts = list()
     for table in tables:
         if table.find("label") is not None:

diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/test_pubmed_oa_parser.py b/tests/test_pubmed_oa_parser.py
@@ -39,6 +39,7 @@ def test_parse_pubmed_xml():
     assert parsed_xml.get("doi") == "10.1371/journal.pone.0046493"
     assert parsed_xml.get("subjects") == "Research Article; Biology; Biochemistry; Enzymes; Enzyme Metabolism; Lipids; Fatty Acids; Glycerides; Lipid Metabolism; Neutral Lipids; Metabolism; Lipid Metabolism; Proteins; Globular Proteins; Protein Classes; Recombinant Proteins; Biotechnology; Microbiology; Bacterial Pathogens; Bacteriology; Emerging Infectious Diseases; Host-Pathogen Interaction; Microbial Growth and Development; Microbial Metabolism; Microbial Pathogens; Microbial Physiology; Proteomics; Sequence Analysis; Spectrometric Identification of Proteins"  # noqa
     assert "Competing Interests: " in parsed_xml.get("coi_statement")
+    assert parsed_xml.get("journal") == "PLoS ONE"
     assert parsed_xml.get('publication_year') == 2012
     assert parsed_xml.get('publication_date') == '01-01-2012'
     assert parsed_xml.get('epublication_date') == '28-9-2012'
@@ -66,6 +67,18 @@ def test_parse_pubmed_references():
     assert isinstance(references[0], dict)
     assert len(references) == 58, "Expected references to have length of 29"
 
+    references_9539395 = pp.parse_pubmed_references(pubmed_xml_9539395)
+    assert references_9539395[0].get('pmid') == '36094679'
+
+
+def test_parse_pubmed_table():
+    """
+    Test parsing table from PubMed XML file
+    """
+    table_9539395 = pp.parse_pubmed_table(pubmed_xml_9539395)
+    expected_cols = ['Gene', 'Uninfected and untreated', 'Day 7 postinoculation', 'PBS', 'sACE22.v2.4-IgG1']
+    assert table_9539395[0].get('table_columns') == expected_cols
+
 
 def test_parse_pubmed_caption():
     """