From e0b18eeb622b1a3f7cea05d2bb2c2cb1280139f6 Mon Sep 17 00:00:00 2001 From: Michael-E-Rose Date: Fri, 12 Jul 2024 14:31:53 +0000 Subject: [PATCH] deploy: 327403ffd043989076374de9bcd6e02e301f4347 --- _modules/pubmed_parser/pubmed_oa_parser.html | 50 ++++++++++++++++---- api.html | 2 +- searchindex.js | 2 +- 3 files changed, 44 insertions(+), 10 deletions(-) diff --git a/_modules/pubmed_parser/pubmed_oa_parser.html b/_modules/pubmed_parser/pubmed_oa_parser.html index 022cfed..65f9279 100644 --- a/_modules/pubmed_parser/pubmed_oa_parser.html +++ b/_modules/pubmed_parser/pubmed_oa_parser.html @@ -165,6 +165,33 @@

Source code for pubmed_parser.pubmed_oa_parser

return dict_article_meta +def parse_date(tree, date_type): + """Parse publication dates based on the provided date type.""" + def get_text(node): + return node.text if node is not None else None + + pub_date_path = f".//pub-date[@pub-type=\"{date_type}\"]" + date_dict = {} + for part in ["year", "month", "day"]: + text = get_text(tree.find(f"{pub_date_path}/{part}")) + if text is not None: + date_dict[part] = text + + return date_dict + + +def format_date(date_dict): + """Format date dictionary to a string in the format day-month-year.""" + day = date_dict.get("day", "01") + month = date_dict.get("month", "01") + year = date_dict.get("year", "") + + if year: + return f"{day}-{month}-{year}" + else: + return f"{day}-{month}" + + def parse_coi_statements(tree): """ Parse conflict of interest statements from given article tree @@ -206,7 +233,7 @@

Source code for pubmed_parser.pubmed_oa_parser

A dictionary contains a following keys from a parsed XML path 'full_title', 'abstract', 'journal', 'pmid', 'pmc', 'doi', 'publisher_id', 'author_list', 'affiliation_list', 'publication_year', - 'publication_date', 'subjects' + 'publication_date', 'epublication_date' ,'subjects' } """ tree = read_xml(path, nxml) @@ -239,12 +266,18 @@

Source code for pubmed_parser.pubmed_oa_parser

journal = "" dict_article_meta = parse_article_meta(tree) - pub_year_node = tree.find(".//pub-date/year") - pub_year = pub_year_node.text if pub_year_node is not None else "" - pub_month_node = tree.find(".//pub-date/month") - pub_month = pub_month_node.text if pub_month_node is not None else "01" - pub_day_node = tree.find(".//pub-date/day") - pub_day = pub_day_node.text if pub_day_node is not None else "01" + + pub_date_dict = parse_date(tree, "ppub") + if "year" not in pub_date_dict: + pub_date_dict = parse_date(tree, "collection") + pub_date = format_date(pub_date_dict) + + try: + pub_year = int(pub_date_dict["year"]) + except TypeError: + pub_year = None + + epub_date = format_date(parse_date(tree, "epub")) subjects_node = tree.findall(".//article-categories//subj-group/subject") subjects = list() @@ -304,7 +337,8 @@

Source code for pubmed_parser.pubmed_oa_parser

"author_list": author_list, "affiliation_list": affiliation_list, "publication_year": pub_year, - "publication_date": "{}-{}-{}".format(pub_day, pub_month, pub_year), + "publication_date": pub_date, + "epublication_date": epub_date, "subjects": subjects, "coi_statement": coi_statement, } diff --git a/api.html b/api.html index a0f0b0a..007c37c 100644 --- a/api.html +++ b/api.html @@ -216,7 +216,7 @@

Return
dict_out: dict

A dictionary contains a following keys from a parsed XML path ‘full_title’, ‘abstract’, ‘journal’, ‘pmid’, ‘pmc’, ‘doi’, ‘publisher_id’, ‘author_list’, ‘affiliation_list’, ‘publication_year’, -‘publication_date’, ‘subjects’

+‘publication_date’, ‘epublication_date’ ,’subjects’

}

diff --git a/searchindex.js b/searchindex.js index 6b687ef..006f947 100644 --- a/searchindex.js +++ b/searchindex.js @@ -1 +1 @@ -Search.setIndex({"docnames": ["api", "index", "install", "resources", "spark"], "filenames": ["api.rst", "index.rst", "install.rst", "resources.rst", "spark.rst"], "titles": ["API Documentation", "Pubmed Parser: A Python Parser for PubMed Open-Access XML Subset and MEDLINE XML Dataset", "Installation", "Resources", "Setting up Pubmed Parser with PySpark"], "terms": {"The": [0, 3], "core": [0, 4], "function": [0, 1, 3], "can": [0, 1, 2, 3, 4], "divid": 0, "3": [0, 4], "main": 0, "part": 0, "base": 0, "sourc": 0, "data": [0, 1, 3, 4], "we": [0, 1, 3, 4], "us": [0, 1, 2, 3, 4], "an": [0, 1, 3], "input": 0, "either": 0, "open": [0, 2, 3], "access": [0, 3, 4], "subset": 0, "eutil": 0, "below": [0, 3, 4], "list": 0, "implement": [0, 4], "parser": 0, "pubmed_pars": [0, 2, 3], "parse_medline_xml": 0, "path": [0, 3, 4], "year_info_onli": 0, "true": [0, 4], "nlm_categori": 0, "fals": 0, "author_list": 0, "reference_list": 0, "parse_downto_mesh_subterm": 0, "file": [0, 1, 3], "format": [0, 1, 3], "avail": [0, 3], "http": [0, 2, 3], "ftp": [0, 3], "ncbi": [0, 3], "nlm": [0, 3], "nih": [0, 3], "gov": [0, 3], "str": 0, "bool": 0, "thi": [0, 1, 2, 3, 4], "tool": [0, 3], "onli": 0, "attempt": 0, "extract": 0, "year": 0, "inform": [0, 1, 3], "pubdat": 0, "made": 0, "harvest": 0, "all": [0, 1, 3], "If": [0, 3], "month": 0, "i": [0, 1, 3], "yield": 0, "date": [0, 1], "form": 0, "yyyi": 0, "mm": 0, "dai": 0, "dd": 0, "note": [0, 4], "resolut": 0, "r": 0, "databas": [0, 3], "vari": 0, "between": 0, "articl": [0, 1], "default": 0, "structur": [0, 1], "abstract": [0, 1], "where": 0, "each": 0, "section": 0, "origin": 0, "label": 0, "assign": 0, "categori": 0, "author": 0, "output": 0, "string": 0, "concaten": 0, "refer": 0, "pmid": [0, 3], "mesh": 0, "term": 0, "subterm": 0, "append": 0, "major": 0, "mesh_term": 0, "iter": 0, "dictionari": [0, 1], "contain": [0, 1], "about": 0, "see": [0, 1, 3, 4], "parse_article_info": 0, "have": [0, 1, 3, 4], "been": 0, "delet": 0, "ad": 0, "other": [0, 1], "than": [0, 4], "field": 0, "being": 0, "article_iter": 0, "pubmed20n0014": 0, "gz": [0, 3], "print": 0, "titl": 0, "parse_grant_id": 0, "pubmed_articl": 0, "grant": 0, "id": [0, 3], "relat": 0, "given": [0, 3], "tree": 0, "element": 0, "lxml": [0, 1], "node": 0, "point": 0, "grant_list": 0, "acknowledg": 0, "public": 0, "entri": 0, "acronym": 0, "countri": 0, "agenc": 0, "parse_pubmed_xml": 0, "include_path": 0, "nxml": 0, "metadata": 0, "you": [0, 1, 2, 3, 4], "check": [0, 3], "pub": [0, 3], "pmc": 0, "download": [0, 1, 4], "A": 0, "pum": 0, "includ": 0, "kei": 0, "path_to_fil": 0, "strip": 0, "namespac": 0, "after": 0, "read": 0, "stackoverflow": 0, "com": [0, 2], "question": 0, "18159221": 0, "remov": 0, "prefix": 0, "python": [0, 4], "dict_out": 0, "dict": 0, "follow": [0, 2, 3], "full_titl": 0, "journal": 0, "doi": 0, "publisher_id": 0, "affiliation_list": 0, "publication_year": 0, "publication_d": 0, "subject": 0, "parse_pubmed_refer": 0, "dict_ref": 0, "parse_pubmed_paragraph": 0, "all_paragraph": 0, "give": [0, 1], "paragraph": 0, "belong": 0, "By": 0, "least": 0, "one": 0, "aviod": 0, "noisi": 0, "text": [0, 1], "boolean": 0, "indic": 0, "want": [0, 3], "dict_par": 0, "its": 0, "reference_id": 0, "which": [0, 1, 2, 3, 4], "rid": 0, "name": 0, "parse_pubmed_capt": [0, 3], "singl": 0, "figur": 0, "caption": [0, 3], "back": 0, "dict_capt": 0, "fig_id": 0, "fig_capt": 0, "": 0, "graphic_ref": 0, "correspond": [0, 3], "bulk": [0, 3], "pone": 0, "0000217": 0, "17299597": 0, "1790863": 0, "fisher": 0, "geometr": 0, "model": 0, "two": 0, "dimension": 0, "phenotyp": 0, "space": 0, "g001": 0, "fig_label": 0, "1": [0, 4], "parse_pubmed_t": 0, "return_xml": 0, "tabl": 0, "table_xml": 0, "table_dict": 0, "full": [0, 1, 3, 4], "parse_xml_web": 0, "sleep": 0, "none": 0, "save_xml": 0, "load": 0, "int": 0, "integ": 0, "how": [0, 1, 3, 4], "long": 0, "wait": 0, "save": 0, "It": [0, 1], "good": 0, "won": 0, "t": 0, "11360989": 0, "molecular": 0, "biologi": 0, "evolut": 0, "gene": 0, "explain": [0, 3], "biolog": 0, "complex": 0, "scienc": 0, "new": 0, "york": 0, "n": 0, "y": 0, "affili": 0, "collegium": 0, "budapest": 0, "institut": 0, "advanc": 0, "studi": 0, "2": [0, 4], "szenth\u00e1roms\u00e1g": 0, "u": 0, "h": 0, "1014": 0, "hungari": 0, "szathmari": 0, "colbud": 0, "hu": 0, "e": [0, 1], "szathm\u00e1ri": 0, "f": 0, "jord\u00e1n": 0, "c": 0, "p\u00e1l": 0, "keyword": 0, "d000818": 0, "anim": 0, "d005075": 0, "10": [0, 4], "1126": 0, "1060852": 0, "2001": 0, "parse_citation_web": 0, "doc_id": 0, "id_typ": 0, "citat": [0, 1], "type": [0, 3], "choic": 0, "central": 0, "n_citat": 0, "number": [0, 1], "pmc_cite": 0, "cite": 0, "6933944": 0, "0": [0, 4], "31624211": 0, "aax1562": 0, "parse_outgoing_citation_web": 0, "url": 0, "entrez": [0, 1], "elink": 0, "fcgi": 0, "dbfrom": 0, "linknam": 0, "pmc_refs_pubm": 0, "221212": 0, "provid": [0, 1, 3], "pmcid": 0, "pmid_cit": 0, "paper": [0, 1], "11": 0, "30705152": 0, "librari": 1, "pars": [1, 3, 4], "oa": 1, "repositori": [1, 2, 4], "easili": 1, "research": 1, "mine": 1, "natur": 1, "languag": 1, "process": [1, 4], "pipelin": 1, "our": 1, "wiki": 1, "page": [1, 2], "document": [1, 2, 3, 4], "submit": 1, "ha": [1, 4], "might": [1, 3, 4], "get": [1, 3], "from": [1, 2, 3, 4], "regular": [1, 2], "pdf": 1, "more": [1, 4], "30m": 1, "biomed": 1, "publish": 1, "until": 1, "now": 1, "compress": 1, "queri": 1, "through": 1, "program": 1, "util": 1, "obtain": 1, "To": [1, 2, 3], "work": 1, "normal": 1, "write": 1, "take": 1, "time": [1, 4], "effort": 1, "aim": 1, "reduc": [1, 4], "those": 1, "develop": 1, "high": 1, "level": 1, "so": 1, "analyz": 1, "fast": 1, "also": 1, "who": 1, "alwai": 1, "keep": 1, "up": 1, "instal": 1, "api": 1, "resourc": 1, "set": 1, "pyspark": 1, "abov": 1, "guidelin": 1, "directli": 2, "pip": 2, "git": 2, "github": 2, "titipata": 2, "clone": 2, "test": [2, 4], "your": [2, 3], "run": 2, "pytest": 2, "cov": 2, "verbos": 2, "build": 2, "chang": 2, "directori": 2, "doc": 2, "folder": [2, 3, 4], "make": 2, "html": 2, "_build": 2, "via": 2, "browser": 2, "need": 2, "sphinx": 2, "galleri": 2, "here": 3, "ar": 3, "some": 3, "xml": [3, 4], "www": 3, "In": [3, 4], "go": 3, "oa_bulk": 3, "tar": 3, "nlmdata": 3, "medleasebaselin": 3, "weekli": 3, "updat": [3, 4], "medleas": 3, "definit": 3, "dtd": 3, "tag": 3, "specif": 3, "figure_id": 3, "manuscript": 3, "imag": 3, "csv": 3, "oa_file_list": 3, "first": 3, "column": 3, "oa_packag": 3, "08": 3, "e0": 3, "pmc13900": 3, "out": 3, "when": [3, 4], "websit": 3, "do": 3, "them": 3, "ip": 3, "ban": 3, "pleas": [3, 4], "scrape": 3, "There": 3, "few": 3, "interest": 3, "kung": 3, "fu": 3, "medic": 3, "medlinexmltojson": 3, "javascript": 3, "put": 4, "small": 4, "snippet": 4, "setup": 4, "spark": 4, "jupyt": 4, "notebook": 4, "workflow": 4, "medlin": 4, "datafram": 4, "25": 4, "million": 4, "less": 4, "minut": 4, "multipl": 4, "processor": 4, "spark_hom": 4, "differ": 4, "import": 4, "o": 4, "findspark": 4, "init": 4, "opt": 4, "bin": 4, "cdh5": 4, "9": 4, "case": 4, "sparkcontext": 4, "parallel": 4, "createdatafram": 4, "sql": 4, "sparksess": 4, "conf": 4, "sparkconf": 4, "setappnam": 4, "map": 4, "setmast": 4, "local": 4, "5": 4, "yarn": 4, "appmasterenv": 4, "pyspark_python": 4, "anaconda3": 4, "pyspark_driver_python": 4, "executor": 4, "memori": 4, "8g": 4, "memoryoverhead": 4, "16g": 4, "codegen": 4, "schedul": 4, "minimum": 4, "alloc": 4, "mb": 4, "500m": 4, "dynamicalloc": 4, "maxexecutor": 4, "driver": 4, "maxresults": 4, "builder": 4, "appnam": 4, "config": 4, "getorcr": 4, "detail": 4, "script": 4, "incorpor": 4, "dask": 4, "soon": 4}, "objects": {"pubmed_parser": [[0, 0, 1, "", "parse_citation_web"], [0, 0, 1, "", "parse_grant_id"], [0, 0, 1, "", "parse_medline_xml"], [0, 0, 1, "", "parse_outgoing_citation_web"], [0, 0, 1, "", "parse_pubmed_caption"], [0, 0, 1, "", "parse_pubmed_paragraph"], [0, 0, 1, "", "parse_pubmed_references"], [0, 0, 1, "", "parse_pubmed_table"], [0, 0, 1, "", "parse_pubmed_xml"], [0, 0, 1, "", "parse_xml_web"]]}, "objtypes": {"0": "py:function"}, "objnames": {"0": ["py", "function", "Python function"]}, "titleterms": {"api": 0, "document": 0, "pars": 0, "medlin": [0, 1, 3], "xml": [0, 1], "paramet": 0, "return": 0, "exampl": 0, "pubm": [0, 1, 3, 4], "oa": [0, 3], "from": 0, "websit": 0, "parser": [1, 3, 4], "A": 1, "python": 1, "open": 1, "access": 1, "subset": 1, "dataset": [1, 3], "about": 1, "content": 1, "question": 1, "contribut": 1, "bug": 1, "instal": 2, "resourc": 3, "link": 3, "download": 3, "figur": 3, "pmc": 3, "copyright": 3, "notic": 3, "altern": 3, "implement": 3, "set": 4, "up": 4, "pyspark": 4}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx": 58}, "alltitles": {"API Documentation": [[0, "api-documentation"]], "Parse MEDLINE XML": [[0, "parse-medline-xml"]], "Parameters": [[0, "parameters"], [0, "id1"], [0, "id2"], [0, "id4"], [0, "id6"], [0, "id8"], [0, "id11"], [0, "id13"], [0, "id16"], [0, "id19"]], "Return": [[0, "return"], [0, "id3"], [0, "id5"], [0, "id7"], [0, "id9"], [0, "id12"], [0, "id14"], [0, "id17"], [0, "id20"]], "Examples": [[0, "examples"], [0, "id10"], [0, "id15"], [0, "id18"]], "Returns": [[0, "returns"]], "Parse PubMed OA XML": [[0, "parse-pubmed-oa-xml"]], "Parse from Website": [[0, "parse-from-website"]], "Pubmed Parser: A Python Parser for PubMed Open-Access XML Subset and MEDLINE XML Dataset": [[1, "pubmed-parser-a-python-parser-for-pubmed-open-access-xml-subset-and-medline-xml-dataset"]], "About the dataset": [[1, "about-the-dataset"]], "Contents": [[1, "contents"]], "Questions / Contributions / Bugs": [[1, "questions-contributions-bugs"]], "Installation": [[2, "installation"]], "Resources": [[3, "resources"]], "Links to download PubMed OA and MEDLINE dataset": [[3, "links-to-download-pubmed-oa-and-medline-dataset"]], "Download PubMed OA figures": [[3, "download-pubmed-oa-figures"]], "PMC Copyright Notice": [[3, "pmc-copyright-notice"]], "Alternative implementation of MEDLINE parsers": [[3, "alternative-implementation-of-medline-parsers"]], "Setting up Pubmed Parser with PySpark": [[4, "setting-up-pubmed-parser-with-pyspark"]]}, "indexentries": {"parse_citation_web() (in module pubmed_parser)": [[0, "pubmed_parser.parse_citation_web"]], "parse_grant_id() (in module pubmed_parser)": [[0, "pubmed_parser.parse_grant_id"]], "parse_medline_xml() (in module pubmed_parser)": [[0, "pubmed_parser.parse_medline_xml"]], "parse_outgoing_citation_web() (in module pubmed_parser)": [[0, "pubmed_parser.parse_outgoing_citation_web"]], "parse_pubmed_caption() (in module pubmed_parser)": [[0, "pubmed_parser.parse_pubmed_caption"]], "parse_pubmed_paragraph() (in module pubmed_parser)": [[0, "pubmed_parser.parse_pubmed_paragraph"]], "parse_pubmed_references() (in module pubmed_parser)": [[0, "pubmed_parser.parse_pubmed_references"]], "parse_pubmed_table() (in module pubmed_parser)": [[0, "pubmed_parser.parse_pubmed_table"]], "parse_pubmed_xml() (in module pubmed_parser)": [[0, "pubmed_parser.parse_pubmed_xml"]], "parse_xml_web() (in module pubmed_parser)": [[0, "pubmed_parser.parse_xml_web"]]}}) \ No newline at end of file +Search.setIndex({"docnames": ["api", "index", "install", "resources", "spark"], "filenames": ["api.rst", "index.rst", "install.rst", "resources.rst", "spark.rst"], "titles": ["API Documentation", "Pubmed Parser: A Python Parser for PubMed Open-Access XML Subset and MEDLINE XML Dataset", "Installation", "Resources", "Setting up Pubmed Parser with PySpark"], "terms": {"The": [0, 3], "core": [0, 4], "function": [0, 1, 3], "can": [0, 1, 2, 3, 4], "divid": 0, "3": [0, 4], "main": 0, "part": 0, "base": 0, "sourc": 0, "data": [0, 1, 3, 4], "we": [0, 1, 3, 4], "us": [0, 1, 2, 3, 4], "an": [0, 1, 3], "input": 0, "either": 0, "open": [0, 2, 3], "access": [0, 3, 4], "subset": 0, "eutil": 0, "below": [0, 3, 4], "list": 0, "implement": [0, 4], "parser": 0, "pubmed_pars": [0, 2, 3], "parse_medline_xml": 0, "path": [0, 3, 4], "year_info_onli": 0, "true": [0, 4], "nlm_categori": 0, "fals": 0, "author_list": 0, "reference_list": 0, "parse_downto_mesh_subterm": 0, "file": [0, 1, 3], "format": [0, 1, 3], "avail": [0, 3], "http": [0, 2, 3], "ftp": [0, 3], "ncbi": [0, 3], "nlm": [0, 3], "nih": [0, 3], "gov": [0, 3], "str": 0, "bool": 0, "thi": [0, 1, 2, 3, 4], "tool": [0, 3], "onli": 0, "attempt": 0, "extract": 0, "year": 0, "inform": [0, 1, 3], "pubdat": 0, "made": 0, "harvest": 0, "all": [0, 1, 3], "If": [0, 3], "month": 0, "i": [0, 1, 3], "yield": 0, "date": [0, 1], "form": 0, "yyyi": 0, "mm": 0, "dai": 0, "dd": 0, "note": [0, 4], "resolut": 0, "r": 0, "databas": [0, 3], "vari": 0, "between": 0, "articl": [0, 1], "default": 0, "structur": [0, 1], "abstract": [0, 1], "where": 0, "each": 0, "section": 0, "origin": 0, "label": 0, "assign": 0, "categori": 0, "author": 0, "output": 0, "string": 0, "concaten": 0, "refer": 0, "pmid": [0, 3], "mesh": 0, "term": 0, "subterm": 0, "append": 0, "major": 0, "mesh_term": 0, "iter": 0, "dictionari": [0, 1], "contain": [0, 1], "about": 0, "see": [0, 1, 3, 4], "parse_article_info": 0, "have": [0, 1, 3, 4], "been": 0, "delet": 0, "ad": 0, "other": [0, 1], "than": [0, 4], "field": 0, "being": 0, "article_iter": 0, "pubmed20n0014": 0, "gz": [0, 3], "print": 0, "titl": 0, "parse_grant_id": 0, "pubmed_articl": 0, "grant": 0, "id": [0, 3], "relat": 0, "given": [0, 3], "tree": 0, "element": 0, "lxml": [0, 1], "node": 0, "point": 0, "grant_list": 0, "acknowledg": 0, "public": 0, "entri": 0, "acronym": 0, "countri": 0, "agenc": 0, "parse_pubmed_xml": 0, "include_path": 0, "nxml": 0, "metadata": 0, "you": [0, 1, 2, 3, 4], "check": [0, 3], "pub": [0, 3], "pmc": 0, "download": [0, 1, 4], "A": 0, "pum": 0, "includ": 0, "kei": 0, "path_to_fil": 0, "strip": 0, "namespac": 0, "after": 0, "read": 0, "stackoverflow": 0, "com": [0, 2], "question": 0, "18159221": 0, "remov": 0, "prefix": 0, "python": [0, 4], "dict_out": 0, "dict": 0, "follow": [0, 2, 3], "full_titl": 0, "journal": 0, "doi": 0, "publisher_id": 0, "affiliation_list": 0, "publication_year": 0, "publication_d": 0, "epublication_d": 0, "subject": 0, "parse_pubmed_refer": 0, "dict_ref": 0, "parse_pubmed_paragraph": 0, "all_paragraph": 0, "give": [0, 1], "paragraph": 0, "belong": 0, "By": 0, "least": 0, "one": 0, "aviod": 0, "noisi": 0, "text": [0, 1], "boolean": 0, "indic": 0, "want": [0, 3], "dict_par": 0, "its": 0, "reference_id": 0, "which": [0, 1, 2, 3, 4], "rid": 0, "name": 0, "parse_pubmed_capt": [0, 3], "singl": 0, "figur": 0, "caption": [0, 3], "back": 0, "dict_capt": 0, "fig_id": 0, "fig_capt": 0, "": 0, "graphic_ref": 0, "correspond": [0, 3], "bulk": [0, 3], "pone": 0, "0000217": 0, "17299597": 0, "1790863": 0, "fisher": 0, "geometr": 0, "model": 0, "two": 0, "dimension": 0, "phenotyp": 0, "space": 0, "g001": 0, "fig_label": 0, "1": [0, 4], "parse_pubmed_t": 0, "return_xml": 0, "tabl": 0, "table_xml": 0, "table_dict": 0, "full": [0, 1, 3, 4], "parse_xml_web": 0, "sleep": 0, "none": 0, "save_xml": 0, "load": 0, "int": 0, "integ": 0, "how": [0, 1, 3, 4], "long": 0, "wait": 0, "save": 0, "It": [0, 1], "good": 0, "won": 0, "t": 0, "11360989": 0, "molecular": 0, "biologi": 0, "evolut": 0, "gene": 0, "explain": [0, 3], "biolog": 0, "complex": 0, "scienc": 0, "new": 0, "york": 0, "n": 0, "y": 0, "affili": 0, "collegium": 0, "budapest": 0, "institut": 0, "advanc": 0, "studi": 0, "2": [0, 4], "szenth\u00e1roms\u00e1g": 0, "u": 0, "h": 0, "1014": 0, "hungari": 0, "szathmari": 0, "colbud": 0, "hu": 0, "e": [0, 1], "szathm\u00e1ri": 0, "f": 0, "jord\u00e1n": 0, "c": 0, "p\u00e1l": 0, "keyword": 0, "d000818": 0, "anim": 0, "d005075": 0, "10": [0, 4], "1126": 0, "1060852": 0, "2001": 0, "parse_citation_web": 0, "doc_id": 0, "id_typ": 0, "citat": [0, 1], "type": [0, 3], "choic": 0, "central": 0, "n_citat": 0, "number": [0, 1], "pmc_cite": 0, "cite": 0, "6933944": 0, "0": [0, 4], "31624211": 0, "aax1562": 0, "parse_outgoing_citation_web": 0, "url": 0, "entrez": [0, 1], "elink": 0, "fcgi": 0, "dbfrom": 0, "linknam": 0, "pmc_refs_pubm": 0, "221212": 0, "provid": [0, 1, 3], "pmcid": 0, "pmid_cit": 0, "paper": [0, 1], "11": 0, "30705152": 0, "librari": 1, "pars": [1, 3, 4], "oa": 1, "repositori": [1, 2, 4], "easili": 1, "research": 1, "mine": 1, "natur": 1, "languag": 1, "process": [1, 4], "pipelin": 1, "our": 1, "wiki": 1, "page": [1, 2], "document": [1, 2, 3, 4], "submit": 1, "ha": [1, 4], "might": [1, 3, 4], "get": [1, 3], "from": [1, 2, 3, 4], "regular": [1, 2], "pdf": 1, "more": [1, 4], "30m": 1, "biomed": 1, "publish": 1, "until": 1, "now": 1, "compress": 1, "queri": 1, "through": 1, "program": 1, "util": 1, "obtain": 1, "To": [1, 2, 3], "work": 1, "normal": 1, "write": 1, "take": 1, "time": [1, 4], "effort": 1, "aim": 1, "reduc": [1, 4], "those": 1, "develop": 1, "high": 1, "level": 1, "so": 1, "analyz": 1, "fast": 1, "also": 1, "who": 1, "alwai": 1, "keep": 1, "up": 1, "instal": 1, "api": 1, "resourc": 1, "set": 1, "pyspark": 1, "abov": 1, "guidelin": 1, "directli": 2, "pip": 2, "git": 2, "github": 2, "titipata": 2, "clone": 2, "test": [2, 4], "your": [2, 3], "run": 2, "pytest": 2, "cov": 2, "verbos": 2, "build": 2, "chang": 2, "directori": 2, "doc": 2, "folder": [2, 3, 4], "make": 2, "html": 2, "_build": 2, "via": 2, "browser": 2, "need": 2, "sphinx": 2, "galleri": 2, "here": 3, "ar": 3, "some": 3, "xml": [3, 4], "www": 3, "In": [3, 4], "go": 3, "oa_bulk": 3, "tar": 3, "nlmdata": 3, "medleasebaselin": 3, "weekli": 3, "updat": [3, 4], "medleas": 3, "definit": 3, "dtd": 3, "tag": 3, "specif": 3, "figure_id": 3, "manuscript": 3, "imag": 3, "csv": 3, "oa_file_list": 3, "first": 3, "column": 3, "oa_packag": 3, "08": 3, "e0": 3, "pmc13900": 3, "out": 3, "when": [3, 4], "websit": 3, "do": 3, "them": 3, "ip": 3, "ban": 3, "pleas": [3, 4], "scrape": 3, "There": 3, "few": 3, "interest": 3, "kung": 3, "fu": 3, "medic": 3, "medlinexmltojson": 3, "javascript": 3, "put": 4, "small": 4, "snippet": 4, "setup": 4, "spark": 4, "jupyt": 4, "notebook": 4, "workflow": 4, "medlin": 4, "datafram": 4, "25": 4, "million": 4, "less": 4, "minut": 4, "multipl": 4, "processor": 4, "spark_hom": 4, "differ": 4, "import": 4, "o": 4, "findspark": 4, "init": 4, "opt": 4, "bin": 4, "cdh5": 4, "9": 4, "case": 4, "sparkcontext": 4, "parallel": 4, "createdatafram": 4, "sql": 4, "sparksess": 4, "conf": 4, "sparkconf": 4, "setappnam": 4, "map": 4, "setmast": 4, "local": 4, "5": 4, "yarn": 4, "appmasterenv": 4, "pyspark_python": 4, "anaconda3": 4, "pyspark_driver_python": 4, "executor": 4, "memori": 4, "8g": 4, "memoryoverhead": 4, "16g": 4, "codegen": 4, "schedul": 4, "minimum": 4, "alloc": 4, "mb": 4, "500m": 4, "dynamicalloc": 4, "maxexecutor": 4, "driver": 4, "maxresults": 4, "builder": 4, "appnam": 4, "config": 4, "getorcr": 4, "detail": 4, "script": 4, "incorpor": 4, "dask": 4, "soon": 4}, "objects": {"pubmed_parser": [[0, 0, 1, "", "parse_citation_web"], [0, 0, 1, "", "parse_grant_id"], [0, 0, 1, "", "parse_medline_xml"], [0, 0, 1, "", "parse_outgoing_citation_web"], [0, 0, 1, "", "parse_pubmed_caption"], [0, 0, 1, "", "parse_pubmed_paragraph"], [0, 0, 1, "", "parse_pubmed_references"], [0, 0, 1, "", "parse_pubmed_table"], [0, 0, 1, "", "parse_pubmed_xml"], [0, 0, 1, "", "parse_xml_web"]]}, "objtypes": {"0": "py:function"}, "objnames": {"0": ["py", "function", "Python function"]}, "titleterms": {"api": 0, "document": 0, "pars": 0, "medlin": [0, 1, 3], "xml": [0, 1], "paramet": 0, "return": 0, "exampl": 0, "pubm": [0, 1, 3, 4], "oa": [0, 3], "from": 0, "websit": 0, "parser": [1, 3, 4], "A": 1, "python": 1, "open": 1, "access": 1, "subset": 1, "dataset": [1, 3], "about": 1, "content": 1, "question": 1, "contribut": 1, "bug": 1, "instal": 2, "resourc": 3, "link": 3, "download": 3, "figur": 3, "pmc": 3, "copyright": 3, "notic": 3, "altern": 3, "implement": 3, "set": 4, "up": 4, "pyspark": 4}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx": 58}, "alltitles": {"API Documentation": [[0, "api-documentation"]], "Parse MEDLINE XML": [[0, "parse-medline-xml"]], "Parameters": [[0, "parameters"], [0, "id1"], [0, "id2"], [0, "id4"], [0, "id6"], [0, "id8"], [0, "id11"], [0, "id13"], [0, "id16"], [0, "id19"]], "Return": [[0, "return"], [0, "id3"], [0, "id5"], [0, "id7"], [0, "id9"], [0, "id12"], [0, "id14"], [0, "id17"], [0, "id20"]], "Examples": [[0, "examples"], [0, "id10"], [0, "id15"], [0, "id18"]], "Returns": [[0, "returns"]], "Parse PubMed OA XML": [[0, "parse-pubmed-oa-xml"]], "Parse from Website": [[0, "parse-from-website"]], "Pubmed Parser: A Python Parser for PubMed Open-Access XML Subset and MEDLINE XML Dataset": [[1, "pubmed-parser-a-python-parser-for-pubmed-open-access-xml-subset-and-medline-xml-dataset"]], "About the dataset": [[1, "about-the-dataset"]], "Contents": [[1, "contents"]], "Questions / Contributions / Bugs": [[1, "questions-contributions-bugs"]], "Installation": [[2, "installation"]], "Resources": [[3, "resources"]], "Links to download PubMed OA and MEDLINE dataset": [[3, "links-to-download-pubmed-oa-and-medline-dataset"]], "Download PubMed OA figures": [[3, "download-pubmed-oa-figures"]], "PMC Copyright Notice": [[3, "pmc-copyright-notice"]], "Alternative implementation of MEDLINE parsers": [[3, "alternative-implementation-of-medline-parsers"]], "Setting up Pubmed Parser with PySpark": [[4, "setting-up-pubmed-parser-with-pyspark"]]}, "indexentries": {"parse_citation_web() (in module pubmed_parser)": [[0, "pubmed_parser.parse_citation_web"]], "parse_grant_id() (in module pubmed_parser)": [[0, "pubmed_parser.parse_grant_id"]], "parse_medline_xml() (in module pubmed_parser)": [[0, "pubmed_parser.parse_medline_xml"]], "parse_outgoing_citation_web() (in module pubmed_parser)": [[0, "pubmed_parser.parse_outgoing_citation_web"]], "parse_pubmed_caption() (in module pubmed_parser)": [[0, "pubmed_parser.parse_pubmed_caption"]], "parse_pubmed_paragraph() (in module pubmed_parser)": [[0, "pubmed_parser.parse_pubmed_paragraph"]], "parse_pubmed_references() (in module pubmed_parser)": [[0, "pubmed_parser.parse_pubmed_references"]], "parse_pubmed_table() (in module pubmed_parser)": [[0, "pubmed_parser.parse_pubmed_table"]], "parse_pubmed_xml() (in module pubmed_parser)": [[0, "pubmed_parser.parse_pubmed_xml"]], "parse_xml_web() (in module pubmed_parser)": [[0, "pubmed_parser.parse_xml_web"]]}}) \ No newline at end of file