-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathxml-reader.py
98 lines (81 loc) · 3.53 KB
/
xml-reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import xml.etree.ElementTree as ET
import os.path
def read_xml_file(file_path):
if os.path.isfile(file_path):
get_xml_tree = ET.parse(file_path)
print("xml-reader: Detect root element tag: {}".format(get_xml_tree.getroot().tag))
return get_xml_tree
else:
print("xml-reader: Read file error (file not exist)")
return
def full_text_search_in_xml(xml_tree, *args):
"""
leave args empty -> retrieval all type
accept args : PubmedBookArticle, PubmedArticle
"""
documents = xml_tree.getroot()
collection_list = []
if ET.iselement(documents):
count_of_document = len(documents)
print("xml-reader: total documents: {}".format(str(count_of_document)))
print("xml-reader: correctly set")
if args:
for arg in args:
if arg == 'PubmedBookArticle':
collection_list.extend(retrieval_pubmed_book_article(documents))
if arg == 'PubmedArticle':
collection_list.extend(retrieval_pubmed_article(documents))
else:
print('xml-reader: arg. error')
else:
collection_list.extend(retrieval_pubmed_book_article(documents))
collection_list.extend(retrieval_pubmed_article(documents))
return collection_list
else:
print("setting fail")
return list()
def retrieval_pubmed_book_article(documents):
collection_list = []
''' PubmedBookArticle type data'''
for child_of_root in documents.iterfind('PubmedBookArticle/BookDocument'):
tmp_list = list()
# print(child_of_root.find('ArticleTitle').text)
tmp_list.append(child_of_root.find('ArticleTitle').text) ## ArticleTitle
# print(child_of_root.find('Abstract/AbstractText'))
tmp_list.append(child_of_root.find('Abstract/AbstractText').text) ## AbstractText
collection_list.append(tmp_list)
# for child_of_root in documents.iterfind('PubmedBookArticle/BookDocument/ArticleTitle'):
# print("tag: {}, text: {}".format(child_of_root.tag, child_of_root.text))
# for child_of_root in documents.iterfind('PubmedBookArticle/BookDocument/Abstract/AbstractText'):
# print("tag: {}, text: {}".format(child_of_root.tag, child_of_root.text))
return collection_list
def retrieval_pubmed_article(documents):
collection_list = []
''' PubmedArticle type data'''
for child_of_root in documents.iterfind('PubmedArticle/MedlineCitation/Article'):
tmp_list = list()
# print(child_of_root.find('ArticleTitle').text)
tmp_list.append(child_of_root.find('ArticleTitle').text) ## ArticleTitle
# print(child_of_root.findall('Abstract/AbstractText')) ## Abstract
abstract_texts = child_of_root.findall('Abstract/AbstractText')
tmp_str = ''
for abstract_text in abstract_texts:
tmp_str += abstract_text.text + ' '
# print(tmp_str)
tmp_list.append(tmp_str)
collection_list.append(tmp_list)
return collection_list
def pubmed_xml_parser(file_path, *args):
get_tree = read_xml_file(file_path)
parse_result = full_text_search_in_xml(get_tree, *args)
return parse_result
if __name__ == "__main__":
# For debug use
# print("xml-reader: error usage")
# result = pubmed_xml_parser('./pubmed_result.xml', 'PubmedBookArticle')
result = pubmed_xml_parser('./pubmed_result.xml')
print("!!")
print(len(result))
# print(get_result[0])
# print(get_result[1])
# print(get_result[2])