-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathstudium_csv_processor.py
168 lines (130 loc) · 5.81 KB
/
studium_csv_processor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import argparse
import os
import re
import pandas
import requests
from bs4 import BeautifulSoup
from pandas import DataFrame
from output_csv_validator import validate_csv
def find_file(doi: str, file_path: str) -> str:
found_files = []
file_prefix = doi.split(".")[-1]
for root, dir, files in os.walk(file_path):
for file in files:
if file.startswith(file_prefix):
found_files.append(os.path.join(file_path, file))
if len(found_files) != 1:
raise RuntimeError(f"Found {len(found_files)} files expected 1")
return found_files[0]
def fix_year_volume_and_issue(csv: DataFrame):
different_issue_year = re.compile("^\\d-\\d{4}$")
year_volume = {}
for index, row in csv.iterrows():
year = row["jaar"]
if not year in year_volume.keys():
year_volume[year] = row["volume"]
issue = row["issue"]
if different_issue_year.match(issue):
year_issue = issue.split("-")
csv.at[index, "jaar"] = int(year_issue[-1])
csv.at[index, "volume"] = year_volume[int(year_issue[-1])]
csv.at[index, "issue"] = year_issue[0]
def process_authors(data: DataFrame):
authors_columns = {}
authors_lists = data["authors"].map(lambda authors: authors.split(" | ")).tolist()
ordered_authors_lists = authors_lists.copy()
ordered_authors_lists.sort(key=len, reverse=True)
max_authors = len(ordered_authors_lists[0])
given_name = "author_given_name_"
family_name = "author_family_name_"
for i in range(max_authors):
authors_columns[given_name + str(i)] = []
authors_columns[family_name + str(i)] = []
for row in authors_lists:
for index in range(max_authors):
first_name = ""
surname = ""
if index < len(row):
first_name, surname = split_author_names(row[index])
authors_columns[given_name + str(index)].append(first_name)
authors_columns[family_name + str(index)].append(surname)
for i in range(max_authors):
column_given_name = given_name + str(i)
data[column_given_name] = authors_columns[column_given_name]
column_family_name = family_name + str(i)
data[column_family_name] = authors_columns[column_family_name]
def split_author_names(name_string: str):
first_name = ""
surname = ""
author_names = name_string.strip(" ").split(" ")
if len(author_names) == 1:
first_name = author_names[0]
elif len(author_names) == 2:
first_name = author_names[0]
surname = author_names[1]
else:
prefixes = ["van", "de", "den", "der", "ten", "ter"]
normalized_names = [name.casefold() for name in author_names]
lowest_prefix_index = 1000
for prefix in prefixes:
if prefix.casefold() in normalized_names and normalized_names.index(prefix) < lowest_prefix_index:
lowest_prefix_index = normalized_names.index(prefix)
start_of_surname = -1
if lowest_prefix_index < len(author_names):
start_of_surname = lowest_prefix_index
if author_names[start_of_surname - 1] == "-":
start_of_surname = start_of_surname - 2
elif "-" in author_names[start_of_surname - 1]:
start_of_surname = start_of_surname - 1
if name_string.lower() == "Van de redactie".lower():
first_name = name_string
else:
first_name = " ".join(author_names[0:start_of_surname])
surname = " ".join(author_names[start_of_surname:])
return first_name, surname
def add_publication(csv: DataFrame):
publications = []
for index, row in csv.iterrows():
publications.append(f"{row["jaar"]} / {row["issue"]}")
csv["publication"] = publications
def process_web_page_data(link: str):
page = requests.get(link)
soup = BeautifulSoup(page.text, "html.parser")
abstract_text = ""
possible_abstracts = soup.find_all("div", {"class": "authors"})
for abstract in possible_abstracts:
if "Abstract" in abstract.text:
abstract_text = abstract.text.replace("Abstract", "").strip()
publication_date = soup.find("meta", {"name": "citation_publication_date"})
return publication_date.attrs["content"], abstract_text
def language_to_locale(language):
if language == "dut" or language == "nld" or language == "nl; en":
return "nl"
if language == "fr":
return "fr_FR"
return language
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--input_csv", type=str, required=True)
parser.add_argument("--output_csv", type=str, required=True)
parser.add_argument("--files_path", help="folder that contains the files mentioned in the input_csv",
type=str, required=True)
args = parser.parse_args()
files_path = args.files_path
csv = pandas.read_csv(args.input_csv)
csv["file"] = csv["doi"].map(lambda doi: find_file(doi, files_path))
fix_year_volume_and_issue(csv)
process_authors(csv)
csv["id"] = csv.index
csv = csv.assign(section_title=["Artikelen"] * len(csv))
csv = csv.assign(section_policy=["Standaard"] * len(csv))
csv = csv.assign(section_reference=["ART"] * len(csv))
add_publication(csv)
csv["publication_date"], csv["abstract"] = zip(*csv["link"].map(lambda link: process_web_page_data(link)))
csv["locale"] = csv["language"].map(lambda language: language_to_locale(language))
csv = csv[["id"] + [col for col in csv.columns if col != "id"]]
csv = csv.rename(columns={"jaar": "year", "pages": "page_number", "titel": "title"})
csv = csv.sort_values(["year", "issue", "doi"])
csv = csv.drop(["doi", "link", "pdf", "xml", "authors", "keywords"], axis=1)
validate_csv(csv)
csv.to_csv(args.output_csv, sep=";", index=False)