-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdaily_vu_harvester.py
184 lines (150 loc) · 6.92 KB
/
daily_vu_harvester.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
## Daily VU harvester
#
# Harvest ScopusSearch for today
# enriches and cleans it
# then pushes it to the database with incremented primary key
#
# warning: the code uses 'today' so you have to run it before midnight
#
# warning: code of harvester should be refactored and bound with oadash
## imports
from static import PATH_START, PATH_START_PERSONAL
from static import PATH_START_SERVER , PATH_START_PERSONAL_SERVER
import pandas as pd
from pybliometrics.scopus import ScopusSearch
# imports from our own import framework
from nlp_functions import faculty_finder
from nlp_functions import corresponding_author_functions
#
from core_functions import add_year_and_month
from core_functions import get_scopus_abstract_info
from core_functions import get_first_chosen_affiliation_author # ! i want all vu authors now : )
from core_functions import add_unpaywall_columns
from core_functions import add_deal_info
from core_functions import fn_cats, get_today, get_today_for_pubdatetxt
from database_functions import get_last_primary_key, push_df_to_db, get_connection, \
process_df_to_list_to_push, pre_process_for_push
## settings
running_on_server = False
#
# paths
if running_on_server:
#df_pass = pd.read_csv(r'C:\Users\yasing\Desktop\git\password_mydb.csv')
#path_deals = 'C:/Users/yasing/Desktop/oa oktober/apcdeals.csv' #check
#path_isn = 'C:/Users/yasing/Desktop/oa oktober/ISN_ISSN.csv' #check
#path_org = 'C:/Users/yasing/Desktop/oa oktober/vu_organogram_2.xlsx' #check
#path_out = 'C:/Users/yasing/Desktop/oa oktober/' #check
#path_vsnu_afids = 'C:/Users/yasing/Desktop/oa oktober/afids_vsnu_nonfin.csv' #check
df_pass = pd.read_csv(PATH_START_PERSONAL_SERVER + r'\password_mydb.csv')
path_deals = PATH_START_SERVER + r'\raw data algemeen\apcdeals.csv'
path_isn = PATH_START_SERVER + r'\raw data algemeen\ISN_ISSN.csv'
path_org = PATH_START_SERVER + r'\raw data algemeen\vu_organogram_2.xlsx'
path_out = PATH_START_PERSONAL_SERVER + '/out_daily/' # no r
path_vsnu_afids = PATH_START_SERVER + r'\raw data algemeen\afids_vsnu_nonfin.csv'
else:
df_pass = pd.read_csv(PATH_START_PERSONAL + r'\password_mydb.csv')
path_deals = PATH_START + r'\raw data algemeen\apcdeals.csv'
path_isn = PATH_START + r'\raw data algemeen\ISN_ISSN.csv'
path_org = PATH_START + r'\raw data algemeen\vu_organogram_2.xlsx'
path_out = PATH_START_PERSONAL + '/out_daily/' # no r
path_vsnu_afids = PATH_START + r'\raw data algemeen\afids_vsnu_nonfin.csv'
#
# read in pass (distribute mysql accounts as some point)
user = df_pass.user[0]
pw = df_pass.pw[0]
host = df_pass.host[0]
database = df_pass.database[0]
#
chosen_affid = ["60008734","60029124","60012443","60109852","60026698","60013779","60032886","60000614",
"60030550","60013243","60026220","60001997"] # I added 60001997 and thus I added VUMC
# corresponding author
vu_afids = chosen_affid
#this is vsnu w/o phtu and such (borrowed from VSNU-SDG-data), but should approach the UKB list... good for now. update later.
all_vsnu_sdg_afids = pd.read_csv(path_vsnu_afids).iloc[:,1].astype('str').to_list()
#
# harvester
# warning: the code uses 'today' so you have to run it before midnight
#
# harvest from ScopusSearch everything from VU+VUMC of today
# because the API has issues with direct today command we instead take entire month and isolate today
#
# prepare query
VU_with_VUMC_affid = "( AF-ID(60001997) OR AF-ID(60008734) OR AF-ID(60029124) OR AF-ID(60012443) OR AF-ID(60109852) OR AF-ID(60026698) OR AF-ID(60013779) OR AF-ID(60032886) OR AF-ID(60000614) OR AF-ID(60030550) OR AF-ID(60013243) OR AF-ID(60026220))"
my_query = VU_with_VUMC_affid + ' AND ' + "PUBDATETXT( " + get_today_for_pubdatetxt() + " )" # RECENT(1) is somehow very slow
print(my_query)
#
# call the scopussearch API
s = ScopusSearch(my_query, refresh=True, download=True)
df = pd.DataFrame(s.results)
#
# filter to records of today
today = get_today()
df = df[df.coverDate == today]
#
# here is the result (may be empty on some days)
###df
# pre-processing aspect
# we need to add extra sources, clean it, rename columns and make it ready for push
# this is a static copy, and you should migrate processing to pycharm and call that from here
# Add info on year and month
df = add_year_and_month(df, 'coverDate') # add info columns
# prepare the faculty_finder NLP tool
org_info = pd.read_excel(path_org, skiprows=0)
ff = faculty_finder(organizational_chart=org_info)
# Per EID, get scopus abstract info, get first vu author and use NLP to find faculty
# initialize
df_ab = pd.DataFrame()
df_au = pd.DataFrame()
df_ff = pd.DataFrame()
for counter, cur_eid in enumerate(df.eid.tolist()):
print('getting abstract info for ' + str(counter + 1) + ' out of ' + str(len(df.eid.tolist())))
# get abstract
dict_ab_info = get_scopus_abstract_info(cur_eid) # !
dict_ab_info['eid'] = cur_eid
# get first chosen affiliation author
dict_auth_info = get_first_chosen_affiliation_author(dict_ab_info['abstract_object'], chosen_affid)
dict_auth_info['eid'] = cur_eid
# get faculty
if dict_auth_info['first_affil_author_has_error'] == True:
print('no chosen affid author found at EID:' + str(cur_eid))
dict_ff = ff.match_nan()
else:
# get faculty
dict_ff = ff.match(dict_auth_info['first_affil_author_org'])
dict_ff['eid'] = cur_eid
df_ab = df_ab.append(dict_ab_info, ignore_index=True)
df_au = df_au.append(dict_auth_info, ignore_index=True)
df_ff = df_ff.append(dict_ff, ignore_index=True)
df = df.merge(df_ab, on='eid', how='left')
df = df.merge(df_au, on='eid', how='left')
df = df.merge(df_ff, on='eid', how='left')
# add unpaywall info
df = add_unpaywall_columns(df, silent=False) # !
# add deal info
df = add_deal_info(path_deals=path_deals, path_isn=path_isn, df_b=df)
# add corresponding author info
df = (corresponding_author_functions()
.add_corresponding_author_info(df=df,
vu_afids=vu_afids,
ukb_afids=all_vsnu_sdg_afids))
# futher process
df['upw_oa_color_category'] = df.upw_oa_color.apply(fn_cats)
## get connection to db
connection = get_connection(host=host,
database=database,
user=user,
pw=pw)
## now pre-process specifically for push to database
# we make primary keys here, so let's find the last one
last_primary_key, success = get_last_primary_key(connection)
last_primary_key, success
df = pre_process_for_push(df, last_primary_key)
list_to_push = process_df_to_list_to_push(df)
# I don't like closing connection after every command, edit that out please, make external closer
# actually push it
connection = get_connection(host=host,
database=database,
user=user,
pw=pw)
push_df_to_db(connection, df)
print('done')