-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathclassify
executable file
·347 lines (282 loc) · 8.14 KB
/
classify
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
#!/usr/bin/env bash
#shellcheck disable=SC2034,SC1090
#
# classify - return classification data for ISBN (etc.) or MD5 (from libgen/libgen_fiction)
shopt -s extglob
trap "trap_error" TERM
trap "trap_clean" EXIT
export TOP_PID=$$
version="0.5.1"
release="20210601"
functions="$(dirname "$0")/books_functions"
if [ -f "$functions" ]; then
source "$functions"
else
echo "$functions not found"
exit 1
fi
main () {
# PREFERENCES
config=${XDG_CONFIG_HOME:-$HOME/.config}/books.conf
# OCLC classify API
oclc="http://classify.oclc.org/classify2/Classify"
declare -A API=(
[response]='/classify/response/@code'
[owi]='/classify/works/work[1]/@owi'
[wi]='/classify/works/work[1]/@wi'
[fast]='join(/classify/recommendations/fast/headings/heading,",")'
[ddc]='join(/classify/recommendations/ddc/mostPopular/@nsfa)'
[lcc]='join(/classify/recommendations/lcc/mostPopular/@nsfa)'
[nlm]='join(/classify/recommendations/nlm/mostPopular/@sfa)'
[author]='/classify/work/@author'
[authors]='join(/classify/authors/author," | ")'
[title]='/classify/work/@title'
)
declare -A filters=(
[filename]="sed -e 's/[^-[:alnum:]:;?!.,+@#%]/_/g;s/^\([-_]\)*//'"
)
declare -A tables=(
[libgen]="updated"
[libgen_fiction]="fiction"
)
xidel=$(find_tool "xidel")
curl=$(find_tool "curl")
xq="$xidel -s"
request=""
TMPDIR="/tmp"
xml=$(mktemp -p $TMPDIR classify.XXXXX)
# source config file if it exists
[[ -f ${config} ]] && source "${config}"
while getopts "owdlnfatVAD:C:X:G@:h" OPTION; do
case $OPTION in
o)
request="$request owi"
;;
w)
request="$request wi"
;;
d)
request="$request ddc"
;;
l)
request="$request lcc"
;;
n)
request="$request nlm"
;;
f)
request="$request fast"
;;
a)
request="$request author"
;;
t)
request="$request title"
;;
V)
verbose=1
;;
D)
db="$OPTARG"
;;
C)
[ -z "$db" ] && exit_with_error "use -D to define which database to use"
build_csv=1
md5="$OPTARG"
idents=$(get_identifiers "$db" "$md5")
[ -z "$idents" ] && exit_with_error "no identifier found in $db for MD5 = $md5"
;;
X)
save_xml="$OPTARG"
[[ ! -d "$save_xml" ]] && exit_with_error "Save XML (-X $OPTARG): directory does not exist?"
;;
A)
request="author title fast owi wi ddc lcc nlm"
verbose=1
;;
G)
((debug++))
;;
@)
torsocks=$(find_tool "torsocks")
export TORSOCKS_TOR_PORT=${OPTARG}
;;
h)
help
exit
;;
*)
exit_with_error "unknown option: $OPTION"
;;
esac
done
shift $((OPTIND-1))
[ -z "$idents" ] && idents="$1"
IFS=',' read -ra idarr <<< "$idents"
for ident in "${idarr[@]}"; do
[[ -n "$debug" ]] && echo "trying $ident..."
get_xml "$xml" "stdnbr=${ident// }"
response=$(get "response" "$xml")
case "$response" in
0)
success=1
break
;;
2)
success=1
break
;;
4)
wi=$(get "wi" "$xml")
get_xml "$xml" "wi=$wi"
if [[ $(get "response" "$xml") =~ 0|2 ]]; then
success=1
break
else
continue
fi
;;
*)
continue
;;
esac
done
[[ -z "$success" ]] && exit_with_error "no valid response for identifier(s) $idents"
if [[ -n "$save_xml" ]]; then
[[ -z "$md5" ]] && exit_with_error "Save XML (-X) only works with a defined MD5 (-C MD5)"
cp "$xml" "$save_xml/$md5.xml"
fi
if [[ -n "$debug" ]]; then
cat "$xml"
fi
if [[ -n "$build_csv" ]]; then
build_csv "$db" "$md5" "$xml"
else
show_data "$request"
fi
}
get_xml () {
xml="$1"
shift
query="$*"
$torsocks "$curl" -s "${oclc}?summary=false&${query}" --output "$xml"
}
get () {
parameter="$1"
xml="$2"
shift 2
filter="$*"
[[ -z "$filter" ]] && filter='cat -'
$xq "$xml" -e "${API[$parameter]}"|eval "$filter"
}
get_identifiers () {
db="$1"
md5="$2"
declare -A sql_identifier=(
[libgen]="select IdentifierWODash from updated where md5='${md5}';"
[libgen_fiction]="select Identifier from fiction where md5='${md5}';"
)
sql="${sql_identifier[$db]}"
dbx "$db" "$sql"
}
show_data () {
request="$*"
for parameter in $request; do
data=$(get "$parameter" "$xml")
[[ -n "$verbose" ]] && legend="${parameter^^}: "
[[ -n "$data" ]] && echo "${legend}${data}"
done
}
build_csv () {
db="$1"
md5="$2"
xml="$3"
updates="${md5}"
for parameter in ddc lcc nlm; do
data=$(get "$parameter" "$xml")
updates+=",\"${data}\""
done
for parameter in fast author title; do
data=$(get "$parameter" "$xml" "base64 -w0")
updates+=",${data}"
done
echo "$updates"
}
cleanup () {
base=$(basename "$xml")
rm -f "$TMPDIR/$base"
}
help () {
cat <<-EOHELP
$(basename "$(readlink -f "$0")") "version $version"
Use: classify [OPTIONS] identifier[,identifier...]
Queries OCLC classification service for available data
Supports: DDC, LCC, NLM, Author and Title
Valid identifiers are ISBN, ISSN, UPC and OCLC/OWI
OPTIONS:
-d show DDC
-l show LCC
-n show NLM
-f show FAST
-a show Author
-t show Title
-o show OWI (OCLC works identifier)
-w show WI (OCLC works number)
-C md5 create CSV (MD5,DDC,LCC,NLM,FAST,AUTHOR,TITLE)
use -D libgen/-D libgen_fiction to indicate database
-X dir save OCLC XML response to \$dir/\$md5.xml
only works with a defined MD5 (-C MD5)
-D db define which database to use (libgen/libgen_fiction)
-A show all available data for identifier
-V show labels
-@ PORT use torsocks to connect to the OCLC classify service.
use this to avoid getting your IP blocked by OCLC
-h show this help message
Examples
$ classify -A 0199535760
AUTHOR: Plato | Jowett, Benjamin, 1817-1893 Translator; Editor; Other] ...
TITLE: The republic
DDC: 321.07
LCC: JC71
$ classify -D libgen -C 25b8ce971343e85dbdc3fa375804b538
25b8ce971343e85dbdc3fa375804b538,"321.07","JC71","",UG9saXRpY2FsI\
HNjaWVuY2UsVXRvcGlhcyxKdXN0aWNlLEV0aGljcyxQb2xpdGljYWwgZXRoaWNzLFB\
oaWxvc29waHksRW5nbGlzaCBsYW5ndWFnZSxUaGVzYXVyaQo=,UGxhdG8gfCBKb3dl\
dHQsIEJlbmphbWluLCAxODE3LTE4OTMgW1RyYW5zbGF0b3I7IEVkaXRvcjsgT3RoZX\
JdIHwgV2F0ZXJmaWVsZCwgUm9iaW4sIDE5NTItIFtUcmFuc2xhdG9yOyBXcml0ZXIg\
b2YgYWRkZWQgdGV4dDsgRWRpdG9yOyBPdGhlcl0gfCBMZWUsIEguIEQuIFAuIDE5MD\
gtMTk5MyBbVHJhbnNsYXRvcjsgRWRpdG9yOyBBdXRob3Igb2YgaW50cm9kdWN0aW9u\
XSB8IFNob3JleSwgUGF1bCwgMTg1Ny0xOTM0IFtUcmFuc2xhdG9yOyBBdXRob3I7IE\
90aGVyXSB8IFJlZXZlLCBDLiBELiBDLiwgMTk0OC0gW1RyYW5zbGF0b3I7IEVkaXRv\
cjsgT3RoZXJdCg==,VGhlIHJlcHVibGljCg==
Classifying libgen/libgen_fiction
This tool can be used to add classification data to libgen and
libgen_fiction databases. It does not directy modify the database,
instead producing CSV which can be used to apply the modifications.
The best way to do this is to produce a list of md5 hashes for
publications which do have Identifier values but lack values for DDC
and/or LCC. Such lists can be produced by the following SQL:
libgen: select md5 from updated where IdentifierWODash<>"" and DDC="";
libgen_fiction: select md5 from fiction where Identifier<>"" and DDC="";
Run these as batch jobs (mysql -B .... -e 'sql_code_here;' > md5_list), split
the resulting file in ~1000 line sections and feed these to this tool,
preferably with a random pause between requests to keep OCLC's intrusion
detection systems from triggering too early. It is advisable to use
this tool through Tor (using -@ TORPORT to enable torsocks, make sure it
is configured correctly for your Tor instance) to avoid having too
many requests from your IP to be registered, this again to avoid
your IP being blocked. The OCLC classification service is not
run as a production service (I asked them).
Return values are stored in the following order:
MD5,DDC,LCC,NLM,FAST,AUTHOR,TITLE
DDC, LCC and NLM are enclosed within double quotes and can contain
multiple space-separated values. FAST, AUTHOR and TITLE are base64 encoded
since these fields can contain a whole host of unwholesome characters
which can mess up CSV. The AUTHOR field currentlydecodes to a pipe ('|')
separated list of authors in the format:
LAST_NAME, NAME_OR_INITIALS, DATE_OF_BIRTH-[DATE_OF_DEATH] [[ROLE[[;ROLE]...]]]
This format could change depending on what OCLC does with the
(experimental) service.
EOHELP
}
main "$@"