-
Notifications
You must be signed in to change notification settings - Fork 1
/
seriescheck.py
110 lines (96 loc) · 4.5 KB
/
seriescheck.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#
# Scan a MARC file or database table and use the author and title to retrieve series
# information from various external sources to check for a missing 490 (series) field.
#
# Usage: python seriescheck.py --inputfile <MARC input file> [ --libcode library-code]
# or: python seriescheck.py --inputtable <name of database table> [ --libcode library-code]
#
# The database table should have columns for bibnumber, tag, indicators, and tagData.
# tagData is all the subfields glommed together. You can get more information from
# the mydb.py file in the lib folder.
#
# The --libcode argument is necessary in order to retrieve series data from Novelist.
# DO NOT include this unless you are a Bibliocommons library using your own
# library code.
#
# Version: 0.1.0 1/1/21
#
# License: CC BY-NC-SA 4.0, https://creativecommons.org/licenses/by-nc-sa/4.0/
#
# Graeme Williams
#
from collections import Counter
import argparse
import requests
from lib import mydb, mymarc, opac, goodreads
parser = argparse.ArgumentParser(description=
"""Specify either an input file or a MySQL input table.
(The database name is hard-coded. Sorry.)
--libcode is optional but is needed to check Novelist.
If it's not included, Novelist checking will be skipped.
"""
)
parser.add_argument("--libcode", "-lc", required=False)
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument("--inputfile", "-if")
group.add_argument("--inputtable", "-it")
args = parser.parse_args()
therecordgenerator = mymarc.recordgenerator(args.inputfile, args.inputtable)
check_cnx = mydb.Connection(schema="isfdb")
check_table = mydb.Table(check_cnx, "author_title_series_name")
# if you don't specify a library code, Novelist will not be checked
libcode = args.libcode
counts = Counter()
session = requests.Session()
limit_counter = 0
# For each MARC record in the file or database table:
# - collect authors from 100, 700;
# - run each check in the checkList
for bibnum, theRecord in therecordgenerator:
if (limit_counter := limit_counter+1) > 10000:
print("Hit limit from input table!")
break
if (field100 := theRecord['100']):
try:
author_name = mymarc.flipnames(field100['a']).encode("utf-8", "strict").decode('latin-1', "strict")
title = theRecord['245']['a'].rstrip("/: ").encode("utf-8", "strict").decode('latin-1', "strict")
except Exception:
# print(f"Encoding problem with {field100['a']} or {record['245']['a']}")
counts['Character conversion failed'] += 1
continue
query = (
f"WHERE author={mydb.dbescape(author_name)}"
f" AND title={mydb.dbescape(title)}"
)
try:
row = check_table.readfirstrow(query=query, debug=False)
except Exception:
# print(f"Problem with {author_name} and {title}")
counts['Query failed'] += 1
continue
# At this point, we have the MARC record, from which we have extracted the title and author_name
# row has been returned from the local ISFDB instance, but might be None
# We're going to check whether the record has a 490 field and whether row is not None,
# BUT we're not going to check that the series names match.
# print(f"Checking {title} (by) {author_name}")
opac_check = "OPAC:Y" if theRecord["490"] else "OPAC:N"
db_check = "ISFDB:Y" if row else "ISFDB:N"
key = "/".join([opac_check, db_check])
counts[key] += 1
# We only do further checks for records that, according to ISFDB, have missing
# series information. This is to avoid load on Novelist or Goodreads.
if opac_check == "OPAC:N" and db_check == "ISFDB:Y":
isfdb_seriesname = row[2]
# print(f"Missing 490 in {bibnum}: {title} (by) {author_name} = {isfdb_seriesname}")
if bool(libcode) and bool(bibnum):
novelist_seriesname = opac.checkNoveListseries(session, libcode, bibnum, requestdelay=5)
else:
novelist_seriesname = None
if goodreads.enabled():
goodreads_seriesname = goodreads.get_seriesname(session, goodreads.get_worknumber(session, author_name, title))
else:
goodreads_seriesname = None
print(bibnum, author_name, title, isfdb_seriesname, novelist_seriesname, goodreads_seriesname, sep = ',')
print(counts)
session.close()