This repository has been archived by the owner on Aug 10, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ent_cli.py
67 lines (49 loc) · 1.74 KB
/
ent_cli.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import click
import glob
import pandas as pd
import spacy
from spacy_lookup import Entity
def extract_taxon_names(fn):
taxon = pd.read_csv(fn, sep="\t")
return taxon["scientificName"].to_list()
def create_nlp_pipe():
return spacy.load("en_core_web_md")
def modify_nlp_pipe(nlp, taxon_names, label):
entity = Entity(keywords_list=taxon_names, label=label)
nlp.add_pipe(entity)
nlp.remove_pipe("ner")
def extract_filenames_and_texts(text_dir):
fns = []
texts = []
for fn in glob.glob(text_dir + "/*.txt"):
fns.append(fn.split("/")[1])
with open(fn, "r") as f:
texts.append(f.read())
return fns, texts
def extract_ents_from_texts(nlp, texts):
docs = list(nlp.pipe(texts))
ent_sets = [set([ent.text.lower() for ent in doc.ents]) for doc in docs]
return ent_sets
def build_ent_df(fns, ent_sets):
data = {"filename": fns, "ents": ent_sets}
return pd.DataFrame(data=data)
def save_ent_df(fn, ent_df):
ent_df.to_csv(fn)
@click.command()
@click.option("--taxon_file", help="tsv file containing taxon data")
@click.option(
"--text_dir",
help="name of directory containing all plain text files to be processed",
)
@click.option("--output", help="filename for saving the filename and entity data")
def main(taxon_file, text_dir, output):
"""Small program to extract sets of named entities from texts based on a defined dictionary."""
sci_names = extract_taxon_names(taxon_file)
nlp = create_nlp_pipe()
modify_nlp_pipe(nlp, sci_names, "Marine")
fns, texts = extract_filenames_and_texts(text_dir)
ent_sets = extract_ents_from_texts(nlp, texts)
ent_df = build_ent_df(fns, ent_sets)
save_ent_df(output, ent_df)
if __name__ == "__main__":
main()