From 3d0abe5c65d27b502cb1e35bfa5987ad43a3e526 Mon Sep 17 00:00:00 2001 From: Ale Abdo Date: Thu, 27 May 2021 18:12:40 +0200 Subject: [PATCH 01/23] Add sashimi module for domain topic modeling --- src/sashimi/__main__.py | 75 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 src/sashimi/__main__.py diff --git a/src/sashimi/__main__.py b/src/sashimi/__main__.py new file mode 100644 index 0000000..fec14f0 --- /dev/null +++ b/src/sashimi/__main__.py @@ -0,0 +1,75 @@ +#! /usr/bin/env python + +import abstractology +import pandas as pd + +# not used, but avoids an import order bug +import graph_tool + +graph_tool + + +def reload(): + """Live reload while developing""" + import importlib + + importlib.reload(abstractology) + + +def get_data(): + df15 = pd.read_csv("abstracts_contents_2015.tsv", sep="\t") + df15 = df15.dropna(subset=["abstract_text"]) + df17 = pd.read_csv("abstracts_contents_1719.tsv", sep="\t") + df17 = df17.dropna(subset=["abstract_text"]) + df = df15.append(df17) + df = df.reset_index() + + return df + + +def _load_data(a, get_data=get_data): + a.data = get_data() + a.process_corpus(ngrams=1) # no ngrams while I don't fix gensim on guix + corpus_name = "isgc_2015-2017.df" + if corpus_name not in a.loaded["data"]: + a.loaded["data"].append(corpus_name) + a.col_title = "abstract_title" + a.col_time = "year" + a.text_source = "abstract_text" + + +def bootstrap(): + a = abstractology.Graphology() + _load_data(a) + a.load_domain_topic_model() + a.set_graph(extend={"prop": "year"}) + a.load_domain_chained_model() + a.register_config() + + return a + + +def load(): + a = abstractology.Graphology("auto_abstractology/reports/config.json") + _load_data(a) + + return a + + +def plot(a): + a.plot_sashimi("ISGC 2015-2017") + a.plot_sashimi("ISGC 2015-2017", chained=True) + + +def main(): + try: + a = load() + except FileNotFoundError: + a = bootstrap() + plot(a) + a.data.columns + reload() + + +if __name__ == "__main__": + main() From b54ac428bde35c272d4be8b85462951312a55105 Mon Sep 17 00:00:00 2001 From: Ale Abdo Date: Mon, 14 Jun 2021 00:30:38 +0200 Subject: [PATCH 02/23] sashimi: domain-chained model with topic_1 --- src/sashimi/__main__.py | 55 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 51 insertions(+), 4 deletions(-) diff --git a/src/sashimi/__main__.py b/src/sashimi/__main__.py index fec14f0..6c60bfc 100644 --- a/src/sashimi/__main__.py +++ b/src/sashimi/__main__.py @@ -1,13 +1,47 @@ #! /usr/bin/env python +import graph_tool # not used, but avoids an import order bug import abstractology import pandas as pd -# not used, but avoids an import order bug -import graph_tool - graph_tool +""" NOTES + +- Data columns: + ['abstract_text', + 'abstract_title', + 'bibliography', + 'cancelled', + 'code', + 'figure_legend_1', + 'figure_legend_2', + 'figure_title_1', + 'figure_title_2', + 'final_status', + 'id', + 'index', + 'is_complete', + 'keyword_1', + 'keyword_2', + 'keyword_3', + 'keyword_4', + 'keywords', + 'legend_1', + 'legend_2', + 'not_to_remind', + 'program_day', + 'program_session', + 'publish_onsite', + 'relance_register', + 'topic_1', + 'topic_2', + 'topic_3', + 'user_id', + 'validate', + 'year'] +""" + def reload(): """Live reload while developing""" @@ -44,20 +78,33 @@ def bootstrap(): a.load_domain_topic_model() a.set_graph(extend={"prop": "year"}) a.load_domain_chained_model() + a.set_graph(extend={"prop": "topic_1"}) + a.load_domain_chained_model() a.register_config() return a def load(): - a = abstractology.Graphology("auto_abstractology/reports/config.json") + a = abstractology.Graphology( + config="auto_abstractology/reports/config.json", + load_data=False, + ) _load_data(a) return a def plot(a): + a.load_domain_topic_model() a.plot_sashimi("ISGC 2015-2017") + + a.set_graph(extend={"prop": "year"}) + a.load_domain_chained_model() + a.plot_sashimi("ISGC 2015-2017", chained=True) + + a.set_graph(extend={"prop": "topic_1"}) + a.load_domain_chained_model() a.plot_sashimi("ISGC 2015-2017", chained=True) From ffb69b504a948fd0cbc13d5fe424b82ef8e8b78d Mon Sep 17 00:00:00 2001 From: Ale Abdo Date: Fri, 27 Aug 2021 02:54:50 +0200 Subject: [PATCH 03/23] Add .clean.clean_text(): remove undesirable content from abstracts --- src/sashimi/__main__.py | 22 +++------ src/sashimi/clean.py | 105 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 113 insertions(+), 14 deletions(-) create mode 100644 src/sashimi/clean.py diff --git a/src/sashimi/__main__.py b/src/sashimi/__main__.py index 6c60bfc..6486311 100644 --- a/src/sashimi/__main__.py +++ b/src/sashimi/__main__.py @@ -3,8 +3,9 @@ import graph_tool # not used, but avoids an import order bug import abstractology import pandas as pd +from .clean import clean_text -graph_tool +graph_tool # just so the linter won't complain """ NOTES @@ -43,14 +44,7 @@ """ -def reload(): - """Live reload while developing""" - import importlib - - importlib.reload(abstractology) - - -def get_data(): +def get_data(clean=True): df15 = pd.read_csv("abstracts_contents_2015.tsv", sep="\t") df15 = df15.dropna(subset=["abstract_text"]) df17 = pd.read_csv("abstracts_contents_1719.tsv", sep="\t") @@ -58,18 +52,21 @@ def get_data(): df = df15.append(df17) df = df.reset_index() + if clean: + df["abstract_text"] = clean_text(df) + return df def _load_data(a, get_data=get_data): a.data = get_data() - a.process_corpus(ngrams=1) # no ngrams while I don't fix gensim on guix corpus_name = "isgc_2015-2017.df" if corpus_name not in a.loaded["data"]: a.loaded["data"].append(corpus_name) a.col_title = "abstract_title" a.col_time = "year" a.text_source = "abstract_text" + a.process_corpus(ngrams=1) # no ngrams while I don't fix gensim on guix def bootstrap(): @@ -114,9 +111,6 @@ def main(): except FileNotFoundError: a = bootstrap() plot(a) - a.data.columns - reload() -if __name__ == "__main__": - main() +main() diff --git a/src/sashimi/clean.py b/src/sashimi/clean.py new file mode 100644 index 0000000..8f17df7 --- /dev/null +++ b/src/sashimi/clean.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python + +import re +from difflib import unified_diff +from itertools import permutations + +CLEAN_REFLAGS = re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE + + +def clean_text(df): + """ + Known untreated entries: + 964: "The research was co-financed" + """ + + def and_sections(re0, re1): + return re0 + r"\s* (?: and | & ) \s*" + re1 + + # Remove entries with no valid content + unwanted_res = "^Lorem ipsum dolor sit amet" + b_unwanted = df["abstract_text"].str.contains(unwanted_res) + clean_df = df[~b_unwanted] + + # Section names to be removed + section_names_res = [ + r"backgrounds?", + r"conclusions?", + r"discussions?", + r"experiments?", + r"experimental", + r"introductions?", + r"materials?", + r"methods?", + r"perspectives?", + r"prospects?", + r"objectives?", + r"outlooks?", + r"results?", + r"significance", + r"summary", + ] + section_names_re = r"|".join( + [and_sections(x, y) for x, y in permutations(section_names_res, 2)] + + section_names_res + ) + + # Remove invalid content from entries + unclean_from_start_of_text_res = [ + r".* \n abstract (?: [^\n\w,]* \n)", + ] + unclean_res = [ + r"^ keys?\ ?words? (?: [^\n\w]* \n )? [^\n]*", + r"(^ (?:" + section_names_re + r") (?: [^\n\w]* \n | \s* [^\n\w\s,&]+ ))", + ] + unclean_until_end_of_text_res = [ + r"^ acknowledge?ments? :? .*", + r"^ r[eé]f[eé]rences? \s* :? \s* \n .*", + r"^ [^\n\w]* [12] [^\n\w]+ \w [^\n]+ (? Date: Sat, 28 Aug 2021 02:07:30 +0200 Subject: [PATCH 04/23] Add parameters to package execution for data dir and action --- src/sashimi/__main__.py | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/src/sashimi/__main__.py b/src/sashimi/__main__.py index 6486311..5692843 100644 --- a/src/sashimi/__main__.py +++ b/src/sashimi/__main__.py @@ -1,9 +1,11 @@ #! /usr/bin/env python import graph_tool # not used, but avoids an import order bug +import sys import abstractology import pandas as pd -from .clean import clean_text +from .clean import clean_text, check_clean +from pathlib import Path graph_tool # just so the linter won't complain @@ -44,16 +46,22 @@ """ +ISGC_FILES_DIR = Path(sys.argv[1] if len(sys.argv) > 1 else ".") +ISGC_2015_FILE = ISGC_FILES_DIR / "abstracts_contents_2015.tsv" +ISGC_2017_FILE = ISGC_FILES_DIR / "abstracts_contents_1719.tsv" + + def get_data(clean=True): - df15 = pd.read_csv("abstracts_contents_2015.tsv", sep="\t") + df15 = pd.read_csv(ISGC_2015_FILE, sep="\t") df15 = df15.dropna(subset=["abstract_text"]) - df17 = pd.read_csv("abstracts_contents_1719.tsv", sep="\t") + df17 = pd.read_csv(ISGC_2017_FILE, sep="\t") df17 = df17.dropna(subset=["abstract_text"]) df = df15.append(df17) df = df.reset_index() if clean: df["abstract_text"] = clean_text(df) + df.dropna(subset=["abstract_text"], inplace=True) return df @@ -106,11 +114,17 @@ def plot(a): def main(): - try: - a = load() - except FileNotFoundError: - a = bootstrap() - plot(a) + action = sys.argv[2] if len(sys.argv) > 2 else "check_clean" + if action == "check_clean": + unclean = get_data(False) + clean = get_data(True) + check_clean(unclean.loc[clean.index], clean["abstract_text"]) + elif action == "sashimi": + try: + a = load() + except FileNotFoundError: + a = bootstrap() + plot(a) main() From 1d5e7ffc7251b8587c9f142c3fb716dc308b5516 Mon Sep 17 00:00:00 2001 From: Ale Abdo Date: Mon, 30 Aug 2021 03:48:23 +0200 Subject: [PATCH 05/23] Rename main as init. --- src/sashimi/{__main__.py => __init__.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/sashimi/{__main__.py => __init__.py} (100%) diff --git a/src/sashimi/__main__.py b/src/sashimi/__init__.py similarity index 100% rename from src/sashimi/__main__.py rename to src/sashimi/__init__.py From 0c5e9d765881ce6f41e433fd4cdc1ab1148670ec Mon Sep 17 00:00:00 2001 From: Ale Abdo Date: Mon, 30 Aug 2021 03:48:44 +0200 Subject: [PATCH 06/23] Improve clean_text and check_clean. Document check_clean. --- src/sashimi/clean.py | 48 +++++++++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/src/sashimi/clean.py b/src/sashimi/clean.py index 8f17df7..d0bca5d 100644 --- a/src/sashimi/clean.py +++ b/src/sashimi/clean.py @@ -1,6 +1,7 @@ #!/usr/bin/env python import re +import pandas as pd from difflib import unified_diff from itertools import permutations @@ -31,6 +32,7 @@ def and_sections(re0, re1): r"introductions?", r"materials?", r"methods?", + r"motivation?", r"perspectives?", r"prospects?", r"objectives?", @@ -43,18 +45,22 @@ def and_sections(re0, re1): [and_sections(x, y) for x, y in permutations(section_names_res, 2)] + section_names_res ) - + section_numbering_re = r"[^\n\w]* (?: \d? [^\n\w]* )" # Remove invalid content from entries unclean_from_start_of_text_res = [ - r".* \n abstract (?: [^\n\w,]* \n)", + r"(?: ^ | .* \n)" + section_numbering_re + r"abstract (?: [^\n\w,]* \n)", ] unclean_res = [ - r"^ keys?\ ?words? (?: [^\n\w]* \n )? [^\n]*", - r"(^ (?:" + section_names_re + r") (?: [^\n\w]* \n | \s* [^\n\w\s,&]+ ))", + r"^" + section_numbering_re + r"keys?\ ?words? (?: [^\n\w]* \n )? [^\n]*", + r"^" + + section_numbering_re + + r"(?:" + + section_names_re + + r") (?: \ * section)? (?: [^\n\w]* \n | \s* [^\n\w\s,&]+ )", ] unclean_until_end_of_text_res = [ - r"^ acknowledge?ments? :? .*", - r"^ r[eé]f[eé]rences? \s* :? \s* \n .*", + r"^" + section_numbering_re + r"acknowledge?ments? :? .*", + r"^" + section_numbering_re + r"r[eé]f[eé]rences? \s* :? .*", r"^ [^\n\w]* [12] [^\n\w]+ \w [^\n]+ (? Date: Mon, 30 Aug 2021 03:51:10 +0200 Subject: [PATCH 07/23] Refactor for modularity. --- src/sashimi/__init__.py | 99 ++++++----------------------------------- src/sashimi/__main__.py | 30 +++++++++++++ src/sashimi/sashimi.py | 54 ++++++++++++++++++++++ 3 files changed, 98 insertions(+), 85 deletions(-) create mode 100644 src/sashimi/__main__.py create mode 100644 src/sashimi/sashimi.py diff --git a/src/sashimi/__init__.py b/src/sashimi/__init__.py index 5692843..1fb0675 100644 --- a/src/sashimi/__init__.py +++ b/src/sashimi/__init__.py @@ -1,13 +1,7 @@ #! /usr/bin/env python -import graph_tool # not used, but avoids an import order bug -import sys -import abstractology import pandas as pd -from .clean import clean_text, check_clean -from pathlib import Path - -graph_tool # just so the linter won't complain +from .clean import clean_text """ NOTES @@ -23,7 +17,6 @@ 'figure_title_2', 'final_status', 'id', - 'index', 'is_complete', 'keyword_1', 'keyword_2', @@ -46,85 +39,21 @@ """ -ISGC_FILES_DIR = Path(sys.argv[1] if len(sys.argv) > 1 else ".") -ISGC_2015_FILE = ISGC_FILES_DIR / "abstracts_contents_2015.tsv" -ISGC_2017_FILE = ISGC_FILES_DIR / "abstracts_contents_1719.tsv" - - -def get_data(clean=True): - df15 = pd.read_csv(ISGC_2015_FILE, sep="\t") - df15 = df15.dropna(subset=["abstract_text"]) - df17 = pd.read_csv(ISGC_2017_FILE, sep="\t") - df17 = df17.dropna(subset=["abstract_text"]) - df = df15.append(df17) - df = df.reset_index() - +def get_data(file_paths, clean=True): + df = pd.concat( + [ + pd.read_csv(file, sep="\t") + for file in file_paths + ], + ignore_index=True, + ) + print(f"Found {len(df)} entries.") + df = df.dropna(subset=["abstract_text"]) + print(f" Kept {len(df)} entries containing an abstract.") + if clean: df["abstract_text"] = clean_text(df) df.dropna(subset=["abstract_text"], inplace=True) + print(f" Kept {len(df)} entries after cleaning.") return df - - -def _load_data(a, get_data=get_data): - a.data = get_data() - corpus_name = "isgc_2015-2017.df" - if corpus_name not in a.loaded["data"]: - a.loaded["data"].append(corpus_name) - a.col_title = "abstract_title" - a.col_time = "year" - a.text_source = "abstract_text" - a.process_corpus(ngrams=1) # no ngrams while I don't fix gensim on guix - - -def bootstrap(): - a = abstractology.Graphology() - _load_data(a) - a.load_domain_topic_model() - a.set_graph(extend={"prop": "year"}) - a.load_domain_chained_model() - a.set_graph(extend={"prop": "topic_1"}) - a.load_domain_chained_model() - a.register_config() - - return a - - -def load(): - a = abstractology.Graphology( - config="auto_abstractology/reports/config.json", - load_data=False, - ) - _load_data(a) - - return a - - -def plot(a): - a.load_domain_topic_model() - a.plot_sashimi("ISGC 2015-2017") - - a.set_graph(extend={"prop": "year"}) - a.load_domain_chained_model() - a.plot_sashimi("ISGC 2015-2017", chained=True) - - a.set_graph(extend={"prop": "topic_1"}) - a.load_domain_chained_model() - a.plot_sashimi("ISGC 2015-2017", chained=True) - - -def main(): - action = sys.argv[2] if len(sys.argv) > 2 else "check_clean" - if action == "check_clean": - unclean = get_data(False) - clean = get_data(True) - check_clean(unclean.loc[clean.index], clean["abstract_text"]) - elif action == "sashimi": - try: - a = load() - except FileNotFoundError: - a = bootstrap() - plot(a) - - -main() diff --git a/src/sashimi/__main__.py b/src/sashimi/__main__.py new file mode 100644 index 0000000..655b8f1 --- /dev/null +++ b/src/sashimi/__main__.py @@ -0,0 +1,30 @@ +import sys +from pathlib import Path +from . import get_data +from .clean import check_clean + +# User input +ISGC_FILES_DIR = Path(sys.argv[1] if len(sys.argv) > 1 else ".") +ISGC_FILES = [ + ISGC_FILES_DIR / file + for file in ["abstracts_contents_2015.tsv", "abstracts_contents_1719.tsv"] +] +ACTION = sys.argv[2] if len(sys.argv) > 2 else "check_clean" +START = int(sys.argv[3]) if len(sys.argv) > 3 else 0 + + +def main(): + if ACTION == "check_clean": + unclean = get_data(ISGC_FILES, False) + clean = get_data(ISGC_FILES, True) + check_clean(unclean, clean["abstract_text"], START) + elif ACTION == "sashimi": + import sashimi + try: + a = sashimi.load() + except FileNotFoundError: + a = sashimi.bootstrap() + sashimi.plot(a) + + +main() diff --git a/src/sashimi/sashimi.py b/src/sashimi/sashimi.py new file mode 100644 index 0000000..c622552 --- /dev/null +++ b/src/sashimi/sashimi.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python + +import graph_tool # not used, but avoids an import order bug +import abstractology +from . import get_data + +graph_tool # just so the linter won't complain + + +def _load_data(a, get_data=get_data): + a.data = get_data() + corpus_name = "isgc_2015-2017.df" + if corpus_name not in a.loaded["data"]: + a.loaded["data"].append(corpus_name) + a.col_title = "abstract_title" + a.col_time = "year" + a.text_source = "abstract_text" + a.process_corpus(ngrams=1) # no ngrams while I don't fix gensim on guix + + +def bootstrap(): + a = abstractology.Graphology() + _load_data(a) + a.load_domain_topic_model() + a.set_graph(extend={"prop": "year"}) + a.load_domain_chained_model() + a.set_graph(extend={"prop": "topic_1"}) + a.load_domain_chained_model() + a.register_config() + + return a + + +def load(): + a = abstractology.Graphology( + config="auto_abstractology/reports/config.json", + load_data=False, + ) + _load_data(a) + + return a + + +def plot(a): + a.load_domain_topic_model() + a.plot_sashimi("ISGC 2015-2017") + + a.set_graph(extend={"prop": "year"}) + a.load_domain_chained_model() + a.plot_sashimi("ISGC 2015-2017", chained=True) + + a.set_graph(extend={"prop": "topic_1"}) + a.load_domain_chained_model() + a.plot_sashimi("ISGC 2015-2017", chained=True) From b2f44b25bfc7e211ad2801b3359a7ca2fd73fc77 Mon Sep 17 00:00:00 2001 From: Ale Abdo Date: Mon, 30 Aug 2021 05:07:48 +0200 Subject: [PATCH 08/23] Improve read/write data. --- src/sashimi/__init__.py | 3 ++- src/sashimi/__main__.py | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/sashimi/__init__.py b/src/sashimi/__init__.py index 1fb0675..ba2ea8c 100644 --- a/src/sashimi/__init__.py +++ b/src/sashimi/__init__.py @@ -42,7 +42,7 @@ def get_data(file_paths, clean=True): df = pd.concat( [ - pd.read_csv(file, sep="\t") + pd.read_csv(file, sep="\t", dtype=str) for file in file_paths ], ignore_index=True, @@ -56,4 +56,5 @@ def get_data(file_paths, clean=True): df.dropna(subset=["abstract_text"], inplace=True) print(f" Kept {len(df)} entries after cleaning.") + df.index.name = "index" return df diff --git a/src/sashimi/__main__.py b/src/sashimi/__main__.py index 655b8f1..385b230 100644 --- a/src/sashimi/__main__.py +++ b/src/sashimi/__main__.py @@ -9,6 +9,7 @@ ISGC_FILES_DIR / file for file in ["abstracts_contents_2015.tsv", "abstracts_contents_1719.tsv"] ] +CLEAN_FILE = "abstracts_contents_clean.csv.xz" ACTION = sys.argv[2] if len(sys.argv) > 2 else "check_clean" START = int(sys.argv[3]) if len(sys.argv) > 3 else 0 @@ -17,6 +18,7 @@ def main(): if ACTION == "check_clean": unclean = get_data(ISGC_FILES, False) clean = get_data(ISGC_FILES, True) + clean.to_csv(CLEAN_FILE) check_clean(unclean, clean["abstract_text"], START) elif ACTION == "sashimi": import sashimi From fc2e6d07773d7d0f29c23770276347649ed6d2a8 Mon Sep 17 00:00:00 2001 From: Ale Abdo Date: Mon, 30 Aug 2021 07:56:45 +0200 Subject: [PATCH 09/23] Clean more funding info, with manually identified exceptions. --- src/sashimi/__init__.py | 2 +- src/sashimi/clean.py | 18 ++++++++++++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/src/sashimi/__init__.py b/src/sashimi/__init__.py index ba2ea8c..1cad09b 100644 --- a/src/sashimi/__init__.py +++ b/src/sashimi/__init__.py @@ -53,7 +53,7 @@ def get_data(file_paths, clean=True): if clean: df["abstract_text"] = clean_text(df) - df.dropna(subset=["abstract_text"], inplace=True) + df = df.dropna(subset=["abstract_text"]) print(f" Kept {len(df)} entries after cleaning.") df.index.name = "index" diff --git a/src/sashimi/clean.py b/src/sashimi/clean.py index d0bca5d..e9d573d 100644 --- a/src/sashimi/clean.py +++ b/src/sashimi/clean.py @@ -11,7 +11,7 @@ def clean_text(df): """ Known untreated entries: - 964: "The research was co-financed" + - some title plus authors headers with no clear separation """ def and_sections(re0, re1): @@ -59,7 +59,7 @@ def and_sections(re0, re1): + r") (?: \ * section)? (?: [^\n\w]* \n | \s* [^\n\w\s,&]+ )", ] unclean_until_end_of_text_res = [ - r"^" + section_numbering_re + r"acknowledge?ments? :? .*", + r"^" + section_numbering_re + r"ac?knowled?ge?ments? :? .*", r"^" + section_numbering_re + r"r[eé]f[eé]rences? \s* :? .*", r"^ [^\n\w]* [12] [^\n\w]+ \w [^\n]+ (? Date: Sat, 19 Feb 2022 02:18:29 -0300 Subject: [PATCH 10/23] Add '_abstract_text_is_cleaned' column to singal cleaned --- src/sashimi/__init__.py | 5 ++++- src/sashimi/__main__.py | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/sashimi/__init__.py b/src/sashimi/__init__.py index 1cad09b..339a61d 100644 --- a/src/sashimi/__init__.py +++ b/src/sashimi/__init__.py @@ -52,7 +52,10 @@ def get_data(file_paths, clean=True): print(f" Kept {len(df)} entries containing an abstract.") if clean: - df["abstract_text"] = clean_text(df) + clean_abstract_text = clean_text(df) + df["_abstract_text_is_cleaned"] = ~df["abstract_text"].eq(clean_abstract_text) + df["abstract_text"] = clean_abstract_text + df = df.dropna(subset=["abstract_text"]) print(f" Kept {len(df)} entries after cleaning.") diff --git a/src/sashimi/__main__.py b/src/sashimi/__main__.py index 385b230..6b1edd3 100644 --- a/src/sashimi/__main__.py +++ b/src/sashimi/__main__.py @@ -22,6 +22,7 @@ def main(): check_clean(unclean, clean["abstract_text"], START) elif ACTION == "sashimi": import sashimi + try: a = sashimi.load() except FileNotFoundError: From 087c3ff0d124312646cae33281a416cf2c8ec43b Mon Sep 17 00:00:00 2001 From: Ale Abdo Date: Sat, 19 Feb 2022 02:19:07 -0300 Subject: [PATCH 11/23] Improvements to cleaning and notes --- src/sashimi/clean.py | 49 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 47 insertions(+), 2 deletions(-) diff --git a/src/sashimi/clean.py b/src/sashimi/clean.py index e9d573d..a752674 100644 --- a/src/sashimi/clean.py +++ b/src/sashimi/clean.py @@ -1,5 +1,29 @@ #!/usr/bin/env python +""" +TODO: +- [ ] trouver les abstracts avec une ligne auteurs +- [x] ajouter une variable si l'abstract a été modifié ou pas + +List of abstracts with some issue: + +- 187, 661 (longuer de la ligne pas des majuscules) +- 1610 - trop de chiffres +- 1340, 1634, 1642 (ligne trop courte) +- 315, 873 biblio mais il y a l'année (ou DOI) +- 871 :P +- 1028 'authors acknowledge...' +- 1314 biblio sans année +- 'Abstract: ...' peut-être assouplir le critère (sans \n) +- 871 (table ronde), 1401 +-- Majuscules (peut-être pas tous): +- 978, 1161 formule chimique pleine de majuscules +- 1205 ligne d'abstract avec majuscules +- 1701 plein d'acronymes +- première ligne toute en majuscule + +""" + import re import pandas as pd from difflib import unified_diff @@ -21,6 +45,8 @@ def and_sections(re0, re1): unwanted_res = "^Lorem ipsum dolor sit amet" b_unwanted = df["abstract_text"].str.contains(unwanted_res) clean_df = df[~b_unwanted] + b_unwanted = df["abstract_text"].map(len).lt(100) + clean_df = df[~b_unwanted] # Section names to be removed section_names_res = [ @@ -29,6 +55,7 @@ def and_sections(re0, re1): r"discussions?", r"experiments?", r"experimental", + r"intro", r"introductions?", r"materials?", r"methods?", @@ -37,7 +64,9 @@ def and_sections(re0, re1): r"prospects?", r"objectives?", r"outlooks?", + r"overview?", r"results?", + r"key\ results?", r"significance", r"summary", ] @@ -48,7 +77,7 @@ def and_sections(re0, re1): section_numbering_re = r"[^\n\w]* (?: \d? [^\n\w]* )" # Remove invalid content from entries unclean_from_start_of_text_res = [ - r"(?: ^ | .* \n)" + section_numbering_re + r"abstract (?: [^\n\w,]* \n)", + r"(?: ^ | .* \n)" + section_numbering_re + r"abstract [^\n\w,]* [\n:]", ] unclean_res = [ r"^" + section_numbering_re + r"keys?\ ?words? (?: [^\n\w]* \n )? [^\n]*", @@ -59,7 +88,7 @@ def and_sections(re0, re1): + r") (?: \ * section)? (?: [^\n\w]* \n | \s* [^\n\w\s,&]+ )", ] unclean_until_end_of_text_res = [ - r"^" + section_numbering_re + r"ac?knowled?ge?ments? :? .*", + r"^" + section_numbering_re + r"ac?knowled?ge?m?ents? :? .*", r"^" + section_numbering_re + r"r[eé]f[eé]rences? \s* :? .*", r"^ [^\n\w]* [12] [^\n\w]+ \w [^\n]+ (? 5 and len(upper) / len(words) > 1 / 2 + + def remove_upper_and_email_lines(txt): + newtxt = [] + for line in txt.split("\n"): + if not count_upper(line) and not re.search(r"[\w-]+@[\w-]+\.[\w-]+", line): + newtxt.append(line) + return "\n".join(newtxt) + + + def check_clean(df_or_series, clean_abstract_text, start=0, interactive=True): """Compares two textual series showing diffs for each entry. From cc69b86c91cab988fb9aa3851736bdd8c25dd8a7 Mon Sep 17 00:00:00 2001 From: Ale Abdo Date: Sat, 19 Feb 2022 02:33:34 -0300 Subject: [PATCH 12/23] Rename 'authors' functions --- src/sashimi/clean.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/sashimi/clean.py b/src/sashimi/clean.py index a752674..17e1c95 100644 --- a/src/sashimi/clean.py +++ b/src/sashimi/clean.py @@ -117,22 +117,22 @@ def and_sections(re0, re1): return clean_abstract_text # TODO: find use for these - def count_upper(line): + def long_and_mostly_titlecased(line): line = re.sub(r"[^\w\s]", "", line) words = line.split() upper = [x for x in words if x[0].isupper()] return len(words) > 5 and len(upper) / len(words) > 1 / 2 - def remove_upper_and_email_lines(txt): + def remove_lines_like_authors(txt): newtxt = [] for line in txt.split("\n"): - if not count_upper(line) and not re.search(r"[\w-]+@[\w-]+\.[\w-]+", line): + if not long_and_mostly_titlecased(line) and not re.search( + r"[\w-]+@[\w-]+\.[\w-]+", line + ): newtxt.append(line) return "\n".join(newtxt) - - def check_clean(df_or_series, clean_abstract_text, start=0, interactive=True): """Compares two textual series showing diffs for each entry. From 7bd3b547925e46aded94213d92d50fa7c4e6a39a Mon Sep 17 00:00:00 2001 From: Ale Abdo Date: Sat, 19 Feb 2022 20:35:03 -0300 Subject: [PATCH 13/23] Parametrize drop in get_data --- src/sashimi/__init__.py | 19 +++++++++---------- src/sashimi/__main__.py | 4 +++- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/src/sashimi/__init__.py b/src/sashimi/__init__.py index 339a61d..a6e7f11 100644 --- a/src/sashimi/__init__.py +++ b/src/sashimi/__init__.py @@ -39,25 +39,24 @@ """ -def get_data(file_paths, clean=True): +def get_data(file_paths, clean=True, drop=True): df = pd.concat( - [ - pd.read_csv(file, sep="\t", dtype=str) - for file in file_paths - ], + [pd.read_csv(file, sep="\t", dtype=str) for file in file_paths], ignore_index=True, ) print(f"Found {len(df)} entries.") - df = df.dropna(subset=["abstract_text"]) - print(f" Kept {len(df)} entries containing an abstract.") - + if drop: + df = df.dropna(subset=["abstract_text"]) + print(f" Kept {len(df)} entries containing an abstract.") + if clean: clean_abstract_text = clean_text(df) df["_abstract_text_is_cleaned"] = ~df["abstract_text"].eq(clean_abstract_text) df["abstract_text"] = clean_abstract_text - df = df.dropna(subset=["abstract_text"]) - print(f" Kept {len(df)} entries after cleaning.") + if drop: + df = df.dropna(subset=["abstract_text"]) + print(f" Kept {len(df)} entries after cleaning.") df.index.name = "index" return df diff --git a/src/sashimi/__main__.py b/src/sashimi/__main__.py index 6b1edd3..974d1a8 100644 --- a/src/sashimi/__main__.py +++ b/src/sashimi/__main__.py @@ -20,6 +20,7 @@ def main(): clean = get_data(ISGC_FILES, True) clean.to_csv(CLEAN_FILE) check_clean(unclean, clean["abstract_text"], START) + elif ACTION == "sashimi": import sashimi @@ -30,4 +31,5 @@ def main(): sashimi.plot(a) -main() +if __name__ == "__main__": + main() From 2b64216db26a6fdc0180ad063283c681eede2420 Mon Sep 17 00:00:00 2001 From: Ale Abdo Date: Sat, 19 Feb 2022 20:36:37 -0300 Subject: [PATCH 14/23] Complete author_affiliation functions --- src/sashimi/clean.py | 58 ++++++++++++++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 15 deletions(-) diff --git a/src/sashimi/clean.py b/src/sashimi/clean.py index 17e1c95..a0727ef 100644 --- a/src/sashimi/clean.py +++ b/src/sashimi/clean.py @@ -116,22 +116,50 @@ def and_sections(re0, re1): return clean_abstract_text - # TODO: find use for these - def long_and_mostly_titlecased(line): - line = re.sub(r"[^\w\s]", "", line) - words = line.split() - upper = [x for x in words if x[0].isupper()] - return len(words) > 5 and len(upper) / len(words) > 1 / 2 - - def remove_lines_like_authors(txt): - newtxt = [] - for line in txt.split("\n"): - if not long_and_mostly_titlecased(line) and not re.search( - r"[\w-]+@[\w-]+\.[\w-]+", line - ): - newtxt.append(line) - return "\n".join(newtxt) +# TODO: find a use for these in clean_text()? +def is_author_affiliation(line, verbose=False): + author_words = r"and of at in de et und".split() + words_re = fr'\b(?:{"|".join(author_words)})\b' + line = re.sub(r"[-\.]", " ", line) + line = re.sub(r",", " , ", line) + line = re.sub(r"\d+", "", line) + line = re.sub(r"[^\w\s,]*|\b[a-z]\b", "", line) + words = line.split() + point_words = [ + x for x in words if x[0].isupper() or x == "," or re.match(words_re, x) + ] + if verbose: + print(point_words) + print(words) + return len(words) > 4 and len(point_words) / len(words) > 0.8 + + +def is_email_address(line): + return re.search(r"[\w-]+@[\w-]+\.[\w-]+", line) + + +def has_author_affiliation(txt): + split = int(len(txt) / 2) + txts = txt[:split].split("\n")[:-1] + for line in txts: + if is_author_affiliation(line) or is_email_address(line): + return True + return False + + +def remove_lines_like_authors(txt): + newtxt = [] + split = int(len(txt) / 2) + txts = txt[:split].split("\n") + tail = [txts.pop()] + for line in txts: + if not is_author_affiliation(line) and not is_email_address(line): + newtxt.append(line) + return "\n".join(newtxt + tail) + txt[split:] + + +## Interactive def check_clean(df_or_series, clean_abstract_text, start=0, interactive=True): """Compares two textual series showing diffs for each entry. From 8057b798ee65596706886a16ced5496aa68647c5 Mon Sep 17 00:00:00 2001 From: Ale Abdo Date: Sun, 20 Feb 2022 22:00:52 -0300 Subject: [PATCH 15/23] Refactor to new abstractology API --- src/sashimi/__main__.py | 8 ++--- src/sashimi/sashimi.py | 65 ++++++++++++++++++++--------------------- 2 files changed, 35 insertions(+), 38 deletions(-) diff --git a/src/sashimi/__main__.py b/src/sashimi/__main__.py index 974d1a8..a1b33c8 100644 --- a/src/sashimi/__main__.py +++ b/src/sashimi/__main__.py @@ -22,13 +22,13 @@ def main(): check_clean(unclean, clean["abstract_text"], START) elif ACTION == "sashimi": - import sashimi + from . import sashimi try: - a = sashimi.load() + corpus = sashimi.load() except FileNotFoundError: - a = sashimi.bootstrap() - sashimi.plot(a) + corpus = sashimi.bootstrap() + sashimi.plot(corpus) if __name__ == "__main__": diff --git a/src/sashimi/sashimi.py b/src/sashimi/sashimi.py index c622552..0d96434 100644 --- a/src/sashimi/sashimi.py +++ b/src/sashimi/sashimi.py @@ -1,54 +1,51 @@ #!/usr/bin/env python -import graph_tool # not used, but avoids an import order bug +import graph_tool # not used, but avoids an import order bug # noqa import abstractology from . import get_data -graph_tool # just so the linter won't complain - - -def _load_data(a, get_data=get_data): - a.data = get_data() - corpus_name = "isgc_2015-2017.df" - if corpus_name not in a.loaded["data"]: - a.loaded["data"].append(corpus_name) - a.col_title = "abstract_title" - a.col_time = "year" - a.text_source = "abstract_text" - a.process_corpus(ngrams=1) # no ngrams while I don't fix gensim on guix +CORPUS_NAME = "isgc_2015-2017" def bootstrap(): - a = abstractology.Graphology() - _load_data(a) - a.load_domain_topic_model() - a.set_graph(extend={"prop": "year"}) - a.load_domain_chained_model() - a.set_graph(extend={"prop": "topic_1"}) - a.load_domain_chained_model() - a.register_config() + corpus = abstractology.Graphology() + corpus.text_source = "abstract_text" + corpus.col_title = "abstract_title" + corpus.col_time = "year" + _load_data(corpus) - return a + corpus.load_domain_topic_model() + corpus.set_graph(extend={"prop": "year"}) + corpus.load_domain_chained_model() + corpus.set_graph(extend={"prop": "topic_1"}) + corpus.load_domain_chained_model() + + corpus.register_config() + return corpus def load(): - a = abstractology.Graphology( + corpus = abstractology.Graphology( config="auto_abstractology/reports/config.json", load_data=False, ) - _load_data(a) + _load_data(corpus) + return corpus + - return a +def plot(corpus): + corpus.load_domain_topic_model() + corpus.domain_map("ISGC 2015-2017") + corpus.set_graph(extend={"prop": "year"}) + corpus.load_domain_chained_model() + corpus.domain_map("ISGC 2015-2017", chained=True) -def plot(a): - a.load_domain_topic_model() - a.plot_sashimi("ISGC 2015-2017") + corpus.set_graph(extend={"prop": "topic_1"}) + corpus.load_domain_chained_model() + corpus.domain_map("ISGC 2015-2017", chained=True) - a.set_graph(extend={"prop": "year"}) - a.load_domain_chained_model() - a.plot_sashimi("ISGC 2015-2017", chained=True) - a.set_graph(extend={"prop": "topic_1"}) - a.load_domain_chained_model() - a.plot_sashimi("ISGC 2015-2017", chained=True) +def _load_data(corpus): + corpus.load_data(get_data(), "dataframe", name=CORPUS_NAME) + corpus.process_corpus(ngrams=1) # no ngrams while I don't fix gensim on guix From d2b6198782cd3e482bdca9b666fd7ce72f5ae3f7 Mon Sep 17 00:00:00 2001 From: Ale Abdo Date: Mon, 21 Feb 2022 00:36:27 -0300 Subject: [PATCH 16/23] Add has_authors to cleaned data --- src/sashimi/__init__.py | 9 ++++++--- src/sashimi/clean.py | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/sashimi/__init__.py b/src/sashimi/__init__.py index a6e7f11..85ede19 100644 --- a/src/sashimi/__init__.py +++ b/src/sashimi/__init__.py @@ -1,7 +1,9 @@ #! /usr/bin/env python import pandas as pd -from .clean import clean_text + +from . import clean as clean_m + """ NOTES @@ -50,8 +52,9 @@ def get_data(file_paths, clean=True, drop=True): print(f" Kept {len(df)} entries containing an abstract.") if clean: - clean_abstract_text = clean_text(df) - df["_abstract_text_is_cleaned"] = ~df["abstract_text"].eq(clean_abstract_text) + df["abstract_text__has_authors"] = df["abstract_text"].map(clean_m.has_authors) + clean_abstract_text = clean_m.clean_text(df) + df["abstract_text__is_cleaned"] = ~df["abstract_text"].eq(clean_abstract_text) df["abstract_text"] = clean_abstract_text if drop: diff --git a/src/sashimi/clean.py b/src/sashimi/clean.py index a0727ef..3c8ae4a 100644 --- a/src/sashimi/clean.py +++ b/src/sashimi/clean.py @@ -139,7 +139,7 @@ def is_email_address(line): return re.search(r"[\w-]+@[\w-]+\.[\w-]+", line) -def has_author_affiliation(txt): +def has_authors(txt): split = int(len(txt) / 2) txts = txt[:split].split("\n")[:-1] for line in txts: From 4b9a492faf74a7e4f593886773454098185b7acb Mon Sep 17 00:00:00 2001 From: Ale Abdo Date: Mon, 21 Feb 2022 00:37:13 -0300 Subject: [PATCH 17/23] Improve and document script behavior --- src/sashimi/README.md | 24 ++++++++++++++++++++++++ src/sashimi/__init__.py | 1 + src/sashimi/__main__.py | 23 ++++++++++++++--------- src/sashimi/sashimi.py | 15 +++++++-------- 4 files changed, 46 insertions(+), 17 deletions(-) create mode 100644 src/sashimi/README.md diff --git a/src/sashimi/README.md b/src/sashimi/README.md new file mode 100644 index 0000000..0e7b09f --- /dev/null +++ b/src/sashimi/README.md @@ -0,0 +1,24 @@ +# Running as as script + +`python ${PATH_TO_THIS_MODULE} ${PATH_TO_DATA_DIR}` + +cleans data and outputs `abstracts_contents_clean.csv.xz"`. + +`python ${PATH_TO_THIS_MODULE} ${PATH_TO_DATA_DIR} check_clean` + +in addition, interactively displays what was cleaned. + +`python ${PATH_TO_THIS_MODULE} ${PATH_TO_DATA_DIR} check_clean ${START}` + +starts displaying changes from index ${START}. + +## Requires Python modules `abstractology` and `graph-tool`: + +`python ${PATH_TO_THIS_MODULE} ${PATH_TO_DATA_DIR} sashimi` + +- cleans data and outputs `abstracts_contents_clean.csv.xz"`. +- produces a domain-topic model of the data +- plots a domain-topic map +- produces chained models for time ('year') and conference topic ('topic_1') +- plots the respective domain-chained maps + diff --git a/src/sashimi/__init__.py b/src/sashimi/__init__.py index 85ede19..05c5cf3 100644 --- a/src/sashimi/__init__.py +++ b/src/sashimi/__init__.py @@ -56,6 +56,7 @@ def get_data(file_paths, clean=True, drop=True): clean_abstract_text = clean_m.clean_text(df) df["abstract_text__is_cleaned"] = ~df["abstract_text"].eq(clean_abstract_text) df["abstract_text"] = clean_abstract_text + print(f" Cleaned {df['abstract_text__is_cleaned'].sum()} entries.") if drop: df = df.dropna(subset=["abstract_text"]) diff --git a/src/sashimi/__main__.py b/src/sashimi/__main__.py index a1b33c8..f6dac4c 100644 --- a/src/sashimi/__main__.py +++ b/src/sashimi/__main__.py @@ -5,29 +5,34 @@ # User input ISGC_FILES_DIR = Path(sys.argv[1] if len(sys.argv) > 1 else ".") +ACTION = sys.argv[2] if len(sys.argv) > 2 else None +START = int(sys.argv[3]) if len(sys.argv) > 3 else 0 + +# Path definitions ISGC_FILES = [ ISGC_FILES_DIR / file for file in ["abstracts_contents_2015.tsv", "abstracts_contents_1719.tsv"] ] -CLEAN_FILE = "abstracts_contents_clean.csv.xz" -ACTION = sys.argv[2] if len(sys.argv) > 2 else "check_clean" -START = int(sys.argv[3]) if len(sys.argv) > 3 else 0 +CLEAN_FILE = Path("isgc_2015-2019_clean.csv.xz") def main(): + clean_data = get_data(ISGC_FILES, True) + clean_data.to_csv(CLEAN_FILE) + print(f"Saved {CLEAN_FILE}") + clean_data.name = CLEAN_FILE.name.split(".")[0] + if ACTION == "check_clean": - unclean = get_data(ISGC_FILES, False) - clean = get_data(ISGC_FILES, True) - clean.to_csv(CLEAN_FILE) - check_clean(unclean, clean["abstract_text"], START) + unclean_data = get_data(ISGC_FILES, False) + check_clean(unclean_data, clean_data["abstract_text"], START) elif ACTION == "sashimi": from . import sashimi try: - corpus = sashimi.load() + corpus = sashimi.load(clean_data) except FileNotFoundError: - corpus = sashimi.bootstrap() + corpus = sashimi.bootstrap(clean_data) sashimi.plot(corpus) diff --git a/src/sashimi/sashimi.py b/src/sashimi/sashimi.py index 0d96434..d397d2a 100644 --- a/src/sashimi/sashimi.py +++ b/src/sashimi/sashimi.py @@ -2,17 +2,16 @@ import graph_tool # not used, but avoids an import order bug # noqa import abstractology -from . import get_data -CORPUS_NAME = "isgc_2015-2017" +DEFAULT_CORPUS_NAME = "isgc" -def bootstrap(): +def bootstrap(data): corpus = abstractology.Graphology() corpus.text_source = "abstract_text" corpus.col_title = "abstract_title" corpus.col_time = "year" - _load_data(corpus) + _load_data(corpus, data) corpus.load_domain_topic_model() corpus.set_graph(extend={"prop": "year"}) @@ -24,12 +23,12 @@ def bootstrap(): return corpus -def load(): +def load(data): corpus = abstractology.Graphology( config="auto_abstractology/reports/config.json", load_data=False, ) - _load_data(corpus) + _load_data(corpus, data) return corpus @@ -46,6 +45,6 @@ def plot(corpus): corpus.domain_map("ISGC 2015-2017", chained=True) -def _load_data(corpus): - corpus.load_data(get_data(), "dataframe", name=CORPUS_NAME) +def _load_data(corpus, data): + corpus.load_data(data, "dataframe", name=getattr(data, 'name', DEFAULT_CORPUS_NAME)) corpus.process_corpus(ngrams=1) # no ngrams while I don't fix gensim on guix From 00d68bfb1ede5495ecbf030b5e30840eb7e4437b Mon Sep 17 00:00:00 2001 From: Ale Abdo Date: Mon, 21 Feb 2022 01:07:42 -0300 Subject: [PATCH 18/23] Update README --- src/sashimi/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sashimi/README.md b/src/sashimi/README.md index 0e7b09f..bcde2b8 100644 --- a/src/sashimi/README.md +++ b/src/sashimi/README.md @@ -2,7 +2,7 @@ `python ${PATH_TO_THIS_MODULE} ${PATH_TO_DATA_DIR}` -cleans data and outputs `abstracts_contents_clean.csv.xz"`. +cleans data and outputs `isgc_2015-2019_clean.csv.xz`. `python ${PATH_TO_THIS_MODULE} ${PATH_TO_DATA_DIR} check_clean` @@ -16,7 +16,7 @@ starts displaying changes from index ${START}. `python ${PATH_TO_THIS_MODULE} ${PATH_TO_DATA_DIR} sashimi` -- cleans data and outputs `abstracts_contents_clean.csv.xz"`. +- cleans data and outputs `isgc_2015-2019_clean.csv.xz`. - produces a domain-topic model of the data - plots a domain-topic map - produces chained models for time ('year') and conference topic ('topic_1') From cbbce6f9bad1dffa1cbd48f812c47a3d05a94600 Mon Sep 17 00:00:00 2001 From: Ale Abdo Date: Mon, 21 Feb 2022 01:45:27 -0300 Subject: [PATCH 19/23] Fix README --- src/sashimi/README.md | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/sashimi/README.md b/src/sashimi/README.md index bcde2b8..e6631e8 100644 --- a/src/sashimi/README.md +++ b/src/sashimi/README.md @@ -1,20 +1,28 @@ -# Running as as script +# Running the module as as script -`python ${PATH_TO_THIS_MODULE} ${PATH_TO_DATA_DIR}` +First, add the folder containing this module to your $PYTHON_PATH: + +`PATH_TO_PARENT_DIR=/home/user/projects/isgc-congress/src` + +`export PYTHONPATH=$PYTHONPATH:${PATH_TO_PARENT_DIR}` + +Then... + +`python -m ${PATH_TO_THIS_MODULE} ${PATH_TO_DATA_DIR}` cleans data and outputs `isgc_2015-2019_clean.csv.xz`. -`python ${PATH_TO_THIS_MODULE} ${PATH_TO_DATA_DIR} check_clean` +`python -m ${PATH_TO_THIS_MODULE} ${PATH_TO_DATA_DIR} check_clean` in addition, interactively displays what was cleaned. -`python ${PATH_TO_THIS_MODULE} ${PATH_TO_DATA_DIR} check_clean ${START}` +`python -m ${PATH_TO_THIS_MODULE} ${PATH_TO_DATA_DIR} check_clean ${START}` starts displaying changes from index ${START}. ## Requires Python modules `abstractology` and `graph-tool`: -`python ${PATH_TO_THIS_MODULE} ${PATH_TO_DATA_DIR} sashimi` +`python -m ${PATH_TO_THIS_MODULE} ${PATH_TO_DATA_DIR} sashimi` - cleans data and outputs `isgc_2015-2019_clean.csv.xz`. - produces a domain-topic model of the data From 410ab76efc3d308b6bd16658809819ccc33069a0 Mon Sep 17 00:00:00 2001 From: Ale Abdo Date: Mon, 21 Feb 2022 01:51:35 -0300 Subject: [PATCH 20/23] Rename to isgc_sashimi and fix README --- src/{sashimi => isgc_sashimi}/README.md | 8 ++++---- src/{sashimi => isgc_sashimi}/__init__.py | 0 src/{sashimi => isgc_sashimi}/__main__.py | 0 src/{sashimi => isgc_sashimi}/clean.py | 0 src/{sashimi => isgc_sashimi}/sashimi.py | 0 5 files changed, 4 insertions(+), 4 deletions(-) rename src/{sashimi => isgc_sashimi}/README.md (72%) rename src/{sashimi => isgc_sashimi}/__init__.py (100%) rename src/{sashimi => isgc_sashimi}/__main__.py (100%) rename src/{sashimi => isgc_sashimi}/clean.py (100%) rename src/{sashimi => isgc_sashimi}/sashimi.py (100%) diff --git a/src/sashimi/README.md b/src/isgc_sashimi/README.md similarity index 72% rename from src/sashimi/README.md rename to src/isgc_sashimi/README.md index e6631e8..fa063c6 100644 --- a/src/sashimi/README.md +++ b/src/isgc_sashimi/README.md @@ -8,21 +8,21 @@ First, add the folder containing this module to your $PYTHON_PATH: Then... -`python -m ${PATH_TO_THIS_MODULE} ${PATH_TO_DATA_DIR}` +`python -m isgc_sashimi ${PATH_TO_DATA_DIR}` cleans data and outputs `isgc_2015-2019_clean.csv.xz`. -`python -m ${PATH_TO_THIS_MODULE} ${PATH_TO_DATA_DIR} check_clean` +`python -m isgc_sashimi ${PATH_TO_DATA_DIR} check_clean` in addition, interactively displays what was cleaned. -`python -m ${PATH_TO_THIS_MODULE} ${PATH_TO_DATA_DIR} check_clean ${START}` +`python -m isgc_sashimi ${PATH_TO_DATA_DIR} check_clean ${START}` starts displaying changes from index ${START}. ## Requires Python modules `abstractology` and `graph-tool`: -`python -m ${PATH_TO_THIS_MODULE} ${PATH_TO_DATA_DIR} sashimi` +`python -m isgc_sashimi ${PATH_TO_DATA_DIR} sashimi` - cleans data and outputs `isgc_2015-2019_clean.csv.xz`. - produces a domain-topic model of the data diff --git a/src/sashimi/__init__.py b/src/isgc_sashimi/__init__.py similarity index 100% rename from src/sashimi/__init__.py rename to src/isgc_sashimi/__init__.py diff --git a/src/sashimi/__main__.py b/src/isgc_sashimi/__main__.py similarity index 100% rename from src/sashimi/__main__.py rename to src/isgc_sashimi/__main__.py diff --git a/src/sashimi/clean.py b/src/isgc_sashimi/clean.py similarity index 100% rename from src/sashimi/clean.py rename to src/isgc_sashimi/clean.py diff --git a/src/sashimi/sashimi.py b/src/isgc_sashimi/sashimi.py similarity index 100% rename from src/sashimi/sashimi.py rename to src/isgc_sashimi/sashimi.py From ad5a4efbba250bcbf8af750646cad9c9fd1b4f38 Mon Sep 17 00:00:00 2001 From: Ale Abdo Date: Mon, 21 Feb 2022 02:35:55 -0300 Subject: [PATCH 21/23] Keep original and cleaned text, update README with new columns --- src/isgc_sashimi/README.md | 6 +++++- src/isgc_sashimi/__init__.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/isgc_sashimi/README.md b/src/isgc_sashimi/README.md index fa063c6..53b41b3 100644 --- a/src/isgc_sashimi/README.md +++ b/src/isgc_sashimi/README.md @@ -10,7 +10,11 @@ Then... `python -m isgc_sashimi ${PATH_TO_DATA_DIR}` -cleans data and outputs `isgc_2015-2019_clean.csv.xz`. +cleans data and outputs `isgc_2015-2019_clean.csv.xz`, with new columns: + +- abstract_text__cleaned +- abstract_text__is_cleaned +- abstract_text__has_authors `python -m isgc_sashimi ${PATH_TO_DATA_DIR} check_clean` diff --git a/src/isgc_sashimi/__init__.py b/src/isgc_sashimi/__init__.py index 05c5cf3..dd2b8b2 100644 --- a/src/isgc_sashimi/__init__.py +++ b/src/isgc_sashimi/__init__.py @@ -55,7 +55,7 @@ def get_data(file_paths, clean=True, drop=True): df["abstract_text__has_authors"] = df["abstract_text"].map(clean_m.has_authors) clean_abstract_text = clean_m.clean_text(df) df["abstract_text__is_cleaned"] = ~df["abstract_text"].eq(clean_abstract_text) - df["abstract_text"] = clean_abstract_text + df["abstract_text__cleaned"] = clean_abstract_text print(f" Cleaned {df['abstract_text__is_cleaned'].sum()} entries.") if drop: From 7bc22cd8cfa77c432cc627f197561354df7f77d2 Mon Sep 17 00:00:00 2001 From: Ale Abdo Date: Mon, 21 Feb 2022 02:41:32 -0300 Subject: [PATCH 22/23] Update README --- src/isgc_sashimi/README.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/isgc_sashimi/README.md b/src/isgc_sashimi/README.md index 53b41b3..a5a340b 100644 --- a/src/isgc_sashimi/README.md +++ b/src/isgc_sashimi/README.md @@ -24,13 +24,16 @@ in addition, interactively displays what was cleaned. starts displaying changes from index ${START}. -## Requires Python modules `abstractology` and `graph-tool`: +## Domain-topic modeling (SASHIMI) + +Requires Python modules `abstractology` and `graph-tool`. `python -m isgc_sashimi ${PATH_TO_DATA_DIR} sashimi` +will do the following, in sequence: + - cleans data and outputs `isgc_2015-2019_clean.csv.xz`. - produces a domain-topic model of the data - plots a domain-topic map - produces chained models for time ('year') and conference topic ('topic_1') - plots the respective domain-chained maps - From 1313b08ac01afa13d25d100407a948ff57685953 Mon Sep 17 00:00:00 2001 From: Ale Abdo Date: Tue, 20 Sep 2022 00:20:56 +0200 Subject: [PATCH 23/23] Update to abstractology: use set_chain --- src/isgc_sashimi/sashimi.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/isgc_sashimi/sashimi.py b/src/isgc_sashimi/sashimi.py index d397d2a..3ce7e2e 100644 --- a/src/isgc_sashimi/sashimi.py +++ b/src/isgc_sashimi/sashimi.py @@ -14,9 +14,9 @@ def bootstrap(data): _load_data(corpus, data) corpus.load_domain_topic_model() - corpus.set_graph(extend={"prop": "year"}) + corpus.set_chain("year") corpus.load_domain_chained_model() - corpus.set_graph(extend={"prop": "topic_1"}) + corpus.set_chain("topic_1") corpus.load_domain_chained_model() corpus.register_config() @@ -36,15 +36,15 @@ def plot(corpus): corpus.load_domain_topic_model() corpus.domain_map("ISGC 2015-2017") - corpus.set_graph(extend={"prop": "year"}) + corpus.set_chain("year") corpus.load_domain_chained_model() corpus.domain_map("ISGC 2015-2017", chained=True) - corpus.set_graph(extend={"prop": "topic_1"}) + corpus.set_chain(extend="topic_1") corpus.load_domain_chained_model() corpus.domain_map("ISGC 2015-2017", chained=True) def _load_data(corpus, data): - corpus.load_data(data, "dataframe", name=getattr(data, 'name', DEFAULT_CORPUS_NAME)) + corpus.load_data(data, "dataframe", name=getattr(data, "name", DEFAULT_CORPUS_NAME)) corpus.process_corpus(ngrams=1) # no ngrams while I don't fix gensim on guix