From 3d0abe5c65d27b502cb1e35bfa5987ad43a3e526 Mon Sep 17 00:00:00 2001
From: Ale Abdo <abdo@member.fsf.org>
Date: Thu, 27 May 2021 18:12:40 +0200
Subject: [PATCH 01/23] Add sashimi module for domain topic modeling

---
 src/sashimi/__main__.py | 75 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)
 create mode 100644 src/sashimi/__main__.py

diff --git a/src/sashimi/__main__.py b/src/sashimi/__main__.py
new file mode 100644
index 0000000..fec14f0
--- /dev/null
+++ b/src/sashimi/__main__.py
@@ -0,0 +1,75 @@
+#! /usr/bin/env python
+
+import abstractology
+import pandas as pd
+
+# not used, but avoids an import order bug
+import graph_tool
+
+graph_tool
+
+
+def reload():
+    """Live reload while developing"""
+    import importlib
+
+    importlib.reload(abstractology)
+
+
+def get_data():
+    df15 = pd.read_csv("abstracts_contents_2015.tsv", sep="\t")
+    df15 = df15.dropna(subset=["abstract_text"])
+    df17 = pd.read_csv("abstracts_contents_1719.tsv", sep="\t")
+    df17 = df17.dropna(subset=["abstract_text"])
+    df = df15.append(df17)
+    df = df.reset_index()
+
+    return df
+
+
+def _load_data(a, get_data=get_data):
+    a.data = get_data()
+    a.process_corpus(ngrams=1)  # no ngrams while I don't fix gensim on guix
+    corpus_name = "isgc_2015-2017.df"
+    if corpus_name not in a.loaded["data"]:
+        a.loaded["data"].append(corpus_name)
+        a.col_title = "abstract_title"
+        a.col_time = "year"
+        a.text_source = "abstract_text"
+
+
+def bootstrap():
+    a = abstractology.Graphology()
+    _load_data(a)
+    a.load_domain_topic_model()
+    a.set_graph(extend={"prop": "year"})
+    a.load_domain_chained_model()
+    a.register_config()
+
+    return a
+
+
+def load():
+    a = abstractology.Graphology("auto_abstractology/reports/config.json")
+    _load_data(a)
+
+    return a
+
+
+def plot(a):
+    a.plot_sashimi("ISGC 2015-2017")
+    a.plot_sashimi("ISGC 2015-2017", chained=True)
+
+
+def main():
+    try:
+        a = load()
+    except FileNotFoundError:
+        a = bootstrap()
+    plot(a)
+    a.data.columns
+    reload()
+
+
+if __name__ == "__main__":
+    main()

From b54ac428bde35c272d4be8b85462951312a55105 Mon Sep 17 00:00:00 2001
From: Ale Abdo <abdo@member.fsf.org>
Date: Mon, 14 Jun 2021 00:30:38 +0200
Subject: [PATCH 02/23] sashimi: domain-chained model with topic_1

---
 src/sashimi/__main__.py | 55 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 51 insertions(+), 4 deletions(-)

diff --git a/src/sashimi/__main__.py b/src/sashimi/__main__.py
index fec14f0..6c60bfc 100644
--- a/src/sashimi/__main__.py
+++ b/src/sashimi/__main__.py
@@ -1,13 +1,47 @@
 #! /usr/bin/env python
 
+import graph_tool  # not used, but avoids an import order bug
 import abstractology
 import pandas as pd
 
-# not used, but avoids an import order bug
-import graph_tool
-
 graph_tool
 
+""" NOTES
+
+- Data columns:
+    ['abstract_text',
+     'abstract_title',
+     'bibliography',
+     'cancelled',
+     'code',
+     'figure_legend_1',
+     'figure_legend_2',
+     'figure_title_1',
+     'figure_title_2',
+     'final_status',
+     'id',
+     'index',
+     'is_complete',
+     'keyword_1',
+     'keyword_2',
+     'keyword_3',
+     'keyword_4',
+     'keywords',
+     'legend_1',
+     'legend_2',
+     'not_to_remind',
+     'program_day',
+     'program_session',
+     'publish_onsite',
+     'relance_register',
+     'topic_1',
+     'topic_2',
+     'topic_3',
+     'user_id',
+     'validate',
+     'year']
+"""
+
 
 def reload():
     """Live reload while developing"""
@@ -44,20 +78,33 @@ def bootstrap():
     a.load_domain_topic_model()
     a.set_graph(extend={"prop": "year"})
     a.load_domain_chained_model()
+    a.set_graph(extend={"prop": "topic_1"})
+    a.load_domain_chained_model()
     a.register_config()
 
     return a
 
 
 def load():
-    a = abstractology.Graphology("auto_abstractology/reports/config.json")
+    a = abstractology.Graphology(
+        config="auto_abstractology/reports/config.json",
+        load_data=False,
+    )
     _load_data(a)
 
     return a
 
 
 def plot(a):
+    a.load_domain_topic_model()
     a.plot_sashimi("ISGC 2015-2017")
+
+    a.set_graph(extend={"prop": "year"})
+    a.load_domain_chained_model()
+    a.plot_sashimi("ISGC 2015-2017", chained=True)
+
+    a.set_graph(extend={"prop": "topic_1"})
+    a.load_domain_chained_model()
     a.plot_sashimi("ISGC 2015-2017", chained=True)
 
 

From ffb69b504a948fd0cbc13d5fe424b82ef8e8b78d Mon Sep 17 00:00:00 2001
From: Ale Abdo <abdo@member.fsf.org>
Date: Fri, 27 Aug 2021 02:54:50 +0200
Subject: [PATCH 03/23] Add .clean.clean_text(): remove undesirable content
 from abstracts

---
 src/sashimi/__main__.py |  22 +++------
 src/sashimi/clean.py    | 105 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 113 insertions(+), 14 deletions(-)
 create mode 100644 src/sashimi/clean.py

diff --git a/src/sashimi/__main__.py b/src/sashimi/__main__.py
index 6c60bfc..6486311 100644
--- a/src/sashimi/__main__.py
+++ b/src/sashimi/__main__.py
@@ -3,8 +3,9 @@
 import graph_tool  # not used, but avoids an import order bug
 import abstractology
 import pandas as pd
+from .clean import clean_text
 
-graph_tool
+graph_tool  # just so the linter won't complain
 
 """ NOTES
 
@@ -43,14 +44,7 @@
 """
 
 
-def reload():
-    """Live reload while developing"""
-    import importlib
-
-    importlib.reload(abstractology)
-
-
-def get_data():
+def get_data(clean=True):
     df15 = pd.read_csv("abstracts_contents_2015.tsv", sep="\t")
     df15 = df15.dropna(subset=["abstract_text"])
     df17 = pd.read_csv("abstracts_contents_1719.tsv", sep="\t")
@@ -58,18 +52,21 @@ def get_data():
     df = df15.append(df17)
     df = df.reset_index()
 
+    if clean:
+        df["abstract_text"] = clean_text(df)
+
     return df
 
 
 def _load_data(a, get_data=get_data):
     a.data = get_data()
-    a.process_corpus(ngrams=1)  # no ngrams while I don't fix gensim on guix
     corpus_name = "isgc_2015-2017.df"
     if corpus_name not in a.loaded["data"]:
         a.loaded["data"].append(corpus_name)
         a.col_title = "abstract_title"
         a.col_time = "year"
         a.text_source = "abstract_text"
+    a.process_corpus(ngrams=1)  # no ngrams while I don't fix gensim on guix
 
 
 def bootstrap():
@@ -114,9 +111,6 @@ def main():
     except FileNotFoundError:
         a = bootstrap()
     plot(a)
-    a.data.columns
-    reload()
 
 
-if __name__ == "__main__":
-    main()
+main()
diff --git a/src/sashimi/clean.py b/src/sashimi/clean.py
new file mode 100644
index 0000000..8f17df7
--- /dev/null
+++ b/src/sashimi/clean.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python
+
+import re
+from difflib import unified_diff
+from itertools import permutations
+
+CLEAN_REFLAGS = re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE
+
+
+def clean_text(df):
+    """
+    Known untreated entries:
+    964: "The research was co-financed"
+    """
+
+    def and_sections(re0, re1):
+        return re0 + r"\s* (?: and | & ) \s*" + re1
+
+    # Remove entries with no valid content
+    unwanted_res = "^Lorem ipsum dolor sit amet"
+    b_unwanted = df["abstract_text"].str.contains(unwanted_res)
+    clean_df = df[~b_unwanted]
+
+    # Section names to be removed
+    section_names_res = [
+        r"backgrounds?",
+        r"conclusions?",
+        r"discussions?",
+        r"experiments?",
+        r"experimental",
+        r"introductions?",
+        r"materials?",
+        r"methods?",
+        r"perspectives?",
+        r"prospects?",
+        r"objectives?",
+        r"outlooks?",
+        r"results?",
+        r"significance",
+        r"summary",
+    ]
+    section_names_re = r"|".join(
+        [and_sections(x, y) for x, y in permutations(section_names_res, 2)]
+        + section_names_res
+    )
+
+    # Remove invalid content from entries
+    unclean_from_start_of_text_res = [
+        r".* \n abstract (?: [^\n\w,]* \n)",
+    ]
+    unclean_res = [
+        r"^ keys?\ ?words? (?: [^\n\w]* \n )? [^\n]*",
+        r"(^ (?:" + section_names_re + r") (?: [^\n\w]* \n | \s* [^\n\w\s,&]+ ))",
+    ]
+    unclean_until_end_of_text_res = [
+        r"^ acknowledge?ments? :? .*",
+        r"^ r[eé]f[eé]rences? \s* :? \s* \n .*",
+        r"^ [^\n\w]* [12] [^\n\w]+ \w [^\n]+ (?<!\d)(?:1[6789]|20)[0-9]{2}(?!\d) .*",
+    ]
+    unclean_rx = re.compile(
+        pattern=r"|".join(
+            unclean_from_start_of_text_res + unclean_res + unclean_until_end_of_text_res
+        ),
+        flags=CLEAN_REFLAGS,
+    )
+    clean_abstract_text = clean_df["abstract_text"].str.replace(unclean_rx, "")
+
+    return clean_abstract_text
+
+
+def check_clean(df, clean_abstract_text, start=0):
+    comp = df["abstract_text"].compare(clean_abstract_text)
+    for idx, diff in comp.agg(
+        lambda x: unified_diff(x["self"].split("\n"), x["other"].split("\n")), axis=1
+    ).items():
+        if idx < start:
+            continue
+        for line in diff:
+            print(line)
+        print("\n" + 70 * "-" + str(idx) + "\n")
+        if input():
+            break
+
+
+def search_text(df, rexp):
+    sel = df.abstract_text.str.contains(
+        rexp,
+        flags=CLEAN_REFLAGS,
+    )
+    for idx, txt in df.loc[sel, "abstract_text"].items():
+        print(txt)
+        print("\n" + 70 * "-" + str(idx) + "\n")
+        if input():
+            break
+
+
+def extract_text(df, rexp):
+    return (
+        df["abstract_text"]
+        .str.extractall(
+            rexp,
+            flags=CLEAN_REFLAGS,
+        )
+        .dropna()
+    )

From d669a09d538b8aa1a44691d5813eea0b88c8a464 Mon Sep 17 00:00:00 2001
From: Ale Abdo <abdo@member.fsf.org>
Date: Sat, 28 Aug 2021 02:07:30 +0200
Subject: [PATCH 04/23] Add parameters to package execution for data dir and
 action

---
 src/sashimi/__main__.py | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/src/sashimi/__main__.py b/src/sashimi/__main__.py
index 6486311..5692843 100644
--- a/src/sashimi/__main__.py
+++ b/src/sashimi/__main__.py
@@ -1,9 +1,11 @@
 #! /usr/bin/env python
 
 import graph_tool  # not used, but avoids an import order bug
+import sys
 import abstractology
 import pandas as pd
-from .clean import clean_text
+from .clean import clean_text, check_clean
+from pathlib import Path
 
 graph_tool  # just so the linter won't complain
 
@@ -44,16 +46,22 @@
 """
 
 
+ISGC_FILES_DIR = Path(sys.argv[1] if len(sys.argv) > 1 else ".")
+ISGC_2015_FILE = ISGC_FILES_DIR / "abstracts_contents_2015.tsv"
+ISGC_2017_FILE = ISGC_FILES_DIR / "abstracts_contents_1719.tsv"
+
+
 def get_data(clean=True):
-    df15 = pd.read_csv("abstracts_contents_2015.tsv", sep="\t")
+    df15 = pd.read_csv(ISGC_2015_FILE, sep="\t")
     df15 = df15.dropna(subset=["abstract_text"])
-    df17 = pd.read_csv("abstracts_contents_1719.tsv", sep="\t")
+    df17 = pd.read_csv(ISGC_2017_FILE, sep="\t")
     df17 = df17.dropna(subset=["abstract_text"])
     df = df15.append(df17)
     df = df.reset_index()
 
     if clean:
         df["abstract_text"] = clean_text(df)
+        df.dropna(subset=["abstract_text"], inplace=True)
 
     return df
 
@@ -106,11 +114,17 @@ def plot(a):
 
 
 def main():
-    try:
-        a = load()
-    except FileNotFoundError:
-        a = bootstrap()
-    plot(a)
+    action = sys.argv[2] if len(sys.argv) > 2 else "check_clean"
+    if action == "check_clean":
+        unclean = get_data(False)
+        clean = get_data(True)
+        check_clean(unclean.loc[clean.index], clean["abstract_text"])
+    elif action == "sashimi":
+        try:
+            a = load()
+        except FileNotFoundError:
+            a = bootstrap()
+        plot(a)
 
 
 main()

From 1d5e7ffc7251b8587c9f142c3fb716dc308b5516 Mon Sep 17 00:00:00 2001
From: Ale Abdo <abdo@member.fsf.org>
Date: Mon, 30 Aug 2021 03:48:23 +0200
Subject: [PATCH 05/23] Rename main as init.

---
 src/sashimi/{__main__.py => __init__.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename src/sashimi/{__main__.py => __init__.py} (100%)

diff --git a/src/sashimi/__main__.py b/src/sashimi/__init__.py
similarity index 100%
rename from src/sashimi/__main__.py
rename to src/sashimi/__init__.py

From 0c5e9d765881ce6f41e433fd4cdc1ab1148670ec Mon Sep 17 00:00:00 2001
From: Ale Abdo <abdo@member.fsf.org>
Date: Mon, 30 Aug 2021 03:48:44 +0200
Subject: [PATCH 06/23] Improve clean_text and check_clean. Document
 check_clean.

---
 src/sashimi/clean.py | 48 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 36 insertions(+), 12 deletions(-)

diff --git a/src/sashimi/clean.py b/src/sashimi/clean.py
index 8f17df7..d0bca5d 100644
--- a/src/sashimi/clean.py
+++ b/src/sashimi/clean.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 
 import re
+import pandas as pd
 from difflib import unified_diff
 from itertools import permutations
 
@@ -31,6 +32,7 @@ def and_sections(re0, re1):
         r"introductions?",
         r"materials?",
         r"methods?",
+        r"motivation?",
         r"perspectives?",
         r"prospects?",
         r"objectives?",
@@ -43,18 +45,22 @@ def and_sections(re0, re1):
         [and_sections(x, y) for x, y in permutations(section_names_res, 2)]
         + section_names_res
     )
-
+    section_numbering_re = r"[^\n\w]* (?: \d? [^\n\w]* )"
     # Remove invalid content from entries
     unclean_from_start_of_text_res = [
-        r".* \n abstract (?: [^\n\w,]* \n)",
+        r"(?: ^ | .* \n)" + section_numbering_re + r"abstract (?: [^\n\w,]* \n)",
     ]
     unclean_res = [
-        r"^ keys?\ ?words? (?: [^\n\w]* \n )? [^\n]*",
-        r"(^ (?:" + section_names_re + r") (?: [^\n\w]* \n | \s* [^\n\w\s,&]+ ))",
+        r"^" + section_numbering_re + r"keys?\ ?words? (?: [^\n\w]* \n )? [^\n]*",
+        r"^"
+        + section_numbering_re
+        + r"(?:"
+        + section_names_re
+        + r") (?: \ * section)? (?: [^\n\w]* \n | \s* [^\n\w\s,&]+ )",
     ]
     unclean_until_end_of_text_res = [
-        r"^ acknowledge?ments? :? .*",
-        r"^ r[eé]f[eé]rences? \s* :? \s* \n .*",
+        r"^" + section_numbering_re + r"acknowledge?ments? :? .*",
+        r"^" + section_numbering_re + r"r[eé]f[eé]rences? \s* :? .*",
         r"^ [^\n\w]* [12] [^\n\w]+ \w [^\n]+ (?<!\d)(?:1[6789]|20)[0-9]{2}(?!\d) .*",
     ]
     unclean_rx = re.compile(
@@ -68,17 +74,35 @@ def and_sections(re0, re1):
     return clean_abstract_text
 
 
-def check_clean(df, clean_abstract_text, start=0):
-    comp = df["abstract_text"].compare(clean_abstract_text)
-    for idx, diff in comp.agg(
+def check_clean(df_or_series, clean_abstract_text, start=0, interactive=True):
+    """Compares two textual series showing diffs for each entry.
+
+    If passed a dataframe as first argument, picks the "abstract_text" column.
+    If `start` is provided, skips abstracts indexed less than its value.
+    If not `interactive`, returns the diffs as a `pandas.Series`
+    If `interactive`, waits for input at each entry, stopping if sent a nonempty string.
+    """
+    if not isinstance(df_or_series, pd.Series):
+        abstract_text = df_or_series["abstract_text"]
+    else:
+        abstract_text = df_or_series
+    abstract_text = abstract_text.loc[clean_abstract_text.index]
+    comp = abstract_text.compare(clean_abstract_text)
+    if comp.empty:
+        print("No differences found.")
+        return
+    diff = comp.agg(
         lambda x: unified_diff(x["self"].split("\n"), x["other"].split("\n")), axis=1
-    ).items():
-        if idx < start:
-            continue
+    )
+    if not interactive:
+        return diff.map("\n".join)
+    print(f"Found {len(diff)} modified documents.\n")
+    for idx, diff in diff.loc[start:].items():
         for line in diff:
             print(line)
         print("\n" + 70 * "-" + str(idx) + "\n")
         if input():
+            print("\nInterrupted!\n")
             break
 
 

From be6df47c7809c9fa52cf2e73aab34321a03843ea Mon Sep 17 00:00:00 2001
From: Ale Abdo <abdo@member.fsf.org>
Date: Mon, 30 Aug 2021 03:51:10 +0200
Subject: [PATCH 07/23] Refactor for modularity.

---
 src/sashimi/__init__.py | 99 ++++++-----------------------------------
 src/sashimi/__main__.py | 30 +++++++++++++
 src/sashimi/sashimi.py  | 54 ++++++++++++++++++++++
 3 files changed, 98 insertions(+), 85 deletions(-)
 create mode 100644 src/sashimi/__main__.py
 create mode 100644 src/sashimi/sashimi.py

diff --git a/src/sashimi/__init__.py b/src/sashimi/__init__.py
index 5692843..1fb0675 100644
--- a/src/sashimi/__init__.py
+++ b/src/sashimi/__init__.py
@@ -1,13 +1,7 @@
 #! /usr/bin/env python
 
-import graph_tool  # not used, but avoids an import order bug
-import sys
-import abstractology
 import pandas as pd
-from .clean import clean_text, check_clean
-from pathlib import Path
-
-graph_tool  # just so the linter won't complain
+from .clean import clean_text
 
 """ NOTES
 
@@ -23,7 +17,6 @@
      'figure_title_2',
      'final_status',
      'id',
-     'index',
      'is_complete',
      'keyword_1',
      'keyword_2',
@@ -46,85 +39,21 @@
 """
 
 
-ISGC_FILES_DIR = Path(sys.argv[1] if len(sys.argv) > 1 else ".")
-ISGC_2015_FILE = ISGC_FILES_DIR / "abstracts_contents_2015.tsv"
-ISGC_2017_FILE = ISGC_FILES_DIR / "abstracts_contents_1719.tsv"
-
-
-def get_data(clean=True):
-    df15 = pd.read_csv(ISGC_2015_FILE, sep="\t")
-    df15 = df15.dropna(subset=["abstract_text"])
-    df17 = pd.read_csv(ISGC_2017_FILE, sep="\t")
-    df17 = df17.dropna(subset=["abstract_text"])
-    df = df15.append(df17)
-    df = df.reset_index()
-
+def get_data(file_paths, clean=True):
+    df = pd.concat(
+        [
+            pd.read_csv(file, sep="\t")
+            for file in file_paths
+        ],
+        ignore_index=True,
+    )
+    print(f"Found {len(df)} entries.")
+    df = df.dropna(subset=["abstract_text"])
+    print(f" Kept {len(df)} entries containing an abstract.")
+    
     if clean:
         df["abstract_text"] = clean_text(df)
         df.dropna(subset=["abstract_text"], inplace=True)
+        print(f" Kept {len(df)} entries after cleaning.")
 
     return df
-
-
-def _load_data(a, get_data=get_data):
-    a.data = get_data()
-    corpus_name = "isgc_2015-2017.df"
-    if corpus_name not in a.loaded["data"]:
-        a.loaded["data"].append(corpus_name)
-        a.col_title = "abstract_title"
-        a.col_time = "year"
-        a.text_source = "abstract_text"
-    a.process_corpus(ngrams=1)  # no ngrams while I don't fix gensim on guix
-
-
-def bootstrap():
-    a = abstractology.Graphology()
-    _load_data(a)
-    a.load_domain_topic_model()
-    a.set_graph(extend={"prop": "year"})
-    a.load_domain_chained_model()
-    a.set_graph(extend={"prop": "topic_1"})
-    a.load_domain_chained_model()
-    a.register_config()
-
-    return a
-
-
-def load():
-    a = abstractology.Graphology(
-        config="auto_abstractology/reports/config.json",
-        load_data=False,
-    )
-    _load_data(a)
-
-    return a
-
-
-def plot(a):
-    a.load_domain_topic_model()
-    a.plot_sashimi("ISGC 2015-2017")
-
-    a.set_graph(extend={"prop": "year"})
-    a.load_domain_chained_model()
-    a.plot_sashimi("ISGC 2015-2017", chained=True)
-
-    a.set_graph(extend={"prop": "topic_1"})
-    a.load_domain_chained_model()
-    a.plot_sashimi("ISGC 2015-2017", chained=True)
-
-
-def main():
-    action = sys.argv[2] if len(sys.argv) > 2 else "check_clean"
-    if action == "check_clean":
-        unclean = get_data(False)
-        clean = get_data(True)
-        check_clean(unclean.loc[clean.index], clean["abstract_text"])
-    elif action == "sashimi":
-        try:
-            a = load()
-        except FileNotFoundError:
-            a = bootstrap()
-        plot(a)
-
-
-main()
diff --git a/src/sashimi/__main__.py b/src/sashimi/__main__.py
new file mode 100644
index 0000000..655b8f1
--- /dev/null
+++ b/src/sashimi/__main__.py
@@ -0,0 +1,30 @@
+import sys
+from pathlib import Path
+from . import get_data
+from .clean import check_clean
+
+# User input
+ISGC_FILES_DIR = Path(sys.argv[1] if len(sys.argv) > 1 else ".")
+ISGC_FILES = [
+    ISGC_FILES_DIR / file
+    for file in ["abstracts_contents_2015.tsv", "abstracts_contents_1719.tsv"]
+]
+ACTION = sys.argv[2] if len(sys.argv) > 2 else "check_clean"
+START = int(sys.argv[3]) if len(sys.argv) > 3 else 0
+
+
+def main():
+    if ACTION == "check_clean":
+        unclean = get_data(ISGC_FILES, False)
+        clean = get_data(ISGC_FILES, True)
+        check_clean(unclean, clean["abstract_text"], START)
+    elif ACTION == "sashimi":
+        import sashimi
+        try:
+            a = sashimi.load()
+        except FileNotFoundError:
+            a = sashimi.bootstrap()
+        sashimi.plot(a)
+
+
+main()
diff --git a/src/sashimi/sashimi.py b/src/sashimi/sashimi.py
new file mode 100644
index 0000000..c622552
--- /dev/null
+++ b/src/sashimi/sashimi.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+
+import graph_tool  # not used, but avoids an import order bug
+import abstractology
+from . import get_data
+
+graph_tool  # just so the linter won't complain
+
+
+def _load_data(a, get_data=get_data):
+    a.data = get_data()
+    corpus_name = "isgc_2015-2017.df"
+    if corpus_name not in a.loaded["data"]:
+        a.loaded["data"].append(corpus_name)
+        a.col_title = "abstract_title"
+        a.col_time = "year"
+        a.text_source = "abstract_text"
+    a.process_corpus(ngrams=1)  # no ngrams while I don't fix gensim on guix
+
+
+def bootstrap():
+    a = abstractology.Graphology()
+    _load_data(a)
+    a.load_domain_topic_model()
+    a.set_graph(extend={"prop": "year"})
+    a.load_domain_chained_model()
+    a.set_graph(extend={"prop": "topic_1"})
+    a.load_domain_chained_model()
+    a.register_config()
+
+    return a
+
+
+def load():
+    a = abstractology.Graphology(
+        config="auto_abstractology/reports/config.json",
+        load_data=False,
+    )
+    _load_data(a)
+
+    return a
+
+
+def plot(a):
+    a.load_domain_topic_model()
+    a.plot_sashimi("ISGC 2015-2017")
+
+    a.set_graph(extend={"prop": "year"})
+    a.load_domain_chained_model()
+    a.plot_sashimi("ISGC 2015-2017", chained=True)
+
+    a.set_graph(extend={"prop": "topic_1"})
+    a.load_domain_chained_model()
+    a.plot_sashimi("ISGC 2015-2017", chained=True)

From b2f44b25bfc7e211ad2801b3359a7ca2fd73fc77 Mon Sep 17 00:00:00 2001
From: Ale Abdo <abdo@member.fsf.org>
Date: Mon, 30 Aug 2021 05:07:48 +0200
Subject: [PATCH 08/23] Improve read/write data.

---
 src/sashimi/__init__.py | 3 ++-
 src/sashimi/__main__.py | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/sashimi/__init__.py b/src/sashimi/__init__.py
index 1fb0675..ba2ea8c 100644
--- a/src/sashimi/__init__.py
+++ b/src/sashimi/__init__.py
@@ -42,7 +42,7 @@
 def get_data(file_paths, clean=True):
     df = pd.concat(
         [
-            pd.read_csv(file, sep="\t")
+            pd.read_csv(file, sep="\t", dtype=str)
             for file in file_paths
         ],
         ignore_index=True,
@@ -56,4 +56,5 @@ def get_data(file_paths, clean=True):
         df.dropna(subset=["abstract_text"], inplace=True)
         print(f" Kept {len(df)} entries after cleaning.")
 
+    df.index.name = "index"
     return df
diff --git a/src/sashimi/__main__.py b/src/sashimi/__main__.py
index 655b8f1..385b230 100644
--- a/src/sashimi/__main__.py
+++ b/src/sashimi/__main__.py
@@ -9,6 +9,7 @@
     ISGC_FILES_DIR / file
     for file in ["abstracts_contents_2015.tsv", "abstracts_contents_1719.tsv"]
 ]
+CLEAN_FILE = "abstracts_contents_clean.csv.xz"
 ACTION = sys.argv[2] if len(sys.argv) > 2 else "check_clean"
 START = int(sys.argv[3]) if len(sys.argv) > 3 else 0
 
@@ -17,6 +18,7 @@ def main():
     if ACTION == "check_clean":
         unclean = get_data(ISGC_FILES, False)
         clean = get_data(ISGC_FILES, True)
+        clean.to_csv(CLEAN_FILE)
         check_clean(unclean, clean["abstract_text"], START)
     elif ACTION == "sashimi":
         import sashimi

From fc2e6d07773d7d0f29c23770276347649ed6d2a8 Mon Sep 17 00:00:00 2001
From: Ale Abdo <abdo@member.fsf.org>
Date: Mon, 30 Aug 2021 07:56:45 +0200
Subject: [PATCH 09/23] Clean more funding info, with manually identified
 exceptions.

---
 src/sashimi/__init__.py |  2 +-
 src/sashimi/clean.py    | 18 ++++++++++++++++--
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/src/sashimi/__init__.py b/src/sashimi/__init__.py
index ba2ea8c..1cad09b 100644
--- a/src/sashimi/__init__.py
+++ b/src/sashimi/__init__.py
@@ -53,7 +53,7 @@ def get_data(file_paths, clean=True):
     
     if clean:
         df["abstract_text"] = clean_text(df)
-        df.dropna(subset=["abstract_text"], inplace=True)
+        df = df.dropna(subset=["abstract_text"])
         print(f" Kept {len(df)} entries after cleaning.")
 
     df.index.name = "index"
diff --git a/src/sashimi/clean.py b/src/sashimi/clean.py
index d0bca5d..e9d573d 100644
--- a/src/sashimi/clean.py
+++ b/src/sashimi/clean.py
@@ -11,7 +11,7 @@
 def clean_text(df):
     """
     Known untreated entries:
-    964: "The research was co-financed"
+    - some title plus authors headers with no clear separation
     """
 
     def and_sections(re0, re1):
@@ -59,7 +59,7 @@ def and_sections(re0, re1):
         + r") (?: \ * section)? (?: [^\n\w]* \n | \s* [^\n\w\s,&]+ )",
     ]
     unclean_until_end_of_text_res = [
-        r"^" + section_numbering_re + r"acknowledge?ments? :? .*",
+        r"^" + section_numbering_re + r"ac?knowled?ge?ments? :? .*",
         r"^" + section_numbering_re + r"r[eé]f[eé]rences? \s* :? .*",
         r"^ [^\n\w]* [12] [^\n\w]+ \w [^\n]+ (?<!\d)(?:1[6789]|20)[0-9]{2}(?!\d) .*",
     ]
@@ -71,6 +71,20 @@ def and_sections(re0, re1):
     )
     clean_abstract_text = clean_df["abstract_text"].str.replace(unclean_rx, "")
 
+    # Remove even more funding info (max 61) excluding (10) manually identified wrong matches
+    clean_extra_funding_rx = re.compile(
+        r"(^ [^\n]*"
+        r"(?: fund[eis] | financ | supported\ by | support\ of | support\ from | grant )"
+        r"[^\n]* \s* ) \Z",
+        flags=CLEAN_REFLAGS,
+    )
+    up_index = clean_abstract_text.index.difference(
+        [23, 968, 999, 1243, 1373, 1416, 1469, 1560, 1700, 1710]
+    )
+    clean_abstract_text = clean_abstract_text.loc[up_index].str.replace(
+        clean_extra_funding_rx, ""
+    )
+
     return clean_abstract_text
 
 

From a48155780733d9d536cd4820980e251829589659 Mon Sep 17 00:00:00 2001
From: Ale Abdo <abdo@member.fsf.org>
Date: Sat, 19 Feb 2022 02:18:29 -0300
Subject: [PATCH 10/23] Add '_abstract_text_is_cleaned' column to singal
 cleaned

---
 src/sashimi/__init__.py | 5 ++++-
 src/sashimi/__main__.py | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/sashimi/__init__.py b/src/sashimi/__init__.py
index 1cad09b..339a61d 100644
--- a/src/sashimi/__init__.py
+++ b/src/sashimi/__init__.py
@@ -52,7 +52,10 @@ def get_data(file_paths, clean=True):
     print(f" Kept {len(df)} entries containing an abstract.")
     
     if clean:
-        df["abstract_text"] = clean_text(df)
+        clean_abstract_text = clean_text(df)
+        df["_abstract_text_is_cleaned"] = ~df["abstract_text"].eq(clean_abstract_text)
+        df["abstract_text"] = clean_abstract_text
+
         df = df.dropna(subset=["abstract_text"])
         print(f" Kept {len(df)} entries after cleaning.")
 
diff --git a/src/sashimi/__main__.py b/src/sashimi/__main__.py
index 385b230..6b1edd3 100644
--- a/src/sashimi/__main__.py
+++ b/src/sashimi/__main__.py
@@ -22,6 +22,7 @@ def main():
         check_clean(unclean, clean["abstract_text"], START)
     elif ACTION == "sashimi":
         import sashimi
+
         try:
             a = sashimi.load()
         except FileNotFoundError:

From 087c3ff0d124312646cae33281a416cf2c8ec43b Mon Sep 17 00:00:00 2001
From: Ale Abdo <abdo@member.fsf.org>
Date: Sat, 19 Feb 2022 02:19:07 -0300
Subject: [PATCH 11/23] Improvements to cleaning and notes

---
 src/sashimi/clean.py | 49 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 47 insertions(+), 2 deletions(-)

diff --git a/src/sashimi/clean.py b/src/sashimi/clean.py
index e9d573d..a752674 100644
--- a/src/sashimi/clean.py
+++ b/src/sashimi/clean.py
@@ -1,5 +1,29 @@
 #!/usr/bin/env python
 
+"""
+TODO:
+- [ ] trouver les abstracts avec une ligne auteurs
+- [x] ajouter une variable si l'abstract a été modifié ou pas
+
+List of abstracts with some issue:
+
+- 187, 661 (longuer de la ligne pas des majuscules)
+- 1610 - trop de chiffres
+- 1340, 1634, 1642 (ligne trop courte)
+- 315, 873 biblio mais il y a l'année (ou DOI)
+- 871 :P
+- 1028 'authors acknowledge...'
+- 1314 biblio sans année
+- 'Abstract: ...' peut-être assouplir le critère (sans \n)
+- 871 (table ronde), 1401
+-- Majuscules (peut-être pas tous):
+- 978, 1161 formule chimique pleine de majuscules
+- 1205 ligne d'abstract avec majuscules
+- 1701 plein d'acronymes
+- première ligne toute en majuscule
+
+"""
+
 import re
 import pandas as pd
 from difflib import unified_diff
@@ -21,6 +45,8 @@ def and_sections(re0, re1):
     unwanted_res = "^Lorem ipsum dolor sit amet"
     b_unwanted = df["abstract_text"].str.contains(unwanted_res)
     clean_df = df[~b_unwanted]
+    b_unwanted = df["abstract_text"].map(len).lt(100)
+    clean_df = df[~b_unwanted]
 
     # Section names to be removed
     section_names_res = [
@@ -29,6 +55,7 @@ def and_sections(re0, re1):
         r"discussions?",
         r"experiments?",
         r"experimental",
+        r"intro",
         r"introductions?",
         r"materials?",
         r"methods?",
@@ -37,7 +64,9 @@ def and_sections(re0, re1):
         r"prospects?",
         r"objectives?",
         r"outlooks?",
+        r"overview?",
         r"results?",
+        r"key\ results?",
         r"significance",
         r"summary",
     ]
@@ -48,7 +77,7 @@ def and_sections(re0, re1):
     section_numbering_re = r"[^\n\w]* (?: \d? [^\n\w]* )"
     # Remove invalid content from entries
     unclean_from_start_of_text_res = [
-        r"(?: ^ | .* \n)" + section_numbering_re + r"abstract (?: [^\n\w,]* \n)",
+        r"(?: ^ | .* \n)" + section_numbering_re + r"abstract [^\n\w,]* [\n:]",
     ]
     unclean_res = [
         r"^" + section_numbering_re + r"keys?\ ?words? (?: [^\n\w]* \n )? [^\n]*",
@@ -59,7 +88,7 @@ def and_sections(re0, re1):
         + r") (?: \ * section)? (?: [^\n\w]* \n | \s* [^\n\w\s,&]+ )",
     ]
     unclean_until_end_of_text_res = [
-        r"^" + section_numbering_re + r"ac?knowled?ge?ments? :? .*",
+        r"^" + section_numbering_re + r"ac?knowled?ge?m?ents? :? .*",
         r"^" + section_numbering_re + r"r[eé]f[eé]rences? \s* :? .*",
         r"^ [^\n\w]* [12] [^\n\w]+ \w [^\n]+ (?<!\d)(?:1[6789]|20)[0-9]{2}(?!\d) .*",
     ]
@@ -87,6 +116,22 @@ def and_sections(re0, re1):
 
     return clean_abstract_text
 
+    # TODO: find use for these
+    def count_upper(line):
+        line = re.sub(r"[^\w\s]", "", line)
+        words = line.split()
+        upper = [x for x in words if x[0].isupper()]
+        return len(words) > 5 and len(upper) / len(words) > 1 / 2
+
+    def remove_upper_and_email_lines(txt):
+        newtxt = []
+        for line in txt.split("\n"):
+            if not count_upper(line) and not re.search(r"[\w-]+@[\w-]+\.[\w-]+", line):
+                newtxt.append(line)
+        return "\n".join(newtxt)
+
+
+
 
 def check_clean(df_or_series, clean_abstract_text, start=0, interactive=True):
     """Compares two textual series showing diffs for each entry.

From cc69b86c91cab988fb9aa3851736bdd8c25dd8a7 Mon Sep 17 00:00:00 2001
From: Ale Abdo <abdo@member.fsf.org>
Date: Sat, 19 Feb 2022 02:33:34 -0300
Subject: [PATCH 12/23] Rename 'authors' functions

---
 src/sashimi/clean.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/sashimi/clean.py b/src/sashimi/clean.py
index a752674..17e1c95 100644
--- a/src/sashimi/clean.py
+++ b/src/sashimi/clean.py
@@ -117,22 +117,22 @@ def and_sections(re0, re1):
     return clean_abstract_text
 
     # TODO: find use for these
-    def count_upper(line):
+    def long_and_mostly_titlecased(line):
         line = re.sub(r"[^\w\s]", "", line)
         words = line.split()
         upper = [x for x in words if x[0].isupper()]
         return len(words) > 5 and len(upper) / len(words) > 1 / 2
 
-    def remove_upper_and_email_lines(txt):
+    def remove_lines_like_authors(txt):
         newtxt = []
         for line in txt.split("\n"):
-            if not count_upper(line) and not re.search(r"[\w-]+@[\w-]+\.[\w-]+", line):
+            if not long_and_mostly_titlecased(line) and not re.search(
+                r"[\w-]+@[\w-]+\.[\w-]+", line
+            ):
                 newtxt.append(line)
         return "\n".join(newtxt)
 
 
-
-
 def check_clean(df_or_series, clean_abstract_text, start=0, interactive=True):
     """Compares two textual series showing diffs for each entry.
 

From 7bd3b547925e46aded94213d92d50fa7c4e6a39a Mon Sep 17 00:00:00 2001
From: Ale Abdo <abdo@member.fsf.org>
Date: Sat, 19 Feb 2022 20:35:03 -0300
Subject: [PATCH 13/23] Parametrize drop in get_data

---
 src/sashimi/__init__.py | 19 +++++++++----------
 src/sashimi/__main__.py |  4 +++-
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/src/sashimi/__init__.py b/src/sashimi/__init__.py
index 339a61d..a6e7f11 100644
--- a/src/sashimi/__init__.py
+++ b/src/sashimi/__init__.py
@@ -39,25 +39,24 @@
 """
 
 
-def get_data(file_paths, clean=True):
+def get_data(file_paths, clean=True, drop=True):
     df = pd.concat(
-        [
-            pd.read_csv(file, sep="\t", dtype=str)
-            for file in file_paths
-        ],
+        [pd.read_csv(file, sep="\t", dtype=str) for file in file_paths],
         ignore_index=True,
     )
     print(f"Found {len(df)} entries.")
-    df = df.dropna(subset=["abstract_text"])
-    print(f" Kept {len(df)} entries containing an abstract.")
-    
+    if drop:
+        df = df.dropna(subset=["abstract_text"])
+        print(f" Kept {len(df)} entries containing an abstract.")
+
     if clean:
         clean_abstract_text = clean_text(df)
         df["_abstract_text_is_cleaned"] = ~df["abstract_text"].eq(clean_abstract_text)
         df["abstract_text"] = clean_abstract_text
 
-        df = df.dropna(subset=["abstract_text"])
-        print(f" Kept {len(df)} entries after cleaning.")
+        if drop:
+            df = df.dropna(subset=["abstract_text"])
+            print(f" Kept {len(df)} entries after cleaning.")
 
     df.index.name = "index"
     return df
diff --git a/src/sashimi/__main__.py b/src/sashimi/__main__.py
index 6b1edd3..974d1a8 100644
--- a/src/sashimi/__main__.py
+++ b/src/sashimi/__main__.py
@@ -20,6 +20,7 @@ def main():
         clean = get_data(ISGC_FILES, True)
         clean.to_csv(CLEAN_FILE)
         check_clean(unclean, clean["abstract_text"], START)
+
     elif ACTION == "sashimi":
         import sashimi
 
@@ -30,4 +31,5 @@ def main():
         sashimi.plot(a)
 
 
-main()
+if __name__ == "__main__":
+    main()

From 2b64216db26a6fdc0180ad063283c681eede2420 Mon Sep 17 00:00:00 2001
From: Ale Abdo <abdo@member.fsf.org>
Date: Sat, 19 Feb 2022 20:36:37 -0300
Subject: [PATCH 14/23] Complete author_affiliation functions

---
 src/sashimi/clean.py | 58 ++++++++++++++++++++++++++++++++------------
 1 file changed, 43 insertions(+), 15 deletions(-)

diff --git a/src/sashimi/clean.py b/src/sashimi/clean.py
index 17e1c95..a0727ef 100644
--- a/src/sashimi/clean.py
+++ b/src/sashimi/clean.py
@@ -116,22 +116,50 @@ def and_sections(re0, re1):
 
     return clean_abstract_text
 
-    # TODO: find use for these
-    def long_and_mostly_titlecased(line):
-        line = re.sub(r"[^\w\s]", "", line)
-        words = line.split()
-        upper = [x for x in words if x[0].isupper()]
-        return len(words) > 5 and len(upper) / len(words) > 1 / 2
-
-    def remove_lines_like_authors(txt):
-        newtxt = []
-        for line in txt.split("\n"):
-            if not long_and_mostly_titlecased(line) and not re.search(
-                r"[\w-]+@[\w-]+\.[\w-]+", line
-            ):
-                newtxt.append(line)
-        return "\n".join(newtxt)
 
+# TODO: find a use for these in clean_text()?
+def is_author_affiliation(line, verbose=False):
+    author_words = r"and of at in de et und".split()
+    words_re = fr'\b(?:{"|".join(author_words)})\b'
+    line = re.sub(r"[-\.]", " ", line)
+    line = re.sub(r",", " , ", line)
+    line = re.sub(r"\d+", "", line)
+    line = re.sub(r"[^\w\s,]*|\b[a-z]\b", "", line)
+    words = line.split()
+    point_words = [
+        x for x in words if x[0].isupper() or x == "," or re.match(words_re, x)
+    ]
+    if verbose:
+        print(point_words)
+        print(words)
+    return len(words) > 4 and len(point_words) / len(words) > 0.8
+
+
+def is_email_address(line):
+    return re.search(r"[\w-]+@[\w-]+\.[\w-]+", line)
+
+
+def has_author_affiliation(txt):
+    split = int(len(txt) / 2)
+    txts = txt[:split].split("\n")[:-1]
+    for line in txts:
+        if is_author_affiliation(line) or is_email_address(line):
+            return True
+    return False
+
+
+def remove_lines_like_authors(txt):
+    newtxt = []
+    split = int(len(txt) / 2)
+    txts = txt[:split].split("\n")
+    tail = [txts.pop()]
+    for line in txts:
+        if not is_author_affiliation(line) and not is_email_address(line):
+            newtxt.append(line)
+    return "\n".join(newtxt + tail) + txt[split:]
+
+
+## Interactive
 
 def check_clean(df_or_series, clean_abstract_text, start=0, interactive=True):
     """Compares two textual series showing diffs for each entry.

From 8057b798ee65596706886a16ced5496aa68647c5 Mon Sep 17 00:00:00 2001
From: Ale Abdo <abdo@member.fsf.org>
Date: Sun, 20 Feb 2022 22:00:52 -0300
Subject: [PATCH 15/23] Refactor to new abstractology API

---
 src/sashimi/__main__.py |  8 ++---
 src/sashimi/sashimi.py  | 65 ++++++++++++++++++++---------------------
 2 files changed, 35 insertions(+), 38 deletions(-)

diff --git a/src/sashimi/__main__.py b/src/sashimi/__main__.py
index 974d1a8..a1b33c8 100644
--- a/src/sashimi/__main__.py
+++ b/src/sashimi/__main__.py
@@ -22,13 +22,13 @@ def main():
         check_clean(unclean, clean["abstract_text"], START)
 
     elif ACTION == "sashimi":
-        import sashimi
+        from . import sashimi
 
         try:
-            a = sashimi.load()
+            corpus = sashimi.load()
         except FileNotFoundError:
-            a = sashimi.bootstrap()
-        sashimi.plot(a)
+            corpus = sashimi.bootstrap()
+        sashimi.plot(corpus)
 
 
 if __name__ == "__main__":
diff --git a/src/sashimi/sashimi.py b/src/sashimi/sashimi.py
index c622552..0d96434 100644
--- a/src/sashimi/sashimi.py
+++ b/src/sashimi/sashimi.py
@@ -1,54 +1,51 @@
 #!/usr/bin/env python
 
-import graph_tool  # not used, but avoids an import order bug
+import graph_tool  # not used, but avoids an import order bug # noqa
 import abstractology
 from . import get_data
 
-graph_tool  # just so the linter won't complain
-
-
-def _load_data(a, get_data=get_data):
-    a.data = get_data()
-    corpus_name = "isgc_2015-2017.df"
-    if corpus_name not in a.loaded["data"]:
-        a.loaded["data"].append(corpus_name)
-        a.col_title = "abstract_title"
-        a.col_time = "year"
-        a.text_source = "abstract_text"
-    a.process_corpus(ngrams=1)  # no ngrams while I don't fix gensim on guix
+CORPUS_NAME = "isgc_2015-2017"
 
 
 def bootstrap():
-    a = abstractology.Graphology()
-    _load_data(a)
-    a.load_domain_topic_model()
-    a.set_graph(extend={"prop": "year"})
-    a.load_domain_chained_model()
-    a.set_graph(extend={"prop": "topic_1"})
-    a.load_domain_chained_model()
-    a.register_config()
+    corpus = abstractology.Graphology()
+    corpus.text_source = "abstract_text"
+    corpus.col_title = "abstract_title"
+    corpus.col_time = "year"
+    _load_data(corpus)
 
-    return a
+    corpus.load_domain_topic_model()
+    corpus.set_graph(extend={"prop": "year"})
+    corpus.load_domain_chained_model()
+    corpus.set_graph(extend={"prop": "topic_1"})
+    corpus.load_domain_chained_model()
+
+    corpus.register_config()
+    return corpus
 
 
 def load():
-    a = abstractology.Graphology(
+    corpus = abstractology.Graphology(
         config="auto_abstractology/reports/config.json",
         load_data=False,
     )
-    _load_data(a)
+    _load_data(corpus)
+    return corpus
+
 
-    return a
+def plot(corpus):
+    corpus.load_domain_topic_model()
+    corpus.domain_map("ISGC 2015-2017")
 
+    corpus.set_graph(extend={"prop": "year"})
+    corpus.load_domain_chained_model()
+    corpus.domain_map("ISGC 2015-2017", chained=True)
 
-def plot(a):
-    a.load_domain_topic_model()
-    a.plot_sashimi("ISGC 2015-2017")
+    corpus.set_graph(extend={"prop": "topic_1"})
+    corpus.load_domain_chained_model()
+    corpus.domain_map("ISGC 2015-2017", chained=True)
 
-    a.set_graph(extend={"prop": "year"})
-    a.load_domain_chained_model()
-    a.plot_sashimi("ISGC 2015-2017", chained=True)
 
-    a.set_graph(extend={"prop": "topic_1"})
-    a.load_domain_chained_model()
-    a.plot_sashimi("ISGC 2015-2017", chained=True)
+def _load_data(corpus):
+    corpus.load_data(get_data(), "dataframe", name=CORPUS_NAME)
+    corpus.process_corpus(ngrams=1)  # no ngrams while I don't fix gensim on guix

From d2b6198782cd3e482bdca9b666fd7ce72f5ae3f7 Mon Sep 17 00:00:00 2001
From: Ale Abdo <abdo@member.fsf.org>
Date: Mon, 21 Feb 2022 00:36:27 -0300
Subject: [PATCH 16/23] Add has_authors to cleaned data

---
 src/sashimi/__init__.py | 9 ++++++---
 src/sashimi/clean.py    | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/sashimi/__init__.py b/src/sashimi/__init__.py
index a6e7f11..85ede19 100644
--- a/src/sashimi/__init__.py
+++ b/src/sashimi/__init__.py
@@ -1,7 +1,9 @@
 #! /usr/bin/env python
 
 import pandas as pd
-from .clean import clean_text
+
+from . import clean as clean_m
+
 
 """ NOTES
 
@@ -50,8 +52,9 @@ def get_data(file_paths, clean=True, drop=True):
         print(f" Kept {len(df)} entries containing an abstract.")
 
     if clean:
-        clean_abstract_text = clean_text(df)
-        df["_abstract_text_is_cleaned"] = ~df["abstract_text"].eq(clean_abstract_text)
+        df["abstract_text__has_authors"] = df["abstract_text"].map(clean_m.has_authors)
+        clean_abstract_text = clean_m.clean_text(df)
+        df["abstract_text__is_cleaned"] = ~df["abstract_text"].eq(clean_abstract_text)
         df["abstract_text"] = clean_abstract_text
 
         if drop:
diff --git a/src/sashimi/clean.py b/src/sashimi/clean.py
index a0727ef..3c8ae4a 100644
--- a/src/sashimi/clean.py
+++ b/src/sashimi/clean.py
@@ -139,7 +139,7 @@ def is_email_address(line):
     return re.search(r"[\w-]+@[\w-]+\.[\w-]+", line)
 
 
-def has_author_affiliation(txt):
+def has_authors(txt):
     split = int(len(txt) / 2)
     txts = txt[:split].split("\n")[:-1]
     for line in txts:

From 4b9a492faf74a7e4f593886773454098185b7acb Mon Sep 17 00:00:00 2001
From: Ale Abdo <abdo@member.fsf.org>
Date: Mon, 21 Feb 2022 00:37:13 -0300
Subject: [PATCH 17/23] Improve and document script behavior

---
 src/sashimi/README.md   | 24 ++++++++++++++++++++++++
 src/sashimi/__init__.py |  1 +
 src/sashimi/__main__.py | 23 ++++++++++++++---------
 src/sashimi/sashimi.py  | 15 +++++++--------
 4 files changed, 46 insertions(+), 17 deletions(-)
 create mode 100644 src/sashimi/README.md

diff --git a/src/sashimi/README.md b/src/sashimi/README.md
new file mode 100644
index 0000000..0e7b09f
--- /dev/null
+++ b/src/sashimi/README.md
@@ -0,0 +1,24 @@
+# Running as as script
+
+`python ${PATH_TO_THIS_MODULE} ${PATH_TO_DATA_DIR}`
+
+cleans data and outputs `abstracts_contents_clean.csv.xz"`.
+
+`python ${PATH_TO_THIS_MODULE} ${PATH_TO_DATA_DIR} check_clean`
+
+in addition, interactively displays what was cleaned.
+
+`python ${PATH_TO_THIS_MODULE} ${PATH_TO_DATA_DIR} check_clean ${START}`
+
+starts displaying changes from index ${START}.
+
+## Requires Python modules `abstractology` and `graph-tool`:
+
+`python ${PATH_TO_THIS_MODULE} ${PATH_TO_DATA_DIR} sashimi`
+
+- cleans data and outputs `abstracts_contents_clean.csv.xz"`.
+- produces a domain-topic model of the data
+- plots a domain-topic map
+- produces chained models for time ('year') and conference topic ('topic_1')
+- plots the respective domain-chained maps
+
diff --git a/src/sashimi/__init__.py b/src/sashimi/__init__.py
index 85ede19..05c5cf3 100644
--- a/src/sashimi/__init__.py
+++ b/src/sashimi/__init__.py
@@ -56,6 +56,7 @@ def get_data(file_paths, clean=True, drop=True):
         clean_abstract_text = clean_m.clean_text(df)
         df["abstract_text__is_cleaned"] = ~df["abstract_text"].eq(clean_abstract_text)
         df["abstract_text"] = clean_abstract_text
+        print(f" Cleaned {df['abstract_text__is_cleaned'].sum()} entries.")
 
         if drop:
             df = df.dropna(subset=["abstract_text"])
diff --git a/src/sashimi/__main__.py b/src/sashimi/__main__.py
index a1b33c8..f6dac4c 100644
--- a/src/sashimi/__main__.py
+++ b/src/sashimi/__main__.py
@@ -5,29 +5,34 @@
 
 # User input
 ISGC_FILES_DIR = Path(sys.argv[1] if len(sys.argv) > 1 else ".")
+ACTION = sys.argv[2] if len(sys.argv) > 2 else None
+START = int(sys.argv[3]) if len(sys.argv) > 3 else 0
+
+# Path definitions
 ISGC_FILES = [
     ISGC_FILES_DIR / file
     for file in ["abstracts_contents_2015.tsv", "abstracts_contents_1719.tsv"]
 ]
-CLEAN_FILE = "abstracts_contents_clean.csv.xz"
-ACTION = sys.argv[2] if len(sys.argv) > 2 else "check_clean"
-START = int(sys.argv[3]) if len(sys.argv) > 3 else 0
+CLEAN_FILE = Path("isgc_2015-2019_clean.csv.xz")
 
 
 def main():
+    clean_data = get_data(ISGC_FILES, True)
+    clean_data.to_csv(CLEAN_FILE)
+    print(f"Saved {CLEAN_FILE}")
+    clean_data.name = CLEAN_FILE.name.split(".")[0]
+
     if ACTION == "check_clean":
-        unclean = get_data(ISGC_FILES, False)
-        clean = get_data(ISGC_FILES, True)
-        clean.to_csv(CLEAN_FILE)
-        check_clean(unclean, clean["abstract_text"], START)
+        unclean_data = get_data(ISGC_FILES, False)
+        check_clean(unclean_data, clean_data["abstract_text"], START)
 
     elif ACTION == "sashimi":
         from . import sashimi
 
         try:
-            corpus = sashimi.load()
+            corpus = sashimi.load(clean_data)
         except FileNotFoundError:
-            corpus = sashimi.bootstrap()
+            corpus = sashimi.bootstrap(clean_data)
         sashimi.plot(corpus)
 
 
diff --git a/src/sashimi/sashimi.py b/src/sashimi/sashimi.py
index 0d96434..d397d2a 100644
--- a/src/sashimi/sashimi.py
+++ b/src/sashimi/sashimi.py
@@ -2,17 +2,16 @@
 
 import graph_tool  # not used, but avoids an import order bug # noqa
 import abstractology
-from . import get_data
 
-CORPUS_NAME = "isgc_2015-2017"
+DEFAULT_CORPUS_NAME = "isgc"
 
 
-def bootstrap():
+def bootstrap(data):
     corpus = abstractology.Graphology()
     corpus.text_source = "abstract_text"
     corpus.col_title = "abstract_title"
     corpus.col_time = "year"
-    _load_data(corpus)
+    _load_data(corpus, data)
 
     corpus.load_domain_topic_model()
     corpus.set_graph(extend={"prop": "year"})
@@ -24,12 +23,12 @@ def bootstrap():
     return corpus
 
 
-def load():
+def load(data):
     corpus = abstractology.Graphology(
         config="auto_abstractology/reports/config.json",
         load_data=False,
     )
-    _load_data(corpus)
+    _load_data(corpus, data)
     return corpus
 
 
@@ -46,6 +45,6 @@ def plot(corpus):
     corpus.domain_map("ISGC 2015-2017", chained=True)
 
 
-def _load_data(corpus):
-    corpus.load_data(get_data(), "dataframe", name=CORPUS_NAME)
+def _load_data(corpus, data):
+    corpus.load_data(data, "dataframe", name=getattr(data, 'name', DEFAULT_CORPUS_NAME))
     corpus.process_corpus(ngrams=1)  # no ngrams while I don't fix gensim on guix

From 00d68bfb1ede5495ecbf030b5e30840eb7e4437b Mon Sep 17 00:00:00 2001
From: Ale Abdo <abdo@member.fsf.org>
Date: Mon, 21 Feb 2022 01:07:42 -0300
Subject: [PATCH 18/23] Update README

---
 src/sashimi/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/sashimi/README.md b/src/sashimi/README.md
index 0e7b09f..bcde2b8 100644
--- a/src/sashimi/README.md
+++ b/src/sashimi/README.md
@@ -2,7 +2,7 @@
 
 `python ${PATH_TO_THIS_MODULE} ${PATH_TO_DATA_DIR}`
 
-cleans data and outputs `abstracts_contents_clean.csv.xz"`.
+cleans data and outputs `isgc_2015-2019_clean.csv.xz`.
 
 `python ${PATH_TO_THIS_MODULE} ${PATH_TO_DATA_DIR} check_clean`
 
@@ -16,7 +16,7 @@ starts displaying changes from index ${START}.
 
 `python ${PATH_TO_THIS_MODULE} ${PATH_TO_DATA_DIR} sashimi`
 
-- cleans data and outputs `abstracts_contents_clean.csv.xz"`.
+- cleans data and outputs `isgc_2015-2019_clean.csv.xz`.
 - produces a domain-topic model of the data
 - plots a domain-topic map
 - produces chained models for time ('year') and conference topic ('topic_1')

From cbbce6f9bad1dffa1cbd48f812c47a3d05a94600 Mon Sep 17 00:00:00 2001
From: Ale Abdo <abdo@member.fsf.org>
Date: Mon, 21 Feb 2022 01:45:27 -0300
Subject: [PATCH 19/23] Fix README

---
 src/sashimi/README.md | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/sashimi/README.md b/src/sashimi/README.md
index bcde2b8..e6631e8 100644
--- a/src/sashimi/README.md
+++ b/src/sashimi/README.md
@@ -1,20 +1,28 @@
-# Running as as script
+# Running the module as as script
 
-`python ${PATH_TO_THIS_MODULE} ${PATH_TO_DATA_DIR}`
+First, add the folder containing this module to your $PYTHON_PATH:
+
+`PATH_TO_PARENT_DIR=/home/user/projects/isgc-congress/src`
+
+`export PYTHONPATH=$PYTHONPATH:${PATH_TO_PARENT_DIR}`
+
+Then...
+
+`python -m ${PATH_TO_THIS_MODULE} ${PATH_TO_DATA_DIR}`
 
 cleans data and outputs `isgc_2015-2019_clean.csv.xz`.
 
-`python ${PATH_TO_THIS_MODULE} ${PATH_TO_DATA_DIR} check_clean`
+`python -m ${PATH_TO_THIS_MODULE} ${PATH_TO_DATA_DIR} check_clean`
 
 in addition, interactively displays what was cleaned.
 
-`python ${PATH_TO_THIS_MODULE} ${PATH_TO_DATA_DIR} check_clean ${START}`
+`python -m ${PATH_TO_THIS_MODULE} ${PATH_TO_DATA_DIR} check_clean ${START}`
 
 starts displaying changes from index ${START}.
 
 ## Requires Python modules `abstractology` and `graph-tool`:
 
-`python ${PATH_TO_THIS_MODULE} ${PATH_TO_DATA_DIR} sashimi`
+`python -m ${PATH_TO_THIS_MODULE} ${PATH_TO_DATA_DIR} sashimi`
 
 - cleans data and outputs `isgc_2015-2019_clean.csv.xz`.
 - produces a domain-topic model of the data

From 410ab76efc3d308b6bd16658809819ccc33069a0 Mon Sep 17 00:00:00 2001
From: Ale Abdo <abdo@member.fsf.org>
Date: Mon, 21 Feb 2022 01:51:35 -0300
Subject: [PATCH 20/23] Rename to isgc_sashimi and fix README

---
 src/{sashimi => isgc_sashimi}/README.md   | 8 ++++----
 src/{sashimi => isgc_sashimi}/__init__.py | 0
 src/{sashimi => isgc_sashimi}/__main__.py | 0
 src/{sashimi => isgc_sashimi}/clean.py    | 0
 src/{sashimi => isgc_sashimi}/sashimi.py  | 0
 5 files changed, 4 insertions(+), 4 deletions(-)
 rename src/{sashimi => isgc_sashimi}/README.md (72%)
 rename src/{sashimi => isgc_sashimi}/__init__.py (100%)
 rename src/{sashimi => isgc_sashimi}/__main__.py (100%)
 rename src/{sashimi => isgc_sashimi}/clean.py (100%)
 rename src/{sashimi => isgc_sashimi}/sashimi.py (100%)

diff --git a/src/sashimi/README.md b/src/isgc_sashimi/README.md
similarity index 72%
rename from src/sashimi/README.md
rename to src/isgc_sashimi/README.md
index e6631e8..fa063c6 100644
--- a/src/sashimi/README.md
+++ b/src/isgc_sashimi/README.md
@@ -8,21 +8,21 @@ First, add the folder containing this module to your $PYTHON_PATH:
 
 Then...
 
-`python -m ${PATH_TO_THIS_MODULE} ${PATH_TO_DATA_DIR}`
+`python -m isgc_sashimi ${PATH_TO_DATA_DIR}`
 
 cleans data and outputs `isgc_2015-2019_clean.csv.xz`.
 
-`python -m ${PATH_TO_THIS_MODULE} ${PATH_TO_DATA_DIR} check_clean`
+`python -m isgc_sashimi ${PATH_TO_DATA_DIR} check_clean`
 
 in addition, interactively displays what was cleaned.
 
-`python -m ${PATH_TO_THIS_MODULE} ${PATH_TO_DATA_DIR} check_clean ${START}`
+`python -m isgc_sashimi ${PATH_TO_DATA_DIR} check_clean ${START}`
 
 starts displaying changes from index ${START}.
 
 ## Requires Python modules `abstractology` and `graph-tool`:
 
-`python -m ${PATH_TO_THIS_MODULE} ${PATH_TO_DATA_DIR} sashimi`
+`python -m isgc_sashimi ${PATH_TO_DATA_DIR} sashimi`
 
 - cleans data and outputs `isgc_2015-2019_clean.csv.xz`.
 - produces a domain-topic model of the data
diff --git a/src/sashimi/__init__.py b/src/isgc_sashimi/__init__.py
similarity index 100%
rename from src/sashimi/__init__.py
rename to src/isgc_sashimi/__init__.py
diff --git a/src/sashimi/__main__.py b/src/isgc_sashimi/__main__.py
similarity index 100%
rename from src/sashimi/__main__.py
rename to src/isgc_sashimi/__main__.py
diff --git a/src/sashimi/clean.py b/src/isgc_sashimi/clean.py
similarity index 100%
rename from src/sashimi/clean.py
rename to src/isgc_sashimi/clean.py
diff --git a/src/sashimi/sashimi.py b/src/isgc_sashimi/sashimi.py
similarity index 100%
rename from src/sashimi/sashimi.py
rename to src/isgc_sashimi/sashimi.py

From ad5a4efbba250bcbf8af750646cad9c9fd1b4f38 Mon Sep 17 00:00:00 2001
From: Ale Abdo <abdo@member.fsf.org>
Date: Mon, 21 Feb 2022 02:35:55 -0300
Subject: [PATCH 21/23] Keep original and cleaned text, update README with new
 columns

---
 src/isgc_sashimi/README.md   | 6 +++++-
 src/isgc_sashimi/__init__.py | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/isgc_sashimi/README.md b/src/isgc_sashimi/README.md
index fa063c6..53b41b3 100644
--- a/src/isgc_sashimi/README.md
+++ b/src/isgc_sashimi/README.md
@@ -10,7 +10,11 @@ Then...
 
 `python -m isgc_sashimi ${PATH_TO_DATA_DIR}`
 
-cleans data and outputs `isgc_2015-2019_clean.csv.xz`.
+cleans data and outputs `isgc_2015-2019_clean.csv.xz`, with new columns:
+
+- abstract_text__cleaned
+- abstract_text__is_cleaned
+- abstract_text__has_authors
 
 `python -m isgc_sashimi ${PATH_TO_DATA_DIR} check_clean`
 
diff --git a/src/isgc_sashimi/__init__.py b/src/isgc_sashimi/__init__.py
index 05c5cf3..dd2b8b2 100644
--- a/src/isgc_sashimi/__init__.py
+++ b/src/isgc_sashimi/__init__.py
@@ -55,7 +55,7 @@ def get_data(file_paths, clean=True, drop=True):
         df["abstract_text__has_authors"] = df["abstract_text"].map(clean_m.has_authors)
         clean_abstract_text = clean_m.clean_text(df)
         df["abstract_text__is_cleaned"] = ~df["abstract_text"].eq(clean_abstract_text)
-        df["abstract_text"] = clean_abstract_text
+        df["abstract_text__cleaned"] = clean_abstract_text
         print(f" Cleaned {df['abstract_text__is_cleaned'].sum()} entries.")
 
         if drop:

From 7bc22cd8cfa77c432cc627f197561354df7f77d2 Mon Sep 17 00:00:00 2001
From: Ale Abdo <abdo@member.fsf.org>
Date: Mon, 21 Feb 2022 02:41:32 -0300
Subject: [PATCH 22/23] Update README

---
 src/isgc_sashimi/README.md | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/isgc_sashimi/README.md b/src/isgc_sashimi/README.md
index 53b41b3..a5a340b 100644
--- a/src/isgc_sashimi/README.md
+++ b/src/isgc_sashimi/README.md
@@ -24,13 +24,16 @@ in addition, interactively displays what was cleaned.
 
 starts displaying changes from index ${START}.
 
-## Requires Python modules `abstractology` and `graph-tool`:
+## Domain-topic modeling (SASHIMI)
+
+Requires Python modules `abstractology` and `graph-tool`.
 
 `python -m isgc_sashimi ${PATH_TO_DATA_DIR} sashimi`
 
+will do the following, in sequence:
+
 - cleans data and outputs `isgc_2015-2019_clean.csv.xz`.
 - produces a domain-topic model of the data
 - plots a domain-topic map
 - produces chained models for time ('year') and conference topic ('topic_1')
 - plots the respective domain-chained maps
-

From 1313b08ac01afa13d25d100407a948ff57685953 Mon Sep 17 00:00:00 2001
From: Ale Abdo <abdo@member.fsf.org>
Date: Tue, 20 Sep 2022 00:20:56 +0200
Subject: [PATCH 23/23] Update to abstractology: use set_chain

---
 src/isgc_sashimi/sashimi.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/isgc_sashimi/sashimi.py b/src/isgc_sashimi/sashimi.py
index d397d2a..3ce7e2e 100644
--- a/src/isgc_sashimi/sashimi.py
+++ b/src/isgc_sashimi/sashimi.py
@@ -14,9 +14,9 @@ def bootstrap(data):
     _load_data(corpus, data)
 
     corpus.load_domain_topic_model()
-    corpus.set_graph(extend={"prop": "year"})
+    corpus.set_chain("year")
     corpus.load_domain_chained_model()
-    corpus.set_graph(extend={"prop": "topic_1"})
+    corpus.set_chain("topic_1")
     corpus.load_domain_chained_model()
 
     corpus.register_config()
@@ -36,15 +36,15 @@ def plot(corpus):
     corpus.load_domain_topic_model()
     corpus.domain_map("ISGC 2015-2017")
 
-    corpus.set_graph(extend={"prop": "year"})
+    corpus.set_chain("year")
     corpus.load_domain_chained_model()
     corpus.domain_map("ISGC 2015-2017", chained=True)
 
-    corpus.set_graph(extend={"prop": "topic_1"})
+    corpus.set_chain(extend="topic_1")
     corpus.load_domain_chained_model()
     corpus.domain_map("ISGC 2015-2017", chained=True)
 
 
 def _load_data(corpus, data):
-    corpus.load_data(data, "dataframe", name=getattr(data, 'name', DEFAULT_CORPUS_NAME))
+    corpus.load_data(data, "dataframe", name=getattr(data, "name", DEFAULT_CORPUS_NAME))
     corpus.process_corpus(ngrams=1)  # no ngrams while I don't fix gensim on guix