Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pushing my work #2

Open
wants to merge 23 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
3d0abe5
Add sashimi module for domain topic modeling
solstag May 27, 2021
b54ac42
sashimi: domain-chained model with topic_1
solstag Jun 13, 2021
ffb69b5
Add .clean.clean_text(): remove undesirable content from abstracts
solstag Aug 27, 2021
d669a09
Add parameters to package execution for data dir and action
solstag Aug 28, 2021
1d5e7ff
Rename main as init.
solstag Aug 30, 2021
0c5e9d7
Improve clean_text and check_clean. Document check_clean.
solstag Aug 30, 2021
be6df47
Refactor for modularity.
solstag Aug 30, 2021
b2f44b2
Improve read/write data.
solstag Aug 30, 2021
fc2e6d0
Clean more funding info, with manually identified exceptions.
solstag Aug 30, 2021
a481557
Add '_abstract_text_is_cleaned' column to singal cleaned
solstag Feb 19, 2022
087c3ff
Improvements to cleaning and notes
solstag Feb 19, 2022
cc69b86
Rename 'authors' functions
solstag Feb 19, 2022
7bd3b54
Parametrize drop in get_data
solstag Feb 19, 2022
2b64216
Complete author_affiliation functions
solstag Feb 19, 2022
8057b79
Refactor to new abstractology API
solstag Feb 21, 2022
d2b6198
Add has_authors to cleaned data
solstag Feb 21, 2022
4b9a492
Improve and document script behavior
solstag Feb 21, 2022
00d68bf
Update README
solstag Feb 21, 2022
cbbce6f
Fix README
solstag Feb 21, 2022
410ab76
Rename to isgc_sashimi and fix README
solstag Feb 21, 2022
ad5a4ef
Keep original and cleaned text, update README with new columns
solstag Feb 21, 2022
7bc22cd
Update README
solstag Feb 21, 2022
1313b08
Update to abstractology: use set_chain
solstag Sep 19, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions src/isgc_sashimi/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Running the module as as script

First, add the folder containing this module to your $PYTHON_PATH:

`PATH_TO_PARENT_DIR=/home/user/projects/isgc-congress/src`

`export PYTHONPATH=$PYTHONPATH:${PATH_TO_PARENT_DIR}`

Then...

`python -m isgc_sashimi ${PATH_TO_DATA_DIR}`

cleans data and outputs `isgc_2015-2019_clean.csv.xz`, with new columns:

- abstract_text__cleaned
- abstract_text__is_cleaned
- abstract_text__has_authors

`python -m isgc_sashimi ${PATH_TO_DATA_DIR} check_clean`

in addition, interactively displays what was cleaned.

`python -m isgc_sashimi ${PATH_TO_DATA_DIR} check_clean ${START}`

starts displaying changes from index ${START}.

## Domain-topic modeling (SASHIMI)

Requires Python modules `abstractology` and `graph-tool`.

`python -m isgc_sashimi ${PATH_TO_DATA_DIR} sashimi`

will do the following, in sequence:

- cleans data and outputs `isgc_2015-2019_clean.csv.xz`.
- produces a domain-topic model of the data
- plots a domain-topic map
- produces chained models for time ('year') and conference topic ('topic_1')
- plots the respective domain-chained maps
66 changes: 66 additions & 0 deletions src/isgc_sashimi/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#! /usr/bin/env python

import pandas as pd

from . import clean as clean_m


""" NOTES

- Data columns:
['abstract_text',
'abstract_title',
'bibliography',
'cancelled',
'code',
'figure_legend_1',
'figure_legend_2',
'figure_title_1',
'figure_title_2',
'final_status',
'id',
'is_complete',
'keyword_1',
'keyword_2',
'keyword_3',
'keyword_4',
'keywords',
'legend_1',
'legend_2',
'not_to_remind',
'program_day',
'program_session',
'publish_onsite',
'relance_register',
'topic_1',
'topic_2',
'topic_3',
'user_id',
'validate',
'year']
"""


def get_data(file_paths, clean=True, drop=True):
df = pd.concat(
[pd.read_csv(file, sep="\t", dtype=str) for file in file_paths],
ignore_index=True,
)
print(f"Found {len(df)} entries.")
if drop:
df = df.dropna(subset=["abstract_text"])
print(f" Kept {len(df)} entries containing an abstract.")

if clean:
df["abstract_text__has_authors"] = df["abstract_text"].map(clean_m.has_authors)
clean_abstract_text = clean_m.clean_text(df)
df["abstract_text__is_cleaned"] = ~df["abstract_text"].eq(clean_abstract_text)
df["abstract_text__cleaned"] = clean_abstract_text
print(f" Cleaned {df['abstract_text__is_cleaned'].sum()} entries.")

if drop:
df = df.dropna(subset=["abstract_text"])
print(f" Kept {len(df)} entries after cleaning.")

df.index.name = "index"
return df
40 changes: 40 additions & 0 deletions src/isgc_sashimi/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import sys
from pathlib import Path
from . import get_data
from .clean import check_clean

# User input
ISGC_FILES_DIR = Path(sys.argv[1] if len(sys.argv) > 1 else ".")
ACTION = sys.argv[2] if len(sys.argv) > 2 else None
START = int(sys.argv[3]) if len(sys.argv) > 3 else 0

# Path definitions
ISGC_FILES = [
ISGC_FILES_DIR / file
for file in ["abstracts_contents_2015.tsv", "abstracts_contents_1719.tsv"]
]
CLEAN_FILE = Path("isgc_2015-2019_clean.csv.xz")


def main():
clean_data = get_data(ISGC_FILES, True)
clean_data.to_csv(CLEAN_FILE)
print(f"Saved {CLEAN_FILE}")
clean_data.name = CLEAN_FILE.name.split(".")[0]

if ACTION == "check_clean":
unclean_data = get_data(ISGC_FILES, False)
check_clean(unclean_data, clean_data["abstract_text"], START)

elif ACTION == "sashimi":
from . import sashimi

try:
corpus = sashimi.load(clean_data)
except FileNotFoundError:
corpus = sashimi.bootstrap(clean_data)
sashimi.plot(corpus)


if __name__ == "__main__":
main()
216 changes: 216 additions & 0 deletions src/isgc_sashimi/clean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
#!/usr/bin/env python

"""
TODO:
- [ ] trouver les abstracts avec une ligne auteurs
- [x] ajouter une variable si l'abstract a été modifié ou pas

List of abstracts with some issue:

- 187, 661 (longuer de la ligne pas des majuscules)
- 1610 - trop de chiffres
- 1340, 1634, 1642 (ligne trop courte)
- 315, 873 biblio mais il y a l'année (ou DOI)
- 871 :P
- 1028 'authors acknowledge...'
- 1314 biblio sans année
- 'Abstract: ...' peut-être assouplir le critère (sans \n)
- 871 (table ronde), 1401
-- Majuscules (peut-être pas tous):
- 978, 1161 formule chimique pleine de majuscules
- 1205 ligne d'abstract avec majuscules
- 1701 plein d'acronymes
- première ligne toute en majuscule

"""

import re
import pandas as pd
from difflib import unified_diff
from itertools import permutations

CLEAN_REFLAGS = re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE


def clean_text(df):
"""
Known untreated entries:
- some title plus authors headers with no clear separation
"""

def and_sections(re0, re1):
return re0 + r"\s* (?: and | & ) \s*" + re1

# Remove entries with no valid content
unwanted_res = "^Lorem ipsum dolor sit amet"
b_unwanted = df["abstract_text"].str.contains(unwanted_res)
clean_df = df[~b_unwanted]
b_unwanted = df["abstract_text"].map(len).lt(100)
clean_df = df[~b_unwanted]

# Section names to be removed
section_names_res = [
r"backgrounds?",
r"conclusions?",
r"discussions?",
r"experiments?",
r"experimental",
r"intro",
r"introductions?",
r"materials?",
r"methods?",
r"motivation?",
r"perspectives?",
r"prospects?",
r"objectives?",
r"outlooks?",
r"overview?",
r"results?",
r"key\ results?",
r"significance",
r"summary",
]
section_names_re = r"|".join(
[and_sections(x, y) for x, y in permutations(section_names_res, 2)]
+ section_names_res
)
section_numbering_re = r"[^\n\w]* (?: \d? [^\n\w]* )"
# Remove invalid content from entries
unclean_from_start_of_text_res = [
r"(?: ^ | .* \n)" + section_numbering_re + r"abstract [^\n\w,]* [\n:]",
]
unclean_res = [
r"^" + section_numbering_re + r"keys?\ ?words? (?: [^\n\w]* \n )? [^\n]*",
r"^"
+ section_numbering_re
+ r"(?:"
+ section_names_re
+ r") (?: \ * section)? (?: [^\n\w]* \n | \s* [^\n\w\s,&]+ )",
]
unclean_until_end_of_text_res = [
r"^" + section_numbering_re + r"ac?knowled?ge?m?ents? :? .*",
r"^" + section_numbering_re + r"r[eé]f[eé]rences? \s* :? .*",
r"^ [^\n\w]* [12] [^\n\w]+ \w [^\n]+ (?<!\d)(?:1[6789]|20)[0-9]{2}(?!\d) .*",
]
unclean_rx = re.compile(
pattern=r"|".join(
unclean_from_start_of_text_res + unclean_res + unclean_until_end_of_text_res
),
flags=CLEAN_REFLAGS,
)
clean_abstract_text = clean_df["abstract_text"].str.replace(unclean_rx, "")

# Remove even more funding info (max 61) excluding (10) manually identified wrong matches
clean_extra_funding_rx = re.compile(
r"(^ [^\n]*"
r"(?: fund[eis] | financ | supported\ by | support\ of | support\ from | grant )"
r"[^\n]* \s* ) \Z",
flags=CLEAN_REFLAGS,
)
up_index = clean_abstract_text.index.difference(
[23, 968, 999, 1243, 1373, 1416, 1469, 1560, 1700, 1710]
)
clean_abstract_text = clean_abstract_text.loc[up_index].str.replace(
clean_extra_funding_rx, ""
)

return clean_abstract_text


# TODO: find a use for these in clean_text()?
def is_author_affiliation(line, verbose=False):
author_words = r"and of at in de et und".split()
words_re = fr'\b(?:{"|".join(author_words)})\b'
line = re.sub(r"[-\.]", " ", line)
line = re.sub(r",", " , ", line)
line = re.sub(r"\d+", "", line)
line = re.sub(r"[^\w\s,]*|\b[a-z]\b", "", line)
words = line.split()
point_words = [
x for x in words if x[0].isupper() or x == "," or re.match(words_re, x)
]
if verbose:
print(point_words)
print(words)
return len(words) > 4 and len(point_words) / len(words) > 0.8


def is_email_address(line):
return re.search(r"[\w-]+@[\w-]+\.[\w-]+", line)


def has_authors(txt):
split = int(len(txt) / 2)
txts = txt[:split].split("\n")[:-1]
for line in txts:
if is_author_affiliation(line) or is_email_address(line):
return True
return False


def remove_lines_like_authors(txt):
newtxt = []
split = int(len(txt) / 2)
txts = txt[:split].split("\n")
tail = [txts.pop()]
for line in txts:
if not is_author_affiliation(line) and not is_email_address(line):
newtxt.append(line)
return "\n".join(newtxt + tail) + txt[split:]


## Interactive

def check_clean(df_or_series, clean_abstract_text, start=0, interactive=True):
"""Compares two textual series showing diffs for each entry.

If passed a dataframe as first argument, picks the "abstract_text" column.
If `start` is provided, skips abstracts indexed less than its value.
If not `interactive`, returns the diffs as a `pandas.Series`
If `interactive`, waits for input at each entry, stopping if sent a nonempty string.
"""
if not isinstance(df_or_series, pd.Series):
abstract_text = df_or_series["abstract_text"]
else:
abstract_text = df_or_series
abstract_text = abstract_text.loc[clean_abstract_text.index]
comp = abstract_text.compare(clean_abstract_text)
if comp.empty:
print("No differences found.")
return
diff = comp.agg(
lambda x: unified_diff(x["self"].split("\n"), x["other"].split("\n")), axis=1
)
if not interactive:
return diff.map("\n".join)
print(f"Found {len(diff)} modified documents.\n")
for idx, diff in diff.loc[start:].items():
for line in diff:
print(line)
print("\n" + 70 * "-" + str(idx) + "\n")
if input():
print("\nInterrupted!\n")
break


def search_text(df, rexp):
sel = df.abstract_text.str.contains(
rexp,
flags=CLEAN_REFLAGS,
)
for idx, txt in df.loc[sel, "abstract_text"].items():
print(txt)
print("\n" + 70 * "-" + str(idx) + "\n")
if input():
break


def extract_text(df, rexp):
return (
df["abstract_text"]
.str.extractall(
rexp,
flags=CLEAN_REFLAGS,
)
.dropna()
)
Loading