adding scripts for computing 'influence' in mailing list and folding …

…in influence data
sbenthall · May 31, 2024 · c04095d · c04095d
1 parent 817efac
commit c04095d
Show file tree

Hide file tree

Showing 2 changed files with 134 additions and 0 deletions.
diff --git a/bigbang/analysis/affiliation.py b/bigbang/analysis/affiliation.py
@@ -0,0 +1,37 @@
+from bigbang.analysis.influence import *
+from bigbang.analysis.utils import localize_to_utc
+
+affil_start_date_col_name = 'Time start (mm/yyyy)'
+affil_end_date_col_name = 'Time end (mm/yyyy)'
+affil_affiliation_col_name = 'Affiliation'
+
+def affiliated_influence(arx, affiliations, top_n = 50):
+    ## this is defined in influence.py, and builds a sender_cat column
+    ## based on email domain
+    augment(arx)
+
+    ## this further looks up the email author in the affiliations table
+    ## and modifies the sender_cat column
+    arx.data['sender_cat'] = arx.data.apply(
+        lambda mrow: lookup_affiliation(mrow['sender_cat'], mrow['Date'], affiliations),
+        axis=1)
+
+    top_ddd = aggregate_activity(arx, top_n)
+
+    return top_ddd
+
+def lookup_affiliation(name, date, affiliation_data):
+    """
+    Find the affiliation of a name on a particular date,
+    given an affiliation data file.
+    """
+    name_affils = affiliation_data[affiliation_data['Name'] == name]
+
+    date =  localize_to_utc(date)
+
+    for na_row in name_affils.iterrows():
+        if date > na_row[1][affil_start_date_col_name] \
+            and date < na_row[1][affil_end_date_col_name]:
+                return na_row[1][affil_affiliation_col_name]
+
+    return name
diff --git a/bigbang/analysis/influence.py b/bigbang/analysis/influence.py
@@ -0,0 +1,97 @@
+from bigbang.archive import Archive, open_list_archives
+
+import bigbang.parse as parse
+import bigbang.analysis.utils as utils
+
+import matplotlib.pyplot as plt
+import pandas as pd
+
+import os
+import subprocess
+
+from bigbang.datasets import domains, organizations
+
+"""
+Processing of mailing list data to support an analysis
+of actor-influence, where actors are understood to be at
+the affilation/organization level.
+"""
+
+dd = domains.load_data()
+odf = organizations.load_data()
+
+good_categories = ["company", "academic", "sdo"] # not "generic"
+
+def lookup_stakeholder_by_domain(domain):
+    """
+    For an email domain, use the organization data provided in BigBang
+    to look up the organization name associate with that email domain.
+    """
+    search = odf['email domain names'].apply(lambda dn: domain in str(dn))
+
+    orgs = odf[search]
+
+    top_orgs = orgs[orgs['subsidiary of / alias of'].isna()]
+
+    if top_orgs.shape[0] > 0: 
+        return top_orgs['name'].iloc[0]
+    else:
+        return domain
+
+def normalize_senders_by_domain(row):
+    try:
+        if dd.loc[row['domain']]['category'] in good_categories:
+            return lookup_stakeholder_by_domain(row['domain'])
+        else:
+            return parse.clean_from(row['From'])
+    except Exception as e:
+        return parse.clean_from(row['From'])
+
+def is_affiliation(domain):
+    try:
+        if dd.loc[domain]['category'] in good_categories:
+            return lookup_stakeholder_by_domain(domain)
+        else:
+            return "Unaffiliated"
+    except:
+        return "Unaffiliated"
+
+def augment(arx):
+    """
+    Add to an email archive's data three new columns: an email addres,
+    an email domain, and the 'category' of the sender, which may be an
+    organization name, 'Unaffiliated', or a cleaned version of the email's
+    From field.
+    """
+    arx.data['email'] = arx.data['From'].apply(utils.extract_email)
+    arx.data['domain'] = arx.data['From'].apply(utils.extract_domain)
+    arx.data['sender_cat'] = arx.data.apply(normalize_senders_by_domain, axis=1)
+
+def aggregate_activity(aarx, top_n):
+    """
+    Transform an 'augmented' email archive into a 'wide' format datafame
+    that has the activity of each actor (organizational level, where possible)
+    for each year.
+
+    TODO: generalize this, with more flexible frequency.
+    TODO: Internalize the 'augment' preprocessing.
+    """
+    grouped = aarx.data.groupby(['sender_cat', pd.Grouper(key='Date', freq='Y')]) \
+        .count().reset_index().sort_values('Date')
+
+    ddd = grouped.pivot(columns="sender_cat", index="Date", values="From").fillna(0)
+
+    top_ddd = ddd[ddd.sum().sort_values(ascending=False)[:top_n].index]
+
+    return top_ddd
+
+def influence_from_arx(arx, top_n):
+    """
+    Return a dataframe with the annual influence of each organizational
+    actor, for the top TOP_N most active stakeholders.
+    """
+    top_n = 50
+    augment(arx)
+    aaarx = aggregate_activity(arx, top_n)
+
+    return aaarx