Skip to content

Commit

Permalink
adding scripts for computing 'influence' in mailing list and folding …
Browse files Browse the repository at this point in the history
…in influence data
  • Loading branch information
sbenthall committed May 31, 2024
1 parent 817efac commit c04095d
Show file tree
Hide file tree
Showing 2 changed files with 134 additions and 0 deletions.
37 changes: 37 additions & 0 deletions bigbang/analysis/affiliation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from bigbang.analysis.influence import *
from bigbang.analysis.utils import localize_to_utc

affil_start_date_col_name = 'Time start (mm/yyyy)'
affil_end_date_col_name = 'Time end (mm/yyyy)'
affil_affiliation_col_name = 'Affiliation'

def affiliated_influence(arx, affiliations, top_n = 50):
## this is defined in influence.py, and builds a sender_cat column
## based on email domain
augment(arx)

## this further looks up the email author in the affiliations table
## and modifies the sender_cat column
arx.data['sender_cat'] = arx.data.apply(
lambda mrow: lookup_affiliation(mrow['sender_cat'], mrow['Date'], affiliations),
axis=1)

top_ddd = aggregate_activity(arx, top_n)

return top_ddd

def lookup_affiliation(name, date, affiliation_data):
"""
Find the affiliation of a name on a particular date,
given an affiliation data file.
"""
name_affils = affiliation_data[affiliation_data['Name'] == name]

date = localize_to_utc(date)

for na_row in name_affils.iterrows():
if date > na_row[1][affil_start_date_col_name] \
and date < na_row[1][affil_end_date_col_name]:
return na_row[1][affil_affiliation_col_name]

return name
97 changes: 97 additions & 0 deletions bigbang/analysis/influence.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
from bigbang.archive import Archive, open_list_archives

import bigbang.parse as parse
import bigbang.analysis.utils as utils

import matplotlib.pyplot as plt
import pandas as pd

import os
import subprocess

from bigbang.datasets import domains, organizations

"""
Processing of mailing list data to support an analysis
of actor-influence, where actors are understood to be at
the affilation/organization level.
"""

dd = domains.load_data()
odf = organizations.load_data()

good_categories = ["company", "academic", "sdo"] # not "generic"

def lookup_stakeholder_by_domain(domain):
"""
For an email domain, use the organization data provided in BigBang
to look up the organization name associate with that email domain.
"""
search = odf['email domain names'].apply(lambda dn: domain in str(dn))

orgs = odf[search]

top_orgs = orgs[orgs['subsidiary of / alias of'].isna()]

if top_orgs.shape[0] > 0:
return top_orgs['name'].iloc[0]
else:
return domain

def normalize_senders_by_domain(row):
try:
if dd.loc[row['domain']]['category'] in good_categories:
return lookup_stakeholder_by_domain(row['domain'])
else:
return parse.clean_from(row['From'])
except Exception as e:
return parse.clean_from(row['From'])

def is_affiliation(domain):
try:
if dd.loc[domain]['category'] in good_categories:
return lookup_stakeholder_by_domain(domain)
else:
return "Unaffiliated"
except:
return "Unaffiliated"

def augment(arx):
"""
Add to an email archive's data three new columns: an email addres,
an email domain, and the 'category' of the sender, which may be an
organization name, 'Unaffiliated', or a cleaned version of the email's
From field.
"""
arx.data['email'] = arx.data['From'].apply(utils.extract_email)
arx.data['domain'] = arx.data['From'].apply(utils.extract_domain)
arx.data['sender_cat'] = arx.data.apply(normalize_senders_by_domain, axis=1)

def aggregate_activity(aarx, top_n):
"""
Transform an 'augmented' email archive into a 'wide' format datafame
that has the activity of each actor (organizational level, where possible)
for each year.
TODO: generalize this, with more flexible frequency.
TODO: Internalize the 'augment' preprocessing.
"""
grouped = aarx.data.groupby(['sender_cat', pd.Grouper(key='Date', freq='Y')]) \
.count().reset_index().sort_values('Date')

ddd = grouped.pivot(columns="sender_cat", index="Date", values="From").fillna(0)

top_ddd = ddd[ddd.sum().sort_values(ascending=False)[:top_n].index]

return top_ddd

def influence_from_arx(arx, top_n):
"""
Return a dataframe with the annual influence of each organizational
actor, for the top TOP_N most active stakeholders.
"""
top_n = 50
augment(arx)
aaarx = aggregate_activity(arx, top_n)

return aaarx

0 comments on commit c04095d

Please sign in to comment.