Skip to content

Commit

Permalink
add cooccurence between authors
Browse files Browse the repository at this point in the history
  • Loading branch information
duan committed Nov 2, 2023
1 parent dafc463 commit 99b8b4d
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 35 deletions.
14 changes: 14 additions & 0 deletions npmi_heatmap_author_sentence.html

Large diffs are not rendered by default.

14 changes: 14 additions & 0 deletions npmi_heatmap_author_text.html

Large diffs are not rendered by default.

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion npmi_heatmap_text.html → npmi_heatmap_work_text.html

Large diffs are not rendered by default.

65 changes: 32 additions & 33 deletions pmi.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,42 +3,49 @@
import numpy as np
import plotly.graph_objects as go

umfang = "text" # set it to "sentence" in order to get only the cooccurrence in the same sentence
unit = "author" # alternatively set it to "work"

df = pd.read_csv("work_title.tsv", sep="\t")
df = df[df.type == "Literary work"]
#remove the rows where identifier is empty
df = df[df.Identifier.notnull()]
df = df[df.MiMoTextBase_ID.notnull()]

all_works = df.Identifier.unique().tolist() #get a list of all mentioned literary works
def get_author(s):
return s.split("_")[-1]

if unit == "author":
df['Identifier'] = df['Identifier'].apply(get_author)

all_units = df.Identifier.unique().tolist() #get a list of all mentioned literary works
work_index = dict() #get a index of secondary works for each literary work
for work in all_works:
for work in all_units:
work_index[work] = set()

unique_text = df.text.unique() #get a list of all secondary works

umfang = "text" # set it to "sentence" in order to get only the cooccurrence in the same sentence

for title in unique_text:
if umfang == "text":
ner_list = df[df.text == title].Identifier.unique() #get a list of unique literary works mentioned in the secondary work
for work in all_works:
for work in all_units:
if work in ner_list:
work_index[work].add(title)
else:
for sentence in df[df.text == title].sentence.unique():
ner_list = df[df.text == title][df.sentence == sentence].Identifier.unique()
for work in all_works:
for work in all_units:
if work in ner_list:
work_index[work].add(title+"_"+str(sentence))

n = len(all_works)
n = len(all_units)
npmi_matrix = np.zeros((n, n))

for i in range(n):
for j in range(i+1, n): # Only compute upper triangle, since matrix is symmetric
p_work1 = len(work_index[all_works[i]]) / len(unique_text)
p_work2 = len(work_index[all_works[j]]) / len(unique_text)
p_occurrence = len(work_index[all_works[i]].intersection(work_index[all_works[j]])) / len(unique_text)
p_work1 = len(work_index[all_units[i]]) / len(unique_text)
p_work2 = len(work_index[all_units[j]]) / len(unique_text)
p_occurrence = len(work_index[all_units[i]].intersection(work_index[all_units[j]])) / len(unique_text)

if p_occurrence == 0: # Avoid log(0) and division by zero
npmi = 0
Expand All @@ -50,17 +57,11 @@
npmi_matrix[j][i] = npmi

# Assuming npmi_matrix is already computed and is a numpy array
x_labels = all_works
y_labels = all_works
x_labels = all_units
y_labels = all_units

custom_colorscale = [
[0.0, 'rgb(255,255,255)'] # white for zero
# You can add more colors for other values here.
# For example:
# [0.1, 'rgb(235, 237, 240)'],
# [1.0, 'rgb(0, 0, 255)']
# Ensure that the colorscale covers the range from the minimum to the maximum
# values that your npmi_matrix can take, excluding zero if it has a special color.
[0.0, 'rgb(255,255,255)']
]

fig = go.Figure(data=go.Heatmap(
Expand All @@ -75,30 +76,28 @@

# Update layout to make it more readable
fig.update_layout(
title=f'NPMI Heatmap of Works (cooccurrence in {umfang})',
title=f'nPMI Heatmap ({unit} cooccurrence in {umfang})',
xaxis_nticks=100,
yaxis_nticks=100,
xaxis_title="Work 1",
yaxis_title="Work 2",
xaxis_title=f'{unit} 1',
yaxis_title=f'{unit} 2',
margin=dict(l=150, r=5, t=45, b=120)
)

# Export to HTML
fig.write_html(f'npmi_heatmap_{umfang}.html')


fig.write_html(f'npmi_heatmap_{unit}_{umfang}.html')

#find all pairs of items of all_works
#find all pairs of items of all_units
# pmi_result = dict()
# for i in range(len(all_works)):
# for j in range(i+1, len(all_works)):
# p_work1 = len(work_index[all_works[i]])/len(unique_text)
# p_work2 = len(work_index[all_works[j]])/len(unique_text)
# p_occurrence = len(work_index[all_works[i]].intersection(work_index[all_works[j]]))/len(unique_text)
# for i in range(len(all_units)):
# for j in range(i+1, len(all_units)):
# p_work1 = len(work_index[all_units[i]])/len(unique_text)
# p_work2 = len(work_index[all_units[j]])/len(unique_text)
# p_occurrence = len(work_index[all_units[i]].intersection(work_index[all_units[j]]))/len(unique_text)
# pmi = math.log2(p_occurrence+0.5/((p_work1+0.5)*(p_work2+0.5)))
# npmi = pmi/-math.log2(p_occurrence+0.5)
# #print(all_works[i], all_works[j],npmi)
# pmi_result[(all_works[i], all_works[j])] = npmi
# #print(all_units[i], all_units[j],npmi)
# pmi_result[(all_units[i], all_units[j])] = npmi

# #write the result to a file
# with open("pmi_result.tsv", "w",encoding="utf-8") as f:
Expand Down

0 comments on commit 99b8b4d

Please sign in to comment.