add cooccurence between authors

MiMoText · Nov 2, 2023 · 99b8b4d · 99b8b4d
1 parent dafc463
commit 99b8b4d
Show file tree

Hide file tree

Showing 5 changed files with 62 additions and 35 deletions.
diff --git a/npmi_heatmap_author_sentence.html b/npmi_heatmap_author_sentence.html
diff --git a/npmi_heatmap_author_text.html b/npmi_heatmap_author_text.html
diff --git a/npmi_heatmap_sentence.html → npmi_heatmap_work_sentence.html b/npmi_heatmap_sentence.html → npmi_heatmap_work_sentence.html
diff --git a/npmi_heatmap_text.html → npmi_heatmap_work_text.html b/npmi_heatmap_text.html → npmi_heatmap_work_text.html
diff --git a/pmi.py b/pmi.py
@@ -3,42 +3,49 @@
 import numpy as np
 import plotly.graph_objects as go
 
+umfang = "text" # set it to "sentence" in order to get only the cooccurrence in the same sentence
+unit = "author" # alternatively set it to "work"
+
 df = pd.read_csv("work_title.tsv", sep="\t")
 df = df[df.type == "Literary work"]
 #remove the rows where identifier is empty
 df = df[df.Identifier.notnull()]
 df = df[df.MiMoTextBase_ID.notnull()]
 
-all_works = df.Identifier.unique().tolist() #get a list of all mentioned literary works
+def get_author(s):
+    return s.split("_")[-1]
+
+if unit == "author":
+    df['Identifier'] = df['Identifier'].apply(get_author)
+
+all_units = df.Identifier.unique().tolist() #get a list of all mentioned literary works
 work_index = dict() #get a index of secondary works for each literary work
-for work in all_works:
+for work in all_units:
     work_index[work] = set()
 
 unique_text = df.text.unique() #get a list of all secondary works
 
-umfang = "text" # set it to "sentence" in order to get only the cooccurrence in the same sentence 
-
 for title in unique_text:
     if umfang == "text":
         ner_list = df[df.text == title].Identifier.unique() #get a list of unique literary works mentioned in the secondary work
-        for work in all_works:
+        for work in all_units:
             if work in ner_list:
                 work_index[work].add(title)
     else:
         for sentence in df[df.text == title].sentence.unique():
             ner_list = df[df.text == title][df.sentence == sentence].Identifier.unique()
-            for work in all_works:
+            for work in all_units:
                 if work in ner_list:
                     work_index[work].add(title+"_"+str(sentence))
 
-n = len(all_works)
+n = len(all_units)
 npmi_matrix = np.zeros((n, n))
 
 for i in range(n):
     for j in range(i+1, n):  # Only compute upper triangle, since matrix is symmetric
-        p_work1 = len(work_index[all_works[i]]) / len(unique_text)
-        p_work2 = len(work_index[all_works[j]]) / len(unique_text)
-        p_occurrence = len(work_index[all_works[i]].intersection(work_index[all_works[j]])) / len(unique_text)
+        p_work1 = len(work_index[all_units[i]]) / len(unique_text)
+        p_work2 = len(work_index[all_units[j]]) / len(unique_text)
+        p_occurrence = len(work_index[all_units[i]].intersection(work_index[all_units[j]])) / len(unique_text)
 
         if p_occurrence == 0:  # Avoid log(0) and division by zero
             npmi = 0
@@ -50,17 +57,11 @@
         npmi_matrix[j][i] = npmi
 
 # Assuming npmi_matrix is already computed and is a numpy array
-x_labels = all_works
-y_labels = all_works
+x_labels = all_units
+y_labels = all_units
 
 custom_colorscale = [
-    [0.0, 'rgb(255,255,255)']  # white for zero
-    # You can add more colors for other values here.
-    # For example:
-    # [0.1, 'rgb(235, 237, 240)'],
-    # [1.0, 'rgb(0, 0, 255)']
-    # Ensure that the colorscale covers the range from the minimum to the maximum
-    # values that your npmi_matrix can take, excluding zero if it has a special color.
+    [0.0, 'rgb(255,255,255)']
 ]
 
 fig = go.Figure(data=go.Heatmap(
@@ -75,30 +76,28 @@
 
 # Update layout to make it more readable
 fig.update_layout(
-    title=f'NPMI Heatmap of Works (cooccurrence in {umfang})',
+    title=f'nPMI Heatmap ({unit} cooccurrence in {umfang})',
     xaxis_nticks=100,
     yaxis_nticks=100,
-    xaxis_title="Work 1",
-    yaxis_title="Work 2",
+    xaxis_title=f'{unit} 1',
+    yaxis_title=f'{unit} 2',
     margin=dict(l=150, r=5, t=45, b=120)
 )
 
 # Export to HTML
-fig.write_html(f'npmi_heatmap_{umfang}.html')
-
-
+fig.write_html(f'npmi_heatmap_{unit}_{umfang}.html')
 
-#find all pairs of items of all_works
+#find all pairs of items of all_units
 # pmi_result = dict()
-# for i in range(len(all_works)):
-#     for j in range(i+1, len(all_works)):
-#         p_work1 = len(work_index[all_works[i]])/len(unique_text)
-#         p_work2 = len(work_index[all_works[j]])/len(unique_text)
-#         p_occurrence = len(work_index[all_works[i]].intersection(work_index[all_works[j]]))/len(unique_text)
+# for i in range(len(all_units)):
+#     for j in range(i+1, len(all_units)):
+#         p_work1 = len(work_index[all_units[i]])/len(unique_text)
+#         p_work2 = len(work_index[all_units[j]])/len(unique_text)
+#         p_occurrence = len(work_index[all_units[i]].intersection(work_index[all_units[j]]))/len(unique_text)
 #         pmi = math.log2(p_occurrence+0.5/((p_work1+0.5)*(p_work2+0.5)))
 #         npmi = pmi/-math.log2(p_occurrence+0.5)
-#         #print(all_works[i], all_works[j],npmi)
-#         pmi_result[(all_works[i], all_works[j])] = npmi
+#         #print(all_units[i], all_units[j],npmi)
+#         pmi_result[(all_units[i], all_units[j])] = npmi
 
 # #write the result to a file
 # with open("pmi_result.tsv", "w",encoding="utf-8") as f: