-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcooccurrence.py
88 lines (69 loc) · 2.98 KB
/
cooccurrence.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
# Load the TSV file
filename = "work_title.tsv"
df = pd.read_csv(filename, sep='\t')
# Filter the data
filtered_df = df[(df['type'] == 'Literary work') & (df['MiMoTextBase_ID'].notna())]
print(f'number of mentioned works: {len(filtered_df)}')
# Get unique entities from the filtered data
unique_entities = filtered_df['text'].unique()
print(f'number of scholarly works: {len(unique_entities)}')
# Get unique entities and create an empty co-occurrence matrix
entities = filtered_df['Identifier'].unique()
print(f'number of literary works: {len(entities)}')
co_occurrence_matrix = np.zeros((len(entities), len(entities)))
# Iterate over the rows to fill the co-occurrence matrix
for _, row in filtered_df.iterrows():
text = row['text']
identifier = row['Identifier']
text_indices = np.where(filtered_df['text'] == text)[0]
# Increment co-occurrence count for each pair of entities
for idx in text_indices:
other_identifier = filtered_df.iloc[idx]['Identifier']
if identifier != other_identifier:
identifier_index = np.where(entities == identifier)[0][0]
other_index = np.where(entities == other_identifier)[0][0]
co_occurrence_matrix[identifier_index, other_index] += 1
# Print the co-occurrence matrix
print(co_occurrence_matrix)
with open("matrix.tsv", "w") as f:
for row in co_occurrence_matrix:
f.write("\t".join(map(str, row)) + "\n")
# Convert the co-occurrence matrix into a network graph
G = nx.from_numpy_array(co_occurrence_matrix)
# Get the entity labels from the original dataframe
entity_labels = {idx: entity for idx, entity in enumerate(entities)}
#write entity_labels to file
with open("entity_labels.tsv", "w") as f:
for key, value in entity_labels.items():
f.write(str(key) + "\t" + value + "\n")
# Set the labels of the nodes in the network graph
nx.set_node_attributes(G, entity_labels, 'label')
nx.write_gexf(G, "work_title_graph.gexf")
### some backup code
"""
# Calculate degree centrality for each node
degree_centrality = nx.degree_centrality(G)
# Sort nodes based on degree centrality
sorted_nodes = sorted(degree_centrality, key=degree_centrality.get, reverse=True)
# Output a list of node labels and their centrality measures
node_centrality_list = []
for node in sorted_nodes:
label = entity_labels[node]
centrality = degree_centrality[node]
node_centrality_list.append((label, centrality))
# Print the sorted list of node labels and centrality measures
for label, centrality in node_centrality_list:
print(f"Node: {label} | Degree Centrality: {centrality}")
# Visualize the network graph with labels
plt.figure(figsize=(10, 8))
pos = nx.spring_layout(G)
labels = nx.get_node_attributes(G, 'label')
nx.draw_networkx(G, pos, labels=labels, with_labels=True, node_size=500, node_color='lightblue', edge_color='gray')
plt.title('Co-occurrence Network Graph')
plt.axis('off')
plt.show()
"""