-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_wordclouds.py
98 lines (83 loc) · 3.1 KB
/
create_wordclouds.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
"""
This script creates wordclouds from the results of the pubmed_trends_analysis tool
"""
from collections import defaultdict
import re
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
# Result file created by dig_analysis.sh
RESULTS = "results/omics_trend_2024-02-09_15-21_dig_analysis.tsv"
with open(RESULTS, "r", encoding='utf-8') as f:
words, keywords = np.genfromtxt(f, usecols=[2, 4], dtype='str', delimiter="\t").transpose()
# Keywords of interest (given in omics_keywords.txt)
targets = ['proteomics', 'metabolomics', 'lipidomics', 'glycomics']
# Function create wordclound in loop
def word_cloud(dict_):
wc = WordCloud(max_words=90,
background_color="white",
contour_width=0,
colormap='Dark2').generate_from_frequencies(dict_)
plt.axis('off')
return wc
# Dict of words to replace in wordclouds
to_replace = {
'ms': 'MS',
'gc-ms': 'GC-MS',
'lc-ms/ms': 'LC-MS/MS',
'lc-ms': 'LC-MS',
'maldi-ms': 'MALDI-MS',
'maldi': 'MALDI',
'maldi-tof ms': 'MALDI-TOF MS',
'maldi-tog-ms': 'MALDI-TOF MS',
'nmr': 'NMR',
'silac': 'SILAC',
'itraq': 'iTraq',
'tmt': 'TMT',
'covid-19': 'COVID-19',
'sars-cov-2': 'SARS-CoV-2',
'igg': 'IgG',
'o-glycosylation': 'O-glycosylation',
'n-glycosylation': 'N-glycosylation',
'n-glycan': 'N-glycan',
'n-glycome': 'N-glycome',
'n-linked glycans': 'N-linked glycans',
'n-linked glycosylation': 'N-linked glycosylation',
'o-glycan': 'O-glycan',
'rna-seq': 'RNA-Seq',
'nafld': 'NAFLD',
'hplc': 'HPLC',
'immunoglubulin g': 'immunoglobulin G',
'omic': 'omics',
'multi-omic': 'multi-omics'
}
# Count words frequencies and create wordclouds
for target in targets:
# Get words related to the target keyword (but remove the keyword itself)
pat = re.compile(f'{target}|{target[:-1]}')
list_words = [pat.sub('', word.lower())
for word_string in words[keywords == target] for word in word_string.split(';')]
word, count = np.unique(list_words, return_counts=True)
res = dict(zip(word, count))\
key_to_remove = ['']
items_to_add = defaultdict(int)
for w, c in res.items():
if w+'s' in res:
res[w] += res[w+'s']
key_to_remove.append(w+'s')
# Change some words/accronyms
if w in to_replace:
items_to_add[to_replace[w]] = res[w]
key_to_remove.append(w)
# Remove lower case accronyms and words plural form
for k in np.unique(key_to_remove):
res.pop(k)
# Update dict with replaced terms
res = res | items_to_add
if 'lc-ms/m' in res:
res['LC-MS/MS'] += res['lc-ms/m']
res.pop('lc-ms/m')
wc = word_cloud(res)
wordcloud_svg = wc.to_svg(embed_font=True)
with open(f'{target}_wc.svg', 'w') as f:
f.write(wordcloud_svg)