-
Notifications
You must be signed in to change notification settings - Fork 1
/
explore_corpus.py
37 lines (26 loc) · 923 Bytes
/
explore_corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import pandas as pd
from collections import Counter
import re
import random
import pdb
filename = '../data/echr.txt'
#filename = '../data/icaad.txt'
#filename = '../data/hrc.txt'
#filename = '../data/wiki+hr.txt'
keywords = ['rape', 'raped', 'sexual', 'assault', 'sexual assault']
with open(filename, 'r') as f:
text = f.readline()
counter = Counter(text.split())
print('\n *** CORPUS: ****')
print('{}\n'.format(filename))
print('Number of Words: {}'.format(len(text.split())))
print('Vocabulary size: {} \n \n'.format(len(counter)))
for word in keywords:
print('Frequency of {}: {} \n'.format(word, counter.get(word)))
def get_words_in_context(word):
ind = [match.start() for match in re.finditer(word, text)]
for i in range(10):
rnd = random.randrange(len(ind))
rnd_ind = ind[rnd]
print('- {} \n'.format(text[rnd_ind - 50: rnd_ind + 50]))
get_words_in_context('raped')