-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathTextSummariser.py
141 lines (111 loc) · 4.14 KB
/
TextSummariser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
from langdetect import detect
text_str = input("Enter the text : ")
if(detect(text_str)=='ta'):
a=[]
with open('tamil Stop words.txt', 'r', encoding='utf-8') as f:
a+=f.readlines()
f.close()
for i in range(0,len(a)):
a[i]=a[i].rstrip('\n')
stopWords = a
elif(detect(text_str)=='en'):
stopWords = set(stopwords.words("english"))
elif(detect(text_str)=='hi'):
a=[]
with open('hindi stopwords.txt', 'r', encoding='utf-8') as f:
a+=f.readlines()
f.close()
for i in range(0,len(a)):
a[i]=a[i].rstrip('\n')
stopWords = a
def _create_frequency_table(text_string) -> dict:
"""
we create a dictionary for the word frequency table.
For this, we should only use the words that are not part of the stopWords array.
Removing stop words and making frequency table
Stemmer - an algorithm to bring words to its root word.
:rtype: dict
"""
#
words = word_tokenize(text_string)
ps = PorterStemmer()
freqTable = dict()
for word in words:
word = ps.stem(word)
if word in stopWords:
continue
if word in freqTable:
freqTable[word] += 1
else:
freqTable[word] = 1
return freqTable
def _score_sentences(sentences, freqTable) -> dict:
"""
score a sentence by its words
Basic algorithm: adding the frequency of every non-stop word in a sentence divided by total no of words in a sentence.
:rtype: dict
"""
sentenceValue = dict()
for sentence in sentences:
word_count_in_sentence = (len(word_tokenize(sentence)))
word_count_in_sentence_except_stop_words = 0
for wordValue in freqTable:
if wordValue in sentence.lower():
word_count_in_sentence_except_stop_words += 1
if sentence in sentenceValue:
sentenceValue[sentence] += freqTable[wordValue]
else:
sentenceValue[sentence] = freqTable[wordValue]
if sentence in sentenceValue:
sentenceValue[sentence] = sentenceValue[sentence] / word_count_in_sentence_except_stop_words
'''
Notice that a potential issue with our score algorithm is that long sentences will have an advantage over
short sentences.
To solve this, we're dividing every sentence score by the number of words in the sentence.
Note that here sentence[:10] is the first 10 character of any sentence, this is to save memory while saving keys of
the dictionary.
'''
print(sentenceValue)
return sentenceValue
def _find_average_score(sentenceValue) -> int:
"""
Find the average score from the sentence value dictionary
:rtype: int
"""
sumValues = 0
for entry in sentenceValue:
sumValues += sentenceValue[entry]
# Average value of a sentence from original text
average = (sumValues / len(sentenceValue))
return average
def _generate_summary(sentences, sentenceValue, threshold):
sentence_count = 0
summary = ''
for sentence in sentences:
if sentence in sentenceValue and sentenceValue[sentence] >= (threshold):
summary += " " + sentence
sentence_count += 1
return summary
def run_summarization(text):
# 1 Create the word frequency table
freq_table = _create_frequency_table(text)
'''
We already have a sentence tokenizer, so we just need
to run the sent_tokenize() method to create the array of sentences.
'''
# 2 Tokenize the sentences
sentences = sent_tokenize(text)
# 3 Important Algorithm: score the sentences
sentence_scores = _score_sentences(sentences, freq_table)
# 4 Find the threshold
threshold = _find_average_score(sentence_scores)
# 5 Important Algorithm: Generate the summary
summary = _generate_summary(sentences, sentence_scores, 1.0 * threshold)
return summary
if __name__ == '__main__':
result = run_summarization(text_str)
print("\nSummarized text is: \n")
print(result)