-
Notifications
You must be signed in to change notification settings - Fork 25
/
Copy pathutil.py
123 lines (94 loc) · 4.63 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
from config import *
##############################################################################################################################
def isSublist(A, B):
# returns True if A is a sublist of B, False otherwise
sub = True
for item in A:
if item not in B:
sub = False
break
return sub
##############################################################################################################################
##############################################################################################################################
def findAllCommonContiguousSublists(A, B, turnToLowerCases=True): # this is a very inefficient implementation, you can use suffix trees to devise a much faster method
# returns all the contiguous sublists in order of decreasing length
# output format (0-indexed):
# [
# [[indices in 'A' for common sublist 1], [indices in 'B' for common sublist 1]],
# ...,
# [[indices in 'A' for common sublist n], [indices in 'B' for common sublist n]]
# ]
a = []
b = []
for item in A:
a.append(item)
for item in B:
b.append(item)
if turnToLowerCases:
for i in xrange(len(a)):
a[i] = a[i].lower()
for i in xrange(len(b)):
b[i] = b[i].lower()
commonContiguousSublists = []
swapped = False
if len(a) > len(b):
temp = a
a = b
b = temp
swapped = True
maxSize = len(a)
for size in xrange(maxSize, 0, -1):
startingIndicesForA = [item for item in xrange(0, len(a)-size+1)]
startingIndicesForB = [item for item in xrange(0, len(b)-size+1)]
for i in startingIndicesForA:
for j in startingIndicesForB:
if a[i:i+size] == b[j:j+size]:
# check if a contiguous superset has already been inserted; don't insert this one in that case
alreadyInserted = False
currentAIndices = [item for item in xrange(i,i+size)]
currentBIndices = [item for item in xrange(j,j+size)]
for item in commonContiguousSublists:
if isSublist(currentAIndices, item[0]) and isSublist(currentBIndices, item[1]):
alreadyInserted = True
break
if not alreadyInserted:
commonContiguousSublists.append([currentAIndices, currentBIndices])
if swapped:
for item in commonContiguousSublists:
temp = item[0]
item[0] = item[1]
item[1] = temp
return commonContiguousSublists
##############################################################################################################################
##############################################################################################################################
def findTextualNeighborhood(sentenceDetails, wordIndex, leftSpan, rightSpan):
# return the lemmas in the span [wordIndex-leftSpan, wordIndex+rightSpan] and the positions actually available, left and right
global punctuations
sentenceLength = len(sentenceDetails)
startWordIndex = max(1, wordIndex-leftSpan)
endWordIndex = min(sentenceLength, wordIndex+rightSpan)
lemmas = []
wordIndices = []
for item in sentenceDetails[startWordIndex-1:wordIndex-1]:
if item[3] not in stopwords + punctuations:
lemmas.append(item[3])
wordIndices.append(item[1])
for item in sentenceDetails[wordIndex:endWordIndex]:
if item[3] not in stopwords + punctuations:
lemmas.append(item[3])
wordIndices.append(item[1])
return [wordIndices, lemmas, wordIndex-startWordIndex, endWordIndex-wordIndex]
##############################################################################################################################
##############################################################################################################################
def isAcronym(word, namedEntity):
# returns whether 'word' is an acronym of 'namedEntity', which is a list of the component words
canonicalWord = word.replace('.', '')
if not canonicalWord.isupper() or len(canonicalWord) <> len(namedEntity) or canonicalWord.lower() in ['a', 'i']:
return False
acronym = True
for i in xrange(len(canonicalWord)):
if canonicalWord[i] <> namedEntity[i][0]:
acronym = False
break
return acronym
##############################################################################################################################