-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDomComparator.py
156 lines (140 loc) · 4.66 KB
/
DomComparator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import hashlib
import xml.dom.minidom
from Queue import *
from logger import LoggerHandler
from lxml.html.diff import htmldiff
from BeautifulSoup import BeautifulSoup
import tidy
from lxml import etree
from lxml.html.clean import clean_html
import os, sys
from StringIO import StringIO
logger = LoggerHandler(__name__)
def cleanDom(dom):
repl = ["</ins>,", "<ins>,","<ins>", "</ins>"]
for key in repl:
dom = dom.replace(key, "")
return dom
def hash(dom):
return(hashlib.sha256(dom.encode('utf-8')).hexdigest())
def getHtmlDiff(dom1, dom2, tagCount1, tagCount2):
diff1 = htmldiff(dom1, dom2)
#diff2 = htmldiff(dom2 ,dom1)
#print diff1
'''
if len(diff1) > len(diff2):
diff = diff1
tagCount = tagCount1
else:
diff = diff2
tagCount = tagCount2
'''
diff = diff1
tagCount = tagCount1
#diff = diff1
bdiff = BeautifulSoup(diff)
ins = ''.join(str(bdiff.findAll("ins")))
print ins
delete = ''.join(str(bdiff.findAll("del")))
print cleanDom(delete)
diffDom = cleanDom(ins)
print diffDom
return diffDom
def checkExistState(dom1,dom2):
if hash(dom1) == hash(dom2):
return True
else:
tagCount = 0
tagCount1, strippedDom1 = traverseDom(dom1)
tagCount2, strippedDom2 = traverseDom(dom2)
print "ALL tag count %d %d" % (tagCount1, tagCount2)
mintagCount = min(tagCount1,tagCount2)
maxtagCount = max(tagCount1,tagCount2)
if float(mintagCount)/float(maxtagCount) < 0.9:
logger.info("Different States Huge Difference in Tag Count")
return False
'''
diff1 = htmldiff(strippedDom1, strippedDom2)
diff2 = htmldiff(strippedDom2,strippedDom1)
if len(diff1) > len(diff2):
diff = diff1
tagCount = tagCount1
else:
diff = diff2
tagCount = tagCount2
bdiff = BeautifulSoup(diff)
ins = ''.join(str(bdiff.findAll("ins")))
delete = ''.join(str(bdiff.findAll("del")))
print cleanDom(delete)
diffDom = cleanDom(ins)
print diffDom
'''
tagCount = tagCount1
diffDom = getHtmlDiff(strippedDom1, strippedDom2, tagCount1, tagCount2)
if diffDom!="[]":
diffTagCount,diffStrippedDom = traverseDom(diffDom)
else:
if hash(strippedDom1) == hash(strippedDom2):
return True
else:
#diffDom = getHtmlDiff(dom1, dom2, tagCount1, tagCount2)
#if diffDom!= "[]":
# diffTagCount,difference = traverseDom(diffDom)
#else:
logger.info("Different States No Insert Delete Tags Found")
return False
logger.info("tag count %d %d" % (diffTagCount, tagCount1))
if (float(diffTagCount)/float(tagCount))*100 > 5:
print diffTagCount, tagCount
print float(diffTagCount)/float(tagCount)
return False
logger.info("STATE ALREADY EXIST")
#print dom1
#print dom2
return True
def getDomDiff(parentDom, childDom):
html = htmldiff(parentDom, childDom)
bshtml = BeautifulSoup(html)
ins = ''.join(str(bshtml.findAll("ins")))
diffDom = cleanDom(ins)
return diffDom
def traverseDom(domString):
out = StringIO()
domString = str(tidy.parseString(domString))
domString = clean_html(domString)
tree = etree.HTML(domString.replace('\r', ''))
domString = '\n'.join([ etree.tostring(stree, pretty_print=True, method="xml")
for stree in tree ])
tagCount = 0
dom = xml.dom.minidom.parseString(domString)
q = Queue()
nodes = dom.childNodes
for node in nodes:
q.put(node)
while(q.empty() == False):
cur = q.get()
tagCount+=1
if cur.nodeName == '#text':
cur.nodeValue = ""
tagCount-=1
if cur.attributes:
keys = cur.attributes.keys()
for key in keys:
cur.attributes[key].value = ""
l = len(cur.childNodes)
i = 0
while i < l:
q.put(cur.childNodes[i])
i+=1
dom.writexml(out)
s = out.getvalue()
return (tagCount, s)
'''
file1 = open("page1", "r").read()
file3 = open("page1", "r").read()
benign1="<html><head><title>Title1</title></head><body><b>Hi</b><a href='a'></a><p>hello1</p></body></html>"
benign2 = None
benign2="<html><head><title>Title2</title></head><body><b>Hi</b><a href='a'></a><p>hello2</p></body></html>"
hostile1="<html><head><title>Title3</title></head><body><b>Hi</b><a href='a'></a><p>hello3</p></body></html>"
htmlCompare(file1,None,file3)
'''