Skip to content

Commit

Permalink
[lazylex/html] Skeleton for XML conversion
Browse files Browse the repository at this point in the history
I think we can do a sed-like conversion.
  • Loading branch information
Andy C committed Jan 13, 2025
1 parent f2bf58f commit e3f348b
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 33 deletions.
14 changes: 14 additions & 0 deletions lazylex/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -919,6 +919,20 @@ def Validate(contents, flags, counters):
counters.num_tokens += len(tokens)


def ToXml(h):
# type: (str) -> str

# TODO:
# 1. Lex it
# 2. < & > must be escaped
# a. in raw data
# b. in quoted strings
# 3. <script> turned into CDATA
# 4. void tags turned into self-closing tags
# 5. case-sensitive tag matching - not sure about this
return h


class Counters(object):

def __init__(self):
Expand Down
92 changes: 59 additions & 33 deletions lazylex/html_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,20 +348,11 @@ def testInvalid(self):
self.fail('Expected LexError %r' % s)

def testValid(self):
for s in VALID_LEX:
for s, _ in VALID_LEX:
tokens = Lex(s)
print()


VALID_LEX = [
'<foo>',
'<foo x=y>',
'<foo x="&">',

# Allowed with BadAmpersand
'<p> x & y </p>',
]

INVALID_LEX = [
'<a><',
'&amp<',
Expand All @@ -379,50 +370,71 @@ def testValid(self):
'<STYLEz><</STYLEz>',
]

VALID_LEX = [
# TODO: convert these to XML
('<foo></foo>', ''),
('<foo x=y></foo>', ''),
('<foo x="&"></foo>', ''),

# Allowed with BadAmpersand
('<p> x & y </p>', ''),
]

INVALID_PARSE = [
'<a></b>',
'<a>', # missing closing tag
'<meta></meta>', # this is a self-closing tag
]

SKIP = 0
UNCHANGED = 1

VALID_PARSE = [
'<!DOCTYPE html>\n',
'<!DOCTYPE>',
('<!DOCTYPE html>\n', ''),
('<!DOCTYPE>', ''),

# empty strings
'<p x=""></p>',
"<p x=''></p>",
('<p x=""></p>', UNCHANGED),
("<p x=''></p>", UNCHANGED),

('<self-closing a="b" />', UNCHANGED),

# We could also normalize CDATA?
# Note that CDATA has an escaping problem: you need to handle it ]]> with
# concatenation. It just "pushes the problem around".
# So I think it's better to use ONE kind of escaping, which is &lt;
('<script><![CDATA[ <wtf> >< ]]></script>', UNCHANGED),

# allowed, but 3 < 4 is not allowed
'<a> 3 > 4 </a>',
('<a> 3 > 4 </a>', ''),
# allowed, but 3 > 4 is not allowed
'<p x="3 < 4"></p>',
'<b><a href="foo">link</a></b>',
'<meta><a></a>',
('<p x="3 < 4"></p>', ''),
('<b><a href="foo">link</a></b>', ''),
('<meta><a></a>', ''),
# no attribute
'<button disabled></button>',
'<button disabled=></button>',
'<button disabled= ></button>',
('<button disabled></button>', ''),
('<button disabled=></button>', ''),
('<button disabled= ></button>', ''),

# single quoted is pretty common
"<a href='single'></a>",
("<a href='single'></a>", ''),

# Conceding to reality - I used these myself
'<a href=ble.sh></a>',
'<a href=foo.html></a>',
'<foo x="&"></foo>',
('<a href=ble.sh></a>', ''),
('<a href=foo.html></a>', ''),
('<foo x="&"></foo>', ''),

# caps
'<foo></FOO>',
'<Foo></fOO>',
('<foo></FOO>', ''),
('<Foo></fOO>', ''),

# capital VOID tag
'<META><a></a>',
'<script><</script>',
('<META><a></a>', ''),
('<script><</script>', ''),
# matching
'<SCRipt><</SCRipt>',
'<SCRIPT><</SCRIPT>',
'<STYLE><</STYLE>',
('<SCRipt><</SCRipt>', ''),
('<SCRIPT><</SCRIPT>', ''),
('<STYLE><</STYLE>', ''),
#'<SCRipt><</script>',

# Note: Python HTMLParser.py does DYNAMIC compilation of regex with re.I
Expand Down Expand Up @@ -467,7 +479,7 @@ def testInvalid(self):

def testValid(self):
counters = html.Counters()
for s in VALID_PARSE:
for s, _ in VALID_PARSE:
html.Validate(s, html.BALANCED_TAGS, counters)
print('HTML5 %r' % s)
print('HTML5 attrs %r' % counters.debug_attrs)
Expand All @@ -481,5 +493,19 @@ def testValidXml(self):
print('XML attrs %r' % counters.debug_attrs)


class XmlTest(unittest.TestCase):

def testValid(self):
counters = html.Counters()
for h, expected_xml in VALID_LEX + VALID_PARSE:
actual = html.ToXml(h)
if expected_xml == UNCHANGED: # Unchanged
self.assertEqual(h, actual)
elif expected_xml == '': # Skip
pass
else:
self.assertEqual(expected_xml, actual)


if __name__ == '__main__':
unittest.main()

0 comments on commit e3f348b

Please sign in to comment.