diff --git a/lazylex/html.py b/lazylex/html.py
index 4014d417e..09251037c 100755
--- a/lazylex/html.py
+++ b/lazylex/html.py
@@ -919,6 +919,20 @@ def Validate(contents, flags, counters):
counters.num_tokens += len(tokens)
+def ToXml(h):
+ # type: (str) -> str
+
+ # TODO:
+ # 1. Lex it
+ # 2. < & > must be escaped
+ # a. in raw data
+ # b. in quoted strings
+ # 3. ', UNCHANGED),
# allowed, but 3 < 4 is not allowed
- ' 3 > 4 ',
+ (' 3 > 4 ', ''),
# allowed, but 3 > 4 is not allowed
- '
',
- 'link',
- '',
+ ('', ''),
+ ('link', ''),
+ ('', ''),
# no attribute
- '',
- '',
- '',
+ ('', ''),
+ ('', ''),
+ ('', ''),
# single quoted is pretty common
- "",
+ ("", ''),
# Conceding to reality - I used these myself
- '',
- '',
- '',
+ ('', ''),
+ ('', ''),
+ ('', ''),
# caps
- '',
- '',
+ ('', ''),
+ ('', ''),
# capital VOID tag
- '',
- '',
+ ('', ''),
+ ('', ''),
# matching
- '',
- '',
- '',
+ ('', ''),
+ ('', ''),
+ ('', ''),
#'',
# Note: Python HTMLParser.py does DYNAMIC compilation of regex with re.I
@@ -467,7 +479,7 @@ def testInvalid(self):
def testValid(self):
counters = html.Counters()
- for s in VALID_PARSE:
+ for s, _ in VALID_PARSE:
html.Validate(s, html.BALANCED_TAGS, counters)
print('HTML5 %r' % s)
print('HTML5 attrs %r' % counters.debug_attrs)
@@ -481,5 +493,19 @@ def testValidXml(self):
print('XML attrs %r' % counters.debug_attrs)
+class XmlTest(unittest.TestCase):
+
+ def testValid(self):
+ counters = html.Counters()
+ for h, expected_xml in VALID_LEX + VALID_PARSE:
+ actual = html.ToXml(h)
+ if expected_xml == UNCHANGED: # Unchanged
+ self.assertEqual(h, actual)
+ elif expected_xml == '': # Skip
+ pass
+ else:
+ self.assertEqual(expected_xml, actual)
+
+
if __name__ == '__main__':
unittest.main()