diff --git a/lazylex/html.py b/lazylex/html.py index 4014d417e..09251037c 100755 --- a/lazylex/html.py +++ b/lazylex/html.py @@ -919,6 +919,20 @@ def Validate(contents, flags, counters): counters.num_tokens += len(tokens) +def ToXml(h): + # type: (str) -> str + + # TODO: + # 1. Lex it + # 2. < & > must be escaped + # a. in raw data + # b. in quoted strings + # 3. ', UNCHANGED), # allowed, but 3 < 4 is not allowed - ' 3 > 4 ', + (' 3 > 4 ', ''), # allowed, but 3 > 4 is not allowed - '

', - 'link', - '', + ('

', ''), + ('link', ''), + ('', ''), # no attribute - '', - '', - '', + ('', ''), + ('', ''), + ('', ''), # single quoted is pretty common - "", + ("", ''), # Conceding to reality - I used these myself - '', - '', - '', + ('', ''), + ('', ''), + ('', ''), # caps - '', - '', + ('', ''), + ('', ''), # capital VOID tag - '', - '', + ('', ''), + ('', ''), # matching - '', - '', - '', + ('', ''), + ('', ''), + ('', ''), #'', # Note: Python HTMLParser.py does DYNAMIC compilation of regex with re.I @@ -467,7 +479,7 @@ def testInvalid(self): def testValid(self): counters = html.Counters() - for s in VALID_PARSE: + for s, _ in VALID_PARSE: html.Validate(s, html.BALANCED_TAGS, counters) print('HTML5 %r' % s) print('HTML5 attrs %r' % counters.debug_attrs) @@ -481,5 +493,19 @@ def testValidXml(self): print('XML attrs %r' % counters.debug_attrs) +class XmlTest(unittest.TestCase): + + def testValid(self): + counters = html.Counters() + for h, expected_xml in VALID_LEX + VALID_PARSE: + actual = html.ToXml(h) + if expected_xml == UNCHANGED: # Unchanged + self.assertEqual(h, actual) + elif expected_xml == '': # Skip + pass + else: + self.assertEqual(expected_xml, actual) + + if __name__ == '__main__': unittest.main()