diff --git a/tests/test_url.py b/tests/test_url.py index 0df5bfdc..79e2af44 100644 --- a/tests/test_url.py +++ b/tests/test_url.py @@ -146,12 +146,14 @@ def test_safe_url_idna(self): # Japanese (u'http://はじめよう.みんな/?query=サ&maxResults=5', 'http://xn--p8j9a0d9c9a.xn--q9jyb4c/?query=%E3%82%B5&maxResults=5'), + (u'http://はじめよう.みんな:80/?query=サ&maxResults=5', 'http://xn--p8j9a0d9c9a.xn--q9jyb4c:80/?query=%E3%82%B5&maxResults=5'), # Russian (u'http://кто.рф/', 'http://xn--j1ail.xn--p1ai/'), (u'http://кто.рф/index.php?domain=Что', 'http://xn--j1ail.xn--p1ai/index.php?domain=%D0%A7%D1%82%D0%BE'), # Korean + (u'http://내도메인.한국:80/', 'http://xn--220b31d95hq8o.xn--3e0b707e:80/'), (u'http://내도메인.한국/', 'http://xn--220b31d95hq8o.xn--3e0b707e/'), (u'http://맨체스터시티축구단.한국/', 'http://xn--2e0b17htvgtvj9haj53ccob62ni8d.xn--3e0b707e/'), @@ -159,6 +161,8 @@ def test_safe_url_idna(self): (u'http://nic.شبكة', 'http://nic.xn--ngbc5azd'), # Chinese + (u'http://您好.中国/', 'http://xn--5usr0o.xn--fiqs8s/'), + (u'http://您好.中国:80/', 'http://xn--5usr0o.xn--fiqs8s:80/'), (u'https://www.贷款.在线', 'https://www.xn--0kwr83e.xn--3ds443g'), (u'https://www2.xn--0kwr83e.在线', 'https://www2.xn--0kwr83e.xn--3ds443g'), (u'https://www3.贷款.xn--3ds443g', 'https://www3.xn--0kwr83e.xn--3ds443g'), @@ -394,10 +398,15 @@ def test_typical_usage(self): def test_port_number(self): self.assertEqual(canonicalize_url("http://www.example.com:8888/do?a=1&b=2&c=3"), "http://www.example.com:8888/do?a=1&b=2&c=3") + + self.assertEqual(canonicalize_url(u'http://您好.中国:80/'), 'http://xn--5usr0o.xn--fiqs8s:80/') + # trailing empty ports are removed self.assertEqual(canonicalize_url("http://www.example.com:/do?a=1&b=2&c=3"), "http://www.example.com/do?a=1&b=2&c=3") + self.assertEqual(canonicalize_url(u'http://您好.中国:/'), 'http://xn--5usr0o.xn--fiqs8s/') + def test_sorting(self): self.assertEqual(canonicalize_url("http://www.example.com/do?c=3&b=5&b=2&a=50"), "http://www.example.com/do?a=50&b=2&b=5&c=3") @@ -522,10 +531,17 @@ def test_domains_are_case_insensitive(self): def test_canonicalize_idns(self): self.assertEqual(canonicalize_url(u'http://www.bücher.de?q=bücher'), 'http://www.xn--bcher-kva.de/?q=b%C3%BCcher') + + self.assertEqual(canonicalize_url(u'http://www.bücher.de:80?q=bücher'), + 'http://www.xn--bcher-kva.de:80/?q=b%C3%BCcher') + # Japanese (+ reordering query parameters) self.assertEqual(canonicalize_url(u'http://はじめよう.みんな/?query=サ&maxResults=5'), 'http://xn--p8j9a0d9c9a.xn--q9jyb4c/?maxResults=5&query=%E3%82%B5') + self.assertEqual(canonicalize_url(u'http://はじめよう.みんな:80/?query=サ&maxResults=5'), + 'http://xn--p8j9a0d9c9a.xn--q9jyb4c:80/?maxResults=5&query=%E3%82%B5') + def test_quoted_slash_and_question_sign(self): self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC+rocks%3f/?yeah=1"), "http://foo.com/AC%2FDC+rocks%3F/?yeah=1") diff --git a/w3lib/url.py b/w3lib/url.py index 4be74f74..6b9c2d49 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -19,6 +19,28 @@ from w3lib.util import to_bytes, to_native_str, to_unicode +def _encode_netloc(onetloc): + """ + :type onetloc: unicode + :rtype: unicode + """ + try: + idx = onetloc.rfind(u':') + if idx != -1: + hostname = onetloc[:idx] + portpart = onetloc[idx:] + else: + hostname = onetloc + portpart = u'' + # assert isinstance(hostname, unicode) + # assert isinstance(portpart, unicode) + hostname = to_unicode(hostname.encode('idna')) + netloc = hostname + portpart + except UnicodeError: + netloc = onetloc + return netloc + + # error handling function for bytes-to-Unicode decoding errors with URLs def _quote_byte(error): return (to_unicode(quote(error.object[error.start:error.end])), error.end) @@ -61,10 +83,7 @@ def safe_url_string(url, encoding='utf8', path_encoding='utf8'): # IDNA encoding can fail for too long labels (>63 characters) # or missing labels (e.g. http://.example.com) - try: - netloc = parts.netloc.encode('idna') - except UnicodeError: - netloc = parts.netloc + netloc = _encode_netloc(parts.netloc) # quote() in Python2 return type follows input type; # quote() in Python3 always returns Unicode (native str) @@ -373,10 +392,7 @@ def parse_data_uri(uri): def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'): # IDNA encoding can fail for too long labels (>63 characters) # or missing labels (e.g. http://.example.com) - try: - netloc = parts.netloc.encode('idna') - except UnicodeError: - netloc = parts.netloc + netloc = _encode_netloc(parts.netloc) return ( to_native_str(parts.scheme),