From 3312f87ada0020caa7aa82fcce7abf1e992fe686 Mon Sep 17 00:00:00 2001 From: pp-qq Date: Thu, 14 Sep 2017 17:12:12 +0800 Subject: [PATCH] =?UTF-8?q?fix(url):=20canonicalize=5Furl('http://?= =?UTF-8?q?=E6=82=A8=E5=A5=BD.=E4=B8=AD=E5=9B=BD:80/')=20failed?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit expect: 'http://xn--5usr0o.xn--fiqs8s:80/' actual: 'http://xn--5usr0o.xn--:80-u68dy61b/' --- w3lib/url.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/w3lib/url.py b/w3lib/url.py index 4be74f74..a2025df7 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -374,7 +374,15 @@ def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'): # IDNA encoding can fail for too long labels (>63 characters) # or missing labels (e.g. http://.example.com) try: - netloc = parts.netloc.encode('idna') + idx = parts.netloc.rfind(u':') + if idx != -1: + hostname = parts.netloc[:idx] + portpart = parts.netloc[idx:] + else: + hostname = parts.netloc + portpart = u'' + hostname = to_unicode(hostname.encode('idna')) + netloc = hostname + portpart except UnicodeError: netloc = parts.netloc