From be4f3ad6605f5131c3f71da542280d89b8cadbec Mon Sep 17 00:00:00 2001 From: Catia Goncalves Date: Fri, 9 Jun 2017 10:17:51 -0700 Subject: [PATCH 1/3] test for curly quotes --- conformance/extract.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/conformance/extract.yml b/conformance/extract.yml index b9d1ba09c..62bec090d 100644 --- a/conformance/extract.yml +++ b/conformance/extract.yml @@ -849,6 +849,10 @@ tests: text: "#_ #1_2 #122 #〃" expected: [] + - description: "Extract hashtag followed by curly quotes" + text: "#whatshappening’s trending." + expected: [whatshappening] + hashtags_from_astral: - description: "Extract hashtag with letter from astral plane (U+20021)" text: "#\U00020021" @@ -952,6 +956,10 @@ tests: text: "$CashtagMustBeLessThanSixCharacter" expected: [] + - description: "Extract cashtag followed by curly quotes" + text: "$Stock’s trending." + expected: [Stock] + cashtags_with_indices: - description: "Extract cashtags" text: "Example: $TEST $symbol test" From 336295e0131491a9ca6c90205340fd93bdca16a4 Mon Sep 17 00:00:00 2001 From: Catia Goncalves Date: Mon, 12 Jun 2017 13:26:44 -0700 Subject: [PATCH 2/3] support smart quotes at end of cashtags --- java/src/com/twitter/Regex.java | 2 +- js/twitter-text.js | 2 +- objc/lib/TwitterText.m | 2 +- objc/tests/json-conformance/extract.json | 14 ++++++++++++++ rb/lib/twitter-text/regex.rb | 2 +- 5 files changed, 18 insertions(+), 4 deletions(-) diff --git a/java/src/com/twitter/Regex.java b/java/src/com/twitter/Regex.java index 1b8adff97..99f6ce0aa 100644 --- a/java/src/com/twitter/Regex.java +++ b/java/src/com/twitter/Regex.java @@ -263,7 +263,7 @@ public class Regex { VALID_URL = Pattern.compile(VALID_URL_PATTERN_STRING, Pattern.CASE_INSENSITIVE); VALID_TCO_URL = Pattern.compile("^https?:\\/\\/t\\.co\\/[a-z0-9]+", Pattern.CASE_INSENSITIVE); - VALID_CASHTAG = Pattern.compile("(^|" + UNICODE_SPACES + ")(" + DOLLAR_SIGN_CHAR + ")(" + CASHTAG + ")" + "(?=$|\\s|\\p{Punct})", Pattern.CASE_INSENSITIVE); + VALID_CASHTAG = Pattern.compile("(^|" + UNICODE_SPACES + ")(" + DOLLAR_SIGN_CHAR + ")(" + CASHTAG + ")" + "(?=$|\\s|\\p{Punct}|\\u2019)", Pattern.CASE_INSENSITIVE); VALID_DOMAIN = Pattern.compile(URL_VALID_DOMAIN, Pattern.CASE_INSENSITIVE); } } diff --git a/js/twitter-text.js b/js/twitter-text.js index af3200d87..009af94c1 100644 --- a/js/twitter-text.js +++ b/js/twitter-text.js @@ -273,7 +273,7 @@ // cashtag related regex twttr.txt.regexen.cashtag = /[a-z]{1,6}(?:[._][a-z]{1,2})?/i; - twttr.txt.regexen.validCashtag = regexSupplant('(^|#{spaces})(\\$)(#{cashtag})(?=$|\\s|[#{punct}])', 'gi'); + twttr.txt.regexen.validCashtag = regexSupplant('(^|#{spaces})(\\$)(#{cashtag})(?=$|\\s|[#{punct}]|\u2019)', 'gi'); // These URL validation pattern strings are based on the ABNF from RFC 3986 twttr.txt.regexen.validateUrlUnreserved = /[a-z\u0400-\u04FF0-9\-._~]/i; diff --git a/objc/lib/TwitterText.m b/objc/lib/TwitterText.m index 4b51e0afe..60674c632 100644 --- a/objc/lib/TwitterText.m +++ b/objc/lib/TwitterText.m @@ -82,7 +82,7 @@ #define TWUValidSymbol \ @"(?:^|[" TWUUnicodeSpaces @"])" \ @"(\\$" TWUSymbol @")" \ - @"(?=$|\\s|[" TWUPunctuationChars @"])" + @"(?=$|\\s|[" TWUPunctuationChars @"]|\\u2019)" // // Mention and list name diff --git a/objc/tests/json-conformance/extract.json b/objc/tests/json-conformance/extract.json index ae27fbc15..8925c9824 100644 --- a/objc/tests/json-conformance/extract.json +++ b/objc/tests/json-conformance/extract.json @@ -1680,6 +1680,13 @@ "expected": [ ] + }, + { + "description": "Extract hashtag followed by curly quotes", + "text": "#whatshappening’s trending.", + "expected": [ + "whatshappening" + ] } ], "hashtags_from_astral": [ @@ -1901,6 +1908,13 @@ "expected": [ ] + }, + { + "description": "Extract cashtag followed by curly quotes", + "text": "$Stock’s trending.", + "expected": [ + "Stock" + ] } ], "cashtags_with_indices": [ diff --git a/rb/lib/twitter-text/regex.rb b/rb/lib/twitter-text/regex.rb index 7e92a7418..f0f78c62b 100644 --- a/rb/lib/twitter-text/regex.rb +++ b/rb/lib/twitter-text/regex.rb @@ -257,7 +257,7 @@ def self.regex_range(from, to = nil) # :nodoc: }iox REGEXEN[:cashtag] = /[a-z]{1,6}(?:[._][a-z]{1,2})?/i - REGEXEN[:valid_cashtag] = /(^|#{REGEXEN[:spaces]})(\$)(#{REGEXEN[:cashtag]})(?=$|\s|[#{PUNCTUATION_CHARS}])/i + REGEXEN[:valid_cashtag] = /(^|#{REGEXEN[:spaces]})(\$)(#{REGEXEN[:cashtag]})(?=$|\s|[#{PUNCTUATION_CHARS}]|\u2019)/i # These URL validation pattern strings are based on the ABNF from RFC 3986 REGEXEN[:validate_url_unreserved] = /[a-z\p{Cyrillic}0-9\-._~]/i From a6a9c83a152da35ec173884e1ac38b82f0bb8094 Mon Sep 17 00:00:00 2001 From: Catia Goncalves Date: Thu, 22 Jun 2017 19:27:43 -0700 Subject: [PATCH 3/3] escape --- js/twitter-text.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/js/twitter-text.js b/js/twitter-text.js index 009af94c1..581a23d43 100644 --- a/js/twitter-text.js +++ b/js/twitter-text.js @@ -273,7 +273,7 @@ // cashtag related regex twttr.txt.regexen.cashtag = /[a-z]{1,6}(?:[._][a-z]{1,2})?/i; - twttr.txt.regexen.validCashtag = regexSupplant('(^|#{spaces})(\\$)(#{cashtag})(?=$|\\s|[#{punct}]|\u2019)', 'gi'); + twttr.txt.regexen.validCashtag = regexSupplant('(^|#{spaces})(\\$)(#{cashtag})(?=$|\\s|[#{punct}]|\\u2019)', 'gi'); // These URL validation pattern strings are based on the ABNF from RFC 3986 twttr.txt.regexen.validateUrlUnreserved = /[a-z\u0400-\u04FF0-9\-._~]/i;