From de71952f7e92052e597314a396b38ad9aa9d8a8e Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 4 Jan 2016 16:16:33 -0800 Subject: [PATCH 01/16] Add Mac impl for bulk spell-checking function This function will return an array of character ranges, indicating where *all* of the misspelled words are in a given string. --- lib/spellchecker.js | 7 +++++++ spec/spellchecker-spec.coffee | 9 +++++++++ src/main.cc | 25 +++++++++++++++++++++++++ src/spellchecker.h | 7 +++++++ src/spellchecker_mac.h | 1 + src/spellchecker_mac.mm | 17 +++++++++++++++++ 6 files changed, 66 insertions(+) diff --git a/lib/spellchecker.js b/lib/spellchecker.js index ad94430..a8085e3 100644 --- a/lib/spellchecker.js +++ b/lib/spellchecker.js @@ -29,6 +29,12 @@ var isMisspelled = function() { return defaultSpellcheck.isMisspelled.apply(defaultSpellcheck, arguments); }; +var checkSpelling = function() { + ensureDefaultSpellCheck(); + + return defaultSpellcheck.checkSpelling.apply(defaultSpellcheck, arguments); +}; + var add = function() { ensureDefaultSpellCheck(); @@ -64,6 +70,7 @@ module.exports = { setDictionary: setDictionary, add: add, isMisspelled: isMisspelled, + checkSpelling: checkSpelling, getAvailableDictionaries: getAvailableDictionaries, getCorrectionsForMisspelling: getCorrectionsForMisspelling, Spellchecker: Spellchecker diff --git a/spec/spellchecker-spec.coffee b/spec/spellchecker-spec.coffee index 76664a4..4b23346 100644 --- a/spec/spellchecker-spec.coffee +++ b/spec/spellchecker-spec.coffee @@ -11,6 +11,15 @@ describe "SpellChecker", -> it "throws an exception when no word specified", -> expect(-> SpellChecker.isMisspelled()).toThrow() + describe ".checkSpelling(string)", -> + it "returns an array of character ranges of misspelled words", -> + string = "cat caat dog dooog" + + expect(SpellChecker.checkSpelling(string)).toEqual [ + {start: 4, end: 8}, + {start: 13, end: 18}, + ] + describe ".getCorrectionsForMisspelling(word)", -> it "returns an array of possible corrections", -> corrections = SpellChecker.getCorrectionsForMisspelling('worrd') diff --git a/src/main.cc b/src/main.cc index 6ff1ab5..6347f1f 100644 --- a/src/main.cc +++ b/src/main.cc @@ -1,3 +1,4 @@ +#include #include "nan.h" #include "spellchecker.h" @@ -49,6 +50,29 @@ class Spellchecker : public Nan::ObjectWrap { info.GetReturnValue().Set(Nan::New(that->impl->IsMisspelled(word))); } + static NAN_METHOD(CheckSpelling) { + Nan::HandleScope scope; + if (info.Length() < 1) { + return Nan::ThrowError("Bad argument"); + } + + Spellchecker* that = Nan::ObjectWrap::Unwrap(info.Holder()); + String::Utf8Value text(info[0]); + + std::vector misspelled_ranges = that->impl->CheckSpelling(*text, text.length()); + + Local result = Nan::New(); + std::vector::const_iterator iter = misspelled_ranges.begin(); + for (; iter != misspelled_ranges.end(); ++iter) { + Local misspelled_range = Nan::New(); + misspelled_range->Set(Nan::New("start").ToLocalChecked(), Nan::New(iter->start)); + misspelled_range->Set(Nan::New("end").ToLocalChecked(), Nan::New(iter->end)); + result->Set(iter - misspelled_ranges.begin(), misspelled_range); + } + + info.GetReturnValue().Set(result); + } + static NAN_METHOD(Add) { Nan::HandleScope scope; if (info.Length() < 1) { @@ -127,6 +151,7 @@ class Spellchecker : public Nan::ObjectWrap { Nan::SetMethod(tpl->InstanceTemplate(), "getAvailableDictionaries", Spellchecker::GetAvailableDictionaries); Nan::SetMethod(tpl->InstanceTemplate(), "getCorrectionsForMisspelling", Spellchecker::GetCorrectionsForMisspelling); Nan::SetMethod(tpl->InstanceTemplate(), "isMisspelled", Spellchecker::IsMisspelled); + Nan::SetMethod(tpl->InstanceTemplate(), "checkSpelling", Spellchecker::CheckSpelling); Nan::SetMethod(tpl->InstanceTemplate(), "add", Spellchecker::Add); exports->Set(Nan::New("Spellchecker").ToLocalChecked(), tpl->GetFunction()); diff --git a/src/spellchecker.h b/src/spellchecker.h index 5251dcb..7fb1afa 100644 --- a/src/spellchecker.h +++ b/src/spellchecker.h @@ -6,6 +6,11 @@ namespace spellchecker { +struct MisspelledRange { + size_t start; + size_t end; +}; + class SpellcheckerImplementation { public: virtual bool SetDictionary(const std::string& language, const std::string& path) = 0; @@ -17,6 +22,8 @@ class SpellcheckerImplementation { // Returns true if the word is misspelled. virtual bool IsMisspelled(const std::string& word) = 0; + virtual std::vector CheckSpelling(const char *text, size_t length) = 0; + // Adds a new word to the dictionary. // NB: When using Hunspell, this will not modify the .dic file; custom words must be added each // time the spellchecker is created. Use a custom dictionary file. diff --git a/src/spellchecker_mac.h b/src/spellchecker_mac.h index a118e81..4675ab9 100644 --- a/src/spellchecker_mac.h +++ b/src/spellchecker_mac.h @@ -17,6 +17,7 @@ class MacSpellchecker : public SpellcheckerImplementation { std::vector GetAvailableDictionaries(const std::string& path); std::vector GetCorrectionsForMisspelling(const std::string& word); bool IsMisspelled(const std::string& word); + std::vector CheckSpelling(const char *text, size_t length); void Add(const std::string& word); private: diff --git a/src/spellchecker_mac.mm b/src/spellchecker_mac.mm index 59596f8..a8d1051 100644 --- a/src/spellchecker_mac.mm +++ b/src/spellchecker_mac.mm @@ -52,6 +52,23 @@ return result; } +std::vector MacSpellchecker::CheckSpelling(const char *text, size_t length) { + std::vector result; + + @autoreleasepool { + NSString* string = [NSString stringWithUTF8String:text]; + NSArray *misspellings = [this->spellChecker checkString:string range:NSMakeRange(0, length) types:NSTextCheckingTypeSpelling options:nil inSpellDocumentWithTag:0 orthography:nil wordCount:nil]; + for (NSTextCheckingResult *misspelling in misspellings) { + MisspelledRange range; + range.start = misspelling.range.location; + range.end = misspelling.range.location + misspelling.range.length; + result.push_back(range); + } + } + + return result; +} + void MacSpellchecker::Add(const std::string& word) { @autoreleasepool { NSString* newWord = [NSString stringWithUTF8String:word.c_str()]; From 063ef56ef30c4b241727a3ef241d15669142f877 Mon Sep 17 00:00:00 2001 From: Max Date: Tue, 5 Jan 2016 11:39:40 -0800 Subject: [PATCH 02/16] Add stub hunspell impl for bulk spell-checking function --- src/spellchecker_hunspell.cc | 5 +++++ src/spellchecker_hunspell.h | 1 + 2 files changed, 6 insertions(+) diff --git a/src/spellchecker_hunspell.cc b/src/spellchecker_hunspell.cc index c144757..e3b670a 100644 --- a/src/spellchecker_hunspell.cc +++ b/src/spellchecker_hunspell.cc @@ -48,6 +48,11 @@ bool HunspellSpellchecker::IsMisspelled(const std::string& word) { return hunspell->spell(word.c_str()) == 0; } +std::vector HunspellSpellchecker::CheckSpelling(const char *text, size_t length) { + std::vector result; + return result; +} + void HunspellSpellchecker::Add(const std::string& word) { if (hunspell) { hunspell->add(word.c_str()); diff --git a/src/spellchecker_hunspell.h b/src/spellchecker_hunspell.h index 67aa3e7..f3d6e69 100644 --- a/src/spellchecker_hunspell.h +++ b/src/spellchecker_hunspell.h @@ -16,6 +16,7 @@ class HunspellSpellchecker : public SpellcheckerImplementation { std::vector GetAvailableDictionaries(const std::string& path); std::vector GetCorrectionsForMisspelling(const std::string& word); bool IsMisspelled(const std::string& word); + std::vector CheckSpelling(const char *text, size_t length); void Add(const std::string& word); private: From db0e3883b7a942a6de1577e5209f9ef41b9afa30 Mon Sep 17 00:00:00 2001 From: Max Date: Tue, 5 Jan 2016 13:03:33 -0800 Subject: [PATCH 03/16] Add windows impl for bulk spell-checking function --- src/spellchecker_win.cc | 32 +++++++++++++++++++++++++++++++- src/spellchecker_win.h | 1 + 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/src/spellchecker_win.cc b/src/spellchecker_win.cc index 9d959ec..b76d165 100644 --- a/src/spellchecker_win.cc +++ b/src/spellchecker_win.cc @@ -78,7 +78,7 @@ WindowsSpellchecker::~WindowsSpellchecker() { this->currentSpellchecker->Release(); this->currentSpellchecker = NULL; } - + if (this->spellcheckerFactory) { this->spellcheckerFactory->Release(); this->spellcheckerFactory = NULL; @@ -187,6 +187,36 @@ bool WindowsSpellchecker::IsMisspelled(const std::string& word) { return ret; } +std::vector WindowsSpellchecker::CheckSpelling(const char *text, size_t length) { + std::vector result; + + if (this->currentSpellchecker == NULL) { + return result; + } + + IEnumSpellingError* errors = NULL; + std::wstring wtext = ToWString(text); + if (FAILED(this->currentSpellchecker->Check(wtext.c_str(), &errors))) { + return result; + } + + ISpellingError *error; + while (errors->Next(&error) == S_OK) { + ULONG start, length; + error->get_StartIndex(&start); + error->get_Length(&length); + + MisspelledRange range; + range.start = start; + range.end = start + length; + result.push_back(range); + error->Release(); + } + + errors->Release(); + return result; +} + void WindowsSpellchecker::Add(const std::string& word) { if (this->currentSpellchecker == NULL) { return; diff --git a/src/spellchecker_win.h b/src/spellchecker_win.h index 4875f1d..5ea8112 100644 --- a/src/spellchecker_win.h +++ b/src/spellchecker_win.h @@ -18,6 +18,7 @@ class WindowsSpellchecker : public SpellcheckerImplementation { std::vector GetCorrectionsForMisspelling(const std::string& word); bool IsMisspelled(const std::string& word); + std::vector CheckSpelling(const char *text, size_t length); void Add(const std::string& word); private: From dae76c81452eb24038cce404306fbad68f159aea Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 5 Jan 2016 17:13:35 -0800 Subject: [PATCH 04/16] :art: --- src/spellchecker_mac.mm | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/spellchecker_mac.mm b/src/spellchecker_mac.mm index a8d1051..20237d8 100644 --- a/src/spellchecker_mac.mm +++ b/src/spellchecker_mac.mm @@ -57,7 +57,13 @@ @autoreleasepool { NSString* string = [NSString stringWithUTF8String:text]; - NSArray *misspellings = [this->spellChecker checkString:string range:NSMakeRange(0, length) types:NSTextCheckingTypeSpelling options:nil inSpellDocumentWithTag:0 orthography:nil wordCount:nil]; + NSArray *misspellings = [this->spellChecker checkString:string + range:NSMakeRange(0, length) + types:NSTextCheckingTypeSpelling + options:nil + inSpellDocumentWithTag:0 + orthography:nil + wordCount:nil]; for (NSTextCheckingResult *misspelling in misspellings) { MisspelledRange range; range.start = misspelling.range.location; From 8609f76d8a1411e8c7e1e8402ca10b327aef0dd8 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 6 Jan 2016 12:19:07 -0800 Subject: [PATCH 05/16] Pass UTF16-encoded string to CheckSpelling --- src/main.cc | 6 ++++-- src/spellchecker.h | 3 ++- src/spellchecker_hunspell.cc | 4 +++- src/spellchecker_hunspell.h | 2 +- src/spellchecker_mac.h | 2 +- src/spellchecker_mac.mm | 7 ++++--- src/spellchecker_win.cc | 4 ++-- src/spellchecker_win.h | 2 +- 8 files changed, 18 insertions(+), 12 deletions(-) diff --git a/src/main.cc b/src/main.cc index 6347f1f..ce64181 100644 --- a/src/main.cc +++ b/src/main.cc @@ -57,9 +57,11 @@ class Spellchecker : public Nan::ObjectWrap { } Spellchecker* that = Nan::ObjectWrap::Unwrap(info.Holder()); - String::Utf8Value text(info[0]); + Handle string = Handle::Cast(info[0]); + std::vector text(string->Length()); + string->Write(reinterpret_cast(text.data())); - std::vector misspelled_ranges = that->impl->CheckSpelling(*text, text.length()); + std::vector misspelled_ranges = that->impl->CheckSpelling(text.data(), text.size()); Local result = Nan::New(); std::vector::const_iterator iter = misspelled_ranges.begin(); diff --git a/src/spellchecker.h b/src/spellchecker.h index 7fb1afa..0e052b9 100644 --- a/src/spellchecker.h +++ b/src/spellchecker.h @@ -3,6 +3,7 @@ #include #include +#include namespace spellchecker { @@ -22,7 +23,7 @@ class SpellcheckerImplementation { // Returns true if the word is misspelled. virtual bool IsMisspelled(const std::string& word) = 0; - virtual std::vector CheckSpelling(const char *text, size_t length) = 0; + virtual std::vector CheckSpelling(const uint16_t *text, size_t length) = 0; // Adds a new word to the dictionary. // NB: When using Hunspell, this will not modify the .dic file; custom words must be added each diff --git a/src/spellchecker_hunspell.cc b/src/spellchecker_hunspell.cc index e3b670a..c746a97 100644 --- a/src/spellchecker_hunspell.cc +++ b/src/spellchecker_hunspell.cc @@ -1,6 +1,8 @@ #include +#include #include #include "../vendor/hunspell/src/hunspell/hunspell.hxx" +#include "../vendor/hunspell/src/hunspell/csutil.hxx" #include "spellchecker_hunspell.h" namespace spellchecker { @@ -48,7 +50,7 @@ bool HunspellSpellchecker::IsMisspelled(const std::string& word) { return hunspell->spell(word.c_str()) == 0; } -std::vector HunspellSpellchecker::CheckSpelling(const char *text, size_t length) { +std::vector HunspellSpellchecker::CheckSpelling(const uint16_t *utf16_text, size_t utf16_length) { std::vector result; return result; } diff --git a/src/spellchecker_hunspell.h b/src/spellchecker_hunspell.h index f3d6e69..fa1cf74 100644 --- a/src/spellchecker_hunspell.h +++ b/src/spellchecker_hunspell.h @@ -16,7 +16,7 @@ class HunspellSpellchecker : public SpellcheckerImplementation { std::vector GetAvailableDictionaries(const std::string& path); std::vector GetCorrectionsForMisspelling(const std::string& word); bool IsMisspelled(const std::string& word); - std::vector CheckSpelling(const char *text, size_t length); + std::vector CheckSpelling(const uint16_t *text, size_t length); void Add(const std::string& word); private: diff --git a/src/spellchecker_mac.h b/src/spellchecker_mac.h index 4675ab9..fd2cdc2 100644 --- a/src/spellchecker_mac.h +++ b/src/spellchecker_mac.h @@ -17,7 +17,7 @@ class MacSpellchecker : public SpellcheckerImplementation { std::vector GetAvailableDictionaries(const std::string& path); std::vector GetCorrectionsForMisspelling(const std::string& word); bool IsMisspelled(const std::string& word); - std::vector CheckSpelling(const char *text, size_t length); + std::vector CheckSpelling(const uint16_t *text, size_t length); void Add(const std::string& word); private: diff --git a/src/spellchecker_mac.mm b/src/spellchecker_mac.mm index 20237d8..6482d5a 100644 --- a/src/spellchecker_mac.mm +++ b/src/spellchecker_mac.mm @@ -52,13 +52,14 @@ return result; } -std::vector MacSpellchecker::CheckSpelling(const char *text, size_t length) { +std::vector MacSpellchecker::CheckSpelling(const uint16_t *text, size_t length) { std::vector result; @autoreleasepool { - NSString* string = [NSString stringWithUTF8String:text]; + NSData *data = [[NSData alloc] initWithBytesNoCopy:(void *)(text) length:(length * 2) freeWhenDone:NO]; + NSString* string = [[NSString alloc] initWithData:data encoding:NSUTF16LittleEndianStringEncoding]; NSArray *misspellings = [this->spellChecker checkString:string - range:NSMakeRange(0, length) + range:NSMakeRange(0, string.length) types:NSTextCheckingTypeSpelling options:nil inSpellDocumentWithTag:0 diff --git a/src/spellchecker_win.cc b/src/spellchecker_win.cc index b76d165..0e32df4 100644 --- a/src/spellchecker_win.cc +++ b/src/spellchecker_win.cc @@ -187,7 +187,7 @@ bool WindowsSpellchecker::IsMisspelled(const std::string& word) { return ret; } -std::vector WindowsSpellchecker::CheckSpelling(const char *text, size_t length) { +std::vector WindowsSpellchecker::CheckSpelling(const uint16_t *text, size_t length) { std::vector result; if (this->currentSpellchecker == NULL) { @@ -195,7 +195,7 @@ std::vector WindowsSpellchecker::CheckSpelling(const char *text } IEnumSpellingError* errors = NULL; - std::wstring wtext = ToWString(text); + std::wstring wtext(reinterpret_cast(text), length); if (FAILED(this->currentSpellchecker->Check(wtext.c_str(), &errors))) { return result; } diff --git a/src/spellchecker_win.h b/src/spellchecker_win.h index 5ea8112..09e7cae 100644 --- a/src/spellchecker_win.h +++ b/src/spellchecker_win.h @@ -18,7 +18,7 @@ class WindowsSpellchecker : public SpellcheckerImplementation { std::vector GetCorrectionsForMisspelling(const std::string& word); bool IsMisspelled(const std::string& word); - std::vector CheckSpelling(const char *text, size_t length); + std::vector CheckSpelling(const uint16_t *text, size_t length); void Add(const std::string& word); private: From f51154c02dfb35241cc98ea9ef5d77ce825d390d Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 6 Jan 2016 12:20:11 -0800 Subject: [PATCH 06/16] Add real hunspell impl for bulk spell-checking function --- src/spellchecker_hunspell.cc | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/src/spellchecker_hunspell.cc b/src/spellchecker_hunspell.cc index c746a97..4b0579e 100644 --- a/src/spellchecker_hunspell.cc +++ b/src/spellchecker_hunspell.cc @@ -52,6 +52,38 @@ bool HunspellSpellchecker::IsMisspelled(const std::string& word) { std::vector HunspellSpellchecker::CheckSpelling(const uint16_t *utf16_text, size_t utf16_length) { std::vector result; + + if (!hunspell) { + return result; + } + + std::vector utf8_buffer(256); + char *utf8_word = utf8_buffer.data(); + + bool within_word = false; + size_t word_start = 0; + for (size_t i = 0; i <= utf16_length; i++) { + bool is_alpha = i < utf16_length && std::iswalpha(utf16_text[i]); + + if (within_word) { + if (!is_alpha) { + within_word = false; + const w_char *utf16_word = reinterpret_cast(utf16_text + word_start); + u16_u8(utf8_word, utf8_buffer.size(), utf16_word, i - word_start); + + if (hunspell->spell(utf8_word) == 0) { + MisspelledRange range; + range.start = word_start; + range.end = i; + result.push_back(range); + } + } + } else if (is_alpha) { + word_start = i; + within_word = true; + } + } + return result; } From 1bc7ad24aa1d4683199902ae65f3b9522b6638dd Mon Sep 17 00:00:00 2001 From: Max Date: Wed, 6 Jan 2016 12:50:01 -0800 Subject: [PATCH 07/16] Fix MSVS warnings --- src/main.cc | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/main.cc b/src/main.cc index ce64181..34e0d1b 100644 --- a/src/main.cc +++ b/src/main.cc @@ -66,10 +66,13 @@ class Spellchecker : public Nan::ObjectWrap { Local result = Nan::New(); std::vector::const_iterator iter = misspelled_ranges.begin(); for (; iter != misspelled_ranges.end(); ++iter) { + size_t index = iter - misspelled_ranges.begin(); + uint32_t start = iter->start, end = iter->end; + Local misspelled_range = Nan::New(); - misspelled_range->Set(Nan::New("start").ToLocalChecked(), Nan::New(iter->start)); - misspelled_range->Set(Nan::New("end").ToLocalChecked(), Nan::New(iter->end)); - result->Set(iter - misspelled_ranges.begin(), misspelled_range); + misspelled_range->Set(Nan::New("start").ToLocalChecked(), Nan::New(start)); + misspelled_range->Set(Nan::New("end").ToLocalChecked(), Nan::New(end)); + result->Set(index, misspelled_range); } info.GetReturnValue().Set(result); From ab0126206c45471586a8b5f16976e512060d07e5 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 6 Jan 2016 13:12:07 -0800 Subject: [PATCH 08/16] Add spec for handling paired characters --- spec/spellchecker-spec.coffee | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/spec/spellchecker-spec.coffee b/spec/spellchecker-spec.coffee index 4b23346..00f4611 100644 --- a/spec/spellchecker-spec.coffee +++ b/spec/spellchecker-spec.coffee @@ -20,6 +20,14 @@ describe "SpellChecker", -> {start: 13, end: 18}, ] + it "accounts for UTF16 pairs correctly", -> + string = "😎 cat caat dog dooog" + + expect(SpellChecker.checkSpelling(string)).toEqual [ + {start: 7, end: 11}, + {start: 16, end: 21}, + ] + describe ".getCorrectionsForMisspelling(word)", -> it "returns an array of possible corrections", -> corrections = SpellChecker.getCorrectionsForMisspelling('worrd') From 0d2fe14b846ad55d019997498b0e789661c1b4f4 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 6 Jan 2016 13:46:38 -0800 Subject: [PATCH 09/16] Add test for non-word characters --- spec/spellchecker-spec.coffee | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/spec/spellchecker-spec.coffee b/spec/spellchecker-spec.coffee index 00f4611..d3305e2 100644 --- a/spec/spellchecker-spec.coffee +++ b/spec/spellchecker-spec.coffee @@ -20,7 +20,7 @@ describe "SpellChecker", -> {start: 13, end: 18}, ] - it "accounts for UTF16 pairs correctly", -> + it "accounts for UTF16 pairs", -> string = "😎 cat caat dog dooog" expect(SpellChecker.checkSpelling(string)).toEqual [ @@ -28,6 +28,13 @@ describe "SpellChecker", -> {start: 16, end: 21}, ] + it "accounts for other non-word characters", -> + string = "'cat' (caat. :dooog)" + expect(SpellChecker.checkSpelling(string)).toEqual [ + {start: 7, end: 11}, + {start: 20, end: 25}, + ] + describe ".getCorrectionsForMisspelling(word)", -> it "returns an array of possible corrections", -> corrections = SpellChecker.getCorrectionsForMisspelling('worrd') From 0824cd32fda0e677dd6838f71518a53a9749002c Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 6 Jan 2016 14:01:24 -0800 Subject: [PATCH 10/16] Handle invalid inputs to bulk spell-checking function --- spec/spellchecker-spec.coffee | 6 ++++++ src/main.cc | 16 ++++++++++++---- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/spec/spellchecker-spec.coffee b/spec/spellchecker-spec.coffee index d3305e2..3448037 100644 --- a/spec/spellchecker-spec.coffee +++ b/spec/spellchecker-spec.coffee @@ -35,6 +35,12 @@ describe "SpellChecker", -> {start: 20, end: 25}, ] + it "handles invalid inputs", -> + expect(SpellChecker.checkSpelling("")).toEqual [] + expect(-> SpellChecker.checkSpelling()).toThrow("Bad argument") + expect(-> SpellChecker.checkSpelling(null)).toThrow("Bad argument") + expect(-> SpellChecker.checkSpelling({})).toThrow("Bad argument") + describe ".getCorrectionsForMisspelling(word)", -> it "returns an array of possible corrections", -> corrections = SpellChecker.getCorrectionsForMisspelling('worrd') diff --git a/src/main.cc b/src/main.cc index 34e0d1b..5cbe853 100644 --- a/src/main.cc +++ b/src/main.cc @@ -56,14 +56,24 @@ class Spellchecker : public Nan::ObjectWrap { return Nan::ThrowError("Bad argument"); } - Spellchecker* that = Nan::ObjectWrap::Unwrap(info.Holder()); Handle string = Handle::Cast(info[0]); + if (!string->IsString()) { + return Nan::ThrowError("Bad argument"); + } + + Local result = Nan::New(); + info.GetReturnValue().Set(result); + + if (string->Length() == 0) { + return; + } + std::vector text(string->Length()); string->Write(reinterpret_cast(text.data())); + Spellchecker* that = Nan::ObjectWrap::Unwrap(info.Holder()); std::vector misspelled_ranges = that->impl->CheckSpelling(text.data(), text.size()); - Local result = Nan::New(); std::vector::const_iterator iter = misspelled_ranges.begin(); for (; iter != misspelled_ranges.end(); ++iter) { size_t index = iter - misspelled_ranges.begin(); @@ -74,8 +84,6 @@ class Spellchecker : public Nan::ObjectWrap { misspelled_range->Set(Nan::New("end").ToLocalChecked(), Nan::New(end)); result->Set(index, misspelled_range); } - - info.GetReturnValue().Set(result); } static NAN_METHOD(Add) { From 7f601b716313f2ad8c9210060b5c2ffdfa01bbee Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 6 Jan 2016 19:54:06 -0800 Subject: [PATCH 11/16] In CheckSpelling, leave room for the terminating NULL --- src/main.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.cc b/src/main.cc index 5cbe853..78b9c52 100644 --- a/src/main.cc +++ b/src/main.cc @@ -68,7 +68,7 @@ class Spellchecker : public Nan::ObjectWrap { return; } - std::vector text(string->Length()); + std::vector text(string->Length() + 1); string->Write(reinterpret_cast(text.data())); Spellchecker* that = Nan::ObjectWrap::Unwrap(info.Holder()); From 84fb4afdd639b0431b473604a19cfa9bc436fabb Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 6 Jan 2016 19:54:18 -0800 Subject: [PATCH 12/16] Clean up hunspell CheckSpelling --- src/spellchecker_hunspell.cc | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/spellchecker_hunspell.cc b/src/spellchecker_hunspell.cc index 4b0579e..8ff7a43 100644 --- a/src/spellchecker_hunspell.cc +++ b/src/spellchecker_hunspell.cc @@ -60,17 +60,15 @@ std::vector HunspellSpellchecker::CheckSpelling(const uint16_t std::vector utf8_buffer(256); char *utf8_word = utf8_buffer.data(); - bool within_word = false; size_t word_start = 0; - for (size_t i = 0; i <= utf16_length; i++) { - bool is_alpha = i < utf16_length && std::iswalpha(utf16_text[i]); - + bool within_word = false; + for (size_t i = 0; i < utf16_length; i++) { + bool is_word_character = iswalpha(utf16_text[i]); if (within_word) { - if (!is_alpha) { + if (!is_word_character) { within_word = false; const w_char *utf16_word = reinterpret_cast(utf16_text + word_start); u16_u8(utf8_word, utf8_buffer.size(), utf16_word, i - word_start); - if (hunspell->spell(utf8_word) == 0) { MisspelledRange range; range.start = word_start; @@ -78,7 +76,7 @@ std::vector HunspellSpellchecker::CheckSpelling(const uint16_t result.push_back(range); } } - } else if (is_alpha) { + } else if (is_word_character) { word_start = i; within_word = true; } From 9aff4964240854dfc1063743acbbe0755ed83b20 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 7 Jan 2016 11:05:04 -0800 Subject: [PATCH 13/16] Test hunspell implementation on Windows CI --- appveyor.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/appveyor.yml b/appveyor.yml index c06414a..0ff9c7e 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,6 +1,10 @@ environment: nodejs_version: "0.10" + matrix: + - {} + - {SPELLCHECKER_PREFER_HUNSPELL: true} + install: - ps: Install-Product node $env:nodejs_version - npm --msvs_version=2013 install From 4262eb341ca958e413736cdc0001f81612602df0 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 7 Jan 2016 11:24:59 -0800 Subject: [PATCH 14/16] Use std libraries for UTF16 -> UTF8 conversion in hunspell spellchecker --- binding.gyp | 7 +++-- src/spellchecker_hunspell.cc | 30 ++++++++++-------- src/spellchecker_hunspell.h | 2 ++ src/transcoder.h | 17 +++++++++++ src/transcoder_posix.cc | 59 ++++++++++++++++++++++++++++++++++++ src/transcoder_win.cc | 23 ++++++++++++++ 6 files changed, 124 insertions(+), 14 deletions(-) create mode 100644 src/transcoder.h create mode 100644 src/transcoder_posix.cc create mode 100644 src/transcoder_win.cc diff --git a/binding.gyp b/binding.gyp index 5216859..cf285eb 100644 --- a/binding.gyp +++ b/binding.gyp @@ -41,17 +41,20 @@ }], ['OS=="win"', { 'sources': [ - 'src/spellchecker_win.cc' + 'src/spellchecker_win.cc', + 'src/transcoder_win.cc', ], }], ['OS=="linux"', { 'sources': [ - 'src/spellchecker_linux.cc' + 'src/spellchecker_linux.cc', + 'src/transcoder_posix.cc', ], }], ['OS=="mac"', { 'sources': [ 'src/spellchecker_mac.mm', + 'src/transcoder_posix.cc', ], 'link_settings': { 'libraries': [ diff --git a/src/spellchecker_hunspell.cc b/src/spellchecker_hunspell.cc index 8ff7a43..28a2586 100644 --- a/src/spellchecker_hunspell.cc +++ b/src/spellchecker_hunspell.cc @@ -2,16 +2,20 @@ #include #include #include "../vendor/hunspell/src/hunspell/hunspell.hxx" -#include "../vendor/hunspell/src/hunspell/csutil.hxx" #include "spellchecker_hunspell.h" namespace spellchecker { -HunspellSpellchecker::HunspellSpellchecker() : hunspell(NULL) { } +HunspellSpellchecker::HunspellSpellchecker() : hunspell(NULL), transcoder(NewTranscoder()) { } + HunspellSpellchecker::~HunspellSpellchecker() { if (hunspell) { delete hunspell; } + + if (transcoder) { + FreeTranscoder(transcoder); + } } bool HunspellSpellchecker::SetDictionary(const std::string& language, const std::string& dirname) { @@ -53,27 +57,29 @@ bool HunspellSpellchecker::IsMisspelled(const std::string& word) { std::vector HunspellSpellchecker::CheckSpelling(const uint16_t *utf16_text, size_t utf16_length) { std::vector result; - if (!hunspell) { + if (!hunspell || !transcoder) { return result; } std::vector utf8_buffer(256); - char *utf8_word = utf8_buffer.data(); size_t word_start = 0; bool within_word = false; for (size_t i = 0; i < utf16_length; i++) { - bool is_word_character = iswalpha(utf16_text[i]); + uint16_t c = utf16_text[i]; + bool is_word_character = iswalpha(c); if (within_word) { if (!is_word_character) { within_word = false; - const w_char *utf16_word = reinterpret_cast(utf16_text + word_start); - u16_u8(utf8_word, utf8_buffer.size(), utf16_word, i - word_start); - if (hunspell->spell(utf8_word) == 0) { - MisspelledRange range; - range.start = word_start; - range.end = i; - result.push_back(range); + + bool converted = TranscodeUTF16ToUTF8(transcoder, (char *)utf8_buffer.data(), utf8_buffer.size(), utf16_text + word_start, i - word_start); + if (converted) { + if (hunspell->spell(utf8_buffer.data()) == 0) { + MisspelledRange range; + range.start = word_start; + range.end = i; + result.push_back(range); + } } } } else if (is_word_character) { diff --git a/src/spellchecker_hunspell.h b/src/spellchecker_hunspell.h index fa1cf74..c8d74a2 100644 --- a/src/spellchecker_hunspell.h +++ b/src/spellchecker_hunspell.h @@ -2,6 +2,7 @@ #define SRC_SPELLCHECKER_HUNSPELL_H_ #include "spellchecker.h" +#include "transcoder.h" class Hunspell; @@ -21,6 +22,7 @@ class HunspellSpellchecker : public SpellcheckerImplementation { private: Hunspell* hunspell; + Transcoder *transcoder; }; } // namespace spellchecker diff --git a/src/transcoder.h b/src/transcoder.h new file mode 100644 index 0000000..eae93a1 --- /dev/null +++ b/src/transcoder.h @@ -0,0 +1,17 @@ +#ifndef SRC_TRANSCODER_H_ +#define SRC_TRANSCODER_H_ + +#include +#include + +namespace spellchecker { + +struct Transcoder; + +Transcoder *NewTranscoder(); +void FreeTranscoder(Transcoder *); +bool TranscodeUTF16ToUTF8(const Transcoder *, char *out, size_t out_length, const uint16_t *in, size_t in_length); + +} // namespace spellchecker + +#endif // SRC_TRANSCODER_H_ diff --git a/src/transcoder_posix.cc b/src/transcoder_posix.cc new file mode 100644 index 0000000..9656681 --- /dev/null +++ b/src/transcoder_posix.cc @@ -0,0 +1,59 @@ +#include +#include +#include + +namespace spellchecker { + +struct Transcoder { + iconv_t conversion; +}; + +static int IsBigEndian(void) { + union { + uint16_t integer; + char bytes[2]; + } two_byte_value; + + two_byte_value.integer = {0x0102}; + return two_byte_value.bytes[0] == 1; +} + +Transcoder *NewTranscoder() { + const char *to_encoding = "UTF-8"; + const char *from_encoding = IsBigEndian() ? "UTF-16BE" : "UTF-16LE"; + iconv_t conversion = iconv_open(to_encoding, from_encoding); + if (conversion == (iconv_t)-1) { + return NULL; + } + + Transcoder *result = new Transcoder(); + result->conversion = conversion; + return result; +} + +void FreeTranscoder(Transcoder *transcoder) { + iconv_close(transcoder->conversion); + delete transcoder; +} + +bool TranscodeUTF16ToUTF8(const Transcoder *transcoder, char *out, size_t out_bytes, const uint16_t *in, size_t in_length) { + char *utf16_word = reinterpret_cast(const_cast(in)); + size_t utf16_bytes = in_length * (sizeof(uint16_t) / sizeof(char)); + + size_t iconv_result = iconv( + transcoder->conversion, + &utf16_word, + &utf16_bytes, + &out, + &out_bytes + ); + + if (iconv_result == static_cast(-1)) { + return false; + } + + *out = '\0'; + return true; +} + +} // namespace spellchecker diff --git a/src/transcoder_win.cc b/src/transcoder_win.cc new file mode 100644 index 0000000..34d1da7 --- /dev/null +++ b/src/transcoder_win.cc @@ -0,0 +1,23 @@ +#include +#include +#include "transcoder.h" + +namespace spellchecker { + +struct Transcoder {}; + +Transcoder* NewTranscoder() { + return new Transcoder(); +} + +void FreeTranscoder(Transcoder *transcoder) { + delete transcoder; +} + +bool TranscodeUTF16ToUTF8(const Transcoder *transcoder, char *out, size_t out_length, const uint16_t *in, size_t in_length) { + int length = WideCharToMultiByte(CP_UTF8, 0, reinterpret_cast(in), in_length, out, out_length, NULL, NULL); + out[length] = '\0'; + return true; +} + +} // namespace spellchecker From 5f11ffdba9e5a5f7a049d68fae9d08faaf8c842a Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 7 Jan 2016 13:03:40 -0800 Subject: [PATCH 15/16] In hunspell, handle apostrophes, ignore words w/ non-english letters --- spec/spellchecker-spec.coffee | 18 +++++++++++ src/spellchecker_hunspell.cc | 59 ++++++++++++++++++++++++----------- 2 files changed, 58 insertions(+), 19 deletions(-) diff --git a/spec/spellchecker-spec.coffee b/spec/spellchecker-spec.coffee index 3448037..5396a08 100644 --- a/spec/spellchecker-spec.coffee +++ b/spec/spellchecker-spec.coffee @@ -35,6 +35,24 @@ describe "SpellChecker", -> {start: 20, end: 25}, ] + it "does not treat non-english letters as word boundaries", -> + SpellChecker.add("cliché") + expect(SpellChecker.checkSpelling("what cliché nonsense")).toEqual [] + + it "handles words with apostrophes", -> + string = "doesn't isn't aint hasn't" + expect(SpellChecker.checkSpelling(string)).toEqual [ + {start: string.indexOf("aint"), end: string.indexOf("aint") + 4} + ] + + string = "you say you're 'certain', but are you really?" + expect(SpellChecker.checkSpelling(string)).toEqual [] + + string = "you say you're 'sertan', but are you really?" + expect(SpellChecker.checkSpelling(string)).toEqual [ + {start: string.indexOf("sertan"), end: string.indexOf("',")} + ] + it "handles invalid inputs", -> expect(SpellChecker.checkSpelling("")).toEqual [] expect(-> SpellChecker.checkSpelling()).toThrow("Bad argument") diff --git a/src/spellchecker_hunspell.cc b/src/spellchecker_hunspell.cc index 28a2586..eba60cb 100644 --- a/src/spellchecker_hunspell.cc +++ b/src/spellchecker_hunspell.cc @@ -63,28 +63,49 @@ std::vector HunspellSpellchecker::CheckSpelling(const uint16_t std::vector utf8_buffer(256); - size_t word_start = 0; - bool within_word = false; - for (size_t i = 0; i < utf16_length; i++) { + enum { + unknown, + in_separator, + in_word, + } state = in_separator; + + for (size_t word_start = 0, i = 0; i < utf16_length; i++) { uint16_t c = utf16_text[i]; - bool is_word_character = iswalpha(c); - if (within_word) { - if (!is_word_character) { - within_word = false; - - bool converted = TranscodeUTF16ToUTF8(transcoder, (char *)utf8_buffer.data(), utf8_buffer.size(), utf16_text + word_start, i - word_start); - if (converted) { - if (hunspell->spell(utf8_buffer.data()) == 0) { - MisspelledRange range; - range.start = word_start; - range.end = i; - result.push_back(range); + + switch (state) { + case unknown: + if (iswpunct(c) || iswspace(c)) { + state = in_separator; + } + break; + + case in_separator: + if (iswalpha(c)) { + word_start = i; + state = in_word; + } else if (!iswpunct(c) && !iswspace(c)) { + state = unknown; + } + break; + + case in_word: + if (c == '\'' && iswalpha(utf16_text[i + 1])) { + i++; + } else if (c == 0 || iswpunct(c) || iswspace(c)) { + state = in_separator; + bool converted = TranscodeUTF16ToUTF8(transcoder, (char *)utf8_buffer.data(), utf8_buffer.size(), utf16_text + word_start, i - word_start); + if (converted) { + if (hunspell->spell(utf8_buffer.data()) == 0) { + MisspelledRange range; + range.start = word_start; + range.end = i; + result.push_back(range); + } } + } else if (!iswalpha(c)) { + state = unknown; } - } - } else if (is_word_character) { - word_start = i; - within_word = true; + break; } } From b47f7060c536a980671d14022d54789ec3d2f1b2 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 8 Jan 2016 10:52:53 -0800 Subject: [PATCH 16/16] 3.2.0-0 --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 52cc319..b0f2374 100644 --- a/package.json +++ b/package.json @@ -2,7 +2,7 @@ "main": "./lib/spellchecker.js", "name": "spellchecker", "description": "Bindings to native spellchecker", - "version": "3.1.3", + "version": "3.2.0-0", "licenses": [ { "type": "MIT",