diff --git a/appveyor.yml b/appveyor.yml index c06414a..0ff9c7e 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,6 +1,10 @@ environment: nodejs_version: "0.10" + matrix: + - {} + - {SPELLCHECKER_PREFER_HUNSPELL: true} + install: - ps: Install-Product node $env:nodejs_version - npm --msvs_version=2013 install diff --git a/binding.gyp b/binding.gyp index 5216859..cf285eb 100644 --- a/binding.gyp +++ b/binding.gyp @@ -41,17 +41,20 @@ }], ['OS=="win"', { 'sources': [ - 'src/spellchecker_win.cc' + 'src/spellchecker_win.cc', + 'src/transcoder_win.cc', ], }], ['OS=="linux"', { 'sources': [ - 'src/spellchecker_linux.cc' + 'src/spellchecker_linux.cc', + 'src/transcoder_posix.cc', ], }], ['OS=="mac"', { 'sources': [ 'src/spellchecker_mac.mm', + 'src/transcoder_posix.cc', ], 'link_settings': { 'libraries': [ diff --git a/lib/spellchecker.js b/lib/spellchecker.js index ad94430..a8085e3 100644 --- a/lib/spellchecker.js +++ b/lib/spellchecker.js @@ -29,6 +29,12 @@ var isMisspelled = function() { return defaultSpellcheck.isMisspelled.apply(defaultSpellcheck, arguments); }; +var checkSpelling = function() { + ensureDefaultSpellCheck(); + + return defaultSpellcheck.checkSpelling.apply(defaultSpellcheck, arguments); +}; + var add = function() { ensureDefaultSpellCheck(); @@ -64,6 +70,7 @@ module.exports = { setDictionary: setDictionary, add: add, isMisspelled: isMisspelled, + checkSpelling: checkSpelling, getAvailableDictionaries: getAvailableDictionaries, getCorrectionsForMisspelling: getCorrectionsForMisspelling, Spellchecker: Spellchecker diff --git a/package.json b/package.json index 52cc319..b0f2374 100644 --- a/package.json +++ b/package.json @@ -2,7 +2,7 @@ "main": "./lib/spellchecker.js", "name": "spellchecker", "description": "Bindings to native spellchecker", - "version": "3.1.3", + "version": "3.2.0-0", "licenses": [ { "type": "MIT", diff --git a/spec/spellchecker-spec.coffee b/spec/spellchecker-spec.coffee index 76664a4..5396a08 100644 --- a/spec/spellchecker-spec.coffee +++ b/spec/spellchecker-spec.coffee @@ -11,6 +11,54 @@ describe "SpellChecker", -> it "throws an exception when no word specified", -> expect(-> SpellChecker.isMisspelled()).toThrow() + describe ".checkSpelling(string)", -> + it "returns an array of character ranges of misspelled words", -> + string = "cat caat dog dooog" + + expect(SpellChecker.checkSpelling(string)).toEqual [ + {start: 4, end: 8}, + {start: 13, end: 18}, + ] + + it "accounts for UTF16 pairs", -> + string = "😎 cat caat dog dooog" + + expect(SpellChecker.checkSpelling(string)).toEqual [ + {start: 7, end: 11}, + {start: 16, end: 21}, + ] + + it "accounts for other non-word characters", -> + string = "'cat' (caat. :dooog)" + expect(SpellChecker.checkSpelling(string)).toEqual [ + {start: 7, end: 11}, + {start: 20, end: 25}, + ] + + it "does not treat non-english letters as word boundaries", -> + SpellChecker.add("cliché") + expect(SpellChecker.checkSpelling("what cliché nonsense")).toEqual [] + + it "handles words with apostrophes", -> + string = "doesn't isn't aint hasn't" + expect(SpellChecker.checkSpelling(string)).toEqual [ + {start: string.indexOf("aint"), end: string.indexOf("aint") + 4} + ] + + string = "you say you're 'certain', but are you really?" + expect(SpellChecker.checkSpelling(string)).toEqual [] + + string = "you say you're 'sertan', but are you really?" + expect(SpellChecker.checkSpelling(string)).toEqual [ + {start: string.indexOf("sertan"), end: string.indexOf("',")} + ] + + it "handles invalid inputs", -> + expect(SpellChecker.checkSpelling("")).toEqual [] + expect(-> SpellChecker.checkSpelling()).toThrow("Bad argument") + expect(-> SpellChecker.checkSpelling(null)).toThrow("Bad argument") + expect(-> SpellChecker.checkSpelling({})).toThrow("Bad argument") + describe ".getCorrectionsForMisspelling(word)", -> it "returns an array of possible corrections", -> corrections = SpellChecker.getCorrectionsForMisspelling('worrd') diff --git a/src/main.cc b/src/main.cc index 6ff1ab5..78b9c52 100644 --- a/src/main.cc +++ b/src/main.cc @@ -1,3 +1,4 @@ +#include #include "nan.h" #include "spellchecker.h" @@ -49,6 +50,42 @@ class Spellchecker : public Nan::ObjectWrap { info.GetReturnValue().Set(Nan::New(that->impl->IsMisspelled(word))); } + static NAN_METHOD(CheckSpelling) { + Nan::HandleScope scope; + if (info.Length() < 1) { + return Nan::ThrowError("Bad argument"); + } + + Handle string = Handle::Cast(info[0]); + if (!string->IsString()) { + return Nan::ThrowError("Bad argument"); + } + + Local result = Nan::New(); + info.GetReturnValue().Set(result); + + if (string->Length() == 0) { + return; + } + + std::vector text(string->Length() + 1); + string->Write(reinterpret_cast(text.data())); + + Spellchecker* that = Nan::ObjectWrap::Unwrap(info.Holder()); + std::vector misspelled_ranges = that->impl->CheckSpelling(text.data(), text.size()); + + std::vector::const_iterator iter = misspelled_ranges.begin(); + for (; iter != misspelled_ranges.end(); ++iter) { + size_t index = iter - misspelled_ranges.begin(); + uint32_t start = iter->start, end = iter->end; + + Local misspelled_range = Nan::New(); + misspelled_range->Set(Nan::New("start").ToLocalChecked(), Nan::New(start)); + misspelled_range->Set(Nan::New("end").ToLocalChecked(), Nan::New(end)); + result->Set(index, misspelled_range); + } + } + static NAN_METHOD(Add) { Nan::HandleScope scope; if (info.Length() < 1) { @@ -127,6 +164,7 @@ class Spellchecker : public Nan::ObjectWrap { Nan::SetMethod(tpl->InstanceTemplate(), "getAvailableDictionaries", Spellchecker::GetAvailableDictionaries); Nan::SetMethod(tpl->InstanceTemplate(), "getCorrectionsForMisspelling", Spellchecker::GetCorrectionsForMisspelling); Nan::SetMethod(tpl->InstanceTemplate(), "isMisspelled", Spellchecker::IsMisspelled); + Nan::SetMethod(tpl->InstanceTemplate(), "checkSpelling", Spellchecker::CheckSpelling); Nan::SetMethod(tpl->InstanceTemplate(), "add", Spellchecker::Add); exports->Set(Nan::New("Spellchecker").ToLocalChecked(), tpl->GetFunction()); diff --git a/src/spellchecker.h b/src/spellchecker.h index 5251dcb..0e052b9 100644 --- a/src/spellchecker.h +++ b/src/spellchecker.h @@ -3,9 +3,15 @@ #include #include +#include namespace spellchecker { +struct MisspelledRange { + size_t start; + size_t end; +}; + class SpellcheckerImplementation { public: virtual bool SetDictionary(const std::string& language, const std::string& path) = 0; @@ -17,6 +23,8 @@ class SpellcheckerImplementation { // Returns true if the word is misspelled. virtual bool IsMisspelled(const std::string& word) = 0; + virtual std::vector CheckSpelling(const uint16_t *text, size_t length) = 0; + // Adds a new word to the dictionary. // NB: When using Hunspell, this will not modify the .dic file; custom words must be added each // time the spellchecker is created. Use a custom dictionary file. diff --git a/src/spellchecker_hunspell.cc b/src/spellchecker_hunspell.cc index c144757..eba60cb 100644 --- a/src/spellchecker_hunspell.cc +++ b/src/spellchecker_hunspell.cc @@ -1,15 +1,21 @@ #include +#include #include #include "../vendor/hunspell/src/hunspell/hunspell.hxx" #include "spellchecker_hunspell.h" namespace spellchecker { -HunspellSpellchecker::HunspellSpellchecker() : hunspell(NULL) { } +HunspellSpellchecker::HunspellSpellchecker() : hunspell(NULL), transcoder(NewTranscoder()) { } + HunspellSpellchecker::~HunspellSpellchecker() { if (hunspell) { delete hunspell; } + + if (transcoder) { + FreeTranscoder(transcoder); + } } bool HunspellSpellchecker::SetDictionary(const std::string& language, const std::string& dirname) { @@ -48,6 +54,64 @@ bool HunspellSpellchecker::IsMisspelled(const std::string& word) { return hunspell->spell(word.c_str()) == 0; } +std::vector HunspellSpellchecker::CheckSpelling(const uint16_t *utf16_text, size_t utf16_length) { + std::vector result; + + if (!hunspell || !transcoder) { + return result; + } + + std::vector utf8_buffer(256); + + enum { + unknown, + in_separator, + in_word, + } state = in_separator; + + for (size_t word_start = 0, i = 0; i < utf16_length; i++) { + uint16_t c = utf16_text[i]; + + switch (state) { + case unknown: + if (iswpunct(c) || iswspace(c)) { + state = in_separator; + } + break; + + case in_separator: + if (iswalpha(c)) { + word_start = i; + state = in_word; + } else if (!iswpunct(c) && !iswspace(c)) { + state = unknown; + } + break; + + case in_word: + if (c == '\'' && iswalpha(utf16_text[i + 1])) { + i++; + } else if (c == 0 || iswpunct(c) || iswspace(c)) { + state = in_separator; + bool converted = TranscodeUTF16ToUTF8(transcoder, (char *)utf8_buffer.data(), utf8_buffer.size(), utf16_text + word_start, i - word_start); + if (converted) { + if (hunspell->spell(utf8_buffer.data()) == 0) { + MisspelledRange range; + range.start = word_start; + range.end = i; + result.push_back(range); + } + } + } else if (!iswalpha(c)) { + state = unknown; + } + break; + } + } + + return result; +} + void HunspellSpellchecker::Add(const std::string& word) { if (hunspell) { hunspell->add(word.c_str()); diff --git a/src/spellchecker_hunspell.h b/src/spellchecker_hunspell.h index 67aa3e7..c8d74a2 100644 --- a/src/spellchecker_hunspell.h +++ b/src/spellchecker_hunspell.h @@ -2,6 +2,7 @@ #define SRC_SPELLCHECKER_HUNSPELL_H_ #include "spellchecker.h" +#include "transcoder.h" class Hunspell; @@ -16,10 +17,12 @@ class HunspellSpellchecker : public SpellcheckerImplementation { std::vector GetAvailableDictionaries(const std::string& path); std::vector GetCorrectionsForMisspelling(const std::string& word); bool IsMisspelled(const std::string& word); + std::vector CheckSpelling(const uint16_t *text, size_t length); void Add(const std::string& word); private: Hunspell* hunspell; + Transcoder *transcoder; }; } // namespace spellchecker diff --git a/src/spellchecker_mac.h b/src/spellchecker_mac.h index a118e81..fd2cdc2 100644 --- a/src/spellchecker_mac.h +++ b/src/spellchecker_mac.h @@ -17,6 +17,7 @@ class MacSpellchecker : public SpellcheckerImplementation { std::vector GetAvailableDictionaries(const std::string& path); std::vector GetCorrectionsForMisspelling(const std::string& word); bool IsMisspelled(const std::string& word); + std::vector CheckSpelling(const uint16_t *text, size_t length); void Add(const std::string& word); private: diff --git a/src/spellchecker_mac.mm b/src/spellchecker_mac.mm index 59596f8..6482d5a 100644 --- a/src/spellchecker_mac.mm +++ b/src/spellchecker_mac.mm @@ -52,6 +52,30 @@ return result; } +std::vector MacSpellchecker::CheckSpelling(const uint16_t *text, size_t length) { + std::vector result; + + @autoreleasepool { + NSData *data = [[NSData alloc] initWithBytesNoCopy:(void *)(text) length:(length * 2) freeWhenDone:NO]; + NSString* string = [[NSString alloc] initWithData:data encoding:NSUTF16LittleEndianStringEncoding]; + NSArray *misspellings = [this->spellChecker checkString:string + range:NSMakeRange(0, string.length) + types:NSTextCheckingTypeSpelling + options:nil + inSpellDocumentWithTag:0 + orthography:nil + wordCount:nil]; + for (NSTextCheckingResult *misspelling in misspellings) { + MisspelledRange range; + range.start = misspelling.range.location; + range.end = misspelling.range.location + misspelling.range.length; + result.push_back(range); + } + } + + return result; +} + void MacSpellchecker::Add(const std::string& word) { @autoreleasepool { NSString* newWord = [NSString stringWithUTF8String:word.c_str()]; diff --git a/src/spellchecker_win.cc b/src/spellchecker_win.cc index 9d959ec..0e32df4 100644 --- a/src/spellchecker_win.cc +++ b/src/spellchecker_win.cc @@ -78,7 +78,7 @@ WindowsSpellchecker::~WindowsSpellchecker() { this->currentSpellchecker->Release(); this->currentSpellchecker = NULL; } - + if (this->spellcheckerFactory) { this->spellcheckerFactory->Release(); this->spellcheckerFactory = NULL; @@ -187,6 +187,36 @@ bool WindowsSpellchecker::IsMisspelled(const std::string& word) { return ret; } +std::vector WindowsSpellchecker::CheckSpelling(const uint16_t *text, size_t length) { + std::vector result; + + if (this->currentSpellchecker == NULL) { + return result; + } + + IEnumSpellingError* errors = NULL; + std::wstring wtext(reinterpret_cast(text), length); + if (FAILED(this->currentSpellchecker->Check(wtext.c_str(), &errors))) { + return result; + } + + ISpellingError *error; + while (errors->Next(&error) == S_OK) { + ULONG start, length; + error->get_StartIndex(&start); + error->get_Length(&length); + + MisspelledRange range; + range.start = start; + range.end = start + length; + result.push_back(range); + error->Release(); + } + + errors->Release(); + return result; +} + void WindowsSpellchecker::Add(const std::string& word) { if (this->currentSpellchecker == NULL) { return; diff --git a/src/spellchecker_win.h b/src/spellchecker_win.h index 4875f1d..09e7cae 100644 --- a/src/spellchecker_win.h +++ b/src/spellchecker_win.h @@ -18,6 +18,7 @@ class WindowsSpellchecker : public SpellcheckerImplementation { std::vector GetCorrectionsForMisspelling(const std::string& word); bool IsMisspelled(const std::string& word); + std::vector CheckSpelling(const uint16_t *text, size_t length); void Add(const std::string& word); private: diff --git a/src/transcoder.h b/src/transcoder.h new file mode 100644 index 0000000..eae93a1 --- /dev/null +++ b/src/transcoder.h @@ -0,0 +1,17 @@ +#ifndef SRC_TRANSCODER_H_ +#define SRC_TRANSCODER_H_ + +#include +#include + +namespace spellchecker { + +struct Transcoder; + +Transcoder *NewTranscoder(); +void FreeTranscoder(Transcoder *); +bool TranscodeUTF16ToUTF8(const Transcoder *, char *out, size_t out_length, const uint16_t *in, size_t in_length); + +} // namespace spellchecker + +#endif // SRC_TRANSCODER_H_ diff --git a/src/transcoder_posix.cc b/src/transcoder_posix.cc new file mode 100644 index 0000000..9656681 --- /dev/null +++ b/src/transcoder_posix.cc @@ -0,0 +1,59 @@ +#include +#include +#include + +namespace spellchecker { + +struct Transcoder { + iconv_t conversion; +}; + +static int IsBigEndian(void) { + union { + uint16_t integer; + char bytes[2]; + } two_byte_value; + + two_byte_value.integer = {0x0102}; + return two_byte_value.bytes[0] == 1; +} + +Transcoder *NewTranscoder() { + const char *to_encoding = "UTF-8"; + const char *from_encoding = IsBigEndian() ? "UTF-16BE" : "UTF-16LE"; + iconv_t conversion = iconv_open(to_encoding, from_encoding); + if (conversion == (iconv_t)-1) { + return NULL; + } + + Transcoder *result = new Transcoder(); + result->conversion = conversion; + return result; +} + +void FreeTranscoder(Transcoder *transcoder) { + iconv_close(transcoder->conversion); + delete transcoder; +} + +bool TranscodeUTF16ToUTF8(const Transcoder *transcoder, char *out, size_t out_bytes, const uint16_t *in, size_t in_length) { + char *utf16_word = reinterpret_cast(const_cast(in)); + size_t utf16_bytes = in_length * (sizeof(uint16_t) / sizeof(char)); + + size_t iconv_result = iconv( + transcoder->conversion, + &utf16_word, + &utf16_bytes, + &out, + &out_bytes + ); + + if (iconv_result == static_cast(-1)) { + return false; + } + + *out = '\0'; + return true; +} + +} // namespace spellchecker diff --git a/src/transcoder_win.cc b/src/transcoder_win.cc new file mode 100644 index 0000000..34d1da7 --- /dev/null +++ b/src/transcoder_win.cc @@ -0,0 +1,23 @@ +#include +#include +#include "transcoder.h" + +namespace spellchecker { + +struct Transcoder {}; + +Transcoder* NewTranscoder() { + return new Transcoder(); +} + +void FreeTranscoder(Transcoder *transcoder) { + delete transcoder; +} + +bool TranscodeUTF16ToUTF8(const Transcoder *transcoder, char *out, size_t out_length, const uint16_t *in, size_t in_length) { + int length = WideCharToMultiByte(CP_UTF8, 0, reinterpret_cast(in), in_length, out, out_length, NULL, NULL); + out[length] = '\0'; + return true; +} + +} // namespace spellchecker