Skip to content
This repository has been archived by the owner on Dec 15, 2022. It is now read-only.

Add an API for finding all the misspelled words in a given string #27

Merged
merged 16 commits into from
Jan 11, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions appveyor.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
environment:
nodejs_version: "0.10"

matrix:
- {}
- {SPELLCHECKER_PREFER_HUNSPELL: true}

install:
- ps: Install-Product node $env:nodejs_version
- npm --msvs_version=2013 install
Expand Down
7 changes: 5 additions & 2 deletions binding.gyp
Original file line number Diff line number Diff line change
Expand Up @@ -41,17 +41,20 @@
}],
['OS=="win"', {
'sources': [
'src/spellchecker_win.cc'
'src/spellchecker_win.cc',
'src/transcoder_win.cc',
],
}],
['OS=="linux"', {
'sources': [
'src/spellchecker_linux.cc'
'src/spellchecker_linux.cc',
'src/transcoder_posix.cc',
],
}],
['OS=="mac"', {
'sources': [
'src/spellchecker_mac.mm',
'src/transcoder_posix.cc',
],
'link_settings': {
'libraries': [
Expand Down
7 changes: 7 additions & 0 deletions lib/spellchecker.js
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,12 @@ var isMisspelled = function() {
return defaultSpellcheck.isMisspelled.apply(defaultSpellcheck, arguments);
};

var checkSpelling = function() {
ensureDefaultSpellCheck();

return defaultSpellcheck.checkSpelling.apply(defaultSpellcheck, arguments);
};

var add = function() {
ensureDefaultSpellCheck();

Expand Down Expand Up @@ -64,6 +70,7 @@ module.exports = {
setDictionary: setDictionary,
add: add,
isMisspelled: isMisspelled,
checkSpelling: checkSpelling,
getAvailableDictionaries: getAvailableDictionaries,
getCorrectionsForMisspelling: getCorrectionsForMisspelling,
Spellchecker: Spellchecker
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"main": "./lib/spellchecker.js",
"name": "spellchecker",
"description": "Bindings to native spellchecker",
"version": "3.1.3",
"version": "3.2.0-0",
"licenses": [
{
"type": "MIT",
Expand Down
48 changes: 48 additions & 0 deletions spec/spellchecker-spec.coffee
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,54 @@ describe "SpellChecker", ->
it "throws an exception when no word specified", ->
expect(-> SpellChecker.isMisspelled()).toThrow()

describe ".checkSpelling(string)", ->
it "returns an array of character ranges of misspelled words", ->
string = "cat caat dog dooog"

expect(SpellChecker.checkSpelling(string)).toEqual [
{start: 4, end: 8},
{start: 13, end: 18},
]

it "accounts for UTF16 pairs", ->
string = "😎 cat caat dog dooog"
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For Mac and Windows, this didn't require any extra work, because the NSRanges returned by NSSpellChecker (and probably all NSString APIs) seem to refer to UTF16 code point indices, as opposed to logical character indices, and the same applies for the Windows spell-check APIs.

For Linux, the Hunspell library only provides a per-word spell-checking API; it doesn't handle arbitrary text. It also expects UTF8-encoded words. I deal with this by passing the string to the native spell-checkers in UTF16 (as V8 natively stores it), and for hunspell, transcoding to UTF8 one word at a time, so that I retain the UTF16 indices.


expect(SpellChecker.checkSpelling(string)).toEqual [
{start: 7, end: 11},
{start: 16, end: 21},
]

it "accounts for other non-word characters", ->
string = "'cat' (caat. <dog> :dooog)"
expect(SpellChecker.checkSpelling(string)).toEqual [
{start: 7, end: 11},
{start: 20, end: 25},
]

it "does not treat non-english letters as word boundaries", ->
SpellChecker.add("cliché")
expect(SpellChecker.checkSpelling("what cliché nonsense")).toEqual []

it "handles words with apostrophes", ->
string = "doesn't isn't aint hasn't"
expect(SpellChecker.checkSpelling(string)).toEqual [
{start: string.indexOf("aint"), end: string.indexOf("aint") + 4}
]

string = "you say you're 'certain', but are you really?"
expect(SpellChecker.checkSpelling(string)).toEqual []

string = "you say you're 'sertan', but are you really?"
expect(SpellChecker.checkSpelling(string)).toEqual [
{start: string.indexOf("sertan"), end: string.indexOf("',")}
]

it "handles invalid inputs", ->
expect(SpellChecker.checkSpelling("")).toEqual []
expect(-> SpellChecker.checkSpelling()).toThrow("Bad argument")
expect(-> SpellChecker.checkSpelling(null)).toThrow("Bad argument")
expect(-> SpellChecker.checkSpelling({})).toThrow("Bad argument")

describe ".getCorrectionsForMisspelling(word)", ->
it "returns an array of possible corrections", ->
corrections = SpellChecker.getCorrectionsForMisspelling('worrd')
Expand Down
38 changes: 38 additions & 0 deletions src/main.cc
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#include <vector>
#include "nan.h"
#include "spellchecker.h"

Expand Down Expand Up @@ -49,6 +50,42 @@ class Spellchecker : public Nan::ObjectWrap {
info.GetReturnValue().Set(Nan::New(that->impl->IsMisspelled(word)));
}

static NAN_METHOD(CheckSpelling) {
Nan::HandleScope scope;
if (info.Length() < 1) {
return Nan::ThrowError("Bad argument");
}

Handle<String> string = Handle<String>::Cast(info[0]);
if (!string->IsString()) {
return Nan::ThrowError("Bad argument");
}

Local<Array> result = Nan::New<Array>();
info.GetReturnValue().Set(result);

if (string->Length() == 0) {
return;
}

std::vector<uint16_t> text(string->Length() + 1);
string->Write(reinterpret_cast<uint16_t *>(text.data()));

Spellchecker* that = Nan::ObjectWrap::Unwrap<Spellchecker>(info.Holder());
std::vector<MisspelledRange> misspelled_ranges = that->impl->CheckSpelling(text.data(), text.size());

std::vector<MisspelledRange>::const_iterator iter = misspelled_ranges.begin();
for (; iter != misspelled_ranges.end(); ++iter) {
size_t index = iter - misspelled_ranges.begin();
uint32_t start = iter->start, end = iter->end;

Local<Object> misspelled_range = Nan::New<Object>();
misspelled_range->Set(Nan::New("start").ToLocalChecked(), Nan::New<Integer>(start));
misspelled_range->Set(Nan::New("end").ToLocalChecked(), Nan::New<Integer>(end));
result->Set(index, misspelled_range);
}
}

static NAN_METHOD(Add) {
Nan::HandleScope scope;
if (info.Length() < 1) {
Expand Down Expand Up @@ -127,6 +164,7 @@ class Spellchecker : public Nan::ObjectWrap {
Nan::SetMethod(tpl->InstanceTemplate(), "getAvailableDictionaries", Spellchecker::GetAvailableDictionaries);
Nan::SetMethod(tpl->InstanceTemplate(), "getCorrectionsForMisspelling", Spellchecker::GetCorrectionsForMisspelling);
Nan::SetMethod(tpl->InstanceTemplate(), "isMisspelled", Spellchecker::IsMisspelled);
Nan::SetMethod(tpl->InstanceTemplate(), "checkSpelling", Spellchecker::CheckSpelling);
Nan::SetMethod(tpl->InstanceTemplate(), "add", Spellchecker::Add);

exports->Set(Nan::New("Spellchecker").ToLocalChecked(), tpl->GetFunction());
Expand Down
8 changes: 8 additions & 0 deletions src/spellchecker.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,15 @@

#include <string>
#include <vector>
#include <stdint.h>

namespace spellchecker {

struct MisspelledRange {
size_t start;
size_t end;
};

class SpellcheckerImplementation {
public:
virtual bool SetDictionary(const std::string& language, const std::string& path) = 0;
Expand All @@ -17,6 +23,8 @@ class SpellcheckerImplementation {
// Returns true if the word is misspelled.
virtual bool IsMisspelled(const std::string& word) = 0;

virtual std::vector<MisspelledRange> CheckSpelling(const uint16_t *text, size_t length) = 0;

// Adds a new word to the dictionary.
// NB: When using Hunspell, this will not modify the .dic file; custom words must be added each
// time the spellchecker is created. Use a custom dictionary file.
Expand Down
66 changes: 65 additions & 1 deletion src/spellchecker_hunspell.cc
Original file line number Diff line number Diff line change
@@ -1,15 +1,21 @@
#include <cstdio>
#include <cwctype>
#include <algorithm>
#include "../vendor/hunspell/src/hunspell/hunspell.hxx"
#include "spellchecker_hunspell.h"

namespace spellchecker {

HunspellSpellchecker::HunspellSpellchecker() : hunspell(NULL) { }
HunspellSpellchecker::HunspellSpellchecker() : hunspell(NULL), transcoder(NewTranscoder()) { }

HunspellSpellchecker::~HunspellSpellchecker() {
if (hunspell) {
delete hunspell;
}

if (transcoder) {
FreeTranscoder(transcoder);
}
}

bool HunspellSpellchecker::SetDictionary(const std::string& language, const std::string& dirname) {
Expand Down Expand Up @@ -48,6 +54,64 @@ bool HunspellSpellchecker::IsMisspelled(const std::string& word) {
return hunspell->spell(word.c_str()) == 0;
}

std::vector<MisspelledRange> HunspellSpellchecker::CheckSpelling(const uint16_t *utf16_text, size_t utf16_length) {
std::vector<MisspelledRange> result;

if (!hunspell || !transcoder) {
return result;
}

std::vector<char> utf8_buffer(256);

enum {
unknown,
in_separator,
in_word,
} state = in_separator;

for (size_t word_start = 0, i = 0; i < utf16_length; i++) {
uint16_t c = utf16_text[i];

switch (state) {
case unknown:
if (iswpunct(c) || iswspace(c)) {
state = in_separator;
}
break;

case in_separator:
if (iswalpha(c)) {
word_start = i;
state = in_word;
} else if (!iswpunct(c) && !iswspace(c)) {
state = unknown;
}
break;

case in_word:
if (c == '\'' && iswalpha(utf16_text[i + 1])) {
i++;
} else if (c == 0 || iswpunct(c) || iswspace(c)) {
state = in_separator;
bool converted = TranscodeUTF16ToUTF8(transcoder, (char *)utf8_buffer.data(), utf8_buffer.size(), utf16_text + word_start, i - word_start);
if (converted) {
if (hunspell->spell(utf8_buffer.data()) == 0) {
MisspelledRange range;
range.start = word_start;
range.end = i;
result.push_back(range);
}
}
} else if (!iswalpha(c)) {
state = unknown;
}
break;
}
}

return result;
}

void HunspellSpellchecker::Add(const std::string& word) {
if (hunspell) {
hunspell->add(word.c_str());
Expand Down
3 changes: 3 additions & 0 deletions src/spellchecker_hunspell.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#define SRC_SPELLCHECKER_HUNSPELL_H_

#include "spellchecker.h"
#include "transcoder.h"

class Hunspell;

Expand All @@ -16,10 +17,12 @@ class HunspellSpellchecker : public SpellcheckerImplementation {
std::vector<std::string> GetAvailableDictionaries(const std::string& path);
std::vector<std::string> GetCorrectionsForMisspelling(const std::string& word);
bool IsMisspelled(const std::string& word);
std::vector<MisspelledRange> CheckSpelling(const uint16_t *text, size_t length);
void Add(const std::string& word);

private:
Hunspell* hunspell;
Transcoder *transcoder;
};

} // namespace spellchecker
Expand Down
1 change: 1 addition & 0 deletions src/spellchecker_mac.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ class MacSpellchecker : public SpellcheckerImplementation {
std::vector<std::string> GetAvailableDictionaries(const std::string& path);
std::vector<std::string> GetCorrectionsForMisspelling(const std::string& word);
bool IsMisspelled(const std::string& word);
std::vector<MisspelledRange> CheckSpelling(const uint16_t *text, size_t length);
void Add(const std::string& word);

private:
Expand Down
24 changes: 24 additions & 0 deletions src/spellchecker_mac.mm
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,30 @@
return result;
}

std::vector<MisspelledRange> MacSpellchecker::CheckSpelling(const uint16_t *text, size_t length) {
std::vector<MisspelledRange> result;

@autoreleasepool {
NSData *data = [[NSData alloc] initWithBytesNoCopy:(void *)(text) length:(length * 2) freeWhenDone:NO];
NSString* string = [[NSString alloc] initWithData:data encoding:NSUTF16LittleEndianStringEncoding];
NSArray *misspellings = [this->spellChecker checkString:string
range:NSMakeRange(0, string.length)
types:NSTextCheckingTypeSpelling
options:nil
inSpellDocumentWithTag:0
orthography:nil
wordCount:nil];
for (NSTextCheckingResult *misspelling in misspellings) {
MisspelledRange range;
range.start = misspelling.range.location;
range.end = misspelling.range.location + misspelling.range.length;
result.push_back(range);
}
}

return result;
}

void MacSpellchecker::Add(const std::string& word) {
@autoreleasepool {
NSString* newWord = [NSString stringWithUTF8String:word.c_str()];
Expand Down
32 changes: 31 additions & 1 deletion src/spellchecker_win.cc
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ WindowsSpellchecker::~WindowsSpellchecker() {
this->currentSpellchecker->Release();
this->currentSpellchecker = NULL;
}

if (this->spellcheckerFactory) {
this->spellcheckerFactory->Release();
this->spellcheckerFactory = NULL;
Expand Down Expand Up @@ -187,6 +187,36 @@ bool WindowsSpellchecker::IsMisspelled(const std::string& word) {
return ret;
}

std::vector<MisspelledRange> WindowsSpellchecker::CheckSpelling(const uint16_t *text, size_t length) {
std::vector<MisspelledRange> result;

if (this->currentSpellchecker == NULL) {
return result;
}

IEnumSpellingError* errors = NULL;
std::wstring wtext(reinterpret_cast<const wchar_t *>(text), length);
if (FAILED(this->currentSpellchecker->Check(wtext.c_str(), &errors))) {
return result;
}

ISpellingError *error;
while (errors->Next(&error) == S_OK) {
ULONG start, length;
error->get_StartIndex(&start);
error->get_Length(&length);

MisspelledRange range;
range.start = start;
range.end = start + length;
result.push_back(range);
error->Release();
}

errors->Release();
return result;
}

void WindowsSpellchecker::Add(const std::string& word) {
if (this->currentSpellchecker == NULL) {
return;
Expand Down
Loading