Skip to content

Commit

Permalink
added rules, ignorePunct interface; changed the js interface to use o…
Browse files Browse the repository at this point in the history
…bject to config
  • Loading branch information
dotSlashLu committed Nov 15, 2013
1 parent 77914bd commit 71496a2
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 46 deletions.
11 changes: 8 additions & 3 deletions addon.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,14 @@ fs.readFile("./test_doc.txt", {
return console.error(err);
start = new Date().valueOf();
var scws = require("./build/Release/scws");
// console.log("going to segment:" + data);
var res = scws.segment(data, "utf8", "./dicts/dict.utf8.xdb:./dicts/dict_cht.utf8.xdb");
var res = scws.segment(data, {
charset: "utf8",
//dicts: "./dicts/dict.utf8.xdb:./dicts/dict_cht.utf8.xdb:./dicts/dict.test.txt",
dicts: "./dicts/dict.utf8.xdb",
rule: "./rules/rules.utf8.ini",
ignorePunct: true
});
end = new Date().valueOf();
console.log("time used: " + (end - start));
console.log("10 result: ", res.splice(0, 10));
console.log("10 results: ", res.splice(0, 10));
})
1 change: 1 addition & 0 deletions dicts/dict.test.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
路塔石
106 changes: 63 additions & 43 deletions nodescws.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,73 +20,94 @@ Handle<Value> Split(const Arguments& args) {
HandleScope scope;

/*
* scws(text, charset, dict, ignore_punct, multi);
* */
* scws(text, charset, dict, rules, ignore_punct, multi);
*/
if (args.Length() < 1) {
ThrowException(Exception::TypeError(String::New("[scws] Wrong number of arguments")));
return scope.Close(Undefined());
}

if (!args[0]->IsString()) {
ThrowException(Exception::TypeError(String::New("[scws] Wrong arguments")));
ThrowException(Exception::TypeError(String::New("[scws] Argument 1 should be the string to segment")));
return scope.Close(Undefined());
}

/*
* setup scws
*/
scws_t ret = scws_new();

v8::String::Utf8Value charset_str(args[1]->ToString());
std::string charset_str_std = std::string(*charset_str);
char *charset = (char *)charset_str_std.c_str();
if (strcmp(charset, "utf8") != 0 && strcmp(charset, "gbk") != 0)
// get settings
v8::Local<v8::Object> Settings = args[1]->ToObject();

// setup charset
std::string Charset(*v8::String::Utf8Value(Settings->Get(String::New("charset"))));
char *charset = (char *)Charset.c_str();
if (strcmp(charset, "undefined") == 0) {
printf("[scws WARNING] charset not specified\n");
charset = "utf8";
}
else if (strcmp(charset, "utf8") != 0 && strcmp(charset, "gbk") != 0)
charset = "utf8";
printf("charset: %s\n", charset);
printf("[scws LOG] charset: %s\n", charset);
scws_set_charset(ret, charset);

// setup dict
v8::String::Utf8Value dicts_str(args[2]->ToString());
std::string dicts_str_std = std::string(*dicts_str);
char *dicts = (char *)dicts_str_std.c_str();
int dict_mode;
if (strchr(dicts, ':') != NULL) {
while (*dicts != '\0') {
char *dict = (char *)malloc(sizeof(char) * MAXDIRLEN);
int i = 0;
while (i < MAXDIRLEN && (*dicts != ':'))
dict[i++] = *dicts++;
if (*dicts != '\0')
dicts++; // skip the ':'
if (strstr(dict, ".txt") != NULL) {
dict_mode = SCWS_XDICT_TXT;
printf("setting dict: txt mode\n");
std::string Dicts(*v8::String::Utf8Value(Settings->Get(String::New("dicts"))));
char *dicts = (char *)Dicts.c_str();
if (strcmp(dicts, "undefined") == 0) {
std::clog<<"[scws WARNING] Dict not specified, loading from the default path\n";
int add_dict_ret = scws_add_dict(ret, "./dicts/dict.utf8.xdb", SCWS_XDICT_XDB);
}
else {
int dict_mode;
if (strchr(dicts, ':') != NULL) {
while (*dicts != '\0') {
char *dict = (char *)malloc(sizeof(char) * MAXDIRLEN);
int i = 0;
while (i < MAXDIRLEN && (*dicts != ':'))
dict[i++] = *dicts++;
dict[i++] = '\0';
if (*dicts != '\0')
dicts++; // skip the ':'
if (strstr(dict, ".txt") != NULL)
dict_mode = SCWS_XDICT_TXT;
else
dict_mode = SCWS_XDICT_XDB;
printf("[scws LOG] setting dict: %s\n", dict);
scws_add_dict(ret, dict, dict_mode);
free(dict);
}
}
else {
if (strstr(dicts, ".txt") != NULL)
dict_mode = SCWS_XDICT_TXT;
else
dict_mode = SCWS_XDICT_XDB;
printf("setting dict: %s\n", dict);
scws_add_dict(ret, dict, dict_mode);
free(dict);
}
printf("setting dict: %s\n", dicts);
scws_add_dict(ret, dicts, dict_mode);
}
}

// set rules
std::string Rule(*v8::String::Utf8Value(Settings->Get(String::New("rule"))));
char *rule = (char *)Rule.c_str();
if (strcmp(rule, "undefined") == 0) {
std::clog<<"[scws WARNING] Rule not specified, loading from the default path\n";
scws_set_rule(ret, "./rules/rules.utf8.ini");
}
else {
if (strstr(dicts, ".txt") != NULL)
dict_mode = SCWS_XDICT_TXT;
else
dict_mode = SCWS_XDICT_XDB;
printf("setting dict: %s\n", dicts);
scws_add_dict(ret, dicts, dict_mode);
scws_set_rule(ret, rule);
printf("[scws LOG] Setting specified rule %s\n", rule);
}

int add_dict_ret = scws_add_dict(ret, "./dicts/dict.utf8.xdb", SCWS_XDICT_XDB);
scws_set_rule(ret, "./rules/rules.utf8.ini");
scws_set_ignore(ret, 1);
if (add_dict_ret < 0)
ThrowException(Exception::Error(String::New("[scws] Can't load dict")));
// set ignore punctuation
Local<Boolean> IgnorePunct = Settings->Get(String::New("ignorePunct"))->ToBoolean();
if (IgnorePunct->BooleanValue())
scws_set_ignore(ret, 1);

v8::String::Utf8Value value_str(args[0]->ToString());
std::string value_str_std = std::string(*value_str);
char *text = (char *)value_str_std.c_str();

std::string Text(*v8::String::Utf8Value(args[0]->ToString()));
char *text = (char *)Text.c_str();
scws_send_text(ret, text, strlen(text));

scws_res_t res;
Expand All @@ -96,7 +117,6 @@ Handle<Value> Split(const Arguments& args) {
scws_result *results_raw = (scws_result *)malloc(memsize);

while ((res = scws_get_result(ret)) != NULL) {
// printf("word: %d\tlimit: %d\n", result_words_count, RESMEMSTEP * memsteps);
while (res != NULL) {
memcpy(&results_raw[result_words_count], res, sizeof(*res));
result_words_count++;
Expand Down
1 change: 1 addition & 0 deletions test_doc.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
我是路塔石,现在测试nodescws
美丽优于丑陋。
清楚优于含糊。
简单优于复杂。
Expand Down

0 comments on commit 71496a2

Please sign in to comment.