diff --git a/addon.js b/addon.js index 5a71293..fea8e5f 100644 --- a/addon.js +++ b/addon.js @@ -9,9 +9,14 @@ fs.readFile("./test_doc.txt", { return console.error(err); start = new Date().valueOf(); var scws = require("./build/Release/scws"); - // console.log("going to segment:" + data); - var res = scws.segment(data, "utf8", "./dicts/dict.utf8.xdb:./dicts/dict_cht.utf8.xdb"); + var res = scws.segment(data, { + charset: "utf8", + //dicts: "./dicts/dict.utf8.xdb:./dicts/dict_cht.utf8.xdb:./dicts/dict.test.txt", + dicts: "./dicts/dict.utf8.xdb", + rule: "./rules/rules.utf8.ini", + ignorePunct: true + }); end = new Date().valueOf(); console.log("time used: " + (end - start)); - console.log("10 result: ", res.splice(0, 10)); + console.log("10 results: ", res.splice(0, 10)); }) diff --git a/dicts/dict.test.txt b/dicts/dict.test.txt new file mode 100644 index 0000000..ec30a05 --- /dev/null +++ b/dicts/dict.test.txt @@ -0,0 +1 @@ +路塔石 diff --git a/nodescws.cc b/nodescws.cc index eeea067..22034af 100644 --- a/nodescws.cc +++ b/nodescws.cc @@ -20,15 +20,15 @@ Handle Split(const Arguments& args) { HandleScope scope; /* - * scws(text, charset, dict, ignore_punct, multi); - * */ + * scws(text, charset, dict, rules, ignore_punct, multi); + */ if (args.Length() < 1) { ThrowException(Exception::TypeError(String::New("[scws] Wrong number of arguments"))); return scope.Close(Undefined()); } if (!args[0]->IsString()) { - ThrowException(Exception::TypeError(String::New("[scws] Wrong arguments"))); + ThrowException(Exception::TypeError(String::New("[scws] Argument 1 should be the string to segment"))); return scope.Close(Undefined()); } @@ -36,57 +36,78 @@ Handle Split(const Arguments& args) { * setup scws */ scws_t ret = scws_new(); - - v8::String::Utf8Value charset_str(args[1]->ToString()); - std::string charset_str_std = std::string(*charset_str); - char *charset = (char *)charset_str_std.c_str(); - if (strcmp(charset, "utf8") != 0 && strcmp(charset, "gbk") != 0) + // get settings + v8::Local Settings = args[1]->ToObject(); + + // setup charset + std::string Charset(*v8::String::Utf8Value(Settings->Get(String::New("charset")))); + char *charset = (char *)Charset.c_str(); + if (strcmp(charset, "undefined") == 0) { + printf("[scws WARNING] charset not specified\n"); + charset = "utf8"; + } + else if (strcmp(charset, "utf8") != 0 && strcmp(charset, "gbk") != 0) charset = "utf8"; - printf("charset: %s\n", charset); + printf("[scws LOG] charset: %s\n", charset); scws_set_charset(ret, charset); // setup dict - v8::String::Utf8Value dicts_str(args[2]->ToString()); - std::string dicts_str_std = std::string(*dicts_str); - char *dicts = (char *)dicts_str_std.c_str(); - int dict_mode; - if (strchr(dicts, ':') != NULL) { - while (*dicts != '\0') { - char *dict = (char *)malloc(sizeof(char) * MAXDIRLEN); - int i = 0; - while (i < MAXDIRLEN && (*dicts != ':')) - dict[i++] = *dicts++; - if (*dicts != '\0') - dicts++; // skip the ':' - if (strstr(dict, ".txt") != NULL) { - dict_mode = SCWS_XDICT_TXT; - printf("setting dict: txt mode\n"); + std::string Dicts(*v8::String::Utf8Value(Settings->Get(String::New("dicts")))); + char *dicts = (char *)Dicts.c_str(); + if (strcmp(dicts, "undefined") == 0) { + std::clog<<"[scws WARNING] Dict not specified, loading from the default path\n"; + int add_dict_ret = scws_add_dict(ret, "./dicts/dict.utf8.xdb", SCWS_XDICT_XDB); + } + else { + int dict_mode; + if (strchr(dicts, ':') != NULL) { + while (*dicts != '\0') { + char *dict = (char *)malloc(sizeof(char) * MAXDIRLEN); + int i = 0; + while (i < MAXDIRLEN && (*dicts != ':')) + dict[i++] = *dicts++; + dict[i++] = '\0'; + if (*dicts != '\0') + dicts++; // skip the ':' + if (strstr(dict, ".txt") != NULL) + dict_mode = SCWS_XDICT_TXT; + else + dict_mode = SCWS_XDICT_XDB; + printf("[scws LOG] setting dict: %s\n", dict); + scws_add_dict(ret, dict, dict_mode); + free(dict); } + } + else { + if (strstr(dicts, ".txt") != NULL) + dict_mode = SCWS_XDICT_TXT; else dict_mode = SCWS_XDICT_XDB; - printf("setting dict: %s\n", dict); - scws_add_dict(ret, dict, dict_mode); - free(dict); - } + printf("setting dict: %s\n", dicts); + scws_add_dict(ret, dicts, dict_mode); + } + } + + // set rules + std::string Rule(*v8::String::Utf8Value(Settings->Get(String::New("rule")))); + char *rule = (char *)Rule.c_str(); + if (strcmp(rule, "undefined") == 0) { + std::clog<<"[scws WARNING] Rule not specified, loading from the default path\n"; + scws_set_rule(ret, "./rules/rules.utf8.ini"); } else { - if (strstr(dicts, ".txt") != NULL) - dict_mode = SCWS_XDICT_TXT; - else - dict_mode = SCWS_XDICT_XDB; - printf("setting dict: %s\n", dicts); - scws_add_dict(ret, dicts, dict_mode); + scws_set_rule(ret, rule); + printf("[scws LOG] Setting specified rule %s\n", rule); } - int add_dict_ret = scws_add_dict(ret, "./dicts/dict.utf8.xdb", SCWS_XDICT_XDB); - scws_set_rule(ret, "./rules/rules.utf8.ini"); - scws_set_ignore(ret, 1); - if (add_dict_ret < 0) - ThrowException(Exception::Error(String::New("[scws] Can't load dict"))); + // set ignore punctuation + Local IgnorePunct = Settings->Get(String::New("ignorePunct"))->ToBoolean(); + if (IgnorePunct->BooleanValue()) + scws_set_ignore(ret, 1); - v8::String::Utf8Value value_str(args[0]->ToString()); - std::string value_str_std = std::string(*value_str); - char *text = (char *)value_str_std.c_str(); + + std::string Text(*v8::String::Utf8Value(args[0]->ToString())); + char *text = (char *)Text.c_str(); scws_send_text(ret, text, strlen(text)); scws_res_t res; @@ -96,7 +117,6 @@ Handle Split(const Arguments& args) { scws_result *results_raw = (scws_result *)malloc(memsize); while ((res = scws_get_result(ret)) != NULL) { - // printf("word: %d\tlimit: %d\n", result_words_count, RESMEMSTEP * memsteps); while (res != NULL) { memcpy(&results_raw[result_words_count], res, sizeof(*res)); result_words_count++; diff --git a/test_doc.txt b/test_doc.txt index f1c7686..f1b7cc2 100644 --- a/test_doc.txt +++ b/test_doc.txt @@ -1,3 +1,4 @@ +我是路塔石,现在测试nodescws 美丽优于丑陋。 清楚优于含糊。 简单优于复杂。