Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add slf4j log and load user dict from InputStream #66

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,23 @@
<artifactId>commons-lang3</artifactId>
<version>3.3.1</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.7.25</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-simple</artifactId>
<version>1.7.25</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.16.20</version>
<scope>provided</scope>
</dependency>
</dependencies>

<build>
Expand Down
104 changes: 64 additions & 40 deletions src/main/java/com/huaban/analysis/jieba/WordDictionary.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package com.huaban.analysis.jieba;

import lombok.extern.slf4j.Slf4j;

import java.io.BufferedReader;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
Expand All @@ -17,6 +19,7 @@
import java.util.Set;


@Slf4j
public class WordDictionary {
private static WordDictionary singleton;
private static final String MAIN_DICT = "/dict.txt";
Expand Down Expand Up @@ -54,7 +57,7 @@ public static WordDictionary getInstance() {
*/
public void init(Path configFile) {
String abspath = configFile.toAbsolutePath().toString();
System.out.println("initialize user dictionary:" + abspath);
log.info("initialize user dictionary: {}", abspath);
synchronized (WordDictionary.class) {
if (loadedPath.contains(abspath))
return;
Expand All @@ -63,14 +66,12 @@ public void init(Path configFile) {
try {
stream = Files.newDirectoryStream(configFile, String.format(Locale.getDefault(), "*%s", USER_DICT_SUFFIX));
for (Path path: stream){
System.err.println(String.format(Locale.getDefault(), "loading dict %s", path.toString()));
log.info("loading dict {}", path);
singleton.loadUserDict(path);
}
loadedPath.add(abspath);
} catch (IOException e) {
// TODO Auto-generated catch block
// e.printStackTrace();
System.err.println(String.format(Locale.getDefault(), "%s: load user dict failure!", configFile.toString()));
log.error("{}: load user dict failure", configFile, e);
}
}
}
Expand Down Expand Up @@ -110,19 +111,15 @@ public void loadDict() {
entry.setValue((Math.log(entry.getValue() / total)));
minFreq = Math.min(entry.getValue(), minFreq);
}
System.out.println(String.format(Locale.getDefault(), "main dict load finished, time elapsed %d ms",
System.currentTimeMillis() - s));
}
catch (IOException e) {
System.err.println(String.format(Locale.getDefault(), "%s load failure!", MAIN_DICT));
}
finally {
log.info("main dict load finished, time elapsed {} ms", System.currentTimeMillis() - s);
} catch (IOException e) {
log.error("{} load failure", MAIN_DICT, e);
} finally {
try {
if (null != is)
is.close();
}
catch (IOException e) {
System.err.println(String.format(Locale.getDefault(), "%s close failure!", MAIN_DICT));
} catch (IOException e) {
log.error("{} close failure!", MAIN_DICT, e);
}
}
}
Expand All @@ -143,38 +140,65 @@ public void loadUserDict(Path userDict) {
loadUserDict(userDict, StandardCharsets.UTF_8);
}


public void loadUserDict(Path userDict, Charset charset) {
public void loadUserDict(Path userDict, Charset charset) {
BufferedReader br = null;
try {
BufferedReader br = Files.newBufferedReader(userDict, charset);
long s = System.currentTimeMillis();
int count = 0;
while (br.ready()) {
String line = br.readLine();
String[] tokens = line.split("[\t ]+");

if (tokens.length < 1) {
// Ignore empty line
continue;
log.info("to read user dict {}", userDict);
br = Files.newBufferedReader(userDict, charset);
loadUserDict(br);
} catch (IOException e) {
log.error("load user dict {} failure!", userDict, e);
} finally {
if (br != null) {
try {
br.close();
} catch (IOException e) {
log.error("close BufferedReader failure!", e);
}

String word = tokens[0];

double freq = 3.0d;
if (tokens.length == 2)
freq = Double.valueOf(tokens[1]);
word = addWord(word);
freqs.put(word, Math.log(freq / total));
count++;
}
System.out.println(String.format(Locale.getDefault(), "user dict %s load finished, tot words:%d, time elapsed:%dms", userDict.toString(), count, System.currentTimeMillis() - s));
br.close();
}
catch (IOException e) {
System.err.println(String.format(Locale.getDefault(), "%s: load user dict failure!", userDict.toString()));
}

public void loadUserDict(InputStream is) {
BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
try {
log.info("to read user dict from InputStream");
loadUserDict(br);
} catch (IOException e) {
log.error("load user dict failure!", e);
} finally {
try {
br.close();
} catch (IOException e) {
log.error("close BufferedReader failure!", e);
}
}
}

public void loadUserDict(BufferedReader br) throws IOException {
long s = System.currentTimeMillis();
int count = 0;
while (br.ready()) {
String line = br.readLine();
String[] tokens = line.split("[\t ]+");

if (tokens.length < 1) {
// Ignore empty line
continue;
}

String word = tokens[0];

double freq = 3.0d;
if (tokens.length == 2)
freq = Double.valueOf(tokens[1]);
word = addWord(word);
freqs.put(word, Math.log(freq / total));
count++;
}
log.info("user dict load finished, total words: {}, time elapsed: {} ms", count, System.currentTimeMillis() - s);
}


public DictSegment getTrie() {
return this._dict;
Expand Down
15 changes: 7 additions & 8 deletions src/main/java/com/huaban/analysis/jieba/viterbi/FinalSeg.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@
import com.huaban.analysis.jieba.CharacterUtil;
import com.huaban.analysis.jieba.Pair;
import com.huaban.analysis.jieba.Node;
import lombok.extern.slf4j.Slf4j;


@Slf4j
public class FinalSeg {
private static FinalSeg singleInstance;
private static final String PROB_EMIT = "/prob_emit.txt";
Expand Down Expand Up @@ -90,21 +92,18 @@ private void loadModel() {
values.put(tokens[0].charAt(0), Double.valueOf(tokens[1]));
}
}
}
catch (IOException e) {
System.err.println(String.format(Locale.getDefault(), "%s: load model failure!", PROB_EMIT));
}
finally {
} catch (IOException e) {
log.error("{}: load model failure!", PROB_EMIT, e);
} finally {
try {
if (null != is)
is.close();
}
catch (IOException e) {
System.err.println(String.format(Locale.getDefault(), "%s: close failure!", PROB_EMIT));
log.error("{}: close failure!", PROB_EMIT, e);
}
}
System.out.println(String.format(Locale.getDefault(), "model load finished, time elapsed %d ms.",
System.currentTimeMillis() - s));
log.info("model load finished, time elapsed {} ms.", System.currentTimeMillis() - s);
}


Expand Down
16 changes: 8 additions & 8 deletions src/test/java/com/huaban/analysis/jieba/JiebaSegmenterTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
*/
package com.huaban.analysis.jieba;

import java.io.File;
import java.nio.charset.StandardCharsets;
import java.nio.file.Paths;
import java.util.List;
Expand All @@ -14,13 +13,16 @@
import org.junit.Test;

import com.huaban.analysis.jieba.JiebaSegmenter.SegMode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


/**
* @author matrix
*
*/
public class JiebaSegmenterTest extends TestCase {
private Logger logger = LoggerFactory.getLogger(JiebaSegmenterTest.class);
private JiebaSegmenter segmenter = new JiebaSegmenter();
String[] sentences =
new String[] {
Expand Down Expand Up @@ -133,7 +135,7 @@ protected void tearDown() throws Exception {
public void testCutForSearch() {
for (String sentence : sentences) {
List<SegToken> tokens = segmenter.process(sentence, SegMode.SEARCH);
System.out.print(String.format(Locale.getDefault(), "\n%s\n%s", sentence, tokens.toString()));
logger.info("\n{}\n{}", sentence, tokens);
}
}

Expand All @@ -142,7 +144,7 @@ public void testCutForSearch() {
public void testCutForIndex() {
for (String sentence : sentences) {
List<SegToken> tokens = segmenter.process(sentence, SegMode.INDEX);
System.out.print(String.format(Locale.getDefault(), "\n%s\n%s", sentence, tokens.toString()));
logger.info("\n{}\n{}", sentence, tokens);
}
}

Expand All @@ -159,7 +161,7 @@ public void testBugSentence() {
"干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 " };
for (String sentence : bugs) {
List<SegToken> tokens = segmenter.process(sentence, SegMode.SEARCH);
System.out.print(String.format(Locale.getDefault(), "\n%s\n%s", sentence, tokens.toString()));
logger.info("\n{}\n{}", sentence, tokens);
}
}

Expand All @@ -176,8 +178,7 @@ public void testSegmentSpeed() {
wordCount += sentence.length();
}
long elapsed = (System.currentTimeMillis() - start);
System.out.println(String.format(Locale.getDefault(), "time elapsed:%d, rate:%fkb/s, sentences:%.2f/s", elapsed,
(length * 1.0) / 1024.0f / (elapsed * 1.0 / 1000.0f), wordCount * 1000.0f / (elapsed * 1.0)));
logger.info("time elapsed: {}, rate: {}kb/s, sentences: {}/s", elapsed, length / 1024.0 / (elapsed / 1000.0), wordCount * 1000.0 / elapsed);
}


Expand All @@ -193,7 +194,6 @@ public void testLongTextSegmentSpeed() {
wordCount += sentence.length();
}
long elapsed = (System.currentTimeMillis() - start);
System.out.println(String.format(Locale.getDefault(), "time elapsed:%d, rate:%fkb/s, sentences:%.2f/s", elapsed,
(length * 1.0) / 1024.0f / (elapsed * 1.0 / 1000.0f), wordCount * 1000.0f / (elapsed * 1.0)));
logger.info("time elapsed: {}, rate: {}kb/s, sentences: {}/s", elapsed, length / 1024.0 / (elapsed / 1000.0), wordCount * 1000.0 / elapsed);
}
}