From 2022194a086de018d0ac326984654e13450fc376 Mon Sep 17 00:00:00 2001 From: lvli Date: Sat, 27 Jan 2018 16:12:14 +0800 Subject: [PATCH 1/2] change sys out -> log --- pom.xml | 17 +++++++ .../huaban/analysis/jieba/WordDictionary.java | 48 ++++++++++--------- .../analysis/jieba/viterbi/FinalSeg.java | 15 +++--- .../analysis/jieba/JiebaSegmenterTest.java | 16 +++---- 4 files changed, 58 insertions(+), 38 deletions(-) diff --git a/pom.xml b/pom.xml index 5b8bc06..5f4f1c7 100644 --- a/pom.xml +++ b/pom.xml @@ -57,6 +57,23 @@ commons-lang3 3.3.1 + + org.slf4j + slf4j-api + 1.7.25 + + + org.slf4j + slf4j-simple + 1.7.25 + test + + + org.projectlombok + lombok + 1.16.20 + provided + diff --git a/src/main/java/com/huaban/analysis/jieba/WordDictionary.java b/src/main/java/com/huaban/analysis/jieba/WordDictionary.java index 8cb1e53..c4cb9e5 100644 --- a/src/main/java/com/huaban/analysis/jieba/WordDictionary.java +++ b/src/main/java/com/huaban/analysis/jieba/WordDictionary.java @@ -1,5 +1,7 @@ package com.huaban.analysis.jieba; +import lombok.extern.slf4j.Slf4j; + import java.io.BufferedReader; import java.nio.file.DirectoryStream; import java.nio.file.Files; @@ -17,6 +19,7 @@ import java.util.Set; +@Slf4j public class WordDictionary { private static WordDictionary singleton; private static final String MAIN_DICT = "/dict.txt"; @@ -54,7 +57,7 @@ public static WordDictionary getInstance() { */ public void init(Path configFile) { String abspath = configFile.toAbsolutePath().toString(); - System.out.println("initialize user dictionary:" + abspath); + log.info("initialize user dictionary: {}", abspath); synchronized (WordDictionary.class) { if (loadedPath.contains(abspath)) return; @@ -63,14 +66,12 @@ public void init(Path configFile) { try { stream = Files.newDirectoryStream(configFile, String.format(Locale.getDefault(), "*%s", USER_DICT_SUFFIX)); for (Path path: stream){ - System.err.println(String.format(Locale.getDefault(), "loading dict %s", path.toString())); + log.info("loading dict {}", path); singleton.loadUserDict(path); } loadedPath.add(abspath); } catch (IOException e) { - // TODO Auto-generated catch block - // e.printStackTrace(); - System.err.println(String.format(Locale.getDefault(), "%s: load user dict failure!", configFile.toString())); + log.error("{}: load user dict failure", configFile, e); } } } @@ -110,19 +111,15 @@ public void loadDict() { entry.setValue((Math.log(entry.getValue() / total))); minFreq = Math.min(entry.getValue(), minFreq); } - System.out.println(String.format(Locale.getDefault(), "main dict load finished, time elapsed %d ms", - System.currentTimeMillis() - s)); - } - catch (IOException e) { - System.err.println(String.format(Locale.getDefault(), "%s load failure!", MAIN_DICT)); - } - finally { + log.info("main dict load finished, time elapsed {} ms", System.currentTimeMillis() - s); + } catch (IOException e) { + log.error("{} load failure", MAIN_DICT, e); + } finally { try { if (null != is) is.close(); - } - catch (IOException e) { - System.err.println(String.format(Locale.getDefault(), "%s close failure!", MAIN_DICT)); + } catch (IOException e) { + log.error("{} close failure!", MAIN_DICT, e); } } } @@ -144,9 +141,10 @@ public void loadUserDict(Path userDict) { } - public void loadUserDict(Path userDict, Charset charset) { + public void loadUserDict(Path userDict, Charset charset) { + BufferedReader br = null; try { - BufferedReader br = Files.newBufferedReader(userDict, charset); + br = Files.newBufferedReader(userDict, charset); long s = System.currentTimeMillis(); int count = 0; while (br.ready()) { @@ -167,11 +165,17 @@ public void loadUserDict(Path userDict, Charset charset) { freqs.put(word, Math.log(freq / total)); count++; } - System.out.println(String.format(Locale.getDefault(), "user dict %s load finished, tot words:%d, time elapsed:%dms", userDict.toString(), count, System.currentTimeMillis() - s)); - br.close(); - } - catch (IOException e) { - System.err.println(String.format(Locale.getDefault(), "%s: load user dict failure!", userDict.toString())); + log.info("user dict {} load finished, total words: {}, time elapsed: {} ms", userDict, count, System.currentTimeMillis() - s); + } catch (IOException e) { + log.error("{}: load user dict failure!", userDict, e); + } finally { + if (br != null) { + try { + br.close(); + } catch (IOException e) { + log.error("{} close failure!", userDict, e); + } + } } } diff --git a/src/main/java/com/huaban/analysis/jieba/viterbi/FinalSeg.java b/src/main/java/com/huaban/analysis/jieba/viterbi/FinalSeg.java index 8a79eb5..9be5806 100644 --- a/src/main/java/com/huaban/analysis/jieba/viterbi/FinalSeg.java +++ b/src/main/java/com/huaban/analysis/jieba/viterbi/FinalSeg.java @@ -16,8 +16,10 @@ import com.huaban.analysis.jieba.CharacterUtil; import com.huaban.analysis.jieba.Pair; import com.huaban.analysis.jieba.Node; +import lombok.extern.slf4j.Slf4j; +@Slf4j public class FinalSeg { private static FinalSeg singleInstance; private static final String PROB_EMIT = "/prob_emit.txt"; @@ -90,21 +92,18 @@ private void loadModel() { values.put(tokens[0].charAt(0), Double.valueOf(tokens[1])); } } - } - catch (IOException e) { - System.err.println(String.format(Locale.getDefault(), "%s: load model failure!", PROB_EMIT)); - } - finally { + } catch (IOException e) { + log.error("{}: load model failure!", PROB_EMIT, e); + } finally { try { if (null != is) is.close(); } catch (IOException e) { - System.err.println(String.format(Locale.getDefault(), "%s: close failure!", PROB_EMIT)); + log.error("{}: close failure!", PROB_EMIT, e); } } - System.out.println(String.format(Locale.getDefault(), "model load finished, time elapsed %d ms.", - System.currentTimeMillis() - s)); + log.info("model load finished, time elapsed {} ms.", System.currentTimeMillis() - s); } diff --git a/src/test/java/com/huaban/analysis/jieba/JiebaSegmenterTest.java b/src/test/java/com/huaban/analysis/jieba/JiebaSegmenterTest.java index b31f98b..b91cab4 100644 --- a/src/test/java/com/huaban/analysis/jieba/JiebaSegmenterTest.java +++ b/src/test/java/com/huaban/analysis/jieba/JiebaSegmenterTest.java @@ -3,7 +3,6 @@ */ package com.huaban.analysis.jieba; -import java.io.File; import java.nio.charset.StandardCharsets; import java.nio.file.Paths; import java.util.List; @@ -14,6 +13,8 @@ import org.junit.Test; import com.huaban.analysis.jieba.JiebaSegmenter.SegMode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** @@ -21,6 +22,7 @@ * */ public class JiebaSegmenterTest extends TestCase { + private Logger logger = LoggerFactory.getLogger(JiebaSegmenterTest.class); private JiebaSegmenter segmenter = new JiebaSegmenter(); String[] sentences = new String[] { @@ -133,7 +135,7 @@ protected void tearDown() throws Exception { public void testCutForSearch() { for (String sentence : sentences) { List tokens = segmenter.process(sentence, SegMode.SEARCH); - System.out.print(String.format(Locale.getDefault(), "\n%s\n%s", sentence, tokens.toString())); + logger.info("\n{}\n{}", sentence, tokens); } } @@ -142,7 +144,7 @@ public void testCutForSearch() { public void testCutForIndex() { for (String sentence : sentences) { List tokens = segmenter.process(sentence, SegMode.INDEX); - System.out.print(String.format(Locale.getDefault(), "\n%s\n%s", sentence, tokens.toString())); + logger.info("\n{}\n{}", sentence, tokens); } } @@ -159,7 +161,7 @@ public void testBugSentence() { "干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 " }; for (String sentence : bugs) { List tokens = segmenter.process(sentence, SegMode.SEARCH); - System.out.print(String.format(Locale.getDefault(), "\n%s\n%s", sentence, tokens.toString())); + logger.info("\n{}\n{}", sentence, tokens); } } @@ -176,8 +178,7 @@ public void testSegmentSpeed() { wordCount += sentence.length(); } long elapsed = (System.currentTimeMillis() - start); - System.out.println(String.format(Locale.getDefault(), "time elapsed:%d, rate:%fkb/s, sentences:%.2f/s", elapsed, - (length * 1.0) / 1024.0f / (elapsed * 1.0 / 1000.0f), wordCount * 1000.0f / (elapsed * 1.0))); + logger.info("time elapsed: {}, rate: {}kb/s, sentences: {}/s", elapsed, length / 1024.0 / (elapsed / 1000.0), wordCount * 1000.0 / elapsed); } @@ -193,7 +194,6 @@ public void testLongTextSegmentSpeed() { wordCount += sentence.length(); } long elapsed = (System.currentTimeMillis() - start); - System.out.println(String.format(Locale.getDefault(), "time elapsed:%d, rate:%fkb/s, sentences:%.2f/s", elapsed, - (length * 1.0) / 1024.0f / (elapsed * 1.0 / 1000.0f), wordCount * 1000.0f / (elapsed * 1.0))); + logger.info("time elapsed: {}, rate: {}kb/s, sentences: {}/s", elapsed, length / 1024.0 / (elapsed / 1000.0), wordCount * 1000.0 / elapsed); } } From f68d62d529577db150cf86dcf76df65e124aa610 Mon Sep 17 00:00:00 2001 From: lvli Date: Sat, 27 Jan 2018 16:33:10 +0800 Subject: [PATCH 2/2] add read user dict from InputStream --- .../huaban/analysis/jieba/WordDictionary.java | 68 ++++++++++++------- 1 file changed, 44 insertions(+), 24 deletions(-) diff --git a/src/main/java/com/huaban/analysis/jieba/WordDictionary.java b/src/main/java/com/huaban/analysis/jieba/WordDictionary.java index c4cb9e5..a80b2a4 100644 --- a/src/main/java/com/huaban/analysis/jieba/WordDictionary.java +++ b/src/main/java/com/huaban/analysis/jieba/WordDictionary.java @@ -140,45 +140,65 @@ public void loadUserDict(Path userDict) { loadUserDict(userDict, StandardCharsets.UTF_8); } - public void loadUserDict(Path userDict, Charset charset) { BufferedReader br = null; try { + log.info("to read user dict {}", userDict); br = Files.newBufferedReader(userDict, charset); - long s = System.currentTimeMillis(); - int count = 0; - while (br.ready()) { - String line = br.readLine(); - String[] tokens = line.split("[\t ]+"); - - if (tokens.length < 1) { - // Ignore empty line - continue; - } - - String word = tokens[0]; - - double freq = 3.0d; - if (tokens.length == 2) - freq = Double.valueOf(tokens[1]); - word = addWord(word); - freqs.put(word, Math.log(freq / total)); - count++; - } - log.info("user dict {} load finished, total words: {}, time elapsed: {} ms", userDict, count, System.currentTimeMillis() - s); + loadUserDict(br); } catch (IOException e) { - log.error("{}: load user dict failure!", userDict, e); + log.error("load user dict {} failure!", userDict, e); } finally { if (br != null) { try { br.close(); } catch (IOException e) { - log.error("{} close failure!", userDict, e); + log.error("close BufferedReader failure!", e); } } } } + public void loadUserDict(InputStream is) { + BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8)); + try { + log.info("to read user dict from InputStream"); + loadUserDict(br); + } catch (IOException e) { + log.error("load user dict failure!", e); + } finally { + try { + br.close(); + } catch (IOException e) { + log.error("close BufferedReader failure!", e); + } + } + } + + public void loadUserDict(BufferedReader br) throws IOException { + long s = System.currentTimeMillis(); + int count = 0; + while (br.ready()) { + String line = br.readLine(); + String[] tokens = line.split("[\t ]+"); + + if (tokens.length < 1) { + // Ignore empty line + continue; + } + + String word = tokens[0]; + + double freq = 3.0d; + if (tokens.length == 2) + freq = Double.valueOf(tokens[1]); + word = addWord(word); + freqs.put(word, Math.log(freq / total)); + count++; + } + log.info("user dict load finished, total words: {}, time elapsed: {} ms", count, System.currentTimeMillis() - s); + } + public DictSegment getTrie() { return this._dict;