Skip to content

Commit

Permalink
0.26 版
Browse files Browse the repository at this point in the history
  • Loading branch information
fukuball committed Nov 4, 2017
1 parent 0283cf6 commit ac88923
Show file tree
Hide file tree
Showing 16 changed files with 77 additions and 28 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
[![Codacy Badge](https://api.codacy.com/project/badge/grade/9360ebe8fc2d47d8a64f49f57d2f016f)](https://www.codacy.com/app/fukuball/jieba-php)
[![Made with Love](https://img.shields.io/badge/made%20with-%e2%9d%a4-ff69b4.svg)](http://www.fukuball.com)

"結巴"中文分詞:做最好的 PHP 中文分詞、中文斷詞組件,目前翻譯版本為 jieba-0.25 版本,未來再慢慢往上升級,效能也需要再改善,請有興趣的開發者一起加入開發!若想使用 Python 版本請前往 [fxsjy/jieba](https://github.com/fxsjy/jieba)
"結巴"中文分詞:做最好的 PHP 中文分詞、中文斷詞組件,目前翻譯版本為 jieba-0.26 版本,未來再慢慢往上升級,效能也需要再改善,請有興趣的開發者一起加入開發!若想使用 Python 版本請前往 [fxsjy/jieba](https://github.com/fxsjy/jieba)

現在已經可以支援繁體中文!只要將字典切換為 big 模式即可!

Expand Down
2 changes: 1 addition & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"description": "結巴中文分詞(PHP 版本):做最好的 PHP 中文分詞、中文斷詞組件",
"keywords": ["Jieba", "PHP"],
"license": "MIT",
"version": "0.25",
"version": "0.26",
"authors": [
{
"name": "fukuball",
Expand Down
2 changes: 1 addition & 1 deletion src/class/Finalseg.php
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ public static function cut($sentence, $options = array())
$seg_list = array();

$re_cjk_pattern = '([\x{3040}-\x{309F}]+)|([\x{30A0}-\x{30FF}]+)|([\x{4E00}-\x{9FA5}]+)|([\x{AC00}-\x{D7AF}]+)';
$re_skip_pattern = '([a-zA-Z0-9+#\r\n]+)';
$re_skip_pattern = '([a-zA-Z0-9+&=#\r\n]+)';
preg_match_all(
'/('.$re_cjk_pattern.'|'.$re_skip_pattern.')/u',
$sentence,
Expand Down
8 changes: 5 additions & 3 deletions src/class/Jieba.php
Original file line number Diff line number Diff line change
Expand Up @@ -439,17 +439,19 @@ public static function cut($sentence, $cut_all = false, $options = array())
$seg_list = array();

$re_han_pattern = '([\x{4E00}-\x{9FA5}]+)';
$re_han_with_ascii_pattern = '([\x{4E00}-\x{9FA5}a-zA-Z0-9+#&=]+)';
$re_kanjikana_pattern = '([\x{3040}-\x{309F}\x{4E00}-\x{9FA5}]+)';
$re_katakana_pattern = '([\x{30A0}-\x{30FF}]+)';
$re_hangul_pattern = '([\x{AC00}-\x{D7AF}]+)';
$re_ascii_pattern = '([a-zA-Z0-9+#\r\n]+)';
$re_ascii_pattern = '([a-zA-Z0-9+#&=\r\n]+)';
$re_skip_pattern = '(\s+)';

if (self::$cjk_all) {
$filter_pattern = $re_kanjikana_pattern.
'|'.$re_katakana_pattern.
'|'.$re_hangul_pattern;
} else {
$filter_pattern = $re_han_pattern;
$filter_pattern = $re_han_with_ascii_pattern;
}

preg_match_all(
Expand All @@ -465,7 +467,7 @@ public static function cut($sentence, $cut_all = false, $options = array())
// skip korean
$filter_pattern = $re_kanjikana_pattern.'|'.$re_katakana_pattern;
} else {
$filter_pattern = $re_han_pattern;
$filter_pattern = $re_han_with_ascii_pattern;
}

if (preg_match('/'.$filter_pattern.'/u', $blk)) {
Expand Down
19 changes: 17 additions & 2 deletions src/class/JiebaAnalyse.php
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,14 @@ class JiebaAnalyse

public static $idf_freq = array();
public static $max_idf = 0;
public static $median_idf = 0;
public static $stop_words= [
"the", "of", "is", "and", "to", "in", "that", "we",
"for", "an", "are", "by", "be", "as", "on", "with",
"can", "if", "from", "which", "you", "it", "this",
"then", "at", "have", "all", "not", "one", "has",
"or", "that"
];

/**
* Static method init
Expand Down Expand Up @@ -57,8 +65,11 @@ public static function init($options = array())
}
fclose($content);

asort(self::$idf_freq);
$keys = array_keys(self::$idf_freq);
$middle_key = $keys[count(self::$idf_freq)/2];
self::$max_idf = max(self::$idf_freq);

self::$median_idf = self::$idf_freq[$middle_key];
}// end function init

/**
Expand Down Expand Up @@ -91,6 +102,10 @@ public static function extractTags($content, $top_k = 20, $options = array())
if (mb_strlen($w, 'UTF-8')<2) {
continue;
}

if (in_array(strtolower($w), self::$stop_words)) {
continue;
}
if (isset($freq[$w])) {
$freq[$w] = $freq[$w] + 1.0;
} else {
Expand All @@ -109,7 +124,7 @@ public static function extractTags($content, $top_k = 20, $options = array())
if (isset(self::$idf_freq[$k])) {
$idf_freq = self::$idf_freq[$k];
} else {
$idf_freq = self::$max_idf;
$idf_freq = self::$median_idf;
}
$tf_idf_list[$k] = $v * $idf_freq;
}
Expand Down
15 changes: 15 additions & 0 deletions src/cmd/demo.php
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,27 @@
Jieba::init(array('mode'=>'test','dict'=>'big'));
Finalseg::init();

$seg_list = Jieba::cut("AT&T是一件不错的公司,给你发offer了吗");
var_dump($seg_list);

$seg_list = Jieba::cut("张晓梅去人民医院做了个B超然后去买了件T恤");
var_dump($seg_list);

$seg_list = Jieba::cut("C++和c#是什么关系?11+122=133,是吗?");
var_dump($seg_list);

$seg_list = Jieba::cut("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究");
var_dump($seg_list);

$seg_list = Jieba::cut("怜香惜玉也得要看对象啊!");
var_dump($seg_list);

$seg_list = Jieba::cut("憐香惜玉也得要看對象啊!");
var_dump($seg_list);

$seg_list = Jieba::cut("How are you? 我是 ABC!");
var_dump($seg_list);

echo "Full Mode: \n";
$seg_list = Jieba::cut("我来到北京清华大学", true);
var_dump($seg_list);
Expand Down
8 changes: 7 additions & 1 deletion src/dict/dict.big.txt
Original file line number Diff line number Diff line change
Expand Up @@ -608335,4 +608335,10 @@ B超 3 n
龜齡鶴算 3 n
龜龍片甲 3 nz
龜龍麟鳳 3 ns
臺灣 251 ns
臺灣 251 ns
T恤 4 n
C++ 3 nz
c++ 3 nz
C# 3 nz
c# 3 nz
AT&T 3 nz
2 changes: 1 addition & 1 deletion src/dict/dict.big.txt.cache.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/dict/dict.big.txt.json

Large diffs are not rendered by default.

8 changes: 7 additions & 1 deletion src/dict/dict.small.txt
Original file line number Diff line number Diff line change
Expand Up @@ -109747,4 +109747,10 @@
龙蛇混杂 10 i
龙骨山 10 nr
龚松林 10 nr
龟板 10 n
龟板 10 n
T恤 4 n
C++ 3 nz
c++ 3 nz
C# 3 nz
c# 3 nz
AT&T 3 nz
2 changes: 1 addition & 1 deletion src/dict/dict.small.txt.cache.json

Large diffs are not rendered by default.

Loading

0 comments on commit ac88923

Please sign in to comment.