Skip to content

Commit

Permalink
0.32 版
Browse files Browse the repository at this point in the history
  • Loading branch information
fukuball committed Nov 22, 2017
1 parent e8eb653 commit f876c82
Show file tree
Hide file tree
Showing 6 changed files with 166 additions and 12 deletions.
2 changes: 1 addition & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"description": "結巴中文分詞(PHP 版本):做最好的 PHP 中文分詞、中文斷詞組件",
"keywords": ["Jieba", "PHP"],
"license": "MIT",
"version": "0.31",
"version": "0.32",
"authors": [
{
"name": "fukuball",
Expand Down
73 changes: 67 additions & 6 deletions src/class/Jieba.php
Original file line number Diff line number Diff line change
Expand Up @@ -260,9 +260,9 @@ public static function addWord($word, $freq, $tag = '', $options = array())
*
* @return array
*/
public static function tokenize($sentence, $options = array())
public static function tokenize($sentence, $options = array("HMM" => true))
{
$seg_list = self::cut($sentence);
$seg_list = self::cut($sentence, false, array("HMM" => $options["HMM"]));
$tokenize_list = [];
$start = 0;
$end = 0;
Expand Down Expand Up @@ -488,6 +488,60 @@ public static function __cutDAG($sentence, $options = array())
return $words;
}// end function __cutDAG

/**
* Static method __cutDAGNoHMM
*
* @param string $sentence # input sentence
* @param array $options # other options
*
* @return array $words
*/
public static function __cutDAGNoHMM($sentence, $options = array())
{
$defaults = array(
'mode'=>'default'
);

$options = array_merge($defaults, $options);

$words = array();

$N = mb_strlen($sentence, 'UTF-8');
$DAG = self::getDAG($sentence);

self::calc($sentence, $DAG);

$x = 0;
$buf = '';

$re_eng_pattern = '[a-zA-Z+#]+';

while ($x < $N) {
$current_route_keys = array_keys(self::$route[$x]);
$y = $current_route_keys[0]+1;
$l_word = mb_substr($sentence, $x, ($y-$x), 'UTF-8');

if (preg_match('/'.$re_eng_pattern.'/u', $l_word)) {
$buf = $buf.$l_word;
$x = $y;
} else {
if (mb_strlen($buf, 'UTF-8')>0) {
array_push($words, $buf);
$buf = '';
}
array_push($words, $l_word);
$x = $y;
}
}

if (mb_strlen($buf, 'UTF-8')>0) {
array_push($words, $buf);
$buf = '';
}

return $words;
}// end function __cutDAGNoHMM

/**
* Static method cut
*
Expand All @@ -497,7 +551,7 @@ public static function __cutDAG($sentence, $options = array())
*
* @return array $seg_list
*/
public static function cut($sentence, $cut_all = false, $options = array())
public static function cut($sentence, $cut_all = false, $options = array("HMM" => true))
{
$defaults = array(
'mode'=>'default'
Expand Down Expand Up @@ -538,6 +592,9 @@ public static function cut($sentence, $cut_all = false, $options = array())
$blocks = $matches[0];

foreach ($blocks as $blk) {
if (mb_strlen($blk, 'UTF-8')==0) {
continue;
}
if (self::$cjk_all) {
// skip korean
$filter_pattern = $re_kanjikana_pattern.'|'.$re_katakana_pattern;
Expand All @@ -549,7 +606,11 @@ public static function cut($sentence, $cut_all = false, $options = array())
if ($cut_all) {
$words = Jieba::__cutAll($blk);
} else {
$words = Jieba::__cutDAG($blk);
if ($options['HMM']) {
$words = Jieba::__cutDAG($blk);
} else {
$words = Jieba::__cutDAGNoHMM($blk);
}
}

foreach ($words as $word) {
Expand Down Expand Up @@ -595,7 +656,7 @@ public static function cut($sentence, $cut_all = false, $options = array())
*
* @return array $seg_list
*/
public static function cutForSearch($sentence, $options = array())
public static function cutForSearch($sentence, $options = array("HMM" => true))
{
$defaults = array(
'mode'=>'default'
Expand All @@ -605,7 +666,7 @@ public static function cutForSearch($sentence, $options = array())

$seg_list = array();

$cut_seg_list = Jieba::cut($sentence);
$cut_seg_list = Jieba::cut($sentence, false, array("HMM" => $options["HMM"]));

foreach ($cut_seg_list as $w) {
$len = mb_strlen($w, 'UTF-8');
Expand Down
87 changes: 84 additions & 3 deletions src/class/Posseg.php
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ public static function viterbi($sentence, $options = array())
*
* @return array $words
*/
public static function __cut($sentence, $options = array())
public static function __cut($sentence, $options = array("HMM" => true))
{
$defaults = array(
'mode'=>'default'
Expand Down Expand Up @@ -514,6 +514,83 @@ public static function __cutDAG($sentence, $options = array())
return $words;
}// end function __cutDAG

/**
* Static method __cutDAGNoHMM
*
* @param string $sentence # input sentence
* @param array $options # other options
*
* @return array $words
*/
public static function __cutDAGNoHMM($sentence, $options = array())
{
$defaults = array(
'mode'=>'default'
);

$options = array_merge($defaults, $options);

$words = array();

$N = mb_strlen($sentence, 'UTF-8');
$DAG = Jieba::getDAG($sentence);

Jieba::calc($sentence, $DAG);

$x = 0;
$buf = '';

$re_eng_pattern = '[a-zA-Z+#]+';

while ($x < $N) {
$current_route_keys = array_keys(Jieba::$route[$x]);
$y = $current_route_keys[0]+1;
$l_word = mb_substr($sentence, $x, ($y-$x), 'UTF-8');

if (preg_match('/'.$re_eng_pattern.'/u', $l_word)) {
$buf = $buf.$l_word;
$x = $y;
} else {
if (mb_strlen($buf, 'UTF-8')>0) {
if (isset(self::$word_tag[$buf])) {
$buf_tag = self::$word_tag[$buf];
} else {
$buf_tag = "x";
}
array_push(
$words,
array("word"=>$buf, "tag"=>$buf_tag)
);
$buf = '';
}
if (isset(self::$word_tag[$l_word])) {
$buf_tag = self::$word_tag[$l_word];
} else {
$buf_tag = "x";
}
array_push(
$words,
array("word"=>$l_word, "tag"=>$buf_tag)
);
$x = $y;
}
}

if (mb_strlen($buf, 'UTF-8')>0) {
if (isset(self::$word_tag[$buf])) {
$buf_tag = self::$word_tag[$buf];
} else {
$buf_tag = "x";
}
array_push(
$words,
array("word"=>$buf, "tag"=>$buf_tag)
);
}

return $words;
}// end function __cutDAGNoHMM

/**
* Static method cut
*
Expand All @@ -522,7 +599,7 @@ public static function __cutDAG($sentence, $options = array())
*
* @return array $seg_list
*/
public static function cut($sentence, $options = array())
public static function cut($sentence, $options = array("HMM" => true))
{
$defaults = array(
'mode'=>'default'
Expand Down Expand Up @@ -550,7 +627,11 @@ public static function cut($sentence, $options = array())

foreach ($blocks as $blk) {
if (preg_match('/'.$re_han_pattern.'/u', $blk)) {
$words = Posseg::__cutDAG($blk);
if ($options['HMM']) {
$words = Posseg::__cutDAG($blk);
} else {
$words = Posseg::__cutDAGNoHMM($blk);
}

foreach ($words as $word) {
array_push($seg_list, $word);
Expand Down
8 changes: 7 additions & 1 deletion src/cmd/demo.php
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,13 @@
$seg_list = Jieba::cut("他来到了网易杭研大厦");
var_dump($seg_list);

$seg_list = Jieba::cut("他來到了網易杭研大廈");
$seg_list = Jieba::cut("他来到了网易杭研大厦", false, ['HMM' => false]);
var_dump($seg_list);

$seg_list = Jieba::cut("林志傑來到了網易杭研大廈");
var_dump($seg_list);

$seg_list = Jieba::cut("林志傑來到了網易杭研大廈", false, ['HMM' => false]);
var_dump($seg_list);

$seg_list = Jieba::cutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造");
Expand Down
6 changes: 6 additions & 0 deletions src/cmd/demo_posseg.php
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,12 @@
$seg_list = Posseg::cut("這是一個伸手不見五指的黑夜。我叫孫悟空,我愛北京,我愛Python和C++");
var_dump($seg_list);

$seg_list = Posseg::posTagReadable($seg_list);
var_dump($seg_list);

$seg_list = Posseg::cut("林志傑來到了網易杭研大廈", ['HMM' => false]);
var_dump($seg_list);

$seg_list = Posseg::posTagReadable($seg_list);
var_dump($seg_list);
?>
2 changes: 1 addition & 1 deletion src/model/pos/char_state.json
Original file line number Diff line number Diff line change
Expand Up @@ -52545,7 +52545,7 @@
"\u8dd6": ["('S', 'g')", "('M', 'n')", "('B', 'n')", "('E', 'nr')"],
"\u8dd7": ["('B', 'n')"],
"\u8dda": ["('E', 'v')", "('S', 'x')", "('E', 'l')", "('E', 'nr')", "('E', 'vn')"],
"\u8ddb": ["('B', 'n')", "('B', 'v')"],
"\u8ddb": ["('B', 'n')", "('B', 'v')", "('S', 'a')"],
"\u8ddd": ["('B', 'n')",
"('S', 'p')",
"('E', 'n')",
Expand Down

0 comments on commit f876c82

Please sign in to comment.