diff --git a/README.md b/README.md index 884f244..0aa4a14 100644 --- a/README.md +++ b/README.md @@ -751,6 +751,70 @@ array(17) { } ``` +功能 7):返回詞語在原文的起止位置 +============== + +代碼示例 (Tutorial) + +```php +ini_set('memory_limit', '1024M'); + +require_once dirname(dirname(__FILE__))."/vendor/multi-array/MultiArray.php"; +require_once dirname(dirname(__FILE__))."/vendor/multi-array/Factory/MultiArrayFactory.php"; +require_once dirname(dirname(__FILE__))."/class/Jieba.php"; +require_once dirname(dirname(__FILE__))."/class/Finalseg.php"; +use Fukuball\Jieba\Jieba; +use Fukuball\Jieba\Finalseg; +Jieba::init(array('mode'=>'test','dict'=>'big')); +Finalseg::init(); + +$seg_list = Jieba::tokenize("永和服装饰品有限公司"); +var_dump($seg_list); +``` + +Output: + +```php +array(4) { + [0] => + array(3) { + 'word' => + string(6) "永和" + 'start' => + int(0) + 'end' => + int(2) + } + [1] => + array(3) { + 'word' => + string(6) "服装" + 'start' => + int(2) + 'end' => + int(4) + } + [2] => + array(3) { + 'word' => + string(6) "饰品" + 'start' => + int(4) + 'end' => + int(6) + } + [3] => + array(3) { + 'word' => + string(12) "有限公司" + 'start' => + int(6) + 'end' => + int(10) + } +} +``` + 其他詞典 ======== 1) 佔用內容較小的詞典 diff --git a/composer.json b/composer.json index 9cbd86d..4dbbb5a 100644 --- a/composer.json +++ b/composer.json @@ -4,7 +4,7 @@ "description": "結巴中文分詞(PHP 版本):做最好的 PHP 中文分詞、中文斷詞組件", "keywords": ["Jieba", "PHP"], "license": "MIT", - "version": "0.29", + "version": "0.31", "authors": [ { "name": "fukuball", diff --git a/src/class/Jieba.php b/src/class/Jieba.php index 0beeb86..db85079 100644 --- a/src/class/Jieba.php +++ b/src/class/Jieba.php @@ -224,6 +224,61 @@ public static function loadUserDict($f_name, $options = array()) return self::$trie; }// end function loadUserDict + /** + * Static method addWord + * + * @param string $word + * @param float $freq + * @param string $tag + * + * @return array self::$trie + */ + public static function addWord($word, $freq, $tag='', $options = array()) + { + if (isset(self::$original_freq[$word])) { + self::$total -= self::$original_freq[$word]; + } + self::$original_freq[$word] = $freq; + self::$total += $freq; + $l = mb_strlen($word, 'UTF-8'); + $word_c = array(); + for ($i=0; $i<$l; $i++) { + $c = mb_substr($word, $i, 1, 'UTF-8'); + array_push($word_c, $c); + } + $word_c_key = implode('.', $word_c); + self::$trie->set($word_c_key, array("end"=>"")); + self::__calcFreq(); + self::$dag_cache = array(); + return self::$trie; + } + + /** + * Static method tokenize + * + * @param string $sentence + * + * @return array + */ + public static function tokenize($sentence, $options = array()) + { + $seg_list = self::cut($sentence); + $tokenize_list = []; + $start = 0; + $end = 0; + foreach ($seg_list as $seg) { + $end = $start+mb_strlen($seg, 'UTF-8'); + $tokenize = [ + 'word' => $seg, + 'start' => $start, + 'end' => $end + ]; + $start = $end; + array_push($tokenize_list, $tokenize); + } + return $tokenize_list; + } + /** * Static method __cutAll * @@ -462,6 +517,9 @@ public static function cut($sentence, $cut_all = false, $options = array()) if ($cut_all) { $re_skip_pattern = '([a-zA-Z0-9+#&=\._\r\n]+)'; } + $re_punctuation_pattern = '([\x{ff5e}\x{ff01}\x{ff08}\x{ff09}\x{300e}'. + '\x{300c}\x{300d}\x{300f}\x{3001}\x{ff1a}\x{ff1b}'. + '\x{ff0c}\x{ff1f}\x{3002}]+)'; if (self::$cjk_all) { $filter_pattern = $re_kanjikana_pattern. @@ -472,7 +530,7 @@ public static function cut($sentence, $cut_all = false, $options = array()) } preg_match_all( - '/('.$filter_pattern.'|'.$re_ascii_pattern.')/u', + '/('.$filter_pattern.'|'.$re_ascii_pattern.'|'.$re_punctuation_pattern.')/u', $sentence, $matches, PREG_PATTERN_ORDER @@ -497,7 +555,7 @@ public static function cut($sentence, $cut_all = false, $options = array()) foreach ($words as $word) { array_push($seg_list, $word); } - } else { + } elseif (preg_match('/'.$re_skip_pattern.'/u', $blk)) { preg_match_all( '/('.$re_skip_pattern.')/u', $blk, @@ -521,6 +579,8 @@ public static function cut($sentence, $cut_all = false, $options = array()) } } } + } elseif (preg_match('/'.$re_punctuation_pattern.'/u', $blk)) { + array_push($seg_list, $blk); }// end else (preg_match('/'.$re_han_pattern.'/u', $blk)) }// end foreach ($blocks as $blk) diff --git a/src/cmd/demo.php b/src/cmd/demo.php index ee7b48a..6a25f6f 100644 --- a/src/cmd/demo.php +++ b/src/cmd/demo.php @@ -71,4 +71,7 @@ $seg_list = Jieba::cutForSearch("小明碩士畢業于中國科學院計算所,後在日本京都大學深造"); var_dump($seg_list); + +$seg_list = Jieba::tokenize("永和服装饰品有限公司"); +var_dump($seg_list); ?> \ No newline at end of file diff --git a/src/cmd/demo_tokenize.php b/src/cmd/demo_tokenize.php new file mode 100644 index 0000000..c18e69c --- /dev/null +++ b/src/cmd/demo_tokenize.php @@ -0,0 +1,28 @@ +#!/usr/bin/php + + * @license MIT Licence + * @version GIT: + * @link https://github.com/fukuball/jieba-php + */ +ini_set('memory_limit', '1024M'); + +require_once dirname(dirname(__FILE__))."/vendor/multi-array/MultiArray.php"; +require_once dirname(dirname(__FILE__))."/vendor/multi-array/Factory/MultiArrayFactory.php"; +require_once dirname(dirname(__FILE__))."/class/Jieba.php"; +require_once dirname(dirname(__FILE__))."/class/Finalseg.php"; +use Fukuball\Jieba\Jieba; +use Fukuball\Jieba\Finalseg; +Jieba::init(array('mode'=>'test','dict'=>'big')); +Finalseg::init(); + +$seg_list = Jieba::tokenize("永和服装饰品有限公司"); +var_dump($seg_list); +?> \ No newline at end of file diff --git a/test/JiebaTest.php b/test/JiebaTest.php index c90683f..0ca028b 100644 --- a/test/JiebaTest.php +++ b/test/JiebaTest.php @@ -44,7 +44,8 @@ public function testJiebaCut() "要", "看", "对象", - "啊" + "啊", + "!" ); $seg_list = Jieba::cut("怜香惜玉也得要看对象啊!"); @@ -107,6 +108,7 @@ public function testJiebaCutForSearch() "中国科学院", "计算", "计算所", + ",", "后", "在", "日本",