Skip to content

Commit

Permalink
0.31 版
Browse files Browse the repository at this point in the history
  • Loading branch information
fukuball committed Nov 21, 2017
1 parent 05b6e7f commit 4a038e9
Show file tree
Hide file tree
Showing 6 changed files with 161 additions and 4 deletions.
64 changes: 64 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -751,6 +751,70 @@ array(17) {
}
```

功能 7):返回詞語在原文的起止位置
==============

代碼示例 (Tutorial)

```php
ini_set('memory_limit', '1024M');

require_once dirname(dirname(__FILE__))."/vendor/multi-array/MultiArray.php";
require_once dirname(dirname(__FILE__))."/vendor/multi-array/Factory/MultiArrayFactory.php";
require_once dirname(dirname(__FILE__))."/class/Jieba.php";
require_once dirname(dirname(__FILE__))."/class/Finalseg.php";
use Fukuball\Jieba\Jieba;
use Fukuball\Jieba\Finalseg;
Jieba::init(array('mode'=>'test','dict'=>'big'));
Finalseg::init();

$seg_list = Jieba::tokenize("永和服装饰品有限公司");
var_dump($seg_list);
```

Output:

```php
array(4) {
[0] =>
array(3) {
'word' =>
string(6) "永和"
'start' =>
int(0)
'end' =>
int(2)
}
[1] =>
array(3) {
'word' =>
string(6) "服装"
'start' =>
int(2)
'end' =>
int(4)
}
[2] =>
array(3) {
'word' =>
string(6) "饰品"
'start' =>
int(4)
'end' =>
int(6)
}
[3] =>
array(3) {
'word' =>
string(12) "有限公司"
'start' =>
int(6)
'end' =>
int(10)
}
}
```

其他詞典
========
1) 佔用內容較小的詞典
Expand Down
2 changes: 1 addition & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"description": "結巴中文分詞(PHP 版本):做最好的 PHP 中文分詞、中文斷詞組件",
"keywords": ["Jieba", "PHP"],
"license": "MIT",
"version": "0.29",
"version": "0.31",
"authors": [
{
"name": "fukuball",
Expand Down
64 changes: 62 additions & 2 deletions src/class/Jieba.php
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,61 @@ public static function loadUserDict($f_name, $options = array())
return self::$trie;
}// end function loadUserDict

/**
* Static method addWord
*
* @param string $word
* @param float $freq
* @param string $tag
*
* @return array self::$trie
*/
public static function addWord($word, $freq, $tag='', $options = array())
{
if (isset(self::$original_freq[$word])) {
self::$total -= self::$original_freq[$word];
}
self::$original_freq[$word] = $freq;
self::$total += $freq;
$l = mb_strlen($word, 'UTF-8');
$word_c = array();
for ($i=0; $i<$l; $i++) {
$c = mb_substr($word, $i, 1, 'UTF-8');
array_push($word_c, $c);
}
$word_c_key = implode('.', $word_c);
self::$trie->set($word_c_key, array("end"=>""));
self::__calcFreq();
self::$dag_cache = array();
return self::$trie;
}

/**
* Static method tokenize
*
* @param string $sentence
*
* @return array
*/
public static function tokenize($sentence, $options = array())
{
$seg_list = self::cut($sentence);
$tokenize_list = [];
$start = 0;
$end = 0;
foreach ($seg_list as $seg) {
$end = $start+mb_strlen($seg, 'UTF-8');
$tokenize = [
'word' => $seg,
'start' => $start,
'end' => $end
];
$start = $end;
array_push($tokenize_list, $tokenize);
}
return $tokenize_list;
}

/**
* Static method __cutAll
*
Expand Down Expand Up @@ -462,6 +517,9 @@ public static function cut($sentence, $cut_all = false, $options = array())
if ($cut_all) {
$re_skip_pattern = '([a-zA-Z0-9+#&=\._\r\n]+)';
}
$re_punctuation_pattern = '([\x{ff5e}\x{ff01}\x{ff08}\x{ff09}\x{300e}'.
'\x{300c}\x{300d}\x{300f}\x{3001}\x{ff1a}\x{ff1b}'.
'\x{ff0c}\x{ff1f}\x{3002}]+)';

if (self::$cjk_all) {
$filter_pattern = $re_kanjikana_pattern.
Expand All @@ -472,7 +530,7 @@ public static function cut($sentence, $cut_all = false, $options = array())
}

preg_match_all(
'/('.$filter_pattern.'|'.$re_ascii_pattern.')/u',
'/('.$filter_pattern.'|'.$re_ascii_pattern.'|'.$re_punctuation_pattern.')/u',
$sentence,
$matches,
PREG_PATTERN_ORDER
Expand All @@ -497,7 +555,7 @@ public static function cut($sentence, $cut_all = false, $options = array())
foreach ($words as $word) {
array_push($seg_list, $word);
}
} else {
} elseif (preg_match('/'.$re_skip_pattern.'/u', $blk)) {
preg_match_all(
'/('.$re_skip_pattern.')/u',
$blk,
Expand All @@ -521,6 +579,8 @@ public static function cut($sentence, $cut_all = false, $options = array())
}
}
}
} elseif (preg_match('/'.$re_punctuation_pattern.'/u', $blk)) {
array_push($seg_list, $blk);
}// end else (preg_match('/'.$re_han_pattern.'/u', $blk))
}// end foreach ($blocks as $blk)

Expand Down
3 changes: 3 additions & 0 deletions src/cmd/demo.php
Original file line number Diff line number Diff line change
Expand Up @@ -71,4 +71,7 @@

$seg_list = Jieba::cutForSearch("小明碩士畢業于中國科學院計算所,後在日本京都大學深造");
var_dump($seg_list);

$seg_list = Jieba::tokenize("永和服装饰品有限公司");
var_dump($seg_list);
?>
28 changes: 28 additions & 0 deletions src/cmd/demo_tokenize.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/usr/bin/php
<?php
/**
* demo_tokenize.php
*
* PHP version 5
*
* @category PHP
* @package /src/cmd/
* @author Fukuball Lin <[email protected]>
* @license MIT Licence
* @version GIT: <fukuball/jieba-php>
* @link https://github.com/fukuball/jieba-php
*/
ini_set('memory_limit', '1024M');

require_once dirname(dirname(__FILE__))."/vendor/multi-array/MultiArray.php";
require_once dirname(dirname(__FILE__))."/vendor/multi-array/Factory/MultiArrayFactory.php";
require_once dirname(dirname(__FILE__))."/class/Jieba.php";
require_once dirname(dirname(__FILE__))."/class/Finalseg.php";
use Fukuball\Jieba\Jieba;
use Fukuball\Jieba\Finalseg;
Jieba::init(array('mode'=>'test','dict'=>'big'));
Finalseg::init();

$seg_list = Jieba::tokenize("永和服装饰品有限公司");
var_dump($seg_list);
?>
4 changes: 3 additions & 1 deletion test/JiebaTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ public function testJiebaCut()
"",
"",
"对象",
""
"",
""
);

$seg_list = Jieba::cut("怜香惜玉也得要看对象啊!");
Expand Down Expand Up @@ -107,6 +108,7 @@ public function testJiebaCutForSearch()
"中国科学院",
"计算",
"计算所",
"",
"",
"",
"日本",
Expand Down

0 comments on commit 4a038e9

Please sign in to comment.