Skip to content

Commit

Permalink
0.33 版
Browse files Browse the repository at this point in the history
  • Loading branch information
fukuball committed Nov 22, 2017
1 parent 7859c9d commit a5980dc
Show file tree
Hide file tree
Showing 5 changed files with 134 additions and 23 deletions.
69 changes: 49 additions & 20 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,7 @@ array(18) {
* JiebaAnalyse::extractTags($content, $top_k)
* content 為待提取的文本
* top_k 為返回幾個 TF/IDF 權重最大的關鍵詞,默認值為 20
* 可使用 setStopWords 增加自定義 stop words

代碼示例 (關鍵詞提取)

Expand All @@ -276,32 +277,60 @@ $content = file_get_contents("/path/to/your/dict/lyric.txt", "r");

$tags = JiebaAnalyse::extractTags($content, $top_k);

var_dump($tags);

JiebaAnalyse::setStopWords('/path/to/your/dict/stop_words.txt');

$tags = JiebaAnalyse::extractTags($content, $top_k);

var_dump($tags);
```

Output:
```php
array(10) {
["是否"]=>
float(1.2196321889395)
["一般"]=>
float(1.0032459890209)
["肌迫"]=>
float(0.64654314660465)
["怯懦"]=>
float(0.44762844339349)
["藉口"]=>
float(0.32327157330233)
["逼不得已"]=>
float(0.32327157330233)
["不安全感"]=>
float(0.26548304656279)
["同感"]=>
float(0.23929673812326)
["有把握"]=>
float(0.21043366018744)
["空洞"]=>
float(0.20598261709442)
'沒有' =>
double(1.0592831964595)
'所謂' =>
double(0.90795702553671)
'是否' =>
double(0.66385043195443)
'一般' =>
double(0.54607060161899)
'雖然' =>
double(0.30265234184557)
'來說' =>
double(0.30265234184557)
'肌迫' =>
double(0.30265234184557)
'退縮' =>
double(0.30265234184557)
'矯作' =>
double(0.30265234184557)
'怯懦' =>
double(0.24364586159392)
}
array(10) {
'所謂' =>
double(1.1569129841516)
'一般' =>
double(0.69579963754677)
'矯作' =>
double(0.38563766138387)
'來說' =>
double(0.38563766138387)
'退縮' =>
double(0.38563766138387)
'雖然' =>
double(0.38563766138387)
'肌迫' =>
double(0.38563766138387)
'怯懦' =>
double(0.31045198493419)
'隨便說說' =>
double(0.19281883069194)
'一場' =>
double(0.19281883069194)
}
```

Expand Down
2 changes: 1 addition & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"description": "結巴中文分詞(PHP 版本):做最好的 PHP 中文分詞、中文斷詞組件",
"keywords": ["Jieba", "PHP"],
"license": "MIT",
"version": "0.32",
"version": "0.33",
"authors": [
{
"name": "fukuball",
Expand Down
29 changes: 28 additions & 1 deletion src/class/JiebaAnalyse.php
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,17 @@ public static function init($options = array())
{

$defaults = array(
'mode'=>'default'
'mode'=>'default',
'dict'=>'normal'
);

$options = array_merge($defaults, $options);

if ($options['dict']=='big') {
$f_name = "idf.big.txt";
} else {
$f_name = "idf.txt";
}
$content = fopen(dirname(dirname(__FILE__))."/dict/idf.txt", "r");

while (($line = fgets($content)) !== false) {
Expand All @@ -72,6 +78,27 @@ public static function init($options = array())
self::$median_idf = self::$idf_freq[$middle_key];
}// end function init

/**
* Static method setStopWords
*
* @param string $stop_words_path
* @param array $options
*
* @return array $tags
*/
public static function setStopWords($stop_words_path, $options = array())
{
$content = fopen($stop_words_path, "r");

while (($line = fgets($content)) !== false) {
$stop_word = strtolower(trim($line));
if (! in_array($stop_word, self::$stop_words)) {
array_push(self::$stop_words, $stop_word);
}
}
fclose($content);
}

/**
* Static method extractTags
*
Expand Down
6 changes: 5 additions & 1 deletion src/cmd/demo_extract_tags.php
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,16 @@
use Fukuball\Jieba\JiebaAnalyse;
Jieba::init(array('mode'=>'test','dict'=>'big'));
Finalseg::init();
JiebaAnalyse::init();
JiebaAnalyse::init(array('dict'=>'big'));

$top_k = 10;
$content = file_get_contents(dirname(dirname(__FILE__))."/dict/lyric.txt", "r");

$tags = JiebaAnalyse::extractTags($content, $top_k);
var_dump($tags);

JiebaAnalyse::setStopWords(dirname(dirname(__FILE__)).'/dict/stop_words.txt');

$tags = JiebaAnalyse::extractTags($content, $top_k);
var_dump($tags);
?>
51 changes: 51 additions & 0 deletions src/dict/stop_words.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
the
of
is
and
to
in
that
we
for
an
are
by
be
as
on
with
can
if
from
which
you
it
this
then
at
have
all
not
one
has
or
that
一個
沒有
我們
你們
妳們
他們
她們
是否

0 comments on commit a5980dc

Please sign in to comment.