fix: GSE数据文件从gse/dict目录加载
This commit is contained in:
15
gse/dict/README.md
Normal file
15
gse/dict/README.md
Normal file
@@ -0,0 +1,15 @@
|
||||
Some dict/zh data is from [github.com/fxsjy/jieba](https://github.com/fxsjy/jieba)
|
||||
|
||||
update at 2023-11-16:
|
||||
|
||||
add two new dict documents , which from [github.com/GuocaiL/nlp_corpus](https://github.com/GuocaiL/nlp_corpus)
|
||||
|
||||
generated by `nlp_corpus/open_ner_data/boson/boson.txt`, `open_ner_data/people_daily/people_daily_ner.txt`, `open_ner_data/tianchi_yiyao/train.txt`,`open_ner_data/ResumeNER/dev.txt`
|
||||
|
||||
1. tf_idf.txt
|
||||
|
||||
The first column of this document is the term , the second column is the word frequency of the corresponding term, and the third column is the inverse document frequency of the corresponding term
|
||||
|
||||
2. tf_idf_origin.txt
|
||||
|
||||
the origin corpus text
|
||||
0
gse/dict/en/dict.txt
Normal file
0
gse/dict/en/dict.txt
Normal file
1
gse/dict/jp/README.md
Normal file
1
gse/dict/jp/README.md
Normal file
@@ -0,0 +1 @@
|
||||
dict.txt 通过内部工具生成, Copyright 2017 ego authors. 商用和拷贝请注明来源和版权
|
||||
885298
gse/dict/jp/dict.txt
Normal file
885298
gse/dict/jp/dict.txt
Normal file
File diff suppressed because it is too large
Load Diff
270132
gse/dict/zh/idf.txt
Normal file
270132
gse/dict/zh/idf.txt
Normal file
File diff suppressed because it is too large
Load Diff
352279
gse/dict/zh/s_1.txt
Normal file
352279
gse/dict/zh/s_1.txt
Normal file
File diff suppressed because it is too large
Load Diff
1161
gse/dict/zh/stop_tokens.txt
Normal file
1161
gse/dict/zh/stop_tokens.txt
Normal file
File diff suppressed because it is too large
Load Diff
88
gse/dict/zh/stop_word.txt
Normal file
88
gse/dict/zh/stop_word.txt
Normal file
@@ -0,0 +1,88 @@
|
||||
,
|
||||
.
|
||||
?
|
||||
!
|
||||
"
|
||||
@
|
||||
,
|
||||
。
|
||||
、
|
||||
?
|
||||
!
|
||||
:
|
||||
“
|
||||
”
|
||||
;
|
||||
|
||||
(
|
||||
)
|
||||
《
|
||||
》
|
||||
~
|
||||
*
|
||||
<
|
||||
>
|
||||
/
|
||||
\
|
||||
|
|
||||
-
|
||||
_
|
||||
+
|
||||
=
|
||||
&
|
||||
^
|
||||
%
|
||||
#
|
||||
`
|
||||
;
|
||||
$
|
||||
¥
|
||||
‘
|
||||
’
|
||||
〉
|
||||
〈
|
||||
…
|
||||
>
|
||||
<
|
||||
@
|
||||
#
|
||||
$
|
||||
%
|
||||
︿
|
||||
&
|
||||
*
|
||||
+
|
||||
~
|
||||
|
|
||||
[
|
||||
]
|
||||
{
|
||||
}
|
||||
啊
|
||||
阿
|
||||
哎
|
||||
哎呀
|
||||
哎哟
|
||||
唉
|
||||
俺
|
||||
俺们
|
||||
按
|
||||
按照
|
||||
吧
|
||||
吧哒
|
||||
把
|
||||
罢了
|
||||
被
|
||||
本
|
||||
本着
|
||||
比
|
||||
比方
|
||||
比如
|
||||
鄙人
|
||||
彼
|
||||
彼此
|
||||
边
|
||||
别
|
||||
别的
|
||||
别说
|
||||
并
|
||||
236754
gse/dict/zh/t_1.txt
Normal file
236754
gse/dict/zh/t_1.txt
Normal file
File diff suppressed because it is too large
Load Diff
107536
gse/dict/zh/tf_idf.txt
Normal file
107536
gse/dict/zh/tf_idf.txt
Normal file
File diff suppressed because it is too large
Load Diff
33450
gse/dict/zh/tf_idf_origin.txt
Normal file
33450
gse/dict/zh/tf_idf_origin.txt
Normal file
File diff suppressed because one or more lines are too long
154
gse/gse.go
Normal file
154
gse/gse.go
Normal file
@@ -0,0 +1,154 @@
|
||||
package utils
|
||||
|
||||
import (
|
||||
"context"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"sync"
|
||||
|
||||
"github.com/go-ego/gse"
|
||||
"github.com/go-ego/gse/hmm/extracker"
|
||||
"github.com/go-ego/gse/hmm/segment"
|
||||
"github.com/gogf/gf/v2/os/glog"
|
||||
)
|
||||
|
||||
// 全局工具实例(不再自动初始化)
|
||||
var (
|
||||
GseTool *gseTool
|
||||
once sync.Once // 保证只初始化一次,线程安全
|
||||
)
|
||||
|
||||
func InitGseTool(ctx context.Context) error {
|
||||
var err error
|
||||
once.Do(func() {
|
||||
// 只执行一次初始化
|
||||
GseTool, err = newGseTool()
|
||||
})
|
||||
if err != nil {
|
||||
glog.Error(ctx, "gse 分词工具初始化失败:", err)
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
// gseTool 关键词提取工具(gse v1.0.2 标准)
|
||||
type gseTool struct {
|
||||
seg gse.Segmenter
|
||||
tfidf *extracker.TagExtracter
|
||||
tr *extracker.TextRanker
|
||||
}
|
||||
|
||||
// newGseTool 初始化工具(使用外部数据文件)
|
||||
func newGseTool() (tool *gseTool, err error) {
|
||||
// 1. 初始化分词器
|
||||
var seg gse.Segmenter
|
||||
|
||||
// gse数据文件在可执行文件同级的gse/dict目录下
|
||||
gseDataPath := "gse"
|
||||
dictPath := filepath.Join(gseDataPath, "dict", "zh")
|
||||
idfPath := filepath.Join(dictPath, "idf.txt")
|
||||
stopPath := filepath.Join(dictPath, "stop.txt")
|
||||
|
||||
// 加载词典
|
||||
err = seg.LoadDict(filepath.Join(dictPath, "dict.txt"))
|
||||
if err != nil {
|
||||
glog.Warning(context.Background(), "加载gse词典失败,尝试embed模式:", err)
|
||||
// 回退到embed模式
|
||||
err = seg.LoadDictEmbed()
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
err = seg.LoadStopEmbed()
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
tfidf := &extracker.TagExtracter{}
|
||||
tfidf.WithGse(seg)
|
||||
err = tfidf.LoadIdf()
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
tr := &extracker.TextRanker{}
|
||||
tr.WithGse(seg)
|
||||
tool = &gseTool{
|
||||
seg: seg,
|
||||
tfidf: tfidf,
|
||||
tr: tr,
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// 加载停用词
|
||||
err = seg.LoadStop(stopPath)
|
||||
if err != nil {
|
||||
glog.Warning(context.Background(), "加载停用词失败,继续:", err)
|
||||
}
|
||||
|
||||
// 2. 初始化 TF-IDF 提取器
|
||||
tfidf := &extracker.TagExtracter{}
|
||||
tfidf.WithGse(seg)
|
||||
err = tfidf.LoadIdf(idfPath)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
// 3. 初始化 TextRank 提取器
|
||||
tr := &extracker.TextRanker{}
|
||||
tr.WithGse(seg)
|
||||
|
||||
tool = &gseTool{
|
||||
seg: seg,
|
||||
tfidf: tfidf,
|
||||
tr: tr,
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Cut 分词(关键词提取唯一正确模式:精确模式 + HMM)
|
||||
func (k *gseTool) Cut(text string) []string {
|
||||
return k.seg.Cut(text, true)
|
||||
}
|
||||
|
||||
// Keyword 最终输出:关键词 + 权重
|
||||
type Keyword struct {
|
||||
Word string `json:"word"`
|
||||
Score float64 `json:"score"`
|
||||
}
|
||||
|
||||
func (k *gseTool) Extract(text string, topN int) []Keyword {
|
||||
// 1. 提取 TF-IDF
|
||||
tfTags := k.extractTFIDF(text, topN)
|
||||
|
||||
// 2. 提取 TextRank
|
||||
trTags := k.extractTextRank(text, topN)
|
||||
|
||||
// 3. 合并成最终关键词(业务最常用)
|
||||
scoreMap := make(map[string]float64)
|
||||
for _, tag := range tfTags {
|
||||
scoreMap[tag.Text] = tag.Weight
|
||||
}
|
||||
for _, tag := range trTags {
|
||||
scoreMap[tag.Text] = tag.Weight
|
||||
}
|
||||
|
||||
// 转成切片并排序(高分在前)
|
||||
res := make([]Keyword, 0, len(scoreMap))
|
||||
for word, score := range scoreMap {
|
||||
res = append(res, Keyword{Word: word, Score: score})
|
||||
}
|
||||
|
||||
sort.Slice(res, func(i, j int) bool {
|
||||
return res[i].Score > res[j].Score
|
||||
})
|
||||
|
||||
return res
|
||||
}
|
||||
|
||||
// ExtractTFIDF TF-IDF 关键词(带权重)90% 业务:文章标签、搜索、关键词
|
||||
func (k *gseTool) extractTFIDF(text string, topN int) segment.Segments {
|
||||
return k.tfidf.ExtractTags(text, topN)
|
||||
}
|
||||
|
||||
// ExtractTextRank TextRank 关键词(带权重)长文本、摘要、语义理解
|
||||
func (k *gseTool) extractTextRank(text string, topN int) segment.Segments {
|
||||
return k.tr.TextRank(text, topN)
|
||||
}
|
||||
Reference in New Issue
Block a user