2026-04-08 14:21:12 +08:00
|
|
|
|
package utils
|
|
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
|
"context"
|
2026-04-28 14:15:31 +08:00
|
|
|
|
"os"
|
|
|
|
|
|
"path/filepath"
|
2026-04-08 14:21:12 +08:00
|
|
|
|
"sort"
|
|
|
|
|
|
"sync"
|
|
|
|
|
|
|
|
|
|
|
|
"github.com/go-ego/gse"
|
|
|
|
|
|
"github.com/go-ego/gse/hmm/extracker"
|
|
|
|
|
|
"github.com/go-ego/gse/hmm/segment"
|
|
|
|
|
|
"github.com/gogf/gf/v2/os/glog"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
// 全局工具实例(不再自动初始化)
|
|
|
|
|
|
var (
|
|
|
|
|
|
GseTool *gseTool
|
|
|
|
|
|
once sync.Once // 保证只初始化一次,线程安全
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
func InitGseTool(ctx context.Context) error {
|
|
|
|
|
|
var err error
|
|
|
|
|
|
once.Do(func() {
|
|
|
|
|
|
// 只执行一次初始化
|
|
|
|
|
|
GseTool, err = newGseTool()
|
|
|
|
|
|
})
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
glog.Error(ctx, "gse 分词工具初始化失败:", err)
|
|
|
|
|
|
}
|
|
|
|
|
|
return err
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// gseTool 关键词提取工具(gse v1.0.2 标准)
|
|
|
|
|
|
type gseTool struct {
|
|
|
|
|
|
seg gse.Segmenter
|
|
|
|
|
|
tfidf *extracker.TagExtracter
|
|
|
|
|
|
tr *extracker.TextRanker
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// newGseTool 初始化工具(内置词典 + 停用词)
|
|
|
|
|
|
func newGseTool() (tool *gseTool, err error) {
|
|
|
|
|
|
// 1. 初始化分词器
|
|
|
|
|
|
var seg gse.Segmenter
|
|
|
|
|
|
// 内置词典(无外部文件)
|
|
|
|
|
|
err = seg.LoadDictEmbed()
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
// 内置停用词(v1.0.2 标准)
|
|
|
|
|
|
err = seg.LoadStopEmbed()
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 2. 初始化 TF-IDF 提取器
|
|
|
|
|
|
tfidf := &extracker.TagExtracter{}
|
|
|
|
|
|
tfidf.WithGse(seg)
|
2026-04-28 14:15:31 +08:00
|
|
|
|
|
|
|
|
|
|
// 尝试从默认路径加载 IDF 字典
|
|
|
|
|
|
idfPath := getIdfDictPath()
|
|
|
|
|
|
if idfPath != "" {
|
|
|
|
|
|
// 如果找到自定义路径,使用 LoadDict 方法加载
|
|
|
|
|
|
err = tfidf.LoadDict(idfPath)
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
glog.Warningf(context.Background(), "加载自定义 IDF 字典失败 [%s]: %v,将使用默认字典", idfPath, err)
|
|
|
|
|
|
// 回退到默认加载方式
|
|
|
|
|
|
err = tfidf.LoadIdf()
|
|
|
|
|
|
} else {
|
|
|
|
|
|
glog.Infof(context.Background(), "成功加载自定义 IDF 字典: %s", idfPath)
|
|
|
|
|
|
}
|
|
|
|
|
|
} else {
|
|
|
|
|
|
// 使用默认的 IDF 字典
|
|
|
|
|
|
err = tfidf.LoadIdf()
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-04-08 14:21:12 +08:00
|
|
|
|
if err != nil {
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 3. 初始化 TextRank 提取器
|
|
|
|
|
|
tr := &extracker.TextRanker{}
|
|
|
|
|
|
tr.WithGse(seg)
|
|
|
|
|
|
|
|
|
|
|
|
tool = &gseTool{
|
|
|
|
|
|
seg: seg,
|
|
|
|
|
|
tfidf: tfidf,
|
|
|
|
|
|
tr: tr,
|
|
|
|
|
|
}
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-04-28 14:15:31 +08:00
|
|
|
|
// getIdfDictPath 获取 IDF 字典文件路径
|
|
|
|
|
|
func getIdfDictPath() string {
|
|
|
|
|
|
// 1. 尝试从容器内的默认挂载路径加载(Docker 卷映射)
|
|
|
|
|
|
containerPath := "/app/dict/zh/idf.txt"
|
|
|
|
|
|
if _, err := os.Stat(containerPath); err == nil {
|
|
|
|
|
|
return containerPath
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 2. 尝试从当前工作目录的 dict/zh/idf.txt 加载
|
|
|
|
|
|
workDir, err := os.Getwd()
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
return ""
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
localPath := filepath.Join(workDir, "dict", "zh", "idf.txt")
|
|
|
|
|
|
if _, err := os.Stat(localPath); err == nil {
|
|
|
|
|
|
return localPath
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 3. 如果没有找到自定义路径,返回空字符串,使用默认字典
|
|
|
|
|
|
return ""
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-04-08 14:21:12 +08:00
|
|
|
|
// Cut 分词(关键词提取唯一正确模式:精确模式 + HMM)
|
|
|
|
|
|
func (k *gseTool) Cut(text string) []string {
|
|
|
|
|
|
return k.seg.Cut(text, true)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Keyword 最终输出:关键词 + 权重
|
|
|
|
|
|
type Keyword struct {
|
|
|
|
|
|
Word string `json:"word"`
|
|
|
|
|
|
Score float64 `json:"score"`
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
func (k *gseTool) Extract(text string, topN int) []Keyword {
|
|
|
|
|
|
// 1. 提取 TF-IDF
|
|
|
|
|
|
tfTags := k.extractTFIDF(text, topN)
|
|
|
|
|
|
|
|
|
|
|
|
// 2. 提取 TextRank
|
|
|
|
|
|
trTags := k.extractTextRank(text, topN)
|
|
|
|
|
|
|
|
|
|
|
|
// 3. 合并成最终关键词(业务最常用)
|
|
|
|
|
|
scoreMap := make(map[string]float64)
|
|
|
|
|
|
for _, tag := range tfTags {
|
|
|
|
|
|
scoreMap[tag.Text] = tag.Weight
|
|
|
|
|
|
}
|
|
|
|
|
|
for _, tag := range trTags {
|
|
|
|
|
|
scoreMap[tag.Text] = tag.Weight
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 转成切片并排序(高分在前)
|
|
|
|
|
|
res := make([]Keyword, 0, len(scoreMap))
|
|
|
|
|
|
for word, score := range scoreMap {
|
|
|
|
|
|
res = append(res, Keyword{Word: word, Score: score})
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
sort.Slice(res, func(i, j int) bool {
|
|
|
|
|
|
return res[i].Score > res[j].Score
|
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
|
|
return res
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ExtractTFIDF TF-IDF 关键词(带权重)90% 业务:文章标签、搜索、关键词
|
|
|
|
|
|
func (k *gseTool) extractTFIDF(text string, topN int) segment.Segments {
|
|
|
|
|
|
return k.tfidf.ExtractTags(text, topN)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ExtractTextRank TextRank 关键词(带权重)长文本、摘要、语义理解
|
|
|
|
|
|
func (k *gseTool) extractTextRank(text string, topN int) segment.Segments {
|
|
|
|
|
|
return k.tr.TextRank(text, topN)
|
|
|
|
|
|
}
|