fix: GSE数据文件从gse/dict目录加载

2026-04-21 10:24:47 +08:00
parent f671096dbe
commit e051046f77
12 changed files with 1886750 additions and 47 deletions
--- a/gse/dict/README.md
+++ b/gse/dict/README.md
@@ -0,0 +1,15 @@
+Some dict/zh data is from [github.com/fxsjy/jieba](https://github.com/fxsjy/jieba)
+
+update at 2023-11-16:
+
+add two new dict documents , which from [github.com/GuocaiL/nlp_corpus](https://github.com/GuocaiL/nlp_corpus)
+
+generated by `nlp_corpus/open_ner_data/boson/boson.txt`, `open_ner_data/people_daily/people_daily_ner.txt`, `open_ner_data/tianchi_yiyao/train.txt`,`open_ner_data/ResumeNER/dev.txt`
+
+1. tf_idf.txt
+
+The first column of this document is the term , the second column is the word frequency of the corresponding term, and the third column is the inverse document frequency of the corresponding term 
+
+2. tf_idf_origin.txt
+
+the origin corpus text
--- a/gse/dict/en/dict.txt
+++ b/gse/dict/en/dict.txt
--- a/gse/dict/jp/README.md
+++ b/gse/dict/jp/README.md
@@ -0,0 +1 @@
+dict.txt 通过内部工具生成, Copyright 2017 ego authors. 商用和拷贝请注明来源和版权
--- a/gse/dict/jp/dict.txt
+++ b/gse/dict/jp/dict.txt
--- a/gse/dict/zh/idf.txt
+++ b/gse/dict/zh/idf.txt
--- a/gse/dict/zh/s_1.txt
+++ b/gse/dict/zh/s_1.txt
--- a/gse/dict/zh/stop_tokens.txt
+++ b/gse/dict/zh/stop_tokens.txt
--- a/gse/dict/zh/stop_word.txt
+++ b/gse/dict/zh/stop_word.txt
@@ -0,0 +1,88 @@
+,
+.
+?
+!
+"
+@
+，
+。
+、
+？
+！
+：
+“
+”
+；
+　
+（
+）
+《
+》
+~
+*
+<
+>
+/
+\
+|
+-
+_
+
+=
+&
+^
+%
+#
+`
+;
+$
+￥
+‘
+’
+〉
+〈
+…
+＞
+＜
+＠
+＃
+＄
+％
+︿
+＆
+＊
+＋
+～
+｜
+［
+］
+｛
+｝
+啊
+阿
+哎
+哎呀
+哎哟
+唉
+俺
+俺们
+按
+按照
+吧
+吧哒
+把
+罢了
+被
+本
+本着
+比
+比方
+比如
+鄙人
+彼
+彼此
+边
+别
+别的
+别说
+并
--- a/gse/dict/zh/t_1.txt
+++ b/gse/dict/zh/t_1.txt
--- a/gse/dict/zh/tf_idf.txt
+++ b/gse/dict/zh/tf_idf.txt
--- a/gse/dict/zh/tf_idf_origin.txt
+++ b/gse/dict/zh/tf_idf_origin.txt
--- a/gse/gse.go
+++ b/gse/gse.go
@@ -0,0 +1,154 @@
+package utils
+
+import (
+	"context"
+	"path/filepath"
+	"sort"
+	"sync"
+
+	"github.com/go-ego/gse"
+	"github.com/go-ego/gse/hmm/extracker"
+	"github.com/go-ego/gse/hmm/segment"
+	"github.com/gogf/gf/v2/os/glog"
+)
+
+// 全局工具实例（不再自动初始化）
+var (
+	GseTool *gseTool
+	once    sync.Once // 保证只初始化一次，线程安全
+)
+
+func InitGseTool(ctx context.Context) error {
+	var err error
+	once.Do(func() {
+		// 只执行一次初始化
+		GseTool, err = newGseTool()
+	})
+	if err != nil {
+		glog.Error(ctx, "gse 分词工具初始化失败:", err)
+	}
+	return err
+}
+
+// gseTool 关键词提取工具（gse v1.0.2 标准）
+type gseTool struct {
+	seg   gse.Segmenter
+	tfidf *extracker.TagExtracter
+	tr    *extracker.TextRanker
+}
+
+// newGseTool 初始化工具（使用外部数据文件）
+func newGseTool() (tool *gseTool, err error) {
+	// 1. 初始化分词器
+	var seg gse.Segmenter
+
+	// gse数据文件在可执行文件同级的gse/dict目录下
+	gseDataPath := "gse"
+	dictPath := filepath.Join(gseDataPath, "dict", "zh")
+	idfPath := filepath.Join(dictPath, "idf.txt")
+	stopPath := filepath.Join(dictPath, "stop.txt")
+
+	// 加载词典
+	err = seg.LoadDict(filepath.Join(dictPath, "dict.txt"))
+	if err != nil {
+		glog.Warning(context.Background(), "加载gse词典失败，尝试embed模式:", err)
+		// 回退到embed模式
+		err = seg.LoadDictEmbed()
+		if err != nil {
+			return
+		}
+		err = seg.LoadStopEmbed()
+		if err != nil {
+			return
+		}
+		tfidf := &extracker.TagExtracter{}
+		tfidf.WithGse(seg)
+		err = tfidf.LoadIdf()
+		if err != nil {
+			return
+		}
+		tr := &extracker.TextRanker{}
+		tr.WithGse(seg)
+		tool = &gseTool{
+			seg:   seg,
+			tfidf: tfidf,
+			tr:    tr,
+		}
+		return
+	}
+
+	// 加载停用词
+	err = seg.LoadStop(stopPath)
+	if err != nil {
+		glog.Warning(context.Background(), "加载停用词失败，继续:", err)
+	}
+
+	// 2. 初始化 TF-IDF 提取器
+	tfidf := &extracker.TagExtracter{}
+	tfidf.WithGse(seg)
+	err = tfidf.LoadIdf(idfPath)
+	if err != nil {
+		return
+	}
+
+	// 3. 初始化 TextRank 提取器
+	tr := &extracker.TextRanker{}
+	tr.WithGse(seg)
+
+	tool = &gseTool{
+		seg:   seg,
+		tfidf: tfidf,
+		tr:    tr,
+	}
+	return
+}
+
+// Cut 分词（关键词提取唯一正确模式：精确模式 + HMM）
+func (k *gseTool) Cut(text string) []string {
+	return k.seg.Cut(text, true)
+}
+
+// Keyword 最终输出：关键词 + 权重
+type Keyword struct {
+	Word  string  `json:"word"`
+	Score float64 `json:"score"`
+}
+
+func (k *gseTool) Extract(text string, topN int) []Keyword {
+	// 1. 提取 TF-IDF
+	tfTags := k.extractTFIDF(text, topN)
+
+	// 2. 提取 TextRank
+	trTags := k.extractTextRank(text, topN)
+
+	// 3. 合并成最终关键词（业务最常用）
+	scoreMap := make(map[string]float64)
+	for _, tag := range tfTags {
+		scoreMap[tag.Text] = tag.Weight
+	}
+	for _, tag := range trTags {
+		scoreMap[tag.Text] = tag.Weight
+	}
+
+	// 转成切片并排序（高分在前）
+	res := make([]Keyword, 0, len(scoreMap))
+	for word, score := range scoreMap {
+		res = append(res, Keyword{Word: word, Score: score})
+	}
+
+	sort.Slice(res, func(i, j int) bool {
+		return res[i].Score > res[j].Score
+	})
+
+	return res
+}
+
+// ExtractTFIDF TF-IDF 关键词（带权重）90% 业务：文章标签、搜索、关键词
+func (k *gseTool) extractTFIDF(text string, topN int) segment.Segments {
+	return k.tfidf.ExtractTags(text, topN)
+}
+
+// ExtractTextRank TextRank 关键词（带权重）长文本、摘要、语义理解
+func (k *gseTool) extractTextRank(text string, topN int) segment.Segments {
+	return k.tr.TextRank(text, topN)
+}
				`@@ -0,0 +1 @@`
				`dict.txt 通过内部工具生成, Copyright 2017 ego authors. 商用和拷贝请注明来源和版权`