fix: GSE数据文件从gse/dict目录加载

2026-04-21 10:24:47 +08:00
parent f671096dbe
commit e051046f77
12 changed files with 1886750 additions and 47 deletions
--- a/gse/dict/README.md
+++ b/gse/dict/README.md
@@ -0,0 +1,15 @@
+Some dict/zh data is from [github.com/fxsjy/jieba](https://github.com/fxsjy/jieba)
+
+update at 2023-11-16:
+
+add two new dict documents , which from [github.com/GuocaiL/nlp_corpus](https://github.com/GuocaiL/nlp_corpus)
+
+generated by `nlp_corpus/open_ner_data/boson/boson.txt`, `open_ner_data/people_daily/people_daily_ner.txt`, `open_ner_data/tianchi_yiyao/train.txt`,`open_ner_data/ResumeNER/dev.txt`
+
+1. tf_idf.txt
+
+The first column of this document is the term , the second column is the word frequency of the corresponding term, and the third column is the inverse document frequency of the corresponding term 
+
+2. tf_idf_origin.txt
+
+the origin corpus text
--- a/gse/dict/en/dict.txt
+++ b/gse/dict/en/dict.txt
--- a/gse/dict/jp/README.md
+++ b/gse/dict/jp/README.md
@@ -0,0 +1 @@
+dict.txt 通过内部工具生成, Copyright 2017 ego authors. 商用和拷贝请注明来源和版权
--- a/gse/dict/jp/dict.txt
+++ b/gse/dict/jp/dict.txt
--- a/gse/dict/zh/idf.txt
+++ b/gse/dict/zh/idf.txt
--- a/gse/dict/zh/s_1.txt
+++ b/gse/dict/zh/s_1.txt
--- a/gse/dict/zh/stop_tokens.txt
+++ b/gse/dict/zh/stop_tokens.txt
--- a/gse/dict/zh/stop_word.txt
+++ b/gse/dict/zh/stop_word.txt
@@ -0,0 +1,88 @@
+,
+.
+?
+!
+"
+@
+，
+。
+、
+？
+！
+：
+“
+”
+；
+　
+（
+）
+《
+》
+~
+*
+<
+>
+/
+\
+|
+-
+_
+
+=
+&
+^
+%
+#
+`
+;
+$
+￥
+‘
+’
+〉
+〈
+…
+＞
+＜
+＠
+＃
+＄
+％
+︿
+＆
+＊
+＋
+～
+｜
+［
+］
+｛
+｝
+啊
+阿
+哎
+哎呀
+哎哟
+唉
+俺
+俺们
+按
+按照
+吧
+吧哒
+把
+罢了
+被
+本
+本着
+比
+比方
+比如
+鄙人
+彼
+彼此
+边
+别
+别的
+别说
+并
--- a/gse/dict/zh/t_1.txt
+++ b/gse/dict/zh/t_1.txt
--- a/gse/dict/zh/tf_idf.txt
+++ b/gse/dict/zh/tf_idf.txt
--- a/gse/dict/zh/tf_idf_origin.txt
+++ b/gse/dict/zh/tf_idf_origin.txt
--- a/utils/gse.go
+++ b/utils/gse.go
@@ -2,7 +2,6 @@ package utils

 import (
 	"context"
-	"os"
 	"path/filepath"
 	"sort"
 	"sync"
@@ -38,78 +37,68 @@ type gseTool struct {
 	tr    *extracker.TextRanker
 }

-// newGseTool 初始化工具（内置词典 + 停用词）
+// newGseTool 初始化工具（使用外部数据文件）
 func newGseTool() (tool *gseTool, err error) {
 	// 1. 初始化分词器
 	var seg gse.Segmenter

-	// 获取GSE数据文件路径
-	gseDataPath := os.Getenv("GSE_DATA_PATH")
+	// gse数据文件在可执行文件同级的gse/dict目录下
+	gseDataPath := "gse"
+	dictPath := filepath.Join(gseDataPath, "dict", "zh")
+	idfPath := filepath.Join(dictPath, "idf.txt")
+	stopPath := filepath.Join(dictPath, "stop.txt")

-	if gseDataPath != "" {
-		// 使用外部数据文件
-		dictPath := filepath.Join(gseDataPath, "dict", "zh")
-		idfPath := filepath.Join(gseDataPath, "dict", "zh", "idf.txt")
-		stopPath := filepath.Join(gseDataPath, "dict", "zh", "stop.txt")
-
-		// 加载词典
-		err = seg.LoadDict(filepath.Join(dictPath, "dict.txt"))
-		if err != nil {
-			return
-		}
-
-		// 加载停用词
-		err = seg.LoadStop(stopPath)
-		if err != nil {
-			glog.Warning(context.Background(), "加载停用词失败，继续:", err)
-		}
-
-		// 2. 初始化 TF-IDF 提取器
-		tfidf := &extracker.TagExtracter{}
-		tfidf.WithGse(seg)
-		err = tfidf.LoadIdf(idfPath)
-		if err != nil {
-			return
-		}
-
-		// 3. 初始化 TextRank 提取器
-		tr := &extracker.TextRanker{}
-		tr.WithGse(seg)
-
-		tool = &gseTool{
-			seg:   seg,
-			tfidf: tfidf,
-			tr:    tr,
-		}
-	} else {
-		// 使用内置embed数据
+	// 加载词典
+	err = seg.LoadDict(filepath.Join(dictPath, "dict.txt"))
+	if err != nil {
+		glog.Warning(context.Background(), "加载gse词典失败，尝试embed模式:", err)
+		// 回退到embed模式
 		err = seg.LoadDictEmbed()
 		if err != nil {
 			return
 		}
-		// 内置停用词（v1.0.2 标准）
 		err = seg.LoadStopEmbed()
 		if err != nil {
 			return
 		}
-
-		// 2. 初始化 TF-IDF 提取器
 		tfidf := &extracker.TagExtracter{}
 		tfidf.WithGse(seg)
 		err = tfidf.LoadIdf()
 		if err != nil {
 			return
 		}
-
-		// 3. 初始化 TextRank 提取器
 		tr := &extracker.TextRanker{}
 		tr.WithGse(seg)
-
 		tool = &gseTool{
 			seg:   seg,
 			tfidf: tfidf,
 			tr:    tr,
 		}
+		return
+	}
+
+	// 加载停用词
+	err = seg.LoadStop(stopPath)
+	if err != nil {
+		glog.Warning(context.Background(), "加载停用词失败，继续:", err)
+	}
+
+	// 2. 初始化 TF-IDF 提取器
+	tfidf := &extracker.TagExtracter{}
+	tfidf.WithGse(seg)
+	err = tfidf.LoadIdf(idfPath)
+	if err != nil {
+		return
+	}
+
+	// 3. 初始化 TextRank 提取器
+	tr := &extracker.TextRanker{}
+	tr.WithGse(seg)
+
+	tool = &gseTool{
+		seg:   seg,
+		tfidf: tfidf,
+		tr:    tr,
 	}
 	return
 }
				`@@ -0,0 +1 @@`
				`dict.txt 通过内部工具生成, Copyright 2017 ego authors. 商用和拷贝请注明来源和版权`