From f85314f119329d1e677d6c59bcd43bcf28d7adf8 Mon Sep 17 00:00:00 2001 From: qhd <1766646056@qq.com> Date: Sat, 28 Mar 2026 11:10:02 +0800 Subject: [PATCH] =?UTF-8?q?refactor:=20=E4=BC=98=E5=8C=96=E6=95=B0?= =?UTF-8?q?=E6=8D=AE=E5=BA=93=E6=9F=A5=E8=AF=A2=E6=9E=84=E5=BB=BA=E9=93=BE?= =?UTF-8?q?=E5=BC=8F=E8=B0=83=E7=94=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- db/gfdb/gfdb.go | 2 +- rag/eino/document_semantic.go | 63 +++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 1 deletion(-) create mode 100644 rag/eino/document_semantic.go diff --git a/db/gfdb/gfdb.go b/db/gfdb/gfdb.go index f3cfe6f..d3c0aaa 100644 --- a/db/gfdb/gfdb.go +++ b/db/gfdb/gfdb.go @@ -440,7 +440,7 @@ func (d *dataBase) Model(ctx context.Context, tableNameOrStruct ...any) *model { m.Sharding(shardingConfig).ShardingValue(user.TenantId) } - m.OmitNil().OmitEmpty().Hook(catchSQLHook()) + m.OmitNil().Hook(catchSQLHook()) return &model{ Model: m, } diff --git a/rag/eino/document_semantic.go b/rag/eino/document_semantic.go new file mode 100644 index 0000000..1fa53bd --- /dev/null +++ b/rag/eino/document_semantic.go @@ -0,0 +1,63 @@ +package eino + +import ( + "context" + + "github.com/cloudwego/eino-ext/components/document/transformer/splitter/recursive" + "github.com/cloudwego/eino-ext/components/document/transformer/splitter/semantic" + "github.com/cloudwego/eino/components/document" + "github.com/cloudwego/eino/schema" + "github.com/gogf/gf/v2/frame/g" +) + +// 全局只初始化一次 +var ( + splitter document.Transformer +) + +// SemanticSplitDocument 语义分割文档 +func SemanticSplitDocument(ctx context.Context, docs []*schema.Document) (res []*schema.Document, err error) { + if g.IsEmpty(splitter) { + // 默认分隔符(支持中英文) + separators := []string{"\n\n", "\n", "。", "!", "?", ";", ".", "!", "?", ";"} + // 读取配置,使用合理的默认值 + bufferSize := g.Cfg().MustGet(ctx, "eino.splitter.bufferSize").Int() + percentile := g.Cfg().MustGet(ctx, "eino.splitter.percentile").Float64() + batchSize := g.Cfg().MustGet(ctx, "eino.splitter.batchSize").Int() + if batchSize <= 0 { + batchSize = 10 // doubao-embedding-vision 限制每批最多 10 个 + } + + // 使用批量包装器 + batchEmbedder := NewBatchEmbedder(Embedder, batchSize) + + splitter, err = semantic.NewSplitter(ctx, &semantic.Config{ + Embedding: batchEmbedder, + BufferSize: bufferSize, + Percentile: percentile, + Separators: separators, + }) + if err != nil { + return + } + } + return splitter.Transform(ctx, docs) +} + +// RecursiveSplitDocument 递归分割文档 +func RecursiveSplitDocument(ctx context.Context, docs []*schema.Document) (res []*schema.Document, err error) { + if g.IsEmpty(splitter) { + // 默认分隔符(支持中英文) + separators := []string{"\n\n", "\n", "。", "!", "?", ";", ".", "!", "?", ";"} + splitter, err = recursive.NewSplitter(ctx, &recursive.Config{ + ChunkSize: 1500, + OverlapSize: 300, + KeepType: recursive.KeepTypeNone, + Separators: separators, + }) + if err != nil { + return + } + } + return splitter.Transform(ctx, docs) +}