2026-04-03 11:14:44 +08:00
|
|
|
|
package eino
|
|
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
|
"context"
|
2026-04-16 15:47:37 +08:00
|
|
|
|
"rag/consts/model"
|
2026-04-03 11:14:44 +08:00
|
|
|
|
|
|
|
|
|
|
"github.com/cloudwego/eino-ext/components/document/transformer/splitter/recursive"
|
|
|
|
|
|
"github.com/cloudwego/eino-ext/components/document/transformer/splitter/semantic"
|
|
|
|
|
|
"github.com/cloudwego/eino/schema"
|
|
|
|
|
|
"github.com/gogf/gf/v2/frame/g"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
// SemanticSplitDocument 语义分割文档
|
2026-04-16 15:47:37 +08:00
|
|
|
|
func SemanticSplitDocument(ctx context.Context, docs []*schema.Document, vectorModel model.ModelConfigType) (res []*schema.Document, err error) {
|
2026-04-03 11:14:44 +08:00
|
|
|
|
// 默认分隔符(支持中英文)
|
|
|
|
|
|
separators := []string{"\n\n", "\n", "。", "!", "?", ";", ".", "!", "?", ";"}
|
|
|
|
|
|
// 读取配置,使用合理的默认值
|
|
|
|
|
|
bufferSize := g.Cfg().MustGet(ctx, "eino.splitter.bufferSize").Int()
|
|
|
|
|
|
minChunkSize := g.Cfg().MustGet(ctx, "eino.splitter.minChunkSize").Int()
|
|
|
|
|
|
percentile := g.Cfg().MustGet(ctx, "eino.splitter.percentile").Float64()
|
|
|
|
|
|
batchSize := g.Cfg().MustGet(ctx, "eino.splitter.batchSize").Int()
|
|
|
|
|
|
|
2026-04-16 15:47:37 +08:00
|
|
|
|
embedder, err := GetTenantEmbedderByType(ctx, vectorModel)
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
return nil, err
|
2026-04-03 11:14:44 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
splitter, err := semantic.NewSplitter(ctx, &semantic.Config{
|
2026-04-16 15:47:37 +08:00
|
|
|
|
Embedding: NewBatchEmbedder(embedder, batchSize),
|
2026-04-03 11:14:44 +08:00
|
|
|
|
BufferSize: bufferSize,
|
|
|
|
|
|
MinChunkSize: minChunkSize,
|
|
|
|
|
|
Percentile: percentile,
|
|
|
|
|
|
Separators: separators,
|
|
|
|
|
|
})
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
return splitter.Transform(ctx, docs)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// RecursiveSplitDocument 递归分割文档
|
|
|
|
|
|
func RecursiveSplitDocument(ctx context.Context, docs []*schema.Document) (res []*schema.Document, err error) {
|
|
|
|
|
|
// 默认分隔符(支持中英文)
|
|
|
|
|
|
separators := []string{"\n\n", "\n", "。", "!", "?", ";", ".", "!", "?", ";"}
|
|
|
|
|
|
splitter, err := recursive.NewSplitter(ctx, &recursive.Config{
|
|
|
|
|
|
ChunkSize: 512,
|
|
|
|
|
|
OverlapSize: 100,
|
|
|
|
|
|
KeepType: recursive.KeepTypeNone,
|
|
|
|
|
|
Separators: separators,
|
|
|
|
|
|
})
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
return splitter.Transform(ctx, docs)
|
|
|
|
|
|
}
|