feat: 集成Eino文档解析与嵌入功能

新增Eino相关依赖,支持docx、pdf、xlsx等格式的文档加载与解析,并集成了Dashscope嵌入模型。同时修复了部分DAO查询中的OmitEmpty配置。
This commit is contained in:
2026-03-28 18:24:15 +08:00
parent f85314f119
commit bcbe6eba78
8 changed files with 414 additions and 197 deletions

View File

@@ -0,0 +1,51 @@
package eino
import (
"context"
"fmt"
"gitea.com/red-future/common/utils"
"github.com/cloudwego/eino-ext/components/document/loader/url"
"github.com/cloudwego/eino-ext/components/document/parser/docx"
"github.com/cloudwego/eino-ext/components/document/parser/pdf"
"github.com/cloudwego/eino-ext/components/document/parser/xlsx"
"github.com/cloudwego/eino/components/document"
"github.com/cloudwego/eino/components/document/parser"
"github.com/cloudwego/eino/schema"
)
// LoadDocument 业务函数:加载文件
func LoadDocument(ctx context.Context, filePath, fileFormat string) (docs []*schema.Document, err error) {
p, err := docsParser(ctx, fileFormat)
if err != nil {
return
}
loader, err := url.NewLoader(ctx, &url.LoaderConfig{
Parser: p,
})
imageUrl, err := utils.GetFileAddressPrefix(ctx)
if err != nil {
return
}
docs, err = loader.Load(context.Background(), document.Source{
URI: fmt.Sprintf("%s%s", imageUrl, filePath),
})
return
}
func docsParser(ctx context.Context, fileFormat string) (p parser.Parser, err error) {
switch fileFormat {
case "docx":
p, err = docx.NewDocxParser(ctx, &docx.Config{
ToSections: true,
IncludeHeaders: true,
IncludeFooters: true,
IncludeTables: true,
})
case "pdf":
p, err = pdf.NewPDFParser(ctx, &pdf.Config{})
case "xlsx":
p, err = xlsx.NewXlsxParser(ctx, &xlsx.Config{})
}
return
}

46
rag/eino/embedding.go Normal file
View File

@@ -0,0 +1,46 @@
package eino
import (
"context"
"github.com/cloudwego/eino-ext/components/embedding/dashscope"
"github.com/gogf/gf/v2/frame/g"
"github.com/golang/glog"
)
// 全局只初始化一次
var (
Embedder *dashscope.Embedder // 导出供其他模块使用
)
// init程序启动时自动执行一次
func init() {
ctx := context.Background()
if !g.Cfg().MustGet(ctx, "eino.embedding").IsEmpty() {
var err error
cfg := &dashscope.EmbeddingConfig{
APIKey: g.Cfg().MustGet(ctx, "eino.embedding.apiKey").String(),
Model: g.Cfg().MustGet(ctx, "eino.embedding.model").String(),
}
// 检查是否配置了 APIType支持 "text_api" 和 "multi_modal_api"
//if apiType := g.Cfg().MustGet(ctx, "eino.embedding.apiType").String(); apiType != "" {
// apiTypeVal := dashscope.APIType(apiType)
// cfg.APIType = &apiTypeVal
//}
Embedder, err = dashscope.NewEmbedder(ctx, cfg)
if err != nil {
glog.Fatalf("NewEmbedder of ark error: %v", err)
}
//embedding, err := embedder.EmbedStrings(ctx, []string{"hello world", "bye bye"})
//if err != nil {
// log.Printf("embedding error: %v\n", err)
// return
//}
//
//log.Printf("embedding: %v\n", embedding)
}
}
func EmbedStrings(ctx context.Context, texts []string) (embeddings [][]float64, err error) {
return Embedder.EmbedStrings(ctx, texts)
}

View File

@@ -0,0 +1,47 @@
package eino
import (
"context"
"github.com/cloudwego/eino/components/embedding"
)
// BatchEmbedder 包装器,支持批量限制
type BatchEmbedder struct {
embedder embedding.Embedder
batchSize int
}
// NewBatchEmbedder 创建支持批量限制的 embedding 包装器
func NewBatchEmbedder(embedder embedding.Embedder, batchSize int) *BatchEmbedder {
if batchSize <= 0 {
batchSize = 10 // 默认每批 10 个
}
return &BatchEmbedder{
embedder: embedder,
batchSize: batchSize,
}
}
// EmbedStrings 分批调用 embedding
func (b *BatchEmbedder) EmbedStrings(ctx context.Context, texts []string, opts ...embedding.Option) ([][]float64, error) {
if len(texts) <= b.batchSize {
return b.embedder.EmbedStrings(ctx, texts, opts...)
}
var allEmbeddings [][]float64
for i := 0; i < len(texts); i += b.batchSize {
end := i + b.batchSize
if end > len(texts) {
end = len(texts)
}
batch := texts[i:end]
embeddings, err := b.embedder.EmbedStrings(ctx, batch, opts...)
if err != nil {
return nil, err
}
allEmbeddings = append(allEmbeddings, embeddings...)
}
return allEmbeddings, nil
}