feat: 添加文档处理API和配置更新
This commit is contained in:
10
config.yml
10
config.yml
@@ -62,9 +62,9 @@ jaeger:
|
|||||||
eino:
|
eino:
|
||||||
# 文件切分配置
|
# 文件切分配置
|
||||||
splitter:
|
splitter:
|
||||||
bufferSize: 1
|
bufferSize: 3 # 必须 >=3 才能识别上下文语义
|
||||||
minChunkSize: 64
|
minChunkSize: 1 # 避免切碎
|
||||||
percentile: 0.75
|
percentile: 0.75 # 保持不变
|
||||||
# 向量化配置
|
# 向量化配置
|
||||||
embedding:
|
embedding:
|
||||||
provider: "dashscope"
|
provider: "dashscope"
|
||||||
@@ -77,6 +77,10 @@ eino:
|
|||||||
provider: "dashscope"
|
provider: "dashscope"
|
||||||
apiKey: "sk-4a8b82770bf74bc490eb3e4c5a8e2be9"
|
apiKey: "sk-4a8b82770bf74bc490eb3e4c5a8e2be9"
|
||||||
model: "qwen-turbo"
|
model: "qwen-turbo"
|
||||||
|
rerank:
|
||||||
|
provider: "dashscope"
|
||||||
|
apiKey: "sk-4a8b82770bf74bc490eb3e4c5a8e2be9"
|
||||||
|
model: "qwen3-rerank"
|
||||||
|
|
||||||
# 文件上传服务地址,与oss模块minio中的endpoint一致
|
# 文件上传服务地址,与oss模块minio中的endpoint一致
|
||||||
filePrefix: "http://116.204.74.41:9000"
|
filePrefix: "http://116.204.74.41:9000"
|
||||||
|
|||||||
@@ -51,3 +51,18 @@ func (c *document) DocumentVector(ctx context.Context, req *dto.DocumentVectorRe
|
|||||||
err = service.Document.Vector(ctx, req)
|
err = service.Document.Vector(ctx, req)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (c *document) VectorSemanticSplit(ctx context.Context, req *dto.VectorSemanticSplitReq) (res *beans.ResponseEmpty, err error) {
|
||||||
|
err = service.Document.VectorSemanticSplit(ctx, req)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *document) SearchRecursiveSplit(ctx context.Context, req *dto.SearchRecursiveSplitReq) (res *beans.ResponseEmpty, err error) {
|
||||||
|
err = service.Document.SearchRecursiveSplit(ctx, req)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *document) KeywordExtract(ctx context.Context, req *dto.KeywordExtractReq) (res *beans.ResponseEmpty, err error) {
|
||||||
|
err = service.Document.KeywordExtract(ctx, req)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|||||||
2
go.mod
2
go.mod
@@ -3,7 +3,7 @@ module rag
|
|||||||
go 1.26.0
|
go 1.26.0
|
||||||
|
|
||||||
require (
|
require (
|
||||||
gitea.com/red-future/common v0.0.15
|
gitea.com/red-future/common v0.0.18
|
||||||
github.com/bjang03/gmq v0.0.1
|
github.com/bjang03/gmq v0.0.1
|
||||||
github.com/cloudwego/eino v0.8.6
|
github.com/cloudwego/eino v0.8.6
|
||||||
github.com/cloudwego/eino-ext/components/document/loader/file v0.0.0-20260416081055-0ebab92e14f2
|
github.com/cloudwego/eino-ext/components/document/loader/file v0.0.0-20260416081055-0ebab92e14f2
|
||||||
|
|||||||
2
go.sum
2
go.sum
@@ -9,6 +9,8 @@ entgo.io/ent v0.14.3 h1:wokAV/kIlH9TeklJWGGS7AYJdVckr0DloWjIcO9iIIQ=
|
|||||||
entgo.io/ent v0.14.3/go.mod h1:aDPE/OziPEu8+OWbzy4UlvWmD2/kbRuWfK2A40hcxJM=
|
entgo.io/ent v0.14.3/go.mod h1:aDPE/OziPEu8+OWbzy4UlvWmD2/kbRuWfK2A40hcxJM=
|
||||||
gitea.com/red-future/common v0.0.15 h1:PcjjS7TpQHSlyGmfgWquxCoSWh1KMCu3DyXIhAgvvfg=
|
gitea.com/red-future/common v0.0.15 h1:PcjjS7TpQHSlyGmfgWquxCoSWh1KMCu3DyXIhAgvvfg=
|
||||||
gitea.com/red-future/common v0.0.15/go.mod h1:+El06tJ0E4SkWuWLLtP7t94CjG7Vqi8k1ladjWUvQx8=
|
gitea.com/red-future/common v0.0.15/go.mod h1:+El06tJ0E4SkWuWLLtP7t94CjG7Vqi8k1ladjWUvQx8=
|
||||||
|
gitea.com/red-future/common v0.0.18 h1:RwpnnWmDTCnFtKfmlp9BOnDd4r9eUnx7YT6Zst3VJqY=
|
||||||
|
gitea.com/red-future/common v0.0.18/go.mod h1:6/nqIucVzmjOyqDTIq71feYBXXFNBy0rFwzaQ0/Ueoo=
|
||||||
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
|
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
|
||||||
github.com/BurntSushi/toml v1.6.0 h1:dRaEfpa2VI55EwlIW72hMRHdWouJeRF7TPYhI+AUQjk=
|
github.com/BurntSushi/toml v1.6.0 h1:dRaEfpa2VI55EwlIW72hMRHdWouJeRF7TPYhI+AUQjk=
|
||||||
github.com/BurntSushi/toml v1.6.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho=
|
github.com/BurntSushi/toml v1.6.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho=
|
||||||
|
|||||||
@@ -102,3 +102,21 @@ type DocumentVectorRPC struct {
|
|||||||
ContentHash string `json:"contentHash" dc:"内容hash"`
|
ContentHash string `json:"contentHash" dc:"内容hash"`
|
||||||
Vector pgvector.Vector `json:"vector" dc:"向量"`
|
Vector pgvector.Vector `json:"vector" dc:"向量"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type VectorSemanticSplitReq struct {
|
||||||
|
g.Meta `path:"/vectorSemanticSplit" method:"post" tags:"文件管理" summary:"向量化生成" dc:"向量化生成"`
|
||||||
|
|
||||||
|
Id int64 `json:"id" v:"required#ID不能为空"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type SearchRecursiveSplitReq struct {
|
||||||
|
g.Meta `path:"/searchRecursiveSplit" method:"post" tags:"文件管理" summary:"全文检索生成" dc:"全文检索生成"`
|
||||||
|
|
||||||
|
Id int64 `json:"id" v:"required#ID不能为空"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type KeywordExtractReq struct {
|
||||||
|
g.Meta `path:"/keywordExtract" method:"post" tags:"文件管理" summary:"关键词提取" dc:"关键词提取"`
|
||||||
|
|
||||||
|
Id int64 `json:"id" v:"required#ID不能为空"`
|
||||||
|
}
|
||||||
|
|||||||
@@ -80,13 +80,16 @@ func (s *documentService) Create(ctx context.Context, req *dto.CreateDocumentReq
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
res = &dto.CreateDocumentRes{Id: id}
|
res = &dto.CreateDocumentRes{Id: id}
|
||||||
// 写入任务进度待处理 任务类型为文档解析
|
// 写入任务进度进行中 任务类型为文档解析
|
||||||
err = Task.WriteTaskProgress(ctx, &dto.WriteTaskProgressReq{
|
err = Task.WriteTaskProgress(ctx, &dto.WriteTaskProgressReq{
|
||||||
TaskId: id,
|
TaskId: id,
|
||||||
TaskType: task.TaskTypeDocParse,
|
TaskType: task.TaskTypeDocParse,
|
||||||
Status: task.TaskStatusPending,
|
Status: task.TaskStatusCompleted,
|
||||||
Remark: "文档上传成功待解析: " + req.Title,
|
Remark: "文档上传完成",
|
||||||
})
|
})
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
return
|
return
|
||||||
})
|
})
|
||||||
|
|
||||||
@@ -171,8 +174,7 @@ func (s *documentService) List(ctx context.Context, req *dto.ListDocumentReq) (r
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// Vector 处理文件(使用eino框架切分和向量化)
|
func (s *documentService) VectorSemanticSplit(ctx context.Context, req *dto.VectorSemanticSplitReq) (err error) {
|
||||||
func (s *documentService) Vector(ctx context.Context, req *dto.DocumentVectorReq) (err error) {
|
|
||||||
// 1. 查询文件信息
|
// 1. 查询文件信息
|
||||||
documentReq := dto.GetDocumentReq{Id: req.Id}
|
documentReq := dto.GetDocumentReq{Id: req.Id}
|
||||||
doc, err := dao.Document.Get(ctx, &documentReq)
|
doc, err := dao.Document.Get(ctx, &documentReq)
|
||||||
@@ -182,8 +184,56 @@ func (s *documentService) Vector(ctx context.Context, req *dto.DocumentVectorReq
|
|||||||
if g.IsEmpty(doc) {
|
if g.IsEmpty(doc) {
|
||||||
return errors.New("document not found")
|
return errors.New("document not found")
|
||||||
}
|
}
|
||||||
|
err = Task.WriteTaskProgress(ctx, &dto.WriteTaskProgressReq{
|
||||||
|
TaskId: req.Id,
|
||||||
|
TaskType: task.TaskTypeGenerateVector,
|
||||||
|
Status: task.TaskStatusRunning,
|
||||||
|
Remark: "向量化执行中",
|
||||||
|
})
|
||||||
|
return s.semanticSplitDocument(ctx, doc)
|
||||||
|
}
|
||||||
|
|
||||||
// 2. 更新文档状态为处理中
|
func (s *documentService) SearchRecursiveSplit(ctx context.Context, req *dto.SearchRecursiveSplitReq) (err error) {
|
||||||
|
// 1. 查询文件信息
|
||||||
|
documentReq := dto.GetDocumentReq{Id: req.Id}
|
||||||
|
doc, err := dao.Document.Get(ctx, &documentReq)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if g.IsEmpty(doc) {
|
||||||
|
return errors.New("document not found")
|
||||||
|
}
|
||||||
|
err = Task.WriteTaskProgress(ctx, &dto.WriteTaskProgressReq{
|
||||||
|
TaskId: req.Id,
|
||||||
|
TaskType: task.TaskTypeFullTextSearch,
|
||||||
|
Status: task.TaskStatusRunning,
|
||||||
|
Remark: "全文检索执行中",
|
||||||
|
})
|
||||||
|
return s.recursiveSplitDocument(ctx, doc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *documentService) KeywordExtract(ctx context.Context, req *dto.KeywordExtractReq) (err error) {
|
||||||
|
// 1. 查询文件信息
|
||||||
|
documentReq := dto.GetDocumentReq{Id: req.Id}
|
||||||
|
doc, err := dao.Document.Get(ctx, &documentReq)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if g.IsEmpty(doc) {
|
||||||
|
return errors.New("document not found")
|
||||||
|
}
|
||||||
|
err = Task.WriteTaskProgress(ctx, &dto.WriteTaskProgressReq{
|
||||||
|
TaskId: req.Id,
|
||||||
|
TaskType: task.TaskTypeExtractKeywords,
|
||||||
|
Status: task.TaskStatusRunning,
|
||||||
|
Remark: "提取关键词执行中",
|
||||||
|
})
|
||||||
|
return s.extractDocument(ctx, doc)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Vector 处理文件(使用eino框架切分和向量化)
|
||||||
|
func (s *documentService) Vector(ctx context.Context, req *dto.DocumentVectorReq) (err error) {
|
||||||
|
// 更新文档状态为处理中
|
||||||
updateDocumentReq := new(dto.UpdateDocumentReq)
|
updateDocumentReq := new(dto.UpdateDocumentReq)
|
||||||
updateDocumentReq.Id = req.Id
|
updateDocumentReq.Id = req.Id
|
||||||
updateDocumentReq.VectorStatus = document.VectorStatusProcessing.Code()
|
updateDocumentReq.VectorStatus = document.VectorStatusProcessing.Code()
|
||||||
@@ -197,16 +247,7 @@ func (s *documentService) Vector(ctx context.Context, req *dto.DocumentVectorReq
|
|||||||
})
|
})
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
// 写入任务进度进行中 任务类型为文档解析
|
|
||||||
err = Task.WriteTaskProgress(ctx, &dto.WriteTaskProgressReq{
|
|
||||||
TaskId: req.Id,
|
|
||||||
TaskType: task.TaskTypeDocParse,
|
|
||||||
Status: task.TaskStatusRunning,
|
|
||||||
Remark: "文档解析开始",
|
|
||||||
})
|
|
||||||
if err != nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
user, err := utils.GetUserInfo(ctx)
|
user, err := utils.GetUserInfo(ctx)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
@@ -217,7 +258,7 @@ func (s *documentService) Vector(ctx context.Context, req *dto.DocumentVectorReq
|
|||||||
// 任务1: 语义 切分文档
|
// 任务1: 语义 切分文档
|
||||||
grpool.Add(taskCtx, func(ctx context.Context) {
|
grpool.Add(taskCtx, func(ctx context.Context) {
|
||||||
g.TryCatch(ctx, func(ctx context.Context) {
|
g.TryCatch(ctx, func(ctx context.Context) {
|
||||||
if innerErr := s.semanticSplitDocument(ctx, doc); innerErr != nil {
|
if innerErr := s.VectorSemanticSplit(ctx, &dto.VectorSemanticSplitReq{Id: req.Id}); innerErr != nil {
|
||||||
cancel()
|
cancel()
|
||||||
}
|
}
|
||||||
}, func(ctx context.Context, err error) {
|
}, func(ctx context.Context, err error) {
|
||||||
@@ -228,7 +269,7 @@ func (s *documentService) Vector(ctx context.Context, req *dto.DocumentVectorReq
|
|||||||
// 任务2: 递归 切分文档
|
// 任务2: 递归 切分文档
|
||||||
grpool.Add(taskCtx, func(ctx context.Context) {
|
grpool.Add(taskCtx, func(ctx context.Context) {
|
||||||
g.TryCatch(ctx, func(ctx context.Context) {
|
g.TryCatch(ctx, func(ctx context.Context) {
|
||||||
if innerErr := s.recursiveSplitDocument(ctx, doc); innerErr != nil {
|
if innerErr := s.SearchRecursiveSplit(ctx, &dto.SearchRecursiveSplitReq{Id: req.Id}); innerErr != nil {
|
||||||
cancel()
|
cancel()
|
||||||
}
|
}
|
||||||
}, func(ctx context.Context, err error) {
|
}, func(ctx context.Context, err error) {
|
||||||
@@ -239,7 +280,7 @@ func (s *documentService) Vector(ctx context.Context, req *dto.DocumentVectorReq
|
|||||||
// 任务3: 提取文档
|
// 任务3: 提取文档
|
||||||
grpool.Add(taskCtx, func(ctx context.Context) {
|
grpool.Add(taskCtx, func(ctx context.Context) {
|
||||||
g.TryCatch(ctx, func(ctx context.Context) {
|
g.TryCatch(ctx, func(ctx context.Context) {
|
||||||
if innerErr := s.extractDocument(ctx, doc); innerErr != nil {
|
if innerErr := s.KeywordExtract(ctx, &dto.KeywordExtractReq{Id: req.Id}); innerErr != nil {
|
||||||
cancel()
|
cancel()
|
||||||
}
|
}
|
||||||
}, func(ctx context.Context, err error) {
|
}, func(ctx context.Context, err error) {
|
||||||
|
|||||||
Reference in New Issue
Block a user