refactor: 重构文档处理流程和任务管理

This commit is contained in:
2026-04-09 09:11:43 +08:00
parent b6896f3fb4
commit 7f894745e9
34 changed files with 1216 additions and 1056 deletions

View File

@@ -49,6 +49,7 @@ func (d *datasetIndexDao) InsertIndex(ctx context.Context, indexName string) (er
CREATE INDEX IF NOT EXISTS %s
ON %s
USING ivfflat (vector vector_cosine_ops)
WITH (lists = 100)
WHERE vector IS NOT NULL;
`, indexName, gfdb.TablePrefix+public.TableNameDocumentChunk)
_, err = db.Exec(ctx, sqlStr)

View File

@@ -2,12 +2,17 @@ package dao
import (
"context"
"fmt"
"rag/consts/public"
"rag/model/dto"
"rag/model/entity"
"gitea.com/red-future/common/db/gfdb"
"gitea.com/red-future/common/full-text-search/meilisearch"
"github.com/gogf/gf/v2/database/gdb"
"github.com/gogf/gf/v2/text/gstr"
"github.com/gogf/gf/v2/util/gconv"
"github.com/pgvector/pgvector-go"
)
var DocumentChunk = new(documentChunkDao)
@@ -55,3 +60,56 @@ func (d *documentChunkDao) List(ctx context.Context, req *dto.ListDocumentChunkR
err = r.Structs(&res)
return
}
func (d *documentChunkDao) GetAllByVector(ctx context.Context, datasetId []int64, queryVec pgvector.Vector, topK int) (list gdb.List, err error) {
sql := `
SELECT id, content, dataset_id, document_id,
vector <-> ? AS distance
FROM rag_vector_document_chunk
WHERE dataset_id IN (?)
AND vector IS NOT NULL
ORDER BY distance ASC
LIMIT ?
`
// 顺序vector, dataset_id, topK
result, err := gfdb.DB(ctx, public.DbNameVector).GetAll(ctx, sql, queryVec, datasetId, topK)
if err != nil {
return nil, err
}
return result.List(), nil
}
// SearchByKeywords 通过关键词全文检索文档块
func (d *documentChunkDao) SearchByKeywords(ctx context.Context, query string, datasetIds []int64, topK int) (list gdb.List, err error) {
// 构建 meilisearch 查询参数
searchParams := &meilisearch.SearchParams{
Query: query,
Limit: int64(topK),
}
// 构建 datasetIds 过滤条件
if len(datasetIds) > 0 {
datasetIdStrs := gconv.Strings(datasetIds)
quotedIds := make([]string, len(datasetIdStrs))
for i, id := range datasetIdStrs {
quotedIds[i] = fmt.Sprintf("%s", id)
}
searchParams.Filter = fmt.Sprintf("dataset_id IN [%s]", gstr.Implode(", ", quotedIds))
}
// 执行搜索
var hits []map[string]interface{}
_, err = meilisearch.DB().Search(ctx, searchParams, public.IndexNameDocumentChunk, &hits)
if err != nil {
return nil, err
}
// 转换查询结果为 gdb.List
resultList := make(gdb.List, 0, len(hits))
for _, hit := range hits {
resultList = append(resultList, hit)
}
return resultList, nil
}

View File

@@ -82,6 +82,9 @@ func (d *keywordDao) List(ctx context.Context, req *dto.ListKeywordReq, fields .
if !g.IsEmpty(req.Keyword) {
model.WhereLike(entity.KeywordCol.Word, "%"+req.Keyword+"%")
}
model.WhereIn(entity.KeywordCol.Word, req.Words)
model.Where(entity.KeywordCol.DatasetId, req.DatasetId)
model.Where(entity.KeywordCol.DocumentId, req.DocumentId)
model.OrderDesc(entity.KeywordCol.Weight)
model.OrderDesc(entity.KeywordCol.CreatedAt)
if req.Page != nil {

58
dao/task.go Normal file
View File

@@ -0,0 +1,58 @@
package dao
import (
"context"
"rag/consts/public"
"rag/model/dto"
"rag/model/entity"
"gitea.com/red-future/common/db/gfdb"
"github.com/gogf/gf/v2/util/gconv"
)
var Task = new(taskDao)
type taskDao struct{}
// Insert 创建任务
func (d *taskDao) Insert(ctx context.Context, req *dto.CreateTaskReq) (id int64, err error) {
var res *entity.Task
if err = gconv.Struct(req, &res); err != nil {
return
}
r, err := gfdb.DB(ctx, public.DbNameKnowledge).Model(ctx, public.TableNameTask).Data(&res).Insert()
if err != nil {
return
}
return r.LastInsertId()
}
// Update 更新任务
func (d *taskDao) Update(ctx context.Context, req *dto.UpdateTaskReq) (rows int64, err error) {
model := gfdb.DB(ctx, public.DbNameKnowledge).Model(ctx, public.TableNameTask)
r, err := model.Data(&req).Where(entity.TaskCol.Id, req.Id).Where(entity.TaskCol.TaskId, req.TaskId).OmitEmpty().Update()
if err != nil {
return
}
return r.RowsAffected()
}
func (d *taskDao) Get(ctx context.Context, req *dto.GetTaskReq) (res []*entity.Task, total int, err error) {
r, total, err := gfdb.DB(ctx, public.DbNameKnowledge).Model(ctx, public.TableNameTask).OmitEmpty().
Where(entity.TaskCol.Id, req.Id).
Where(entity.TaskCol.TaskId, req.TaskId).
Where(entity.TaskCol.TaskType, req.TaskType).AllAndCount(false)
if err != nil {
return
}
err = r.Structs(&res)
return
}
func (d *taskDao) DeleteByTaskId(ctx context.Context, req *dto.DeleteTaskByTaskIdReq) (rows int64, err error) {
r, err := gfdb.DB(ctx, public.DbNameKnowledge).Model(ctx, public.TableNameTask).Where(entity.TaskCol.TaskId, req.TaskId).Delete()
if err != nil {
return
}
return r.RowsAffected()
}