feat: rag初始版
This commit is contained in:
@@ -1,5 +0,0 @@
|
||||
package service
|
||||
|
||||
var DatasetIndex = new(datasetIndexService)
|
||||
|
||||
type datasetIndexService struct{}
|
||||
@@ -3,6 +3,8 @@ package service
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"rag/common/eino"
|
||||
"rag/common/gse"
|
||||
"rag/consts/document"
|
||||
"rag/consts/public"
|
||||
"rag/dao"
|
||||
@@ -16,8 +18,6 @@ import (
|
||||
"gitea.com/red-future/common/db/gfdb"
|
||||
"gitea.com/red-future/common/full-text-search/meilisearch"
|
||||
"gitea.com/red-future/common/http"
|
||||
"gitea.com/red-future/common/rag/eino"
|
||||
"gitea.com/red-future/common/rag/gse"
|
||||
"gitea.com/red-future/common/utils"
|
||||
gmq "github.com/bjang03/gmq/core/gmq"
|
||||
"github.com/bjang03/gmq/mq"
|
||||
@@ -251,7 +251,7 @@ func (s *documentService) sqlSplitDocument(ctx context.Context, doc *entity.Docu
|
||||
return
|
||||
}
|
||||
// 3. 组装向量文档
|
||||
var vectorDocs = make([]dto.VectorDocumentChunkMsg, 0)
|
||||
var docsChunk = make([]*schema.Document, 0)
|
||||
for i, t := range docsSplit {
|
||||
contentHash := gmd5.MustEncryptString(t.Content)
|
||||
// 检查是否重复
|
||||
@@ -263,27 +263,26 @@ func (s *documentService) sqlSplitDocument(ctx context.Context, doc *entity.Docu
|
||||
if !success {
|
||||
continue
|
||||
}
|
||||
vectorDocs = append(vectorDocs, dto.VectorDocumentChunkMsg{
|
||||
TenantId: doc.TenantId,
|
||||
Creator: doc.Creator,
|
||||
DatasetId: doc.DatasetId,
|
||||
DocumentId: doc.Id,
|
||||
Content: t.Content,
|
||||
ContentHash: contentHash,
|
||||
ChunkIndex: gconv.Int64(i),
|
||||
})
|
||||
|
||||
var metaData = make(map[string]any)
|
||||
metaData[entity.DocumentCol.TenantId] = doc.TenantId
|
||||
metaData[entity.DocumentCol.Creator] = doc.Creator
|
||||
metaData[entity.DocumentCol.DatasetId] = doc.DatasetId
|
||||
metaData[entity.DocumentChunkCol.DocumentId] = doc.Id
|
||||
metaData[entity.DocumentChunkCol.ContentHash] = contentHash
|
||||
metaData[entity.DocumentChunkCol.ChunkIndex] = gconv.Int64(i)
|
||||
t.MetaData = metaData
|
||||
docsChunk = append(docsChunk, t)
|
||||
}
|
||||
// 4. 发送消息到队列
|
||||
if len(vectorDocs) > 0 {
|
||||
if len(docsChunk) > 0 {
|
||||
err = gmq.GetGmq("primary").GmqPublish(ctx, &mq.RedisPubMessage{
|
||||
PubMessage: types.PubMessage{
|
||||
Topic: public.KnowledgeDocumentChunkTopic,
|
||||
Data: vectorDocs,
|
||||
Data: docsChunk,
|
||||
},
|
||||
})
|
||||
}
|
||||
vectorDocsCount = gconv.Int64(len(vectorDocs))
|
||||
vectorDocsCount = gconv.Int64(len(docsChunk))
|
||||
return
|
||||
}
|
||||
|
||||
@@ -318,12 +317,12 @@ func (s *documentService) esSplitDocument(ctx context.Context, doc *entity.Docum
|
||||
}
|
||||
// 构建Meilisearch文档
|
||||
meiliDocs = append(meiliDocs, map[string]interface{}{
|
||||
"id": contentHash,
|
||||
"datasetId": doc.DatasetId,
|
||||
"documentId": doc.Id,
|
||||
"content": t.Content,
|
||||
"contentHash": contentHash,
|
||||
"chunkIndex": i,
|
||||
entity.DocumentChunkCol.Id: contentHash,
|
||||
entity.DocumentChunkCol.DatasetId: doc.DatasetId,
|
||||
entity.DocumentChunkCol.DocumentId: doc.Id,
|
||||
entity.DocumentChunkCol.Content: t.Content,
|
||||
entity.DocumentChunkCol.ContentHash: contentHash,
|
||||
entity.DocumentChunkCol.ChunkIndex: i,
|
||||
})
|
||||
}
|
||||
// 4. 写入到meilisearch数据库中
|
||||
|
||||
@@ -2,23 +2,20 @@ package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"errors"
|
||||
"fmt"
|
||||
"rag/common/eino"
|
||||
"rag/consts/document"
|
||||
"rag/consts/public"
|
||||
"rag/dao"
|
||||
"rag/model/dto"
|
||||
"rag/model/entity"
|
||||
|
||||
"gitea.com/red-future/common/beans"
|
||||
"gitea.com/red-future/common/rag/eino"
|
||||
gmq "github.com/bjang03/gmq/core/gmq"
|
||||
"github.com/bjang03/gmq/mq"
|
||||
"github.com/bjang03/gmq/types"
|
||||
"github.com/cloudwego/eino/components/indexer"
|
||||
"github.com/cloudwego/eino/schema"
|
||||
"github.com/gogf/gf/v2/frame/g"
|
||||
"github.com/gogf/gf/v2/util/gconv"
|
||||
"github.com/pgvector/pgvector-go"
|
||||
)
|
||||
|
||||
var DocumentChunk = new(documentChunkService)
|
||||
@@ -49,114 +46,124 @@ func (s *documentChunkService) List(ctx context.Context, req *dto.ListDocumentCh
|
||||
}
|
||||
|
||||
func (s *documentChunkService) DocsChunkMsg(ctx context.Context, msg any) (err error) {
|
||||
var req = make([]*dto.VectorDocumentChunkMsg, 0)
|
||||
var docs = make([]*schema.Document, 0)
|
||||
msgMap := gconv.Map(msg)
|
||||
if err = gconv.Structs(msgMap["data"], &req); err != nil {
|
||||
if err = gconv.Structs(msgMap["data"], &docs); err != nil {
|
||||
g.Log().Error(ctx, "DocsChunkMsg err:", err)
|
||||
return
|
||||
}
|
||||
if len(req) == 0 {
|
||||
if len(docs) == 0 {
|
||||
g.Log().Error(ctx, "DocsChunkMsg err:", "msg is empty")
|
||||
return
|
||||
}
|
||||
|
||||
ctx = context.WithValue(ctx, "user", &beans.User{
|
||||
TenantId: req[0].TenantId,
|
||||
UserName: req[0].Creator,
|
||||
})
|
||||
//ctx = context.WithValue(ctx, "user", &beans.User{
|
||||
// TenantId: req[0].TenantId,
|
||||
// UserName: req[0].Creator,
|
||||
//})
|
||||
|
||||
// 调用eino接口获取向量
|
||||
var vectorDocsStr = make([]string, 0, len(req))
|
||||
for _, t := range req {
|
||||
vectorDocsStr = append(vectorDocsStr, t.Content)
|
||||
}
|
||||
embeddings, err := eino.EmbedStrings(ctx, vectorDocsStr)
|
||||
if err != nil {
|
||||
g.Log().Error(ctx, "DocsChunkMsg err:", err)
|
||||
err = s.publishKnowledgeDocumentMsg(ctx, req[0].TenantId, req[0].Creator, req[0].DocumentId, document.VectorStatusFailed.Code())
|
||||
return
|
||||
}
|
||||
//var vectorDocsStr = make([]string, 0, len(req))
|
||||
//for _, t := range req {
|
||||
// vectorDocsStr = append(vectorDocsStr, t.Content)
|
||||
//}
|
||||
//embeddings, err := eino.EmbedStrings(ctx, vectorDocsStr)
|
||||
//if err != nil {
|
||||
// g.Log().Error(ctx, "DocsChunkMsg err:", err)
|
||||
// err = s.publishKnowledgeDocumentMsg(ctx, req[0].TenantId, req[0].Creator, req[0].DocumentId, document.VectorStatusFailed.Code())
|
||||
// return
|
||||
//}
|
||||
|
||||
// 获取向量维度
|
||||
dimension := 0
|
||||
if len(embeddings) > 0 {
|
||||
dimension = len(embeddings[0])
|
||||
}
|
||||
//dimension := 0
|
||||
//if len(embeddings) > 0 {
|
||||
// dimension = len(embeddings[0])
|
||||
//}
|
||||
|
||||
// 创建或更新DatasetIndex
|
||||
err = s.createOrUpdateDatasetIndex(ctx, req[0].DatasetId, dimension, int64(len(req)))
|
||||
if err != nil {
|
||||
g.Log().Error(ctx, "CreateOrUpdateDatasetIndex err:", err)
|
||||
err = s.publishKnowledgeDocumentMsg(ctx, req[0].TenantId, req[0].Creator, req[0].DocumentId, document.VectorStatusFailed.Code())
|
||||
return
|
||||
}
|
||||
//err = s.createOrUpdateDatasetIndex(ctx, req[0].DatasetId, dimension, int64(len(req)))
|
||||
//if err != nil {
|
||||
// g.Log().Error(ctx, "CreateOrUpdateDatasetIndex err:", err)
|
||||
// err = s.publishKnowledgeDocumentMsg(ctx, req[0].TenantId, req[0].Creator, req[0].DocumentId, document.VectorStatusFailed.Code())
|
||||
// return
|
||||
//}
|
||||
|
||||
// 更新向量文档
|
||||
for i, embedding := range embeddings {
|
||||
req[i].Vector = pgvector.NewVector(gconv.Float32s(embedding))
|
||||
req[i].VectorStatus = document.VectorStatusCompleted.Code()
|
||||
req[i].Status = document.StatusEnable.Code()
|
||||
}
|
||||
_, err = dao.DocumentChunk.BatchInsert(ctx, req)
|
||||
if err != nil {
|
||||
g.Log().Error(ctx, "DocsChunkMsg err:", err)
|
||||
err = s.publishKnowledgeDocumentMsg(ctx, req[0].TenantId, req[0].Creator, req[0].DocumentId, document.VectorStatusFailed.Code())
|
||||
//for i, embedding := range embeddings {
|
||||
// req[i].Vector = pgvector.NewVector(gconv.Float32s(embedding))
|
||||
// req[i].VectorStatus = document.VectorStatusCompleted.Code()
|
||||
// req[i].Status = document.StatusEnable.Code()
|
||||
//}
|
||||
//_, err = dao.DocumentChunk.BatchInsert(ctx, req)
|
||||
//if err != nil {
|
||||
// g.Log().Error(ctx, "DocsChunkMsg err:", err)
|
||||
// err = s.publishKnowledgeDocumentMsg(ctx, req[0].TenantId, req[0].Creator, req[0].DocumentId, document.VectorStatusFailed.Code())
|
||||
// return
|
||||
//}
|
||||
idx := eino.NewPGVectorIndexer(&eino.PGVectorIndexerOptions{
|
||||
BatchSize: 10,
|
||||
})
|
||||
rows, err := idx.Store(ctx, docs, indexer.WithEmbedding(eino.EmbedderDashscope))
|
||||
if err != nil || rows == 0 {
|
||||
g.Log().Error(ctx, "DocsChunkMsg rows: , err:", rows, err)
|
||||
return
|
||||
}
|
||||
|
||||
err = s.publishKnowledgeDocumentMsg(ctx, req[0].TenantId, req[0].Creator, req[0].DocumentId, document.VectorStatusCompleted.Code())
|
||||
tenantId := docs[0].MetaData[entity.DocumentChunkCol.TenantId].(uint64)
|
||||
creator := docs[0].MetaData[entity.DocumentChunkCol.Creator].(string)
|
||||
documentId := docs[0].MetaData[entity.DocumentChunkCol.DocumentId].(int64)
|
||||
err = s.publishKnowledgeDocumentMsg(ctx, tenantId, creator, documentId, document.VectorStatusCompleted.Code())
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
// createOrUpdateDatasetIndex 创建或更新数据集索引
|
||||
func (s *documentChunkService) createOrUpdateDatasetIndex(ctx context.Context, datasetId int64, dimension int, vectorCount int64) (err error) {
|
||||
// 查询数据集是否已有索引
|
||||
existIndex, err := dao.DatasetIndex.GetByDatasetId(ctx, datasetId)
|
||||
if err != nil && !errors.Is(err, sql.ErrNoRows) {
|
||||
return err
|
||||
}
|
||||
|
||||
// 已有索引 → 只更新数量
|
||||
if existIndex != nil {
|
||||
_ = dao.DatasetIndex.IncVectorCount(ctx, existIndex.Id, vectorCount)
|
||||
return nil
|
||||
}
|
||||
|
||||
// ====================== 创建新索引 ======================
|
||||
indexName := fmt.Sprintf("idx_dataset_%d_vector", datasetId) // 真实PG索引名
|
||||
// 1. 插入索引配置
|
||||
index := &entity.DatasetIndex{
|
||||
DatasetId: datasetId,
|
||||
Name: indexName,
|
||||
Dimension: dimension,
|
||||
FieldType: "float",
|
||||
MetricType: "COSINE",
|
||||
Status: gconv.PtrInt8(1),
|
||||
VectorCount: vectorCount,
|
||||
Description: fmt.Sprintf("数据集%d向量索引", datasetId),
|
||||
}
|
||||
_, err = dao.DatasetIndex.Insert(ctx, index)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// 2. 真正创建 PGVector 索引(唯一真实索引!)
|
||||
err = s.createRealPGVectorIndex(ctx, indexName)
|
||||
return err
|
||||
}
|
||||
|
||||
// createRealPGVectorIndex 真正在PostgreSQL创建向量索引(真实可用)
|
||||
func (s *documentChunkService) createRealPGVectorIndex(ctx context.Context, indexName string) error {
|
||||
// 执行真实建索引语句
|
||||
err := dao.DatasetIndex.InsertIndex(ctx, indexName)
|
||||
if err != nil {
|
||||
g.Log().Error(ctx, "创建向量索引失败:", err)
|
||||
return err
|
||||
}
|
||||
g.Log().Info(ctx, "PGVector真实索引创建成功:"+indexName)
|
||||
return nil
|
||||
}
|
||||
//// createOrUpdateDatasetIndex 创建或更新数据集索引
|
||||
//func (s *documentChunkService) createOrUpdateDatasetIndex(ctx context.Context, datasetId int64, dimension int, vectorCount int64) (err error) {
|
||||
// // 查询数据集是否已有索引
|
||||
// existIndex, err := dao.DatasetIndex.GetByDatasetId(ctx, datasetId)
|
||||
// if err != nil && !errors.Is(err, sql.ErrNoRows) {
|
||||
// return err
|
||||
// }
|
||||
//
|
||||
// // 已有索引 → 只更新数量
|
||||
// if existIndex != nil {
|
||||
// _ = dao.DatasetIndex.IncVectorCount(ctx, existIndex.Id, vectorCount)
|
||||
// return nil
|
||||
// }
|
||||
//
|
||||
// // ====================== 创建新索引 ======================
|
||||
// indexName := fmt.Sprintf("idx_dataset_%d_vector", datasetId) // 真实PG索引名
|
||||
// // 1. 插入索引配置
|
||||
// index := &entity.DatasetIndex{
|
||||
// DatasetId: datasetId,
|
||||
// Name: indexName,
|
||||
// Dimension: dimension,
|
||||
// FieldType: "float",
|
||||
// MetricType: "COSINE",
|
||||
// Status: gconv.PtrInt8(1),
|
||||
// VectorCount: vectorCount,
|
||||
// Description: fmt.Sprintf("数据集%d向量索引", datasetId),
|
||||
// }
|
||||
// _, err = dao.DatasetIndex.Insert(ctx, index)
|
||||
// if err != nil {
|
||||
// return err
|
||||
// }
|
||||
//
|
||||
// // 2. 真正创建 PGVector 索引(唯一真实索引!)
|
||||
// err = s.createRealPGVectorIndex(ctx, indexName)
|
||||
// return err
|
||||
//}
|
||||
//
|
||||
//// createRealPGVectorIndex 真正在PostgreSQL创建向量索引(真实可用)
|
||||
//func (s *documentChunkService) createRealPGVectorIndex(ctx context.Context, indexName string) error {
|
||||
// // 执行真实建索引语句
|
||||
// err := dao.DatasetIndex.InsertIndex(ctx, indexName)
|
||||
// if err != nil {
|
||||
// g.Log().Error(ctx, "创建向量索引失败:", err)
|
||||
// return err
|
||||
// }
|
||||
// g.Log().Info(ctx, "PGVector真实索引创建成功:"+indexName)
|
||||
// return nil
|
||||
//}
|
||||
|
||||
// publishKnowledgeDocumentMsg 发布消息
|
||||
func (s *documentChunkService) publishKnowledgeDocumentMsg(ctx context.Context, tenantId uint64, creator string, documentId int64, vectorStatus document.VectorStatus) (err error) {
|
||||
|
||||
Reference in New Issue
Block a user