Files
common/ragflow/document.go
2026-03-12 08:51:25 +08:00

276 lines
9.2 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// Package ragflow - RAGFlow文档管理
// 功能RAGFlow知识库文档的上传、列表、删除操作
package ragflow
import (
"bytes"
"context"
"encoding/json"
"mime/multipart"
"strings"
commonHttp "gitee.com/red-future---jilin-g/common/http"
"github.com/gogf/gf/v2/errors/gerror"
"github.com/gogf/gf/v2/frame/g"
)
// 数据集内文件管理
// 参考: https://ragflow.com.cn/docs/dev/http_api_reference#数据集内文件管理
// ... (rest of the code remains the same)
type Document struct {
Id string `json:"id"`
DatasetId string `json:"dataset_id"`
Name string `json:"name"`
Size int64 `json:"size"`
Location string `json:"location"`
CreatedBy string `json:"created_by"`
CreateTime int64 `json:"create_time"`
Thumbnail string `json:"thumbnail"`
Type string `json:"type"`
RunStatus string `json:"run_status"` // 对应 API 返回的 "run" 字段,可能需要确认
Status string `json:"status"`
ChunkMethod string `json:"chunk_method"`
ParserConfig map[string]interface{} `json:"parser_config"`
TokenNum int `json:"token_num"`
ChunkCount int `json:"chunk_count"`
ProcessBegin int64 `json:"process_begin"`
ProcessDu int64 `json:"process_du"`
Progress float64 `json:"progress"`
ProgressMsg string `json:"progress_msg"`
}
// UploadDocumentReq 上传文档请求
// 注意:上传文件通常需要 multipart/form-data这里仅定义结构实际逻辑在方法中处理
type UploadDocumentReq struct {
FilePaths []string // 本地文件路径列表
}
// UploadDocumentRes 上传文档响应
type UploadDocumentRes struct {
Id string `json:"id"` // 文档ID
}
// ListDocumentsReq 列出文档请求
type ListDocumentsReq struct {
Page int `json:"page,omitempty"` // 页码,默认 1
PageSize int `json:"page_size,omitempty"` // 每页数量,默认 30
OrderBy string `json:"orderby,omitempty"` // 排序字段create_time默认或 update_time
Desc bool `json:"desc,omitempty"` // 是否降序,默认 true
Keywords string `json:"keywords,omitempty"` // 关键词过滤(匹配文档标题)
Id string `json:"id,omitempty"` // 文档 ID 过滤
Name string `json:"name,omitempty"` // 文档名称过滤
CreateTimeFrom int64 `json:"create_time_from,omitempty"` // 创建时间起始Unix 时间戳0 表示无限制
CreateTimeTo int64 `json:"create_time_to,omitempty"` // 创建时间截止Unix 时间戳0 表示无限制
Suffix []string `json:"suffix,omitempty"` // 文件后缀过滤,如 ["pdf", "txt", "docx"]
Run []string `json:"run,omitempty"` // 处理状态过滤,支持 ["UNSTART", "RUNNING", "CANCEL", "DONE", "FAIL"] 或数字格式 ["0", "1", "2", "3", "4"]
}
// ListDocumentsRes 列出文档响应
// 注意:响应结构与其他 List 接口不同data 是一个对象而非数组
type ListDocumentsRes struct {
Code int `json:"code"` // 状态码0 表示成功
Data struct {
Docs []*Document `json:"docs"` // 文档列表
TotalDatasets int `json:"total_datasets"` // 总文档数
} `json:"data"`
}
// DeleteDocumentsReq 删除文档请求
type DeleteDocumentsReq struct {
Ids []string `json:"ids"`
}
// ListDocuments 列出文档
func (c *Client) ListDocuments(ctx context.Context, datasetId string, req *ListDocumentsReq) (*ListDocumentsRes, error) {
path := "/api/v1/datasets/" + datasetId + "/documents"
params := map[string]interface{}{}
if req.Page > 0 {
params["page"] = req.Page
}
if req.PageSize > 0 {
params["page_size"] = req.PageSize
}
if req.OrderBy != "" {
params["orderby"] = req.OrderBy
}
if req.Desc {
params["desc"] = "true"
} else {
params["desc"] = "false"
}
if req.Keywords != "" {
params["keywords"] = req.Keywords
}
if req.Id != "" {
params["id"] = req.Id
}
if req.Name != "" {
params["name"] = req.Name
}
if req.CreateTimeFrom > 0 {
params["create_time_from"] = req.CreateTimeFrom
}
if req.CreateTimeTo > 0 {
params["create_time_to"] = req.CreateTimeTo
}
// 构造查询字符串
query := buildQueryString(params)
var queryParts []string
if query != "" {
queryParts = append(queryParts, query)
}
// 处理数组参数suffix文件后缀过滤
// API 要求多个值时重复参数名suffix=pdf&suffix=txt
for _, suffix := range req.Suffix {
queryParts = append(queryParts, "suffix="+suffix)
}
// 处理数组参数run处理状态过滤
// 支持数字格式("0"-"4")或文本格式("UNSTART", "RUNNING", "CANCEL", "DONE", "FAIL"
for _, run := range req.Run {
queryParts = append(queryParts, "run="+run)
}
// 构造最终请求路径
if len(queryParts) > 0 {
path += "?" + strings.Join(queryParts, "&")
}
// 发送请求并处理响应
var res ListDocumentsRes
if err := c.request(ctx, "GET", path, nil, &res); err != nil {
return nil, err
}
if res.Code != 0 {
return nil, gerror.Newf("list documents failed: code=%d", res.Code)
}
return &res, nil
}
// UploadDocumentFromText 上传文本内容作为文档
func (c *Client) UploadDocumentFromText(ctx context.Context, datasetId, content, filename string) (documentId string, err error) {
if datasetId == "" {
return "", gerror.New("datasetId不能为空")
}
if content == "" {
return "", gerror.New("文档内容不能为空")
}
if filename == "" {
filename = "document.txt"
}
// 构造URL使用负载均衡
endpoint := c.getNextEndpoint()
if endpoint == "" {
return "", gerror.New("RAGFlow endpoints not configured")
}
url := endpoint + "/api/v1/datasets/" + datasetId + "/documents"
// 创建multipart writer
body := &bytes.Buffer{}
writer := multipart.NewWriter(body)
// 添加文件字段
part, err := writer.CreateFormFile("file", filename)
if err != nil {
return "", gerror.Wrap(err, "创建form file失败")
}
// 写入内容
if _, err = part.Write([]byte(content)); err != nil {
return "", gerror.Wrap(err, "写入文件内容失败")
}
// 关闭multipart writer
if err = writer.Close(); err != nil {
return "", gerror.Wrap(err, "关闭multipart writer失败")
}
// 发送请求
client := commonHttp.Httpclient.Clone()
client.SetHeader("Authorization", "Bearer "+c.APIKey)
client.SetHeader("Content-Type", writer.FormDataContentType())
resp, err := client.Post(ctx, url, body.Bytes())
if err != nil {
return "", gerror.Wrap(err, "上传文档请求失败")
}
defer resp.Close()
// 解析响应
var response struct {
Code int `json:"code"`
Message string `json:"message"`
Data []UploadDocumentRes `json:"data"` // RAGFlow返回数组
}
respBody := resp.ReadAll()
g.Log().Debugf(ctx, "RAGFlow上传文档响应: %s", string(respBody))
if err := json.Unmarshal(respBody, &response); err != nil {
g.Log().Errorf(ctx, "解析RAGFlow响应失败: %v, 原始响应: %s", err, string(respBody))
return "", gerror.Newf("json Decode failed: %v", err)
}
// 先检查code再检查data
if response.Code != 0 {
g.Log().Errorf(ctx, "RAGFlow返回错误: code=%d, message=%s", response.Code, response.Message)
return "", gerror.Newf("上传文档失败 (code=%d): %s", response.Code, response.Message)
}
if len(response.Data) == 0 {
g.Log().Errorf(ctx, "RAGFlow返回data为空, 完整响应: %s", string(respBody))
return "", gerror.New("上传文档返回data为空")
}
return response.Data[0].Id, nil
}
// UploadDocument 上传文档(保留兼容)
func (c *Client) UploadDocument(ctx context.Context, datasetId string, filePaths []string) (err error) {
return gerror.New("upload document from file not implemented yet, use UploadDocumentFromText instead")
}
// ParseDocumentsReq 解析文档请求
type ParseDocumentsReq struct {
DocumentIds []string `json:"document_ids"` // 要解析的文档ID列表
}
// ParseDocuments 解析文档(上传后必须调用此接口才会开始解析)
func (c *Client) ParseDocuments(ctx context.Context, datasetId string, documentIds []string) error {
if datasetId == "" {
return gerror.New("datasetId不能为空")
}
if len(documentIds) == 0 {
return gerror.New("documentIds不能为空")
}
req := ParseDocumentsReq{DocumentIds: documentIds}
var res CommonResponse
path := "/api/v1/datasets/" + datasetId + "/chunks"
if err := c.request(ctx, "POST", path, req, &res); err != nil {
return err
}
if !res.IsSuccess() {
return gerror.Newf("解析文档失败: %s", res.Message)
}
return nil
}
// DeleteDocument 删除文档
func (c *Client) DeleteDocument(ctx context.Context, datasetId string, ids []string) (err error) {
req := DeleteDocumentsReq{Ids: ids}
var res CommonResponse
path := "/api/v1/datasets/" + datasetId + "/documents"
if err = c.request(ctx, "DELETE", path, req, &res); err != nil {
return
}
if !res.IsSuccess() {
return gerror.Newf("delete document failed: %s", res.Message)
}
return
}