Files
rag/update.sql

162 lines
10 KiB
MySQL
Raw Normal View History

2026-04-03 09:16:53 +08:00
-----------2025-06-16 15:00:00--------------
--------------------pgsql创建rag_knowledge_dataset表语句---------------------------
-- 数据集表RAG场景专用
CREATE TABLE IF NOT EXISTS rag_knowledge_dataset (
-- 基础字段(继承 SQLBaseCol 通用字段,与 SQLBaseDO 对齐)
id BIGINT PRIMARY KEY, -- 主键ID非自增
tenant_id BIGINT NOT NULL DEFAULT 0, -- 租户ID int8类型
creator VARCHAR(64) NOT NULL,
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
updater VARCHAR(64) NOT NULL,
updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
deleted_at timestamp(6),
-- 数据集核心字段(调整为允许为空)
name VARCHAR(128) NOT NULL, -- 数据集名称(核心字段仍非空)
description TEXT DEFAULT '', -- 数据集描述(长文本,适配详细描述场景)
embedding VARCHAR(64), -- 向量模型名称(允许为空)
dimension INT, -- 向量维度(允许为空)
document_count BIGINT, -- 文件数量int64 映射为 BIGINT允许为空
document_size BIGINT -- 文件大小(字节)int64 映射为 BIGINT允许为空
);
-- 索引针对RAG数据集高频查询优化
CREATE INDEX idx_dataset_tenant_id ON rag_knowledge_dataset(tenant_id); -- 租户ID索引
CREATE INDEX idx_dataset_name ON rag_knowledge_dataset(name); -- 数据集名称模糊/精准查询
CREATE INDEX idx_dataset_embedding ON rag_knowledge_dataset(embedding); -- 按向量模型筛选允许空值索引自动忽略NULL
CREATE INDEX idx_dataset_deleted_at ON rag_knowledge_dataset(deleted_at); -- 软删字段索引
-- 唯一索引(保证数据集称唯一性,避免重复创建)
CREATE UNIQUE INDEX uk_dataset_name ON rag_knowledge_dataset(name) WHERE deleted_at IS NULL;
-- 表和字段注释
COMMENT ON TABLE rag_knowledge_dataset IS '数据集表RAG场景专用';
COMMENT ON COLUMN rag_knowledge_dataset.id IS '主键ID非自增';
COMMENT ON COLUMN rag_knowledge_dataset.tenant_id IS '租户ID';
COMMENT ON COLUMN rag_knowledge_dataset.creator IS '创建人';
COMMENT ON COLUMN rag_knowledge_dataset.created_at IS '创建时间';
COMMENT ON COLUMN rag_knowledge_dataset.updater IS '更新人';
COMMENT ON COLUMN rag_knowledge_dataset.updated_at IS '更新时间';
COMMENT ON COLUMN rag_knowledge_dataset.deleted_at IS '删除时间(软删)';
COMMENT ON COLUMN rag_knowledge_dataset.name IS '数据集名称';
COMMENT ON COLUMN rag_knowledge_dataset.description IS '数据集描述';
COMMENT ON COLUMN rag_knowledge_dataset.embedding IS '向量模型名称如text-embedding-ada-002允许为空';
COMMENT ON COLUMN rag_knowledge_dataset.dimension IS '向量维度对应embedding模型的输出维度允许为空';
COMMENT ON COLUMN rag_knowledge_dataset.document_count IS '数据集内文件数量(允许为空)';
COMMENT ON COLUMN rag_knowledge_dataset.document_size IS '数据集内文件总大小(字节,允许为空)';
--------------------pgsql创建rag_knowledge_dataset表语句---------------------------
--------------------pgsql创建rag_knowledge_document表语句---------------------------
-- RAG文件表存储原始文件及切分相关信息关联数据集
CREATE TABLE IF NOT EXISTS rag_knowledge_document (
-- 基础字段(继承 SQLBaseCol 通用字段)
id BIGINT PRIMARY KEY, -- 主键ID非自增
tenant_id BIGINT NOT NULL DEFAULT 0, -- 租户ID int8类型
creator VARCHAR(64) NOT NULL,
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
updater VARCHAR(64) NOT NULL,
updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
deleted_at timestamp(6),
-- 核心关联字段
dataset_id BIGINT NOT NULL, -- 关联数据集ID新增非空
-- 文件核心字段
title VARCHAR(256) NOT NULL, -- 文件标题
content TEXT, -- 文件内容(长文本,允许为空:大文件内容可仅存路径,不存原文)
format VARCHAR(16) DEFAULT '', -- 文件格式: txt, md, pdf, docx, html
source VARCHAR(64) DEFAULT '', -- 来源(如:手动上传/爬虫/API导入
source_id VARCHAR(64) DEFAULT '', -- 来源ID爬虫任务ID/上传批次ID
status SMALLINT NOT NULL DEFAULT 1, -- 状态1启用/0停用
vector_status SMALLINT NOT NULL DEFAULT 1, -- 向量化状态: 1pending, 2processing, 3completed, 4failed,5partCompleted
chunk_count BIGINT, -- 切分后的块数量int64映射为BIGINT允许为空
file_size BIGINT, -- 文件大小(字节)int64映射为BIGINT允许为空
file_path VARCHAR(512) DEFAULT '', -- 文件存储路径如MinIO路径
metadata JSONB DEFAULT '{}'::JSONB -- 额外元数据嵌套Metadata结构体JSONB存储
);
-- 单独添加外键约束(避免表定义内写约束导致的语法兼容问题)
-- 注意:执行前确保 rag_knowledge_dataset 表已存在,否则注释此行
ALTER TABLE rag_knowledge_document ADD CONSTRAINT fk_document_dataset_id FOREIGN KEY (dataset_id) REFERENCES rag_knowledge_dataset(id) ON DELETE CASCADE;
-- 索引针对RAG文件高频查询+数据集关联优化)
CREATE INDEX idx_document_tenant_id ON rag_knowledge_document(tenant_id); -- 租户ID索引
CREATE INDEX idx_document_dataset_id ON rag_knowledge_document(dataset_id); -- 数据集关联查询(核心索引)
CREATE INDEX idx_document_title ON rag_knowledge_document(title); -- 标题模糊查询
CREATE INDEX idx_document_format ON rag_knowledge_document(format); -- 按文件格式筛选
CREATE INDEX idx_document_status ON rag_knowledge_document(status); -- 启用/停用筛选
CREATE INDEX idx_document_vector_status ON rag_knowledge_document(vector_status); -- 向量化状态筛选(核心:监控处理中/失败文件)
CREATE INDEX idx_document_source ON rag_knowledge_document(source, source_id); -- 来源+来源ID组合查询溯源场景
CREATE INDEX idx_document_deleted_at ON rag_knowledge_document(deleted_at); -- 软删字段索引
-- 表和字段注释
COMMENT ON TABLE rag_knowledge_document IS 'RAG文件表存储原始文件及切分、元数据相关信息关联数据集';
COMMENT ON COLUMN rag_knowledge_document.id IS '主键ID非自增';
COMMENT ON COLUMN rag_knowledge_document.tenant_id IS '租户ID';
COMMENT ON COLUMN rag_knowledge_document.creator IS '创建人';
COMMENT ON COLUMN rag_knowledge_document.created_at IS '创建时间';
COMMENT ON COLUMN rag_knowledge_document.updater IS '更新人';
COMMENT ON COLUMN rag_knowledge_document.updated_at IS '更新时间';
COMMENT ON COLUMN rag_knowledge_document.deleted_at IS '删除时间(软删)';
COMMENT ON COLUMN rag_knowledge_document.dataset_id IS '关联数据集ID';
COMMENT ON COLUMN rag_knowledge_document.title IS '文件标题';
COMMENT ON COLUMN rag_knowledge_document.content IS '文件内容(大文件建议仅存路径,不存储原文)';
COMMENT ON COLUMN rag_knowledge_document.format IS '文件格式txt/md/pdf/docx/html等';
COMMENT ON COLUMN rag_knowledge_document.source IS '文件来源(手动上传/爬虫/API导入等';
COMMENT ON COLUMN rag_knowledge_document.source_id IS '来源ID溯源标识';
COMMENT ON COLUMN rag_knowledge_document.status IS '文件状态1启用/0停用';
COMMENT ON COLUMN rag_knowledge_document.vector_status IS '向量化状状态1pending-待处理/2processing-处理中/3completed-完成/4failed-失败/5partCompleted';
COMMENT ON COLUMN rag_knowledge_document.chunk_count IS '文件切分后的块数量int64类型未切分时为空';
COMMENT ON COLUMN rag_knowledge_document.file_size IS '文件大小字节int64类型允许为空';
COMMENT ON COLUMN rag_knowledge_document.file_path IS '文件存储路径如MinIO对象存储路径';
COMMENT ON COLUMN rag_knowledge_document.metadata IS '文件元数据,结构:{"author":"作者","tags":["标签1","标签2"],"custom":{"key":"值"}}';
--------------------pgsql创建rag_knowledge_document表语句---------------------------
--------------------pgsql创建rag_knowledge_keyword表语句---------------------------
-- 关键词表(文档关键词+权重)
CREATE TABLE IF NOT EXISTS rag_knowledge_keyword (
-- 基础字段(完全对齐项目规范)
id BIGINT PRIMARY KEY, -- 主键ID非自增
tenant_id BIGINT NOT NULL DEFAULT 0, -- 租户ID int8
creator VARCHAR(64) NOT NULL,
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
updater VARCHAR(64) NOT NULL,
updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
deleted_at timestamp(6),
-- 业务字段
dataset_id BIGINT NOT NULL, -- 数据集ID
document_id BIGINT NOT NULL, -- 文件ID
word VARCHAR(255) NOT NULL, -- 关键词
weight SMALLINT NOT NULL DEFAULT 0 -- 权重
);
-- 唯一索引:保证 租户 + 数据集 + 文档 + 关键词 全局唯一
CREATE UNIQUE INDEX uk_rag_knowledge_keyword_tenant_dataset_doc_word
ON rag_knowledge_keyword(tenant_id, dataset_id, document_id, word)
WHERE deleted_at IS NULL;
-- 索引(按业务高频查询)
CREATE INDEX idx_keyword_tenant_id ON rag_knowledge_keyword(tenant_id);
CREATE INDEX idx_keyword_dataset_id ON rag_knowledge_keyword(dataset_id);
CREATE INDEX idx_keyword_document_id ON rag_knowledge_keyword(document_id);
CREATE INDEX idx_keyword_word ON rag_knowledge_keyword(word);
CREATE INDEX idx_keyword_deleted_at ON rag_knowledge_keyword(deleted_at);
-- 表和字段注释
COMMENT ON TABLE rag_knowledge_keyword IS 'RAG关键词表文档关键词+权重)';
COMMENT ON COLUMN rag_knowledge_keyword.id IS '主键ID非自增';
COMMENT ON COLUMN rag_knowledge_keyword.tenant_id IS '租户ID';
COMMENT ON COLUMN rag_knowledge_keyword.creator IS '创建人';
COMMENT ON COLUMN rag_knowledge_keyword.created_at IS '创建时间';
COMMENT ON COLUMN rag_knowledge_keyword.updater IS '更新人';
COMMENT ON COLUMN rag_knowledge_keyword.updated_at IS '更新时间';
COMMENT ON COLUMN rag_knowledge_keyword.deleted_at IS '删除时间(软删)';
COMMENT ON COLUMN rag_knowledge_keyword.dataset_id IS '数据集ID';
COMMENT ON COLUMN rag_knowledge_keyword.document_id IS '文档ID';
COMMENT ON COLUMN rag_knowledge_keyword.word IS '关键词';
COMMENT ON COLUMN rag_knowledge_keyword.weight IS '权重';
--------------------pgsql创建rag_knowledge_keyword表语句---------------------------