feat: 新增操作日志、任务分页查询与模型失败重试优化

- 新增操作日志表(asynch_op_log)及对应DAO,记录任务创建等操作的审计信息
- 新增任务分页查询接口(ListTask)及对应DTO、Service和DAO方法
- 优化模型调用失败重试逻辑:支持配置重试排队策略(插队到队首或队尾)
- 新增临时文件存储机制,当模型调用成功但OSS上传失败时,下次仅重试OSS上传
- 模型配置新增retry_queue_max_seconds字段,控制失败重试排队策略
- 更新数据库表结构(asynch_models、asynch_task、新增asynch_op_log)及同步更新SQL
- 配置文件调整:超时单位改为秒,更新服务地址和轮询间隔
- 修复模型列表查询支持按名称模糊搜索
This commit is contained in:
2026-04-25 10:42:21 +08:00
parent 23b83cae39
commit f6c70a451e
22 changed files with 573 additions and 214 deletions

View File

@@ -20,7 +20,7 @@ func (d *taskDao) ClaimPendingGlobal(ctx context.Context, batchSize int) (tasks
}
err = gfdb.DB(ctx).Transaction(ctx, func(ctx context.Context, tx gdb.TX) error {
sql := fmt.Sprintf(
`SELECT id, tenant_id, model_name, task_id, input_ref, request_payload
`SELECT id, tenant_id, model_name, task_id, input_ref, request_payload, phase, tmp_file
FROM %s
WHERE deleted_at IS NULL AND state = 0
ORDER BY enqueue_at ASC
@@ -58,7 +58,7 @@ func (d *taskDao) ClaimPendingGlobal(ctx context.Context, batchSize int) (tasks
func (d *taskDao) UpdateSuccessGlobal(ctx context.Context, id int64, ossFile, fileType string, fileSize int64, expireAt *gtime.Time) error {
now := gtime.Now()
_, err := gfdb.DB(ctx).Exec(ctx,
fmt.Sprintf(`UPDATE %s SET state=2, oss_file=?, file_type=?, file_size=?, error_msg='', finished_at=?, expire_at=NULL, updated_at=? WHERE id=?`, public.TableNameTask),
fmt.Sprintf(`UPDATE %s SET state=2, oss_file=?, file_type=?, file_size=?, error_msg='', finished_at=?, expire_at=NULL, phase=0, tmp_file='', updated_at=? WHERE id=?`, public.TableNameTask),
ossFile, fileType, fileSize, now, now, id,
)
return err
@@ -67,12 +67,31 @@ func (d *taskDao) UpdateSuccessGlobal(ctx context.Context, id int64, ossFile, fi
func (d *taskDao) UpdateFailedGlobal(ctx context.Context, id int64, errorMsg string) error {
now := gtime.Now()
_, err := gfdb.DB(ctx).Exec(ctx,
fmt.Sprintf(`UPDATE %s SET state=3, error_msg=?, finished_at=?, updated_at=? WHERE id=?`, public.TableNameTask),
fmt.Sprintf(`UPDATE %s SET state=3, error_msg=?, finished_at=?, phase=0, tmp_file='', updated_at=? WHERE id=?`, public.TableNameTask),
errorMsg, now, now, id,
)
return err
}
// UpdateFailedKeepTmpGlobal OSS 上传失败:保留 phase/tmp_file下一轮仅重试 OSS 上传
func (d *taskDao) UpdateFailedKeepTmpGlobal(ctx context.Context, id int64, errorMsg string) error {
now := gtime.Now()
_, err := gfdb.DB(ctx).Exec(ctx,
fmt.Sprintf(`UPDATE %s SET state=3, error_msg=?, finished_at=?, phase=1, updated_at=? WHERE id=?`, public.TableNameTask),
errorMsg, now, now, id,
)
return err
}
// UpdateTmpAfterModelGlobal 模型调用成功后,写入临时文件路径并标记 phase=1
func (d *taskDao) UpdateTmpAfterModelGlobal(ctx context.Context, id int64, tmpFile string) error {
_, err := gfdb.DB(ctx).Exec(ctx,
fmt.Sprintf(`UPDATE %s SET phase=1, tmp_file=?, updated_at=NOW() WHERE id=?`, public.TableNameTask),
tmpFile, id,
)
return err
}
func (d *taskDao) SoftDeleteByTaskIDGlobal(ctx context.Context, taskID string) error {
_, err := gfdb.DB(ctx).Exec(ctx,
fmt.Sprintf(`UPDATE %s SET deleted_at=NOW(), updated_at=NOW() WHERE task_id=? AND deleted_at IS NULL`, public.TableNameTask),
@@ -113,7 +132,8 @@ func (d *taskDao) ListFailedRetryableGlobal(ctx context.Context, limit int) (lis
}
r, err := gfdb.DB(ctx).GetAll(ctx,
fmt.Sprintf(`
SELECT t.*
SELECT t.*,
m.retry_queue_max_seconds AS retry_queue_max_seconds
FROM %s t
JOIN %s m
ON t.tenant_id = m.tenant_id
@@ -132,11 +152,13 @@ SELECT t.*
return
}
// RequeueForRetryGlobal 将任务重新入队state=0enqueue_at=now),并将 retry_count +1
func (d *taskDao) RequeueForRetryGlobal(ctx context.Context, id int64) error {
// RequeueForRetryGlobal 将任务重新入队state=0并将 retry_count +1
// enqueueAt 用于控制重试任务在队列中的位置:
// - enqueueAt 越早越靠前ClaimPendingGlobal 按 enqueue_at ASC 抢占)
func (d *taskDao) RequeueForRetryGlobal(ctx context.Context, id int64, enqueueAt time.Time) error {
_, err := gfdb.DB(ctx).Exec(ctx,
fmt.Sprintf(`UPDATE %s SET state=0, retry_count=retry_count+1, enqueue_at=NOW(), updated_at=NOW() WHERE id=? AND state=3 AND deleted_at IS NULL`, public.TableNameTask),
id,
fmt.Sprintf(`UPDATE %s SET state=0, retry_count=retry_count+1, enqueue_at=?, updated_at=NOW() WHERE id=? AND state=3 AND deleted_at IS NULL`, public.TableNameTask),
enqueueAt, id,
)
return err
}