Files
model-gateway/service/job/cleaner.go

100 lines
3.4 KiB
Go
Raw Normal View History

package job
2026-04-29 15:54:14 +08:00
import (
"context"
"model-gateway/model/dto"
"model-gateway/service/queue"
"os"
2026-04-29 15:54:14 +08:00
"time"
"model-gateway/dao"
2026-04-29 15:54:14 +08:00
"github.com/gogf/gf/v2/frame/g"
)
var Cleaner = &cleaner{}
type cleaner struct{}
// RunOnce 由上层定时任务触发:执行一次清理/重试
func (c *cleaner) RunOnce(ctx context.Context) (res *dto.CleanWorkRes, err error) {
2026-04-29 15:54:14 +08:00
// 1) 清理已下载(state=4)且过期的任务(硬删除 + OSS
expired, err := dao.Task.ListExpiredDownloadedGlobal(ctx, 200)
if err != nil {
g.Log().Errorf(ctx, "[清理] 查询已下载过期任务失败: %v", err)
2026-04-29 15:54:14 +08:00
} else {
for _, t := range expired {
_ = os.Remove(t.TmpFile)
2026-04-29 15:54:14 +08:00
_ = dao.Task.HardDeleteByIDGlobal(ctx, t.Id)
}
g.Log().Infof(ctx, "[清理] 已下载过期任务清理完成, count=%d", len(expired))
2026-04-29 15:54:14 +08:00
}
// 2) 超时任务标失败
list, err := dao.Task.ListTimeoutTasksGlobal(ctx, 200)
if err != nil {
g.Log().Errorf(ctx, "[清理] 查询超时任务失败: %v", err)
2026-04-29 15:54:14 +08:00
} else {
for _, t := range list {
t.ErrorMsg = "任务超时自动失败"
_ = dao.Task.UpdateFailedGlobal(ctx, t)
queue.ReleaseQueueSlot(ctx, t.ModelName, t.TaskID)
2026-04-29 15:54:14 +08:00
}
g.Log().Infof(ctx, "[清理] 超时任务处理完成, count=%d", len(list))
2026-04-29 15:54:14 +08:00
}
// 3) 失败(state=3)的任务按模型配置 retry_times 重新入队(放到队尾)
retryable, err := dao.Task.ListFailedRetryableGlobal(ctx, 200)
if err != nil {
g.Log().Errorf(ctx, "[清理] 查询可重试任务失败: %v", err)
2026-04-29 15:54:14 +08:00
} else {
for _, t := range retryable {
// 失败任务重新入队state=3 -> 0先严格占用 queue_limit slot占用失败则留在失败态下一轮再尝试
// 获取模型配置以得到 queue_limit / expected_seconds
m, err := dao.Model.GetByModelNameForTenant(ctx, t.TenantId, t.ModelName)
if err != nil || m == nil {
continue
}
limit := queue.GetRuntimeQueueLimit(ctx, t.ModelName, m.QueueLimit)
2026-04-29 15:54:14 +08:00
if limit > 0 {
ok, _ := queue.AcquireQueueSlot(ctx, t.ModelName, t.TaskID, limit, m.ExpectedSeconds)
2026-04-29 15:54:14 +08:00
if !ok {
continue
}
}
// retry_queue_max_seconds 控制失败重试的排队策略:
// - =0失败重试插队到队首
// - >0当任务从创建到现在的排队时长 >= maxSeconds则插队到队首否则仍放到队尾
now := time.Now()
enqueueAt := now
maxSeconds := t.RetryQueueMaxSeconds
if maxSeconds == 0 {
enqueueAt = now.Add(-100 * 365 * 24 * time.Hour)
} else if maxSeconds > 0 && t.CreatedAt != nil {
if now.Sub(t.CreatedAt.Time) >= time.Duration(maxSeconds)*time.Second {
enqueueAt = now.Add(-100 * 365 * 24 * time.Hour)
}
}
_ = dao.Task.RequeueForRetryGlobal(ctx, t.Id, enqueueAt)
}
g.Log().Infof(ctx, "[清理] 可重试任务重新入队完成, count=%d", len(retryable))
2026-04-29 15:54:14 +08:00
}
// 4) 超过重试次数仍失败(state=3)的任务:硬删除
exhausted, err := dao.Task.ListFailedExhaustedGlobal(ctx, 200)
if err != nil {
g.Log().Errorf(ctx, "[清理] 查询重试耗尽任务失败: %v", err)
2026-04-29 15:54:14 +08:00
} else {
for _, t := range exhausted {
_ = os.Remove(t.TmpFile)
2026-04-29 15:54:14 +08:00
// 重试耗尽硬删除:释放闸门占位(兜底,若此前已释放则幂等)
queue.ReleaseQueueSlot(ctx, t.ModelName, t.TaskID)
2026-04-29 15:54:14 +08:00
_ = dao.Task.HardDeleteByIDGlobal(ctx, t.Id)
}
g.Log().Infof(ctx, "[清理] 重试耗尽任务清理完成, count=%d", len(exhausted))
2026-04-29 15:54:14 +08:00
}
return &dto.CleanWorkRes{
Ok: true,
}, nil
2026-04-29 15:54:14 +08:00
}