package audio import ( "context" "encoding/json" common "media/controller/common" dto "media/model/dto/audio" service "media/service/asr" "gitea.com/red-future/common/beans" "github.com/gogf/gf/v2/frame/g" "github.com/gogf/gf/v2/net/ghttp" ) type audio struct{} var AudioExtract = new(audio) // safeResult 对外输出的识别结果(隐藏内部路径) type safeResult struct { Text string `json:"text"` Model string `json:"model"` Language string `json:"language"` AudioSize int64 `json:"audioSize"` AudioDuration string `json:"audioDuration"` Scenes *dto.SceneSummaryDTO `json:"scenes,omitempty"` } // safeItem 对外输出的单视频结果 type safeItem struct { FileName string `json:"fileName"` Result *safeResult `json:"result,omitempty"` Error string `json:"error,omitempty"` } // TranscribeHandler 语音转文字+分镜分析 // 支持两种入参方式: // 1. JSON body: {"video_urls":[...], "model":"medium", "language":"zh", "threshold":0.3} // 2. 文件上传: files 参数(兼容单/多文件) func (c *audio) TranscribeHandler(r *ghttp.Request) { ctx := r.Context() ctx = context.WithValue(ctx, "user", &beans.User{UserName: "admin"}) // 优先尝试 JSON body(URL 列表模式) body := r.GetBody() if len(body) > 0 && body[0] == '{' { var req dto.TranscribeReq if json.Unmarshal(body, &req) == nil && len(req.VideoURLs) > 0 { // 填充默认值 if req.Model == "" { req.Model = g.Cfg().MustGet(ctx, "whisper.model", "medium").String() } if req.Language == "" { req.Language = g.Cfg().MustGet(ctx, "whisper.language", "zh").String() } if req.Threshold <= 0 { req.Threshold = 0.3 } res, svcErr := service.VideoTranscribe.TranscribeWithURLs(ctx, &req) if svcErr != nil { r.Response.WriteJson(g.Map{"code": 500, "message": svcErr.Error()}) return } r.Response.WriteJson(g.Map{"code": 200, "message": "success", "data": toSafeItems(res.Results)}) return } } // 文件上传模式 savePaths, err := common.SaveUploadedFiles(r) if err != nil || len(savePaths) == 0 { r.Response.WriteJson(g.Map{"code": 400, "message": "请上传视频文件( multipart )或提供 video_urls( JSON )"}) return } results := service.VideoTranscribe.TranscribeUpload(ctx, savePaths, r.Get("model", g.Cfg().MustGet(ctx, "whisper.model", "medium").String()).String(), r.Get("language", g.Cfg().MustGet(ctx, "whisper.language", "zh").String()).String(), r.Get("threshold", 0.3).Float64()) r.Response.WriteJson(g.Map{"code": 200, "message": "success", "data": toSafeItems(results)}) } // toSafeItems 将结果转为安全的响应格式(移除 audioPath 等内部路径) func toSafeItems(results []dto.TranscribeItem) []safeItem { var items []safeItem for _, item := range results { si := safeItem{FileName: item.FileName, Error: item.Error} if item.Result != nil { if r, ok := item.Result.(*dto.TranscribeResult); ok { si.Result = &safeResult{ Text: r.Text, Model: r.Model, Language: r.Language, AudioSize: r.AudioSize, AudioDuration: r.AudioDuration, Scenes: r.Scenes, } } } items = append(items, si) } return items }