代码初始化
This commit is contained in:
232
service/asr/transcribe_service.go
Normal file
232
service/asr/transcribe_service.go
Normal file
@@ -0,0 +1,232 @@
|
||||
package asr
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
dto "media/model/dto/audio"
|
||||
serviceAudio "media/service/audio"
|
||||
serviceScene "media/service/scene"
|
||||
|
||||
"github.com/gogf/gf/v2/frame/g"
|
||||
)
|
||||
|
||||
// VideoTranscribeReq 视频语音识别请求
|
||||
type VideoTranscribeReq struct {
|
||||
VideoPath string
|
||||
Model string
|
||||
Language string
|
||||
KeepAudio bool
|
||||
}
|
||||
|
||||
// VideoTranscribeRes 视频语音识别响应
|
||||
type VideoTranscribeRes struct {
|
||||
Text string `json:"text"`
|
||||
Model string `json:"model"`
|
||||
Language string `json:"language"`
|
||||
AudioPath string `json:"audioPath"`
|
||||
AudioSize int64 `json:"audioSize"`
|
||||
AudioDuration string `json:"audioDuration"`
|
||||
}
|
||||
|
||||
type transcribeService struct{}
|
||||
|
||||
var VideoTranscribe = new(transcribeService)
|
||||
|
||||
// TranscribeWithURLs 从 URL 下载视频并转录
|
||||
func (s *transcribeService) TranscribeWithURLs(ctx context.Context, req *dto.TranscribeReq) (res *dto.TranscribeRes, err error) {
|
||||
if len(req.VideoURLs) == 0 {
|
||||
return nil, errors.New("video_urls 不能为空")
|
||||
}
|
||||
|
||||
tempDir := getTempDir(ctx)
|
||||
os.MkdirAll(tempDir, 0755)
|
||||
|
||||
var savePaths []string
|
||||
for _, videoURL := range req.VideoURLs {
|
||||
savePath, dlErr := downloadFromURL(ctx, videoURL, tempDir)
|
||||
if dlErr != nil {
|
||||
continue
|
||||
}
|
||||
savePaths = append(savePaths, savePath)
|
||||
}
|
||||
if len(savePaths) == 0 {
|
||||
return nil, errors.New("所有视频下载均失败")
|
||||
}
|
||||
|
||||
results := s.processVideos(ctx, savePaths, req.Model, req.Language, req.Threshold)
|
||||
res = &dto.TranscribeRes{Results: results}
|
||||
return
|
||||
}
|
||||
|
||||
// TranscribeUpload 从已保存的文件转录
|
||||
func (s *transcribeService) TranscribeUpload(ctx context.Context, savePaths []string, model, language string, threshold float64) []dto.TranscribeItem {
|
||||
return s.processVideos(ctx, savePaths, model, language, threshold)
|
||||
}
|
||||
|
||||
// processVideos 逐个处理视频
|
||||
func (s *transcribeService) processVideos(ctx context.Context, savePaths []string, model, language string, threshold float64) []dto.TranscribeItem {
|
||||
var results []dto.TranscribeItem
|
||||
|
||||
for _, savePath := range savePaths {
|
||||
fileName := filepath.Base(savePath)
|
||||
if idx := strings.Index(fileName, "_"); idx > 0 {
|
||||
fileName = fileName[idx+1:]
|
||||
}
|
||||
|
||||
// 场景分析
|
||||
var scenes *dto.SceneSummaryDTO
|
||||
sceneRes, sceneErr := serviceScene.SceneAnalyzer.Analyze(ctx, &serviceScene.SceneAnalyzeReq{
|
||||
VideoPaths: []string{savePath},
|
||||
Threshold: threshold,
|
||||
ExtractKeyframes: false,
|
||||
})
|
||||
if sceneErr == nil && len(sceneRes.Analyses) > 0 {
|
||||
scenes = toSceneDTO(&sceneRes.Analyses[0])
|
||||
}
|
||||
|
||||
// 语音转文字(内部删除视频文件)
|
||||
transRes, transErr := s.TranscribeVideo(ctx, &VideoTranscribeReq{
|
||||
VideoPath: savePath,
|
||||
Model: model,
|
||||
Language: language,
|
||||
})
|
||||
if transErr != nil {
|
||||
os.Remove(savePath)
|
||||
results = append(results, dto.TranscribeItem{FileName: fileName, Error: transErr.Error()})
|
||||
continue
|
||||
}
|
||||
|
||||
results = append(results, dto.TranscribeItem{
|
||||
FileName: fileName,
|
||||
Result: &dto.TranscribeResult{
|
||||
Text: transRes.Text,
|
||||
Model: transRes.Model,
|
||||
Language: transRes.Language,
|
||||
AudioPath: transRes.AudioPath,
|
||||
AudioSize: transRes.AudioSize,
|
||||
AudioDuration: transRes.AudioDuration,
|
||||
Scenes: scenes,
|
||||
},
|
||||
})
|
||||
}
|
||||
return results
|
||||
}
|
||||
|
||||
// TranscribeVideo 从视频提取音频并转为文字
|
||||
func (s *transcribeService) TranscribeVideo(ctx context.Context, req *VideoTranscribeReq) (res *VideoTranscribeRes, err error) {
|
||||
audioReq := &serviceAudio.ExtractAudioReq{VideoPath: req.VideoPath, Format: "mp3"}
|
||||
audioRes, err := serviceAudio.AudioExtract.Extract(ctx, audioReq)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("音频提取失败: %v", err)
|
||||
}
|
||||
|
||||
whisperRes, err := Whisper.Transcribe(ctx, &TranscribeReq{AudioPath: audioRes.AudioPath, Model: req.Model, Language: req.Language})
|
||||
if err != nil {
|
||||
os.Remove(audioRes.AudioPath)
|
||||
return nil, fmt.Errorf("语音识别失败: %v", err)
|
||||
}
|
||||
|
||||
os.Remove(req.VideoPath)
|
||||
if !req.KeepAudio {
|
||||
os.Remove(audioRes.AudioPath)
|
||||
baseName := strings.TrimSuffix(audioRes.AudioPath, filepath.Ext(audioRes.AudioPath))
|
||||
os.Remove(baseName + ".txt")
|
||||
os.Remove(baseName + "." + whisperRes.Model + ".txt")
|
||||
}
|
||||
|
||||
res = &VideoTranscribeRes{
|
||||
Text: whisperRes.Text,
|
||||
Model: whisperRes.Model,
|
||||
Language: whisperRes.Language,
|
||||
AudioPath: audioRes.AudioPath,
|
||||
AudioSize: audioRes.Size,
|
||||
AudioDuration: audioRes.Duration,
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func downloadFromURL(ctx context.Context, rawURL, tempDir string) (string, error) {
|
||||
parsedURL, err := url.Parse(rawURL)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
segments := strings.Split(parsedURL.Path, "/")
|
||||
fileName := segments[len(segments)-1]
|
||||
if fileName == "" {
|
||||
fileName = fmt.Sprintf("video_%d.mp4", time.Now().UnixMilli())
|
||||
}
|
||||
savePath := filepath.Join(tempDir, fmt.Sprintf("%d_%s", time.Now().UnixMilli(), fileName))
|
||||
|
||||
client := &http.Client{Timeout: 10 * time.Minute}
|
||||
resp, err := client.Get(rawURL)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
out, err := os.Create(savePath)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer out.Close()
|
||||
|
||||
_, err = io.Copy(out, resp.Body)
|
||||
if err != nil {
|
||||
os.Remove(savePath)
|
||||
}
|
||||
return savePath, err
|
||||
}
|
||||
|
||||
func getTempDir(ctx context.Context) string {
|
||||
tempDir := g.Cfg().MustGet(ctx, "ffmpeg.temp_dir", "resource/temp").String()
|
||||
if tempDir == "" {
|
||||
tempDir = "resource/temp"
|
||||
}
|
||||
if !filepath.IsAbs(tempDir) {
|
||||
absDir, _ := filepath.Abs(tempDir)
|
||||
tempDir = absDir
|
||||
}
|
||||
return tempDir
|
||||
}
|
||||
|
||||
// toSceneDTO 将场景服务的原始结果转为 DTO 格式
|
||||
func toSceneDTO(analysis *serviceScene.VideoSceneAnalysis) *dto.SceneSummaryDTO {
|
||||
if analysis == nil {
|
||||
return nil
|
||||
}
|
||||
shots := make([]dto.SceneShotDTO, 0, len(analysis.Scenes))
|
||||
for _, s := range analysis.Scenes {
|
||||
shots = append(shots, dto.SceneShotDTO{
|
||||
SceneIndex: s.SceneIndex,
|
||||
StartTimeStr: s.StartTimeStr,
|
||||
EndTimeStr: s.EndTimeStr,
|
||||
DurationStr: s.DurationStr,
|
||||
ShotType: s.ShotType,
|
||||
Composition: s.Composition,
|
||||
NarrativePos: s.NarrativePos,
|
||||
Description: s.Description,
|
||||
})
|
||||
}
|
||||
return &dto.SceneSummaryDTO{
|
||||
TotalScenes: analysis.TotalScenes,
|
||||
DurationStr: analysis.DurationStr,
|
||||
AspectRatio: analysis.AspectRatio,
|
||||
Orientation: analysis.Orientation,
|
||||
Pacing: analysis.Summary.Pacing,
|
||||
ShotTypes: analysis.Summary.ShotTypeDist,
|
||||
Scenes: shots,
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user