代码初始化

2026-05-19 14:33:06 +08:00
commit 219b7e39c7
18 changed files with 3311 additions and 0 deletions
--- a/service/asr/transcribe_service.go
+++ b/service/asr/transcribe_service.go
@@ -0,0 +1,232 @@
+package asr
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"io"
+	"net/http"
+	"net/url"
+	"os"
+	"path/filepath"
+	"strings"
+	"time"
+
+	dto "media/model/dto/audio"
+	serviceAudio "media/service/audio"
+	serviceScene "media/service/scene"
+
+	"github.com/gogf/gf/v2/frame/g"
+)
+
+// VideoTranscribeReq 视频语音识别请求
+type VideoTranscribeReq struct {
+	VideoPath string
+	Model     string
+	Language  string
+	KeepAudio bool
+}
+
+// VideoTranscribeRes 视频语音识别响应
+type VideoTranscribeRes struct {
+	Text          string `json:"text"`
+	Model         string `json:"model"`
+	Language      string `json:"language"`
+	AudioPath     string `json:"audioPath"`
+	AudioSize     int64  `json:"audioSize"`
+	AudioDuration string `json:"audioDuration"`
+}
+
+type transcribeService struct{}
+
+var VideoTranscribe = new(transcribeService)
+
+// TranscribeWithURLs 从 URL 下载视频并转录
+func (s *transcribeService) TranscribeWithURLs(ctx context.Context, req *dto.TranscribeReq) (res *dto.TranscribeRes, err error) {
+	if len(req.VideoURLs) == 0 {
+		return nil, errors.New("video_urls 不能为空")
+	}
+
+	tempDir := getTempDir(ctx)
+	os.MkdirAll(tempDir, 0755)
+
+	var savePaths []string
+	for _, videoURL := range req.VideoURLs {
+		savePath, dlErr := downloadFromURL(ctx, videoURL, tempDir)
+		if dlErr != nil {
+			continue
+		}
+		savePaths = append(savePaths, savePath)
+	}
+	if len(savePaths) == 0 {
+		return nil, errors.New("所有视频下载均失败")
+	}
+
+	results := s.processVideos(ctx, savePaths, req.Model, req.Language, req.Threshold)
+	res = &dto.TranscribeRes{Results: results}
+	return
+}
+
+// TranscribeUpload 从已保存的文件转录
+func (s *transcribeService) TranscribeUpload(ctx context.Context, savePaths []string, model, language string, threshold float64) []dto.TranscribeItem {
+	return s.processVideos(ctx, savePaths, model, language, threshold)
+}
+
+// processVideos 逐个处理视频
+func (s *transcribeService) processVideos(ctx context.Context, savePaths []string, model, language string, threshold float64) []dto.TranscribeItem {
+	var results []dto.TranscribeItem
+
+	for _, savePath := range savePaths {
+		fileName := filepath.Base(savePath)
+		if idx := strings.Index(fileName, "_"); idx > 0 {
+			fileName = fileName[idx+1:]
+		}
+
+		// 场景分析
+		var scenes *dto.SceneSummaryDTO
+		sceneRes, sceneErr := serviceScene.SceneAnalyzer.Analyze(ctx, &serviceScene.SceneAnalyzeReq{
+			VideoPaths:       []string{savePath},
+			Threshold:        threshold,
+			ExtractKeyframes: false,
+		})
+		if sceneErr == nil && len(sceneRes.Analyses) > 0 {
+			scenes = toSceneDTO(&sceneRes.Analyses[0])
+		}
+
+		// 语音转文字（内部删除视频文件）
+		transRes, transErr := s.TranscribeVideo(ctx, &VideoTranscribeReq{
+			VideoPath: savePath,
+			Model:     model,
+			Language:  language,
+		})
+		if transErr != nil {
+			os.Remove(savePath)
+			results = append(results, dto.TranscribeItem{FileName: fileName, Error: transErr.Error()})
+			continue
+		}
+
+		results = append(results, dto.TranscribeItem{
+			FileName: fileName,
+			Result: &dto.TranscribeResult{
+				Text:          transRes.Text,
+				Model:         transRes.Model,
+				Language:      transRes.Language,
+				AudioPath:     transRes.AudioPath,
+				AudioSize:     transRes.AudioSize,
+				AudioDuration: transRes.AudioDuration,
+				Scenes:        scenes,
+			},
+		})
+	}
+	return results
+}
+
+// TranscribeVideo 从视频提取音频并转为文字
+func (s *transcribeService) TranscribeVideo(ctx context.Context, req *VideoTranscribeReq) (res *VideoTranscribeRes, err error) {
+	audioReq := &serviceAudio.ExtractAudioReq{VideoPath: req.VideoPath, Format: "mp3"}
+	audioRes, err := serviceAudio.AudioExtract.Extract(ctx, audioReq)
+	if err != nil {
+		return nil, fmt.Errorf("音频提取失败: %v", err)
+	}
+
+	whisperRes, err := Whisper.Transcribe(ctx, &TranscribeReq{AudioPath: audioRes.AudioPath, Model: req.Model, Language: req.Language})
+	if err != nil {
+		os.Remove(audioRes.AudioPath)
+		return nil, fmt.Errorf("语音识别失败: %v", err)
+	}
+
+	os.Remove(req.VideoPath)
+	if !req.KeepAudio {
+		os.Remove(audioRes.AudioPath)
+		baseName := strings.TrimSuffix(audioRes.AudioPath, filepath.Ext(audioRes.AudioPath))
+		os.Remove(baseName + ".txt")
+		os.Remove(baseName + "." + whisperRes.Model + ".txt")
+	}
+
+	res = &VideoTranscribeRes{
+		Text:          whisperRes.Text,
+		Model:         whisperRes.Model,
+		Language:      whisperRes.Language,
+		AudioPath:     audioRes.AudioPath,
+		AudioSize:     audioRes.Size,
+		AudioDuration: audioRes.Duration,
+	}
+	return
+}
+
+func downloadFromURL(ctx context.Context, rawURL, tempDir string) (string, error) {
+	parsedURL, err := url.Parse(rawURL)
+	if err != nil {
+		return "", err
+	}
+	segments := strings.Split(parsedURL.Path, "/")
+	fileName := segments[len(segments)-1]
+	if fileName == "" {
+		fileName = fmt.Sprintf("video_%d.mp4", time.Now().UnixMilli())
+	}
+	savePath := filepath.Join(tempDir, fmt.Sprintf("%d_%s", time.Now().UnixMilli(), fileName))
+
+	client := &http.Client{Timeout: 10 * time.Minute}
+	resp, err := client.Get(rawURL)
+	if err != nil {
+		return "", err
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		return "", fmt.Errorf("HTTP %d", resp.StatusCode)
+	}
+
+	out, err := os.Create(savePath)
+	if err != nil {
+		return "", err
+	}
+	defer out.Close()
+
+	_, err = io.Copy(out, resp.Body)
+	if err != nil {
+		os.Remove(savePath)
+	}
+	return savePath, err
+}
+
+func getTempDir(ctx context.Context) string {
+	tempDir := g.Cfg().MustGet(ctx, "ffmpeg.temp_dir", "resource/temp").String()
+	if tempDir == "" {
+		tempDir = "resource/temp"
+	}
+	if !filepath.IsAbs(tempDir) {
+		absDir, _ := filepath.Abs(tempDir)
+		tempDir = absDir
+	}
+	return tempDir
+}
+
+// toSceneDTO 将场景服务的原始结果转为 DTO 格式
+func toSceneDTO(analysis *serviceScene.VideoSceneAnalysis) *dto.SceneSummaryDTO {
+	if analysis == nil {
+		return nil
+	}
+	shots := make([]dto.SceneShotDTO, 0, len(analysis.Scenes))
+	for _, s := range analysis.Scenes {
+		shots = append(shots, dto.SceneShotDTO{
+			SceneIndex:   s.SceneIndex,
+			StartTimeStr: s.StartTimeStr,
+			EndTimeStr:   s.EndTimeStr,
+			DurationStr:  s.DurationStr,
+			ShotType:     s.ShotType,
+			Composition:  s.Composition,
+			NarrativePos: s.NarrativePos,
+			Description:  s.Description,
+		})
+	}
+	return &dto.SceneSummaryDTO{
+		TotalScenes: analysis.TotalScenes,
+		DurationStr: analysis.DurationStr,
+		AspectRatio: analysis.AspectRatio,
+		Orientation: analysis.Orientation,
+		Pacing:      analysis.Summary.Pacing,
+		ShotTypes:   analysis.Summary.ShotTypeDist,
+		Scenes:      shots,
+	}
+}