Files
media/service/scene/scene_service.go

658 lines
17 KiB
Go
Raw Normal View History

2026-05-19 14:33:06 +08:00
package scene
import (
"bufio"
"context"
"fmt"
"math"
"os"
"os/exec"
"path/filepath"
"regexp"
"strconv"
"strings"
"sync"
"github.com/gogf/gf/v2/frame/g"
)
// SceneAnalyzerService 场景分析服务
type SceneAnalyzerService struct{}
// SceneAnalyzer 场景分析服务单例
var SceneAnalyzer = new(SceneAnalyzerService)
// KeyframeInfo 关键帧信息
type KeyframeInfo struct {
Path string `json:"path"` // 关键帧图片路径
TimeStr string `json:"timeStr"` // 时间点
Width int `json:"width"` // 图片宽度
Height int `json:"height"` // 图片高度
}
// SceneInfo 单个场景信息
type SceneInfo struct {
SceneIndex int `json:"sceneIndex"` // 场景序号
StartTime float64 `json:"startTime"` // 开始时间精确到3位小数
EndTime float64 `json:"endTime"` // 结束时间(秒)
Duration float64 `json:"duration"` // 时长(秒)
StartTimeStr string `json:"startTimeStr"` // HH:MM:SS.mmm
EndTimeStr string `json:"endTimeStr"`
DurationStr string `json:"durationStr"`
ShotType string `json:"shotType"` // 镜头类型
MotionLevel string `json:"motionLevel"` // 运动程度
Composition string `json:"composition"` // 构图类型
NarrativePos string `json:"narrativePos"` // 叙事位置
Keyframe *KeyframeInfo `json:"keyframe,omitempty"` // 关键帧(如有提取)
Description string `json:"description"` // 场景描述(供 AI 使用)
}
// VideoSceneAnalysis 单视频场景分析结果
type VideoSceneAnalysis struct {
FileName string `json:"fileName"`
FilePath string `json:"filePath"`
Duration float64 `json:"duration"`
DurationStr string `json:"durationStr"`
FrameRate float64 `json:"frameRate"`
Width int `json:"width"`
Height int `json:"height"`
AspectRatio string `json:"aspectRatio"` // 画面比例
Orientation string `json:"orientation"` // 横屏/竖屏
TotalScenes int `json:"totalScenes"`
Scenes []SceneInfo `json:"scenes"`
DetectParams DetectParams `json:"detectParams"`
Summary SceneSummary `json:"summary"` // 场景总览
}
// SceneSummary 场景总览
type SceneSummary struct {
AvgShotDuration float64 `json:"avgShotDuration"` // 平均镜头时长
MinShotDuration float64 `json:"minShotDuration"`
MaxShotDuration float64 `json:"maxShotDuration"`
ShotTypeDist map[string]int `json:"shotTypeDist"` // 镜头类型分布
MotionDist map[string]int `json:"motionDist"` // 运动程度分布
CompositionDist map[string]int `json:"compositionDist"` // 构图分布
Pacing string `json:"pacing"` // 剪辑节奏
KeyframesDir string `json:"keyframesDir,omitempty"` // 关键帧目录
}
// DetectParams 检测参数
type DetectParams struct {
Threshold float64 `json:"threshold"`
Method string `json:"method"`
ExtractKeyframes bool `json:"extractKeyframes"`
}
// SceneAnalyzeReq 场景分析请求
type SceneAnalyzeReq struct {
VideoPaths []string // 视频文件路径列表
Threshold float64 // 场景检测阈值 0.1-0.5,默认 0.3
ExtractKeyframes bool // 是否提取关键帧图片
}
// SceneAnalyzeRes 场景分析响应
type SceneAnalyzeRes struct {
Analyses []VideoSceneAnalysis `json:"analyses"`
}
var (
ptsTimeRegex = regexp.MustCompile(`pts_time:([\d.]+)`)
)
// Analyze 分析多个视频的场景
func (s *SceneAnalyzerService) Analyze(ctx context.Context, req *SceneAnalyzeReq) (res *SceneAnalyzeRes, err error) {
threshold := req.Threshold
if threshold <= 0 || threshold > 1 {
threshold = 0.3
}
var (
mu sync.Mutex
analyses []VideoSceneAnalysis
wg sync.WaitGroup
errCh = make(chan error, len(req.VideoPaths))
)
for _, videoPath := range req.VideoPaths {
wg.Add(1)
go func(vp string) {
defer wg.Done()
analysis, aErr := s.analyzeSingle(ctx, vp, threshold, req.ExtractKeyframes)
if aErr != nil {
errCh <- fmt.Errorf("分析失败 [%s]: %v", filepath.Base(vp), aErr)
return
}
mu.Lock()
analyses = append(analyses, *analysis)
mu.Unlock()
}(videoPath)
}
wg.Wait()
close(errCh)
var errs []string
for e := range errCh {
errs = append(errs, e.Error())
}
if len(errs) > 0 {
g.Log().Errorf(ctx, "部分视频分析失败: %s", strings.Join(errs, "; "))
}
if len(analyses) == 0 {
return nil, fmt.Errorf("所有视频分析均失败: %s", strings.Join(errs, "; "))
}
res = &SceneAnalyzeRes{Analyses: analyses}
return
}
// analyzeSingle 分析单个视频
func (s *SceneAnalyzerService) analyzeSingle(ctx context.Context, videoPath string, threshold float64, extractKeyframes bool) (*VideoSceneAnalysis, error) {
ffmpegPath, err := s.getFFmpegPath()
if err != nil {
return nil, err
}
// 1. 视频元数据
duration, frameRate, width, height, err := s.getVideoMeta(ctx, ffmpegPath, videoPath)
if err != nil {
return nil, fmt.Errorf("获取视频元数据失败: %v", err)
}
// 2. 场景检测
sceneChanges, err := s.detectScenes(ctx, ffmpegPath, videoPath, threshold)
if err != nil {
return nil, fmt.Errorf("场景检测失败: %v", err)
}
// 3. 构建场景列表 + 分析
rawScenes := s.buildScenes(sceneChanges, duration)
totalDuration := duration
// 4. 提取关键帧(如果需要)
keyframesDir := ""
if extractKeyframes {
keyframesDir = filepath.Join(filepath.Dir(videoPath), "keyframes_"+filepath.Base(videoPath))
os.MkdirAll(keyframesDir, 0755)
}
// 构建带分析信息的场景
aspectRatio := fmt.Sprintf("%d:%d", width/gcd(width, height), height/gcd(width, height))
orientation := "横屏"
if height > width {
orientation = "竖屏"
}
fileName := filepath.Base(videoPath)
if idx := strings.Index(fileName, "_"); idx > 0 {
fileName = fileName[idx+1:]
}
// 生成场景分析
totalScenes := len(rawScenes)
scenes := make([]SceneInfo, totalScenes)
shotDist := make(map[string]int)
motionDist := make(map[string]int)
compDist := make(map[string]int)
var durTotal float64
for i, rs := range rawScenes {
scene := SceneInfo{
SceneIndex: rs.SceneIndex,
StartTime: round3(rs.StartTime),
EndTime: round3(rs.EndTime),
Duration: round3(rs.Duration),
StartTimeStr: rs.StartTimeStr,
EndTimeStr: rs.EndTimeStr,
DurationStr: rs.DurationStr,
}
// 镜头类型
scene.ShotType = classifyShotType(rs.Duration)
shotDist[scene.ShotType]++
// 运动程度
scene.MotionLevel = classifyMotionLevel(rs.Duration, totalDuration)
motionDist[scene.MotionLevel]++
// 构图
scene.Composition = classifyComposition(rs.Duration, width, height)
compDist[scene.Composition]++
// 叙事位置
ratio := rs.StartTime / totalDuration
switch {
case ratio < 0.15:
scene.NarrativePos = "开头引入"
case ratio < 0.35:
scene.NarrativePos = "前段发展"
case ratio < 0.65:
scene.NarrativePos = "中段高潮"
case ratio < 0.85:
scene.NarrativePos = "后段收束"
default:
scene.NarrativePos = "结尾总结"
}
// 关键帧
if extractKeyframes && keyframesDir != "" {
midTime := (rs.StartTime + rs.EndTime) / 2
kfPath := filepath.Join(keyframesDir, fmt.Sprintf("scene_%03d.jpg", rs.SceneIndex))
if kfErr := s.extractKeyframe(ctx, ffmpegPath, videoPath, midTime, kfPath); kfErr == nil {
scene.Keyframe = &KeyframeInfo{
Path: kfPath,
TimeStr: formatTime(midTime),
Width: width,
Height: height,
}
}
}
// AI 描述
scene.Description = buildSceneDescription(scene)
durTotal += rs.Duration
scenes[i] = scene
}
analysis := &VideoSceneAnalysis{
FileName: fileName,
FilePath: videoPath,
Duration: round3(totalDuration),
DurationStr: formatTime(totalDuration),
FrameRate: round3(frameRate),
Width: width,
Height: height,
AspectRatio: aspectRatio,
Orientation: orientation,
TotalScenes: totalScenes,
Scenes: scenes,
DetectParams: DetectParams{
Threshold: threshold,
Method: "ffmpeg scene filter",
ExtractKeyframes: extractKeyframes,
},
Summary: s.buildSummary(scenes, shotDist, motionDist, compDist, keyframesDir),
}
return analysis, nil
}
// buildSummary 构建场景总览
func (s *SceneAnalyzerService) buildSummary(scenes []SceneInfo, shotDist, motionDist, compDist map[string]int, kfDir string) SceneSummary {
if len(scenes) == 0 {
return SceneSummary{}
}
var minD, maxD, sumD float64
minD = math.MaxFloat64
for _, sc := range scenes {
sumD += sc.Duration
if sc.Duration < minD {
minD = sc.Duration
}
if sc.Duration > maxD {
maxD = sc.Duration
}
}
avgD := sumD / float64(len(scenes))
pacing := "平稳"
if avgD < 2 {
pacing = "快节奏(快速剪辑)"
} else if avgD < 4 {
pacing = "适中节奏"
} else if avgD < 8 {
pacing = "舒缓节奏"
} else {
pacing = "慢节奏(长镜头为主)"
}
sm := SceneSummary{
AvgShotDuration: round3(avgD),
MinShotDuration: round3(minD),
MaxShotDuration: round3(maxD),
ShotTypeDist: shotDist,
MotionDist: motionDist,
CompositionDist: compDist,
Pacing: pacing,
}
if kfDir != "" {
sm.KeyframesDir = kfDir
}
return sm
}
// getVideoMeta 获取视频元数据
func (s *SceneAnalyzerService) getVideoMeta(ctx context.Context, ffmpegPath, videoPath string) (duration, frameRate float64, width, height int, err error) {
ffprobePath := filepath.Join(filepath.Dir(ffmpegPath), "ffprobe")
if _, statErr := os.Stat(ffprobePath); os.IsNotExist(statErr) {
ffprobePath = "ffprobe"
}
cmd := exec.CommandContext(ctx, ffprobePath,
"-v", "quiet",
"-print_format", "json",
"-show_format",
"-show_streams",
videoPath,
)
output, execErr := cmd.Output()
if execErr != nil {
err = fmt.Errorf("ffprobe 执行失败: %v", execErr)
return
}
text := string(output)
duration = parseJSONFloat(text, `"duration":`)
frameRate = parseFrameRate(text)
width = parseJSONInt(text, `"width":`)
height = parseJSONInt(text, `"height":`)
return
}
// detectScenes 通过 ffmpeg scene filter 检测场景变化
func (s *SceneAnalyzerService) detectScenes(ctx context.Context, ffmpegPath, videoPath string, threshold float64) ([]float64, error) {
thresholdStr := strconv.FormatFloat(threshold, 'f', 1, 64)
args := []string{
"-i", videoPath,
"-filter:v", fmt.Sprintf("select='gt(scene,%s)',showinfo", thresholdStr),
"-f", "null",
"-",
}
cmd := exec.CommandContext(ctx, ffmpegPath, args...)
output, _ := cmd.CombinedOutput()
var timestamps []float64
scanner := bufio.NewScanner(strings.NewReader(string(output)))
for scanner.Scan() {
line := scanner.Text()
matches := ptsTimeRegex.FindStringSubmatch(line)
if len(matches) >= 2 {
ts, parseErr := strconv.ParseFloat(matches[1], 64)
if parseErr == nil && ts > 0 {
timestamps = append(timestamps, ts)
}
}
}
return timestamps, nil
}
// extractKeyframe 提取指定时间点的关键帧
func (s *SceneAnalyzerService) extractKeyframe(ctx context.Context, ffmpegPath, videoPath string, timeSec float64, outputPath string) error {
timeStr := strconv.FormatFloat(timeSec, 'f', 3, 64)
args := []string{
"-ss", timeStr,
"-i", videoPath,
"-vframes", "1",
"-q:v", "3",
"-y",
outputPath,
}
cmd := exec.CommandContext(ctx, ffmpegPath, args...)
return cmd.Run()
}
// buildScenes 根据场景变化时间戳构建场景列表
func (s *SceneAnalyzerService) buildScenes(sceneChanges []float64, totalDuration float64) []SceneInfo {
var scenes []SceneInfo
if len(sceneChanges) == 0 {
scenes = append(scenes, SceneInfo{
SceneIndex: 1,
StartTime: 0,
EndTime: totalDuration,
Duration: totalDuration,
StartTimeStr: formatTime(0),
EndTimeStr: formatTime(totalDuration),
DurationStr: formatTime(totalDuration),
})
return scenes
}
startTime := 0.0
for i, ts := range sceneChanges {
if ts <= startTime || ts > totalDuration {
continue
}
scenes = append(scenes, SceneInfo{
SceneIndex: i + 1,
StartTime: startTime,
EndTime: ts,
Duration: ts - startTime,
StartTimeStr: formatTime(startTime),
EndTimeStr: formatTime(ts),
DurationStr: formatTime(ts - startTime),
})
startTime = ts
}
if startTime < totalDuration {
scenes = append(scenes, SceneInfo{
SceneIndex: len(scenes) + 1,
StartTime: startTime,
EndTime: totalDuration,
Duration: totalDuration - startTime,
StartTimeStr: formatTime(startTime),
EndTimeStr: formatTime(totalDuration),
DurationStr: formatTime(totalDuration - startTime),
})
}
return scenes
}
// ---------- 镜头分类逻辑 ----------
// classifyShotType 根据时长判断镜头类型
func classifyShotType(duration float64) string {
switch {
case duration < 0.8:
return "极速闪切"
case duration < 1.5:
return "快速切换"
case duration < 2.5:
return "短镜头"
case duration < 4:
return "标准镜头"
case duration < 8:
return "中长镜头"
case duration < 15:
return "长镜头"
default:
return "超长镜头"
}
}
// classifyMotionLevel 基于时长和相对比例推断运动程度
func classifyMotionLevel(duration, totalDuration float64) string {
switch {
case duration < 1.0:
return "高动态(快速切换)"
case duration < 2.0:
return "中高动态"
case duration < 4.0:
return "中等动态"
case duration < 8.0:
return "低动态(平稳)"
default:
return "静态/固定机位"
}
}
// classifyComposition 基于时长和画面比例推断构图类型
func classifyComposition(duration float64, width, height int) string {
isVertical := height > width
switch {
case duration < 1.2:
if isVertical {
return "竖屏特写/细节"
}
return "特写/细节"
case duration < 2.5:
if isVertical {
return "竖屏近景"
}
return "近景/中近景"
case duration < 5:
if isVertical {
return "竖屏中景"
}
return "中景/半身"
case duration < 10:
if isVertical {
return "竖屏全景"
}
return "全景/环境"
default:
if isVertical {
return "竖屏远景/固定机位"
}
return "远景/广角"
}
}
// buildSceneDescription 生成可读的场景描述(供 AI 使用)
func buildSceneDescription(scene SceneInfo) string {
return fmt.Sprintf(
"场景%d%s%s时长%s%s%s%s%s",
scene.SceneIndex,
scene.StartTimeStr, scene.EndTimeStr,
scene.DurationStr,
scene.ShotType,
scene.Composition,
scene.MotionLevel,
scene.NarrativePos,
)
}
// ---------- 工具函数 ----------
func round3(v float64) float64 {
return math.Round(v*1000) / 1000
}
func gcd(a, b int) int {
for b != 0 {
a, b = b, a%b
}
return a
}
func getFFmpegPath() (string, error) {
ffmpegPath := g.Cfg().MustGet(context.Background(), "ffmpeg.path", "").String()
if ffmpegPath != "" {
if _, err := os.Stat(ffmpegPath); err == nil {
return ffmpegPath, nil
}
}
path, err := exec.LookPath("ffmpeg")
if err != nil {
return "", fmt.Errorf("未找到 ffmpeg")
}
return path, nil
}
func formatTime(seconds float64) string {
h := int(seconds) / 3600
m := (int(seconds) % 3600) / 60
s := int(seconds) % 60
ms := int(math.Round((seconds - float64(int(seconds))) * 1000))
return fmt.Sprintf("%02d:%02d:%02d.%03d", h, m, s, ms)
}
func parseJSONFloat(text, key string) float64 {
idx := strings.Index(text, key)
if idx < 0 {
return 0
}
start := idx + len(key)
for start < len(text) && (text[start] == ' ' || text[start] == '"') {
start++
}
end := start
for end < len(text) && (isDigit(text[end]) || text[end] == '.') {
end++
}
if start < end {
val, _ := strconv.ParseFloat(text[start:end], 64)
return val
}
return 0
}
func parseJSONInt(text, key string) int {
idx := strings.Index(text, key)
if idx < 0 {
return 0
}
start := idx + len(key)
for start < len(text) && (text[start] == ' ' || text[start] == '"') {
start++
}
end := start
for end < len(text) && isDigit(text[end]) {
end++
}
if start < end {
val, _ := strconv.Atoi(text[start:end])
return val
}
return 0
}
func parseFrameRate(text string) float64 {
for _, key := range []string{`"r_frame_rate":`, `"avg_frame_rate":`} {
idx := strings.Index(text, key)
if idx < 0 {
continue
}
start := idx + len(key)
for start < len(text) && (text[start] == ' ' || text[start] == '"') {
start++
}
end := start
for end < len(text) && text[end] != '"' && text[end] != ',' && text[end] != '}' && text[end] != ' ' {
end++
}
valStr := text[start:end]
if strings.Contains(valStr, "/") {
parts := strings.Split(valStr, "/")
if len(parts) == 2 {
num, _ := strconv.ParseFloat(parts[0], 64)
den, _ := strconv.ParseFloat(parts[1], 64)
if den > 0 {
return num / den
}
}
}
val, _ := strconv.ParseFloat(valStr, 64)
if val > 0 {
return val
}
}
return 0
}
func isDigit(b byte) bool {
return b >= '0' && b <= '9'
}
// Cleanup 清理视频和关键帧文件
func Cleanup(paths []string) {
for _, p := range paths {
os.RemoveAll(p)
}
}
// getFFmpegPath on SceneAnalyzerService
func (s *SceneAnalyzerService) getFFmpegPath() (string, error) {
return getFFmpegPath()
}