clapper / src /services /resolver /useResolver.ts
jbilcke-hf's picture
jbilcke-hf HF staff
update doc
b8f658f
raw
history blame
21.2 kB
"use client"
import { create } from "zustand"
import { ClapEntity, ClapOutputType, ClapSegmentCategory, ClapSegmentFilteringMode, ClapSegmentStatus, filterSegments } from "@aitube/clap"
import { RenderingStrategy, TimelineStore, useTimeline, getAudioBuffer, SegmentVisibility, segmentVisibilityPriority, TimelineSegment } from "@aitube/timeline"
import { getVideoPrompt } from "@aitube/engine"
import { ResolverStore } from "@aitube/clapper-services"
import { getDefaultResolverState } from "./getDefaultResolverState"
import { useSettings } from "../settings"
import { DEFAULT_WAIT_TIME_IF_NOTHING_TO_DO_IN_MS } from "./constants"
import { ResolveRequest, ResolveRequestPrompts } from "@aitube/clapper-services"
export const useResolver = create<ResolverStore>((set, get) => ({
...getDefaultResolverState(),
startLoop: () => {
const {
isRunning,
runLoop
} = get()
console.log(`useResolver.startLoop() isRunning: ${isRunning}`)
if (isRunning) { return }
set({ isRunning: true })
setTimeout(() => {
runLoop()
}, 0)
},
/**
* A loop which reconstruct a queue at each cycle
*
* this has to be dynamic since the user might be moving around
* inside the timeline
* @returns
*/
runLoop: async (): Promise<void> => {
const {
imageRenderingStrategy,
videoRenderingStrategy,
soundRenderingStrategy,
voiceRenderingStrategy,
musicRenderingStrategy,
} = useSettings.getState()
const runLoopAgain = (waitTimeIfNothingToDoInMs = DEFAULT_WAIT_TIME_IF_NOTHING_TO_DO_IN_MS) => {
setTimeout(() => {
get().runLoop()
}, waitTimeIfNothingToDoInMs)
}
// note: do not create a return condition in case all strategies are "on demand"
// otherwise we won't be able to get the status of current tasks
// console.log(`useResolver.runLoop()`)
const timelineState: TimelineStore = useTimeline.getState()
const { visibleSegments, loadedSegments, segments: allSegments, resolveSegment } = timelineState
// ------------------------------------------------------------------------------------------------
//
// - we modify the original object in-line to add the visibility setting
// - there is a priority order: the info that a segment is "visible" (on screen),
// is more important, which is why it is done after processing the "loaded" segments (the ones that are buffered, because near the sliding window)
for (const s of loadedSegments) { (s as TimelineSegment).visibility = SegmentVisibility.BUFFERED }
for (const s of visibleSegments) { (s as TimelineSegment).visibility = SegmentVisibility.VISIBLE }
// sort segments by visibility:
// segments visible on screen are show first,
// then those nearby, then the hidden ones
const segments: TimelineSegment[] = ([...allSegments] as TimelineSegment[]).sort((segment1, segment2) => {
const priority1 = (segmentVisibilityPriority as any)[segment1.visibility || SegmentVisibility.HIDDEN] || 0
const priority2 = (segmentVisibilityPriority as any)[segment2.visibility || SegmentVisibility.HIDDEN] || 0
return priority2 - priority1
})
//
// -------------------------------------------------------------------------
const { defaultParallelismQuotas, isPaused } = get()
let currentParallelismQuotaForVideo = defaultParallelismQuotas.video
let currentParallelismQuotaForImage = defaultParallelismQuotas.image
let currentParallelismQuotaForVoice = defaultParallelismQuotas.voice
let currentParallelismQuotaForSound = defaultParallelismQuotas.sound
let currentParallelismQuotaForMusic = defaultParallelismQuotas.music
// console.log(`useResolver.runLoop() parallelismQuotas = `, parallelismQuotas)
// we do not need ot get currentParallelismQuotas,
// as we are going to re-compute it
// (currentParallelismQuotas is only used in the UI
// to display of the parallel request counter)
const segmentsToRender: TimelineSegment[] = []
// the following loop isn't the prettiest, but I think it presents
// the dynamic generation logic in a clear way, so let's keep it for now
for (const s of segments) {
if (s.category === ClapSegmentCategory.VIDEO) {
if (s.status !== ClapSegmentStatus.TO_GENERATE) {
// this is important: we found an in-progress task!
// it is thus vital to deduct it from the parallelism quota,
// to avoir triggering quota limit on the providers side
if (s.status === ClapSegmentStatus.IN_PROGRESS) {
currentParallelismQuotaForVideo = Math.max(0, currentParallelismQuotaForVideo - 1)
}
continue
}
if (isPaused) { continue }
if (videoRenderingStrategy === RenderingStrategy.ON_DEMAND) {
continue
}
if (
s.visibility === SegmentVisibility.HIDDEN
&&
videoRenderingStrategy !== RenderingStrategy.ON_SCREEN_THEN_ALL
) {
continue
} else if (
s.visibility === SegmentVisibility.BUFFERED
&&
videoRenderingStrategy !== RenderingStrategy.ON_SCREEN_THEN_SURROUNDING
) {
continue
}
if (currentParallelismQuotaForVideo > 0) {
currentParallelismQuotaForVideo = Math.max(0, currentParallelismQuotaForVideo - 1)
segmentsToRender.push(s)
}
} else if (s.category === ClapSegmentCategory.STORYBOARD) {
// console.log(`useResolver.runLoop(): found a storyboard segment`)
if (s.status !== ClapSegmentStatus.TO_GENERATE) {
// console.log(`useResolver.runLoop(): found a storyboard segment that is not to_generate`)
// this is important: we found an in-progress task!
// it is thus vital to deduct it from the parallelism quota,
// to avoir triggering quoote limit on the providers side
if (s.status === ClapSegmentStatus.IN_PROGRESS) {
currentParallelismQuotaForImage = Math.max(0, currentParallelismQuotaForImage - 1)
}
continue
}
// console.log(`useResolver.runLoop(): found a storyboard segment that has to be generated`)
if (isPaused) { continue }
if (imageRenderingStrategy === RenderingStrategy.ON_DEMAND) {
continue
}
if (
s.visibility === SegmentVisibility.HIDDEN
&&
imageRenderingStrategy !== RenderingStrategy.ON_SCREEN_THEN_ALL
) {
continue
} else if (
s.visibility === SegmentVisibility.BUFFERED
&&
imageRenderingStrategy !== RenderingStrategy.ON_SCREEN_THEN_SURROUNDING
) {
continue
}
// console.log(`useResolver.runLoop(): strategy is good to go`)
if (currentParallelismQuotaForImage > 0) {
// console.log(`useResolver.runLoop(): quota is good to go`)
currentParallelismQuotaForImage = Math.max(0, currentParallelismQuotaForImage - 1)
segmentsToRender.push(s)
}
} else if (s.category === ClapSegmentCategory.DIALOGUE) {
if (s.status !== ClapSegmentStatus.TO_GENERATE) {
// this is important: we found an in-progress task!
// it is thus vital to deduct it from the parallelism quota,
// to avoir triggering quoote limit on the providers side
if (s.status === ClapSegmentStatus.IN_PROGRESS) {
currentParallelismQuotaForVoice = Math.max(0, currentParallelismQuotaForVoice - 1)
}
continue
}
if (isPaused) { continue }
if (voiceRenderingStrategy === RenderingStrategy.ON_DEMAND) {
continue
}
if (
s.visibility === SegmentVisibility.HIDDEN
&&
voiceRenderingStrategy !== RenderingStrategy.ON_SCREEN_THEN_ALL
) {
continue
} else if (
s.visibility === SegmentVisibility.BUFFERED
&&
voiceRenderingStrategy !== RenderingStrategy.ON_SCREEN_THEN_SURROUNDING
) {
continue
}
if (currentParallelismQuotaForVoice > 0) {
currentParallelismQuotaForVoice = Math.max(0, currentParallelismQuotaForVoice - 1)
segmentsToRender.push(s)
}
} else if (s.category === ClapSegmentCategory.SOUND) {
if (s.status !== ClapSegmentStatus.TO_GENERATE) {
// this is important: we found an in-progress task!
// it is thus vital to deduct it from the parallelism quota,
// to avoir triggering quoote limit on the providers side
if (s.status === ClapSegmentStatus.IN_PROGRESS) {
currentParallelismQuotaForSound = Math.max(0, currentParallelismQuotaForSound - 1)
}
continue
}
if (isPaused) { continue }
if (soundRenderingStrategy === RenderingStrategy.ON_DEMAND) {
continue
}
if (
s.visibility === SegmentVisibility.HIDDEN
&&
soundRenderingStrategy !== RenderingStrategy.ON_SCREEN_THEN_ALL
) {
continue
} else if (
s.visibility === SegmentVisibility.BUFFERED
&&
soundRenderingStrategy !== RenderingStrategy.ON_SCREEN_THEN_SURROUNDING
) {
continue
}
if (currentParallelismQuotaForSound > 0) {
currentParallelismQuotaForSound = Math.max(0, currentParallelismQuotaForSound - 1)
segmentsToRender.push(s)
}
} else if (s.category === ClapSegmentCategory.MUSIC) {
if (s.status !== ClapSegmentStatus.TO_GENERATE) {
// this is important: we found an in-progress task!
// it is thus vital to deduct it from the parallelism quota,
// to avoir triggering quoote limit on the providers side
if (s.status === ClapSegmentStatus.IN_PROGRESS) {
currentParallelismQuotaForMusic = Math.max(0, currentParallelismQuotaForMusic - 1)
}
continue
}
if (isPaused) { continue }
if (musicRenderingStrategy === RenderingStrategy.ON_DEMAND) {
continue
}
if (
s.visibility === SegmentVisibility.HIDDEN
&&
musicRenderingStrategy !== RenderingStrategy.ON_SCREEN_THEN_ALL
) {
continue
} else if (
s.visibility === SegmentVisibility.BUFFERED
&&
musicRenderingStrategy !== RenderingStrategy.ON_SCREEN_THEN_SURROUNDING
) {
continue
}
if (currentParallelismQuotaForMusic > 0) {
currentParallelismQuotaForMusic = Math.max(0, currentParallelismQuotaForMusic - 1)
segmentsToRender.push(s)
}
} // else continue
}
// we don't want to do something like this:
// await Promise.allSettled(segmentsRenderingPromises)
// because that would limit us in terms of parallelism.
//
// the idea here is that we don't want to wait for all segments
// to finish before starting new ones.
const nbPendingRequestsForVideo = defaultParallelismQuotas.video - currentParallelismQuotaForVideo
const nbPendingRequestsForImage = defaultParallelismQuotas.image - currentParallelismQuotaForImage
const nbPendingRequestsForVoice = defaultParallelismQuotas.voice - currentParallelismQuotaForVoice
const nbPendingRequestsForSound = defaultParallelismQuotas.sound - currentParallelismQuotaForSound
const nbPendingRequestsForMusic = defaultParallelismQuotas.music - currentParallelismQuotaForMusic
const nbRequestsRunningInParallel =
nbPendingRequestsForVideo
+ nbPendingRequestsForImage
+ nbPendingRequestsForVoice
+ nbPendingRequestsForSound
+ nbPendingRequestsForMusic
const isBusyResolving = nbRequestsRunningInParallel > 0
set({
currentParallelismQuotaForVideo,
currentParallelismQuotaForImage,
currentParallelismQuotaForVoice,
currentParallelismQuotaForSound,
currentParallelismQuotaForMusic,
// just some aliases for convenience
nbPendingRequestsForVideo,
nbPendingRequestsForImage,
nbPendingRequestsForVoice,
nbPendingRequestsForSound,
nbPendingRequestsForMusic,
nbRequestsRunningInParallel,
isBusyResolving
})
// console.log(`useResolver.runLoop(): firing and forgetting ${segmentsToRender.length} new resolveSegment promises`)
// we fire and forget
segmentsToRender.forEach(segment => resolveSegment(segment))
return runLoopAgain()
},
togglePause: (isPaused?: boolean): boolean => {
const { isPaused: previouslyPaused } = get()
if (typeof isPaused === "boolean") {
set({ isPaused })
return isPaused
} else {
set({ isPaused: !previouslyPaused })
return !previouslyPaused
}
},
/**
* This resolve an entity (eg. aa character or a location)
*
* Note: while we return a clap segment, the original will be replaced, too
*
* @param segment
* @returns
*/
resolveEntity: async (entity: ClapEntity): Promise<ClapEntity> => {
},
/**
* This resolve a segment.
*
* Resolving means taking input parameters and generating an output (changes in the
* segment settings, typically the assetUrl but this can have other implications
* and changes as well)
*
* This function returns the original segment, modified in-line.
*
* Side-effects are propagated by using useTimeline.trackSilentChangeInSegment()
*
* @param segment
* @returns
*/
resolveSegment: async (segment: TimelineSegment): Promise<TimelineSegment> => {
const settings = useSettings.getState().getSettings()
const timeline: TimelineStore = useTimeline.getState()
// note: do NOT use the visibleSegments here
// that's because resolveSegment is 100% asynchronous,
// meaning it might be called on invisible segments too!
const { clap, segments: allSegments, trackSilentChangeInSegment } = timeline
if (!clap?.meta || !allSegments.length) {
return segment
// throw new Error(`please call setSegmentRender(...) first`)
}
const segments: TimelineSegment[] = filterSegments(
ClapSegmentFilteringMode.ANY,
segment,
allSegments
)
if (segment.status === ClapSegmentStatus.IN_PROGRESS) {
// console.log(`useResolver.resolveSegment(): warning: this segment is already being generated!`)
return segment
}
segment.status = ClapSegmentStatus.IN_PROGRESS
const entities = clap.entityIndex || {}
const speakingCharactersIds = segments.map(s =>
s.category === ClapSegmentCategory.DIALOGUE ? s.entityId : null
).filter(id => id) as string[]
const generalCharactersIds = segments.map(s =>
s.category === ClapSegmentCategory.CHARACTER ? s.entityId : null
).filter(id => id) as string[]
const mainCharacterId: string | undefined = speakingCharactersIds.at(0) || generalCharactersIds.at(0) || undefined
const mainCharacterEntity: ClapEntity | undefined = mainCharacterId ? (entities[mainCharacterId] || undefined) : undefined
const storyboard = segments.find(s => s.category === ClapSegmentCategory.STORYBOARD)
const dialogue = segments.find(s => s.category === ClapSegmentCategory.DIALOGUE)
const imagePrompt = getVideoPrompt(
segments,
entities
)
const positiveImagePrompt = [
settings.imagePromptPrefix,
imagePrompt,
settings.imagePromptSuffix,
].map(x => x.trim()).filter(x => x).join(", ")
const negativeImagePrompt = [
settings.imageNegativePrompt
].map(x => x.trim()).filter(x => x).join(", ")
// note: not all AI models will support those parameters.
// in 2024, even the "best" proprietary video models like Sora, Veo, Kling, Gen-3, Dream Machine etc..
// don't support voice input for lip syncing, for instance.
const prompts: ResolveRequestPrompts = {
image: {
// the "identification picture" of the character, if available
identity: `${mainCharacterEntity?.imageId || ""}`,
positive: positiveImagePrompt,
negative: negativeImagePrompt
},
video: {
// image to animate
image: `${storyboard?.assetUrl || ""}`,
// dialogue line to lip-sync
voice: `${dialogue?.assetUrl || ""}`,
},
voice: {
identity: `${mainCharacterEntity?.audioId || ""}`,
positive: "",
negative: ""
}
}
const serializableSegment = { ...segment }
// we delete things that cannot be serialized properly
delete serializableSegment.scene;
delete serializableSegment.audioBuffer;
serializableSegment.textures = {};
const request: ResolveRequest = {
settings,
segment: serializableSegment,
segments,
entities,
speakingCharactersIds,
generalCharactersIds,
mainCharacterId,
mainCharacterEntity,
meta: clap.meta,
prompts,
}
try {
const res = await fetch("/api/resolve", {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify(request)
})
// console.log(`useResolver.resolveSegment(): result from /api.render:`, res)
// note: this isn't really a "full" TimelineSegment,
// it will miss some data that cannot be serialized
const newSegmentData = (await res.json()) as TimelineSegment
// console.log(`useResolver.resolveSegment(): newSegmentData`, newSegmentData)
// note: this modifies the old object in-place
// it is super important as this helps preserving the reference
const newSegment = Object.assign(
segment,
newSegmentData,
// this step is super-important when rendering multiple segments at once:
// the position of the segment might have changed while it was being generated,
// so we need to preserve it
{
startTimeInMs: segment.startTimeInMs,
endTimeInMs: segment.endTimeInMs,
}
) as TimelineSegment
if (newSegment.outputType === ClapOutputType.AUDIO) {
try {
newSegment.audioBuffer = await getAudioBuffer(newSegment.assetUrl)
} catch (err) {
console.error(`failed to load the audio file: ${err}`)
}
}
// after a segment has ben resolved, it is possible that the size
// of its asset changed (eg. a dialogue line longer than the segment's length)
//
// there are multiple ways to solve this, one approach could be to
// just add some more B-roll (more shots)
//
// or we can also extend it, which is the current simple solution
//
// for the other categories, such as MUSIC or SOUND,
// we assume it is okay if they are too short or too long,
// and that we can crop them etc
//
// note that video clips are also concerned: we want them to perfectly fit
if (newSegment.category === ClapSegmentCategory.DIALOGUE) {
// by default fitSegmentToAssetDuration() will fit the segment to the asset duration without any gap,
// which can be weird to hear.. so let's add a little delay
// that is assuming that our dialogue lines have been properly cut,
//
await timeline.fitSegmentToAssetDuration(
newSegment,
typeof newSegment.assetDurationInMs === "number"
// this delay is arbitrary, could be another value (200, 500, 1200..)
? newSegment.assetDurationInMs + 700
: 2000
)
} else if (newSegment.category === ClapSegmentCategory.SOUND) {
await timeline.fitSegmentToAssetDuration(
newSegment,
typeof newSegment.assetDurationInMs === "number"
// this delay is arbitrary, could be another value (200, 500, 1200..)
? newSegment.assetDurationInMs
: 2000
)
} else if (newSegment.category === ClapSegmentCategory.MUSIC) {
await timeline.fitSegmentToAssetDuration(
newSegment,
typeof newSegment.assetDurationInMs === "number"
// this delay is arbitrary, could be another value (200, 500, 1200..)
? newSegment.assetDurationInMs
: 2000
)
} else if (newSegment.category === ClapSegmentCategory.VIDEO) {
await timeline.fitSegmentToAssetDuration(newSegment)
}
newSegment.status = ClapSegmentStatus.COMPLETED
trackSilentChangeInSegment(newSegment.id)
return newSegment
} catch (err) {
console.error(`useResolver.resolveSegment(): error: ${err}`)
segment.status = ClapSegmentStatus.TO_GENERATE
// we could do that in a future version to improve error tracking
// segment.status = ClapSegmentStatus.ERROR
}
return segment
}
}))
if (typeof window !== "undefined") {
(window as any).useResolver = useResolver
}