Spaces:

jbilcke-hf
/

observer

Paused

App Files Files Community

jbilcke-hf HF staff commited on Sep 26, 2023

Commit

7249a2e

•

1 Parent(s): 6caeb80

add support for whisper-turbo (base)

Browse files

Files changed (14) hide show

package-lock.json +0 -0
package.json +4 -1
src/app/engine/listen.ts +46 -0
src/app/engine/think.ts +27 -19
src/app/listen.tsx +273 -28
src/app/main.tsx +19 -29
src/app/observe.tsx +4 -4
src/app/speak.tsx +5 -2
src/components/ui/dialog.tsx +1 -2
src/components/ui/toast.tsx +127 -0
src/components/ui/toaster.tsx +35 -0
src/components/ui/use-toast.ts +192 -0
src/lib/blobToBase64Uri.ts +18 -0
src/types.ts +11 -0

package-lock.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

package.json CHANGED Viewed

@@ -24,6 +24,7 @@
     "@radix-ui/react-separator": "^1.0.3",
     "@radix-ui/react-slot": "^1.0.2",
     "@radix-ui/react-switch": "^1.0.3",
     "@radix-ui/react-tooltip": "^1.0.6",
     "@react-pdf/renderer": "^3.1.12",
     "@types/node": "20.4.2",
@@ -59,7 +60,9 @@
     "tts-react": "^3.0.1",
     "typescript": "5.1.6",
     "usehooks-ts": "^2.9.1",
-    "uuid": "^9.0.0"
   },
   "devDependencies": {
     "@types/sbd": "^1.0.3"

     "@radix-ui/react-separator": "^1.0.3",
     "@radix-ui/react-slot": "^1.0.2",
     "@radix-ui/react-switch": "^1.0.3",
+    "@radix-ui/react-toast": "^1.1.4",
     "@radix-ui/react-tooltip": "^1.0.6",
     "@react-pdf/renderer": "^3.1.12",
     "@types/node": "20.4.2",
     "tts-react": "^3.0.1",
     "typescript": "5.1.6",
     "usehooks-ts": "^2.9.1",
+    "uuid": "^9.0.0",
+    "webm-to-wav-converter": "^1.1.0",
+    "whisper-turbo": "^0.7.0"
   },
   "devDependencies": {
     "@types/sbd": "^1.0.3"

src/app/engine/listen.ts ADDED Viewed

	@@ -0,0 +1,46 @@

+"use server"
+import { SoundAnalysisRequest, SoundAnalysisResponse } from "@/types"
+const apiUrl = `${process.env.RENDERING_ENGINE_API || ""}`
+export async function listen(sound: string): Promise<string> {
+  if (!sound?.length) {
+    console.log(`cannot call the API without a sound, aborting..`)
+    // throw new Error(`cannot call the API without a sound, aborting..`)
+    return ""
+  }
+  try {
+    const request = {
+      sound
+    } as SoundAnalysisRequest
+    console.log(`calling ${apiUrl}/listen called with: `, {
+      sound: request.sound.slice(0, 20)
+    })
+    const res = await fetch(`${apiUrl}/listen`, {
+      method: "POST",
+      headers: {
+        Accept: "application/json",
+        "Content-Type": "application/json",
+        // Authorization: `Bearer ${process.env.VC_SECRET_ACCESS_TOKEN}`,
+      },
+      body: JSON.stringify(request),
+      cache: 'no-store',
+    // we can also use this (see https://vercel.com/blog/vercel-cache-api-nextjs-cache)
+    // next: { revalidate: 1 }
+    })
+    if (res.status !== 200) {
+      throw new Error('Failed to fetch data')
+    }
+    const response = (await res.json()) as SoundAnalysisResponse
+    return response.result
+  } catch (err) {
+    console.error(err)
+    return ""
+  }
+}

src/app/engine/think.ts CHANGED Viewed

@@ -5,35 +5,38 @@ import { createLlamaPrompt } from "@/lib/createLlamaPrompt"
 import { predict } from "./predict"
-export const think = async ({
-  event = "",
-  observation = "",
-  history = "",
-}: {
-  event: string;
-  observation: string;
-  history: string;
-}): Promise<string> => {
   if (!event) {
     throw new Error("missing event")
   }
   const prompt = createLlamaPrompt([
     {
       role: "system",
       content: [
-        `You are a companion robot, very friendly, curious about the world.`,
-        // TODO: put the history here (from most recent to oldest)
-        `You have been presented some situation in the past, but you lost your memory.`,
         `Today's date is ${format(new Date(), 'yyyy-MM-dd at HH:mm (d)')}.`,
-,       `You are currently observing this: ${observation}`,
       ].filter(item => item).join("\n")
     },
-    {
-      role: "user",
-      content: event,
-    }
   ])
@@ -56,5 +59,10 @@ export const think = async ({
   // llama-2 is too chatty, let's keep 3 sentences at most
   const sentences = sbd.sentences(result).slice(0, 3).join(" ").trim()
   return sentences
 }

 import { predict } from "./predict"
+const internalHistory: {
+  role: string;
+  content: string;
+}[] = []
+export const think = async (event: string): Promise<string> => {
   if (!event) {
     throw new Error("missing event")
   }
+  internalHistory.push({
+    role: "user",
+    content: event,
+  })
+  if (internalHistory.length > 10) {
+    internalHistory.shift()
+  }
   const prompt = createLlamaPrompt([
     {
       role: "system",
       content: [
         `Today's date is ${format(new Date(), 'yyyy-MM-dd at HH:mm (d)')}.`,
+        `You are an android robot, very friendly, curious about the world.`,
+        `Your life goal is to help human and interact them as a natural way.`,
+        `You are going to see and hear various things, and you need to act in a very natural way.`,
+        `If you see someone through your eyes, you need to interact with them,`,
+        `you should be o ngoing and open, ask questions, be curious, do jokes etc.`,
       ].filter(item => item).join("\n")
     },
+    ...internalHistory,
   ])
   // llama-2 is too chatty, let's keep 3 sentences at most
   const sentences = sbd.sentences(result).slice(0, 3).join(" ").trim()
+  internalHistory.push({
+    role: "assistant",
+    content: sentences,
+  })
   return sentences
 }

src/app/listen.tsx CHANGED Viewed

@@ -2,52 +2,297 @@
 import { useCallback, useEffect, useRef, useState, useTransition } from "react"
 import { useInterval } from "usehooks-ts"
 import { useRecorder } from "react-microphone-recorder"
 // import { listen } from "./engine/listen"
 export function Listen({
   onListen,
 }: {
   onListen: (recording: string) => void
 }) {
-  const [_isPending, startTransition] = useTransition()
-  const {
-    audioLevel,
-    startRecording,
-    pauseRecording,
-    stopRecording,
-    resetRecording,
-    audioURL,
-    recordingState,
-    isRecording,
-    audioFile
-  } = useRecorder()
-  const status = audioLevel > 18 ? "I hear something!" : "background noise"
   useInterval(() => {
-    console.log("let's stop, and start again")
-    stopRecording()
-    startRecording()
   }, 3000)
   useEffect(() => {
-    console.log("starting recording..")
-    startRecording()
-    startTransition(async () => {
-      // await listen()
-    })
   }, [])
-  return null
-  /*
   return (
-    <div className="fixed top-64 left-16 z-10 bg-gray-100 p-4">
-      <div>{status}</div>
     </div>
   )
-  */
 }

 import { useCallback, useEffect, useRef, useState, useTransition } from "react"
 import { useInterval } from "usehooks-ts"
+// TODO: try this? https://www.npmjs.com/package/react-audio-voice-recorder
 import { useRecorder } from "react-microphone-recorder"
+import { getWaveBlob } from "webm-to-wav-converter"
+import {
+  AvailableModels,
+  InferenceSession,
+  MicRecorder,
+  SessionManager,
+} from "whisper-turbo"
+import { useToast } from "@/components/ui/use-toast"
+// import { listen } from "@/app/engine/listen"
+import { blobToBase64Uri } from "@/lib/blobToBase64Uri"
 // import { listen } from "./engine/listen"
+export interface TSSegment {
+  text: string;
+  start: number;
+  stop: number;
+  last: boolean;
+}
+export interface TSTranscript {
+  segments: Array<TSSegment>;
+}
 export function Listen({
   onListen,
 }: {
   onListen: (recording: string) => void
 }) {
+  const { toast } = useToast()
+  const [transcribing, setTranscribing] = useState(false)
+  const transcribingRef = useRef(transcribing)
+  useEffect(() => { transcribingRef.current = transcribing }, [transcribing])
+  // used to detect changes, signal when we can analyze the audio
+  const [audioDataFrame, setAudioDataFrame] = useState(0)
+  const audioDataFrameRef = useRef(audioDataFrame)
+  useEffect(() => { audioDataFrameRef.current = audioDataFrame }, [audioDataFrame])
+  const [transcriptBuffer, setTranscriptBuffer] = useState("")
+  useEffect(() => {
+    onListen(transcriptBuffer)
+  }, [transcriptBuffer])
+  /*
+  Available models: {
+    WHISPER_TINY: 'whisper-tiny',
+    WHISPER_BASE: 'whisper-base',
+    WHISPER_SMALL: 'whisper-small',
+    WHISPER_MEDIUM: 'whisper-medium',
+    WHISPER_LARGE: 'whisper-large'
+  }
+  */
+  const whisperModel: AvailableModels = AvailableModels.WHISPER_BASE
+  const listenerRef = useRef({
+    isListening: false,
+    startedListeningAt: 0,
+    stoppedListeningAt: 0,
+    durationInMs: 0,
+    hits: 0,
+    debugCanContinue: true, // used for debugging
+  })
+  // the background listener is not a CIA spy device, but a detect of changes in the
+  // background noise volume level. The goal is to detect whenever an interesting event is happening
+  const backgroundListener = useRecorder()
+  // the foreground listener is the actual sound sampler
+  // with out system, it will always lag a bit behind the background listener
+  // however there might be a fix (which I haven't tried yet):
+  // to take the last second of the background listener sample,
+  // and glue it to the beginning of the foreground listener sample
+  //
+  // or, alternatively, we could just try to use a shorter time window for the background listener,
+  // to make it more reactive
+  const foregroundListener = useRecorder()
+  // to detect voice, we use a combination of audio level and frequency sampling
+  const heardSomething = backgroundListener.audioLevel > 12 // 18
+  const status = heardSomething ? "I hear something!" : "background noise"
+  const session = useRef<InferenceSession | null>(null)
+  const [audioData, setAudioData] = useState<Uint8Array | null>(null)
+  const [audioMetadata, setAudioMetadata] = useState<File | null>(null)
+  const [loaded, setLoaded] = useState<boolean>(false)
+  const [progress, setProgress] = useState<number>(0)
+  const isLoadingModel = progress > 0
+  const hasLoadedModel = progress === 100
+  const loadModel = async () => {
+    console.log("loadModel")
+    if (session.current) {
+      session.current.destroy()
+    }
+    if (!whisperModel) {
+      console.error("No whisper model loaded")
+      return
+    }
+    try {
+      const manager = new SessionManager()
+      const loadResult = await manager.loadModel(
+        whisperModel,
+        () => {
+          setLoaded(true)
+        },
+        (p: number) => {
+          console.log("progress:", p)
+          setProgress(p)
+        }
+      )
+      if (loadResult.isErr) {
+        throw new Error(loadResult.error.message)
+      } else {
+        session.current = loadResult.value
+      }
+    } catch (err) {
+      const error = `failed to load the model: ${err}`
+      console.error(error)
+      toast({
+        title: "Error",
+        description: error,
+        variant: "destructive"
+      })
+    }
+  }
+  const runSession = async () => {
+    if (!loaded) {
+      console.log("runSession: aborting (model not loaded yet)")
+      return
+    }
+    if (!session.current) {
+      console.log("runSession: aborting (no model loaded)")
+      toast({
+        title: "Error",
+        description: "No model loaded",
+        variant: "destructive"
+      })
+      return
+    }
+    // console.log("debug:", { audioData, audioDataFrame })
+    if (!audioData) {
+      console.log("runSession: aborting (no audio file loaded)")
+      toast({
+        title: "Error",
+        description: "No audio file loaded",
+        variant: "destructive"
+      })
+      return
+    }
+    setTranscribing(transcribingRef.current = true)
+    try {
+      await session.current.transcribe(audioData, (s: any) => {
+        const segment = s as { text: string, start: number, stop: number, last: boolean }
+        const text = segment.text.trim()
+        console.log("text:", text)
+        if (text) {
+          setTranscriptBuffer(text)
+        }
+        if (s.last) {
+          console.log("IS LAST")
+          setTranscribing(transcribingRef.current = false)
+          return
+        }
+      })
+    } catch (err) {
+      const error = `transcription crashed: ${err}`
+      console.error(error)
+      toast({
+        title: "Error",
+        description: "No audio file loaded",
+        variant: "destructive"
+      })
+    }
+  }
+  // let's disable the background recorder for now
   useInterval(() => {
+    // console.log("let's stop, and start again")
+    backgroundListener.stopRecording()
+    backgroundListener.startRecording()
   }, 3000)
   useEffect(() => {
+    const fn = async () => {
+      console.log("load model..")
+      await loadModel()
+      console.log("starting to listen to background noise to detect volume peaks..")
+      backgroundListener.startRecording()
+    }
+    fn()
   }, [])
+  useEffect(() => {
+    if (!audioData) {
+      console.log("no audio")
+    }
+    // console.log("audioDataFrame changed, need to process audioData!")
+    runSession()
+  }, [audioDataFrame])
+  useEffect(() => {
+    if (heardSomething) {
+      if (!listenerRef.current.isListening) {
+        console.log("recoording..")
+        foregroundListener.startRecording()
+        listenerRef.current.hits = 0
+        listenerRef.current.isListening = true
+        // TODO: use a debouncer to detect when we started speaking
+        setTimeout(async () => {
+          foregroundListener.stopRecording()
+          listenerRef.current.isListening = false
+          listenerRef.current.stoppedListeningAt = Date.now()
+          listenerRef.current.durationInMs =
+            listenerRef.current.stoppedListeningAt - listenerRef.current.startedListeningAt
+          const hits = listenerRef.current.hits
+          console.log(`end of sample (${foregroundListener.timeElapsed}, ${hits} hits)`)
+          if (!foregroundListener.audioBlob || typeof window === "undefined" || !window?.FileReader) {
+            return
+          }
+          if (hits > 11) {
+            // at 12 threshold level, we should have between 12 and 20 hits (per 2 sec) for short words and utterances
+            // at 12 threshold level, keystrokes should not be detected, unless the person hits the keyboard heavily
+            console.log("got an interesting sample, sending for review")
+            // temporary, to prevent infinite loop
+            if (listenerRef.current.debugCanContinue) {
+              // to prevent the infinite loop, set this value to false
+              // listenerRef.current.debugCanContinue = false
+              try {
+                const blob = await getWaveBlob(foregroundListener.audioBlob, false) // false = 16 bit, true = 32 bit
+                const arrayBuffer = await blob.arrayBuffer()
+                const uint8Array = new Uint8Array(arrayBuffer)
+                setAudioData(uint8Array)
+                setAudioDataFrame(audioDataFrameRef.current + 1)
+              } catch (err) {
+                const error = `failed to convert the audio sample: ${err}`
+                console.error(error)
+                toast({
+                  title: "Error",
+                  description: error,
+                  variant: "destructive"
+                })
+              }
+            } else {
+              console.log("Julian: infinite loop temporary disabled :D")
+            }
+          }
+        }, 3000)
+      } else {
+        // TODO: increase hits?
+        // listenerRef.current.hits = listenerRef.current.hits + 1
+      }
+    }
+  }, [heardSomething])
+  if (heardSomething && listenerRef.current.isListening) {
+    listenerRef.current.hits = listenerRef.current.hits + 1
+  }
   return (
+    <div className="fixed top-80 left-16 z-10 bg-gray-100 p-4">
+      {isLoadingModel && hasLoadedModel
+        ? <p>Loading: ${progress}%</p>
+        : <p>{
+          transcriptBuffer
+          || ""
+          }</p>
+      }
     </div>
   )
 }

src/app/main.tsx CHANGED Viewed

@@ -10,6 +10,7 @@ import { think } from "./engine/think"
 import { Progress } from "./interface/progress"
 import { Listen } from "./listen"
 import { Speak } from "./speak"
 export default function Main() {
   const [_isPending, startTransition] = useTransition()
@@ -17,38 +18,25 @@ export default function Main() {
   const [lastRawObservation, setLastRawObservation] = useState<string>("")
   const [isLoadingAction, setLoadingAction] = useState(false)
-  const [observations, setObservations] = useState<string[]>([])
   const [action, setAction] = useState<string>("Nothing to say yet.")
-  // receive a new observation from what the agent is looking at
-  const handleOnObserve = (observation: string, image: string) => {
-    setLastRawObservation(observation)
-    setLastImage(image)
-    // last comes first
-    setObservations([
-      `On ${format(new Date(), 'yyyy-MM-dd at HH:mm (d)')}, you saw: \"${observation}\".`
-    ].concat(observations))
-    // TODO: use llama-2 to summarize previous observations
-    const history = observations.slice(0, 3).join("\n")
     startTransition(async () => {
-      setLoadingAction(true)
-      const action =  await think({
-        history,
-        observation,
-        event: "Please react in a natural way to the current situation, by interacting with the person or entity you are seeing.",
-      })
       setAction(action)
       setLoadingAction(false)
     })
   }
   const handleOnListen = (recording: string) => {
-    console.log("on listen")
   }
   return (
@@ -93,8 +81,9 @@ export default function Main() {
       </div>
       <Observe onObserve={handleOnObserve} />
-      {/*<Listen onListen={handleOnListen} />*/}
       <Speak>{action}</Speak>
       <Progress
         isLoading={isLoadingAction}
@@ -104,11 +93,12 @@ export default function Main() {
       <div className="fixed z-10 left-0 right-0 bottom-0 flex flex-col items-center justify-center">
         <div className="full md:w-[80%] lg:w-[70%] mb-0 md:p-4 md:mb-8 bg-zinc-100 md:rounded-xl p-4 shadow-2xl text-xs md:text-sm">
-          <p>🅿️ <span className="font-semibold">Informations: </span> This demo uses
-           <a href="https://huggingface.co/HuggingFaceM4/idefics-80b#bias-evaluation" target="_blank" className="font-semibold"> IDEFICS </a>
-           and
-           <a href="https://huggingface.co/meta-llama" target="_blank" className="font-semibold"> Llama-2 </a>, and is provided for demonstration and research purposes.</p>
-          <p>⛔️ <span className="font-semibold">Limitations: </span> This demo is provided as-is, with no guarantee of factually correct results. In some cases, the models may return hallucinated or innapropriate responses.</p>
         </div>
       </div>
     </div>

 import { Progress } from "./interface/progress"
 import { Listen } from "./listen"
 import { Speak } from "./speak"
+import { Toaster } from "@/components/ui/toaster"
 export default function Main() {
   const [_isPending, startTransition] = useTransition()
   const [lastRawObservation, setLastRawObservation] = useState<string>("")
   const [isLoadingAction, setLoadingAction] = useState(false)
   const [action, setAction] = useState<string>("Nothing to say yet.")
+  const handleOnEvent = (event: string) => {
+    setLoadingAction(true)
     startTransition(async () => {
+      const action = await think(event)
       setAction(action)
       setLoadingAction(false)
     })
   }
+  // receive a new observation from what the agent is looking at
+  const handleOnObserve = (observation: string, image: string) => {
+    setLastRawObservation(observation)
+    setLastImage(image)
+    handleOnEvent(`It is ${format(new Date(), 'HH:mm (d)')} and you are seeing this: ${observation}`)
+  }
   const handleOnListen = (recording: string) => {
+    handleOnEvent(`It is ${format(new Date(), 'HH:mm (d)')} and you are hearing this: ${recording}`)
   }
   return (
       </div>
       <Observe onObserve={handleOnObserve} />
+      <Listen onListen={handleOnListen} />
       <Speak>{action}</Speak>
+      <Toaster />
       <Progress
         isLoading={isLoadingAction}
       <div className="fixed z-10 left-0 right-0 bottom-0 flex flex-col items-center justify-center">
         <div className="full md:w-[80%] lg:w-[70%] mb-0 md:p-4 md:mb-8 bg-zinc-100 md:rounded-xl p-4 shadow-2xl text-xs md:text-sm">
+          <p>🅿️ <span className="font-semibold">
+            </span>This multimodal demo allow
+           <a href="https://huggingface.co/meta-llama" target="_blank" className="font-semibold underline"> Llama-2 </a> to hear, see and talk.
+           You need to upgrade to a <a href="https://caniuse.com/webgpu" target="_blank" className="font-semibold underline">browser with support for WebGPU</a> for speech recognition to work.
+            Vision is handled by <a href="https://huggingface.co/HuggingFaceM4/idefics-80b#bias-evaluation" target="_blank" className="font-semibold underline"> IDEFICS </a></p>
+          <p>⛔️ <span className="font-semibold">Limitations: </span>This demo is provided as-is, for demonstration and research purpose only. As it demonstrates WebGPU technology, this demo will not support incompatible browsers and/or devices. No guarantee of factually correct results. In some cases, the models may return hallucinated or innapropriate responses.</p>
         </div>
       </div>
     </div>

src/app/observe.tsx CHANGED Viewed

@@ -66,7 +66,7 @@ export function Observe({
     setBusy(true)
-    console.log("Capturing new frame from webcam..")
     startTransition(async () => {
       const imageBase64 = capture()
@@ -80,10 +80,10 @@ export function Observe({
       }
       const prompt = `What do you see here?`
-      console.log("Calling IDEFICS..")
-      const newObservation = await see({ prompt, imageBase64 })
-      console.log("New observation: ", newObservation)
       if (newObservation !== lastObservation) {
         // console.log("update!")
         setLastObservation(newObservation || "")

     setBusy(true)
+    // console.log("Capturing new frame from webcam..")
     startTransition(async () => {
       const imageBase64 = capture()
       }
       const prompt = `What do you see here?`
+      // console.log("Calling IDEFICS..")
+      const newObservation = "fake"  // await see({ prompt, imageBase64 })
+      // console.log("New observation: ", newObservation)
       if (newObservation !== lastObservation) {
         // console.log("update!")
         setLastObservation(newObservation || "")

src/app/speak.tsx CHANGED Viewed

@@ -46,11 +46,14 @@ export function Speak({
     if (newMessage === playedMessage) { return }
     const synth = window.speechSynthesis
-    console.log(`Speaking "${newMessage}"`)
     setPlayedMessage(newMessage)
     const utterance = new SpeechSynthesisUtterance(newMessage)
     utterance.voice = voice
-    synth.speak(utterance)
   }, [voice?.name, newMessage, playedMessage])
   return (

     if (newMessage === playedMessage) { return }
     const synth = window.speechSynthesis
+    // console.log(`Speaking "${newMessage}"`)
     setPlayedMessage(newMessage)
     const utterance = new SpeechSynthesisUtterance(newMessage)
     utterance.voice = voice
+    console.log("julian: voice disabled :D")
+    // synth.speak(utterance)
   }, [voice?.name, newMessage, playedMessage])
   return (

src/components/ui/dialog.tsx CHANGED Viewed

@@ -11,10 +11,9 @@ const Dialog = DialogPrimitive.Root
 const DialogTrigger = DialogPrimitive.Trigger
 const DialogPortal = ({
-  className,
   ...props
 }: DialogPrimitive.DialogPortalProps) => (
-  <DialogPrimitive.Portal className={cn(className)} {...props} />
 )
 DialogPortal.displayName = DialogPrimitive.Portal.displayName

 const DialogTrigger = DialogPrimitive.Trigger
 const DialogPortal = ({
   ...props
 }: DialogPrimitive.DialogPortalProps) => (
+  <DialogPrimitive.Portal {...props} />
 )
 DialogPortal.displayName = DialogPrimitive.Portal.displayName

src/components/ui/toast.tsx ADDED Viewed

	@@ -0,0 +1,127 @@

+import * as React from "react"
+import * as ToastPrimitives from "@radix-ui/react-toast"
+import { cva, type VariantProps } from "class-variance-authority"
+import { X } from "lucide-react"
+import { cn } from "@/lib/utils"
+const ToastProvider = ToastPrimitives.Provider
+const ToastViewport = React.forwardRef<
+  React.ElementRef<typeof ToastPrimitives.Viewport>,
+  React.ComponentPropsWithoutRef<typeof ToastPrimitives.Viewport>
+>(({ className, ...props }, ref) => (
+  <ToastPrimitives.Viewport
+    ref={ref}
+    className={cn(
+      "fixed top-0 z-[100] flex max-h-screen w-full flex-col-reverse p-4 sm:bottom-0 sm:right-0 sm:top-auto sm:flex-col md:max-w-[420px]",
+      className
+    )}
+    {...props}
+  />
+))
+ToastViewport.displayName = ToastPrimitives.Viewport.displayName
+const toastVariants = cva(
+  "group pointer-events-auto relative flex w-full items-center justify-between space-x-4 overflow-hidden rounded-md border border-stone-200 p-6 pr-8 shadow-lg transition-all data-[swipe=cancel]:translate-x-0 data-[swipe=end]:translate-x-[var(--radix-toast-swipe-end-x)] data-[swipe=move]:translate-x-[var(--radix-toast-swipe-move-x)] data-[swipe=move]:transition-none data-[state=open]:animate-in data-[state=closed]:animate-out data-[swipe=end]:animate-out data-[state=closed]:fade-out-80 data-[state=closed]:slide-out-to-right-full data-[state=open]:slide-in-from-top-full data-[state=open]:sm:slide-in-from-bottom-full dark:border-stone-800",
+  {
+    variants: {
+      variant: {
+        default: "border bg-white text-stone-950 dark:bg-stone-950 dark:text-stone-50",
+        destructive:
+          "destructive group border-red-500 bg-red-500 text-stone-50 dark:border-red-900 dark:bg-red-900 dark:text-stone-50",
+      },
+    },
+    defaultVariants: {
+      variant: "default",
+    },
+  }
+)
+const Toast = React.forwardRef<
+  React.ElementRef<typeof ToastPrimitives.Root>,
+  React.ComponentPropsWithoutRef<typeof ToastPrimitives.Root> &
+    VariantProps<typeof toastVariants>
+>(({ className, variant, ...props }, ref) => {
+  return (
+    <ToastPrimitives.Root
+      ref={ref}
+      className={cn(toastVariants({ variant }), className)}
+      {...props}
+    />
+  )
+})
+Toast.displayName = ToastPrimitives.Root.displayName
+const ToastAction = React.forwardRef<
+  React.ElementRef<typeof ToastPrimitives.Action>,
+  React.ComponentPropsWithoutRef<typeof ToastPrimitives.Action>
+>(({ className, ...props }, ref) => (
+  <ToastPrimitives.Action
+    ref={ref}
+    className={cn(
+      "inline-flex h-8 shrink-0 items-center justify-center rounded-md border border-stone-200 bg-transparent px-3 text-sm font-medium ring-offset-white transition-colors hover:bg-stone-100 focus:outline-none focus:ring-2 focus:ring-stone-950 focus:ring-offset-2 disabled:pointer-events-none disabled:opacity-50 group-[.destructive]:border-stone-100/40 group-[.destructive]:hover:border-red-500/30 group-[.destructive]:hover:bg-red-500 group-[.destructive]:hover:text-stone-50 group-[.destructive]:focus:ring-red-500 dark:border-stone-800 dark:ring-offset-stone-950 dark:hover:bg-stone-800 dark:focus:ring-stone-300 dark:group-[.destructive]:border-stone-800/40 dark:group-[.destructive]:hover:border-red-900/30 dark:group-[.destructive]:hover:bg-red-900 dark:group-[.destructive]:hover:text-stone-50 dark:group-[.destructive]:focus:ring-red-900",
+      className
+    )}
+    {...props}
+  />
+))
+ToastAction.displayName = ToastPrimitives.Action.displayName
+const ToastClose = React.forwardRef<
+  React.ElementRef<typeof ToastPrimitives.Close>,
+  React.ComponentPropsWithoutRef<typeof ToastPrimitives.Close>
+>(({ className, ...props }, ref) => (
+  <ToastPrimitives.Close
+    ref={ref}
+    className={cn(
+      "absolute right-2 top-2 rounded-md p-1 text-stone-950/50 opacity-0 transition-opacity hover:text-stone-950 focus:opacity-100 focus:outline-none focus:ring-2 group-hover:opacity-100 group-[.destructive]:text-red-300 group-[.destructive]:hover:text-red-50 group-[.destructive]:focus:ring-red-400 group-[.destructive]:focus:ring-offset-red-600 dark:text-stone-50/50 dark:hover:text-stone-50",
+      className
+    )}
+    toast-close=""
+    {...props}
+  >
+    <X className="h-4 w-4" />
+  </ToastPrimitives.Close>
+))
+ToastClose.displayName = ToastPrimitives.Close.displayName
+const ToastTitle = React.forwardRef<
+  React.ElementRef<typeof ToastPrimitives.Title>,
+  React.ComponentPropsWithoutRef<typeof ToastPrimitives.Title>
+>(({ className, ...props }, ref) => (
+  <ToastPrimitives.Title
+    ref={ref}
+    className={cn("text-sm font-semibold", className)}
+    {...props}
+  />
+))
+ToastTitle.displayName = ToastPrimitives.Title.displayName
+const ToastDescription = React.forwardRef<
+  React.ElementRef<typeof ToastPrimitives.Description>,
+  React.ComponentPropsWithoutRef<typeof ToastPrimitives.Description>
+>(({ className, ...props }, ref) => (
+  <ToastPrimitives.Description
+    ref={ref}
+    className={cn("text-sm opacity-90", className)}
+    {...props}
+  />
+))
+ToastDescription.displayName = ToastPrimitives.Description.displayName
+type ToastProps = React.ComponentPropsWithoutRef<typeof Toast>
+type ToastActionElement = React.ReactElement<typeof ToastAction>
+export {
+  type ToastProps,
+  type ToastActionElement,
+  ToastProvider,
+  ToastViewport,
+  Toast,
+  ToastTitle,
+  ToastDescription,
+  ToastClose,
+  ToastAction,
+}

src/components/ui/toaster.tsx ADDED Viewed

	@@ -0,0 +1,35 @@

+"use client"
+import {
+  Toast,
+  ToastClose,
+  ToastDescription,
+  ToastProvider,
+  ToastTitle,
+  ToastViewport,
+} from "@/components/ui/toast"
+import { useToast } from "@/components/ui/use-toast"
+export function Toaster() {
+  const { toasts } = useToast()
+  return (
+    <ToastProvider>
+      {toasts.map(function ({ id, title, description, action, ...props }) {
+        return (
+          <Toast key={id} {...props}>
+            <div className="grid gap-1">
+              {title && <ToastTitle>{title}</ToastTitle>}
+              {description && (
+                <ToastDescription>{description}</ToastDescription>
+              )}
+            </div>
+            {action}
+            <ToastClose />
+          </Toast>
+        )
+      })}
+      <ToastViewport />
+    </ToastProvider>
+  )
+}

src/components/ui/use-toast.ts ADDED Viewed

	@@ -0,0 +1,192 @@

+// Inspired by react-hot-toast library
+import * as React from "react"
+import type {
+  ToastActionElement,
+  ToastProps,
+} from "@/components/ui/toast"
+const TOAST_LIMIT = 1
+const TOAST_REMOVE_DELAY = 1000000
+type ToasterToast = ToastProps & {
+  id: string
+  title?: React.ReactNode
+  description?: React.ReactNode
+  action?: ToastActionElement
+}
+const actionTypes = {
+  ADD_TOAST: "ADD_TOAST",
+  UPDATE_TOAST: "UPDATE_TOAST",
+  DISMISS_TOAST: "DISMISS_TOAST",
+  REMOVE_TOAST: "REMOVE_TOAST",
+} as const
+let count = 0
+function genId() {
+  count = (count + 1) % Number.MAX_VALUE
+  return count.toString()
+}
+type ActionType = typeof actionTypes
+type Action =
+  | {
+      type: ActionType["ADD_TOAST"]
+      toast: ToasterToast
+    }
+  | {
+      type: ActionType["UPDATE_TOAST"]
+      toast: Partial<ToasterToast>
+    }
+  | {
+      type: ActionType["DISMISS_TOAST"]
+      toastId?: ToasterToast["id"]
+    }
+  | {
+      type: ActionType["REMOVE_TOAST"]
+      toastId?: ToasterToast["id"]
+    }
+interface State {
+  toasts: ToasterToast[]
+}
+const toastTimeouts = new Map<string, ReturnType<typeof setTimeout>>()
+const addToRemoveQueue = (toastId: string) => {
+  if (toastTimeouts.has(toastId)) {
+    return
+  }
+  const timeout = setTimeout(() => {
+    toastTimeouts.delete(toastId)
+    dispatch({
+      type: "REMOVE_TOAST",
+      toastId: toastId,
+    })
+  }, TOAST_REMOVE_DELAY)
+  toastTimeouts.set(toastId, timeout)
+}
+export const reducer = (state: State, action: Action): State => {
+  switch (action.type) {
+    case "ADD_TOAST":
+      return {
+        ...state,
+        toasts: [action.toast, ...state.toasts].slice(0, TOAST_LIMIT),
+      }
+    case "UPDATE_TOAST":
+      return {
+        ...state,
+        toasts: state.toasts.map((t) =>
+          t.id === action.toast.id ? { ...t, ...action.toast } : t
+        ),
+      }
+    case "DISMISS_TOAST": {
+      const { toastId } = action
+      // ! Side effects ! - This could be extracted into a dismissToast() action,
+      // but I'll keep it here for simplicity
+      if (toastId) {
+        addToRemoveQueue(toastId)
+      } else {
+        state.toasts.forEach((toast) => {
+          addToRemoveQueue(toast.id)
+        })
+      }
+      return {
+        ...state,
+        toasts: state.toasts.map((t) =>
+          t.id === toastId || toastId === undefined
+            ? {
+                ...t,
+                open: false,
+              }
+            : t
+        ),
+      }
+    }
+    case "REMOVE_TOAST":
+      if (action.toastId === undefined) {
+        return {
+          ...state,
+          toasts: [],
+        }
+      }
+      return {
+        ...state,
+        toasts: state.toasts.filter((t) => t.id !== action.toastId),
+      }
+  }
+}
+const listeners: Array<(state: State) => void> = []
+let memoryState: State = { toasts: [] }
+function dispatch(action: Action) {
+  memoryState = reducer(memoryState, action)
+  listeners.forEach((listener) => {
+    listener(memoryState)
+  })
+}
+type Toast = Omit<ToasterToast, "id">
+function toast({ ...props }: Toast) {
+  const id = genId()
+  const update = (props: ToasterToast) =>
+    dispatch({
+      type: "UPDATE_TOAST",
+      toast: { ...props, id },
+    })
+  const dismiss = () => dispatch({ type: "DISMISS_TOAST", toastId: id })
+  dispatch({
+    type: "ADD_TOAST",
+    toast: {
+      ...props,
+      id,
+      open: true,
+      onOpenChange: (open) => {
+        if (!open) dismiss()
+      },
+    },
+  })
+  return {
+    id: id,
+    dismiss,
+    update,
+  }
+}
+function useToast() {
+  const [state, setState] = React.useState<State>(memoryState)
+  React.useEffect(() => {
+    listeners.push(setState)
+    return () => {
+      const index = listeners.indexOf(setState)
+      if (index > -1) {
+        listeners.splice(index, 1)
+      }
+    }
+  }, [state])
+  return {
+    ...state,
+    toast,
+    dismiss: (toastId?: string) => dispatch({ type: "DISMISS_TOAST", toastId }),
+  }
+}
+export { useToast, toast }

src/lib/blobToBase64Uri.ts ADDED Viewed

	@@ -0,0 +1,18 @@

+export function blobToBase64Uri(blob?: Blob): Promise<string> {
+  return new Promise((resolve, reject) => {
+    if (!blob || typeof window === "undefined" || !window.FileReader) {
+      resolve("")
+      return
+    }
+    const reader = new window.FileReader()
+    reader.readAsDataURL(blob)
+    reader.onloadend = () => {
+      resolve(`${reader.result || ""}`)
+    }
+    reader.onerror = () => {
+      // reject("error while converting blob to base64")
+      resolve("")
+    }
+  })
+}

src/types.ts CHANGED Viewed

@@ -7,3 +7,14 @@ export interface ImageAnalysisResponse {
   result: string
   error?: string
 }

   result: string
   error?: string
 }
+export interface SoundAnalysisRequest {
+  sound: string // in base64
+  prompt: string
+}
+export interface SoundAnalysisResponse {
+  result: string
+  error?: string
+}