Spaces:

Xenova
/

whisper-speaker-diarization

Running

App Files Files Community

Xenova HF staff commited on Jul 22

Commit

2575095

•

1 Parent(s): 69c0d7d

Upload source code (#1)

Browse files

- Upload source code (3ba2cf63839483ebc96f123b39a535a5b2f0336a)

Files changed (16) hide show

whisper-speaker-diarization/.eslintrc.cjs +21 -0
whisper-speaker-diarization/.gitignore +24 -0
whisper-speaker-diarization/README.md +8 -0
whisper-speaker-diarization/index.html +12 -0
whisper-speaker-diarization/package.json +30 -0
whisper-speaker-diarization/postcss.config.js +6 -0
whisper-speaker-diarization/src/App.jsx +218 -0
whisper-speaker-diarization/src/components/LanguageSelector.jsx +134 -0
whisper-speaker-diarization/src/components/MediaInput.jsx +194 -0
whisper-speaker-diarization/src/components/Progress.jsx +15 -0
whisper-speaker-diarization/src/components/Transcript.jsx +125 -0
whisper-speaker-diarization/src/index.css +25 -0
whisper-speaker-diarization/src/main.jsx +10 -0
whisper-speaker-diarization/src/worker.js +124 -0
whisper-speaker-diarization/tailwind.config.js +12 -0
whisper-speaker-diarization/vite.config.js +7 -0

whisper-speaker-diarization/.eslintrc.cjs ADDED Viewed

	@@ -0,0 +1,21 @@

+module.exports = {
+  root: true,
+  env: { browser: true, es2020: true },
+  extends: [
+    'eslint:recommended',
+    'plugin:react/recommended',
+    'plugin:react/jsx-runtime',
+    'plugin:react-hooks/recommended',
+  ],
+  ignorePatterns: ['dist', '.eslintrc.cjs'],
+  parserOptions: { ecmaVersion: 'latest', sourceType: 'module' },
+  settings: { react: { version: '18.2' } },
+  plugins: ['react-refresh'],
+  rules: {
+    'react/jsx-no-target-blank': 'off',
+    'react-refresh/only-export-components': [
+      'warn',
+      { allowConstantExport: true },
+    ],
+  },
+}

whisper-speaker-diarization/.gitignore ADDED Viewed

	@@ -0,0 +1,24 @@

+# Logs
+logs
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+pnpm-debug.log*
+lerna-debug.log*
+node_modules
+dist
+dist-ssr
+*.local
+# Editor directories and files
+.vscode/*
+!.vscode/extensions.json
+.idea
+.DS_Store
+*.suo
+*.ntvs*
+*.njsproj
+*.sln
+*.sw?

whisper-speaker-diarization/README.md ADDED Viewed

	@@ -0,0 +1,8 @@

+# React + Vite
+This template provides a minimal setup to get React working in Vite with HMR and some ESLint rules.
+Currently, two official plugins are available:
+- [@vitejs/plugin-react](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react/README.md) uses [Babel](https://babeljs.io/) for Fast Refresh
+- [@vitejs/plugin-react-swc](https://github.com/vitejs/vite-plugin-react-swc) uses [SWC](https://swc.rs/) for Fast Refresh

whisper-speaker-diarization/index.html ADDED Viewed

	@@ -0,0 +1,12 @@

+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>Whisper Diarization</title>
+  </head>
+  <body>
+    <div id="root"></div>
+    <script type="module" src="/src/main.jsx"></script>
+  </body>
+</html>

whisper-speaker-diarization/package.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "name": "whisper-speaker-diarization",
+  "private": true,
+  "version": "0.0.0",
+  "type": "module",
+  "scripts": {
+    "dev": "vite",
+    "build": "vite build",
+    "lint": "eslint . --ext js,jsx --report-unused-disable-directives --max-warnings 0",
+    "preview": "vite preview"
+  },
+  "dependencies": {
+    "@xenova/transformers": "github:xenova/transformers.js#v3",
+    "react": "^18.3.1",
+    "react-dom": "^18.3.1"
+  },
+  "devDependencies": {
+    "@types/react": "^18.3.3",
+    "@types/react-dom": "^18.3.0",
+    "@vitejs/plugin-react": "^4.3.1",
+    "autoprefixer": "^10.4.19",
+    "eslint": "^8.57.0",
+    "eslint-plugin-react": "^7.34.2",
+    "eslint-plugin-react-hooks": "^4.6.2",
+    "eslint-plugin-react-refresh": "^0.4.7",
+    "postcss": "^8.4.38",
+    "tailwindcss": "^3.4.4",
+    "vite": "^5.3.1"
+  }
+}

whisper-speaker-diarization/postcss.config.js ADDED Viewed

	@@ -0,0 +1,6 @@

+export default {
+  plugins: {
+    tailwindcss: {},
+    autoprefixer: {},
+  },
+}

whisper-speaker-diarization/src/App.jsx ADDED Viewed

	@@ -0,0 +1,218 @@

+import { useEffect, useState, useRef, useCallback } from 'react';
+import Progress from './components/Progress';
+import MediaInput from './components/MediaInput';
+import Transcript from './components/Transcript';
+import LanguageSelector from './components/LanguageSelector';
+async function hasWebGPU() {
+    if (!navigator.gpu) {
+        return false;
+    }
+    try {
+        const adapter = await navigator.gpu.requestAdapter();
+        return !!adapter;
+    } catch (e) {
+        return false;
+    }
+}
+function App() {
+    // Create a reference to the worker object.
+    const worker = useRef(null);
+    // Model loading and progress
+    const [status, setStatus] = useState(null);
+    const [loadingMessage, setLoadingMessage] = useState('');
+    const [progressItems, setProgressItems] = useState([]);
+    const mediaInputRef = useRef(null);
+    const [audio, setAudio] = useState(null);
+    const [language, setLanguage] = useState('en');
+    const [result, setResult] = useState(null);
+    const [time, setTime] = useState(null);
+    const [currentTime, setCurrentTime] = useState(0);
+    const [device, setDevice] = useState('webgpu'); // Try use WebGPU first
+    const [modelSize, setModelSize] = useState('gpu' in navigator ? 196 : 77); // WebGPU=196MB, WebAssembly=77MB
+    useEffect(() => {
+        hasWebGPU().then((b) => {
+            setModelSize(b ? 196 : 77);
+            setDevice(b ? 'webgpu' : 'wasm');
+        });
+    }, []);
+    // We use the `useEffect` hook to setup the worker as soon as the `App` component is mounted.
+    useEffect(() => {
+        if (!worker.current) {
+            // Create the worker if it does not yet exist.
+            worker.current = new Worker(new URL('./worker.js', import.meta.url), {
+                type: 'module'
+            });
+        }
+        // Create a callback function for messages from the worker thread.
+        const onMessageReceived = (e) => {
+            switch (e.data.status) {
+                case 'loading':
+                    // Model file start load: add a new progress item to the list.
+                    setStatus('loading');
+                    setLoadingMessage(e.data.data);
+                    break;
+                case 'initiate':
+                    setProgressItems(prev => [...prev, e.data]);
+                    break;
+                case 'progress':
+                    // Model file progress: update one of the progress items.
+                    setProgressItems(
+                        prev => prev.map(item => {
+                            if (item.file === e.data.file) {
+                                return { ...item, ...e.data }
+                            }
+                            return item;
+                        })
+                    );
+                    break;
+                case 'done':
+                    // Model file loaded: remove the progress item from the list.
+                    setProgressItems(
+                        prev => prev.filter(item => item.file !== e.data.file)
+                    );
+                    break;
+                case 'loaded':
+                    // Pipeline ready: the worker is ready to accept messages.
+                    setStatus('ready');
+                    break;
+                case 'complete':
+                    setResult(e.data.result);
+                    setTime(e.data.time);
+                    setStatus('ready');
+                    break;
+            }
+        };
+        // Attach the callback function as an event listener.
+        worker.current.addEventListener('message', onMessageReceived);
+        // Define a cleanup function for when the component is unmounted.
+        return () => {
+            worker.current.removeEventListener('message', onMessageReceived);
+        };
+    }, []);
+    const handleClick = useCallback(() => {
+        setResult(null);
+        setTime(null);
+        if (status === null) {
+            setStatus('loading');
+            worker.current.postMessage({ type: 'load', data: { device } });
+        } else {
+            setStatus('running');
+            worker.current.postMessage({
+                type: 'run', data: { audio, language }
+            });
+        }
+    }, [status, audio, language, device]);
+    return (
+        <div className="flex flex-col h-screen mx-auto text-gray-800 dark:text-gray-200 bg-white dark:bg-gray-900 max-w-[600px]">
+            {status === 'loading' && (
+                <div className="flex justify-center items-center fixed w-screen h-screen bg-black z-10 bg-opacity-[92%] top-0 left-0">
+                    <div className="w-[500px]">
+                        <p className="text-center mb-1 text-white text-md">{loadingMessage}</p>
+                        {progressItems.map(({ file, progress, total }, i) => (
+                            <Progress key={i} text={file} percentage={progress} total={total} />
+                        ))}
+                    </div>
+                </div>
+            )}
+            <div className="my-auto">
+                <div className="flex flex-col items-center mb-2 text-center">
+                    <h1 className="text-5xl font-bold mb-2">Whisper Diarization</h1>
+                    <h2 className="text-xl font-semibold">In-browser automatic speech recognition w/ <br />word-level timestamps and speaker segmentation</h2>
+                </div>
+                <div className="w-full min-h-[220px] flex flex-col justify-center items-center">
+                    {
+                        !audio && (
+                            <p className="mb-2">
+                                You are about to download <a href="https://huggingface.co/onnx-community/whisper-base_timestamped" target="_blank" rel="noreferrer" className="font-medium underline">whisper-base</a> and <a href="https://huggingface.co/onnx-community/pyannote-segmentation-3.0" target="_blank" rel="noreferrer" className="font-medium underline">pyannote-segmentation-3.0</a>,
+                                two powerful speech recognition models for generating word-level timestamps across 100 different languages and speaker segmentation, respectively.
+                                Once loaded, the models ({modelSize}MB + 6MB) will be cached and reused when you revisit the page.<br />
+                                <br />
+                                Everything runs locally in your browser using <a href="https://huggingface.co/docs/transformers.js" target="_blank" rel="noreferrer" className="underline">🤗&nbsp;Transformers.js</a> and ONNX Runtime Web,
+                                meaning no API calls are made to a server for inference. You can even disconnect from the internet after the model has loaded!
+                            </p>
+                        )
+                    }
+                    <div className="flex flex-col w-full m-3 max-w-[520px]">
+                        <span className="text-sm mb-0.5">Input audio/video</span>
+                        <MediaInput
+                            ref={mediaInputRef}
+                            className="flex items-center border rounded-md cursor-pointer min-h-[100px] max-h-[500px] overflow-hidden"
+                            onInputChange={(audio) => {
+                                setResult(null);
+                                setAudio(audio);
+                            }}
+                            onTimeUpdate={(time) => setCurrentTime(time)}
+                        />
+                    </div>
+                    <div className="relative w-full flex justify-center items-center">
+                        <button
+                            className="border px-4 py-2 rounded-lg bg-blue-400 text-white hover:bg-blue-500 disabled:bg-blue-100 disabled:cursor-not-allowed select-none"
+                            onClick={handleClick}
+                            disabled={status === 'running' || (status !== null && audio === null)}
+                        >
+                            {status === null ? 'Load model' :
+                                status === 'running'
+                                    ? 'Running...'
+                                    : 'Run model'
+                            }
+                        </button>
+                        {status !== null &&
+                            <div className='absolute right-0 bottom-0'>
+                                <span className="text-xs">Language:</span>
+                                <br />
+                                <LanguageSelector className="border rounded-lg p-1 max-w-[100px]" language={language} setLanguage={setLanguage} />
+                            </div>
+                        }
+                    </div>
+                    {
+                        result && time && (
+                            <>
+                                <div className="w-full mt-4 border rounded-md">
+                                    <Transcript
+                                        className="p-2 max-h-[200px] overflow-y-auto scrollbar-thin select-none"
+                                        transcript={result.transcript}
+                                        segments={result.segments}
+                                        currentTime={currentTime}
+                                        setCurrentTime={(time) => {
+                                            setCurrentTime(time);
+                                            mediaInputRef.current.setMediaTime(time);
+                                        }}
+                                    />
+                                </div>
+                                <p className="text-sm text-gray-600 text-end p-1">Generation time: <span className="text-gray-800 font-semibold">{time.toFixed(2)}ms</span></p>
+                            </>
+                        )
+                    }
+                </div>
+            </div>
+        </div >
+    )
+}
+export default App

whisper-speaker-diarization/src/components/LanguageSelector.jsx ADDED Viewed

	@@ -0,0 +1,134 @@

+function titleCase(str) {
+    str = str.toLowerCase();
+    return (str.match(/\w+.?/g) || [])
+        .map((word) => {
+            return word.charAt(0).toUpperCase() + word.slice(1);
+        })
+        .join("");
+}
+// List of supported languages:
+// https://help.openai.com/en/articles/7031512-whisper-api-faq
+// https://github.com/openai/whisper/blob/248b6cb124225dd263bb9bd32d060b6517e067f8/whisper/tokenizer.py#L79
+const LANGUAGES = {
+    en: "english",
+    zh: "chinese",
+    de: "german",
+    es: "spanish/castilian",
+    ru: "russian",
+    ko: "korean",
+    fr: "french",
+    ja: "japanese",
+    pt: "portuguese",
+    tr: "turkish",
+    pl: "polish",
+    ca: "catalan/valencian",
+    nl: "dutch/flemish",
+    ar: "arabic",
+    sv: "swedish",
+    it: "italian",
+    id: "indonesian",
+    hi: "hindi",
+    fi: "finnish",
+    vi: "vietnamese",
+    he: "hebrew",
+    uk: "ukrainian",
+    el: "greek",
+    ms: "malay",
+    cs: "czech",
+    ro: "romanian/moldavian/moldovan",
+    da: "danish",
+    hu: "hungarian",
+    ta: "tamil",
+    no: "norwegian",
+    th: "thai",
+    ur: "urdu",
+    hr: "croatian",
+    bg: "bulgarian",
+    lt: "lithuanian",
+    la: "latin",
+    mi: "maori",
+    ml: "malayalam",
+    cy: "welsh",
+    sk: "slovak",
+    te: "telugu",
+    fa: "persian",
+    lv: "latvian",
+    bn: "bengali",
+    sr: "serbian",
+    az: "azerbaijani",
+    sl: "slovenian",
+    kn: "kannada",
+    et: "estonian",
+    mk: "macedonian",
+    br: "breton",
+    eu: "basque",
+    is: "icelandic",
+    hy: "armenian",
+    ne: "nepali",
+    mn: "mongolian",
+    bs: "bosnian",
+    kk: "kazakh",
+    sq: "albanian",
+    sw: "swahili",
+    gl: "galician",
+    mr: "marathi",
+    pa: "punjabi/panjabi",
+    si: "sinhala/sinhalese",
+    km: "khmer",
+    sn: "shona",
+    yo: "yoruba",
+    so: "somali",
+    af: "afrikaans",
+    oc: "occitan",
+    ka: "georgian",
+    be: "belarusian",
+    tg: "tajik",
+    sd: "sindhi",
+    gu: "gujarati",
+    am: "amharic",
+    yi: "yiddish",
+    lo: "lao",
+    uz: "uzbek",
+    fo: "faroese",
+    ht: "haitian creole/haitian",
+    ps: "pashto/pushto",
+    tk: "turkmen",
+    nn: "nynorsk",
+    mt: "maltese",
+    sa: "sanskrit",
+    lb: "luxembourgish/letzeburgesch",
+    my: "myanmar/burmese",
+    bo: "tibetan",
+    tl: "tagalog",
+    mg: "malagasy",
+    as: "assamese",
+    tt: "tatar",
+    haw: "hawaiian",
+    ln: "lingala",
+    ha: "hausa",
+    ba: "bashkir",
+    jw: "javanese",
+    su: "sundanese",
+};
+function LanguageSelector({ language, setLanguage, ...props }) {
+    const handleLanguageChange = (event) => {
+        setLanguage(event.target.value);
+    };
+    const names = Object.values(LANGUAGES).map(titleCase);
+    return (
+        <select
+            {...props}
+            value={language} onChange={handleLanguageChange}>
+            {Object.keys(LANGUAGES).map((key, i) => (
+                <option key={key} value={key}>
+                    {names[i]}
+                </option>
+            ))}
+        </select>
+    );
+}
+export default LanguageSelector

whisper-speaker-diarization/src/components/MediaInput.jsx ADDED Viewed

	@@ -0,0 +1,194 @@

+import { useState, forwardRef, useRef, useImperativeHandle, useEffect, useCallback } from 'react';
+const EXAMPLE_URL = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/hopper.webm';
+const MediaInput = forwardRef(({ onInputChange, onTimeUpdate, ...props }, ref) => {
+    // UI states
+    const [dragging, setDragging] = useState(false);
+    const fileInputRef = useRef(null);
+    // Create a reference to the audio and video elements
+    const audioElement = useRef(null);
+    const videoElement = useRef(null);
+    const currentTimeRef = useRef(0);
+    useImperativeHandle(ref, () => ({
+        setMediaTime(time) {
+            if (audioElement.current?.src) {
+                audioElement.current.currentTime = time;
+            } else if (videoElement.current?.src) {
+                videoElement.current.currentTime = time;
+            }
+            currentTimeRef.current = time;
+        }
+    }));
+    const onBufferLoad = (arrayBuffer, type) => {
+        const blob = new Blob([arrayBuffer.slice(0)], { type: type });
+        const url = URL.createObjectURL(blob);
+        processFile(arrayBuffer);
+        // Create a URL for the Blob
+        if (type.startsWith('audio/')) {
+            // Dispose the previous source
+            videoElement.current.pause();
+            videoElement.current.removeAttribute('src');
+            videoElement.current.load();
+            audioElement.current.src = url;
+        } else if (type.startsWith('video/')) {
+            // Dispose the previous source
+            audioElement.current.pause();
+            audioElement.current.removeAttribute('src');
+            audioElement.current.load();
+            videoElement.current.src = url;
+        } else {
+            alert(`Unsupported file type: ${type}`);
+        }
+    }
+    const readFile = (file) => {
+        if (!file) return;
+        // file.type
+        const reader = new FileReader();
+        reader.onload = (e) => {
+            onBufferLoad(e.target.result, file.type);
+        }
+        reader.readAsArrayBuffer(file);
+    }
+    const handleInputChange = (event) => {
+        readFile(event.target.files[0]);
+    };
+    const handleDragOver = (event) => {
+        event.preventDefault();
+    };
+    const handleDrop = (event) => {
+        event.preventDefault();
+        setDragging(false);
+        readFile(event.dataTransfer.files[0]);
+    };
+    const handleClick = (e) => {
+        if (e.target.tagName === 'VIDEO' || e.target.tagName === 'AUDIO') {
+            e.preventDefault();
+            fileInputRef.current.click();
+        } else if (e.target.tagName === 'INPUT') {
+            e.stopPropagation();
+        } else {
+            fileInputRef.current.click();
+            e.stopPropagation();
+        }
+    };
+    const processFile = async (buffer) => {
+        const audioContext = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: 16_000 });
+        try {
+            const audioBuffer = await audioContext.decodeAudioData(buffer);
+            let audio;
+            if (audioBuffer.numberOfChannels === 2) {
+                // Merge channels
+                const SCALING_FACTOR = Math.sqrt(2);
+                const left = audioBuffer.getChannelData(0);
+                const right = audioBuffer.getChannelData(1);
+                audio = new Float32Array(left.length);
+                for (let i = 0; i < audioBuffer.length; ++i) {
+                    audio[i] = SCALING_FACTOR * (left[i] + right[i]) / 2;
+                }
+            } else {
+                audio = audioBuffer.getChannelData(0);
+            }
+            onInputChange(audio);
+        } catch (e) {
+            alert(e);
+        }
+    };
+    const requestRef = useRef();
+    const updateTime = useCallback(() => {
+        let elem;
+        if (audioElement.current?.src) {
+            elem = audioElement.current;
+        } else if (videoElement.current?.src) {
+            elem = videoElement.current;
+        }
+        if (elem && currentTimeRef.current !== elem.currentTime) {
+            currentTimeRef.current = elem.currentTime;
+            onTimeUpdate(elem.currentTime);
+        }
+        // Request the next frame
+        requestRef.current = requestAnimationFrame(updateTime);
+    }, [onTimeUpdate]);
+    useEffect(() => {
+        // Start the animation
+        requestRef.current = requestAnimationFrame(updateTime);
+        return () => {
+            // Cleanup on component unmount
+            cancelAnimationFrame(requestRef.current);
+        };
+    }, [updateTime]);
+    return (
+        <div
+            {...props}
+            onClick={handleClick}
+            onDragOver={handleDragOver}
+            onDrop={handleDrop}
+            onDragEnter={(e) => setDragging(true)}
+            onDragLeave={(e) => setDragging(false)}
+        >
+            <input
+                type="file"
+                accept="audio/*,video/*"
+                onChange={handleInputChange}
+                ref={fileInputRef}
+                className="hidden"
+            />
+            {
+                <audio
+                    ref={audioElement}
+                    controls
+                    style={{ display: audioElement.current?.src ? 'block' : 'none' }}
+                    className='w-full max-h-full'
+                />
+            }
+            {
+                <video
+                    ref={videoElement}
+                    controls
+                    style={{ display: videoElement.current?.src ? 'block' : 'none' }}
+                    className='w-full max-h-full'
+                />
+            }
+            {
+                !audioElement.current?.src && !videoElement.current?.src && (
+                    <div className="w-full flex flex-col items-center justify-center border-2 border-dashed border-gray-300 rounded-md h-[250px]"
+                        style={{ borderColor: dragging ? 'blue' : 'lightgray' }}
+                    >
+                        <span className="text-gray-600 text-center"><u>Drag & drop</u> or <u>click</u><br />to select media</span>
+                        <span className="text-gray-500 text-sm hover:text-gray-800 mt-2" onClick={async (e) => {
+                            e.stopPropagation();
+                            const buffer = await fetch(EXAMPLE_URL).then((r) => r.arrayBuffer());
+                            videoElement.current.src = URL.createObjectURL(new Blob([buffer], { type: 'video/mp4' }));
+                            onBufferLoad(buffer, 'video/mp4');
+                        }}>(or <u>try an example</u>)</span>
+                    </div>
+                )
+            }
+        </div>
+    );
+});
+MediaInput.displayName = 'MediaInput';
+export default MediaInput;

whisper-speaker-diarization/src/components/Progress.jsx ADDED Viewed

	@@ -0,0 +1,15 @@

+function formatBytes(size) {
+    const i = size == 0 ? 0 : Math.floor(Math.log(size) / Math.log(1024));
+    return +((size / Math.pow(1024, i)).toFixed(2)) * 1 + ['B', 'kB', 'MB', 'GB', 'TB'][i];
+}
+export default function Progress({ text, percentage, total }) {
+    percentage ??= 0;
+    return (
+        <div className="w-full bg-gray-100 dark:bg-gray-700 text-left rounded-lg overflow-hidden mb-0.5">
+            <div className="bg-blue-400 whitespace-nowrap px-1 text-sm" style={{ width: `${percentage}%` }}>
+                {text} ({percentage.toFixed(2)}%{isNaN(total) ? '' : ` of ${formatBytes(total)}`})
+            </div>
+        </div>
+    );
+}

whisper-speaker-diarization/src/components/Transcript.jsx ADDED Viewed

	@@ -0,0 +1,125 @@

+import { useEffect, useMemo, useRef } from "react";
+const Chunk = ({ chunk, currentTime, onClick, ...props }) => {
+    const spanRef = useRef(null);
+    const { text, timestamp } = chunk;
+    const [start, end] = timestamp;
+    const bolded = start <= currentTime && currentTime < end;
+    useEffect(() => {
+        if (spanRef.current && bolded) { // scroll into view
+            spanRef.current.scrollIntoView({
+                behavior: 'smooth',
+                block: 'center',
+                inline: 'center',
+            });
+        }
+    }, [bolded]);
+    return (
+        <span {...props}>
+            {text.startsWith(' ') ? " " : ""}
+            <span
+                ref={spanRef}
+                onClick={onClick}
+                className="text-md text-gray-600 cursor-pointer hover:text-red-600"
+                title={timestamp.map(x => x.toFixed(2)).join(' → ')}
+                style={{
+                    textDecoration: bolded ? 'underline' : 'none',
+                    textShadow: bolded ? '0 0 1px #000' : 'none',
+                }}
+            >{text.trim()}</span>
+        </span>
+    )
+}
+const Transcript = ({ transcript, segments, currentTime, setCurrentTime, ...props }) => {
+    const jsonTranscript = useMemo(() => {
+        return JSON.stringify({
+            ...transcript,
+            segments,
+        }, null, 2)
+            // post-process the JSON to make it more readable
+            .replace(/( {4}"timestamp": )\[\s+(\S+)\s+(\S+)\s+\]/gm, "$1[$2 $3]");
+    }, [transcript, segments]);
+    // Post-process the transcript to highlight speaker changes
+    const postProcessedTranscript = useMemo(() => {
+        let prev = 0;
+        const words = transcript.chunks;
+        const result = [];
+        for (const segment of segments) {
+            const { label, end } = segment;
+            if (label === 'NO_SPEAKER') continue;
+            // Collect all words within this segment
+            const segmentWords = [];
+            for (let i = prev; i < words.length; ++i) {
+                const word = words[i];
+                if (word.timestamp[1] <= end) {
+                    segmentWords.push(word);
+                } else {
+                    prev = i;
+                    break;
+                }
+            }
+            if (segmentWords.length > 0) {
+                result.push({
+                    ...segment,
+                    chunks: segmentWords,
+                })
+            }
+        }
+        return result;
+    }, [transcript, segments]);
+    const downloadTranscript = () => {
+        const blob = new Blob([jsonTranscript], { type: 'application/json' });
+        const url = URL.createObjectURL(blob);
+        const a = document.createElement('a');
+        a.href = url;
+        a.download = 'transcript.json';
+        a.click();
+        URL.revokeObjectURL(url);
+    }
+    return (<>
+        <div {...props}>
+            {
+                postProcessedTranscript.map(({ label, start, end, chunks }, i) => (
+                    <div className="border-t py-2" key={i}>
+                        <div className="flex justify-between">
+                            <label className="text-xs font-medium">{label}</label>
+                            <label className="text-xs">{start.toFixed(2)} &rarr; {end.toFixed(2)}</label>
+                        </div>
+                        <div>
+                            {chunks.map((chunk, j) =>
+                                <Chunk
+                                    key={j}
+                                    chunk={chunk}
+                                    currentTime={currentTime}
+                                    onClick={() => setCurrentTime(chunk.timestamp[0])}  // Set to start of chunk
+                                />
+                            )}
+                        </div>
+                    </div>
+                ))
+            }
+        </div>
+        <div className="flex justify-center border-t text-sm text-gray-600 max-h-[150px] overflow-y-auto p-2 scrollbar-thin">
+            <button
+                className="flex items-center border px-2 py-1 rounded-lg bg-green-400 text-white hover:bg-green-500"
+                onClick={downloadTranscript}
+            >
+                <svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" strokeWidth={1.5} stroke="currentColor" className="size-6 mr-1">
+                    <path strokeLinecap="round" strokeLinejoin="round" d="M3 16.5v2.25A2.25 2.25 0 0 0 5.25 21h13.5A2.25 2.25 0 0 0 21 18.75V16.5M16.5 12 12 16.5m0 0L7.5 12m4.5 4.5V3" />
+                </svg>
+                Download transcript
+            </button>
+        </div>
+    </>)
+};
+export default Transcript;

whisper-speaker-diarization/src/index.css ADDED Viewed

	@@ -0,0 +1,25 @@

+@tailwind base;
+@tailwind components;
+@tailwind utilities;
+@layer utilities {
+  .scrollbar-thin::-webkit-scrollbar {
+    @apply w-2;
+  }
+  .scrollbar-thin::-webkit-scrollbar-track {
+    @apply rounded-full bg-gray-100 dark:bg-gray-700;
+  }
+  .scrollbar-thin::-webkit-scrollbar-thumb {
+    @apply rounded-full bg-gray-300 dark:bg-gray-600;
+  }
+  .scrollbar-thin::-webkit-scrollbar-thumb:hover {
+    @apply bg-gray-500;
+  }
+}
+html {
+  @apply scrollbar-thin;
+}

whisper-speaker-diarization/src/main.jsx ADDED Viewed

	@@ -0,0 +1,10 @@

+import React from 'react'
+import ReactDOM from 'react-dom/client'
+import App from './App.jsx'
+import './index.css'
+ReactDOM.createRoot(document.getElementById('root')).render(
+  <React.StrictMode>
+    <App />
+  </React.StrictMode>,
+)

whisper-speaker-diarization/src/worker.js ADDED Viewed

	@@ -0,0 +1,124 @@

+import { pipeline, AutoProcessor, AutoModelForAudioFrameClassification } from '@xenova/transformers';
+const PER_DEVICE_CONFIG = {
+    webgpu: {
+        dtype: {
+            encoder_model: 'fp32',
+            decoder_model_merged: 'q4',
+        },
+        device: 'webgpu',
+    },
+    wasm: {
+        dtype: 'q8',
+        device: 'wasm',
+    },
+};
+/**
+ * This class uses the Singleton pattern to ensure that only one instance of the model is loaded.
+ */
+class PipelineSingeton {
+    static asr_model_id = 'onnx-community/whisper-base_timestamped';
+    static asr_instance = null;
+    static segmentation_model_id = 'onnx-community/pyannote-segmentation-3.0';
+    static segmentation_instance = null;
+    static segmentation_processor = null;
+    static async getInstance(progress_callback = null, device = 'webgpu') {
+        this.asr_instance ??= pipeline('automatic-speech-recognition', this.asr_model_id, {
+            ...PER_DEVICE_CONFIG[device],
+            progress_callback,
+        });
+        this.segmentation_processor ??= AutoProcessor.from_pretrained(this.segmentation_model_id, {
+            progress_callback,
+        });
+        this.segmentation_instance ??= AutoModelForAudioFrameClassification.from_pretrained(this.segmentation_model_id, {
+            // NOTE: WebGPU is not currently supported for this model
+            // See https://github.com/microsoft/onnxruntime/issues/21386
+            device: 'wasm',
+            dtype: 'fp32',
+            progress_callback,
+        });
+        return Promise.all([this.asr_instance, this.segmentation_processor, this.segmentation_instance]);
+    }
+}
+async function load({ device }) {
+    self.postMessage({
+        status: 'loading',
+        data: `Loading models (${device})...`
+    });
+    // Load the pipeline and save it for future use.
+    const [transcriber, segmentation_processor, segmentation_model] = await PipelineSingeton.getInstance(x => {
+        // We also add a progress callback to the pipeline so that we can
+        // track model loading.
+        self.postMessage(x);
+    }, device);
+    if (device === 'webgpu') {
+        self.postMessage({
+            status: 'loading',
+            data: 'Compiling shaders and warming up model...'
+        });
+        await transcriber(new Float32Array(16_000), {
+            language: 'en',
+        });
+    }
+    self.postMessage({ status: 'loaded' });
+}
+async function segment(processor, model, audio) {
+    const inputs = await processor(audio);
+    const { logits } = await model(inputs);
+    const segments = processor.post_process_speaker_diarization(logits, audio.length)[0];
+    // Attach labels
+    for (const segment of segments) {
+        segment.label = model.config.id2label[segment.id];
+    }
+    return segments;
+}
+async function run({ audio, language }) {
+    const [transcriber, segmentation_processor, segmentation_model] = await PipelineSingeton.getInstance();
+    const start = performance.now();
+    // Run transcription and segmentation in parallel
+    const [transcript, segments] = await Promise.all([
+        transcriber(audio, {
+            language,
+            return_timestamps: 'word',
+            chunk_length_s: 30,
+        }),
+        segment(segmentation_processor, segmentation_model, audio)
+    ]);
+    console.table(segments, ['start', 'end', 'id', 'label', 'confidence']);
+    const end = performance.now();
+    self.postMessage({ status: 'complete', result: { transcript, segments }, time: end - start });
+}
+// Listen for messages from the main thread
+self.addEventListener('message', async (e) => {
+    const { type, data } = e.data;
+    switch (type) {
+        case 'load':
+            load(data);
+            break;
+        case 'run':
+            run(data);
+            break;
+    }
+});

whisper-speaker-diarization/tailwind.config.js ADDED Viewed

	@@ -0,0 +1,12 @@

+/** @type {import('tailwindcss').Config} */
+export default {
+  content: [
+    "./index.html",
+    "./src/**/*.{js,ts,jsx,tsx}",
+  ],
+  theme: {
+    extend: {},
+  },
+  plugins: [],
+}

whisper-speaker-diarization/vite.config.js ADDED Viewed

	@@ -0,0 +1,7 @@

+import { defineConfig } from 'vite'
+import react from '@vitejs/plugin-react'
+// https://vitejs.dev/config/
+export default defineConfig({
+  plugins: [react()],
+})