# clone whisper.cpp git clone https://github.com/ggerganov/whisper.cpp.git cd whisper.cpp # clone dataset git clone https://huggingface.co/datasets/kotoba-tech/kotoba-whisper-eval # convert to 16khz ffmpeg -i kotoba-whisper-eval/audio/long_interview_1.mp3 -ar 16000 -ac 1 -c:a pcm_s16le kotoba-whisper-eval/audio/long_interview_1.wav ffmpeg -i kotoba-whisper-eval/audio/manzai1.mp3 -ar 16000 -ac 1 -c:a pcm_s16le kotoba-whisper-eval/audio/manzai1.wav ffmpeg -i kotoba-whisper-eval/audio/manzai2.mp3 -ar 16000 -ac 1 -c:a pcm_s16le kotoba-whisper-eval/audio/manzai2.wav ffmpeg -i kotoba-whisper-eval/audio/manzai3.mp3 -ar 16000 -ac 1 -c:a pcm_s16le kotoba-whisper-eval/audio/manzai3.wav # clone weight wget https://huggingface.co/kotoba-tech/kotoba-whisper-v1.0-ggml/resolve/main/ggml-kotoba-whisper-v1.0.bin -P ./models # benchmark main model SECONDS=0 make -j && ./main -m models/ggml-kotoba-whisper-v1.0.bin -f kotoba-whisper-eval/audio/long_interview_1.wav TIME_INTERVIEW=$SECONDS SECONDS=0 make -j && ./main -m models/ggml-kotoba-whisper-v1.0.bin -f kotoba-whisper-eval/audio/manzai1.wav TIME_MANZAI1=$SECONDS SECONDS=0 make -j && ./main -m models/ggml-kotoba-whisper-v1.0.bin -f kotoba-whisper-eval/audio/manzai2.wav TIME_MANZAI2=$SECONDS SECONDS=0 make -j && ./main -m models/ggml-kotoba-whisper-v1.0.bin -f kotoba-whisper-eval/audio/manzai3.wav TIME_MANZAI3=$SECONDS # clone weight (quantized) wget https://huggingface.co/kotoba-tech/kotoba-whisper-v1.0-ggml/resolve/main/ggml-kotoba-whisper-v1.0-q5_0.bin -P ./models # benchmark quantized model SECONDS=0 make -j && ./main -m models/ggml-kotoba-whisper-v1.0-q5_0.bin -f kotoba-whisper-eval/audio/long_interview_1.wav TIME_INTERVIEW_Q=$SECONDS SECONDS=0 make -j && ./main -m models/ggml-kotoba-whisper-v1.0-q5_0.bin -f kotoba-whisper-eval/audio/manzai1.wav TIME_MANZAI1_Q=$SECONDS SECONDS=0 make -j && ./main -m models/ggml-kotoba-whisper-v1.0-q5_0.bin -f kotoba-whisper-eval/audio/manzai2.wav TIME_MANZAI2_Q=$SECONDS SECONDS=0 make -j && ./main -m models/ggml-kotoba-whisper-v1.0-q5_0.bin -f kotoba-whisper-eval/audio/manzai3.wav TIME_MANZAI3_Q=$SECONDS # # clone the weight # bash ./models/download-ggml-model.sh large-v3 # # benchmark large-v3 # SECONDS=0 # make -j && ./main -m models/ggml-large-v3.bin -f kotoba-whisper-eval/audio/long_interview_1.wav # TIME_INTERVIEW_L=$SECONDS # SECONDS=0 # make -j && ./main -m models/ggml-large-v3.bin -f kotoba-whisper-eval/audio/manzai1.wav # TIME_MANZAI1_L=$SECONDS # SECONDS=0 # make -j && ./main -m models/ggml-large-v3.bin -f kotoba-whisper-eval/audio/manzai2.wav # TIME_MANZAI2_L=$SECONDS # SECONDS=0 # make -j && ./main -m models/ggml-large-v3.bin --language ja -f kotoba-whisper-eval/audio/manzai3.wav # TIME_MANZAI3_L=$SECONDS # summary echo "MAIN MODEL" echo "interview: $TIME_INTERVIEW" echo "manzai1 : $TIME_MANZAI1" echo "manzai2 : $TIME_MANZAI2" echo "manzai3 : $TIME_MANZAI3" echo "QUANTIZED MODEL" echo "interview: $TIME_INTERVIEW_Q" echo "manzai1 : $TIME_MANZAI1_Q" echo "manzai2 : $TIME_MANZAI2_Q" echo "manzai3 : $TIME_MANZAI3_Q" # Result on MacBookPro: # - Apple M2 Pro # - 32GB # - 14-inch, 2023 # - OS Sonoma Version 14.4.1 (23E224) # MAIN MODEL # interview: 581 # manzai1 : 41 # manzai2 : 30 # manzai3 : 35 # QUANTIZED MODEL # interview: 677 # manzai1 : 37 # manzai2 : 36 # manzai3 : 42