Upload parallel_phonemize.sh with huggingface_hub
Browse files- parallel_phonemize.sh +43 -0
parallel_phonemize.sh
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
input_dir="txt"
|
4 |
+
output_dir="sangraha_hi_phonemized"
|
5 |
+
lang=hi
|
6 |
+
num_files=50000
|
7 |
+
num_jobs=-1
|
8 |
+
|
9 |
+
process_file() {
|
10 |
+
input_file="$1"
|
11 |
+
output_file="$2"
|
12 |
+
lang=hi
|
13 |
+
|
14 |
+
# Create the output directory and its parent directories if they don't exist
|
15 |
+
mkdir -p "$(dirname "$output_file")"
|
16 |
+
|
17 |
+
phonemize --quiet -l $lang "$input_file" -o "$output_file" --strip --language-switch remove-flags --preserve-punctuation
|
18 |
+
echo "Processed: $input_file -> $output_file"
|
19 |
+
}
|
20 |
+
|
21 |
+
export -f process_file
|
22 |
+
|
23 |
+
# Start the timer
|
24 |
+
start_time=$(date +%s)
|
25 |
+
|
26 |
+
# Use GNU Parallel with find to process files in parallel
|
27 |
+
find "$input_dir" -type f -name "*.txt" | head -n $num_files | parallel -j $num_jobs process_file "{}" "${output_dir}/phn_$(basename {})"
|
28 |
+
|
29 |
+
# End the timer
|
30 |
+
end_time=$(date +%s)
|
31 |
+
|
32 |
+
# Calculate the elapsed time
|
33 |
+
elapsed_time=$((end_time - start_time))
|
34 |
+
|
35 |
+
# Convert elapsed time to minutes and seconds
|
36 |
+
minutes=$((elapsed_time / 60))
|
37 |
+
seconds=$((elapsed_time % 60))
|
38 |
+
|
39 |
+
# Print the benchmark results
|
40 |
+
echo "Benchmark Results:"
|
41 |
+
echo "Number of files processed: $num_files"
|
42 |
+
echo "Number of parallel jobs: $num_jobs"
|
43 |
+
echo "Elapsed time: $minutes minutes $seconds seconds"
|