Alyosha11 commited on
Commit
5a5eaa3
1 Parent(s): 9c642b1

Upload parallel_phonemize.sh with huggingface_hub

Browse files
Files changed (1) hide show
  1. parallel_phonemize.sh +43 -0
parallel_phonemize.sh ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ input_dir="txt"
4
+ output_dir="sangraha_hi_phonemized"
5
+ lang=hi
6
+ num_files=50000
7
+ num_jobs=-1
8
+
9
+ process_file() {
10
+ input_file="$1"
11
+ output_file="$2"
12
+ lang=hi
13
+
14
+ # Create the output directory and its parent directories if they don't exist
15
+ mkdir -p "$(dirname "$output_file")"
16
+
17
+ phonemize --quiet -l $lang "$input_file" -o "$output_file" --strip --language-switch remove-flags --preserve-punctuation
18
+ echo "Processed: $input_file -> $output_file"
19
+ }
20
+
21
+ export -f process_file
22
+
23
+ # Start the timer
24
+ start_time=$(date +%s)
25
+
26
+ # Use GNU Parallel with find to process files in parallel
27
+ find "$input_dir" -type f -name "*.txt" | head -n $num_files | parallel -j $num_jobs process_file "{}" "${output_dir}/phn_$(basename {})"
28
+
29
+ # End the timer
30
+ end_time=$(date +%s)
31
+
32
+ # Calculate the elapsed time
33
+ elapsed_time=$((end_time - start_time))
34
+
35
+ # Convert elapsed time to minutes and seconds
36
+ minutes=$((elapsed_time / 60))
37
+ seconds=$((elapsed_time % 60))
38
+
39
+ # Print the benchmark results
40
+ echo "Benchmark Results:"
41
+ echo "Number of files processed: $num_files"
42
+ echo "Number of parallel jobs: $num_jobs"
43
+ echo "Elapsed time: $minutes minutes $seconds seconds"