tan-z-tan commited on
Commit
9b5cb27
1 Parent(s): 6274b4a
Files changed (4) hide show
  1. app.py +20 -6
  2. lang_id.py +2 -1
  3. poetry.lock +0 -0
  4. pyproject.toml +20 -0
app.py CHANGED
@@ -33,7 +33,12 @@ def resample_audio(audio, orig_sr, target_sr=16000):
33
 
34
  def process_chunk(chunk, language_set) -> pd.DataFrame:
35
  print(f"Processing audio chunk of length {len(chunk)}")
36
- volume_norm = np.linalg.norm(chunk)
 
 
 
 
 
37
  length = len(chunk) / SAMPLING_RATE # 音声データの長さ(秒)
38
  s = datetime.now()
39
  selected_scores, all_scores = identify_languages(chunk, language_set)
@@ -55,8 +60,8 @@ def process_chunk(chunk, language_set) -> pd.DataFrame:
55
 
56
  return pd.DataFrame({
57
  "Length (s)": [length],
58
- "Volume": [volume_norm],
59
- "Japanese_English": [f"{ja_en} ({ja_prob:.2f}, {en_prob:.2f})"],
60
  "Language": [top3_languages],
61
  "Lang ID Time": [lang_id_time],
62
  "Transcribe Time": [transcribe_time],
@@ -80,9 +85,6 @@ def process_audio_stream(audio, chunk_duration, language_set):
80
  audio_data = resample_audio(audio_data, sr, target_sr=SAMPLING_RATE)
81
  audio_sec = 0
82
 
83
- # 音量の正規化
84
- audio_data = normalize_audio(audio_data)
85
-
86
  current_chunk.append(audio_data)
87
 
88
  total_chunk = np.concatenate(current_chunk)
@@ -93,7 +95,14 @@ def process_audio_stream(audio, chunk_duration, language_set):
93
  total_chunk = total_chunk[SAMPLING_RATE * chunk_duration:]
94
  audio_sec += chunk_duration
95
 
 
 
 
 
 
96
  df = process_chunk(chunk, language_set)
 
 
97
  data_df = pd.concat([data_df, df], ignore_index=True)
98
 
99
  current_chunk = [total_chunk]
@@ -124,6 +133,11 @@ def process_audio(audio, chunk_duration, language_set):
124
  audio_data = resample_audio(audio_data, sr, target_sr=SAMPLING_RATE)
125
  audio_sec = 0
126
 
 
 
 
 
 
127
  # 音量の正規化
128
  audio_data = normalize_audio(audio_data)
129
 
 
33
 
34
  def process_chunk(chunk, language_set) -> pd.DataFrame:
35
  print(f"Processing audio chunk of length {len(chunk)}")
36
+ rms = np.sqrt(np.mean(chunk**2))
37
+ db_level = 20 * np.log10(rms + 1e-9) # 加えた小さな値で-inf値を防ぐ
38
+
39
+ # 音量の正規化
40
+ chunk = normalize_audio(chunk)
41
+
42
  length = len(chunk) / SAMPLING_RATE # 音声データの長さ(秒)
43
  s = datetime.now()
44
  selected_scores, all_scores = identify_languages(chunk, language_set)
 
60
 
61
  return pd.DataFrame({
62
  "Length (s)": [length],
63
+ "db_level": [db_level],
64
+ "Japanese_English": [f"{ja_en} ({ja_prob:.2f}, {en_prob:.2f})"] if db_level > 50 else ["Silent"],
65
  "Language": [top3_languages],
66
  "Lang ID Time": [lang_id_time],
67
  "Transcribe Time": [transcribe_time],
 
85
  audio_data = resample_audio(audio_data, sr, target_sr=SAMPLING_RATE)
86
  audio_sec = 0
87
 
 
 
 
88
  current_chunk.append(audio_data)
89
 
90
  total_chunk = np.concatenate(current_chunk)
 
95
  total_chunk = total_chunk[SAMPLING_RATE * chunk_duration:]
96
  audio_sec += chunk_duration
97
 
98
+ # Check if the audio in the window is too quiet
99
+ # rms = np.sqrt(np.mean(chunk**2))
100
+ # db_level = 20 * np.log10(rms + 1e-9) # 加えた小さな値で-inf値を防ぐ
101
+ # print(db_level)
102
+
103
  df = process_chunk(chunk, language_set)
104
+ # add db_level
105
+ # df["dB Level"] = db_level
106
  data_df = pd.concat([data_df, df], ignore_index=True)
107
 
108
  current_chunk = [total_chunk]
 
133
  audio_data = resample_audio(audio_data, sr, target_sr=SAMPLING_RATE)
134
  audio_sec = 0
135
 
136
+ # Check if the audio in the window is too quiet
137
+ rms = np.sqrt(np.mean(audio_data**2))
138
+ db_level = 20 * np.log10(rms + 1e-9) # 加えた小さな値で-inf値を防ぐ
139
+ print(db_level)
140
+
141
  # 音量の正規化
142
  audio_data = normalize_audio(audio_data)
143
 
lang_id.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from speechbrain.inference.classifiers import EncoderClassifier
2
  import numpy as np
3
  import torch
@@ -42,7 +43,7 @@ def identify_languages(chunk: np.ndarray, languages: list[str] = ["Japanese", "E
42
  lang_scores, _, _, _ = language_id.classify_batch(torch.from_numpy(chunk).unsqueeze(0))
43
 
44
  # 結果の整形
45
- all_scores = {INDEX_TO_LANG[i]: score for i, score in enumerate(lang_scores[0])}
46
  selected_scores = {lang: float(all_scores[lang]) for lang in languages}
47
 
48
  return selected_scores, all_scores
 
1
+ import math
2
  from speechbrain.inference.classifiers import EncoderClassifier
3
  import numpy as np
4
  import torch
 
43
  lang_scores, _, _, _ = language_id.classify_batch(torch.from_numpy(chunk).unsqueeze(0))
44
 
45
  # 結果の整形
46
+ all_scores = {INDEX_TO_LANG[i]: 100 * math.exp(score) for i, score in enumerate(lang_scores[0])}
47
  selected_scores = {lang: float(all_scores[lang]) for lang in languages}
48
 
49
  return selected_scores, all_scores
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "speech-language-detection"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = ["Makoto Tanji <[email protected]>"]
6
+ readme = "README.md"
7
+
8
+ [tool.poetry.dependencies]
9
+ python = "^3.10"
10
+ transformers = "^4.41.2"
11
+ gradio = "^4.36.1"
12
+ sounddevice = "^0.4.7"
13
+ numpy = "^2.0.0"
14
+ pandas = "^2.2.2"
15
+ speechbrain = "^1.0.0"
16
+
17
+
18
+ [build-system]
19
+ requires = ["poetry-core"]
20
+ build-backend = "poetry.core.masonry.api"