andito HF staff commited on
Commit
96c4257
1 Parent(s): c77f359

Upload folder using huggingface_hub

Browse files
STT/whisper_stt_handler.py CHANGED
@@ -111,6 +111,7 @@ class WhisperSTTHandler(BaseHandler):
111
 
112
  def process(self, spoken_prompt):
113
  logger.debug("infering whisper...")
 
114
 
115
  global pipeline_start
116
  pipeline_start = perf_counter()
@@ -121,6 +122,7 @@ class WhisperSTTHandler(BaseHandler):
121
 
122
  if language_code not in SUPPORTED_LANGUAGES: # reprocess with the last language
123
  logger.warning("Whisper detected unsupported language:", language_code)
 
124
  gen_kwargs = copy(self.gen_kwargs)
125
  gen_kwargs['language'] = self.last_language
126
  language_code = self.last_language
@@ -135,6 +137,7 @@ class WhisperSTTHandler(BaseHandler):
135
 
136
  logger.debug("finished whisper inference")
137
  console.print(f"[yellow]USER: {pred_text}")
 
138
  logger.debug(f"Language Code Whisper: {language_code}")
139
 
140
  yield (pred_text, language_code)
 
111
 
112
  def process(self, spoken_prompt):
113
  logger.debug("infering whisper...")
114
+ console.print("infering whisper...")
115
 
116
  global pipeline_start
117
  pipeline_start = perf_counter()
 
122
 
123
  if language_code not in SUPPORTED_LANGUAGES: # reprocess with the last language
124
  logger.warning("Whisper detected unsupported language:", language_code)
125
+ console.print("Whisper detected unsupported language:", language_code)
126
  gen_kwargs = copy(self.gen_kwargs)
127
  gen_kwargs['language'] = self.last_language
128
  language_code = self.last_language
 
137
 
138
  logger.debug("finished whisper inference")
139
  console.print(f"[yellow]USER: {pred_text}")
140
+ console.print(f"Language Code Whisper: {language_code}")
141
  logger.debug(f"Language Code Whisper: {language_code}")
142
 
143
  yield (pred_text, language_code)
VAD/vad_handler.py CHANGED
@@ -53,10 +53,14 @@ class VADHandler(BaseHandler):
53
  audio_float32 = int2float(audio_int16)
54
  vad_output = self.iterator(torch.from_numpy(audio_float32))
55
  if vad_output is not None and len(vad_output) != 0:
 
56
  logger.debug("VAD: end of speech detected")
57
  array = torch.cat(vad_output).cpu().numpy()
58
  duration_ms = len(array) / self.sample_rate * 1000
59
  if duration_ms < self.min_speech_ms or duration_ms > self.max_speech_ms:
 
 
 
60
  logger.debug(
61
  f"audio input of duration: {len(array) / self.sample_rate}s, skipping"
62
  )
 
53
  audio_float32 = int2float(audio_int16)
54
  vad_output = self.iterator(torch.from_numpy(audio_float32))
55
  if vad_output is not None and len(vad_output) != 0:
56
+ console.print("VAD: end of speech detected")
57
  logger.debug("VAD: end of speech detected")
58
  array = torch.cat(vad_output).cpu().numpy()
59
  duration_ms = len(array) / self.sample_rate * 1000
60
  if duration_ms < self.min_speech_ms or duration_ms > self.max_speech_ms:
61
+ console.print(
62
+ f"audio input of duration: {len(array) / self.sample_rate}s, skipping"
63
+ )
64
  logger.debug(
65
  f"audio input of duration: {len(array) / self.sample_rate}s, skipping"
66
  )
audio_streaming_client.py CHANGED
@@ -60,10 +60,10 @@ class AudioStreamingClient:
60
  if len(buffer) >= self.args.chunk_size * 2: # * 2 because of int16
61
  self.send_request(buffer)
62
  buffer = b''
63
- time.sleep(4*self.args.chunk_size/self.args.sample_rate)
64
  else:
65
  self.send_request()
66
- time.sleep(4*self.args.chunk_size/self.args.sample_rate)
67
 
68
  def send_request(self, audio_data=None):
69
  payload = {"input_type": "speech",
@@ -106,8 +106,8 @@ class AudioStreamingClient:
106
  self.session_id = None
107
  while not self.recv_queue.empty():
108
  time.sleep(0.01) # wait for the queue to empty
109
- while not self.send_queue.empty():
110
- _ = self.send_queue.get() # Clear the queue
111
 
112
  except Exception as e:
113
  print(f"Error sending request: {e}")
 
60
  if len(buffer) >= self.args.chunk_size * 2: # * 2 because of int16
61
  self.send_request(buffer)
62
  buffer = b''
63
+ time.sleep(16*self.args.chunk_size/self.args.sample_rate)
64
  else:
65
  self.send_request()
66
+ time.sleep(16*self.args.chunk_size/self.args.sample_rate)
67
 
68
  def send_request(self, audio_data=None):
69
  payload = {"input_type": "speech",
 
106
  self.session_id = None
107
  while not self.recv_queue.empty():
108
  time.sleep(0.01) # wait for the queue to empty
109
+ with self.send_queue.mutex:
110
+ self.send_queue.queue.clear() # Clear the queue
111
 
112
  except Exception as e:
113
  print(f"Error sending request: {e}")
handler.py CHANGED
@@ -64,6 +64,7 @@ class EndpointHandler:
64
  self.sample_rate = 16000 # Set the expected sample rate
65
 
66
  def _process_audio_chunk(self, audio_data: bytes, session_id: str):
 
67
  audio_array = np.frombuffer(audio_data, dtype=np.int16)
68
 
69
  # Ensure the audio is in chunks of the correct size
@@ -113,6 +114,8 @@ class EndpointHandler:
113
 
114
  input_type = data.get("input_type", "text")
115
  input_data = data.get("inputs", "")
 
 
116
 
117
  if input_type == "speech":
118
  audio_bytes = base64.b64decode(input_data)
@@ -129,6 +132,8 @@ class EndpointHandler:
129
 
130
  def _handle_continue_request(self, data: Dict[str, Any]) -> Dict[str, Any]:
131
  session_id = data.get("session_id")
 
 
132
  if not session_id or session_id not in self.sessions:
133
  raise ValueError("Invalid or missing session_id")
134
 
@@ -136,8 +141,10 @@ class EndpointHandler:
136
 
137
  if not self.queues_and_events['should_listen'].is_set():
138
  session['status'] = 'processing'
 
139
  elif "inputs" in data: # Handle additional input if provided
140
  input_data = data["inputs"]
 
141
  audio_bytes = base64.b64decode(input_data)
142
  self._process_audio_chunk(audio_bytes, session_id)
143
 
@@ -145,6 +152,7 @@ class EndpointHandler:
145
  session['last_sent_index'] = len(session['chunks'])
146
 
147
  if chunks_to_send:
 
148
  combined_audio = b''.join(chunks_to_send)
149
  base64_audio = base64.b64encode(combined_audio).decode('utf-8')
150
  return {
 
64
  self.sample_rate = 16000 # Set the expected sample rate
65
 
66
  def _process_audio_chunk(self, audio_data: bytes, session_id: str):
67
+ print('processing audio chunk')
68
  audio_array = np.frombuffer(audio_data, dtype=np.int16)
69
 
70
  # Ensure the audio is in chunks of the correct size
 
114
 
115
  input_type = data.get("input_type", "text")
116
  input_data = data.get("inputs", "")
117
+ console.print(f"input_type: {input_type}")
118
+ console.print(f"input_data: {input_data}")
119
 
120
  if input_type == "speech":
121
  audio_bytes = base64.b64decode(input_data)
 
132
 
133
  def _handle_continue_request(self, data: Dict[str, Any]) -> Dict[str, Any]:
134
  session_id = data.get("session_id")
135
+ print(f"session_id: {session_id}")
136
+ print('continue request')
137
  if not session_id or session_id not in self.sessions:
138
  raise ValueError("Invalid or missing session_id")
139
 
 
141
 
142
  if not self.queues_and_events['should_listen'].is_set():
143
  session['status'] = 'processing'
144
+ print('should_listen is not set, processing')
145
  elif "inputs" in data: # Handle additional input if provided
146
  input_data = data["inputs"]
147
+ print(f"input_data: {input_data}")
148
  audio_bytes = base64.b64decode(input_data)
149
  self._process_audio_chunk(audio_bytes, session_id)
150
 
 
152
  session['last_sent_index'] = len(session['chunks'])
153
 
154
  if chunks_to_send:
155
+ print('chunks_to_send')
156
  combined_audio = b''.join(chunks_to_send)
157
  base64_audio = base64.b64encode(combined_audio).decode('utf-8')
158
  return {