kabita-choudhary commited on
Commit
7478ded
1 Parent(s): 7c2cc3c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -30
app.py CHANGED
@@ -16,63 +16,71 @@ import gradio as gr
16
 
17
  from sklearn.cluster import AgglomerativeClustering
18
  import numpy as np
 
 
 
19
 
20
 
21
- num_speakers = 2
22
-
23
- language = 'English'
24
-
25
- model_size = 'medium'
26
- model = whisper.load_model(model_size)
27
-
28
-
29
- model_name = model_size
30
- audio = Audio()
31
- def segmentembedding(segment):
32
- start = segment["start"]
33
- end = min(duration, segment["end"])
34
- clip = Segment(start, end)
35
- waveform, sample_rate = audio.crop(path, clip)
36
- return embedding_model(waveform[None])
37
  def time(secs):
38
  return datetime.timedelta(seconds=round(secs))
39
- from transformers import pipeline
40
- summarizer = pipeline("summarization", model="kabita-choudhary/finetuned-bart-for-conversation-summary")
41
 
42
- def translatetotext(path):
43
- out=""
44
- if path[-3:] != 'wav':
45
- subprocess.call(['ffmpeg', '-i', path, 'audio.wav', '-y'])
46
- path = 'audio.wav'
 
 
 
 
 
 
 
47
  result = model.transcribe(path)
48
  segments = result["segments"]
49
  print(segments)
 
50
  with contextlib.closing(wave.open(path,'r')) as f:
51
  frames = f.getnframes()
52
  rate = f.getframerate()
53
  duration = frames / float(rate)
 
 
 
 
 
 
 
 
54
  embeddings = np.zeros(shape=(len(segments), 192))
 
55
  for i, segment in enumerate(segments):
56
- embeddings[i] = segmentembedding(segment)
57
  embeddings = np.nan_to_num(embeddings)
58
  clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
59
  labels = clustering.labels_
 
60
  for i in range(len(segments)):
61
  segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
62
  f = open("transcript.txt", "w")
 
63
  for (i, segment) in enumerate(segments):
64
  if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
65
- f.write("\n" + segment["speaker"] + ' ' + str(time(segment["start"])) + '\n')
66
- out=out+segment["speaker"]
67
- f.write(segment["text"][1:] + ' ')
68
  out=out+segment["text"][1:] + '\n'
69
  f.close()
 
70
  summary = summarizer(out)
71
- return out,summary
 
 
 
72
 
73
  demo = gr.Interface(
74
  fn=translatetotext,
75
- inputs=gr.Audio(source="upload",type="filepath"),
76
- outputs=["text","text"]
77
  )
78
  demo.launch(debug=True)
 
16
 
17
  from sklearn.cluster import AgglomerativeClustering
18
  import numpy as np
19
+ import os
20
+ from transformers import pipeline
21
+ summarizer = pipeline("summarization", model="kabita-choudhary/finetuned-bart-for-conversation-summary")
22
 
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  def time(secs):
25
  return datetime.timedelta(seconds=round(secs))
 
 
26
 
27
+
28
+ def translatetotext(vpath,no_of_speaker):
29
+ num_speakers = no_of_speaker
30
+ language = 'English'
31
+ model_size = 'small'
32
+ model = whisper.load_model(model_size)
33
+ model_name = model_size
34
+ _,file_ending = os.path.splitext(f'{vpath}')
35
+ print(f'file enging is {file_ending}')
36
+ path = vpath.replace(file_ending, ".wav")
37
+ print("starting conversion to wav")
38
+ os.system(f'ffmpeg -i "{vpath}" -ar 16000 -ac 1 -c:a pcm_s16le "{path}"')
39
  result = model.transcribe(path)
40
  segments = result["segments"]
41
  print(segments)
42
+ duration=0
43
  with contextlib.closing(wave.open(path,'r')) as f:
44
  frames = f.getnframes()
45
  rate = f.getframerate()
46
  duration = frames / float(rate)
47
+ def segment_embedding(segment):
48
+ audio = Audio()
49
+ start = segment["start"]
50
+ # Whisper overshoots the end timestamp in the last segment
51
+ end = min(duration, segment["end"])
52
+ clip = Segment(start, end)
53
+ waveform, sample_rate = audio.crop(path, clip)
54
+ return embedding_model(waveform[None])
55
  embeddings = np.zeros(shape=(len(segments), 192))
56
+ print(duration)
57
  for i, segment in enumerate(segments):
58
+ embeddings[i] = segment_embedding(segment)
59
  embeddings = np.nan_to_num(embeddings)
60
  clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
61
  labels = clustering.labels_
62
+ print(labels)
63
  for i in range(len(segments)):
64
  segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
65
  f = open("transcript.txt", "w")
66
+ out=""
67
  for (i, segment) in enumerate(segments):
68
  if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
69
+ f.write(segment["speaker"] + ' ' + str(time(segment["start"]))+' ' + str(time(segment["end"]))+' ')
70
+ out=out+segment["speaker"]+' ' + str(time(segment["start"]))+' ' + str(time(segment["end"]))+' '
71
+ f.write(segment["text"][1:] + '\n')
72
  out=out+segment["text"][1:] + '\n'
73
  f.close()
74
+
75
  summary = summarizer(out)
76
+ f = open("summary.txt", "w")
77
+ f.write(summary[0]["summary_text"])
78
+ f.close()
79
+ return out,summary[0]["summary_text"],"transcript.txt","summary.txt"
80
 
81
  demo = gr.Interface(
82
  fn=translatetotext,
83
+ inputs=[gr.Video(source="upload",type="filepath"),gr.Number(precision=0, value=2, label="Selected number of speakers", interactive=True)],
84
+ outputs=["text","text",gr.File(label="Download transcript"),gr.File(label="Download summary")]
85
  )
86
  demo.launch(debug=True)