Create utils.py
Browse files
utils.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def convert_segments_object_to_text(data):
|
2 |
+
result = []
|
3 |
+
|
4 |
+
for segment in data['segments']:
|
5 |
+
words = segment['words']
|
6 |
+
segment_speaker = segment.get('speaker', None)
|
7 |
+
segment_start = segment.get('start', None)
|
8 |
+
segment_end = segment.get('end', None)
|
9 |
+
current_speaker = None
|
10 |
+
current_start = None
|
11 |
+
current_end = None
|
12 |
+
current_text = []
|
13 |
+
|
14 |
+
# Forward fill speaker, start and end if missing
|
15 |
+
for i, word_info in enumerate(words):
|
16 |
+
if 'speaker' not in word_info:
|
17 |
+
if i > 0 and 'speaker' in words[i - 1]:
|
18 |
+
word_info['speaker'] = words[i - 1]['speaker']
|
19 |
+
elif i < len(words) - 1 and 'speaker' in words[i + 1]:
|
20 |
+
word_info['speaker'] = words[i + 1]['speaker']
|
21 |
+
else:
|
22 |
+
word_info['speaker'] = segment_speaker
|
23 |
+
|
24 |
+
if 'start' not in word_info:
|
25 |
+
if i > 0 and 'end' in words[i - 1]:
|
26 |
+
word_info['start'] = words[i - 1]['end']
|
27 |
+
else:
|
28 |
+
word_info['start'] = segment_start
|
29 |
+
|
30 |
+
if 'end' not in word_info:
|
31 |
+
if i < len(words) - 1 and 'start' in words[i + 1]:
|
32 |
+
word_info['end'] = words[i + 1]['start']
|
33 |
+
elif i == len(words) - 1:
|
34 |
+
word_info['end'] = segment_end
|
35 |
+
else:
|
36 |
+
word_info['end'] = word_info['start']
|
37 |
+
|
38 |
+
for word_info in words:
|
39 |
+
word = word_info.get('word', '')
|
40 |
+
start = word_info.get('start', None)
|
41 |
+
end = word_info.get('end', None)
|
42 |
+
speaker = word_info.get('speaker', None)
|
43 |
+
|
44 |
+
if current_speaker is None:
|
45 |
+
current_speaker = speaker
|
46 |
+
current_start = start
|
47 |
+
|
48 |
+
if speaker == current_speaker:
|
49 |
+
current_text.append(word)
|
50 |
+
current_end = end
|
51 |
+
else:
|
52 |
+
# Finish current segment
|
53 |
+
if current_start is not None and current_end is not None:
|
54 |
+
formatted_text = f'{current_speaker} ({current_start} : {current_end}) : {" ".join(current_text)}'
|
55 |
+
else:
|
56 |
+
formatted_text = f'{current_speaker} : {" ".join(current_text)}'
|
57 |
+
result.append(formatted_text)
|
58 |
+
|
59 |
+
# Start new segment
|
60 |
+
current_speaker = speaker
|
61 |
+
current_start = start
|
62 |
+
current_end = end
|
63 |
+
current_text = [word]
|
64 |
+
|
65 |
+
# Append the last segment
|
66 |
+
if current_text:
|
67 |
+
if current_start is not None and current_end is not None:
|
68 |
+
formatted_text = f'{current_speaker} ({current_start} : {current_end}) : {" ".join(current_text)}'
|
69 |
+
else:
|
70 |
+
formatted_text = f'{current_speaker} : {" ".join(current_text)}'
|
71 |
+
result.append(formatted_text)
|
72 |
+
|
73 |
+
return '\n'.join(result)
|