Spaces:
Running
Running
Mahiruoshi
commited on
Commit
•
c4e6b54
1
Parent(s):
534942c
Update tools/sentence.py
Browse files- tools/sentence.py +262 -164
tools/sentence.py
CHANGED
@@ -1,173 +1,271 @@
|
|
1 |
-
import
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
from
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
sentences = new_sentences
|
35 |
|
36 |
for sentence in sentences:
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
|
67 |
-
|
68 |
-
start = 0
|
69 |
-
end = 0
|
70 |
-
sentences_list = []
|
71 |
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
for sentence in sentences:
|
77 |
-
new_sentences.extend(split_alpha_nonalpha(sentence))
|
78 |
-
sentences = new_sentences
|
79 |
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
#
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
return sentences_list
|
117 |
-
|
118 |
-
|
119 |
-
def sentence_split_and_markup(text, max=50, lang="auto", speaker_lang=None):
|
120 |
-
# 如果该speaker只支持一种语言
|
121 |
-
if speaker_lang is not None and len(speaker_lang) == 1:
|
122 |
-
if lang.upper() not in ["AUTO", "MIX"] and lang.lower() != speaker_lang[0]:
|
123 |
-
logging.debug(
|
124 |
-
f'lang "{lang}" is not in speaker_lang {speaker_lang},automatically set lang={speaker_lang[0]}'
|
125 |
-
)
|
126 |
-
lang = speaker_lang[0]
|
127 |
-
|
128 |
-
sentences_list = []
|
129 |
-
if lang.upper() != "MIX":
|
130 |
-
if max <= 0:
|
131 |
-
sentences_list.append(
|
132 |
-
markup_language(text, speaker_lang)
|
133 |
-
if lang.upper() == "AUTO"
|
134 |
-
else f"[{lang.upper()}]{text}[{lang.upper()}]"
|
135 |
-
)
|
136 |
else:
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
|
154 |
if __name__ == "__main__":
|
155 |
-
text = "
|
156 |
-
print(
|
157 |
-
print(
|
158 |
-
print(sentence_split_and_markup(text, max=50, lang="auto", speaker_lang=None))
|
159 |
-
|
160 |
-
text = "你好,这是一段用来测试自动标注的文本。こんにちは,これは自動ラベリングのテスト用テキストです.Hello, this is a piece of text to test autotagging.你好!今天我们要介绍VITS项目,其重点是使用了GAN Duration predictor和transformer flow,并且接入了Bert模型来提升韵律。Bert embedding会在稍后介绍。"
|
161 |
-
print(split_by_language(text, ["zh", "ja", "en"]))
|
162 |
-
|
163 |
-
text = "vits和Bert-VITS2是tts模型。花费3days.花费3天。Take 3 days"
|
164 |
-
|
165 |
-
print(split_by_language(text, ["zh", "ja", "en"]))
|
166 |
-
# output: [('vits', 'en'), ('和', 'ja'), ('Bert-VITS', 'en'), ('2是', 'zh'), ('tts', 'en'), ('模型。花费3', 'zh'), ('days.', 'en'), ('花费3天。', 'zh'), ('Take 3 days', 'en')]
|
167 |
-
|
168 |
-
print(split_by_language(text, ["zh", "en"]))
|
169 |
-
# output: [('vits', 'en'), ('和', 'zh'), ('Bert-VITS', 'en'), ('2是', 'zh'), ('tts', 'en'), ('模型。花费3', 'zh'), ('days.', 'en'), ('花费3天。', 'zh'), ('Take 3 days', 'en')]
|
170 |
-
|
171 |
-
text = "vits 和 Bert-VITS2 是 tts 模型。花费 3 days. 花费 3天。Take 3 days"
|
172 |
-
print(split_by_language(text, ["zh", "en"]))
|
173 |
-
# output: [('vits ', 'en'), ('和 ', 'zh'), ('Bert-VITS2 ', 'en'), ('是 ', 'zh'), ('tts ', 'en'), ('模型。花费 ', 'zh'), ('3 days. ', 'en'), ('花费 3天。', 'zh'), ('Take 3 days', 'en')]
|
|
|
1 |
+
import re, os
|
2 |
+
|
3 |
+
from ebooklib import epub
|
4 |
+
import PyPDF2
|
5 |
+
from PyPDF2 import PdfReader
|
6 |
+
from bs4 import BeautifulSoup
|
7 |
+
import jieba
|
8 |
+
import romajitable
|
9 |
+
|
10 |
+
def is_japanese(string):
|
11 |
+
for ch in string:
|
12 |
+
if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
|
13 |
+
return True
|
14 |
+
return False
|
15 |
+
|
16 |
+
def is_chinese(string):
|
17 |
+
for ch in string:
|
18 |
+
if '\u4e00' <= ch <= '\u9fff':
|
19 |
+
return True
|
20 |
+
return False
|
21 |
+
|
22 |
+
def is_single_language(sentence):
|
23 |
+
# 检查句子是否为单一语言
|
24 |
+
contains_chinese = re.search(r'[\u4e00-\u9fff]', sentence) is not None
|
25 |
+
contains_japanese = re.search(r'[\u3040-\u30ff\u31f0-\u31ff]', sentence) is not None
|
26 |
+
contains_english = re.search(r'[a-zA-Z]', sentence) is not None
|
27 |
+
language_count = sum([contains_chinese, contains_japanese, contains_english])
|
28 |
+
return language_count == 1
|
29 |
+
|
30 |
+
def merge_scattered_parts(sentences):
|
31 |
+
"""合并零散的部分到相邻的句子中,并确保单一语言性"""
|
32 |
+
merged_sentences = []
|
33 |
+
buffer_sentence = ""
|
|
|
34 |
|
35 |
for sentence in sentences:
|
36 |
+
# 检查是否是单一语言或者太短(可能是标点或单个词)
|
37 |
+
if is_single_language(sentence) and len(sentence) > 1:
|
38 |
+
# 如果缓冲区有内容,先将缓冲区的内容添加到列表
|
39 |
+
if buffer_sentence:
|
40 |
+
merged_sentences.append(buffer_sentence)
|
41 |
+
buffer_sentence = ""
|
42 |
+
merged_sentences.append(sentence)
|
43 |
+
else:
|
44 |
+
# 如果是零散的部分,将其添加到缓冲区
|
45 |
+
buffer_sentence += sentence
|
46 |
+
|
47 |
+
# 确保最后的缓冲区内容被添加
|
48 |
+
if buffer_sentence:
|
49 |
+
merged_sentences.append(buffer_sentence)
|
50 |
+
|
51 |
+
return merged_sentences
|
52 |
+
|
53 |
+
def is_only_punctuation(s):
|
54 |
+
"""检查字符串是否只包含标点符号"""
|
55 |
+
# 此处列出中文、日文、英文常见标点符号
|
56 |
+
punctuation_pattern = re.compile(r'^[\s。*;,:“”()、!?《》\u3000\.,;:"\'?!()]+$')
|
57 |
+
return punctuation_pattern.match(s) is not None
|
58 |
+
|
59 |
+
def split_mixed_language(sentence):
|
60 |
+
# 分割混合语言句子
|
61 |
+
# 逐字符检查,分割不同语言部分
|
62 |
+
sub_sentences = []
|
63 |
+
current_language = None
|
64 |
+
current_part = ""
|
65 |
+
|
66 |
+
for char in sentence:
|
67 |
+
if re.match(r'[\u4e00-\u9fff]', char): # Chinese character
|
68 |
+
if current_language != 'chinese':
|
69 |
+
if current_part:
|
70 |
+
sub_sentences.append(current_part)
|
71 |
+
current_part = char
|
72 |
+
current_language = 'chinese'
|
73 |
+
else:
|
74 |
+
current_part += char
|
75 |
+
elif re.match(r'[\u3040-\u30ff\u31f0-\u31ff]', char): # Japanese character
|
76 |
+
if current_language != 'japanese':
|
77 |
+
if current_part:
|
78 |
+
sub_sentences.append(current_part)
|
79 |
+
current_part = char
|
80 |
+
current_language = 'japanese'
|
81 |
+
else:
|
82 |
+
current_part += char
|
83 |
+
elif re.match(r'[a-zA-Z]', char): # English character
|
84 |
+
if current_language != 'english':
|
85 |
+
if current_part:
|
86 |
+
sub_sentences.append(current_part)
|
87 |
+
current_part = char
|
88 |
+
current_language = 'english'
|
89 |
+
else:
|
90 |
+
current_part += char
|
91 |
+
else:
|
92 |
+
current_part += char # For punctuation and other characters
|
93 |
+
|
94 |
+
if current_part:
|
95 |
+
sub_sentences.append(current_part)
|
96 |
|
97 |
+
return sub_sentences
|
|
|
|
|
|
|
98 |
|
99 |
+
def replace_quotes(text):
|
100 |
+
# 替换中文、日文引号为英文引号
|
101 |
+
text = re.sub(r'[“”‘’『』「」()()]', '"', text)
|
102 |
+
return text
|
|
|
|
|
|
|
103 |
|
104 |
+
def remove_numeric_annotations(text):
|
105 |
+
# 定义用于匹配数字注释的正则表达式
|
106 |
+
# 包括 “”、【】和〔〕包裹的数字
|
107 |
+
pattern = r'“\d+”|【\d+】|〔\d+〕'
|
108 |
+
# 使用正则表达式替换掉这些注释
|
109 |
+
cleaned_text = re.sub(pattern, '', text)
|
110 |
+
return cleaned_text
|
111 |
+
|
112 |
+
def merge_adjacent_japanese(sentences):
|
113 |
+
"""合并相邻且都只包含日语的句子"""
|
114 |
+
merged_sentences = []
|
115 |
+
i = 0
|
116 |
+
while i < len(sentences):
|
117 |
+
current_sentence = sentences[i]
|
118 |
+
if i + 1 < len(sentences) and is_japanese(current_sentence) and is_japanese(sentences[i + 1]):
|
119 |
+
# 当前句子和下一句都是日语,合并它们
|
120 |
+
while i + 1 < len(sentences) and is_japanese(sentences[i + 1]):
|
121 |
+
current_sentence += sentences[i + 1]
|
122 |
+
i += 1
|
123 |
+
merged_sentences.append(current_sentence)
|
124 |
+
i += 1
|
125 |
+
return merged_sentences
|
126 |
+
|
127 |
+
def extrac(text):
|
128 |
+
text = replace_quotes(remove_numeric_annotations(text)) # 替换引号
|
129 |
+
text = re.sub("<[^>]*>", "", text) # 移除 HTML 标签
|
130 |
+
# 使用换行符和标点符号进行初步分割
|
131 |
+
preliminary_sentences = re.split(r'([\n。;!?\.\?!])', text)
|
132 |
+
final_sentences = []
|
133 |
+
|
134 |
+
preliminary_sentences = re.split(r'([\n。;!?\.\?!])', text)
|
135 |
+
|
136 |
+
for piece in preliminary_sentences:
|
137 |
+
if is_single_language(piece):
|
138 |
+
final_sentences.append(piece)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
else:
|
140 |
+
sub_sentences = split_mixed_language(piece)
|
141 |
+
final_sentences.extend(sub_sentences)
|
142 |
+
|
143 |
+
# 处理长句子,使用jieba进行分词
|
144 |
+
split_sentences = []
|
145 |
+
for sentence in final_sentences:
|
146 |
+
split_sentences.extend(split_long_sentences(sentence))
|
147 |
+
|
148 |
+
# 合并相邻的日语句子
|
149 |
+
merged_japanese_sentences = merge_adjacent_japanese(split_sentences)
|
150 |
+
|
151 |
+
# 剔除只包含标点符号的元素
|
152 |
+
clean_sentences = [s for s in merged_japanese_sentences if not is_only_punctuation(s)]
|
153 |
+
|
154 |
+
# 移除空字符串并去除多余引号
|
155 |
+
return [s.replace('"','').strip() for s in clean_sentences if s]
|
156 |
+
|
157 |
+
|
158 |
+
|
159 |
+
# 移除空字符串
|
160 |
+
|
161 |
+
def is_mixed_language(sentence):
|
162 |
+
contains_chinese = re.search(r'[\u4e00-\u9fff]', sentence) is not None
|
163 |
+
contains_japanese = re.search(r'[\u3040-\u30ff\u31f0-\u31ff]', sentence) is not None
|
164 |
+
contains_english = re.search(r'[a-zA-Z]', sentence) is not None
|
165 |
+
languages_count = sum([contains_chinese, contains_japanese, contains_english])
|
166 |
+
return languages_count > 1
|
167 |
+
|
168 |
+
def split_mixed_language(sentence):
|
169 |
+
# 分割混合语言句子
|
170 |
+
sub_sentences = re.split(r'(?<=[。!?\.\?!])(?=")|(?<=")(?=[\u4e00-\u9fff\u3040-\u30ff\u31f0-\u31ff]|[a-zA-Z])', sentence)
|
171 |
+
return [s.strip() for s in sub_sentences if s.strip()]
|
172 |
+
|
173 |
+
def seconds_to_ass_time(seconds):
|
174 |
+
"""将秒数转换为ASS时间格式"""
|
175 |
+
hours = int(seconds / 3600)
|
176 |
+
minutes = int((seconds % 3600) / 60)
|
177 |
+
seconds = int(seconds) % 60
|
178 |
+
milliseconds = int((seconds - int(seconds)) * 1000)
|
179 |
+
return "{:01d}:{:02d}:{:02d}.{:02d}".format(hours, minutes, seconds, int(milliseconds / 10))
|
180 |
+
|
181 |
+
def extract_text_from_epub(file_path):
|
182 |
+
book = epub.read_epub(file_path)
|
183 |
+
content = []
|
184 |
+
for item in book.items:
|
185 |
+
if isinstance(item, epub.EpubHtml):
|
186 |
+
soup = BeautifulSoup(item.content, 'html.parser')
|
187 |
+
content.append(soup.get_text())
|
188 |
+
return '\n'.join(content)
|
189 |
+
|
190 |
+
def extract_text_from_pdf(file_path):
|
191 |
+
with open(file_path, 'rb') as file:
|
192 |
+
reader = PdfReader(file)
|
193 |
+
content = [page.extract_text() for page in reader.pages]
|
194 |
+
return '\n'.join(content)
|
195 |
+
|
196 |
+
def remove_annotations(text):
|
197 |
+
# 移除方括号、尖括号和中文方括号中的内容
|
198 |
+
text = re.sub(r'\[.*?\]', '', text)
|
199 |
+
text = re.sub(r'\<.*?\>', '', text)
|
200 |
+
text = re.sub(r'​``【oaicite:1】``​', '', text)
|
201 |
+
return text
|
202 |
|
203 |
+
def extract_text_from_file(inputFile):
|
204 |
+
file_extension = os.path.splitext(inputFile)[1].lower()
|
205 |
+
if file_extension == ".epub":
|
206 |
+
return extract_text_from_epub(inputFile)
|
207 |
+
elif file_extension == ".pdf":
|
208 |
+
return extract_text_from_pdf(inputFile)
|
209 |
+
elif file_extension == ".txt":
|
210 |
+
with open(inputFile, 'r', encoding='utf-8') as f:
|
211 |
+
return f.read()
|
212 |
+
else:
|
213 |
+
raise ValueError(f"Unsupported file format: {file_extension}")
|
214 |
+
|
215 |
+
def split_by_punctuation(sentence):
|
216 |
+
"""按照中文次级标点符号分割句子"""
|
217 |
+
# 常见的中文次级分隔符号:逗号、分号等
|
218 |
+
parts = re.split(r'([,,;;])', sentence)
|
219 |
+
# 将标点符号与前面的词语合并,避免单独标点符号成为一个部分
|
220 |
+
merged_parts = []
|
221 |
+
for part in parts:
|
222 |
+
if part and not part in ',,;;':
|
223 |
+
merged_parts.append(part)
|
224 |
+
elif merged_parts:
|
225 |
+
merged_parts[-1] += part
|
226 |
+
return merged_parts
|
227 |
+
|
228 |
+
def split_long_sentences(sentence, max_length=30):
|
229 |
+
"""如果中文句子太长,先按标点分割,必要时使用jieba进行分词并分割"""
|
230 |
+
if len(sentence) > max_length and is_chinese(sentence):
|
231 |
+
# 首先尝试按照次级标点符号分割
|
232 |
+
preliminary_parts = split_by_punctuation(sentence)
|
233 |
+
new_sentences = []
|
234 |
+
|
235 |
+
for part in preliminary_parts:
|
236 |
+
# 如果部分仍然太长,使用jieba进行分词
|
237 |
+
if len(part) > max_length:
|
238 |
+
words = jieba.lcut(part)
|
239 |
+
current_sentence = ""
|
240 |
+
for word in words:
|
241 |
+
if len(current_sentence) + len(word) > max_length:
|
242 |
+
new_sentences.append(current_sentence)
|
243 |
+
current_sentence = word
|
244 |
+
else:
|
245 |
+
current_sentence += word
|
246 |
+
if current_sentence:
|
247 |
+
new_sentences.append(current_sentence)
|
248 |
+
else:
|
249 |
+
new_sentences.append(part)
|
250 |
+
|
251 |
+
return new_sentences
|
252 |
+
return [sentence] # 如果句子不长或不是中文,直接返回
|
253 |
+
|
254 |
+
def extract_and_convert(text):
|
255 |
+
|
256 |
+
# 使用正则表达式找出所有英文单词
|
257 |
+
english_parts = re.findall(r'\b[A-Za-z]+\b', text) # \b为单词边界标识
|
258 |
+
|
259 |
+
# 对每个英文单词进行片假名转换
|
260 |
+
kana_parts = ['\n{}\n'.format(romajitable.to_kana(word).katakana) for word in english_parts]
|
261 |
+
|
262 |
+
# 替换原文本中的英文部分
|
263 |
+
for eng, kana in zip(english_parts, kana_parts):
|
264 |
+
text = text.replace(eng, kana, 1) # 限制每次只替换一个实例
|
265 |
+
|
266 |
+
return text
|
267 |
|
268 |
if __name__ == "__main__":
|
269 |
+
text = ",如“520”,【23】和〔83〕等。.我亲爱的读者,你也许在某一刻会遇上这样的情形,不禁对那著名哲学句子“那内在的就是那外在的,那外在的就是那内在的”“3”的正确性有了或多或少的怀疑。也许你自己就怀着某种秘密,对之你有着这样一种感觉:因为这秘密在它所具有的喜悦或者痛楚对你来说是太亲切了,以至于你不愿意让他人来和你共享它。也许你的生活使得你和一些人有所接触,对于他们你有着某种预感,隐约感觉到如此的某些事情是可能的,尽管你并不一定能够通过权力或者诱惑来揭示这隐秘。也许你感受到的这些情形并不对你和你的生活发生作用,然而你对这种怀疑却不陌生;它时而在你的思绪中像一种匆匆的形影飘忽而过。这样的一种怀疑来而又去,没有人知道它从哪里来或者它到什么地方去“4”。就我自己而言,我一直对哲学的这一点怀有一种异端的想法,并且因此也尽可能地习惯于自己去深思和考究;我从在这方面与我有同感的作家们那里听取了指导,简言之,我尽了我的努力来弥补那些哲学文本们所遗留下的匮乏。渐渐地,听觉对于我来说倒成了最亲密的感觉功能;因为,正如声音是那相对外在之物而言是无法比较的内在性的揭示,于是耳朵就是用来使这内在性得以被人领会的工具,而听觉就是用来获取这内在性的感觉功能的。每当我在我所见和所听之间发现一个矛盾时,我就觉得我的怀疑得到了强化,而我的观察愿望得到了放大。一个听忏悔的神父与忏悔者之间有窗格子隔开,这神父不看,他只是听。听着听着,他渐渐构想出一个与此相应的外在;这就是说,他不会进入矛盾。相反,在你同时看和听的时候则不同,你看着的是你和言述者之间的一道窗格子。就结果而言,我为在这方面进行观察而做出的努力是非常不同的。有时候我是幸运的,有时候则不,而想要在这些道路上赢得一些战利品,幸运总是一个必须被考虑进去的因素。然而我却从来没有失去继续进行我的调查研究的愿望。如果我真的在什么时候几乎对我的坚定感到了懊悔,那么一种意外幸运也就在这样的时候为我的努力进行了加冕。于是这就是一种意外的幸运,它以一种最奇怪的方式使得我拥有了这些文稿,因而我荣幸地在此向阅读着的关注者们展示这些文稿。在这些文稿中,我得到机会去审视进两个人的生活,这强化了我关于“那外在的不是那内在的”的怀疑。尤其是他们中的一个有着这样的情形。他的外在完全与他的内在相矛盾。而他们中另一个的情形在一定的程度上也是如此,只要他在一种较为无足轻重的外在之下隐藏起了一种更���意义重大的内在,那么他就是处在这样的矛盾中。也许,考虑到顺序,我最好还是先讲述一下,我是怎样获得这些文稿的。现在算来,差不多是在七年前,我在城里的一个旧货商家那里留意到一张文书写字柜“5”,一见之下,它就吸引了我的注意力。它不是出自现代的工艺,很陈旧,但它还是吸引住了我。要解说这一印象的依据,对于我来说是不可能的,但是大多数人在他们的生命中肯定也曾经历过类似的情形。我每天的路径使我经过那旧货商和他的柜桌,在任何一天经过那里时我都从不曾放过时机盯着它看。渐渐地,这个文书写字柜在我心中有了它的故事;看着它,对于我来说成了一种必然,到最后,即使是在我有必要走另一条路的时候,我也毫不犹豫地为它的缘故而绕一段远路。由于我总这样看它,它在我心中也渐渐唤醒一种想要拥有它的愿望。其实我完全能感觉到,这是一种奇怪的愿望,既然我并不需要这家具;对于我来说,买下它就是一种浪费。正如我们所知,愿望有着一种非常诡辩性的说服力。我去了那旧货商家,推说是询问一些别的东西,在我要离开的时候,我漫不经心地就那张文书写字柜问了一个非常低的价钱。我想着,那旧货商人可能会抬价。如果是那个价,那我就占了便宜。不管怎么说,我这样做不是为了钱的缘故,而是为了要在良心上说得过去。但没有成功,那旧货商人有着一种非同寻常的坚定。又是很长一段时间,我每天都去那里,然后以一种钟情着迷的目光看着这文书写字柜。你必须下决心,我寻思着,试想一下,如果它被卖掉了,那就太晚了;哪怕你终于又找到它,你也永远得不到对它的这种印象了。在我走进旧货商家的时候,我的心狂跳着。买下了它,付了钱。这是最后一次了,我想着,你这么浪费;对了,你买下它,这恰恰是一种幸运,因为你这么老是看着它,你就该想着你曾是多么浪费,以这个文书写字柜为起点,你生活中该有一个新的段落开始了。啊,愿望有着一种非常诡辩性的说服力,那些良好的意图总是现成地摆在那里。另外参看阿德勒尔(A.P.Adler)的《对黑格尔的客观逻辑的普及讲演》。“5”[文书写字柜(Secretair)] 法国式柜子,有着许多小的、有时是隐秘的抽屉用于保存文件,并且有一块垂直翻板可以拴出来并且当写字台用。"
|
270 |
+
#print("原文本:", text)
|
271 |
+
print("处理后的文本:", extrac(text))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|