harir commited on
Commit
8cb156b
1 Parent(s): 6f7e68b

add parser.py

Browse files
Files changed (1) hide show
  1. parser.py +28 -0
parser.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import random
3
+ import string
4
+ import nltk
5
+
6
+
7
+ def replacement1(review, regex_list):
8
+ replaced_dict = {}
9
+ for regex in regex_list:
10
+ matches = re.findall(regex, review, re.IGNORECASE)
11
+ for match in matches:
12
+ random_string = ''.join(random.choices(string.ascii_letters + string.digits, k=10))
13
+ review = review.replace(match, random_string)
14
+ replaced_dict[random_string] = match
15
+ return review, replaced_dict
16
+
17
+ def replacement2(sentences, replaced_dict):
18
+ for i in range(len(sentences)):
19
+ for randomized, original in replaced_dict.items():
20
+ sentences[i] = sentences[i].replace(randomized, original)
21
+ return sentences
22
+
23
+ def parse_sentences(review):
24
+ regex_list = [r'et al.', r'"(.*?)"', r"'(.*?)'", r'e.g.', r'Sec.', r'Sec \d+(\.\d+)?\.', r'w.r.t.', r'e.q', r'fig.']
25
+ review, replaced_dict = replacement1(review, regex_list)
26
+ sentences = nltk.sent_tokenize(review)
27
+ sentences = replacement2(sentences, replaced_dict)
28
+ return sentences