Spaces:

hungdungn47
/

MultiDocsSummarization

Running

App Files Files Community

hungdungn47 commited on Aug 7

Commit

31521f5

•

1 Parent(s): 74d656b

fix chdg infer

Browse files

Files changed (1) hide show

chdg_inference.py +14 -6

chdg_inference.py CHANGED Viewed

@@ -287,7 +287,7 @@ def meanTokenVecs(text):
             buffer, buffer_str = [], ''
         else:
             wordVecs[token[0]] = token[1]
     return torch.mean(torch.stack([vec for w, vec in wordVecs.items() if w not in string.punctuation]), dim=0)
 def getPositionEncoding(pos, d=768, n=10000):
@@ -299,7 +299,6 @@ def getPositionEncoding(pos, d=768, n=10000):
     return P
 PositionVec = torch.stack([torch.from_numpy(getPositionEncoding(i, d=768)) for i in range(200)], dim=0).float().to(device)
 stop_w = ['...']
 with open('./vietnamese-stopwords-dash.txt', 'r', encoding='utf-8') as f:
     for w in f.readlines():
@@ -348,9 +347,19 @@ def loadClusterData(docs_org, category): # docs_org: list of text for each docum
     for d, doc in enumerate(docs_org):
         seclist[d], sentTexts = divideSection(doc, category)
         docs.append(sentTexts)
     sents, sentVecs, secIDs, doc_lens = [], [], [], []
-    secnum = 4
     sentnum = sum([len(doc.values()) for doc in seclist.values()])
     doc_sec_mask = np.zeros((len(docs), secnum))
     sec_sen_mask = np.zeros((secnum, sentnum))
@@ -366,7 +375,7 @@ def loadClusterData(docs_org, category): # docs_org: list of text for each docum
             sentVecs.append(meanTokenVecs(sent))
             sec_sen_mask[seclist[d][s], cursent] = 1
             cursent += 1
     return Cluster(sents, sentVecs, doc_lens, doc_sec_mask, sec_sen_mask)
 def val_e2e(data, model, max_word_num=200, c_model=None):
@@ -414,7 +423,6 @@ c_model.load_state_dict(torch.load('./c_25_0.3071.mdl', map_location=device), st
 def infer(docs, category):
     # docs = [text.strip() for text in full_text.split('<><><><><>')]
     docs = [text.strip() for text in docs]
-    print(docs)
     data_tree = loadClusterData(docs, category)
     summ = val_e2e(data_tree, model, c_model=c_model, max_word_num=200)
     summ = re.sub(r'\s+([.,;:"?()/!?])', r'\1', summ.replace('_', ' '))

             buffer, buffer_str = [], ''
         else:
             wordVecs[token[0]] = token[1]
     return torch.mean(torch.stack([vec for w, vec in wordVecs.items() if w not in string.punctuation]), dim=0)
 def getPositionEncoding(pos, d=768, n=10000):
     return P
 PositionVec = torch.stack([torch.from_numpy(getPositionEncoding(i, d=768)) for i in range(200)], dim=0).float().to(device)
 stop_w = ['...']
 with open('./vietnamese-stopwords-dash.txt', 'r', encoding='utf-8') as f:
     for w in f.readlines():
     for d, doc in enumerate(docs_org):
         seclist[d], sentTexts = divideSection(doc, category)
         docs.append(sentTexts)
+    secnum = 0
+    for k, val_dict in seclist.items():
+        vals = set(val_dict.values())
+        for ki, vi in val_dict.items():
+            for i, v in enumerate(vals):
+                if vi == v:
+                    val_dict[ki] = i + secnum
+                    break
+        seclist[k] = val_dict
+        secnum += len(vals)
     sents, sentVecs, secIDs, doc_lens = [], [], [], []
     sentnum = sum([len(doc.values()) for doc in seclist.values()])
     doc_sec_mask = np.zeros((len(docs), secnum))
     sec_sen_mask = np.zeros((secnum, sentnum))
             sentVecs.append(meanTokenVecs(sent))
             sec_sen_mask[seclist[d][s], cursent] = 1
             cursent += 1
     return Cluster(sents, sentVecs, doc_lens, doc_sec_mask, sec_sen_mask)
 def val_e2e(data, model, max_word_num=200, c_model=None):
 def infer(docs, category):
     # docs = [text.strip() for text in full_text.split('<><><><><>')]
     docs = [text.strip() for text in docs]
     data_tree = loadClusterData(docs, category)
     summ = val_e2e(data_tree, model, c_model=c_model, max_word_num=200)
     summ = re.sub(r'\s+([.,;:"?()/!?])', r'\1', summ.replace('_', ' '))