rizgiak commited on
Commit
995955c
1 Parent(s): f6621ba

uncomment table detection

Browse files
Files changed (1) hide show
  1. app.py +48 -49
app.py CHANGED
@@ -640,61 +640,60 @@ class TableExtractionPipeline():
640
 
641
  caption_ocr_res = await asyncio.gather(*sequential_caption_img_list)
642
  flag_caption_pos = 0 # 0=top, 1=bottom
643
- for idx, caption_text in enumerate(caption_ocr_res):
644
- if caption_text == "" or "table" not in caption_text.lower() or "表" not in caption_text.lower():
645
- if idx%2==0:
646
- flag_caption_pos=1
647
- break
648
 
649
  for idx, caption_text in enumerate(caption_ocr_res):
650
  if idx%2==flag_caption_pos:
651
  c3.text(str(idx) + "_" + caption_text)
652
 
653
 
654
- # for idx, unpadded_table in enumerate(cropped_img_list):
655
-
656
- # table = self.add_padding(unpadded_table, padd_top, padd_right,
657
- # padd_bottom, padd_left)
658
- # # table = super_res(table)
659
- # # table = binarizeBlur_image(table)
660
- # # table = sharpen_image(table) # Test sharpen image next
661
- # # table = td_postprocess(table)
662
-
663
- # # table.save("result"+str(idx)+".png")
664
-
665
- # probas, bboxes_scaled = table_struct_recog(
666
- # table, THRESHOLD_PROBA=TSR_THRESHOLD)
667
- # rows, cols = self.generate_structure(c2, table_recognition_model,
668
- # table, probas, bboxes_scaled,
669
- # expand_rowcol_bbox_top,
670
- # expand_rowcol_bbox_bottom)
671
- # # st.write(len(rows), len(cols))
672
- # rows, cols = self.sort_table_featuresv2(rows, cols)
673
- # master_row, cols = self.individual_table_featuresv2(
674
- # table, rows, cols)
675
-
676
- # cells_img, max_cols, max_rows = self.object_to_cellsv2(
677
- # master_row, cols, expand_rowcol_bbox_top,
678
- # expand_rowcol_bbox_bottom, padd_left)
679
-
680
- # sequential_cell_img_list = []
681
- # for k, img_list in cells_img.items():
682
- # for img in img_list:
683
- # # img = super_res(img)
684
- # # img = sharpen_image(img) # Test sharpen image next
685
- # # img = binarizeBlur_image(img)
686
- # # img = self.add_padding(img, 10,10,10,10)
687
- # # plt.imshow(img)
688
- # # c3.pyplot()
689
- # sequential_cell_img_list.append(
690
- # pytess(cell_pil_img=img, threshold=OCR_THRESHOLD))
691
-
692
- # cell_ocr_res = await asyncio.gather(*sequential_cell_img_list)
693
-
694
- # self.create_dataframe(c3, cell_ocr_res, max_cols, max_rows)
695
- # st.write(
696
- # 'Errors in OCR is due to either quality of the image or performance of the OCR'
697
- # )
698
  # except:
699
  # st.write('Either incorrectly identified table or no table, to debug remove try/except')
700
  # break
 
640
 
641
  caption_ocr_res = await asyncio.gather(*sequential_caption_img_list)
642
  flag_caption_pos = 0 # 0=top, 1=bottom
643
+ if "table" in caption_ocr_res[0].lower() or "表" in caption_ocr_res[0]:
644
+ flag_caption_pos=0
645
+ else:
646
+ flag_caption_pos=1
 
647
 
648
  for idx, caption_text in enumerate(caption_ocr_res):
649
  if idx%2==flag_caption_pos:
650
  c3.text(str(idx) + "_" + caption_text)
651
 
652
 
653
+ for idx, unpadded_table in enumerate(cropped_img_list):
654
+
655
+ table = self.add_padding(unpadded_table, padd_top, padd_right,
656
+ padd_bottom, padd_left)
657
+ # table = super_res(table)
658
+ # table = binarizeBlur_image(table)
659
+ # table = sharpen_image(table) # Test sharpen image next
660
+ # table = td_postprocess(table)
661
+
662
+ # table.save("result"+str(idx)+".png")
663
+
664
+ probas, bboxes_scaled = table_struct_recog(
665
+ table, THRESHOLD_PROBA=TSR_THRESHOLD)
666
+ rows, cols = self.generate_structure(c2, table_recognition_model,
667
+ table, probas, bboxes_scaled,
668
+ expand_rowcol_bbox_top,
669
+ expand_rowcol_bbox_bottom)
670
+ # st.write(len(rows), len(cols))
671
+ rows, cols = self.sort_table_featuresv2(rows, cols)
672
+ master_row, cols = self.individual_table_featuresv2(
673
+ table, rows, cols)
674
+
675
+ cells_img, max_cols, max_rows = self.object_to_cellsv2(
676
+ master_row, cols, expand_rowcol_bbox_top,
677
+ expand_rowcol_bbox_bottom, padd_left)
678
+
679
+ sequential_cell_img_list = []
680
+ for k, img_list in cells_img.items():
681
+ for img in img_list:
682
+ # img = super_res(img)
683
+ # img = sharpen_image(img) # Test sharpen image next
684
+ # img = binarizeBlur_image(img)
685
+ # img = self.add_padding(img, 10,10,10,10)
686
+ # plt.imshow(img)
687
+ # c3.pyplot()
688
+ sequential_cell_img_list.append(
689
+ pytess(cell_pil_img=img, threshold=OCR_THRESHOLD))
690
+
691
+ cell_ocr_res = await asyncio.gather(*sequential_cell_img_list)
692
+
693
+ self.create_dataframe(c3, cell_ocr_res, max_cols, max_rows)
694
+ st.write(
695
+ 'Errors in OCR is due to either quality of the image or performance of the OCR'
696
+ )
697
  # except:
698
  # st.write('Either incorrectly identified table or no table, to debug remove try/except')
699
  # break