uncomment table detection
Browse files
app.py
CHANGED
@@ -640,61 +640,60 @@ class TableExtractionPipeline():
|
|
640 |
|
641 |
caption_ocr_res = await asyncio.gather(*sequential_caption_img_list)
|
642 |
flag_caption_pos = 0 # 0=top, 1=bottom
|
643 |
-
|
644 |
-
|
645 |
-
|
646 |
-
|
647 |
-
break
|
648 |
|
649 |
for idx, caption_text in enumerate(caption_ocr_res):
|
650 |
if idx%2==flag_caption_pos:
|
651 |
c3.text(str(idx) + "_" + caption_text)
|
652 |
|
653 |
|
654 |
-
|
655 |
-
|
656 |
-
|
657 |
-
|
658 |
-
|
659 |
-
|
660 |
-
|
661 |
-
|
662 |
-
|
663 |
-
|
664 |
-
|
665 |
-
|
666 |
-
|
667 |
-
|
668 |
-
|
669 |
-
|
670 |
-
|
671 |
-
|
672 |
-
|
673 |
-
|
674 |
-
|
675 |
-
|
676 |
-
|
677 |
-
|
678 |
-
|
679 |
-
|
680 |
-
|
681 |
-
|
682 |
-
|
683 |
-
|
684 |
-
|
685 |
-
|
686 |
-
|
687 |
-
|
688 |
-
|
689 |
-
|
690 |
-
|
691 |
-
|
692 |
-
|
693 |
-
|
694 |
-
|
695 |
-
|
696 |
-
|
697 |
-
|
698 |
# except:
|
699 |
# st.write('Either incorrectly identified table or no table, to debug remove try/except')
|
700 |
# break
|
|
|
640 |
|
641 |
caption_ocr_res = await asyncio.gather(*sequential_caption_img_list)
|
642 |
flag_caption_pos = 0 # 0=top, 1=bottom
|
643 |
+
if "table" in caption_ocr_res[0].lower() or "表" in caption_ocr_res[0]:
|
644 |
+
flag_caption_pos=0
|
645 |
+
else:
|
646 |
+
flag_caption_pos=1
|
|
|
647 |
|
648 |
for idx, caption_text in enumerate(caption_ocr_res):
|
649 |
if idx%2==flag_caption_pos:
|
650 |
c3.text(str(idx) + "_" + caption_text)
|
651 |
|
652 |
|
653 |
+
for idx, unpadded_table in enumerate(cropped_img_list):
|
654 |
+
|
655 |
+
table = self.add_padding(unpadded_table, padd_top, padd_right,
|
656 |
+
padd_bottom, padd_left)
|
657 |
+
# table = super_res(table)
|
658 |
+
# table = binarizeBlur_image(table)
|
659 |
+
# table = sharpen_image(table) # Test sharpen image next
|
660 |
+
# table = td_postprocess(table)
|
661 |
+
|
662 |
+
# table.save("result"+str(idx)+".png")
|
663 |
+
|
664 |
+
probas, bboxes_scaled = table_struct_recog(
|
665 |
+
table, THRESHOLD_PROBA=TSR_THRESHOLD)
|
666 |
+
rows, cols = self.generate_structure(c2, table_recognition_model,
|
667 |
+
table, probas, bboxes_scaled,
|
668 |
+
expand_rowcol_bbox_top,
|
669 |
+
expand_rowcol_bbox_bottom)
|
670 |
+
# st.write(len(rows), len(cols))
|
671 |
+
rows, cols = self.sort_table_featuresv2(rows, cols)
|
672 |
+
master_row, cols = self.individual_table_featuresv2(
|
673 |
+
table, rows, cols)
|
674 |
+
|
675 |
+
cells_img, max_cols, max_rows = self.object_to_cellsv2(
|
676 |
+
master_row, cols, expand_rowcol_bbox_top,
|
677 |
+
expand_rowcol_bbox_bottom, padd_left)
|
678 |
+
|
679 |
+
sequential_cell_img_list = []
|
680 |
+
for k, img_list in cells_img.items():
|
681 |
+
for img in img_list:
|
682 |
+
# img = super_res(img)
|
683 |
+
# img = sharpen_image(img) # Test sharpen image next
|
684 |
+
# img = binarizeBlur_image(img)
|
685 |
+
# img = self.add_padding(img, 10,10,10,10)
|
686 |
+
# plt.imshow(img)
|
687 |
+
# c3.pyplot()
|
688 |
+
sequential_cell_img_list.append(
|
689 |
+
pytess(cell_pil_img=img, threshold=OCR_THRESHOLD))
|
690 |
+
|
691 |
+
cell_ocr_res = await asyncio.gather(*sequential_cell_img_list)
|
692 |
+
|
693 |
+
self.create_dataframe(c3, cell_ocr_res, max_cols, max_rows)
|
694 |
+
st.write(
|
695 |
+
'Errors in OCR is due to either quality of the image or performance of the OCR'
|
696 |
+
)
|
697 |
# except:
|
698 |
# st.write('Either incorrectly identified table or no table, to debug remove try/except')
|
699 |
# break
|