Pytesseract-PytesseractJs-LLM-OCR

Sleeping

App Files Files Community

Luke commited on Jul 30

Commit

03b6d75

•

1 Parent(s): 2d2df69

no message

Browse files

Files changed (6) hide show

IdentifyModel/cardModel.py +23 -12
Plan/AiLLM.py +19 -4
Plan/pytesseractOCR.py +3 -15
Preprocess/preprocessImg.py +11 -1
app.py +27 -48
requirements.txt +1 -1

IdentifyModel/cardModel.py CHANGED Viewed

@@ -1,26 +1,37 @@
-def parse_id_card(text, validation_type, entities=None):
     if validation_type == "身分證正面":
         result = {
             "解析全文內容": text,
-            "姓名": entities.get('B-PER', '無法解析') if entities else '無法解析',
-            "出生年月日": entities.get('B-DATE', '無法解析') if entities else '無法解析',
-            "發證日期": entities.get('I-DATE', '無法解析') if entities else '無法解析',
-            "統一編號": entities.get('B-NUM', '無法解析') if entities else '無法解析'
         }
     elif validation_type == "身分證反面":
         result = {
             "解析全文內容": text,
-            "父": entities.get('B-FATHER', '無法解析') if entities else '無法解析',
-            "母": entities.get('B-MOTHER', '無法解析') if entities else '無法解析',
-            "配偶": entities.get('B-SPOUSE', '無法解析') if entities else '無法解析',
-            "出生地": entities.get('B-LOC', '無法解析') if entities else '無法解析',
-            "住址": entities.get('I-LOC', '無法解析') if entities else '無法解析',
-            "編號": entities.get('B-ID', '無法解析') if entities else '無法解析'
         }
     else:
         result = {
             "解析全文內容": text,
         }
-    return result

+import re
+def parse_id_card(text, validation_type, entities):
     if validation_type == "身分證正面":
+        # 正則表達式
+        birthdate_pattern = r"民國\s*\d+\s*年\s*\d+\s*月\s*\d+\s*日"
+        issue_date_pattern = r"民國\s*\d+\s*年\s*\d+\s*月\s*\d+\s*日（\S+）(?:補發|換發)"
+        unified_id_pattern = r"[A-Za-z]\d{9}"
+        birthdate = re.search(birthdate_pattern, text)
+        issue_date = re.search(issue_date_pattern, text)
+        unified_id = re.search(unified_id_pattern, text)
         result = {
             "解析全文內容": text,
+            "姓名": entities.get('B-PER', '無法解析'),
+            "出生年月日": birthdate.group() if birthdate else '無法解析',
+            "發證日期": issue_date.group() if issue_date else '無法解析',
+            "統一編號": unified_id.group() if unified_id else '無法解析'
         }
     elif validation_type == "身分證反面":
         result = {
             "解析全文內容": text,
+            "父": entities.get('B-FATHER', '無法解析'),
+            "母": entities.get('B-MOTHER', '無法解析'),
+            "配偶": entities.get('B-SPOUSE', '無法解析'),
+            "出生地": entities.get('B-LOC', '無法解析'),
+            "住址": entities.get('I-LOC', '無法解析'),
+            "編號": entities.get('B-ID', '無法解析')
         }
     else:
         result = {
             "解析全文內容": text,
         }
+    return result

Plan/AiLLM.py CHANGED Viewed

@@ -1,14 +1,29 @@
-import os
 import pytesseract
-from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
 from IdentifyModel.cardModel import parse_id_card
 # 初始化 Taiwanese BERT 模型
-tokenizer = AutoTokenizer.from_pretrained("ckiplab/bert-base-chinese")
-model = AutoModelForTokenClassification.from_pretrained("ckiplab/bert-base-chinese-ner")
 ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)
 def llm_recognition(image, validation_type, language):
     text = pytesseract.image_to_string(image, lang=language)
     ner_results = ner_pipeline(text)

 import pytesseract
 from IdentifyModel.cardModel import parse_id_card
+from transformers import BertTokenizer, BertForTokenClassification
+from transformers import pipeline
+# 加載預訓練模型和分詞器
+tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
+model = BertForTokenClassification.from_pretrained("ckiplab/bert-base-chinese-ner")
 # 初始化 Taiwanese BERT 模型
 ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)
+def extract_entities(text):
+    ner_results = ner_pipeline(text)
+    entities = {}
+    for result in ner_results:
+        entity = result['entity']
+        word = result['word']
+        if entity not in entities:
+            entities[entity] = word
+        else:
+            entities[entity] += word
+    return entities
 def llm_recognition(image, validation_type, language):
     text = pytesseract.image_to_string(image, lang=language)
     ner_results = ner_pipeline(text)

Plan/pytesseractOCR.py CHANGED Viewed

@@ -1,26 +1,14 @@
-# import cv2
-import os
 import pytesseract
 from IdentifyModel.cardModel import parse_id_card
-from Preprocess.preprocessImg import preprocess_image001
 def ocr_recognition(image, validation_type, language):
     try:
         custom_config = r'--oem 3 --psm 6'
         text = pytesseract.image_to_string(image, lang=language, config=custom_config)
-        return parse_id_card(text, validation_type)
     except Exception as e:
         return str(e)
-# def ocr_recognition_2(image: str, lang: str = 'chi_tra') -> str:
-#     try:
-#         img = cv2.imread(image)
-#         gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-#         threshold_img = cv2.threshold(gray, 127, 255, cv2.THRESH_TOZERO)[1]
-#         result = pytesseract.image_to_string(threshold_img, lang=lang)
-#         os.remove(image)
-#         return result
-#     except Exception as e:
-#         return str(e)

 import pytesseract
 from IdentifyModel.cardModel import parse_id_card
+from Plan.AiLLM import extract_entities
 def ocr_recognition(image, validation_type, language):
     try:
         custom_config = r'--oem 3 --psm 6'
         text = pytesseract.image_to_string(image, lang=language, config=custom_config)
+        entities = extract_entities(text)
+        return parse_id_card(text, validation_type, entities)
     except Exception as e:
         return str(e)

Preprocess/preprocessImg.py CHANGED Viewed

@@ -16,4 +16,14 @@ def preprocess_image001(image):
     _, binary = cv2.threshold(np.array(enhanced_image), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
     # 去雜訊
     denoised = cv2.fastNlMeansDenoising(binary, None, 30, 7, 21)
-    return Image.fromarray(denoised)

     _, binary = cv2.threshold(np.array(enhanced_image), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
     # 去雜訊
     denoised = cv2.fastNlMeansDenoising(binary, None, 30, 7, 21)
+    return Image.fromarray(denoised)
+def preprocess_image002(image):
+    # 將 PIL Image 轉換為 numpy array
+    image_np = np.array(image)
+    # 使用 OpenCV 進行預處理
+    gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY)  # 灰階化
+    gray = cv2.bilateralFilter(gray, 11, 17, 17)  # 雙邊濾波去噪
+    edged = cv2.Canny(gray, 30, 200)  # 邊緣檢測
+    return Image.fromarray(edged)

app.py CHANGED Viewed

@@ -1,80 +1,59 @@
 import os
 import gradio as gr
-import pytesseract
 from Plan.AiLLM import llm_recognition
 from Plan.pytesseractOCR import ocr_recognition
-from Preprocess.preprocessImg import preprocess_image001
-langs = []
-choices = os.popen('tesseract --list-langs').read().split('\n')[1:-1]
-# If you don't have tesseract executable in your PATH, include the following:
-# pytesseract.pytesseract.tesseract_cmd = r'<full_path_to_your_tesseract_executable>'
-# Example tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract'
-# Simple image to string
-# print(pytesseract.image_to_string(Image.open('eurotext.png')))
-# # French text image to string
-# print(pytesseract.image_to_string(Image.open('test-european.jpg'), lang='fra'))
-# # Get bounding box estimates
-# print(pytesseract.image_to_boxes(Image.open('test.png')))
-# # Get verbose data including boxes, confidences, line and page numbers
-# print(pytesseract.image_to_data(Image.open('test.png')))
-# # Get information about orientation and script detection
-# print(pytesseract.image_to_osd(Image.open('test.png'))
 # 取得所有語言清單
 languages = os.popen('tesseract --list-langs').read().split('\n')[1:-1]
-print(' ======================================================== ')
-# print(' ###### choices:' + choices)
-# print(' ###### GET ENV - TESSDATA_PREFIX:' + os.getenv('TESSDATA_PREFIX'))
-# print(' ###### OS - TESSDATA_PREFIX:' + os.environ['TESSDATA_PREFIX'])
-# os.environ['TESSDATA_PREFIX'] = os.getenv('TESSDATA_PREFIX')
-# print(' ###### Tesseract_Cmd:' + pytesseract.pytesseract.tesseract_cmd)
-# pytesseract.pytesseract.tesseract_cmd = os.getenv('TESSDATA_PREFIX')
-print(' ======================================================== ')
-def preprocess_and_ocr(image, validation_type, language):
-    preprocessed_image = preprocess_image001(image)
-    ocr_result = ocr_recognition(preprocessed_image, validation_type, language)
-    return preprocessed_image, ocr_result
-def preprocess_and_llm(image, validation_type, language):
-    preprocessed_image = preprocess_image001(image)
-    llm_result = llm_recognition(preprocessed_image, validation_type, language)
-    return preprocessed_image, llm_result
 with gr.Blocks() as demo:
     with gr.Row():
         image_input = gr.Image(type="pil", label="上傳圖片")
-        preprocess_output = gr.Image(type="pil", label="預處理後的圖片")
     with gr.Row():
         validation_type = gr.Dropdown(choices=["身分證正面", "身分證反面"], label="驗證類別")
         language_dropdown = gr.Dropdown(choices=languages, value="chi_tra", label="語言")
     with gr.Row():
         ocr_button = gr.Button("使用 OCR")
         llm_button = gr.Button("使用 AI LLM")
     with gr.Row():
-        ocr_output = gr.JSON(label="OCR 解析結果")
-        llm_output = gr.JSON(label="AI LLM 解析結果")
     ocr_button.click(preprocess_and_ocr, inputs=[image_input, validation_type, language_dropdown],
-                     outputs=[preprocess_output, ocr_output])
     llm_button.click(preprocess_and_llm, inputs=[image_input, validation_type, language_dropdown],
-                     outputs=[preprocess_output, llm_output])
 demo.launch(share=False)

 import os
 import gradio as gr
 from Plan.AiLLM import llm_recognition
 from Plan.pytesseractOCR import ocr_recognition
+from Preprocess.preprocessImg import preprocess_image001, preprocess_image002
 # 取得所有語言清單
 languages = os.popen('tesseract --list-langs').read().split('\n')[1:-1]
+def preprocess_and_ocr(image, valid_type, language):
+    # 方案一
+    pre_img_001 = preprocess_image001(image)
+    ocr_result_001 = ocr_recognition(pre_img_001, valid_type, language)
+    # 方案二
+    pre_img_002 = preprocess_image002(image)
+    ocr_result_002 = ocr_recognition(pre_img_002, valid_type, language)
+    return pre_img_001, pre_img_002, ocr_result_001, ocr_result_002
+def preprocess_and_llm(image, valid_type, language):
+    # 方案一
+    pre_img_001 = preprocess_image001(image)
+    llm_result_001 = llm_recognition(pre_img_001, valid_type, language)
+    # 方案二
+    pre_img_002 = preprocess_image002(image)
+    llm_result_002 = llm_recognition(pre_img_002, valid_type, language)
+    return pre_img_001, pre_img_002, llm_result_001, llm_result_002
 with gr.Blocks() as demo:
     with gr.Row():
         image_input = gr.Image(type="pil", label="上傳圖片")
+        preprocess_output_001 = gr.Image(type="pil", label="預處理後的圖片-方案一")
+        preprocess_output_002 = gr.Image(type="pil", label="預處理後的圖片-方案二")
     with gr.Row():
         validation_type = gr.Dropdown(choices=["身分證正面", "身分證反面"], label="驗證類別")
         language_dropdown = gr.Dropdown(choices=languages, value="chi_tra", label="語言")
+        # preprocessed_type = gr.Radio(["001", "002"], label="解析方案")
     with gr.Row():
         ocr_button = gr.Button("使用 OCR")
         llm_button = gr.Button("使用 AI LLM")
     with gr.Row():
+        ocr_output_001 = gr.JSON(label="OCR-001-解析結果")
+        ocr_output_002 = gr.JSON(label="OCR-002-解析結果")
+        llm_output_001 = gr.JSON(label="AiLLM-001 解析結果")
+        llm_output_002 = gr.JSON(label="AiLLM-002 解析結果")
     ocr_button.click(preprocess_and_ocr, inputs=[image_input, validation_type, language_dropdown],
+                     outputs=[preprocess_output_001, preprocess_output_002, ocr_output_001, ocr_output_002])
     llm_button.click(preprocess_and_llm, inputs=[image_input, validation_type, language_dropdown],
+                     outputs=[preprocess_output_001, preprocess_output_002, llm_output_001, llm_output_002])
 demo.launch(share=False)

requirements.txt CHANGED Viewed

@@ -4,4 +4,4 @@ transformers
 Pillow
 torch
 huggingface-hub
-opencv-python

 Pillow
 torch
 huggingface-hub
+opencv-python