Luke
commited on
Commit
•
03b6d75
1
Parent(s):
2d2df69
no message
Browse files- IdentifyModel/cardModel.py +23 -12
- Plan/AiLLM.py +19 -4
- Plan/pytesseractOCR.py +3 -15
- Preprocess/preprocessImg.py +11 -1
- app.py +27 -48
- requirements.txt +1 -1
IdentifyModel/cardModel.py
CHANGED
@@ -1,26 +1,37 @@
|
|
|
|
1 |
|
2 |
-
|
|
|
3 |
if validation_type == "身分證正面":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
result = {
|
5 |
"解析全文內容": text,
|
6 |
-
"姓名": entities.get('B-PER', '無法解析')
|
7 |
-
"出生年月日":
|
8 |
-
"發證日期":
|
9 |
-
"統一編號":
|
10 |
}
|
11 |
elif validation_type == "身分證反面":
|
12 |
result = {
|
13 |
"解析全文內容": text,
|
14 |
-
"父": entities.get('B-FATHER', '無法解析')
|
15 |
-
"母": entities.get('B-MOTHER', '無法解析')
|
16 |
-
"配偶": entities.get('B-SPOUSE', '無法解析')
|
17 |
-
"出生地": entities.get('B-LOC', '無法解析')
|
18 |
-
"住址": entities.get('I-LOC', '無法解析')
|
19 |
-
"編號": entities.get('B-ID', '無法解析')
|
20 |
}
|
21 |
else:
|
22 |
result = {
|
23 |
"解析全文內容": text,
|
24 |
}
|
25 |
|
26 |
-
return result
|
|
|
1 |
+
import re
|
2 |
|
3 |
+
|
4 |
+
def parse_id_card(text, validation_type, entities):
|
5 |
if validation_type == "身分證正面":
|
6 |
+
# 正則表達式
|
7 |
+
birthdate_pattern = r"民國\s*\d+\s*年\s*\d+\s*月\s*\d+\s*日"
|
8 |
+
issue_date_pattern = r"民國\s*\d+\s*年\s*\d+\s*月\s*\d+\s*日(\S+)(?:補發|換發)"
|
9 |
+
unified_id_pattern = r"[A-Za-z]\d{9}"
|
10 |
+
|
11 |
+
birthdate = re.search(birthdate_pattern, text)
|
12 |
+
issue_date = re.search(issue_date_pattern, text)
|
13 |
+
unified_id = re.search(unified_id_pattern, text)
|
14 |
+
|
15 |
result = {
|
16 |
"解析全文內容": text,
|
17 |
+
"姓名": entities.get('B-PER', '無法解析'),
|
18 |
+
"出生年月日": birthdate.group() if birthdate else '無法解析',
|
19 |
+
"發證日期": issue_date.group() if issue_date else '無法解析',
|
20 |
+
"統一編號": unified_id.group() if unified_id else '無法解析'
|
21 |
}
|
22 |
elif validation_type == "身分證反面":
|
23 |
result = {
|
24 |
"解析全文內容": text,
|
25 |
+
"父": entities.get('B-FATHER', '無法解析'),
|
26 |
+
"母": entities.get('B-MOTHER', '無法解析'),
|
27 |
+
"配偶": entities.get('B-SPOUSE', '無法解析'),
|
28 |
+
"出生地": entities.get('B-LOC', '無法解析'),
|
29 |
+
"住址": entities.get('I-LOC', '無法解析'),
|
30 |
+
"編號": entities.get('B-ID', '無法解析')
|
31 |
}
|
32 |
else:
|
33 |
result = {
|
34 |
"解析全文內容": text,
|
35 |
}
|
36 |
|
37 |
+
return result
|
Plan/AiLLM.py
CHANGED
@@ -1,14 +1,29 @@
|
|
1 |
-
import os
|
2 |
import pytesseract
|
3 |
-
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
|
4 |
from IdentifyModel.cardModel import parse_id_card
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
# 初始化 Taiwanese BERT 模型
|
7 |
-
tokenizer = AutoTokenizer.from_pretrained("ckiplab/bert-base-chinese")
|
8 |
-
model = AutoModelForTokenClassification.from_pretrained("ckiplab/bert-base-chinese-ner")
|
9 |
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)
|
10 |
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
def llm_recognition(image, validation_type, language):
|
13 |
text = pytesseract.image_to_string(image, lang=language)
|
14 |
ner_results = ner_pipeline(text)
|
|
|
|
|
1 |
import pytesseract
|
|
|
2 |
from IdentifyModel.cardModel import parse_id_card
|
3 |
+
from transformers import BertTokenizer, BertForTokenClassification
|
4 |
+
from transformers import pipeline
|
5 |
+
|
6 |
+
# 加載預訓練模型和分詞器
|
7 |
+
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
|
8 |
+
model = BertForTokenClassification.from_pretrained("ckiplab/bert-base-chinese-ner")
|
9 |
|
10 |
# 初始化 Taiwanese BERT 模型
|
|
|
|
|
11 |
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)
|
12 |
|
13 |
|
14 |
+
def extract_entities(text):
|
15 |
+
ner_results = ner_pipeline(text)
|
16 |
+
entities = {}
|
17 |
+
for result in ner_results:
|
18 |
+
entity = result['entity']
|
19 |
+
word = result['word']
|
20 |
+
if entity not in entities:
|
21 |
+
entities[entity] = word
|
22 |
+
else:
|
23 |
+
entities[entity] += word
|
24 |
+
return entities
|
25 |
+
|
26 |
+
|
27 |
def llm_recognition(image, validation_type, language):
|
28 |
text = pytesseract.image_to_string(image, lang=language)
|
29 |
ner_results = ner_pipeline(text)
|
Plan/pytesseractOCR.py
CHANGED
@@ -1,26 +1,14 @@
|
|
1 |
-
# import cv2
|
2 |
-
import os
|
3 |
import pytesseract
|
4 |
|
5 |
from IdentifyModel.cardModel import parse_id_card
|
6 |
-
from
|
7 |
|
8 |
|
9 |
def ocr_recognition(image, validation_type, language):
|
10 |
try:
|
11 |
custom_config = r'--oem 3 --psm 6'
|
12 |
text = pytesseract.image_to_string(image, lang=language, config=custom_config)
|
13 |
-
|
|
|
14 |
except Exception as e:
|
15 |
return str(e)
|
16 |
-
|
17 |
-
# def ocr_recognition_2(image: str, lang: str = 'chi_tra') -> str:
|
18 |
-
# try:
|
19 |
-
# img = cv2.imread(image)
|
20 |
-
# gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
21 |
-
# threshold_img = cv2.threshold(gray, 127, 255, cv2.THRESH_TOZERO)[1]
|
22 |
-
# result = pytesseract.image_to_string(threshold_img, lang=lang)
|
23 |
-
# os.remove(image)
|
24 |
-
# return result
|
25 |
-
# except Exception as e:
|
26 |
-
# return str(e)
|
|
|
|
|
|
|
1 |
import pytesseract
|
2 |
|
3 |
from IdentifyModel.cardModel import parse_id_card
|
4 |
+
from Plan.AiLLM import extract_entities
|
5 |
|
6 |
|
7 |
def ocr_recognition(image, validation_type, language):
|
8 |
try:
|
9 |
custom_config = r'--oem 3 --psm 6'
|
10 |
text = pytesseract.image_to_string(image, lang=language, config=custom_config)
|
11 |
+
entities = extract_entities(text)
|
12 |
+
return parse_id_card(text, validation_type, entities)
|
13 |
except Exception as e:
|
14 |
return str(e)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Preprocess/preprocessImg.py
CHANGED
@@ -16,4 +16,14 @@ def preprocess_image001(image):
|
|
16 |
_, binary = cv2.threshold(np.array(enhanced_image), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
17 |
# 去雜訊
|
18 |
denoised = cv2.fastNlMeansDenoising(binary, None, 30, 7, 21)
|
19 |
-
return Image.fromarray(denoised)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
_, binary = cv2.threshold(np.array(enhanced_image), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
17 |
# 去雜訊
|
18 |
denoised = cv2.fastNlMeansDenoising(binary, None, 30, 7, 21)
|
19 |
+
return Image.fromarray(denoised)
|
20 |
+
|
21 |
+
|
22 |
+
def preprocess_image002(image):
|
23 |
+
# 將 PIL Image 轉換為 numpy array
|
24 |
+
image_np = np.array(image)
|
25 |
+
# 使用 OpenCV 進行預處理
|
26 |
+
gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY) # 灰階化
|
27 |
+
gray = cv2.bilateralFilter(gray, 11, 17, 17) # 雙邊濾波去噪
|
28 |
+
edged = cv2.Canny(gray, 30, 200) # 邊緣檢測
|
29 |
+
return Image.fromarray(edged)
|
app.py
CHANGED
@@ -1,80 +1,59 @@
|
|
1 |
import os
|
2 |
import gradio as gr
|
3 |
-
import pytesseract
|
4 |
-
|
5 |
from Plan.AiLLM import llm_recognition
|
6 |
from Plan.pytesseractOCR import ocr_recognition
|
7 |
-
from Preprocess.preprocessImg import preprocess_image001
|
8 |
-
|
9 |
-
langs = []
|
10 |
-
|
11 |
-
choices = os.popen('tesseract --list-langs').read().split('\n')[1:-1]
|
12 |
-
|
13 |
-
# If you don't have tesseract executable in your PATH, include the following:
|
14 |
-
# pytesseract.pytesseract.tesseract_cmd = r'<full_path_to_your_tesseract_executable>'
|
15 |
-
# Example tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract'
|
16 |
-
|
17 |
-
# Simple image to string
|
18 |
-
# print(pytesseract.image_to_string(Image.open('eurotext.png')))
|
19 |
-
|
20 |
-
# # French text image to string
|
21 |
-
# print(pytesseract.image_to_string(Image.open('test-european.jpg'), lang='fra'))
|
22 |
-
|
23 |
-
# # Get bounding box estimates
|
24 |
-
# print(pytesseract.image_to_boxes(Image.open('test.png')))
|
25 |
-
|
26 |
-
# # Get verbose data including boxes, confidences, line and page numbers
|
27 |
-
# print(pytesseract.image_to_data(Image.open('test.png')))
|
28 |
-
|
29 |
-
# # Get information about orientation and script detection
|
30 |
-
# print(pytesseract.image_to_osd(Image.open('test.png'))
|
31 |
-
|
32 |
|
33 |
# 取得所有語言清單
|
34 |
languages = os.popen('tesseract --list-langs').read().split('\n')[1:-1]
|
35 |
|
36 |
-
print(' ======================================================== ')
|
37 |
-
# print(' ###### choices:' + choices)
|
38 |
-
# print(' ###### GET ENV - TESSDATA_PREFIX:' + os.getenv('TESSDATA_PREFIX'))
|
39 |
-
# print(' ###### OS - TESSDATA_PREFIX:' + os.environ['TESSDATA_PREFIX'])
|
40 |
-
# os.environ['TESSDATA_PREFIX'] = os.getenv('TESSDATA_PREFIX')
|
41 |
-
# print(' ###### Tesseract_Cmd:' + pytesseract.pytesseract.tesseract_cmd)
|
42 |
-
# pytesseract.pytesseract.tesseract_cmd = os.getenv('TESSDATA_PREFIX')
|
43 |
-
print(' ======================================================== ')
|
44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
-
def preprocess_and_ocr(image, validation_type, language):
|
47 |
-
preprocessed_image = preprocess_image001(image)
|
48 |
-
ocr_result = ocr_recognition(preprocessed_image, validation_type, language)
|
49 |
-
return preprocessed_image, ocr_result
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
-
|
53 |
-
preprocessed_image = preprocess_image001(image)
|
54 |
-
llm_result = llm_recognition(preprocessed_image, validation_type, language)
|
55 |
-
return preprocessed_image, llm_result
|
56 |
|
57 |
|
58 |
with gr.Blocks() as demo:
|
59 |
with gr.Row():
|
60 |
image_input = gr.Image(type="pil", label="上傳圖片")
|
61 |
-
|
|
|
62 |
|
63 |
with gr.Row():
|
64 |
validation_type = gr.Dropdown(choices=["身分證正面", "身分證反面"], label="驗證類別")
|
65 |
language_dropdown = gr.Dropdown(choices=languages, value="chi_tra", label="語言")
|
|
|
66 |
|
67 |
with gr.Row():
|
68 |
ocr_button = gr.Button("使用 OCR")
|
69 |
llm_button = gr.Button("使用 AI LLM")
|
70 |
|
71 |
with gr.Row():
|
72 |
-
|
73 |
-
|
|
|
|
|
74 |
|
75 |
ocr_button.click(preprocess_and_ocr, inputs=[image_input, validation_type, language_dropdown],
|
76 |
-
outputs=[
|
77 |
llm_button.click(preprocess_and_llm, inputs=[image_input, validation_type, language_dropdown],
|
78 |
-
outputs=[
|
79 |
|
80 |
demo.launch(share=False)
|
|
|
1 |
import os
|
2 |
import gradio as gr
|
|
|
|
|
3 |
from Plan.AiLLM import llm_recognition
|
4 |
from Plan.pytesseractOCR import ocr_recognition
|
5 |
+
from Preprocess.preprocessImg import preprocess_image001, preprocess_image002
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
# 取得所有語言清單
|
8 |
languages = os.popen('tesseract --list-langs').read().split('\n')[1:-1]
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
+
def preprocess_and_ocr(image, valid_type, language):
|
12 |
+
# 方案一
|
13 |
+
pre_img_001 = preprocess_image001(image)
|
14 |
+
ocr_result_001 = ocr_recognition(pre_img_001, valid_type, language)
|
15 |
+
# 方案二
|
16 |
+
pre_img_002 = preprocess_image002(image)
|
17 |
+
ocr_result_002 = ocr_recognition(pre_img_002, valid_type, language)
|
18 |
+
|
19 |
+
return pre_img_001, pre_img_002, ocr_result_001, ocr_result_002
|
20 |
|
|
|
|
|
|
|
|
|
21 |
|
22 |
+
def preprocess_and_llm(image, valid_type, language):
|
23 |
+
# 方案一
|
24 |
+
pre_img_001 = preprocess_image001(image)
|
25 |
+
llm_result_001 = llm_recognition(pre_img_001, valid_type, language)
|
26 |
+
# 方案二
|
27 |
+
pre_img_002 = preprocess_image002(image)
|
28 |
+
llm_result_002 = llm_recognition(pre_img_002, valid_type, language)
|
29 |
|
30 |
+
return pre_img_001, pre_img_002, llm_result_001, llm_result_002
|
|
|
|
|
|
|
31 |
|
32 |
|
33 |
with gr.Blocks() as demo:
|
34 |
with gr.Row():
|
35 |
image_input = gr.Image(type="pil", label="上傳圖片")
|
36 |
+
preprocess_output_001 = gr.Image(type="pil", label="預處理後的圖片-方案一")
|
37 |
+
preprocess_output_002 = gr.Image(type="pil", label="預處理後的圖片-方案二")
|
38 |
|
39 |
with gr.Row():
|
40 |
validation_type = gr.Dropdown(choices=["身分證正面", "身分證反面"], label="驗證類別")
|
41 |
language_dropdown = gr.Dropdown(choices=languages, value="chi_tra", label="語言")
|
42 |
+
# preprocessed_type = gr.Radio(["001", "002"], label="解析方案")
|
43 |
|
44 |
with gr.Row():
|
45 |
ocr_button = gr.Button("使用 OCR")
|
46 |
llm_button = gr.Button("使用 AI LLM")
|
47 |
|
48 |
with gr.Row():
|
49 |
+
ocr_output_001 = gr.JSON(label="OCR-001-解析結果")
|
50 |
+
ocr_output_002 = gr.JSON(label="OCR-002-解析結果")
|
51 |
+
llm_output_001 = gr.JSON(label="AiLLM-001 解析結果")
|
52 |
+
llm_output_002 = gr.JSON(label="AiLLM-002 解析結果")
|
53 |
|
54 |
ocr_button.click(preprocess_and_ocr, inputs=[image_input, validation_type, language_dropdown],
|
55 |
+
outputs=[preprocess_output_001, preprocess_output_002, ocr_output_001, ocr_output_002])
|
56 |
llm_button.click(preprocess_and_llm, inputs=[image_input, validation_type, language_dropdown],
|
57 |
+
outputs=[preprocess_output_001, preprocess_output_002, llm_output_001, llm_output_002])
|
58 |
|
59 |
demo.launch(share=False)
|
requirements.txt
CHANGED
@@ -4,4 +4,4 @@ transformers
|
|
4 |
Pillow
|
5 |
torch
|
6 |
huggingface-hub
|
7 |
-
opencv-python
|
|
|
4 |
Pillow
|
5 |
torch
|
6 |
huggingface-hub
|
7 |
+
opencv-python
|