Luke commited on
Commit
03b6d75
1 Parent(s): 2d2df69

no message

Browse files
IdentifyModel/cardModel.py CHANGED
@@ -1,26 +1,37 @@
 
1
 
2
- def parse_id_card(text, validation_type, entities=None):
 
3
  if validation_type == "身分證正面":
 
 
 
 
 
 
 
 
 
4
  result = {
5
  "解析全文內容": text,
6
- "姓名": entities.get('B-PER', '無法解析') if entities else '無法解析',
7
- "出生年月日": entities.get('B-DATE', '無法解析') if entities else '無法解析',
8
- "發證日期": entities.get('I-DATE', '無法解析') if entities else '無法解析',
9
- "統一編號": entities.get('B-NUM', '無法解析') if entities else '無法解析'
10
  }
11
  elif validation_type == "身分證反面":
12
  result = {
13
  "解析全文內容": text,
14
- "父": entities.get('B-FATHER', '無法解析') if entities else '無法解析',
15
- "母": entities.get('B-MOTHER', '無法解析') if entities else '無法解析',
16
- "配偶": entities.get('B-SPOUSE', '無法解析') if entities else '無法解析',
17
- "出生地": entities.get('B-LOC', '無法解析') if entities else '無法解析',
18
- "住址": entities.get('I-LOC', '無法解析') if entities else '無法解析',
19
- "編號": entities.get('B-ID', '無法解析') if entities else '無法解析'
20
  }
21
  else:
22
  result = {
23
  "解析全文內容": text,
24
  }
25
 
26
- return result
 
1
+ import re
2
 
3
+
4
+ def parse_id_card(text, validation_type, entities):
5
  if validation_type == "身分證正面":
6
+ # 正則表達式
7
+ birthdate_pattern = r"民國\s*\d+\s*年\s*\d+\s*月\s*\d+\s*日"
8
+ issue_date_pattern = r"民國\s*\d+\s*年\s*\d+\s*月\s*\d+\s*日(\S+)(?:補發|換發)"
9
+ unified_id_pattern = r"[A-Za-z]\d{9}"
10
+
11
+ birthdate = re.search(birthdate_pattern, text)
12
+ issue_date = re.search(issue_date_pattern, text)
13
+ unified_id = re.search(unified_id_pattern, text)
14
+
15
  result = {
16
  "解析全文內容": text,
17
+ "姓名": entities.get('B-PER', '無法解析'),
18
+ "出生年月日": birthdate.group() if birthdate else '無法解析',
19
+ "發證日期": issue_date.group() if issue_date else '無法解析',
20
+ "統一編號": unified_id.group() if unified_id else '無法解析'
21
  }
22
  elif validation_type == "身分證反面":
23
  result = {
24
  "解析全文內容": text,
25
+ "父": entities.get('B-FATHER', '無法解析'),
26
+ "母": entities.get('B-MOTHER', '無法解析'),
27
+ "配偶": entities.get('B-SPOUSE', '無法解析'),
28
+ "出生地": entities.get('B-LOC', '無法解析'),
29
+ "住址": entities.get('I-LOC', '無法解析'),
30
+ "編號": entities.get('B-ID', '無法解析')
31
  }
32
  else:
33
  result = {
34
  "解析全文內容": text,
35
  }
36
 
37
+ return result
Plan/AiLLM.py CHANGED
@@ -1,14 +1,29 @@
1
- import os
2
  import pytesseract
3
- from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
4
  from IdentifyModel.cardModel import parse_id_card
 
 
 
 
 
 
5
 
6
  # 初始化 Taiwanese BERT 模型
7
- tokenizer = AutoTokenizer.from_pretrained("ckiplab/bert-base-chinese")
8
- model = AutoModelForTokenClassification.from_pretrained("ckiplab/bert-base-chinese-ner")
9
  ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)
10
 
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  def llm_recognition(image, validation_type, language):
13
  text = pytesseract.image_to_string(image, lang=language)
14
  ner_results = ner_pipeline(text)
 
 
1
  import pytesseract
 
2
  from IdentifyModel.cardModel import parse_id_card
3
+ from transformers import BertTokenizer, BertForTokenClassification
4
+ from transformers import pipeline
5
+
6
+ # 加載預訓練模型和分詞器
7
+ tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
8
+ model = BertForTokenClassification.from_pretrained("ckiplab/bert-base-chinese-ner")
9
 
10
  # 初始化 Taiwanese BERT 模型
 
 
11
  ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)
12
 
13
 
14
+ def extract_entities(text):
15
+ ner_results = ner_pipeline(text)
16
+ entities = {}
17
+ for result in ner_results:
18
+ entity = result['entity']
19
+ word = result['word']
20
+ if entity not in entities:
21
+ entities[entity] = word
22
+ else:
23
+ entities[entity] += word
24
+ return entities
25
+
26
+
27
  def llm_recognition(image, validation_type, language):
28
  text = pytesseract.image_to_string(image, lang=language)
29
  ner_results = ner_pipeline(text)
Plan/pytesseractOCR.py CHANGED
@@ -1,26 +1,14 @@
1
- # import cv2
2
- import os
3
  import pytesseract
4
 
5
  from IdentifyModel.cardModel import parse_id_card
6
- from Preprocess.preprocessImg import preprocess_image001
7
 
8
 
9
  def ocr_recognition(image, validation_type, language):
10
  try:
11
  custom_config = r'--oem 3 --psm 6'
12
  text = pytesseract.image_to_string(image, lang=language, config=custom_config)
13
- return parse_id_card(text, validation_type)
 
14
  except Exception as e:
15
  return str(e)
16
-
17
- # def ocr_recognition_2(image: str, lang: str = 'chi_tra') -> str:
18
- # try:
19
- # img = cv2.imread(image)
20
- # gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
21
- # threshold_img = cv2.threshold(gray, 127, 255, cv2.THRESH_TOZERO)[1]
22
- # result = pytesseract.image_to_string(threshold_img, lang=lang)
23
- # os.remove(image)
24
- # return result
25
- # except Exception as e:
26
- # return str(e)
 
 
 
1
  import pytesseract
2
 
3
  from IdentifyModel.cardModel import parse_id_card
4
+ from Plan.AiLLM import extract_entities
5
 
6
 
7
  def ocr_recognition(image, validation_type, language):
8
  try:
9
  custom_config = r'--oem 3 --psm 6'
10
  text = pytesseract.image_to_string(image, lang=language, config=custom_config)
11
+ entities = extract_entities(text)
12
+ return parse_id_card(text, validation_type, entities)
13
  except Exception as e:
14
  return str(e)
 
 
 
 
 
 
 
 
 
 
 
Preprocess/preprocessImg.py CHANGED
@@ -16,4 +16,14 @@ def preprocess_image001(image):
16
  _, binary = cv2.threshold(np.array(enhanced_image), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
17
  # 去雜訊
18
  denoised = cv2.fastNlMeansDenoising(binary, None, 30, 7, 21)
19
- return Image.fromarray(denoised)
 
 
 
 
 
 
 
 
 
 
 
16
  _, binary = cv2.threshold(np.array(enhanced_image), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
17
  # 去雜訊
18
  denoised = cv2.fastNlMeansDenoising(binary, None, 30, 7, 21)
19
+ return Image.fromarray(denoised)
20
+
21
+
22
+ def preprocess_image002(image):
23
+ # 將 PIL Image 轉換為 numpy array
24
+ image_np = np.array(image)
25
+ # 使用 OpenCV 進行預處理
26
+ gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY) # 灰階化
27
+ gray = cv2.bilateralFilter(gray, 11, 17, 17) # 雙邊濾波去噪
28
+ edged = cv2.Canny(gray, 30, 200) # 邊緣檢測
29
+ return Image.fromarray(edged)
app.py CHANGED
@@ -1,80 +1,59 @@
1
  import os
2
  import gradio as gr
3
- import pytesseract
4
-
5
  from Plan.AiLLM import llm_recognition
6
  from Plan.pytesseractOCR import ocr_recognition
7
- from Preprocess.preprocessImg import preprocess_image001
8
-
9
- langs = []
10
-
11
- choices = os.popen('tesseract --list-langs').read().split('\n')[1:-1]
12
-
13
- # If you don't have tesseract executable in your PATH, include the following:
14
- # pytesseract.pytesseract.tesseract_cmd = r'<full_path_to_your_tesseract_executable>'
15
- # Example tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract'
16
-
17
- # Simple image to string
18
- # print(pytesseract.image_to_string(Image.open('eurotext.png')))
19
-
20
- # # French text image to string
21
- # print(pytesseract.image_to_string(Image.open('test-european.jpg'), lang='fra'))
22
-
23
- # # Get bounding box estimates
24
- # print(pytesseract.image_to_boxes(Image.open('test.png')))
25
-
26
- # # Get verbose data including boxes, confidences, line and page numbers
27
- # print(pytesseract.image_to_data(Image.open('test.png')))
28
-
29
- # # Get information about orientation and script detection
30
- # print(pytesseract.image_to_osd(Image.open('test.png'))
31
-
32
 
33
  # 取得所有語言清單
34
  languages = os.popen('tesseract --list-langs').read().split('\n')[1:-1]
35
 
36
- print(' ======================================================== ')
37
- # print(' ###### choices:' + choices)
38
- # print(' ###### GET ENV - TESSDATA_PREFIX:' + os.getenv('TESSDATA_PREFIX'))
39
- # print(' ###### OS - TESSDATA_PREFIX:' + os.environ['TESSDATA_PREFIX'])
40
- # os.environ['TESSDATA_PREFIX'] = os.getenv('TESSDATA_PREFIX')
41
- # print(' ###### Tesseract_Cmd:' + pytesseract.pytesseract.tesseract_cmd)
42
- # pytesseract.pytesseract.tesseract_cmd = os.getenv('TESSDATA_PREFIX')
43
- print(' ======================================================== ')
44
 
 
 
 
 
 
 
 
 
 
45
 
46
- def preprocess_and_ocr(image, validation_type, language):
47
- preprocessed_image = preprocess_image001(image)
48
- ocr_result = ocr_recognition(preprocessed_image, validation_type, language)
49
- return preprocessed_image, ocr_result
50
 
 
 
 
 
 
 
 
51
 
52
- def preprocess_and_llm(image, validation_type, language):
53
- preprocessed_image = preprocess_image001(image)
54
- llm_result = llm_recognition(preprocessed_image, validation_type, language)
55
- return preprocessed_image, llm_result
56
 
57
 
58
  with gr.Blocks() as demo:
59
  with gr.Row():
60
  image_input = gr.Image(type="pil", label="上傳圖片")
61
- preprocess_output = gr.Image(type="pil", label="預處理後的圖片")
 
62
 
63
  with gr.Row():
64
  validation_type = gr.Dropdown(choices=["身分證正面", "身分證反面"], label="驗證類別")
65
  language_dropdown = gr.Dropdown(choices=languages, value="chi_tra", label="語言")
 
66
 
67
  with gr.Row():
68
  ocr_button = gr.Button("使用 OCR")
69
  llm_button = gr.Button("使用 AI LLM")
70
 
71
  with gr.Row():
72
- ocr_output = gr.JSON(label="OCR 解析結果")
73
- llm_output = gr.JSON(label="AI LLM 解析結果")
 
 
74
 
75
  ocr_button.click(preprocess_and_ocr, inputs=[image_input, validation_type, language_dropdown],
76
- outputs=[preprocess_output, ocr_output])
77
  llm_button.click(preprocess_and_llm, inputs=[image_input, validation_type, language_dropdown],
78
- outputs=[preprocess_output, llm_output])
79
 
80
  demo.launch(share=False)
 
1
  import os
2
  import gradio as gr
 
 
3
  from Plan.AiLLM import llm_recognition
4
  from Plan.pytesseractOCR import ocr_recognition
5
+ from Preprocess.preprocessImg import preprocess_image001, preprocess_image002
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  # 取得所有語言清單
8
  languages = os.popen('tesseract --list-langs').read().split('\n')[1:-1]
9
 
 
 
 
 
 
 
 
 
10
 
11
+ def preprocess_and_ocr(image, valid_type, language):
12
+ # 方案一
13
+ pre_img_001 = preprocess_image001(image)
14
+ ocr_result_001 = ocr_recognition(pre_img_001, valid_type, language)
15
+ # 方案二
16
+ pre_img_002 = preprocess_image002(image)
17
+ ocr_result_002 = ocr_recognition(pre_img_002, valid_type, language)
18
+
19
+ return pre_img_001, pre_img_002, ocr_result_001, ocr_result_002
20
 
 
 
 
 
21
 
22
+ def preprocess_and_llm(image, valid_type, language):
23
+ # 方案一
24
+ pre_img_001 = preprocess_image001(image)
25
+ llm_result_001 = llm_recognition(pre_img_001, valid_type, language)
26
+ # 方案二
27
+ pre_img_002 = preprocess_image002(image)
28
+ llm_result_002 = llm_recognition(pre_img_002, valid_type, language)
29
 
30
+ return pre_img_001, pre_img_002, llm_result_001, llm_result_002
 
 
 
31
 
32
 
33
  with gr.Blocks() as demo:
34
  with gr.Row():
35
  image_input = gr.Image(type="pil", label="上傳圖片")
36
+ preprocess_output_001 = gr.Image(type="pil", label="預處理後的圖片-方案一")
37
+ preprocess_output_002 = gr.Image(type="pil", label="預處理後的圖片-方案二")
38
 
39
  with gr.Row():
40
  validation_type = gr.Dropdown(choices=["身分證正面", "身分證反面"], label="驗證類別")
41
  language_dropdown = gr.Dropdown(choices=languages, value="chi_tra", label="語言")
42
+ # preprocessed_type = gr.Radio(["001", "002"], label="解析方案")
43
 
44
  with gr.Row():
45
  ocr_button = gr.Button("使用 OCR")
46
  llm_button = gr.Button("使用 AI LLM")
47
 
48
  with gr.Row():
49
+ ocr_output_001 = gr.JSON(label="OCR-001-解析結果")
50
+ ocr_output_002 = gr.JSON(label="OCR-002-解析結果")
51
+ llm_output_001 = gr.JSON(label="AiLLM-001 解析結果")
52
+ llm_output_002 = gr.JSON(label="AiLLM-002 解析結果")
53
 
54
  ocr_button.click(preprocess_and_ocr, inputs=[image_input, validation_type, language_dropdown],
55
+ outputs=[preprocess_output_001, preprocess_output_002, ocr_output_001, ocr_output_002])
56
  llm_button.click(preprocess_and_llm, inputs=[image_input, validation_type, language_dropdown],
57
+ outputs=[preprocess_output_001, preprocess_output_002, llm_output_001, llm_output_002])
58
 
59
  demo.launch(share=False)
requirements.txt CHANGED
@@ -4,4 +4,4 @@ transformers
4
  Pillow
5
  torch
6
  huggingface-hub
7
- opencv-python
 
4
  Pillow
5
  torch
6
  huggingface-hub
7
+ opencv-python