Spaces:

huggingchat
/

pdf-to-markdown

Running on Zero

App Files Files Community

Liam Dyer commited on May 22

Commit

312add7

•

1 Parent(s): 4199c92

rewrite with pypdf and ocrmypdf

Browse files

Files changed (3) hide show

app.py +18 -67
packages.txt +2 -0
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,88 +1,39 @@
 import spaces
 import gradio as gr
-import surya.detection as detection
-import surya.layout as layout
-import os
-import base64
-# Monkey patch to prevent spawning processes
-def batch_text_detection(images, model, processor, batch_size=None):
-    preds, orig_sizes = detection.batch_detection(
-        images, model, processor, batch_size=batch_size
-    )
-    results = []
-    for i in range(len(images)):
-        result = detection.parallel_get_lines(preds[i], orig_sizes[i])
-        results.append(result)
-    return results
-detection.batch_text_detection = batch_text_detection
-def batch_layout_detection(
-    images, model, processor, detection_results=None, batch_size=None
-):
-    preds, orig_sizes = layout.batch_detection(
-        images, model, processor, batch_size=batch_size
-    )
-    id2label = model.config.id2label
-    results = []
-    for i in range(len(images)):
-        result = layout.parallel_get_regions(
-            preds[i],
-            orig_sizes[i],
-            id2label,
-            detection_results[i] if detection_results else None,
-        )
-        results.append(result)
-    return results
-layout.batch_layout_detection = batch_layout_detection
-from marker.convert import convert_single_pdf
-from marker.models import load_all_models
-model_list = load_all_models()
 @spaces.GPU
-def convert(pdf_file, extract_images):
-    global model_list
-    full_text, images, out_meta = convert_single_pdf(
-        pdf_file, model_list, batch_multiplier=16
-    )
-    image_data = {}
-    if extract_images:
-        for filename, image in images.items():
-            image.save(filename, "PNG")
-            with open(filename, "rb") as f:
-                image_bytes = f.read()
-            image_base64 = base64.b64encode(image_bytes).decode("utf-8")
-            image_data[filename] = image_base64
-            os.remove(filename)
-    return full_text, out_meta, image_data
 gr.Interface(
     convert,
     inputs=[
         gr.File(label="Upload PDF", type="filepath"),
-        gr.Checkbox(label="Extract Images"),
     ],
     outputs=[
         gr.Text(label="Markdown"),
         gr.JSON(label="Metadata"),
-        gr.JSON(label="Images"),
     ],
 ).launch()

 import spaces
 import gradio as gr
+from pypdf import PdfReader
+import ocrmypdf
 @spaces.GPU
+def convert(pdf_file):
+    reader = PdfReader(pdf_file)
+    # Check if there are any images
+    image_count = 0
+    for page in reader.pages:
+        image_count += len(page.images)
+    # If there are images, perform OCR on the document
+    if image_count > 0:
+        out_pdf_file = pdf_file.replace(".pdf", "_ocr.pdf")
+        ocrmypdf.ocr(pdf_file, out_pdf_file, force_ocr=True)
+        pdf_file = out_pdf_file
+    # Extract text
+    full_text = ""
+    for idx, page in enumerate(reader.pages):
+        full_text += f"\n\n---- Page {idx} ----\n\n" + page.extract_text()
+    return full_text, reader.metadata
 gr.Interface(
     convert,
     inputs=[
         gr.File(label="Upload PDF", type="filepath"),
     ],
     outputs=[
         gr.Text(label="Markdown"),
         gr.JSON(label="Metadata"),
     ],
 ).launch()

packages.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ ocrmypdf
2	+ tesseract-ocr-eng

requirements.txt CHANGED Viewed

	@@ -1 +1,2 @@
1	- ~~marker-pdf~~==0.2.5


1	+ ocrmypdf==16.3.1
2	+ pypdf==4.2.0