|
import os |
|
import base64 |
|
from marker.convert import convert_single_pdf |
|
from marker.models import load_all_models |
|
from marker.settings import Settings |
|
import gradio as gr |
|
|
|
|
|
model_list = load_all_models() |
|
|
|
def parse_pdf_and_return_markdown(pdf_file: bytes , extract_images: bool): |
|
full_text, images, out_meta = convert_single_pdf(pdf_file, model_list) |
|
image_data = {} |
|
if extract_images: |
|
for filename, image in images.items(): |
|
image.save(filename, "PNG") |
|
|
|
with open(filename, "rb") as f: |
|
image_bytes = f.read() |
|
|
|
image_base64 = base64.b64encode(image_bytes).decode('utf-8') |
|
image_data[filename] = image_base64 |
|
|
|
os.remove(filename) |
|
|
|
return full_text, out_meta, image_data |
|
|
|
|
|
with gr.Blocks() as server: |
|
gr.Markdown("# Marker: A PDF to Markdown Converter") |
|
gr.Markdown("This is a tool that converts a PDF file to markdown. It uses a combination of OCR and NLP to extract text and images from the PDF.") |
|
gr.Markdown("The images are returned as base64 encoded strings. You can use PIL to convert them back to images.") |
|
gr.Interface( |
|
parse_pdf_and_return_markdown, |
|
inputs=[gr.File(label="Upload PDF", type="filepath"), gr.Checkbox(label="Extract Images")], |
|
outputs=[gr.Text(label="Markdown"), gr.JSON(label="Metadata"), gr.JSON(label="Images")] |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
server.launch() |