Skip to main content

Overview

This tutorial walks you through how to parse a document, extract a subset of fields, and then connect the fields back to their original locations in the document. We provide a single script for the full workflow. Running this script saves images of the locations of the fields as PNGs.

Scenario and Materials

Scripts

import requests
import io
import pymupdf

# Start the parse process
headers = {
    'Authorization': 'Bearer YOUR_API_KEY'
}

# Define the parsing endpoint
url_parse = 'https://api.va.landing.ai/v1/ade/parse'

# Upload a document
document = open('wire-transfer.pdf', 'rb')
files = {'document': document}
data_parse = {'model': 'dpt-2-latest'}

# Parse the document
response = requests.post(url_parse, files=files, data=data_parse, headers=headers)

# Parse the JSON response
response_parse = response.json()

# Create variables that store the fields from the response
markdown = response_parse['markdown']
chunks = response_parse['chunks']

# Start the extraction process
# Define the extract endpoint
url_extract = 'https://api.va.landing.ai/v1/ade/extract'

# Read the schema file as string
with open('schema-wire-transfer.json', 'r') as f:
    schema_content = f.read()

# Prepare the markdown as a file-like object and schema as data
files_extract = {'markdown': io.StringIO(markdown)}
data_extract = {'schema': schema_content, 'model': 'extract-latest'}

# Run extraction
response_extract = requests.post(url_extract, files=files_extract, data=data_extract, headers=headers)

# Parse the extraction response
response_extraction = response_extract.json()

# Start the process to connect the extracted fields to their locations in the document
# Load the original PDF
pdf = pymupdf.open('wire-transfer.pdf')
extraction = response_extraction['extraction']
extraction_metadata = response_extraction['extraction_metadata']

# Process each extracted field
for field_name, field_value in extraction.items():
    # Get the chunk IDs that contain this field's data
    refs = extraction_metadata[field_name]['references']
    # Find the chunks that match these IDs
    ref_chunks = [chunk for chunk in chunks if chunk['id'] in refs]

    # Process each chunk's grounding information
    for chunk in ref_chunks:
        grounding = chunk['grounding']
        # Get the page image
        page_image = pdf[grounding['page']].get_pixmap(dpi=72)

        # Convert normalized coordinates (0-1) to pixel coordinates
        left = int(grounding['box']['left'] * page_image.width)
        right = int(grounding['box']['right'] * page_image.width)
        top = int(grounding['box']['top'] * page_image.height)
        bottom = int(grounding['box']['bottom'] * page_image.height)

        # Crop the region and save as image
        chunk_crop = page_image.pil_image().crop((left, top, right, bottom))
        chunk_crop.save(f"crop_{field_name}.png")

pdf.close()