> ## Documentation Index
> Fetch the complete documentation index at: https://docs.landing.ai/llms.txt
> Use this file to discover all available pages before exploring further.

# Save Parsed Chunks as Images

> Save each parsed chunk as a separate PNG to build datasets or analyze chunk quality.

export const companyName = 'LandingAI';

export const extract = 'ADE Extract';

export const parse = 'ADE Parse';

export const ade = 'Agentic Document Extraction';

## Overview

Use this script to extract and save each parsed chunk as a separate PNG. This is useful for building datasets, analyzing chunk quality, or processing individual document regions.

<Info>
  These examples require the [Python](./ade-python) or [TypeScript](./ade-typescript) client library. Before running a script, set your API key and install the library and any required dependencies.
</Info>

## Scripts

<CodeGroup>
  ```python Python [expandable] theme={null}
  from pathlib import Path
  from datetime import datetime
  from landingai_ade import LandingAIADE
  from PIL import Image
  import pymupdf

  def save_chunks_as_images(parse_response, document_path, output_base_dir="groundings"):
      """Save each parsed chunk as a separate image file."""

      # Create timestamped output directory
      timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
      document_name = Path(document_path).stem
      output_dir = Path(output_base_dir) / f"{document_name}_{timestamp}"

      def save_page_chunks(image, chunks, page_num):
          """Save all chunks for a specific page."""
          img_width, img_height = image.size

          # Create page-specific directory
          page_dir = output_dir / f"page_{page_num}"
          page_dir.mkdir(parents=True, exist_ok=True)

          for chunk in chunks:
              # Check if chunk belongs to this page
              if chunk.grounding.page != page_num:
                  continue

              box = chunk.grounding.box

              # Convert normalized coordinates to pixel coordinates
              x1 = int(box.left * img_width)
              y1 = int(box.top * img_height)
              x2 = int(box.right * img_width)
              y2 = int(box.bottom * img_height)

              # Crop the chunk region
              chunk_img = image.crop((x1, y1, x2, y2))

              # Save with descriptive filename
              filename = f"{chunk.type}.{chunk.id}.png"
              output_path = page_dir / filename
              chunk_img.save(output_path)

              print(f"Saved chunk: {output_path}")

      if document_path.suffix.lower() == '.pdf':
          pdf = pymupdf.open(document_path)
          total_pages = len(pdf)

          for page_num in range(total_pages):
              page = pdf[page_num]
              pix = page.get_pixmap(matrix=pymupdf.Matrix(2, 2))  # 2x scaling for clarity
              img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

              # Save chunks for this page
              save_page_chunks(img, parse_response.chunks, page_num)

          pdf.close()
      else:
          # Load image file directly
          img = Image.open(document_path)
          if img.mode != "RGB":
              img = img.convert("RGB")

          # Save chunks for single page
          save_page_chunks(img, parse_response.chunks, 0)

      print(f"\nAll chunks saved to: {output_dir}")
      return output_dir

  # Initialize client (uses the API key from the VISION_AGENT_API_KEY environment variable)
  client = LandingAIADE()

  # Replace with your file path
  document_path = Path("/path/to/file/document")

  # Parse the document
  print("Parsing document...")
  parse_response = client.parse(
      document=document_path,
      model="dpt-2-latest"
  )
  print("Parsing complete!")

  # Save chunks as images
  save_chunks_as_images(parse_response, document_path)
  ```

  ```typescript TypeScript [expandable] theme={null}
  import LandingAIADE from "landingai-ade";
  import fs from "fs";
  import path from "path";
  import { createCanvas, loadImage } from "canvas";
  import { pdf } from "pdf-to-img";

  async function saveChunksAsImages(
    parseResponse: any,
    documentPath: string,
    outputBaseDir: string = "groundings"
  ) {
    // Create timestamped output directory
    const timestamp = new Date().toISOString().replace(/[:.]/g, "-").slice(0, -5);
    const documentName = path.basename(documentPath, path.extname(documentPath));
    const outputDir = path.join(outputBaseDir, `${documentName}_${timestamp}`);

    async function savePageChunks(
      imageBuffer: Buffer,
      chunks: any[],
      pageNum: number
    ) {
      // Load the page image
      const image = await loadImage(imageBuffer);
      const imgWidth = image.width;
      const imgHeight = image.height;

      // Create page-specific directory
      const pageDir = path.join(outputDir, `page_${pageNum}`);
      if (!fs.existsSync(pageDir)) {
        fs.mkdirSync(pageDir, { recursive: true });
      }

      // Process each chunk
      for (const chunk of chunks) {
        // Check if chunk belongs to this page
        if (chunk.grounding.page !== pageNum) {
          continue;
        }

        const box = chunk.grounding.box;

        // Convert normalized coordinates to pixel coordinates
        const x1 = Math.floor(box.left * imgWidth);
        const y1 = Math.floor(box.top * imgHeight);
        const x2 = Math.floor(box.right * imgWidth);
        const y2 = Math.floor(box.bottom * imgHeight);

        // Calculate crop dimensions
        const width = x2 - x1;
        const height = y2 - y1;

        // Create canvas for cropped chunk
        const canvas = createCanvas(width, height);
        const ctx = canvas.getContext("2d");

        // Draw the cropped region
        ctx.drawImage(image, x1, y1, width, height, 0, 0, width, height);

        // Save with descriptive filename
        const filename = `${chunk.type}.${chunk.id}.png`;
        const outputPath = path.join(pageDir, filename);
        const buffer = canvas.toBuffer("image/png");
        fs.writeFileSync(outputPath, buffer);

        console.log(`Saved chunk: ${outputPath}`);
      }
    }

    const fileExtension = path.extname(documentPath).toLowerCase();

    if (fileExtension === ".pdf") {
      // Convert PDF to images
      const document = await pdf(documentPath, { scale: 2.0 });

      let pageNum = 0;
      for await (const page of document) {
        console.log(`Processing page ${pageNum}...`);
        await savePageChunks(page, parseResponse.chunks, pageNum);
        pageNum++;
      }
    } else {
      // Load image file directly
      const imageBuffer = fs.readFileSync(documentPath);
      await savePageChunks(imageBuffer, parseResponse.chunks, 0);
    }

    console.log(`\nAll chunks saved to: ${outputDir}`);
    return outputDir;
  }

  // Initialize client (uses the API key from the VISION_AGENT_API_KEY environment variable)
  const client = new LandingAIADE();

  async function extractChunks() {
    // Replace with your file path
    const documentPath = "/path/to/file/document";

    // Parse the document
    console.log("Parsing document...");
    const parseResponse = await client.parse({
      document: fs.createReadStream(documentPath),
      model: "dpt-2-latest"
    });
    console.log("Parsing complete!");

    // Save each chunk as a separate image
    await saveChunksAsImages(parseResponse, documentPath);
  }

  extractChunks();
  ```
</CodeGroup>

## Directory Structure for Saved Images

Images are saved with this structure:

```
groundings/
└── document_TIMESTAMP/
    └── page_0/
        └── ChunkType.CHUNK_ID.png
```

Where:

* `TIMESTAMP` is the time and date the document was parsed (format: `YYYYMMDD_HHMMSS` for Python, ISO format for TypeScript)
* `page_0` is the zero-indexed page number
* `ChunkType` is the [chunk type](./ade-chunk-types)
* `CHUNK_ID` is the unique chunk identifier (UUID format)

Example output:

```
groundings/
└── document_20250117_143022/
    ├── page_0/
    │   ├── text.c5f81e1b-37d2-46bf-89e1-4983c1a36444.png
    │   ├── table.a2b91c3d-48e5-4f67-9123-5678abcdef12.png
    │   └── figure.e9f12345-6789-4abc-def0-123456789abc.png
    └── page_1/
        ├── text.f1a23456-7890-4bcd-ef12-3456789abcde.png
        └── marginalia.b3c45678-9012-4def-5678-90abcdef1234.png
```