Skip to main content

Extract Data from PDFs using Vectorize Iris

note

The API is currently in Beta.

Learn how to use the Vectorize API to extract text from unstructured data (PDF, Documents, images, and more) using Vectorize Iris.

Prerequisites

Before you begin, you'll need:

  1. A Vectorize account
  2. An API access token (how to create one)
  3. Your organization ID (see below)

Finding your Organization ID

Your organization ID is in the Vectorize platform URL:

https://platform.vectorize.io/organization/[YOUR-ORG-ID]

For example, if your URL is:

https://platform.vectorize.io/organization/ecf3fa1d-30d0-4df1-8af6-f4852bc851cb

Your organization ID is: ecf3fa1d-30d0-4df1-8af6-f4852bc851cb

Start the extraction​

First, we need to upload the file that we want to extract text from.

import vectorize_client as v
import os
import urllib3

# Create API instances
files_api = v.FilesApi(apiClient)
extraction_api = v.ExtractionApi(apiClient)

# File to extract
file_path = str(actual_test_file) # Use actual test file for testing
content_type = "application/pdf"

# Start file upload
start_file_upload_response = files_api.start_file_upload(
organization_id,
start_file_upload_request=v.StartFileUploadRequest(
content_type=content_type,
name="My file.pdf",
)
)

# Upload the file
http = urllib3.PoolManager()

with open(file_path, "rb") as f:
response = http.request(
"PUT",
start_file_upload_response.upload_url,
body=f,
headers={
"Content-Type": content_type,
"Content-Length": str(os.path.getsize(file_path))
}
)
if response.status != 200:
print("Upload failed:", response.data)
else:
print("Upload successful")

# Start extraction
response = extraction_api.start_extraction(
organization_id,
start_extraction_request=v.StartExtractionRequest(
file_id=start_file_upload_response.file_id
)
)
extraction_id = response.extraction_id

Get the Extraction result​

Extraction runs asynchronously. Use the extraction ID to check the status and retrieve your results.

import vectorize_client as v
import time

extraction_api = v.ExtractionApi(apiClient)

while True:
response = extraction_api.get_extraction_result(organization_id, extraction_id)
if response.ready:
if response.data.success:
print(response.data.text)
else:
print("Extraction failed:", response.data.error)
break
print("Extraction in progress...")
time.sleep(2) # Wait 2 seconds before checking again

Complete Example​

Here's all the code from this guide combined into a complete, runnable example:

Required Environment Variables:
• `VECTORIZE_API_KEY`
• `VECTORIZE_ORGANIZATION_ID`

Required Files:
• `document.pdf` • A PDF file to extract text from
#!/usr/bin/env python3
"""
Complete example for extracting text from PDF documents using Vectorize Iris.
This is a hand-written example that corresponds to the test file:
api-clients/python/tests/pipelines/extract_pdf_data_using_iris.py

IMPORTANT: Keep this file in sync with the test file's snippets!
"""

import os
import sys
import time
import tempfile
import urllib3
import vectorize_client as v


def get_api_config():
"""Get API configuration from environment variables."""
organization_id = os.environ.get("VECTORIZE_ORGANIZATION_ID")
api_key = os.environ.get("VECTORIZE_API_KEY")

if not organization_id or not api_key:
print("šŸ”‘ Setup required:")
print("1. Get your API key from: https://app.vectorize.io/settings")
print("2. Set environment variables:")
print(" export VECTORIZE_ORGANIZATION_ID='your-org-id'")
print(" export VECTORIZE_API_KEY='your-api-key'")
sys.exit(1)

# Always use production API
configuration = v.Configuration(
host="https://api.vectorize.io/v1",
access_token=api_key
)

return configuration, organization_id


def create_sample_pdf():
"""Create a sample PDF-like text file for demonstration."""
# Since we can't easily create a real PDF, we'll create a text file
# In a real scenario, you'd provide your own PDF file
sample_content = """Sample PDF Document for Text Extraction

This is a sample document that demonstrates text extraction using Vectorize Iris.

Key Features:
- Automatic text extraction from PDF documents
- OCR capabilities for scanned documents
- Support for multi-page documents
- Structured data extraction

Benefits:
1. Fast processing times
2. High accuracy text extraction
3. Support for various document formats
4. Easy API integration

For more information about Vectorize Iris capabilities, visit our documentation.

Sample Data Section:
- Document Title: Sample PDF Document
- Author: Vectorize Example
- Pages: 1
- Created: 2024
"""

# Create temporary file
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
f.write(sample_content)
temp_path = f.name

return temp_path


def start_extraction(api_client, organization_id, file_path):
"""Upload a file and start text extraction."""
# Create API instances
files_api = v.FilesApi(api_client)
extraction_api = v.ExtractionApi(api_client)

# File to extract
content_type = "text/plain" # In real scenario, use "application/pdf"

# Start file upload
start_file_upload_response = files_api.start_file_upload(
organization_id,
start_file_upload_request=v.StartFileUploadRequest(
content_type=content_type,
name="My file.pdf",
)
)

# Upload the file
http = urllib3.PoolManager()

with open(file_path, "rb") as f:
response = http.request(
"PUT",
start_file_upload_response.upload_url,
body=f,
headers={
"Content-Type": content_type,
"Content-Length": str(os.path.getsize(file_path))
}
)
if response.status != 200:
print("Upload failed:", response.data)
return None
else:
print("Upload successful")

# Start extraction
response = extraction_api.start_extraction(
organization_id,
start_extraction_request=v.StartExtractionRequest(
file_id=start_file_upload_response.file_id
)
)
extraction_id = response.extraction_id

return extraction_id


def get_extraction_result(api_client, organization_id, extraction_id):
"""Wait for and retrieve extraction results."""
extraction_api = v.ExtractionApi(api_client)

while True:
response = extraction_api.get_extraction_result(organization_id, extraction_id)
if response.ready:
if response.data.success:
print(response.data.text)
return response.data.text
else:
print("Extraction failed:", response.data.error)
return None
print("Extraction in progress...")
time.sleep(2) # Wait 2 seconds before checking again


def main():
"""Main function demonstrating PDF text extraction using Vectorize Iris."""
print("=== PDF Text Extraction using Vectorize Iris ===\n")

temp_file_path = None

try:
# Get configuration
configuration, organization_id = get_api_config()

print(f"āš™ļø Configuration:")
print(f" Organization ID: {organization_id}")
print(f" Host: {configuration.host}\n")

# Create sample file (in real usage, you'd have an actual PDF file)
print("šŸ“„ Creating sample document...")
temp_file_path = create_sample_pdf()
print(f"āœ… Created sample file at: {temp_file_path}\n")

# Initialize API client
# Initialize API client with proper headers for local env
with v.ApiClient(configuration) as api_client:
# Start extraction process
print("šŸ“¤ Starting Text Extraction Process")
print(" Step 1: Uploading file...")

extraction_id = start_extraction(api_client, organization_id, temp_file_path)

if extraction_id:
print(f" āœ… Extraction started with ID: {extraction_id}\n")

# Get extraction results
print("ā³ Waiting for Extraction Results")
extracted_text = get_extraction_result(api_client, organization_id, extraction_id)

if extracted_text:
print(f"\nšŸ“„ Extraction Results:")
print("=" * 50)
print(extracted_text)
print("=" * 50)
print(f"\nāœ… Text extraction completed successfully!")
print(f" šŸ“ Extracted {len(extracted_text)} characters")
else:
print(f"\nāŒ Text extraction failed")
else:
print(f"\nāŒ Failed to start extraction")

except ValueError as e:
print(f"āŒ Configuration Error: {e}")
print("\nšŸ’” Make sure to set the required environment variables:")
print(" export VECTORIZE_ORGANIZATION_ID='your-org-id'")
print(" export VECTORIZE_API_KEY='your-api-key'")

except Exception as e:
print(f"āŒ Error: {e}")
sys.exit(1)

finally:
# Clean up temp file
if temp_file_path and os.path.exists(temp_file_path):
os.unlink(temp_file_path)
print(f"\n🧹 Cleaned up temporary file")


if __name__ == "__main__":
main()

Was this page helpful?