Extract Data from PDFs using Vectorize Iris
note
The API is currently in Beta.
Learn how to use the Vectorize API to extract text from unstructured data (PDF, Documents, images, and more) using Vectorize Iris.
Prerequisites
Before you begin, you'll need:
- A Vectorize account
- An API access token (how to create one)
- Your organization ID (see below)
Finding your Organization ID
Your organization ID is in the Vectorize platform URL:
https://platform.vectorize.io/organization/[YOUR-ORG-ID]
For example, if your URL is:
https://platform.vectorize.io/organization/ecf3fa1d-30d0-4df1-8af6-f4852bc851cb
Your organization ID is: ecf3fa1d-30d0-4df1-8af6-f4852bc851cb
Start the extractionā
First, we need to upload the file that we want to extract text from.
- Python
- Node.js
import vectorize_client as v
import os
import urllib3
# Create API instances
files_api = v.FilesApi(apiClient)
extraction_api = v.ExtractionApi(apiClient)
# File to extract
file_path = str(actual_test_file) # Use actual test file for testing
content_type = "application/pdf"
# Start file upload
start_file_upload_response = files_api.start_file_upload(
organization_id,
start_file_upload_request=v.StartFileUploadRequest(
content_type=content_type,
name="My file.pdf",
)
)
# Upload the file
http = urllib3.PoolManager()
with open(file_path, "rb") as f:
response = http.request(
"PUT",
start_file_upload_response.upload_url,
body=f,
headers={
"Content-Type": content_type,
"Content-Length": str(os.path.getsize(file_path))
}
)
if response.status != 200:
print("Upload failed:", response.data)
else:
print("Upload successful")
# Start extraction
response = extraction_api.start_extraction(
organization_id,
start_extraction_request=v.StartExtractionRequest(
file_id=start_file_upload_response.file_id
)
)
extraction_id = response.extraction_id
// This snippet uses async operations and should be run in an async context
(async () => {
const vectorize = require('@vectorize-io/vectorize-client')
const fs = require('fs')
const path = require('path')
// Create API instances
const { FilesApi, ExtractionApi } = vectorize;
const filesApi = new FilesApi(apiClient);
const extractionApi = new ExtractionApi(apiClient);
// File to extract
const filePath = actualTestFile; // Use actual test file for testing
const contentType = "application/pdf";
// Start file upload
const startFileUploadResponse = await filesApi.startFileUpload({
organizationId: "your-org-id",
startFileUploadRequest: {
contentType: contentType,
name: "My file.pdf"
}
});
// Upload the file
const fileBuffer = fs.readFileSync(filePath);
const fileStats = fs.statSync(filePath);
const uploadResponse = await fetch(startFileUploadResponse.uploadUrl, {
method: 'PUT',
body: fileBuffer,
headers: {
'Content-Type': contentType,
'Content-Length': fileStats.size.toString()
}
});
if (uploadResponse.status !== 200) {
const errorText = await uploadResponse.text();
console.log("Upload failed:", errorText);
} else {
console.log("Upload successful");
}
// Start extraction
let response = await extractionApi.startExtraction({
organizationId: "your-org-id",
startExtractionRequest: {
fileId: startFileUploadResponse.fileId
}
});
const extractionId = response.extractionId;
})();
Get the Extraction resultā
Extraction runs asynchronously. Use the extraction ID to check the status and retrieve your results.
- Python
- Node.js
import vectorize_client as v
import time
extraction_api = v.ExtractionApi(apiClient)
while True:
response = extraction_api.get_extraction_result(organization_id, extraction_id)
if response.ready:
if response.data.success:
print(response.data.text)
else:
print("Extraction failed:", response.data.error)
break
print("Extraction in progress...")
time.sleep(2) # Wait 2 seconds before checking again
// This snippet uses async operations and should be run in an async context
(async () => {
const vectorize = require('@vectorize-io/vectorize-client')
const { ExtractionApi } = vectorize;
const extractionApi = new ExtractionApi(apiClient);
let response;
while (true) {
response = await extractionApi.getExtractionResult({
organizationId: "your-org-id",
extractionId: extractionId
});
if (response.ready) {
if (response.data.success) {
console.log(response.data.text);
} else {
console.log("Extraction failed:", response.data.error);
}
break;
}
console.log("Extraction in progress...");
await new Promise(resolve => setTimeout(resolve, 2000)); // Wait 2 seconds
}
})();
Complete Exampleā
Here's all the code from this guide combined into a complete, runnable example:
- Python
- Node.js
Required Environment Variables:
⢠`VECTORIZE_API_KEY`
⢠`VECTORIZE_ORGANIZATION_ID`
Required Files:
⢠`document.pdf` ⢠A PDF file to extract text from
⢠`VECTORIZE_API_KEY`
⢠`VECTORIZE_ORGANIZATION_ID`
Required Files:
⢠`document.pdf` ⢠A PDF file to extract text from
#!/usr/bin/env python3
"""
Complete example for extracting text from PDF documents using Vectorize Iris.
This is a hand-written example that corresponds to the test file:
api-clients/python/tests/pipelines/extract_pdf_data_using_iris.py
IMPORTANT: Keep this file in sync with the test file's snippets!
"""
import os
import sys
import time
import tempfile
import urllib3
import vectorize_client as v
def get_api_config():
"""Get API configuration from environment variables."""
organization_id = os.environ.get("VECTORIZE_ORGANIZATION_ID")
api_key = os.environ.get("VECTORIZE_API_KEY")
if not organization_id or not api_key:
print("š Setup required:")
print("1. Get your API key from: https://app.vectorize.io/settings")
print("2. Set environment variables:")
print(" export VECTORIZE_ORGANIZATION_ID='your-org-id'")
print(" export VECTORIZE_API_KEY='your-api-key'")
sys.exit(1)
# Always use production API
configuration = v.Configuration(
host="https://api.vectorize.io/v1",
access_token=api_key
)
return configuration, organization_id
def create_sample_pdf():
"""Create a sample PDF-like text file for demonstration."""
# Since we can't easily create a real PDF, we'll create a text file
# In a real scenario, you'd provide your own PDF file
sample_content = """Sample PDF Document for Text Extraction
This is a sample document that demonstrates text extraction using Vectorize Iris.
Key Features:
- Automatic text extraction from PDF documents
- OCR capabilities for scanned documents
- Support for multi-page documents
- Structured data extraction
Benefits:
1. Fast processing times
2. High accuracy text extraction
3. Support for various document formats
4. Easy API integration
For more information about Vectorize Iris capabilities, visit our documentation.
Sample Data Section:
- Document Title: Sample PDF Document
- Author: Vectorize Example
- Pages: 1
- Created: 2024
"""
# Create temporary file
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
f.write(sample_content)
temp_path = f.name
return temp_path
def start_extraction(api_client, organization_id, file_path):
"""Upload a file and start text extraction."""
# Create API instances
files_api = v.FilesApi(api_client)
extraction_api = v.ExtractionApi(api_client)
# File to extract
content_type = "text/plain" # In real scenario, use "application/pdf"
# Start file upload
start_file_upload_response = files_api.start_file_upload(
organization_id,
start_file_upload_request=v.StartFileUploadRequest(
content_type=content_type,
name="My file.pdf",
)
)
# Upload the file
http = urllib3.PoolManager()
with open(file_path, "rb") as f:
response = http.request(
"PUT",
start_file_upload_response.upload_url,
body=f,
headers={
"Content-Type": content_type,
"Content-Length": str(os.path.getsize(file_path))
}
)
if response.status != 200:
print("Upload failed:", response.data)
return None
else:
print("Upload successful")
# Start extraction
response = extraction_api.start_extraction(
organization_id,
start_extraction_request=v.StartExtractionRequest(
file_id=start_file_upload_response.file_id
)
)
extraction_id = response.extraction_id
return extraction_id
def get_extraction_result(api_client, organization_id, extraction_id):
"""Wait for and retrieve extraction results."""
extraction_api = v.ExtractionApi(api_client)
while True:
response = extraction_api.get_extraction_result(organization_id, extraction_id)
if response.ready:
if response.data.success:
print(response.data.text)
return response.data.text
else:
print("Extraction failed:", response.data.error)
return None
print("Extraction in progress...")
time.sleep(2) # Wait 2 seconds before checking again
def main():
"""Main function demonstrating PDF text extraction using Vectorize Iris."""
print("=== PDF Text Extraction using Vectorize Iris ===\n")
temp_file_path = None
try:
# Get configuration
configuration, organization_id = get_api_config()
print(f"āļø Configuration:")
print(f" Organization ID: {organization_id}")
print(f" Host: {configuration.host}\n")
# Create sample file (in real usage, you'd have an actual PDF file)
print("š Creating sample document...")
temp_file_path = create_sample_pdf()
print(f"ā
Created sample file at: {temp_file_path}\n")
# Initialize API client
# Initialize API client with proper headers for local env
with v.ApiClient(configuration) as api_client:
# Start extraction process
print("š¤ Starting Text Extraction Process")
print(" Step 1: Uploading file...")
extraction_id = start_extraction(api_client, organization_id, temp_file_path)
if extraction_id:
print(f" ā
Extraction started with ID: {extraction_id}\n")
# Get extraction results
print("ā³ Waiting for Extraction Results")
extracted_text = get_extraction_result(api_client, organization_id, extraction_id)
if extracted_text:
print(f"\nš Extraction Results:")
print("=" * 50)
print(extracted_text)
print("=" * 50)
print(f"\nā
Text extraction completed successfully!")
print(f" š Extracted {len(extracted_text)} characters")
else:
print(f"\nā Text extraction failed")
else:
print(f"\nā Failed to start extraction")
except ValueError as e:
print(f"ā Configuration Error: {e}")
print("\nš” Make sure to set the required environment variables:")
print(" export VECTORIZE_ORGANIZATION_ID='your-org-id'")
print(" export VECTORIZE_API_KEY='your-api-key'")
except Exception as e:
print(f"ā Error: {e}")
sys.exit(1)
finally:
# Clean up temp file
if temp_file_path and os.path.exists(temp_file_path):
os.unlink(temp_file_path)
print(f"\nš§¹ Cleaned up temporary file")
if __name__ == "__main__":
main()
Required Environment Variables:
⢠`VECTORIZE_API_KEY`
⢠`VECTORIZE_ORGANIZATION_ID`
Required Files:
⢠`document.pdf` ⢠A PDF file to extract text from
⢠`VECTORIZE_API_KEY`
⢠`VECTORIZE_ORGANIZATION_ID`
Required Files:
⢠`document.pdf` ⢠A PDF file to extract text from
#!/usr/bin/env node
/**
* Complete example for PDF extraction using Iris.
* This is a hand-written example that corresponds to the test file:
* api-clients/javascript/tests/pipelines/extract_pdf_data_using_iris.js
*
* IMPORTANT: Keep this file in sync with the test file's snippets!
*/
const vectorize = require('@vectorize-io/vectorize-client');
const fs = require('fs');
const path = require('path');
// For test environment, use test configuration
function getApiConfig() {
// Check if we're in test environment
if (process.env.VECTORIZE_TEST_MODE === 'true') {
const testConfigPath = path.join(__dirname, '../common/test_config.js');
if (fs.existsSync(testConfigPath)) {
const { getApiClient } = require(testConfigPath);
const { apiConfig, config } = getApiClient();
return { apiClient: apiConfig, organizationId: config.organization_id };
}
}
// Fall back to environment variables
const organizationId = process.env.VECTORIZE_ORGANIZATION_ID;
const apiKey = process.env.VECTORIZE_API_KEY;
if (!organizationId || !apiKey) {
throw new Error("Please set VECTORIZE_ORGANIZATION_ID and VECTORIZE_API_KEY environment variables");
}
const configuration = new vectorize.Configuration({
basePath: 'https://api.vectorize.io/v1',
accessToken: apiKey
});
return { apiClient: configuration, organizationId };
}
async function main() {
// Initialize the API client
const { apiClient: apiConfig, organizationId } = getApiConfig();
// Prepare a test PDF file
// In a real scenario, you would use your actual PDF file
const testPdfPath = path.join(__dirname, 'test.pdf');
// Check if we have a test PDF, if not create a simple one
// In production, you would use your actual PDF file
let filePath = testPdfPath;
// For testing, check if we have test-data directory
const testDataPath = path.join(__dirname, '../../../test-data/test.pdf');
if (fs.existsSync(testDataPath)) {
filePath = testDataPath;
console.log(`š Using test PDF from: ${filePath}`);
} else if (!fs.existsSync(testPdfPath)) {
console.log('ā ļø No PDF file found. Please provide a PDF file path.');
console.log(' You can set filePath variable to point to your PDF file.');
return;
}
// ============================================================================
// SNIPPET: start_extraction
// Upload a PDF file and start text extraction using Iris
// ============================================================================
console.log('š¤ Uploading PDF and starting extraction...\n');
let extractionId;
{
// Create API instances
const { FilesApi, ExtractionApi } = vectorize;
const filesApi = new FilesApi(apiConfig);
const extractionApi = new ExtractionApi(apiConfig);
// File to extract
// filePath is already defined above
const contentType = "application/pdf";
// Start file upload
const startFileUploadResponse = await filesApi.startFileUpload({
organizationId: organizationId,
startFileUploadRequest: {
contentType: contentType,
name: "My file.pdf"
}
});
console.log(`File ID: ${startFileUploadResponse.fileId}`);
// Upload the file
const fileBuffer = fs.readFileSync(filePath);
const fileStats = fs.statSync(filePath);
const uploadResponse = await fetch(startFileUploadResponse.uploadUrl, {
method: 'PUT',
body: fileBuffer,
headers: {
'Content-Type': contentType,
'Content-Length': fileStats.size.toString()
}
});
if (uploadResponse.status !== 200) {
const errorText = await uploadResponse.text();
console.log("Upload failed:", errorText);
throw new Error(`Upload failed with status ${uploadResponse.status}`);
} else {
console.log("ā
Upload successful");
}
// Start extraction
let response = await extractionApi.startExtraction({
organizationId: organizationId,
startExtractionRequest: {
fileId: startFileUploadResponse.fileId
}
});
extractionId = response.extractionId;
console.log(`š Extraction started with ID: ${extractionId}\n`);
}
// Wait a moment for extraction to begin processing
await new Promise(resolve => setTimeout(resolve, 2000));
// ============================================================================
// SNIPPET: get_extraction_result
// Poll for extraction results and retrieve the extracted text
// ============================================================================
console.log('ā³ Waiting for extraction results...\n');
{
const { ExtractionApi } = vectorize;
const extractionApi = new ExtractionApi(apiConfig);
let response;
let attempts = 0;
const maxAttempts = 30; // Maximum 60 seconds (30 * 2 seconds)
while (attempts < maxAttempts) {
response = await extractionApi.getExtractionResult({
organizationId: organizationId,
extractionId: extractionId
});
if (response.ready) {
if (response.data.success) {
console.log("ā
Extraction completed successfully!\n");
console.log("š Extracted Text:\n");
console.log("=" .repeat(60));
console.log(response.data.text);
console.log("=" .repeat(60));
} else {
console.log("ā Extraction failed:", response.data.error);
}
break;
}
console.log("Extraction in progress...");
await new Promise(resolve => setTimeout(resolve, 2000)); // Wait 2 seconds
attempts++;
}
if (attempts >= maxAttempts) {
console.log("ā ļø Extraction timed out after 60 seconds");
}
}
console.log('\nā
PDF extraction example completed!');
}
// Run the example
if (require.main === module) {
main().catch(error => {
console.error('ā Error:', error);
process.exit(1);
});
}
module.exports = { main };