Extract Data from PDFs using Vectorize Iris
note
The API is currently in Beta.
Learn how to use the Vectorize API to extract text from unstructured data (PDF, Documents, images, and more) using Vectorize Iris.
Prerequisites
Before you begin, you'll need:
- A Vectorize account
- An API access token (how to create one)
- Your organization ID (see below)
Finding your Organization ID
Your organization ID is in the Vectorize platform URL:
https://platform.vectorize.io/organization/[YOUR-ORG-ID]
For example, if your URL is:
https://platform.vectorize.io/organization/ecf3fa1d-30d0-4df1-8af6-f4852bc851cb
Your organization ID is: ecf3fa1d-30d0-4df1-8af6-f4852bc851cb
Start the extraction
First, we need to upload the file that we want to extract text from.
- Python
- Node.js
import vectorize_client as v
import os
import urllib3
# Create API instances
files_api = v.FilesApi(apiClient)
extraction_api = v.ExtractionApi(apiClient)
# File to extract
content_type = "application/pdf"
# Start file upload
start_file_upload_response = files_api.start_file_upload(
organization_id,
start_file_upload_request=v.StartFileUploadRequest(
content_type=content_type,
name="My file.pdf",
)
)
# Upload the file
http = urllib3.PoolManager()
with open(file_path, "rb") as f:
response = http.request(
"PUT",
start_file_upload_response.upload_url,
body=f,
headers={
"Content-Type": content_type,
"Content-Length": str(os.path.getsize(file_path))
}
)
if response.status != 200:
print("Upload failed:", response.data)
else:
print("Upload successful")
# Start extraction
response = extraction_api.start_extraction(
organization_id,
start_extraction_request=v.StartExtractionRequest(
file_id=start_file_upload_response.file_id
)
)
extraction_id = response.extraction_id
// This snippet uses async operations and should be run in an async context
(async () => {
const vectorize = require('@vectorize-io/vectorize-client')
const fs = require('fs')
const path = require('path')
// Create API instances
const { FilesApi, ExtractionApi } = vectorize;
const filesApi = new FilesApi(apiClient);
const extractionApi = new ExtractionApi(apiClient);
// File to extract
const contentType = "application/pdf";
// Start file upload
const startFileUploadResponse = await filesApi.startFileUpload({
organizationId: "your-org-id",
startFileUploadRequest: {
contentType: contentType,
name: "My file.pdf"
}
});
// Upload the file
const fileBuffer = fs.readFileSync(filePath);
const fileStats = fs.statSync(filePath);
const uploadResponse = await fetch(startFileUploadResponse.uploadUrl, {
method: 'PUT',
body: fileBuffer,
headers: {
'Content-Type': contentType,
'Content-Length': fileStats.size.toString()
}
});
if (uploadResponse.status !== 200) {
const errorText = await uploadResponse.text();
console.log("Upload failed:", errorText);
} else {
console.log("Upload successful");
}
// Start extraction
let response = await extractionApi.startExtraction({
organizationId: "your-org-id",
startExtractionRequest: {
fileId: startFileUploadResponse.fileId
}
});
const extractionId = response.extractionId;
})();
Get the Extraction result
Extraction runs asynchronously. Use the extraction ID to check the status and retrieve your results.
- Python
- Node.js
import vectorize_client as v
import time
extraction_api = v.ExtractionApi(apiClient)
while True:
response = extraction_api.get_extraction_result(organization_id, extraction_id)
if response.ready:
if response.data.success:
print(response.data.text)
else:
print("Extraction failed:", response.data.error)
break
print("Extraction in progress...")
time.sleep(2) # Wait 2 seconds before checking again
// This snippet uses async operations and should be run in an async context
(async () => {
const vectorize = require('@vectorize-io/vectorize-client')
const { ExtractionApi } = vectorize;
const extractionApi = new ExtractionApi(apiClient);
let response;
while (true) {
response = await extractionApi.getExtractionResult({
organizationId: "your-org-id",
extractionId: extractionId
});
if (response.ready) {
if (response.data.success) {
console.log(response.data.text);
} else {
console.log("Extraction failed:", response.data.error);
}
break;
}
console.log("Extraction in progress...");
await new Promise(resolve => setTimeout(resolve, 2000)); // Wait 2 seconds
}
})();
Complete Example
Here's all the code from this guide combined into a complete, runnable example:
- Python
- Node.js
Required Environment Variables:
• `VECTORIZE_API_KEY`
• `VECTORIZE_ORGANIZATION_ID`
Required Files:
• `document.pdf` • A PDF file to extract text from
• `VECTORIZE_API_KEY`
• `VECTORIZE_ORGANIZATION_ID`
Required Files:
• `document.pdf` • A PDF file to extract text from
import os
import time
import urllib3
import vectorize_client as v
def main():
# Initialize the API client
apiClient = v.ApiClient(v.Configuration(
api_key=os.environ.get("VECTORIZE_API_KEY"),
host="https://api.vectorize.io"
))
# Set organization ID from environment
organization_id = os.environ.get("VECTORIZE_ORGANIZATION_ID")
# Start Extraction
# Create API instances
files_api = v.FilesApi(apiClient)
extraction_api = v.ExtractionApi(apiClient)
# File to extract
content_type = "application/pdf"
# Start file upload
start_file_upload_response = files_api.start_file_upload(
organization_id,
start_file_upload_request=v.StartFileUploadRequest(
content_type=content_type,
name="My file.pdf",
)
)
# Upload the file
http = urllib3.PoolManager()
with open(file_path, "rb") as f:
response = http.request(
"PUT",
start_file_upload_response.upload_url,
body=f,
headers={
"Content-Type": content_type,
"Content-Length": str(os.path.getsize(file_path))
}
)
if response.status != 200:
print("Upload failed:", response.data)
else:
print("Upload successful")
# Start extraction
response = extraction_api.start_extraction(
organization_id,
start_extraction_request=v.StartExtractionRequest(
file_id=start_file_upload_response.file_id
)
)
extraction_id = response.extraction_id
# Get Extraction Result
extraction_api = v.ExtractionApi(apiClient)
while True:
response = extraction_api.get_extraction_result(organization_id, extraction_id)
if response.ready:
if response.data.success:
print(response.data.text)
else:
print("Extraction failed:", response.data.error)
break
print("Extraction in progress...")
time.sleep(2) # Wait 2 seconds before checking again
if __name__ == "__main__":
main()
Required Environment Variables:
• `VECTORIZE_API_KEY`
• `VECTORIZE_ORGANIZATION_ID`
Required Files:
• `document.pdf` • A PDF file to extract text from
• `VECTORIZE_API_KEY`
• `VECTORIZE_ORGANIZATION_ID`
Required Files:
• `document.pdf` • A PDF file to extract text from
const vectorize = require('@vectorize-io/vectorize-client');
const fs = require('fs')
const path = require('path')
async function main() {
// Initialize the API client
const apiClient = new vectorize.ApiClient(new vectorize.Configuration({
basePath: "https://api.vectorize.io/api",
accessToken: process.env.VECTORIZE_API_KEY
}));
// Set organization ID from environment
const organizationId = process.env.VECTORIZE_ORGANIZATION_ID;
// Start Extraction
// Create API instances
const { FilesApi, ExtractionApi } = vectorize;
const filesApi = new FilesApi(apiClient);
const extractionApi = new ExtractionApi(apiClient);
// File to extract
const contentType = "application/pdf";
// Start file upload
const startFileUploadResponse = await filesApi.startFileUpload({
organizationId: "your-org-id",
startFileUploadRequest: {
contentType: contentType,
name: "My file.pdf"
}
});
// Upload the file
const fileBuffer = fs.readFileSync(filePath);
const fileStats = fs.statSync(filePath);
const uploadResponse = await fetch(startFileUploadResponse.uploadUrl, {
method: 'PUT',
body: fileBuffer,
headers: {
'Content-Type': contentType,
'Content-Length': fileStats.size.toString()
}
});
if (uploadResponse.status !== 200) {
const errorText = await uploadResponse.text();
console.log("Upload failed:", errorText);
} else {
console.log("Upload successful");
}
// Start extraction
let response = await extractionApi.startExtraction({
organizationId: "your-org-id",
startExtractionRequest: {
fileId: startFileUploadResponse.fileId
}
});
const extractionId = response.extractionId;
// Get Extraction Result
const { ExtractionApi } = vectorize;
const extractionApi = new ExtractionApi(apiClient);
let response;
while (true) {
response = await extractionApi.getExtractionResult({
organizationId: "your-org-id",
extractionId: extractionId
});
if (response.ready) {
if (response.data.success) {
console.log(response.data.text);
} else {
console.log("Extraction failed:", response.data.error);
}
break;
}
console.log("Extraction in progress...");
await new Promise(resolve => setTimeout(resolve, 2000)); // Wait 2 seconds
}
}
// Run the example
main().catch(console.error);