Skip to main content

Extract Data from PDFs using Vectorize Iris

note

The API is currently in Beta.

Learn how to use the Vectorize API to extract text from unstructured data (PDF, Documents, images, and more) using Vectorize Iris.

Prerequisites

Before you begin, you'll need:

  1. A Vectorize account
  2. An API access token (how to create one)
  3. Your organization ID (see below)

Finding your Organization ID

Your organization ID is in the Vectorize platform URL:

https://platform.vectorize.io/organization/[YOUR-ORG-ID]

For example, if your URL is:

https://platform.vectorize.io/organization/ecf3fa1d-30d0-4df1-8af6-f4852bc851cb

Your organization ID is: ecf3fa1d-30d0-4df1-8af6-f4852bc851cb

Start the extraction

First, we need to upload the file that we want to extract text from.

import vectorize_client as v
import os
import urllib3

# Create API instances
files_api = v.FilesApi(apiClient)
extraction_api = v.ExtractionApi(apiClient)

# File to extract
content_type = "application/pdf"

# Start file upload
start_file_upload_response = files_api.start_file_upload(
organization_id,
start_file_upload_request=v.StartFileUploadRequest(
content_type=content_type,
name="My file.pdf",
)
)

# Upload the file
http = urllib3.PoolManager()

with open(file_path, "rb") as f:
response = http.request(
"PUT",
start_file_upload_response.upload_url,
body=f,
headers={
"Content-Type": content_type,
"Content-Length": str(os.path.getsize(file_path))
}
)
if response.status != 200:
print("Upload failed:", response.data)
else:
print("Upload successful")

# Start extraction
response = extraction_api.start_extraction(
organization_id,
start_extraction_request=v.StartExtractionRequest(
file_id=start_file_upload_response.file_id
)
)
extraction_id = response.extraction_id

Get the Extraction result

Extraction runs asynchronously. Use the extraction ID to check the status and retrieve your results.

import vectorize_client as v
import time

extraction_api = v.ExtractionApi(apiClient)

while True:
response = extraction_api.get_extraction_result(organization_id, extraction_id)
if response.ready:
if response.data.success:
print(response.data.text)
else:
print("Extraction failed:", response.data.error)
break
print("Extraction in progress...")
time.sleep(2) # Wait 2 seconds before checking again

Complete Example

Here's all the code from this guide combined into a complete, runnable example:

Required Environment Variables:
• `VECTORIZE_API_KEY`
• `VECTORIZE_ORGANIZATION_ID`

Required Files:
• `document.pdf` • A PDF file to extract text from
import os
import time
import urllib3
import vectorize_client as v

def main():
# Initialize the API client
apiClient = v.ApiClient(v.Configuration(
api_key=os.environ.get("VECTORIZE_API_KEY"),
host="https://api.vectorize.io"
))

# Set organization ID from environment
organization_id = os.environ.get("VECTORIZE_ORGANIZATION_ID")

# Start Extraction
# Create API instances
files_api = v.FilesApi(apiClient)
extraction_api = v.ExtractionApi(apiClient)

# File to extract
content_type = "application/pdf"

# Start file upload
start_file_upload_response = files_api.start_file_upload(
organization_id,
start_file_upload_request=v.StartFileUploadRequest(
content_type=content_type,
name="My file.pdf",
)
)

# Upload the file
http = urllib3.PoolManager()

with open(file_path, "rb") as f:
response = http.request(
"PUT",
start_file_upload_response.upload_url,
body=f,
headers={
"Content-Type": content_type,
"Content-Length": str(os.path.getsize(file_path))
}
)
if response.status != 200:
print("Upload failed:", response.data)
else:
print("Upload successful")

# Start extraction
response = extraction_api.start_extraction(
organization_id,
start_extraction_request=v.StartExtractionRequest(
file_id=start_file_upload_response.file_id
)
)
extraction_id = response.extraction_id

# Get Extraction Result
extraction_api = v.ExtractionApi(apiClient)

while True:
response = extraction_api.get_extraction_result(organization_id, extraction_id)
if response.ready:
if response.data.success:
print(response.data.text)
else:
print("Extraction failed:", response.data.error)
break
print("Extraction in progress...")
time.sleep(2) # Wait 2 seconds before checking again


if __name__ == "__main__":
main()

Was this page helpful?