📢 Note: The API is currently in Beta.
In this guide, we will show you how to use the Vectorize Iris API to extract text from unstructured data (PDF, Documents, images and more).
Make sure to include the code and imports from the Getting Started page.
First, we need to upload the file that we want to extract text from.
from pathlib import Path
import urllib3, os
files_api = v.FilesApi(api)
content_type="application/pdf"
file_path="path/to/file.pdf"
start_file_upload_response = files_api.start_file_upload(org, start_file_upload_request=v.StartFileUploadRequest(
content_type=content_type,
name="My file.pdf",
))
http = urllib3.PoolManager()
with open(file_path, "rb") as f:
response = http.request("PUT", start_file_upload_response.upload_url, body=f, headers={"Content-Type": "application/pdf", "Content-Length": str(os.path.getsize(file_path))})
if response.status != 200:
print("Upload failed: ", response.data)
else:
print("Upload successful")
extraction_api = v.ExtractionApi(api)
response = extraction_api.start_extraction(org, start_extraction_request=v.StartExtractionRequest(
file_id=start_file_upload_response.file_id
))
extraction_id = response.extraction_id
const contentType = "application/pdf";
const startResponse = await filesApi.startFileUpload({
organization: org,
startFileUploadRequest: {
name: "My File",
contentType
}
});
const fileBuffer = fs.readFileSync("path/to/file.pdf");
const fetchResponse = await fetch(startResponse.uploadUrl, {
method: 'PUT',
body: fileBuffer,
headers: {
'Content-Type': contentType
},
});
if (!fetchResponse.ok) {
throw new Error(`Failed to upload file: ${fetchResponse.statusText}`);
}
const response = await extractionApi.startExtraction({
organization: org,
startExtractionRequest: {
fileId: startResponse.fileId,
// the extraction will also chunk the file as it would do in a RAG pipeline
chunkSize: 512,
}
})
const extractionId = response.extractionId;
Make sure to include the code and imports from the Getting Started page.
Now we need to poll the Extraction API to get the result.
while True:
response = extraction_api.get_extraction_result(org, extraction_id)
if response.ready:
if response.data.success:
print(response.data.text)
else:
print("Extraction failed: ", response.data.error)
break
print("not ready")
while (true) {
const result = await extractionApi.getExtractionResult({
organization: org,
extractionId: extractionId,
})
if (result.ready) {
if (result.data.success) {
console.log(result.data.text)
} else {
console.log("Extraction failed: ", result.data.error)
}
break
} else {
console.log("not ready")
}
}