Reranking top pages from PDF using LlamaParse and ZeroEntropy

In this guide, we'll build a simple workflow to parse PDF documents into text using LlamaParse and then query and rerank the textual data.

Pre-requisites

Python 3.8+
zeroentropy client
llama_cloud_services client
A ZeroEntropy API key (Get yours here)
A LlamaParse API key (Get yours here)

What You'll Learn

How to use LlamaParse to accurately convert PDF documents into markdown
How to use ZeroEntropy to semantically index and query the parsed documents
How to rerank your results using ZeroEntropy's reranker zerank-1 to boost accuracy

Setting up your ZeroEntropy Client and LlamaParse Client

First, install dependencies:

python

!pip install zeroentropy python-dotenv llama_cloud_services requests

Now load your API keys and initialize the clients

python

# Get your API keys from the ZeroEntropy and LlamaParse websites
# https://dashboard.zeroentropy.dev/
# https://docs.cloud.llamaindex.ai/api_key
ZEROENTROPY_API_KEY = "your_api_key_here"
LLAMAPARSE_API_KEY = "your_api_key_here"

python

from zeroentropy import AsyncZeroEntropy, ConflictError
from llama_cloud_services import LlamaParse
import os

# We initialize the AsyncZeroEntropy client in order to parse multiple documents in parallel
# If you want to parse a single document, you can use the synchronous client instead
zclient = AsyncZeroEntropy(api_key=ZEROENTROPY_API_KEY)

# We initialize the llama_parse client to parse the PDF documents into text
llamaParser = LlamaParse(
    api_key=LLAMAPARSE_API_KEY,
    num_workers=1,  # if multiple files passed, split in `num_workers` API calls
    result_type="text",
    verbose=True,
    language="en",  # optionally define a language, default=en
)

Adding a collection to the ZeroEntropy client

python

collection_name = "my_collection"
await zclient.collections.add(collection_name=collection_name)

Now define a function to download and extract PDF files from Dropbox directly to memory:

python

import requests
import zipfile
import asyncio
import io
from typing import List, Tuple


def download_and_extract_dropbox_zip_to_memory(
    url: str,
) -> List[Tuple[str, bytes]]:
    """Download and extract a zip file from Dropbox URL directly to memory.

    Returns:
        List of tuples containing (filename, file_content_bytes)
    """
    try:
        # Download the zip file
        print(f"Downloading zip file from: {url}")
        response = requests.get(url, stream=True)
        response.raise_for_status()

        # Read zip content into memory
        zip_content = io.BytesIO()
        for chunk in response.iter_content(chunk_size=8192):
            zip_content.write(chunk)
        zip_content.seek(0)

        # Extract files from zip in memory
        files_in_memory = []
        with zipfile.ZipFile(zip_content, "r") as zip_ref:
            for file_info in zip_ref.infolist():
                if (
                    not file_info.is_dir()
                    and file_info.filename.lower().endswith(".pdf")
                ):
                    file_content = zip_ref.read(file_info.filename)
                    files_in_memory.append((file_info.filename, file_content))
                    print(
                        f"Loaded {file_info.filename} ({len(file_content)} bytes)"
                    )

        print(
            f"Successfully loaded {len(files_in_memory)} PDF files into memory"
        )
        return files_in_memory

    except Exception as e:
        print(f"Error downloading/extracting zip file: {e}")
        raise


# Download and extract files from Dropbox directly to memory
dropbox_url = "https://www.dropbox.com/scl/fi/oi6kf91gz8h76d2wt57mb/example_docs.zip?rlkey=mf21tvyb65tyrjkr1t2szt226&dl=1"
files_in_memory = download_and_extract_dropbox_zip_to_memory(dropbox_url)

Parsing PDFs using LlamaParse

Let's download the PDF files from Dropbox and parse them directly in memory using LlamaParse:

python

# Create file-like objects for LlamaParse
file_objects = []
file_names = []

for filename, file_content in files_in_memory:
    # Create a file-like object from bytes
    file_obj = io.BytesIO(file_content)
    file_obj.name = filename  # Set the name attribute for LlamaParse
    file_objects.append(file_obj)
    file_names.append(filename)

# Parse all PDF files at once using LlamaParse
# Include extra_info with file names formatted as dictionaries for byte data parsing
print(f"Parsing {len(file_objects)} PDF files...")

# Use async parsing to avoid nested event loop issues
text_data = await asyncio.gather(
    *[
        llamaParser.aparse(file_obj, extra_info={"file_name": name})
        for file_obj, name in zip(file_objects, file_names)
    ]
)
print(f"Successfully parsed {len(text_data)} documents")

Organizing your documents

Once parsed, we form a list of documents with a list of the pages within them.

python

docs = []

for dindex, doc in enumerate(text_data):
    pages = []
    for index, page in enumerate(doc.pages):
        pages.append(page.text)
    docs.append(pages)

print(f"Organized {len(docs)} documents with pages")
if docs:
    print(f"First document has {len(docs[0])} pages")

Querying with ZeroEntropy

We'll now define functions to upload the documents as text pages asynchroniously.

python

import asyncio
from tqdm.asyncio import tqdm

sem = asyncio.Semaphore(16)


async def add_document_with_pages(
    collection_name: str, filename: str, pages: list, doc_index: int
):
    """Add a single document with multiple pages to the collection."""
    async with sem:  # Limit concurrent operations
        for retry in range(3):  # Retry logic
            try:
                response = await zclient.documents.add(
                    collection_name=collection_name,
                    path=filename,  # Use the actual filename as path
                    content={
                        "type": "text-pages",
                        "pages": pages,  # Send list of strings directly
                    },
                )
                return response
            except ConflictError:
                print(
                    f"Document '{filename}' already exists in collection '{collection_name}'"
                )
                break
            except Exception as e:
                if retry == 2:  # Last retry
                    print(f"Failed to add document '{filename}': {e}")
                    return None
                await asyncio.sleep(0.1 * (retry + 1))  # Exponential backoff


async def upload_documents_async(
    docs: list, file_names: list, collection_name: str
):
    """
    Upload documents asynchronously to ZeroEntropy collection.

    Args:
        docs: 2D array where docs[i] contains the list of pages (strings) for document i
        file_names: Array where file_names[i] contains the path for document i
        collection_name: Name of the collection to add documents to
    """

    # Validate input arrays have same length
    if len(docs) != len(file_names):
        raise ValueError("docs and file_names must have the same length")

    # Print starting message
    print(f"Starting upload of {len(docs)} documents...")

    # Create tasks for all documents
    tasks = [
        add_document_with_pages(collection_name, file_names[i], docs[i], i)
        for i in range(len(docs))
    ]

    # Execute all tasks concurrently with progress bar
    results = await tqdm.gather(*tasks, desc="Uploading Documents")

    # Count successful uploads
    successful = sum(1 for result in results if result is not None)
    print(f"Successfully uploaded {successful}/{len(docs)} documents")

    return results

Querying documents with ZeroEntropy

First we will upload documents

python

await upload_documents_async(docs, file_names, collection_name)

Query for the top 5 pages

python

response = await zclient.queries.top_pages(
    collection_name=collection_name,
    query="What are the top 100 stocks in the S&P 500?",
    k=5,
)

Now let's define a function to rerank the pages in the response:

python

async def rerank_top_pages_with_metadata(
    query: str, top_pages_response, collection_name: str
):
    """
    Rerank the results from a top_pages query and return re-ordered list with metadata.

    Args:
        query: The query string to use for reranking
        top_pages_response: The response object from zclient.queries.top_pages()
        collection_name: Name of the collection to fetch page content from

    Returns:
        List of dicts with 'path', 'page_index', and 'rerank_score' in reranked order
    """

    # Fetch page content and store metadata for each result
    documents = []
    metadata = []

    for result in top_pages_response.results:
        # Fetch the actual page content
        page_info = await zclient.documents.get_page_info(
            collection_name=collection_name,
            path=result.path,
            page_index=result.page_index,
            include_content=True,
        )

        # Get page content and ensure it's not empty
        page_content = page_info.page.content
        if page_content and page_content.strip():
            documents.append(page_content.strip())
            metadata.append(
                {
                    "path": result.path,
                    "page_index": result.page_index,
                    "original_score": result.score,
                }
            )
        else:
            # Include empty pages with fallback content
            documents.append("No content available")
            metadata.append(
                {
                    "path": result.path,
                    "page_index": result.page_index,
                    "original_score": result.score,
                }
            )

    if not documents:
        raise ValueError("No documents found to rerank")

    # Perform reranking
    rerank_response = await zclient.models.rerank(
        model="zerank-1", query=query, documents=documents
    )

    # Create re-ordered list with metadata
    reranked_results = []
    for rerank_result in rerank_response.results:
        original_metadata = metadata[rerank_result.index]
        reranked_results.append(
            {
                "path": original_metadata["path"],
                "page_index": original_metadata["page_index"],
                "rerank_score": rerank_result.relevance_score,
            }
        )

    return reranked_results

Run the function and see the results!

python

reranked_results = await rerank_top_pages_with_metadata(
    query="What are the top 100 stocks in the S&P 500?",
    top_pages_response=response,
    collection_name=collection_name,
)

# Display results
print("Reranked Results with Metadata:")
for i, result in enumerate(reranked_results, 1):
    print(
        f"Rank {i}: {result['path']} (Page {result['page_index']}) - Score: {result['rerank_score']:.4f}"
    )

✅ That's It!

You've now built a working semantic search engine that processes PDF files entirely in memory using ZeroEntropy and LlamaParse — no local file storage required!