Back to Llama Index

Reranking top pages from PDF using LlamaParse and ZeroEntropy

docs/examples/cookbooks/rerank_llamaparsed_pdfs.ipynb

0.14.2111.4 KB
Original Source

<a href="https://colab.research.google.com/github/run-llama/llama_index/blob/main/docs/examples/cookbooks/rerank_llamaparsed _pdfs.ipynb" target="_parent"></a>

Reranking top pages from PDF using LlamaParse and ZeroEntropy

In this guide, we'll build a simple workflow to parse PDF documents into text using LlamaParse and then query and rerank the textual data.


Pre-requisites

What You'll Learn

  • How to use LlamaParse to accurately convert PDF documents into markdown
  • How to use ZeroEntropy to semantically index and query the parsed documents
  • How to rerank your results using ZeroEntropy's reranker zerank-1 to boost accuracy

Setting up your ZeroEntropy Client and LlamaParse Client

First, install dependencies:

python
!pip install zeroentropy python-dotenv llama_cloud_services requests

Now load your API keys and initialize the clients

python
# Get your API keys from the ZeroEntropy and LlamaParse websites
# https://dashboard.zeroentropy.dev/
# https://docs.cloud.llamaindex.ai/api_key
ZEROENTROPY_API_KEY = "your_api_key_here"
LLAMAPARSE_API_KEY = "your_api_key_here"
python
from zeroentropy import AsyncZeroEntropy, ConflictError
from llama_cloud_services import LlamaParse
import os

# We initialize the AsyncZeroEntropy client in order to parse multiple documents in parallel
# If you want to parse a single document, you can use the synchronous client instead
zclient = AsyncZeroEntropy(api_key=ZEROENTROPY_API_KEY)

# We initialize the llama_parse client to parse the PDF documents into text
llamaParser = LlamaParse(
    api_key=LLAMAPARSE_API_KEY,
    num_workers=1,  # if multiple files passed, split in `num_workers` API calls
    result_type="text",
    verbose=True,
    language="en",  # optionally define a language, default=en
)

Adding a collection to the ZeroEntropy client

python
collection_name = "my_collection"
await zclient.collections.add(collection_name=collection_name)

Now define a function to download and extract PDF files from Dropbox directly to memory:

python
import requests
import zipfile
import asyncio
import io
from typing import List, Tuple


def download_and_extract_dropbox_zip_to_memory(
    url: str,
) -> List[Tuple[str, bytes]]:
    """Download and extract a zip file from Dropbox URL directly to memory.

    Returns:
        List of tuples containing (filename, file_content_bytes)
    """
    try:
        # Download the zip file
        print(f"Downloading zip file from: {url}")
        response = requests.get(url, stream=True)
        response.raise_for_status()

        # Read zip content into memory
        zip_content = io.BytesIO()
        for chunk in response.iter_content(chunk_size=8192):
            zip_content.write(chunk)
        zip_content.seek(0)

        # Extract files from zip in memory
        files_in_memory = []
        with zipfile.ZipFile(zip_content, "r") as zip_ref:
            for file_info in zip_ref.infolist():
                if (
                    not file_info.is_dir()
                    and file_info.filename.lower().endswith(".pdf")
                ):
                    file_content = zip_ref.read(file_info.filename)
                    files_in_memory.append((file_info.filename, file_content))
                    print(
                        f"Loaded {file_info.filename} ({len(file_content)} bytes)"
                    )

        print(
            f"Successfully loaded {len(files_in_memory)} PDF files into memory"
        )
        return files_in_memory

    except Exception as e:
        print(f"Error downloading/extracting zip file: {e}")
        raise


# Download and extract files from Dropbox directly to memory
dropbox_url = "https://www.dropbox.com/scl/fi/oi6kf91gz8h76d2wt57mb/example_docs.zip?rlkey=mf21tvyb65tyrjkr1t2szt226&dl=1"
files_in_memory = download_and_extract_dropbox_zip_to_memory(dropbox_url)

Parsing PDFs using LlamaParse

Let's download the PDF files from Dropbox and parse them directly in memory using LlamaParse:

python
# Create file-like objects for LlamaParse
file_objects = []
file_names = []

for filename, file_content in files_in_memory:
    # Create a file-like object from bytes
    file_obj = io.BytesIO(file_content)
    file_obj.name = filename  # Set the name attribute for LlamaParse
    file_objects.append(file_obj)
    file_names.append(filename)

# Parse all PDF files at once using LlamaParse
# Include extra_info with file names formatted as dictionaries for byte data parsing
print(f"Parsing {len(file_objects)} PDF files...")

# Use async parsing to avoid nested event loop issues
text_data = await asyncio.gather(
    *[
        llamaParser.aparse(file_obj, extra_info={"file_name": name})
        for file_obj, name in zip(file_objects, file_names)
    ]
)
print(f"Successfully parsed {len(text_data)} documents")

Organizing your documents

Once parsed, we form a list of documents with a list of the pages within them.

python
docs = []

for dindex, doc in enumerate(text_data):
    pages = []
    for index, page in enumerate(doc.pages):
        pages.append(page.text)
    docs.append(pages)

print(f"Organized {len(docs)} documents with pages")
if docs:
    print(f"First document has {len(docs[0])} pages")

Querying with ZeroEntropy

We'll now define functions to upload the documents as text pages asynchroniously.

python
import asyncio
from tqdm.asyncio import tqdm

sem = asyncio.Semaphore(16)


async def add_document_with_pages(
    collection_name: str, filename: str, pages: list, doc_index: int
):
    """Add a single document with multiple pages to the collection."""
    async with sem:  # Limit concurrent operations
        for retry in range(3):  # Retry logic
            try:
                response = await zclient.documents.add(
                    collection_name=collection_name,
                    path=filename,  # Use the actual filename as path
                    content={
                        "type": "text-pages",
                        "pages": pages,  # Send list of strings directly
                    },
                )
                return response
            except ConflictError:
                print(
                    f"Document '{filename}' already exists in collection '{collection_name}'"
                )
                break
            except Exception as e:
                if retry == 2:  # Last retry
                    print(f"Failed to add document '{filename}': {e}")
                    return None
                await asyncio.sleep(0.1 * (retry + 1))  # Exponential backoff


async def upload_documents_async(
    docs: list, file_names: list, collection_name: str
):
    """
    Upload documents asynchronously to ZeroEntropy collection.

    Args:
        docs: 2D array where docs[i] contains the list of pages (strings) for document i
        file_names: Array where file_names[i] contains the path for document i
        collection_name: Name of the collection to add documents to
    """

    # Validate input arrays have same length
    if len(docs) != len(file_names):
        raise ValueError("docs and file_names must have the same length")

    # Print starting message
    print(f"Starting upload of {len(docs)} documents...")

    # Create tasks for all documents
    tasks = [
        add_document_with_pages(collection_name, file_names[i], docs[i], i)
        for i in range(len(docs))
    ]

    # Execute all tasks concurrently with progress bar
    results = await tqdm.gather(*tasks, desc="Uploading Documents")

    # Count successful uploads
    successful = sum(1 for result in results if result is not None)
    print(f"Successfully uploaded {successful}/{len(docs)} documents")

    return results

Querying documents with ZeroEntropy

First we will upload documents

python
await upload_documents_async(docs, file_names, collection_name)

Query for the top 5 pages

python
response = await zclient.queries.top_pages(
    collection_name=collection_name,
    query="What are the top 100 stocks in the S&P 500?",
    k=5,
)

Now let's define a function to rerank the pages in the response:

python
async def rerank_top_pages_with_metadata(
    query: str, top_pages_response, collection_name: str
):
    """
    Rerank the results from a top_pages query and return re-ordered list with metadata.

    Args:
        query: The query string to use for reranking
        top_pages_response: The response object from zclient.queries.top_pages()
        collection_name: Name of the collection to fetch page content from

    Returns:
        List of dicts with 'path', 'page_index', and 'rerank_score' in reranked order
    """

    # Fetch page content and store metadata for each result
    documents = []
    metadata = []

    for result in top_pages_response.results:
        # Fetch the actual page content
        page_info = await zclient.documents.get_page_info(
            collection_name=collection_name,
            path=result.path,
            page_index=result.page_index,
            include_content=True,
        )

        # Get page content and ensure it's not empty
        page_content = page_info.page.content
        if page_content and page_content.strip():
            documents.append(page_content.strip())
            metadata.append(
                {
                    "path": result.path,
                    "page_index": result.page_index,
                    "original_score": result.score,
                }
            )
        else:
            # Include empty pages with fallback content
            documents.append("No content available")
            metadata.append(
                {
                    "path": result.path,
                    "page_index": result.page_index,
                    "original_score": result.score,
                }
            )

    if not documents:
        raise ValueError("No documents found to rerank")

    # Perform reranking
    rerank_response = await zclient.models.rerank(
        model="zerank-1", query=query, documents=documents
    )

    # Create re-ordered list with metadata
    reranked_results = []
    for rerank_result in rerank_response.results:
        original_metadata = metadata[rerank_result.index]
        reranked_results.append(
            {
                "path": original_metadata["path"],
                "page_index": original_metadata["page_index"],
                "rerank_score": rerank_result.relevance_score,
            }
        )

    return reranked_results

Run the function and see the results!

python
reranked_results = await rerank_top_pages_with_metadata(
    query="What are the top 100 stocks in the S&P 500?",
    top_pages_response=response,
    collection_name=collection_name,
)

# Display results
print("Reranked Results with Metadata:")
for i, result in enumerate(reranked_results, 1):
    print(
        f"Rank {i}: {result['path']} (Page {result['page_index']}) - Score: {result['rerank_score']:.4f}"
    )

✅ That's It!

You've now built a working semantic search engine that processes PDF files entirely in memory using ZeroEntropy and LlamaParse — no local file storage required!