Back to Llama Index

Document360

llama-index-integrations/readers/llama-index-readers-document360/examples/document360.ipynb

0.14.214.4 KB
Original Source

Document360

Simple Example

python
from llama_index.readers.document360 import Document360Reader

api_key = "document360_api_key"

reader = Document360Reader(api_key=api_key)

documents = reader.load_data()

for d in documents:
    print(d.text)

Customize Document360Reader Example

Filter entities to process

python
import logging

from llama_index.readers.document360.entities import (
    Article,
    ArticleSlim,
    Category,
    ProjectVersion,
)

from llama_index.readers.document360 import Document360Reader


def should_process_project_version(project_version: ProjectVersion):
    project_versions_of_interest = ["document360_project_version_id"]

    return project_version.get_id() in project_versions_of_interest:

# parent_categories of the current category are passed into the function parameters
def should_process_category(
    category: Category, parent_categories: list[Category]
):
    categories_of_interest = ["document360_category_id"]

    return category.get_id() in categories_of_interest

def should_process_article(article: ArticleSlim):
    return article.get_title() !== "Do not process this article"


# Initialize the Document360Reader
reader = Document360Reader(
    api_key=api_key,
    should_process_project_version=should_process_project_version,
    should_process_category=should_process_category,
    should_process_article=should_process_article,
)

reader.load_data()

Customizing Error Handling

python
import logging

from llama_index.readers.document360.entities import (
    Article,
    ArticleSlim,
)

from llama_index.readers.document360 import Document360Reader


def handle_rate_limit_error():
    logging.error("Rate limit exceeded. Retrying...")


def handle_request_http_error(e: Exception):
    logging.error(f"HTTP Request failed. {e}")


def handle_article_processing_error(e: Exception, article: Union[Article, ArticleSlim]):
    logging.error(f"Failed to process {article}: {e}")


def handle_load_data_error(e: Exception):
    logging.error(f"Load data error: {e}")


# Initialize the Document360Reader
reader = Document360Reader(
    api_key=api_key,
    handle_rate_limit_error=handle_rate_limit_error,
    handle_request_http_error=handle_request_http_error,
    handle_article_processing_error=handle_article_processing_error,
    handle_load_data_error=handle_load_data_error,
)

reader.load_data()

Hook into the Document360Reader Lifecycle

python
import logging

from llama_index.readers.document360.entities import Article, Category

from llama_index.readers.document360 import Document360Reader


def handle_batch_finished():
    logging.info("Batch finished processing")


def handle_category_processing_started(category: Category):
    logging.info(f"Started processing category: {category}")


def handle_article_processing_started(article: Article):
    logging.info(f"Processing article: {article}")


# Initialize the Document360Reader
reader = Document360Reader(
    api_key=api_key,
    handle_batch_finished=handle_batch_finished,
    handle_category_processing_started=handle_category_processing_started,
    handle_article_processing_started=handle_article_processing_started,
)

reader.load_data()

Create a custom llama_index Document from Document360 Article

python
import logging

from llama_index.readers.document360.entities import Article
from llama_index.readers.document360 import Document360Reader

# Your class that handles how to process your Document360 article
from your_module import ProcessedArticle, LinkExtractor


def article_to_custom_document(article: Article):
    processed_article = ProcessedArticle(article=article)

    # for example, you might want to extract links on the article page
    processed_article.extract_links(LinkExtractor())
    links = processed_article.get_links()

    return Document(
        doc_id=article.get_id(),
        text=strip_html(article.get_html_content()),
        extra_info={
            "title": article.get_title(),
            "category_id": article.get_category_id(),
            "project_version_id": article.get_project_version_id(),
            "created_by": article.get_created_by(),
            "created_at": article.get_created_at(),
            "modified_at": article.get_modified_at(),
            "url": article.get_url(),
            "links": links,
        },
    )


# Initialize the Document360Reader
reader = Document360Reader(
    api_key=api_key,
    article_to_custom_document=article_to_custom_document,
)

reader.load_data()