llama-index-integrations/readers/llama-index-readers-github/README.md
pip install llama-index-readers-github
The github readers package consists of three separate readers:
The readers support two authentication methods:
Generate a token under your account settings at https://github.com/settings/tokens
from llama_index.readers.github import GithubClient
# Direct token
client = GithubClient(github_token="ghp_your_token_here")
# Or via environment variable
import os
os.environ["GITHUB_TOKEN"] = "ghp_your_token_here"
client = GithubClient() # Automatically uses GITHUB_TOKEN
For better security, rate limits, and organization-level access, use GitHub App authentication:
from llama_index.readers.github import GithubClient, GitHubAppAuth
# Load your GitHub App private key
with open("path/to/private-key.pem", "r") as f:
private_key = f.read()
# Create GitHub App auth handler
app_auth = GitHubAppAuth(
app_id="123456", # Your GitHub App ID
private_key=private_key, # Private key content (PEM format)
installation_id="789012", # Installation ID for the target org/repo
)
# Use with any client
client = GithubClient(github_app_auth=app_auth)
Installation for GitHub App support:
pip install llama-index-readers-github[github-app]
Benefits of GitHub App authentication:
This reader will read through a repo, with options to specifically filter directories, file extensions, file paths, and custom processing logic.
from llama_index.readers.github import GithubRepositoryReader, GithubClient
client = github_client = GithubClient(github_token=github_token, verbose=False)
reader = GithubRepositoryReader(
github_client=github_client,
owner="run-llama",
repo="llama_index",
use_parser=False,
verbose=True,
filter_directories=(
["docs"],
GithubRepositoryReader.FilterType.INCLUDE,
),
filter_file_extensions=(
[
".png",
".jpg",
".jpeg",
".gif",
".svg",
".ico",
"json",
".ipynb",
],
GithubRepositoryReader.FilterType.EXCLUDE,
),
)
documents = reader.load_data(branch="main")
# Include only specific files
reader = GithubRepositoryReader(
github_client=github_client,
owner="run-llama",
repo="llama_index",
filter_file_paths=(
["README.md", "src/main.py", "docs/guide.md"],
GithubRepositoryReader.FilterType.INCLUDE,
),
)
# Exclude specific files
reader = GithubRepositoryReader(
github_client=github_client,
owner="run-llama",
repo="llama_index",
filter_file_paths=(
["tests/test_file.py", "temp/cache.txt"],
GithubRepositoryReader.FilterType.EXCLUDE,
),
)
def process_file_callback(file_path: str, file_size: int) -> tuple[bool, str]:
"""Custom logic to determine if a file should be processed.
Args:
file_path: The full path to the file
file_size: The size of the file in bytes
Returns:
Tuple of (should_process: bool, reason: str)
"""
# Skip large files
if file_size > 1024 * 1024: # 1MB
return False, f"File too large: {file_size} bytes"
# Skip test files
if "test" in file_path.lower():
return False, "Skipping test files"
# Skip binary files by extension
binary_extensions = [".exe", ".bin", ".so", ".dylib"]
if any(file_path.endswith(ext) for ext in binary_extensions):
return False, "Skipping binary files"
return True, ""
reader = GithubRepositoryReader(
github_client=github_client,
owner="run-llama",
repo="llama_index",
process_file_callback=process_file_callback,
fail_on_error=False, # Continue processing if callback fails
)
from llama_index.core.readers.base import BaseReader
# Custom parser for specific file types
class CustomMarkdownParser(BaseReader):
def load_data(self, file_path, extra_info=None):
# Custom parsing logic here
pass
reader = GithubRepositoryReader(
github_client=github_client,
owner="run-llama",
repo="llama_index",
use_parser=True,
custom_parsers={".md": CustomMarkdownParser()},
custom_folder="/tmp/github_processing", # Custom temp directory
)
The reader integrates with LlamaIndex's instrumentation system to provide detailed events during processing:
from llama_index.core.instrumentation import get_dispatcher
from llama_index.core.instrumentation.event_handlers import BaseEventHandler
from llama_index.readers.github.repository.event import (
GitHubFileProcessedEvent,
GitHubFileSkippedEvent,
GitHubFileFailedEvent,
GitHubRepositoryProcessingStartedEvent,
GitHubRepositoryProcessingCompletedEvent,
)
class GitHubEventHandler(BaseEventHandler):
def handle(self, event):
if isinstance(event, GitHubRepositoryProcessingStartedEvent):
print(f"Started processing repository: {event.repository_name}")
elif isinstance(event, GitHubFileProcessedEvent):
print(
f"Processed file: {event.file_path} ({event.file_size} bytes)"
)
elif isinstance(event, GitHubFileSkippedEvent):
print(f"Skipped file: {event.file_path} - {event.reason}")
elif isinstance(event, GitHubFileFailedEvent):
print(f"Failed to process file: {event.file_path} - {event.error}")
elif isinstance(event, GitHubRepositoryProcessingCompletedEvent):
print(
f"Completed processing. Total documents: {event.total_documents}"
)
# Register the event handler
dispatcher = get_dispatcher()
handler = GitHubEventHandler()
dispatcher.add_event_handler(handler)
# Use the reader - events will be automatically dispatched
reader = GithubRepositoryReader(
github_client=github_client,
owner="run-llama",
repo="llama_index",
)
documents = reader.load_data(branch="main")
The following events are dispatched during repository processing:
GitHubRepositoryProcessingStartedEvent: Fired when repository processing begins
repository_name: Name of the repository (owner/repo)branch_or_commit: Branch name or commit SHA being processedGitHubRepositoryProcessingCompletedEvent: Fired when repository processing completes
repository_name: Name of the repositorybranch_or_commit: Branch name or commit SHAtotal_documents: Number of documents createdGitHubTotalFilesToProcessEvent: Fired with the total count of files to be processed
repository_name: Name of the repositorybranch_or_commit: Branch name or commit SHAtotal_files: Total number of files foundGitHubFileProcessingStartedEvent: Fired when individual file processing starts
file_path: Path to the file being processedfile_type: File extensionGitHubFileProcessedEvent: Fired when a file is successfully processed
file_path: Path to the processed filefile_type: File extensionfile_size: Size of the file in bytesdocument: The created Document objectGitHubFileSkippedEvent: Fired when a file is skipped
file_path: Path to the skipped filefile_type: File extensionreason: Reason why the file was skippedGitHubFileFailedEvent: Fired when file processing fails
file_path: Path to the failed filefile_type: File extensionerror: Error message describing the failurefrom llama_index.readers.github import (
GitHubRepositoryIssuesReader,
GitHubIssuesClient,
)
github_client = GitHubIssuesClient(github_token=github_token, verbose=True)
reader = GitHubRepositoryIssuesReader(
github_client=github_client,
owner="moncho",
repo="dry",
verbose=True,
)
documents = reader.load_data(
state=GitHubRepositoryIssuesReader.IssueState.ALL,
labelFilters=[("bug", GitHubRepositoryIssuesReader.FilterType.INCLUDE)],
)
from llama_index.readers.github import (
GitHubRepositoryCollaboratorsReader,
GitHubCollaboratorsClient,
)
github_client = GitHubCollaboratorsClient(
github_token=github_token, verbose=True
)
reader = GitHubRepositoryCollaboratorsReader(
github_client=github_client,
owner="moncho",
repo="dry",
verbose=True,
)
documents = reader.load_data()
To create and configure a GitHub App for authentication:
Under Repository permissions, set:
.pem file securelyAfter installation, you'll be redirected to a URL like:
https://github.com/settings/installations/12345678
The number 12345678 is your installation ID. You can also find it via the API:
curl -H "Authorization: Bearer YOUR_JWT_TOKEN" \
https://api.github.com/app/installations
from llama_index.readers.github import GithubClient, GitHubAppAuth
# Load private key
with open("path/to/your-app-private-key.pem", "r") as f:
private_key = f.read()
# Create auth handler
app_auth = GitHubAppAuth(
app_id="YOUR_APP_ID",
private_key=private_key,
installation_id="YOUR_INSTALLATION_ID",
)
# Use with any client
client = GithubClient(github_app_auth=app_auth)
The GitHubAppAuth class automatically:
You can manually invalidate a token if needed:
app_auth.invalidate_token() # Forces refresh on next request
"Failed to get installation token: 401"
"Failed to get installation token: 404"
"Import PyJWT failed"
pip install llama-index-readers-github[github-app]