Back to Graphrag

Copyright (c) 2024 Microsoft Corporation.

docs/examples_notebooks/index_migration_to_v1.ipynb

3.0.97.4 KB
Original Source
python
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.

Index Migration (pre-v1 to v1)

This notebook is used to maintain data model parity with older indexes for version 1.0 of GraphRAG. If you have a pre-1.0 index and need to migrate without re-running the entire pipeline, you can use this notebook to only update the pieces necessary for alignment.

NOTE: we recommend regenerating your settings.yml with the latest version of GraphRAG using graphrag init. Copy your LLM settings into it before running this notebook. This ensures your config is aligned with the latest version for the migration. This also ensures that you have default vector store config, which is now required or indexing will fail.

WARNING: This will overwrite your parquet files, you may want to make a backup!

python
# This is the directory that has your settings.yaml
# NOTE: much older indexes may have been output with a timestamped directory
# if this is the case, you will need to make sure the storage.base_dir in settings.yaml points to it correctly
PROJECT_DIRECTORY = "<your project directory"
python
from pathlib import Path

from graphrag.config.load_config import load_config
from graphrag.storage.factory import StorageFactory

config = load_config(Path(PROJECT_DIRECTORY))
storage_config = config.output.model_dump()
storage = StorageFactory().create_storage(
    storage_type=storage_config["type"],
    kwargs=storage_config,
)
python
def remove_columns(df, columns):
    """Remove columns from a DataFrame, suppressing errors."""
    df.drop(labels=columns, axis=1, errors="ignore", inplace=True)
python
def get_community_parent(nodes):
    """Compute the parent community using the node membership as a lookup."""
    parent_mapping = nodes.loc[:, ["level", "community", "title"]]
    nodes = nodes.loc[:, ["level", "community", "title"]]

    # Create a parent mapping by adding 1 to the level column
    parent_mapping["level"] += 1  # Shift levels for parent relationship
    parent_mapping.rename(columns={"community": "parent"}, inplace=True)

    # Merge the parent information back into the base DataFrame
    nodes = nodes.merge(parent_mapping, on=["level", "title"], how="left")

    # Fill missing parents with -1 (default value)
    nodes["parent"] = nodes["parent"].fillna(-1).astype(int)

    join = (
        nodes
        .groupby(["community", "level", "parent"])
        .agg({"title": list})
        .reset_index()
    )
    return join[join["community"] > -1].loc[:, ["community", "parent"]]
python
from uuid import uuid4

from graphrag_storage.tables.parquet_table_provider import ParquetTableProvider

# Create table provider from storage
table_provider = ParquetTableProvider(storage)

# First we'll go through any parquet files that had model changes and update them
# The new data model may have removed excess columns as well, but we will only make the minimal changes required for compatibility

final_documents = await table_provider.read_dataframe("create_final_documents")
final_text_units = await table_provider.read_dataframe("create_final_text_units")
final_entities = await table_provider.read_dataframe("create_final_entities")
final_nodes = await table_provider.read_dataframe("create_final_nodes")
final_relationships = await table_provider.read_dataframe("create_final_relationships")
final_communities = await table_provider.read_dataframe("create_final_communities")
final_community_reports = await table_provider.read_dataframe(
    "create_final_community_reports"
)


# Documents renames raw_content for consistency
if "raw_content" in final_documents.columns:
    final_documents.rename(columns={"raw_content": "text"}, inplace=True)
final_documents["human_readable_id"] = final_documents.index + 1

# Text units just get a human_readable_id or consistency
final_text_units["human_readable_id"] = final_text_units.index + 1

# We renamed "name" to "title" for consistency with the rest of the tables
if "name" in final_entities.columns:
    final_entities.rename(columns={"name": "title"}, inplace=True)
remove_columns(
    final_entities, ["name_embedding", "graph_embedding", "description_embedding"]
)

# Final nodes uses community for joins, which is now an int everywhere
final_nodes["community"] = final_nodes["community"].fillna(-1)
final_nodes["community"] = final_nodes["community"].astype(int)
remove_columns(
    final_nodes,
    [
        "type",
        "description",
        "source_id",
        "graph_embedding",
        "entity_type",
        "top_level_node_id",
        "size",
    ],
)

# Relationships renames "rank" to "combined_degree" to be clear what the default ranking is
if "rank" in final_relationships.columns:
    final_relationships.rename(columns={"rank": "combined_degree"}, inplace=True)


# Compute the parents for each community, to add to communities and reports
parent_df = get_community_parent(final_nodes)

# Communities previously used the "id" field for the Leiden id, but we've moved this to the community field and use a uuid for id like the others
if "community" not in final_communities.columns:
    final_communities["community"] = final_communities["id"].astype(int)
    final_communities["human_readable_id"] = final_communities["community"]
    final_communities["id"] = [str(uuid4()) for _ in range(len(final_communities))]
if "parent" not in final_communities.columns:
    final_communities = final_communities.merge(parent_df, on="community", how="left")
if "entity_ids" not in final_communities.columns:
    node_mapping = (
        final_nodes
        .loc[:, ["community", "id"]]
        .groupby("community")
        .agg(entity_ids=("id", list))
    )
    final_communities = final_communities.merge(
        node_mapping, on="community", how="left"
    )
remove_columns(final_communities, ["raw_community"])

# We need int for community and the human_readable_id copy for consistency
final_community_reports["community"] = final_community_reports["community"].astype(int)
final_community_reports["human_readable_id"] = final_community_reports["community"]
if "parent" not in final_community_reports.columns:
    final_community_reports = final_community_reports.merge(
        parent_df, on="community", how="left"
    )

await table_provider.write_dataframe("create_final_documents", final_documents)
await table_provider.write_dataframe("create_final_text_units", final_text_units)
await table_provider.write_dataframe("create_final_entities", final_entities)
await table_provider.write_dataframe("create_final_nodes", final_nodes)
await table_provider.write_dataframe("create_final_relationships", final_relationships)
await table_provider.write_dataframe("create_final_communities", final_communities)
await table_provider.write_dataframe(
    "create_final_community_reports", final_community_reports
)
python
from graphrag.callbacks.noop_workflow_callbacks import NoopWorkflowCallbacks
from graphrag.index.workflows.generate_text_embeddings import generate_text_embeddings
from graphrag_cache import create_cache

# We only need to re-run the embeddings workflow, to ensure that embeddings
# for all required search fields are in place.
# We pass in the table_provider created earlier so that generate_text_embeddings
# reads the migrated tables we just wrote.

callbacks = NoopWorkflowCallbacks()
cache = create_cache(config.cache)

await generate_text_embeddings(
    config=config,
    table_provider=table_provider,
    cache=cache,
    callbacks=callbacks,
)