docs/examples/node_postprocessor/PII.ipynb
<a href="https://colab.research.google.com/github/run-llama/llama_index/blob/main/docs/examples/node_postprocessor/PII.ipynb" target="_parent"></a>
If you're opening this Notebook on colab, you will probably need to install LlamaIndex 🦙.
%pip install llama-index-llms-openai
%pip install llama-index-llms-huggingface
!pip install llama-index
import logging
import sys
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
from llama_index.core.postprocessor import (
PIINodePostprocessor,
NERPIINodePostprocessor,
)
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core import Document, VectorStoreIndex
from llama_index.core.schema import TextNode
# load documents
text = """
Hello Paulo Santos. The latest statement for your credit card account \
1111-0000-1111-0000 was mailed to 123 Any Street, Seattle, WA 98109.
"""
node = TextNode(text=text)
Use a Hugging Face NER model for PII Masking
processor = NERPIINodePostprocessor()
from llama_index.core.schema import NodeWithScore
new_nodes = processor.postprocess_nodes([NodeWithScore(node=node)])
# view redacted text
new_nodes[0].node.get_text()
# get mapping in metadata
# NOTE: this is not sent to the LLM!
new_nodes[0].node.metadata["__pii_node_info__"]
NOTE: You should be using a local LLM model for PII masking. The example shown is using OpenAI, but normally you'd use an LLM running locally, possibly from huggingface. Examples for local LLMs are here.
from llama_index.llms.openai import OpenAI
processor = PIINodePostprocessor(llm=OpenAI())
from llama_index.core.schema import NodeWithScore
new_nodes = processor.postprocess_nodes([NodeWithScore(node=node)])
# view redacted text
new_nodes[0].node.get_text()
# get mapping in metadata
# NOTE: this is not sent to the LLM!
new_nodes[0].node.metadata["__pii_node_info__"]
Use presidio to identify and anonymize PII
# load documents
text = """
Hello Paulo Santos. The latest statement for your credit card account \
4095-2609-9393-4932 was mailed to Seattle, WA 98109. \
IBAN GB90YNTU67299444055881 and social security number is 474-49-7577 were verified on the system. \
Further communications will be sent to [email protected]
"""
presidio_node = TextNode(text=text)
from llama_index.postprocessor.presidio import PresidioPIINodePostprocessor
processor = PresidioPIINodePostprocessor()
from llama_index.core.schema import NodeWithScore
presidio_new_nodes = processor.postprocess_nodes(
[NodeWithScore(node=presidio_node)]
)
# view redacted text
presidio_new_nodes[0].node.get_text()
# get mapping in metadata
# NOTE: this is not sent to the LLM!
presidio_new_nodes[0].node.metadata["__pii_node_info__"]
# feed into index
index = VectorStoreIndex([n.node for n in new_nodes])
response = index.as_query_engine().query(
"What address was the statement mailed to?"
)
print(str(response))