llama-index-integrations/postprocessor/llama-index-postprocessor-longllmlingua/examples/longllmlingua2.ipynb
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'pg_essay.txt'
VectorStoreIndex abstractionfrom llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.postprocessor.longllmlingua import LongLLMLinguaPostprocessor
documents = SimpleDirectoryReader(input_files=["pg_essay.txt"]).load_data()
index = VectorStoreIndex.from_documents(documents)
retriever = index.as_retriever(similarity_top_k=8)
query = "What did the author do during his time at Yale?"
nodes = retriever.retrieve(query)
nodes
LLMLingua2's claim to fame is its ability to achieve performant compression using a small prompt compression method trained via data distillation from GPT-4 for token classification! This performant compression comes with a performance bump of 3x-6x
compressor_llmlingua2 = LongLLMLinguaPostprocessor(
model_name="microsoft/llmlingua-2-xlm-roberta-large-meetingbank",
device_map="mps", # Mac users rejoice!
use_llmlingua2=True,
)
from llama_index.core.schema import QueryBundle
results = compressor_llmlingua2._postprocess_nodes(
nodes, query_bundle=QueryBundle(query_str=query)
)
from IPython.display import display, Markdown
display(Markdown(results[0].text))
query_engine1 = index.as_query_engine(
similarity_top_k=8, postprocessors=[compressor_llmlingua2]
)
response = query_engine1.query(query)
display(Markdown(str(response)))
response.metadata
compressor_llmlingua1 = LongLLMLinguaPostprocessor(
device_map="mps" # Mac users rejoice!
)
results = compressor_llmlingua1._postprocess_nodes(
nodes, query_bundle=QueryBundle(query_str=query)
)
results
query_engine_llmlingua1 = index.as_query_engine(
similarity_top_k=8, postprocessors=[compressor_llmlingua1]
)
response = query_engine_llmlingua1.query(query)
display(Markdown(str(response)))
response.metadata