llama-index-integrations/readers/llama-index-readers-semanticscholar/examples/demo_s2.ipynb
from llama_hub.semanticscholar.base import SemanticScholarReader
import os
import openai
from llama_index.llms import OpenAI
from llama_index.query_engine import CitationQueryEngine
from llama_index import (
VectorStoreIndex,
StorageContext,
load_index_from_storage,
ServiceContext,
)
from llama_index.response.notebook_utils import display_response
# initialize the SemanticScholarReader
s2reader = SemanticScholarReader()
# initialize the service context
openai.api_key = os.environ["OPENAI_API_KEY"]
service_context = ServiceContext.from_defaults(
llm=OpenAI(model="gpt-3.5-turbo", temperature=0)
)
query_space = "large language models"
full_text = True
# be careful with the total_papers when full_text = True
# it can take a long time to download
total_papers = 50
persist_dir = (
"./citation_" + query_space + "_" + str(total_papers) + "_" + str(full_text)
)
if not os.path.exists(persist_dir):
# Load data from Semantic Scholar
documents = s2reader.load_data(query_space, total_papers, full_text=full_text)
index = VectorStoreIndex.from_documents(documents, service_context=service_context)
index.storage_context.persist(persist_dir=persist_dir)
else:
index = load_index_from_storage(
StorageContext.from_defaults(persist_dir=persist_dir),
service_context=service_context,
)
# initialize the citation query engine
query_engine = CitationQueryEngine.from_args(
index,
similarity_top_k=3,
citation_chunk_size=512,
)
query_string = "limitations of using large language models"
# query the citation query engine
response = query_engine.query(query_string)
display_response(
response, show_source=True, source_length=100, show_source_metadata=True
)
query_space = "covid 19 vaccine"
query_string = "List the efficacy numbers of the covid 19 vaccines"
full_text = True
# be careful with the total_papers when full_text = True
# it can take a long time to download
total_papers = 50
persist_dir = (
"./citation_" + query_space + "_" + str(total_papers) + "_" + str(full_text)
)
if not os.path.exists(persist_dir):
# Load data from Semantic Scholar
documents = s2reader.load_data(query_space, total_papers, full_text=full_text)
index = VectorStoreIndex.from_documents(documents, service_context=service_context)
index.storage_context.persist(persist_dir=persist_dir)
else:
index = load_index_from_storage(
StorageContext.from_defaults(persist_dir=persist_dir),
service_context=service_context,
)
# initialize the citation query engine
query_engine = CitationQueryEngine.from_args(
index,
similarity_top_k=3,
citation_chunk_size=512,
)
# query the citation query engine
response = query_engine.query(query_string)
display_response(
response, show_source=True, source_length=100, show_source_metadata=True
)