Semantic Scholar Loader in llama-index

python

from llama_hub.semanticscholar.base import SemanticScholarReader
import os
import openai
from llama_index.llms import OpenAI
from llama_index.query_engine import CitationQueryEngine
from llama_index import (
    VectorStoreIndex,
    StorageContext,
    load_index_from_storage,
    ServiceContext,
)
from llama_index.response.notebook_utils import display_response

# initialize the SemanticScholarReader
s2reader = SemanticScholarReader()

# initialize the service context
openai.api_key = os.environ["OPENAI_API_KEY"]
service_context = ServiceContext.from_defaults(
    llm=OpenAI(model="gpt-3.5-turbo", temperature=0)
)

python

query_space = "large language models"
full_text = True
# be careful with the total_papers when full_text = True
# it can take a long time to download
total_papers = 50

persist_dir = (
    "./citation_" + query_space + "_" + str(total_papers) + "_" + str(full_text)
)

if not os.path.exists(persist_dir):
    # Load data from Semantic Scholar
    documents = s2reader.load_data(query_space, total_papers, full_text=full_text)
    index = VectorStoreIndex.from_documents(documents, service_context=service_context)
    index.storage_context.persist(persist_dir=persist_dir)
else:
    index = load_index_from_storage(
        StorageContext.from_defaults(persist_dir=persist_dir),
        service_context=service_context,
    )

# initialize the citation query engine
query_engine = CitationQueryEngine.from_args(
    index,
    similarity_top_k=3,
    citation_chunk_size=512,
)

query_string = "limitations of using large language models"

# query the citation query engine
response = query_engine.query(query_string)
display_response(
    response, show_source=True, source_length=100, show_source_metadata=True
)

python

query_space = "covid 19 vaccine"
query_string = "List the efficacy numbers of the covid 19 vaccines"
full_text = True
# be careful with the total_papers when full_text = True
# it can take a long time to download
total_papers = 50

persist_dir = (
    "./citation_" + query_space + "_" + str(total_papers) + "_" + str(full_text)
)

if not os.path.exists(persist_dir):
    # Load data from Semantic Scholar
    documents = s2reader.load_data(query_space, total_papers, full_text=full_text)
    index = VectorStoreIndex.from_documents(documents, service_context=service_context)
    index.storage_context.persist(persist_dir=persist_dir)
else:
    index = load_index_from_storage(
        StorageContext.from_defaults(persist_dir=persist_dir),
        service_context=service_context,
    )

# initialize the citation query engine
query_engine = CitationQueryEngine.from_args(
    index,
    similarity_top_k=3,
    citation_chunk_size=512,
)

# query the citation query engine
response = query_engine.query(query_string)
display_response(
    response, show_source=True, source_length=100, show_source_metadata=True
)