python/examples/experimental/NLP_Summarization.ipynb
🚩 Create a free WhyLabs account to get more value out of whylogs!
Did you know you can store, visualize, and monitor whylogs profiles with the WhyLabs Observability Platform? Sign up for a free WhyLabs account to leverage the power of whylogs and WhyLabs together!
In this example, we'll look at how we might use whylogs to monitor a document summarization task.
We'll use NLTK and BeautifulSoup to do some of the basic NLP tasks, so let's install the packages we'll need now.
%pip install nltk
%pip install bs4
%pip install whylogs[embeddings]
We'll use the NLTK Reuters corpus as the documents to summarize. As a trivial summarization algorithm, we'll pull out the sentence that contains a document's highest log-entropy weighted term as its summary. Let's start by computing the term-frequency index for the corpus and the term global frequencies and entropies. We'll use NLTK's stemming, stopping, and tokenization for those calcuations, but return the unaltered sentence as the summary.
from typing import Any, Dict, List, Optional, Set
import nltk
import numpy as np
from nltk.corpus import reuters
from bs4 import BeautifulSoup
nltk.download('reuters')
nltk.download('punkt')
nltk.download('stopwords')
STEMMER = nltk.stem.PorterStemmer()
# the NLTK tokenizer produces some junk tokens, so add them to the stopwords
STOPWORDS = set(nltk.corpus.stopwords.words("english") + [
".",
",",
"<",
">",
"'s",
"''",
"``",
]
)
def delete_headline(text: str) -> str:
'''
NLTK's sentence tokenizer includes the headline in the first sentence
if we don't manually exlude it.
'''
lines = text.split("\n")
return "\n".join(lines[1:]) if len(lines) > 1 else text
def global_freq(A: np.ndarray) -> np.ndarray:
'''Sum the columns of the term-frequency index to get term global frequencies'''
gf = np.zeros(A.shape[0])
for i in range(A.shape[0]):
for j in range(A.shape[1]):
gf[i] += A[i, j]
return gf
def entropy(A: np.ndarray, gf: np.ndarray) -> np.ndarray:
'''Compute the term entropy'''
g = np.zeros(A.shape[0])
logN = np.log(A.shape[1])
for i in range(A.shape[0]):
for j in range(A.shape[1]):
p_ij = A[i, j] / gf[i]
g[i] += p_ij * np.log(p_ij) if p_ij != 0 else 0
g[i] = 1 + g[i] / logN
return g
def get_raw_tokens(file) -> List[str]:
'''
The raw NLTK documents contain a few HTML entities, so we'll use BeautifulSoup
to decode them, then apply the NLTK word tokenizer. Skip the headline.
'''
raw = BeautifulSoup(delete_headline(reuters.raw(file)), "html.parser").get_text()
return [t.casefold() for t in nltk.word_tokenize(raw) if t.casefold() not in STOPWORDS]
def get_vocabulary(file) -> Set[str]:
'''
Returns the set of stemmed terms in the specified Reuters article (excluding headline).
'''
vocab: Set[str] = set()
tokens = get_raw_tokens(file)
stemmed = [STEMMER.stem(t.casefold()) for t in tokens]
return set(stemmed)
file_ids = reuters.fileids()
train_files = [id for id in file_ids if id.startswith("train")][:500]
vocab: Set[str] = set()
for file in train_files:
vocab.update(get_vocabulary(file))
ndocs = len(train_files)
vocab_size = len(vocab)
print(f"{ndocs} articles {vocab_size} vocabulary")
It will also be handy to have mappings back and forth between each term (as a string) and the term's row in term frequency matrix. Let's build those up.
vocab_map: Dict[str, int] = dict()
rev_map: List[str] = [''] * vocab_size
for i, t in enumerate(vocab):
vocab_map[t] = i
rev_map[i] = t
index = np.zeros((vocab_size, ndocs))
for col, id in enumerate(train_files):
tokens = get_raw_tokens(id)
stemmed = [STEMMER.stem(t) for t in tokens]
for term in stemmed:
index[ vocab_map[term], col ] += 1
gf = global_freq(index)
g = entropy(index, gf)
Now we have the inputs we need to compute the term weights, so we can implement our summarization algorithm. But since we want to monitor our summarization process with whylogs, we'll need to do a little whylogs setup before we start summarizing.
By default, whylogs uses a TransientLogger that produces a new profile for every log() call. For our example, it's nicer to aggregate all the logging into a singe profile. So we'll create a simple PersistentLogger to do that.
from whylogs.api.logger.logger import Logger
from whylogs.core import DatasetProfile, DatasetSchema
from whylogs.core.configs import SummaryConfig
from whylogs.core.dataset_profile import logger as dp_logger # because it doesn't like vectors
from whylogs.core.preprocessing import ListView, PreprocessedColumn
from whylogs.core.resolvers import MetricSpec, ResolverSpec, STANDARD_RESOLVER
from whylogs.core.schema import DeclarativeSchema
from whylogs.core.stubs import pd
from whylogs.core.view.column_profile_view import ColumnProfileView
from whylogs.experimental.extras.nlp_metric import BagOfWordsMetric
class PersistentLogger(Logger):
def __init__(self, schema: Optional[DatasetSchema] = None):
super().__init__(schema)
self._current_profile = DatasetProfile(schema=self._schema)
def _get_matching_profiles(
self,
obj: Any = None,
*,
pandas: Optional[pd.DataFrame] = None,
row: Optional[Dict[str, Any]] = None,
schema: Optional[DatasetSchema] = None,
) -> List[DatasetProfile]:
if schema and schema is not self._schema:
raise ValueError(
"You cannot pass a DatasetSchema to an instance of PersistentLogger.log(),"
"because schema is set once when instantiated, please use TimedRollingLogger(schema) instead."
)
return [self._current_profile]
We also need to attach the BagOfWordsMetric to the columns that represent our input articles and output summaries. We log each document as a list of its tokens.
from logging import ERROR
dp_logger.setLevel(ERROR)
resolvers = STANDARD_RESOLVER + [
ResolverSpec(
column_name = "article_bow",
metrics = [MetricSpec(BagOfWordsMetric)]
),
ResolverSpec(
column_name = "summary_bow",
metrics = [MetricSpec(BagOfWordsMetric)]
)
]
schema = DeclarativeSchema(resolvers)
why = PersistentLogger(schema=schema)
Now we're finally ready to do some summarization! We'll compute the log entropy weighted term vector for each article as a whole, then use NLTK's sentence tokenizer to split it into sentences. The first sentence that contains the word with the highest weight in the document will be our summary.
profile = None
for file in train_files:
raw = BeautifulSoup(reuters.raw(file), 'html.parser').get_text()
# print(raw.split('\n')[0]) # print article headline
# print(raw) # print the whole input article
raw = delete_headline(raw)
tokens = [t.casefold() for t in nltk.word_tokenize(raw) if t.casefold() not in STOPWORDS]
stemmed = [STEMMER.stem(t) for t in tokens]
doc_vec = np.zeros(vocab_size)
for term in stemmed:
doc_vec[ vocab_map[term] ] += 1
max_weight = -1
max_term = ""
for i in range(vocab_size):
doc_vec[i] = g[i] * np.log(doc_vec[i] + 1.0)
if doc_vec[i] > max_weight:
max_weight = doc_vec[i]
max_term = rev_map[i]
sentences = nltk.sent_tokenize(raw)
max_sentence = ""
for sentence in sentences:
tokenized = [t.casefold() for t in nltk.word_tokenize(sentence) if t.casefold() not in STOPWORDS]
stemmed = [STEMMER.stem(t) for t in tokenized]
if max_term in stemmed:
max_sentence = sentence
profile = why.log(obj={"article_bow": tokens, "summary_bow": tokenized})
break
# max_sentence = max_sentence.replace("\n", " ")
# print(f"{max_weight} {max_term}: {max_sentence}")
We've logged the full articles as the article_bow column and the summaries as the summary_bow column. Now let's grab the profile from the logger and take a look at it.
def dump_summary(view: ColumnProfileView) -> None:
summary = view.to_summary_dict()
keys = [
"nlp_bow/doc_length:counts/n",
"nlp_bow/doc_length:distribution/mean",
"nlp_bow/doc_length:distribution/stddev",
"nlp_bow/doc_length:distribution/max",
"nlp_bow/doc_length:distribution/min",
"nlp_bow/doc_length:distribution/median",
"nlp_bow/term_length:counts/n",
"nlp_bow/term_length:distribution/mean",
"nlp_bow/term_length:distribution/stddev",
"nlp_bow/term_length:distribution/max",
"nlp_bow/term_length:distribution/min",
"nlp_bow/term_length:distribution/median",
]
for key in keys:
print(f" {key}: {summary[key]}")
print(f" frequent terms: {[t.value for t in summary['nlp_bow/frequent_terms:frequent_items/frequent_strings'][:10]]}")
view = profile.view()
columns = view.get_columns()
for col_name, col_view in columns.items():
print(f"{col_name}:")
dump_summary(col_view)
print()
As expected, we see that the summary documents are shorter than the original articles. We also see some differences and overlap in the most frequent words in the whole articles and the summaries.
resolvers = STANDARD_RESOLVER + [
ResolverSpec(
column_name = "original_bow",
metrics = [MetricSpec(BagOfWordsMetric)]
),
ResolverSpec(
column_name = "split_bow",
metrics = [MetricSpec(BagOfWordsMetric)]
)
]
schema = DeclarativeSchema(resolvers)
why = PersistentLogger(schema=schema)
import random
profile = None
for file in train_files:
raw = BeautifulSoup(reuters.raw(file), 'html.parser').get_text()
raw = delete_headline(raw)
sentences = nltk.sent_tokenize(raw)
for sentence in sentences:
tokens = [t.casefold() for t in nltk.word_tokenize(sentence)]
why.log(obj={"original_bow": np.array(tokens)})
phrases = sentence.split(",")
if len(phrases) > 1:
index = random.randint(0, len(phrases))
left = [t.casefold() for t in nltk.word_tokenize(", ".join(phrases[:index]) + ".")]
right = [t.casefold() for t in nltk.word_tokenize(", ".join(phrases[index:]))]
why.log(obj={"split_bow": left})
profile = why.log(obj={"split_bow": right})
else:
profile = why.log(obj={"split_bow": tokens})
view = profile.view()
columns = view.get_columns()
for col_name, col_view in columns.items():
print(f"{col_name}:")
dump_summary(col_view)
print()