Back to Whylogs

Document Summarization Example

python/examples/experimental/NLP_Summarization.ipynb

1.6.410.9 KB
Original Source

🚩 Create a free WhyLabs account to get more value out of whylogs!

Did you know you can store, visualize, and monitor whylogs profiles with the WhyLabs Observability Platform? Sign up for a free WhyLabs account to leverage the power of whylogs and WhyLabs together!

Document Summarization Example

In this example, we'll look at how we might use whylogs to monitor a document summarization task.

We'll use NLTK and BeautifulSoup to do some of the basic NLP tasks, so let's install the packages we'll need now.

python
%pip install nltk
%pip install bs4
%pip install whylogs[embeddings]

We'll use the NLTK Reuters corpus as the documents to summarize. As a trivial summarization algorithm, we'll pull out the sentence that contains a document's highest log-entropy weighted term as its summary. Let's start by computing the term-frequency index for the corpus and the term global frequencies and entropies. We'll use NLTK's stemming, stopping, and tokenization for those calcuations, but return the unaltered sentence as the summary.

python
from typing import Any, Dict, List, Optional, Set

import nltk
import numpy as np

from nltk.corpus import reuters
from bs4 import BeautifulSoup

nltk.download('reuters')
nltk.download('punkt')
nltk.download('stopwords')

STEMMER = nltk.stem.PorterStemmer()

# the NLTK tokenizer produces some junk tokens, so add them to the stopwords
STOPWORDS = set(nltk.corpus.stopwords.words("english") + [
    ".",
    ",",
    "<",
    ">",
    "'s",
    "''",
    "``",
  ]
)


def delete_headline(text: str) -> str:
  '''
  NLTK's sentence tokenizer includes the headline in the first sentence
  if we don't manually exlude it.
  '''
  lines = text.split("\n")
  return "\n".join(lines[1:]) if len(lines) > 1 else text


def global_freq(A: np.ndarray) -> np.ndarray:
  '''Sum the columns of the term-frequency index to get term global frequencies'''
  gf = np.zeros(A.shape[0])
  for i in range(A.shape[0]):
    for j in range(A.shape[1]):
      gf[i] += A[i, j]
  return gf


def entropy(A: np.ndarray, gf: np.ndarray) -> np.ndarray:
  '''Compute the term entropy'''
  g = np.zeros(A.shape[0])
  logN = np.log(A.shape[1])
  for i in range(A.shape[0]):
    for j in range(A.shape[1]):
        p_ij = A[i, j] / gf[i]
        g[i] += p_ij * np.log(p_ij) if p_ij != 0 else 0
    g[i] = 1 + g[i] / logN
  return g


def get_raw_tokens(file) -> List[str]:
  '''
  The raw NLTK documents contain a few HTML entities, so we'll use BeautifulSoup
  to decode them, then apply the NLTK word tokenizer. Skip the headline.
  '''
  raw = BeautifulSoup(delete_headline(reuters.raw(file)), "html.parser").get_text()
  return [t.casefold() for t in nltk.word_tokenize(raw) if t.casefold() not in STOPWORDS]


def get_vocabulary(file) -> Set[str]:
  '''
  Returns the set of stemmed terms in the specified Reuters article (excluding headline).
  '''
  vocab: Set[str] = set()
  tokens = get_raw_tokens(file)
  stemmed = [STEMMER.stem(t.casefold()) for t in tokens]
  return set(stemmed)


file_ids = reuters.fileids()
train_files = [id for id in file_ids if id.startswith("train")][:500]

vocab: Set[str] = set()

for file in train_files:
    vocab.update(get_vocabulary(file))

ndocs = len(train_files)
vocab_size = len(vocab)
print(f"{ndocs} articles   {vocab_size} vocabulary")

It will also be handy to have mappings back and forth between each term (as a string) and the term's row in term frequency matrix. Let's build those up.

python
vocab_map: Dict[str, int] = dict()
rev_map: List[str] = [''] * vocab_size
for i, t in enumerate(vocab):
    vocab_map[t] = i
    rev_map[i] = t

index = np.zeros((vocab_size, ndocs))
for col, id in enumerate(train_files):
    tokens = get_raw_tokens(id)
    stemmed = [STEMMER.stem(t) for t in tokens]
    for term in stemmed:
        index[ vocab_map[term], col ] += 1

gf = global_freq(index)
g = entropy(index, gf)

Now we have the inputs we need to compute the term weights, so we can implement our summarization algorithm. But since we want to monitor our summarization process with whylogs, we'll need to do a little whylogs setup before we start summarizing.

By default, whylogs uses a TransientLogger that produces a new profile for every log() call. For our example, it's nicer to aggregate all the logging into a singe profile. So we'll create a simple PersistentLogger to do that.

python
from whylogs.api.logger.logger import Logger
from whylogs.core import DatasetProfile, DatasetSchema
from whylogs.core.configs import SummaryConfig
from whylogs.core.dataset_profile import logger as dp_logger  # because it doesn't like vectors
from whylogs.core.preprocessing import ListView, PreprocessedColumn
from whylogs.core.resolvers import MetricSpec, ResolverSpec, STANDARD_RESOLVER
from whylogs.core.schema import DeclarativeSchema
from whylogs.core.stubs import pd
from whylogs.core.view.column_profile_view import ColumnProfileView
from whylogs.experimental.extras.nlp_metric import BagOfWordsMetric

class PersistentLogger(Logger):
    def __init__(self, schema: Optional[DatasetSchema] = None):
        super().__init__(schema)
        self._current_profile = DatasetProfile(schema=self._schema)

    def _get_matching_profiles(
        self,
        obj: Any = None,
        *,
        pandas: Optional[pd.DataFrame] = None,
        row: Optional[Dict[str, Any]] = None,
        schema: Optional[DatasetSchema] = None,
    ) -> List[DatasetProfile]:
        if schema and schema is not self._schema:
            raise ValueError(
                "You cannot pass a DatasetSchema to an instance of PersistentLogger.log(),"
                "because schema is set once when instantiated, please use TimedRollingLogger(schema) instead."
            )
        return [self._current_profile]

We also need to attach the BagOfWordsMetric to the columns that represent our input articles and output summaries. We log each document as a list of its tokens.

python
from logging import ERROR
dp_logger.setLevel(ERROR)

resolvers = STANDARD_RESOLVER + [
    ResolverSpec(
        column_name = "article_bow",
        metrics = [MetricSpec(BagOfWordsMetric)]
    ),
    ResolverSpec(
        column_name = "summary_bow",
        metrics = [MetricSpec(BagOfWordsMetric)]
    )
]
schema = DeclarativeSchema(resolvers)
why = PersistentLogger(schema=schema)

Now we're finally ready to do some summarization! We'll compute the log entropy weighted term vector for each article as a whole, then use NLTK's sentence tokenizer to split it into sentences. The first sentence that contains the word with the highest weight in the document will be our summary.

python
profile = None
for file in train_files:
    raw = BeautifulSoup(reuters.raw(file), 'html.parser').get_text()
    # print(raw.split('\n')[0])   # print article headline
    # print(raw)  # print the whole input article
    raw = delete_headline(raw)
    tokens = [t.casefold() for t in nltk.word_tokenize(raw) if t.casefold() not in STOPWORDS]
    stemmed = [STEMMER.stem(t) for t in tokens]
    doc_vec = np.zeros(vocab_size)
    for term in stemmed:
        doc_vec[ vocab_map[term] ] += 1
    max_weight = -1
    max_term = ""
    for i in range(vocab_size):
        doc_vec[i] = g[i] * np.log(doc_vec[i] + 1.0)
        if doc_vec[i] > max_weight:
            max_weight = doc_vec[i]
            max_term = rev_map[i]
    sentences = nltk.sent_tokenize(raw)
    max_sentence = ""
    for sentence in sentences:
        tokenized = [t.casefold() for t in nltk.word_tokenize(sentence) if t.casefold() not in STOPWORDS]
        stemmed = [STEMMER.stem(t) for t in tokenized]
        if max_term in stemmed:
            max_sentence = sentence
            profile = why.log(obj={"article_bow": tokens, "summary_bow": tokenized})
            break
    # max_sentence = max_sentence.replace("\n", " ")
    # print(f"{max_weight} {max_term}:   {max_sentence}")

We've logged the full articles as the article_bow column and the summaries as the summary_bow column. Now let's grab the profile from the logger and take a look at it.

python
def dump_summary(view: ColumnProfileView) -> None:
    summary = view.to_summary_dict()
    keys = [
        "nlp_bow/doc_length:counts/n",
        "nlp_bow/doc_length:distribution/mean",
        "nlp_bow/doc_length:distribution/stddev",
        "nlp_bow/doc_length:distribution/max",
        "nlp_bow/doc_length:distribution/min",
        "nlp_bow/doc_length:distribution/median",

        "nlp_bow/term_length:counts/n",
        "nlp_bow/term_length:distribution/mean",
        "nlp_bow/term_length:distribution/stddev",
        "nlp_bow/term_length:distribution/max",
        "nlp_bow/term_length:distribution/min",
        "nlp_bow/term_length:distribution/median",
    ]
    for key in keys:
        print(f"    {key}: {summary[key]}")
    print(f"    frequent terms: {[t.value for t in summary['nlp_bow/frequent_terms:frequent_items/frequent_strings'][:10]]}")


view = profile.view()
columns = view.get_columns()
for col_name, col_view in columns.items():
    print(f"{col_name}:")
    dump_summary(col_view)
    print()

As expected, we see that the summary documents are shorter than the original articles. We also see some differences and overlap in the most frequent words in the whole articles and the summaries.

python
resolvers = STANDARD_RESOLVER + [
    ResolverSpec(
        column_name = "original_bow",
        metrics = [MetricSpec(BagOfWordsMetric)]
    ),
    ResolverSpec(
        column_name = "split_bow",
        metrics = [MetricSpec(BagOfWordsMetric)]
    )
]
schema = DeclarativeSchema(resolvers)
why = PersistentLogger(schema=schema)

import random

profile = None
for file in train_files:
    raw = BeautifulSoup(reuters.raw(file), 'html.parser').get_text()
    raw = delete_headline(raw)
    sentences = nltk.sent_tokenize(raw)
    for sentence in sentences:
      tokens = [t.casefold() for t in nltk.word_tokenize(sentence)]
      why.log(obj={"original_bow": np.array(tokens)})
      phrases = sentence.split(",")
      if len(phrases) > 1:
        index = random.randint(0, len(phrases))
        left = [t.casefold() for t in nltk.word_tokenize(", ".join(phrases[:index]) + ".")]
        right = [t.casefold() for t in nltk.word_tokenize(", ".join(phrases[index:]))]
        why.log(obj={"split_bow": left})
        profile = why.log(obj={"split_bow": right})
      else:
        profile = why.log(obj={"split_bow": tokens})

view = profile.view()
columns = view.get_columns()
for col_name, col_view in columns.items():
    print(f"{col_name}:")
    dump_summary(col_view)
    print()