NBC News Headlines: Scraper

This notebooks implements a scraper for NBC News headlines. It uses this sitemap, which provides a list of article headlines + URLs for every month for the past few years.

This dataset is mostly to get a simple, real-world small text dataset for testing embeddings. They're small pieces of text (~dozen words), have a wide range of semantic meaning, and are more "real-world" them some other embeddings datasets out there.

This notebook uses Deno, linkedom, and a few SQLite extensions to scrape the headlines for a given date range. It creates a single SQL table, articles, with a few columns like headline and url. By default it will get all article headlines from January 2024 -> present and save them to a database called headlines-2024.db. Feel free to copy+paste this code into your own custom scraper.

This notebook also just scrapes the data into a SQLite database, it does NOT do any embeddings + vector search. For those examples of those, see ./2_build.ipynb and ./3_search.ipynb.

typescript

import { Database, Statement } from "jsr:@db/[email protected]";
import { parseHTML } from "npm:linkedom";
import * as d3 from "npm:d3-time";
import * as sqlitePath from "npm:sqlite-path";
import * as sqliteUrl from "npm:sqlite-url";
import * as sqliteRegex from "npm:sqlite-regex";

typescript

const months = ["january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"]

class Db {
  db: Database;
  #stmtInsertArticle: Statement;

  constructor(path:string) {
    this.db = new Database(path);
    this.db.enableLoadExtension = true;
    this.db.loadExtension(sqlitePath.getLoadablePath());
    this.db.loadExtension(sqliteUrl.getLoadablePath());
    this.db.loadExtension(sqliteRegex.getLoadablePath());
    this.db.enableLoadExtension = false;

    this.db.exec(`
      CREATE TABLE IF NOT EXISTS articles(
        id integer primary key autoincrement,
        year integer,
        month integer,
        slug TEXT,
        slug_id TEXT,
        headline TEXT,
        url TEXT,
        category1 TEXT,
        category2 TEXT
      )
    `);

    this.#stmtInsertArticle = this.db.prepare(`
      insert into articles(year, month, slug, slug_id, headline, url, category1, category2)
      select
        :year as year,
        :month as month,
         regex_capture(
          '(?P<slug>.+)-(?P<id>[^-]+)$',
          path_at(url_path(:url), -1),
          'slug'
        ) as slug,
        regex_capture(
          '(?P<slug>.+)-(?P<id>[^-]+)$',
          path_at(url_path(:url), -1),
          'id'
        ) as slug_id,
        :headline as headline,
        :url as url,
        path_at(url_path(:url), 0) as category1,
        iif(
          path_length(url_path(:url)) > 2,
          path_at(url_path(:url), 1),
          null
        ) as category2
    `);
  }

  insertArticles(year:number, month:text, articles:{url: string, year: number, month: number}[]) {
    const tx = this.db.transaction((year, month, articles) => {
      for(const article of articles) {
        this.#stmtInsertArticle.run({...article, year, month})
      }
    });
    tx(year, month, articles);
  }
}

async function insertMonth(db: Db, year:number, month: text) {
  let url = `https://www.nbcnews.com/archive/articles/${year}/${month}`;
  while(true) {
    const monthPage = await fetch(url).then(r=>r.text())
    const {document:monthPageDoc} = parseHTML(monthPage);
    const monthEntries = monthPageDoc
      .querySelectorAll('.MonthPage a')
      .map(a => ({headline: a.innerText, url: a.getAttribute('href')}));
    db.insertArticles(year, months.findIndex(m => m === month)+1, monthEntries);
    const next = monthPageDoc.querySelector('a.Pagination__next.Pagination__enable');
    if(!next) {
      break;
    }
    url = `https://www.nbcnews.com${next.getAttribute('href')}`;
  }

}

typescript


async function backfill(db, start: Date, end: Date) {
  const targets = d3.timeMonths(start, end)
    .map(date => ({year: date.getFullYear(), monthIndex: date.getMonth()}));
  for(const target of targets) {
    console.log(`${target.year} ${target.monthIndex}`)
    await insertMonth(db, target.year, months[target.monthIndex]);
  }
}

typescript

const db = new Db(":memory:");
await backfill(db, new Date('2024-01-01'), new Date())
db.db.exec("vacuum into 'headlines-2024.db'")