examples/nbc-headlines/1_scrape.ipynb
This notebooks implements a scraper for NBC News headlines. It uses this sitemap, which provides a list of article headlines + URLs for every month for the past few years.
This dataset is mostly to get a simple, real-world small text dataset for testing embeddings. They're small pieces of text (~dozen words), have a wide range of semantic meaning, and are more "real-world" them some other embeddings datasets out there.
This notebook uses Deno, linkedom, and a few
SQLite extensions to scrape the headlines for a given date range. It creates a single SQL table, articles,
with a few columns like headline and url. By default it will get all article headlines from January 2024 -> present
and save them to a database called headlines-2024.db. Feel free to copy+paste this code into your own custom scraper.
This notebook also just scrapes the data into a SQLite database, it does NOT do any embeddings + vector search.
For those examples of those, see ./2_build.ipynb and ./3_search.ipynb.
import { Database, Statement } from "jsr:@db/[email protected]";
import { parseHTML } from "npm:linkedom";
import * as d3 from "npm:d3-time";
import * as sqlitePath from "npm:sqlite-path";
import * as sqliteUrl from "npm:sqlite-url";
import * as sqliteRegex from "npm:sqlite-regex";
const months = ["january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"]
class Db {
db: Database;
#stmtInsertArticle: Statement;
constructor(path:string) {
this.db = new Database(path);
this.db.enableLoadExtension = true;
this.db.loadExtension(sqlitePath.getLoadablePath());
this.db.loadExtension(sqliteUrl.getLoadablePath());
this.db.loadExtension(sqliteRegex.getLoadablePath());
this.db.enableLoadExtension = false;
this.db.exec(`
CREATE TABLE IF NOT EXISTS articles(
id integer primary key autoincrement,
year integer,
month integer,
slug TEXT,
slug_id TEXT,
headline TEXT,
url TEXT,
category1 TEXT,
category2 TEXT
)
`);
this.#stmtInsertArticle = this.db.prepare(`
insert into articles(year, month, slug, slug_id, headline, url, category1, category2)
select
:year as year,
:month as month,
regex_capture(
'(?P<slug>.+)-(?P<id>[^-]+)$',
path_at(url_path(:url), -1),
'slug'
) as slug,
regex_capture(
'(?P<slug>.+)-(?P<id>[^-]+)$',
path_at(url_path(:url), -1),
'id'
) as slug_id,
:headline as headline,
:url as url,
path_at(url_path(:url), 0) as category1,
iif(
path_length(url_path(:url)) > 2,
path_at(url_path(:url), 1),
null
) as category2
`);
}
insertArticles(year:number, month:text, articles:{url: string, year: number, month: number}[]) {
const tx = this.db.transaction((year, month, articles) => {
for(const article of articles) {
this.#stmtInsertArticle.run({...article, year, month})
}
});
tx(year, month, articles);
}
}
async function insertMonth(db: Db, year:number, month: text) {
let url = `https://www.nbcnews.com/archive/articles/${year}/${month}`;
while(true) {
const monthPage = await fetch(url).then(r=>r.text())
const {document:monthPageDoc} = parseHTML(monthPage);
const monthEntries = monthPageDoc
.querySelectorAll('.MonthPage a')
.map(a => ({headline: a.innerText, url: a.getAttribute('href')}));
db.insertArticles(year, months.findIndex(m => m === month)+1, monthEntries);
const next = monthPageDoc.querySelector('a.Pagination__next.Pagination__enable');
if(!next) {
break;
}
url = `https://www.nbcnews.com${next.getAttribute('href')}`;
}
}
async function backfill(db, start: Date, end: Date) {
const targets = d3.timeMonths(start, end)
.map(date => ({year: date.getFullYear(), monthIndex: date.getMonth()}));
for(const target of targets) {
console.log(`${target.year} ${target.monthIndex}`)
await insertMonth(db, target.year, months[target.monthIndex]);
}
}
const db = new Db(":memory:");
await backfill(db, new Date('2024-01-01'), new Date())
db.db.exec("vacuum into 'headlines-2024.db'")