Files
vyndr/scripts/scrape-sports-reference.js
T

227 lines
7.7 KiB
JavaScript

#!/usr/bin/env node
/**
* Sports-Reference scraper — monthly refresh.
*
* Pulls referee stats and coach career data from Basketball Reference's
* public HTML pages. Polite by design:
* - 1 request per 5 seconds (well under the rate they tolerate)
* - User-Agent identifies us so they can email us if anything's off
* - --dry-run flag for safe local experiments
*
* Usage:
* node scripts/scrape-sports-reference.js refs # refresh ref profiles
* node scripts/scrape-sports-reference.js coaches # refresh coach profiles
* node scripts/scrape-sports-reference.js --dry-run # parse + log, no DB writes
* node scripts/scrape-sports-reference.js --yes # skip confirmation prompt
*
* Sources:
* https://www.basketball-reference.com/referees/
* https://www.basketball-reference.com/coaches/
*
* If network access from your host is blocked, this script accepts a saved
* HTML fixture via REF_HTML_FILE or COACH_HTML_FILE env vars (used by the
* unit test that ships with this codebase).
*/
if (require.main !== module) {
throw new Error('Run directly: node scripts/scrape-sports-reference.js');
}
const fs = require('fs');
const path = require('path');
const readline = require('readline');
const axios = require('axios');
const cheerio = require('cheerio');
require('dotenv').config({ path: path.join(__dirname, '..', '.env') });
const { getSupabaseServiceClient } = require('../src/utils/supabase');
const args = process.argv.slice(2);
const dryRun = args.includes('--dry-run');
const skipConfirm = args.includes('--yes');
const target = args.find((a) => !a.startsWith('--')) || 'refs';
const USER_AGENT = 'VYNDR Research Bot (contact@vyndr.app)';
const THROTTLE_MS = 5_000;
const HTTP_TIMEOUT_MS = 20_000;
const REF_INDEX_URL = 'https://www.basketball-reference.com/referees/';
const COACH_INDEX_URL = 'https://www.basketball-reference.com/coaches/';
function sleep(ms) { return new Promise((r) => setTimeout(r, ms)); }
async function fetchPage(url) {
const fileOverride = process.env[url === REF_INDEX_URL ? 'REF_HTML_FILE' : 'COACH_HTML_FILE'];
if (fileOverride) {
return fs.readFileSync(fileOverride, 'utf8');
}
const res = await axios.get(url, {
headers: { 'User-Agent': USER_AGENT, Accept: 'text/html' },
timeout: HTTP_TIMEOUT_MS,
});
return res.data;
}
// Basketball-Reference tables follow a stable structure: <table id="..."> with
// <thead>/<tbody>/<tr>. Each row's cells are <th> or <td> with data-stat
// attributes — that's our primary parser key.
function parseTable($, tableSelector) {
const rows = [];
$(`${tableSelector} tbody tr`).each((_, tr) => {
const $tr = $(tr);
if ($tr.hasClass('thead') || $tr.hasClass('rowSep')) return;
const row = {};
$tr.find('th, td').each((__, cell) => {
const $cell = $(cell);
const key = $cell.attr('data-stat');
if (!key) return;
row[key] = $cell.text().trim();
});
if (Object.keys(row).length) rows.push(row);
});
return rows;
}
function num(v) {
if (v == null || v === '') return null;
const n = Number(v);
return Number.isFinite(n) ? n : null;
}
function parseRefRows(rows) {
// Expected data-stat keys: ref, g (games), fouls_per_g, ft_per_g, …
return rows.map((r) => ({
ref_name: r.ref ?? r.player ?? r.name ?? null,
games_reffed: num(r.g ?? r.games),
avg_fouls_per_game: num(r.fouls_per_g ?? r.fouls_per_game),
avg_free_throws_per_game: num(r.ft_per_g ?? r.ft_per_game),
// pace_impact and home_whistle_bias are NOT directly in BR. They get
// computed downstream by a follow-up SQL view over historical games.
// Leaving these null on initial scrape is intentional.
pace_impact: null,
home_whistle_bias: null,
})).filter((r) => r.ref_name);
}
function parseCoachRows(rows) {
return rows.map((r) => ({
coach_name: r.coach ?? r.coaches ?? r.name ?? null,
career_avg_pace: num(r.pace ?? r.pace_p100),
tenure_games: num(r.g),
// The team / current_team_pace / primary_player columns get added by a
// separate enrichment pass; BR's coaches index only carries career totals.
})).filter((r) => r.coach_name);
}
async function scrapeRefs() {
const html = await fetchPage(REF_INDEX_URL);
const $ = cheerio.load(html);
const rows = parseTable($, 'table#refs') ;
const profiles = parseRefRows(rows);
return profiles;
}
async function scrapeCoaches() {
const html = await fetchPage(COACH_INDEX_URL);
const $ = cheerio.load(html);
const rows = parseTable($, 'table#coaches');
return parseCoachRows(rows);
}
async function upsertRefs(profiles) {
const supabase = getSupabaseServiceClient();
const stamp = new Date().toISOString();
let captured = 0;
let errored = 0;
const batchSize = 50;
for (let i = 0; i < profiles.length; i += batchSize) {
const batch = profiles.slice(i, i + batchSize).map((p) => ({ ...p, last_updated: stamp }));
const { error } = await supabase
.from('ref_profiles')
.upsert(batch, { onConflict: 'ref_name' });
if (error) {
console.warn(`[scraper] refs batch ${i / batchSize} failed: ${error.message}`);
errored += batch.length;
} else {
captured += batch.length;
}
}
return { captured, errored };
}
async function upsertCoaches(profiles) {
const supabase = getSupabaseServiceClient();
const stamp = new Date().toISOString();
let captured = 0;
let errored = 0;
// Coaches need (coach_name, team, sport) to match the unique index — but
// BR's index page doesn't have those columns. We write what we have and
// leave the team/sport columns to be filled by manual or follow-up
// enrichment.
for (const p of profiles) {
const row = {
coach_name: p.coach_name,
team: 'UNK',
sport: 'nba',
career_avg_pace: p.career_avg_pace,
tenure_games: p.tenure_games || 0,
last_updated: stamp,
};
const { error } = await supabase
.from('coach_profiles')
.upsert(row, { onConflict: 'coach_name,team,sport' });
if (error) {
console.warn(`[scraper] coach upsert failed for ${p.coach_name}: ${error.message}`);
errored += 1;
} else {
captured += 1;
}
await sleep(50); // gentle DB pacing
}
return { captured, errored };
}
async function confirm(question) {
if (skipConfirm) return true;
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
const answer = await new Promise((r) => rl.question(question, r));
rl.close();
return /^y(es)?$/i.test(answer.trim());
}
async function main() {
if (target !== 'refs' && target !== 'coaches') {
console.error('Usage: scrape-sports-reference.js refs|coaches [--dry-run] [--yes]');
process.exit(2);
}
if (!dryRun) {
const ok = await confirm(`This will upsert ${target} profiles into ${process.env.SUPABASE_URL || '(unknown)'}. Continue? (y/n) `);
if (!ok) { console.log('aborted'); process.exit(0); }
}
await sleep(THROTTLE_MS);
if (target === 'refs') {
const profiles = await scrapeRefs();
console.log(`[scraper] parsed ${profiles.length} ref profiles`);
if (dryRun) { console.log(JSON.stringify(profiles.slice(0, 5), null, 2)); return; }
const summary = await upsertRefs(profiles);
console.log('[scraper] refs upsert summary:', summary);
} else {
const profiles = await scrapeCoaches();
console.log(`[scraper] parsed ${profiles.length} coach profiles`);
if (dryRun) { console.log(JSON.stringify(profiles.slice(0, 5), null, 2)); return; }
const summary = await upsertCoaches(profiles);
console.log('[scraper] coaches upsert summary:', summary);
}
}
main().catch((err) => {
console.error('[scraper] fatal:', err.message);
process.exit(1);
});
module.exports = { parseTable, parseRefRows, parseCoachRows };