#!/usr/bin/env node /** * Sports-Reference scraper — monthly refresh. * * Pulls referee stats and coach career data from Basketball Reference's * public HTML pages. Polite by design: * - 1 request per 5 seconds (well under the rate they tolerate) * - User-Agent identifies us so they can email us if anything's off * - --dry-run flag for safe local experiments * * Usage: * node scripts/scrape-sports-reference.js refs # refresh ref profiles * node scripts/scrape-sports-reference.js coaches # refresh coach profiles * node scripts/scrape-sports-reference.js --dry-run # parse + log, no DB writes * node scripts/scrape-sports-reference.js --yes # skip confirmation prompt * * Sources: * https://www.basketball-reference.com/referees/ * https://www.basketball-reference.com/coaches/ * * If network access from your host is blocked, this script accepts a saved * HTML fixture via REF_HTML_FILE or COACH_HTML_FILE env vars (used by the * unit test that ships with this codebase). */ if (require.main !== module) { throw new Error('Run directly: node scripts/scrape-sports-reference.js'); } const fs = require('fs'); const path = require('path'); const readline = require('readline'); const axios = require('axios'); const cheerio = require('cheerio'); require('dotenv').config({ path: path.join(__dirname, '..', '.env') }); const { getSupabaseServiceClient } = require('../src/utils/supabase'); const args = process.argv.slice(2); const dryRun = args.includes('--dry-run'); const skipConfirm = args.includes('--yes'); const target = args.find((a) => !a.startsWith('--')) || 'refs'; const USER_AGENT = 'VYNDR Research Bot (contact@vyndr.app)'; const THROTTLE_MS = 5_000; const HTTP_TIMEOUT_MS = 20_000; const REF_INDEX_URL = 'https://www.basketball-reference.com/referees/'; const COACH_INDEX_URL = 'https://www.basketball-reference.com/coaches/'; function sleep(ms) { return new Promise((r) => setTimeout(r, ms)); } async function fetchPage(url) { const fileOverride = process.env[url === REF_INDEX_URL ? 'REF_HTML_FILE' : 'COACH_HTML_FILE']; if (fileOverride) { return fs.readFileSync(fileOverride, 'utf8'); } const res = await axios.get(url, { headers: { 'User-Agent': USER_AGENT, Accept: 'text/html' }, timeout: HTTP_TIMEOUT_MS, }); return res.data; } // Basketball-Reference tables follow a stable structure: with // //. Each row's cells are
or with data-stat // attributes — that's our primary parser key. function parseTable($, tableSelector) { const rows = []; $(`${tableSelector} tbody tr`).each((_, tr) => { const $tr = $(tr); if ($tr.hasClass('thead') || $tr.hasClass('rowSep')) return; const row = {}; $tr.find('th, td').each((__, cell) => { const $cell = $(cell); const key = $cell.attr('data-stat'); if (!key) return; row[key] = $cell.text().trim(); }); if (Object.keys(row).length) rows.push(row); }); return rows; } function num(v) { if (v == null || v === '') return null; const n = Number(v); return Number.isFinite(n) ? n : null; } function parseRefRows(rows) { // Expected data-stat keys: ref, g (games), fouls_per_g, ft_per_g, … return rows.map((r) => ({ ref_name: r.ref ?? r.player ?? r.name ?? null, games_reffed: num(r.g ?? r.games), avg_fouls_per_game: num(r.fouls_per_g ?? r.fouls_per_game), avg_free_throws_per_game: num(r.ft_per_g ?? r.ft_per_game), // pace_impact and home_whistle_bias are NOT directly in BR. They get // computed downstream by a follow-up SQL view over historical games. // Leaving these null on initial scrape is intentional. pace_impact: null, home_whistle_bias: null, })).filter((r) => r.ref_name); } function parseCoachRows(rows) { return rows.map((r) => ({ coach_name: r.coach ?? r.coaches ?? r.name ?? null, career_avg_pace: num(r.pace ?? r.pace_p100), tenure_games: num(r.g), // The team / current_team_pace / primary_player columns get added by a // separate enrichment pass; BR's coaches index only carries career totals. })).filter((r) => r.coach_name); } async function scrapeRefs() { const html = await fetchPage(REF_INDEX_URL); const $ = cheerio.load(html); const rows = parseTable($, 'table#refs') ; const profiles = parseRefRows(rows); return profiles; } async function scrapeCoaches() { const html = await fetchPage(COACH_INDEX_URL); const $ = cheerio.load(html); const rows = parseTable($, 'table#coaches'); return parseCoachRows(rows); } async function upsertRefs(profiles) { const supabase = getSupabaseServiceClient(); const stamp = new Date().toISOString(); let captured = 0; let errored = 0; const batchSize = 50; for (let i = 0; i < profiles.length; i += batchSize) { const batch = profiles.slice(i, i + batchSize).map((p) => ({ ...p, last_updated: stamp })); const { error } = await supabase .from('ref_profiles') .upsert(batch, { onConflict: 'ref_name' }); if (error) { console.warn(`[scraper] refs batch ${i / batchSize} failed: ${error.message}`); errored += batch.length; } else { captured += batch.length; } } return { captured, errored }; } async function upsertCoaches(profiles) { const supabase = getSupabaseServiceClient(); const stamp = new Date().toISOString(); let captured = 0; let errored = 0; // Coaches need (coach_name, team, sport) to match the unique index — but // BR's index page doesn't have those columns. We write what we have and // leave the team/sport columns to be filled by manual or follow-up // enrichment. for (const p of profiles) { const row = { coach_name: p.coach_name, team: 'UNK', sport: 'nba', career_avg_pace: p.career_avg_pace, tenure_games: p.tenure_games || 0, last_updated: stamp, }; const { error } = await supabase .from('coach_profiles') .upsert(row, { onConflict: 'coach_name,team,sport' }); if (error) { console.warn(`[scraper] coach upsert failed for ${p.coach_name}: ${error.message}`); errored += 1; } else { captured += 1; } await sleep(50); // gentle DB pacing } return { captured, errored }; } async function confirm(question) { if (skipConfirm) return true; const rl = readline.createInterface({ input: process.stdin, output: process.stdout }); const answer = await new Promise((r) => rl.question(question, r)); rl.close(); return /^y(es)?$/i.test(answer.trim()); } async function main() { if (target !== 'refs' && target !== 'coaches') { console.error('Usage: scrape-sports-reference.js refs|coaches [--dry-run] [--yes]'); process.exit(2); } if (!dryRun) { const ok = await confirm(`This will upsert ${target} profiles into ${process.env.SUPABASE_URL || '(unknown)'}. Continue? (y/n) `); if (!ok) { console.log('aborted'); process.exit(0); } } await sleep(THROTTLE_MS); if (target === 'refs') { const profiles = await scrapeRefs(); console.log(`[scraper] parsed ${profiles.length} ref profiles`); if (dryRun) { console.log(JSON.stringify(profiles.slice(0, 5), null, 2)); return; } const summary = await upsertRefs(profiles); console.log('[scraper] refs upsert summary:', summary); } else { const profiles = await scrapeCoaches(); console.log(`[scraper] parsed ${profiles.length} coach profiles`); if (dryRun) { console.log(JSON.stringify(profiles.slice(0, 5), null, 2)); return; } const summary = await upsertCoaches(profiles); console.log('[scraper] coaches upsert summary:', summary); } } main().catch((err) => { console.error('[scraper] fatal:', err.message); process.exit(1); }); module.exports = { parseTable, parseRefRows, parseCoachRows };