Sessions 5-7a: 955 tests, deployment ready
This commit is contained in:
@@ -0,0 +1,226 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Sports-Reference scraper — monthly refresh.
|
||||
*
|
||||
* Pulls referee stats and coach career data from Basketball Reference's
|
||||
* public HTML pages. Polite by design:
|
||||
* - 1 request per 5 seconds (well under the rate they tolerate)
|
||||
* - User-Agent identifies us so they can email us if anything's off
|
||||
* - --dry-run flag for safe local experiments
|
||||
*
|
||||
* Usage:
|
||||
* node scripts/scrape-sports-reference.js refs # refresh ref profiles
|
||||
* node scripts/scrape-sports-reference.js coaches # refresh coach profiles
|
||||
* node scripts/scrape-sports-reference.js --dry-run # parse + log, no DB writes
|
||||
* node scripts/scrape-sports-reference.js --yes # skip confirmation prompt
|
||||
*
|
||||
* Sources:
|
||||
* https://www.basketball-reference.com/referees/
|
||||
* https://www.basketball-reference.com/coaches/
|
||||
*
|
||||
* If network access from your host is blocked, this script accepts a saved
|
||||
* HTML fixture via REF_HTML_FILE or COACH_HTML_FILE env vars (used by the
|
||||
* unit test that ships with this codebase).
|
||||
*/
|
||||
|
||||
if (require.main !== module) {
|
||||
throw new Error('Run directly: node scripts/scrape-sports-reference.js');
|
||||
}
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const readline = require('readline');
|
||||
const axios = require('axios');
|
||||
const cheerio = require('cheerio');
|
||||
|
||||
require('dotenv').config({ path: path.join(__dirname, '..', '.env') });
|
||||
|
||||
const { getSupabaseServiceClient } = require('../src/utils/supabase');
|
||||
|
||||
const args = process.argv.slice(2);
|
||||
const dryRun = args.includes('--dry-run');
|
||||
const skipConfirm = args.includes('--yes');
|
||||
const target = args.find((a) => !a.startsWith('--')) || 'refs';
|
||||
|
||||
const USER_AGENT = 'VYNDR Research Bot (contact@vyndr.app)';
|
||||
const THROTTLE_MS = 5_000;
|
||||
const HTTP_TIMEOUT_MS = 20_000;
|
||||
|
||||
const REF_INDEX_URL = 'https://www.basketball-reference.com/referees/';
|
||||
const COACH_INDEX_URL = 'https://www.basketball-reference.com/coaches/';
|
||||
|
||||
function sleep(ms) { return new Promise((r) => setTimeout(r, ms)); }
|
||||
|
||||
async function fetchPage(url) {
|
||||
const fileOverride = process.env[url === REF_INDEX_URL ? 'REF_HTML_FILE' : 'COACH_HTML_FILE'];
|
||||
if (fileOverride) {
|
||||
return fs.readFileSync(fileOverride, 'utf8');
|
||||
}
|
||||
const res = await axios.get(url, {
|
||||
headers: { 'User-Agent': USER_AGENT, Accept: 'text/html' },
|
||||
timeout: HTTP_TIMEOUT_MS,
|
||||
});
|
||||
return res.data;
|
||||
}
|
||||
|
||||
// Basketball-Reference tables follow a stable structure: <table id="..."> with
|
||||
// <thead>/<tbody>/<tr>. Each row's cells are <th> or <td> with data-stat
|
||||
// attributes — that's our primary parser key.
|
||||
function parseTable($, tableSelector) {
|
||||
const rows = [];
|
||||
$(`${tableSelector} tbody tr`).each((_, tr) => {
|
||||
const $tr = $(tr);
|
||||
if ($tr.hasClass('thead') || $tr.hasClass('rowSep')) return;
|
||||
const row = {};
|
||||
$tr.find('th, td').each((__, cell) => {
|
||||
const $cell = $(cell);
|
||||
const key = $cell.attr('data-stat');
|
||||
if (!key) return;
|
||||
row[key] = $cell.text().trim();
|
||||
});
|
||||
if (Object.keys(row).length) rows.push(row);
|
||||
});
|
||||
return rows;
|
||||
}
|
||||
|
||||
function num(v) {
|
||||
if (v == null || v === '') return null;
|
||||
const n = Number(v);
|
||||
return Number.isFinite(n) ? n : null;
|
||||
}
|
||||
|
||||
function parseRefRows(rows) {
|
||||
// Expected data-stat keys: ref, g (games), fouls_per_g, ft_per_g, …
|
||||
return rows.map((r) => ({
|
||||
ref_name: r.ref ?? r.player ?? r.name ?? null,
|
||||
games_reffed: num(r.g ?? r.games),
|
||||
avg_fouls_per_game: num(r.fouls_per_g ?? r.fouls_per_game),
|
||||
avg_free_throws_per_game: num(r.ft_per_g ?? r.ft_per_game),
|
||||
// pace_impact and home_whistle_bias are NOT directly in BR. They get
|
||||
// computed downstream by a follow-up SQL view over historical games.
|
||||
// Leaving these null on initial scrape is intentional.
|
||||
pace_impact: null,
|
||||
home_whistle_bias: null,
|
||||
})).filter((r) => r.ref_name);
|
||||
}
|
||||
|
||||
function parseCoachRows(rows) {
|
||||
return rows.map((r) => ({
|
||||
coach_name: r.coach ?? r.coaches ?? r.name ?? null,
|
||||
career_avg_pace: num(r.pace ?? r.pace_p100),
|
||||
tenure_games: num(r.g),
|
||||
// The team / current_team_pace / primary_player columns get added by a
|
||||
// separate enrichment pass; BR's coaches index only carries career totals.
|
||||
})).filter((r) => r.coach_name);
|
||||
}
|
||||
|
||||
async function scrapeRefs() {
|
||||
const html = await fetchPage(REF_INDEX_URL);
|
||||
const $ = cheerio.load(html);
|
||||
const rows = parseTable($, 'table#refs') ;
|
||||
const profiles = parseRefRows(rows);
|
||||
return profiles;
|
||||
}
|
||||
|
||||
async function scrapeCoaches() {
|
||||
const html = await fetchPage(COACH_INDEX_URL);
|
||||
const $ = cheerio.load(html);
|
||||
const rows = parseTable($, 'table#coaches');
|
||||
return parseCoachRows(rows);
|
||||
}
|
||||
|
||||
async function upsertRefs(profiles) {
|
||||
const supabase = getSupabaseServiceClient();
|
||||
const stamp = new Date().toISOString();
|
||||
let captured = 0;
|
||||
let errored = 0;
|
||||
const batchSize = 50;
|
||||
for (let i = 0; i < profiles.length; i += batchSize) {
|
||||
const batch = profiles.slice(i, i + batchSize).map((p) => ({ ...p, last_updated: stamp }));
|
||||
const { error } = await supabase
|
||||
.from('ref_profiles')
|
||||
.upsert(batch, { onConflict: 'ref_name' });
|
||||
if (error) {
|
||||
console.warn(`[scraper] refs batch ${i / batchSize} failed: ${error.message}`);
|
||||
errored += batch.length;
|
||||
} else {
|
||||
captured += batch.length;
|
||||
}
|
||||
}
|
||||
return { captured, errored };
|
||||
}
|
||||
|
||||
async function upsertCoaches(profiles) {
|
||||
const supabase = getSupabaseServiceClient();
|
||||
const stamp = new Date().toISOString();
|
||||
let captured = 0;
|
||||
let errored = 0;
|
||||
// Coaches need (coach_name, team, sport) to match the unique index — but
|
||||
// BR's index page doesn't have those columns. We write what we have and
|
||||
// leave the team/sport columns to be filled by manual or follow-up
|
||||
// enrichment.
|
||||
for (const p of profiles) {
|
||||
const row = {
|
||||
coach_name: p.coach_name,
|
||||
team: 'UNK',
|
||||
sport: 'nba',
|
||||
career_avg_pace: p.career_avg_pace,
|
||||
tenure_games: p.tenure_games || 0,
|
||||
last_updated: stamp,
|
||||
};
|
||||
const { error } = await supabase
|
||||
.from('coach_profiles')
|
||||
.upsert(row, { onConflict: 'coach_name,team,sport' });
|
||||
if (error) {
|
||||
console.warn(`[scraper] coach upsert failed for ${p.coach_name}: ${error.message}`);
|
||||
errored += 1;
|
||||
} else {
|
||||
captured += 1;
|
||||
}
|
||||
await sleep(50); // gentle DB pacing
|
||||
}
|
||||
return { captured, errored };
|
||||
}
|
||||
|
||||
async function confirm(question) {
|
||||
if (skipConfirm) return true;
|
||||
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
|
||||
const answer = await new Promise((r) => rl.question(question, r));
|
||||
rl.close();
|
||||
return /^y(es)?$/i.test(answer.trim());
|
||||
}
|
||||
|
||||
async function main() {
|
||||
if (target !== 'refs' && target !== 'coaches') {
|
||||
console.error('Usage: scrape-sports-reference.js refs|coaches [--dry-run] [--yes]');
|
||||
process.exit(2);
|
||||
}
|
||||
|
||||
if (!dryRun) {
|
||||
const ok = await confirm(`This will upsert ${target} profiles into ${process.env.SUPABASE_URL || '(unknown)'}. Continue? (y/n) `);
|
||||
if (!ok) { console.log('aborted'); process.exit(0); }
|
||||
}
|
||||
|
||||
await sleep(THROTTLE_MS);
|
||||
|
||||
if (target === 'refs') {
|
||||
const profiles = await scrapeRefs();
|
||||
console.log(`[scraper] parsed ${profiles.length} ref profiles`);
|
||||
if (dryRun) { console.log(JSON.stringify(profiles.slice(0, 5), null, 2)); return; }
|
||||
const summary = await upsertRefs(profiles);
|
||||
console.log('[scraper] refs upsert summary:', summary);
|
||||
} else {
|
||||
const profiles = await scrapeCoaches();
|
||||
console.log(`[scraper] parsed ${profiles.length} coach profiles`);
|
||||
if (dryRun) { console.log(JSON.stringify(profiles.slice(0, 5), null, 2)); return; }
|
||||
const summary = await upsertCoaches(profiles);
|
||||
console.log('[scraper] coaches upsert summary:', summary);
|
||||
}
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error('[scraper] fatal:', err.message);
|
||||
process.exit(1);
|
||||
});
|
||||
|
||||
module.exports = { parseTable, parseRefRows, parseCoachRows };
|
||||
Reference in New Issue
Block a user