#!/usr/bin/env node /** * Daily soccer intelligence prefetch — run once per day. * * cron: 0 5 * * * (5am UTC, ~midnight ET — before US fixtures) * call: node scripts/soccer-data-prefetch.js [--leagues=WC,PL] [--dry-run] * * Why: football-data.org caps at 10 req/min and ~10/day for some * endpoints. We can't read these on the user request path. This script * batches the reads, transforms them into the per-player / per-team * aggregates the feature extractor consumes, and persists them to * Redis with conservative TTLs. * * Writes: * soccer:{league}:standings — raw standings from API * soccer:{league}:scorers — top-scorers list (projected) * soccer:player:{normalizedName} — per-player aggregate (per-90 rates) * soccer:teamdefense:{league}:{team} — team defensive aggregate + normalized rank * * Does NOT write next-match / last-fixture pointers — those are the * job of the poller (poller/soccer.js), which runs more frequently * since fixture state changes faster. * * xG data (`xg_per_90`, `xg_delta`) is left null on Day 1 — sourcing * it requires a soccerdata-Python bridge that's a follow-up. The * downstream feature extractor handles null xG gracefully. * * No DB writes. Graceful exit (code 0) when API keys are missing — the * script logs "skipped" and the feature extractor continues with the * static-data-only path. */ const fbd = require('../src/services/adapters/footballDataAdapter'); const apif = require('../src/services/adapters/apiFootballAdapter'); const footapi = require('../src/services/adapters/footApiAdapter'); const { cacheSet } = require('../src/utils/redis'); const { normalizeName } = require('../src/utils/normalize'); const PLAYER_TTL_SEC = 24 * 3600; const STANDINGS_TTL_SEC = 12 * 3600; const SCORERS_TTL_SEC = 6 * 3600; const DEFENSE_TTL_SEC = 12 * 3600; const REFEREE_TTL_SEC = 7 * 24 * 3600; // Session 10 — Map football-data competition codes to api-football // league IDs so the prefetch can ask api-football for the matching // season's data. Add codes here as more leagues come online. const APIFOOTBALL_LEAGUE_MAP = Object.freeze({ WC: 1, // FIFA World Cup PL: 39, // English Premier League PD: 140, // La Liga BL1: 78, // Bundesliga SA: 135, // Serie A FL1: 61, // Ligue 1 CL: 2, // UEFA Champions League MLS: 253, // MLS }); function parseArgs(argv) { // Sources controls which adapters get called. `all` (default) tries // every configured adapter; the explicit single-source values are // useful for debugging or for skipping a misbehaving source. const VALID_SOURCES = new Set(['all', 'api-football', 'footapi', 'football-data']); const args = { leagues: ['WC'], dryRun: false, source: 'all', maxPlayers: 80, season: 2026, }; for (const a of argv.slice(2)) { if (a.startsWith('--leagues=')) { args.leagues = a.slice('--leagues='.length).split(',').map((s) => s.trim().toUpperCase()).filter(Boolean); } else if (a === '--dry-run') { args.dryRun = true; } else if (a.startsWith('--source=')) { const src = a.slice('--source='.length).trim().toLowerCase(); args.source = VALID_SOURCES.has(src) ? src : 'all'; } else if (a.startsWith('--max-players=')) { const n = Number(a.slice('--max-players='.length)); if (Number.isFinite(n) && n > 0) args.maxPlayers = Math.floor(n); } else if (a.startsWith('--season=')) { const n = Number(a.slice('--season='.length)); if (Number.isFinite(n) && n > 1900) args.season = n; } } if (!process.argv.some((a) => a.startsWith('--leagues='))) { const env = process.env.SOCCER_LEAGUES; if (env) args.leagues = env.split(',').map((s) => s.trim().toUpperCase()).filter(Boolean); } return args; } function shouldRunSource(args, source) { // Default to 'all' so callers (and existing tests) that don't set // `source` explicitly get the legacy "run every source" behavior. const requested = args && args.source ? args.source : 'all'; return requested === 'all' || requested === source; } // Project a single team's standings row into the defensive aggregate // the feature extractor reads. defensive_rank_norm is on a 0..1 scale // (0 = best defense, 1 = worst) so it slots into engine1's opp_rank_stat. function aggregateTeamDefense(standingsRow, allRows) { const playedGames = standingsRow.playedGames || standingsRow.played || 0; const goalsAgainst = standingsRow.goalsAgainst ?? null; if (!playedGames || goalsAgainst == null) return null; const goalsConcededPerGame = goalsAgainst / playedGames; // Normalize against the rest of the table — defensive_rank_norm = the // team's goals-conceded percentile (0 best, 1 worst). const allRates = allRows .map((r) => { const pg = r.playedGames || r.played || 0; if (!pg) return null; return (r.goalsAgainst ?? 0) / pg; }) .filter((v) => Number.isFinite(v)) .sort((a, b) => a - b); let rank = allRates.findIndex((v) => v >= goalsConcededPerGame); if (rank === -1) rank = allRates.length - 1; const rankNorm = allRates.length > 1 ? rank / (allRates.length - 1) : 0; // Clean sheets (not on the football-data row in the free tier — null is OK). const cleanSheets = standingsRow.cleanSheets ?? null; const cleanSheetRate = cleanSheets != null && playedGames > 0 ? cleanSheets / playedGames : null; return { goals_conceded_per_game: Math.round(goalsConcededPerGame * 1000) / 1000, clean_sheet_rate: cleanSheetRate, defensive_rank: rank + 1, // 1-indexed for human reasoning defensive_rank_norm: rankNorm, // 0..1 for engine1 played_games: playedGames, }; } // Project a single scorer row into the per-player aggregate. function aggregatePlayerFromScorer(scorerRow) { // Number(null) is 0 — explicit null check so a missing minutes field // doesn't pretend the player played 0 minutes (which would still // satisfy Number.isFinite and break the per-90 fallback). const minutes = scorerRow.minutesPlayed == null ? null : Number(scorerRow.minutesPlayed); const goals = Number(scorerRow.goals) || 0; const assists = Number(scorerRow.assists) || 0; const played = Number(scorerRow.playedMatches) || 0; // Per-90 rates need minutes. The free tier sometimes omits minutes — // fall back to (goals / played) when missing. const goalsPer90 = Number.isFinite(minutes) && minutes > 0 ? Math.round((goals / (minutes / 90)) * 1000) / 1000 : (played > 0 ? Math.round((goals / played) * 1000) / 1000 : null); const assistsPer90 = Number.isFinite(minutes) && minutes > 0 ? Math.round((assists / (minutes / 90)) * 1000) / 1000 : (played > 0 ? Math.round((assists / played) * 1000) / 1000 : null); const minutesPerGame = Number.isFinite(minutes) && played > 0 ? Math.round(minutes / played) : null; return { team: scorerRow.team, position: scorerRow.position, nationality: scorerRow.nationality, goals, assists, played, minutes: Number.isFinite(minutes) ? minutes : null, goals_per_90: goalsPer90, assists_per_90: assistsPer90, minutes_per_game: minutesPerGame, // Day 1 — no rolling 5-match form, no xG. The feature extractor // falls back to season_per_90 when recent_form_per_90 is null. recent_form_per_90: null, season_per_90: goalsPer90, start_rate: null, xg_per_90: null, xa_per_90: null, xg_delta: null, }; } // Session 10 — pull finished WC fixtures from api-football and // aggregate per-player season stats across them. Writes // `apifootball:player_by_name:{normalizedName}` so the cascade hits // PRIMARY for these players instead of falling through to // football-data. Hard-capped at `maxPlayers` writes per run. async function enrichFromApiFootball(league, args) { if (!apif.hasApiKey()) { return { skipped: 'no_key', players: 0 }; } const leagueId = APIFOOTBALL_LEAGUE_MAP[league]; if (!leagueId) { return { skipped: 'unmapped_league', players: 0 }; } const fixtures = await apif.getFixtures({ league: leagueId, season: args.season }); if (!Array.isArray(fixtures) || fixtures.length === 0) { return { skipped: 'no_fixtures', players: 0 }; } // Only walk FINISHED fixtures — in-progress games have partial stats // that would skew the per-90 rates. api-football's `status` short // code is 'FT' / 'AET' / 'PEN' for finished, 'NS' / 'TBD' for not // started, '1H' / '2H' / 'HT' / 'ET' / 'BT' / 'P' / 'SUSP' for live. const finishedStatuses = new Set(['FT', 'AET', 'PEN', 'AWD', 'WO']); const finished = fixtures.filter((f) => finishedStatuses.has(f.status)); // Index by player name across all finished fixtures. We accumulate // raw stats then collapse into per-90 rates at the end. const byPlayer = new Map(); let fixtureBudget = Math.min(finished.length, 16); // budget cap — each fixture is 1 api-football call for (const fixture of finished.slice(0, fixtureBudget)) { if (byPlayer.size >= args.maxPlayers * 2) break; // header const playerStats = await apif.getFixturePlayerStats(fixture.id); if (!Array.isArray(playerStats)) continue; for (const row of playerStats) { if (!row.name) continue; const key = normalizeName(row.name); const agg = byPlayer.get(key) || { name: row.name, team: row.team, playerId: row.playerId, position: row.position, appearances: 0, starts: 0, minutes: 0, goals: 0, assists: 0, shots_total: 0, shots_on: 0, tackles_total: 0, yellow: 0, red: 0, rating_sum: 0, rating_count: 0, }; agg.appearances += 1; if (!row.substitute) agg.starts += 1; agg.minutes += Number(row.minutes) || 0; agg.goals += Number(row.goals) || 0; agg.assists += Number(row.assists) || 0; agg.shots_total += Number(row.shots_total) || 0; agg.shots_on += Number(row.shots_on) || 0; agg.tackles_total += Number(row.tackles_total) || 0; agg.yellow += Number(row.yellow) || 0; agg.red += Number(row.red) || 0; const rating = Number(row.rating); if (Number.isFinite(rating) && rating > 0) { agg.rating_sum += rating; agg.rating_count += 1; } byPlayer.set(key, agg); } } // Collapse and persist (within maxPlayers budget). let written = 0; for (const [normalized, agg] of byPlayer) { if (written >= args.maxPlayers) break; const profile = { name: agg.name, team: agg.team, playerId: agg.playerId, position: agg.position, appearances: agg.appearances, starts: agg.starts, minutes: agg.minutes, goals: agg.goals, assists: agg.assists, // Cascade-canonical fields. goals_per_90: agg.minutes > 0 ? Math.round((agg.goals / (agg.minutes / 90)) * 1000) / 1000 : null, assists_per_90: agg.minutes > 0 ? Math.round((agg.assists / (agg.minutes / 90)) * 1000) / 1000 : null, minutes_per_game: agg.appearances > 0 ? Math.round(agg.minutes / agg.appearances) : null, start_rate: agg.appearances > 0 ? Math.round((agg.starts / agg.appearances) * 100) / 100 : null, // Soccer-specific overlays. shots_per_90: agg.minutes > 0 ? Math.round((agg.shots_total / (agg.minutes / 90)) * 1000) / 1000 : null, shots_on_per_90: agg.minutes > 0 ? Math.round((agg.shots_on / (agg.minutes / 90)) * 1000) / 1000 : null, tackles_per_90: agg.minutes > 0 ? Math.round((agg.tackles_total / (agg.minutes / 90)) * 1000) / 1000 : null, yellow_per_90: agg.minutes > 0 ? Math.round((agg.yellow / (agg.minutes / 90)) * 1000) / 1000 : null, avg_rating: agg.rating_count > 0 ? Math.round((agg.rating_sum / agg.rating_count) * 100) / 100 : null, // xG fields still null (see comment at top of file) — when an // api-football endpoint that exposes xG goes live, fill here. xg_per_90: null, xa_per_90: null, xg_delta: null, // Aliases for the legacy reader. recent_form_per_90: null, season_per_90: agg.minutes > 0 ? Math.round((agg.goals / (agg.minutes / 90)) * 1000) / 1000 : null, }; if (!args.dryRun) { await cacheSet(`apifootball:player_by_name:${normalized}`, profile, PLAYER_TTL_SEC); } written += 1; } return { players: written, fixturesProcessed: fixtureBudget }; } // Session 10 — enrich the per-referee cache via FootApi. Referees // move slowly so a 7-day TTL is fine. This pass is best-effort: if // no key, skip; if a specific referee 404s, log + continue. async function enrichRefereesFromFootApi(refereeIds, args) { if (!footapi.hasApiKey()) return { skipped: 'no_key', referees: 0 }; if (!Array.isArray(refereeIds) || refereeIds.length === 0) return { referees: 0 }; let written = 0; for (const { id, name } of refereeIds) { if (!id || !name) continue; const stats = await footapi.getRefereeStatistics(id); if (!Array.isArray(stats) || stats.length === 0) continue; // Find the WC-2026 row if present, else collapse across tournaments. const wc = stats.find((s) => s.tournamentId === 16) || stats[0]; const payload = { name, cards_per_game: wc.yellowCardsPerGame, penalties_per_game: null, // FootApi schema doesn't expose this directly appearances: wc.appearances, yellow_cards: wc.yellowCards, red_cards: wc.redCards, }; if (!args.dryRun) { await cacheSet(`footapi:referee_by_name:${name}`, payload, REFEREE_TTL_SEC); } written += 1; } return { referees: written }; } async function processLeague(league, args) { const { dryRun } = args; const summary = { league, standings: 0, scorers: 0, players: 0, teamDefense: 0, apiFootballPlayers: 0, apiFootballSkipped: null, skipped: false, }; const [standings, scorers] = await Promise.all([ fbd.getLeagueStandings(league), fbd.getLeagueScorers(league), ]); // Either null means "API unavailable" — log + bail for this league. if (standings === null && scorers === null) { summary.skipped = true; return summary; } // ---- Standings → team defensive aggregates ---- // football-data wraps standings in groups (type === 'TOTAL' has the // table). Flatten all `table` rows so a competition with multiple // groups (e.g. World Cup group stage) feeds one combined rank table. if (Array.isArray(standings)) { const allRows = []; for (const group of standings) { if (Array.isArray(group?.table)) { for (const row of group.table) { if (row?.team?.name) allRows.push({ ...row, teamName: row.team.name }); } } } summary.standings = allRows.length; for (const row of allRows) { const agg = aggregateTeamDefense(row, allRows); if (!agg) continue; const key = `soccer:teamdefense:${league.toLowerCase()}:${row.teamName}`; if (!dryRun) await cacheSet(key, agg, DEFENSE_TTL_SEC); summary.teamDefense += 1; } if (!dryRun) await cacheSet(`soccer:${league.toLowerCase()}:standings`, standings, STANDINGS_TTL_SEC); } // ---- Scorers → per-player aggregates (football-data, TERTIARY) ---- // Always write the legacy soccer:player:* keys so the cascade has a // working fallback even when api-football is rate-limited or // misconfigured. These rows are thinner (no per-match minutes, no // rating) but they keep the engine producing non-null features. if (Array.isArray(scorers) && shouldRunSource(args, 'football-data')) { summary.scorers = scorers.length; for (const s of scorers) { if (!s?.name) continue; const profile = aggregatePlayerFromScorer(s); const key = `soccer:player:${normalizeName(s.name)}`; if (!dryRun) await cacheSet(key, profile, PLAYER_TTL_SEC); summary.players += 1; } if (!dryRun) await cacheSet(`soccer:${league.toLowerCase()}:scorers`, scorers, SCORERS_TTL_SEC); } // ---- api-football enrichment (PRIMARY cascade write) ---- if (shouldRunSource(args, 'api-football')) { const apifResult = await enrichFromApiFootball(league, args); summary.apiFootballPlayers = apifResult.players || 0; if (apifResult.skipped) summary.apiFootballSkipped = apifResult.skipped; } return summary; } async function main(argv = process.argv) { const args = parseArgs(argv); const startTs = Date.now(); console.log(`[soccer-prefetch] starting — leagues=${args.leagues.join(',')} source=${args.source} max_players=${args.maxPlayers} dry_run=${args.dryRun}`); // Skip only if EVERY configured source is unavailable. Previously // we bailed when football-data was unset, but now api-football can // carry the load on its own. const fbdReady = fbd.hasApiKey() && shouldRunSource(args, 'football-data'); const apifReady = apif.hasApiKey() && shouldRunSource(args, 'api-football'); const footapiReady = footapi.hasApiKey() && shouldRunSource(args, 'footapi'); if (!fbdReady && !apifReady && !footapiReady) { console.warn('[soccer-prefetch] no source keys configured — nothing to fetch. Static data + poller OSS fallback continue to work.'); return { skipped: true }; } const results = []; for (const league of args.leagues) { try { const r = await processLeague(league, args); results.push(r); console.log(`[soccer-prefetch] ${league}: standings=${r.standings} scorers=${r.scorers} players=${r.players} teamDefense=${r.teamDefense} apifootball=${r.apiFootballPlayers || 0}${r.apiFootballSkipped ? `(${r.apiFootballSkipped})` : ''} ${r.skipped ? '(skipped: no_data)' : ''}`); } catch (err) { console.warn(`[soccer-prefetch] ${league} failed:`, err.message); results.push({ league, error: err.message }); } } const elapsed = Math.round((Date.now() - startTs) / 1000); console.log(`[soccer-prefetch] done in ${elapsed}s — ${results.length} leagues processed`); return { results, elapsedSec: elapsed }; } if (require.main === module) { main().then(() => process.exit(0)).catch((err) => { console.error('[soccer-prefetch] fatal:', err); process.exit(1); }); } module.exports = { main, __internals: { parseArgs, shouldRunSource, aggregateTeamDefense, aggregatePlayerFromScorer, enrichFromApiFootball, enrichRefereesFromFootApi, processLeague, APIFOOTBALL_LEAGUE_MAP, PLAYER_TTL_SEC, STANDINGS_TTL_SEC, SCORERS_TTL_SEC, DEFENSE_TTL_SEC, REFEREE_TTL_SEC, }, };