460 lines
18 KiB
JavaScript
460 lines
18 KiB
JavaScript
#!/usr/bin/env node
|
|
/**
|
|
* Daily soccer intelligence prefetch — run once per day.
|
|
*
|
|
* cron: 0 5 * * * (5am UTC, ~midnight ET — before US fixtures)
|
|
* call: node scripts/soccer-data-prefetch.js [--leagues=WC,PL] [--dry-run]
|
|
*
|
|
* Why: football-data.org caps at 10 req/min and ~10/day for some
|
|
* endpoints. We can't read these on the user request path. This script
|
|
* batches the reads, transforms them into the per-player / per-team
|
|
* aggregates the feature extractor consumes, and persists them to
|
|
* Redis with conservative TTLs.
|
|
*
|
|
* Writes:
|
|
* soccer:{league}:standings — raw standings from API
|
|
* soccer:{league}:scorers — top-scorers list (projected)
|
|
* soccer:player:{normalizedName} — per-player aggregate (per-90 rates)
|
|
* soccer:teamdefense:{league}:{team} — team defensive aggregate + normalized rank
|
|
*
|
|
* Does NOT write next-match / last-fixture pointers — those are the
|
|
* job of the poller (poller/soccer.js), which runs more frequently
|
|
* since fixture state changes faster.
|
|
*
|
|
* xG data (`xg_per_90`, `xg_delta`) is left null on Day 1 — sourcing
|
|
* it requires a soccerdata-Python bridge that's a follow-up. The
|
|
* downstream feature extractor handles null xG gracefully.
|
|
*
|
|
* No DB writes. Graceful exit (code 0) when API keys are missing — the
|
|
* script logs "skipped" and the feature extractor continues with the
|
|
* static-data-only path.
|
|
*/
|
|
|
|
const fbd = require('../src/services/adapters/footballDataAdapter');
|
|
const apif = require('../src/services/adapters/apiFootballAdapter');
|
|
const footapi = require('../src/services/adapters/footApiAdapter');
|
|
const { cacheSet } = require('../src/utils/redis');
|
|
const { normalizeName } = require('../src/utils/normalize');
|
|
|
|
const PLAYER_TTL_SEC = 24 * 3600;
|
|
const STANDINGS_TTL_SEC = 12 * 3600;
|
|
const SCORERS_TTL_SEC = 6 * 3600;
|
|
const DEFENSE_TTL_SEC = 12 * 3600;
|
|
const REFEREE_TTL_SEC = 7 * 24 * 3600;
|
|
|
|
// Session 10 — Map football-data competition codes to api-football
|
|
// league IDs so the prefetch can ask api-football for the matching
|
|
// season's data. Add codes here as more leagues come online.
|
|
const APIFOOTBALL_LEAGUE_MAP = Object.freeze({
|
|
WC: 1, // FIFA World Cup
|
|
PL: 39, // English Premier League
|
|
PD: 140, // La Liga
|
|
BL1: 78, // Bundesliga
|
|
SA: 135, // Serie A
|
|
FL1: 61, // Ligue 1
|
|
CL: 2, // UEFA Champions League
|
|
MLS: 253, // MLS
|
|
});
|
|
|
|
function parseArgs(argv) {
|
|
// Sources controls which adapters get called. `all` (default) tries
|
|
// every configured adapter; the explicit single-source values are
|
|
// useful for debugging or for skipping a misbehaving source.
|
|
const VALID_SOURCES = new Set(['all', 'api-football', 'footapi', 'football-data']);
|
|
const args = {
|
|
leagues: ['WC'],
|
|
dryRun: false,
|
|
source: 'all',
|
|
maxPlayers: 80,
|
|
season: 2026,
|
|
};
|
|
for (const a of argv.slice(2)) {
|
|
if (a.startsWith('--leagues=')) {
|
|
args.leagues = a.slice('--leagues='.length).split(',').map((s) => s.trim().toUpperCase()).filter(Boolean);
|
|
} else if (a === '--dry-run') {
|
|
args.dryRun = true;
|
|
} else if (a.startsWith('--source=')) {
|
|
const src = a.slice('--source='.length).trim().toLowerCase();
|
|
args.source = VALID_SOURCES.has(src) ? src : 'all';
|
|
} else if (a.startsWith('--max-players=')) {
|
|
const n = Number(a.slice('--max-players='.length));
|
|
if (Number.isFinite(n) && n > 0) args.maxPlayers = Math.floor(n);
|
|
} else if (a.startsWith('--season=')) {
|
|
const n = Number(a.slice('--season='.length));
|
|
if (Number.isFinite(n) && n > 1900) args.season = n;
|
|
}
|
|
}
|
|
if (!process.argv.some((a) => a.startsWith('--leagues='))) {
|
|
const env = process.env.SOCCER_LEAGUES;
|
|
if (env) args.leagues = env.split(',').map((s) => s.trim().toUpperCase()).filter(Boolean);
|
|
}
|
|
return args;
|
|
}
|
|
|
|
function shouldRunSource(args, source) {
|
|
// Default to 'all' so callers (and existing tests) that don't set
|
|
// `source` explicitly get the legacy "run every source" behavior.
|
|
const requested = args && args.source ? args.source : 'all';
|
|
return requested === 'all' || requested === source;
|
|
}
|
|
|
|
// Project a single team's standings row into the defensive aggregate
|
|
// the feature extractor reads. defensive_rank_norm is on a 0..1 scale
|
|
// (0 = best defense, 1 = worst) so it slots into engine1's opp_rank_stat.
|
|
function aggregateTeamDefense(standingsRow, allRows) {
|
|
const playedGames = standingsRow.playedGames || standingsRow.played || 0;
|
|
const goalsAgainst = standingsRow.goalsAgainst ?? null;
|
|
if (!playedGames || goalsAgainst == null) return null;
|
|
|
|
const goalsConcededPerGame = goalsAgainst / playedGames;
|
|
|
|
// Normalize against the rest of the table — defensive_rank_norm = the
|
|
// team's goals-conceded percentile (0 best, 1 worst).
|
|
const allRates = allRows
|
|
.map((r) => {
|
|
const pg = r.playedGames || r.played || 0;
|
|
if (!pg) return null;
|
|
return (r.goalsAgainst ?? 0) / pg;
|
|
})
|
|
.filter((v) => Number.isFinite(v))
|
|
.sort((a, b) => a - b);
|
|
|
|
let rank = allRates.findIndex((v) => v >= goalsConcededPerGame);
|
|
if (rank === -1) rank = allRates.length - 1;
|
|
const rankNorm = allRates.length > 1 ? rank / (allRates.length - 1) : 0;
|
|
|
|
// Clean sheets (not on the football-data row in the free tier — null is OK).
|
|
const cleanSheets = standingsRow.cleanSheets ?? null;
|
|
const cleanSheetRate = cleanSheets != null && playedGames > 0
|
|
? cleanSheets / playedGames
|
|
: null;
|
|
|
|
return {
|
|
goals_conceded_per_game: Math.round(goalsConcededPerGame * 1000) / 1000,
|
|
clean_sheet_rate: cleanSheetRate,
|
|
defensive_rank: rank + 1, // 1-indexed for human reasoning
|
|
defensive_rank_norm: rankNorm, // 0..1 for engine1
|
|
played_games: playedGames,
|
|
};
|
|
}
|
|
|
|
// Project a single scorer row into the per-player aggregate.
|
|
function aggregatePlayerFromScorer(scorerRow) {
|
|
// Number(null) is 0 — explicit null check so a missing minutes field
|
|
// doesn't pretend the player played 0 minutes (which would still
|
|
// satisfy Number.isFinite and break the per-90 fallback).
|
|
const minutes = scorerRow.minutesPlayed == null ? null : Number(scorerRow.minutesPlayed);
|
|
const goals = Number(scorerRow.goals) || 0;
|
|
const assists = Number(scorerRow.assists) || 0;
|
|
const played = Number(scorerRow.playedMatches) || 0;
|
|
|
|
// Per-90 rates need minutes. The free tier sometimes omits minutes —
|
|
// fall back to (goals / played) when missing.
|
|
const goalsPer90 = Number.isFinite(minutes) && minutes > 0
|
|
? Math.round((goals / (minutes / 90)) * 1000) / 1000
|
|
: (played > 0 ? Math.round((goals / played) * 1000) / 1000 : null);
|
|
const assistsPer90 = Number.isFinite(minutes) && minutes > 0
|
|
? Math.round((assists / (minutes / 90)) * 1000) / 1000
|
|
: (played > 0 ? Math.round((assists / played) * 1000) / 1000 : null);
|
|
|
|
const minutesPerGame = Number.isFinite(minutes) && played > 0
|
|
? Math.round(minutes / played)
|
|
: null;
|
|
|
|
return {
|
|
team: scorerRow.team,
|
|
position: scorerRow.position,
|
|
nationality: scorerRow.nationality,
|
|
goals,
|
|
assists,
|
|
played,
|
|
minutes: Number.isFinite(minutes) ? minutes : null,
|
|
goals_per_90: goalsPer90,
|
|
assists_per_90: assistsPer90,
|
|
minutes_per_game: minutesPerGame,
|
|
// Day 1 — no rolling 5-match form, no xG. The feature extractor
|
|
// falls back to season_per_90 when recent_form_per_90 is null.
|
|
recent_form_per_90: null,
|
|
season_per_90: goalsPer90,
|
|
start_rate: null,
|
|
xg_per_90: null,
|
|
xa_per_90: null,
|
|
xg_delta: null,
|
|
};
|
|
}
|
|
|
|
// Session 10 — pull finished WC fixtures from api-football and
|
|
// aggregate per-player season stats across them. Writes
|
|
// `apifootball:player_by_name:{normalizedName}` so the cascade hits
|
|
// PRIMARY for these players instead of falling through to
|
|
// football-data. Hard-capped at `maxPlayers` writes per run.
|
|
async function enrichFromApiFootball(league, args) {
|
|
if (!apif.hasApiKey()) {
|
|
return { skipped: 'no_key', players: 0 };
|
|
}
|
|
const leagueId = APIFOOTBALL_LEAGUE_MAP[league];
|
|
if (!leagueId) {
|
|
return { skipped: 'unmapped_league', players: 0 };
|
|
}
|
|
const fixtures = await apif.getFixtures({ league: leagueId, season: args.season });
|
|
if (!Array.isArray(fixtures) || fixtures.length === 0) {
|
|
return { skipped: 'no_fixtures', players: 0 };
|
|
}
|
|
|
|
// Only walk FINISHED fixtures — in-progress games have partial stats
|
|
// that would skew the per-90 rates. api-football's `status` short
|
|
// code is 'FT' / 'AET' / 'PEN' for finished, 'NS' / 'TBD' for not
|
|
// started, '1H' / '2H' / 'HT' / 'ET' / 'BT' / 'P' / 'SUSP' for live.
|
|
const finishedStatuses = new Set(['FT', 'AET', 'PEN', 'AWD', 'WO']);
|
|
const finished = fixtures.filter((f) => finishedStatuses.has(f.status));
|
|
|
|
// Index by player name across all finished fixtures. We accumulate
|
|
// raw stats then collapse into per-90 rates at the end.
|
|
const byPlayer = new Map();
|
|
let fixtureBudget = Math.min(finished.length, 16); // budget cap — each fixture is 1 api-football call
|
|
|
|
for (const fixture of finished.slice(0, fixtureBudget)) {
|
|
if (byPlayer.size >= args.maxPlayers * 2) break; // header
|
|
const playerStats = await apif.getFixturePlayerStats(fixture.id);
|
|
if (!Array.isArray(playerStats)) continue;
|
|
for (const row of playerStats) {
|
|
if (!row.name) continue;
|
|
const key = normalizeName(row.name);
|
|
const agg = byPlayer.get(key) || {
|
|
name: row.name,
|
|
team: row.team,
|
|
playerId: row.playerId,
|
|
position: row.position,
|
|
appearances: 0,
|
|
starts: 0,
|
|
minutes: 0,
|
|
goals: 0,
|
|
assists: 0,
|
|
shots_total: 0,
|
|
shots_on: 0,
|
|
tackles_total: 0,
|
|
yellow: 0,
|
|
red: 0,
|
|
rating_sum: 0,
|
|
rating_count: 0,
|
|
};
|
|
agg.appearances += 1;
|
|
if (!row.substitute) agg.starts += 1;
|
|
agg.minutes += Number(row.minutes) || 0;
|
|
agg.goals += Number(row.goals) || 0;
|
|
agg.assists += Number(row.assists) || 0;
|
|
agg.shots_total += Number(row.shots_total) || 0;
|
|
agg.shots_on += Number(row.shots_on) || 0;
|
|
agg.tackles_total += Number(row.tackles_total) || 0;
|
|
agg.yellow += Number(row.yellow) || 0;
|
|
agg.red += Number(row.red) || 0;
|
|
const rating = Number(row.rating);
|
|
if (Number.isFinite(rating) && rating > 0) {
|
|
agg.rating_sum += rating;
|
|
agg.rating_count += 1;
|
|
}
|
|
byPlayer.set(key, agg);
|
|
}
|
|
}
|
|
|
|
// Collapse and persist (within maxPlayers budget).
|
|
let written = 0;
|
|
for (const [normalized, agg] of byPlayer) {
|
|
if (written >= args.maxPlayers) break;
|
|
const profile = {
|
|
name: agg.name,
|
|
team: agg.team,
|
|
playerId: agg.playerId,
|
|
position: agg.position,
|
|
appearances: agg.appearances,
|
|
starts: agg.starts,
|
|
minutes: agg.minutes,
|
|
goals: agg.goals,
|
|
assists: agg.assists,
|
|
// Cascade-canonical fields.
|
|
goals_per_90: agg.minutes > 0 ? Math.round((agg.goals / (agg.minutes / 90)) * 1000) / 1000 : null,
|
|
assists_per_90: agg.minutes > 0 ? Math.round((agg.assists / (agg.minutes / 90)) * 1000) / 1000 : null,
|
|
minutes_per_game: agg.appearances > 0 ? Math.round(agg.minutes / agg.appearances) : null,
|
|
start_rate: agg.appearances > 0 ? Math.round((agg.starts / agg.appearances) * 100) / 100 : null,
|
|
// Soccer-specific overlays.
|
|
shots_per_90: agg.minutes > 0 ? Math.round((agg.shots_total / (agg.minutes / 90)) * 1000) / 1000 : null,
|
|
shots_on_per_90: agg.minutes > 0 ? Math.round((agg.shots_on / (agg.minutes / 90)) * 1000) / 1000 : null,
|
|
tackles_per_90: agg.minutes > 0 ? Math.round((agg.tackles_total / (agg.minutes / 90)) * 1000) / 1000 : null,
|
|
yellow_per_90: agg.minutes > 0 ? Math.round((agg.yellow / (agg.minutes / 90)) * 1000) / 1000 : null,
|
|
avg_rating: agg.rating_count > 0 ? Math.round((agg.rating_sum / agg.rating_count) * 100) / 100 : null,
|
|
// xG fields still null (see comment at top of file) — when an
|
|
// api-football endpoint that exposes xG goes live, fill here.
|
|
xg_per_90: null,
|
|
xa_per_90: null,
|
|
xg_delta: null,
|
|
// Aliases for the legacy reader.
|
|
recent_form_per_90: null,
|
|
season_per_90: agg.minutes > 0 ? Math.round((agg.goals / (agg.minutes / 90)) * 1000) / 1000 : null,
|
|
};
|
|
if (!args.dryRun) {
|
|
await cacheSet(`apifootball:player_by_name:${normalized}`, profile, PLAYER_TTL_SEC);
|
|
}
|
|
written += 1;
|
|
}
|
|
return { players: written, fixturesProcessed: fixtureBudget };
|
|
}
|
|
|
|
// Session 10 — enrich the per-referee cache via FootApi. Referees
|
|
// move slowly so a 7-day TTL is fine. This pass is best-effort: if
|
|
// no key, skip; if a specific referee 404s, log + continue.
|
|
async function enrichRefereesFromFootApi(refereeIds, args) {
|
|
if (!footapi.hasApiKey()) return { skipped: 'no_key', referees: 0 };
|
|
if (!Array.isArray(refereeIds) || refereeIds.length === 0) return { referees: 0 };
|
|
let written = 0;
|
|
for (const { id, name } of refereeIds) {
|
|
if (!id || !name) continue;
|
|
const stats = await footapi.getRefereeStatistics(id);
|
|
if (!Array.isArray(stats) || stats.length === 0) continue;
|
|
// Find the WC-2026 row if present, else collapse across tournaments.
|
|
const wc = stats.find((s) => s.tournamentId === 16) || stats[0];
|
|
const payload = {
|
|
name,
|
|
cards_per_game: wc.yellowCardsPerGame,
|
|
penalties_per_game: null, // FootApi schema doesn't expose this directly
|
|
appearances: wc.appearances,
|
|
yellow_cards: wc.yellowCards,
|
|
red_cards: wc.redCards,
|
|
};
|
|
if (!args.dryRun) {
|
|
await cacheSet(`footapi:referee_by_name:${name}`, payload, REFEREE_TTL_SEC);
|
|
}
|
|
written += 1;
|
|
}
|
|
return { referees: written };
|
|
}
|
|
|
|
async function processLeague(league, args) {
|
|
const { dryRun } = args;
|
|
const summary = {
|
|
league, standings: 0, scorers: 0, players: 0, teamDefense: 0,
|
|
apiFootballPlayers: 0, apiFootballSkipped: null, skipped: false,
|
|
};
|
|
|
|
const [standings, scorers] = await Promise.all([
|
|
fbd.getLeagueStandings(league),
|
|
fbd.getLeagueScorers(league),
|
|
]);
|
|
|
|
// Either null means "API unavailable" — log + bail for this league.
|
|
if (standings === null && scorers === null) {
|
|
summary.skipped = true;
|
|
return summary;
|
|
}
|
|
|
|
// ---- Standings → team defensive aggregates ----
|
|
// football-data wraps standings in groups (type === 'TOTAL' has the
|
|
// table). Flatten all `table` rows so a competition with multiple
|
|
// groups (e.g. World Cup group stage) feeds one combined rank table.
|
|
if (Array.isArray(standings)) {
|
|
const allRows = [];
|
|
for (const group of standings) {
|
|
if (Array.isArray(group?.table)) {
|
|
for (const row of group.table) {
|
|
if (row?.team?.name) allRows.push({ ...row, teamName: row.team.name });
|
|
}
|
|
}
|
|
}
|
|
summary.standings = allRows.length;
|
|
|
|
for (const row of allRows) {
|
|
const agg = aggregateTeamDefense(row, allRows);
|
|
if (!agg) continue;
|
|
const key = `soccer:teamdefense:${league.toLowerCase()}:${row.teamName}`;
|
|
if (!dryRun) await cacheSet(key, agg, DEFENSE_TTL_SEC);
|
|
summary.teamDefense += 1;
|
|
}
|
|
if (!dryRun) await cacheSet(`soccer:${league.toLowerCase()}:standings`, standings, STANDINGS_TTL_SEC);
|
|
}
|
|
|
|
// ---- Scorers → per-player aggregates (football-data, TERTIARY) ----
|
|
// Always write the legacy soccer:player:* keys so the cascade has a
|
|
// working fallback even when api-football is rate-limited or
|
|
// misconfigured. These rows are thinner (no per-match minutes, no
|
|
// rating) but they keep the engine producing non-null features.
|
|
if (Array.isArray(scorers) && shouldRunSource(args, 'football-data')) {
|
|
summary.scorers = scorers.length;
|
|
for (const s of scorers) {
|
|
if (!s?.name) continue;
|
|
const profile = aggregatePlayerFromScorer(s);
|
|
const key = `soccer:player:${normalizeName(s.name)}`;
|
|
if (!dryRun) await cacheSet(key, profile, PLAYER_TTL_SEC);
|
|
summary.players += 1;
|
|
}
|
|
if (!dryRun) await cacheSet(`soccer:${league.toLowerCase()}:scorers`, scorers, SCORERS_TTL_SEC);
|
|
}
|
|
|
|
// ---- api-football enrichment (PRIMARY cascade write) ----
|
|
if (shouldRunSource(args, 'api-football')) {
|
|
const apifResult = await enrichFromApiFootball(league, args);
|
|
summary.apiFootballPlayers = apifResult.players || 0;
|
|
if (apifResult.skipped) summary.apiFootballSkipped = apifResult.skipped;
|
|
}
|
|
|
|
return summary;
|
|
}
|
|
|
|
async function main(argv = process.argv) {
|
|
const args = parseArgs(argv);
|
|
const startTs = Date.now();
|
|
|
|
console.log(`[soccer-prefetch] starting — leagues=${args.leagues.join(',')} source=${args.source} max_players=${args.maxPlayers} dry_run=${args.dryRun}`);
|
|
|
|
// Skip only if EVERY configured source is unavailable. Previously
|
|
// we bailed when football-data was unset, but now api-football can
|
|
// carry the load on its own.
|
|
const fbdReady = fbd.hasApiKey() && shouldRunSource(args, 'football-data');
|
|
const apifReady = apif.hasApiKey() && shouldRunSource(args, 'api-football');
|
|
const footapiReady = footapi.hasApiKey() && shouldRunSource(args, 'footapi');
|
|
if (!fbdReady && !apifReady && !footapiReady) {
|
|
console.warn('[soccer-prefetch] no source keys configured — nothing to fetch. Static data + poller OSS fallback continue to work.');
|
|
return { skipped: true };
|
|
}
|
|
|
|
const results = [];
|
|
for (const league of args.leagues) {
|
|
try {
|
|
const r = await processLeague(league, args);
|
|
results.push(r);
|
|
console.log(`[soccer-prefetch] ${league}: standings=${r.standings} scorers=${r.scorers} players=${r.players} teamDefense=${r.teamDefense} apifootball=${r.apiFootballPlayers || 0}${r.apiFootballSkipped ? `(${r.apiFootballSkipped})` : ''} ${r.skipped ? '(skipped: no_data)' : ''}`);
|
|
} catch (err) {
|
|
console.warn(`[soccer-prefetch] ${league} failed:`, err.message);
|
|
results.push({ league, error: err.message });
|
|
}
|
|
}
|
|
|
|
const elapsed = Math.round((Date.now() - startTs) / 1000);
|
|
console.log(`[soccer-prefetch] done in ${elapsed}s — ${results.length} leagues processed`);
|
|
return { results, elapsedSec: elapsed };
|
|
}
|
|
|
|
if (require.main === module) {
|
|
main().then(() => process.exit(0)).catch((err) => {
|
|
console.error('[soccer-prefetch] fatal:', err);
|
|
process.exit(1);
|
|
});
|
|
}
|
|
|
|
module.exports = {
|
|
main,
|
|
__internals: {
|
|
parseArgs,
|
|
shouldRunSource,
|
|
aggregateTeamDefense,
|
|
aggregatePlayerFromScorer,
|
|
enrichFromApiFootball,
|
|
enrichRefereesFromFootApi,
|
|
processLeague,
|
|
APIFOOTBALL_LEAGUE_MAP,
|
|
PLAYER_TTL_SEC,
|
|
STANDINGS_TTL_SEC,
|
|
SCORERS_TTL_SEC,
|
|
DEFENSE_TTL_SEC,
|
|
REFEREE_TTL_SEC,
|
|
},
|
|
};
|