Sessions 5-7a: 955 tests, deployment ready

2026-06-08 18:35:13 -04:00
parent 06b82624a2
commit 1fa04dc776
371 changed files with 49366 additions and 955 deletions
@@ -0,0 +1,153 @@
+"""
+MLB Statcast enrichment using pybaseball.
+
+Provides:
+- Pitcher pitch-mix + zone heatmap data for K-prop grading
+- Batter vs Pitcher historical matchup data
+
+We avoid wide-net `statcast()` calls that pull every pitch league-wide —
+those routinely time out. Pitcher-specific calls are scoped to a 30-day
+trailing window which keeps payloads under a few hundred KB.
+"""
+from __future__ import annotations
+
+import time
+from datetime import datetime, timedelta
+from typing import Optional
+
+import pandas as pd
+from pybaseball import statcast_pitcher
+
+from app.utils.cache import cache_get, cache_set
+from app.config import SPLITS_TTL
+
+
+def _today_iso() -> str:
+    return datetime.utcnow().strftime("%Y-%m-%d")
+
+
+def _date_n_days_ago(n: int) -> str:
+    return (datetime.utcnow() - timedelta(days=n)).strftime("%Y-%m-%d")
+
+
+def get_pitcher_profile(pitcher_id: int, days_back: int = 30) -> dict:
+    """
+    Aggregate a pitcher's recent pitch-level data into pitch mix,
+    velocity, whiff/chase, and zone heatmap counts.
+    """
+    if not isinstance(pitcher_id, int) or pitcher_id <= 0:
+        return {"error": "invalid pitcher_id"}
+
+    cache_key = f"mlb:pitcher:{pitcher_id}:d{days_back}"
+    cached = cache_get(cache_key)
+    if cached is not None:
+        cached["source"] = "cache"
+        return cached
+
+    end = _today_iso()
+    start = _date_n_days_ago(days_back)
+
+    try:
+        data = statcast_pitcher(start, end, pitcher_id)
+    except Exception as exc:
+        return {"error": f"statcast fetch failed: {exc!s}"}
+
+    if data is None or data.empty:
+        return {"pitcher_id": pitcher_id, "pitch_mix": [], "zone": [], "note": "no data"}
+
+    # Pitch mix
+    description_col = data["description"] if "description" in data.columns else pd.Series(dtype=str)
+    pitch_mix_grouped = data.groupby("pitch_type") if "pitch_type" in data.columns else None
+    pitch_mix: list[dict] = []
+    if pitch_mix_grouped is not None:
+        for ptype, g in pitch_mix_grouped:
+            total = len(g)
+            d = g["description"] if "description" in g.columns else pd.Series(dtype=str)
+            swings = d.isin([
+                "swinging_strike", "foul", "foul_tip", "hit_into_play",
+                "swinging_strike_blocked",
+            ]).sum() if not d.empty else 0
+            whiffs = (d == "swinging_strike").sum() if not d.empty else 0
+            pitch_mix.append({
+                "pitch_type": str(ptype),
+                "count": int(total),
+                "share": float(total / len(data)) if len(data) else 0.0,
+                "avg_velocity": float(g["release_speed"].mean()) if "release_speed" in g.columns else None,
+                "whiff_rate": float(whiffs / swings) if swings else 0.0,
+            })
+
+    # Zone heatmap (the existing pybaseball 'zone' column is the 13-zone scheme)
+    zone_data: list[dict] = []
+    if "zone" in data.columns:
+        for zone, g in data.groupby("zone"):
+            d = g["description"] if "description" in g.columns else pd.Series(dtype=str)
+            zone_data.append({
+                "zone": int(zone) if pd.notna(zone) else None,
+                "pitches": int(len(g)),
+                "whiff_rate": float((d == "swinging_strike").mean()) if not d.empty else 0.0,
+            })
+
+    result = {
+        "pitcher_id": pitcher_id,
+        "window_days": days_back,
+        "total_pitches": int(len(data)),
+        "avg_velocity": float(data["release_speed"].mean()) if "release_speed" in data.columns else None,
+        "k_rate_estimate": float((data["events"] == "strikeout").mean()) if "events" in data.columns else None,
+        "pitch_mix": pitch_mix,
+        "zone": zone_data,
+        "source": "statcast",
+    }
+    cache_set(cache_key, result, SPLITS_TTL)
+    return result
+
+
+def get_batter_vs_pitcher(batter_id: int, pitcher_id: int, years_back: int = 3) -> dict:
+    """
+    Historical matchup. We scope to the pitcher because their pitch stream
+    is small enough to fetch quickly; then filter to plate appearances by
+    the batter.
+    """
+    if not isinstance(batter_id, int) or not isinstance(pitcher_id, int):
+        return {"error": "invalid ids"}
+
+    cache_key = f"mlb:bvp:{batter_id}:{pitcher_id}:y{years_back}"
+    cached = cache_get(cache_key)
+    if cached is not None:
+        cached["source"] = "cache"
+        return cached
+
+    end = _today_iso()
+    start = _date_n_days_ago(365 * years_back)
+
+    try:
+        pitcher_data = statcast_pitcher(start, end, pitcher_id)
+    except Exception as exc:
+        return {"error": f"statcast fetch failed: {exc!s}"}
+
+    if pitcher_data is None or pitcher_data.empty or "batter" not in pitcher_data.columns:
+        return {"batter_id": batter_id, "pitcher_id": pitcher_id, "matchup": "no data"}
+
+    matchup = pitcher_data[pitcher_data["batter"] == batter_id]
+    if matchup.empty:
+        return {"batter_id": batter_id, "pitcher_id": pitcher_id, "matchup": "no history"}
+
+    events = matchup["events"] if "events" in matchup.columns else pd.Series(dtype=str)
+    result = {
+        "batter_id": batter_id,
+        "pitcher_id": pitcher_id,
+        "plate_appearances": int(events.notna().sum()),
+        "hits": int(events.isin(["single", "double", "triple", "home_run"]).sum()),
+        "strikeouts": int((events == "strikeout").sum()),
+        "home_runs": int((events == "home_run").sum()),
+        "walks": int((events == "walk").sum()),
+        "avg_exit_velocity": float(matchup["launch_speed"].mean()) if "launch_speed" in matchup.columns else None,
+        "pitches_seen": int(len(matchup)),
+        "pitch_types_faced": {
+            str(k): int(v)
+            for k, v in (matchup["pitch_type"].value_counts().to_dict().items() if "pitch_type" in matchup.columns else {}).items()
+        },
+        "source": "statcast",
+    }
+    # Cache aggressively — historical matchup data is stable.
+    cache_set(cache_key, result, SPLITS_TTL * 2)
+    return result
@@ -0,0 +1,107 @@
+"""
+MLB umpire K-zone profiling via pybaseball Statcast pitch data.
+
+Drives the K-prop modifier in the grading engine:
+- Top quartile called-strike rate → boost K projections
+- Bottom quartile → penalize K projections
+
+NOTE: Statcast's per-pitch dataset includes umpires under the `umpire` and
+`fielder_*` columns inconsistently across seasons. We treat missing data
+as 'no signal' rather than blocking the grade.
+"""
+from __future__ import annotations
+
+from datetime import datetime, timedelta
+from typing import Optional
+
+import pandas as pd
+from pybaseball import statcast
+
+from app.utils.cache import cache_get, cache_set
+from app.config import SPLITS_TTL
+
+# Approximate rule-book strike zone half-width / height range in feet.
+_ZONE_HALF_WIDTH = 0.83
+_ZONE_BOTTOM = 1.5
+_ZONE_TOP = 3.5
+
+
+def _today_iso() -> str:
+    return datetime.utcnow().strftime("%Y-%m-%d")
+
+
+def get_umpire_profile(umpire_name: Optional[str] = None, days_back: int = 30) -> dict:
+    """
+    Pull a window of pitch-level data and aggregate by umpire. Returns a
+    league average plus a list of umpires sorted by called-strike rate.
+
+    Heavy call — capped at 30 days to keep the payload manageable. The
+    orchestrator should call this nightly, not per-game.
+    """
+    days_back = max(7, min(int(days_back or 30), 45))
+    end = _today_iso()
+    start = (datetime.utcnow() - timedelta(days=days_back)).strftime("%Y-%m-%d")
+
+    cache_key = f"mlb:umpires:{start}:{end}:{umpire_name or 'all'}"
+    cached = cache_get(cache_key)
+    if cached is not None:
+        cached["source"] = "cache"
+        return cached
+
+    try:
+        data = statcast(start, end)
+    except Exception as exc:
+        return {"error": f"statcast fetch failed: {exc!s}", "umpires": []}
+
+    if data is None or data.empty:
+        return {"umpires": [], "note": "no data", "window": [start, end]}
+
+    if "umpire" not in data.columns:
+        # Some Statcast windows omit the umpire column entirely.
+        return {
+            "umpires": [],
+            "league_avg_called_strike_rate": None,
+            "note": "umpire data unavailable in this window",
+            "window": [start, end],
+        }
+
+    in_zone = (
+        data["plate_x"].abs() <= _ZONE_HALF_WIDTH
+    ) & (
+        data["plate_z"].between(_ZONE_BOTTOM, _ZONE_TOP)
+    ) if {"plate_x", "plate_z"}.issubset(data.columns) else pd.Series(False, index=data.index)
+
+    grouped = data.groupby("umpire", dropna=True)
+    rows = []
+    for ump, g in grouped:
+        d = g["description"] if "description" in g.columns else pd.Series(dtype=str)
+        called_strikes = int((d == "called_strike").sum())
+        called_balls = int((d == "ball").sum())
+        called_total = called_strikes + called_balls
+        events = g["events"] if "events" in g.columns else pd.Series(dtype=str)
+        rows.append({
+            "umpire": str(ump),
+            "pitches": int(len(g)),
+            "called_strike_rate": float(called_strikes / called_total) if called_total else 0.0,
+            "k_rate": float((events == "strikeout").mean()) if not events.empty else 0.0,
+            "in_zone_pitches": int(in_zone[g.index].sum()) if not in_zone.empty else 0,
+        })
+
+    if not rows:
+        return {"umpires": [], "note": "no per-umpire rows aggregated"}
+
+    league_avg = sum(r["called_strike_rate"] for r in rows) / len(rows)
+    rows.sort(key=lambda r: r["called_strike_rate"], reverse=True)
+
+    if umpire_name:
+        needle = umpire_name.lower()
+        rows = [r for r in rows if needle in r["umpire"].lower()]
+
+    result = {
+        "umpires": rows[:30],
+        "league_avg_called_strike_rate": league_avg,
+        "window": [start, end],
+        "source": "statcast",
+    }
+    cache_set(cache_key, result, SPLITS_TTL)
+    return result
@@ -0,0 +1,38 @@
+"""
+pbpstats wrapper — possession-level NBA/WNBA analytics.
+
+pbpstats client setup is non-trivial; this module exposes a single safe
+entrypoint that returns aggregate possession data per player. If client
+construction fails (commonly due to missing local data files), we return
+a structured 'unavailable' response rather than raising.
+"""
+from __future__ import annotations
+
+from typing import Optional
+
+
+def get_possession_data(player_id: int, season: str = "2025-26", season_type: str = "Regular Season") -> dict:
+    try:
+        from pbpstats.client import Client
+        settings = {
+            "Boxscore": {"source": "web", "data_provider": "data_nba"},
+            "Possessions": {"source": "web", "data_provider": "data_nba"},
+        }
+        client = Client(settings)
+        # The pbpstats API surface depends on the installed version. We
+        # expose just a minimal shape here so the orchestrator can call us
+        # uniformly even when this module is degraded.
+        return {
+            "player_id": player_id,
+            "season": season,
+            "season_type": season_type,
+            "available": True,
+            "note": "pbpstats client initialized; per-player possession aggregation TODO",
+            "source": "pbpstats",
+        }
+    except Exception as exc:
+        return {
+            "player_id": player_id,
+            "available": False,
+            "error": f"pbpstats unavailable: {exc!s}",
+        }
@@ -0,0 +1,160 @@
+"""
+NBA/WNBA referee enrichment.
+
+Source: stats.nba.com unofficial endpoints. Crew assignments are typically
+published ~60-90 minutes before tip via the boxscoresummaryv2 endpoint.
+
+This DIRECTLY affects kill conditions in the grading engine:
+- Crews calling more fouls than league average increase foul-trouble risk
+- Players w/ high foul rates + foul-heavy crews → kill condition activated
+
+We intentionally keep this stateless; the orchestrator caches results.
+"""
+from __future__ import annotations
+
+import time
+from typing import Optional
+
+import requests
+
+from app.utils.cache import cache_get, cache_set
+from app.config import NBA_API_TIMEOUT, SPLITS_TTL
+
+_NBA_HEADERS = {
+    "User-Agent": "Mozilla/5.0 (compatible; VYNDR/1.0)",
+    "Referer": "https://www.nba.com/",
+    "Accept": "application/json, text/plain, */*",
+    "Accept-Language": "en-US,en;q=0.9",
+    "x-nba-stats-origin": "stats",
+    "x-nba-stats-token": "true",
+}
+
+_REF_STATS_URL = "https://stats.nba.com/stats/officialgamefindergamelogs"
+_BOXSCORE_URL = "https://stats.nba.com/stats/boxscoresummaryv2"
+
+# League IDs per stats.nba.com convention
+LEAGUE_ID = {"nba": "00", "wnba": "10"}
+
+
+def _safe_get(url: str, params: dict) -> Optional[dict]:
+    """Resilient GET with a single retry. stats.nba.com is flaky."""
+    for attempt in (0, 1):
+        try:
+            resp = requests.get(url, headers=_NBA_HEADERS, params=params, timeout=NBA_API_TIMEOUT)
+            if resp.status_code == 200:
+                return resp.json()
+        except requests.RequestException:
+            pass
+        if attempt == 0:
+            time.sleep(1.5)
+    return None
+
+
+def get_tonight_officials(game_id: str) -> dict:
+    """
+    Return the crew assigned to a single game. Empty list means assignments
+    haven't been published yet (normal until ~90 min before tip).
+    """
+    if not game_id or not str(game_id).isalnum():
+        return {"error": "invalid game_id", "officials": []}
+
+    cache_key = f"refs:officials:{game_id}"
+    cached = cache_get(cache_key)
+    if cached is not None:
+        return cached
+
+    data = _safe_get(_BOXSCORE_URL, {"GameID": game_id})
+    if not data or "resultSets" not in data:
+        return {"officials": [], "game_id": game_id, "source": "stats.nba.com", "note": "no data"}
+
+    officials = []
+    for rs in data.get("resultSets", []):
+        if rs.get("name") != "Officials":
+            continue
+        headers = rs.get("headers") or []
+        for row in rs.get("rowSet") or []:
+            record = dict(zip(headers, row))
+            first = record.get("FIRST_NAME", "") or ""
+            last = record.get("LAST_NAME", "") or ""
+            officials.append({
+                "official_id": record.get("OFFICIAL_ID"),
+                "name": f"{first} {last}".strip(),
+                "jersey_num": record.get("JERSEY_NUM"),
+            })
+        break
+
+    result = {
+        "game_id": game_id,
+        "officials": officials,
+        "source": "stats.nba.com",
+    }
+    # Officials assignments don't change once published, but TTL keeps the cache fresh.
+    cache_set(cache_key, result, ttl=SPLITS_TTL)
+    return result
+
+
+def get_referee_tendencies(season: str, league: str = "nba") -> dict:
+    """
+    Aggregate per-referee tendencies for the season. Returns league_avg_pf
+    and a sorted list of refs by personal-foul rate; consumers can classify
+    'tight', 'average', 'generous' crews from the quartile bands.
+
+    NOTE: stats.nba.com's referee dashboard endpoint changes shape every few
+    years. If the upstream returns nothing, the orchestrator should fall
+    back to last season's cached data.
+    """
+    if league not in LEAGUE_ID:
+        return {"error": "invalid league", "referees": []}
+
+    cache_key = f"refs:tendencies:{league}:{season}"
+    cached = cache_get(cache_key)
+    if cached is not None:
+        return cached
+
+    # The upstream endpoint moved around 2024. We try the modern URL first
+    # and degrade gracefully — the rest of the pipeline can use league_avg
+    # alone to back off the foul-trouble kill condition modifier.
+    params = {
+        "Season": season,
+        "SeasonType": "Regular Season",
+        "LeagueID": LEAGUE_ID[league],
+        "PerMode": "PerGame",
+    }
+    data = _safe_get("https://stats.nba.com/stats/leaguedashrefstats", params)
+    if not data or not data.get("resultSets"):
+        result = {
+            "referees": [],
+            "league_avg_pf_per_game": None,
+            "season": season,
+            "league": league,
+            "note": "upstream referee dashboard unavailable",
+        }
+        # Short cache so we retry sooner.
+        cache_set(cache_key, result, ttl=300)
+        return result
+
+    rs = data["resultSets"][0]
+    headers = rs.get("headers") or []
+    refs = []
+    for row in rs.get("rowSet") or []:
+        record = dict(zip(headers, row))
+        refs.append({
+            "name": record.get("REFEREE_NAME", ""),
+            "games": record.get("GP", 0),
+            "pf_per_game": record.get("PF", 0),
+            "tech_per_game": record.get("TECH", 0),
+            "off_foul_per_game": record.get("OFF_FOUL", 0),
+        })
+
+    pf_values = [r["pf_per_game"] or 0 for r in refs if (r.get("pf_per_game") or 0) > 0]
+    league_avg = (sum(pf_values) / len(pf_values)) if pf_values else None
+
+    result = {
+        "referees": refs,
+        "league_avg_pf_per_game": league_avg,
+        "season": season,
+        "league": league,
+        "source": "stats.nba.com",
+    }
+    cache_set(cache_key, result, ttl=SPLITS_TTL)
+    return result
@@ -0,0 +1,157 @@
+"""
+WNBA stats — uses nba_api with league_id='10'.
+
+Kept self-contained (not a wrapper over NBA's stats.py) so the existing
+NBA code path stays untouched. Shape of the returned dicts mirrors
+stats.py so callers can dispatch on `sport` without branching downstream.
+"""
+from __future__ import annotations
+
+import time
+from datetime import datetime, timezone
+from typing import Optional
+
+from nba_api.stats.endpoints import playercareerstats, playergamelog
+from nba_api.stats.static import players as wnba_players
+
+from app.utils.cache import cache_get, cache_set
+from app.config import (
+    NBA_API_DELAY, NBA_API_TIMEOUT,
+    SEASON_AVG_TTL, LAST_N_TTL,
+)
+
+WNBA_LEAGUE_ID = "10"
+_STAT_MAP = {
+    "PTS": "points",
+    "REB": "rebounds",
+    "AST": "assists",
+    "FG3M": "threes",
+    "BLK": "blocks",
+    "STL": "steals",
+    "TOV": "turnovers",
+    "MIN": "minutes",
+    "GP": "games_played",
+}
+
+
+def _wnba_current_season() -> str:
+    now = datetime.now(timezone.utc)
+    # WNBA season is roughly May–September; use the calendar year.
+    return str(now.year)
+
+
+def _safe(func, **kwargs):
+    """Tiny rate-limited wrapper around nba_api endpoints."""
+    time.sleep(NBA_API_DELAY)
+    return func(timeout=NBA_API_TIMEOUT, **kwargs)
+
+
+def _resolve_wnba_player(name: str) -> tuple[Optional[int], str]:
+    name = (name or "").strip()
+    if len(name) < 2:
+        return None, ""
+    # nba_api.static.players only ships NBA player lists; for WNBA we resolve
+    # via the search endpoint (commonteamroster also works). For now we fall
+    # back to a name match across the (NBA + WNBA) static set, then verify
+    # with the live endpoint if needed.
+    matches = wnba_players.find_players_by_full_name(name)
+    if matches:
+        return matches[0]["id"], matches[0]["full_name"]
+    return None, ""
+
+
+def _map_stats(row: dict) -> dict:
+    return {our: row[their] for their, our in _STAT_MAP.items() if their in row}
+
+
+def wnba_season_avg(player_name: str, stat_type: Optional[str] = None, season: Optional[str] = None) -> Optional[dict]:
+    player_id, full_name = _resolve_wnba_player(player_name)
+    if player_id is None:
+        return None
+
+    season = season or _wnba_current_season()
+    cache_key = f"wnba:season:{player_id}:{season}"
+    cached = cache_get(cache_key)
+    if cached is not None:
+        cached["source"] = "cache"
+        if stat_type and stat_type in cached.get("stats", {}):
+            cached["stats"] = {stat_type: cached["stats"][stat_type]}
+        return cached
+
+    career = _safe(
+        playercareerstats.PlayerCareerStats,
+        player_id=player_id,
+        league_id_nullable=WNBA_LEAGUE_ID,
+    )
+    df = career.get_data_frames()[0]
+    season_row = df[df["SEASON_ID"] == season]
+
+    stats = _map_stats(season_row.iloc[0].to_dict()) if not season_row.empty else {}
+
+    result = {
+        "player": full_name,
+        "player_id": player_id,
+        "team": season_row.iloc[0]["TEAM_ABBREVIATION"] if not season_row.empty else "UNK",
+        "season": season,
+        "league": "wnba",
+        "source": "live",
+        "stats": stats,
+    }
+    cache_set(cache_key, result, SEASON_AVG_TTL)
+
+    if stat_type and stat_type in stats:
+        result["stats"] = {stat_type: stats[stat_type]}
+    return result
+
+
+def wnba_last_n(player_name: str, n: int = 10, stat_type: Optional[str] = None) -> Optional[dict]:
+    player_id, full_name = _resolve_wnba_player(player_name)
+    if player_id is None:
+        return None
+
+    n = min(max(int(n), 1), 30)
+    cache_key = f"wnba:last:{player_id}:{n}"
+    cached = cache_get(cache_key)
+    if cached is not None:
+        cached["source"] = "cache"
+        if stat_type and stat_type in cached.get("stats", {}):
+            cached["stats"] = {stat_type: cached["stats"][stat_type]}
+        return cached
+
+    season = _wnba_current_season()
+    gamelog = _safe(
+        playergamelog.PlayerGameLog,
+        player_id=player_id,
+        season=season,
+        league_id_nullable=WNBA_LEAGUE_ID,
+    )
+    df = gamelog.get_data_frames()[0]
+
+    if df.empty:
+        return {
+            "player": full_name,
+            "player_id": player_id,
+            "team": "UNK",
+            "last_n": n,
+            "league": "wnba",
+            "source": "live",
+            "stats": {},
+        }
+
+    recent = df.head(n)
+    averages = {our: float(recent[their].mean()) for their, our in _STAT_MAP.items() if their in recent.columns}
+
+    result = {
+        "player": full_name,
+        "player_id": player_id,
+        "team": str(recent.iloc[0].get("MATCHUP", "")).split(" ")[0] or "UNK",
+        "last_n": n,
+        "league": "wnba",
+        "source": "live",
+        "stats": averages,
+    }
+    cache_set(cache_key, result, LAST_N_TTL)
+
+    if stat_type and stat_type in averages:
+        result["stats"] = {stat_type: averages[stat_type]}
+    return result