""" MLB umpire K-zone profiling via pybaseball Statcast pitch data. Drives the K-prop modifier in the grading engine: - Top quartile called-strike rate → boost K projections - Bottom quartile → penalize K projections NOTE: Statcast's per-pitch dataset includes umpires under the `umpire` and `fielder_*` columns inconsistently across seasons. We treat missing data as 'no signal' rather than blocking the grade. """ from __future__ import annotations from datetime import datetime, timedelta from typing import Optional import pandas as pd from pybaseball import statcast from app.utils.cache import cache_get, cache_set from app.config import SPLITS_TTL # Approximate rule-book strike zone half-width / height range in feet. _ZONE_HALF_WIDTH = 0.83 _ZONE_BOTTOM = 1.5 _ZONE_TOP = 3.5 def _today_iso() -> str: return datetime.utcnow().strftime("%Y-%m-%d") def get_umpire_profile(umpire_name: Optional[str] = None, days_back: int = 30) -> dict: """ Pull a window of pitch-level data and aggregate by umpire. Returns a league average plus a list of umpires sorted by called-strike rate. Heavy call — capped at 30 days to keep the payload manageable. The orchestrator should call this nightly, not per-game. """ days_back = max(7, min(int(days_back or 30), 45)) end = _today_iso() start = (datetime.utcnow() - timedelta(days=days_back)).strftime("%Y-%m-%d") cache_key = f"mlb:umpires:{start}:{end}:{umpire_name or 'all'}" cached = cache_get(cache_key) if cached is not None: cached["source"] = "cache" return cached try: data = statcast(start, end) except Exception as exc: return {"error": f"statcast fetch failed: {exc!s}", "umpires": []} if data is None or data.empty: return {"umpires": [], "note": "no data", "window": [start, end]} if "umpire" not in data.columns: # Some Statcast windows omit the umpire column entirely. return { "umpires": [], "league_avg_called_strike_rate": None, "note": "umpire data unavailable in this window", "window": [start, end], } in_zone = ( data["plate_x"].abs() <= _ZONE_HALF_WIDTH ) & ( data["plate_z"].between(_ZONE_BOTTOM, _ZONE_TOP) ) if {"plate_x", "plate_z"}.issubset(data.columns) else pd.Series(False, index=data.index) grouped = data.groupby("umpire", dropna=True) rows = [] for ump, g in grouped: d = g["description"] if "description" in g.columns else pd.Series(dtype=str) called_strikes = int((d == "called_strike").sum()) called_balls = int((d == "ball").sum()) called_total = called_strikes + called_balls events = g["events"] if "events" in g.columns else pd.Series(dtype=str) rows.append({ "umpire": str(ump), "pitches": int(len(g)), "called_strike_rate": float(called_strikes / called_total) if called_total else 0.0, "k_rate": float((events == "strikeout").mean()) if not events.empty else 0.0, "in_zone_pitches": int(in_zone[g.index].sum()) if not in_zone.empty else 0, }) if not rows: return {"umpires": [], "note": "no per-umpire rows aggregated"} league_avg = sum(r["called_strike_rate"] for r in rows) / len(rows) rows.sort(key=lambda r: r["called_strike_rate"], reverse=True) if umpire_name: needle = umpire_name.lower() rows = [r for r in rows if needle in r["umpire"].lower()] result = { "umpires": rows[:30], "league_avg_called_strike_rate": league_avg, "window": [start, end], "source": "statcast", } cache_set(cache_key, result, SPLITS_TTL) return result