Files

108 lines
3.7 KiB
Python

"""
MLB umpire K-zone profiling via pybaseball Statcast pitch data.
Drives the K-prop modifier in the grading engine:
- Top quartile called-strike rate → boost K projections
- Bottom quartile → penalize K projections
NOTE: Statcast's per-pitch dataset includes umpires under the `umpire` and
`fielder_*` columns inconsistently across seasons. We treat missing data
as 'no signal' rather than blocking the grade.
"""
from __future__ import annotations
from datetime import datetime, timedelta
from typing import Optional
import pandas as pd
from pybaseball import statcast
from app.utils.cache import cache_get, cache_set
from app.config import SPLITS_TTL
# Approximate rule-book strike zone half-width / height range in feet.
_ZONE_HALF_WIDTH = 0.83
_ZONE_BOTTOM = 1.5
_ZONE_TOP = 3.5
def _today_iso() -> str:
return datetime.utcnow().strftime("%Y-%m-%d")
def get_umpire_profile(umpire_name: Optional[str] = None, days_back: int = 30) -> dict:
"""
Pull a window of pitch-level data and aggregate by umpire. Returns a
league average plus a list of umpires sorted by called-strike rate.
Heavy call — capped at 30 days to keep the payload manageable. The
orchestrator should call this nightly, not per-game.
"""
days_back = max(7, min(int(days_back or 30), 45))
end = _today_iso()
start = (datetime.utcnow() - timedelta(days=days_back)).strftime("%Y-%m-%d")
cache_key = f"mlb:umpires:{start}:{end}:{umpire_name or 'all'}"
cached = cache_get(cache_key)
if cached is not None:
cached["source"] = "cache"
return cached
try:
data = statcast(start, end)
except Exception as exc:
return {"error": f"statcast fetch failed: {exc!s}", "umpires": []}
if data is None or data.empty:
return {"umpires": [], "note": "no data", "window": [start, end]}
if "umpire" not in data.columns:
# Some Statcast windows omit the umpire column entirely.
return {
"umpires": [],
"league_avg_called_strike_rate": None,
"note": "umpire data unavailable in this window",
"window": [start, end],
}
in_zone = (
data["plate_x"].abs() <= _ZONE_HALF_WIDTH
) & (
data["plate_z"].between(_ZONE_BOTTOM, _ZONE_TOP)
) if {"plate_x", "plate_z"}.issubset(data.columns) else pd.Series(False, index=data.index)
grouped = data.groupby("umpire", dropna=True)
rows = []
for ump, g in grouped:
d = g["description"] if "description" in g.columns else pd.Series(dtype=str)
called_strikes = int((d == "called_strike").sum())
called_balls = int((d == "ball").sum())
called_total = called_strikes + called_balls
events = g["events"] if "events" in g.columns else pd.Series(dtype=str)
rows.append({
"umpire": str(ump),
"pitches": int(len(g)),
"called_strike_rate": float(called_strikes / called_total) if called_total else 0.0,
"k_rate": float((events == "strikeout").mean()) if not events.empty else 0.0,
"in_zone_pitches": int(in_zone[g.index].sum()) if not in_zone.empty else 0,
})
if not rows:
return {"umpires": [], "note": "no per-umpire rows aggregated"}
league_avg = sum(r["called_strike_rate"] for r in rows) / len(rows)
rows.sort(key=lambda r: r["called_strike_rate"], reverse=True)
if umpire_name:
needle = umpire_name.lower()
rows = [r for r in rows if needle in r["umpire"].lower()]
result = {
"umpires": rows[:30],
"league_avg_called_strike_rate": league_avg,
"window": [start, end],
"source": "statcast",
}
cache_set(cache_key, result, SPLITS_TTL)
return result