108 lines
3.7 KiB
Python
108 lines
3.7 KiB
Python
"""
|
|
MLB umpire K-zone profiling via pybaseball Statcast pitch data.
|
|
|
|
Drives the K-prop modifier in the grading engine:
|
|
- Top quartile called-strike rate → boost K projections
|
|
- Bottom quartile → penalize K projections
|
|
|
|
NOTE: Statcast's per-pitch dataset includes umpires under the `umpire` and
|
|
`fielder_*` columns inconsistently across seasons. We treat missing data
|
|
as 'no signal' rather than blocking the grade.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
from datetime import datetime, timedelta
|
|
from typing import Optional
|
|
|
|
import pandas as pd
|
|
from pybaseball import statcast
|
|
|
|
from app.utils.cache import cache_get, cache_set
|
|
from app.config import SPLITS_TTL
|
|
|
|
# Approximate rule-book strike zone half-width / height range in feet.
|
|
_ZONE_HALF_WIDTH = 0.83
|
|
_ZONE_BOTTOM = 1.5
|
|
_ZONE_TOP = 3.5
|
|
|
|
|
|
def _today_iso() -> str:
|
|
return datetime.utcnow().strftime("%Y-%m-%d")
|
|
|
|
|
|
def get_umpire_profile(umpire_name: Optional[str] = None, days_back: int = 30) -> dict:
|
|
"""
|
|
Pull a window of pitch-level data and aggregate by umpire. Returns a
|
|
league average plus a list of umpires sorted by called-strike rate.
|
|
|
|
Heavy call — capped at 30 days to keep the payload manageable. The
|
|
orchestrator should call this nightly, not per-game.
|
|
"""
|
|
days_back = max(7, min(int(days_back or 30), 45))
|
|
end = _today_iso()
|
|
start = (datetime.utcnow() - timedelta(days=days_back)).strftime("%Y-%m-%d")
|
|
|
|
cache_key = f"mlb:umpires:{start}:{end}:{umpire_name or 'all'}"
|
|
cached = cache_get(cache_key)
|
|
if cached is not None:
|
|
cached["source"] = "cache"
|
|
return cached
|
|
|
|
try:
|
|
data = statcast(start, end)
|
|
except Exception as exc:
|
|
return {"error": f"statcast fetch failed: {exc!s}", "umpires": []}
|
|
|
|
if data is None or data.empty:
|
|
return {"umpires": [], "note": "no data", "window": [start, end]}
|
|
|
|
if "umpire" not in data.columns:
|
|
# Some Statcast windows omit the umpire column entirely.
|
|
return {
|
|
"umpires": [],
|
|
"league_avg_called_strike_rate": None,
|
|
"note": "umpire data unavailable in this window",
|
|
"window": [start, end],
|
|
}
|
|
|
|
in_zone = (
|
|
data["plate_x"].abs() <= _ZONE_HALF_WIDTH
|
|
) & (
|
|
data["plate_z"].between(_ZONE_BOTTOM, _ZONE_TOP)
|
|
) if {"plate_x", "plate_z"}.issubset(data.columns) else pd.Series(False, index=data.index)
|
|
|
|
grouped = data.groupby("umpire", dropna=True)
|
|
rows = []
|
|
for ump, g in grouped:
|
|
d = g["description"] if "description" in g.columns else pd.Series(dtype=str)
|
|
called_strikes = int((d == "called_strike").sum())
|
|
called_balls = int((d == "ball").sum())
|
|
called_total = called_strikes + called_balls
|
|
events = g["events"] if "events" in g.columns else pd.Series(dtype=str)
|
|
rows.append({
|
|
"umpire": str(ump),
|
|
"pitches": int(len(g)),
|
|
"called_strike_rate": float(called_strikes / called_total) if called_total else 0.0,
|
|
"k_rate": float((events == "strikeout").mean()) if not events.empty else 0.0,
|
|
"in_zone_pitches": int(in_zone[g.index].sum()) if not in_zone.empty else 0,
|
|
})
|
|
|
|
if not rows:
|
|
return {"umpires": [], "note": "no per-umpire rows aggregated"}
|
|
|
|
league_avg = sum(r["called_strike_rate"] for r in rows) / len(rows)
|
|
rows.sort(key=lambda r: r["called_strike_rate"], reverse=True)
|
|
|
|
if umpire_name:
|
|
needle = umpire_name.lower()
|
|
rows = [r for r in rows if needle in r["umpire"].lower()]
|
|
|
|
result = {
|
|
"umpires": rows[:30],
|
|
"league_avg_called_strike_rate": league_avg,
|
|
"window": [start, end],
|
|
"source": "statcast",
|
|
}
|
|
cache_set(cache_key, result, SPLITS_TTL)
|
|
return result
|