Sessions 5-7a: 955 tests, deployment ready
This commit is contained in:
@@ -0,0 +1,107 @@
|
||||
"""
|
||||
MLB umpire K-zone profiling via pybaseball Statcast pitch data.
|
||||
|
||||
Drives the K-prop modifier in the grading engine:
|
||||
- Top quartile called-strike rate → boost K projections
|
||||
- Bottom quartile → penalize K projections
|
||||
|
||||
NOTE: Statcast's per-pitch dataset includes umpires under the `umpire` and
|
||||
`fielder_*` columns inconsistently across seasons. We treat missing data
|
||||
as 'no signal' rather than blocking the grade.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Optional
|
||||
|
||||
import pandas as pd
|
||||
from pybaseball import statcast
|
||||
|
||||
from app.utils.cache import cache_get, cache_set
|
||||
from app.config import SPLITS_TTL
|
||||
|
||||
# Approximate rule-book strike zone half-width / height range in feet.
|
||||
_ZONE_HALF_WIDTH = 0.83
|
||||
_ZONE_BOTTOM = 1.5
|
||||
_ZONE_TOP = 3.5
|
||||
|
||||
|
||||
def _today_iso() -> str:
|
||||
return datetime.utcnow().strftime("%Y-%m-%d")
|
||||
|
||||
|
||||
def get_umpire_profile(umpire_name: Optional[str] = None, days_back: int = 30) -> dict:
|
||||
"""
|
||||
Pull a window of pitch-level data and aggregate by umpire. Returns a
|
||||
league average plus a list of umpires sorted by called-strike rate.
|
||||
|
||||
Heavy call — capped at 30 days to keep the payload manageable. The
|
||||
orchestrator should call this nightly, not per-game.
|
||||
"""
|
||||
days_back = max(7, min(int(days_back or 30), 45))
|
||||
end = _today_iso()
|
||||
start = (datetime.utcnow() - timedelta(days=days_back)).strftime("%Y-%m-%d")
|
||||
|
||||
cache_key = f"mlb:umpires:{start}:{end}:{umpire_name or 'all'}"
|
||||
cached = cache_get(cache_key)
|
||||
if cached is not None:
|
||||
cached["source"] = "cache"
|
||||
return cached
|
||||
|
||||
try:
|
||||
data = statcast(start, end)
|
||||
except Exception as exc:
|
||||
return {"error": f"statcast fetch failed: {exc!s}", "umpires": []}
|
||||
|
||||
if data is None or data.empty:
|
||||
return {"umpires": [], "note": "no data", "window": [start, end]}
|
||||
|
||||
if "umpire" not in data.columns:
|
||||
# Some Statcast windows omit the umpire column entirely.
|
||||
return {
|
||||
"umpires": [],
|
||||
"league_avg_called_strike_rate": None,
|
||||
"note": "umpire data unavailable in this window",
|
||||
"window": [start, end],
|
||||
}
|
||||
|
||||
in_zone = (
|
||||
data["plate_x"].abs() <= _ZONE_HALF_WIDTH
|
||||
) & (
|
||||
data["plate_z"].between(_ZONE_BOTTOM, _ZONE_TOP)
|
||||
) if {"plate_x", "plate_z"}.issubset(data.columns) else pd.Series(False, index=data.index)
|
||||
|
||||
grouped = data.groupby("umpire", dropna=True)
|
||||
rows = []
|
||||
for ump, g in grouped:
|
||||
d = g["description"] if "description" in g.columns else pd.Series(dtype=str)
|
||||
called_strikes = int((d == "called_strike").sum())
|
||||
called_balls = int((d == "ball").sum())
|
||||
called_total = called_strikes + called_balls
|
||||
events = g["events"] if "events" in g.columns else pd.Series(dtype=str)
|
||||
rows.append({
|
||||
"umpire": str(ump),
|
||||
"pitches": int(len(g)),
|
||||
"called_strike_rate": float(called_strikes / called_total) if called_total else 0.0,
|
||||
"k_rate": float((events == "strikeout").mean()) if not events.empty else 0.0,
|
||||
"in_zone_pitches": int(in_zone[g.index].sum()) if not in_zone.empty else 0,
|
||||
})
|
||||
|
||||
if not rows:
|
||||
return {"umpires": [], "note": "no per-umpire rows aggregated"}
|
||||
|
||||
league_avg = sum(r["called_strike_rate"] for r in rows) / len(rows)
|
||||
rows.sort(key=lambda r: r["called_strike_rate"], reverse=True)
|
||||
|
||||
if umpire_name:
|
||||
needle = umpire_name.lower()
|
||||
rows = [r for r in rows if needle in r["umpire"].lower()]
|
||||
|
||||
result = {
|
||||
"umpires": rows[:30],
|
||||
"league_avg_called_strike_rate": league_avg,
|
||||
"window": [start, end],
|
||||
"source": "statcast",
|
||||
}
|
||||
cache_set(cache_key, result, SPLITS_TTL)
|
||||
return result
|
||||
Reference in New Issue
Block a user