vyndr/nba-service/app/services/mlb_umpire.py

"""
MLB umpire K-zone profiling via pybaseball Statcast pitch data.

Drives the K-prop modifier in the grading engine:
- Top quartile called-strike rate → boost K projections
- Bottom quartile → penalize K projections

NOTE: Statcast's per-pitch dataset includes umpires under the `umpire` and
`fielder_*` columns inconsistently across seasons. We treat missing data
as 'no signal' rather than blocking the grade.
"""
from __future__ import annotations

from datetime import datetime, timedelta
from typing import Optional

import pandas as pd
from pybaseball import statcast

from app.utils.cache import cache_get, cache_set
from app.config import SPLITS_TTL

# Approximate rule-book strike zone half-width / height range in feet.
_ZONE_HALF_WIDTH = 0.83
_ZONE_BOTTOM = 1.5
_ZONE_TOP = 3.5


def _today_iso() -> str:
    return datetime.utcnow().strftime("%Y-%m-%d")


def get_umpire_profile(umpire_name: Optional[str] = None, days_back: int = 30) -> dict:
    """
    Pull a window of pitch-level data and aggregate by umpire. Returns a
    league average plus a list of umpires sorted by called-strike rate.

    Heavy call — capped at 30 days to keep the payload manageable. The
    orchestrator should call this nightly, not per-game.
    """
    days_back = max(7, min(int(days_back or 30), 45))
    end = _today_iso()
    start = (datetime.utcnow() - timedelta(days=days_back)).strftime("%Y-%m-%d")

    cache_key = f"mlb:umpires:{start}:{end}:{umpire_name or 'all'}"
    cached = cache_get(cache_key)
    if cached is not None:
        cached["source"] = "cache"
        return cached

    try:
        data = statcast(start, end)
    except Exception as exc:
        return {"error": f"statcast fetch failed: {exc!s}", "umpires": []}

    if data is None or data.empty:
        return {"umpires": [], "note": "no data", "window": [start, end]}

    if "umpire" not in data.columns:
        # Some Statcast windows omit the umpire column entirely.
        return {
            "umpires": [],
            "league_avg_called_strike_rate": None,
            "note": "umpire data unavailable in this window",
            "window": [start, end],
        }

    in_zone = (
        data["plate_x"].abs() <= _ZONE_HALF_WIDTH
    ) & (
        data["plate_z"].between(_ZONE_BOTTOM, _ZONE_TOP)
    ) if {"plate_x", "plate_z"}.issubset(data.columns) else pd.Series(False, index=data.index)

    grouped = data.groupby("umpire", dropna=True)
    rows = []
    for ump, g in grouped:
        d = g["description"] if "description" in g.columns else pd.Series(dtype=str)
        called_strikes = int((d == "called_strike").sum())
        called_balls = int((d == "ball").sum())
        called_total = called_strikes + called_balls
        events = g["events"] if "events" in g.columns else pd.Series(dtype=str)
        rows.append({
            "umpire": str(ump),
            "pitches": int(len(g)),
            "called_strike_rate": float(called_strikes / called_total) if called_total else 0.0,
            "k_rate": float((events == "strikeout").mean()) if not events.empty else 0.0,
            "in_zone_pitches": int(in_zone[g.index].sum()) if not in_zone.empty else 0,
        })

    if not rows:
        return {"umpires": [], "note": "no per-umpire rows aggregated"}

    league_avg = sum(r["called_strike_rate"] for r in rows) / len(rows)
    rows.sort(key=lambda r: r["called_strike_rate"], reverse=True)

    if umpire_name:
        needle = umpire_name.lower()
        rows = [r for r in rows if needle in r["umpire"].lower()]

    result = {
        "umpires": rows[:30],
        "league_avg_called_strike_rate": league_avg,
        "window": [start, end],
        "source": "statcast",
    }
    cache_set(cache_key, result, SPLITS_TTL)
    return result