""" MLB Statcast enrichment using pybaseball. Provides: - Pitcher pitch-mix + zone heatmap data for K-prop grading - Batter vs Pitcher historical matchup data We avoid wide-net `statcast()` calls that pull every pitch league-wide — those routinely time out. Pitcher-specific calls are scoped to a 30-day trailing window which keeps payloads under a few hundred KB. """ from __future__ import annotations import time from datetime import datetime, timedelta from typing import Optional import pandas as pd from pybaseball import statcast_pitcher from app.utils.cache import cache_get, cache_set from app.config import SPLITS_TTL def _today_iso() -> str: return datetime.utcnow().strftime("%Y-%m-%d") def _date_n_days_ago(n: int) -> str: return (datetime.utcnow() - timedelta(days=n)).strftime("%Y-%m-%d") def get_pitcher_profile(pitcher_id: int, days_back: int = 30) -> dict: """ Aggregate a pitcher's recent pitch-level data into pitch mix, velocity, whiff/chase, and zone heatmap counts. """ if not isinstance(pitcher_id, int) or pitcher_id <= 0: return {"error": "invalid pitcher_id"} cache_key = f"mlb:pitcher:{pitcher_id}:d{days_back}" cached = cache_get(cache_key) if cached is not None: cached["source"] = "cache" return cached end = _today_iso() start = _date_n_days_ago(days_back) try: data = statcast_pitcher(start, end, pitcher_id) except Exception as exc: return {"error": f"statcast fetch failed: {exc!s}"} if data is None or data.empty: return {"pitcher_id": pitcher_id, "pitch_mix": [], "zone": [], "note": "no data"} # Pitch mix description_col = data["description"] if "description" in data.columns else pd.Series(dtype=str) pitch_mix_grouped = data.groupby("pitch_type") if "pitch_type" in data.columns else None pitch_mix: list[dict] = [] if pitch_mix_grouped is not None: for ptype, g in pitch_mix_grouped: total = len(g) d = g["description"] if "description" in g.columns else pd.Series(dtype=str) swings = d.isin([ "swinging_strike", "foul", "foul_tip", "hit_into_play", "swinging_strike_blocked", ]).sum() if not d.empty else 0 whiffs = (d == "swinging_strike").sum() if not d.empty else 0 pitch_mix.append({ "pitch_type": str(ptype), "count": int(total), "share": float(total / len(data)) if len(data) else 0.0, "avg_velocity": float(g["release_speed"].mean()) if "release_speed" in g.columns else None, "whiff_rate": float(whiffs / swings) if swings else 0.0, }) # Zone heatmap (the existing pybaseball 'zone' column is the 13-zone scheme) zone_data: list[dict] = [] if "zone" in data.columns: for zone, g in data.groupby("zone"): d = g["description"] if "description" in g.columns else pd.Series(dtype=str) zone_data.append({ "zone": int(zone) if pd.notna(zone) else None, "pitches": int(len(g)), "whiff_rate": float((d == "swinging_strike").mean()) if not d.empty else 0.0, }) result = { "pitcher_id": pitcher_id, "window_days": days_back, "total_pitches": int(len(data)), "avg_velocity": float(data["release_speed"].mean()) if "release_speed" in data.columns else None, "k_rate_estimate": float((data["events"] == "strikeout").mean()) if "events" in data.columns else None, "pitch_mix": pitch_mix, "zone": zone_data, "source": "statcast", } cache_set(cache_key, result, SPLITS_TTL) return result def get_batter_vs_pitcher(batter_id: int, pitcher_id: int, years_back: int = 3) -> dict: """ Historical matchup. We scope to the pitcher because their pitch stream is small enough to fetch quickly; then filter to plate appearances by the batter. """ if not isinstance(batter_id, int) or not isinstance(pitcher_id, int): return {"error": "invalid ids"} cache_key = f"mlb:bvp:{batter_id}:{pitcher_id}:y{years_back}" cached = cache_get(cache_key) if cached is not None: cached["source"] = "cache" return cached end = _today_iso() start = _date_n_days_ago(365 * years_back) try: pitcher_data = statcast_pitcher(start, end, pitcher_id) except Exception as exc: return {"error": f"statcast fetch failed: {exc!s}"} if pitcher_data is None or pitcher_data.empty or "batter" not in pitcher_data.columns: return {"batter_id": batter_id, "pitcher_id": pitcher_id, "matchup": "no data"} matchup = pitcher_data[pitcher_data["batter"] == batter_id] if matchup.empty: return {"batter_id": batter_id, "pitcher_id": pitcher_id, "matchup": "no history"} events = matchup["events"] if "events" in matchup.columns else pd.Series(dtype=str) result = { "batter_id": batter_id, "pitcher_id": pitcher_id, "plate_appearances": int(events.notna().sum()), "hits": int(events.isin(["single", "double", "triple", "home_run"]).sum()), "strikeouts": int((events == "strikeout").sum()), "home_runs": int((events == "home_run").sum()), "walks": int((events == "walk").sum()), "avg_exit_velocity": float(matchup["launch_speed"].mean()) if "launch_speed" in matchup.columns else None, "pitches_seen": int(len(matchup)), "pitch_types_faced": { str(k): int(v) for k, v in (matchup["pitch_type"].value_counts().to_dict().items() if "pitch_type" in matchup.columns else {}).items() }, "source": "statcast", } # Cache aggressively — historical matchup data is stable. cache_set(cache_key, result, SPLITS_TTL * 2) return result