Files
vyndr/nba-service/app/services/mlb_statcast.py
T

154 lines
5.8 KiB
Python

"""
MLB Statcast enrichment using pybaseball.
Provides:
- Pitcher pitch-mix + zone heatmap data for K-prop grading
- Batter vs Pitcher historical matchup data
We avoid wide-net `statcast()` calls that pull every pitch league-wide —
those routinely time out. Pitcher-specific calls are scoped to a 30-day
trailing window which keeps payloads under a few hundred KB.
"""
from __future__ import annotations
import time
from datetime import datetime, timedelta
from typing import Optional
import pandas as pd
from pybaseball import statcast_pitcher
from app.utils.cache import cache_get, cache_set
from app.config import SPLITS_TTL
def _today_iso() -> str:
return datetime.utcnow().strftime("%Y-%m-%d")
def _date_n_days_ago(n: int) -> str:
return (datetime.utcnow() - timedelta(days=n)).strftime("%Y-%m-%d")
def get_pitcher_profile(pitcher_id: int, days_back: int = 30) -> dict:
"""
Aggregate a pitcher's recent pitch-level data into pitch mix,
velocity, whiff/chase, and zone heatmap counts.
"""
if not isinstance(pitcher_id, int) or pitcher_id <= 0:
return {"error": "invalid pitcher_id"}
cache_key = f"mlb:pitcher:{pitcher_id}:d{days_back}"
cached = cache_get(cache_key)
if cached is not None:
cached["source"] = "cache"
return cached
end = _today_iso()
start = _date_n_days_ago(days_back)
try:
data = statcast_pitcher(start, end, pitcher_id)
except Exception as exc:
return {"error": f"statcast fetch failed: {exc!s}"}
if data is None or data.empty:
return {"pitcher_id": pitcher_id, "pitch_mix": [], "zone": [], "note": "no data"}
# Pitch mix
description_col = data["description"] if "description" in data.columns else pd.Series(dtype=str)
pitch_mix_grouped = data.groupby("pitch_type") if "pitch_type" in data.columns else None
pitch_mix: list[dict] = []
if pitch_mix_grouped is not None:
for ptype, g in pitch_mix_grouped:
total = len(g)
d = g["description"] if "description" in g.columns else pd.Series(dtype=str)
swings = d.isin([
"swinging_strike", "foul", "foul_tip", "hit_into_play",
"swinging_strike_blocked",
]).sum() if not d.empty else 0
whiffs = (d == "swinging_strike").sum() if not d.empty else 0
pitch_mix.append({
"pitch_type": str(ptype),
"count": int(total),
"share": float(total / len(data)) if len(data) else 0.0,
"avg_velocity": float(g["release_speed"].mean()) if "release_speed" in g.columns else None,
"whiff_rate": float(whiffs / swings) if swings else 0.0,
})
# Zone heatmap (the existing pybaseball 'zone' column is the 13-zone scheme)
zone_data: list[dict] = []
if "zone" in data.columns:
for zone, g in data.groupby("zone"):
d = g["description"] if "description" in g.columns else pd.Series(dtype=str)
zone_data.append({
"zone": int(zone) if pd.notna(zone) else None,
"pitches": int(len(g)),
"whiff_rate": float((d == "swinging_strike").mean()) if not d.empty else 0.0,
})
result = {
"pitcher_id": pitcher_id,
"window_days": days_back,
"total_pitches": int(len(data)),
"avg_velocity": float(data["release_speed"].mean()) if "release_speed" in data.columns else None,
"k_rate_estimate": float((data["events"] == "strikeout").mean()) if "events" in data.columns else None,
"pitch_mix": pitch_mix,
"zone": zone_data,
"source": "statcast",
}
cache_set(cache_key, result, SPLITS_TTL)
return result
def get_batter_vs_pitcher(batter_id: int, pitcher_id: int, years_back: int = 3) -> dict:
"""
Historical matchup. We scope to the pitcher because their pitch stream
is small enough to fetch quickly; then filter to plate appearances by
the batter.
"""
if not isinstance(batter_id, int) or not isinstance(pitcher_id, int):
return {"error": "invalid ids"}
cache_key = f"mlb:bvp:{batter_id}:{pitcher_id}:y{years_back}"
cached = cache_get(cache_key)
if cached is not None:
cached["source"] = "cache"
return cached
end = _today_iso()
start = _date_n_days_ago(365 * years_back)
try:
pitcher_data = statcast_pitcher(start, end, pitcher_id)
except Exception as exc:
return {"error": f"statcast fetch failed: {exc!s}"}
if pitcher_data is None or pitcher_data.empty or "batter" not in pitcher_data.columns:
return {"batter_id": batter_id, "pitcher_id": pitcher_id, "matchup": "no data"}
matchup = pitcher_data[pitcher_data["batter"] == batter_id]
if matchup.empty:
return {"batter_id": batter_id, "pitcher_id": pitcher_id, "matchup": "no history"}
events = matchup["events"] if "events" in matchup.columns else pd.Series(dtype=str)
result = {
"batter_id": batter_id,
"pitcher_id": pitcher_id,
"plate_appearances": int(events.notna().sum()),
"hits": int(events.isin(["single", "double", "triple", "home_run"]).sum()),
"strikeouts": int((events == "strikeout").sum()),
"home_runs": int((events == "home_run").sum()),
"walks": int((events == "walk").sum()),
"avg_exit_velocity": float(matchup["launch_speed"].mean()) if "launch_speed" in matchup.columns else None,
"pitches_seen": int(len(matchup)),
"pitch_types_faced": {
str(k): int(v)
for k, v in (matchup["pitch_type"].value_counts().to_dict().items() if "pitch_type" in matchup.columns else {}).items()
},
"source": "statcast",
}
# Cache aggressively — historical matchup data is stable.
cache_set(cache_key, result, SPLITS_TTL * 2)
return result