vyndr/nba-service/app/services/refs.py

"""
NBA/WNBA referee enrichment.

Source: stats.nba.com unofficial endpoints. Crew assignments are typically
published ~60-90 minutes before tip via the boxscoresummaryv2 endpoint.

This DIRECTLY affects kill conditions in the grading engine:
- Crews calling more fouls than league average increase foul-trouble risk
- Players w/ high foul rates + foul-heavy crews → kill condition activated

We intentionally keep this stateless; the orchestrator caches results.
"""
from __future__ import annotations

import time
from typing import Optional

import requests

from app.utils.cache import cache_get, cache_set
from app.config import NBA_API_TIMEOUT, SPLITS_TTL

_NBA_HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; VYNDR/1.0)",
    "Referer": "https://www.nba.com/",
    "Accept": "application/json, text/plain, */*",
    "Accept-Language": "en-US,en;q=0.9",
    "x-nba-stats-origin": "stats",
    "x-nba-stats-token": "true",
}

_REF_STATS_URL = "https://stats.nba.com/stats/officialgamefindergamelogs"
_BOXSCORE_URL = "https://stats.nba.com/stats/boxscoresummaryv2"

# League IDs per stats.nba.com convention
LEAGUE_ID = {"nba": "00", "wnba": "10"}


def _safe_get(url: str, params: dict) -> Optional[dict]:
    """Resilient GET with a single retry. stats.nba.com is flaky."""
    for attempt in (0, 1):
        try:
            resp = requests.get(url, headers=_NBA_HEADERS, params=params, timeout=NBA_API_TIMEOUT)
            if resp.status_code == 200:
                return resp.json()
        except requests.RequestException:
            pass
        if attempt == 0:
            time.sleep(1.5)
    return None


def get_tonight_officials(game_id: str) -> dict:
    """
    Return the crew assigned to a single game. Empty list means assignments
    haven't been published yet (normal until ~90 min before tip).
    """
    if not game_id or not str(game_id).isalnum():
        return {"error": "invalid game_id", "officials": []}

    cache_key = f"refs:officials:{game_id}"
    cached = cache_get(cache_key)
    if cached is not None:
        return cached

    data = _safe_get(_BOXSCORE_URL, {"GameID": game_id})
    if not data or "resultSets" not in data:
        return {"officials": [], "game_id": game_id, "source": "stats.nba.com", "note": "no data"}

    officials = []
    for rs in data.get("resultSets", []):
        if rs.get("name") != "Officials":
            continue
        headers = rs.get("headers") or []
        for row in rs.get("rowSet") or []:
            record = dict(zip(headers, row))
            first = record.get("FIRST_NAME", "") or ""
            last = record.get("LAST_NAME", "") or ""
            officials.append({
                "official_id": record.get("OFFICIAL_ID"),
                "name": f"{first} {last}".strip(),
                "jersey_num": record.get("JERSEY_NUM"),
            })
        break

    result = {
        "game_id": game_id,
        "officials": officials,
        "source": "stats.nba.com",
    }
    # Officials assignments don't change once published, but TTL keeps the cache fresh.
    cache_set(cache_key, result, ttl=SPLITS_TTL)
    return result


def get_referee_tendencies(season: str, league: str = "nba") -> dict:
    """
    Aggregate per-referee tendencies for the season. Returns league_avg_pf
    and a sorted list of refs by personal-foul rate; consumers can classify
    'tight', 'average', 'generous' crews from the quartile bands.

    NOTE: stats.nba.com's referee dashboard endpoint changes shape every few
    years. If the upstream returns nothing, the orchestrator should fall
    back to last season's cached data.
    """
    if league not in LEAGUE_ID:
        return {"error": "invalid league", "referees": []}

    cache_key = f"refs:tendencies:{league}:{season}"
    cached = cache_get(cache_key)
    if cached is not None:
        return cached

    # The upstream endpoint moved around 2024. We try the modern URL first
    # and degrade gracefully — the rest of the pipeline can use league_avg
    # alone to back off the foul-trouble kill condition modifier.
    params = {
        "Season": season,
        "SeasonType": "Regular Season",
        "LeagueID": LEAGUE_ID[league],
        "PerMode": "PerGame",
    }
    data = _safe_get("https://stats.nba.com/stats/leaguedashrefstats", params)
    if not data or not data.get("resultSets"):
        result = {
            "referees": [],
            "league_avg_pf_per_game": None,
            "season": season,
            "league": league,
            "note": "upstream referee dashboard unavailable",
        }
        # Short cache so we retry sooner.
        cache_set(cache_key, result, ttl=300)
        return result

    rs = data["resultSets"][0]
    headers = rs.get("headers") or []
    refs = []
    for row in rs.get("rowSet") or []:
        record = dict(zip(headers, row))
        refs.append({
            "name": record.get("REFEREE_NAME", ""),
            "games": record.get("GP", 0),
            "pf_per_game": record.get("PF", 0),
            "tech_per_game": record.get("TECH", 0),
            "off_foul_per_game": record.get("OFF_FOUL", 0),
        })

    pf_values = [r["pf_per_game"] or 0 for r in refs if (r.get("pf_per_game") or 0) > 0]
    league_avg = (sum(pf_values) / len(pf_values)) if pf_values else None

    result = {
        "referees": refs,
        "league_avg_pf_per_game": league_avg,
        "season": season,
        "league": league,
        "source": "stats.nba.com",
    }
    cache_set(cache_key, result, ttl=SPLITS_TTL)
    return result