Using Rate Limits to Control Spend

Credit cards have spending limits. Not because you can't afford more, but because a stolen card number shouldn't drain your account. The limit is protection against unauthorized use.

LLM rate limits serve the same purpose. They protect against bugs that generate millions of requests, compromised API keys, and users who accidentally build infinite loops. The limit is your safety net.

The Rate Limiting Hierarchy

def rate_limit_hierarchy():
    return {
        "global": {
            "purpose": "Protect infrastructure from total overload",
            "example": "1M tokens/minute total",
            "trigger": "Hard stop, queue or reject",
        },
        "per_organization": {
            "purpose": "Prevent one org from starving others",
            "example": "100K tokens/minute per org",
            "trigger": "429 to that org only",
        },
        "per_user": {
            "purpose": "Prevent one user from overusing",
            "example": "10K tokens/minute per user",
            "trigger": "429 to that user",
        },
        "per_feature": {
            "purpose": "Control cost by endpoint",
            "example": "Expensive /analyze has lower limit than /chat",
            "trigger": "429 for that feature",
        },
        "per_api_key": {
            "purpose": "Isolate keys if one is compromised",
            "example": "Each key has its own budget",
            "trigger": "Revoke or limit specific key",
        },
    }

Token-Based Rate Limiting

class TokenRateLimiter:
    """
    Rate limit by tokens, not requests
    """

    def __init__(self, tokens_per_minute: int):
        self.limit = tokens_per_minute
        self.window_seconds = 60
        self.usage = {}  # key -> list of (timestamp, tokens)

    def check_and_record(self, key: str, tokens: int) -> tuple[bool, dict]:
        now = time.time()
        window_start = now - self.window_seconds

        # Clean old entries
        if key in self.usage:
            self.usage[key] = [
                (ts, t) for ts, t in self.usage[key]
                if ts > window_start
            ]
        else:
            self.usage[key] = []

        # Calculate current usage
        current_usage = sum(t for _, t in self.usage[key])

        if current_usage + tokens > self.limit:
            return False, {
                "allowed": False,
                "current_usage": current_usage,
                "limit": self.limit,
                "retry_after": self.calculate_retry_after(key),
            }

        # Record this usage
        self.usage[key].append((now, tokens))
        return True, {
            "allowed": True,
            "current_usage": current_usage + tokens,
            "remaining": self.limit - current_usage - tokens,
        }

Budget-Based Limits

class BudgetLimiter:
    """
    Set spend limits, not just token limits
    """

    def __init__(self, budget_config: dict):
        self.daily_budget = budget_config["daily_usd"]
        self.monthly_budget = budget_config["monthly_usd"]
        self.alert_threshold = budget_config.get("alert_at", 0.8)

    def check_budget(self, org_id: str, estimated_cost: float) -> dict:
        daily_spend = self.get_daily_spend(org_id)
        monthly_spend = self.get_monthly_spend(org_id)

        # Check hard limits
        if daily_spend + estimated_cost > self.daily_budget:
            return {"allowed": False, "reason": "daily_budget_exceeded"}

        if monthly_spend + estimated_cost > self.monthly_budget:
            return {"allowed": False, "reason": "monthly_budget_exceeded"}

        # Check alerts
        alerts = []
        if daily_spend / self.daily_budget > self.alert_threshold:
            alerts.append("approaching_daily_limit")
        if monthly_spend / self.monthly_budget > self.alert_threshold:
            alerts.append("approaching_monthly_limit")

        return {
            "allowed": True,
            "daily_remaining": self.daily_budget - daily_spend,
            "monthly_remaining": self.monthly_budget - monthly_spend,
            "alerts": alerts,
        }

Preventing Runaway Bugs

def bug_protection_strategies():
    return {
        "request_rate_spike": {
            "detection": "Requests/second 10x above baseline",
            "action": "Automatic throttling + alert",
            "implementation": """
            if requests_per_second > baseline * 10:
                enable_throttling(user_id)
                alert("Unusual request rate", user_id)
            """,
        },
        "single_request_bomb": {
            "detection": "max_tokens set to very high value",
            "action": "Cap at reasonable maximum",
            "implementation": """
            request.max_tokens = min(request.max_tokens, 4096)
            """,
        },
        "retry_storm": {
            "detection": "Same request retried many times",
            "action": "Exponential backoff enforcement",
            "implementation": """
            if recent_retries(request_hash) > 5:
                return RateLimitResponse(retry_after=300)
            """,
        },
        "prompt_size_bomb": {
            "detection": "Extremely long prompts",
            "action": "Reject or truncate",
            "implementation": """
            if len(prompt) > MAX_PROMPT_CHARS:
                return ErrorResponse("Prompt too long")
            """,
        },
    }

Response Headers

def rate_limit_headers():
    """
    Tell clients their rate limit status
    """
    return {
        "headers": {
            "X-RateLimit-Limit": "Tokens allowed per window",
            "X-RateLimit-Remaining": "Tokens remaining",
            "X-RateLimit-Reset": "Unix timestamp when window resets",
            "Retry-After": "Seconds to wait (on 429)",
        },
        "implementation": """
        def add_rate_limit_headers(response, limiter_result):
            response.headers["X-RateLimit-Limit"] = str(limiter_result.limit)
            response.headers["X-RateLimit-Remaining"] = str(limiter_result.remaining)
            response.headers["X-RateLimit-Reset"] = str(limiter_result.reset_time)

            if not limiter_result.allowed:
                response.status_code = 429
                response.headers["Retry-After"] = str(limiter_result.retry_after)
        """,
    }

Graceful Degradation Under Limits

def graceful_degradation():
    return {
        "queue_not_reject": {
            "description": "Queue requests instead of rejecting",
            "implementation": """
            if rate_limit_exceeded:
                queue.add(request)
                return AcceptedResponse(
                    status=202,
                    message="Queued, will process shortly"
                )
            """,
            "use_case": "Batch jobs that can wait",
        },
        "reduce_quality": {
            "description": "Use smaller model when at limit",
            "implementation": """
            if at_rate_limit and request.allows_degradation:
                return small_model.generate(request)
            """,
            "use_case": "Interactive requests that need some response",
        },
        "cached_response": {
            "description": "Return cached response if available",
            "implementation": """
            if at_rate_limit:
                cached = cache.get(request.hash)
                if cached:
                    return cached.with_warning("Rate limited, cached response")
            """,
            "use_case": "Common queries with stable answers",
        },
    }

Monitoring and Alerting

def rate_limit_monitoring():
    return {
        "metrics": [
            "Rate limit hit rate by user/org",
            "Tokens consumed by user/org/feature",
            "Budget utilization percentage",
            "429 response rate",
        ],
        "alerts": [
            {
                "condition": "daily_spend > 80% of budget",
                "action": "Warn team, review top consumers",
            },
            {
                "condition": "single_user > 50% of org limit",
                "action": "Investigate, may be abuse or bug",
            },
            {
                "condition": "429_rate > 5% of requests",
                "action": "Review limits, may need adjustment",
            },
        ],
        "dashboard": """
        Rate Limit Dashboard:
        - Top consumers by tokens
        - Budget burn rate
        - Rate limit triggers by endpoint
        - Trend: requests vs limits over time
        """,
    }

Rate limits are guardrails, not punishments. A well-set limit protects your budget, ensures fair access, and catches bugs before they become expensive incidents. Set them before you need them.