Cost Control

Dollar Spend Enforcement

Map LLM API costs to RLAAS units and enforce a hard daily spend cap per tenant — in real time, before the bill arrives.

Before & After

Without RLAAS

Discover the $800 Overspend on Next Month's Invoice

  • No real-time spend tracking — billing data arrives 24–48 hours late
  • One tenant's heavy usage silently eats into your margin
  • Quota alerts fire after you've already exceeded the budget
  • Changing a spend cap requires a code change and redeploy
# ✗ No guardrail — request goes through regardless of cost def call_llm(tenant_id: str, prompt: str) -> str: # No spend check — billing event arrives tomorrow response = openai.chat.completions.create( model="gpt-4o", messages=[{"role": "user", "content": prompt}] ) return response.choices[0].message.content
With RLAAS

Hard Cap Enforced in Real Time — Change Budgets via API

  • Estimate cost in tokens, convert to RLAAS units, check before every call
  • Budget exhausted → 402 response with days left until reset
  • Record actual spend after the call to keep the bucket accurate
  • Raise or lower a tenant budget live via PATCH — no redeploy
# ✓ Check estimated cost before calling; record actual after COST_PER_1K_TOKENS = 0.030 # cents — adjust per model UNITS_PER_CENT = 100 def call_llm(tenant_id: str, prompt: str) -> str: est_tokens = len(prompt.split()) * 1.3 est_units = int(est_tokens / 1000 * COST_PER_1K_TOKENS * UNITS_PER_CENT) decision = client.check(CheckRequest( user_id=tenant_id, resource="llm:spend", cost=est_units )) if not decision.allowed: raise BudgetExhausted(tenant_id=tenant_id, reset_at=decision.retry_after) response = openai.chat.completions.create(...) actual_units = int(response.usage.total_tokens / 1000 * COST_PER_1K_TOKENS * UNITS_PER_CENT) client.record(RecordRequest(user_id=tenant_id, resource="llm:spend", units=actual_units)) return response.choices[0].message.content

How It Works

Policy Configuration

# Daily cost quota - 1 unit = 1 USD cent
{
  "id": "llm-spend-daily",
  "resource": "llm:spend",
  "algorithm": "quota",
  "config": {
    "limit": 1000,
    "window_seconds": 86400
  },
  "action_deny": "reject",
  "metadata": {
    "description": "$10/day per tenant",
    "unit": "usd_cent"
  }
}

Request Flow

  1. Estimate cost before the call — convert estimated tokens to RLAAS units (e.g. cents)
  2. Check RLAAS quota — if the bucket doesn't have enough units, return 402 immediately
  3. Call the LLM API — proceeds only when the check passes
  4. Record actual cost — use response.usage.total_tokens to deduct the real amount
  5. Change budgets live — PATCH the policy limit; takes effect without any restart

SDK Examples

Pre-flight cost check and post-call actual deduction in three languages.

// 1 unit = 1 USD cent; limit: 1000 = $10/day per tenant const (costPer1KTokens = 3; unitsPerCent = 100) func (s *LLMService) CallLLM(ctx context.Context, tenantID, prompt string) (string, error) { estTokens := float64(len(strings.Fields(prompt))) * 1.3 estUnits := int64(estTokens / 1000 * costPer1KTokens * unitsPerCent) decision, err := s.rlaas.Check(ctx, &rlaas.CheckRequest{ UserID: tenantID, Resource: "llm:spend", Cost: estUnits, }) if err != nil { return "", err } if !decision.Allowed { return "", ErrBudgetExhausted } resp, err := s.openai.CreateChatCompletion(ctx, ...) if err != nil { return "", err } actualUnits := int64(float64(resp.Usage.TotalTokens) / 1000 * costPer1KTokens * unitsPerCent) _ = s.rlaas.Record(ctx, &rlaas.RecordRequest{ UserID: tenantID, Resource: "llm:spend", Units: actualUnits, }) return resp.Choices[0].Message.Content, nil }
# 1 unit = 1 USD cent; $10/day budget = limit 1000 COST_PER_1K = 3 # cents UNITS_PER_CENT = 100 def call_llm(tenant_id: str, prompt: str) -> str: est_tokens = len(prompt.split()) * 1.3 est_units = int(est_tokens / 1000 * COST_PER_1K * UNITS_PER_CENT) decision = client.check(CheckRequest( user_id=tenant_id, resource="llm:spend", cost=est_units )) if not decision.allowed: raise BudgetExhausted(reset_at=decision.retry_after) response = openai.chat.completions.create(...) actual_units = int( response.usage.total_tokens / 1000 * COST_PER_1K * UNITS_PER_CENT ) client.record(RecordRequest( user_id=tenant_id, resource="llm:spend", units=actual_units )) return response.choices[0].message.content
// 1 unit = 1 USD cent; $10/day = limit 1000 const COST_PER_1K_CENTS = 3; async function callLLM(tenantId: string, prompt: string): Promise<string> { const estTokens = prompt.split(' ').length * 1.3; const estUnits = Math.ceil(estTokens / 1000 * COST_PER_1K_CENTS); const decision = await rlaas.check({ userId: tenantId, resource: 'llm:spend', cost: estUnits, }); if (!decision.allowed) { throw new BudgetExhausted({ tenantId, resetAt: decision.retryAfter }); } const response = await openai.chat.completions.create({...}); const actualUnits = Math.ceil( response.usage.total_tokens / 1000 * COST_PER_1K_CENTS ); await rlaas.record({ userId: tenantId, resource: 'llm:spend', units: actualUnits }); return response.choices[0].message.content; }
// pre-flight cost check and post-call deduction import io.rlaas.sdk.RlaasClient; import io.rlaas.sdk.model.*; RlaasClient rlaas = new RlaasClient("http://rlaas:8080"); String callLLM(String tenantId, String prompt) throws Exception { int estTokens = (int) (prompt.split(" ").length * 1.3); int estUnits = estTokens / 1000 * 3; Decision decision = rlaas.checkLimit(new CheckRequest(tenantId, "llm:spend", estUnits)); if (!decision.isAllowed()) throw new BudgetExhaustedException(decision.getRetryAfter()); var response = openAi.createChatCompletion(...); int actualUnits = response.getUsage().getTotalTokens() / 1000 * 3; rlaas.record(new RecordRequest(tenantId, "llm:spend", actualUnits)); return response.getChoices().get(0).getMessage().getContent(); }
// pre-flight cost check and post-call deduction using Rlaas.Sdk; using Rlaas.Sdk.Models; var rlaas = new RlaasClient("http://rlaas:8080"); async Task<string> CallLLMAsync(string tenantId, string prompt) { var estTokens = (int)(prompt.Split(' ').Length * 1.3); var estUnits = estTokens / 1000 * 3; var decision = await rlaas.CheckLimitAsync( new CheckRequest(tenantId, "llm:spend", estUnits)); if (!decision.Allowed) throw new BudgetExhaustedException(decision.RetryAfter); var response = await openAi.CreateChatCompletionAsync(...); var actualUnits = response.Usage.TotalTokens / 1000 * 3; await rlaas.RecordAsync( new RecordRequest(tenantId, "llm:spend", actualUnits)); return response.Choices[0].Message.Content; }
// cost budget enforcement (Node.js) const { RlaasClient } = require('@rlaas/node-sdk'); const client = new RlaasClient('http://rlaas:8080'); async function costAwareLLMCall(userId, prompt, estimatedCost) { const decision = await client.check({ user_id: userId, resource: 'ai:spend-usd', cost: estimatedCost, }); if (!decision.allowed) { throw new Error(`Budget exhausted – retry after ${decision.retry_after}s`); } const result = await callLLM(prompt); await client.record({ user_id: userId, resource: 'ai:spend-usd', cost: result.actual_cost }); return result; }
// cost budget enforcement (C++) #include "rlaas/client.h" rlaas::Client client("http://rlaas:8080"); LLMResult cost_aware_llm_call(const std::string& user_id, const std::string& prompt, double estimated_cost) { rlaas::CheckRequest req; req.user_id = user_id; req.resource = "ai:spend-usd"; req.cost = estimated_cost; auto decision = client.check(req); if (!decision.allowed) throw BudgetExhausted(decision.retry_after_ms); auto result = call_llm(prompt); client.record(user_id, "ai:spend-usd", result.actual_cost); return result; }
// cost budget enforcement (Rust) use rlaas_sdk::{Client, CheckRequest}; let client = Client::new("http://rlaas:8080"); async fn cost_aware_llm_call( client: &Client, user_id: &str, prompt: &str, est_cost: f64, ) -> Result<LLMResult> { let decision = client.check(&CheckRequest { user_id: user_id.into(), resource: "ai:spend-usd".into(), cost: Some(est_cost), ..Default::default() }).await?; if !decision.allowed { return Err(anyhow!("Budget exhausted")); } let result = call_llm(prompt).await?; client.record(user_id, "ai:spend-usd", result.actual_cost).await?; Ok(result) }
# cost budget enforcement (Ruby) require 'rlaas_sdk' client = Rlaas::Client.new('http://rlaas:8080') def cost_aware_llm_call(user_id, prompt, estimated_cost) decision = client.check( user_id: user_id, resource: 'ai:spend-usd', cost: estimated_cost ) raise BudgetExhausted unless decision.allowed result = call_llm(prompt) client.record(user_id: user_id, resource: 'ai:spend-usd', cost: result.actual_cost) result end