Check the token budget before every LLM call using your preferred language.
// check token budget before calling LLM
decision, err := client.Check(ctx, &rlaas.CheckRequest{
UserID: userID,
Resource: "gpt-4",
Quantity: int64(estimatedTokens), // real token cost
})
if err != nil {
return "", err
}
if !decision.Allowed {
return "", fmt.Errorf("token budget exhausted, retry in %ds",
decision.RetryAfter)
}
// make LLM call
resp, err := openaiClient.CreateChatCompletion(ctx, req)
from rlaas_sdk import RlaasClient, CheckRequest
client = RlaasClient(base_url="http://rlaas:8080")
decision = client.check(CheckRequest(
user_id=user_id,
resource="gpt-4",
quantity=estimated_tokens,
))
if not decision.allowed:
raise RateLimitError(retry_after=decision.retry_after)
response = openai_client.chat.completions.create(
model="gpt-4",
messages=messages,
)
import { RlaasClient } from '@rlaas/sdk';
const rlaas = new RlaasClient({ baseUrl: 'http://rlaas:8080' });
const decision = await rlaas.check({
userId: userId,
resource: 'gpt-4',
quantity: estimatedTokens,
});
if (!decision.allowed) {
throw new RateLimitError({ retryAfter: decision.retryAfter });
}
const response = await openai.chat.completions.create({ model: 'gpt-4', messages });
// check token budget before calling LLM
import io.rlaas.sdk.RlaasClient;
import io.rlaas.sdk.model.*;
RlaasClient rlaas = new RlaasClient("http://rlaas:8080");
Decision decision = rlaas.checkLimit(new CheckRequest(
userId, "gpt-4", estimatedTokens));
if (!decision.isAllowed()) {
throw new RateLimitException(
"Token budget exhausted, retry in " + decision.getRetryAfter() + "s");
}
// make LLM call
var resp = openAiClient.createChatCompletion(req);
// check token budget before calling LLM
using Rlaas.Sdk;
using Rlaas.Sdk.Models;
var rlaas = new RlaasClient("http://rlaas:8080");
var decision = await rlaas.CheckLimitAsync(
new CheckRequest(userId, "gpt-4", estimatedTokens));
if (!decision.Allowed)
throw new RateLimitException(
$"Token budget exhausted, retry in {decision.RetryAfter}s");
// make LLM call
var resp = await openAi.CreateChatCompletionAsync(req);
// check token budget before calling LLM (Node.js)
const { RlaasClient } = require('@rlaas/node-sdk');
const client = new RlaasClient('http://rlaas:8080');
const decision = await client.check({
user_id: userId,
resource: 'gpt-4',
quantity: estimatedTokens,
});
if (!decision.allowed) {
res.status(429).json({ error: 'Token budget exhausted', retry_after: decision.retry_after });
return;
}
const response = await openai.chat.completions.create({ model: 'gpt-4', messages });
// check token budget before calling LLM (C++)
#include "rlaas/client.h"
rlaas::Client client("http://rlaas:8080");
rlaas::CheckRequest req;
req.user_id = user_id;
req.resource = "gpt-4";
req.quantity = estimated_tokens;
auto decision = client.check(req);
if (!decision.allowed) {
throw std::runtime_error(
"Token budget exhausted, retry in " +
std::to_string(decision.retry_after_ms) + "ms");
}
// make LLM call
auto resp = openai_client.create_chat_completion(chat_req);
// check token budget before calling LLM (Rust)
use rlaas_sdk::{Client, CheckRequest};
let client = Client::new("http://rlaas:8080");
let decision = client.check(&CheckRequest {
user_id: user_id.into(),
resource: "gpt-4".into(),
quantity: estimated_tokens as i64,
..Default::default()
}).await?;
if !decision.allowed {
return Err(anyhow!("token budget exhausted, retry in {}s", decision.retry_after));
}
let resp = openai.create_chat_completion(req).await?;
# check token budget before calling LLM (Ruby)
require 'rlaas_sdk'
client = Rlaas::Client.new('http://rlaas:8080')
decision = client.check(
user_id: user_id,
resource: 'gpt-4',
quantity: estimated_tokens
)
unless decision.allowed
raise RateLimitError, "Token budget exhausted, retry in #{decision.retry_after}s"
end
response = openai_client.chat(model: 'gpt-4', messages: messages)