Check the model limit before the LLM call and use the downgrade hint if denied.
// decide which model to use before calling the LLM
decision, err := client.Check(ctx, &rlaas.CheckRequest{
UserID: userID,
Resource: "gpt-4",
})
if err != nil {
return "", err
}
model := "gpt-4"
if !decision.Allowed {
// read downgrade hint from policy metadata
if fallback, ok := decision.Metadata["downgrade_to"]; ok {
model = fallback
} else {
model = "gpt-3.5-turbo"
}
log.Info("downgrading model", "from", "gpt-4", "to", model)
}
resp, err := openaiClient.CreateChatCompletion(ctx, openai.ChatCompletionRequest{
Model: model,
Messages: messages,
})
from rlaas_sdk import RlaasClient, CheckRequest
client = RlaasClient(base_url="http://rlaas:8080")
decision = client.check(CheckRequest(
user_id=user_id,
resource="gpt-4",
))
model = "gpt-4"
if not decision.allowed:
model = decision.metadata.get("downgrade_to", "gpt-3.5-turbo")
response = openai_client.chat.completions.create(
model=model,
messages=messages,
)
import { RlaasClient } from '@rlaas/sdk';
const rlaas = new RlaasClient({ baseUrl: 'http://rlaas:8080' });
const decision = await rlaas.check({
userId: userId,
resource: 'gpt-4',
});
const model = decision.allowed
? 'gpt-4'
: (decision.metadata?.downgrade_to ?? 'gpt-3.5-turbo');
const response = await openai.chat.completions.create({ model, messages });
// auto-downgrade model on rate limit
import io.rlaas.sdk.RlaasClient;
import io.rlaas.sdk.model.CheckRequest;
import io.rlaas.sdk.model.Decision;
RlaasClient rlaas = new RlaasClient("http://rlaas:8080");
Decision decision = rlaas.checkLimit(new CheckRequest(userId, "gpt-4o"));
String model = "gpt-4o";
if (!decision.isAllowed()) {
String fallback = decision.getMetadata().get("downgrade_to");
if (fallback != null) {
model = fallback; // "gpt-4o-mini"
} else {
throw new RateLimitException(decision.getRetryAfter());
}
}
// call LLM with selected model
var response = openAi.createChatCompletion(model, messages);
// auto-downgrade model on rate limit
using Rlaas.Sdk;
using Rlaas.Sdk.Models;
var rlaas = new RlaasClient("http://rlaas:8080");
var decision = await rlaas.CheckLimitAsync(new CheckRequest(userId, "gpt-4o"));
var model = "gpt-4o";
if (!decision.Allowed)
{
if (decision.Metadata.TryGetValue("downgrade_to", out var fallback))
model = fallback; // "gpt-4o-mini"
else
throw new RateLimitException(decision.RetryAfter);
}
// call LLM with selected model
var response = await openAi.CreateChatCompletionAsync(model, messages);
// auto-downgrade: try models in order (Node.js)
const { RlaasClient } = require('@rlaas/node-sdk');
const client = new RlaasClient('http://rlaas:8080');
const models = ['gpt-4o', 'gpt-4o-mini', 'gpt-3.5-turbo'];
async function chatWithFallback(userId, messages) {
for (const model of models) {
const decision = await client.check({
user_id: userId, resource: model,
});
if (decision.allowed) {
return await openai.chat.completions.create({ model, messages });
}
}
throw new Error('All models rate-limited');
}
// auto-downgrade: try models in order (C++)
#include "rlaas/client.h"
rlaas::Client client("http://rlaas:8080");
std::vector<std::string> models = {"gpt-4o", "gpt-4o-mini", "gpt-3.5-turbo"};
std::string chat_with_fallback(const std::string& user_id) {
for (const auto& model : models) {
rlaas::CheckRequest req;
req.user_id = user_id;
req.resource = model;
auto decision = client.check(req);
if (decision.allowed) {
return openai.chat(model, messages);
}
}
throw std::runtime_error("All models rate-limited");
}
// auto-downgrade: try models in order (Rust)
use rlaas_sdk::{Client, CheckRequest};
let client = Client::new("http://rlaas:8080");
let models = ["gpt-4o", "gpt-4o-mini", "gpt-3.5-turbo"];
async fn chat_with_fallback(client: &Client, user_id: &str) -> Result<String> {
for model in &models {
let decision = client.check(&CheckRequest {
user_id: user_id.into(),
resource: model.to_string(),
..Default::default()
}).await?;
if decision.allowed {
return openai.chat(model, &messages).await;
}
}
Err(anyhow!("All models rate-limited"))
}
# auto-downgrade: try models in order (Ruby)
require 'rlaas_sdk'
client = Rlaas::Client.new('http://rlaas:8080')
models = ['gpt-4o', 'gpt-4o-mini', 'gpt-3.5-turbo']
def chat_with_fallback(user_id, messages)
models.each do |model|
decision = client.check(user_id: user_id, resource: model)
if decision.allowed
return openai.chat(model: model, messages: messages)
end
end
raise 'All models rate-limited'
end