Pre-check on input tokens, stream with chunk counting, deduct actual output after completion.
// two-phase token accounting for streaming responses
func (s *LLMService) StreamCompletion(ctx context.Context, userID, prompt string, out io.Writer) error {
inputTokens := CountTokens(prompt)
decision, err := s.rlaas.Check(ctx, &rlaas.CheckRequest{
UserID: userID,
Resource: "llm:tokens",
Cost: int64(inputTokens),
})
if err != nil { return err }
if !decision.Allowed { return ErrTokenBudgetExceeded }
stream, err := s.openai.CreateChatCompletionStream(ctx, ...)
if err != nil { return err }
defer stream.Close()
var outputTokens int
for {
chunk, err := stream.Recv()
if errors.Is(err, io.EOF) { break }
if err != nil { return err }
delta := chunk.Choices[0].Delta.Content
outputTokens += CountTokens(delta)
fmt.Fprint(out, delta)
}
_ = s.rlaas.Record(ctx, &rlaas.RecordRequest{
UserID: userID,
Resource: "llm:tokens",
Units: int64(outputTokens),
})
return nil
}
import tiktoken
enc = tiktoken.encoding_for_model("gpt-4o")
async def stream_response(user_id: str, prompt: str):
# Phase 1 — pre-check on input tokens
input_tokens = len(enc.encode(prompt))
decision = client.check(CheckRequest(
user_id=user_id, resource="llm:tokens", cost=input_tokens
))
if not decision.allowed:
raise TokenBudgetExceeded(retry_after=decision.retry_after)
# Phase 2 — stream and count output
output_tokens = 0
stream = await openai.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
stream=True,
)
async for chunk in stream:
delta = chunk.choices[0].delta.content or ""
output_tokens += len(enc.encode(delta))
yield delta
# Phase 3 — deduct actual output tokens
client.record(RecordRequest(
user_id=user_id, resource="llm:tokens", units=output_tokens
))
import { encode } from 'gpt-tokenizer';
async function* streamCompletion(userId: string, prompt: string) {
// Phase 1 — pre-check on input tokens
const inputTokens = encode(prompt).length;
const decision = await rlaas.check({
userId, resource: 'llm:tokens', cost: inputTokens,
});
if (!decision.allowed) {
throw new TokenBudgetExceeded({ userId, retryAfter: decision.retryAfter });
}
// Phase 2 — stream and count output tokens
let outputTokens = 0;
const stream = await openai.chat.completions.create({
model: 'gpt-4o', messages: [{ role: 'user', content: prompt }], stream: true,
});
for await (const chunk of stream) {
const delta = chunk.choices[0]?.delta?.content ?? '';
outputTokens += encode(delta).length;
yield delta;
}
// Phase 3 — deduct actual output
await rlaas.record({ userId, resource: 'llm:tokens', units: outputTokens });
}
// two-phase token accounting for streaming responses
import io.rlaas.sdk.RlaasClient;
import io.rlaas.sdk.model.*;
RlaasClient rlaas = new RlaasClient("http://rlaas:8080");
String streamCompletion(String userId, String prompt, Writer out) throws Exception {
int inputTokens = countTokens(prompt);
// Phase 1 — pre-check input tokens
Decision decision = rlaas.checkLimit(
new CheckRequest(userId, "llm:tokens", inputTokens));
if (!decision.isAllowed())
throw new TokenBudgetExceededException(decision.getRetryAfter());
// Phase 2 — stream and count output
var stream = openAi.createChatCompletionStream(...);
int outputTokens = 0;
for (var chunk : stream) {
String delta = chunk.getChoices().get(0).getDelta().getContent();
outputTokens += countTokens(delta);
out.write(delta);
out.flush();
}
// Phase 3 — deduct actual output tokens
rlaas.record(new RecordRequest(userId, "llm:tokens", outputTokens));
return null;
}
// two-phase token accounting for streaming responses
using Rlaas.Sdk;
using Rlaas.Sdk.Models;
var rlaas = new RlaasClient("http://rlaas:8080");
async IAsyncEnumerable<string> StreamCompletionAsync(string userId, string prompt)
{
var inputTokens = CountTokens(prompt);
// Phase 1 — pre-check input tokens
var decision = await rlaas.CheckLimitAsync(
new CheckRequest(userId, "llm:tokens", inputTokens));
if (!decision.Allowed)
throw new TokenBudgetExceededException(decision.RetryAfter);
// Phase 2 — stream and count output tokens
var outputTokens = 0;
await foreach (var chunk in openAi.StreamChatCompletionAsync(...))
{
var delta = chunk.Choices[0].Delta.Content ?? "";
outputTokens += CountTokens(delta);
yield return delta;
}
// Phase 3 — deduct actual output tokens
await rlaas.RecordAsync(
new RecordRequest(userId, "llm:tokens", outputTokens));
}
// streaming token accounting (Node.js)
const { RlaasClient } = require('@rlaas/node-sdk');
const client = new RlaasClient('http://rlaas:8080');
async function streamWithAccounting(userId, messages) {
const inputTokens = countTokens(messages);
const decision = await client.check({
user_id: userId, resource: 'ai:tokens',
tokens: inputTokens,
});
if (!decision.allowed) throw new Error('Token budget exceeded');
let outputTokens = 0;
const stream = await llm.chatStream(messages);
for await (const chunk of stream) {
outputTokens += chunk.token_count;
emit(chunk.text);
}
await client.record({ user_id: userId,
resource: 'ai:tokens', tokens: outputTokens });
}
// streaming token accounting (C++)
#include "rlaas/client.h"
rlaas::Client client("http://rlaas:8080");
void stream_with_accounting(const std::string& user_id,
const Messages& messages) {
int input_tokens = count_tokens(messages);
rlaas::CheckRequest req;
req.user_id = user_id;
req.resource = "ai:tokens";
req.tokens = input_tokens;
auto decision = client.check(req);
if (!decision.allowed) throw TokenBudgetExceeded();
int output_tokens = 0;
llm.chat_stream(messages, [&](const Chunk& c) {
output_tokens += c.token_count;
emit(c.text);
});
client.record(user_id, "ai:tokens", output_tokens);
}
// streaming token accounting (Rust)
use rlaas_sdk::{Client, CheckRequest};
let client = Client::new("http://rlaas:8080");
async fn stream_with_accounting(
client: &Client, user_id: &str, messages: &[Message],
) -> Result<()> {
let input_tokens = count_tokens(messages);
let decision = client.check(&CheckRequest {
user_id: user_id.into(),
resource: "ai:tokens".into(),
tokens: Some(input_tokens),
..Default::default()
}).await?;
if !decision.allowed { return Err(anyhow!("Token budget exceeded")); }
let mut output_tokens = 0;
let mut stream = llm.chat_stream(messages).await?;
while let Some(chunk) = stream.next().await {
output_tokens += chunk.token_count;
emit(&chunk.text);
}
client.record(user_id, "ai:tokens", output_tokens).await?;
Ok(())
}
# streaming token accounting (Ruby)
require 'rlaas_sdk'
client = Rlaas::Client.new('http://rlaas:8080')
def stream_with_accounting(user_id, messages)
input_tokens = count_tokens(messages)
decision = client.check(
user_id: user_id, resource: 'ai:tokens',
tokens: input_tokens
)
raise TokenBudgetExceeded unless decision.allowed
output_tokens = 0
llm.chat_stream(messages) do |chunk|
output_tokens += chunk.token_count
emit(chunk.text)
end
client.record(user_id: user_id,
resource: 'ai:tokens', tokens: output_tokens)
end