GPU / Training

GPU Slot Fairness

Cap concurrent GPU training jobs per org. Each slot is held with acquire/release for the full job duration — and automatically reclaimed via TTL if the job crashes.

Before & After

Without RLAAS

One Team Monopolises Every GPU Slot

  • No concurrency cap — one org submits 20 jobs and saturates the cluster
  • Other teams queue indefinitely with no ETA or feedback
  • Crashed jobs leave slots permanently occupied — requires manual intervention
  • Fairness rules are ad hoc scheduler configs that are hard to change
# ✗ No cap — first org to submit wins everything def submit_training_job(org_id: str, job: Job): # No concurrency check — GPU cluster may already be full gpu_scheduler.submit(job) # throws after a long queue wait # Crashed jobs need manual slot reclaim: os.system("kubectl delete pod --field-selector=status.phase=Failed")
With RLAAS

Fair Concurrency Cap — Crash-Safe TTL

  • Maximum 4 concurrent GPU jobs per org — enforced before scheduling
  • Slot is held for the exact job duration via AcquireLease / ReleaseLease
  • TTL auto-releases orphaned slots if the job process crashes
  • Change the cap per org live via API — no scheduler restart
# ✓ RLAAS controls concurrency with crash-safe TTL client = RlaasClient(base_url="http://rlaas:8080") def submit_training_job(org_id: str, job: Job): slot = client.acquire_lease(LeaseRequest( user_id=org_id, resource="gpu-slot", ttl_seconds=3600, # auto-release if job crashes )) if not slot.acquired: return {"queued": True, "current": slot.current} try: gpu_scheduler.submit(job) finally: client.release_lease(ReleaseRequest(user_id=org_id, resource="gpu-slot"))

How It Works

Policy Configuration

# POST /rlaas/v1/policies
{
  "id": "gpu-concurrency-per-org",
  "resource": "gpu-slot",
  "algorithm": "concurrency",
  "config": {
    "limit": 4
  },
  "action_deny": "reject",
  "metadata": {
    "description": "4 concurrent GPU jobs per org",
    "default_ttl_seconds": "3600"
  }
}

Acquire / Release Flow

  1. Call AcquireLease before submitting — pass resource: "gpu-slot" and a TTL matching your max expected job duration
  2. If not acquired — queue the job and return current count to show the user their position
  3. Job runs — RLAAS holds the slot; TTL heartbeat can be renewed if needed
  4. Call ReleaseLease on completion — slot is returned immediately so the next job can begin
  5. Crash safety — if the process dies without releasing, the TTL auto-reclaims the slot after expiry

SDK Examples

Acquire a GPU slot before scheduling, release it when done.

// acquire GPU slot before submitting the job slot, err := client.AcquireLease(ctx, &rlaas.LeaseRequest{ UserID: orgID, Resource: "gpu-slot", TTLSeconds: 3600, // auto-release if process crashes }) if err != nil { return err } if !slot.Acquired { log.Info("GPU at capacity", "current", slot.Current, "limit", slot.Limit) return ErrGPUAtCapacity } defer client.ReleaseLease(ctx, &rlaas.ReleaseRequest{ UserID: orgID, Resource: "gpu-slot", }) // submit the job — slot is held until defer runs return gpuScheduler.Submit(ctx, job)
from contextlib import contextmanager from rlaas_sdk import RlaasClient, LeaseRequest, ReleaseRequest client = RlaasClient(base_url="http://rlaas:8080") def run_training_job(org_id: str, job) -> None: slot = client.acquire_lease(LeaseRequest( user_id=org_id, resource="gpu-slot", ttl_seconds=3600 )) if not slot.acquired: raise CapacityError(f"GPU at capacity ({slot.current}/{slot.limit})") try: gpu_scheduler.submit(job) finally: client.release_lease(ReleaseRequest( user_id=org_id, resource="gpu-slot" ))
import { RlaasClient } from '@rlaas/sdk'; const rlaas = new RlaasClient({ baseUrl: 'http://rlaas:8080' }); async function runTrainingJob(orgId: string, job: Job) { const slot = await rlaas.acquireLease({ userId: orgId, resource: 'gpu-slot', ttlSeconds: 3600, }); if (!slot.acquired) { throw new CapacityError(`GPU at capacity (${slot.current}/${slot.limit})`); } try { await gpuScheduler.submit(job); } finally { await rlaas.releaseLease({ userId: orgId, resource: 'gpu-slot' }); } }
// acquire/release GPU slot with concurrency limiter import io.rlaas.sdk.RlaasClient; import java.util.Map; RlaasClient rlaas = new RlaasClient("http://rlaas:8080"); Map<String, Object> lease = rlaas.acquire(Map.of( "user_id", orgId, "resource", "gpu-slot", "ttl_seconds", 3600 )); String leaseId = (String) lease.get("lease_id"); try { runTrainingJob(jobConfig); } finally { rlaas.release(leaseId); }
// acquire/release GPU slot with concurrency limiter using Rlaas.Sdk; var rlaas = new RlaasClient("http://rlaas:8080"); var lease = await rlaas.AcquireAsync(new Dictionary<string, object> { ["user_id"] = orgId, ["resource"] = "gpu-slot", ["ttl_seconds"] = 3600 }); var leaseId = lease["lease_id"].ToString(); try { await RunTrainingJobAsync(jobConfig); } finally { await rlaas.ReleaseAsync(leaseId); }
// acquire GPU lease, train, release (Node.js) const { RlaasClient } = require('@rlaas/node-sdk'); const client = new RlaasClient('http://rlaas:8080'); const lease = await client.acquire({ user_id: 'training-pipeline', resource: 'gpu:a100', quantity: 4, ttl: 3600, }); try { await runTraining(lease.lease_id); } finally { await client.release(lease.lease_id); }
// acquire GPU lease, train, release (C++) #include "rlaas/client.h" rlaas::Client client("http://rlaas:8080"); auto lease = client.acquire({ .user_id = "training-pipeline", .resource = "gpu:a100", .quantity = 4, .ttl = 3600, }); try { run_training(lease.lease_id); } catch (...) { client.release(lease.lease_id); throw; } client.release(lease.lease_id);
// acquire GPU lease, train, release (Rust) use rlaas_sdk::{Client, AcquireRequest}; let client = Client::new("http://rlaas:8080"); let lease = client.acquire(&AcquireRequest { user_id: "training-pipeline".into(), resource: "gpu:a100".into(), quantity: 4, ttl: 3600, ..Default::default() }).await?; let _guard = scopeguard::guard((), |_| { let _ = client.release(&lease.lease_id); }); run_training(&lease.lease_id).await?;
# acquire GPU lease, train, release (Ruby) require 'rlaas_sdk' client = Rlaas::Client.new('http://rlaas:8080') lease = client.acquire( user_id: 'training-pipeline', resource: 'gpu:a100', quantity: 4, ttl: 3600 ) begin run_training(lease.lease_id) ensure client.release(lease.lease_id) end