ray/examples/config/ray.sub1b.cax11.public.json at main · razroo/ray · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
{
  "profile": "sub1b",
  "server": {
    "host": "127.0.0.1",
    "port": 3000,
    "requestBodyLimitBytes": 48000
  },
  "model": {
    "id": "qwen2.5-0.5b-instruct-q4_k_m",
    "family": "qwen2.5",
    "quantization": "q4_k_m",
    "contextWindow": 8192,
    "warmOnBoot": true,
    "maxOutputTokens": 256,
    "adapter": {
      "kind": "llama.cpp",
      "baseUrl": "http://127.0.0.1:8081",
      "modelRef": "qwen2.5-0.5b-instruct-q4_k_m",
      "timeoutMs": 18000,
      "cachePrompt": true,
      "slotStateTtlMs": 200,
      "promptScaffoldCacheEntries": 256,
      "launchProfile": {
        "preset": "single-vps-sub1b-cax11",
        "binaryPath": "/usr/local/bin/llama-server",
        "modelPath": "/var/lib/ray/models/qwen2.5-0.5b-instruct-q4_k_m.gguf",
        "host": "127.0.0.1",
        "port": 8081,
        "alias": "qwen2.5-0.5b-instruct-q4_k_m",
        "ctxSize": 3072,
        "parallel": 1,
        "threads": 2,
        "threadsHttp": 2,
        "batchSize": 192,
        "ubatchSize": 96,
        "cachePrompt": true,
        "cacheReuse": 256,
        "cacheRamMiB": 512,
        "continuousBatching": true,
        "enableMetrics": true,
        "exposeSlots": true,
        "warmup": true,
        "enableUnifiedKv": true,
        "cacheIdleSlots": true,
        "contextShift": true
      }
    }
  },
  "scheduler": {
    "concurrency": 1,
    "maxQueue": 48,
    "maxQueuedTokens": 18000,
    "maxInflightTokens": 3072,
    "requestTimeoutMs": 22000,
    "dedupeInflight": true,
    "batchWindowMs": 5,
    "affinityLookahead": 16,
    "shortJobMaxTokens": 96
  },
  "asyncQueue": {
    "enabled": true,
    "storageDir": "/var/lib/ray/async-queue",
    "pollIntervalMs": 1000,
    "dispatchConcurrency": 1,
    "maxAttempts": 3,
    "callbackTimeoutMs": 5000,
    "maxCallbackAttempts": 5
  },
  "cache": {
    "enabled": true,
    "maxEntries": 256,
    "ttlMs": 90000,
    "keyStrategy": "input+params"
  },
  "gracefulDegradation": {
    "enabled": true,
    "queueDepthThreshold": 12,
    "maxPromptChars": 6000,
    "degradeToMaxTokens": 128
  },
  "adaptiveTuning": {
    "enabled": true,
    "sampleSize": 32,
    "queueLatencyThresholdMs": 450,
    "minCompletionTokensPerSecond": 10,
    "maxOutputReductionRatio": 0.5,
    "minOutputTokens": 64,
    "learnedFamilyCapEnabled": true,
    "familyHistorySize": 64,
    "learnedCapMinSamples": 8,
    "draftPercentile": 0.95,
    "shortPercentile": 0.9,
    "learnedCapHeadroomTokens": 24
  },
  "telemetry": {
    "serviceName": "ray-gateway",
    "logLevel": "info",
    "includeDebugMetrics": true,
    "slowRequestThresholdMs": 1500
  },
  "auth": {
    "enabled": true,
    "apiKeyEnv": "RAY_API_KEYS"
  },
  "rateLimit": {
    "enabled": true,
    "windowMs": 60000,
    "maxRequests": 90,
    "keyStrategy": "ip+api-key",
    "trustProxyHeaders": true
  },
  "tags": {
    "hardware": "hetzner-cax11-class",
    "engine": "llama.cpp",
    "exposure": "public"
  }
}