inference-in-a-box/scripts/demo-observability.sh at main · tetrateio/inference-in-a-box · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
#!/bin/bash

# Observability Demo
# Enterprise monitoring and observability stack validation

set -e

# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
PURPLE='\033[0;35m'
CYAN='\033[0;36m'
WHITE='\033[1;37m'
NC='\033[0m' # No Color

# Professional logging functions
log() {
    echo -e "${BLUE}[$(date +'%Y-%m-%d %H:%M:%S')] INFO: $1${NC}"
}

success() {
    echo -e "${GREEN}[$(date +'%Y-%m-%d %H:%M:%S')] SUCCESS: $1${NC}"
}

error() {
    echo -e "${RED}[$(date +'%Y-%m-%d %H:%M:%S')] ERROR: $1${NC}"
}

warn() {
    echo -e "${YELLOW}[$(date +'%Y-%m-%d %H:%M:%S')] WARNING: $1${NC}"
}

info() {
    echo -e "${CYAN}[$(date +'%Y-%m-%d %H:%M:%S')] INFO: $1${NC}"
}

# Print separator
separator() {
    echo -e "${CYAN}=================================================================================${NC}"
}

# Print section header
section_header() {
    echo ""
    separator
    echo -e "${WHITE}OBSERVABILITY DEMO: $1${NC}"
    separator
    echo ""
}

# Function to validate monitoring stack components
validate_monitoring_stack() {
    info "Validating observability stack component availability"

    echo -e "${WHITE}Component Status Analysis:${NC}"

    # Check Grafana
    if kubectl get svc prometheus-grafana -n monitoring &>/dev/null; then
        local grafana_status=$(kubectl get pods -n monitoring -l app.kubernetes.io/name=grafana --no-headers | awk '{print $3}' | head -1)
        echo -e "${GREEN}  ✓ Grafana: Available (Status: $grafana_status)${NC}"
        echo -e "${WHITE}    Service: prometheus-grafana.monitoring${NC}"
        echo -e "${WHITE}    Port: 80${NC}"
    else
        echo -e "${RED}  ✗ Grafana: Not available${NC}"
        return 1
    fi

    # Check Prometheus
    if kubectl get svc prometheus-kube-prometheus-prometheus -n monitoring &>/dev/null; then
        local prometheus_status=$(kubectl get pods -n monitoring -l app.kubernetes.io/name=prometheus --no-headers | awk '{print $3}' | head -1)
        echo -e "${GREEN}  ✓ Prometheus: Available (Status: $prometheus_status)${NC}"
        echo -e "${WHITE}    Service: prometheus-kube-prometheus-prometheus.monitoring${NC}"
        echo -e "${WHITE}    Port: 9090${NC}"
    else
        echo -e "${RED}  ✗ Prometheus: Not available${NC}"
        return 1
    fi

    # Check Kiali
    if kubectl get svc kiali -n monitoring &>/dev/null; then
        local kiali_status=$(kubectl get pods -n monitoring -l app=kiali --no-headers | awk '{print $3}' | head -1)
        echo -e "${GREEN}  ✓ Kiali: Available (Status: $kiali_status)${NC}"
        echo -e "${WHITE}    Service: kiali.monitoring${NC}"
        echo -e "${WHITE}    Port: 20001${NC}"
    else
        echo -e "${YELLOW}  ! Kiali: Not available${NC}"
    fi

    # Check Jaeger
    if kubectl get svc jaeger-query -n monitoring &>/dev/null; then
        local jaeger_status=$(kubectl get pods -n monitoring -l app=jaeger --no-headers | awk '{print $3}' | head -1)
        echo -e "${GREEN}  ✓ Jaeger: Available (Status: $jaeger_status)${NC}"
        echo -e "${WHITE}    Service: jaeger-query.monitoring${NC}"
        echo -e "${WHITE}    Port: 16686${NC}"
    else
        echo -e "${YELLOW}  ! Jaeger: Not deployed in current configuration${NC}"
    fi

    echo ""
    success "Monitoring stack validation completed"
    return 0
}

# Function to establish observability tool connections
establish_monitoring_connections() {
    info "Establishing port-forward connections to observability tools"

    # Terminate any existing port-forwards
    pkill -f "port-forward" || true
    sleep 2

    echo -e "${WHITE}Port-forward Configuration:${NC}"

    # Start Grafana port-forward
    kubectl port-forward -n monitoring svc/prometheus-grafana 3000:80 >/dev/null 2>&1 &
    PF_GRAFANA=$!
    echo -e "${GREEN}  ✓ Grafana: localhost:3000${NC}"

    # Start Prometheus port-forward
    kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090 >/dev/null 2>&1 &
    PF_PROMETHEUS=$!
    echo -e "${GREEN}  ✓ Prometheus: localhost:9090${NC}"

    # Start Kiali port-forward if available
    if kubectl get svc kiali -n monitoring &>/dev/null; then
        kubectl port-forward -n monitoring svc/kiali 20001:20001 >/dev/null 2>&1 &
        PF_KIALI=$!
        echo -e "${GREEN}  ✓ Kiali: localhost:20001${NC}"
    else
        echo -e "${YELLOW}  ! Kiali: Service not available${NC}"
        PF_KIALI=""
    fi

    # Wait for connections to establish
    sleep 3

    success "Monitoring tool connections established"
    echo ""
}

# Function to generate observability traffic
generate_observability_traffic() {
    local token_a=$1
    local token_c=$2

    info "Generating traffic for observability data collection"

    echo -e "${WHITE}Traffic Generation Configuration:${NC}"
    echo -e "${WHITE}  Pattern: Mixed workload across multiple tenants${NC}"
    echo -e "${WHITE}  Duration: 30 seconds${NC}"
    echo -e "${WHITE}  Request Rate: ~2 requests/second${NC}"
    echo -e "${WHITE}  Models: sklearn-iris (tenant-a), pytorch-resnet (tenant-c)${NC}"
    echo ""

    log "Executing traffic generation for metrics and tracing"

    # Generate diverse traffic patterns
    for i in {1..15}; do
        # Tenant A requests
        curl -s -H "Authorization: Bearer $token_a" \
            http://sklearn-iris-predictor.tenant-a.127.0.0.1.sslip.io:8080/v1/models/sklearn-iris:predict \
            -d '{"instances": [[5.1, 3.5, 1.4, 0.2]]}' >/dev/null 2>&1 &

        # Tenant C requests (may fail if model not available)
        curl -s -H "Authorization: Bearer $token_c" \
            http://pytorch-resnet-predictor.tenant-c.127.0.0.1.sslip.io:8080/v1/models/pytorch-resnet:predict \
            -d '{"instances": [[[0.1, 0.2, 0.3]]]}' >/dev/null 2>&1 &

        sleep 2

        if [ $((i % 5)) -eq 0 ]; then
            log "Traffic generation progress: $i/15 cycles completed"
        fi
    done

    # Wait for all background requests to complete
    wait

    success "Traffic generation completed - metrics should be available in monitoring tools"
    echo ""
}

# Helper function to get AI Gateway service name
get_ai_gateway_service() {
    local service_name=$(kubectl get svc -n envoy-gateway-system -l gateway.envoyproxy.io/owning-gateway-name=ai-inference-gateway -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "envoy-ai-gateway")
    info "Discovered AI Gateway service: $service_name"
    echo "$service_name"
}

# Helper function to get JWT tokens from server
get_jwt_tokens() {
    log "Establishing connection to JWT authentication server"

    # Check if JWT server is available
    if ! kubectl get svc jwt-server -n default &>/dev/null; then
        error "JWT server not available. Platform may not be properly bootstrapped."
        return 1
    fi

    success "JWT server verified in default namespace"

    # Port-forward to JWT server
    log "Creating port-forward to JWT server on port 8081"
    kubectl port-forward -n default svc/jwt-server 8081:8080 >/dev/null 2>&1 &
    local jwt_pf=$!
    sleep 2

    # Get tokens from server
    info "Retrieving JWT tokens from authentication server"
    local tokens=$(curl -s http://localhost:8081/tokens 2>/dev/null)

    # Clean up port-forward
    kill $jwt_pf 2>/dev/null || true

    if [ -z "$tokens" ]; then
        error "Failed to retrieve JWT tokens from authentication server"
        return 1
    fi

    success "JWT tokens retrieved successfully"
    echo "$tokens"
}

# Observability Demo
demo_observability() {
    section_header "OBSERVABILITY STACK VALIDATION"

    log "Demo Scope: Enterprise monitoring and observability capabilities validation"
    echo -e "${WHITE}Objectives:${NC}"
    echo -e "${WHITE}  - Validate monitoring stack component availability${NC}"
    echo -e "${WHITE}  - Establish connections to observability tools${NC}"
    echo -e "${WHITE}  - Generate metrics and tracing data${NC}"
    echo -e "${WHITE}  - Demonstrate monitoring dashboards and analytics${NC}"
    echo -e "${WHITE}  - Analyze service mesh observability features${NC}"
    echo ""

    # Monitoring stack validation
    section_header "MONITORING STACK VALIDATION"
    validate_monitoring_stack
    if [ $? -ne 0 ]; then
        error "Monitoring stack validation failed - cannot proceed with observability demo"
        return 1
    fi

    # Establish monitoring connections
    section_header "MONITORING TOOL CONNECTION ESTABLISHMENT"
    establish_monitoring_connections

    # Gateway connection setup
    section_header "AI GATEWAY CONNECTION ESTABLISHMENT"
    log "Establishing connection to Envoy AI Gateway for traffic generation"
    AI_GATEWAY_SERVICE=$(get_ai_gateway_service)

    info "Initiating port-forward to $AI_GATEWAY_SERVICE on port 8080"
    kubectl port-forward -n envoy-gateway-system svc/$AI_GATEWAY_SERVICE 8080:80 >/dev/null 2>&1 &
    GATEWAY_PF=$!
    sleep 3

    # Verify gateway connection
    if kill -0 $GATEWAY_PF 2>/dev/null; then
        success "AI Gateway port-forward established successfully"
        info "AI Gateway accessible at http://localhost:8080"
    else
        error "Failed to establish port-forward to AI Gateway"
        return 1
    fi
    echo ""

    # Authentication setup
    section_header "AUTHENTICATION CONFIGURATION"
    local jwt_response=$(get_jwt_tokens)
    if [ $? -ne 0 ]; then
        error "JWT token acquisition failed - cannot generate authenticated traffic"
        return 1
    fi

    # Extract tokens for multiple tenants
    info "Extracting authentication tokens for multi-tenant traffic generation"
    TOKEN_A=$(echo "$jwt_response" | jq -r '.["tenant-a"]' 2>/dev/null)
    TOKEN_C=$(echo "$jwt_response" | jq -r '.["tenant-c"]' 2>/dev/null)

    if [ -z "$TOKEN_A" ] || [ "$TOKEN_A" = "null" ]; then
        error "Failed to extract tenant-a authentication token"
        return 1
    fi

    if [ -z "$TOKEN_C" ] || [ "$TOKEN_C" = "null" ]; then
        warn "Failed to extract tenant-c authentication token - some traffic generation will be limited"
        TOKEN_C=""
    fi

    success "Authentication tokens configured for observability traffic generation"
    echo ""

    # Traffic generation for observability
    section_header "OBSERVABILITY TRAFFIC GENERATION"
    generate_observability_traffic "$TOKEN_A" "$TOKEN_C"

    # Monitoring tool access information
    section_header "MONITORING TOOL ACCESS CONFIGURATION"
    log "Providing access information for observability tools"

    echo -e "${WHITE}Observability Tools Access:${NC}"
    echo -e "${GREEN}  Grafana Dashboard: http://localhost:3000${NC}"
    echo -e "${WHITE}    Default Credentials: admin / prom-operator${NC}"
    echo -e "${WHITE}    Purpose: Metrics visualization and alerting${NC}"
    echo ""
    echo -e "${GREEN}  Prometheus Query Interface: http://localhost:9090${NC}"
    echo -e "${WHITE}    Purpose: Metrics collection and querying${NC}"
    echo -e "${WHITE}    Query Language: PromQL${NC}"
    echo ""

    if [ -n "$PF_KIALI" ]; then
        echo -e "${GREEN}  Kiali Service Mesh Console: http://localhost:20001${NC}"
        echo -e "${WHITE}    Purpose: Service mesh topology and traffic analysis${NC}"
        echo -e "${WHITE}    Features: Distributed tracing, traffic flow visualization${NC}"
        echo ""
    fi

    if kubectl get svc jaeger-query -n monitoring &>/dev/null; then
        echo -e "${GREEN}  Jaeger Tracing: http://localhost:16686${NC}"
        echo -e "${WHITE}    Purpose: Distributed request tracing${NC}"
        echo -e "${WHITE}    Features: Request flow analysis, latency profiling${NC}"
        echo ""
    else
        echo -e "${YELLOW}  Jaeger Tracing: Not deployed in current configuration${NC}"
        echo -e "${WHITE}    Note: Consider deploying Jaeger for comprehensive distributed tracing${NC}"
        echo ""
    fi

    # Dashboard recommendations
    section_header "RECOMMENDED MONITORING DASHBOARDS"
    log "Providing guidance for observability dashboard exploration"

    echo -e "${WHITE}Grafana Dashboard Recommendations:${NC}"
    echo -e "${GREEN}  KServe Model Performance Dashboard${NC}"
    echo -e "${WHITE}    Metrics: Request latency, throughput, error rates${NC}"
    echo -e "${WHITE}    Use Case: Model serving performance analysis${NC}"
    echo ""
    echo -e "${GREEN}  Istio Service Dashboard${NC}"
    echo -e "${WHITE}    Metrics: Service-to-service communication, traffic patterns${NC}"
    echo -e "${WHITE}    Use Case: Service mesh traffic analysis${NC}"
    echo ""
    echo -e "${GREEN}  Istio Workload Dashboard${NC}"
    echo -e "${WHITE}    Metrics: Pod-level metrics, resource utilization${NC}"
    echo -e "${WHITE}    Use Case: Workload performance monitoring${NC}"
    echo ""
    echo -e "${GREEN}  Kubernetes Cluster Overview${NC}"
    echo -e "${WHITE}    Metrics: Node health, resource usage, pod status${NC}"
    echo -e "${WHITE}    Use Case: Infrastructure monitoring${NC}"
    echo ""

    # Key metrics guidance
    echo -e "${WHITE}Key Metrics to Monitor:${NC}"
    echo -e "${CYAN}  Application Metrics:${NC}"
    echo -e "${WHITE}    - Request latency percentiles (p50, p95, p99)${NC}"
    echo -e "${WHITE}    - Request throughput (requests/second)${NC}"
    echo -e "${WHITE}    - Error rates and status code distribution${NC}"
    echo -e "${WHITE}    - Model inference latency and accuracy${NC}"
    echo ""
    echo -e "${CYAN}  Infrastructure Metrics:${NC}"
    echo -e "${WHITE}    - CPU and memory utilization${NC}"
    echo -e "${WHITE}    - Network traffic and latency${NC}"
    echo -e "${WHITE}    - Pod scaling events and resource limits${NC}"
    echo -e "${WHITE}    - Storage I/O and disk usage${NC}"
    echo ""
    echo -e "${CYAN}  Security Metrics:${NC}"
    echo -e "${WHITE}    - Authentication success/failure rates${NC}"
    echo -e "${WHITE}    - Cross-tenant access attempts${NC}"
    echo -e "${WHITE}    - Certificate validity and rotation${NC}"
    echo -e "${WHITE}    - Network policy violations${NC}"
    echo ""

    # Interactive exploration period
    info "Observability tools are now accessible for exploration"
    warn "Keep tools running for monitoring analysis - Press Ctrl+C to terminate when finished"
    echo ""

    # Wait for user interaction
    trap 'log "Terminating observability demo..." && break' INT
    while true; do
        sleep 10
        log "Observability tools remain active - monitoring data continues to be collected"
    done

    # Cleanup connections
    section_header "CONNECTION CLEANUP"
    log "Terminating port-forward connections"

    # Kill all port-forwards
    kill $GATEWAY_PF 2>/dev/null || true
    kill $PF_GRAFANA 2>/dev/null || true
    kill $PF_PROMETHEUS 2>/dev/null || true
    [ -n "$PF_KIALI" ] && kill $PF_KIALI 2>/dev/null || true

    success "All monitoring connections terminated"

    # Comprehensive summary
    section_header "OBSERVABILITY VALIDATION SUMMARY"
    success "Observability demonstration completed successfully"
    echo ""
    echo -e "${GREEN}Validation Results:${NC}"
    echo -e "${WHITE}  - Monitoring stack components: VERIFIED${NC}"
    echo -e "${WHITE}  - Metrics collection: OPERATIONAL${NC}"
    echo -e "${WHITE}  - Dashboard access: ESTABLISHED${NC}"
    echo -e "${WHITE}  - Traffic generation: COMPLETED${NC}"
    echo -e "${WHITE}  - Service mesh observability: CONFIRMED${NC}"
    echo ""
    echo -e "${CYAN}Observability Features Validated:${NC}"
    echo -e "${WHITE}  - Prometheus: Metrics collection and storage${NC}"
    echo -e "${WHITE}  - Grafana: Visualization and alerting platform${NC}"
    echo -e "${WHITE}  - Kiali: Service mesh topology and traffic analysis${NC}"
    echo -e "${WHITE}  - Istio: Service mesh telemetry and tracing${NC}"
    echo ""
    echo -e "${PURPLE}Enterprise Insights:${NC}"
    echo -e "${WHITE}  - Comprehensive monitoring stack provides full observability${NC}"
    echo -e "${WHITE}  - Real-time metrics enable proactive issue detection${NC}"
    echo -e "${WHITE}  - Service mesh telemetry offers deep application insights${NC}"
    echo -e "${WHITE}  - Distributed tracing capabilities support complex troubleshooting${NC}"
    echo ""
    echo -e "${CYAN}Operational Recommendations:${NC}"
    echo -e "${WHITE}  - Configure alerting rules for critical metrics${NC}"
    echo -e "${WHITE}  - Implement log aggregation for comprehensive analysis${NC}"
    echo -e "${WHITE}  - Set up automated dashboard provisioning${NC}"
    echo -e "${WHITE}  - Establish monitoring data retention policies${NC}"
    echo ""

    success "Observability validation completed - monitoring capabilities confirmed"
}

# Run the demo
demo_observability