forked from grafana/tempo
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconfig.libsonnet
More file actions
74 lines (67 loc) · 3.32 KB
/
config.libsonnet
File metadata and controls
74 lines (67 loc) · 3.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
{
local makePrefix(groups) = std.join('_', groups),
local makeGroupBy(groups) = std.join(', ', groups),
_config+:: {
http_api_prefix: '',
namespace: '.*',
jobs: {
gateway: 'cortex-gw(-internal)?',
query_frontend: 'query-frontend',
querier: 'querier',
ingester: 'ingester',
metrics_generator: 'metrics-generator',
distributor: 'distributor',
compactor: 'compactor',
block_builder: 'block-builder',
backend_scheduler: 'backend-scheduler',
backend_worker: 'backend-worker',
memcached: 'memcached',
live_store: 'live-store',
live_store_zones: 'live-store.*',
},
alerts: {
compactions_per_hour_failed: 2,
flushes_per_hour_failed: 2,
polls_per_hour_failed: 2,
user_configurable_overrides_polls_per_hour_failed: 5,
max_tenant_index_age_seconds: 600,
p99_request_threshold_seconds: 3,
p99_request_exclude_regex: 'metrics|/frontend.Frontend/Process|debug_pprof',
outstanding_blocks_warning: 100,
outstanding_blocks_critical: 250,
// Generators partition lag thresholds in seconds
partition_lag_critical_seconds: 900, // 15 minutes
// Block-builder partition lag thresholds in seconds
block_builder_partition_lag_warning_seconds: 200, // 3.3 minutes
block_builder_partition_lag_critical_seconds: 300, // 5 minutes
// threshold config for backend scheduler and worker alerts
backend_scheduler_jobs_failure_rate: 0.05, // 5% of the jobs failed
backend_scheduler_jobs_retry_count_per_minute: 20, // 20 jobs retried per minute
backend_scheduler_compaction_tenant_empty_job_count_per_minute: 10, // 10 empty jobs per minute
backend_scheduler_bad_jobs_count_per_minute: 0, // alert if there are any bad jobs
backend_worker_call_retries_count_per_minute: 5, // 5 retries per minute
vulture_error_rate_threshold: 0.1, // 10% error rate
// Livestore partition lag thresholds in seconds
live_store_partition_lag_warning_seconds: 200, // 3.3 minutes
live_store_partition_lag_critical_seconds: 300, // 5 minutes
},
per_cluster_label: 'cluster',
namespace_selector_separator: '/',
// Groups labels to uniquely identify and group by {jobs, clusters, tenants}
cluster_selectors: [$._config.per_cluster_label, 'namespace'],
job_selectors: [$._config.per_cluster_label, 'namespace', 'job'],
tenant_selectors: [$._config.per_cluster_label, 'namespace', 'tenant'],
// Each group prefix is composed of `_`-separated labels
group_prefix_clusters: makePrefix($._config.cluster_selectors),
group_prefix_jobs: makePrefix($._config.job_selectors),
group_prefix_tenants: makePrefix($._config.tenant_selectors),
// Each group-by label list is `, `-separated and unique identifies
group_by_cluster: makeGroupBy($._config.cluster_selectors),
group_by_job: makeGroupBy($._config.job_selectors),
group_by_tenant: makeGroupBy($._config.tenant_selectors),
// Tunes histogram recording rules to aggregate over this interval.
// Set to at least twice the scrape interval; otherwise, recording rules will output no data.
// Set to four times the scrape interval to account for edge cases: https://www.robustperception.io/what-range-should-i-use-with-rate/
recording_rules_range_interval: '1m',
},
}