Skip to content

Commit e69cab7

Browse files
Thomas Leemeta-codesync[bot]
authored andcommitted
THFT_002-005 restart variants + ODS-throttle SKIP for UncleanExitHealthCheck
Reviewed By: pavanpatil92 Differential Revision: D108120231 fbshipit-source-id: 346e7f8f449033f1816c31d1040c06bc71141823
1 parent f2b7a53 commit e69cab7

4 files changed

Lines changed: 240 additions & 75 deletions

File tree

taac/health_checks/device_health_checks/unclean_exit_health_check.py

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -58,13 +58,31 @@ async def _run(
5858
key_desc = ",".join(
5959
[UNCLEAN_EXIT_KEY_DESC.format(service=service) for service in services]
6060
)
61-
ods_data = await async_query_ods(
62-
entity_desc=obj.name,
63-
key_desc=key_desc,
64-
transform_desc=DAILY_TABLE_TRANSFORM_DESC,
65-
start_time=int(start_time),
66-
end_time=int(end_time),
67-
)
61+
try:
62+
ods_data = await async_query_ods(
63+
entity_desc=obj.name,
64+
key_desc=key_desc,
65+
transform_desc=DAILY_TABLE_TRANSFORM_DESC,
66+
start_time=int(start_time),
67+
end_time=int(end_time),
68+
)
69+
except Exception as e:
70+
# ODS counter-side throttling is a transient infra issue, not a
71+
# DUT-side problem. Treat as SKIP so the playbook doesn't
72+
# false-error (the next playbook retries naturally after backoff).
73+
# Mirrors the sibling fix in CpuUtilizationHealthCheck +
74+
# MemoryUtilizationHealthCheck (D107783972 family).
75+
err_msg = str(e)
76+
if "throttling your requests" in err_msg.lower():
77+
return hc_types.HealthCheckResult(
78+
status=hc_types.HealthCheckStatus.SKIP,
79+
message=(
80+
f"ODS counter throttled — skipping this iteration of "
81+
f"UncleanExitHealthCheck (will retry on next "
82+
f"playbook). Underlying error: {err_msg}"
83+
),
84+
)
85+
raise
6886

6987
if not ods_data:
7088
ods_query_url = await async_generate_ods_url(

taac/task_definitions.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,33 @@ def create_wait_for_agent_convergence_task(
241241
)
242242

243243

244+
def create_assert_thrift_rate_limit_enabled_task(
245+
hostname: str,
246+
) -> Task:
247+
"""Setup-gate task: assert `thriftApiToRateLimitInQps` is non-empty
248+
in the DUT's running agent config before any THFT playbook starts.
249+
250+
Fails fast (raises in the underlying task's `run()`) if the
251+
configerator-side rate limit map (set via D108220182 for IcePack TH6
252+
/ ICECUBE800BC) hasn't shipped or COOP hasn't re-applied the agent
253+
config. This prevents burning a 4-hour THFT soak only to discover at
254+
postcheck time that the agent had no server-side throttling and got
255+
pegged at >1000% CPU.
256+
257+
Args:
258+
hostname: DUT hostname (FBOSS-only — `getRunningConfig()` is a
259+
FBOSS thrift API).
260+
261+
Returns:
262+
Task object that runs `AssertThriftRateLimitEnabledTask` at
263+
setup time.
264+
"""
265+
return Task(
266+
task_name="assert_thrift_rate_limit_enabled",
267+
params=Params(json_params=json.dumps({"hostname": hostname})),
268+
)
269+
270+
244271
def create_wait_for_bgp_convergence_task(
245272
hostnames: t.List[str] | str,
246273
num_tries: int = 120,

taac/tasks/all.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2043,3 +2043,103 @@ async def run(self, params: t.Dict[str, t.Any]) -> None:
20432043
patcher_args=patcher_args,
20442044
patcher_desc=description,
20452045
)
2046+
2047+
2048+
class AssertThriftRateLimitEnabledTask(BaseTask):
2049+
"""Setup-gate task: fail-fast if `thriftApiToRateLimitInQps` is empty in
2050+
the DUT's running agent config.
2051+
2052+
Background: the THFT (thrift-hardening) testconfig fires ~7,000
2053+
concurrent read-only thrift calls per burst against `fboss_sw_agent`.
2054+
Without server-side rate limiting (configerator
2055+
`agent_thrift_api_to_rate_limit.mcconf`), the storm pegs CPU and can
2056+
cascade into kernel OOMs on swap-tight hosts (see T275336067). With
2057+
rate limiting on, the agent throttles excess calls and CPU stays
2058+
bounded.
2059+
2060+
D108220182 removed `ICECUBE800BC` from the EMPTY_HW exclusion list so
2061+
RTSW/FTSW/STSW/GTSW roles on TH6 IcePack now pick up the default
2062+
`thriftApiToRateLimitInQps` map (~140 APIs at 2-8 qps each). This task
2063+
asserts that landed reality on the DUT before any THFT playbook runs;
2064+
if the map is missing or empty, the test fails immediately with a
2065+
clear pointer rather than burning a 4-hour soak before the postcheck
2066+
catches the symptom.
2067+
2068+
Implementation: calls `getRunningConfig()` thrift API on the agent,
2069+
recursively searches the returned JSON for a non-empty
2070+
`thriftApiToRateLimitInQps` map. Recursive search is robust to where
2071+
the materialized config lands in the SwitchConfig schema (top-level
2072+
vs nested under switchSettings / cooperConfig / etc.) so this task
2073+
survives FBOSS-side schema changes that move the key.
2074+
"""
2075+
2076+
NAME = "assert_thrift_rate_limit_enabled"
2077+
THRIFT_RATE_LIMIT_KEY = "thriftApiToRateLimitInQps"
2078+
2079+
@classmethod
2080+
def _find_key_recursive(
2081+
cls, node: t.Any, target_key: str
2082+
) -> t.Optional[t.Dict[str, t.Any]]:
2083+
"""Depth-first search for `target_key` in a nested JSON structure.
2084+
2085+
Returns the value at the first match, or None if not found. Used
2086+
to handle FBOSS schema variations that may nest the rate-limit map.
2087+
"""
2088+
if isinstance(node, dict):
2089+
if target_key in node:
2090+
return node[target_key]
2091+
for value in node.values():
2092+
found = cls._find_key_recursive(value, target_key)
2093+
if found is not None:
2094+
return found
2095+
elif isinstance(node, list):
2096+
for item in node:
2097+
found = cls._find_key_recursive(item, target_key)
2098+
if found is not None:
2099+
return found
2100+
return None
2101+
2102+
async def run(self, params: t.Dict[str, t.Any]) -> None:
2103+
hostname = params["hostname"]
2104+
driver = await async_get_device_driver(hostname)
2105+
# pyre-ignore[16]: AbstractSwitch has no attribute `async_agent_client`
2106+
async with driver.async_agent_client as client:
2107+
running_config_json = await client.getRunningConfig()
2108+
try:
2109+
running_config = json.loads(running_config_json)
2110+
except (ValueError, TypeError) as e:
2111+
raise RuntimeError(
2112+
f"{hostname}: getRunningConfig() returned non-JSON payload "
2113+
f"({type(e).__name__}: {e}); cannot verify thrift rate limit "
2114+
f"is enabled."
2115+
) from e
2116+
2117+
rate_limit_map = self._find_key_recursive(
2118+
running_config, self.THRIFT_RATE_LIMIT_KEY
2119+
)
2120+
if rate_limit_map is None:
2121+
raise RuntimeError(
2122+
f"{hostname}: `{self.THRIFT_RATE_LIMIT_KEY}` not found in "
2123+
f"running agent config. Thrift API rate limiting is NOT "
2124+
f"enabled — refusing to start a THFT (thrift-hardening) "
2125+
f"run because the storm will overload `fboss_sw_agent`. "
2126+
f"See D108220182 (enables defaults for ICECUBE800BC) and "
2127+
f"verify the configerator change has shipped + COOP has "
2128+
f"re-applied the agent config on this host."
2129+
)
2130+
if not isinstance(rate_limit_map, dict) or len(rate_limit_map) == 0:
2131+
raise RuntimeError(
2132+
f"{hostname}: `{self.THRIFT_RATE_LIMIT_KEY}` is present but "
2133+
f"empty (type={type(rate_limit_map).__name__}, "
2134+
f"size={len(rate_limit_map) if hasattr(rate_limit_map, '__len__') else 'n/a'}). "
2135+
f"Thrift API rate limiting is effectively disabled — "
2136+
f"refusing to start THFT run. See D108220182."
2137+
)
2138+
2139+
self.logger.info(
2140+
f"{hostname}: thrift API rate limiting is ENABLED "
2141+
f"({len(rate_limit_map)} APIs in the limit map). "
2142+
f"Sample rates: "
2143+
+ ", ".join(f"{k}={v}" for k, v in list(rate_limit_map.items())[:5])
2144+
+ " ..."
2145+
)

taac/testconfigs/npi/thrift_hardening_test_config.py

Lines changed: 88 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from ixia.ixia import types as ixia_types
2727
from taac.playbooks.playbook_definitions import (
2828
add_common_checks_to_thft_playbooks,
29-
create_thft_baseline_playbook,
29+
create_thft_playbooks,
3030
)
3131
from taac.task_definitions import (
3232
create_configure_parallel_bgp_peers_task,
@@ -100,6 +100,8 @@ def create_npi_thrift_hardening_test_config(
100100
downlink_peer_tag: str,
101101
stsw_flap_ports: list,
102102
test_duration_s: int = 600,
103+
restart_test_duration_s: int = 3600,
104+
restart_period_s: int = 300,
103105
requests_per_burst: int = 10000,
104106
burst_timeout_s: float = 60.0,
105107
direct_ixia_connections=None,
@@ -132,8 +134,12 @@ def create_npi_thrift_hardening_test_config(
132134
GTSW-adjacent uplinks for an STSW, etc.). EXCLUDE IXIA-facing
133135
ports — flapping those breaks IXIA peering and would invalidate
134136
the BGP_SESSION_ESTABLISH precheck.
135-
test_duration_s: Longevity stage duration (default 600s = 10 min smoke).
136-
Production passes 14400 (4 hr).
137+
test_duration_s: THFT_001 baseline longevity duration (default 600s =
138+
10 min smoke). Production passes 14400 (4 hr).
139+
restart_test_duration_s: Per-playbook duration for THFT_002..005
140+
(each restart-variant). Default 3600s (1 hr) so the 4 restart
141+
variants total ~4hr — matches the THFT_001 4hr soak instead of
142+
blowing the campaign wall-time up to 5×4=20hr.
137143
direct_ixia_connections: Optional explicit direct-IXIA mapping.
138144
basset_pool: Optional override pool selection. Default "dne.test".
139145
service_restart_services: Override default service-restart-check list.
@@ -167,6 +173,15 @@ def create_npi_thrift_hardening_test_config(
167173
# Setup tasks: BGP peer scaffolding (mirrors NPI cpu_queue testconfig,
168174
# MINUS the rogue interface — THFT only needs downlink+uplink for
169175
# BGP_SESSION_ESTABLISH precheck + BGP_PEER_ROUTE snapshot).
176+
#
177+
# TODO: re-enable `create_assert_thrift_rate_limit_enabled_task` as
178+
# the first setup-task once partner team confirms the config path
179+
# for `thriftApiToRateLimitInQps` in `getRunningConfig()` output —
180+
# current recursive-search implementation did not locate the key on
181+
# the DUT despite D108220182 having shipped, so the gate is shelved
182+
# to avoid false-FAIL on tonight's overnight campaign. The task
183+
# class + factory remain in place (tasks/all.py +
184+
# task_definitions.py) so re-enable is a one-line add here.
170185
setup_tasks=[
171186
create_coop_unregister_patchers_task(device_name),
172187
# Remove all existing BGP peers first.
@@ -520,15 +535,15 @@ def create_npi_thrift_hardening_test_config(
520535
# the disruption is thrift load on the agent + qsfp flaps via the
521536
# periodic task attached to each playbook.
522537
playbooks=add_common_checks_to_thft_playbooks(
523-
[
524-
create_thft_baseline_playbook(
525-
device_name=device_name,
526-
stsw_flap_ports=stsw_flap_ports,
527-
test_duration_s=test_duration_s,
528-
requests_per_burst=requests_per_burst,
529-
burst_timeout_s=burst_timeout_s,
530-
)
531-
],
538+
create_thft_playbooks(
539+
device_name=device_name,
540+
stsw_flap_ports=stsw_flap_ports,
541+
test_duration_s=test_duration_s,
542+
restart_test_duration_s=restart_test_duration_s,
543+
restart_period_s=restart_period_s,
544+
requests_per_burst=requests_per_burst,
545+
burst_timeout_s=burst_timeout_s,
546+
),
532547
service_restart_services=service_restart_services,
533548
),
534549
)
@@ -697,60 +712,65 @@ def create_npi_thrift_hardening_test_config(
697712
# `NPI_DVT_ICEPACK_GTSW__CPU_QUEUE_TEST_CONFIG` (which has been validated on
698713
# the same DUT). Flap target = 128 STSW-adjacent uplinks (NOT the IXIA-facing
699714
# `eth1/13/1`/`eth1/13/3` carrying BGP peers).
700-
NPI_DVT_ICEPACK_GTSW__THRIFT_HARDENING_TEST_CONFIG = (
701-
create_npi_thrift_hardening_test_config(
702-
test_config_name="NPI_DVT_ICEPACK_GTSW__THRIFT_HARDENING_TEST_CONFIG",
703-
device_name="gtsw001.l1001.c085.ash6",
704-
local_mac_address="02:00:00:00:0f:0c",
705-
ixia_downlink_interface="eth1/13/1",
706-
ixia_uplink_interface="eth1/13/3",
707-
peergroup_uplink_mimic_v6="PEERGROUP_GTSW_STSW_V6",
708-
peergroup_uplink_mimic_v4="PEERGROUP_GTSW_STSW_V4",
709-
peergroup_downlink_mimic_v6="PEERGROUP_GTSW_STSW_V6",
710-
peergroup_downlink_mimic_v4="PEERGROUP_GTSW_HOST_MIMIC_V4",
711-
route_map_uplink_ingress="PROPAGATE_GTSW_STSW_IN",
712-
route_map_uplink_egress="PROPAGATE_GTSW_STSW_OUT",
713-
route_map_downlink_ingress="PROPAGATE_GTSW_STSW_IN",
714-
route_map_downlink_egress="PROPAGATE_GTSW_STSW_OUT",
715-
ixia_downlink_ic_parent_network_v6="2401:db00:1ff:c108",
716-
ixia_uplink_ic_parent_network_v6="2401:db00:1ff:c109",
717-
ixia_downlink_ic_parent_network_v4="10.127.240",
718-
ixia_uplink_ic_parent_network_v4="10.127.241",
719-
unique_prefix_limit="5000",
720-
per_peer_max_route_limit="20000",
721-
downlink_peer_count=8,
722-
uplink_peer_count=8,
723-
remote_uplink_as_4byte=65272,
724-
remote_downlink_as_4byte=7001,
725-
remote_as_4_byte_step=1,
726-
is_uplink_peer_confed="False",
727-
is_downlink_peer_confed="False",
728-
ixia_downlink_prefix_count_v6=500,
729-
ixia_uplink_prefix_count_v6=500,
730-
ixia_downlink_prefix_count_v4=500,
731-
ixia_uplink_prefix_count_v4=500,
732-
ixia_downlink_communities=["65446:30", "65441:323", "65456:323"],
733-
ixia_uplink_communities=["65446:30", "65441:323", "65456:323"],
734-
downlink_peer_tag="HOST",
735-
uplink_peer_tag="STSW",
736-
stsw_flap_ports=ICEPACK_GTSW_STSW_FLAP_PORTS,
737-
test_duration_s=14400, # 4 hr prod (override to 600 = 10 min for smoke)
738-
# Scale-down: 1000 per API (vs Pavan's 10000). At 10000 the 70K-call
739-
# asyncio.gather pegged `fboss_sw_agent` CPU at 1339% within ~2.5 min
740-
# and ticked `coop.unclean_exits` (gather() hung indefinitely with no
741-
# bursts completing). 1000 keeps gather() within `burst_timeout_s` and
742-
# lets multiple bursts complete per run.
743-
requests_per_burst=1000,
744-
basset_pool="dne.test",
745-
# IcePack backend GTSW does not run openr — drop from postcheck list
746-
# to avoid false-fail on INACTIVE (same rationale as cpu_queue config).
747-
service_restart_services=[
748-
"bgpd",
749-
"fboss_hw_agent@0",
750-
"fboss_sw_agent",
751-
"fsdb",
752-
"qsfp_service",
753-
"wedge_agent",
754-
],
755-
)
715+
NPI_DVT_ICEPACK_GTSW__THRIFT_HARDENING_TEST_CONFIG = create_npi_thrift_hardening_test_config(
716+
test_config_name="NPI_DVT_ICEPACK_GTSW__THRIFT_HARDENING_TEST_CONFIG",
717+
device_name="gtsw001.l1001.c085.ash6",
718+
local_mac_address="02:00:00:00:0f:0c",
719+
ixia_downlink_interface="eth1/13/1",
720+
ixia_uplink_interface="eth1/13/3",
721+
peergroup_uplink_mimic_v6="PEERGROUP_GTSW_STSW_V6",
722+
peergroup_uplink_mimic_v4="PEERGROUP_GTSW_STSW_V4",
723+
peergroup_downlink_mimic_v6="PEERGROUP_GTSW_STSW_V6",
724+
peergroup_downlink_mimic_v4="PEERGROUP_GTSW_HOST_MIMIC_V4",
725+
route_map_uplink_ingress="PROPAGATE_GTSW_STSW_IN",
726+
route_map_uplink_egress="PROPAGATE_GTSW_STSW_OUT",
727+
route_map_downlink_ingress="PROPAGATE_GTSW_STSW_IN",
728+
route_map_downlink_egress="PROPAGATE_GTSW_STSW_OUT",
729+
ixia_downlink_ic_parent_network_v6="2401:db00:1ff:c108",
730+
ixia_uplink_ic_parent_network_v6="2401:db00:1ff:c109",
731+
ixia_downlink_ic_parent_network_v4="10.127.240",
732+
ixia_uplink_ic_parent_network_v4="10.127.241",
733+
unique_prefix_limit="5000",
734+
per_peer_max_route_limit="20000",
735+
downlink_peer_count=8,
736+
uplink_peer_count=8,
737+
remote_uplink_as_4byte=65272,
738+
remote_downlink_as_4byte=7001,
739+
remote_as_4_byte_step=1,
740+
is_uplink_peer_confed="False",
741+
is_downlink_peer_confed="False",
742+
ixia_downlink_prefix_count_v6=500,
743+
ixia_uplink_prefix_count_v6=500,
744+
ixia_downlink_prefix_count_v4=500,
745+
ixia_uplink_prefix_count_v4=500,
746+
ixia_downlink_communities=["65446:30", "65441:323", "65456:323"],
747+
ixia_uplink_communities=["65446:30", "65441:323", "65456:323"],
748+
downlink_peer_tag="HOST",
749+
uplink_peer_tag="STSW",
750+
stsw_flap_ports=ICEPACK_GTSW_STSW_FLAP_PORTS,
751+
test_duration_s=14400, # THFT_001 = 4 hr prod (override to 600 = 10 min for smoke)
752+
restart_test_duration_s=3600, # THFT_002..005 = 1 hr each → 4hr total
753+
# Scaled back to Pavan's original 10000 per API (= 70K concurrent calls
754+
# per burst across the 7 read-only APIs) after D108220182 enabled
755+
# server-side thrift API rate limiting on ICECUBE800BC. With
756+
# `thriftApiToRateLimitInQps` populated (~140 APIs at 2-8 qps each), the
757+
# agent throttles excess calls and CPU stays bounded even at the
758+
# original 70K burst scale. The new `assert_thrift_rate_limit_enabled`
759+
# setup-task (see setup_tasks above) fails fast if rate limiting is OFF,
760+
# so we won't accidentally hammer an unprotected agent at 70K again.
761+
# Prior history (pre-D108220182): 70K crashed `fboss_sw_agent` at
762+
# CPU 1339% in ~2.5 min and ticked `coop.unclean_exits` twice
763+
# (T275336067 — kernel OOM via workload.slice swap exhaustion).
764+
requests_per_burst=10000,
765+
basset_pool="dne.test",
766+
# IcePack backend GTSW does not run openr — drop from postcheck list
767+
# to avoid false-fail on INACTIVE (same rationale as cpu_queue config).
768+
service_restart_services=[
769+
"bgpd",
770+
"fboss_hw_agent@0",
771+
"fboss_sw_agent",
772+
"fsdb",
773+
"qsfp_service",
774+
"wedge_agent",
775+
],
756776
)

0 commit comments

Comments
 (0)