|
26 | 26 | from ixia.ixia import types as ixia_types |
27 | 27 | from taac.playbooks.playbook_definitions import ( |
28 | 28 | add_common_checks_to_thft_playbooks, |
29 | | - create_thft_baseline_playbook, |
| 29 | + create_thft_playbooks, |
30 | 30 | ) |
31 | 31 | from taac.task_definitions import ( |
32 | 32 | create_configure_parallel_bgp_peers_task, |
@@ -100,6 +100,8 @@ def create_npi_thrift_hardening_test_config( |
100 | 100 | downlink_peer_tag: str, |
101 | 101 | stsw_flap_ports: list, |
102 | 102 | test_duration_s: int = 600, |
| 103 | + restart_test_duration_s: int = 3600, |
| 104 | + restart_period_s: int = 300, |
103 | 105 | requests_per_burst: int = 10000, |
104 | 106 | burst_timeout_s: float = 60.0, |
105 | 107 | direct_ixia_connections=None, |
@@ -132,8 +134,12 @@ def create_npi_thrift_hardening_test_config( |
132 | 134 | GTSW-adjacent uplinks for an STSW, etc.). EXCLUDE IXIA-facing |
133 | 135 | ports — flapping those breaks IXIA peering and would invalidate |
134 | 136 | the BGP_SESSION_ESTABLISH precheck. |
135 | | - test_duration_s: Longevity stage duration (default 600s = 10 min smoke). |
136 | | - Production passes 14400 (4 hr). |
| 137 | + test_duration_s: THFT_001 baseline longevity duration (default 600s = |
| 138 | + 10 min smoke). Production passes 14400 (4 hr). |
| 139 | + restart_test_duration_s: Per-playbook duration for THFT_002..005 |
| 140 | + (each restart-variant). Default 3600s (1 hr) so the 4 restart |
| 141 | + variants total ~4hr — matches the THFT_001 4hr soak instead of |
| 142 | + blowing the campaign wall-time up to 5×4=20hr. |
137 | 143 | direct_ixia_connections: Optional explicit direct-IXIA mapping. |
138 | 144 | basset_pool: Optional override pool selection. Default "dne.test". |
139 | 145 | service_restart_services: Override default service-restart-check list. |
@@ -167,6 +173,15 @@ def create_npi_thrift_hardening_test_config( |
167 | 173 | # Setup tasks: BGP peer scaffolding (mirrors NPI cpu_queue testconfig, |
168 | 174 | # MINUS the rogue interface — THFT only needs downlink+uplink for |
169 | 175 | # BGP_SESSION_ESTABLISH precheck + BGP_PEER_ROUTE snapshot). |
| 176 | + # |
| 177 | + # TODO: re-enable `create_assert_thrift_rate_limit_enabled_task` as |
| 178 | + # the first setup-task once partner team confirms the config path |
| 179 | + # for `thriftApiToRateLimitInQps` in `getRunningConfig()` output — |
| 180 | + # current recursive-search implementation did not locate the key on |
| 181 | + # the DUT despite D108220182 having shipped, so the gate is shelved |
| 182 | + # to avoid false-FAIL on tonight's overnight campaign. The task |
| 183 | + # class + factory remain in place (tasks/all.py + |
| 184 | + # task_definitions.py) so re-enable is a one-line add here. |
170 | 185 | setup_tasks=[ |
171 | 186 | create_coop_unregister_patchers_task(device_name), |
172 | 187 | # Remove all existing BGP peers first. |
@@ -520,15 +535,15 @@ def create_npi_thrift_hardening_test_config( |
520 | 535 | # the disruption is thrift load on the agent + qsfp flaps via the |
521 | 536 | # periodic task attached to each playbook. |
522 | 537 | playbooks=add_common_checks_to_thft_playbooks( |
523 | | - [ |
524 | | - create_thft_baseline_playbook( |
525 | | - device_name=device_name, |
526 | | - stsw_flap_ports=stsw_flap_ports, |
527 | | - test_duration_s=test_duration_s, |
528 | | - requests_per_burst=requests_per_burst, |
529 | | - burst_timeout_s=burst_timeout_s, |
530 | | - ) |
531 | | - ], |
| 538 | + create_thft_playbooks( |
| 539 | + device_name=device_name, |
| 540 | + stsw_flap_ports=stsw_flap_ports, |
| 541 | + test_duration_s=test_duration_s, |
| 542 | + restart_test_duration_s=restart_test_duration_s, |
| 543 | + restart_period_s=restart_period_s, |
| 544 | + requests_per_burst=requests_per_burst, |
| 545 | + burst_timeout_s=burst_timeout_s, |
| 546 | + ), |
532 | 547 | service_restart_services=service_restart_services, |
533 | 548 | ), |
534 | 549 | ) |
@@ -697,60 +712,65 @@ def create_npi_thrift_hardening_test_config( |
697 | 712 | # `NPI_DVT_ICEPACK_GTSW__CPU_QUEUE_TEST_CONFIG` (which has been validated on |
698 | 713 | # the same DUT). Flap target = 128 STSW-adjacent uplinks (NOT the IXIA-facing |
699 | 714 | # `eth1/13/1`/`eth1/13/3` carrying BGP peers). |
700 | | -NPI_DVT_ICEPACK_GTSW__THRIFT_HARDENING_TEST_CONFIG = ( |
701 | | - create_npi_thrift_hardening_test_config( |
702 | | - test_config_name="NPI_DVT_ICEPACK_GTSW__THRIFT_HARDENING_TEST_CONFIG", |
703 | | - device_name="gtsw001.l1001.c085.ash6", |
704 | | - local_mac_address="02:00:00:00:0f:0c", |
705 | | - ixia_downlink_interface="eth1/13/1", |
706 | | - ixia_uplink_interface="eth1/13/3", |
707 | | - peergroup_uplink_mimic_v6="PEERGROUP_GTSW_STSW_V6", |
708 | | - peergroup_uplink_mimic_v4="PEERGROUP_GTSW_STSW_V4", |
709 | | - peergroup_downlink_mimic_v6="PEERGROUP_GTSW_STSW_V6", |
710 | | - peergroup_downlink_mimic_v4="PEERGROUP_GTSW_HOST_MIMIC_V4", |
711 | | - route_map_uplink_ingress="PROPAGATE_GTSW_STSW_IN", |
712 | | - route_map_uplink_egress="PROPAGATE_GTSW_STSW_OUT", |
713 | | - route_map_downlink_ingress="PROPAGATE_GTSW_STSW_IN", |
714 | | - route_map_downlink_egress="PROPAGATE_GTSW_STSW_OUT", |
715 | | - ixia_downlink_ic_parent_network_v6="2401:db00:1ff:c108", |
716 | | - ixia_uplink_ic_parent_network_v6="2401:db00:1ff:c109", |
717 | | - ixia_downlink_ic_parent_network_v4="10.127.240", |
718 | | - ixia_uplink_ic_parent_network_v4="10.127.241", |
719 | | - unique_prefix_limit="5000", |
720 | | - per_peer_max_route_limit="20000", |
721 | | - downlink_peer_count=8, |
722 | | - uplink_peer_count=8, |
723 | | - remote_uplink_as_4byte=65272, |
724 | | - remote_downlink_as_4byte=7001, |
725 | | - remote_as_4_byte_step=1, |
726 | | - is_uplink_peer_confed="False", |
727 | | - is_downlink_peer_confed="False", |
728 | | - ixia_downlink_prefix_count_v6=500, |
729 | | - ixia_uplink_prefix_count_v6=500, |
730 | | - ixia_downlink_prefix_count_v4=500, |
731 | | - ixia_uplink_prefix_count_v4=500, |
732 | | - ixia_downlink_communities=["65446:30", "65441:323", "65456:323"], |
733 | | - ixia_uplink_communities=["65446:30", "65441:323", "65456:323"], |
734 | | - downlink_peer_tag="HOST", |
735 | | - uplink_peer_tag="STSW", |
736 | | - stsw_flap_ports=ICEPACK_GTSW_STSW_FLAP_PORTS, |
737 | | - test_duration_s=14400, # 4 hr prod (override to 600 = 10 min for smoke) |
738 | | - # Scale-down: 1000 per API (vs Pavan's 10000). At 10000 the 70K-call |
739 | | - # asyncio.gather pegged `fboss_sw_agent` CPU at 1339% within ~2.5 min |
740 | | - # and ticked `coop.unclean_exits` (gather() hung indefinitely with no |
741 | | - # bursts completing). 1000 keeps gather() within `burst_timeout_s` and |
742 | | - # lets multiple bursts complete per run. |
743 | | - requests_per_burst=1000, |
744 | | - basset_pool="dne.test", |
745 | | - # IcePack backend GTSW does not run openr — drop from postcheck list |
746 | | - # to avoid false-fail on INACTIVE (same rationale as cpu_queue config). |
747 | | - service_restart_services=[ |
748 | | - "bgpd", |
749 | | - "fboss_hw_agent@0", |
750 | | - "fboss_sw_agent", |
751 | | - "fsdb", |
752 | | - "qsfp_service", |
753 | | - "wedge_agent", |
754 | | - ], |
755 | | - ) |
| 715 | +NPI_DVT_ICEPACK_GTSW__THRIFT_HARDENING_TEST_CONFIG = create_npi_thrift_hardening_test_config( |
| 716 | + test_config_name="NPI_DVT_ICEPACK_GTSW__THRIFT_HARDENING_TEST_CONFIG", |
| 717 | + device_name="gtsw001.l1001.c085.ash6", |
| 718 | + local_mac_address="02:00:00:00:0f:0c", |
| 719 | + ixia_downlink_interface="eth1/13/1", |
| 720 | + ixia_uplink_interface="eth1/13/3", |
| 721 | + peergroup_uplink_mimic_v6="PEERGROUP_GTSW_STSW_V6", |
| 722 | + peergroup_uplink_mimic_v4="PEERGROUP_GTSW_STSW_V4", |
| 723 | + peergroup_downlink_mimic_v6="PEERGROUP_GTSW_STSW_V6", |
| 724 | + peergroup_downlink_mimic_v4="PEERGROUP_GTSW_HOST_MIMIC_V4", |
| 725 | + route_map_uplink_ingress="PROPAGATE_GTSW_STSW_IN", |
| 726 | + route_map_uplink_egress="PROPAGATE_GTSW_STSW_OUT", |
| 727 | + route_map_downlink_ingress="PROPAGATE_GTSW_STSW_IN", |
| 728 | + route_map_downlink_egress="PROPAGATE_GTSW_STSW_OUT", |
| 729 | + ixia_downlink_ic_parent_network_v6="2401:db00:1ff:c108", |
| 730 | + ixia_uplink_ic_parent_network_v6="2401:db00:1ff:c109", |
| 731 | + ixia_downlink_ic_parent_network_v4="10.127.240", |
| 732 | + ixia_uplink_ic_parent_network_v4="10.127.241", |
| 733 | + unique_prefix_limit="5000", |
| 734 | + per_peer_max_route_limit="20000", |
| 735 | + downlink_peer_count=8, |
| 736 | + uplink_peer_count=8, |
| 737 | + remote_uplink_as_4byte=65272, |
| 738 | + remote_downlink_as_4byte=7001, |
| 739 | + remote_as_4_byte_step=1, |
| 740 | + is_uplink_peer_confed="False", |
| 741 | + is_downlink_peer_confed="False", |
| 742 | + ixia_downlink_prefix_count_v6=500, |
| 743 | + ixia_uplink_prefix_count_v6=500, |
| 744 | + ixia_downlink_prefix_count_v4=500, |
| 745 | + ixia_uplink_prefix_count_v4=500, |
| 746 | + ixia_downlink_communities=["65446:30", "65441:323", "65456:323"], |
| 747 | + ixia_uplink_communities=["65446:30", "65441:323", "65456:323"], |
| 748 | + downlink_peer_tag="HOST", |
| 749 | + uplink_peer_tag="STSW", |
| 750 | + stsw_flap_ports=ICEPACK_GTSW_STSW_FLAP_PORTS, |
| 751 | + test_duration_s=14400, # THFT_001 = 4 hr prod (override to 600 = 10 min for smoke) |
| 752 | + restart_test_duration_s=3600, # THFT_002..005 = 1 hr each → 4hr total |
| 753 | + # Scaled back to Pavan's original 10000 per API (= 70K concurrent calls |
| 754 | + # per burst across the 7 read-only APIs) after D108220182 enabled |
| 755 | + # server-side thrift API rate limiting on ICECUBE800BC. With |
| 756 | + # `thriftApiToRateLimitInQps` populated (~140 APIs at 2-8 qps each), the |
| 757 | + # agent throttles excess calls and CPU stays bounded even at the |
| 758 | + # original 70K burst scale. The new `assert_thrift_rate_limit_enabled` |
| 759 | + # setup-task (see setup_tasks above) fails fast if rate limiting is OFF, |
| 760 | + # so we won't accidentally hammer an unprotected agent at 70K again. |
| 761 | + # Prior history (pre-D108220182): 70K crashed `fboss_sw_agent` at |
| 762 | + # CPU 1339% in ~2.5 min and ticked `coop.unclean_exits` twice |
| 763 | + # (T275336067 — kernel OOM via workload.slice swap exhaustion). |
| 764 | + requests_per_burst=10000, |
| 765 | + basset_pool="dne.test", |
| 766 | + # IcePack backend GTSW does not run openr — drop from postcheck list |
| 767 | + # to avoid false-fail on INACTIVE (same rationale as cpu_queue config). |
| 768 | + service_restart_services=[ |
| 769 | + "bgpd", |
| 770 | + "fboss_hw_agent@0", |
| 771 | + "fboss_sw_agent", |
| 772 | + "fsdb", |
| 773 | + "qsfp_service", |
| 774 | + "wedge_agent", |
| 775 | + ], |
756 | 776 | ) |
0 commit comments