tests/multi-server: embed proto_load in kafka-producer1, add reconnect suite

arr2036 · arr2036 · commit 6daacc9b7f59 · 2026-04-23T17:49:26.000-04:00
Topology simplification: move the proto_load listener directly into
kafka-producer1's virtual server, so generated Access-Requests flow
straight into `recv Access-Request` -&gt; kafka.produce without going
over the wire.  One fewer container, one fewer RADIUS hop, and the
test still exercises exactly the produce path end-to-end.

Changes:

 * environments/kafka.yml.j2
   - Drop the load-generator service.
   - Feed the proto_load profile (start_pps / max_pps / duration /
     step / parallel / num_messages) to kafka-producer1 via env vars;
     Jinja pulls them from the test's loadgen: block.
   - Re-declare TEST_PROJECT_NAME / TEST_SUBNET inline on
     kafka-producer1 because YAML's &lt;&lt;: anchor merge doesn't union
     nested dicts - a service-level environment: replaces the one
     inherited from x-common-config.
   - New `loadgen_num_messages` knob, defaulting to
     `expected_messages`, so tests that expect loss (reconnect) can
     generate more than the consumer will count.

 * configs/freeradius/kafka-producer1/radiusd.conf.j2
   - Add `listen load { handler = load; transport = step; step { ... } }`
     inside the existing kafka-producer server.

 * configs/freeradius/kafka-producer1/load-generator-packets/packet.conf
   - Default Access-Request packet skeleton proto_load sends.

 * tests/kafka-produce/{short.ci,heavy}.test.yml + template.yml.j2
   - Collapse to a single state that waits for kafka-consumer-summary.
     No more two-phase load-gen orchestration; proto_load fires on
     freeradius startup and finishes long before the summary arrives.

 * tests/kafka-produce-reconnect/
   - New suite exercising broker disconnect / reconnect.  Applies 100%
     packet loss on kafka-producer1's egress mid-stream (packet_loss
     action from the framework's NetworkEvents), holds for
     `outage_seconds`, then removes it.  Queued produces inside
     librdkafka drain after reconnect, request threads that yielded
     waiting on their delivery reports resume, and the consumer
     eventually sees &gt;= expected_messages on the topic.
diff --git a/src/tests/multi-server/configs/freeradius/kafka-producer1/load-generator-packets/packet.conf b/src/tests/multi-server/configs/freeradius/kafka-producer1/load-generator-packets/packet.conf
@@ -0,0 +1,3 @@
+User-Name       = "testuser"
+User-Password   = "testpass"
+Calling-Station-ID = "F1-F2-F3-F4-F5-F6"
diff --git a/src/tests/multi-server/configs/freeradius/kafka-producer1/radiusd.conf.j2 b/src/tests/multi-server/configs/freeradius/kafka-producer1/radiusd.conf.j2
@@ -87,6 +87,38 @@ server kafka-producer {
 
 	namespace = radius
 
+	#
+	#  proto_load based synthetic Access-Request generator.  Lives in
+	#  the same virtual server as the rlm_kafka call path so generated
+	#  packets flow straight into `recv Access-Request` without going
+	#  over the wire - no separate load-generator container, no
+	#  inter-container RADIUS hop.
+	#
+	#  Profile parameters come from the per-test env vars set on this
+	#  container by the compose file (which pulls them from the test's
+	#  loadgen: {} block).
+	#
+	listen load {
+		handler = load
+		type = Access-Request
+		transport = step
+
+		step {
+			filename = ${confdir}/load-generator-packets/packet.conf
+
+			max_attributes = 64
+
+			start_pps    = $ENV{TEST_LOADGEN_START_PPS}
+			max_pps      = $ENV{TEST_LOADGEN_MAX_PPS}
+			duration     = $ENV{TEST_LOADGEN_DURATION}
+			step         = $ENV{TEST_LOADGEN_STEP}
+			max_backlog  = $ENV{TEST_LOADGEN_MAX_BACKLOG}
+			parallel     = $ENV{TEST_LOADGEN_PARALLEL}
+			num_messages = $ENV{TEST_LOADGEN_NUM_MESSAGES}
+			repeat       = no
+		}
+	}
+
 	listen authentication {
 		type = Access-Request
 		type = Status-Server
diff --git a/src/tests/multi-server/environments/kafka.yml.j2 b/src/tests/multi-server/environments/kafka.yml.j2
@@ -1,25 +1,24 @@
 # ---------------------------------------------------------------
 #  Docker Compose Test Environment:
 #
-#                      Access-Request
-#   load-generator  -----------------> kafka-producer1
-#                                            |
-#                                            | kafka.produce
-#                                            v
-#                                           kafka (redpanda)
-#                                            |
-#                                            | consume
-#                                            v
-#                                    kafka-consumer
-#                                       (echoes each
-#                                       message back
-#                                       to the test
-#                                       framework)
+#   kafka-producer1 (rlm_kafka)
+#         ^           |
+#         |           | kafka.produce
+#         |           v
+#     radclient     kafka (broker)
+#     (via framework  |
+#      exec on same   | consume
+#      container)     v
+#                kafka-consumer
+#                   (one listener line per received
+#                    message + a summary; the test
+#                    framework reads those to verify)
 #
-#  Each Access-Request triggers exactly one produce.  kafka-consumer
-#  reads the topic and writes one listener line per received message
-#  plus a final summary line; the test framework verifies the count
-#  matches the number of packets sent.
+#  The test framework drives load by exec-ing `radclient` inside the
+#  kafka-producer1 container and sending to the RADIUS listener at
+#  localhost:1812.  Each Access-Request triggers exactly one
+#  kafka.produce.  Keeping load generation in-container means no
+#  separate load-generator service and no inter-container RADIUS hop.
 # ---------------------------------------------------------------
 x-common-config: &id001
   cap_add:
@@ -76,14 +75,56 @@ services:
         condition: service_healthy
     volumes:
     - ${DATA_PATH}/freeradius/kafka-producer1/radiusd.conf:/etc/raddb/radiusd.conf
+    - ${DATA_PATH}/freeradius/kafka-producer1/load-generator-packets/:/etc/raddb/load-generator-packets/
+    - ${DATA_PATH}/freeradius/env-setup.sh:/tmp/env-setup.sh
     - ${LISTENER_DIR}/:/var/run/multi-server/
     restart: unless-stopped
+    environment:
+      #
+      #  YAML's `<<:` anchor merge doesn't union nested dicts, so
+      #  declaring an `environment:` block on the service replaces
+      #  the one inherited from x-common-config.  Re-include the
+      #  shared vars here.
+      #
+      TEST_PROJECT_NAME: ${COMPOSE_PROJECT_NAME}
+      TEST_SUBNET: {{ test_subnet | default('172.16.0.0/12') }}
+      #
+      #  proto_load profile - read by the `listen load { step { ... } }`
+      #  block in this container's radiusd.conf.  The test's loadgen:
+      #  dict in its .test.yml feeds these.
+      #
+      TEST_LOADGEN_START_PPS:    "{{ loadgen.start_pps }}"
+      TEST_LOADGEN_MAX_PPS:      "{{ loadgen.max_pps }}"
+      TEST_LOADGEN_DURATION:     "{{ loadgen.duration }}"
+      TEST_LOADGEN_STEP:         "{{ loadgen.step }}"
+      TEST_LOADGEN_MAX_BACKLOG:  "{{ loadgen.max_backlog }}"
+      TEST_LOADGEN_PARALLEL:     "{{ loadgen.parallel }}"
+      #
+      #  Hard cap on packets emitted.  Tests that want the emit
+      #  count to equal the consume count default this to
+      #  `expected_messages`; tests where some loss is expected
+      #  (e.g. the reconnect test, where a brief outage can fail a
+      #  small fraction of in-flight produces) override with a
+      #  larger `loadgen_num_messages`.
+      #
+      TEST_LOADGEN_NUM_MESSAGES: "{{ loadgen_num_messages | default(expected_messages) }}"
     healthcheck:
       test: ["CMD-SHELL", "echo 'Message-Authenticator = 0x00' | radclient localhost:1812 status testing123"]
       interval: 2s
       timeout: 5s
       retries: 10
-      start_period: 30s
+      start_period: 45s
+    #  The env-setup source installs iproute2 so tests that need to
+    #  apply netem qdiscs (e.g. the reconnect test's packet_loss
+    #  action) have `tc` available inside the container.  Harmless
+    #  no-op for tests that don't.
+    entrypoint:
+    - bash
+    - -c
+    - |
+      source /tmp/env-setup.sh && \
+      exec /docker-entrypoint.sh "$@"
+    - --
     command: ["freeradius", "-f", "-l", "stdout"]
     <<: *id001
 
@@ -114,25 +155,3 @@ services:
     - /usr/local/bin/consume.sh
     restart: "no"
     <<: *id001
-
-  load-generator:
-    image: freeradius-build:latest
-    depends_on:
-      kafka-producer1:
-        condition: service_healthy
-      kafka-consumer:
-        condition: service_started
-    volumes:
-    - ${DATA_PATH}/freeradius/load-generator/template.d/load-generator-templates:/etc/raddb/template.d/load-generator-templates
-    - ${DATA_PATH}/freeradius/load-generator/mods-config/files/authorize:/etc/raddb/mods-config/files/authorize
-    - ${DATA_PATH}/freeradius/load-generator/radiusd.conf:/etc/raddb/radiusd.conf
-    - ${DATA_PATH}/freeradius/load-generator/load-generator-packets/:/etc/raddb/load-generator-packets/
-    - ${LISTENER_DIR}/:/var/run/multi-server/
-    entrypoint:
-    - bash
-    - -lc
-    - |
-      # Keep the container alive.  The test framework starts FreeRADIUS
-      # and runs commands via 'docker exec' so it can control timing.
-      sleep infinity
-    <<: *id001
diff --git a/src/tests/multi-server/tests/kafka-produce-reconnect/environment.yml.j2 b/src/tests/multi-server/tests/kafka-produce-reconnect/environment.yml.j2
@@ -0,0 +1 @@
+../../environments/kafka.yml.j2
diff --git a/src/tests/multi-server/tests/kafka-produce-reconnect/short.ci.test.yml b/src/tests/multi-server/tests/kafka-produce-reconnect/short.ci.test.yml
@@ -0,0 +1,58 @@
+#
+#  Broker disconnect / reconnect round-trip test.
+#
+#  proto_load on kafka-producer1 generates Access-Requests steadily
+#  for `loadgen.duration` seconds.  Partway through that window we
+#  cut the producer's network to the broker with 100% packet loss,
+#  hold for `outage_seconds`, then restore.  Messages produced
+#  during the outage queue inside librdkafka; their request threads
+#  yield waiting for delivery reports.  When the link is restored
+#  librdkafka reconnects, drains its queue, delivery reports fire
+#  and yielded requests resume.
+#
+#  Pass criterion: consumer reports PASS with
+#  received == expected == loadgen total.
+#
+listener_type: file
+
+#
+#  proto_load profile.  max_pps has a floor of 10 internally, so
+#  the effective steady-state rate is 10 pps.  Duration must be
+#  long enough that proto_load is still generating by the time the
+#  framework finishes compose-up and reaches state_1 (CI DinD adds
+#  ~40-50s of startup before the first state runs).
+#
+loadgen:
+  start_pps: 10
+  max_pps: 10
+  duration: 30
+  step: 10
+  parallel: 1
+  max_backlog: 1000
+
+#
+#  proto_load overshoots slightly - the `num_messages` cap stops
+#  emission, but a handful of requests already in-flight in the
+#  worker pool finish after that.  Set num_messages well above
+#  expected_messages so the consumer always has enough to count.
+#
+#  expected_messages is what the consumer (kcat -c N) stops at,
+#  so the kafka-consumer-summary line declares PASS as long as at
+#  least `expected_messages` made it through the disconnect /
+#  reconnect cycle.
+#
+loadgen_num_messages: 250
+expected_messages: 200
+
+kafka_topic: fr-multi-server-reconnect-test
+
+#  How long to hold the outage.  Shorter than librdkafka's default
+#  message.timeout.ms (5 min) so queued produces don't fail before
+#  recovery.
+outage_seconds: 3
+
+#  Timeouts sized for the self-hosted CI DinD runners.  test_timeout
+#  must cover compose up + state_1 outage + state_2 wait-for-summary.
+test_timeout: 240
+test_verify_timeout: 120
+consumer_timeout: 180
diff --git a/src/tests/multi-server/tests/kafka-produce-reconnect/template.yml.j2 b/src/tests/multi-server/tests/kafka-produce-reconnect/template.yml.j2
@@ -0,0 +1,59 @@
+timeout: {{ test_timeout }}
+state_order: sequence
+states:
+
+  #
+  #  proto_load on kafka-producer1 is generating Access-Requests
+  #  continuously for `loadgen.duration` seconds.  Drop the
+  #  producer's network to the broker in the middle of that window:
+  #  packets produced during the outage queue inside librdkafka and
+  #  their request threads yield waiting for delivery reports.
+  #
+  #  The verify timeout on this state is how long we hold the outage.
+  #
+  state_1:
+    description: >
+      Apply 100% packet loss on the producer while proto_load is
+      in flight, and hold the outage for {{ outage_seconds }}s.
+    host:
+      kafka-producer1:
+        actions:
+        - packet_loss:
+            interface: eth0
+            loss: 100
+    verify:
+      timeout: {{ outage_seconds }}
+      trigger_mode: unordered
+
+  #
+  #  Remove the packet loss.  librdkafka reconnects, drains its
+  #  queue, delivery reports fire for the queued produces, yielded
+  #  request threads resume, and the consumer eventually sees every
+  #  message.  Pass criterion: the consumer summary reports PASS
+  #  with received == expected == full count.
+  #
+  state_2:
+    description: >
+      Remove the packet loss and wait for the consumer to report
+      PASS with received == expected == {{ expected_messages }}.
+    host:
+      kafka-producer1:
+        actions:
+        - packet_loss:
+            interface: eth0
+            loss: 0
+    verify:
+      timeout: {{ test_verify_timeout }}
+      trigger_mode: unordered
+      triggers:
+      - kafka-consumer-summary:
+          json:
+            result:
+              pattern:
+                reg_pattern: PASS
+            expected:
+              pattern:
+                reg_pattern: "^{{ expected_messages }}$"
+            received:
+              pattern:
+                reg_pattern: "^{{ expected_messages }}$"
diff --git a/src/tests/multi-server/tests/kafka-produce/heavy.test.yml b/src/tests/multi-server/tests/kafka-produce/heavy.test.yml
@@ -1,38 +1,28 @@
 #
-#  Heavy stress variant.  Ramps PPS aggressively across four 2s steps:
-#
-#      500 -> 1000 -> 1500 -> 2000 pps  =  10,000 requests in ~8s
-#
-#  That's enough concurrent work to exercise every worker thread and
-#  drive many back-to-back delivery reports through rlm_kafka's self-pipe,
-#  surfacing races and queue-pressure bugs the short sanity test can't.
+#  Heavy stress variant.  Ramps PPS aggressively to exercise every
+#  worker thread and drive many back-to-back delivery reports
+#  through rlm_kafka's shared producer + per-worker mailbox path.
 #
 #  Not tagged *.ci.test.yml - developers run this locally via
 #  `make test.multi-server.kafka-produce.heavy`, or via the full
-#  `make test.multi-server` sweep, but it isn't in `test.multi-server.ci`.
+#  `make test.multi-server` sweep.
 #
 listener_type: file
 
-load_gen_num_of_dst_servers: 1
-load_gen_dst_server_name: kafka-producer
-
 loadgen:
   start_pps: 500
   max_pps: 2000
   duration: 2
   step: 500
   parallel: 4
   max_backlog: 20000
-  repeat: "no"
 
 # 2 * (500 + 1000 + 1500 + 2000) = 10000
 expected_messages: 10000
 
 kafka_topic: fr-multi-server-test
 
-# State_1 needs ~8s load-gen + enough time for a single-broker redpanda +
-# kcat to drain 10k messages.  On macOS Docker Desktop this isn't instant;
-# Linux CI should comfortably finish well inside these budgets.
+# Generous budget for the 10k burst + broker drain on CI.
 test_timeout: 360
-test_verify_timeout: 150
-consumer_timeout: 180
+test_verify_timeout: 300
+consumer_timeout: 300
diff --git a/src/tests/multi-server/tests/kafka-produce/short.ci.test.yml b/src/tests/multi-server/tests/kafka-produce/short.ci.test.yml
@@ -1,31 +1,29 @@
 listener_type: file
 
-# Routing (load-generator sends to kafka-producer)
-load_gen_num_of_dst_servers: 1
-load_gen_dst_server_name: kafka-producer
-
-# Load generator profile.  A modest burst that's enough to exercise
-# delivery reports without stressing CI timing budgets.
+#
+#  proto_load profile for the kafka-producer1 container's built-in
+#  load generator.  Modest burst - enough to exercise delivery
+#  reports without stressing CI timing budgets.
+#
 loadgen:
   start_pps: 5
   max_pps: 5
   duration: 4
   step: 5
   parallel: 1
   max_backlog: 1000
-  repeat: "no"
 
-# Total Access-Requests the load-generator will emit.  Keep in sync with
-# loadgen above: start_pps * duration when start == max.
+#
+#  Expected message count at the consumer.  Must equal the total
+#  proto_load emits: sum over each pps step of (duration * pps).
+#
 expected_messages: 20
 
 kafka_topic: fr-multi-server-test
 
-# Test framework timeouts.  The whole test has to fit inside test_timeout;
-# each state waits `test_verify_timeout` for its triggers.  Values are
-# sized for the self-hosted CI DinD runners, which are substantially
-# slower than local Docker Desktop (JVM broker startup alone eats
-# ~30s through the healthcheck).
+#  Timeouts sized for the self-hosted CI DinD runners - JVM kafka
+#  startup through the healthcheck + proto_load burst + consumer
+#  drain all have to fit inside `test_verify_timeout`.
 test_timeout: 120
 test_verify_timeout: 60
 consumer_timeout: 90
diff --git a/src/tests/multi-server/tests/kafka-produce/template.yml.j2 b/src/tests/multi-server/tests/kafka-produce/template.yml.j2

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+User-Name = "testuser"`
	`2`	`+User-Password = "testpass"`
	`3`	`+Calling-Station-ID = "F1-F2-F3-F4-F5-F6"`