Make operator resilient to transient Redis errors

jmthomas · claude · jmthomas · commit 876faefd2363 · 2026-06-10T15:10:01.000-06:00
The operator (PID 1 in its container) ran its monitoring loop with no
rescue, so a single transient Redis error in update() -&gt; hgetall (the
kind of network blip that also makes targets reconnect) would unwind
run(), exit the process, and trigger a full container restart.

- operator.rb: wrap the monitoring cycle in a rescue so a failed cycle
  is logged and the loop keeps running, recovering on the next cycle.
  Only StandardError is caught, so SIGTERM/SystemExit still shut down
  cleanly.
- store_autoload.rb / store_implementation.py: configure the Redis/Valkey
  client with equal-jitter reconnect backoff (cap=5s, base=0.625, 3
  retries) so transient blips are absorbed inside the client and many
  clients don't reconnect in lockstep. Ruby samples the jittered delay
  array per connection since redis-rb takes a fixed delay array rather
  than a per-failure backoff callable.
- Add specs covering operator loop resilience and the backoff config in
  both Ruby and Python.

Co-Authored-By: Claude Opus 4.8 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/openc3/lib/openc3/operators/operator.rb b/openc3/lib/openc3/operators/operator.rb
@@ -381,11 +381,21 @@ def run
       # Monitor processes and respawn if died
       Logger.info("#{self.class} Monitoring processes every #{@cycle_time} sec...")
       loop do
-        update()
-        remove_old()
-        respawn_changed()
-        start_new()
-        respawn_dead()
+        # A single cycle must never be able to take down the operator process.
+        # update() in particular hits Redis every cycle and a transient network
+        # error (the kind that also makes targets reconnect) would otherwise
+        # unwind run() and exit the process, which in the container looks like a
+        # full operator restart. Catch, log, and keep cycling so the next cycle
+        # can recover once Redis is reachable again.
+        begin
+          update()
+          remove_old()
+          respawn_changed()
+          start_new()
+          respawn_dead()
+        rescue => e
+          Logger.error("#{self.class} cycle error, continuing: #{e.class} #{e.message}\n#{e.backtrace.join("\n")}")
+        end
         break if @shutdown
 
         sleep(@cycle_time)
diff --git a/openc3/lib/openc3/utilities/store_autoload.rb b/openc3/lib/openc3/utilities/store_autoload.rb
@@ -123,8 +123,46 @@ def initialize(pool_size = 10, db_shard: 0)
       @redis_pool = StoreConnectionPool.new(size: pool_size) { build_redis() }
     end
 
+    # cap/base for the equal-jitter reconnect backoff (seconds)
+    REDIS_BACKOFF_CAP = 5.0
+    REDIS_BACKOFF_BASE = 0.625
+
     def build_redis
-      return Redis.new(url: @redis_url, username: @redis_username, password: @redis_key)
+      # reconnect_attempts retries the connection a few times with equal-jitter
+      # backoff so a transient network blip is handled inside the client instead
+      # of immediately surfacing a connection error to callers. The jitter
+      # de-syncs many clients retrying the same blip to avoid a thundering herd
+      # on recovery.
+      #
+      # This mirrors the Python store's Retry(EqualJitterBackoff(cap: 5, base:
+      # 0.625), 3): per-retry backoff tops out at 5s on the final (3rd) retry
+      # (~0.6-1.25s, ~1.25-2.5s, ~2.5-5s). redis-rb takes a fixed Array of delays
+      # (no per-failure backoff callable), so we sample the jittered delays once
+      # per connection here.
+      # Connection, read, and write timeouts are left as the default: 1s
+      return Redis.new(
+        url: @redis_url,
+        username: @redis_username,
+        password: @redis_key,
+        reconnect_attempts: reconnect_backoff_delays()
+      )
+    end
+
+    # Equal-jitter backoff delays for 3 retries, matching Python's
+    # EqualJitterBackoff. Each retry's delay is randomized within the upper half
+    # of an exponentially-growing ceiling.
+    def reconnect_backoff_delays
+      (1..3).map do |failures|
+        # ceiling = exponential growth (base doubles each retry), clamped to cap.
+        # For base=0.625, cap=5: failures 1,2,3 -> 1.25, 2.5, 5.0 seconds.
+        # temp = half the ceiling: the guaranteed minimum wait for this retry.
+        temp = [REDIS_BACKOFF_CAP, REDIS_BACKOFF_BASE * (2 ** failures)].min / 2.0
+        # Final delay = fixed half (temp) + random half (rand*temp, in [0, temp)),
+        # i.e. a value uniformly in [temp, 2*temp) = [ceiling/2, ceiling).
+        # The fixed half keeps a sane floor; the random half de-syncs clients so
+        # they don't all reconnect in lockstep (thundering herd) after a blip.
+        temp + rand * temp
+      end
     end
 
     ###########################################################################
diff --git a/openc3/python/openc3/utilities/store_implementation.py b/openc3/python/openc3/utilities/store_implementation.py
@@ -13,7 +13,9 @@
 from contextlib import contextmanager
 
 import valkey
-from valkey.exceptions import TimeoutError
+from valkey.backoff import EqualJitterBackoff
+from valkey.exceptions import BusyLoadingError, ConnectionError, TimeoutError
+from valkey.retry import Retry
 
 from openc3.environment import *
 from openc3.utilities.connection_pool import ConnectionPool
@@ -165,11 +167,20 @@ def build_redis(self):
         # NOTE: We can't use decode_response because it tries to decode the binary
         # packet buffer which does not work. Thus strings come back as bytes like
         # b"target_name" and we decode them using b"target_name".decode()
+        #
+        # retry retries a command a few times with equal-jitter backoff so a
+        # transient network blip is handled inside the client instead of
+        # immediately surfacing a connection error to callers. The jitter
+        # de-syncs many clients retrying the same blip to avoid a thundering
+        # herd on recovery. With cap=5, base=0.625 the per-retry backoff tops
+        # out at 5s on the final (3rd) retry: ~0.6-1.25s, ~1.25-2.5s, ~2.5-5s.
         return valkey.Valkey(
             host=self.redis_host,
             port=self.redis_port,
             username=OPENC3_REDIS_USERNAME,
             password=OPENC3_REDIS_PASSWORD,
+            retry=Retry(EqualJitterBackoff(cap=5, base=0.625), 3),
+            retry_on_error=[BusyLoadingError, ConnectionError, TimeoutError],
         )
 
     ###########################################################################
diff --git a/openc3/python/test/utilities/test_store_implementation.py b/openc3/python/test/utilities/test_store_implementation.py
@@ -10,10 +10,43 @@
 # if purchased from OpenC3, Inc.
 
 import unittest
+from unittest.mock import patch
+
+from valkey.backoff import EqualJitterBackoff
+from valkey.exceptions import BusyLoadingError, ConnectionError, TimeoutError
+from valkey.retry import Retry
 
 from openc3.utilities.store_implementation import Store
 
 
 class TestStoreImplementation(unittest.TestCase):
     def test_help(self):
         help(Store)
+
+    def test_build_redis_configures_resilience(self):
+        # A transient network blip (the same kind that makes targets reconnect)
+        # must be retried inside the client with jittered backoff instead of
+        # immediately surfacing a connection error to callers, which would
+        # otherwise propagate up and kill the caller.
+        with patch("valkey.Valkey") as valkey_new:
+            # __new__ bypasses __init__ so we can exercise build_redis in
+            # isolation without spinning up the connection pool / singleton.
+            store = Store.__new__(Store)
+            store.redis_host = "localhost"
+            store.redis_port = 6379
+            store.build_redis()
+
+        self.assertEqual(valkey_new.call_count, 1)
+        _, kwargs = valkey_new.call_args
+        # Client retries with equal-jitter backoff on connection/timeout errors
+        retry = kwargs["retry"]
+        self.assertIsInstance(retry, Retry)
+        self.assertEqual(retry._retries, 3)
+        self.assertIsInstance(retry._backoff, EqualJitterBackoff)
+        self.assertIn(BusyLoadingError, kwargs["retry_on_error"])
+        self.assertIn(ConnectionError, kwargs["retry_on_error"])
+        self.assertIn(TimeoutError, kwargs["retry_on_error"])
+        # Per-retry backoff is bounded so a single retry can't hang forever;
+        # the final (3rd) retry tops out at the 5s cap (jittered, so 2.5-5s).
+        self.assertEqual(retry._backoff._cap, 5)
+        self.assertLessEqual(retry._backoff.compute(3), 5)
diff --git a/openc3/spec/operators/microservice_operator_spec.rb b/openc3/spec/operators/microservice_operator_spec.rb
@@ -261,5 +261,49 @@ def build_changed(started)
       #   end
       # end
     end
+
+    describe "redis error resilience" do
+      before(:each) do
+        @redis = mock_redis()
+        ENV['OPERATOR_CYCLE_TIME'] = '0.05'
+      end
+
+      after(:each) do
+        MicroserviceOperator.instance&.stop
+        # run() parks in a second loop waiting for @shutdown_complete, which is
+        # only set by shutdown() via at_exit (stubbed to a no-op in this spec).
+        # Force-kill the thread so teardown doesn't block on that park loop.
+        @thread&.kill
+        @thread&.join
+      rescue Redis::BaseError
+        # Before the fix the loop dies on the Redis error and join re-raises it
+        # here; swallow so the example's own assertion is the reported failure.
+      end
+
+      # A transient network blip (the same kind that makes targets reconnect)
+      # makes one hgetall raise. The operator must absorb it and keep cycling
+      # instead of letting the exception unwind run() and exit the process,
+      # which in the container manifests as a full operator restart.
+      it "survives a transient Redis error during update and keeps cycling" do
+        raised = false
+        allow(@redis).to receive(:hgetall).and_wrap_original do |original, key, *args|
+          if key.to_s.include?('openc3_microservices') && !raised
+            raised = true
+            raise Redis::CannotConnectError.new("Error connecting to Redis on localhost:6379")
+          end
+          original.call(key, *args)
+        end
+
+        capture_io do
+          @thread = Thread.new { MicroserviceOperator.run }
+          sleep 0.5 # Several cycles; the error fires on the first update
+        end
+
+        # The error must actually have been triggered...
+        expect(raised).to be true
+        # ...and the operator loop must have survived it and still be running.
+        expect(@thread.alive?).to be true
+      end
+    end
   end
 end
diff --git a/openc3/spec/utilities/store_spec.rb b/openc3/spec/utilities/store_spec.rb
@@ -0,0 +1,55 @@
+# encoding: ascii-8bit
+
+# Copyright 2026 OpenC3, Inc.
+# All Rights Reserved.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE.md for more details.
+
+# This file may also be used under the terms of a commercial license
+# if purchased from OpenC3, Inc.
+
+require "spec_helper"
+require "openc3/utilities/store"
+
+module OpenC3
+  describe Store do
+    describe "build_redis" do
+      # A transient network blip (the same kind that makes targets reconnect)
+      # must be retried inside the client with jittered backoff instead of
+      # immediately surfacing a connection error to callers, which would
+      # otherwise propagate up and kill the caller (e.g. the operator).
+      it "configures equal-jitter reconnect backoff mirroring Python" do
+        captured = nil
+        allow(Redis).to receive(:new) do |**kwargs|
+          captured = kwargs
+          double("redis").as_null_object
+        end
+
+        store = Store.allocate
+        store.instance_variable_set(:@redis_url, "redis://localhost:6379")
+        store.instance_variable_set(:@redis_username, nil)
+        store.instance_variable_set(:@redis_key, nil)
+        store.send(:build_redis)
+
+        attempts = captured[:reconnect_attempts]
+        expect(attempts.length).to eq(3)
+        # Equal-jitter ranges per retry with cap=5, base=0.625:
+        # t = min(cap, base*2**f); delay in [t/2, t]
+        expect(attempts[0]).to be_between(0.625, 1.25)
+        expect(attempts[1]).to be_between(1.25, 2.5)
+        expect(attempts[2]).to be_between(2.5, 5.0) # final retry caps at 5s
+      end
+
+      it "samples fresh jittered delays each call to de-sync clients" do
+        store = Store.allocate
+        a = store.send(:reconnect_backoff_delays)
+        b = store.send(:reconnect_backoff_delays)
+        # Astronomically unlikely to be identical unless jitter is missing
+        expect(a).not_to eq(b)
+      end
+    end
+  end
+end