Skip to content

Commit 5c73d3f

Browse files
committed
Add failing tests proving premature LeaseAcquired bug (#3402)
Tests reproduce the split-brain scenario: when a CAS conflict occurs during granting and the blob/configmap has no owner, LeaseActor sends LeaseAcquired BEFORE the retry write completes. If the retry fails (another node takes the lease), the caller believes it holds the lease but it doesn't. ShouldNotSendPrematureLeaseAcquiredWhenConflictRetryIsStolen: - FAILS: receives LeaseAcquired instead of expected LeaseTaken - Proves the bug exists in both Azure and Kubernetes implementations ShouldGrantLeaseOnlyAfterConflictRetrySucceeds: - PASSES: verifies happy-path still works correctly
1 parent 53a1eb0 commit 5c73d3f

2 files changed

Lines changed: 184 additions & 0 deletions

File tree

src/coordination/azure/Akka.Coordination.Azure.Tests/LeaseActorSpec.cs

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -410,6 +410,98 @@ public void ReturnLeaseTakenIfConflictWhenUpdatingLease()
410410
});
411411
}
412412

413+
/// <summary>
414+
/// Reproduces a split-brain bug: when a CAS conflict occurs during granting and the blob
415+
/// has no owner, the LeaseActor sends LeaseAcquired to the caller BEFORE the retry write completes.
416+
/// If the retry then fails (another node steals the lease), the caller already believes it holds the
417+
/// lease — but it doesn't. localGranted is never set to true, heartbeat never starts.
418+
///
419+
/// This test asserts the CORRECT behavior: the caller should receive LeaseTaken (not LeaseAcquired)
420+
/// when the retry write is stolen by another node.
421+
/// </summary>
422+
[Fact(DisplayName = "Bug #3402: should NOT send premature LeaseAcquired when conflict retry is stolen by another node")]
423+
public async Task ShouldNotSendPrematureLeaseAcquiredWhenConflictRetryIsStolen()
424+
{
425+
await RunTestAsync(async () =>
426+
{
427+
// Step 1: Send Acquire, complete the read (lease unowned), enter Granting state
428+
UnderTest.Tell(new LeaseActor.Acquire(), Sender);
429+
await LeaseProbe.ExpectMsgAsync(LeaseName);
430+
LeaseProbe.Reply(new LeaseResource("", CurrentVersion, CurrentTime));
431+
432+
// Step 2: First write attempt — expect the CAS update
433+
await UpdateProbe.ExpectMsgAsync((OwnerName, CurrentVersion));
434+
435+
// Step 3: Reply with CAS conflict (412), but blob has NO owner (version moved on)
436+
// This triggers the null-owner retry path at LeaseActor.cs line 356-366
437+
var conflictVersion = new ETag((CurrentVersionCount + 5).ToString());
438+
UpdateProbe.Reply(
439+
new Left<LeaseResource, LeaseResource>(
440+
new LeaseResource(null, conflictVersion, CurrentTime)));
441+
442+
// Step 4: Retry write is dispatched — expect it with the new version
443+
await UpdateProbe.ExpectMsgAsync((OwnerName, conflictVersion));
444+
445+
// Step 5: Retry FAILS — another node stole the lease between conflict and retry
446+
var stolenVersion = new ETag((CurrentVersionCount + 10).ToString());
447+
UpdateProbe.Reply(
448+
new Left<LeaseResource, LeaseResource>(
449+
new LeaseResource("another-node-stole-it", stolenVersion, CurrentTime)));
450+
451+
// Step 6: Assert CORRECT behavior — caller should get LeaseTaken, NOT LeaseAcquired
452+
// BUG: Currently the caller receives LeaseAcquired (premature, from line 362)
453+
// followed by LeaseTaken (which goes to dead letters since Ask already completed)
454+
await SenderProbe.ExpectMsgAsync<LeaseActor.LeaseTaken>();
455+
456+
// Step 7: localGranted should be false — the lease was never actually acquired
457+
Granted.Value.Should().BeFalse();
458+
});
459+
}
460+
461+
/// <summary>
462+
/// Verifies that when a CAS conflict occurs with no owner and the retry SUCCEEDS,
463+
/// the lease is properly granted with localGranted=true and heartbeat started.
464+
/// This is the happy-path counterpart to the split-brain test above.
465+
/// </summary>
466+
[Fact(DisplayName = "Bug #3402: should grant lease only after conflict retry succeeds")]
467+
public async Task ShouldGrantLeaseOnlyAfterConflictRetrySucceeds()
468+
{
469+
await RunTestAsync(async () =>
470+
{
471+
// Step 1: Send Acquire, complete the read (lease unowned), enter Granting state
472+
UnderTest.Tell(new LeaseActor.Acquire(), Sender);
473+
await LeaseProbe.ExpectMsgAsync(LeaseName);
474+
LeaseProbe.Reply(new LeaseResource("", CurrentVersion, CurrentTime));
475+
476+
// Step 2: First write attempt
477+
await UpdateProbe.ExpectMsgAsync((OwnerName, CurrentVersion));
478+
479+
// Step 3: CAS conflict, no owner
480+
var conflictVersion = new ETag((CurrentVersionCount + 5).ToString());
481+
UpdateProbe.Reply(
482+
new Left<LeaseResource, LeaseResource>(
483+
new LeaseResource(null, conflictVersion, CurrentTime)));
484+
485+
// Step 4: Retry dispatched
486+
await UpdateProbe.ExpectMsgAsync((OwnerName, conflictVersion));
487+
488+
// Step 5: Retry SUCCEEDS
489+
var grantedVersion = new ETag((CurrentVersionCount + 11).ToString());
490+
UpdateProbe.Reply(
491+
new Right<LeaseResource, LeaseResource>(
492+
new LeaseResource(OwnerName, grantedVersion, CurrentTime)));
493+
494+
// Step 6: Now we should get LeaseAcquired
495+
await SenderProbe.ExpectMsgAsync<LeaseActor.LeaseAcquired>();
496+
497+
// Step 7: localGranted should be TRUE — the lease was properly acquired
498+
await AwaitAssertAsync(() =>
499+
{
500+
Granted.Value.Should().BeTrue();
501+
});
502+
});
503+
}
504+
413505
[Fact(DisplayName = "LeaseActor should be able to get lease after failing previous grant update")]
414506
public void AbleToGetLeaseAfterFailingPreviousGrantUpdate()
415507
{

src/coordination/kubernetes/Akka.Coordination.KubernetesApi.Tests/LeaseActorSpec.cs

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -411,6 +411,98 @@ public void ReturnLeaseTakenIfConflictWhenUpdatingLease()
411411
});
412412
}
413413

414+
/// <summary>
415+
/// Reproduces a split-brain bug: when a CAS conflict occurs during granting and the configmap
416+
/// has no owner, the LeaseActor sends LeaseAcquired to the caller BEFORE the retry write completes.
417+
/// If the retry then fails (another node steals the lease), the caller already believes it holds the
418+
/// lease — but it doesn't. localGranted is never set to true, heartbeat never starts.
419+
///
420+
/// This test asserts the CORRECT behavior: the caller should receive LeaseTaken (not LeaseAcquired)
421+
/// when the retry write is stolen by another node.
422+
/// </summary>
423+
[Fact(DisplayName = "Bug #3402: should NOT send premature LeaseAcquired when conflict retry is stolen by another node")]
424+
public async Task ShouldNotSendPrematureLeaseAcquiredWhenConflictRetryIsStolen()
425+
{
426+
await RunTestAsync(async () =>
427+
{
428+
// Step 1: Send Acquire, complete the read (lease unowned), enter Granting state
429+
UnderTest.Tell(new LeaseActor.Acquire(), Sender);
430+
await LeaseProbe.ExpectMsgAsync(LeaseName);
431+
LeaseProbe.Reply(new LeaseResource(null, CurrentVersion, CurrentTime));
432+
433+
// Step 2: First write attempt — expect the CAS update
434+
await UpdateProbe.ExpectMsgAsync((OwnerName, CurrentVersion));
435+
436+
// Step 3: Reply with CAS conflict (412), but configmap has NO owner (version moved on)
437+
// This triggers the null-owner retry path at LeaseActor.cs line 359-368
438+
var conflictVersion = (CurrentVersionCount + 5).ToString();
439+
UpdateProbe.Reply(
440+
new Left<LeaseResource, LeaseResource>(
441+
new LeaseResource(null, conflictVersion, CurrentTime)));
442+
443+
// Step 4: Retry write is dispatched — expect it with the new version
444+
await UpdateProbe.ExpectMsgAsync((OwnerName, conflictVersion));
445+
446+
// Step 5: Retry FAILS — another node stole the lease between conflict and retry
447+
var stolenVersion = (CurrentVersionCount + 10).ToString();
448+
UpdateProbe.Reply(
449+
new Left<LeaseResource, LeaseResource>(
450+
new LeaseResource("another-node-stole-it", stolenVersion, CurrentTime)));
451+
452+
// Step 6: Assert CORRECT behavior — caller should get LeaseTaken, NOT LeaseAcquired
453+
// BUG: Currently the caller receives LeaseAcquired (premature, from line 365)
454+
// followed by LeaseTaken (which goes to dead letters since Ask already completed)
455+
await SenderProbe.ExpectMsgAsync<LeaseActor.LeaseTaken>();
456+
457+
// Step 7: localGranted should be false — the lease was never actually acquired
458+
Granted.Value.Should().BeFalse();
459+
});
460+
}
461+
462+
/// <summary>
463+
/// Verifies that when a CAS conflict occurs with no owner and the retry SUCCEEDS,
464+
/// the lease is properly granted with localGranted=true and heartbeat started.
465+
/// This is the happy-path counterpart to the split-brain test above.
466+
/// </summary>
467+
[Fact(DisplayName = "Bug #3402: should grant lease only after conflict retry succeeds")]
468+
public async Task ShouldGrantLeaseOnlyAfterConflictRetrySucceeds()
469+
{
470+
await RunTestAsync(async () =>
471+
{
472+
// Step 1: Send Acquire, complete the read (lease unowned), enter Granting state
473+
UnderTest.Tell(new LeaseActor.Acquire(), Sender);
474+
await LeaseProbe.ExpectMsgAsync(LeaseName);
475+
LeaseProbe.Reply(new LeaseResource(null, CurrentVersion, CurrentTime));
476+
477+
// Step 2: First write attempt
478+
await UpdateProbe.ExpectMsgAsync((OwnerName, CurrentVersion));
479+
480+
// Step 3: CAS conflict, no owner
481+
var conflictVersion = (CurrentVersionCount + 5).ToString();
482+
UpdateProbe.Reply(
483+
new Left<LeaseResource, LeaseResource>(
484+
new LeaseResource(null, conflictVersion, CurrentTime)));
485+
486+
// Step 4: Retry dispatched
487+
await UpdateProbe.ExpectMsgAsync((OwnerName, conflictVersion));
488+
489+
// Step 5: Retry SUCCEEDS
490+
var grantedVersion = (CurrentVersionCount + 11).ToString();
491+
UpdateProbe.Reply(
492+
new Right<LeaseResource, LeaseResource>(
493+
new LeaseResource(OwnerName, grantedVersion, CurrentTime)));
494+
495+
// Step 6: Now we should get LeaseAcquired
496+
await SenderProbe.ExpectMsgAsync<LeaseActor.LeaseAcquired>();
497+
498+
// Step 7: localGranted should be TRUE — the lease was properly acquired
499+
await AwaitAssertAsync(() =>
500+
{
501+
Granted.Value.Should().BeTrue();
502+
});
503+
});
504+
}
505+
414506
[Fact(DisplayName = "LeaseActor should be able to get lease after failing previous grant update")]
415507
public void AbleToGetLeaseAfterFailingPreviousGrantUpdate()
416508
{

0 commit comments

Comments
 (0)