Skip to content

Commit 4ecb123

Browse files
fix: make payload size limit execution error
* fix: exceed payload size limit as execution error Throw CheckpointError with error_category as INVOCATION for 4xx InvalidParameterValueException errors related to payload size limit exceeded, as these errors are not retryable. * fix: correct CheckpointError retriable classification - Fix inverted is_retriable(): INVOCATION errors are retriable, EXECUTION errors are permanent failures (logic was backwards) - Classify payload size exceeded as EXECUTION (permanent), not INVOCATION — exceeding a size limit is not a transient failure - Simplify CheckpointError.from_exception() conditional logic - Update error-handling docs to reflect correct retry behavior - Update tests to match corrected classification semantics * style: fix indentation in CheckpointError conditional
1 parent b042739 commit 4ecb123

4 files changed

Lines changed: 74 additions & 67 deletions

File tree

docs/advanced/error-handling.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ The SDK provides several exception types for different failure scenarios.
9898
| `InvocationError` | Yes (by Lambda) | Lambda retries invocation | Transient infrastructure issues |
9999
| `CallbackError` | No | Returns FAILED status | Callback handling failures |
100100
| `StepInterruptedError` | Yes (automatic) | Retries on next invocation | Step interrupted before checkpoint |
101-
| `CheckpointError` | Depends | Retries if 4xx (except invalid token) | Failed to save execution state |
101+
| `CheckpointError` | Depends | Permanent on 4xx non-429 (except invalid checkpoint token); retries otherwise | Failed to save execution state |
102102
| `SerDesError` | No | Returns FAILED status | Serialization failures |
103103

104104
### Base exceptions

src/aws_durable_execution_sdk_python/exceptions.py

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -155,32 +155,25 @@ def from_exception(cls, exception: Exception) -> CheckpointError:
155155
error: AwsErrorObj | None = base.error
156156
error_category: CheckpointErrorCategory = CheckpointErrorCategory.INVOCATION
157157

158-
# InvalidParameterValueException and error message starts with "Invalid Checkpoint Token" is an InvocationError
159-
# all other 4xx errors are Execution Errors and should be retried
160-
# all 5xx errors are Invocation Errors
158+
# 4xx errors (except 429) are permanent failures (EXECUTION), unless it's an
159+
# InvalidParameterValueException with "Invalid Checkpoint Token" which is retriable (INVOCATION).
160+
# 5xx, 429, and network errors are retriable (INVOCATION).
161161
status_code: int | None = (metadata and metadata.get("HTTPStatusCode")) or None
162162
if (
163163
status_code
164-
# if we are in 4xx range (except 429) and is not an InvalidParameterValueException with Invalid Checkpoint Token
165-
# then it's an execution error
166-
and status_code < SERVICE_ERROR
167-
and status_code >= BAD_REQUEST_ERROR
164+
and BAD_REQUEST_ERROR <= status_code < SERVICE_ERROR
168165
and status_code != TOO_MANY_REQUESTS_ERROR
169166
and error
170-
and (
171-
# is not InvalidParam => Execution
172-
(error.get("Code", "") or "") != "InvalidParameterValueException"
173-
# is not Invalid Token => Execution
174-
or not (error.get("Message") or "").startswith(
175-
"Invalid Checkpoint Token"
176-
)
167+
and not (
168+
(error.get("Code") or "") == "InvalidParameterValueException"
169+
and (error.get("Message") or "").startswith("Invalid Checkpoint Token")
177170
)
178171
):
179172
error_category = CheckpointErrorCategory.EXECUTION
180173
return CheckpointError(str(exception), error_category, error, metadata)
181174

182175
def is_retriable(self):
183-
return self.error_category == CheckpointErrorCategory.EXECUTION
176+
return self.error_category == CheckpointErrorCategory.INVOCATION
184177

185178

186179
class ValidationError(DurableExecutionsError):

tests/exceptions_test.py

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,23 @@ def test_checkpoint_error_classification_invalid_token_invocation():
6767
result = CheckpointError.from_exception(client_error)
6868

6969
assert result.error_category == CheckpointErrorCategory.INVOCATION
70+
assert result.is_retriable()
71+
72+
73+
def test_checkpoint_error_classification_payload_size_exceeded_execution():
74+
"""Test 4xx InvalidParameterValueException with STEP output payload size limit exceeded is execution error."""
75+
error_response = {
76+
"Error": {
77+
"Code": "InvalidParameterValueException",
78+
"Message": "STEP output payload size must be less than or equal to 262144 bytes.",
79+
},
80+
"ResponseMetadata": {"HTTPStatusCode": 400},
81+
}
82+
client_error = ClientError(error_response, "Checkpoint")
83+
84+
result = CheckpointError.from_exception(client_error)
85+
86+
assert result.error_category == CheckpointErrorCategory.EXECUTION
7087
assert not result.is_retriable()
7188

7289

@@ -81,7 +98,7 @@ def test_checkpoint_error_classification_other_4xx_execution():
8198
result = CheckpointError.from_exception(client_error)
8299

83100
assert result.error_category == CheckpointErrorCategory.EXECUTION
84-
assert result.is_retriable()
101+
assert not result.is_retriable()
85102

86103

87104
def test_checkpoint_error_classification_429_invocation():
@@ -95,7 +112,7 @@ def test_checkpoint_error_classification_429_invocation():
95112
result = CheckpointError.from_exception(client_error)
96113

97114
assert result.error_category == CheckpointErrorCategory.INVOCATION
98-
assert not result.is_retriable()
115+
assert result.is_retriable()
99116

100117

101118
def test_checkpoint_error_classification_invalid_param_without_token_execution():
@@ -112,7 +129,7 @@ def test_checkpoint_error_classification_invalid_param_without_token_execution()
112129
result = CheckpointError.from_exception(client_error)
113130

114131
assert result.error_category == CheckpointErrorCategory.EXECUTION
115-
assert result.is_retriable()
132+
assert not result.is_retriable()
116133

117134

118135
def test_checkpoint_error_classification_5xx_invocation():
@@ -126,7 +143,7 @@ def test_checkpoint_error_classification_5xx_invocation():
126143
result = CheckpointError.from_exception(client_error)
127144

128145
assert result.error_category == CheckpointErrorCategory.INVOCATION
129-
assert not result.is_retriable()
146+
assert result.is_retriable()
130147

131148

132149
def test_checkpoint_error_classification_unknown_invocation():
@@ -136,7 +153,7 @@ def test_checkpoint_error_classification_unknown_invocation():
136153
result = CheckpointError.from_exception(unknown_error)
137154

138155
assert result.error_category == CheckpointErrorCategory.INVOCATION
139-
assert not result.is_retriable()
156+
assert result.is_retriable()
140157

141158

142159
def test_validation_error():

tests/execution_test.py

Lines changed: 43 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1066,8 +1066,9 @@ def test_handler(event: Any, context: DurableContext) -> dict:
10661066
# Make the service client checkpoint call fail with CheckpointError
10671067
mock_client.checkpoint.side_effect = failing_checkpoint
10681068

1069-
with pytest.raises(CheckpointError, match="Background checkpoint failed"):
1070-
test_handler(invocation_input, lambda_context)
1069+
response = test_handler(invocation_input, lambda_context)
1070+
assert response["Status"] == InvocationStatus.FAILED.value
1071+
assert response["Error"]["ErrorType"] == "CheckpointError"
10711072

10721073

10731074
# endregion durable_execution
@@ -1120,16 +1121,13 @@ def slow_background():
11201121
"aws_durable_execution_sdk_python.state.ExecutionState.checkpoint_batches_forever",
11211122
side_effect=slow_background,
11221123
):
1123-
with pytest.raises(CheckpointError, match="Checkpoint system failed"):
1124-
test_handler(invocation_input, lambda_context)
1125-
1124+
response = test_handler(invocation_input, lambda_context)
1125+
assert response["Status"] == InvocationStatus.FAILED.value
1126+
assert response["Error"]["ErrorType"] == "CheckpointError"
11261127

1127-
def test_durable_execution_checkpoint_invocation_error_stops_background():
1128-
"""Test that CheckpointError handler stops background checkpointing.
11291128

1130-
When user code raises CheckpointError, the handler should stop the background
1131-
thread before re-raising to terminate the Lambda.
1132-
"""
1129+
def test_durable_execution_checkpoint_invocation_error_retries():
1130+
"""Test that CheckpointError with INVOCATION category re-raises to trigger Lambda retry."""
11331131
mock_client = Mock(spec=DurableServiceClient)
11341132

11351133
@durable_execution
@@ -1171,13 +1169,12 @@ def slow_background():
11711169
"aws_durable_execution_sdk_python.state.ExecutionState.checkpoint_batches_forever",
11721170
side_effect=slow_background,
11731171
):
1174-
response = test_handler(invocation_input, lambda_context)
1175-
assert response["Status"] == InvocationStatus.FAILED.value
1176-
assert response["Error"]["ErrorType"] == "CheckpointError"
1172+
with pytest.raises(CheckpointError, match="Checkpoint system failed"):
1173+
test_handler(invocation_input, lambda_context)
11771174

11781175

1179-
def test_durable_execution_background_thread_execution_error_retries():
1180-
"""Test that background thread Execution errors are retried (re-raised)."""
1176+
def test_durable_execution_background_thread_execution_error_returns_failed():
1177+
"""Test that background thread Execution errors return FAILED (permanent, no retry)."""
11811178
mock_client = Mock(spec=DurableServiceClient)
11821179

11831180
def failing_checkpoint(*args, **kwargs):
@@ -1215,12 +1212,13 @@ def test_handler(event: Any, context: DurableContext) -> dict:
12151212

12161213
mock_client.checkpoint.side_effect = failing_checkpoint
12171214

1218-
with pytest.raises(CheckpointError, match="Background checkpoint failed"):
1219-
test_handler(invocation_input, lambda_context)
1215+
response = test_handler(invocation_input, lambda_context)
1216+
assert response["Status"] == InvocationStatus.FAILED.value
1217+
assert response["Error"]["ErrorType"] == "CheckpointError"
12201218

12211219

1222-
def test_durable_execution_background_thread_invocation_error_returns_failed():
1223-
"""Test that background thread Invocation errors return FAILED status."""
1220+
def test_durable_execution_background_thread_invocation_error_retries():
1221+
"""Test that background thread Invocation errors re-raise to trigger Lambda retry."""
12241222
mock_client = Mock(spec=DurableServiceClient)
12251223

12261224
def failing_checkpoint(*args, **kwargs):
@@ -1258,13 +1256,12 @@ def test_handler(event: Any, context: DurableContext) -> dict:
12581256

12591257
mock_client.checkpoint.side_effect = failing_checkpoint
12601258

1261-
response = test_handler(invocation_input, lambda_context)
1262-
assert response["Status"] == InvocationStatus.FAILED.value
1263-
assert response["Error"]["ErrorType"] == "CheckpointError"
1259+
with pytest.raises(CheckpointError, match="Background checkpoint failed"):
1260+
test_handler(invocation_input, lambda_context)
12641261

12651262

1266-
def test_durable_execution_final_success_checkpoint_execution_error_retries():
1267-
"""Test that execution errors on final success checkpoint trigger retry."""
1263+
def test_durable_execution_final_success_checkpoint_execution_error_returns_failed():
1264+
"""Test that execution errors on final success checkpoint return FAILED (permanent, no retry)."""
12681265
mock_client = Mock(spec=DurableServiceClient)
12691266

12701267
def failing_final_checkpoint(*args, **kwargs):
@@ -1303,12 +1300,13 @@ def test_handler(event: Any, context: DurableContext) -> dict:
13031300

13041301
mock_client.checkpoint.side_effect = failing_final_checkpoint
13051302

1306-
with pytest.raises(CheckpointError, match="Final checkpoint failed"):
1307-
test_handler(invocation_input, lambda_context)
1303+
response = test_handler(invocation_input, lambda_context)
1304+
assert response["Status"] == InvocationStatus.FAILED.value
1305+
assert response["Error"]["ErrorType"] == "CheckpointError"
13081306

13091307

1310-
def test_durable_execution_final_success_checkpoint_invocation_error_returns_failed():
1311-
"""Test that invocation errors on final success checkpoint return FAILED."""
1308+
def test_durable_execution_final_success_checkpoint_invocation_error_retries():
1309+
"""Test that invocation errors on final success checkpoint re-raise to trigger Lambda retry."""
13121310
mock_client = Mock(spec=DurableServiceClient)
13131311

13141312
def failing_final_checkpoint(*args, **kwargs):
@@ -1348,14 +1346,12 @@ def test_handler(event: Any, context: DurableContext) -> dict:
13481346

13491347
mock_client.checkpoint.side_effect = failing_final_checkpoint
13501348

1351-
response = test_handler(invocation_input, lambda_context)
1352-
assert response["Status"] == InvocationStatus.FAILED.value
1353-
assert response["Error"]["ErrorType"] == "CheckpointError"
1354-
assert response["Error"]["ErrorMessage"] == "Final checkpoint failed"
1349+
with pytest.raises(CheckpointError, match="Final checkpoint failed"):
1350+
test_handler(invocation_input, lambda_context)
13551351

13561352

1357-
def test_durable_execution_final_failure_checkpoint_execution_error_retries():
1358-
"""Test that execution errors on final failure checkpoint trigger retry."""
1353+
def test_durable_execution_final_failure_checkpoint_execution_error_returns_failed():
1354+
"""Test that execution errors on final failure checkpoint return FAILED (permanent, no retry)."""
13591355
mock_client = Mock(spec=DurableServiceClient)
13601356

13611357
def failing_final_checkpoint(*args, **kwargs):
@@ -1396,12 +1392,13 @@ def test_handler(event: Any, context: DurableContext) -> dict:
13961392

13971393
mock_client.checkpoint.side_effect = failing_final_checkpoint
13981394

1399-
with pytest.raises(CheckpointError, match="Final checkpoint failed"):
1400-
test_handler(invocation_input, lambda_context)
1395+
response = test_handler(invocation_input, lambda_context)
1396+
assert response["Status"] == InvocationStatus.FAILED.value
1397+
assert response["Error"]["ErrorType"] == "CheckpointError"
14011398

14021399

1403-
def test_durable_execution_final_failure_checkpoint_invocation_error_returns_failed():
1404-
"""Test that invocation errors on final failure checkpoint return FAILED."""
1400+
def test_durable_execution_final_failure_checkpoint_invocation_error_retries():
1401+
"""Test that invocation errors on final failure checkpoint re-raise to trigger Lambda retry."""
14051402
mock_client = Mock(spec=DurableServiceClient)
14061403

14071404
def failing_final_checkpoint(*args, **kwargs):
@@ -1442,10 +1439,8 @@ def test_handler(event: Any, context: DurableContext) -> dict:
14421439

14431440
mock_client.checkpoint.side_effect = failing_final_checkpoint
14441441

1445-
response = test_handler(invocation_input, lambda_context)
1446-
assert response["Status"] == InvocationStatus.FAILED.value
1447-
assert response["Error"]["ErrorType"] == "CheckpointError"
1448-
assert response["Error"]["ErrorMessage"] == "Final checkpoint failed"
1442+
with pytest.raises(CheckpointError, match="Final checkpoint failed"):
1443+
test_handler(invocation_input, lambda_context)
14491444

14501445

14511446
def test_durable_handler_background_thread_failure_on_succeed_checkpoint():
@@ -1809,8 +1804,9 @@ def test_handler(event: Any, context: DurableContext) -> dict:
18091804
mock_client.checkpoint.side_effect = failing_checkpoint
18101805

18111806
with patch("aws_durable_execution_sdk_python.execution.logger", mock_logger):
1812-
with pytest.raises(CheckpointError):
1813-
test_handler(invocation_input, lambda_context)
1807+
response = test_handler(invocation_input, lambda_context)
1808+
assert response["Status"] == InvocationStatus.FAILED.value
1809+
assert response["Error"]["ErrorType"] == "CheckpointError"
18141810

18151811
mock_logger.exception.assert_called_once()
18161812
call_args = mock_logger.exception.call_args
@@ -1922,8 +1918,9 @@ def test_handler(event: Any, context: DurableContext) -> dict:
19221918
lambda_context.tenant_id = None
19231919

19241920
with patch("aws_durable_execution_sdk_python.execution.logger", mock_logger):
1925-
with pytest.raises(CheckpointError):
1926-
test_handler(invocation_input, lambda_context)
1921+
response = test_handler(invocation_input, lambda_context)
1922+
assert response["Status"] == InvocationStatus.FAILED.value
1923+
assert response["Error"]["ErrorType"] == "CheckpointError"
19271924

19281925
mock_logger.exception.assert_called_once()
19291926
call_args = mock_logger.exception.call_args

0 commit comments

Comments
 (0)