Skip to content

Commit b51bdf6

Browse files
bnusunnyseshubaws
andauthored
fix: remove fallback in count_running_containers that causes flaky warm container tests
* Update notify-slack.yml to add sleep (aws#8646) * fix: use tag prefix matching to clean up samcli/lambda-* images (aws#8647) Docker's images.list(name='samcli/lambda') does exact repository matching and won't match repositories like 'samcli/lambda-python'. This caused stale images to persist across parameterized test classes, leading to flaky test_download_two_layers failures where Layer2 should overwrite Layer1 but the image was never rebuilt. Fix by: 1. Adding _cleanup_samcli_images() that lists all images and filters by 'samcli/lambda-' tag prefix 2. Using this method in both tearDown and tearDownClass 3. Fixing the same pattern in TestLayerVersionThatDoNotCreateCache * fix: remove fallback in count_running_containers that causes flaky warm container tests The count_running_containers method had a fallback that returned the count of ALL SAM CLI containers when MODE env var filtering found no matches. This caused AssertionError: 3 != 2 when stale containers from other tests were present. Now it strictly counts only containers matching this test's unique MODE UUID and uses exact string matching. * fix: add AWS_DEFAULT_REGION and use -k pattern for parameterized tests * fix: add BY_CANARY=true to enable Docker tests on CI * fix: exclude RemoteLayers tests that need AWS credentials --------- Co-authored-by: seshubaws <116689586+seshubaws@users.noreply.github.com>
1 parent d8b0f82 commit b51bdf6

5 files changed

Lines changed: 86 additions & 40 deletions

File tree

.github/workflows/notify-slack.yml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,11 @@ jobs:
1212
permissions:
1313
contents: read
1414
steps:
15+
- name: Wait for label to be applied
16+
run: sleep 10s
17+
shell: bash
1518
- name: Send External PR Notification
16-
if: github.event_name == 'pull_request' && github.event.label.name == 'pr/external'
19+
if: github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'pr/external')
1720
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2
1821
env:
1922
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK_URL }}
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
name: Test Warm Containers Fix
2+
3+
on:
4+
pull_request:
5+
branches: [develop]
6+
workflow_dispatch:
7+
8+
permissions:
9+
contents: read
10+
11+
env:
12+
AWS_DEFAULT_REGION: us-east-1
13+
SAM_CLI_DEV: 1
14+
SAM_CLI_TELEMETRY: 0
15+
SAM_CLI_CONTAINER_CONNECTION_TIMEOUT: 60
16+
BY_CANARY: true
17+
18+
jobs:
19+
test-warm-containers:
20+
runs-on: ubuntu-22.04
21+
steps:
22+
- uses: actions/checkout@v4
23+
24+
- uses: actions/setup-python@v5
25+
with:
26+
python-version: "3.11"
27+
28+
- name: Install SAM CLI in dev mode
29+
run: pip install -e ".[dev]"
30+
31+
- name: Run warm containers tests
32+
run: |
33+
pytest -vv --reruns 3 \
34+
-k "TestWarmContainers and not RemoteLayers" \
35+
tests/integration/local/start_api/test_start_api.py \
36+
tests/integration/local/start_lambda/test_start_lambda.py \
37+
--json-report --json-report-file=test-report.json
38+
39+
- name: Upload test report
40+
if: always()
41+
uses: actions/upload-artifact@v4
42+
with:
43+
name: test-report
44+
path: test-report.json

tests/integration/local/invoke/test_integrations_cli.py

Lines changed: 32 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -803,13 +803,27 @@ class TestLayerVersionBase(InvokeIntegBase):
803803
def setUp(self):
804804
self.layer_cache = Path().home().joinpath("integ_layer_cache")
805805

806+
@staticmethod
807+
def _cleanup_samcli_images(docker_client):
808+
"""Remove all samcli/lambda-* images.
809+
810+
Docker's images.list(name="samcli/lambda") does exact repository matching
811+
and won't match repositories like "samcli/lambda-python". We list all images
812+
and filter by tag prefix instead.
813+
"""
814+
try:
815+
all_images = docker_client.images.list()
816+
for image in all_images:
817+
for tag in image.tags:
818+
if tag.startswith("samcli/lambda-"):
819+
docker_client.remove_image_safely(image.id, force=True)
820+
break
821+
except Exception:
822+
pass
823+
806824
def tearDown(self):
807825
docker_client = get_validated_container_client()
808-
samcli_images = docker_client.images.list(name="samcli/lambda")
809-
for image in samcli_images:
810-
# Use strategy pattern method for runtime-aware image cleanup
811-
docker_client.remove_image_safely(image.id, force=True)
812-
826+
self._cleanup_samcli_images(docker_client)
813827
shutil.rmtree(str(self.layer_cache), ignore_errors=True)
814828

815829
@classmethod
@@ -823,10 +837,7 @@ def tearDownClass(cls):
823837
cls.layer_utils.delete_layers()
824838
# Added to handle the case where ^C failed the test due to invalid cleanup of layers
825839
docker_client = get_validated_container_client()
826-
samcli_images = docker_client.images.list(name="samcli/lambda")
827-
for image in samcli_images:
828-
# Use strategy pattern method for runtime-aware image cleanup
829-
docker_client.remove_image_safely(image.id, force=True)
840+
cls._cleanup_samcli_images(docker_client)
830841
integ_layer_cache_dir = Path().home().joinpath("integ_layer_cache")
831842
if integ_layer_cache_dir.exists():
832843
shutil.rmtree(str(integ_layer_cache_dir))
@@ -1069,12 +1080,18 @@ def setUp(self):
10691080
def tearDown(self):
10701081
docker_client = get_validated_container_client()
10711082

1072-
# Use strategy pattern method for runtime-aware image cleanup
1073-
# This handles both Docker and Finch cleanup strategies automatically
1074-
samcli_images = docker_client.images.list(name="samcli/lambda")
1075-
for image in samcli_images:
1076-
# Use strategy pattern method that handles runtime-specific cleanup logic
1077-
docker_client.remove_image_safely(image.id, force=True)
1083+
# Use the same prefix-based cleanup as TestLayerVersionBase.
1084+
# Docker's images.list(name="samcli/lambda") does exact repository matching
1085+
# and won't match "samcli/lambda-python" etc.
1086+
try:
1087+
all_images = docker_client.images.list()
1088+
for image in all_images:
1089+
for tag in image.tags:
1090+
if tag.startswith("samcli/lambda-"):
1091+
docker_client.remove_image_safely(image.id, force=True)
1092+
break
1093+
except Exception:
1094+
pass
10781095

10791096
def test_layer_does_not_exist(self):
10801097
self.layer_utils.upsert_layer(LayerUtils.generate_layer_name(), "LayerOneArn", "layer1.zip")

tests/integration/local/start_api/test_start_api.py

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2220,35 +2220,26 @@ def setUp(self):
22202220

22212221
def count_running_containers(self):
22222222
"""Count containers created by this test using Docker client directly."""
2223-
# Use Docker client to find containers with SAM CLI labels
22242223
try:
2225-
# Get running containers with SAM CLI lambda container label
22262224
sam_containers = self.docker_client.containers.list(
22272225
all=False, filters={"label": "sam.cli.container.type=lambda"}
22282226
)
22292227

2230-
# Filter by our test's mode environment variable if possible
22312228
test_containers = []
22322229
for container in sam_containers:
22332230
try:
22342231
container.reload()
22352232
env_vars = container.attrs.get("Config", {}).get("Env", [])
22362233
for env_var in env_vars:
2237-
if env_var.startswith("MODE=") and self.mode_env_variable in env_var:
2234+
if env_var == f"MODE={self.mode_env_variable}":
22382235
test_containers.append(container)
22392236
break
22402237
except Exception:
22412238
continue
22422239

2243-
# If we found containers with our mode variable, return that count
2244-
if test_containers:
2245-
return len(test_containers)
2240+
return len(test_containers)
22462241

2247-
# Otherwise, return all SAM containers (fallback)
2248-
return len(sam_containers)
2249-
2250-
except Exception as e:
2251-
# If we can't access Docker client, fall back to 0
2242+
except Exception:
22522243
return 0
22532244

22542245
def _parse_container_ids_from_output(self):

tests/integration/local/start_lambda/test_start_lambda.py

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -403,35 +403,26 @@ def setUp(self):
403403

404404
def count_running_containers(self):
405405
"""Count containers created by this test using Docker client directly."""
406-
# Use Docker client to find containers with SAM CLI labels
407406
try:
408-
# Get running containers with SAM CLI lambda container label
409407
sam_containers = self.docker_client.containers.list(
410408
all=False, filters={"label": "sam.cli.container.type=lambda"}
411409
)
412410

413-
# Filter by our test's mode environment variable if possible
414411
test_containers = []
415412
for container in sam_containers:
416413
try:
417414
container.reload()
418415
env_vars = container.attrs.get("Config", {}).get("Env", [])
419416
for env_var in env_vars:
420-
if env_var.startswith("MODE=") and self.mode_env_variable in env_var:
417+
if env_var == f"MODE={self.mode_env_variable}":
421418
test_containers.append(container)
422419
break
423420
except Exception:
424421
continue
425422

426-
# If we found containers with our mode variable, return that count
427-
if test_containers:
428-
return len(test_containers)
423+
return len(test_containers)
429424

430-
# Otherwise, return all SAM containers (fallback)
431-
return len(sam_containers)
432-
433-
except Exception as e:
434-
# If we can't access Docker client, fall back to 0
425+
except Exception:
435426
return 0
436427

437428
def _parse_container_ids_from_output(self):

0 commit comments

Comments
 (0)