Skip to content

Commit a9582b4

Browse files
authored
Record terraform run result metadata (#1094)
### Context We want to record terraform action success/failure or timeout error for stats purpose. Add logic to scan terraform logs to capture those info. ### Change - Add a new field `result` to terraform execution metadata record two flags `"success": true/false` and `"timed_out": true/false }` - Improve the code so time elapsed can be captured even when the terraform action failed. - Update terraform azapi timeout to 60 mins, and corresponding pipeline timeout to 2hr See `result` field appended: <img width="2586" height="91" alt="image" src="https://github.com/user-attachments/assets/df3c1cc0-bbde-4f2e-90f8-cd3a24fd3872" />
1 parent 39a5f6f commit a9582b4

File tree

7 files changed

+214
-21
lines changed

7 files changed

+214
-21
lines changed

modules/python/terraform/extract_terraform_operation_metadata.py

Lines changed: 69 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,18 @@
44
import os
55
import datetime
66

7-
# Regex to extract: full module path and time
8-
PATTERN = re.compile(r"(module\.[^:]+): (?:Creation|Destruction) complete after (\d+h\d+m\d+s|\d+h\d+s|\d+m\d+s|\d+s)")
7+
# Regex to extract: full module path and time from completion lines
8+
COMPLETE_PATTERN = re.compile(r"(module\.[^:]+): (?:Creation|Destruction) complete after (\d+h\d+m\d+s|\d+h\d+s|\d+m\d+s|\d+s)")
9+
10+
# Regex to extract: full module path from initial "Creating"/"Destroying" lines
11+
# e.g. module.azapi["ccp-provisioning-H4"].azapi_resource.aks_cluster: Creating...
12+
# e.g. module.azapi["ccp-provisioning-H4"].azapi_resource.aks_cluster: Destroying
13+
START_PATTERN = re.compile(r"(module\.[^:]+): (?:Creating|Destroying)")
14+
15+
# Regex to extract: elapsed time from "Still creating/destroying..." lines
16+
# e.g. module.azapi[...].azapi_resource.aks_cluster: Still creating... [29m51s elapsed]
17+
# or: module.azapi[...].azapi_resource.aks_cluster: Still destroying... 00m40s elapsed]
18+
ELAPSED_PATTERN = re.compile(r"Still (?:creating|destroying)\.\.\. \[?(\d+h\d+m\d+s|\d+h\d+s|\d+m\d+s|\d+s) elapsed\]")
919

1020
def time_to_seconds(time_str):
1121
try:
@@ -33,7 +43,30 @@ def parse_module_path(full_path):
3343

3444
return module_name, submodule_path, resource_name
3545

46+
def build_result(full_path, time_str, run_id, _command_type, _scenario_type, _scenario_name, completed, timed_out):
47+
seconds = time_to_seconds(time_str)
48+
module, submodule, resource = parse_module_path(full_path)
49+
50+
return {
51+
"timestamp": datetime.datetime.now().isoformat(),
52+
"run_id": run_id,
53+
"scenario_type": _scenario_type,
54+
"scenario_name": _scenario_name,
55+
"module_name": module,
56+
"submodule_name": submodule,
57+
"resource_name": resource,
58+
"action": _command_type,
59+
"time_taken_seconds": seconds,
60+
"result": {"success": completed, "timed_out": timed_out}
61+
}
62+
3663
def process_terraform_logs(log_path, _command_type, _scenario_type, _scenario_name):
64+
"""Process terraform log file and collect operation metadata for each run.
65+
A single log file may contain multiple runs of the same module when failures
66+
trigger retries. Each run starts with a "Creating..."/"Destroying..." line.
67+
We collect data for all runs: successful runs use the completion time, and
68+
failed runs use the last reported elapsed time as a fallback.
69+
"""
3770
log_file = os.path.join(log_path, f"terraform_{_command_type}.log")
3871
run_id = os.getenv("RUN_ID", "")
3972
results = []
@@ -43,25 +76,44 @@ def process_terraform_logs(log_path, _command_type, _scenario_type, _scenario_na
4376
return results
4477

4578
try:
79+
current_full_path = None
80+
current_elapsed_time_str = None
81+
current_completed = False
82+
current_timed_out = False
83+
4684
with open(log_file, "r", encoding='utf-8') as f:
4785
for line in f:
48-
match = PATTERN.search(line)
86+
# Check for timeout on every line before any continue statements
87+
if "context deadline exceeded" in line:
88+
current_timed_out = True
89+
90+
start_match = START_PATTERN.search(line)
91+
if start_match:
92+
# A new run is starting; flush the previous run if it was incomplete
93+
if current_full_path and not current_completed and current_elapsed_time_str:
94+
results.append(build_result(current_full_path, current_elapsed_time_str, run_id, _command_type, _scenario_type, _scenario_name, False, current_timed_out))
95+
96+
current_full_path = start_match.group(1)
97+
current_elapsed_time_str = None
98+
current_completed = False
99+
current_timed_out = False
100+
continue
101+
102+
match = COMPLETE_PATTERN.search(line)
49103
if match:
50104
full_path, time_str = match.groups()
51-
seconds = time_to_seconds(time_str)
52-
module, submodule, resource = parse_module_path(full_path)
53-
54-
results.append({
55-
"timestamp": datetime.datetime.now().isoformat(),
56-
"run_id": run_id,
57-
"scenario_type": _scenario_type,
58-
"scenario_name": _scenario_name,
59-
"module_name": module,
60-
"submodule_name": submodule,
61-
"resource_name": resource,
62-
"action": _command_type,
63-
"time_taken_seconds": seconds
64-
})
105+
current_completed = True
106+
results.append(build_result(full_path, time_str, run_id, _command_type, _scenario_type, _scenario_name, True, False))
107+
continue
108+
109+
elapsed_match = ELAPSED_PATTERN.search(line)
110+
if elapsed_match:
111+
current_elapsed_time_str = elapsed_match.group(1)
112+
continue
113+
114+
# Flush the last run if it was incomplete
115+
if current_full_path and not current_completed and current_elapsed_time_str:
116+
results.append(build_result(current_full_path, current_elapsed_time_str, run_id, _command_type, _scenario_type, _scenario_name, False, current_timed_out))
65117
except Exception as e:
66118
print(f"[ERROR] Failed to process log file '{log_file}': {e}")
67119

modules/python/tests/test_extract_terraform_operation_metadata.py

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,5 +169,142 @@ def test_process_terraform_logs_with_invalid_log_line(self, mock_open_file, mock
169169
mock_open_file.assert_called_once_with('/fake/path/terraform_apply.log', 'r', encoding='utf-8')
170170
mock_isfile.assert_called_once_with("/fake/path/terraform_apply.log")
171171

172+
@patch("os.path.isfile", return_value=True)
173+
@patch("builtins.open", new_callable=mock_open, read_data=(
174+
"module.azapi[\"ccp\"].azapi_resource.aks_cluster: Creating...\n"
175+
"module.azapi[\"ccp\"].azapi_resource.aks_cluster: Still creating... [10m0s elapsed]\n"
176+
"module.azapi[\"ccp\"].azapi_resource.aks_cluster: Still creating... 20m0s elapsed]\n"
177+
"│ Error: Failed to create/update resource\n"
178+
"module.azapi[\"ccp\"].azapi_resource.aks_cluster: Creating...\n"
179+
"module.azapi[\"ccp\"].azapi_resource.aks_cluster: Still creating... [15m0s elapsed]\n"
180+
"module.azapi[\"ccp\"].azapi_resource.aks_cluster: Still creating... 25m30s elapsed]\n"
181+
"module.azapi[\"ccp\"].azapi_resource.aks_cluster: Creating...\n"
182+
"module.azapi[\"ccp\"].azapi_resource.aks_cluster: Still creating... [5m0s elapsed]\n"
183+
"module.azapi[\"ccp\"].azapi_resource.aks_cluster: Still creating... 30m45s elapsed]\n"
184+
"module.azapi[\"ccp\"].azapi_resource.aks_cluster: Creation complete after 30m55s [id=/subscriptions/b8ceb4e5-f05b-4562-a9f5-14acb1f24219/resourceGroups/57202-578c08a2/providers/Microsoft.ContainerService/managedClusters/ccp-provisioning-H4]\n"
185+
))
186+
def test_process_terraform_logs_with_failed_run_and_retries(self, mock_open_file, mock_isfile):
187+
os.environ["RUN_ID"] = "9988776655"
188+
189+
results = process_terraform_logs(
190+
log_path="/fake/path",
191+
_command_type="apply",
192+
_scenario_type="perf-eval",
193+
_scenario_name="test_scenario_name",
194+
)
195+
196+
self.assertEqual(len(results), 3)
197+
198+
# First failed run - last elapsed was 20m0s = 1200s
199+
self.assertEqual(results[0]["module_name"], "azapi[\"ccp\"]")
200+
self.assertEqual(results[0]["resource_name"], "aks_cluster")
201+
self.assertEqual(results[0]["time_taken_seconds"], 1200)
202+
self.assertEqual(results[0]["result"], {"success": False, "timed_out": False})
203+
204+
# Second failed run (retry 1) - last elapsed was 25m30s = 1530s
205+
self.assertEqual(results[1]["module_name"], "azapi[\"ccp\"]")
206+
self.assertEqual(results[1]["resource_name"], "aks_cluster")
207+
self.assertEqual(results[1]["time_taken_seconds"], 1530)
208+
self.assertEqual(results[1]["result"], {"success": False, "timed_out": False})
209+
210+
# Third failed run (retry 2) - last elapsed was 30m45s = 1845s
211+
self.assertEqual(results[2]["module_name"], "azapi[\"ccp\"]")
212+
self.assertEqual(results[2]["resource_name"], "aks_cluster")
213+
self.assertEqual(results[2]["time_taken_seconds"], 1855)
214+
self.assertEqual(results[2]["result"], {"success": True, "timed_out": False})
215+
216+
mock_open_file.assert_called_once_with('/fake/path/terraform_apply.log', 'r', encoding='utf-8')
217+
mock_isfile.assert_called_once_with("/fake/path/terraform_apply.log")
218+
219+
@patch("os.path.isfile", return_value=True)
220+
@patch("builtins.open", new_callable=mock_open, read_data=(
221+
"module.azapi[\"ccp\"].azapi_resource.aks_cluster: Creating...\n"
222+
"module.azapi[\"ccp\"].azapi_resource.aks_cluster: Still creating... [5m0s elapsed]\n"
223+
"module.azapi[\"ccp\"].azapi_resource.aks_cluster: Still creating... [10m0s elapsed]\n"
224+
"creating/updating Resource: (ResourceId\n"
225+
"/subscriptions/b8ceb4e5/resourceGroups/58296/providers/Microsoft.ContainerService/managedClusters/ccp-H8\n"
226+
"│ Error: creating/updating Resource: context deadline exceeded\n"
227+
"module.azapi[\"ccp\"].azapi_resource.aks_cluster: Creating...\n"
228+
"module.azapi[\"ccp\"].azapi_resource.aks_cluster: Still creating... [5m0s elapsed]\n"
229+
"module.azapi[\"ccp\"].azapi_resource.aks_cluster: Creation complete after 8m30s [id=/subscriptions/b8ceb4e5/resourceGroups/58296/providers/Microsoft.ContainerService/managedClusters/ccp-H8]\n"
230+
))
231+
def test_process_terraform_logs_with_timeout_and_retry(self, mock_open_file, mock_isfile):
232+
os.environ["RUN_ID"] = "1234567890"
233+
234+
results = process_terraform_logs(
235+
log_path="/fake/path",
236+
_command_type="apply",
237+
_scenario_type="perf-eval",
238+
_scenario_name="test_scenario_name",
239+
)
240+
241+
self.assertEqual(len(results), 2)
242+
243+
# First run timed out
244+
self.assertEqual(results[0]["module_name"], "azapi[\"ccp\"]")
245+
self.assertEqual(results[0]["resource_name"], "aks_cluster")
246+
self.assertEqual(results[0]["time_taken_seconds"], 600)
247+
self.assertEqual(results[0]["result"], {"success": False, "timed_out": True})
248+
249+
# Retry succeeded
250+
self.assertEqual(results[1]["module_name"], "azapi[\"ccp\"]")
251+
self.assertEqual(results[1]["resource_name"], "aks_cluster")
252+
self.assertEqual(results[1]["time_taken_seconds"], 510)
253+
self.assertEqual(results[1]["result"], {"success": True, "timed_out": False})
254+
255+
mock_open_file.assert_called_once_with('/fake/path/terraform_apply.log', 'r', encoding='utf-8')
256+
mock_isfile.assert_called_once_with("/fake/path/terraform_apply.log")
257+
258+
@patch("os.path.isfile", return_value=True)
259+
@patch("builtins.open", new_callable=mock_open, read_data=(
260+
"module.azapi[\"ccp\"].azapi_resource.aks_cluster: Creating...\n"
261+
"module.azapi[\"ccp\"].azapi_resource.aks_cluster: Still creating... [5m0s elapsed]\n"
262+
"module.azapi[\"ccp\"].azapi_resource.aks_cluster: Still creating... [10m0s elapsed]\n"
263+
"│ Error: Failed to create/update resource\n"
264+
"module.azapi[\"ccp\"].azapi_resource.aks_cluster: Creating...\n"
265+
"module.azapi[\"ccp\"].azapi_resource.aks_cluster: Still creating... [5m0s elapsed]\n"
266+
"module.azapi[\"ccp\"].azapi_resource.aks_cluster: Still creating... [10m0s elapsed]\n"
267+
"│ Error: creating/updating Resource: context deadline exceeded\n"
268+
"module.azapi[\"ccp\"].azapi_resource.aks_cluster: Creating...\n"
269+
"module.azapi[\"ccp\"].azapi_resource.aks_cluster: Still creating... [5m0s elapsed]\n"
270+
"module.azapi[\"ccp\"].azapi_resource.aks_cluster: Still creating... [10m0s elapsed]\n"
271+
"│ Error: creating/updating Resource: context deadline exceeded\n"
272+
))
273+
def test_process_terraform_logs_with_all_retries_failed(self, mock_open_file, mock_isfile):
274+
os.environ["RUN_ID"] = "1112223334"
275+
276+
results = process_terraform_logs(
277+
log_path="/fake/path",
278+
_command_type="apply",
279+
_scenario_type="perf-eval",
280+
_scenario_name="test_scenario_name",
281+
)
282+
283+
self.assertEqual(len(results), 3)
284+
285+
# First run failed with a non-timeout error
286+
self.assertEqual(results[0]["run_id"], "1112223334")
287+
self.assertEqual(results[0]["module_name"], "azapi[\"ccp\"]")
288+
self.assertEqual(results[0]["resource_name"], "aks_cluster")
289+
self.assertEqual(results[0]["time_taken_seconds"], 600)
290+
self.assertEqual(results[0]["result"], {"success": False, "timed_out": False})
291+
292+
# Second run timed out
293+
self.assertEqual(results[1]["run_id"], "1112223334")
294+
self.assertEqual(results[1]["module_name"], "azapi[\"ccp\"]")
295+
self.assertEqual(results[1]["resource_name"], "aks_cluster")
296+
self.assertEqual(results[1]["time_taken_seconds"], 600)
297+
self.assertEqual(results[1]["result"], {"success": False, "timed_out": True})
298+
299+
# Third run timed out
300+
self.assertEqual(results[2]["run_id"], "1112223334")
301+
self.assertEqual(results[2]["module_name"], "azapi[\"ccp\"]")
302+
self.assertEqual(results[2]["resource_name"], "aks_cluster")
303+
self.assertEqual(results[2]["time_taken_seconds"], 600)
304+
self.assertEqual(results[2]["result"], {"success": False, "timed_out": True})
305+
306+
mock_open_file.assert_called_once_with('/fake/path/terraform_apply.log', 'r', encoding='utf-8')
307+
mock_isfile.assert_called_once_with("/fake/path/terraform_apply.log")
308+
172309
if __name__ == "__main__":
173310
unittest.main()

modules/terraform/azure/azapi/main.tf

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,4 +62,8 @@ resource "azapi_resource" "aks_cluster" {
6262
response_export_values = [
6363
"properties.fqdn"
6464
]
65+
66+
timeouts {
67+
create = "60m"
68+
}
6569
}

pipelines/perf-eval/Hyperscale Cluster Benchmark/ccp-provisioning-H2.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,5 +32,5 @@ stages:
3232
max_parallel: 1
3333
credential_type: service_connection
3434
ssh_key_enabled: false
35-
timeout_in_minutes: 60
35+
timeout_in_minutes: 120
3636
retry_attempt_count: 0

pipelines/perf-eval/Hyperscale Cluster Benchmark/ccp-provisioning-H4.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,5 +32,5 @@ stages:
3232
max_parallel: 1
3333
credential_type: service_connection
3434
ssh_key_enabled: false
35-
timeout_in_minutes: 60
35+
timeout_in_minutes: 120
3636
retry_attempt_count: 0

pipelines/perf-eval/Hyperscale Cluster Benchmark/ccp-provisioning-H8.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,5 +32,5 @@ stages:
3232
max_parallel: 1
3333
credential_type: service_connection
3434
ssh_key_enabled: false
35-
timeout_in_minutes: 60
35+
timeout_in_minutes: 120
3636
retry_attempt_count: 0

steps/terraform/run-command.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ steps:
4141
fi
4242
4343
set +e
44-
terraform ${{ parameters.command }} --auto-approve ${{ parameters.arguments }} -var-file $terraform_input_file -var json_input="$terraform_input_variables" | tee terraform_${{ parameters.command }}.log
44+
terraform ${{ parameters.command }} --auto-approve ${{ parameters.arguments }} -var-file $terraform_input_file -var json_input="$terraform_input_variables" 2>&1 | tee terraform_${{ parameters.command }}.log
4545
exit_code=${PIPESTATUS[0]}
4646
if [[ $exit_code -ne 0 ]]; then
4747
if [[ ${{ parameters.command }} == "apply" && "$CLOUD" == "azure" ]]; then

0 commit comments

Comments
 (0)