-
Notifications
You must be signed in to change notification settings - Fork 208
Expand file tree
/
Copy pathsub-deploy-integration-tests-gcp.yml
More file actions
756 lines (684 loc) · 34.4 KB
/
sub-deploy-integration-tests-gcp.yml
File metadata and controls
756 lines (684 loc) · 34.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
name: Deploy Tests to GCP
on:
workflow_call:
inputs:
# Status and logging
test_id:
required: true
type: string
description: 'Unique identifier for the test'
test_description:
required: true
type: string
description: 'Explains what the test does'
height_grep_text:
required: false
type: string
description: 'Regular expression to find the tip height in test logs, and add it to newly created cached state image metadata'
# Test selection and parameters
test_variables:
required: true
type: string
description: 'Environmental variables used to select and configure the test'
network:
required: false
type: string
default: Mainnet
description: 'Zcash network to test against'
is_long_test:
required: false
type: boolean
default: false
description: 'Does this test need multiple run jobs? (Does it run longer than 6 hours?)'
# Cached state
#
zebra_state_dir:
required: false
type: string
default: '/home/zebra/.cache/zebra'
description: 'Zebra cached state directory and input image prefix to search in GCP'
lwd_state_dir:
required: false
type: string
default: '/home/zebra/.cache/lwd'
description: 'Lightwalletd cached state directory and input image prefix to search in GCP'
disk_prefix:
required: false
type: string
default: 'zebrad-cache'
description: 'Image name prefix, and `zebra_state_dir` name for newly created cached states'
disk_suffix:
required: false
type: string
default: 'tip'
description: 'Image name suffix'
needs_zebra_state:
required: true
type: boolean
description: 'Does the test use Zebra cached state?'
needs_lwd_state:
required: false
type: boolean
description: 'Does the test use Lightwalletd and Zebra cached state?'
# main branch states can be outdated and slower, but they can also be more reliable
saves_to_disk:
required: true
type: boolean
description: 'Can this test create new or updated cached state disks?'
force_save_to_disk:
required: false
type: boolean
default: false
description: 'Force this test to create a new or updated cached state disk'
app_name:
required: false
type: string
default: 'zebra'
description: 'Application name, used to work out when a job is an update job'
env:
RUST_LOG: ${{ vars.RUST_LOG }}
RUST_BACKTRACE: ${{ vars.RUST_BACKTRACE }}
RUST_LIB_BACKTRACE: ${{ vars.RUST_LIB_BACKTRACE }}
COLORBT_SHOW_HIDDEN: ${{ vars.COLORBT_SHOW_HIDDEN }}
CARGO_INCREMENTAL: ${{ vars.CARGO_INCREMENTAL }}
# How many previous log lines we show at the start of each new log job.
# Increase this number if some log lines are skipped between jobs
#
# We want to show all the logs since the last job finished,
# but we don't know how long it will be between jobs.
# 200 lines is about 6-15 minutes of sync logs, or one panic log.
EXTRA_LOG_LINES: 200
# How many blocks to wait before creating an updated cached state image.
# 1 day is approximately 1152 blocks.
CACHED_STATE_UPDATE_LIMIT: 576
jobs:
# Find a cached state disk for ${{ inputs.test_id }}, matching all of:
# - disk cached state prefix -> zebrad-cache or lwd-cache
# - state version (from the source code) - v{N}
# - network (network) - mainnet or testnet
# - disk target height kind (disk_suffix) - checkpoint or tip
#
# If the test needs a lightwalletd state (needs_lwd_state) set the input disk_prefix accordingly
# - To lwd-cache if needed
# - To zebrad-cache if not
#
# Passes the disk name to subsequent jobs using `cached_disk_name` output
# Passes the state version to subsequent jobs using `state_version` output
#
get-disk-name:
name: Get disk name
uses: ./.github/workflows/sub-find-cached-disks.yml
if: ${{ (inputs.needs_zebra_state || inputs.needs_lwd_state) || (inputs.saves_to_disk || inputs.force_save_to_disk) }}
with:
network: ${{ inputs.network || vars.ZCASH_NETWORK }}
disk_prefix: ${{ inputs.needs_lwd_state && 'lwd-cache' || inputs.needs_zebra_state && 'zebrad-cache' }}
disk_suffix: ${{ (inputs.needs_zebra_state || inputs.needs_lwd_state) && inputs.disk_suffix || '' }}
test_id: ${{ inputs.test_id }}
# Show all the test logs, then follow the logs of the test we just launched, until it finishes.
# Then check the result of the test.
#
# If `inputs.is_long_test` is `true`, the timeout is 5 days, otherwise it's 3 hours.
test-result:
name: Run ${{ inputs.test_id }} test
runs-on: zfnd-runners
needs: [ get-disk-name ]
if: ${{ !cancelled() && !failure() && (needs.get-disk-name.result == 'success' || needs.get-disk-name.result == 'skipped') }}
timeout-minutes: ${{ inputs.is_long_test && 7200 || 180 }}
outputs:
cached_disk_name: ${{ (inputs.needs_zebra_state || inputs.needs_lwd_state) && needs.get-disk-name.outputs.cached_disk_name || '' }}
state_version: ${{ (inputs.needs_zebra_state || inputs.needs_lwd_state) && needs.get-disk-name.outputs.state_version || '' }}
container_id: ${{ steps.find-container.outputs.CONTAINER_ID }}
env:
CACHED_DISK_NAME: ${{ (inputs.needs_zebra_state || inputs.needs_lwd_state) && needs.get-disk-name.outputs.cached_disk_name || '' }}
permissions:
contents: 'read'
id-token: 'write'
steps:
- uses: actions/checkout@v4.2.2
with:
persist-credentials: false
fetch-depth: '2'
- uses: r7kamura/rust-problem-matchers@v1.5.0
- name: Inject slug/short variables
uses: rlespinasse/github-slug-action@v5
with:
short-length: 7
- name: Downcase network name for disks and labels
run: |
NETWORK_CAPS="${{ inputs.network }}"
echo "NETWORK=${NETWORK_CAPS,,}" >> "$GITHUB_ENV"
# Install our SSH secret
- name: Install private SSH key
uses: shimataro/ssh-key-action@v2.7.0
with:
key: ${{ secrets.GCP_SSH_PRIVATE_KEY }}
name: google_compute_engine
known_hosts: unnecessary
- name: Generate public SSH key
run: |
sudo apt-get update && sudo apt-get -qq install -y --no-install-recommends openssh-client
ssh-keygen -y -f ~/.ssh/google_compute_engine > ~/.ssh/google_compute_engine.pub
# Setup gcloud CLI
- name: Authenticate to Google Cloud
id: auth
uses: google-github-actions/auth@v2.1.10
with:
workload_identity_provider: '${{ vars.GCP_WIF }}'
service_account: '${{ vars.GCP_DEPLOYMENTS_SA }}'
- name: Set up Cloud SDK
uses: google-github-actions/setup-gcloud@v2.1.4
# Disk Mounting Logic Explanation:
#
# The following step creates a GCP instance using create-with-container.
# The $CONTAINER_MOUNT_DISKS variable, constructed within the run script,
# defines how the created persistent disk (specified in $DISK_PARAMS)
# is mounted into the test container using --container-mount-disk.
#
# If the test needs Lightwalletd state (inputs.needs_lwd_state is true or test_id is lwd-sync-full),
# the same persistent disk is mounted to both the Zebra state path (inputs.zebra_state_dir)
# and the Lightwalletd state path (inputs.lwd_state_dir).
#
# Using a single disk simplifies the VM and container setup.
# Mounting the same disk to multiple paths doesn't cause conflicts because Zebra and
# lightwalletd create different subdirectories for their data within the mounted volume.
# (However, Zebra, lightwalletd, and the test harness must not delete the whole cache directory root.)
#
# The container mount paths (inputs.zebra_state_dir and inputs.lwd_state_dir) must match
# the paths expected by the tests in Rust (also configured in ci-unit-tests-docker.yml).
# The application inside the container will typically use environment variables (like those set in
# $CONTAINER_ENV) or these known paths to access the state data.
- name: Create ${{ inputs.test_id }} GCP compute instance
id: create-instance
run: |
NAME="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}"
# Create disk separately if using cached image, or prepare params for --create-disk if new
if [ -n "${{ env.CACHED_DISK_NAME }}" ]; then
# Create disk from cached image separately (allows partition mounting)
echo "Creating disk ${NAME} from cached image ${{ env.CACHED_DISK_NAME }}"
gcloud compute disks create "${NAME}" \
--size=400GB \
--type=pd-balanced \
--image="${{ env.CACHED_DISK_NAME }}" \
--zone=${{ vars.GCP_ZONE }}
DISK_ATTACH_PARAMS="--disk=name=${NAME},device-name=${NAME}"
else
# Use --create-disk for new disks (no partition support)
DISK_PARAMS="size=400GB,type=pd-balanced,name=${NAME},device-name=${NAME}"
DISK_ATTACH_PARAMS="--create-disk=${DISK_PARAMS}"
fi
# Mount the disk(s) to the container
# This partition=1 logic differentiates between disk types:
# - Only Zebra tip disks (from full sync) have partitions and need partition=1
# - LWD disks never have partitions
# - Checkpoint disks don't have partitions
# TODO: Consider removing this logic once all cached disk images use consistent partitioning.
# Determine if we should use partition=1 based on specific test requirements
# Default to safe approach: no partitions unless explicitly whitelisted
USE_PARTITION="false"
if [ -n "${{ env.CACHED_DISK_NAME }}" ]; then
# All tests that use Zebra tip disks (which have partitions)
if [[ "${{ inputs.test_id }}" == "sync-update-mainnet" ]] || \
[[ "${{ inputs.test_id }}" == "sync-full-mainnet" ]] || \
[[ "${{ inputs.test_id }}" == "generate-checkpoints-mainnet" ]] || \
[[ "${{ inputs.test_id }}" == "lwd-rpc-test" ]] || \
[[ "${{ inputs.test_id }}" == "rpc-get-block-template" ]] || \
[[ "${{ inputs.test_id }}" == "rpc-submit-block" ]]; then
USE_PARTITION="true"
echo "Using Zebra tip disk with partition=1: ${{ env.CACHED_DISK_NAME }}"
# All other tests default to no partition for safety
else
USE_PARTITION="false"
echo "Using cached disk without partition (safe default): ${{ env.CACHED_DISK_NAME }}"
fi
fi
# Mount zebra state directory
if [[ "$USE_PARTITION" == "true" ]]; then
CONTAINER_MOUNT_DISKS="--container-mount-disk=mount-path=${{ inputs.zebra_state_dir }},name=${NAME},mode=rw,partition=1"
else
CONTAINER_MOUNT_DISKS="--container-mount-disk=mount-path=${{ inputs.zebra_state_dir }},name=${NAME},mode=rw"
fi
# Mount the same disk to the lwd path if needed
if [[ "${{ inputs.needs_lwd_state }}" == "true" || "${{ inputs.test_id }}" == "lwd-sync-full" ]]; then
if [[ "$USE_PARTITION" == "true" ]]; then
CONTAINER_MOUNT_DISKS+=" --container-mount-disk=mount-path=${{ inputs.lwd_state_dir }},name=${NAME},mode=rw,partition=1"
else
CONTAINER_MOUNT_DISKS+=" --container-mount-disk=mount-path=${{ inputs.lwd_state_dir }},name=${NAME},mode=rw"
fi
fi
# Environment variables for the container
CONTAINER_ENV="${{ inputs.test_variables }},RUST_LOG=${{ env.RUST_LOG }},RUST_BACKTRACE=${{ env.RUST_BACKTRACE }},RUST_LIB_BACKTRACE=${{ env.RUST_LIB_BACKTRACE }},COLORBT_SHOW_HIDDEN=${{ env.COLORBT_SHOW_HIDDEN }},CARGO_INCREMENTAL=${{ env.CARGO_INCREMENTAL }}"
# Trim whitespace from GAR_BASE as for some reason it's getting a trailing space
GAR_BASE_TRIMMED=$(echo "${{ vars.GAR_BASE }}" | xargs)
gcloud compute instances create-with-container "${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }}" \
--machine-type ${{ inputs.is_long_test && vars.GCP_LARGE_MACHINE || vars.GCP_SMALL_MACHINE }} \
--boot-disk-size=50GB \
--boot-disk-type=pd-ssd \
--image-project=cos-cloud \
--image-family=cos-stable \
${DISK_ATTACH_PARAMS} \
${CONTAINER_MOUNT_DISKS} \
--container-stdin \
--container-tty \
--container-image="${GAR_BASE_TRIMMED}/${{ vars.CI_IMAGE_NAME }}:sha-${{ env.GITHUB_SHA_SHORT }}" \
--container-env="${CONTAINER_ENV}" \
--container-restart-policy=never \
--subnet=${{ vars.GCP_SUBNETWORK }} \
--scopes cloud-platform \
--service-account=${{ vars.GCP_DEPLOYMENTS_SA }} \
--metadata=google-logging-enabled=true,google-logging-use-fluentbit=true,google-monitoring-enabled=true \
--metadata-from-file=startup-script=.github/workflows/scripts/gcp-vm-startup-script.sh \
--labels=app=${{ inputs.app_name }},environment=test,network=${NETWORK},github_ref=${{ env.GITHUB_REF_SLUG_URL }},test=${{ inputs.test_id }} \
--tags ${{ inputs.app_name }} \
--zone ${{ vars.GCP_ZONE }}
# Find the container ID and save it for use in subsequent steps
- name: Find container ID
id: find-container
run: |
INSTANCE_NAME="${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }}"
CONTAINER_PREFIX="klt-${INSTANCE_NAME}"
echo "Looking for container with prefix: ${CONTAINER_PREFIX}"
# Wait up to 60 seconds for container to start
for attempt in {1..30}; do
echo "Attempt ${attempt}/30: Checking for running container..."
CONTAINER_ID=$(gcloud compute ssh ${INSTANCE_NAME} \
--zone ${{ vars.GCP_ZONE }} \
--ssh-flag="-o ServerAliveInterval=5" \
--ssh-flag="-o ConnectionAttempts=20" \
--ssh-flag="-o ConnectTimeout=5" \
--command="sudo docker ps --filter name=${CONTAINER_PREFIX} -q --no-trunc" 2>/dev/null || echo "")
if [ -n "${CONTAINER_ID}" ]; then
echo "Found running container: ${CONTAINER_ID}"
echo "CONTAINER_ID=${CONTAINER_ID}" >> $GITHUB_OUTPUT
exit 0
fi
echo "No running container found yet, waiting 2 seconds..."
sleep 2
done
echo "Container not found after 60 seconds"
exit 1
# Show debug logs if previous job failed
- name: Show debug logs if previous job failed
if: ${{ failure() }}
run: |
gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
--zone ${{ vars.GCP_ZONE }} \
--ssh-flag="-o ServerAliveInterval=5" \
--ssh-flag="-o ConnectionAttempts=20" \
--ssh-flag="-o ConnectTimeout=5" \
--command=' \
lsblk;
sudo lsof /dev/$DISK_NAME;
sudo dmesg;
sudo journalctl -b \
'
# Check that the container executed at least 1 Rust test harness test, and that all tests passed.
# Then wait for the container to finish, and exit with the test's exit status.
# Also shows all the test logs.
#
# If the container has already finished, `docker wait` should return its status.
# But sometimes this doesn't work, so we use `docker inspect` as a fallback.
#
# `docker wait` prints the container exit status as a string, but we need to exit the `ssh` command
# with that status.
# (`docker wait` can also wait for multiple containers, but we only ever wait for a single container.)
- name: Result of ${{ inputs.test_id }} test
run: |
CONTAINER_ID="${{ steps.find-container.outputs.CONTAINER_ID }}"
echo "Using pre-discovered container ID: ${CONTAINER_ID}"
gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
--zone ${{ vars.GCP_ZONE }} \
--ssh-flag="-o ServerAliveInterval=5" \
--ssh-flag="-o ConnectionAttempts=20" \
--ssh-flag="-o ConnectTimeout=5" \
--command="
echo 'Streaming logs from container ${CONTAINER_ID}...';
sudo docker logs --tail all --follow ${CONTAINER_ID} &
echo 'Waiting for container ${CONTAINER_ID} to exit...';
EXIT_STATUS=\$(sudo docker wait ${CONTAINER_ID});
echo 'Container exit status: '\$EXIT_STATUS;
if [ \"\$EXIT_STATUS\" -ne 0 ]; then
echo 'Test failed with exit code: '\$EXIT_STATUS;
exit 1;
else
echo 'Test successful with exit code: '\$EXIT_STATUS;
exit 0;
fi
"
# create a state image from the instance's state disk, if requested by the caller
create-state-image:
name: Create ${{ inputs.test_id }} cached state image
runs-on: ubuntu-latest
needs: [ test-result ]
# We run exactly one of without-cached-state or with-cached-state, and we always skip the other one.
# Normally, if a job is skipped, all the jobs that depend on it are also skipped.
# So we need to override the default success() check to make this job run.
#! Only create disk images when the test-result job succeeded
if: ${{ needs.test-result.result == 'success' && (inputs.saves_to_disk || inputs.force_save_to_disk) }}
env:
STATE_VERSION: ${{ needs.test-result.outputs.state_version }}
CACHED_DISK_NAME: ${{ needs.test-result.outputs.cached_disk_name }}
permissions:
contents: 'read'
id-token: 'write'
steps:
- uses: actions/checkout@v4.2.2
with:
persist-credentials: false
fetch-depth: '2'
- uses: r7kamura/rust-problem-matchers@v1.5.0
- name: Inject slug/short variables
uses: rlespinasse/github-slug-action@v5
with:
short-length: 7
# Performs formatting on disk name components.
#
# Disk images in GCP are required to be in lowercase, but the blockchain network
# uses sentence case, so we need to downcase ${{ inputs.network }}.
#
# Disk image names in GCP are limited to 63 characters, so we need to limit
# branch names to 12 characters.
#
# Passes ${{ inputs.network }} to subsequent steps using $NETWORK env variable.
# Passes ${{ env.GITHUB_REF_POINT_SLUG_URL }} to subsequent steps using $SHORT_GITHUB_REF env variable.
- name: Format network name and branch name for disks
run: |
NETWORK_CAPS="${{ inputs.network }}"
echo "NETWORK=${NETWORK_CAPS,,}" >> "$GITHUB_ENV"
LONG_GITHUB_REF="${{ env.GITHUB_REF_POINT_SLUG_URL }}"
echo "SHORT_GITHUB_REF=${LONG_GITHUB_REF:0:12}" >> "$GITHUB_ENV"
# Install our SSH secret
- name: Install private SSH key
uses: shimataro/ssh-key-action@v2.7.0
with:
key: ${{ secrets.GCP_SSH_PRIVATE_KEY }}
name: google_compute_engine
known_hosts: unnecessary
- name: Generate public SSH key
run: |
sudo apt-get update && sudo apt-get -qq install -y --no-install-recommends openssh-client
ssh-keygen -y -f ~/.ssh/google_compute_engine > ~/.ssh/google_compute_engine.pub
# Setup gcloud CLI
- name: Authenticate to Google Cloud
id: auth
uses: google-github-actions/auth@v2.1.10
with:
workload_identity_provider: '${{ vars.GCP_WIF }}'
service_account: '${{ vars.GCP_DEPLOYMENTS_SA }}'
- name: Set up Cloud SDK
uses: google-github-actions/setup-gcloud@v2.1.4
# Sets the $UPDATE_SUFFIX env var to "-u" if updating a previous cached state,
# and the empty string otherwise.
#
# Also sets a unique date and time suffix $TIME_SUFFIX.
- name: Set update and time suffixes
run: |
UPDATE_SUFFIX=""
if [[ "${{ inputs.needs_zebra_state }}" == "true" ]] && [[ "${{ inputs.app_name }}" == "zebrad" ]]; then
UPDATE_SUFFIX="-u"
fi
# TODO: find a better logic for the lwd-sync-full case
if [[ "${{ inputs.needs_lwd_state }}" == "true" ]] && [[ "${{ inputs.app_name }}" == "lightwalletd" ]] && [[ "${{ inputs.test_id }}" != 'lwd-sync-full' ]]; then
UPDATE_SUFFIX="-u"
fi
# We're going to delete old images after a few days, so we only need the time here
TIME_SUFFIX=$(date '+%H%M%S' --utc)
echo "UPDATE_SUFFIX=$UPDATE_SUFFIX" >> "$GITHUB_ENV"
echo "TIME_SUFFIX=$TIME_SUFFIX" >> "$GITHUB_ENV"
# Get the full initial and running database versions from the test logs.
# These versions are used as part of the disk description and labels.
#
# If these versions are missing from the logs, the job fails.
#
# Typically, the database versions are around line 20 in the logs..
# But we check the first 1000 log lines, just in case the test harness recompiles all the
# dependencies before running the test. (This can happen if the cache is invalid.)
#
# Passes the versions to subsequent steps using the $INITIAL_DISK_DB_VERSION,
# $RUNNING_DB_VERSION, and $DB_VERSION_SUMMARY env variables.
- name: Get database versions from logs
run: |
INITIAL_DISK_DB_VERSION=""
RUNNING_DB_VERSION=""
DB_VERSION_SUMMARY=""
# Get Instance Name
INSTANCE_NAME="${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }}"
echo "Fetching first 1000 log entries via SSH for instance ${INSTANCE_NAME} to find DB versions..."
CONTAINER_ID="${{ needs.test-result.outputs.container_id }}"
DOCKER_LOGS=$( \
gcloud compute ssh ${INSTANCE_NAME} \
--zone ${{ vars.GCP_ZONE }} \
--ssh-flag="-o ServerAliveInterval=5" \
--ssh-flag="-o ConnectionAttempts=20" \
--ssh-flag="-o ConnectTimeout=5" \
--command="sudo docker logs ${CONTAINER_ID} | head -1000" \
)
if [[ $? -ne 0 ]] || [[ -z "$DOCKER_LOGS" ]]; then
echo "Failed to retrieve logs via SSH."
exit 1
fi
# either a semantic version or "creating new database"
INITIAL_DISK_DB_VERSION=$( \
echo "$DOCKER_LOGS" | \
grep --extended-regexp --only-matching 'initial disk state version: [0-9a-z\.]+' | \
grep --extended-regexp --only-matching '[0-9a-z\.]+' | \
tail -1 || \
[[ $? == 1 ]] \
)
if [[ -z "$INITIAL_DISK_DB_VERSION" ]]; then
# Check for new database creation
if echo "$DOCKER_LOGS" | grep -q "creating.new.database"; then
INITIAL_DISK_DB_VERSION="new"
else
echo "Checked logs:"
echo ""
echo "$DOCKER_LOGS"
echo ""
echo "Missing initial disk database version in logs"
# Fail the tests, because Zebra didn't log the initial disk database version,
# or the regex in this step is wrong.
exit 1
fi
else
INITIAL_DISK_DB_VERSION="v${INITIAL_DISK_DB_VERSION//./-}"
fi
echo "Found initial disk database version in logs: $INITIAL_DISK_DB_VERSION"
echo "INITIAL_DISK_DB_VERSION=$INITIAL_DISK_DB_VERSION" >> "$GITHUB_ENV"
RUNNING_DB_VERSION=$( \
echo "$DOCKER_LOGS" | \
grep --extended-regexp --only-matching 'running state version: [0-9\.]+' | \
grep --extended-regexp --only-matching '[0-9\.]+' | \
tail -1 || \
[[ $? == 1 ]] \
)
if [[ -z "$RUNNING_DB_VERSION" ]]; then
echo "Checked logs:"
echo ""
echo "$DOCKER_LOGS"
echo ""
echo "Missing running database version in logs: $RUNNING_DB_VERSION"
# Fail the tests, because Zebra didn't log the running database version,
# or the regex in this step is wrong.
exit 1
fi
RUNNING_DB_VERSION="v${RUNNING_DB_VERSION//./-}"
echo "Found running database version in logs: $RUNNING_DB_VERSION"
echo "RUNNING_DB_VERSION=$RUNNING_DB_VERSION" >> "$GITHUB_ENV"
if [[ "$INITIAL_DISK_DB_VERSION" = "$RUNNING_DB_VERSION" ]]; then
DB_VERSION_SUMMARY="$RUNNING_DB_VERSION"
elif [[ "$INITIAL_DISK_DB_VERSION" = "new" ]]; then
DB_VERSION_SUMMARY="$RUNNING_DB_VERSION in new database"
else
DB_VERSION_SUMMARY="$INITIAL_DISK_DB_VERSION changing to $RUNNING_DB_VERSION"
fi
echo "Summarised database versions from logs: $DB_VERSION_SUMMARY"
echo "DB_VERSION_SUMMARY=$DB_VERSION_SUMMARY" >> "$GITHUB_ENV"
# Get the sync height from the test logs, which is later used as part of the
# disk description and labels.
#
# The regex used to grep the sync height is provided by ${{ inputs.height_grep_text }},
# this allows to dynamically change the height as needed by different situations or
# based on the logs output from different tests.
#
# If the sync height is missing from the logs, the job fails.
#
# Passes the sync height to subsequent steps using the $SYNC_HEIGHT env variable.
- name: Get sync height from logs
run: |
SYNC_HEIGHT=""
# Get Instance Name
INSTANCE_NAME="${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }}"
echo "Fetching last 200 log entries via SSH for instance ${INSTANCE_NAME} to find sync height..."
CONTAINER_ID="${{ needs.test-result.outputs.container_id }}"
DOCKER_LOGS=$( \
gcloud compute ssh ${INSTANCE_NAME} \
--zone ${{ vars.GCP_ZONE }} \
--ssh-flag="-o ServerAliveInterval=5" \
--ssh-flag="-o ConnectionAttempts=20" \
--ssh-flag="-o ConnectTimeout=5" \
--command="sudo docker logs --tail 200 ${CONTAINER_ID}" \
)
if [[ $? -ne 0 ]] || [[ -z "$DOCKER_LOGS" ]]; then
echo "Failed to retrieve logs via SSH."
exit 1
fi
SYNC_HEIGHT=$( \
echo "$DOCKER_LOGS" | \
grep --extended-regexp --only-matching '${{ inputs.height_grep_text }}[0-9]+' | \
grep --extended-regexp --only-matching '[0-9]+' | \
tail -1 || \
[[ $? == 1 ]] \
)
if [[ -z "$SYNC_HEIGHT" ]]; then
echo "Checked logs:"
echo ""
echo "$DOCKER_LOGS"
echo ""
echo "Missing sync height in logs: $SYNC_HEIGHT"
# Fail the tests, because Zebra and lightwalletd didn't log their sync heights,
# or the CI workflow sync height regex is wrong.
false
fi
echo "Found sync height in logs: $SYNC_HEIGHT"
echo "SYNC_HEIGHT=$SYNC_HEIGHT" >> "$GITHUB_ENV"
# Get the original cached state height from google cloud.
#
# If the height is missing from the image labels, uses zero instead.
#
# TODO: fail the job if needs_zebra_state but the height is missing
# we can make this change after all the old images have been deleted, this should happen around 15 September 2022
# we'll also need to do a manual checkpoint rebuild before opening the PR for this change
#
# Passes the original height to subsequent steps using $ORIGINAL_HEIGHT env variable.
- name: Get original cached state height from google cloud
run: |
ORIGINAL_HEIGHT="0"
ORIGINAL_DISK_NAME="${{ format('{0}', env.CACHED_DISK_NAME) }}"
if [[ -n "$ORIGINAL_DISK_NAME" ]]; then
ORIGINAL_HEIGHT=$(gcloud compute images list --filter="status=READY AND name=$ORIGINAL_DISK_NAME" --format="value(labels.height)")
ORIGINAL_HEIGHT=${ORIGINAL_HEIGHT:-0}
echo "$ORIGINAL_DISK_NAME height: $ORIGINAL_HEIGHT"
else
ORIGINAL_DISK_NAME="new-disk"
echo "newly created disk, original height set to 0"
fi
echo "ORIGINAL_HEIGHT=$ORIGINAL_HEIGHT" >> "$GITHUB_ENV"
echo "ORIGINAL_DISK_NAME=$ORIGINAL_DISK_NAME" >> "$GITHUB_ENV"
# Create an image from the state disk, which will be used for any tests that start
# after it is created. These tests can be in the same workflow, or in a different PR.
#
# Using the newest image makes future jobs faster, because it is closer to the chain tip.
#
# Skips creating updated images if the original image is less than $CACHED_STATE_UPDATE_LIMIT behind the current tip.
# Full sync images are always created.
#
# The image can contain:
# - Zebra cached state, or
# - Zebra + lightwalletd cached state.
# Which cached state is being saved to the disk is defined by ${{ inputs.disk_prefix }}.
#
# Google Cloud doesn't have an atomic image replacement operation.
# We don't want to delete and re-create the image, because that causes a ~5 minute
# window where might be no recent image. So we add an extra image with a unique name,
# which gets selected because it has a later creation time.
# This also simplifies the process of deleting old images,
# because we don't have to worry about accidentally deleting all the images.
#
# The timestamp makes images from the same commit unique,
# as long as they don't finish in the same second.
# (This is unlikely, because each image created by a workflow has a different name.)
#
# The image name must also be 63 characters or less.
# More info: https://cloud.google.com/compute/docs/naming-resources#resource-name-format
#
# Force the image creation (--force) as the disk is still attached even though is not being
# used by the container.
- name: Create image from state disk
run: |
MINIMUM_UPDATE_HEIGHT=$((ORIGINAL_HEIGHT+CACHED_STATE_UPDATE_LIMIT))
if [[ -z "$UPDATE_SUFFIX" ]] || [[ "$SYNC_HEIGHT" -gt "$MINIMUM_UPDATE_HEIGHT" ]] || [[ "${{ inputs.force_save_to_disk }}" == "true" ]]; then
# Use RUNNING_DB_VERSION for image naming (more reliable than STATE_VERSION)
# Extract just the major version number for the image name
IMAGE_VERSION_FOR_NAME=${RUNNING_DB_VERSION#v} # Remove v prefix
IMAGE_VERSION_FOR_NAME=${IMAGE_VERSION_FOR_NAME%%-*} # Keep only major version (before first dash)
# Validate that we have a version number
if [[ -z $IMAGE_VERSION_FOR_NAME ]] || [[ ! $IMAGE_VERSION_FOR_NAME =~ ^[0-9]+$ ]]; then
echo "ERROR: Invalid version extracted for image naming: $IMAGE_VERSION_FOR_NAME"
echo "RUNNING_DB_VERSION was: $RUNNING_DB_VERSION"
echo "STATE_VERSION was: ${{ env.STATE_VERSION }}"
exit 1
fi
echo "Using version $IMAGE_VERSION_FOR_NAME for image naming (from RUNNING_DB_VERSION: $RUNNING_DB_VERSION)"
# Create image from the test disk
gcloud compute images create \
"${{ inputs.disk_prefix }}-${SHORT_GITHUB_REF}-${{ env.GITHUB_SHA_SHORT }}-v${IMAGE_VERSION_FOR_NAME}-${NETWORK}-${{ inputs.disk_suffix }}${UPDATE_SUFFIX}-${TIME_SUFFIX}" \
--force \
--source-disk=${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }} \
--source-disk-zone=${{ vars.GCP_ZONE }} \
--storage-location=us \
--description="Created from commit ${{ env.GITHUB_SHA_SHORT }} with height ${{ env.SYNC_HEIGHT }} and database format ${{ env.DB_VERSION_SUMMARY }}" \
--labels="height=${{ env.SYNC_HEIGHT }},purpose=${{ inputs.disk_prefix }},branch=${{ env.GITHUB_REF_SLUG_URL }},commit=${{ env.GITHUB_SHA_SHORT }},state-version=${IMAGE_VERSION_FOR_NAME},state-running-version=${RUNNING_DB_VERSION},initial-state-disk-version=${INITIAL_DISK_DB_VERSION},network=${NETWORK},target-height-kind=${{ inputs.disk_suffix }},update-flag=${UPDATE_SUFFIX},force-save=${{ inputs.force_save_to_disk }},updated-from-height=${ORIGINAL_HEIGHT},updated-from-disk=${ORIGINAL_DISK_NAME},test-id=${{ inputs.test_id }},app-name=${{ inputs.app_name }}"
else
echo "Skipped cached state update because the new sync height $SYNC_HEIGHT was less than $CACHED_STATE_UPDATE_LIMIT blocks above the original height $ORIGINAL_HEIGHT of $ORIGINAL_DISK_NAME"
fi
# delete the Google Cloud instance for this test
delete-instance:
name: Delete ${{ inputs.test_id }} instance
runs-on: ubuntu-latest
needs: [ create-state-image ]
# If a disk generation step timeouts (+6 hours) the previous job (creating the image) will be skipped.
# Even if the instance continues running, no image will be created, so it's better to delete it.
if: always()
continue-on-error: true
permissions:
contents: 'read'
id-token: 'write'
steps:
- uses: actions/checkout@v4.2.2
with:
persist-credentials: false
fetch-depth: '2'
- uses: r7kamura/rust-problem-matchers@v1.5.0
- name: Inject slug/short variables
uses: rlespinasse/github-slug-action@v5
with:
short-length: 7
# Setup gcloud CLI
- name: Authenticate to Google Cloud
id: auth
uses: google-github-actions/auth@v2.1.10
with:
workload_identity_provider: '${{ vars.GCP_WIF }}'
service_account: '${{ vars.GCP_DEPLOYMENTS_SA }}'
- name: Set up Cloud SDK
uses: google-github-actions/setup-gcloud@v2.1.4
# Deletes the instances that has been recently deployed in the actual commit after all
# previous jobs have run, no matter the outcome of the job.
- name: Delete test instance
continue-on-error: true
run: |
INSTANCE=$(gcloud compute instances list --filter=${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} --format='value(NAME)')
if [ -z "${INSTANCE}" ]; then
echo "No instance to delete"
else
gcloud compute instances delete "${INSTANCE}" --zone "${{ vars.GCP_ZONE }}" --delete-disks all --quiet
fi