zebra/.github/workflows/sub-deploy-integration-tests-gcp.yml at 7796f788633f537e9294f3051b9be6ca5aa4700c · ZcashFoundation/zebra · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
name: Deploy Tests to GCP

on:
  workflow_call:
    inputs:
      # Status and logging
      test_id:
        required: true
        type: string
        description: 'Unique identifier for the test'
      test_description:
        required: true
        type: string
        description: 'Explains what the test does'
      height_grep_text:
        required: false
        type: string
        description: 'Regular expression to find the tip height in test logs, and add it to newly created cached state image metadata'

      # Test selection and parameters
      test_variables:
        required: true
        type: string
        description: 'Environmental variables used to select and configure the test'
      network:
        required: false
        type: string
        default: Mainnet
        description: 'Zcash network to test against'
      is_long_test:
        required: false
        type: boolean
        default: false
        description: 'Does this test need multiple run jobs? (Does it run longer than 6 hours?)'

      # Cached state
      #
      zebra_state_dir:
        required: false
        type: string
        default: '/home/zebra/.cache/zebra'
        description: 'Zebra cached state directory and input image prefix to search in GCP'
      lwd_state_dir:
        required: false
        type: string
        default: '/home/zebra/.cache/lwd'
        description: 'Lightwalletd cached state directory and input image prefix to search in GCP'
      disk_prefix:
        required: false
        type: string
        default: 'zebrad-cache'
        description: 'Image name prefix, and `zebra_state_dir` name for newly created cached states'
      disk_suffix:
        required: false
        type: string
        default: 'tip'
        description: 'Image name suffix'
      needs_zebra_state:
        required: true
        type: boolean
        description: 'Does the test use Zebra cached state?'
      needs_lwd_state:
        required: false
        type: boolean
        description: 'Does the test use Lightwalletd and Zebra cached state?'
      # main branch states can be outdated and slower, but they can also be more reliable
      saves_to_disk:
        required: true
        type: boolean
        description: 'Can this test create new or updated cached state disks?'
      force_save_to_disk:
        required: false
        type: boolean
        default: false
        description: 'Force this test to create a new or updated cached state disk'
      app_name:
        required: false
        type: string
        default: 'zebra'
        description: 'Application name, used to work out when a job is an update job'

env:
  RUST_LOG: ${{ vars.RUST_LOG }}
  RUST_BACKTRACE: ${{ vars.RUST_BACKTRACE }}
  RUST_LIB_BACKTRACE: ${{ vars.RUST_LIB_BACKTRACE }}
  COLORBT_SHOW_HIDDEN: ${{ vars.COLORBT_SHOW_HIDDEN }}
  CARGO_INCREMENTAL: ${{ vars.CARGO_INCREMENTAL }}
  # How many previous log lines we show at the start of each new log job.
  # Increase this number if some log lines are skipped between jobs
  #
  # We want to show all the logs since the last job finished,
  # but we don't know how long it will be between jobs.
  # 200 lines is about 6-15 minutes of sync logs, or one panic log.
  EXTRA_LOG_LINES: 200
  # How many blocks to wait before creating an updated cached state image.
  # 1 day is approximately 1152 blocks.
  CACHED_STATE_UPDATE_LIMIT: 576
jobs:
  # Find a cached state disk for ${{ inputs.test_id }}, matching all of:
  # - disk cached state prefix -> zebrad-cache or lwd-cache
  # - state version (from the source code) - v{N}
  # - network (network) - mainnet or testnet
  # - disk target height kind (disk_suffix) - checkpoint or tip
  #
  # If the test needs a lightwalletd state (needs_lwd_state) set the input disk_prefix accordingly
  # - To lwd-cache if needed
  # - To zebrad-cache if not
  #
  # Passes the disk name to subsequent jobs using `cached_disk_name` output
  # Passes the state version to subsequent jobs using `state_version` output
  #
  get-disk-name:
    name: Get disk name
    uses: ./.github/workflows/sub-find-cached-disks.yml
    if: ${{ (inputs.needs_zebra_state || inputs.needs_lwd_state) || (inputs.saves_to_disk || inputs.force_save_to_disk) }}
    with:
      network: ${{ inputs.network || vars.ZCASH_NETWORK }}
      disk_prefix: ${{ inputs.needs_lwd_state && 'lwd-cache' || inputs.needs_zebra_state && 'zebrad-cache' }}
      disk_suffix: ${{ (inputs.needs_zebra_state || inputs.needs_lwd_state) && inputs.disk_suffix || '' }}
      test_id: ${{ inputs.test_id }}

  # Show all the test logs, then follow the logs of the test we just launched, until it finishes.
  # Then check the result of the test.
  #
  # If `inputs.is_long_test` is `true`, the timeout is 5 days, otherwise it's 3 hours.
  test-result:
    name: Run ${{ inputs.test_id }} test
    runs-on: zfnd-runners
    needs: [ get-disk-name ]
    if: ${{ !cancelled() && !failure() && (needs.get-disk-name.result == 'success' || needs.get-disk-name.result == 'skipped') }}
    timeout-minutes: ${{ inputs.is_long_test && 7200 || 180 }}
    outputs:
      cached_disk_name: ${{ (inputs.needs_zebra_state || inputs.needs_lwd_state) && needs.get-disk-name.outputs.cached_disk_name || '' }}
      state_version: ${{ (inputs.needs_zebra_state || inputs.needs_lwd_state) && needs.get-disk-name.outputs.state_version || '' }}
      container_id: ${{ steps.find-container.outputs.CONTAINER_ID }}
    env:
      CACHED_DISK_NAME: ${{ (inputs.needs_zebra_state || inputs.needs_lwd_state) && needs.get-disk-name.outputs.cached_disk_name || '' }}
    permissions:
      contents: 'read'
      id-token: 'write'
    steps:
      - uses: actions/checkout@v4.2.2
        with:
          persist-credentials: false
          fetch-depth: '2'
      - uses: r7kamura/rust-problem-matchers@v1.5.0

      - name: Inject slug/short variables
        uses: rlespinasse/github-slug-action@v5
        with:
          short-length: 7

      - name: Downcase network name for disks and labels
        run: |
          NETWORK_CAPS="${{ inputs.network }}"
          echo "NETWORK=${NETWORK_CAPS,,}" >> "$GITHUB_ENV"

      # Install our SSH secret
      - name: Install private SSH key
        uses: shimataro/ssh-key-action@v2.7.0
        with:
          key: ${{ secrets.GCP_SSH_PRIVATE_KEY }}
          name: google_compute_engine
          known_hosts: unnecessary

      - name: Generate public SSH key
        run: |
          sudo apt-get update && sudo apt-get -qq install -y --no-install-recommends openssh-client
          ssh-keygen -y -f ~/.ssh/google_compute_engine > ~/.ssh/google_compute_engine.pub

      # Setup gcloud CLI
      - name: Authenticate to Google Cloud
        id: auth
        uses: google-github-actions/auth@v2.1.10
        with:
          workload_identity_provider: '${{ vars.GCP_WIF }}'
          service_account: '${{ vars.GCP_DEPLOYMENTS_SA }}'

      - name: Set up Cloud SDK
        uses: google-github-actions/setup-gcloud@v2.1.4

      # Disk Mounting Logic Explanation:
      #
      # The following step creates a GCP instance using create-with-container.
      # The $CONTAINER_MOUNT_DISKS variable, constructed within the run script,
      # defines how the created persistent disk (specified in $DISK_PARAMS)
      # is mounted into the test container using --container-mount-disk.
      #
      # If the test needs Lightwalletd state (inputs.needs_lwd_state is true or test_id is lwd-sync-full),
      # the same persistent disk is mounted to both the Zebra state path (inputs.zebra_state_dir)
      # and the Lightwalletd state path (inputs.lwd_state_dir).
      #
      # Using a single disk simplifies the VM and container setup.
      # Mounting the same disk to multiple paths doesn't cause conflicts because Zebra and
      # lightwalletd create different subdirectories for their data within the mounted volume.
      # (However, Zebra, lightwalletd, and the test harness must not delete the whole cache directory root.)
      #
      # The container mount paths (inputs.zebra_state_dir and inputs.lwd_state_dir) must match
      # the paths expected by the tests in Rust (also configured in ci-unit-tests-docker.yml).
      # The application inside the container will typically use environment variables (like those set in
      # $CONTAINER_ENV) or these known paths to access the state data.
      - name: Create ${{ inputs.test_id }} GCP compute instance
        id: create-instance
        run: |
          NAME="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}"

          # Create disk separately if using cached image, or prepare params for --create-disk if new
          if [ -n "${{ env.CACHED_DISK_NAME }}" ]; then
            # Create disk from cached image separately (allows partition mounting)
            echo "Creating disk ${NAME} from cached image ${{ env.CACHED_DISK_NAME }}"
            gcloud compute disks create "${NAME}" \
              --size=400GB \
              --type=pd-balanced \
              --image="${{ env.CACHED_DISK_NAME }}" \
              --zone=${{ vars.GCP_ZONE }}
            DISK_ATTACH_PARAMS="--disk=name=${NAME},device-name=${NAME}"
          else
            # Use --create-disk for new disks (no partition support)
            DISK_PARAMS="size=400GB,type=pd-balanced,name=${NAME},device-name=${NAME}"
            DISK_ATTACH_PARAMS="--create-disk=${DISK_PARAMS}"
          fi

          # Mount the disk(s) to the container
          # This partition=1 logic differentiates between disk types:
          # - Only Zebra tip disks (from full sync) have partitions and need partition=1
          # - LWD disks never have partitions
          # - Checkpoint disks don't have partitions
          # TODO: Consider removing this logic once all cached disk images use consistent partitioning.

          # Determine if we should use partition=1 based on specific test requirements
          # Default to safe approach: no partitions unless explicitly whitelisted
          USE_PARTITION="false"
          if [ -n "${{ env.CACHED_DISK_NAME }}" ]; then
            # All tests that use Zebra tip disks (which have partitions)
            if [[ "${{ inputs.test_id }}" == "sync-update-mainnet" ]] || \
               [[ "${{ inputs.test_id }}" == "sync-full-mainnet" ]] || \
               [[ "${{ inputs.test_id }}" == "generate-checkpoints-mainnet" ]] || \
               [[ "${{ inputs.test_id }}" == "lwd-rpc-test" ]] || \
               [[ "${{ inputs.test_id }}" == "rpc-get-block-template" ]] || \
               [[ "${{ inputs.test_id }}" == "rpc-submit-block" ]]; then
              USE_PARTITION="true"
              echo "Using Zebra tip disk with partition=1: ${{ env.CACHED_DISK_NAME }}"
            # All other tests default to no partition for safety
            else
              USE_PARTITION="false"
              echo "Using cached disk without partition (safe default): ${{ env.CACHED_DISK_NAME }}"
            fi
          fi

          # Mount zebra state directory
          if [[ "$USE_PARTITION" == "true" ]]; then
            CONTAINER_MOUNT_DISKS="--container-mount-disk=mount-path=${{ inputs.zebra_state_dir }},name=${NAME},mode=rw,partition=1"
          else
            CONTAINER_MOUNT_DISKS="--container-mount-disk=mount-path=${{ inputs.zebra_state_dir }},name=${NAME},mode=rw"
          fi

          # Mount the same disk to the lwd path if needed
          if [[ "${{ inputs.needs_lwd_state }}" == "true" || "${{ inputs.test_id }}" == "lwd-sync-full" ]]; then
            if [[ "$USE_PARTITION" == "true" ]]; then
              CONTAINER_MOUNT_DISKS+=" --container-mount-disk=mount-path=${{ inputs.lwd_state_dir }},name=${NAME},mode=rw,partition=1"
            else
              CONTAINER_MOUNT_DISKS+=" --container-mount-disk=mount-path=${{ inputs.lwd_state_dir }},name=${NAME},mode=rw"
            fi
          fi

          # Environment variables for the container
          CONTAINER_ENV="${{ inputs.test_variables }},RUST_LOG=${{ env.RUST_LOG }},RUST_BACKTRACE=${{ env.RUST_BACKTRACE }},RUST_LIB_BACKTRACE=${{ env.RUST_LIB_BACKTRACE }},COLORBT_SHOW_HIDDEN=${{ env.COLORBT_SHOW_HIDDEN }},CARGO_INCREMENTAL=${{ env.CARGO_INCREMENTAL }}"

          # Trim whitespace from GAR_BASE as for some reason it's getting a trailing space
          GAR_BASE_TRIMMED=$(echo "${{ vars.GAR_BASE }}" | xargs)

          gcloud compute instances create-with-container "${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }}" \
          --machine-type ${{ inputs.is_long_test && vars.GCP_LARGE_MACHINE || vars.GCP_SMALL_MACHINE }} \
          --boot-disk-size=50GB \
          --boot-disk-type=pd-ssd \
          --image-project=cos-cloud \
          --image-family=cos-stable \
          ${DISK_ATTACH_PARAMS} \
          ${CONTAINER_MOUNT_DISKS} \
          --container-stdin \
          --container-tty \
          --container-image="${GAR_BASE_TRIMMED}/${{ vars.CI_IMAGE_NAME }}:sha-${{ env.GITHUB_SHA_SHORT }}" \
          --container-env="${CONTAINER_ENV}" \
          --container-restart-policy=never \
          --subnet=${{ vars.GCP_SUBNETWORK }} \
          --scopes cloud-platform \
          --service-account=${{ vars.GCP_DEPLOYMENTS_SA }} \
          --metadata=google-logging-enabled=true,google-logging-use-fluentbit=true,google-monitoring-enabled=true \
          --metadata-from-file=startup-script=.github/workflows/scripts/gcp-vm-startup-script.sh \
          --labels=app=${{ inputs.app_name }},environment=test,network=${NETWORK},github_ref=${{ env.GITHUB_REF_SLUG_URL }},test=${{ inputs.test_id }} \
          --tags ${{ inputs.app_name }} \
          --zone ${{ vars.GCP_ZONE }}

      # Find the container ID and save it for use in subsequent steps
      - name: Find container ID
        id: find-container
        run: |
          INSTANCE_NAME="${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }}"
          CONTAINER_PREFIX="klt-${INSTANCE_NAME}"

          echo "Looking for container with prefix: ${CONTAINER_PREFIX}"

          # Wait up to 60 seconds for container to start
          for attempt in {1..30}; do
            echo "Attempt ${attempt}/30: Checking for running container..."
            CONTAINER_ID=$(gcloud compute ssh ${INSTANCE_NAME} \
              --zone ${{ vars.GCP_ZONE }} \
              --ssh-flag="-o ServerAliveInterval=5" \
              --ssh-flag="-o ConnectionAttempts=20" \
              --ssh-flag="-o ConnectTimeout=5" \
              --command="sudo docker ps --filter name=${CONTAINER_PREFIX} -q --no-trunc" 2>/dev/null || echo "")
            if [ -n "${CONTAINER_ID}" ]; then
              echo "Found running container: ${CONTAINER_ID}"
              echo "CONTAINER_ID=${CONTAINER_ID}" >> $GITHUB_OUTPUT
              exit 0
            fi
            echo "No running container found yet, waiting 2 seconds..."
            sleep 2
          done

          echo "Container not found after 60 seconds"
          exit 1

      # Show debug logs if previous job failed
      - name: Show debug logs if previous job failed
        if: ${{ failure() }}
        run: |
          gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
          --zone ${{ vars.GCP_ZONE }} \
          --ssh-flag="-o ServerAliveInterval=5" \
          --ssh-flag="-o ConnectionAttempts=20" \
          --ssh-flag="-o ConnectTimeout=5" \
          --command=' \
          lsblk;
          sudo lsof /dev/$DISK_NAME;
          sudo dmesg;
          sudo journalctl -b \
          '

      # Check that the container executed at least 1 Rust test harness test, and that all tests passed.
      # Then wait for the container to finish, and exit with the test's exit status.
      # Also shows all the test logs.
      #
      # If the container has already finished, `docker wait` should return its status.
      # But sometimes this doesn't work, so we use `docker inspect` as a fallback.
      #
      # `docker wait` prints the container exit status as a string, but we need to exit the `ssh` command
      # with that status.
      # (`docker wait` can also wait for multiple containers, but we only ever wait for a single container.)
      - name: Result of ${{ inputs.test_id }} test
        run: |
          CONTAINER_ID="${{ steps.find-container.outputs.CONTAINER_ID }}"
          echo "Using pre-discovered container ID: ${CONTAINER_ID}"
          gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
          --zone ${{ vars.GCP_ZONE }} \
          --ssh-flag="-o ServerAliveInterval=5" \
          --ssh-flag="-o ConnectionAttempts=20" \
          --ssh-flag="-o ConnectTimeout=5" \
          --command="
          echo 'Streaming logs from container ${CONTAINER_ID}...';
          sudo docker logs --tail all --follow ${CONTAINER_ID} &

          echo 'Waiting for container ${CONTAINER_ID} to exit...';
          EXIT_STATUS=\$(sudo docker wait ${CONTAINER_ID});
          echo 'Container exit status: '\$EXIT_STATUS;

          if [ \"\$EXIT_STATUS\" -ne 0 ]; then
            echo 'Test failed with exit code: '\$EXIT_STATUS;
            exit 1;
          else
            echo 'Test successful with exit code: '\$EXIT_STATUS;
            exit 0;
          fi
          "

  # create a state image from the instance's state disk, if requested by the caller
  create-state-image:
    name: Create ${{ inputs.test_id }} cached state image
    runs-on: ubuntu-latest
    needs: [ test-result ]
    # We run exactly one of without-cached-state or with-cached-state, and we always skip the other one.
    # Normally, if a job is skipped, all the jobs that depend on it are also skipped.
    # So we need to override the default success() check to make this job run.
    #! Only create disk images when the test-result job succeeded
    if: ${{ needs.test-result.result == 'success' && (inputs.saves_to_disk || inputs.force_save_to_disk) }}
    env:
      STATE_VERSION: ${{ needs.test-result.outputs.state_version }}
      CACHED_DISK_NAME: ${{ needs.test-result.outputs.cached_disk_name }}
    permissions:
      contents: 'read'
      id-token: 'write'
    steps:
      - uses: actions/checkout@v4.2.2
        with:
          persist-credentials: false
          fetch-depth: '2'
      - uses: r7kamura/rust-problem-matchers@v1.5.0

      - name: Inject slug/short variables
        uses: rlespinasse/github-slug-action@v5
        with:
          short-length: 7

      # Performs formatting on disk name components.
      #
      # Disk images in GCP are required to be in lowercase, but the blockchain network
      # uses sentence case, so we need to downcase ${{ inputs.network }}.
      #
      # Disk image names in GCP are limited to 63 characters, so we need to limit
      # branch names to 12 characters.
      #
      # Passes ${{ inputs.network }} to subsequent steps using $NETWORK env variable.
      # Passes ${{ env.GITHUB_REF_POINT_SLUG_URL }} to subsequent steps using $SHORT_GITHUB_REF env variable.
      - name: Format network name and branch name for disks
        run: |
          NETWORK_CAPS="${{ inputs.network }}"
          echo "NETWORK=${NETWORK_CAPS,,}" >> "$GITHUB_ENV"
          LONG_GITHUB_REF="${{ env.GITHUB_REF_POINT_SLUG_URL }}"
          echo "SHORT_GITHUB_REF=${LONG_GITHUB_REF:0:12}" >> "$GITHUB_ENV"

      # Install our SSH secret
      - name: Install private SSH key
        uses: shimataro/ssh-key-action@v2.7.0
        with:
          key: ${{ secrets.GCP_SSH_PRIVATE_KEY }}
          name: google_compute_engine
          known_hosts: unnecessary

      - name: Generate public SSH key
        run: |
          sudo apt-get update && sudo apt-get -qq install -y --no-install-recommends openssh-client
          ssh-keygen -y -f ~/.ssh/google_compute_engine > ~/.ssh/google_compute_engine.pub

      # Setup gcloud CLI
      - name: Authenticate to Google Cloud
        id: auth
        uses: google-github-actions/auth@v2.1.10
        with:
          workload_identity_provider: '${{ vars.GCP_WIF }}'
          service_account: '${{ vars.GCP_DEPLOYMENTS_SA }}'

      - name: Set up Cloud SDK
        uses: google-github-actions/setup-gcloud@v2.1.4

      # Sets the $UPDATE_SUFFIX env var to "-u" if updating a previous cached state,
      # and the empty string otherwise.
      #
      # Also sets a unique date and time suffix $TIME_SUFFIX.
      - name: Set update and time suffixes
        run: |
          UPDATE_SUFFIX=""

          if [[ "${{ inputs.needs_zebra_state }}" == "true" ]] && [[ "${{ inputs.app_name }}" == "zebrad" ]]; then
              UPDATE_SUFFIX="-u"
          fi

          # TODO: find a better logic for the lwd-sync-full case
          if [[ "${{ inputs.needs_lwd_state }}" == "true" ]] && [[ "${{ inputs.app_name }}" == "lightwalletd" ]] && [[ "${{ inputs.test_id }}" != 'lwd-sync-full' ]]; then
              UPDATE_SUFFIX="-u"
          fi

          # We're going to delete old images after a few days, so we only need the time here
          TIME_SUFFIX=$(date '+%H%M%S' --utc)

          echo "UPDATE_SUFFIX=$UPDATE_SUFFIX" >> "$GITHUB_ENV"
          echo "TIME_SUFFIX=$TIME_SUFFIX" >> "$GITHUB_ENV"

      # Get the full initial and running database versions from the test logs.
      # These versions are used as part of the disk description and labels.
      #
      # If these versions are missing from the logs, the job fails.
      #
      # Typically, the database versions are around line 20 in the logs..
      # But we check the first 1000 log lines, just in case the test harness recompiles all the
      # dependencies before running the test. (This can happen if the cache is invalid.)
      #
      # Passes the versions to subsequent steps using the $INITIAL_DISK_DB_VERSION,
      # $RUNNING_DB_VERSION, and $DB_VERSION_SUMMARY env variables.
      - name: Get database versions from logs
        run: |
          INITIAL_DISK_DB_VERSION=""
          RUNNING_DB_VERSION=""
          DB_VERSION_SUMMARY=""

          # Get Instance Name
          INSTANCE_NAME="${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }}"

          echo "Fetching first 1000 log entries via SSH for instance ${INSTANCE_NAME} to find DB versions..."
          CONTAINER_ID="${{ needs.test-result.outputs.container_id }}"
          DOCKER_LOGS=$( \
          gcloud compute ssh ${INSTANCE_NAME} \
          --zone ${{ vars.GCP_ZONE }} \
          --ssh-flag="-o ServerAliveInterval=5" \
          --ssh-flag="-o ConnectionAttempts=20" \
          --ssh-flag="-o ConnectTimeout=5" \
          --command="sudo docker logs ${CONTAINER_ID} | head -1000" \
          )

          if [[ $? -ne 0 ]] || [[ -z "$DOCKER_LOGS" ]]; then
              echo "Failed to retrieve logs via SSH."
              exit 1
          fi

          # either a semantic version or "creating new database"
          INITIAL_DISK_DB_VERSION=$( \
          echo "$DOCKER_LOGS" | \
          grep --extended-regexp --only-matching 'initial disk state version: [0-9a-z\.]+' | \
          grep --extended-regexp --only-matching '[0-9a-z\.]+' | \
          tail -1 || \
          [[ $? == 1 ]] \
          )

          if [[ -z "$INITIAL_DISK_DB_VERSION" ]]; then
          # Check for new database creation
          if echo "$DOCKER_LOGS" | grep -q "creating.new.database"; then
              INITIAL_DISK_DB_VERSION="new"
          else
              echo "Checked logs:"
              echo ""
              echo "$DOCKER_LOGS"
              echo ""
              echo "Missing initial disk database version in logs"
              # Fail the tests, because Zebra didn't log the initial disk database version,
              # or the regex in this step is wrong.
              exit 1
          fi
          else
              INITIAL_DISK_DB_VERSION="v${INITIAL_DISK_DB_VERSION//./-}"
          fi

          echo "Found initial disk database version in logs: $INITIAL_DISK_DB_VERSION"
          echo "INITIAL_DISK_DB_VERSION=$INITIAL_DISK_DB_VERSION" >> "$GITHUB_ENV"

          RUNNING_DB_VERSION=$( \
          echo "$DOCKER_LOGS" | \
          grep --extended-regexp --only-matching 'running state version: [0-9\.]+' | \
          grep --extended-regexp --only-matching '[0-9\.]+' | \
          tail -1 || \
          [[ $? == 1 ]] \
          )

          if [[ -z "$RUNNING_DB_VERSION" ]]; then
              echo "Checked logs:"
              echo ""
              echo "$DOCKER_LOGS"
              echo ""
              echo "Missing running database version in logs: $RUNNING_DB_VERSION"
              # Fail the tests, because Zebra didn't log the running database version,
              # or the regex in this step is wrong.
              exit 1
          fi

          RUNNING_DB_VERSION="v${RUNNING_DB_VERSION//./-}"
          echo "Found running database version in logs: $RUNNING_DB_VERSION"
          echo "RUNNING_DB_VERSION=$RUNNING_DB_VERSION" >> "$GITHUB_ENV"

          if [[ "$INITIAL_DISK_DB_VERSION" = "$RUNNING_DB_VERSION" ]]; then
              DB_VERSION_SUMMARY="$RUNNING_DB_VERSION"
          elif [[ "$INITIAL_DISK_DB_VERSION" = "new" ]]; then
              DB_VERSION_SUMMARY="$RUNNING_DB_VERSION in new database"
          else
              DB_VERSION_SUMMARY="$INITIAL_DISK_DB_VERSION changing to $RUNNING_DB_VERSION"
          fi

          echo "Summarised database versions from logs: $DB_VERSION_SUMMARY"
          echo "DB_VERSION_SUMMARY=$DB_VERSION_SUMMARY" >> "$GITHUB_ENV"

      # Get the sync height from the test logs, which is later used as part of the
      # disk description and labels.
      #
      # The regex used to grep the sync height is provided by ${{ inputs.height_grep_text }},
      # this allows to dynamically change the height as needed by different situations or
      # based on the logs output from different tests.
      #
      # If the sync height is missing from the logs, the job fails.
      #
      # Passes the sync height to subsequent steps using the $SYNC_HEIGHT env variable.
      - name: Get sync height from logs
        run: |
          SYNC_HEIGHT=""

          # Get Instance Name
          INSTANCE_NAME="${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }}"

          echo "Fetching last 200 log entries via SSH for instance ${INSTANCE_NAME} to find sync height..."
          CONTAINER_ID="${{ needs.test-result.outputs.container_id }}"
          DOCKER_LOGS=$( \
          gcloud compute ssh ${INSTANCE_NAME} \
          --zone ${{ vars.GCP_ZONE }} \
          --ssh-flag="-o ServerAliveInterval=5" \
          --ssh-flag="-o ConnectionAttempts=20" \
          --ssh-flag="-o ConnectTimeout=5" \
          --command="sudo docker logs --tail 200 ${CONTAINER_ID}" \
          )

          if [[ $? -ne 0 ]] || [[ -z "$DOCKER_LOGS" ]]; then
              echo "Failed to retrieve logs via SSH."
              exit 1
          fi

          SYNC_HEIGHT=$( \
          echo "$DOCKER_LOGS" | \
          grep --extended-regexp --only-matching '${{ inputs.height_grep_text }}[0-9]+' | \
          grep --extended-regexp --only-matching '[0-9]+' | \
          tail -1 || \
          [[ $? == 1 ]] \
          )

          if [[ -z "$SYNC_HEIGHT" ]]; then
              echo "Checked logs:"
              echo ""
              echo "$DOCKER_LOGS"
              echo ""
              echo "Missing sync height in logs: $SYNC_HEIGHT"
              # Fail the tests, because Zebra and lightwalletd didn't log their sync heights,
              # or the CI workflow sync height regex is wrong.
              false
          fi

          echo "Found sync height in logs: $SYNC_HEIGHT"
          echo "SYNC_HEIGHT=$SYNC_HEIGHT" >> "$GITHUB_ENV"

      # Get the original cached state height from google cloud.
      #
      # If the height is missing from the image labels, uses zero instead.
      #
      # TODO: fail the job if needs_zebra_state but the height is missing
      #       we can make this change after all the old images have been deleted, this should happen around 15 September 2022
      #       we'll also need to do a manual checkpoint rebuild before opening the PR for this change
      #
      # Passes the original height to subsequent steps using $ORIGINAL_HEIGHT env variable.
      - name: Get original cached state height from google cloud
        run: |
          ORIGINAL_HEIGHT="0"
          ORIGINAL_DISK_NAME="${{ format('{0}', env.CACHED_DISK_NAME) }}"

          if [[ -n "$ORIGINAL_DISK_NAME" ]]; then
              ORIGINAL_HEIGHT=$(gcloud compute images list --filter="status=READY AND name=$ORIGINAL_DISK_NAME" --format="value(labels.height)")
              ORIGINAL_HEIGHT=${ORIGINAL_HEIGHT:-0}
              echo "$ORIGINAL_DISK_NAME height: $ORIGINAL_HEIGHT"
          else
              ORIGINAL_DISK_NAME="new-disk"
              echo "newly created disk, original height set to 0"
          fi

          echo "ORIGINAL_HEIGHT=$ORIGINAL_HEIGHT" >> "$GITHUB_ENV"
          echo "ORIGINAL_DISK_NAME=$ORIGINAL_DISK_NAME" >> "$GITHUB_ENV"

      # Create an image from the state disk, which will be used for any tests that start
      # after it is created. These tests can be in the same workflow, or in a different PR.
      #
      # Using the newest image makes future jobs faster, because it is closer to the chain tip.
      #
      # Skips creating updated images if the original image is less than $CACHED_STATE_UPDATE_LIMIT behind the current tip.
      # Full sync images are always created.
      #
      # The image can contain:
      # - Zebra cached state, or
      # - Zebra + lightwalletd cached state.
      # Which cached state is being saved to the disk is defined by ${{ inputs.disk_prefix }}.
      #
      # Google Cloud doesn't have an atomic image replacement operation.
      # We don't want to delete and re-create the image, because that causes a ~5 minute
      # window where might be no recent image. So we add an extra image with a unique name,
      # which gets selected because it has a later creation time.
      # This also simplifies the process of deleting old images,
      # because we don't have to worry about accidentally deleting all the images.
      #
      # The timestamp makes images from the same commit unique,
      # as long as they don't finish in the same second.
      # (This is unlikely, because each image created by a workflow has a different name.)
      #
      # The image name must also be 63 characters or less.
      # More info: https://cloud.google.com/compute/docs/naming-resources#resource-name-format
      #
      # Force the image creation (--force) as the disk is still attached even though is not being
      # used by the container.
      - name: Create image from state disk
        run: |
          MINIMUM_UPDATE_HEIGHT=$((ORIGINAL_HEIGHT+CACHED_STATE_UPDATE_LIMIT))
          if [[ -z "$UPDATE_SUFFIX" ]] || [[ "$SYNC_HEIGHT" -gt "$MINIMUM_UPDATE_HEIGHT" ]] || [[ "${{ inputs.force_save_to_disk }}" == "true" ]]; then

             # Use RUNNING_DB_VERSION for image naming (more reliable than STATE_VERSION)
             # Extract just the major version number for the image name
             IMAGE_VERSION_FOR_NAME=${RUNNING_DB_VERSION#v}  # Remove v prefix
             IMAGE_VERSION_FOR_NAME=${IMAGE_VERSION_FOR_NAME%%-*}  # Keep only major version (before first dash)

             # Validate that we have a version number
             if [[ -z $IMAGE_VERSION_FOR_NAME ]] || [[ ! $IMAGE_VERSION_FOR_NAME =~ ^[0-9]+$ ]]; then
                 echo "ERROR: Invalid version extracted for image naming: $IMAGE_VERSION_FOR_NAME"
                 echo "RUNNING_DB_VERSION was: $RUNNING_DB_VERSION"
                 echo "STATE_VERSION was: ${{ env.STATE_VERSION }}"
                 exit 1
             fi

             echo "Using version $IMAGE_VERSION_FOR_NAME for image naming (from RUNNING_DB_VERSION: $RUNNING_DB_VERSION)"

             # Create image from the test disk
             gcloud compute images create \
              "${{ inputs.disk_prefix }}-${SHORT_GITHUB_REF}-${{ env.GITHUB_SHA_SHORT }}-v${IMAGE_VERSION_FOR_NAME}-${NETWORK}-${{ inputs.disk_suffix }}${UPDATE_SUFFIX}-${TIME_SUFFIX}" \
              --force \
              --source-disk=${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }} \
              --source-disk-zone=${{ vars.GCP_ZONE }} \
              --storage-location=us \
              --description="Created from commit ${{ env.GITHUB_SHA_SHORT }} with height ${{ env.SYNC_HEIGHT }} and database format ${{ env.DB_VERSION_SUMMARY }}" \
              --labels="height=${{ env.SYNC_HEIGHT }},purpose=${{ inputs.disk_prefix }},branch=${{ env.GITHUB_REF_SLUG_URL }},commit=${{ env.GITHUB_SHA_SHORT }},state-version=${IMAGE_VERSION_FOR_NAME},state-running-version=${RUNNING_DB_VERSION},initial-state-disk-version=${INITIAL_DISK_DB_VERSION},network=${NETWORK},target-height-kind=${{ inputs.disk_suffix }},update-flag=${UPDATE_SUFFIX},force-save=${{ inputs.force_save_to_disk }},updated-from-height=${ORIGINAL_HEIGHT},updated-from-disk=${ORIGINAL_DISK_NAME},test-id=${{ inputs.test_id }},app-name=${{ inputs.app_name }}"
          else
              echo "Skipped cached state update because the new sync height $SYNC_HEIGHT was less than $CACHED_STATE_UPDATE_LIMIT blocks above the original height $ORIGINAL_HEIGHT of $ORIGINAL_DISK_NAME"
          fi

  # delete the Google Cloud instance for this test
  delete-instance:
    name: Delete ${{ inputs.test_id }} instance
    runs-on: ubuntu-latest
    needs: [ create-state-image ]
    # If a disk generation step timeouts (+6 hours) the previous job (creating the image) will be skipped.
    # Even if the instance continues running, no image will be created, so it's better to delete it.
    if: always()
    continue-on-error: true
    permissions:
      contents: 'read'
      id-token: 'write'
    steps:
      - uses: actions/checkout@v4.2.2
        with:
          persist-credentials: false
          fetch-depth: '2'
      - uses: r7kamura/rust-problem-matchers@v1.5.0

      - name: Inject slug/short variables
        uses: rlespinasse/github-slug-action@v5
        with:
          short-length: 7

      # Setup gcloud CLI
      - name: Authenticate to Google Cloud
        id: auth
        uses: google-github-actions/auth@v2.1.10
        with:
          workload_identity_provider: '${{ vars.GCP_WIF }}'
          service_account: '${{ vars.GCP_DEPLOYMENTS_SA }}'

      - name: Set up Cloud SDK
        uses: google-github-actions/setup-gcloud@v2.1.4

      # Deletes the instances that has been recently deployed in the actual commit after all
      # previous jobs have run, no matter the outcome of the job.
      - name: Delete test instance
        continue-on-error: true
        run: |
          INSTANCE=$(gcloud compute instances list --filter=${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} --format='value(NAME)')
          if [ -z "${INSTANCE}" ]; then
            echo "No instance to delete"
          else
            gcloud compute instances delete "${INSTANCE}" --zone "${{ vars.GCP_ZONE }}" --delete-disks all --quiet
          fi