fix(ci): handle disk mounting and logs reading edge-cases (#7690)

* fix: use `exit-nopipe` with consistent `shell` usage Temporarily disabled the `set -e` option around the docker logs command to handle the broken pipe error gracefully. Handle more complex scenarios in our `Result of ${{ inputs.test_id }} test` job * fix: Use single quotes for the outer command * fix: use same approach for CD * test: check launch failure logs * fix: revert CD changes * fix: do not try to increase the disk size and wait mounting * fix: increase GB a bit more * fix: do not fail on pipe failure * fix: use plain `tee /dev/stderr` If this does not work try `(tee … || true)` * fix: `tee` not stoping on cd config tests * fix: match logic with GCP tests * fix(cd): handle pipe and other errors correctly * try `tee --output-error=exit-nopipe` * fix: TRAP without pipefail * test: pipefail with exit and trap * fix: use a subshell * fix(ci): wait for mounting and show system logs if fail * fix(ci): GCP is not always mounting disks in the same order * fix: use `grep` instead of `awk` * fix: typo * fix: use simpler `grep` command * fix: do not sleep if not require * chore: reduce diff
2023-10-09 18:59:59 +01:00 · 2023-10-09 18:59:59 +01:00 · 8d0a17ee1c
parent a2b7859e8e
commit 8d0a17ee1c
2 changed files with 162 additions and 81 deletions
--- a/.github/workflows/continous-delivery.yml
+++ b/.github/workflows/continous-delivery.yml
@ -29,7 +29,7 @@ on:
        type: boolean
        default: false
-  # Temporarily disabled to reduce network load, see #6894.
+  # TODO: Temporarily disabled to reduce network load, see #6894.
  #push:
  #  branches:
  #    - main
@ -132,29 +132,37 @@ jobs:
      # Make sure Zebra can sync at least one full checkpoint on mainnet
      - name: Run tests using the default config
        shell: /usr/bin/bash -exo pipefail {0}
        run: |
          set -ex
          docker pull ${{ vars.GAR_BASE }}/zebrad@${{ needs.build.outputs.image_digest }}
          docker run --detach --name default-conf-tests -t ${{ vars.GAR_BASE }}/zebrad@${{ needs.build.outputs.image_digest }}
-          # show the logs, even if the job times out
+
-          docker logs --tail all --follow default-conf-tests | \
+          # Use a subshell to handle the broken pipe error gracefully
-          tee --output-error=exit /dev/stderr | \
+          (
-          grep --max-count=1 --extended-regexp --color=always \
+            trap "" PIPE;
-          'net.*=.*Main.*estimated progress to chain tip.*BeforeOverwinter'
+            docker logs \
            --tail all \
            --follow \
            default-conf-tests | \
            tee --output-error=exit /dev/stderr | \
            grep --max-count=1 --extended-regexp --color=always \
            -e "net.*=.*Main.*estimated progress to chain tip.*BeforeOverwinter"
          ) || true
          LOGS_EXIT_STATUS=$?
          docker stop default-conf-tests
-          # get the exit status from docker
+
-          EXIT_STATUS=$( \
+          EXIT_STATUS=$(docker wait default-conf-tests || echo "Error retrieving exit status");
-          docker wait default-conf-tests || \
+          echo "docker exit status: $EXIT_STATUS";
-          docker inspect --format "{{.State.ExitCode}}" default-conf-tests || \
+
-          echo "missing container, or missing exit status for container" \
+          # If grep found the pattern, exit with the Docker container exit status
-          )
+          if [ $LOGS_EXIT_STATUS -eq 0 ]; then
-          docker logs default-conf-tests
+              exit $EXIT_STATUS;
          echo "docker exit status: $EXIT_STATUS"
          if [[ "$EXIT_STATUS" = "137" ]]; then
          echo "ignoring expected signal status"
          exit 0
          fi
-          exit "$EXIT_STATUS"
+
          # Handle other potential errors here
          echo "An error occurred while processing the logs.";
          exit 1;
  # Test reconfiguring the docker image for testnet.
  test-configuration-file-testnet:
@ -172,30 +180,37 @@ jobs:
      # Make sure Zebra can sync the genesis block on testnet
      - name: Run tests using a testnet config
        shell: /usr/bin/bash -exo pipefail {0}
        run: |
          set -ex
          docker pull ${{ vars.GAR_BASE }}/zebrad@${{ needs.build.outputs.image_digest }}
          docker run --env "NETWORK=Testnet" --detach --name testnet-conf-tests -t ${{ vars.GAR_BASE }}/zebrad@${{ needs.build.outputs.image_digest }}
-          # show the logs, even if the job times out
+          # Use a subshell to handle the broken pipe error gracefully
-          docker logs --tail all --follow testnet-conf-tests | \
+          (
-          tee --output-error=exit /dev/stderr | \
+            trap "" PIPE;
-          grep --max-count=1 --extended-regexp --color=always \
+            docker logs \
-          -e 'net.*=.*Test.*estimated progress to chain tip.*Genesis' \
+            --tail all \
-          -e 'net.*=.*Test.*estimated progress to chain tip.*BeforeOverwinter'
+            --follow \
            testnet-conf-tests | \
            tee --output-error=exit /dev/stderr | \
            grep --max-count=1 --extended-regexp --color=always \
            -e "net.*=.*Test.*estimated progress to chain tip.*Genesis" \
            -e "net.*=.*Test.*estimated progress to chain tip.*BeforeOverwinter";
          ) || true
          LOGS_EXIT_STATUS=$?
          docker stop testnet-conf-tests
-          # get the exit status from docker
+
-          EXIT_STATUS=$( \
+          EXIT_STATUS=$(docker wait testnet-conf-tests || echo "Error retrieving exit status");
-          docker wait testnet-conf-tests || \
+          echo "docker exit status: $EXIT_STATUS";
-          docker inspect --format "{{.State.ExitCode}}" testnet-conf-tests || \
+
-          echo "missing container, or missing exit status for container" \
+          # If grep found the pattern, exit with the Docker container exit status
-          )
+          if [ $LOGS_EXIT_STATUS -eq 0 ]; then
-          docker logs testnet-conf-tests
+              exit $EXIT_STATUS;
          echo "docker exit status: $EXIT_STATUS"
          if [[ "$EXIT_STATUS" = "137" ]]; then
          echo "ignoring expected signal status"
          exit 0
          fi
-          exit "$EXIT_STATUS"
+
          # Handle other potential errors here
          echo "An error occurred while processing the logs.";
          exit 1;
  # Deploy Managed Instance Groups (MiGs) for Mainnet and Testnet,
  # with one node in the configured GCP region.
--- a/.github/workflows/deploy-gcp-tests.yml
+++ b/.github/workflows/deploy-gcp-tests.yml
@ -183,39 +183,56 @@ jobs:
      # Format the mounted disk if the test doesn't use a cached state.
      - name: Format ${{ inputs.test_id }} volume
        shell: /usr/bin/bash -exo pipefail {0}
        run: |
          gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
          --zone ${{ vars.GCP_ZONE }} \
          --ssh-flag="-o ServerAliveInterval=5" \
          --ssh-flag="-o ConnectionAttempts=20" \
          --ssh-flag="-o ConnectTimeout=5" \
-          --command \
+          --command=' \
-          "\
+          set -ex;
-          while sudo lsof /dev/sdb; do \
+          # Extract the correct disk name based on the device-name
-            echo 'Waiting for /dev/sdb to be free...'; \
+          export DISK_NAME=$(ls -l /dev/disk/by-id | grep -oE "google-${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }} -> ../../[^ ]+" | grep -oE "/[^/]+$" | cut -c 2-); \
-            sleep 10; \
+          sudo mkfs.ext4 -v /dev/$DISK_NAME \
-          done; \
+          '
          sudo mkfs.ext4 -v /dev/sdb \
          "
      # Launch the test without any cached state
      - name: Launch ${{ inputs.test_id }} test
        id: launch-test
        shell: /usr/bin/bash -exo pipefail {0}
        run: |
          gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
          --zone ${{ vars.GCP_ZONE }} \
          --ssh-flag="-o ServerAliveInterval=5" \
          --ssh-flag="-o ConnectionAttempts=20" \
          --ssh-flag="-o ConnectTimeout=5" \
-          --command \
+          --command=' \
          "\
          sudo docker run \
          --name ${{ inputs.test_id }} \
          --tty \
          --detach \
          ${{ inputs.test_variables }} \
-          --mount type=volume,volume-driver=local,volume-opt=device=/dev/sdb,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \
+          --mount type=volume,volume-driver=local,volume-opt=device=/dev/$DISK_NAME,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \
          ${{ vars.GAR_BASE }}/${{ vars.CI_IMAGE_NAME }}:sha-${{ env.GITHUB_SHA_SHORT }} \
-          "
+          '
      # Show debug logs if previous job failed
      - name: Show debug logs if previous job failed
        if: ${{ failure() }}
        shell: /usr/bin/bash -exo pipefail {0}
        run: |
          gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
          --zone ${{ vars.GCP_ZONE }} \
          --ssh-flag="-o ServerAliveInterval=5" \
          --ssh-flag="-o ConnectionAttempts=20" \
          --ssh-flag="-o ConnectTimeout=5" \
          --command=' \
          lsblk;
          sudo lsof /dev/sdb;
          sudo dmesg;
          sudo journalctl -b \
          '
  # set up and launch the test, if it uses cached state
  # each test runs one of the *-with/without-cached-state job series, and skips the other
@ -381,7 +398,6 @@ jobs:
          --labels=app=${{ inputs.app_name }},environment=test,network=${NETWORK},github_ref=${{ env.GITHUB_REF_SLUG_URL }},test=${{ inputs.test_id }} \
          --tags ${{ inputs.app_name }} \
          --zone ${{ vars.GCP_ZONE }}
          sleep 60
      # Launch the test with the previously created Zebra-only cached state.
      # Each test runs one of the "Launch test" steps, and skips the other.
@ -405,22 +421,43 @@ jobs:
        # lightwalletd-full-sync reads Zebra and writes lwd, so it is handled specially.
        # TODO: we should find a better logic for this use cases
        if: ${{ (inputs.needs_zebra_state && !inputs.needs_lwd_state) && inputs.test_id != 'lwd-full-sync' }}
        shell: /usr/bin/bash -exo pipefail {0}
        run: |
          gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
          --zone ${{ vars.GCP_ZONE }} \
          --ssh-flag="-o ServerAliveInterval=5" \
          --ssh-flag="-o ConnectionAttempts=20" \
          --ssh-flag="-o ConnectTimeout=5" \
-          --command \
+          --command=' \
-          "\
+          set -ex;
          # Extract the correct disk name based on the device-name
          export DISK_NAME=$(ls -l /dev/disk/by-id | grep -oE "google-${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }} -> ../../[^ ]+" | grep -oE "/[^/]+$" | cut -c 2-); \
          sudo docker run \
          --name ${{ inputs.test_id }} \
          --tty \
          --detach \
          ${{ inputs.test_variables }} \
-          --mount type=volume,volume-driver=local,volume-opt=device=/dev/sdb,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \
+          --mount type=volume,volume-driver=local,volume-opt=device=/dev/$DISK_NAME,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \
          ${{ vars.GAR_BASE }}/${{ vars.CI_IMAGE_NAME }}:sha-${{ env.GITHUB_SHA_SHORT }} \
-          "
+          '
      # Show debug logs if previous job failed
      - name: Show debug logs if previous job failed
        if: ${{ failure() && (inputs.needs_zebra_state && !inputs.needs_lwd_state) && inputs.test_id != 'lwd-full-sync' }}
        shell: /usr/bin/bash -exo pipefail {0}
        run: |
          gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
          --zone ${{ vars.GCP_ZONE }} \
          --ssh-flag="-o ServerAliveInterval=5" \
          --ssh-flag="-o ConnectionAttempts=20" \
          --ssh-flag="-o ConnectTimeout=5" \
          --command=' \
          lsblk;
          sudo lsof /dev/$DISK_NAME;
          sudo dmesg;
          sudo journalctl -b \
          '
      # Launch the test with the previously created Lightwalletd and Zebra cached state.
      # Each test runs one of the "Launch test" steps, and skips the other.
@ -455,23 +492,44 @@ jobs:
        # lightwalletd-full-sync reads Zebra and writes lwd, so it is handled specially.
        # TODO: we should find a better logic for this use cases
        if: ${{ (inputs.needs_zebra_state && inputs.needs_lwd_state) || inputs.test_id == 'lwd-full-sync' }}
        shell: /usr/bin/bash -exo pipefail {0}
        run: |
          gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
          --zone ${{ vars.GCP_ZONE }} \
          --ssh-flag="-o ServerAliveInterval=5" \
          --ssh-flag="-o ConnectionAttempts=20" \
          --ssh-flag="-o ConnectTimeout=5" \
-          --command \
+          --command=' \
-          "\
+          set -ex;
          # Extract the correct disk name based on the device-name
          export DISK_NAME=$(ls -l /dev/disk/by-id | grep -oE "google-${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }} -> ../../[^ ]+" | grep -oE "/[^/]+$" | cut -c 2-); \
          sudo docker run \
          --name ${{ inputs.test_id }} \
          --tty \
          --detach \
          ${{ inputs.test_variables }} \
-          --mount type=volume,volume-driver=local,volume-opt=device=/dev/sdb,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \
+          --mount type=volume,volume-driver=local,volume-opt=device=/dev/$DISK_NAME,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \
-          --mount type=volume,volume-driver=local,volume-opt=device=/dev/sdb,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.lwd_state_dir }} \
+          --mount type=volume,volume-driver=local,volume-opt=device=/dev/$DISK_NAME,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.lwd_state_dir }} \
          ${{ vars.GAR_BASE }}/${{ vars.CI_IMAGE_NAME }}:sha-${{ env.GITHUB_SHA_SHORT }} \
-          "
+          '
      # Show debug logs if previous job failed
      - name: Show debug logs if previous job failed
        if: ${{ failure() && (inputs.needs_zebra_state && inputs.needs_lwd_state) || inputs.test_id == 'lwd-full-sync' }}
        shell: /usr/bin/bash -exo pipefail {0}
        run: |
          gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
          --zone ${{ vars.GCP_ZONE }} \
          --ssh-flag="-o ServerAliveInterval=5" \
          --ssh-flag="-o ConnectionAttempts=20" \
          --ssh-flag="-o ConnectTimeout=5" \
          --command=' \
          lsblk;
          sudo lsof /dev/$DISK_NAME;
          sudo dmesg;
          sudo journalctl -b \
          '
  # Show all the test logs, then follow the logs of the test we just launched, until it finishes.
  # Then check the result of the test.
@ -538,23 +596,23 @@ jobs:
      #
      # Errors in the tests are caught by the final test status job.
      - name: Check startup logs for ${{ inputs.test_id }}
        shell: /usr/bin/bash -exo pipefail {0}
        run: |
          gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
          --zone ${{ vars.GCP_ZONE }} \
          --ssh-flag="-o ServerAliveInterval=5" \
          --ssh-flag="-o ConnectionAttempts=20" \
          --ssh-flag="-o ConnectTimeout=5" \
-          --command \
+          --command=' \
          "\
          sudo docker logs \
          --tail all \
          --follow \
          ${{ inputs.test_id }} | \
          head -700 | \
-          tee --output-error=exit /dev/stderr | \
+          tee --output-error=exit-nopipe /dev/stderr | \
          grep --max-count=1 --extended-regexp --color=always \
-          -e 'Zcash network: ${{ inputs.network }}' \
+          -e "Zcash network: ${{ inputs.network }}" \
-          "
+          '
      # Check that the container executed at least 1 Rust test harness test, and that all tests passed.
      # Then wait for the container to finish, and exit with the test's exit status.
@ -567,6 +625,7 @@ jobs:
      # with that status.
      # (`docker wait` can also wait for multiple containers, but we only ever wait for a single container.)
      - name: Result of ${{ inputs.test_id }} test
        shell: /usr/bin/bash -exo pipefail {0}
        run: |
          gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
          --zone ${{ vars.GCP_ZONE }} \
@ -574,26 +633,31 @@ jobs:
          --ssh-flag="-o ConnectionAttempts=20" \
          --ssh-flag="-o ConnectTimeout=5" \
          --command=' \
-          set -e;
+          trap "" PIPE;
          set -o pipefail;
          trap '' PIPE;
          # Temporarily disable "set -e" to handle the broken pipe error gracefully
          set +e;
          sudo docker logs \
          --tail all \
          --follow \
          ${{ inputs.test_id }} | \
-          tee --output-error=exit /dev/stderr | \
+          tee --output-error=exit-nopipe /dev/stderr | \
          grep --max-count=1 --extended-regexp --color=always \
-          "test result: .*ok.* [1-9][0-9]* passed.*finished in"; \
+          "test result: .*ok.* [1-9][0-9]* passed.*finished in";
          LOGS_EXIT_STATUS=$?;
          set -e;
-          EXIT_STATUS=$( \
+          EXIT_STATUS=$(sudo docker wait ${{ inputs.test_id }} || echo "Error retrieving exit status");
-          sudo docker wait ${{ inputs.test_id }} || \
+          echo "sudo docker exit status: $EXIT_STATUS";
          sudo docker inspect --format "{{.State.ExitCode}}" ${{ inputs.test_id }} || \
          echo "missing container, or missing exit status for container" \
          ); \
-          echo "sudo docker exit status: $EXIT_STATUS"; \
+          # If grep found the pattern, exit with the Docker container"s exit status
-          exit "$EXIT_STATUS" \
+          if [ $LOGS_EXIT_STATUS -eq 0 ]; then
              exit $EXIT_STATUS;
          fi
          # Handle other potential errors here
          echo "An error occurred while processing the logs.";
          exit 1; \
          '
  # create a state image from the instance's state disk, if requested by the caller
@ -707,6 +771,7 @@ jobs:
      # Passes the versions to subsequent steps using the $INITIAL_DISK_DB_VERSION,
      # $RUNNING_DB_VERSION, and $DB_VERSION_SUMMARY env variables.
      - name: Get database versions from logs
        shell: /usr/bin/bash -exo pipefail {0}
        run: |
          INITIAL_DISK_DB_VERSION=""
          RUNNING_DB_VERSION=""
@ -718,9 +783,9 @@ jobs:
          --ssh-flag="-o ServerAliveInterval=5" \
          --ssh-flag="-o ConnectionAttempts=20" \
          --ssh-flag="-o ConnectTimeout=5" \
-          --command=" \
+          --command=' \
          sudo docker logs ${{ inputs.test_id }} | head -1000 \
-          ")
+          ')
          # either a semantic version or "creating new database"
          INITIAL_DISK_DB_VERSION=$( \
@ -796,6 +861,7 @@ jobs:
      #
      # Passes the sync height to subsequent steps using the $SYNC_HEIGHT env variable.
      - name: Get sync height from logs
        shell: /usr/bin/bash -exo pipefail {0}
        run: |
          SYNC_HEIGHT=""
@ -805,9 +871,9 @@ jobs:
          --ssh-flag="-o ServerAliveInterval=5" \
          --ssh-flag="-o ConnectionAttempts=20" \
          --ssh-flag="-o ConnectTimeout=5" \
-          --command=" \
+          --command=' \
          sudo docker logs ${{ inputs.test_id }} --tail 200 \
-          ")
+          ')
          SYNC_HEIGHT=$( \
          echo "$DOCKER_LOGS" | \