From 374fb7b34f1d7c72548256252d240b8de397b527 Mon Sep 17 00:00:00 2001 From: Gustavo Valverde Date: Thu, 26 May 2022 02:12:45 -0400 Subject: [PATCH] refactor(ci): allow more time for tests to end gracefully (#4469) * refactor(ci): keep tests jobs under the 6 hour timeout When running a full sync or any other test which takes almost 5 hours, having those jobs running with other actions that might take several minutes, also reduces the overall time from the job_id. We use a separate job for image creation and deletion to handle this cases. * fix(ci): instance deletion can't run on non finished tests * fix(ci): tests without a cached state might save to disk * fix(ci): ignore failures when deleting an instance * fix(ci): remove delete step `needs` redundancy Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> --- .github/workflows/deploy-gcp-tests.yml | 144 ++++++++++++++++--------- 1 file changed, 94 insertions(+), 50 deletions(-) diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml index 2c709f0a..3371dfcb 100644 --- a/.github/workflows/deploy-gcp-tests.yml +++ b/.github/workflows/deploy-gcp-tests.yml @@ -132,53 +132,6 @@ jobs: --mount type=volume,src=${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }},dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \ ${{ env.GAR_BASE }}/${{ env.IMAGE_NAME }}:sha-${{ env.GITHUB_SHA_SHORT }}" - - name: Get state version from constants.rs - run: | - LOCAL_STATE_VERSION=$(grep -oE "DATABASE_FORMAT_VERSION: .* [0-9]+" $GITHUB_WORKSPACE/zebra-state/src/constants.rs | grep -oE "[0-9]+" | tail -n1) - echo "STATE_VERSION: $LOCAL_STATE_VERSION" - - echo "STATE_VERSION=$LOCAL_STATE_VERSION" >> $GITHUB_ENV - - - name: Get sync height from logs - run: | - SYNC_HEIGHT="" - - DOCKER_LOGS=$(\ - gcloud compute ssh \ - ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ - --zone ${{ env.ZONE }} \ - --quiet \ - --ssh-flag="-o ServerAliveInterval=5" \ - --command="docker logs ${{ inputs.test_id }} --tail 20") - - SYNC_HEIGHT=$(echo $DOCKER_LOGS | grep -oE '${{ inputs.height_grep_text }}\([0-9]+\)' | grep -oE '[0-9]+' | tail -1 || [[ $? == 1 ]]) - echo "SYNC_HEIGHT=$SYNC_HEIGHT" >> $GITHUB_ENV - - # Create image from disk that will be used for following tests - # Force the image creation as the disk is still attached even though is not being used by the container - - name: Create image from state disk - if: ${{ inputs.saves_to_disk }} - run: | - gcloud compute images create ${{ inputs.disk_prefix }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }}-v${{ env.STATE_VERSION }}-${{ env.NETWORK }}-${{ inputs.disk_suffix }} \ - --force \ - --source-disk=${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }} \ - --source-disk-zone=${{ env.ZONE }} \ - --storage-location=us \ - --description="Created from commit ${{ env.GITHUB_SHA_SHORT }} with height ${{ env.SYNC_HEIGHT }}" - - - name: Delete test instance - # If the disk generation step timeouts (+6 hours) the previous step (creating the image) will be skipped. - # Even if the instance continues running, no image will be created, so it's better to delete it. - if: always() - continue-on-error: true - run: | - INSTANCE=$(gcloud compute instances list --filter=${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} --format='value(NAME)') - if [ -z "${INSTANCE}" ]; then - echo "No instance to delete" - else - gcloud compute instances delete "${INSTANCE}" --zone "${{ env.ZONE }}" --delete-disks all --quiet - fi - test-with-cached-state: name: Run ${{ inputs.test_id }} test if: ${{ inputs.needs_zebra_state }} @@ -363,6 +316,72 @@ jobs: --mount type=volume,src=${{ inputs.disk_prefix }}-${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }},dst=${{ inputs.root_state_path }}/${{ inputs.lwd_state_dir }} \ ${{ env.GAR_BASE }}/${{ env.IMAGE_NAME }}:sha-${{ env.GITHUB_SHA_SHORT }}" + create-state-image: + name: Create ${{ inputs.test_id }} cached state image + runs-on: ubuntu-latest + needs: [ test-without-cached-state, test-with-cached-state ] + if: ${{ inputs.saves_to_disk }} + permissions: + contents: 'read' + id-token: 'write' + steps: + - name: Inject slug/short variables + uses: rlespinasse/github-slug-action@v4 + with: + short-length: 7 + + # Disk images in GCP are required to be in lowercase, but the blockchain network + # uses sentence case, so we need to downcase ${{ inputs.network }} + # + # Passes ${{ inputs.network }} to subsequent steps using $NETWORK env variable + - name: Downcase network name for disks + run: | + NETWORK_CAPS=${{ inputs.network }} + echo "NETWORK=${NETWORK_CAPS,,}" >> $GITHUB_ENV + + # Setup gcloud CLI + - name: Authenticate to Google Cloud + id: auth + uses: google-github-actions/auth@v0.7.3 + with: + workload_identity_provider: 'projects/143793276228/locations/global/workloadIdentityPools/github-actions/providers/github-oidc' + service_account: 'github-service-account@zealous-zebra.iam.gserviceaccount.com' + token_format: 'access_token' + + # Get the state version from the local constants.rs file to be used in the image creation, + # as the state version is part of the disk image name. + # + # Passes the state version to subsequent steps using $STATE_VERSION env variable + - name: Get state version from constants.rs + run: | + LOCAL_STATE_VERSION=$(grep -oE "DATABASE_FORMAT_VERSION: .* [0-9]+" $GITHUB_WORKSPACE/zebra-state/src/constants.rs | grep -oE "[0-9]+" | tail -n1) + echo "STATE_VERSION: $LOCAL_STATE_VERSION" + + echo "STATE_VERSION=$LOCAL_STATE_VERSION" >> $GITHUB_ENV + + # Get the sync height from the test logs, which is later used as part of the + # disk description. + # + # The regex used to grep the sync height is provided by ${{ inputs.height_grep_text }}, + # this allows to dinamically change the height as needed by different situations or + # based on the logs output from different tests + # + # Passes the sync height to subsequent steps using $SYNC_HEIGHT env variable + - name: Get sync height from logs + run: | + SYNC_HEIGHT="" + + DOCKER_LOGS=$(\ + gcloud compute ssh \ + ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --zone ${{ env.ZONE }} \ + --quiet \ + --ssh-flag="-o ServerAliveInterval=5" \ + --command="docker logs ${{ inputs.test_id }} --tail 20") + + SYNC_HEIGHT=$(echo $DOCKER_LOGS | grep -oE '${{ inputs.height_grep_text }}\([0-9]+\)' | grep -oE '[0-9]+' | tail -1 || [[ $? == 1 ]]) + echo "SYNC_HEIGHT=$SYNC_HEIGHT" >> $GITHUB_ENV + # Create an image from disk that will be used for following/other tests # This image can contain: # - Zebra cached state @@ -372,7 +391,6 @@ jobs: # Force the image creation (--force) as the disk is still attached even though is not being # used by the container - name: Create image from state disk - if: ${{ inputs.saves_to_disk }} run: | gcloud compute images create ${{ inputs.disk_prefix }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }}-v${{ env.STATE_VERSION }}-${{ env.NETWORK }}-${{ inputs.disk_suffix }} \ --force \ @@ -381,9 +399,35 @@ jobs: --storage-location=us \ --description="Created from commit ${{ env.GITHUB_SHA_SHORT }} with height ${{ env.SYNC_HEIGHT }}" + delete-instance: + name: Delete ${{ inputs.test_id }} instance + runs-on: ubuntu-latest + needs: [ create-state-image ] + # If a disk generation step timeouts (+6 hours) the previous job (creating the image) will be skipped. + # Even if the instance continues running, no image will be created, so it's better to delete it. + if: always() + continue-on-error: true + permissions: + contents: 'read' + id-token: 'write' + steps: + - name: Inject slug/short variables + uses: rlespinasse/github-slug-action@v4 + with: + short-length: 7 + + # Setup gcloud CLI + - name: Authenticate to Google Cloud + id: auth + uses: google-github-actions/auth@v0.7.3 + with: + workload_identity_provider: 'projects/143793276228/locations/global/workloadIdentityPools/github-actions/providers/github-oidc' + service_account: 'github-service-account@zealous-zebra.iam.gserviceaccount.com' + token_format: 'access_token' + + # Deletes the instances that has been recently deployed in the actual commit after all + # previous jobs have run, no matter the outcome of the job. - name: Delete test instance - # We don't want to leave a failed instance in GCP using resources - if: always() continue-on-error: true run: | INSTANCE=$(gcloud compute instances list --filter=${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} --format='value(NAME)')