From 844ebf0dbd9d820511390d15670f229fdcfd3572 Mon Sep 17 00:00:00 2001 From: Gustavo Valverde Date: Wed, 16 Nov 2022 10:27:09 -0400 Subject: [PATCH] feat(ssh): enable OS Login for GCP test instances (#5602) * feat(ssh): enable OS Login for GCP test instances * fix(ssh): force service account impersonation for OS Login * debug: show actual user trying to impersonate SA * fix(glcloud): configure gcloud before running commands * fix(ssh): add VM zone to ssh command * fix(auth): bringing changes from #5614 * fix(auth): impersonation is working as expected now * fix(gcloud): setup the GCP CLI after authenticating (#5606) Previous behavior: `gcloud` commands have been running without an appropiate authentication as the `auth` auction was sucessfully executed, but the actual gcloud CLI being used in further jobs was not using the correct configuration nor credentials Expected behavior: All `gcloud` commands should be properly configured and authenticated. Solution: Add the `google-github-actions/setup-gcloud` action after each `google-github-actions/auth` invocation, and before running any `gcloud` command. Remove the need of an OAuth Access token when not required by following steps * fix(auth): revert to latest version * fix: wrong replace * fix(ci): use a specific debian image for VM containers * fix(ssh): delete generated SSH keys by CI after 30 seconds * debug: remove debug commands * fix(compute): use a lightweight container image * fix(ci): add missing sudo to docker command * Update .github/workflows/deploy-gcp-tests.yml Co-authored-by: Deirdre Connolly * fix(ssh): delete ssh-keys for the specific GHA service account Co-authored-by: Deirdre Connolly --- .github/workflows/deploy-gcp-tests.yml | 102 ++++++++++++------------- 1 file changed, 49 insertions(+), 53 deletions(-) diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml index 61344b93..ecd9f10f 100644 --- a/.github/workflows/deploy-gcp-tests.yml +++ b/.github/workflows/deploy-gcp-tests.yml @@ -145,11 +145,11 @@ jobs: --boot-disk-size 300GB \ --boot-disk-type pd-ssd \ --create-disk name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",device-name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",size=300GB,type=pd-ssd \ - --container-image debian-11 \ + --container-image gcr.io/google-containers/busybox \ --container-restart-policy=never \ --machine-type ${{ env.MACHINE_TYPE }} \ --scopes cloud-platform \ - --metadata=google-monitoring-enabled=true,google-logging-enabled=true \ + --metadata=google-monitoring-enabled=TRUE,google-logging-enabled=TRUE,enable-oslogin=TRUE \ --metadata-from-file=startup-script=.github/workflows/scripts/gcp-vm-startup-script.sh \ --tags ${{ inputs.app_name }} \ --zone ${{ env.ZONE }} @@ -160,10 +160,9 @@ jobs: # SSH into the just created VM, and create a docker volume with the newly created disk. - name: Create ${{ inputs.test_id }} Docker volume run: | - gcloud compute ssh \ - github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --ssh-key-expire-after=30s \ --zone ${{ env.ZONE }} \ - --quiet \ --ssh-flag="-o ServerAliveInterval=5" \ --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ @@ -211,10 +210,9 @@ jobs: # Launch the test without any cached state - name: Launch ${{ inputs.test_id }} test run: | - gcloud compute ssh \ - github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --ssh-key-expire-after=30s \ --zone ${{ env.ZONE }} \ - --quiet \ --ssh-flag="-o ServerAliveInterval=5" \ --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ @@ -366,11 +364,11 @@ jobs: --boot-disk-size 300GB \ --boot-disk-type pd-ssd \ --create-disk image=${{ env.CACHED_DISK_NAME }},name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",device-name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",size=300GB,type=pd-ssd \ - --container-image debian-11 \ + --container-image gcr.io/google-containers/busybox \ --container-restart-policy=never \ --machine-type ${{ env.MACHINE_TYPE }} \ --scopes cloud-platform \ - --metadata=google-monitoring-enabled=true,google-logging-enabled=true \ + --metadata=google-monitoring-enabled=TRUE,google-logging-enabled=TRUE,enable-oslogin=TRUE \ --metadata-from-file=startup-script=.github/workflows/scripts/gcp-vm-startup-script.sh \ --tags ${{ inputs.app_name }} \ --zone ${{ env.ZONE }} @@ -383,10 +381,9 @@ jobs: # but the cached state can be smaller if we just increased the disk size.) - name: Create ${{ inputs.test_id }} Docker volume run: | - gcloud compute ssh \ - github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --ssh-key-expire-after=30s \ --zone ${{ env.ZONE }} \ - --quiet \ --ssh-flag="-o ServerAliveInterval=5" \ --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ @@ -452,10 +449,9 @@ jobs: # TODO: we should find a better logic for this use cases if: ${{ (inputs.needs_zebra_state && !inputs.needs_lwd_state) && inputs.test_id != 'lwd-full-sync' }} run: | - gcloud compute ssh \ - github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --ssh-key-expire-after=30s \ --zone ${{ env.ZONE }} \ - --quiet \ --ssh-flag="-o ServerAliveInterval=5" \ --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ @@ -502,10 +498,9 @@ jobs: # TODO: we should find a better logic for this use cases if: ${{ (inputs.needs_zebra_state && inputs.needs_lwd_state) || inputs.test_id == 'lwd-full-sync' }} run: | - gcloud compute ssh \ - github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --ssh-key-expire-after=30s \ --zone ${{ env.ZONE }} \ - --quiet \ --ssh-flag="-o ServerAliveInterval=5" \ --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ @@ -568,10 +563,9 @@ jobs: # Errors in the tests are caught by the final test status job. - name: Show logs for ${{ inputs.test_id }} test (sprout) run: | - gcloud compute ssh \ - github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --ssh-key-expire-after=30s \ --zone ${{ env.ZONE }} \ - --quiet \ --ssh-flag="-o ServerAliveInterval=5" \ --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ @@ -627,10 +621,9 @@ jobs: # Show recent logs, following until Canopy activation (or the test finishes) - name: Show logs for ${{ inputs.test_id }} test (heartwood) run: | - gcloud compute ssh \ - github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --ssh-key-expire-after=30s \ --zone ${{ env.ZONE }} \ - --quiet \ --ssh-flag="-o ServerAliveInterval=5" \ --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ @@ -683,10 +676,9 @@ jobs: # Show recent logs, following until NU5 activation (or the test finishes) - name: Show logs for ${{ inputs.test_id }} test (canopy) run: | - gcloud compute ssh \ - github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --ssh-key-expire-after=30s \ --zone ${{ env.ZONE }} \ - --quiet \ --ssh-flag="-o ServerAliveInterval=5" \ --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ @@ -741,10 +733,9 @@ jobs: # Show recent logs, following until block 1,740,000 (or the test finishes) - name: Show logs for ${{ inputs.test_id }} test (1740k) run: | - gcloud compute ssh \ - github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --ssh-key-expire-after=30s \ --zone ${{ env.ZONE }} \ - --quiet \ --ssh-flag="-o ServerAliveInterval=5" \ --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ @@ -801,10 +792,9 @@ jobs: # Show recent logs, following until block 1,760,000 (or the test finishes) - name: Show logs for ${{ inputs.test_id }} test (1760k) run: | - gcloud compute ssh \ - github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --ssh-key-expire-after=30s \ --zone ${{ env.ZONE }} \ - --quiet \ --ssh-flag="-o ServerAliveInterval=5" \ --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ @@ -861,10 +851,9 @@ jobs: # Show recent logs, following until block 1,780,000 (or the test finishes) - name: Show logs for ${{ inputs.test_id }} test (1780k) run: | - gcloud compute ssh \ - github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --ssh-key-expire-after=30s \ --zone ${{ env.ZONE }} \ - --quiet \ --ssh-flag="-o ServerAliveInterval=5" \ --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ @@ -922,10 +911,9 @@ jobs: # Show recent logs, following until block 1,800,000 (or the test finishes) - name: Show logs for ${{ inputs.test_id }} test (1800k) run: | - gcloud compute ssh \ - github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --ssh-key-expire-after=30s \ --zone ${{ env.ZONE }} \ - --quiet \ --ssh-flag="-o ServerAliveInterval=5" \ --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ @@ -982,10 +970,9 @@ jobs: # Show recent logs, following until block 1,820,000 (or the test finishes) - name: Show logs for ${{ inputs.test_id }} test (1820k) run: | - gcloud compute ssh \ - github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --ssh-key-expire-after=30s \ --zone ${{ env.ZONE }} \ - --quiet \ --ssh-flag="-o ServerAliveInterval=5" \ --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ @@ -1041,10 +1028,9 @@ jobs: # TODO: when doing obtain/extend tips, log the verifier in use, and check for full verification here - name: Show logs for ${{ inputs.test_id }} test (checkpoint) run: | - gcloud compute ssh \ - github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --ssh-key-expire-after=30s \ --zone ${{ env.ZONE }} \ - --quiet \ --ssh-flag="-o ServerAliveInterval=5" \ --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ @@ -1111,10 +1097,9 @@ jobs: # (`docker wait` can also wait for multiple containers, but we only ever wait for a single container.) - name: Result of ${{ inputs.test_id }} test run: | - gcloud compute ssh \ - github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --ssh-key-expire-after=30s \ --zone ${{ env.ZONE }} \ - --quiet \ --ssh-flag="-o ServerAliveInterval=5" \ --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ @@ -1237,15 +1222,14 @@ jobs: SYNC_HEIGHT="" DOCKER_LOGS=$( \ - gcloud compute ssh \ - github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --ssh-key-expire-after=30s \ --zone ${{ env.ZONE }} \ - --quiet \ --ssh-flag="-o ServerAliveInterval=5" \ --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ --command=" \ - docker logs ${{ inputs.test_id }} --tail 200 \ + sudo docker logs ${{ inputs.test_id }} --tail 200 \ ") SYNC_HEIGHT=$( \ @@ -1376,3 +1360,15 @@ jobs: else gcloud compute instances delete "${INSTANCE}" --zone "${{ env.ZONE }}" --delete-disks all --quiet fi + + # Deletes SSH keys generated during this workflow run, as GCP has a limit of SSH keys + # that can exist at the same time in the OS Login metadata. Not deleting this keys + # could cause the following error: + # `Login profile size exceeds 32 KiB. Delete profile values to make additional space` + - name: Delete temporal SSH keys + continue-on-error: true + run: | + for i in $(gcloud compute os-login ssh-keys list --format="table[no-heading](value.fingerprint)") --impersonate-service-account=github-service-account@zealous-zebra.iam.gserviceaccount.com; do + echo "$i"; + gcloud compute os-login ssh-keys remove --key "$i" --impersonate-service-account=github-service-account@zealous-zebra.iam.gserviceaccount.com || true; + done