From fec012a006d9841d095b10ff8649c637959994f9 Mon Sep 17 00:00:00 2001 From: teor Date: Tue, 6 Sep 2022 12:51:46 +1000 Subject: [PATCH] fix(ci): delete GCP resources, but keep some recent cached state images (#5082) * Fix delete GCP resources commands * Don't create a GCP credentials file * Keep the latest 2 images * Explain time * Show the names of disks that are being deleted * Actually run the image delete steps * Only delete commit-based instance templates * Document automated deletion --- .github/workflows/delete-gcp-resources.yml | 106 +++++++++++++++++---- book/src/dev/continuous-integration.md | 20 ++++ 2 files changed, 105 insertions(+), 21 deletions(-) diff --git a/.github/workflows/delete-gcp-resources.yml b/.github/workflows/delete-gcp-resources.yml index 704e7e03..2693bac1 100644 --- a/.github/workflows/delete-gcp-resources.yml +++ b/.github/workflows/delete-gcp-resources.yml @@ -1,10 +1,19 @@ name: Delete GCP resources on: + # Run right before Teor's week starts (0500 in UTC+10) schedule: - - cron: "0 0 1 * *" + - cron: "0 19 * * 0" workflow_dispatch: +env: + # Delete all resources created before $DELETE_AGE_DAYS days ago. + DELETE_AGE_DAYS: 7 + # But keep the latest $KEEP_LATEST_IMAGE_COUNT images of each type. + # + # TODO: reduce this to 1 or 2 after "The resource is not ready" errors get fixed? + KEEP_LATEST_IMAGE_COUNT: 3 + jobs: delete-resources: name: Delete old GCP resources @@ -13,6 +22,10 @@ jobs: contents: 'read' id-token: 'write' steps: + - uses: actions/checkout@v3.0.2 + with: + persist-credentials: false + # Setup gcloud CLI - name: Authenticate to Google Cloud id: auth @@ -23,40 +36,91 @@ jobs: service_account: 'github-service-account@zealous-zebra.iam.gserviceaccount.com' token_format: 'access_token' - # Deletes all the instances template older than 30 days + # Deletes all the instance templates older than $DELETE_AGE_DAYS days. - name: Delete old instance templates run: | - TEMPLATES=$(gcloud compute instance-templates list --sort-by=creationTimestamp --filter="creationTimestamp < $(date --date='30 days ago' '+%Y%m%d')" --format='value(NAME)') + DELETE_BEFORE_DATE=$(date --date="$DELETE_AGE_DAYS days ago" '+%Y%m%d') + TEMPLATES=$(gcloud compute instance-templates list --sort-by=creationTimestamp --filter="name~-[0-9a-f]+$ AND creationTimestamp < $DELETE_BEFORE_DATE" --format='value(NAME)') for TEMPLATE in $TEMPLATES do - gcloud compute instance-templates delete ${TEMPLATE} --quiet || continue + gcloud compute instance-templates delete ${TEMPLATE} || continue done - # Deletes cached images older than 90 days + # Deletes all the disks older than $DELETE_AGE_DAYS days. # - # A search is done is done for each of this images: - # - Images created on Pull Requests older than 30 days - # - Images created on the `main` branch older than 60 days - # - Any other remaining image older than 90 days - # TODO: we should improve this approach and filter by disk type, and just keep the 2 latest images of each type (zebra checkpoint, zebra tip, lwd tip) + # Disks that are attached to an instance template can't be deleted, so it is safe to delete all disks here. + - name: Delete old disks + run: | + DELETE_BEFORE_DATE=$(date --date="$DELETE_AGE_DAYS days ago" '+%Y%m%d') + + # Disks created by PR jobs, and other jobs that use a commit hash + COMMIT_DISKS=$(gcloud compute disks list --sort-by=creationTimestamp --filter="name~-[0-9a-f]+$ AND creationTimestamp < $DELETE_BEFORE_DATE" --format='value(NAME)') + + for DISK in $COMMIT_DISKS + do + gcloud compute disks delete --verbosity=info ${DISK} || continue + done + + # Disks created by managed instance groups, and other jobs that start with "zebrad-" + ZEBRAD_DISKS=$(gcloud compute disks list --sort-by=creationTimestamp --filter="name~^zebrad- AND creationTimestamp < $DELETE_BEFORE_DATE" --format='value(NAME)') + + for DISK in $ZEBRAD_DISKS + do + gcloud compute disks delete --verbosity=info ${DISK} || continue + done + + # Deletes cache images older than $DELETE_AGE_DAYS days. + # + # Keeps the latest $KEEP_LATEST_IMAGE_COUNT images of each type: + # - zebrad checkpoint cache + # - zebrad tip cache + # - lightwalletd + zebrad tip cache + # + # TODO: when we add testnet to the workflows, keep the latest $KEEP_LATEST_IMAGE_COUNT testnet images, + # and the latest $KEEP_LATEST_IMAGE_COUNT mainnet images. - name: Delete old cache disks run: | - PR_OLD_CACHE_DISKS=$(gcloud compute images list --sort-by=creationTimestamp --filter="name~-cache-.+[0-9a-f]+-merge AND creationTimestamp < $(date --date='30 days ago' '+%Y%m%d')" --format='value(NAME)') - for DISK in $PR_OLD_CACHE_DISKS + DELETE_BEFORE_DATE=$(date --date="$DELETE_AGE_DAYS days ago" '+%Y%m%d') + + ZEBRAD_CHECKPOINT_IMAGES=$(gcloud compute images list --sort-by=~creationTimestamp --filter="name~^zebrad-cache-.*net-checkpoint AND creationTimestamp < $DELETE_BEFORE_DATE" --format='value(NAME)') + KEPT_IMAGES=0 + for IMAGE in $ZEBRAD_CHECKPOINT_IMAGES do - gcloud compute image delete ${DISK} --quiet || continue + if [[ "$KEPT_IMAGES" -lt "$KEEP_LATEST_IMAGE_COUNT" ]]; + then + KEPT_IMAGES=$((KEPT_IMAGES+1)) + echo "Keeping image $KEPT_IMAGES named $IMAGE" + continue + fi + + gcloud compute images delete ${IMAGE} || continue done - MAIN_OLD_CACHE_DISKS=$(gcloud compute images list --sort-by=creationTimestamp --filter="name~-cache-main AND creationTimestamp < $(date --date='60 days ago' '+%Y%m%d')" --format='value(NAME)') - for DISK in $MAIN_OLD_CACHE_DISKS + ZEBRAD_TIP_IMAGES=$(gcloud compute images list --sort-by=~creationTimestamp --filter="name~^zebrad-cache-.*net-tip AND creationTimestamp < $DELETE_BEFORE_DATE" --format='value(NAME)') + KEPT_IMAGES=0 + for IMAGE in $ZEBRAD_TIP_IMAGES do - gcloud compute image delete ${DISK} --quiet || continue + if [[ "$KEPT_IMAGES" -lt "$KEEP_LATEST_IMAGE_COUNT" ]]; + then + KEPT_IMAGES=$((KEPT_IMAGES+1)) + echo "Keeping image $KEPT_IMAGES named $IMAGE" + continue + fi + + gcloud compute images delete ${IMAGE} || continue done - - - ALL_OLD_CACHE_DISKS=$(gcloud compute images list --sort-by=creationTimestamp --filter="name~-cache- AND creationTimestamp < $(date --date='90 days ago' '+%Y%m%d')" --format='value(NAME)') - for DISK in $ALL_OLD_CACHE_DISKS + + LWD_TIP_IMAGES=$(gcloud compute images list --sort-by=~creationTimestamp --filter="name~^lwd-cache-.*net-tip AND creationTimestamp < $DELETE_BEFORE_DATE" --format='value(NAME)') + KEPT_IMAGES=0 + for IMAGE in $LWD_TIP_IMAGES do - gcloud compute image delete ${DISK} --quiet || continue + if [[ "$KEPT_IMAGES" -lt "$KEEP_LATEST_IMAGE_COUNT" ]]; + then + KEPT_IMAGES=$((KEPT_IMAGES+1)) + echo "Keeping image $KEPT_IMAGES named $IMAGE" + continue + fi + + gcloud compute images delete ${IMAGE} || continue done diff --git a/book/src/dev/continuous-integration.md b/book/src/dev/continuous-integration.md index 87576ca0..c0fbf284 100644 --- a/book/src/dev/continuous-integration.md +++ b/book/src/dev/continuous-integration.md @@ -20,6 +20,26 @@ any branch and commit, as long as the state version is the same. Zebra also does [a smaller set of tests](https://github.com/ZcashFoundation/zebra/blob/main/.github/workflows/continous-integration-os.yml) on tier 2 platforms using GitHub actions runners. +## Manually Using Google Cloud + +Some Zebra developers have access to the Zcash Foundation's Google Cloud instance, which also runs our automatic CI. + +Please shut down large instances when they are not being used. + +### Automated Deletion + +The [Delete GCP Resources](https://github.com/ZcashFoundation/zebra/blob/main/.github/workflows/delete-gcp-resources.yml) +workflow automatically deletes instance templates, disks, and images older than 1 week. + +Running instances and their disks are protected from deletion. + +If you want to keep instance templates, disks, or images in Google Cloud, name them so they don't match the automated names: +- deleted instance templates and disks end in a commit hash, so use a name ending in `-` or `-[^0-9a-f]+` +- deleted images start with `zebrad-cache` or `lwd-cache`, so use a name starting with anything else + +Our other Google Cloud projects don't have automated deletion, so you can also use them for experiments or production deployments. + + ## Troubleshooting To improve CI performance, some Docker tests are stateful.