fix(ci): delete GCP resources, but keep some recent cached state images (#5082)
* Fix delete GCP resources commands * Don't create a GCP credentials file * Keep the latest 2 images * Explain time * Show the names of disks that are being deleted * Actually run the image delete steps * Only delete commit-based instance templates * Document automated deletion
This commit is contained in:
parent
c081fd9873
commit
fec012a006
|
|
@ -1,10 +1,19 @@
|
||||||
name: Delete GCP resources
|
name: Delete GCP resources
|
||||||
|
|
||||||
on:
|
on:
|
||||||
|
# Run right before Teor's week starts (0500 in UTC+10)
|
||||||
schedule:
|
schedule:
|
||||||
- cron: "0 0 1 * *"
|
- cron: "0 19 * * 0"
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
|
|
||||||
|
env:
|
||||||
|
# Delete all resources created before $DELETE_AGE_DAYS days ago.
|
||||||
|
DELETE_AGE_DAYS: 7
|
||||||
|
# But keep the latest $KEEP_LATEST_IMAGE_COUNT images of each type.
|
||||||
|
#
|
||||||
|
# TODO: reduce this to 1 or 2 after "The resource is not ready" errors get fixed?
|
||||||
|
KEEP_LATEST_IMAGE_COUNT: 3
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
delete-resources:
|
delete-resources:
|
||||||
name: Delete old GCP resources
|
name: Delete old GCP resources
|
||||||
|
|
@ -13,6 +22,10 @@ jobs:
|
||||||
contents: 'read'
|
contents: 'read'
|
||||||
id-token: 'write'
|
id-token: 'write'
|
||||||
steps:
|
steps:
|
||||||
|
- uses: actions/checkout@v3.0.2
|
||||||
|
with:
|
||||||
|
persist-credentials: false
|
||||||
|
|
||||||
# Setup gcloud CLI
|
# Setup gcloud CLI
|
||||||
- name: Authenticate to Google Cloud
|
- name: Authenticate to Google Cloud
|
||||||
id: auth
|
id: auth
|
||||||
|
|
@ -23,40 +36,91 @@ jobs:
|
||||||
service_account: 'github-service-account@zealous-zebra.iam.gserviceaccount.com'
|
service_account: 'github-service-account@zealous-zebra.iam.gserviceaccount.com'
|
||||||
token_format: 'access_token'
|
token_format: 'access_token'
|
||||||
|
|
||||||
# Deletes all the instances template older than 30 days
|
# Deletes all the instance templates older than $DELETE_AGE_DAYS days.
|
||||||
- name: Delete old instance templates
|
- name: Delete old instance templates
|
||||||
run: |
|
run: |
|
||||||
TEMPLATES=$(gcloud compute instance-templates list --sort-by=creationTimestamp --filter="creationTimestamp < $(date --date='30 days ago' '+%Y%m%d')" --format='value(NAME)')
|
DELETE_BEFORE_DATE=$(date --date="$DELETE_AGE_DAYS days ago" '+%Y%m%d')
|
||||||
|
TEMPLATES=$(gcloud compute instance-templates list --sort-by=creationTimestamp --filter="name~-[0-9a-f]+$ AND creationTimestamp < $DELETE_BEFORE_DATE" --format='value(NAME)')
|
||||||
|
|
||||||
for TEMPLATE in $TEMPLATES
|
for TEMPLATE in $TEMPLATES
|
||||||
do
|
do
|
||||||
gcloud compute instance-templates delete ${TEMPLATE} --quiet || continue
|
gcloud compute instance-templates delete ${TEMPLATE} || continue
|
||||||
done
|
done
|
||||||
|
|
||||||
# Deletes cached images older than 90 days
|
# Deletes all the disks older than $DELETE_AGE_DAYS days.
|
||||||
#
|
#
|
||||||
# A search is done is done for each of this images:
|
# Disks that are attached to an instance template can't be deleted, so it is safe to delete all disks here.
|
||||||
# - Images created on Pull Requests older than 30 days
|
- name: Delete old disks
|
||||||
# - Images created on the `main` branch older than 60 days
|
run: |
|
||||||
# - Any other remaining image older than 90 days
|
DELETE_BEFORE_DATE=$(date --date="$DELETE_AGE_DAYS days ago" '+%Y%m%d')
|
||||||
# TODO: we should improve this approach and filter by disk type, and just keep the 2 latest images of each type (zebra checkpoint, zebra tip, lwd tip)
|
|
||||||
|
# Disks created by PR jobs, and other jobs that use a commit hash
|
||||||
|
COMMIT_DISKS=$(gcloud compute disks list --sort-by=creationTimestamp --filter="name~-[0-9a-f]+$ AND creationTimestamp < $DELETE_BEFORE_DATE" --format='value(NAME)')
|
||||||
|
|
||||||
|
for DISK in $COMMIT_DISKS
|
||||||
|
do
|
||||||
|
gcloud compute disks delete --verbosity=info ${DISK} || continue
|
||||||
|
done
|
||||||
|
|
||||||
|
# Disks created by managed instance groups, and other jobs that start with "zebrad-"
|
||||||
|
ZEBRAD_DISKS=$(gcloud compute disks list --sort-by=creationTimestamp --filter="name~^zebrad- AND creationTimestamp < $DELETE_BEFORE_DATE" --format='value(NAME)')
|
||||||
|
|
||||||
|
for DISK in $ZEBRAD_DISKS
|
||||||
|
do
|
||||||
|
gcloud compute disks delete --verbosity=info ${DISK} || continue
|
||||||
|
done
|
||||||
|
|
||||||
|
# Deletes cache images older than $DELETE_AGE_DAYS days.
|
||||||
|
#
|
||||||
|
# Keeps the latest $KEEP_LATEST_IMAGE_COUNT images of each type:
|
||||||
|
# - zebrad checkpoint cache
|
||||||
|
# - zebrad tip cache
|
||||||
|
# - lightwalletd + zebrad tip cache
|
||||||
|
#
|
||||||
|
# TODO: when we add testnet to the workflows, keep the latest $KEEP_LATEST_IMAGE_COUNT testnet images,
|
||||||
|
# and the latest $KEEP_LATEST_IMAGE_COUNT mainnet images.
|
||||||
- name: Delete old cache disks
|
- name: Delete old cache disks
|
||||||
run: |
|
run: |
|
||||||
PR_OLD_CACHE_DISKS=$(gcloud compute images list --sort-by=creationTimestamp --filter="name~-cache-.+[0-9a-f]+-merge AND creationTimestamp < $(date --date='30 days ago' '+%Y%m%d')" --format='value(NAME)')
|
DELETE_BEFORE_DATE=$(date --date="$DELETE_AGE_DAYS days ago" '+%Y%m%d')
|
||||||
for DISK in $PR_OLD_CACHE_DISKS
|
|
||||||
|
ZEBRAD_CHECKPOINT_IMAGES=$(gcloud compute images list --sort-by=~creationTimestamp --filter="name~^zebrad-cache-.*net-checkpoint AND creationTimestamp < $DELETE_BEFORE_DATE" --format='value(NAME)')
|
||||||
|
KEPT_IMAGES=0
|
||||||
|
for IMAGE in $ZEBRAD_CHECKPOINT_IMAGES
|
||||||
do
|
do
|
||||||
gcloud compute image delete ${DISK} --quiet || continue
|
if [[ "$KEPT_IMAGES" -lt "$KEEP_LATEST_IMAGE_COUNT" ]];
|
||||||
|
then
|
||||||
|
KEPT_IMAGES=$((KEPT_IMAGES+1))
|
||||||
|
echo "Keeping image $KEPT_IMAGES named $IMAGE"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
gcloud compute images delete ${IMAGE} || continue
|
||||||
done
|
done
|
||||||
|
|
||||||
MAIN_OLD_CACHE_DISKS=$(gcloud compute images list --sort-by=creationTimestamp --filter="name~-cache-main AND creationTimestamp < $(date --date='60 days ago' '+%Y%m%d')" --format='value(NAME)')
|
ZEBRAD_TIP_IMAGES=$(gcloud compute images list --sort-by=~creationTimestamp --filter="name~^zebrad-cache-.*net-tip AND creationTimestamp < $DELETE_BEFORE_DATE" --format='value(NAME)')
|
||||||
for DISK in $MAIN_OLD_CACHE_DISKS
|
KEPT_IMAGES=0
|
||||||
|
for IMAGE in $ZEBRAD_TIP_IMAGES
|
||||||
do
|
do
|
||||||
gcloud compute image delete ${DISK} --quiet || continue
|
if [[ "$KEPT_IMAGES" -lt "$KEEP_LATEST_IMAGE_COUNT" ]];
|
||||||
|
then
|
||||||
|
KEPT_IMAGES=$((KEPT_IMAGES+1))
|
||||||
|
echo "Keeping image $KEPT_IMAGES named $IMAGE"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
gcloud compute images delete ${IMAGE} || continue
|
||||||
done
|
done
|
||||||
|
|
||||||
|
LWD_TIP_IMAGES=$(gcloud compute images list --sort-by=~creationTimestamp --filter="name~^lwd-cache-.*net-tip AND creationTimestamp < $DELETE_BEFORE_DATE" --format='value(NAME)')
|
||||||
ALL_OLD_CACHE_DISKS=$(gcloud compute images list --sort-by=creationTimestamp --filter="name~-cache- AND creationTimestamp < $(date --date='90 days ago' '+%Y%m%d')" --format='value(NAME)')
|
KEPT_IMAGES=0
|
||||||
for DISK in $ALL_OLD_CACHE_DISKS
|
for IMAGE in $LWD_TIP_IMAGES
|
||||||
do
|
do
|
||||||
gcloud compute image delete ${DISK} --quiet || continue
|
if [[ "$KEPT_IMAGES" -lt "$KEEP_LATEST_IMAGE_COUNT" ]];
|
||||||
|
then
|
||||||
|
KEPT_IMAGES=$((KEPT_IMAGES+1))
|
||||||
|
echo "Keeping image $KEPT_IMAGES named $IMAGE"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
gcloud compute images delete ${IMAGE} || continue
|
||||||
done
|
done
|
||||||
|
|
|
||||||
|
|
@ -20,6 +20,26 @@ any branch and commit, as long as the state version is the same.
|
||||||
Zebra also does [a smaller set of tests](https://github.com/ZcashFoundation/zebra/blob/main/.github/workflows/continous-integration-os.yml) on tier 2 platforms using GitHub actions runners.
|
Zebra also does [a smaller set of tests](https://github.com/ZcashFoundation/zebra/blob/main/.github/workflows/continous-integration-os.yml) on tier 2 platforms using GitHub actions runners.
|
||||||
|
|
||||||
|
|
||||||
|
## Manually Using Google Cloud
|
||||||
|
|
||||||
|
Some Zebra developers have access to the Zcash Foundation's Google Cloud instance, which also runs our automatic CI.
|
||||||
|
|
||||||
|
Please shut down large instances when they are not being used.
|
||||||
|
|
||||||
|
### Automated Deletion
|
||||||
|
|
||||||
|
The [Delete GCP Resources](https://github.com/ZcashFoundation/zebra/blob/main/.github/workflows/delete-gcp-resources.yml)
|
||||||
|
workflow automatically deletes instance templates, disks, and images older than 1 week.
|
||||||
|
|
||||||
|
Running instances and their disks are protected from deletion.
|
||||||
|
|
||||||
|
If you want to keep instance templates, disks, or images in Google Cloud, name them so they don't match the automated names:
|
||||||
|
- deleted instance templates and disks end in a commit hash, so use a name ending in `-` or `-[^0-9a-f]+`
|
||||||
|
- deleted images start with `zebrad-cache` or `lwd-cache`, so use a name starting with anything else
|
||||||
|
|
||||||
|
Our other Google Cloud projects don't have automated deletion, so you can also use them for experiments or production deployments.
|
||||||
|
|
||||||
|
|
||||||
## Troubleshooting
|
## Troubleshooting
|
||||||
|
|
||||||
To improve CI performance, some Docker tests are stateful.
|
To improve CI performance, some Docker tests are stateful.
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue