From 84b525f83ea3d42f37a49b4d78f5df26d1fc5167 Mon Sep 17 00:00:00 2001 From: bcj Date: Mon, 8 Mar 2021 23:16:34 -0600 Subject: [PATCH] Add a script for pruning old backup files Adds a pruning script which is installed but not set to run by default. Also adds for that script that can be run in a container that replicates the db container's conditions --- README.md | 5 + postgres-docker/Dockerfile | 1 + postgres-docker/cronfile | 5 +- postgres-docker/tests/Dockerfile | 8 + postgres-docker/tests/docker-compose.yaml | 9 + postgres-docker/tests/testing-entrypoint.sh | 426 ++++++++++++++++++++ postgres-docker/weed.sh | 185 +++++++++ 7 files changed, 638 insertions(+), 1 deletion(-) create mode 100644 postgres-docker/tests/Dockerfile create mode 100644 postgres-docker/tests/docker-compose.yaml create mode 100644 postgres-docker/tests/testing-entrypoint.sh create mode 100755 postgres-docker/weed.sh diff --git a/README.md b/README.md index 19eb6538d..98ce9c766 100644 --- a/README.md +++ b/README.md @@ -209,6 +209,11 @@ Whenever a user interacts with a book, they are interacting with a specific edit Bookwyrm's db service dumps a backup copy of its database to its `/backups` directory daily at midnight UTC. Backups are named `backup__%Y-%m-%d.sql`. +The db service has an optional script for periodically pruning the backups directory so that all recent daily backups are kept, but for older backups, only weekly or monthly backups are kept. +To enable this script: +- Uncomment the final line in `postgres-docker/cronfile` +- rebuild your instance `docker-compose up --build` + You can copy backups from the backups volume to your host machine with `docker cp`: - Run `docker-compose ps` to confirm the db service's full name (it's probably `bookwyrm_db_1`. - Run `docker cp :/backups diff --git a/postgres-docker/Dockerfile b/postgres-docker/Dockerfile index 23ec1af43..fd4ad409a 100644 --- a/postgres-docker/Dockerfile +++ b/postgres-docker/Dockerfile @@ -3,6 +3,7 @@ FROM postgres:latest # crontab RUN mkdir /backups COPY ./backup.sh /backups +COPY ./weed.sh /backups COPY ./cronfile /etc/cron.d/cronfile RUN apt-get update && apt-get -y install cron RUN chmod 0644 /etc/cron.d/cronfile diff --git a/postgres-docker/cronfile b/postgres-docker/cronfile index d20708cd1..0ba34e46b 100644 --- a/postgres-docker/cronfile +++ b/postgres-docker/cronfile @@ -1,2 +1,5 @@ 0 0 * * * /backups/backup.sh - +# If uncommented, this script will weed the backups directory. It will keep the 14 +# most-recent backups, then one backup/week for the next four backups, then one +# backup/month after that. +# 0 1 * * * /backups/weed.sh -d 14 -w 4 -m -1 /backups diff --git a/postgres-docker/tests/Dockerfile b/postgres-docker/tests/Dockerfile new file mode 100644 index 000000000..a6f51e2c1 --- /dev/null +++ b/postgres-docker/tests/Dockerfile @@ -0,0 +1,8 @@ +FROM postgres:latest + +RUN apt update && apt install -y shellcheck + +COPY ./tests/testing-entrypoint.sh /testing-entrypoint.sh +RUN chmod u+rx,go=r /testing-entrypoint.sh +COPY ./weed.sh /weed.sh +RUN chmod u+rx,go=r /weed.sh \ No newline at end of file diff --git a/postgres-docker/tests/docker-compose.yaml b/postgres-docker/tests/docker-compose.yaml new file mode 100644 index 000000000..33377a7bb --- /dev/null +++ b/postgres-docker/tests/docker-compose.yaml @@ -0,0 +1,9 @@ +version: "3" + +services: + weeding: + build: + # We need to build from the parent directory so we can access weed.sh + context: .. + dockerfile: ./tests/Dockerfile + entrypoint: /testing-entrypoint.sh \ No newline at end of file diff --git a/postgres-docker/tests/testing-entrypoint.sh b/postgres-docker/tests/testing-entrypoint.sh new file mode 100644 index 000000000..480a5973a --- /dev/null +++ b/postgres-docker/tests/testing-entrypoint.sh @@ -0,0 +1,426 @@ +#!/usr/bin/env bash +# These tests are written to run in their own container, using the same image as the +# actual postgres service. To run: `docker-compose up --build` +set -euo pipefail + +source /weed.sh + +ERROR_COUNT=0 +FAILURE_COUNT=0 + +# compare two sorted files +function compare_files { + local expected="$1" + local actual="$2" + + declare -a missing + local missing_index=0 + declare -a extra + local extra_index=0 + + old_ifs="$IFS" + IFS=$'\n' + for line in $(diff --suppress-common-lines "$expected" "$actual"); do + if [[ $line =~ ^\< ]]; then + missing[missing_index]=${line:1} + missing_index=$((missing_index + 1)) + elif [[ $line =~ ^\> ]]; then + extra[extra_index]=${line:1} + extra_index=$((extra_index + 1)) + fi + done + IFS="$old_ifs" + + if [[ $((missing_index + extra_index)) -gt 0 ]]; then + echo 'fail' + + if [[ missing_index -gt 0 ]]; then + echo -e "\\t$missing_index missing files:" + + for index in $(seq 0 $((missing_index - 1))); do + echo -e "\\t\\t${missing[index]}" + done + fi + + if [[ extra_index -gt 0 ]]; then + echo -e "\\t$extra_index extra files:" + + for index in $(seq 0 $((extra_index - 1))); do + echo -e "\\t\\t${extra[index]}" + done + fi + + FAILURE_COUNT=$((FAILURE_COUNT + 1)) + + return 1 + fi +} + +# This is a wrapper function that handles creating a directory with test files in it, +# running weed_directory (as the function, as a dry run, then finally actually-deleting +# files), marking the test as failed/errored as necessary, then cleaning up after +# itself. the first three arguments passed are the thresholds to pass into +# weed_directory. The remaining arguments are names of files to create for the test. +# Bash isn't great at passing arrays so instead of separately passing in a list of +# expected results, flag the files you expect to be deleted by prepending "DELETE:" +# to the path. +function perform_test { + echo "${FUNCNAME[1]}" | sed 's/^test_\(.*\)$/\1/' | tr '_\n' ' :' + echo -en '\t' + + local daily_threshold="$1" + shift + local weekly_threshold="$1" + shift + local monthly_threshold="$1" + shift + + # We might as well name the files we're using for running tests in as inflamatory a + # way as possible to increase the chances that bad filtering by weed_directory + # results in tests failing. + local expected="/testing/expected/backup__2020-02-02.sql" + local actual="/testing/backup__2020-02-02.sql.actual" + local remaining="/testing/remainbackup__2020-02-02.sql" + local temp="/testing/backup__2020-TE-MP.sql" + + # create test files + mkdir -p /testing/expected + if [[ -e "$expected" ]]; then + rm "$expected" + fi + touch "$expected" + echo -e "$expected\\n$actual\\n$remaining\\n$temp" > "$remaining" + while [[ "$#" -gt 0 ]]; do + if [[ "$1" =~ ^DELETE: ]]; then + path="/testing/${1:7}" + echo "$path" >> "$expected" + else + path="/testing/$1" + echo "$path" >> "$remaining" + fi + + directory=$(dirname "$path") + mkdir -p "$directory" + touch "$path" + + shift + done + # We don't make any promise about the order files will be listed in by + # weed_directory (it is currently reverse-chronological). We should sort the output + # and the expected file instead of forcing tests to list files in that order (or + # causing tests to fail if weed_directory's order changes) + sort "$expected" > "$temp" + mv "$temp" "$expected" + sort "$remaining" > "$temp" + mv "$temp" "$remaining" + + # Part one: call the function directly + set +e + ( + weed_directory \ + "/testing" \ + "$daily_threshold" \ + "$weekly_threshold" \ + "$monthly_threshold" \ + 2> "$temp" \ + | sort > "$actual" + ) + local result="$?" + set -e + + if [[ "$result" -ne 0 ]]; then + echo 'error' + ERROR_COUNT=$((ERROR_COUNT + 1)) + if [[ -s "$temp" ]]; then + echo 'stderr:' + cat "$temp" + fi + else + set +e + compare_files "$expected" "$actual" + result="$?" + set -e + + if [[ "$result" -eq 0 ]]; then + # Part two: as a script with the dry-run flag (-l) + set +e + ( + "/weed.sh" \ + "-d" "$daily_threshold" \ + "-w" "$weekly_threshold" \ + "-m" "$monthly_threshold" \ + "-l" \ + "/testing" \ + 2> "$temp" \ + | sort > "$actual" + ) + local result="$?" + set -e + + if [[ "$result" -ne 0 ]]; then + echo 'error' + ERROR_COUNT=$((ERROR_COUNT + 1)) + if [[ -s "$temp" ]]; then + echo 'stderr:' + cat "$temp" + fi + else + set +e + compare_files "$expected" "$actual" + result="$?" + set -e + + if [[ "$result" -eq 0 ]]; then + # Part three: let's try actually deleting files + set +e + ( + "/weed.sh" \ + "-d" "$daily_threshold" \ + "-w" "$weekly_threshold" \ + "-m" "$monthly_threshold" \ + "/testing" \ + 2> "$temp" + ) + local result="$?" + set -e + + if [[ "$result" -ne 0 ]]; then + echo 'error' + ERROR_COUNT=$((ERROR_COUNT + 1)) + if [[ -s "$temp" ]]; then + echo 'stderr:' + cat "$temp" + fi + else + find /testing -type f | sort > "$actual" + + set +e + compare_files "$remaining" "$actual" + result="$?" + set -e + + if [[ "$result" -eq 0 ]]; then + echo 'pass' + elif [[ -s "$temp" ]]; then + echo 'stderr:' + cat "$temp" + fi + fi + elif [[ -s "$temp" ]]; then + echo 'stderr:' + cat "$temp" + fi + fi + elif [[ -s "$temp" ]]; then + echo 'stderr:' + cat "$temp" + fi + fi + rm -rf /testing +} + +# actual tests +function test_shellcheck { + echo -en 'running shellcheck on scripts:\t' + shellcheck /weed.sh + # Test the tests too! Writing bash is hard + shellcheck -x /testing-entrypoint.sh + echo 'pass' +} + +function test_empty_directory { + perform_test 1 2 3 +} + +function test_single_file { + perform_test 1 2 3 "backup__2021-02-02.sql" +} + +function test_keep_everything { + perform_test -1 0 0 "backup__2021-02-02.sql" "backup__2021-02-01.sql" "backup__2021-01-31.sql" +} + +function test_keep_one { + perform_test 1 0 0 "backup__2021-02-02.sql" "DELETE:backup__2021-02-01.sql" "DELETE:backup__2021-01-31.sql" +} + +function test_weekly { + # weed.sh follows ISO 8601 and uses %W for day of week, so Monday is the first day + # of the week. + # backup__2021-03-08.sql: Monday (keep) + # backup__2021-03-07.sql: Sunday (keep) + # backup__2021-02-28.sql: Sunday (keep) + # backup__2021-02-22.sql: Monday (delete) + # backup__2021-02-20.sql: Saturday (keep) + # backup__2021-02-16.sql: Tuesday (delete) + # backup__2021-02-15.sql: Monday (delete) + # backup__2021-02-14.sql: Sunday (keep) + # backup__2020-02-14.sql: Sunday (same week of year) (keep) + perform_test 0 -1 0 \ + "backup__2021-03-08.sql" \ + "backup__2021-03-07.sql" \ + "backup__2021-02-28.sql" \ + "DELETE:backup__2021-02-22.sql" \ + "backup__2021-02-20.sql" \ + "DELETE:backup__2021-02-16.sql" \ + "DELETE:backup__2021-02-15.sql" \ + "backup__2021-02-14.sql" \ + "backup__2020-02-14.sql" +} + +function test_monthly { + perform_test 1 0 -1 \ + "backup__2021-03-08.sql" \ + "DELETE:backup__2021-03-07.sql" \ + "backup__2021-02-28.sql" \ + "DELETE:backup__2021-02-22.sql" \ + "DELETE:backup__2021-02-20.sql" \ + "DELETE:backup__2021-02-16.sql" \ + "DELETE:backup__2021-02-15.sql" \ + "DELETE:backup__2021-02-14.sql" \ + "backup__2021-01-14.sql" \ + "backup__2020-01-13.sql" +} + +function test_annual { + perform_test 0 0 0 \ + "backup__2021-03-08.sql" \ + "DELETE:backup__2021-03-07.sql" \ + "DELETE:backup__2021-02-28.sql" \ + "DELETE:backup__2021-02-22.sql" \ + "DELETE:backup__2021-02-20.sql" \ + "DELETE:backup__2021-02-16.sql" \ + "DELETE:backup__2021-02-15.sql" \ + "DELETE:backup__2021-02-14.sql" \ + "DELETE:backup__2021-01-14.sql" \ + "backup__2020-01-13.sql" \ + "backup__2019-12-31.sql" \ + "DELETE:backup__2019-01-13.sql" +} + +# Will not pass while maxdepth is set to 1. +function skip_test_sort_order { + perform_test 0 0 1 \ + "a/backup__2021-03-08.sql" \ + "DELETE:b/backup__2021-03-07.sql" \ + "DELETE:a/backup__2021-02-28.sql" \ + "DELETE:b/backup__2021-02-22.sql" \ + "DELETE:a/backup__2021-02-20.sql" \ + "DELETE:b/backup__2021-02-16.sql" \ + "DELETE:a/backup__2021-02-15.sql" \ + "DELETE:b/backup__2021-02-14.sql" \ + "DELETE:a/backup__2021-01-14.sql" \ + "b/backup__2020-01-13.sql" \ + "a/backup__2019-12-31.sql" \ + "DELETE:b/backup__2019-01-13.sql" +} + +function test_ignore_subdirectories { + perform_test 0 0 0 "a/backup__2021-03-08.sql" "backup__2021-03-07.sql" +} + +function test_standard { + perform_test 14 4 1 \ + "backup__2021-03-08.sql" \ + "backup__2021-03-07.sql" \ + "backup__2021-03-06.sql" \ + "backup__2021-03-05.sql" \ + "backup__2021-03-04.sql" \ + "backup__2021-03-03.sql" \ + "backup__2021-03-02.sql" \ + "backup__2021-03-01.sql" \ + "backup__2021-02-28.sql" \ + "backup__2021-02-27.sql" \ + "backup__2021-02-26.sql" \ + "backup__2021-02-25.sql" \ + "backup__2021-02-24.sql" \ + "backup__2021-02-23.sql" \ + "DELETE:backup__2021-02-22.sql" \ + "backup__2021-02-21.sql" \ + "DELETE:backup__2021-02-20.sql" \ + "DELETE:backup__2021-02-19.sql" \ + "DELETE:backup__2021-02-18.sql" \ + "DELETE:backup__2021-02-17.sql" \ + "DELETE:backup__2021-02-16.sql" \ + "DELETE:backup__2021-02-15.sql" \ + "backup__2021-02-14.sql" \ + "DELETE:backup__2021-02-13.sql" \ + "DELETE:backup__2021-02-12.sql" \ + "DELETE:backup__2021-02-11.sql" \ + "DELETE:backup__2021-02-10.sql" \ + "DELETE:backup__2021-02-09.sql" \ + "DELETE:backup__2021-02-08.sql" \ + "backup__2021-02-07.sql" \ + "DELETE:backup__2021-02-06.sql" \ + "DELETE:backup__2021-02-05.sql" \ + "DELETE:backup__2021-02-04.sql" \ + "DELETE:backup__2021-02-03.sql" \ + "DELETE:backup__2021-02-02.sql" \ + "DELETE:backup__2021-02-01.sql" \ + "backup__2021-01-31.sql" \ + "DELETE:backup__2021-01-30.sql" \ + "DELETE:backup__2021-01-29.sql" \ + "DELETE:backup__2021-01-28.sql" \ + "DELETE:backup__2021-01-27.sql" \ + "DELETE:backup__2021-01-26.sql" \ + "DELETE:backup__2021-01-25.sql" \ + "DELETE:backup__2021-01-24.sql" \ + "DELETE:backup__2021-01-23.sql" \ + "DELETE:backup__2021-01-22.sql" \ + "DELETE:backup__2021-01-21.sql" \ + "DELETE:backup__2021-01-20.sql" \ + "DELETE:backup__2021-01-19.sql" \ + "DELETE:backup__2021-01-18.sql" \ + "DELETE:backup__2021-01-17.sql" \ + "DELETE:backup__2021-01-16.sql" \ + "DELETE:backup__2021-01-15.sql" \ + "DELETE:backup__2021-01-14.sql" \ + "DELETE:backup__2021-01-13.sql" \ + "DELETE:backup__2021-01-12.sql" \ + "DELETE:backup__2021-01-11.sql" \ + "DELETE:backup__2021-01-10.sql" \ + "DELETE:backup__2021-01-09.sql" \ + "DELETE:backup__2021-01-08.sql" \ + "DELETE:backup__2021-01-07.sql" \ + "DELETE:backup__2021-01-06.sql" \ + "DELETE:backup__2021-01-05.sql" \ + "DELETE:backup__2021-01-04.sql" \ + "DELETE:backup__2021-01-03.sql" \ + "DELETE:backup__2021-01-02.sql" \ + "DELETE:backup__2021-01-01.sql" \ + "backup__2020-12-31.sql" +} + +function tests { + # Run all functions named test_... in this file in definition order + count=0 + while read -r test; do + eval "$test" + count=$((count + 1)) + done < <(awk '$1 == "function" && $2 ~ "^test_" {print $2}' "${BASH_SOURCE[0]}") + + echo "------------------" + echo "$((count - ERROR_COUNT - FAILURE_COUNT))/$count tests passed" + if [[ $((FAILURE_COUNT + ERROR_COUNT)) -gt 0 ]]; then + if [[ "$ERROR_COUNT" -gt 0 ]]; then + echo "$ERROR_COUNT tests errored" + fi + + if [[ "$FAILURE_COUNT" -gt 0 ]]; then + echo "$FAILURE_COUNT tests failed" + fi + echo 'failure' + else + echo 'success' + fi +} + +if [ "${BASH_SOURCE[0]}" -ef "$0" ]; then + trap 'echo -e "\\terror (in ${FUNCNAME[1]} ${BASH_SOURCE[1]}:${BASH_LINENO[1]})\naborting"' EXIT + tests + trap - EXIT + + if [[ $((FAILURE_COUNT + ERROR_COUNT)) -gt 0 ]]; then + exit 1 + fi +fi \ No newline at end of file diff --git a/postgres-docker/weed.sh b/postgres-docker/weed.sh new file mode 100755 index 000000000..00163625e --- /dev/null +++ b/postgres-docker/weed.sh @@ -0,0 +1,185 @@ +#!/usr/bin/env bash +# Weed old backups. See HELP for details. +# Tests for this script can be found in: +# bookwyrm/postgres-docker/tests/testing-entrypoint.sh +set -euo pipefail + +DAILY_THRESHOLD=14 +WEEKLY_THRESHOLD=4 +MONTHLY_THRESHOLD=-1 + +HELP="\ +NAME + +weed -- remove old backups from the backups directory + +SYNOPSIS + +weed.sh [-d threshold] [-w threshold] [-m threshold] [-l] backup_directory + +DESCRIPTION + +Reduce the number of backups by only keeping a certain number of daily backups before \ +reducing the frequency to weekly, monthly, and then finaly annually. + +For each threshold, setting it to 0 will skip that frequency (e.g., setting weekly to \ +0 will mean backups go directly from daily to monthly), and setting it to -1 will \ +never reduce backups to a lower frequency (e.g., setting weekly to -1 will mean \ +backups never are reduced to monthly backups). + +-d threshold: Store this many daily backups before switching to weekly \ +(default $DAILY_THRESHOLD) + +-w threshold: Store this many weekly backups before switching to monthly \ +(default $WEEKLY_THRESHOLD) + +-m threshold: Store this many monthly backups before switching to annual \ +(default $MONTHLY_THRESHOLD) + +-l: Dry run. List the files that would be deleted. +" + +# fail +# Write a message to stderr then exit +function fail { + echo -e "weed: $1" >&2 + exit 1 +} + +# parse_threshold +# Thresholds should be a non-negative number (or -1 for no threshold) +function parse_threshold { + if [[ ! $1 =~ ^-?[0-9]+$ || $1 -lt -1 ]]; then + fail "Invalid threshold: $1" + fi + + echo "$1" +} + +# weed_directory +# List files to be deleted +function weed_directory { + local directory=$1 + local daily_threshold=$2 + local weekly_threshold=$3 + local monthly_threshold=$4 + + local count=0 + local thresholds=("$daily_threshold" "$weekly_threshold" "$monthly_threshold" -1) + local date_formats=("%Y %m %d" "%Y %W" "%Y %m" "%Y") + local index=0 + local last_date="" + local last_format="" + local date="" + + # We would like to loop through all the backup files in the backup directory in + # reverse-chronological order. Bookwyrm backup files are named such that + # chronological and lexical order match. So we should be safe to find all backup + # files and reverse sort them. We should be terrified of deleting a backup an + # instance maintainer wants to keep, so we will be extra cautious. We're ignoring + # any subdirectories in case someone moves an important backup into a meaningfully + # named folder. We are also prepending the date to the path before sorting so that + # the ordering would be correct even if we were allowed to find backup files in + # subdirectories where chronological and lexical order don't match. + for date_file in $( + find "$directory" \ + -maxdepth 1 \ + -name 'backup__[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]\.sql' \ + | sed 's/\(^.*backup__\([0-9-]*\)\.sql$\)/\2\1/' \ + | sort --reverse + ); do + date="${date_file:0:10}" + file="${date_file:10}" + date="${date_file:0:10}" + file="${date_file:10}" + + # We can't fall off the end because we set annual backups to unlimited. It seems + # unlikely that instance maintainers would have enough concern about the space + # one backup/year takes to warrant supporting a cutoff. + while [[ ${thresholds[index]} -ne -1 && $count -ge ${thresholds[index]} ]]; do + index=$((index + 1)) + last_format="" + count=0 + done + + if [[ -z "$last_date" ]]; then + count=$((count + 1)) + last_date=$date + last_format="" + else + if [[ -z "$last_format" ]]; then + last_format=$(date --date="$last_date" +"${date_formats[index]}") + fi + + format=$(date --date="$date" +"${date_formats[index]}") + + if [[ "$format" == "$last_format" ]]; then + echo "$file" + else + count=$((count + 1)) + last_date="$date" + last_format="$format" + fi + fi + done +} + +function main(){ + local daily_threshold=$DAILY_THRESHOLD + local weekly_threshold=$WEEKLY_THRESHOLD + local monthly_threshold=$MONTHLY_THRESHOLD + local dry_run="" + + while getopts "hd:w:m:l" OPTION; do + case "$OPTION" in + h) + echo "$HELP"; + exit + ;; + d) + daily_threshold=$(parse_threshold "$OPTARG") + ;; + w) + weekly_threshold=$(parse_threshold "$OPTARG") + ;; + m) + monthly_threshold=$(parse_threshold "$OPTARG") + ;; + l) + dry_run="true" + ;; + :) + fail "Missing argument for '$OPTARG'. To see help run: weed.sh -h" + ;; + ?) + fail "Unknown option '$OPTION'. To see help run: weed.sh -h" + esac + done + shift "$((OPTIND - 1))" + + if [[ $# -ne 1 ]]; then + fail "expected a single argument, directory" + fi + + local count=0 + for file in $(weed_directory "$1" "$daily_threshold" "$weekly_threshold" "$monthly_threshold"); do + count=$((count + 1)) + if [[ -n "$dry_run" ]]; then + echo "$file" + else + echo "deleting $file" >&2 + rm "$file" + fi + done + + if [[ -n "$dry_run" ]]; then + optional_words="would be " + else + optional_words="" + fi + echo -e "$count files ${optional_words}deleted" >&2 +} + +if [ "${BASH_SOURCE[0]}" -ef "$0" ]; then + main "$@" +fi \ No newline at end of file