refactor progress
This commit is contained in:
282
.github/workflows/test-cassandra-backup.yaml
vendored
282
.github/workflows/test-cassandra-backup.yaml
vendored
@@ -15,7 +15,7 @@ permissions:
|
||||
jobs:
|
||||
test-backup:
|
||||
name: Test latest Cassandra backup
|
||||
runs-on: blacksmith-2vcpu-ubuntu-2404
|
||||
runs-on: blacksmith-8vcpu-ubuntu-2404
|
||||
timeout-minutes: 45
|
||||
|
||||
env:
|
||||
@@ -32,275 +32,59 @@ jobs:
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Set temp paths
|
||||
run: |
|
||||
set -euo pipefail
|
||||
: "${RUNNER_TEMP:?RUNNER_TEMP is not set}"
|
||||
echo "WORKDIR=$RUNNER_TEMP/cassandra-restore-test" >> "$GITHUB_ENV"
|
||||
run: >-
|
||||
python3 scripts/ci/workflows/test_cassandra_backup.py
|
||||
--step set_temp_paths
|
||||
|
||||
- name: Pre-clean
|
||||
run: |
|
||||
set -euo pipefail
|
||||
docker rm -f "${CASS_CONTAINER}" "${UTIL_CONTAINER}" 2>/dev/null || true
|
||||
docker volume rm "${CASS_VOLUME}" 2>/dev/null || true
|
||||
docker volume rm "${BACKUP_VOLUME}" 2>/dev/null || true
|
||||
rm -rf "${WORKDIR}" 2>/dev/null || true
|
||||
run: >-
|
||||
python3 scripts/ci/workflows/test_cassandra_backup.py
|
||||
--step pre_clean
|
||||
|
||||
- name: Install tools
|
||||
run: |
|
||||
set -euo pipefail
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install -y --no-install-recommends rclone age ca-certificates
|
||||
run: >-
|
||||
python3 scripts/ci/workflows/test_cassandra_backup.py
|
||||
--step install_tools
|
||||
|
||||
- name: Find latest backup, validate freshness, download, decrypt, extract into Docker volume
|
||||
env:
|
||||
B2_KEY_ID: ${{ secrets.B2_KEY_ID }}
|
||||
B2_APPLICATION_KEY: ${{ secrets.B2_APPLICATION_KEY }}
|
||||
AGE_PRIVATE_KEY: ${{ secrets.CASSANDRA_AGE_PRIVATE_KEY }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
rm -rf "$WORKDIR"
|
||||
mkdir -p "$WORKDIR"
|
||||
|
||||
export RCLONE_CONFIG_B2S3_TYPE=s3
|
||||
export RCLONE_CONFIG_B2S3_PROVIDER=Other
|
||||
export RCLONE_CONFIG_B2S3_ACCESS_KEY_ID="${B2_KEY_ID}"
|
||||
export RCLONE_CONFIG_B2S3_SECRET_ACCESS_KEY="${B2_APPLICATION_KEY}"
|
||||
export RCLONE_CONFIG_B2S3_ENDPOINT="https://s3.eu-central-003.backblazeb2.com"
|
||||
export RCLONE_CONFIG_B2S3_REGION="eu-central-003"
|
||||
export RCLONE_CONFIG_B2S3_FORCE_PATH_STYLE=true
|
||||
|
||||
LATEST_BACKUP="$(
|
||||
rclone lsf "B2S3:fluxer" --recursive --files-only --fast-list \
|
||||
| grep -E '(^|/)cassandra-backup-[0-9]{8}-[0-9]{6}\.tar\.age$' \
|
||||
| sort -r \
|
||||
| head -n 1
|
||||
)"
|
||||
|
||||
if [ -z "${LATEST_BACKUP}" ]; then
|
||||
echo "Error: No backup found in bucket"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "LATEST_BACKUP=${LATEST_BACKUP}" >> "$GITHUB_ENV"
|
||||
|
||||
base="$(basename "${LATEST_BACKUP}")"
|
||||
ts="${base#cassandra-backup-}"
|
||||
ts="${ts%.tar.age}"
|
||||
|
||||
if ! [[ "$ts" =~ ^[0-9]{8}-[0-9]{6}$ ]]; then
|
||||
echo "Error: Could not extract timestamp from backup filename: ${base}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
BACKUP_EPOCH="$(date -u -d "${ts:0:8} ${ts:9:2}:${ts:11:2}:${ts:13:2}" +%s)"
|
||||
CURRENT_EPOCH="$(date -u +%s)"
|
||||
AGE_HOURS=$(( (CURRENT_EPOCH - BACKUP_EPOCH) / 3600 ))
|
||||
|
||||
echo "Backup age: ${AGE_HOURS} hours"
|
||||
if [ "${AGE_HOURS}" -ge 3 ]; then
|
||||
echo "Error: Latest backup is ${AGE_HOURS} hours old (threshold: 3 hours)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
rclone copyto "B2S3:fluxer/${LATEST_BACKUP}" "${WORKDIR}/backup.tar.age" --fast-list
|
||||
|
||||
umask 077
|
||||
printf '%s' "${AGE_PRIVATE_KEY}" > "${WORKDIR}/age.key"
|
||||
|
||||
docker volume create "${BACKUP_VOLUME}"
|
||||
|
||||
age -d -i "${WORKDIR}/age.key" "${WORKDIR}/backup.tar.age" \
|
||||
| docker run --rm -i \
|
||||
-v "${BACKUP_VOLUME}:/backup" \
|
||||
--entrypoint bash \
|
||||
"${CASSANDRA_IMAGE}" -lc '
|
||||
set -euo pipefail
|
||||
rm -rf /backup/*
|
||||
mkdir -p /backup/_tmp
|
||||
tar -C /backup/_tmp -xf -
|
||||
|
||||
top="$(find /backup/_tmp -maxdepth 1 -mindepth 1 -type d -name "cassandra-backup-*" | head -n 1 || true)"
|
||||
|
||||
if [ -n "$top" ] && [ -f "$top/schema.cql" ]; then
|
||||
cp -a "$top"/. /backup/
|
||||
elif [ -f /backup/_tmp/schema.cql ]; then
|
||||
cp -a /backup/_tmp/. /backup/
|
||||
else
|
||||
echo "Error: schema.cql not found after extraction"
|
||||
find /backup/_tmp -maxdepth 3 -type f -print | sed -n "1,80p" || true
|
||||
exit 1
|
||||
fi
|
||||
|
||||
rm -rf /backup/_tmp
|
||||
'
|
||||
|
||||
docker run --rm \
|
||||
-v "${BACKUP_VOLUME}:/backup:ro" \
|
||||
--entrypoint bash \
|
||||
"${CASSANDRA_IMAGE}" -lc '
|
||||
set -euo pipefail
|
||||
test -f /backup/schema.cql
|
||||
echo "Extracted backup layout (top 3 levels):"
|
||||
find /backup -maxdepth 3 -type d -print | sed -n "1,200p" || true
|
||||
echo "Sample SSTables (*Data.db):"
|
||||
find /backup -type f -name "*Data.db" | sed -n "1,30p" || true
|
||||
'
|
||||
run: >-
|
||||
python3 scripts/ci/workflows/test_cassandra_backup.py
|
||||
--step fetch_backup
|
||||
|
||||
- name: Create data volume
|
||||
run: |
|
||||
set -euo pipefail
|
||||
docker volume create "${CASS_VOLUME}"
|
||||
run: >-
|
||||
python3 scripts/ci/workflows/test_cassandra_backup.py
|
||||
--step create_data_volume
|
||||
|
||||
- name: Restore keyspaces into volume and promote snapshot SSTables
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
docker run --rm \
|
||||
--name "${UTIL_CONTAINER}" \
|
||||
-v "${CASS_VOLUME}:/var/lib/cassandra" \
|
||||
-v "${BACKUP_VOLUME}:/backup:ro" \
|
||||
--entrypoint bash \
|
||||
"${CASSANDRA_IMAGE}" -lc '
|
||||
set -euo pipefail
|
||||
shopt -s nullglob
|
||||
|
||||
BASE=/var/lib/cassandra
|
||||
DATA_DIR="$BASE/data"
|
||||
mkdir -p "$DATA_DIR" "$BASE/commitlog" "$BASE/hints" "$BASE/saved_caches"
|
||||
|
||||
ROOT=/backup
|
||||
if [ -d "$ROOT/cassandra_data" ]; then ROOT="$ROOT/cassandra_data"; fi
|
||||
if [ -d "$ROOT/data" ]; then ROOT="$ROOT/data"; fi
|
||||
|
||||
echo "Using backup ROOT=$ROOT"
|
||||
echo "Restoring into DATA_DIR=$DATA_DIR"
|
||||
|
||||
restored=0
|
||||
for keyspace_dir in "$ROOT"/*/; do
|
||||
[ -d "$keyspace_dir" ] || continue
|
||||
ks="$(basename "$keyspace_dir")"
|
||||
|
||||
if [ "$ks" = "system_schema" ] || ! [[ "$ks" =~ ^system ]]; then
|
||||
echo "Restoring keyspace: $ks"
|
||||
rm -rf "$DATA_DIR/$ks"
|
||||
cp -a "$keyspace_dir" "$DATA_DIR/"
|
||||
restored=$((restored + 1))
|
||||
fi
|
||||
done
|
||||
|
||||
if [ "$restored" -le 0 ]; then
|
||||
echo "Error: No keyspaces restored from backup root: $ROOT"
|
||||
echo "Debug: listing $ROOT:"
|
||||
ls -la "$ROOT" || true
|
||||
find "$ROOT" -maxdepth 2 -type d -print | sed -n "1,100p" || true
|
||||
exit 1
|
||||
fi
|
||||
|
||||
promoted=0
|
||||
for ks_dir in "$DATA_DIR"/*/; do
|
||||
[ -d "$ks_dir" ] || continue
|
||||
ks="$(basename "$ks_dir")"
|
||||
|
||||
if [ "$ks" != "system_schema" ] && [[ "$ks" =~ ^system ]]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
for table_dir in "$ks_dir"*/; do
|
||||
[ -d "$table_dir" ] || continue
|
||||
|
||||
snap_root="$table_dir/snapshots"
|
||||
[ -d "$snap_root" ] || continue
|
||||
|
||||
latest_snap="$(ls -1d "$snap_root"/*/ 2>/dev/null | sort -r | head -n 1 || true)"
|
||||
[ -n "$latest_snap" ] || continue
|
||||
|
||||
files=( "$latest_snap"* )
|
||||
if [ "${#files[@]}" -gt 0 ]; then
|
||||
cp -av "${files[@]}" "$table_dir"
|
||||
promoted=$((promoted + $(ls -1 "$latest_snap"/*Data.db 2>/dev/null | wc -l || true)))
|
||||
fi
|
||||
done
|
||||
done
|
||||
|
||||
chown -R cassandra:cassandra "$BASE"
|
||||
|
||||
echo "Promoted Data.db files: $promoted"
|
||||
if [ "$promoted" -le 0 ]; then
|
||||
echo "Error: No *Data.db files were promoted out of snapshots"
|
||||
echo "Debug: first snapshot dirs found:"
|
||||
find "$DATA_DIR" -type d -path "*/snapshots/*" | sed -n "1,50p" || true
|
||||
exit 1
|
||||
fi
|
||||
'
|
||||
run: >-
|
||||
python3 scripts/ci/workflows/test_cassandra_backup.py
|
||||
--step restore_keyspaces
|
||||
|
||||
- name: Start Cassandra
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
docker run -d \
|
||||
--name "${CASS_CONTAINER}" \
|
||||
-v "${CASS_VOLUME}:/var/lib/cassandra" \
|
||||
-e MAX_HEAP_SIZE="${MAX_HEAP_SIZE}" \
|
||||
-e HEAP_NEWSIZE="${HEAP_NEWSIZE}" \
|
||||
-e JVM_OPTS="-Dcassandra.disable_mlock=true" \
|
||||
"${CASSANDRA_IMAGE}"
|
||||
|
||||
for i in $(seq 1 150); do
|
||||
status="$(docker inspect -f '{{.State.Status}}' "${CASS_CONTAINER}" 2>/dev/null || true)"
|
||||
if [ "${status}" != "running" ]; then
|
||||
docker inspect "${CASS_CONTAINER}" --format 'ExitCode={{.State.ExitCode}} OOMKilled={{.State.OOMKilled}} Error={{.State.Error}}' || true
|
||||
docker logs --tail 300 "${CASS_CONTAINER}" || true
|
||||
exit 1
|
||||
fi
|
||||
if docker exec "${CASS_CONTAINER}" cqlsh -e "SELECT now() FROM system.local;" >/dev/null 2>&1; then
|
||||
break
|
||||
fi
|
||||
sleep 2
|
||||
done
|
||||
|
||||
docker exec "${CASS_CONTAINER}" cqlsh -e "SELECT now() FROM system.local;" >/dev/null 2>&1
|
||||
run: >-
|
||||
python3 scripts/ci/workflows/test_cassandra_backup.py
|
||||
--step start_cassandra
|
||||
|
||||
- name: Verify data
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
USER_COUNT=""
|
||||
for i in $(seq 1 20); do
|
||||
USER_COUNT="$(
|
||||
docker exec "${CASS_CONTAINER}" cqlsh -e "SELECT COUNT(*) FROM fluxer.users;" 2>/dev/null \
|
||||
| awk "/^[[:space:]]*[0-9]+[[:space:]]*$/ {print \$1; exit}" || true
|
||||
)"
|
||||
if [ -n "${USER_COUNT}" ]; then
|
||||
break
|
||||
fi
|
||||
sleep 2
|
||||
done
|
||||
|
||||
if [ -n "${USER_COUNT}" ] && [ "${USER_COUNT}" -gt 0 ] 2>/dev/null; then
|
||||
echo "Backup restore verification passed"
|
||||
else
|
||||
echo "Backup restore verification failed"
|
||||
docker logs --tail 300 "${CASS_CONTAINER}" || true
|
||||
exit 1
|
||||
fi
|
||||
run: >-
|
||||
python3 scripts/ci/workflows/test_cassandra_backup.py
|
||||
--step verify_data
|
||||
|
||||
- name: Cleanup
|
||||
if: always()
|
||||
run: |
|
||||
set -euo pipefail
|
||||
docker rm -f "${CASS_CONTAINER}" 2>/dev/null || true
|
||||
docker volume rm "${CASS_VOLUME}" 2>/dev/null || true
|
||||
docker volume rm "${BACKUP_VOLUME}" 2>/dev/null || true
|
||||
rm -rf "${WORKDIR}" 2>/dev/null || true
|
||||
run: >-
|
||||
python3 scripts/ci/workflows/test_cassandra_backup.py
|
||||
--step cleanup
|
||||
|
||||
- name: Report status
|
||||
if: always()
|
||||
run: |
|
||||
set -euo pipefail
|
||||
LATEST_BACKUP_NAME="${LATEST_BACKUP:-unknown}"
|
||||
if [ "${{ job.status }}" = "success" ]; then
|
||||
echo "Backup ${LATEST_BACKUP_NAME} is valid and restorable"
|
||||
else
|
||||
echo "Backup ${LATEST_BACKUP_NAME} test failed"
|
||||
fi
|
||||
env:
|
||||
JOB_STATUS: ${{ job.status }}
|
||||
run: >-
|
||||
python3 scripts/ci/workflows/test_cassandra_backup.py
|
||||
--step report_status
|
||||
|
||||
Reference in New Issue
Block a user