From: Alan J. Pippin Date: Wed, 14 Jan 2009 07:17:39 +0000 (-0700) Subject: Added code to fallback to lists of snapshots to find common point X-Git-Url: http://git.pippins.net/images/%27%20%20%20%20.%20%24GLOBALS%5B%27phpgw%27%5D-%3Elink%28%27inc/jquery/.%24link.?a=commitdiff_plain;h=64e85d923d59ee7b95cb123f12bfa3750f42dc0f;p=zfs-nexenta%2F.git Added code to fallback to lists of snapshots to find common point if the backup markers can't be found. Fixed various bugs. --- diff --git a/zfs-autosnap b/zfs-autosnap index cfe80aa..da90906 100755 --- a/zfs-autosnap +++ b/zfs-autosnap @@ -50,7 +50,7 @@ filesystem=$1 mountpoint=${2-/$1} numsnapshots=${3-12} maxagedays=${4-0} -lockdir="/tmp/${filesystem}.lock" +lockdir="/tmp/zfs-admin-lock" pool=`echo "$filesystem" | awk -F '/' '{ print $1 }'` if [ -z "$filesystem" ] || [ -z "$mountpoint" ] || [ -z "$numsnapshots" ] || [ -z "$maxagedays" ]; then @@ -111,8 +111,8 @@ snapshot "${filesystem}@${datetime}" minutes=$(echo $datetime | datetime_to_minutes) -lockdir="/tmp/zfs-admin-lock" if ! mkdir "$lockdir" >/dev/null 2>&1; then + echo "-W- The zfs filesystem has been locked down. Skipping snapshot cleanup." exit 0 fi cleanup() { rm -rf "$lockdir"; } diff --git a/zfs-replicate b/zfs-replicate index 128d051..57680fc 100755 --- a/zfs-replicate +++ b/zfs-replicate @@ -4,13 +4,17 @@ remote=$1 remote_fs=$2 remote_pool=${2%%/*} - -# Set this variable to '1' to use the legacy, non-marker, snapshot diff, replicate script logic -use_legacy_replicate=0 +remote_lockdir="/tmp/zfs-admin-lock" # Set the name of the local pool used to store the backup of the remote local_pool=backups +# Set the email address to send notification to +mailto=alan@pippins.net + +# The ssh connection doesn't find zfs without this. +zfs=/usr/sbin/zfs + # Make sure we have valid arguments if [[ -z "$remote" ]] || [[ -z "$remote_fs" ]]; then echo "Usage: $0 " @@ -18,20 +22,26 @@ if [[ -z "$remote" ]] || [[ -z "$remote_fs" ]]; then fi # Make sure the local pool and local receiving filesystem exist, or print some errors -if ! zpool list -H "$local_pool" >/dev/null 2>&1; then +zpool list -H "$local_pool" >/dev/null 2>&1 +if [ $? != 0 ]; then echo >&2 "-E- The local pool, '$local_pool' doesn't seem to exist." exit 1 fi -if ! zfs list "$local_pool/$remote_pool" >/dev/null 2>&1; then - echo >&2 "-E- The local filesystem for the remote pool, '$local_pool/$remote_pool' doesn't seem to exist." - echo >&2 " You will need to create this filesystem before this script can replicate your data." - echo >&2 " You can create this filsystem by executing this command: 'zfs create $local_pool/$remote_pool'" - exit 1 +zfs list "$local_pool/$remote_pool" >/dev/null 2>&1 +if [ $? != 0 ]; then + echo >&2 "-I- The local filesystem for the remote pool, '$local_pool/$remote_pool' doesn't seem to exist." + echo >&2 " Creating the local filesystem to receive the remote pool into: $local_pool/$remote_pool" + $zfs create $local_pool/$remote_pool + if [ $? != 0 ]; then + echo "-E- remote $zfs create command failed" + exit 1 + fi fi # Obtain the zpool guid for the local pool local_pool_guid=`zpool get guid $local_pool 2>&1 | grep $local_pool | awk '{ print $3 }'` -if ! zpool get guid $local_pool > /dev/null 2>&1; then +zpool get guid $local_pool > /dev/null 2>&1 +if [ $? != 0 ]; then echo >&2 "-E- Unable to extract the guid for the local pool: $local_pool" exit 1 fi @@ -39,20 +49,46 @@ fi # Turn on shell verbosity set -x +# Create the remote lockdir before continuing with the replicate +# Spinlock on creating the lock +maxsleeptime=60 +maxattempts=100 +attempts=0 +while true; do + ssh $remote mkdir "$remote_lockdir" >/dev/null 2>&1 + if [ $? != 0 ]; then + # Another zfs admin tool is running. + # Wait a random amount of time and try again + ransleep=$(($RANDOM % $maxsleeptime)) + sleep $ransleep + ((attempts=attempts+1)) + else + # No other zfs admin tool is running, we can now. + break + fi + if [[ $attempts -gt $maxattempts ]]; then + # We've exceeded our maximum while loop count + echo "-W- The zfs filesystem has been locked down. Skipping replicate operation." + ssh $remote ls -ld $remote_lockdir | /usr/bin/mailx -s "zfs-replicate-all unable to obtain zfs admin lock" $mailto + exit 1 + fi +done + +# Declare a cleanup() method to remove the remote lockdir +cleanup() { ssh $remote rm -rf "$remote_lockdir"; } +trap cleanup EXIT + # Setup our backup marker names current_backup_marker=${remote_fs}@current-backup-${local_pool_guid} previous_backup_marker=${remote_fs}@previous-backup-${local_pool_guid} -# The ssh connection doesn't find zfs without this. -zfs=/usr/sbin/zfs - # List the snapshots on the remote machine. remote_list=$(mktemp /tmp/replicate.XXXXXX) ssh $remote \ $zfs list -H -t snapshot | grep ^${remote_fs}@ | awk '{print$1}' > $remote_list -if [[ $? != 0 ]]; then +if [ $? != 0 ]; then echo "-E- remote $zfs list command failed" exit 1 fi @@ -61,71 +97,61 @@ fi local_list=$(mktemp /tmp/replicate.XXXXXX) $zfs list -H -t snapshot | grep ^${local_pool}/${remote_fs}@ | - awk '{gsub(/^${local_pool}./,"",$1); print$1}' > $local_list -if [[ $? != 0 ]]; then + awk "{gsub(/^$local_pool./,\"\",\$1); print\$1}" > $local_list +if [ $? != 0 ]; then echo "-E- local $zfs list command failed" exit 1 fi -if [ $use_legacy_replicate == 0 ]; then - # Destroy the current backup marker snapshot on the remote system if it exists - grep -q ${current_backup_marker} $remote_list - if [ $? == 0 ]; then - ssh $remote $zfs destroy ${current_backup_marker} - if [[ $? != 0 ]]; then - echo "-E- remote $zfs destroy command failed" - exit 1 - fi - fi - # Create the current backup marker snapshot on the remote system - ssh $remote $zfs snapshot ${current_backup_marker} - if [[ $? != 0 ]]; then - echo "-E- remote $zfs snapshot command failed" +# Destroy the current backup marker snapshot on the remote system if it exists +grep -q ${current_backup_marker} $remote_list +if [ $? == 0 ]; then + ssh $remote $zfs destroy ${current_backup_marker} + if [ $? != 0 ]; then + echo "-E- remote $zfs destroy command failed" exit 1 fi +fi - # Check to see if the previous backup marker exists in the remote snapshot list. - # Check to see if the previous backup marker exists in the local snapshot list. - # If the previous backup markers exists, perform an incremental replicate. - # Otherwise, perform a full replicate. - grep -q ${previous_backup_marker} $remote_list - full=$? - grep -q ${previous_backup_marker} $local_list - full=$(($full || $?)) - - if [[ $full == 0 ]]; then - ssh $remote $zfs send -R -I${previous_backup_marker} ${current_backup_marker} | - $zfs receive -vF -d ${local_pool}/${remote_fs%/*} - if [[ $? != 0 ]]; then - echo "-E- remote incremental $zfs send command failed" - exit 1 - fi - else - ssh $remote $zfs send -R ${current_backup_marker} | - $zfs receive -vF -d ${local_pool}/${remote_fs%/*} - if [[ $? != 0 ]]; then - echo "-E- remote full $zfs send command failed" - exit 1 - fi - fi - - # destroy the previous backup markers now that we've replicated past them - $zfs destroy ${local_pool}/${previous_backup_marker} > /dev/null 2>&1 - ssh $remote $zfs destroy ${previous_backup_marker} > /dev/null 2>&1 - - # Rename the current backup marker to be the previous backup marker - $zfs rename ${local_pool}/${current_backup_marker} ${local_pool}/${previous_backup_marker} - if [[ $? != 0 ]]; then - echo "-E- local $zfs rename command failed" +# Create the current backup marker snapshot on the remote system +ssh $remote $zfs snapshot ${current_backup_marker} +if [ $? != 0 ]; then + echo "-E- remote $zfs snapshot command failed" + exit 1 +fi + +# Check to see if the previous backup marker exists in the remote snapshot list. +# Check to see if the previous backup marker exists in the local snapshot list. +# If the previous backup markers exists, perform an incremental replicate. Else: +# 1) check to see if a common snapshot exists, and perform an incremental replicate. +# 2) if no common snapshot exists, destroy the local filesystem, and perform a full replicate. +grep -q ${previous_backup_marker} $remote_list +no_markers=$? +grep -q ${previous_backup_marker} $local_list +no_markers=$(($no_markers || $?)) + +if [ $no_markers == 0 ]; then + # We found backup markers, incrementally send the new snaps + + # First, rollback the local pool to the previous backup marker in case the previous + # backup was interrupted for some reason. If we don't do this, the zfs send -R command + # below may complain about snaps already existing as it tries to resend from the + # previous backup marker again from a previously interrupted replicate. + $zfs rollback -r ${local_pool}/${previous_backup_marker} + if [ $? != 0 ]; then + echo "-E- remote incremental $zfs rollback command failed" exit 1 fi - ssh $remote $zfs rename ${current_backup_marker} ${previous_backup_marker} - if [[ $? != 0 ]]; then - echo "-E- remote $zfs rename command failed" + # Now it should be safe to send the snaps + ssh $remote $zfs send -R -I${previous_backup_marker} ${current_backup_marker} | + $zfs receive -vF -d ${local_pool}/${remote_pool} + if [ $? != 0 ]; then + echo "-E- remote incremental $zfs send command failed" exit 1 fi - else + # We didn't find any backup markers, next check to see if we have a common snapshot. + # See what the most recent snapshot on the remote end is. latest=$(tail -n 1 $remote_list) @@ -136,17 +162,53 @@ else echo bogus.local >> $local_list common=$(diff -u $remote_list $local_list | grep '^ ' | tail -n 1) - if [ -n "$common" ]; then - # We found a common snapshot - ssh $remote $zfs send -R -I${common/*@/@} $latest | - $zfs receive -vF -d ${local_pool}/${remote_fs%/*} + if [[ -n "$common" ]]; then + # We found a common snapshot, incrementally send the new snaps + ssh $remote $zfs send -R -I${common/*@/@} ${current_backup_marker} | + $zfs receive -vF -d ${local_pool}/${remote_pool} + if [ $? != 0 ]; then + echo "-E- remote incremental $zfs send command failed" + exit 1 + fi else - # We did not find a common snapshot, so send the entire filesystem - ssh $remote $zfs send -R $latest | - $zfs receive -vF -d ${local_pool}/${remote_fs%/*} + # We did not find any markers or a common snapshot + # At this point, we'll have to send the entire filesystem + # Destroy the local filesystem if it exists before receving the full replicate + zfs list ${local_pool}/${remote_fs} > /dev/null 2>&1 + if [ $? == 0 ]; then + zfs destroy -r ${local_pool}/${remote_fs} + if [ $? != 0 ]; then + echo "-E- remote full $zfs destroy command failed" + exit 1 + fi + fi + # Send the full filesystem + ssh $remote $zfs send -R ${current_backup_marker} | + $zfs receive -vF -d ${local_pool}/${remote_pool} + if [ $? != 0 ]; then + echo "-E- remote full $zfs send command failed" + exit 1 + fi fi fi - + +# destroy the previous backup markers now that we've replicated past them +# don't check the return codes here because these may not exist, and that is ok +$zfs destroy ${local_pool}/${previous_backup_marker} > /dev/null 2>&1 +ssh $remote $zfs destroy ${previous_backup_marker} > /dev/null 2>&1 + +# Rename the current backup marker to be the previous backup marker +$zfs rename ${local_pool}/${current_backup_marker} ${local_pool}/${previous_backup_marker} +if [ $? != 0 ]; then + echo "-E- local $zfs rename command failed" + exit 1 +fi +ssh $remote $zfs rename ${current_backup_marker} ${previous_backup_marker} +if [ $? != 0 ]; then + echo "-E- remote $zfs rename command failed" + exit 1 +fi + # Remove tmp files -#rm -f $local_list $remote_list +rm -f $local_list $remote_list