From 64e85d923d59ee7b95cb123f12bfa3750f42dc0f Mon Sep 17 00:00:00 2001
From: "Alan J. Pippin" <ajp@pippins.net>
Date: Wed, 14 Jan 2009 00:17:39 -0700
Subject: [PATCH] Added code to fallback to lists of snapshots to find common
 point if the backup markers can't be found. Fixed various bugs.

---
 zfs-autosnap  |   4 +-
 zfs-replicate | 214 ++++++++++++++++++++++++++++++++------------------
 2 files changed, 140 insertions(+), 78 deletions(-)

diff --git a/zfs-autosnap b/zfs-autosnap
index cfe80aa..da90906 100755
--- a/zfs-autosnap
+++ b/zfs-autosnap
@@ -50,7 +50,7 @@ filesystem=$1
 mountpoint=${2-/$1}
 numsnapshots=${3-12}
 maxagedays=${4-0}
-lockdir="/tmp/${filesystem}.lock"
+lockdir="/tmp/zfs-admin-lock"
 pool=`echo "$filesystem" | awk -F '/' '{ print $1 }'`
 
 if [ -z "$filesystem" ] || [ -z "$mountpoint" ] || [ -z "$numsnapshots" ] || [ -z "$maxagedays" ]; then
@@ -111,8 +111,8 @@ snapshot "${filesystem}@${datetime}"
 
 minutes=$(echo $datetime | datetime_to_minutes)
 
-lockdir="/tmp/zfs-admin-lock"
 if ! mkdir "$lockdir" >/dev/null 2>&1; then
+  echo "-W- The zfs filesystem has been locked down. Skipping snapshot cleanup."
   exit 0
 fi
 cleanup() { rm -rf "$lockdir"; }
diff --git a/zfs-replicate b/zfs-replicate
index 128d051..57680fc 100755
--- a/zfs-replicate
+++ b/zfs-replicate
@@ -4,13 +4,17 @@
 remote=$1
 remote_fs=$2
 remote_pool=${2%%/*}
-
-# Set this variable to '1' to use the legacy, non-marker, snapshot diff, replicate script logic
-use_legacy_replicate=0
+remote_lockdir="/tmp/zfs-admin-lock"
 
 # Set the name of the local pool used to store the backup of the remote
 local_pool=backups
 
+# Set the email address to send notification to
+mailto=alan@pippins.net
+
+# The ssh connection doesn't find zfs without this.
+zfs=/usr/sbin/zfs
+
 # Make sure we have valid arguments
 if [[ -z "$remote" ]] || [[ -z "$remote_fs" ]]; then
   echo "Usage: $0 <hostname> <zfs filesystem>"
@@ -18,20 +22,26 @@ if [[ -z "$remote" ]] || [[ -z "$remote_fs" ]]; then
 fi
 
 # Make sure the local pool and local receiving filesystem exist, or print some errors
-if ! zpool list -H "$local_pool" >/dev/null 2>&1; then
+zpool list -H "$local_pool" >/dev/null 2>&1
+if [ $? != 0 ]; then
   echo >&2 "-E- The local pool, '$local_pool' doesn't seem to exist."
   exit 1
 fi
-if ! zfs list "$local_pool/$remote_pool" >/dev/null 2>&1; then
-  echo >&2 "-E- The local filesystem for the remote pool, '$local_pool/$remote_pool' doesn't seem to exist."
-  echo >&2 "    You will need to create this filesystem before this script can replicate your data."
-  echo >&2 "    You can create this filsystem by executing this command: 'zfs create $local_pool/$remote_pool'"
-  exit 1
+zfs list "$local_pool/$remote_pool" >/dev/null 2>&1
+if [ $? != 0 ]; then
+  echo >&2 "-I- The local filesystem for the remote pool, '$local_pool/$remote_pool' doesn't seem to exist."
+  echo >&2 "    Creating the local filesystem to receive the remote pool into: $local_pool/$remote_pool"
+  $zfs create $local_pool/$remote_pool
+  if [ $? != 0 ]; then
+    echo "-E- remote $zfs create command failed"
+    exit 1
+  fi
 fi
 
 # Obtain the zpool guid for the local pool
 local_pool_guid=`zpool get guid $local_pool 2>&1 | grep $local_pool | awk '{ print $3 }'`
-if ! zpool get guid $local_pool > /dev/null 2>&1; then
+zpool get guid $local_pool > /dev/null 2>&1
+if [ $? != 0 ]; then
   echo >&2 "-E- Unable to extract the guid for the local pool: $local_pool"
   exit 1
 fi
@@ -39,20 +49,46 @@ fi
 # Turn on shell verbosity
 set -x
 
+# Create the remote lockdir before continuing with the replicate
+# Spinlock on creating the lock
+maxsleeptime=60
+maxattempts=100
+attempts=0
+while true; do
+  ssh $remote mkdir "$remote_lockdir" >/dev/null 2>&1
+  if [ $? != 0 ]; then
+    # Another zfs admin tool is running.
+    # Wait a random amount of time and try again
+    ransleep=$(($RANDOM % $maxsleeptime))
+    sleep $ransleep
+    ((attempts=attempts+1))
+  else 
+    # No other zfs admin tool is running, we can now.
+    break
+  fi
+  if [[ $attempts -gt $maxattempts ]]; then
+    # We've exceeded our maximum while loop count
+    echo "-W- The zfs filesystem has been locked down. Skipping replicate operation."
+    ssh $remote ls -ld $remote_lockdir | /usr/bin/mailx -s "zfs-replicate-all unable to obtain zfs admin lock" $mailto
+    exit 1
+  fi
+done
+
+# Declare a cleanup() method to remove the remote lockdir
+cleanup() { ssh $remote rm -rf "$remote_lockdir"; }
+trap cleanup EXIT
+
 # Setup our backup marker names
 current_backup_marker=${remote_fs}@current-backup-${local_pool_guid}
 previous_backup_marker=${remote_fs}@previous-backup-${local_pool_guid}
 
-# The ssh connection doesn't find zfs without this.
-zfs=/usr/sbin/zfs
-
 # List the snapshots on the remote machine.
 remote_list=$(mktemp /tmp/replicate.XXXXXX)
 ssh $remote \
     $zfs list -H -t snapshot |
     grep ^${remote_fs}@ |
     awk '{print$1}' > $remote_list
-if [[ $? != 0 ]]; then
+if [ $? != 0 ]; then
   echo "-E- remote $zfs list command failed"
   exit 1
 fi
@@ -61,71 +97,61 @@ fi
 local_list=$(mktemp /tmp/replicate.XXXXXX)
 $zfs list -H -t snapshot |
     grep ^${local_pool}/${remote_fs}@ |
-    awk '{gsub(/^${local_pool}./,"",$1); print$1}' > $local_list
-if [[ $? != 0 ]]; then
+    awk "{gsub(/^$local_pool./,\"\",\$1); print\$1}" > $local_list
+if [ $? != 0 ]; then
   echo "-E- local $zfs list command failed"
   exit 1
 fi
 
-if [ $use_legacy_replicate == 0 ]; then
-  # Destroy the current backup marker snapshot on the remote system if it exists
-  grep -q ${current_backup_marker} $remote_list
-  if [ $? == 0 ]; then
-    ssh $remote $zfs destroy ${current_backup_marker} 
-    if [[ $? != 0 ]]; then
-      echo "-E- remote $zfs destroy command failed"
-      exit 1
-    fi
-  fi
-  # Create the current backup marker snapshot on the remote system
-  ssh $remote $zfs snapshot ${current_backup_marker}
-  if [[ $? != 0 ]]; then
-    echo "-E- remote $zfs snapshot command failed"
+# Destroy the current backup marker snapshot on the remote system if it exists
+grep -q ${current_backup_marker} $remote_list
+if [ $? == 0 ]; then
+  ssh $remote $zfs destroy ${current_backup_marker} 
+  if [ $? != 0 ]; then
+    echo "-E- remote $zfs destroy command failed"
     exit 1
   fi
+fi
 
-  # Check to see if the previous backup marker exists in the remote snapshot list.
-  # Check to see if the previous backup marker exists in the local snapshot list.
-  # If the previous backup markers exists, perform an incremental replicate.
-  # Otherwise, perform a full replicate.
-  grep -q ${previous_backup_marker} $remote_list
-  full=$?
-  grep -q ${previous_backup_marker} $local_list
-  full=$(($full || $?))
-
-  if [[ $full == 0 ]]; then
-    ssh $remote $zfs send -R -I${previous_backup_marker} ${current_backup_marker} | 
-        $zfs receive -vF -d ${local_pool}/${remote_fs%/*}
-    if [[ $? != 0 ]]; then
-      echo "-E- remote incremental $zfs send command failed"
-      exit 1
-    fi
-  else
-    ssh $remote $zfs send -R ${current_backup_marker} |
-        $zfs receive -vF -d ${local_pool}/${remote_fs%/*}
-    if [[ $? != 0 ]]; then
-      echo "-E- remote full $zfs send command failed"
-      exit 1
-    fi
-  fi
- 
-  # destroy the previous backup markers now that we've replicated past them
-  $zfs destroy ${local_pool}/${previous_backup_marker} > /dev/null 2>&1
-  ssh $remote $zfs destroy ${previous_backup_marker} > /dev/null 2>&1
-
-  # Rename the current backup marker to be the previous backup marker
-  $zfs rename ${local_pool}/${current_backup_marker} ${local_pool}/${previous_backup_marker}
-  if [[ $? != 0 ]]; then
-    echo "-E- local $zfs rename command failed"
+# Create the current backup marker snapshot on the remote system
+ssh $remote $zfs snapshot ${current_backup_marker}
+if [ $? != 0 ]; then
+  echo "-E- remote $zfs snapshot command failed"
+  exit 1
+fi
+
+# Check to see if the previous backup marker exists in the remote snapshot list.
+# Check to see if the previous backup marker exists in the local snapshot list.
+# If the previous backup markers exists, perform an incremental replicate. Else:
+# 1) check to see if a common snapshot exists, and perform an incremental replicate.
+# 2) if no common snapshot exists, destroy the local filesystem, and perform a full replicate.
+grep -q ${previous_backup_marker} $remote_list
+no_markers=$?
+grep -q ${previous_backup_marker} $local_list
+no_markers=$(($no_markers || $?))
+
+if [ $no_markers == 0 ]; then
+  # We found backup markers, incrementally send the new snaps
+
+  # First, rollback the local pool to the previous backup marker in case the previous
+  # backup was interrupted for some reason. If we don't do this, the zfs send -R command
+  # below may complain about snaps already existing as it tries to resend from the 
+  # previous backup marker again from a previously interrupted replicate.
+  $zfs rollback -r ${local_pool}/${previous_backup_marker} 
+  if [ $? != 0 ]; then
+    echo "-E- remote incremental $zfs rollback command failed"
     exit 1
   fi
-  ssh $remote $zfs rename ${current_backup_marker} ${previous_backup_marker}
-  if [[ $? != 0 ]]; then
-    echo "-E- remote $zfs rename command failed"
+  # Now it should be safe to send the snaps
+  ssh $remote $zfs send -R -I${previous_backup_marker} ${current_backup_marker} | 
+      $zfs receive -vF -d ${local_pool}/${remote_pool}
+  if [ $? != 0 ]; then
+    echo "-E- remote incremental $zfs send command failed"
     exit 1
   fi
-   
 else
+  # We didn't find any backup markers, next check to see if we have a common snapshot.
+
   # See what the most recent snapshot on the remote end is.
   latest=$(tail -n 1 $remote_list)
 
@@ -136,17 +162,53 @@ else
   echo bogus.local  >> $local_list
   common=$(diff -u $remote_list $local_list | grep '^ ' | tail -n 1)
 
-  if [ -n "$common" ]; then 
-    # We found a common snapshot
-    ssh $remote $zfs send -R -I${common/*@/@} $latest |
-        $zfs receive -vF -d ${local_pool}/${remote_fs%/*}
+  if [[ -n "$common" ]]; then
+    # We found a common snapshot, incrementally send the new snaps
+    ssh $remote $zfs send -R -I${common/*@/@} ${current_backup_marker} |
+        $zfs receive -vF -d ${local_pool}/${remote_pool}
+    if [ $? != 0 ]; then
+      echo "-E- remote incremental $zfs send command failed"
+      exit 1
+    fi
   else
-    # We did not find a common snapshot, so send the entire filesystem
-    ssh $remote $zfs send -R $latest |
-        $zfs receive -vF -d ${local_pool}/${remote_fs%/*}
+    # We did not find any markers or a common snapshot
+    # At this point, we'll have to send the entire filesystem
+    # Destroy the local filesystem if it exists before receving the full replicate
+    zfs list ${local_pool}/${remote_fs} > /dev/null 2>&1
+    if [ $? == 0 ]; then
+      zfs destroy -r ${local_pool}/${remote_fs}
+      if [ $? != 0 ]; then
+        echo "-E- remote full $zfs destroy command failed"
+        exit 1
+      fi
+    fi
+    # Send the full filesystem
+    ssh $remote $zfs send -R ${current_backup_marker} |
+        $zfs receive -vF -d ${local_pool}/${remote_pool}
+    if [ $? != 0 ]; then
+      echo "-E- remote full $zfs send command failed"
+      exit 1
+    fi
   fi
 fi
-
+ 
+# destroy the previous backup markers now that we've replicated past them
+# don't check the return codes here because these may not exist, and that is ok
+$zfs destroy ${local_pool}/${previous_backup_marker} > /dev/null 2>&1
+ssh $remote $zfs destroy ${previous_backup_marker} > /dev/null 2>&1
+
+# Rename the current backup marker to be the previous backup marker
+$zfs rename ${local_pool}/${current_backup_marker} ${local_pool}/${previous_backup_marker}
+if [ $? != 0 ]; then
+  echo "-E- local $zfs rename command failed"
+  exit 1
+fi
+ssh $remote $zfs rename ${current_backup_marker} ${previous_backup_marker}
+if [ $? != 0 ]; then
+  echo "-E- remote $zfs rename command failed"
+  exit 1
+fi
+   
 # Remove tmp files
-#rm -f $local_list $remote_list
+rm -f $local_list $remote_list
 
-- 
2.34.1