From: Alan J. Pippin <>
Date: Wed, 14 Jan 2009 07:17:39 +0000 (-0700)
Subject: Added code to fallback to lists of snapshots to find common point

Added code to fallback to lists of snapshots to find common point
if the backup markers can't be found. Fixed various bugs.

diff --git a/zfs-autosnap b/zfs-autosnap
index cfe80aa..da90906 100755
--- a/zfs-autosnap
+++ b/zfs-autosnap
@@ -50,7 +50,7 @@ filesystem=$1
 pool=`echo "$filesystem" | awk -F '/' '{ print $1 }'`
 if [ -z "$filesystem" ] || [ -z "$mountpoint" ] || [ -z "$numsnapshots" ] || [ -z "$maxagedays" ]; then
@@ -111,8 +111,8 @@ snapshot "${filesystem}@${datetime}"
 minutes=$(echo $datetime | datetime_to_minutes)
 if ! mkdir "$lockdir" >/dev/null 2>&1; then
+  echo "-W- The zfs filesystem has been locked down. Skipping snapshot cleanup."
   exit 0
 cleanup() { rm -rf "$lockdir"; }
diff --git a/zfs-replicate b/zfs-replicate
index 128d051..57680fc 100755
--- a/zfs-replicate
+++ b/zfs-replicate
@@ -4,13 +4,17 @@
-# Set this variable to '1' to use the legacy, non-marker, snapshot diff, replicate script logic
 # Set the name of the local pool used to store the backup of the remote
+# Set the email address to send notification to
+# The ssh connection doesn't find zfs without this.
 # Make sure we have valid arguments
 if [[ -z "$remote" ]] || [[ -z "$remote_fs" ]]; then
   echo "Usage: $0 <hostname> <zfs filesystem>"
@@ -18,20 +22,26 @@ if [[ -z "$remote" ]] || [[ -z "$remote_fs" ]]; then
 # Make sure the local pool and local receiving filesystem exist, or print some errors
-if ! zpool list -H "$local_pool" >/dev/null 2>&1; then
+zpool list -H "$local_pool" >/dev/null 2>&1
+if [ $? != 0 ]; then
   echo >&2 "-E- The local pool, '$local_pool' doesn't seem to exist."
   exit 1
-if ! zfs list "$local_pool/$remote_pool" >/dev/null 2>&1; then
-  echo >&2 "-E- The local filesystem for the remote pool, '$local_pool/$remote_pool' doesn't seem to exist."
-  echo >&2 "    You will need to create this filesystem before this script can replicate your data."
-  echo >&2 "    You can create this filsystem by executing this command: 'zfs create $local_pool/$remote_pool'"
-  exit 1
+zfs list "$local_pool/$remote_pool" >/dev/null 2>&1
+if [ $? != 0 ]; then
+  echo >&2 "-I- The local filesystem for the remote pool, '$local_pool/$remote_pool' doesn't seem to exist."
+  echo >&2 "    Creating the local filesystem to receive the remote pool into: $local_pool/$remote_pool"
+  $zfs create $local_pool/$remote_pool
+  if [ $? != 0 ]; then
+    echo "-E- remote $zfs create command failed"
+    exit 1
+  fi
 # Obtain the zpool guid for the local pool
 local_pool_guid=`zpool get guid $local_pool 2>&1 | grep $local_pool | awk '{ print $3 }'`
-if ! zpool get guid $local_pool > /dev/null 2>&1; then
+zpool get guid $local_pool > /dev/null 2>&1
+if [ $? != 0 ]; then
   echo >&2 "-E- Unable to extract the guid for the local pool: $local_pool"
   exit 1
@@ -39,20 +49,46 @@ fi
 # Turn on shell verbosity
 set -x
+# Create the remote lockdir before continuing with the replicate
+# Spinlock on creating the lock
+while true; do
+  ssh $remote mkdir "$remote_lockdir" >/dev/null 2>&1
+  if [ $? != 0 ]; then
+    # Another zfs admin tool is running.
+    # Wait a random amount of time and try again
+    ransleep=$(($RANDOM % $maxsleeptime))
+    sleep $ransleep
+    ((attempts=attempts+1))
+  else 
+    # No other zfs admin tool is running, we can now.
+    break
+  fi
+  if [[ $attempts -gt $maxattempts ]]; then
+    # We've exceeded our maximum while loop count
+    echo "-W- The zfs filesystem has been locked down. Skipping replicate operation."
+    ssh $remote ls -ld $remote_lockdir | /usr/bin/mailx -s "zfs-replicate-all unable to obtain zfs admin lock" $mailto
+    exit 1
+  fi
+# Declare a cleanup() method to remove the remote lockdir
+cleanup() { ssh $remote rm -rf "$remote_lockdir"; }
+trap cleanup EXIT
 # Setup our backup marker names
-# The ssh connection doesn't find zfs without this.
 # List the snapshots on the remote machine.
 remote_list=$(mktemp /tmp/replicate.XXXXXX)
 ssh $remote \
     $zfs list -H -t snapshot |
     grep ^${remote_fs}@ |
     awk '{print$1}' > $remote_list
-if [[ $? != 0 ]]; then
+if [ $? != 0 ]; then
   echo "-E- remote $zfs list command failed"
   exit 1
@@ -61,71 +97,61 @@ fi
 local_list=$(mktemp /tmp/replicate.XXXXXX)
 $zfs list -H -t snapshot |
     grep ^${local_pool}/${remote_fs}@ |
-    awk '{gsub(/^${local_pool}./,"",$1); print$1}' > $local_list
-if [[ $? != 0 ]]; then
+    awk "{gsub(/^$local_pool./,\"\",\$1); print\$1}" > $local_list
+if [ $? != 0 ]; then
   echo "-E- local $zfs list command failed"
   exit 1
-if [ $use_legacy_replicate == 0 ]; then
-  # Destroy the current backup marker snapshot on the remote system if it exists
-  grep -q ${current_backup_marker} $remote_list
-  if [ $? == 0 ]; then
-    ssh $remote $zfs destroy ${current_backup_marker} 
-    if [[ $? != 0 ]]; then
-      echo "-E- remote $zfs destroy command failed"
-      exit 1
-    fi
-  fi
-  # Create the current backup marker snapshot on the remote system
-  ssh $remote $zfs snapshot ${current_backup_marker}
-  if [[ $? != 0 ]]; then
-    echo "-E- remote $zfs snapshot command failed"
+# Destroy the current backup marker snapshot on the remote system if it exists
+grep -q ${current_backup_marker} $remote_list
+if [ $? == 0 ]; then
+  ssh $remote $zfs destroy ${current_backup_marker} 
+  if [ $? != 0 ]; then
+    echo "-E- remote $zfs destroy command failed"
     exit 1
-  # Check to see if the previous backup marker exists in the remote snapshot list.
-  # Check to see if the previous backup marker exists in the local snapshot list.
-  # If the previous backup markers exists, perform an incremental replicate.
-  # Otherwise, perform a full replicate.
-  grep -q ${previous_backup_marker} $remote_list
-  full=$?
-  grep -q ${previous_backup_marker} $local_list
-  full=$(($full || $?))
-  if [[ $full == 0 ]]; then
-    ssh $remote $zfs send -R -I${previous_backup_marker} ${current_backup_marker} | 
-        $zfs receive -vF -d ${local_pool}/${remote_fs%/*}
-    if [[ $? != 0 ]]; then
-      echo "-E- remote incremental $zfs send command failed"
-      exit 1
-    fi
-  else
-    ssh $remote $zfs send -R ${current_backup_marker} |
-        $zfs receive -vF -d ${local_pool}/${remote_fs%/*}
-    if [[ $? != 0 ]]; then
-      echo "-E- remote full $zfs send command failed"
-      exit 1
-    fi
-  fi
-  # destroy the previous backup markers now that we've replicated past them
-  $zfs destroy ${local_pool}/${previous_backup_marker} > /dev/null 2>&1
-  ssh $remote $zfs destroy ${previous_backup_marker} > /dev/null 2>&1
-  # Rename the current backup marker to be the previous backup marker
-  $zfs rename ${local_pool}/${current_backup_marker} ${local_pool}/${previous_backup_marker}
-  if [[ $? != 0 ]]; then
-    echo "-E- local $zfs rename command failed"
+# Create the current backup marker snapshot on the remote system
+ssh $remote $zfs snapshot ${current_backup_marker}
+if [ $? != 0 ]; then
+  echo "-E- remote $zfs snapshot command failed"
+  exit 1
+# Check to see if the previous backup marker exists in the remote snapshot list.
+# Check to see if the previous backup marker exists in the local snapshot list.
+# If the previous backup markers exists, perform an incremental replicate. Else:
+# 1) check to see if a common snapshot exists, and perform an incremental replicate.
+# 2) if no common snapshot exists, destroy the local filesystem, and perform a full replicate.
+grep -q ${previous_backup_marker} $remote_list
+grep -q ${previous_backup_marker} $local_list
+no_markers=$(($no_markers || $?))
+if [ $no_markers == 0 ]; then
+  # We found backup markers, incrementally send the new snaps
+  # First, rollback the local pool to the previous backup marker in case the previous
+  # backup was interrupted for some reason. If we don't do this, the zfs send -R command
+  # below may complain about snaps already existing as it tries to resend from the 
+  # previous backup marker again from a previously interrupted replicate.
+  $zfs rollback -r ${local_pool}/${previous_backup_marker} 
+  if [ $? != 0 ]; then
+    echo "-E- remote incremental $zfs rollback command failed"
     exit 1
-  ssh $remote $zfs rename ${current_backup_marker} ${previous_backup_marker}
-  if [[ $? != 0 ]]; then
-    echo "-E- remote $zfs rename command failed"
+  # Now it should be safe to send the snaps
+  ssh $remote $zfs send -R -I${previous_backup_marker} ${current_backup_marker} | 
+      $zfs receive -vF -d ${local_pool}/${remote_pool}
+  if [ $? != 0 ]; then
+    echo "-E- remote incremental $zfs send command failed"
     exit 1
+  # We didn't find any backup markers, next check to see if we have a common snapshot.
   # See what the most recent snapshot on the remote end is.
   latest=$(tail -n 1 $remote_list)
@@ -136,17 +162,53 @@ else
   echo bogus.local  >> $local_list
   common=$(diff -u $remote_list $local_list | grep '^ ' | tail -n 1)
-  if [ -n "$common" ]; then 
-    # We found a common snapshot
-    ssh $remote $zfs send -R -I${common/*@/@} $latest |
-        $zfs receive -vF -d ${local_pool}/${remote_fs%/*}
+  if [[ -n "$common" ]]; then
+    # We found a common snapshot, incrementally send the new snaps
+    ssh $remote $zfs send -R -I${common/*@/@} ${current_backup_marker} |
+        $zfs receive -vF -d ${local_pool}/${remote_pool}
+    if [ $? != 0 ]; then
+      echo "-E- remote incremental $zfs send command failed"
+      exit 1
+    fi
-    # We did not find a common snapshot, so send the entire filesystem
-    ssh $remote $zfs send -R $latest |
-        $zfs receive -vF -d ${local_pool}/${remote_fs%/*}
+    # We did not find any markers or a common snapshot
+    # At this point, we'll have to send the entire filesystem
+    # Destroy the local filesystem if it exists before receving the full replicate
+    zfs list ${local_pool}/${remote_fs} > /dev/null 2>&1
+    if [ $? == 0 ]; then
+      zfs destroy -r ${local_pool}/${remote_fs}
+      if [ $? != 0 ]; then
+        echo "-E- remote full $zfs destroy command failed"
+        exit 1
+      fi
+    fi
+    # Send the full filesystem
+    ssh $remote $zfs send -R ${current_backup_marker} |
+        $zfs receive -vF -d ${local_pool}/${remote_pool}
+    if [ $? != 0 ]; then
+      echo "-E- remote full $zfs send command failed"
+      exit 1
+    fi
+# destroy the previous backup markers now that we've replicated past them
+# don't check the return codes here because these may not exist, and that is ok
+$zfs destroy ${local_pool}/${previous_backup_marker} > /dev/null 2>&1
+ssh $remote $zfs destroy ${previous_backup_marker} > /dev/null 2>&1
+# Rename the current backup marker to be the previous backup marker
+$zfs rename ${local_pool}/${current_backup_marker} ${local_pool}/${previous_backup_marker}
+if [ $? != 0 ]; then
+  echo "-E- local $zfs rename command failed"
+  exit 1
+ssh $remote $zfs rename ${current_backup_marker} ${previous_backup_marker}
+if [ $? != 0 ]; then
+  echo "-E- remote $zfs rename command failed"
+  exit 1
 # Remove tmp files
-#rm -f $local_list $remote_list
+rm -f $local_list $remote_list