zfs-replicate

   1 #/bin/bash
   2
   3 # Author: Carl Baldwin & Alan Pippin
   4 # Description: This script replicates a remote zfs filesystem to a local zfs pool.
   5 #              This script will keep all snapshots in sync, removing the ones
   6 #              that have been deleted since the last replicate was performed.
   7 #              This script will only send the new, or missing, snapshots since
   8 #              the last replicate was performed.
   9 # Usage: replicate <hostname> <zfs filesystem>
  10
  11 # source our configuration
  12 config="${0%/*}/zfs-scripts.conf"
  13 [ -e "${config}.dist" ] && . ${config}.dist
  14 [ -e "${config}" ] && . ${config}
  15
  16 # command line arg parsing
  17 remote=$1
  18 remote_fs=$2
  19 remote_pool=${2%%/*}
  20 hostname=`hostname`
  21
  22 # Setup our cleanup and exit trap
  23 cleanup() {
  24   if [[ -e "$local_list" ]]; then
  25     rm -f $local_list
  26   fi
  27   if [[ -e "$remote_list" ]]; then
  28     rm -f $remote_list
  29   fi
  30   if [[ -n "$remote" ]]; then
  31     ssh $remote ls -d "$lockdir" > /dev/null 2>&1
  32     if [[ $? == 0 ]]; then
  33       ssh $remote rm -rf "$lockdir"
  34     fi
  35   fi
  36 }
  37 fatal_and_exit() {
  38   echo -e 2>&1 "$1"
  39   if [[ -n "$2" ]]; then
  40     echo -e "$1" | $mailx -s "zfs replicate on $hostname failed" "$2"
  41   fi
  42   exit 1
  43 }
  44 trap fatal_and_exit INT
  45 trap cleanup EXIT
  46
  47 # Make sure we have valid arguments
  48 if [[ -z "$remote" ]] || [[ -z "$remote_fs" ]]; then
  49   fatal_and_exit "Usage: $0 <hostname> <zfs filesystem>"
  50 fi
  51
  52 # Make sure the local pool and local receiving filesystem exist, or print some errors
  53 zpool list -H "$local_pool" >/dev/null 2>&1
  54 if [ $? != 0 ]; then
  55   fatal_and_exit "-E- The local pool, '$local_pool' doesn't seem to exist." $mailto
  56 fi
  57 zfs list "$local_pool/$remote_pool" >/dev/null 2>&1
  58 if [ $? != 0 ]; then
  59   echo >&2 "-I- The local filesystem for the remote pool, '$local_pool/$remote_pool' doesn't seem to exist."
  60   echo >&2 "    Creating the local filesystem to receive the remote pool into: $local_pool/$remote_pool"
  61   $zfs create $local_pool/$remote_pool
  62   if [ $? != 0 ]; then
  63     fatal_and_exit "-E- remote $zfs create command failed" $mailto
  64   fi
  65 fi
  66
  67 # Obtain the zpool guid for the local pool
  68 local_pool_guid=`zpool get guid $local_pool 2>&1 | grep $local_pool | awk '{ print $3 }'`
  69 zpool get guid $local_pool > /dev/null 2>&1
  70 if [ $? != 0 ]; then
  71   fatal_and_exit "-E- Unable to extract the guid for the local pool: $local_pool" $mailto
  72 fi
  73
  74 # Turn on shell verbosity
  75 set -x
  76
  77 # Create the remote lockdir before continuing with the replicate
  78 # Spinlock on creating the lock
  79 maxsleeptime=60
  80 maxattempts=100
  81 attempts=0
  82 while true; do
  83   ssh $remote mkdir "$lockdir" >/dev/null 2>&1
  84   if [ $? != 0 ]; then
  85     # Another zfs admin tool is running.
  86     # Wait a random amount of time and try again
  87     ransleep=$(($RANDOM % $maxsleeptime))
  88     sleep $ransleep
  89     ((attempts=attempts+1))
  90   else
  91     # No other zfs admin tool is running, we can now.
  92     break
  93   fi
  94   if [[ $attempts -gt $maxattempts ]]; then
  95     # We've exceeded our maximum while loop count
  96     echo "-E- The zfs filesystem has been locked down. Skipping replicate operation."
  97     fail_msg=`ssh $remote ls -ld $lockdir 2>&1`
  98     fatal_and_exit "zfs-replicate-all unable to obtain zfs admin lock:\n$fail_msg" $mailto
  99   fi
 100 done
 101
 102 # Setup our backup marker names
 103 current_backup_marker=${remote_fs}@current-backup-${local_pool_guid}
 104 previous_backup_marker=${remote_fs}@previous-backup-${local_pool_guid}
 105
 106 # List the snapshots on the remote machine.
 107 remote_list=$(mktemp /tmp/replicate.XXXXXX)
 108 ssh $remote \
 109     $zfs list -H -t snapshot |
 110     grep ^${remote_fs}@ |
 111     awk '{print$1}' > $remote_list
 112 if [ $? != 0 ]; then
 113   fatal_and_exit "-E- remote $zfs list command failed" $mailto
 114 fi
 115
 116 # List the snapshots on the local machine.
 117 # Don't list the current backup marker if it exists on the local side.
 118 # If you do, it can mess up the common finding algorithm below.
 119 local_list=$(mktemp /tmp/replicate.XXXXXX)
 120 $zfs list -H -t snapshot |
 121     grep ^${local_pool}/${remote_fs}@ |
 122     grep -v ^${local_pool}/${current_backup_marker} |
 123     awk "{gsub(/^$local_pool./,\"\",\$1); print\$1}" > $local_list
 124 if [ $? != 0 ]; then
 125   fatal_and_exit "-E- local $zfs list command failed" $mailto
 126 fi
 127
 128 # Destroy the current backup marker snapshot on the remote system if it exists
 129 grep -q ${current_backup_marker} $remote_list
 130 if [ $? == 0 ]; then
 131   ssh $remote $zfs destroy ${current_backup_marker}
 132   if [ $? != 0 ]; then
 133     fatal_and_exit "-E- remote $zfs destroy command failed" $mailto
 134   fi
 135 fi
 136
 137 # Create the current backup marker snapshot on the remote system
 138 ssh $remote $zfs snapshot ${current_backup_marker}
 139 if [ $? != 0 ]; then
 140   fatal_and_exit "-E- remote $zfs snapshot command failed" $mailto
 141 fi
 142
 143 # Check to see if the previous backup marker exists in the remote snapshot list.
 144 # Check to see if the previous backup marker exists in the local snapshot list.
 145 # If the previous backup markers exists, perform an incremental replicate. Else:
 146 # 1) check to see if a common snapshot exists, and perform an incremental replicate.
 147 # 2) if no common snapshot exists, destroy the local filesystem, and perform a full replicate.
 148 grep -q ${previous_backup_marker} $remote_list
 149 no_markers=$?
 150 grep -q ${previous_backup_marker} $local_list
 151 no_markers=$(($no_markers || $?))
 152
 153 if [ $no_markers == 0 ]; then
 154   # We found backup markers, incrementally send the new snaps
 155
 156   # First, rollback the local pool to the previous backup marker in case the previous
 157   # backup was interrupted for some reason. If we don't do this, the zfs send -R command
 158   # below may complain about snaps already existing as it tries to resend from the
 159   # previous backup marker again from a previously interrupted replicate.
 160   $zfs rollback -r ${local_pool}/${previous_backup_marker}
 161   if [ $? != 0 ]; then
 162     fatal_and_exit "-E- remote incremental $zfs rollback command failed" $mailto
 163   fi
 164   # Now it should be safe to send the snaps
 165   ssh $remote $zfs send -R -I${previous_backup_marker} ${current_backup_marker} |
 166       $zfs receive -vF -d ${local_pool}/${remote_pool}
 167   if [ $? != 0 ]; then
 168     fatal_and_exit "-E- remote incremental $zfs send command failed" $mailto
 169   fi
 170 else
 171   # We didn't find any backup markers, next check to see if we have a common snapshot.
 172
 173   # See what the most recent snapshot on the remote end is.
 174   latest=$(tail -n 1 $remote_list)
 175
 176   # I did this to make sure that diff would always display the most recent common
 177   # Since we're keying off the context of the diff, we need to ensure we will get context
 178   # by injecting a known difference in case no others exist in the lists.
 179   echo bogus.remote >> $remote_list
 180   echo bogus.local  >> $local_list
 181   common=$(diff -u $remote_list $local_list | grep '^ ' | tail -n 1)
 182
 183   if [[ -n "$common" ]]; then
 184     # We found a common snapshot, incrementally send the new snaps
 185     ssh $remote $zfs send -R -I${common/*@/@} ${current_backup_marker} |
 186         $zfs receive -vF -d ${local_pool}/${remote_pool}
 187     if [ $? != 0 ]; then
 188       fatal_and_exit "-E- remote incremental $zfs send command failed" $mailto
 189     fi
 190   else
 191     # We did not find any markers or a common snapshot
 192     # At this point, we'll have to send the entire filesystem
 193     # Destroy the local filesystem if it exists before receving the full replicate
 194     zfs list ${local_pool}/${remote_fs} > /dev/null 2>&1
 195     if [ $? == 0 ]; then
 196       if [[ $destroy_local_filesystem_on_full_replicate == 1 ]]; then
 197         $zfs destroy -r ${local_pool}/${remote_fs}
 198         if [ $? != 0 ]; then
 199           fatal_and_exit "-E- remote full $zfs destroy command failed" $mailto
 200         fi
 201       else
 202         echo "-W- We need to destroy a local filesystem before receiving a full stream."
 203         echo "    However, since the option is set to prevent this, skipping replicate operation."
 204         fatal_and_exit "unable to destroy local filesystem:\n$zfs destroy -r ${local_pool}/${remote_fs} not able to run" $mailto
 205       fi
 206     fi
 207     # Send the full filesystem
 208     ssh $remote $zfs send -R ${current_backup_marker} |
 209         $zfs receive -vF -d ${local_pool}/${remote_pool}
 210     if [ $? != 0 ]; then
 211       fatal_and_exit "-E- remote full $zfs send command failed" $mailto
 212     fi
 213   fi
 214 fi
 215
 216 # destroy the previous backup markers now that we've replicated past them
 217 # don't check the return codes here because these may not exist, and that is ok
 218 $zfs destroy ${local_pool}/${previous_backup_marker} > /dev/null 2>&1
 219 ssh $remote $zfs destroy ${previous_backup_marker} > /dev/null 2>&1
 220
 221 # Rename the current backup marker to be the previous backup marker
 222 $zfs rename ${local_pool}/${current_backup_marker} ${local_pool}/${previous_backup_marker}
 223 if [ $? != 0 ]; then
 224   fatal_and_exit "-E- local $zfs rename command failed" $mailto
 225 fi
 226 ssh $remote $zfs rename ${current_backup_marker} ${previous_backup_marker}
 227 if [ $? != 0 ]; then
 228   fatal_and_exit "-E- remote $zfs rename command failed" $mailto
 229 fi
 230