zfs-replicate

   1 #/bin/bash
   2
   3 # Usage: replicate <hostname> <zfs filesystem>
   4 remote=$1
   5 remote_fs=$2
   6 remote_pool=${2%%/*}
   7 remote_lockdir="/tmp/zfs-admin-lock"
   8 hostname=`hostname`
   9
  10 # Set the name of the local pool used to store the backup of the remote
  11 local_pool=backups
  12
  13 # Set the email address to send notification to
  14 mailto=root@pippins.net
  15 mailx=/usr/bin/mailx
  16
  17 # When this variable is set, local filesystems will be destroyed
  18 # before receiving a full streams into them from the remote source.
  19 destroy_local_filesystem_on_full_replicate=0
  20
  21 # The ssh connection doesn't find zfs without this.
  22 zfs=/usr/sbin/zfs
  23
  24 # Setup our cleanup and exit trap
  25 cleanup() {
  26   if [[ -e "$local_list" ]]; then
  27     rm -f $local_list
  28   fi
  29   if [[ -e "$remote_list" ]]; then
  30     rm -f $remote_list
  31   fi
  32   if [[ -n "$remote" ]]; then
  33     ssh $remote ls -d "$remote_lockdir" > /dev/null 2>&1
  34     if [[ $? == 0 ]]; then
  35       ssh $remote rm -rf "$remote_lockdir"
  36     fi
  37   fi
  38 }
  39 fatal_and_exit() {
  40   echo -e 2>&1 "$1"
  41   if [[ -n "$2" ]]; then
  42     echo -e "$1" | $mailx -s "zfs replicate on $hostname failed" "$2"
  43   fi
  44   exit 1
  45 }
  46 trap fatal_and_exit INT
  47 trap cleanup EXIT
  48
  49 # Make sure we have valid arguments
  50 if [[ -z "$remote" ]] || [[ -z "$remote_fs" ]]; then
  51   fatal_and_exit "Usage: $0 <hostname> <zfs filesystem>"
  52 fi
  53
  54 # Make sure the local pool and local receiving filesystem exist, or print some errors
  55 zpool list -H "$local_pool" >/dev/null 2>&1
  56 if [ $? != 0 ]; then
  57   fatal_and_exit "-E- The local pool, '$local_pool' doesn't seem to exist." $mailto
  58 fi
  59 zfs list "$local_pool/$remote_pool" >/dev/null 2>&1
  60 if [ $? != 0 ]; then
  61   echo >&2 "-I- The local filesystem for the remote pool, '$local_pool/$remote_pool' doesn't seem to exist."
  62   echo >&2 "    Creating the local filesystem to receive the remote pool into: $local_pool/$remote_pool"
  63   $zfs create $local_pool/$remote_pool
  64   if [ $? != 0 ]; then
  65     fatal_and_exit "-E- remote $zfs create command failed" $mailto
  66   fi
  67 fi
  68
  69 # Obtain the zpool guid for the local pool
  70 local_pool_guid=`zpool get guid $local_pool 2>&1 | grep $local_pool | awk '{ print $3 }'`
  71 zpool get guid $local_pool > /dev/null 2>&1
  72 if [ $? != 0 ]; then
  73   fatal_and_exit "-E- Unable to extract the guid for the local pool: $local_pool" $mailto
  74 fi
  75
  76 # Turn on shell verbosity
  77 set -x
  78
  79 # Create the remote lockdir before continuing with the replicate
  80 # Spinlock on creating the lock
  81 maxsleeptime=60
  82 maxattempts=100
  83 attempts=0
  84 while true; do
  85   ssh $remote mkdir "$remote_lockdir" >/dev/null 2>&1
  86   if [ $? != 0 ]; then
  87     # Another zfs admin tool is running.
  88     # Wait a random amount of time and try again
  89     ransleep=$(($RANDOM % $maxsleeptime))
  90     sleep $ransleep
  91     ((attempts=attempts+1))
  92   else
  93     # No other zfs admin tool is running, we can now.
  94     break
  95   fi
  96   if [[ $attempts -gt $maxattempts ]]; then
  97     # We've exceeded our maximum while loop count
  98     echo "-E- The zfs filesystem has been locked down. Skipping replicate operation."
  99     fail_msg=`ssh $remote ls -ld $remote_lockdir 2>&1`
 100     fatal_and_exit "zfs-replicate-all unable to obtain zfs admin lock:\n$fail_msg" $mailto
 101   fi
 102 done
 103
 104 # Setup our backup marker names
 105 current_backup_marker=${remote_fs}@current-backup-${local_pool_guid}
 106 previous_backup_marker=${remote_fs}@previous-backup-${local_pool_guid}
 107
 108 # List the snapshots on the remote machine.
 109 remote_list=$(mktemp /tmp/replicate.XXXXXX)
 110 ssh $remote \
 111     $zfs list -H -t snapshot |
 112     grep ^${remote_fs}@ |
 113     awk '{print$1}' > $remote_list
 114 if [ $? != 0 ]; then
 115   fatal_and_exit "-E- remote $zfs list command failed" $mailto
 116 fi
 117
 118 # List the snapshots on the local machine.
 119 # Don't list the current backup marker if it exists on the local side.
 120 # If you do, it can mess up the common finding algorithm below.
 121 local_list=$(mktemp /tmp/replicate.XXXXXX)
 122 $zfs list -H -t snapshot |
 123     grep ^${local_pool}/${remote_fs}@ |
 124     grep -v ^${local_pool}/${current_backup_marker} |
 125     awk "{gsub(/^$local_pool./,\"\",\$1); print\$1}" > $local_list
 126 if [ $? != 0 ]; then
 127   fatal_and_exit "-E- local $zfs list command failed" $mailto
 128 fi
 129
 130 # Destroy the current backup marker snapshot on the remote system if it exists
 131 grep -q ${current_backup_marker} $remote_list
 132 if [ $? == 0 ]; then
 133   ssh $remote $zfs destroy ${current_backup_marker}
 134   if [ $? != 0 ]; then
 135     fatal_and_exit "-E- remote $zfs destroy command failed" $mailto
 136   fi
 137 fi
 138
 139 # Create the current backup marker snapshot on the remote system
 140 ssh $remote $zfs snapshot ${current_backup_marker}
 141 if [ $? != 0 ]; then
 142   fatal_and_exit "-E- remote $zfs snapshot command failed" $mailto
 143 fi
 144
 145 # Check to see if the previous backup marker exists in the remote snapshot list.
 146 # Check to see if the previous backup marker exists in the local snapshot list.
 147 # If the previous backup markers exists, perform an incremental replicate. Else:
 148 # 1) check to see if a common snapshot exists, and perform an incremental replicate.
 149 # 2) if no common snapshot exists, destroy the local filesystem, and perform a full replicate.
 150 grep -q ${previous_backup_marker} $remote_list
 151 no_markers=$?
 152 grep -q ${previous_backup_marker} $local_list
 153 no_markers=$(($no_markers || $?))
 154
 155 if [ $no_markers == 0 ]; then
 156   # We found backup markers, incrementally send the new snaps
 157
 158   # First, rollback the local pool to the previous backup marker in case the previous
 159   # backup was interrupted for some reason. If we don't do this, the zfs send -R command
 160   # below may complain about snaps already existing as it tries to resend from the
 161   # previous backup marker again from a previously interrupted replicate.
 162   $zfs rollback -r ${local_pool}/${previous_backup_marker}
 163   if [ $? != 0 ]; then
 164     fatal_and_exit "-E- remote incremental $zfs rollback command failed" $mailto
 165   fi
 166   # Now it should be safe to send the snaps
 167   ssh $remote $zfs send -R -I${previous_backup_marker} ${current_backup_marker} |
 168       $zfs receive -vF -d ${local_pool}/${remote_pool}
 169   if [ $? != 0 ]; then
 170     fatal_and_exit "-E- remote incremental $zfs send command failed" $mailto
 171   fi
 172 else
 173   # We didn't find any backup markers, next check to see if we have a common snapshot.
 174
 175   # See what the most recent snapshot on the remote end is.
 176   latest=$(tail -n 1 $remote_list)
 177
 178   # I did this to make sure that diff would always display the most recent common
 179   # Since we're keying off the context of the diff, we need to ensure we will get context
 180   # by injecting a known difference in case no others exist in the lists.
 181   echo bogus.remote >> $remote_list
 182   echo bogus.local  >> $local_list
 183   common=$(diff -u $remote_list $local_list | grep '^ ' | tail -n 1)
 184
 185   if [[ -n "$common" ]]; then
 186     # We found a common snapshot, incrementally send the new snaps
 187     ssh $remote $zfs send -R -I${common/*@/@} ${current_backup_marker} |
 188         $zfs receive -vF -d ${local_pool}/${remote_pool}
 189     if [ $? != 0 ]; then
 190       fatal_and_exit "-E- remote incremental $zfs send command failed" $mailto
 191     fi
 192   else
 193     # We did not find any markers or a common snapshot
 194     # At this point, we'll have to send the entire filesystem
 195     # Destroy the local filesystem if it exists before receving the full replicate
 196     zfs list ${local_pool}/${remote_fs} > /dev/null 2>&1
 197     if [ $? == 0 ]; then
 198       if [[ $destroy_local_filesystem_on_full_replicate == 1 ]]; then
 199         $zfs destroy -r ${local_pool}/${remote_fs}
 200         if [ $? != 0 ]; then
 201           fatal_and_exit "-E- remote full $zfs destroy command failed" $mailto
 202         fi
 203       else
 204         echo "-W- We need to destroy a local filesystem before receiving a full stream."
 205         echo "    However, since the option is set to prevent this, skipping replicate operation."
 206         fatal_and_exit "unable to destroy local filesystem:\n$zfs destroy -r ${local_pool}/${remote_fs} not able to run" $mailto
 207       fi
 208     fi
 209     # Send the full filesystem
 210     ssh $remote $zfs send -R ${current_backup_marker} |
 211         $zfs receive -vF -d ${local_pool}/${remote_pool}
 212     if [ $? != 0 ]; then
 213       fatal_and_exit "-E- remote full $zfs send command failed" $mailto
 214     fi
 215   fi
 216 fi
 217
 218 # destroy the previous backup markers now that we've replicated past them
 219 # don't check the return codes here because these may not exist, and that is ok
 220 $zfs destroy ${local_pool}/${previous_backup_marker} > /dev/null 2>&1
 221 ssh $remote $zfs destroy ${previous_backup_marker} > /dev/null 2>&1
 222
 223 # Rename the current backup marker to be the previous backup marker
 224 $zfs rename ${local_pool}/${current_backup_marker} ${local_pool}/${previous_backup_marker}
 225 if [ $? != 0 ]; then
 226   fatal_and_exit "-E- local $zfs rename command failed" $mailto
 227 fi
 228 ssh $remote $zfs rename ${current_backup_marker} ${previous_backup_marker}
 229 if [ $? != 0 ]; then
 230   fatal_and_exit "-E- remote $zfs rename command failed" $mailto
 231 fi
 232