]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
rbd-recover-tool: implement the function framework
authorMin Chen <minchen@ubuntukylin.com>
Wed, 4 Feb 2015 08:09:13 +0000 (16:09 +0800)
committerMin Chen <minchen@ubuntukylin.com>
Wed, 4 Feb 2015 08:09:13 +0000 (16:09 +0800)
add control files & config files

rbd-recover-tool is an offline tool for dead ceph cluster
to recever rbd image in replicated pool from all osds.
it is a simple but usefull policy to keep data secruity on ceph

Sigbed-off-by: Min Chen <minchen@ubuntukylin.com>
src/rbd_recover_tool/config/mds_host [new file with mode: 0644]
src/rbd_recover_tool/config/mon_host [new file with mode: 0644]
src/rbd_recover_tool/config/osd_host_path [new file with mode: 0644]
src/rbd_recover_tool/osd_job [new file with mode: 0755]
src/rbd_recover_tool/rbd-recover-tool [new file with mode: 0755]

diff --git a/src/rbd_recover_tool/config/mds_host b/src/rbd_recover_tool/config/mds_host
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/src/rbd_recover_tool/config/mon_host b/src/rbd_recover_tool/config/mon_host
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/src/rbd_recover_tool/config/osd_host_path b/src/rbd_recover_tool/config/osd_host_path
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/src/rbd_recover_tool/osd_job b/src/rbd_recover_tool/osd_job
new file mode 100755 (executable)
index 0000000..f3e2ff3
--- /dev/null
@@ -0,0 +1,170 @@
+#!/bin/bash
+# file: osd_job
+#
+# Copyright (C) 2015 Ubuntu Kylin
+#
+# Author: Min Chen <minchen@ubuntukylin.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Library Public License for more details.
+#
+
+my_dir=$(dirname "$0")
+
+. $my_dir/common_h
+. $my_dir/metadata_h
+. $my_dir/epoch_h
+
+function check_ceph_osd()
+{
+  local func="check_ceph_osd"
+  local host=`hostname`
+  # if ceph-osd service is still running, except flush-journal
+  if [ "`ps aux|grep ceph-osd|grep -v flush-journal|grep -v grep`"x != ""x ];then
+    echo "[$host]: $func: ceph-osd is running..., stop it"
+    exit 
+  fi
+}
+
+function cat_pg_epoch()
+{
+  local func="cat_pg_epoch" 
+  init_env_osd $1
+  if [ -e $node_pg_epoch ];then
+    cat $node_pg_epoch
+  fi
+} 
+
+function cat_image_v1()
+{
+  local func="cat_image_v1" 
+  init_env_osd $1
+  if [ -e $image_v1 ];then
+    cat $image_v1
+  fi
+} 
+
+function cat_image_v2()
+{
+  local func="cat_image_v2" 
+  init_env_osd $1
+  if [ -e $image_v2 ];then
+    cat $image_v2
+  fi
+} 
+
+function flush_osd_journal()
+{
+  local func="flush_osd_journal"
+  init_env_osd $1
+  local osd_data_path=$osd_data
+  local osd_journal_path=$osd_data/journal 
+  local whoami_path=$osd_data/whoami
+  local host=`hostname`
+  if [ ! -e $whoami_path ];then
+    echo "[$host]: $func: $whoami_path not exists"
+    exit
+  fi
+  local whoami=`cat $whoami_path`
+  echo "[$host]: $func ..."
+  ceph-osd -i $whoami --osd-data $osd_data_path --osd-journal $osd_journal_path --flush-journal >/dev/null
+  if [ $? -ne 0 ];then
+    echo "[$host]: $func: flush osd journal failed"
+    exit
+  fi
+}
+
+function do_omap_list()
+{
+  local func="do_omap_list"
+  init_env_osd $1
+  local host=`hostname`
+  echo "[$host]: $func ..."
+  get_omap_list
+}
+
+# get all pgs epoch 
+function do_pg_epoch()
+{
+  local func="do_pg_epoch"
+  init_env_osd $1
+  local node=`hostname`
+  get_pgid_list
+  >$node_pg_epoch
+  local pgid=
+  local data_path=
+  local host=`hostname`
+  echo "[$host]: $func ..."
+  while read line
+  do
+  {
+    pgid=`echo $line|awk '{print $1}'`
+    data_path=`echo $line|awk '{print $2}'`
+    get_pg_epoch $pgid
+    echo -e "$node $pgid $pg_epoch $data_path" >>$node_pg_epoch
+  } 
+  done < $pgid_list
+}
+
+# get an list of image in this osd node, pg epoch maybe not the latest, the admin node will do distinguish
+function do_image_list()
+{
+  local func="do_image_list"
+  init_env_osd $1
+  get_image_list   
+  local node=`hostname`
+  >$image_v1
+  >$image_v2
+  local host=`hostname`
+  echo "[$host]: $func ..."
+  for line in `cat $image_list_v1`
+  do
+    pgid=`get_pgid $line`
+    get_pg_epoch $pgid
+    echo "$node $line $pg_epoch" >> $image_v1
+  done
+  for line in `cat $image_list_v2`
+  do
+    pgid=`get_pgid $line`
+    get_pg_epoch $pgid
+    echo "$node $line $pg_epoch" >> $image_v2
+  done
+}
+
+function do_image_id()
+{
+  local func="do_image_id"
+  init_env_osd $1
+  get_image_id $2
+}
+
+function do_image_metadata_v1()
+{
+  local func="do_image_metadata_v1"
+  init_env_osd $1
+  local image_header_hobject=$2
+  local snap_name=$3
+  get_image_metadata_v1 $image_header_hobject $snap_name
+}
+
+function do_image_metadata_v2()
+{
+  local func="do_image_metadata_v2"
+  init_env_osd $1
+  local image_id=$2
+  local image_header_hobject=$3
+  local snap_name=$4
+  get_map_header $image_id 
+  get_meta_header_seq $map_header_prefix $map_header_key
+  get_image_metadata_v2 $meta_header_seq $snap_name
+}
+
+check_ceph_osd
+$*
diff --git a/src/rbd_recover_tool/rbd-recover-tool b/src/rbd_recover_tool/rbd-recover-tool
new file mode 100755 (executable)
index 0000000..b24992d
--- /dev/null
@@ -0,0 +1,327 @@
+#!/bin/bash
+# file: rbd-recover-tool
+#
+# Copyright (C) 2015 Ubuntu Kylin
+#
+# Author: Min Chen <minchen@ubuntukylin.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Library Public License for more details.
+#
+
+# rbd-recover-tool is an offline recover tool for rbd image in replicated pool
+# when ceph cluster is stopped.
+# it is a simple disater recovery policy, just for urgent condition
+
+my_dir=$(dirname "$0")
+
+. $my_dir/common_h
+. $my_dir/metadata_h
+. $my_dir/epoch_h
+. $my_dir/database_h
+
+#scp files from admin node to osd node
+file1=common_h
+file2=metadata_h
+file3=epoch_h
+file4=osd_job
+
+#------------ admin node's action -------------
+
+function scp_file()
+{
+  local func="scp_file"
+  file=$1
+  if [ "$1"x = ""x ];then
+    echo "$func: not file input"
+    exit
+  fi
+  for host in `cat $osd_host`
+  do
+  {
+    echo "$func: $host"
+    scp $ssh_option $file $host:$job_path  1>/dev/null
+  } &
+  done
+}
+
+function scp_files()
+{
+  local func="scp_files"
+  for host in `cat $osd_host`
+  do
+  {
+    echo "$func: $host"
+    scp $ssh_option $file1 $host:$job_path
+    scp $ssh_option $file2 $host:$job_path
+    scp $ssh_option $file3 $host:$job_path
+    scp $ssh_option $file4 $host:$job_path
+  } &
+  done
+  wait
+  echo "$func: finish"
+}
+
+function scatter_node_jobs()
+{
+  local func="scatter_node_jobs"
+  local host=
+  local data_path=
+  echo "$func: flush osd journal & generate infos: omap, pg, image metadata ..."
+
+  trap 'echo $func failed; exit' INT HUP
+  while read line
+  do
+  {
+    host=`echo $line|awk '{print $1}'`
+    data_path=`echo $line|awk '{print $2}'`
+    check_osd_process $host
+
+    cmd="mkdir -p $job_path"
+    ssh $ssh_option $host $cmd
+    scp $ssh_option $file1 $host:$job_path  >/dev/null
+    scp $ssh_option $file2 $host:$job_path  >/dev/null
+    scp $ssh_option $file3 $host:$job_path  >/dev/null
+    scp $ssh_option $file4 $host:$job_path  >/dev/null
+
+    cmd="bash $job_path/osd_job flush_osd_journal $data_path;"
+    cmd="$cmd $job_path/osd_job do_omap_list $data_path;"
+    cmd="$cmd bash $job_path/osd_job do_pg_epoch $data_path;"
+    cmd="$cmd bash $job_path/osd_job do_image_list $data_path;"
+
+    ssh $ssh_option $host $cmd </dev/null
+  } &
+  done < $osd_host_path
+  wait
+  echo "$func: finish"
+}
+
+function gather_node_infos()
+{
+  local func="gather_node_infos"
+  echo "$func ..."
+  >$pg_coll
+  >$image_coll_v1
+  >$image_coll_v2
+  trap 'echo $func failed; exit' INT HUP
+  while read line
+  do
+  {
+    host=`echo $line|awk '{print $1}'`
+    data_path=`echo $line|awk '{print $2}'`
+    echo "$func: $host"
+    check_osd_process $host
+
+    #pg epoch
+    cmd1="bash $job_path/osd_job cat_pg_epoch $data_path"
+    ssh $ssh_option $host $cmd1 >> $pg_coll
+    #image v1
+    cmd2="bash $job_path/osd_job cat_image_v1 $data_path"
+    ssh $ssh_option $host $cmd2 >> $image_coll_v1
+    #image v2
+    cmd3="bash $job_path/osd_job cat_image_v2 $data_path"
+    ssh $ssh_option $host $cmd3 >> $image_coll_v2
+  } &
+  done < $osd_host_path
+  wait
+  echo "$func: finish"
+}
+
+function scatter_gather()
+{
+  local func="scatter_gather"
+  if [ ! -s $osd_host ];then
+    echo "$func: no osd_host input"
+    exit
+  fi
+  if [ ! -s $mon_host ];then
+    echo "$func: no mon_host input"
+    exit
+  fi
+  scatter_node_jobs
+  gather_node_infos
+}
+
+
+#------------- operations --------------
+
+function database()
+{
+  scatter_gather
+  gen_database
+}
+
+function list()
+{
+  list_images
+}
+
+function lookup()
+{
+  lookup_image $1 $2 $3
+}
+
+function recover()
+{
+  recover_image $1 $2 $3 $4
+}
+
+#------------- helper -------------
+
+function usage()
+{
+  local cmd_name="rbd-recover-tool"
+  echo 
+  echo "$cmd_name is used to recover rbd image of replicated pool, 
+       when all ceph services are stopped"
+  echo "Usage:"
+  echo "$cmd_name database
+                       gather pg info, object info, image metadata, 
+                       and epoch info from all osd nodes,
+                       this will cosume a long time, just be patient, 
+                       especially when scale up to 1000+ osds"
+  echo "$cmd_name list
+                       list all rbd images of all replicated pools, 
+                       before to lookup & recover"
+  echo "$cmd_name lookup  <pool_id>/<image_name>[@[<snap_name>]]
+                       show image metadata: image format, rbd id, size, order, snapseq
+                       In addtion, for image with snapshots, 
+                       this will list all snapshot infomations"
+  echo "$cmd_name recover <pool_id>/<image_name>[@[<snap_name>]] [</path/to/store/image>]
+                       all snapshots share one image head, to economize disk space
+                       so there is only one snapshot at any time,
+                       image is saved at </path/to/store/image>/pool_<pool_id>/image_name/image_name
+                       cat <path/to/store/image>/pool_<pool_id>/image_name/@CURRENT,
+                       will show snapid
+                       recover to raw image/nosnap/head: <image_name>
+                       rollback to image head:           <image_name>@
+                       rollback to image snap:           <image_name>@<snap_name>
+                       recover steps:
+                       1. recover image nosnap (only one time)
+                       2. rollback to image snap"
+}
+
+function get_path()
+{
+  local func="get_path"
+  if [ $# -lt 1 ];then
+    return
+  fi
+  if [[ $1 =~ // ]];then
+    return # "/path//to" is invalid
+  fi
+  local parent=`dirname $1`
+  local name=`basename $1`
+  if [ "$parent"x = "/"x ];then
+    echo "$parent$name"
+  else
+    echo -n "$parent/$name"
+  fi
+}
+
+function admin_cmd()
+{
+  local func="admin_cmd"
+  if [ $# -lt 1 ];then
+    usage
+    exit
+  fi
+  if [ "$1"x = "-h"x ] || [ "$1"x = "--help"x ];then
+    usage
+    exit
+  fi
+  
+  if [ "$1"x = "database"x ];then
+    if [ $# -gt 1 ];then
+      usage
+      exit
+    fi
+    # remove osd_host to refresh osd_host and osd_host_mapping
+    rm -f $osd_host
+    init_env_admin
+    database
+  elif [ "$1"x = "list"x ];then
+    if [ $# -gt 1 ];then
+      usage
+      exit
+    fi
+    init_env_admin
+    list
+  elif [ "$1"x = "lookup"x ];then
+    if [ $# -gt 2 ];then
+      usage
+      exit
+    fi
+    local pool_id=-1
+    local image_name=
+    local snap_name=
+    if [[ $2 =~  ^([^@/]+)/([^@/]+)$ ]];then
+      pool_id="${BASH_REMATCH[1]}"
+      image_name="${BASH_REMATCH[2]}"
+    elif [[ $2 =~  ^([^@/]+)/([^@/]+)@([^@/]*)$ ]];then
+      pool_id="${BASH_REMATCH[1]}"
+      image_name="${BASH_REMATCH[2]}"
+      snap_name="${BASH_REMATCH[3]}"
+    else
+      echo "format: $2 is invalid, use <pool_id>/<image_name>[@[<snap_name>]]"
+      exit
+    fi
+    init_env_admin
+    lookup $pool_id $image_name $snap_name
+  elif [ "$1"x = "recover"x ];then
+    if [ $# -lt 2 ] || [ $# -gt 3 ];then
+      usage
+      exit
+    fi
+    local pool_id=-1
+    local image_name=
+    local snap_name=@
+    local image_dir=
+    if [[ $2 =~  ^([^@/]+)/([^@/]+)$ ]];then
+      pool_id="${BASH_REMATCH[1]}"
+      image_name="${BASH_REMATCH[2]}"
+    elif [[ $2 =~  ^([^@/]+)/([^@/]+)@([^@/]*)$ ]];then
+      pool_id="${BASH_REMATCH[1]}"
+      image_name="${BASH_REMATCH[2]}"
+      snap_name="${BASH_REMATCH[3]}"
+      if [ "$snap_name"x = ""x ];then
+        snap_name=@@
+      fi
+    else
+      echo "format: $2 is invalid, use <pool_id>/<image_name>[@[<snap_name>]]"
+      exit
+    fi
+    if [ $# = 3 ];then
+      image_dir=`get_path $3`
+      if [ "image_dir"x = ""x ];then
+        echo "$3 invalid"
+        exit
+      fi
+    fi
+    init_env_admin
+    recover $pool_id $image_name $snap_name $image_dir
+  elif [ "$1"x = "scp_files"x ];then
+    if [ $# -gt 1 ];then
+      exit
+    fi
+    admin_parse_osd
+    scp_files
+  elif [ "$1"x = "scp_file"x ];then
+    if [ $# -gt 2 ];then
+      exit
+    fi
+    admin_parse_osd
+    scp_file $2
+  else
+    echo "$func: $1: command not found"
+  fi
+}
+
+admin_cmd $*