blob: 1285aa92e7c45b4a76d280abb1db8a3157bc3398 [file] [log] [blame]
#!/bin/bash
# Collect relevant data from a Ceph cluster for troubleshooting and assessment
# (C) Christian Huebner chuebner@mirantis.com 2015
# run with ./ceph_collect.sh <customername> <clustername>
echo "Collecting Ceph cluster data."
help () {
echo "Data collector for Ceph analytics"
echo "Usage: ceph_collect.sh [-b] [-m] <customername> <clustername>"
echo "-m only works with Nautilus and up"
}
POSITIONAL=()
BENCH="true"
VOLUMES="false"
OSDS=(`ceph osd ls`)
if [[ $# -eq 0 ]]; then
help
exit
fi
while [[ $# -gt 0 ]]; do
key="$1"
case $key in
-h|--help)
help
exit
;;
-vol|--volumes)
VOLUMES="true"
shift
;;
-nb|--nobench)
BENCH="false"
shift # past argument
;;
-m|--healthmetrics)
HEALTHMETRICS="true"
shift # past argument
;;
*) # unknown option
POSITIONAL+=("$1") # save it in an array for later
shift # past argument
;;
esac
done
if [ ${#POSITIONAL[*]} -lt 2 ]; then echo "Usage: ./ceph_collect.sh <CUSTOMER> <CLUSTERNAME>"; exit; fi
# if [ "$#" -lt 2 ]; then echo "Usage: ./ceph_collect.sh <CUSTOMER> <CLUSTERNAME>"; exit; fi
export CUSTOMER=${POSITIONAL[0]}
export CLUSTERNAME=${POSITIONAL[1]}
if ! which ceph >/dev/null; then echo "ERROR: This script must be run on a ceph monitor or admin node"; exit; fi
DATE=`date "+%Y-%m-%d"`
DIRNAME="CephCollectData.$CUSTOMER.$CLUSTERNAME.$DATE"
ARCHNAME=$DIRNAME".tar.gz"
mkdir $DIRNAME
cd $DIRNAME
echo "Collecting CRUSH map"
ceph osd getcrushmap -o crush.bin
crushtool -d crush.bin -o crushmap.txt
crushtool -i crush.bin --dump > crushmap.json
rm crush.bin
echo "Collecting ceph osd crush dump"
ceph osd crush dump >crushdump.json
echo "Collecting cluster status"
ceph -s -f json -o ceph_s.json
echo "Collecting health detail"
ceph -f json health detail -o ceph_health_detail.json
echo "Collecting monmap"
ceph mon dump -f json -o monmap.json
echo "Collecting ceph df"
ceph df -f json -o ceph_df.json
echo "Collecting ceph osd df"
ceph osd df -f json -o ceph_osd_df.json
echo "Collecting ceph osd dump"
ceph osd dump -f json -o ceph_osd_dump.json
echo "Collecting rados df"
rados df -f json >rados_df.json
echo "Collecting ceph report"
ceph report -o ceph_report.json
echo "Collecting auth data anonymized"
ceph auth list -f json |sed 's/AQ[^=]*==/KEY/g' > ceph_auth_ls.json
echo "Collecting ceph pg dump"
ceph pg dump -f json -o ceph_pg_dump.json
echo "Collecting ceph pg autoscale"
ceph osd pool autoscale-status -f json -o ceph_pg_autoscale_status.json
echo "Collecting ceph running configuration"
ceph config dump -f json >ceph_config_dump.json
echo "Collecting ceph erasure code profiles"
ceph -f json osd erasure-code-profile ls >ceph_osd_erasure-code-profiles.json
echo "Collecting erasure code profiles"
ceph -f json osd erasure-code-profile ls >ceph_osd_erasure-code-profiles.json
echo "Collecting rbd ls -l"
rbd ls -l | sed '$ s/.$/}/' >rbd_ls_l.json
echo "Collecting block DB/WAL stats"
for i in ${OSDS[@]}; do
echo \"osd.$i\"\: `ceph -f json tell osd.$i bluefs stats` >>ceph_db_wal_stats.json;
done
for prof in `ceph osd erasure-code-profile ls`; do \
ceph -f json -o ceph_osd_erasure-code-profile_$prof.json osd erasure-code-profile get $prof; \
done
if [[ $VOLUMES = "true" ]]; then
echo "Collecting ceph volumes and CoW clones per rbd pool"
for pool in `ceph osd pool ls detail |grep rbd |awk '{print $3}'|sed s/\'//g`; do \
echo '{' \
'"name" : ' $pool ',' \
'"volumes" : ' `rbd ls -l $pool | grep -v '@'| wc -l |grep -v ' 0$'` ',' \
'"snapshots" : ' `rbd ls -l $pool | awk '{print $1}' |grep '@' | wc -l` ',' \
'"clones" : ' `rbd ls -l $pool | awk '{print $4}' |grep '@' | wc -l` ',' \
'}' ; \
done >> volumes_per_pool.json
else
echo "Volume collection disabled."
fi
echo "Collecting ceph osd perf"
for i in {0..9}; do echo $i; ceph osd perf -f json -o ceph_osd_perf_$i.json; sleep 4; done
if $(ceph device ls &>/dev/null); then
echo "Collecting device health information"
ceph device ls|grep -v DEVICE|awk '{print $1}'|xargs --max-args=1 ceph device get-health-metrics > ceph_device_get_health_metrics.json
else
echo "Device health check not supported"
fi
if [[ $HEALTHMETRICS = "true" ]]; then
echo "Collecting Ceph Health Metrics (-m option)"
IFS=$'\n'
for device in `ceph device ls|grep -v DEVICE`; do
osd=$(echo $device|awk '{print $3}');
dev=$(echo $device|awk '{print $1}');
ceph device get-health-metrics $dev >ceph_health_$osd.json ;
done;
fi
if [[ $BENCH = "true" ]]; then
echo "Collecting Ceph Benchmark"
echo "{ " > ceph_tell_bench.json
for i in ${OSDS[@]}; do
if [ $i -ne ${OSDS[-1]} ]; then
echo \"osd.$i\"\: `ceph tell osd.$i bench -f json 12000000 4096` ',' >>ceph_tell_bench.json;
else
echo \"osd.$i\"\: `ceph tell osd.$i bench -f json 12000000 4096` >>ceph_tell_bench.json;
fi
done;
echo "}" >> ceph_tell_bench.json
fi
# Grab Ceph logs
echo "Collecting Ceph Logs"
ceph log last 10000 cluster >ceph_cluster_log.json || true
ceph log last 10000 audit >ceph_audit_log.json || true
tar czf "../"$ARCHNAME *
cd ..