blob: 1285aa92e7c45b4a76d280abb1db8a3157bc3398 [file] [log] [blame]
Alexdcb792f2021-10-04 14:24:21 -05001#!/bin/bash
Ievgeniia Zadorozhna40d074b2025-07-16 19:06:29 +02002
3# Collect relevant data from a Ceph cluster for troubleshooting and assessment
4# (C) Christian Huebner chuebner@mirantis.com 2015
5# run with ./ceph_collect.sh <customername> <clustername>
6
Alexdcb792f2021-10-04 14:24:21 -05007echo "Collecting Ceph cluster data."
8
Ievgeniia Zadorozhna40d074b2025-07-16 19:06:29 +02009help () {
10 echo "Data collector for Ceph analytics"
11 echo "Usage: ceph_collect.sh [-b] [-m] <customername> <clustername>"
12 echo "-m only works with Nautilus and up"
13}
14
15POSITIONAL=()
16BENCH="true"
17VOLUMES="false"
18OSDS=(`ceph osd ls`)
19
20if [[ $# -eq 0 ]]; then
21 help
22 exit
23fi
24
25while [[ $# -gt 0 ]]; do
26 key="$1"
27
28 case $key in
29 -h|--help)
30 help
31 exit
32 ;;
33 -vol|--volumes)
34 VOLUMES="true"
35 shift
36 ;;
37 -nb|--nobench)
38 BENCH="false"
39 shift # past argument
40 ;;
41 -m|--healthmetrics)
42 HEALTHMETRICS="true"
43 shift # past argument
44 ;;
45 *) # unknown option
46 POSITIONAL+=("$1") # save it in an array for later
47 shift # past argument
48 ;;
49 esac
50done
51
52if [ ${#POSITIONAL[*]} -lt 2 ]; then echo "Usage: ./ceph_collect.sh <CUSTOMER> <CLUSTERNAME>"; exit; fi
53# if [ "$#" -lt 2 ]; then echo "Usage: ./ceph_collect.sh <CUSTOMER> <CLUSTERNAME>"; exit; fi
54export CUSTOMER=${POSITIONAL[0]}
55export CLUSTERNAME=${POSITIONAL[1]}
Alexdcb792f2021-10-04 14:24:21 -050056
57if ! which ceph >/dev/null; then echo "ERROR: This script must be run on a ceph monitor or admin node"; exit; fi
58
59DATE=`date "+%Y-%m-%d"`
60DIRNAME="CephCollectData.$CUSTOMER.$CLUSTERNAME.$DATE"
61ARCHNAME=$DIRNAME".tar.gz"
62mkdir $DIRNAME
63cd $DIRNAME
64
65echo "Collecting CRUSH map"
66ceph osd getcrushmap -o crush.bin
67crushtool -d crush.bin -o crushmap.txt
68crushtool -i crush.bin --dump > crushmap.json
69rm crush.bin
70
71echo "Collecting ceph osd crush dump"
72ceph osd crush dump >crushdump.json
73
74echo "Collecting cluster status"
75ceph -s -f json -o ceph_s.json
76echo "Collecting health detail"
Ievgeniia Zadorozhna40d074b2025-07-16 19:06:29 +020077ceph -f json health detail -o ceph_health_detail.json
Alexdcb792f2021-10-04 14:24:21 -050078echo "Collecting monmap"
79ceph mon dump -f json -o monmap.json
80echo "Collecting ceph df"
81ceph df -f json -o ceph_df.json
82echo "Collecting ceph osd df"
83ceph osd df -f json -o ceph_osd_df.json
84echo "Collecting ceph osd dump"
85ceph osd dump -f json -o ceph_osd_dump.json
86echo "Collecting rados df"
87rados df -f json >rados_df.json
88echo "Collecting ceph report"
89ceph report -o ceph_report.json
90echo "Collecting auth data anonymized"
91ceph auth list -f json |sed 's/AQ[^=]*==/KEY/g' > ceph_auth_ls.json
92echo "Collecting ceph pg dump"
93ceph pg dump -f json -o ceph_pg_dump.json
Ievgeniia Zadorozhna40d074b2025-07-16 19:06:29 +020094echo "Collecting ceph pg autoscale"
95ceph osd pool autoscale-status -f json -o ceph_pg_autoscale_status.json
Alexdcb792f2021-10-04 14:24:21 -050096echo "Collecting ceph running configuration"
97ceph config dump -f json >ceph_config_dump.json
Ievgeniia Zadorozhna40d074b2025-07-16 19:06:29 +020098echo "Collecting ceph erasure code profiles"
99ceph -f json osd erasure-code-profile ls >ceph_osd_erasure-code-profiles.json
100
101
102echo "Collecting erasure code profiles"
103ceph -f json osd erasure-code-profile ls >ceph_osd_erasure-code-profiles.json
104
105echo "Collecting rbd ls -l"
106rbd ls -l | sed '$ s/.$/}/' >rbd_ls_l.json
107
108echo "Collecting block DB/WAL stats"
109for i in ${OSDS[@]}; do
110 echo \"osd.$i\"\: `ceph -f json tell osd.$i bluefs stats` >>ceph_db_wal_stats.json;
111done
112
113for prof in `ceph osd erasure-code-profile ls`; do \
114 ceph -f json -o ceph_osd_erasure-code-profile_$prof.json osd erasure-code-profile get $prof; \
115done
116
117if [[ $VOLUMES = "true" ]]; then
118 echo "Collecting ceph volumes and CoW clones per rbd pool"
119 for pool in `ceph osd pool ls detail |grep rbd |awk '{print $3}'|sed s/\'//g`; do \
120 echo '{' \
121 '"name" : ' $pool ',' \
122 '"volumes" : ' `rbd ls -l $pool | grep -v '@'| wc -l |grep -v ' 0$'` ',' \
123 '"snapshots" : ' `rbd ls -l $pool | awk '{print $1}' |grep '@' | wc -l` ',' \
124 '"clones" : ' `rbd ls -l $pool | awk '{print $4}' |grep '@' | wc -l` ',' \
125 '}' ; \
126 done >> volumes_per_pool.json
127else
128 echo "Volume collection disabled."
129fi
130
131echo "Collecting ceph osd perf"
132for i in {0..9}; do echo $i; ceph osd perf -f json -o ceph_osd_perf_$i.json; sleep 4; done
133
134if $(ceph device ls &>/dev/null); then
135 echo "Collecting device health information"
136 ceph device ls|grep -v DEVICE|awk '{print $1}'|xargs --max-args=1 ceph device get-health-metrics > ceph_device_get_health_metrics.json
137else
138 echo "Device health check not supported"
139fi
140
141if [[ $HEALTHMETRICS = "true" ]]; then
142 echo "Collecting Ceph Health Metrics (-m option)"
143 IFS=$'\n'
144 for device in `ceph device ls|grep -v DEVICE`; do
145 osd=$(echo $device|awk '{print $3}');
146 dev=$(echo $device|awk '{print $1}');
147 ceph device get-health-metrics $dev >ceph_health_$osd.json ;
148 done;
149fi
150
151if [[ $BENCH = "true" ]]; then
152 echo "Collecting Ceph Benchmark"
153 echo "{ " > ceph_tell_bench.json
154 for i in ${OSDS[@]}; do
155 if [ $i -ne ${OSDS[-1]} ]; then
156 echo \"osd.$i\"\: `ceph tell osd.$i bench -f json 12000000 4096` ',' >>ceph_tell_bench.json;
157 else
158 echo \"osd.$i\"\: `ceph tell osd.$i bench -f json 12000000 4096` >>ceph_tell_bench.json;
159 fi
160 done;
161 echo "}" >> ceph_tell_bench.json
162fi
163
164# Grab Ceph logs
165echo "Collecting Ceph Logs"
166ceph log last 10000 cluster >ceph_cluster_log.json || true
167ceph log last 10000 audit >ceph_audit_log.json || true
Alexdcb792f2021-10-04 14:24:21 -0500168
169tar czf "../"$ARCHNAME *
Ievgeniia Zadorozhna40d074b2025-07-16 19:06:29 +0200170cd ..
Alexdcb792f2021-10-04 14:24:21 -0500171