Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 1 | #!/bin/bash |
Ievgeniia Zadorozhna | 40d074b | 2025-07-16 19:06:29 +0200 | [diff] [blame^] | 2 | |
| 3 | # Collect relevant data from a Ceph cluster for troubleshooting and assessment |
| 4 | # (C) Christian Huebner chuebner@mirantis.com 2015 |
| 5 | # run with ./ceph_collect.sh <customername> <clustername> |
| 6 | |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 7 | echo "Collecting Ceph cluster data." |
| 8 | |
Ievgeniia Zadorozhna | 40d074b | 2025-07-16 19:06:29 +0200 | [diff] [blame^] | 9 | help () { |
| 10 | echo "Data collector for Ceph analytics" |
| 11 | echo "Usage: ceph_collect.sh [-b] [-m] <customername> <clustername>" |
| 12 | echo "-m only works with Nautilus and up" |
| 13 | } |
| 14 | |
| 15 | POSITIONAL=() |
| 16 | BENCH="true" |
| 17 | VOLUMES="false" |
| 18 | OSDS=(`ceph osd ls`) |
| 19 | |
| 20 | if [[ $# -eq 0 ]]; then |
| 21 | help |
| 22 | exit |
| 23 | fi |
| 24 | |
| 25 | while [[ $# -gt 0 ]]; do |
| 26 | key="$1" |
| 27 | |
| 28 | case $key in |
| 29 | -h|--help) |
| 30 | help |
| 31 | exit |
| 32 | ;; |
| 33 | -vol|--volumes) |
| 34 | VOLUMES="true" |
| 35 | shift |
| 36 | ;; |
| 37 | -nb|--nobench) |
| 38 | BENCH="false" |
| 39 | shift # past argument |
| 40 | ;; |
| 41 | -m|--healthmetrics) |
| 42 | HEALTHMETRICS="true" |
| 43 | shift # past argument |
| 44 | ;; |
| 45 | *) # unknown option |
| 46 | POSITIONAL+=("$1") # save it in an array for later |
| 47 | shift # past argument |
| 48 | ;; |
| 49 | esac |
| 50 | done |
| 51 | |
| 52 | if [ ${#POSITIONAL[*]} -lt 2 ]; then echo "Usage: ./ceph_collect.sh <CUSTOMER> <CLUSTERNAME>"; exit; fi |
| 53 | # if [ "$#" -lt 2 ]; then echo "Usage: ./ceph_collect.sh <CUSTOMER> <CLUSTERNAME>"; exit; fi |
| 54 | export CUSTOMER=${POSITIONAL[0]} |
| 55 | export CLUSTERNAME=${POSITIONAL[1]} |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 56 | |
| 57 | if ! which ceph >/dev/null; then echo "ERROR: This script must be run on a ceph monitor or admin node"; exit; fi |
| 58 | |
| 59 | DATE=`date "+%Y-%m-%d"` |
| 60 | DIRNAME="CephCollectData.$CUSTOMER.$CLUSTERNAME.$DATE" |
| 61 | ARCHNAME=$DIRNAME".tar.gz" |
| 62 | mkdir $DIRNAME |
| 63 | cd $DIRNAME |
| 64 | |
| 65 | echo "Collecting CRUSH map" |
| 66 | ceph osd getcrushmap -o crush.bin |
| 67 | crushtool -d crush.bin -o crushmap.txt |
| 68 | crushtool -i crush.bin --dump > crushmap.json |
| 69 | rm crush.bin |
| 70 | |
| 71 | echo "Collecting ceph osd crush dump" |
| 72 | ceph osd crush dump >crushdump.json |
| 73 | |
| 74 | echo "Collecting cluster status" |
| 75 | ceph -s -f json -o ceph_s.json |
| 76 | echo "Collecting health detail" |
Ievgeniia Zadorozhna | 40d074b | 2025-07-16 19:06:29 +0200 | [diff] [blame^] | 77 | ceph -f json health detail -o ceph_health_detail.json |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 78 | echo "Collecting monmap" |
| 79 | ceph mon dump -f json -o monmap.json |
| 80 | echo "Collecting ceph df" |
| 81 | ceph df -f json -o ceph_df.json |
| 82 | echo "Collecting ceph osd df" |
| 83 | ceph osd df -f json -o ceph_osd_df.json |
| 84 | echo "Collecting ceph osd dump" |
| 85 | ceph osd dump -f json -o ceph_osd_dump.json |
| 86 | echo "Collecting rados df" |
| 87 | rados df -f json >rados_df.json |
| 88 | echo "Collecting ceph report" |
| 89 | ceph report -o ceph_report.json |
| 90 | echo "Collecting auth data anonymized" |
| 91 | ceph auth list -f json |sed 's/AQ[^=]*==/KEY/g' > ceph_auth_ls.json |
| 92 | echo "Collecting ceph pg dump" |
| 93 | ceph pg dump -f json -o ceph_pg_dump.json |
Ievgeniia Zadorozhna | 40d074b | 2025-07-16 19:06:29 +0200 | [diff] [blame^] | 94 | echo "Collecting ceph pg autoscale" |
| 95 | ceph osd pool autoscale-status -f json -o ceph_pg_autoscale_status.json |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 96 | echo "Collecting ceph running configuration" |
| 97 | ceph config dump -f json >ceph_config_dump.json |
Ievgeniia Zadorozhna | 40d074b | 2025-07-16 19:06:29 +0200 | [diff] [blame^] | 98 | echo "Collecting ceph erasure code profiles" |
| 99 | ceph -f json osd erasure-code-profile ls >ceph_osd_erasure-code-profiles.json |
| 100 | |
| 101 | |
| 102 | echo "Collecting erasure code profiles" |
| 103 | ceph -f json osd erasure-code-profile ls >ceph_osd_erasure-code-profiles.json |
| 104 | |
| 105 | echo "Collecting rbd ls -l" |
| 106 | rbd ls -l | sed '$ s/.$/}/' >rbd_ls_l.json |
| 107 | |
| 108 | echo "Collecting block DB/WAL stats" |
| 109 | for i in ${OSDS[@]}; do |
| 110 | echo \"osd.$i\"\: `ceph -f json tell osd.$i bluefs stats` >>ceph_db_wal_stats.json; |
| 111 | done |
| 112 | |
| 113 | for prof in `ceph osd erasure-code-profile ls`; do \ |
| 114 | ceph -f json -o ceph_osd_erasure-code-profile_$prof.json osd erasure-code-profile get $prof; \ |
| 115 | done |
| 116 | |
| 117 | if [[ $VOLUMES = "true" ]]; then |
| 118 | echo "Collecting ceph volumes and CoW clones per rbd pool" |
| 119 | for pool in `ceph osd pool ls detail |grep rbd |awk '{print $3}'|sed s/\'//g`; do \ |
| 120 | echo '{' \ |
| 121 | '"name" : ' $pool ',' \ |
| 122 | '"volumes" : ' `rbd ls -l $pool | grep -v '@'| wc -l |grep -v ' 0$'` ',' \ |
| 123 | '"snapshots" : ' `rbd ls -l $pool | awk '{print $1}' |grep '@' | wc -l` ',' \ |
| 124 | '"clones" : ' `rbd ls -l $pool | awk '{print $4}' |grep '@' | wc -l` ',' \ |
| 125 | '}' ; \ |
| 126 | done >> volumes_per_pool.json |
| 127 | else |
| 128 | echo "Volume collection disabled." |
| 129 | fi |
| 130 | |
| 131 | echo "Collecting ceph osd perf" |
| 132 | for i in {0..9}; do echo $i; ceph osd perf -f json -o ceph_osd_perf_$i.json; sleep 4; done |
| 133 | |
| 134 | if $(ceph device ls &>/dev/null); then |
| 135 | echo "Collecting device health information" |
| 136 | ceph device ls|grep -v DEVICE|awk '{print $1}'|xargs --max-args=1 ceph device get-health-metrics > ceph_device_get_health_metrics.json |
| 137 | else |
| 138 | echo "Device health check not supported" |
| 139 | fi |
| 140 | |
| 141 | if [[ $HEALTHMETRICS = "true" ]]; then |
| 142 | echo "Collecting Ceph Health Metrics (-m option)" |
| 143 | IFS=$'\n' |
| 144 | for device in `ceph device ls|grep -v DEVICE`; do |
| 145 | osd=$(echo $device|awk '{print $3}'); |
| 146 | dev=$(echo $device|awk '{print $1}'); |
| 147 | ceph device get-health-metrics $dev >ceph_health_$osd.json ; |
| 148 | done; |
| 149 | fi |
| 150 | |
| 151 | if [[ $BENCH = "true" ]]; then |
| 152 | echo "Collecting Ceph Benchmark" |
| 153 | echo "{ " > ceph_tell_bench.json |
| 154 | for i in ${OSDS[@]}; do |
| 155 | if [ $i -ne ${OSDS[-1]} ]; then |
| 156 | echo \"osd.$i\"\: `ceph tell osd.$i bench -f json 12000000 4096` ',' >>ceph_tell_bench.json; |
| 157 | else |
| 158 | echo \"osd.$i\"\: `ceph tell osd.$i bench -f json 12000000 4096` >>ceph_tell_bench.json; |
| 159 | fi |
| 160 | done; |
| 161 | echo "}" >> ceph_tell_bench.json |
| 162 | fi |
| 163 | |
| 164 | # Grab Ceph logs |
| 165 | echo "Collecting Ceph Logs" |
| 166 | ceph log last 10000 cluster >ceph_cluster_log.json || true |
| 167 | ceph log last 10000 audit >ceph_audit_log.json || true |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 168 | |
| 169 | tar czf "../"$ARCHNAME * |
Ievgeniia Zadorozhna | 40d074b | 2025-07-16 19:06:29 +0200 | [diff] [blame^] | 170 | cd .. |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 171 | |