#!/usr/bin/python import os,sys,getopt,pprint SCRIPT='fix_pgs.py' def help(): print '%s -m <hostname>'%SCRIPT print '%s -h'%SCRIPT def get_pg_failed(): cmd = os.popen("ceph health detail | grep 'pg ' | grep 'failed_repair' | awk '{print $2\" \"$6}'").read() pgs = cmd.split("\n") pgs = pgs[0:-1] pgs_dict = {} for p in pgs: e = p.split(' ') osd_list = e[1][1:-1].split(",") pgs_dict[e[0]] = osd_list return pgs_dict def get_host_osds(): cmd = os.popen("ceph osd tree | grep host | awk '{print $4}'").read() hosts = cmd.split("\n") hosts = hosts[0:-1] hosts_dict = {} for h in hosts: cmd = os.popen("ssh %s systemctl list-units --type=service --state=active | grep ceph-osd | awk '{print $1}'"%h).read() services = cmd.split("\n") services = services[0:-1] hosts_dict[h] = [] for i in services: hosts_dict[h].append((i.split(".")[0]).split("@")[-1]) return hosts_dict def get_osd_hosts(): cmd = os.popen("ceph osd tree | grep host | awk '{print $4}'").read() hosts = cmd.split("\n") hosts = hosts[0:-1] osds_dict = {} for h in hosts: cmd = os.popen("ssh %s systemctl list-units --type=service --state=active | grep ceph-osd | awk '{print $1}'"%h).read() services = cmd.split("\n") services = services[0:-1] for i in services: osd = (i.split(".")[0]).split("@")[-1] osds_dict[osd] = h return osds_dict def build_commands(pgs,osds): commands = [] for p in pgs: print p for o in pgs[p]: commands.append("ssh %s systemctl stop ceph-osd@%s.service"%(osds[o],o)) commands.append("ssh %s ceph-osd -i %s --flush-journal"%(osds[o],o)) commands.append("ssh %s systemctl start ceph-osd@%s.service"%(osds[o],o)) commands.append("ceph pg repair %s"%p) return commands # stop the OSD that has the wrong object responsible for that PG # flush the journal (ceph-osd -i <id> --flush-journal) # move the bad object to another location # start the OSD again # call ceph pg repair 17.1c1 def fix_pgs(): pgs = get_pg_failed() osds_dict = get_osd_hosts() build_commands(pgs,osds_dict) commands = build_commands(pgs,osds_dict) for c in commands: os.popen(c) def main(argv): try: opts, args = getopt.getopt(argv,"hmpf",["machine=pg_failed"]) except getopt.GetoptError: help() sys.exit(2) for opt, arg in opts: if opt == '-h': help() sys.exit() elif opt in ("-m", "--machine"): osds = get_host_osds() pprint.pprint(osds) elif opt in ("-p", "--pg_failed"): pg = get_pg_failed() pprint.pprint(pg) elif opt in ("-f", "--fix_pg"): fix_pgs() else: help() sys.exit() if __name__ == "__main__": main(sys.argv[1:])