Skip to content
Snippets Groups Projects
fix_pgs.py 2.75 KiB
Newer Older
#!/usr/bin/python
import os,sys,getopt,pprint



SCRIPT='fix_pgs.py'


def help():
  print '%s -m <hostname>'%SCRIPT
  print '%s -h'%SCRIPT

def get_pg_failed():
  cmd = os.popen("ceph health detail | grep 'pg ' | grep 'failed_repair' | awk '{print $2\" \"$6}'").read()
  pgs = cmd.split("\n")
  pgs = pgs[0:-1]
  pgs_dict = {}
  for p in pgs:
    e = p.split(' ')
    osd_list = e[1][1:-1].split(",")
    pgs_dict[e[0]] =  osd_list
  return pgs_dict  

def get_host_osds():
  cmd = os.popen("ceph osd tree | grep host | awk '{print $4}'").read()
  hosts = cmd.split("\n")
  hosts = hosts[0:-1]
  hosts_dict = {}
  for h in hosts:
    cmd = os.popen("ssh %s systemctl list-units --type=service --state=active | grep ceph-osd | awk '{print $1}'"%h).read() 
    services = cmd.split("\n")
    services = services[0:-1]
    hosts_dict[h] = []
    for i in services:
      hosts_dict[h].append((i.split(".")[0]).split("@")[-1])
  return hosts_dict

def get_osd_hosts():
  cmd = os.popen("ceph osd tree | grep host | awk '{print $4}'").read()
  hosts = cmd.split("\n")
  hosts = hosts[0:-1]
  osds_dict = {}
  for h in hosts:
    cmd = os.popen("ssh %s systemctl list-units --type=service --state=active | grep ceph-osd | awk '{print $1}'"%h).read()
    services = cmd.split("\n")
    services = services[0:-1]
    for i in services:
      osd = (i.split(".")[0]).split("@")[-1]
      osds_dict[osd] = h
  return osds_dict

def build_commands(pgs,osds):
  commands = []
  for p in pgs:
    print p
    for o in pgs[p]:
      commands.append("ssh %s systemctl stop ceph-osd@%s.service"%(osds[o],o))
      commands.append("ssh %s ceph-osd -i %s --flush-journal"%(osds[o],o)) 
      commands.append("ssh %s systemctl start ceph-osd@%s.service"%(osds[o],o))
      commands.append("ceph pg repair %s"%p)
  return commands
    


# stop the OSD that has the wrong object responsible for that PG
# flush the journal (ceph-osd -i <id> --flush-journal)
# move the bad object to another location
# start the OSD again
# call ceph pg repair 17.1c1
def fix_pgs():
  pgs = get_pg_failed()
  osds_dict = get_osd_hosts()
  build_commands(pgs,osds_dict)
  commands = build_commands(pgs,osds_dict)
  for c in commands:
    os.popen(c)
  
   
  

def main(argv):
   try:
      opts, args = getopt.getopt(argv,"hmpf",["machine=pg_failed"])
   except getopt.GetoptError:
      help()
      sys.exit(2)
   for opt, arg in opts:
      if opt == '-h':
         help()
         sys.exit()
      elif opt in ("-m", "--machine"):
        osds = get_host_osds()
        pprint.pprint(osds)
      elif opt in ("-p", "--pg_failed"):
        pg = get_pg_failed()
        pprint.pprint(pg)
      elif opt in ("-f", "--fix_pg"):
        fix_pgs()

         
      else:
        help()
        sys.exit()

if __name__ == "__main__":
   main(sys.argv[1:])