#!/usr/bin/python # # This is a script that attempts to kill qlogin sessions that have been # zombied. # It does this by looking at who is running a qlogin session from an unkown # tty by checking ps and then looks them up in the qstat list. If the user # for a particular machine has the same number of unknown ttys as qlogin # sessions then they are all invalid and will all be removed. If those # numbers differ then at least one of them is indeterminant so none will be # killed. # import sys, re, os, datetime DEBUG = False # Log file location logFile = "/var/log/qlogin_cleanup" # Read in the qsub processes running checkCounts = {} # A mapping between the user-compute name and the number of instances command = "/bin/ps auxwww | /bin/grep qlogin.sh" child = os.popen(command, "r") # For each running qlogin.sh process extract the user, the tty, and the command for line in child.readlines(): wMatch = re.search("(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(.*)", line) if (wMatch): user = wMatch.group(1) PID = wMatch.group(2) tty = wMatch.group(7) jobStartTime = wMatch.group(9) command = wMatch.group(11) # If the tty is unknown, then extract the compute-x.y.local name and put it in the mapping. if (tty == "?"): computeMatch = re.search("(\S+)\s+(compute-.*local)", command) if (computeMatch): # Merge the username and the compute into one string so we can map them together merged = user +"-" + computeMatch.group(2) # if we already have an entry for this username-compute then incrememnt its count, # otherwise add a new mapping with count 1. if (checkCounts.has_key(merged)): checkCounts[merged] = checkCounts[merged] + 1 else: checkCounts[merged] = 1 else: print "invalid line: " + line err = child.close(); if err: print "Error executing command: " + command + " error: " + str(err) if (DEBUG): print "Found the following to check: " for i in checkCounts: print "\t"+i +": " + str(checkCounts[i]) # Read in the qlogin sessions sessionCounts = {} # A mapping of user-compute names to number of instances sessionIDMap = {} # A mapping of user-compute names to a list of job IDs # Setup the SGE environment: os.environ["SGE_ROOT"]="/opt/gridengine" os.environ["SGE_CELL"]="default" os.environ["SGE_ARCH"]="lx26-amd64" os.environ["SGE_EXECD_PORT"]="537" os.environ["SGE_QMASTER_PORT"]="536" # Execute the command to get all the qlogin sessions command = "/opt/gridengine/bin/lx26-amd64/qstat | /bin/grep QLOGIN" child = os.popen(command, "r") # For each line in the output extract the jobID, the user, and the machine for line in child.readlines(): wMatch = re.search("\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)", line) if (wMatch): # Extract the fields jobID = wMatch.group(1) user = wMatch.group(4) jobStartDate = wMatch.group(6) jobStartTime = wMatch.group(7) machine = wMatch.group(8) # Extract the machine in the same format as above: compute-x-y.local computeMatch = re.search("(\S+).q@(\S+)\.", machine) if (computeMatch): # Merge the user name and machine merged = user +"-"+computeMatch.group(2)+".local" # If we already have an entry for this user on this machine then increment the # count associated with it, otherwise add a new one with count 1. if (sessionCounts.has_key(merged)): sessionCounts[merged] = sessionCounts[merged] + 1 else: sessionCounts[merged] = 1 # Add this jobID to the list of jobIDs under the given merged username-compute # so we have the jobID to kill if we decide to kill this session. ids = [] ids.append(int(jobID)) if (sessionIDMap.has_key(merged)): sessionIDMap[merged].append(int(jobID)) else: sessionIDMap[merged] = ids else: print "invalid qlogin session found: " + machine + " in " + line else: print "invalid line: " + line err = child.close(); if err: print "Error executing command: " + command + " error: " + str(err) if (DEBUG): print "Found the following real instances: " for i in sessionCounts: print "\t"+i +":" + str(sessionCounts[i]) # Now we check that we have the same number of instances of a given # user on a given machine in the list to check as we have on the system. # If that is the case (i.e., we have 2 instances of them on compute-0-2 and # there are a total of 2 instances of them on compute-0-2 on the whole system) # then we can kill all of those. If there are more instances of them on a # particular machine then we have to check that means at least one of them # is good and we can't figure out which one, so we can't do anything. try: log = open(logFile, "a"); except IOError: print "Could not open log file for writing. " + logFile log = open("temp.log", "a"); now = datetime.datetime log.write("qlogin_cleanup: " + str(now.now())+"\n") for userToCheck in checkCounts: if (sessionCounts.has_key(userToCheck)): if (sessionCounts[userToCheck] == checkCounts[userToCheck]): if (DEBUG): print ">> Found user to kill: " +userToCheck + " instances: " + str(checkCounts[userToCheck]) for jobID in sessionIDMap[userToCheck]: command = "/opt/gridengine/bin/lx26-amd64/qdel " + str(jobID) child = os.popen(command, "r") child.close() log.write("Removing job: " + userToCheck + " (" + str(jobID) + ")\n") if (DEBUG): print "Removing job: " + userToCheck + " (" + str(jobID) + ")" else: log.write("Not removing " + userToCheck + " because the number of dead connections " + str(checkCounts[userToCheck]) + " does not equal the number of qlogin instances " + str(sessionCounts[userToCheck]) +".\n") if (DEBUG): print "Not removing " + userToCheck + " because the number of dead connections " + str(checkCounts[userToCheck]) + " does not equal the number of qlogin instances " + str(sessionCounts[userToCheck]) +"." log.close()