#! /usr/bin/python # # This script kills processes that are running by users who do not have # this machine reserved in the queue. It also nices users who are not # in high-priority groups. # # If it is run with the "warning" option emails are sent instead of having # the processes killed. # # The killGroups lists all the groups of users whose processes should # be considered. # The priorityGroup lists groups whose processes should not be niced. # # 15-march-06 - davidbbs # 17-march-06 - changed to use pwd, grp, and socket and to look for # partial user matches because qstat truncates them. # import sys, re, os, datetime, pwd, grp, socket DEBUG = False # Groups to kill killGroups = ["students", "staff", "faculty", "regression"] # Groups to not renice priorityGroups = ["cva"] # Commands to not send warnings about warningCommandsToIgnore = ["gconfd"] # Comand to get the processes running # Setup the SGE environment: os.environ["SGE_ROOT"]="/opt/gridengine" os.environ["SGE_CELL"]="default" os.environ["SGE_ARCH"]="lx26-amd64" os.environ["SGE_EXECD_PORT"]="537" os.environ["SGE_QMASTER_PORT"]="536" # Execute the command to get all the qlogin sessions qstatCommand = "/opt/gridengine/bin/lx26-amd64/qstat" # Grep command grepCommand = "/bin/grep" # ps command psCommand = "/bin/ps auxwww" # kill command killCommand = "/bin/kill" # nice command niceCommand = "/usr/bin/renice 15" #hostname command hostnameCommand = "/bin/hostname" # sendmail command sendmailCommand = "/usr/sbin/sendmail -t" priorityUsers = [] for group in priorityGroups: priorityUsers.extend(grp.getgrnam(group)[3]) if (DEBUG): print "priority Users: " + str(priorityUsers) killUsers = [] for group in killGroups: killUsers.extend(grp.getgrnam(group)[3]) if (DEBUG): print "kill Users: " + str(killUsers) warningOnly = False if (len(sys.argv) > 1): if (sys.argv[1] == "warning"): warningOnly = True # get the compute name so we can get only qstat information for # this machine. host = socket.gethostbyaddr(socket.gethostname())[0] match = re.match("(compute-[0-9]-[0-9]+).*", host) host = match.group(1) if (DEBUG): print "Host is: " + host validUsers = [] # Get the jobs for this node command = qstatCommand + " | " + grepCommand + " \"" + host +"\\.\"" child = os.popen(command, "r") for line in child.readlines(): match = re.match("\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+\.q)@(compute-[0-9]-[0-9]+)", line) if (match): user = match.group(4) queue = match.group(8) machine = match.group(9) #if (DEBUG): print "queued user: " + user + " in " + queue + " on " +machine validUsers.append(user) else: print "ERROR: qlogin line failed to match." warnings = {} # Get the jobs running on this machine child = os.popen(psCommand, "r") for line in child.readlines(): match = re.match("(\S+)\s+([0-9]+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(.*)", line) if (match): user = match.group(1) # If the user name is too long we get a numeric UID if (re.match("[0-9]+", user)): user = pwd.getpwuid(int(user))[0] pid = int(match.group(2)) usercommand = match.group(11) if (DEBUG): print "active user: " + user + " " + str(pid) + " " + usercommand kill = True nice = True valid = False # Check if the user is a valid user if user in killUsers: valid = True if (DEBUG): print "user can be killed" # Check if the user is a high-priority user if user in priorityUsers: nice = False if (DEBUG): print "do not nice" # Check if the user is scheduled on this node # User names may be truncated in the validUsers list so we need to be careful! for check in validUsers: if (re.match(check+".*", user)): valid = False if (DEBUG): print "do not kill, matches: " + check if (valid & (not warningOnly)): if (nice & (not kill)): command = niceCommand + " " + str(pid) print "nice: " + user + " " + usercommand + " (" + command +")" if (DEBUG == False): os.system(command) if kill: command = killCommand + " " + str(pid) print "kill: " + user + " " + usercommand + " (" + command +")" if (DEBUG == False): os.system(command) elif (valid & kill): for ignore in warningCommandsToIgnore: if (re.search(ignore, usercommand) == None): if (warnings.has_key(user)): warnings[user] = warnings[user] + "\t" + "(PID: " + str(pid) +")\t" + usercommand + "\n" else: warnings[user] = "\t" + "(PID: " + str(pid) +")\t" + usercommand + "\n" now = datetime.datetime nowStr = str(now.now()) if warningOnly: for user in warnings: name = pwd.getpwnam(user)[4] message = "To: " + name + " <" + user +"@localhost>\n" message += "From: Bagels Cluster \n" message += "Subject: Processes to be killed on cluster (" + user + ")\n" message += "\n" message += "You are using the node " + host + " without having it reserved through the cluster queue system (qlogin/qsub).\n" message += "\n" message += "The following processes will be killed:\n" message += warnings[user] message += "\n\n" message += "If you don't care about these processes then you don't have to do anything.\n" message += "\n" message += "If you have any questions please consult the cluster documentation at http://cva.stanford.edu/systems/cva_systems.html#KillNonQueuedProcesses.\n" message += "\n" message += "Report generated: " + nowStr + "\n" message += "\n" if (DEBUG): print message else: print "warning sent to " + user p = os.popen(sendmailCommand, "w") p.write(message) p.close()