Source code for schrodinger.test.stu.joberrors

import os
import pathlib
import zipfile
import sys

from schrodinger.infra.mmjob import mmjob_is_job_server_job
from schrodinger.job import jobcontrol
from schrodinger.job import queue
from schrodinger.job import remote_command
from schrodinger.utils import mmutil
from schrodinger.utils import subprocess

from . import common

logger = common.logger

WINDOWS_LONG_PATH_PREFIX = "\\\\?\\"


[docs]def run_postmortem(job, product_name): """ This is any queue.BaseJob. If it has a jobid, run postmortem for that particular job. Otherwise, run postmortem for the job database only. """ job.infoStatus('running postmortem') _, jobid, _ = job.getStatusStrings() if jobid and jobid.strip() == "[none]": jobid = None job_directory = job.getCommandDir() args = [] if not job_directory: return if jobid: args.append(jobid) elif mmutil.feature_flag_is_enabled(mmutil.JOB_SERVER): return else: args.append("-jobdbonly") postmortem_log_path = pathlib.Path(job_directory).joinpath('postmortem.log') with open(postmortem_log_path, 'w') as logf: if mmjob_is_job_server_job(jobid): command = [ "jsc", "postmortem", "--with-subjobs", "--without-redaction", ] + args else: command = ["postmortem"] + args proc = subprocess.run(command, cwd=job_directory, stderr=subprocess.STDOUT, stdout=logf) if proc.returncode: msg = f'postmortem failed with return code: {proc.returncode}' logf.write(msg + '\n') msg += f' while investigating {job}' logger.warning(msg) if os.path.isfile(postmortem_log_path) and jobid: unzip_postmortem(postmortem_log_path) collect_queue_stats(job)
[docs]def unzip_postmortem(postmortem_log_path: str): """ Find postmortem file name from the log file, then extract :param postmortem_log_path: output of postmortem command :param extraction_dir: If None, extract to current working directory """ # Absolute path to postmortem zip should be at the end of the log. Take # the last line from the log containing a .zip file postmortem_zip_path = None with open(postmortem_log_path) as f: for line in f: if '.zip' in line: postmortem_zip_path = pathlib.Path( line.strip().strip('"').split(': ')[-1]) if not postmortem_zip_path: logger.warning( f'Postmortem file not found. See log at {postmortem_log_path}') return if not postmortem_zip_path.is_absolute(): postmortem_zip_path = postmortem_log_path.parent.joinpath( postmortem_zip_path) postmortem_extraction_dir = postmortem_zip_path.parent if sys.platform == "win32": postmortem_extraction_dir = WINDOWS_LONG_PATH_PREFIX + str( postmortem_extraction_dir) with zipfile.ZipFile(postmortem_zip_path) as zf: zf.extractall(postmortem_extraction_dir) # Remove zip archive and file manifest os.remove(postmortem_zip_path) postmortem_files = postmortem_zip_path.with_suffix('.files') if postmortem_files.exists(): os.remove(postmortem_files)
[docs]def collect_queue_stats(job): """ Invoke remote queue commands to collect queue stats to debug a killed queue job. :type job: schrodinger.job.queue.JobControlJob :param job: Instance of schrodinger.job.queue.JobControlJob to collect the stats from queue job has run. """ if not is_killed_queue_job(job): return jobObj = job.getJob() job_directory = job.getCommandDir() cmd = "bash --login -c '{}'" submission_hostname = jobcontrol.get_host(jobObj.HostEntry).host submission_username = jobcontrol.get_host(jobObj.HostEntry).user rcmd = remote_command._rsh_cmd(submission_hostname, remoteuser=submission_username) with open(os.path.join(job_directory, 'qstat.log'), 'w') as f: bash_command = cmd.format('date && echo Running Cmd: qstat && qstat') proc = subprocess.run(rcmd + [bash_command], stdout=f) if proc.returncode != 0: logger.warning(f"{bash_command} exited abnormally") with open(os.path.join(job_directory, 'clusutil.log'), 'w') as f: clusutil_cmd = 'perl /nfs/working/sysmgr/sysmgr-repo/scripts/clustutil.pl -u -a' bash_command = cmd.format('date && echo Running Cmd: {} && {}'.format( clusutil_cmd, clusutil_cmd)) proc = subprocess.run(rcmd + [bash_command], stdout=f) if proc.returncode != 0: logger.warning(f"{bash_command} exited abnormally")
[docs]def is_killed_queue_job(job): """ Return True if the job was killed on the queueing system. :type job: schrodinger.job.queue.JobControlJob :param job: Instance to collect the stats from queue job has run. """ # subprocess jobs don't submit to the queu if not isinstance(job, queue.JobControlJob): return False # job failed to launch jobObj = job.getJob() if not jobObj: return False # not a queued job if not jobObj.isQueued(): return False return job.canceled_by_timeout