Source code for sherpa.schedulers

"""
SHERPA is a Python library for hyperparameter tuning of machine learning models.
Copyright (C) 2018  Lars Hertel, Peter Sadowski, and Julian Collado.

This file is part of SHERPA.

SHERPA is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

SHERPA is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with SHERPA.  If not, see <http://www.gnu.org/licenses/>.
"""
import subprocess
import re
import sys
import os
import logging


logger = logging.getLogger(__name__)


class _JobStatus(object):
    """
    Job status used internally to classify jobs into categories.
    """
    finished = 1
    running = 2
    failed = 3
    queued = 4
    killed = 5
    other = 6


[docs]class Scheduler(object): """ The job scheduler gives an API to submit jobs, retrieve statuses of specific jobs, and kill a job. """ def __init__(self): pass
[docs] def submit_job(self, command, env={}, job_name=''): """ Submits a job to the scheduler. Args: command (list[str]): components to the command to run by the scheduler e.g. ``["python", "train.py"]`` env (dict): environment variables to pass to the job. job_name (str): this specifies a name for the job and its output directory. Returns: str: a job ID, used for getting the status or killing the job. """ pass
[docs] def get_status(self, job_id): """ Obtains the current status of the job. Args: job_id (str): identifier returned when submitting the job. Returns: sherpa.schedulers._JobStatus: the job-status. """ pass
[docs] def kill_job(self, job_id): """ Kills a given job. Args: job_id (str): identifier returned when submitting the job. """ pass
[docs]class LocalScheduler(Scheduler): """ Runs jobs locally as a subprocess. Args: submit_options (str): options appended before the command. resources (list[str]): list of resources that will be passed as SHERPA_RESOURCE environment variable. If no resource is available '' will be passed. """ def __init__(self, submit_options='', output_dir='', resources=None): self.output_dir = output_dir self.jobs = {} self.resources = resources self.resource_by_job = {} self.output_files = {} self.submit_options = submit_options self.decode_status = {0: _JobStatus.finished, -15: _JobStatus.killed} self.output_dir = output_dir
[docs] def submit_job(self, command, env={}, job_name=''): outdir = os.path.join(self.output_dir, 'jobs') if not os.path.isdir(outdir): os.mkdir(outdir) env.update(os.environ.copy()) if self.resources is not None: env['SHERPA_RESOURCE'] = str(self.resources.pop()) else: env['SHERPA_RESOURCE'] = '' f = open(os.path.join(outdir, '{}.out'.format(job_name)), 'w') optns = self.submit_options.split(' ') if self.submit_options else [] process = subprocess.Popen(optns + command, env=env, stderr=f, stdout=f) self.jobs[process.pid] = process self.output_files[process.pid] = f if self.resources is not None: self.resource_by_job[process.pid] = env['SHERPA_RESOURCE'] return process.pid
[docs] def get_status(self, job_id): process = self.jobs.get(job_id) if not process: raise ValueError("Job not found.") status = process.poll() if status is None: return _JobStatus.running else: if job_id in self.resource_by_job: resource = self.resource_by_job.pop(job_id) self.resources.append(resource) if job_id in self.output_files: f = self.output_files.pop(process.pid) f.close() return self.decode_status.get(status, _JobStatus.other)
[docs] def kill_job(self, job_id): process = self.jobs.get(job_id) if not process: raise ValueError("Job not found.") process.terminate()
[docs]class SGEScheduler(Scheduler): """ Submits jobs to SGE, can check on their status, and kill jobs. Uses ``drmaa`` Python library. Due to the way SGE works it cannot distinguish between a failed and a completed job. Args: submit_options (str): command line options such as queue ``-q``, or ``-P`` for project, all written in one string. environment (str): the path to a file that contains environment variables; will be sourced before job is run. output_dir (str): path to directory in which ``stdout`` and ``stderr`` will be written to. If not specified this will use the same as defined for the study. """ def __init__(self, submit_options, environment, output_dir=''): self.count = 0 self.submit_options = submit_options self.environment = environment self.output_dir = output_dir self.killed_jobs = set() self.drmaa = __import__('drmaa') self.decode_status = { self.drmaa.JobState.UNDETERMINED: _JobStatus.other, self.drmaa.JobState.QUEUED_ACTIVE: _JobStatus.queued, self.drmaa.JobState.SYSTEM_ON_HOLD: _JobStatus.other, self.drmaa.JobState.USER_ON_HOLD: _JobStatus.other, self.drmaa.JobState.USER_SYSTEM_ON_HOLD: _JobStatus.other, self.drmaa.JobState.RUNNING: _JobStatus.running, self.drmaa.JobState.SYSTEM_SUSPENDED: _JobStatus.other, self.drmaa.JobState.USER_SUSPENDED: _JobStatus.other, self.drmaa.JobState.DONE: _JobStatus.finished, self.drmaa.JobState.FAILED: _JobStatus.failed}
[docs] def submit_job(self, command, env={}, job_name=''): # Create temp directory. outdir = os.path.join(self.output_dir, 'jobs') if not os.path.isdir(outdir): os.mkdir(outdir) job_name = job_name or str(self.count) sgeoutfile = os.path.join(outdir, '{}.out'.format(job_name)) try: os.remove(sgeoutfile) except OSError: pass # Create bash script that sources environment and runs python script. job_script = '#$ -S /bin/bash\n' if self.environment: job_script += 'source %s\n' % self.environment job_script += 'echo "Running from" ${HOSTNAME}\n' for var_name, var_value in env.items(): job_script += 'export {}={}\n'.format(var_name, var_value) job_script += " ".join(command) # 'python file.py args...' # Submit command to SGE. # Note: submitting job using drmaa didn't work because we weren't able # to specify options. submit_command = 'qsub -S /bin/bash -wd {} -j y -o {} -e {} {}'.format( os.getcwd(), sgeoutfile, sgeoutfile, self.submit_options) assert ' -cwd' not in submit_command # Submit using subprocess so we can get SGE process ID. job_id = self._submit_job(submit_command, job_script) logger.info('\t{}: job submitted'.format(job_id)) self.count += 1 return job_id
@staticmethod def _submit_job(submit_command, run_command): """ Args: submit_command (str): e.g. "qsub -N myProject ..." run_command (str): e.g. "python nn.py" Returns: str: SGE process ID. """ process = subprocess.Popen(submit_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, universal_newlines=True) output, std_err = process.communicate(input=run_command) # output, std_err = process.communicate() process.stdin.close() output_regexp = r'Your job (\d+)' # Parse out the process id from text match = re.search(output_regexp, output) if match: return match.group(1) else: sys.stderr.write(output) return None
[docs] def get_status(self, job_id): """ Args: job_ids (str): SGE process ID. Returns: sherpa.schedulers._JobStatus: The job status. """ with self.drmaa.Session() as s: try: status = s.jobStatus(str(job_id)) except self.drmaa.errors.InvalidJobException: return _JobStatus.finished s = self.decode_status.get(status) if s == _JobStatus.finished and job_id in self.killed_jobs: s = _JobStatus.killed return s
[docs] def kill_job(self, job_id): """ Kills a job submitted to SGE. Args: job_id (str): the SGE process ID of the job. """ logger.info("Killing job {}".format(job_id)) with self.drmaa.Session() as s: s.control(job_id, self.drmaa.JobControlAction.TERMINATE) # TODO: what happens when job doesn't exist - then we don't want to add self.killed_jobs.add(job_id)
[docs]class SLURMScheduler(Scheduler): """ Submits jobs to SLURM, can check on their status, and kill jobs. Uses ``drmaa`` Python library. Args: submit_options (str): command line options such as queue ``-q``, all written in one string. environment (str): the path to a file that contains environment variables; will be sourced before job is run. output_dir (str): path to directory in which ``stdout`` and ``stderr`` will be written to. If not specified this will use the same as defined for the study. """ def __init__(self, submit_options, environment, output_dir=''): self.count = 0 self.submit_options = submit_options self.environment = environment self.output_dir = output_dir self.killed_jobs = set() self.drmaa = __import__('drmaa') self.decode_status = { self.drmaa.JobState.UNDETERMINED: _JobStatus.other, self.drmaa.JobState.QUEUED_ACTIVE: _JobStatus.queued, self.drmaa.JobState.SYSTEM_ON_HOLD: _JobStatus.other, self.drmaa.JobState.USER_ON_HOLD: _JobStatus.other, self.drmaa.JobState.USER_SYSTEM_ON_HOLD: _JobStatus.other, self.drmaa.JobState.RUNNING: _JobStatus.running, self.drmaa.JobState.SYSTEM_SUSPENDED: _JobStatus.other, self.drmaa.JobState.USER_SUSPENDED: _JobStatus.other, self.drmaa.JobState.DONE: _JobStatus.finished, self.drmaa.JobState.FAILED: _JobStatus.failed}
[docs] def submit_job(self, command, env={}, job_name=''): # Create temp directory. logger.info('\nSUBMITTING JOB in submit_job') outdir = os.path.join(self.output_dir, 'jobs') if not os.path.isdir(outdir): os.mkdir(outdir) job_name = job_name or str(self.count) slurmoutfile = os.path.join(outdir, '{}.out'.format(job_name)) try: os.remove(slurmoutfile) except OSError: pass # Create bash script that sources environment and runs python script. job_script = '#!/bin/bash\n' if self.environment: job_script += 'source %s\n' % self.environment job_script += 'echo "Running from" ${HOSTNAME}\n' for var_name, var_value in env.items(): job_script += 'export {}={}\n'.format(var_name, var_value) job_script += " ".join(command) # 'python file.py args...' # Submit command to SLURM. # Note: submitting job using drmaa didn't work because we weren't able # to specify options. submit_command = 'sbatch --chdir={} --output={} --error={} {}'.format( os.getcwd(), slurmoutfile, slurmoutfile, self.submit_options) assert ' -cwd' not in submit_command # Submit using subprocess so we can get SLURM process ID. job_id = self._submit_job(submit_command, job_script) logger.info('\t{}: job submitted'.format(job_id)) self.count += 1 return job_id
@staticmethod def _submit_job(submit_command, run_command): """ Args: submit_command (str): e.g. "qsub -N myProject ..." run_command (str): e.g. "python nn.py" Returns: str: SLURM process ID. """ process = subprocess.Popen(submit_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, universal_newlines=True) output, std_err = process.communicate(input=run_command) # output, std_err = process.communicate() process.stdin.close() output_regexp = r'Submitted batch job (\d+)' # Parse out the process id from text match = re.search(output_regexp, output) if match: return match.group(1) else: sys.stderr.write(output) return None
[docs] def get_status(self, job_id): """ Args: job_ids (str): SLURM process ID. Returns: sherpa.schedulers._JobStatus: The job status. """ with self.drmaa.Session() as s: try: status = s.jobStatus(str(job_id)) except self.drmaa.errors.InvalidJobException: return _JobStatus.finished s = self.decode_status.get(status) if s == _JobStatus.finished and job_id in self.killed_jobs: s = _JobStatus.killed return s
[docs] def kill_job(self, job_id): """ Kills a job submitted to SLURM. Args: job_id (str): the SLURM process ID of the job. """ logger.info("Killing job {}".format(job_id)) with self.drmaa.Session() as s: s.control(job_id, self.drmaa.JobControlAction.TERMINATE) # TODO: what happens when job doesn't exist - then we don't want to add self.killed_jobs.add(job_id)