turbo-hipster/turbo_hipster/lib/models.py

# Copyright 2013 Rackspace Australia
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.


import copy
import json
import logging
import os
import tempfile
import pkg_resources
import socket
import uuid

from turbo_hipster.lib import common
from turbo_hipster.lib import utils


class Task(object):
    """ A base object for running a job (aka Task) """
    log = logging.getLogger("task")

    def __init__(self, worker_server, job_name, job_config):
        # TODO(jhesketh): remove the need for worker_server here
        self.worker_server = worker_server
        # NOTE(jhesketh): job_config may be in the old format where name
        # refers to the plugin and function is the job name. Thus these should
        # never be used in a job, instead use the provided job_name.
        self.job_config = job_config
        self.job_name = job_name
        self._reset()

        # Define the number of steps we will do to determine our progress.
        self.total_steps = 0

    def _cleanup(self):
        if self.log_handler:
            self.log.removeHandler(self.log_handler)
            self.log_handler.flush()
            self.log_handler.close()
        if ('shutdown-th' in self.job_config and
                self.job_config['shutdown-th']):
            self.worker_server.shutdown_gracefully()

    def _reset(self):
        self.job = None
        self.job_arguments = None
        self.work_data = None
        self.cancelled = False
        self.success = True
        self.messages = []
        self.current_step = 0
        self.log_handler = None
        self.th_uuid = str(uuid.uuid4())[-12:]

    def _prep_working_dir(self):
        # Use the th_uuid so that if the same job is somehow taken twice from
        # zuul we won't re-use zuul's uuid. This shouldn't happen but if it
        # does it prevents overwriting previous results
        self.job_working_dir = os.path.join(
            self.worker_server.config['jobs_working_dir'],
            self.th_uuid,
            self.job_arguments['LOG_PATH']
        )
        self.job_results_dir = os.path.join(
            self.job_working_dir,
            'results'
        )
        self.task_output_log = os.path.join(
            self.job_results_dir,
            'task_output.log'
        )

        if not os.path.isdir(os.path.dirname(self.task_output_log)):
            os.makedirs(os.path.dirname(self.task_output_log))

    def _setup_task_logging(self):
        self.log_handler = logging.FileHandler(self.task_output_log)
        log_formatter = logging.Formatter('%(asctime)s %(message)s')
        self.log_handler.setFormatter(log_formatter)
        self.log.addHandler(self.log_handler)
        self.log.setLevel(logging.DEBUG)

    def start_job(self, job):
        self._reset()
        self.job = job

        if self.job is not None:
            try:
                self.job_arguments = \
                    json.loads(self.job.arguments.decode('utf-8'))
                self.log.debug("Got job from ZUUL %s" % self.job_arguments)

                # Send an initial WORK_DATA and WORK_STATUS packets
                self._send_work_data()

                # Prep working dirs
                self._prep_working_dir()

                # Now we have working dirs we can log the job details to a file
                self._setup_task_logging()

            except Exception as e:
                # If something failed during this section we have been unable
                # to log to file. As such raise an exception to gearman
                self.log.exception("Failure during setup")
                self.log.exception(e)
                if not self.cancelled:
                    self.success = False
                    self.messages.append('FAILURE during the job setup')
                    self.messages.append('Exception: %s' % e)
                    self._send_work_data()
                    self.job.sendWorkException(str(e).encode('utf-8'))
                    # No point trying the job, lets return here
                    self._send_final_results()
                    return

            # From here we can log exceptions
            try:
                # Execute the job_steps
                self.do_job_steps()
            except Exception as e:
                # Log the problem
                if not self.cancelled:
                    self.success = False
                    self.log.exception('Something failed running the job!')
                    self.messages.append('FAILURE running the job')
                    self.messages.append('Exception: %s' % e)
                    # Don't return from here as we can continue uploading the
                    # logs

            try:
                self._cleanup()
                self._upload_results()
            except Exception as e:
                # If something failed during this section we have been unable
                # to upload the log. As such raise an exception to gearman
                self.log.exception("Failure during cleanup and upload")
                self.log.exception(e)
                if not self.cancelled:
                    self.success = False
                    self.messages.append('FAILURE during cleanup and log '
                                         'upload')
                    self.messages.append('Exception: %s' % e)
                    self._send_work_data()
                    self.job.sendWorkException(str(e).encode('utf-8'))
            finally:
                # Finally, send updated work data and completed packets
                self._send_final_results()

    def stop_working(self, number=None):
        # Check the number is for this job instance (None will cancel all)
        # (makes it possible to run multiple workers with this task
        # on this server)
        if number is None or number == self.job.unique:
            self.log.debug("We've been asked to stop by our gearman manager")
            self.cancelled = True
            # TODO: Work out how to kill current step

    def _get_work_data(self):
        if self.work_data is None:
            hostname = os.uname()[1]
            fqdn = socket.getfqdn()
            self.work_data = dict(
                name=self.job_name,
                number=self.job.unique,
                manager='turbo-hipster-manager-%s' % hostname,
                url='http://localhost',
                worker_hostname=hostname,
                worker_fqdn=fqdn,
                worker_program='turbo-hipster',
            )
            try:
                self.work_data['worker_version'] = (
                    pkg_resources.get_distribution('turbo_hipster').version
                )
            except pkg_resources.DistributionNotFound:
                # Package isn't installed; I do not think that manually
                # attempting to extract version in some ad-hoc manner would be
                # worth it -> just ignore this.
                pass
        return self.work_data

    def _send_work_data(self):
        """ Send the WORK DATA in json format for job """
        self.log.debug("Send the work data response: %s" %
                       json.dumps(self._get_work_data()))
        if self.success:
            self.work_data['result'] = 'SUCCESS'
        else:
            self.work_data['result'] = '\n'.join(self.messages)
        self.job.sendWorkData(json.dumps(self._get_work_data()))

    def _send_final_results(self):
        self._send_work_data()

        if self.success:
            self.job.sendWorkComplete(
                json.dumps(self._get_work_data()))
        else:
            self.job.sendWorkFail()

    def _do_next_step(self):
        """ Send a WORK_STATUS command to the gearman server.
        This can provide a progress bar. """

        # Each opportunity we should check if we need to stop
        if self.cancelled:
            self.work_data['result'] = "Failed: Job cancelled"
            self.job.sendWorkStatus(self.current_step, self.total_steps)
            self.job.sendWorkFail()
            raise Exception('Job cancelled')

        self.current_step += 1
        self.job.sendWorkStatus(self.current_step, self.total_steps)

    def _upload_results(self):
        """Upload the contents of the working dir either using the instructions
        provided by zuul and/or our configuration"""

        self.log.debug("Process the resulting files (upload/push)")

        dir_list = os.listdir(self.job_results_dir)
        path_list = [os.path.join(self.job_results_dir, i) for i in dir_list]

        if 'publish_logs' in self.worker_server.config:
            index_url = utils.push_files(
                self.job_arguments['LOG_PATH'], path_list,
                self.worker_server.config['publish_logs'])
            self.log.debug("Index URL found at %s" % index_url)
            self.work_data['url'] = index_url

        if 'ZUUL_EXTRA_SWIFT_URL' in self.job_arguments:
            # Upload to zuul's url as instructed
            utils.zuul_swift_upload(self.job_working_dir, self.job_arguments)
            self.work_data['url'] = self.job_arguments['LOG_PATH']


class ShellTask(Task):
    log = logging.getLogger("task.shell_task")

    def __init__(self, worker_server, job_name, job_config):
        super(ShellTask, self).__init__(worker_server, job_name, job_config)
        # Define the number of steps we will do to determine our progress.
        self.total_steps = 5

    def _reset(self):
        super(ShellTask, self)._reset()
        self.git_path = None
        self.job_working_dir = None
        self.shell_output_log = None
        self.git_prep_log = None
        self.output_summary = None

    def do_job_steps(self):
        self.log.info('Step 1: Setup environment')
        self._setup_environment()

        self.log.info('Step 2: Checkout updates from git')
        self._grab_patchset(self.job_arguments)

        self.log.info('Step 3: Run shell script')
        self._execute_script()

        self.log.info('Step 4: Analyse logs for errors')
        self._parse_and_check_results()

        self.log.info('Step 5: Handle the results')
        self._handle_results()

        self.log.info('Step 6: Handle extra actions such as shutting down')
        self._handle_cleanup()

    @common.task_step
    def _setup_environment(self):
        self.git_prep_log = os.path.join(
            self.job_results_dir,
            'git_prep.log'
        )
        self.shell_output_log = os.path.join(
            self.job_results_dir,
            'shell_output.log'
        )
        self.output_summary = tempfile.mkstemp()
        self.log.info('Working on node %s' % (os.uname()[1]))

    @common.task_step
    def _grab_patchset(self, job_args):
        """ Checkout the reference into config['git_working_dir'] """

        # TODO(jhesketh): Use the zuul cloner stuff instead :-)

        self.log.debug("Grab the patchset we want to test against")
        local_path = os.path.join(self.worker_server.config['git_working_dir'],
                                  self.th_uuid, job_args['ZUUL_PROJECT'])
        if not os.path.exists(local_path):
            os.makedirs(local_path)

        env = os.environ
        git_args = copy.deepcopy(job_args)
        env.update(git_args)

        cmd = os.path.join(
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         'gerrit-git-prep.sh')
        )
        cmd += ' ' + self.worker_server.config['zuul_server']['gerrit_site']
        cmd += ' ' + self.worker_server.config['zuul_server']['git_origin']

        # NOTE(jhesketh): The most common problem is the git remote timing out
        # Retry cloning multiple times before raising a failure.
        tries = 0
        return_code = 1
        while return_code != 0:
            tries += 1
            env.update({'GIT_CURL_VERBOSE': '1', 'GIT_TRACE': '1'})
            return_code = utils.execute_to_log(cmd, self.git_prep_log,
                                               env=env, cwd=local_path)
            if tries == 2:
                # Try upping the post buffer. See:
                # http://stackoverflow.com/questions/6842687/
                # the-remote-end-hung-up-unexpectedly-while-git-cloning
                utils.execute_to_log(
                    "git config --global http.postBuffer 1048576000",
                    self.git_prep_log, env=env, cwd=local_path)
            if tries >= 4:
                break
        if return_code != 0:
            cmd = 'ifconfig'
            utils.execute_to_log(cmd, self.git_prep_log)
            raise Exception("Failed to fetch patchset")
        self.git_path = local_path
        return local_path

    @common.task_step
    def _execute_script(self):
        # Run script
        cmd = self.job_config['shell_script']
        cmd += (
            (' %(git_path)s %(job_working_dir)s %(unique_id)s')
            % {
                'git_path': self.git_path,
                'job_working_dir': self.job_working_dir,
                'unique_id': self.job.unique
            }
        )

        env_args = copy.deepcopy(os.environ)
        env_args.update(self.job_arguments)
        if self.job.name.startswith('build:'):
            env_args['TH_JOB_NAME'] = self.job.name[len('build:'):]
        else:
            env_args['TH_JOB_NAME'] = self.job.name
        env_args['TH_RESULT_FILE'] = self.output_summary[1]

        self.script_return_code = utils.execute_to_log(
            cmd,
            self.shell_output_log,
            env=env_args
        )

    @common.task_step
    def _parse_and_check_results(self):
        if self.script_return_code > 0:
            self.success = False
            with os.fdopen(self.output_summary[0]) as fp:
                line = fp.readline().strip()
                if len(line) and not line.startswith('SUCCESS'):
                    self.messages.append(line)
            self.messages.append('Return code from test script was non-zero '
                                 '(%d)' % self.script_return_code)

    @common.task_step
    def _handle_cleanup(self):
        """Handle and cleanup functions. Shutdown if requested to so that no
        further jobs are ran if the environment is dirty."""

        try:
            os.remove(self.output_summary[1])
        except OSError:
            pass

    @common.task_step
    def _handle_results(self):
        pass