diff options
Diffstat (limited to 'testing/mozharness/scripts/release/antivirus.py')
-rw-r--r-- | testing/mozharness/scripts/release/antivirus.py | 193 |
1 files changed, 193 insertions, 0 deletions
diff --git a/testing/mozharness/scripts/release/antivirus.py b/testing/mozharness/scripts/release/antivirus.py new file mode 100644 index 0000000000..b40dc5cc01 --- /dev/null +++ b/testing/mozharness/scripts/release/antivirus.py @@ -0,0 +1,193 @@ +from multiprocessing.pool import ThreadPool +import os +import re +import sys +import shutil + +sys.path.insert(1, os.path.dirname(os.path.dirname(sys.path[0]))) + +from mozharness.base.python import VirtualenvMixin, virtualenv_config_options +from mozharness.base.script import BaseScript + + +class AntivirusScan(BaseScript, VirtualenvMixin): + config_options = [ + [["--product"], { + "dest": "product", + "help": "Product being released, eg: firefox, thunderbird", + }], + [["--version"], { + "dest": "version", + "help": "Version of release, eg: 39.0b5", + }], + [["--build-number"], { + "dest": "build_number", + "help": "Build number of release, eg: 2", + }], + [["--bucket-name"], { + "dest": "bucket_name", + "help": "S3 Bucket to retrieve files from", + }], + [["--exclude"], { + "dest": "excludes", + "action": "append", + "help": "List of filename patterns to exclude. See script source for default", + }], + [["-d", "--download-parallelization"], { + "dest": "download_parallelization", + "default": 6, + "type": "int", + "help": "Number of concurrent file downloads", + }], + [["-s", "--scan-parallelization"], { + "dest": "scan_parallelization", + "default": 4, + "type": "int", + "help": "Number of concurrent file scans", + }], + [["--tools-repo"], { + "dest": "tools_repo", + "default": "https://hg.mozilla.org/build/tools", + }], + [["--tools-revision"], { + "dest": "tools_revision", + "help": "Revision of tools repo to use when downloading extract_and_run_command.py", + }], + ] + virtualenv_config_options + + DEFAULT_EXCLUDES = [ + r"^.*tests.*$", + r"^.*crashreporter.*$", + r"^.*\.zip(\.asc)?$", + r"^.*\.log$", + r"^.*\.txt$", + r"^.*\.asc$", + r"^.*/partner-repacks.*$", + r"^.*.checksums(\.asc)?$", + r"^.*/logs/.*$", + r"^.*/jsshell.*$", + r"^.*json$", + r"^.*/host.*$", + r"^.*/mar-tools/.*$", + r"^.*robocop.apk$", + r"^.*contrib.*" + ] + CACHE_DIR = 'cache' + + def __init__(self): + BaseScript.__init__(self, + config_options=self.config_options, + require_config_file=False, + config={ + "virtualenv_modules": [ + "boto", + "redo", + "mar", + ], + "virtualenv_path": "venv", + }, + all_actions=[ + "create-virtualenv", + "activate-virtualenv", + "get-extract-script", + "get-files", + "scan-files", + "cleanup-cache", + ], + default_actions=[ + "create-virtualenv", + "activate-virtualenv", + "get-extract-script", + "get-files", + "scan-files", + "cleanup-cache", + ], + ) + self.excludes = self.config.get('excludes', self.DEFAULT_EXCLUDES) + self.dest_dir = self.CACHE_DIR + + def _get_candidates_prefix(self): + return "pub/{}/candidates/{}-candidates/build{}/".format( + self.config['product'], + self.config["version"], + self.config["build_number"] + ) + + def _matches_exclude(self, keyname): + for exclude in self.excludes: + if re.search(exclude, keyname): + return True + return False + + def get_extract_script(self): + """Gets a copy of extract_and_run_command.py from tools, and the supporting mar.py, + so that we can unpack various files for clam to scan them.""" + remote_file = "{}/raw-file/{}/stage/extract_and_run_command.py".format(self.config["tools_repo"], + self.config["tools_revision"]) + self.download_file(remote_file, file_name="extract_and_run_command.py") + + def get_files(self): + """Pull the candidate files down from S3 for scanning, using parallel requests""" + from boto.s3.connection import S3Connection + from boto.exception import S3CopyError, S3ResponseError + from redo import retry + from httplib import HTTPException + + # suppress boto debug logging, it's too verbose with --loglevel=debug + import logging + logging.getLogger('boto').setLevel(logging.INFO) + + self.info("Connecting to S3") + conn = S3Connection(anon=True) + self.info("Getting bucket {}".format(self.config["bucket_name"])) + bucket = conn.get_bucket(self.config["bucket_name"]) + + if os.path.exists(self.dest_dir): + self.info('Emptying {}'.format(self.dest_dir)) + shutil.rmtree(self.dest_dir) + os.makedirs(self.dest_dir) + + def worker(item): + source, destination = item + + self.info("Downloading {} to {}".format(source, destination)) + key = bucket.get_key(source) + return retry(key.get_contents_to_filename, + args=(destination, ), + sleeptime=30, max_sleeptime=150, + retry_exceptions=(S3CopyError, S3ResponseError, + IOError, HTTPException)) + + def find_release_files(): + candidates_prefix = self._get_candidates_prefix() + self.info("Getting key names from candidates") + for key in bucket.list(prefix=candidates_prefix): + keyname = key.name + if self._matches_exclude(keyname): + self.debug("Excluding {}".format(keyname)) + else: + destination = os.path.join(self.dest_dir, keyname.replace(candidates_prefix, '')) + dest_dir = os.path.dirname(destination) + if not os.path.isdir(dest_dir): + os.makedirs(dest_dir) + yield (keyname, destination) + + pool = ThreadPool(self.config["download_parallelization"]) + pool.map(worker, find_release_files()) + + def scan_files(self): + """Scan the files we've collected. We do the download and scan concurrently to make + it easier to have a coherent log afterwards. Uses the venv python.""" + self.run_command([self.query_python_path(), 'extract_and_run_command.py', + '-j{}'.format(self.config['scan_parallelization']), + 'clamdscan', '-m', '--no-summary', '--', self.dest_dir]) + + def cleanup_cache(self): + """If we have simultaneous releases in flight an av slave may end up doing another + av job before being recycled, and we need to make sure the full disk is available.""" + shutil.rmtree(self.dest_dir) + + +if __name__ == "__main__": + myScript = AntivirusScan() + myScript.run_and_exit() |