diff options
author | Guilherme Gallo <guilherme.gallo@collabora.com> | 2023-04-18 00:18:12 -0300 |
---|---|---|
committer | Marge Bot <emma+marge@anholt.net> | 2023-04-19 14:36:37 +0000 |
commit | 11a97b644cbc2f8906f6493b65135374858331a4 (patch) | |
tree | 79207c673bb63bed9f2d9b2a8b0730951cd1546a /.gitlab-ci/lava | |
parent | 710b568dcdab41963f953599acc2497578a3b82e (diff) | |
download | mesa-11a97b644cbc2f8906f6493b65135374858331a4.tar.gz mesa-11a97b644cbc2f8906f6493b65135374858331a4.tar.bz2 mesa-11a97b644cbc2f8906f6493b65135374858331a4.zip |
ci/lava: Refactor LAVAJobSubmitter and add tests
Some refactoring was needed to make LAVAJobSubmitter class testable via
pytest.
Signed-off-by: Guilherme Gallo <guilherme.gallo@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22500>
Diffstat (limited to '.gitlab-ci/lava')
-rw-r--r-- | .gitlab-ci/lava/exceptions.py | 3 | ||||
-rwxr-xr-x | .gitlab-ci/lava/lava_job_submitter.py | 127 | ||||
-rw-r--r-- | .gitlab-ci/lava/utils/lava_job.py | 21 |
3 files changed, 109 insertions, 42 deletions
diff --git a/.gitlab-ci/lava/exceptions.py b/.gitlab-ci/lava/exceptions.py index 3c9a63eb3c6..f877b024510 100644 --- a/.gitlab-ci/lava/exceptions.py +++ b/.gitlab-ci/lava/exceptions.py @@ -12,9 +12,10 @@ class MesaCITimeoutError(MesaCIException): class MesaCIRetryError(MesaCIException): - def __init__(self, *args, retry_count: int) -> None: + def __init__(self, *args, retry_count: int, last_job: None) -> None: super().__init__(*args) self.retry_count = retry_count + self.last_job = last_job class MesaCIParseException(MesaCIException): diff --git a/.gitlab-ci/lava/lava_job_submitter.py b/.gitlab-ci/lava/lava_job_submitter.py index 39d07deac40..064219de782 100755 --- a/.gitlab-ci/lava/lava_job_submitter.py +++ b/.gitlab-ci/lava/lava_job_submitter.py @@ -18,7 +18,7 @@ from collections import defaultdict from dataclasses import dataclass, fields from datetime import datetime, timedelta from io import StringIO -from os import getenv +from os import environ, getenv, path from typing import Any, Optional import fire @@ -282,6 +282,7 @@ def print_job_final_status(job): def execute_job_with_retries( proxy, job_definition, retry_count, jobs_log ) -> Optional[LAVAJob]: + last_failed_job = None for attempt_no in range(1, retry_count + 2): # Need to get the logger value from its object to enable autosave # features, if AutoSaveDict is enabled from StructuredLogging module @@ -290,41 +291,52 @@ def execute_job_with_retries( job = LAVAJob(proxy, job_definition, job_log) STRUCTURAL_LOG["dut_attempt_counter"] = attempt_no try: + job_log["submitter_start_time"] = datetime.now().isoformat() submit_job(job) wait_for_job_get_started(job) log_follower: LogFollower = bootstrap_log_follower() follow_job_execution(job, log_follower) return job + except (MesaCIException, KeyboardInterrupt) as exception: job.handle_exception(exception) + + finally: + print_job_final_status(job) + # If LAVA takes too long to post process the job, the submitter + # gives up and proceeds. + job_log["submitter_end_time"] = datetime.now().isoformat() + last_failed_job = job print_log( f"{CONSOLE_LOG['BOLD']}" f"Finished executing LAVA job in the attempt #{attempt_no}" f"{CONSOLE_LOG['RESET']}" ) - finally: - job_log["finished_time"] = datetime.now().isoformat() - print_job_final_status(job) + + return last_failed_job def retriable_follow_job(proxy, job_definition) -> LAVAJob: number_of_retries = NUMBER_OF_RETRIES_TIMEOUT_DETECTION - if finished_job := execute_job_with_retries( + last_attempted_job = execute_job_with_retries( proxy, job_definition, number_of_retries, STRUCTURAL_LOG["dut_jobs"] - ): - return finished_job - - # Job failed in all attempts - raise MesaCIRetryError( - f"{CONSOLE_LOG['BOLD']}" - f"{CONSOLE_LOG['FG_RED']}" - "Job failed after it exceeded the number of " - f"{number_of_retries} retries." - f"{CONSOLE_LOG['RESET']}", - retry_count=number_of_retries, ) + if last_attempted_job.exception is not None: + # Infra failed in all attempts + raise MesaCIRetryError( + f"{CONSOLE_LOG['BOLD']}" + f"{CONSOLE_LOG['FG_RED']}" + "Job failed after it exceeded the number of " + f"{number_of_retries} retries." + f"{CONSOLE_LOG['RESET']}", + retry_count=number_of_retries, + last_job=last_attempted_job, + ) + + return last_attempted_job + @dataclass class PathResolver: @@ -371,20 +383,9 @@ class LAVAJobSubmitter(PathResolver): return self.__structured_log_context = StructuredLoggerWrapper(self).logger_context() + self.proxy = setup_lava_proxy() - def dump(self, job_definition): - if self.dump_yaml: - with GitlabSection( - "yaml_dump", - "LAVA job definition (YAML)", - type=LogSectionType.LAVA_BOOT, - start_collapsed=True, - ): - print(hide_sensitive_data(job_definition)) - - def submit(self): - proxy = setup_lava_proxy() - + def __prepare_submission(self) -> str: # Overwrite the timeout for the testcases with the value offered by the # user. The testcase running time should be at least 4 times greater than # the other sections (boot and setup), so we can safely ignore them. @@ -398,27 +399,81 @@ class LAVAJobSubmitter(PathResolver): lava_yaml.dump(generate_lava_yaml_payload(self), job_definition_stream) job_definition = job_definition_stream.getvalue() - self.dump(job_definition) + if self.dump_yaml: + self.dump_job_definition(job_definition) - job = LAVAJob(proxy, job_definition) - if errors := job.validate(): + validation_job = LAVAJob(self.proxy, job_definition) + if errors := validation_job.validate(): fatal_err(f"Error in LAVA job definition: {errors}") print_log("LAVA job definition validated successfully") + return job_definition + + @classmethod + def is_under_ci(cls): + ci_envvar: str = getenv("CI", "false") + return ci_envvar.lower() == "true" + + def dump_job_definition(self, job_definition) -> None: + with GitlabSection( + "yaml_dump", + "LAVA job definition (YAML)", + type=LogSectionType.LAVA_BOOT, + start_collapsed=True, + ): + print(hide_sensitive_data(job_definition)) + + def submit(self) -> None: + """ + Prepares and submits the LAVA job. + If `validate_only` is True, it validates the job without submitting it. + If the job finishes with a non-pass status or encounters an exception, + the program exits with a non-zero return code. + """ + job_definition: str = self.__prepare_submission() + if self.validate_only: return with self.__structured_log_context: + last_attempt_job = None try: - finished_job = retriable_follow_job(proxy, job_definition) - exit_code = 0 if finished_job.status == "pass" else 1 - STRUCTURAL_LOG["job_combined_status"] = job.status - sys.exit(exit_code) + last_attempt_job = retriable_follow_job(self.proxy, job_definition) + + except MesaCIRetryError as retry_exception: + last_attempt_job = retry_exception.last_job except Exception as exception: STRUCTURAL_LOG["job_combined_fail_reason"] = str(exception) raise exception + finally: + self.finish_script(last_attempt_job) + + def print_log_artifact_url(self): + base_url = "https://$CI_PROJECT_ROOT_NAMESPACE.pages.freedesktop.org/" + artifacts_path = "-/$CI_PROJECT_NAME/-/jobs/$CI_JOB_ID/artifacts/" + relative_log_path = self.structured_log_file.relative_to(pathlib.Path.cwd()) + full_path = f"{base_url}{artifacts_path}{relative_log_path}" + artifact_url = path.expandvars(full_path) + + print_log(f"Structural Logging data available at: {artifact_url}") + + def finish_script(self, last_attempt_job): + if self.is_under_ci() and self.structured_log_file: + self.print_log_artifact_url() + + if not last_attempt_job: + # No job was run, something bad happened + STRUCTURAL_LOG["job_combined_status"] = "script_crash" + current_exception = str(sys.exc_info()[0]) + STRUCTURAL_LOG["job_combined_fail_reason"] = current_exception + raise SystemExit(1) + + STRUCTURAL_LOG["job_combined_status"] = last_attempt_job.status + + if last_attempt_job.status != "pass": + raise SystemExit(1) class StructuredLoggerWrapper: def __init__(self, submitter: LAVAJobSubmitter) -> None: diff --git a/.gitlab-ci/lava/utils/lava_job.py b/.gitlab-ci/lava/utils/lava_job.py index 04d5300e8f2..b69f8b9fbb7 100644 --- a/.gitlab-ci/lava/utils/lava_job.py +++ b/.gitlab-ci/lava/utils/lava_job.py @@ -34,6 +34,7 @@ class LAVAJob: self._is_finished = False self.log: dict[str, Any] = log self.status = "not_submitted" + self.__exception: Optional[str] = None def heartbeat(self) -> None: self.last_log_time: datetime = datetime.now() @@ -61,6 +62,15 @@ class LAVAJob: def is_finished(self) -> bool: return self._is_finished + @property + def exception(self) -> str: + return self.__exception + + @exception.setter + def exception(self, exception: Exception) -> None: + self.__exception = repr(exception) + self.log["dut_job_fail_reason"] = self.__exception + def validate(self) -> Optional[dict]: """Returns a dict with errors, if the validation fails. @@ -158,6 +168,10 @@ class LAVAJob: def handle_exception(self, exception: Exception): print_log(exception) + self.cancel() + self.exception = exception + + # Give more accurate status depending on exception if isinstance(exception, MesaCIKnownIssueException): self.status = "canceled" elif isinstance(exception, MesaCITimeoutError): @@ -165,11 +179,8 @@ class LAVAJob: elif isinstance(exception, MesaCIException): self.status = "failed" elif isinstance(exception, KeyboardInterrupt): - self.status = "canceled_by_user" + self.status = "interrupted" print_log("LAVA job submitter was interrupted. Cancelling the job.") - raise exception + raise else: self.status = "job_submitter_error" - - self.cancel() - self.log["dut_job_fail_reason"] = str(exception) |