summaryrefslogtreecommitdiff
path: root/.gitlab-ci/lava
diff options
context:
space:
mode:
authorGuilherme Gallo <guilherme.gallo@collabora.com>2023-04-18 00:18:12 -0300
committerMarge Bot <emma+marge@anholt.net>2023-04-19 14:36:37 +0000
commit11a97b644cbc2f8906f6493b65135374858331a4 (patch)
tree79207c673bb63bed9f2d9b2a8b0730951cd1546a /.gitlab-ci/lava
parent710b568dcdab41963f953599acc2497578a3b82e (diff)
downloadmesa-11a97b644cbc2f8906f6493b65135374858331a4.tar.gz
mesa-11a97b644cbc2f8906f6493b65135374858331a4.tar.bz2
mesa-11a97b644cbc2f8906f6493b65135374858331a4.zip
ci/lava: Refactor LAVAJobSubmitter and add tests
Some refactoring was needed to make LAVAJobSubmitter class testable via pytest. Signed-off-by: Guilherme Gallo <guilherme.gallo@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22500>
Diffstat (limited to '.gitlab-ci/lava')
-rw-r--r--.gitlab-ci/lava/exceptions.py3
-rwxr-xr-x.gitlab-ci/lava/lava_job_submitter.py127
-rw-r--r--.gitlab-ci/lava/utils/lava_job.py21
3 files changed, 109 insertions, 42 deletions
diff --git a/.gitlab-ci/lava/exceptions.py b/.gitlab-ci/lava/exceptions.py
index 3c9a63eb3c6..f877b024510 100644
--- a/.gitlab-ci/lava/exceptions.py
+++ b/.gitlab-ci/lava/exceptions.py
@@ -12,9 +12,10 @@ class MesaCITimeoutError(MesaCIException):
class MesaCIRetryError(MesaCIException):
- def __init__(self, *args, retry_count: int) -> None:
+ def __init__(self, *args, retry_count: int, last_job: None) -> None:
super().__init__(*args)
self.retry_count = retry_count
+ self.last_job = last_job
class MesaCIParseException(MesaCIException):
diff --git a/.gitlab-ci/lava/lava_job_submitter.py b/.gitlab-ci/lava/lava_job_submitter.py
index 39d07deac40..064219de782 100755
--- a/.gitlab-ci/lava/lava_job_submitter.py
+++ b/.gitlab-ci/lava/lava_job_submitter.py
@@ -18,7 +18,7 @@ from collections import defaultdict
from dataclasses import dataclass, fields
from datetime import datetime, timedelta
from io import StringIO
-from os import getenv
+from os import environ, getenv, path
from typing import Any, Optional
import fire
@@ -282,6 +282,7 @@ def print_job_final_status(job):
def execute_job_with_retries(
proxy, job_definition, retry_count, jobs_log
) -> Optional[LAVAJob]:
+ last_failed_job = None
for attempt_no in range(1, retry_count + 2):
# Need to get the logger value from its object to enable autosave
# features, if AutoSaveDict is enabled from StructuredLogging module
@@ -290,41 +291,52 @@ def execute_job_with_retries(
job = LAVAJob(proxy, job_definition, job_log)
STRUCTURAL_LOG["dut_attempt_counter"] = attempt_no
try:
+ job_log["submitter_start_time"] = datetime.now().isoformat()
submit_job(job)
wait_for_job_get_started(job)
log_follower: LogFollower = bootstrap_log_follower()
follow_job_execution(job, log_follower)
return job
+
except (MesaCIException, KeyboardInterrupt) as exception:
job.handle_exception(exception)
+
+ finally:
+ print_job_final_status(job)
+ # If LAVA takes too long to post process the job, the submitter
+ # gives up and proceeds.
+ job_log["submitter_end_time"] = datetime.now().isoformat()
+ last_failed_job = job
print_log(
f"{CONSOLE_LOG['BOLD']}"
f"Finished executing LAVA job in the attempt #{attempt_no}"
f"{CONSOLE_LOG['RESET']}"
)
- finally:
- job_log["finished_time"] = datetime.now().isoformat()
- print_job_final_status(job)
+
+ return last_failed_job
def retriable_follow_job(proxy, job_definition) -> LAVAJob:
number_of_retries = NUMBER_OF_RETRIES_TIMEOUT_DETECTION
- if finished_job := execute_job_with_retries(
+ last_attempted_job = execute_job_with_retries(
proxy, job_definition, number_of_retries, STRUCTURAL_LOG["dut_jobs"]
- ):
- return finished_job
-
- # Job failed in all attempts
- raise MesaCIRetryError(
- f"{CONSOLE_LOG['BOLD']}"
- f"{CONSOLE_LOG['FG_RED']}"
- "Job failed after it exceeded the number of "
- f"{number_of_retries} retries."
- f"{CONSOLE_LOG['RESET']}",
- retry_count=number_of_retries,
)
+ if last_attempted_job.exception is not None:
+ # Infra failed in all attempts
+ raise MesaCIRetryError(
+ f"{CONSOLE_LOG['BOLD']}"
+ f"{CONSOLE_LOG['FG_RED']}"
+ "Job failed after it exceeded the number of "
+ f"{number_of_retries} retries."
+ f"{CONSOLE_LOG['RESET']}",
+ retry_count=number_of_retries,
+ last_job=last_attempted_job,
+ )
+
+ return last_attempted_job
+
@dataclass
class PathResolver:
@@ -371,20 +383,9 @@ class LAVAJobSubmitter(PathResolver):
return
self.__structured_log_context = StructuredLoggerWrapper(self).logger_context()
+ self.proxy = setup_lava_proxy()
- def dump(self, job_definition):
- if self.dump_yaml:
- with GitlabSection(
- "yaml_dump",
- "LAVA job definition (YAML)",
- type=LogSectionType.LAVA_BOOT,
- start_collapsed=True,
- ):
- print(hide_sensitive_data(job_definition))
-
- def submit(self):
- proxy = setup_lava_proxy()
-
+ def __prepare_submission(self) -> str:
# Overwrite the timeout for the testcases with the value offered by the
# user. The testcase running time should be at least 4 times greater than
# the other sections (boot and setup), so we can safely ignore them.
@@ -398,27 +399,81 @@ class LAVAJobSubmitter(PathResolver):
lava_yaml.dump(generate_lava_yaml_payload(self), job_definition_stream)
job_definition = job_definition_stream.getvalue()
- self.dump(job_definition)
+ if self.dump_yaml:
+ self.dump_job_definition(job_definition)
- job = LAVAJob(proxy, job_definition)
- if errors := job.validate():
+ validation_job = LAVAJob(self.proxy, job_definition)
+ if errors := validation_job.validate():
fatal_err(f"Error in LAVA job definition: {errors}")
print_log("LAVA job definition validated successfully")
+ return job_definition
+
+ @classmethod
+ def is_under_ci(cls):
+ ci_envvar: str = getenv("CI", "false")
+ return ci_envvar.lower() == "true"
+
+ def dump_job_definition(self, job_definition) -> None:
+ with GitlabSection(
+ "yaml_dump",
+ "LAVA job definition (YAML)",
+ type=LogSectionType.LAVA_BOOT,
+ start_collapsed=True,
+ ):
+ print(hide_sensitive_data(job_definition))
+
+ def submit(self) -> None:
+ """
+ Prepares and submits the LAVA job.
+ If `validate_only` is True, it validates the job without submitting it.
+ If the job finishes with a non-pass status or encounters an exception,
+ the program exits with a non-zero return code.
+ """
+ job_definition: str = self.__prepare_submission()
+
if self.validate_only:
return
with self.__structured_log_context:
+ last_attempt_job = None
try:
- finished_job = retriable_follow_job(proxy, job_definition)
- exit_code = 0 if finished_job.status == "pass" else 1
- STRUCTURAL_LOG["job_combined_status"] = job.status
- sys.exit(exit_code)
+ last_attempt_job = retriable_follow_job(self.proxy, job_definition)
+
+ except MesaCIRetryError as retry_exception:
+ last_attempt_job = retry_exception.last_job
except Exception as exception:
STRUCTURAL_LOG["job_combined_fail_reason"] = str(exception)
raise exception
+ finally:
+ self.finish_script(last_attempt_job)
+
+ def print_log_artifact_url(self):
+ base_url = "https://$CI_PROJECT_ROOT_NAMESPACE.pages.freedesktop.org/"
+ artifacts_path = "-/$CI_PROJECT_NAME/-/jobs/$CI_JOB_ID/artifacts/"
+ relative_log_path = self.structured_log_file.relative_to(pathlib.Path.cwd())
+ full_path = f"{base_url}{artifacts_path}{relative_log_path}"
+ artifact_url = path.expandvars(full_path)
+
+ print_log(f"Structural Logging data available at: {artifact_url}")
+
+ def finish_script(self, last_attempt_job):
+ if self.is_under_ci() and self.structured_log_file:
+ self.print_log_artifact_url()
+
+ if not last_attempt_job:
+ # No job was run, something bad happened
+ STRUCTURAL_LOG["job_combined_status"] = "script_crash"
+ current_exception = str(sys.exc_info()[0])
+ STRUCTURAL_LOG["job_combined_fail_reason"] = current_exception
+ raise SystemExit(1)
+
+ STRUCTURAL_LOG["job_combined_status"] = last_attempt_job.status
+
+ if last_attempt_job.status != "pass":
+ raise SystemExit(1)
class StructuredLoggerWrapper:
def __init__(self, submitter: LAVAJobSubmitter) -> None:
diff --git a/.gitlab-ci/lava/utils/lava_job.py b/.gitlab-ci/lava/utils/lava_job.py
index 04d5300e8f2..b69f8b9fbb7 100644
--- a/.gitlab-ci/lava/utils/lava_job.py
+++ b/.gitlab-ci/lava/utils/lava_job.py
@@ -34,6 +34,7 @@ class LAVAJob:
self._is_finished = False
self.log: dict[str, Any] = log
self.status = "not_submitted"
+ self.__exception: Optional[str] = None
def heartbeat(self) -> None:
self.last_log_time: datetime = datetime.now()
@@ -61,6 +62,15 @@ class LAVAJob:
def is_finished(self) -> bool:
return self._is_finished
+ @property
+ def exception(self) -> str:
+ return self.__exception
+
+ @exception.setter
+ def exception(self, exception: Exception) -> None:
+ self.__exception = repr(exception)
+ self.log["dut_job_fail_reason"] = self.__exception
+
def validate(self) -> Optional[dict]:
"""Returns a dict with errors, if the validation fails.
@@ -158,6 +168,10 @@ class LAVAJob:
def handle_exception(self, exception: Exception):
print_log(exception)
+ self.cancel()
+ self.exception = exception
+
+ # Give more accurate status depending on exception
if isinstance(exception, MesaCIKnownIssueException):
self.status = "canceled"
elif isinstance(exception, MesaCITimeoutError):
@@ -165,11 +179,8 @@ class LAVAJob:
elif isinstance(exception, MesaCIException):
self.status = "failed"
elif isinstance(exception, KeyboardInterrupt):
- self.status = "canceled_by_user"
+ self.status = "interrupted"
print_log("LAVA job submitter was interrupted. Cancelling the job.")
- raise exception
+ raise
else:
self.status = "job_submitter_error"
-
- self.cancel()
- self.log["dut_job_fail_reason"] = str(exception)