Import Mesa 23.3.6

author: Jonathan Gray <jsg@cvs.openbsd.org> 2024-04-02 09:30:07 +0000
committer: Jonathan Gray <jsg@cvs.openbsd.org> 2024-04-02 09:30:07 +0000
commit: f54e142455cb3c9d1662dae7e096a32a47e5409b (patch)
tree: 440ecd46269f0eac25e349e1ed58f246490c5e26 /lib/mesa/.gitlab-ci/lava
parent: 36d8503c27530f68d655d3ef77a6eaa4dfd8ad65 (diff)
15 files changed, 1256 insertions, 494 deletions
diff --git a/lib/mesa/.gitlab-ci/lava/exceptions.py b/lib/mesa/.gitlab-ci/lava/exceptions.py
index 3c9a63eb3..f877b0245 100644
--- a/lib/mesa/.gitlab-ci/lava/exceptions.py
+++ b/lib/mesa/.gitlab-ci/lava/exceptions.py
@@ -12,9 +12,10 @@ class MesaCITimeoutError(MesaCIException):
 
 
 class MesaCIRetryError(MesaCIException):
-    def __init__(self, *args, retry_count: int) -> None:
+    def __init__(self, *args, retry_count: int, last_job: None) -> None:
         super().__init__(*args)
         self.retry_count = retry_count
+        self.last_job = last_job
 
 
 class MesaCIParseException(MesaCIException):
diff --git a/lib/mesa/.gitlab-ci/lava/lava-gitlab-ci.yml b/lib/mesa/.gitlab-ci/lava/lava-gitlab-ci.yml
index 61039de87..de589595a 100755
--- a/lib/mesa/.gitlab-ci/lava/lava-gitlab-ci.yml
+++ b/lib/mesa/.gitlab-ci/lava/lava-gitlab-ci.yml
@@ -1,3 +1,7 @@
+variables:
+  LAVA_SSH_CLIENT_IMAGE: "${CI_REGISTRY_IMAGE}/alpine/x86_64_lava_ssh_client:${ALPINE_X86_64_LAVA_SSH_TAG}--${MESA_TEMPLATES_COMMIT}"
+
+
 .lava-test:
   # Cancel job if a newer commit is pushed to the same branch
   interruptible: true
@@ -7,14 +11,14 @@
     # proxy used to cache data locally
     FDO_HTTP_CACHE_URI: "http://caching-proxy/cache/?uri="
     # base system generated by the container build job, shared between many pipelines
-    BASE_SYSTEM_HOST_PREFIX: "${MINIO_HOST}/mesa-lava"
-    BASE_SYSTEM_MAINLINE_HOST_PATH: "${BASE_SYSTEM_HOST_PREFIX}/${FDO_UPSTREAM_REPO}/${DISTRIBUTION_TAG}/${ARCH}"
-    BASE_SYSTEM_FORK_HOST_PATH: "${BASE_SYSTEM_HOST_PREFIX}/${CI_PROJECT_PATH}/${DISTRIBUTION_TAG}/${ARCH}"
+    BASE_SYSTEM_HOST_PREFIX: "${S3_HOST}/mesa-lava"
+    BASE_SYSTEM_MAINLINE_HOST_PATH: "${BASE_SYSTEM_HOST_PREFIX}/${FDO_UPSTREAM_REPO}/${DISTRIBUTION_TAG}/${DEBIAN_ARCH}"
+    BASE_SYSTEM_FORK_HOST_PATH: "${BASE_SYSTEM_HOST_PREFIX}/${CI_PROJECT_PATH}/${DISTRIBUTION_TAG}/${DEBIAN_ARCH}"
     # per-job build artifacts
-    BUILD_PATH: "${PIPELINE_ARTIFACTS_BASE}/${CI_PROJECT_NAME}-${ARCH}.tar.zst"
     JOB_ROOTFS_OVERLAY_PATH: "${JOB_ARTIFACTS_BASE}/job-rootfs-overlay.tar.gz"
     JOB_RESULTS_PATH: "${JOB_ARTIFACTS_BASE}/results.tar.zst"
-    MINIO_RESULTS_UPLOAD: "${JOB_ARTIFACTS_BASE}"
+    S3_ARTIFACT_NAME: "mesa-${ARCH}-default-debugoptimized"
+    S3_RESULTS_UPLOAD: "${JOB_ARTIFACTS_BASE}"
     PIGLIT_NO_WINDOW: 1
     VISIBILITY_GROUP: "Collabora+fdo"
   script:
@@ -32,45 +36,52 @@
     - $RUNNER_TAG
   after_script:
     - curl -L --retry 4 -f --retry-all-errors --retry-delay 60 -s "https://${JOB_RESULTS_PATH}" | tar --zstd -x
+  needs:
+    - alpine/x86_64_lava_ssh_client
+    - !reference [.required-for-hardware-jobs, needs]
 
-.lava-test:armhf:
+.lava-test:arm32:
   variables:
-    ARCH: armhf
+    ARCH: arm32
+    DEBIAN_ARCH: armhf
     KERNEL_IMAGE_NAME: zImage
     KERNEL_IMAGE_TYPE: "zimage"
     BOOT_METHOD: u-boot
   extends:
-    - .use-debian/arm_build # for same $MESA_ARTIFACTS_TAG as in kernel+rootfs_armhf
-    - .use-debian/x86_build
+    - .use-debian/arm64_build # for same $MESA_ARTIFACTS_TAG as in kernel+rootfs_arm32
+    - .use-debian/x86_64_build
     - .lava-test
     - .use-kernel+rootfs-arm
   needs:
-    - kernel+rootfs_armhf
-    - debian/x86_build
-    - debian-armhf
+    - !reference [.lava-test, needs]
+    - kernel+rootfs_arm32
+    - debian/x86_64_build
+    - debian-arm32
 
-.lava-test-deqp:armhf:
+.lava-test-deqp:arm32:
   extends:
-    - .lava-test:armhf
+    - .lava-test:arm32
   variables:
     HWCI_TEST_SCRIPT: "/install/deqp-runner.sh"
 
 .lava-test:arm64:
   variables:
     ARCH: arm64
+    DEBIAN_ARCH: arm64
     KERNEL_IMAGE_NAME: Image
     KERNEL_IMAGE_TYPE: "image"
     BOOT_METHOD: u-boot
   extends:
-    - .use-debian/arm_build # for same $MESA_ARTIFACTS_TAG as in kernel+rootfs_arm64
-    - .use-debian/x86_build
+    - .use-debian/arm64_build # for same $MESA_ARTIFACTS_TAG as in kernel+rootfs_arm64
+    - .use-debian/x86_64_build
     - .lava-test
     - .use-kernel+rootfs-arm
   dependencies:
     - debian-arm64
   needs:
+    - !reference [.lava-test, needs]
     - kernel+rootfs_arm64
-    - debian/x86_build
+    - debian/x86_64_build
     - debian-arm64
 
 .lava-test-deqp:arm64:
@@ -79,30 +90,34 @@
   extends:
     - .lava-test:arm64
 
-.lava-test:amd64:
+.lava-test:x86_64:
   variables:
-    ARCH: amd64
+    ARCH: x86_64
+    DEBIAN_ARCH: amd64
     KERNEL_IMAGE_NAME: bzImage
     KERNEL_IMAGE_TYPE: "zimage"
     BOOT_METHOD: u-boot
   extends:
-    - .use-debian/x86_build-base # for same $MESA_ARTIFACTS_BASE_TAG as in kernel+rootfs_amd64
-    - .use-debian/x86_build
+    - .use-debian/x86_64_build-base # for same $MESA_ARTIFACTS_BASE_TAG as in kernel+rootfs_x86_64
+    - .use-debian/x86_64_build
     - .lava-test
-    - .use-kernel+rootfs-amd64
+    - .use-kernel+rootfs-x86_64
   needs:
-    - kernel+rootfs_amd64
+    - !reference [.lava-test, needs]
+    - kernel+rootfs_x86_64
     - debian-testing
 
-.lava-test-deqp:amd64:
+.lava-test-deqp:x86_64:
   variables:
     HWCI_TEST_SCRIPT: "/install/deqp-runner.sh"
   extends:
-    - .lava-test:amd64
+    - .lava-test:x86_64
 
 .lava-traces-base:
   variables:
     HWCI_TEST_SCRIPT: "/install/piglit/piglit-traces.sh"
+    # until we overcome Infrastructure issues, give traces extra 5 min before timeout
+    DEVICE_HANGING_TIMEOUT_SEC: 600
   artifacts:
     reports:
       junit: results/junit.xml
@@ -113,15 +128,15 @@
     PIGLIT_RESULTS: "${GPU_VERSION}-${PIGLIT_PROFILES}"
     HWCI_TEST_SCRIPT: "/install/piglit/piglit-runner.sh"
 
-.lava-piglit-traces:amd64:
+.lava-piglit-traces:x86_64:
   extends:
-    - .lava-test:amd64
+    - .lava-test:x86_64
     - .lava-piglit
     - .lava-traces-base
 
-.lava-piglit-traces:armhf:
+.lava-piglit-traces:arm32:
   extends:
-    - .lava-test:armhf
+    - .lava-test:arm32
     - .lava-piglit
     - .lava-traces-base
 
@@ -131,9 +146,9 @@
     - .lava-piglit
     - .lava-traces-base
 
-.lava-piglit:amd64:
+.lava-piglit:x86_64:
   extends:
-    - .lava-test:amd64
+    - .lava-test:x86_64
     - .lava-piglit
 
 .lava-piglit:arm64:
diff --git a/lib/mesa/.gitlab-ci/lava/lava-pytest.sh b/lib/mesa/.gitlab-ci/lava/lava-pytest.sh
index 9ace8a05f..786a669b9 100755
--- a/lib/mesa/.gitlab-ci/lava/lava-pytest.sh
+++ b/lib/mesa/.gitlab-ci/lava/lava-pytest.sh
@@ -1,35 +1,17 @@
 #!/usr/bin/env bash
-#
-# Copyright (C) 2022 Collabora Limited
+# SPDX-License-Identifier: MIT
+# © Collabora Limited
 # Author: Guilherme Gallo <guilherme.gallo@collabora.com>
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice (including the next
-# paragraph) shall be included in all copies or substantial portions of the
-# Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
 
 # This script runs unit/integration tests related with LAVA CI tools
+# shellcheck disable=SC1091 # The relative paths in this file only become valid at runtime.
 
 set -ex
 
 # Use this script in a python virtualenv for isolation
 python3 -m venv .venv
 . .venv/bin/activate
-python3 -m pip install -r ${CI_PROJECT_DIR}/.gitlab-ci/lava/requirements-test.txt
+python3 -m pip install --break-system-packages -r "${CI_PROJECT_DIR}/.gitlab-ci/lava/requirements-test.txt"
 
 TEST_DIR=${CI_PROJECT_DIR}/.gitlab-ci/tests
 
diff --git a/lib/mesa/.gitlab-ci/lava/lava-submit.sh b/lib/mesa/.gitlab-ci/lava/lava-submit.sh
index 29d045a68..e02bcb24c 100755
--- a/lib/mesa/.gitlab-ci/lava/lava-submit.sh
+++ b/lib/mesa/.gitlab-ci/lava/lava-submit.sh
@@ -1,15 +1,18 @@
-#!/bin/bash
+#!/usr/bin/env bash
+# shellcheck disable=SC2086 # we want word splitting
 
-set -e
-set -x
+set -ex
 
-# Try to use the kernel and rootfs built in mainline first, so we're more
-# likely to hit cache
-if curl -s -X HEAD -L --retry 4 -f --retry-all-errors --retry-delay 60 \
-    "https://${BASE_SYSTEM_MAINLINE_HOST_PATH}/done"; then
-	BASE_SYSTEM_HOST_PATH="${BASE_SYSTEM_MAINLINE_HOST_PATH}"
-else
+# If we run in the fork (not from mesa or Marge-bot), reuse mainline kernel and rootfs, if exist.
+BASE_SYSTEM_HOST_PATH="${BASE_SYSTEM_MAINLINE_HOST_PATH}"
+if [ "$CI_PROJECT_PATH" != "$FDO_UPSTREAM_REPO" ]; then
+    if ! curl -s -X HEAD -L --retry 4 -f --retry-delay 60 \
+      "https://${BASE_SYSTEM_MAINLINE_HOST_PATH}/done"; then
+	echo "Using kernel and rootfs from the fork, cached from mainline is unavailable."
 	BASE_SYSTEM_HOST_PATH="${BASE_SYSTEM_FORK_HOST_PATH}"
+    else
+	echo "Using the cached mainline kernel and rootfs."
+    fi
 fi
 
 rm -rf results
@@ -18,46 +21,41 @@ mkdir -p results/job-rootfs-overlay/
 cp artifacts/ci-common/capture-devcoredump.sh results/job-rootfs-overlay/
 cp artifacts/ci-common/init-*.sh results/job-rootfs-overlay/
 cp artifacts/ci-common/intel-gpu-freq.sh results/job-rootfs-overlay/
+cp artifacts/ci-common/kdl.sh results/job-rootfs-overlay/
 cp "$SCRIPTS_DIR"/setup-test-env.sh results/job-rootfs-overlay/
 
 # Prepare env vars for upload.
-KERNEL_IMAGE_BASE_URL="https://${BASE_SYSTEM_HOST_PATH}" \
-	artifacts/ci-common/generate-env.sh > results/job-rootfs-overlay/set-job-env-vars.sh
 section_start variables "Variables passed through:"
-cat results/job-rootfs-overlay/set-job-env-vars.sh
+artifacts/ci-common/generate-env.sh | tee results/job-rootfs-overlay/set-job-env-vars.sh
 section_end variables
 
 tar zcf job-rootfs-overlay.tar.gz -C results/job-rootfs-overlay/ .
 ci-fairy s3cp --token-file "${CI_JOB_JWT_FILE}" job-rootfs-overlay.tar.gz "https://${JOB_ROOTFS_OVERLAY_PATH}"
 
-ARTIFACT_URL="${FDO_HTTP_CACHE_URI:-}https://${BUILD_PATH}"
-# Make it take the mesa build from MINIO_ARTIFACT_NAME, if it is specified in
-# the environment. This will make the LAVA behavior consistent with the
-# baremetal jobs.
-if [ -n "${MINIO_ARTIFACT_NAME}" ]
-then
-	ARTIFACT_URL="${FDO_HTTP_CACHE_URI:-}https://${PIPELINE_ARTIFACTS_BASE}/${MINIO_ARTIFACT_NAME}.tar.zst"
-fi
+ARTIFACT_URL="${FDO_HTTP_CACHE_URI:-}https://${PIPELINE_ARTIFACTS_BASE}/${S3_ARTIFACT_NAME:?}.tar.zst"
 
 touch results/lava.log
 tail -f results/lava.log &
 PYTHONPATH=artifacts/ artifacts/lava/lava_job_submitter.py \
+	submit \
 	--dump-yaml \
 	--pipeline-info "$CI_JOB_NAME: $CI_PIPELINE_URL on $CI_COMMIT_REF_NAME ${CI_NODE_INDEX}/${CI_NODE_TOTAL}" \
 	--rootfs-url-prefix "https://${BASE_SYSTEM_HOST_PATH}" \
-	--kernel-url-prefix "https://${BASE_SYSTEM_HOST_PATH}" \
+	--kernel-url-prefix "${KERNEL_IMAGE_BASE}/${DEBIAN_ARCH}" \
 	--build-url "${ARTIFACT_URL}" \
 	--job-rootfs-overlay-url "${FDO_HTTP_CACHE_URI:-}https://${JOB_ROOTFS_OVERLAY_PATH}" \
-	--job-timeout ${JOB_TIMEOUT:-30} \
+	--job-timeout-min ${JOB_TIMEOUT:-30} \
 	--first-stage-init artifacts/ci-common/init-stage1.sh \
-	--ci-project-dir ${CI_PROJECT_DIR} \
-	--device-type ${DEVICE_TYPE} \
-	--dtb ${DTB} \
+	--ci-project-dir "${CI_PROJECT_DIR}" \
+	--device-type "${DEVICE_TYPE}" \
+	--dtb-filename "${DTB}" \
 	--jwt-file "${CI_JOB_JWT_FILE}" \
-	--kernel-image-name ${KERNEL_IMAGE_NAME} \
+	--kernel-image-name "${KERNEL_IMAGE_NAME}" \
 	--kernel-image-type "${KERNEL_IMAGE_TYPE}" \
-	--boot-method ${BOOT_METHOD} \
-	--visibility-group ${VISIBILITY_GROUP} \
+	--boot-method "${BOOT_METHOD}" \
+	--visibility-group "${VISIBILITY_GROUP}" \
 	--lava-tags "${LAVA_TAGS}" \
 	--mesa-job-name "$CI_JOB_NAME" \
+	--structured-log-file "results/lava_job_detail.json" \
+	--ssh-client-image "${LAVA_SSH_CLIENT_IMAGE}" \
 	>> results/lava.log
diff --git a/lib/mesa/.gitlab-ci/lava/lava_job_submitter.py b/lib/mesa/.gitlab-ci/lava/lava_job_submitter.py
index 5feb4688c..b2d8e5306 100755
--- a/lib/mesa/.gitlab-ci/lava/lava_job_submitter.py
+++ b/lib/mesa/.gitlab-ci/lava/lava_job_submitter.py
@@ -9,25 +9,21 @@
 
 """Send a job to LAVA, track it and collect log back"""
 
-
-import argparse
 import contextlib
+import json
 import pathlib
-import re
 import sys
 import time
-import traceback
-import urllib.parse
-import xmlrpc.client
+from collections import defaultdict
+from dataclasses import dataclass, fields
 from datetime import datetime, timedelta
 from io import StringIO
-from os import getenv
+from os import environ, getenv, path
 from typing import Any, Optional
 
-import lavacli
+import fire
 from lava.exceptions import (
     MesaCIException,
-    MesaCIKnownIssueException,
     MesaCIParseException,
     MesaCIRetryError,
     MesaCITimeoutError,
@@ -36,303 +32,61 @@ from lava.utils import CONSOLE_LOG
 from lava.utils import DEFAULT_GITLAB_SECTION_TIMEOUTS as GL_SECTION_TIMEOUTS
 from lava.utils import (
     GitlabSection,
+    LAVAJob,
     LogFollower,
     LogSectionType,
+    call_proxy,
     fatal_err,
+    generate_lava_job_definition,
     hide_sensitive_data,
     print_log,
+    setup_lava_proxy,
 )
 from lavacli.utils import flow_yaml as lava_yaml
 
+# Initialize structural logging with a defaultdict, it can be changed for more
+# sophisticated dict-like data abstractions.
+STRUCTURAL_LOG = defaultdict(list)
+
+try:
+    from ci.structured_logger import StructuredLogger
+except ImportError as e:
+    print_log(
+        f"Could not import StructuredLogger library: {e}. "
+        "Falling back to defaultdict based structured logger."
+    )
+
 # Timeout in seconds to decide if the device from the dispatched LAVA job has
 # hung or not due to the lack of new log output.
-DEVICE_HANGING_TIMEOUT_SEC = int(getenv("LAVA_DEVICE_HANGING_TIMEOUT_SEC",  5*60))
+DEVICE_HANGING_TIMEOUT_SEC = int(getenv("DEVICE_HANGING_TIMEOUT_SEC",  5*60))
 
 # How many seconds the script should wait before try a new polling iteration to
 # check if the dispatched LAVA job is running or waiting in the job queue.
-WAIT_FOR_DEVICE_POLLING_TIME_SEC = int(getenv("LAVA_WAIT_FOR_DEVICE_POLLING_TIME_SEC", 10))
+WAIT_FOR_DEVICE_POLLING_TIME_SEC = int(
+    getenv("LAVA_WAIT_FOR_DEVICE_POLLING_TIME_SEC", 1)
+)
+
+# How many seconds the script will wait to let LAVA finalize the job and give
+# the final details.
+WAIT_FOR_LAVA_POST_PROCESSING_SEC = int(getenv("LAVA_WAIT_LAVA_POST_PROCESSING_SEC", 5))
+WAIT_FOR_LAVA_POST_PROCESSING_RETRIES = int(
+    getenv("LAVA_WAIT_LAVA_POST_PROCESSING_RETRIES", 6)
+)
 
 # How many seconds to wait between log output LAVA RPC calls.
 LOG_POLLING_TIME_SEC = int(getenv("LAVA_LOG_POLLING_TIME_SEC", 5))
 
 # How many retries should be made when a timeout happen.
-NUMBER_OF_RETRIES_TIMEOUT_DETECTION = int(getenv("LAVA_NUMBER_OF_RETRIES_TIMEOUT_DETECTION", 2))
-
-# How many attempts should be made when a timeout happen during LAVA device boot.
-NUMBER_OF_ATTEMPTS_LAVA_BOOT = int(getenv("LAVA_NUMBER_OF_ATTEMPTS_LAVA_BOOT", 3))
-
-# Supports any integers in [0, 100].
-# The scheduler considers the job priority when ordering the queue
-# to consider which job should run next.
-JOB_PRIORITY = int(getenv("LAVA_JOB_PRIORITY", 75))
-
-
-def generate_lava_yaml_payload(args) -> dict[str, Any]:
-    # General metadata and permissions, plus also inexplicably kernel arguments
-    values = {
-        'job_name': 'mesa: {}'.format(args.pipeline_info),
-        'device_type': args.device_type,
-        'visibility': { 'group': [ args.visibility_group ] },
-        'priority': JOB_PRIORITY,
-        'context': {
-            'extra_nfsroot_args': ' init=/init rootwait usbcore.quirks=0bda:8153:k'
-        },
-        "timeouts": {
-            "job": {"minutes": args.job_timeout},
-            "actions": {
-                "depthcharge-retry": {
-                    # Could take between 1 and 1.5 min in slower boots
-                    "minutes": 2
-                },
-                "depthcharge-start": {
-                    # Should take less than 1 min.
-                    "minutes": 1,
-                },
-                "depthcharge-action": {
-                    # This timeout englobes the entire depthcharge timing,
-                    # including retries
-                    "minutes": 2 * NUMBER_OF_ATTEMPTS_LAVA_BOOT,
-                },
-            }
-        },
-    }
-
-    if args.lava_tags:
-        values['tags'] = args.lava_tags.split(',')
-
-    # URLs to our kernel rootfs to boot from, both generated by the base
-    # container build
-    deploy = {
-      'timeout': { 'minutes': 10 },
-      'to': 'tftp',
-      'os': 'oe',
-      'kernel': {
-        'url': '{}/{}'.format(args.kernel_url_prefix, args.kernel_image_name),
-      },
-      'nfsrootfs': {
-        'url': '{}/lava-rootfs.tar.zst'.format(args.rootfs_url_prefix),
-        'compression': 'zstd',
-      }
-    }
-    if args.kernel_image_type:
-        deploy['kernel']['type'] = args.kernel_image_type
-    if args.dtb:
-        deploy['dtb'] = {
-          'url': '{}/{}.dtb'.format(args.kernel_url_prefix, args.dtb)
-        }
-
-    # always boot over NFS
-    boot = {
-        "failure_retry": NUMBER_OF_ATTEMPTS_LAVA_BOOT,
-        "method": args.boot_method,
-        "commands": "nfs",
-        "prompts": ["lava-shell:"],
-    }
-
-    # skeleton test definition: only declaring each job as a single 'test'
-    # since LAVA's test parsing is not useful to us
-    run_steps = []
-    test = {
-      'timeout': { 'minutes': args.job_timeout },
-      'failure_retry': 1,
-      'definitions': [ {
-        'name': 'mesa',
-        'from': 'inline',
-        'lava-signal': 'kmsg',
-        'path': 'inline/mesa.yaml',
-        'repository': {
-          'metadata': {
-            'name': 'mesa',
-            'description': 'Mesa test plan',
-            'os': [ 'oe' ],
-            'scope': [ 'functional' ],
-            'format': 'Lava-Test Test Definition 1.0',
-          },
-          'run': {
-            "steps": run_steps
-          },
-        },
-      } ],
-    }
-
-    # job execution script:
-    #   - inline .gitlab-ci/common/init-stage1.sh
-    #   - fetch and unpack per-pipeline build artifacts from build job
-    #   - fetch and unpack per-job environment from lava-submit.sh
-    #   - exec .gitlab-ci/common/init-stage2.sh
-
-    with open(args.first_stage_init, "r") as init_sh:
-        run_steps += [
-            x.rstrip() for x in init_sh if not x.startswith("#") and x.rstrip()
-        ]
-        run_steps.append(
-            f"curl -L --retry 4 -f --retry-all-errors --retry-delay 60 {args.job_rootfs_overlay_url} | tar -xz -C /",
-        )
-
-    if args.jwt_file:
-        with open(args.jwt_file) as jwt_file:
-            run_steps += [
-                "set +x",
-                f'echo -n "{jwt_file.read()}" > "{args.jwt_file}"  # HIDEME',
-                "set -x",
-                f'echo "export CI_JOB_JWT_FILE={args.jwt_file}" >> /set-job-env-vars.sh',
-            ]
-    else:
-        run_steps += [
-            "echo Could not find jwt file, disabling MINIO requests...",
-            "sed -i '/MINIO_RESULTS_UPLOAD/d' /set-job-env-vars.sh",
-        ]
-
-    run_steps += [
-      'mkdir -p {}'.format(args.ci_project_dir),
-      'curl {} | tar --zstd -x -C {}'.format(args.build_url, args.ci_project_dir),
-
-      # Sleep a bit to give time for bash to dump shell xtrace messages into
-      # console which may cause interleaving with LAVA_SIGNAL_STARTTC in some
-      # devices like a618.
-      'sleep 1',
-
-      # Putting CI_JOB name as the testcase name, it may help LAVA farm
-      # maintainers with monitoring
-      f"lava-test-case 'mesa-ci_{args.mesa_job_name}' --shell /init-stage2.sh",
-    ]
-
-    values['actions'] = [
-      { 'deploy': deploy },
-      { 'boot': boot },
-      { 'test': test },
-    ]
-
-    return values
-
-
-def setup_lava_proxy():
-    config = lavacli.load_config("default")
-    uri, usr, tok = (config.get(key) for key in ("uri", "username", "token"))
-    uri_obj = urllib.parse.urlparse(uri)
-    uri_str = "{}://{}:{}@{}{}".format(uri_obj.scheme, usr, tok, uri_obj.netloc, uri_obj.path)
-    transport = lavacli.RequestsTransport(
-        uri_obj.scheme,
-        config.get("proxy"),
-        config.get("timeout", 120.0),
-        config.get("verify_ssl_cert", True),
-    )
-    proxy = xmlrpc.client.ServerProxy(
-        uri_str, allow_none=True, transport=transport)
-
-    print_log("Proxy for {} created.".format(config['uri']))
-
-    return proxy
-
-
-def _call_proxy(fn, *args):
-    retries = 60
-    for n in range(1, retries + 1):
-        try:
-            return fn(*args)
-        except xmlrpc.client.ProtocolError as err:
-            if n == retries:
-                traceback.print_exc()
-                fatal_err("A protocol error occurred (Err {} {})".format(err.errcode, err.errmsg))
-            else:
-                time.sleep(15)
-        except xmlrpc.client.Fault as err:
-            traceback.print_exc()
-            fatal_err("FATAL: Fault: {} (code: {})".format(err.faultString, err.faultCode))
-
-
-class LAVAJob:
-    COLOR_STATUS_MAP = {
-        "pass": CONSOLE_LOG["FG_GREEN"],
-        "hung": CONSOLE_LOG["FG_YELLOW"],
-        "fail": CONSOLE_LOG["FG_RED"],
-        "canceled": CONSOLE_LOG["FG_MAGENTA"],
-    }
-
-    def __init__(self, proxy, definition):
-        self.job_id = None
-        self.proxy = proxy
-        self.definition = definition
-        self.last_log_line = 0
-        self.last_log_time = None
-        self.is_finished = False
-        self.status = "created"
-
-    def heartbeat(self):
-        self.last_log_time = datetime.now()
-        self.status = "running"
-
-    def validate(self) -> Optional[dict]:
-        """Returns a dict with errors, if the validation fails.
-
-        Returns:
-            Optional[dict]: a dict with the validation errors, if any
-        """
-        return _call_proxy(self.proxy.scheduler.jobs.validate, self.definition, True)
-
-    def submit(self):
-        try:
-            self.job_id = _call_proxy(self.proxy.scheduler.jobs.submit, self.definition)
-        except MesaCIException:
-            return False
-        return True
-
-    def cancel(self):
-        if self.job_id:
-            self.proxy.scheduler.jobs.cancel(self.job_id)
-
-    def is_started(self) -> bool:
-        waiting_states = ["Submitted", "Scheduling", "Scheduled"]
-        job_state: dict[str, str] = _call_proxy(
-            self.proxy.scheduler.job_state, self.job_id
-        )
-        return job_state["job_state"] not in waiting_states
-
-    def _load_log_from_data(self, data) -> list[str]:
-        lines = []
-        if isinstance(data, xmlrpc.client.Binary):
-            # We are dealing with xmlrpc.client.Binary
-            # Let's extract the data
-            data = data.data
-        # When there is no new log data, the YAML is empty
-        if loaded_lines := lava_yaml.load(data):
-            lines = loaded_lines
-            self.last_log_line += len(lines)
-        return lines
-
-    def get_logs(self) -> list[str]:
-        try:
-            (finished, data) = _call_proxy(
-                self.proxy.scheduler.jobs.logs, self.job_id, self.last_log_line
-            )
-            self.is_finished = finished
-            return self._load_log_from_data(data)
-
-        except Exception as mesa_ci_err:
-            raise MesaCIParseException(
-                f"Could not get LAVA job logs. Reason: {mesa_ci_err}"
-            ) from mesa_ci_err
-
-    def parse_job_result_from_log(
-        self, lava_lines: list[dict[str, str]]
-    ) -> list[dict[str, str]]:
-        """Use the console log to catch if the job has completed successfully or
-        not. Returns the list of log lines until the result line."""
-
-        last_line = None  # Print all lines. lines[:None] == lines[:]
-
-        for idx, line in enumerate(lava_lines):
-            if result := re.search(r"hwci: mesa: (pass|fail)", line):
-                self.is_finished = True
-                self.status = result.group(1)
-
-                last_line = idx + 1
-                # We reached the log end here. hwci script has finished.
-                break
-        return lava_lines[:last_line]
+NUMBER_OF_RETRIES_TIMEOUT_DETECTION = int(
+    getenv("LAVA_NUMBER_OF_RETRIES_TIMEOUT_DETECTION", 2)
+)
 
 
-def find_exception_from_metadata(metadata, job_id):
+def raise_exception_from_metadata(metadata: dict, job_id: int) -> None:
+    """
+    Investigate infrastructure errors from the job metadata.
+    If it finds an error, raise it as MesaCIException.
+    """
     if "result" not in metadata or metadata["result"] != "fail":
         return
     if "error_type" in metadata:
@@ -354,23 +108,22 @@ def find_exception_from_metadata(metadata, job_id):
         raise MesaCIException(
             f"LAVA job {job_id} failed validation (possible download error). Retry."
         )
-    return metadata
 
 
-def find_lava_error(job) -> None:
-    # Look for infrastructure errors and retry if we see them.
-    results_yaml = _call_proxy(job.proxy.results.get_testjob_results_yaml, job.job_id)
+def raise_lava_error(job) -> None:
+    # Look for infrastructure errors, raise them, and retry if we see them.
+    results_yaml = call_proxy(job.proxy.results.get_testjob_results_yaml, job.job_id)
     results = lava_yaml.load(results_yaml)
     for res in results:
         metadata = res["metadata"]
-        find_exception_from_metadata(metadata, job.job_id)
+        raise_exception_from_metadata(metadata, job.job_id)
 
     # If we reach this far, it means that the job ended without hwci script
     # result and no LAVA infrastructure problem was found
     job.status = "fail"
 
 
-def show_job_data(job, colour=f"{CONSOLE_LOG['BOLD']}{CONSOLE_LOG['FG_GREEN']}"):
+def show_final_job_data(job, colour=f"{CONSOLE_LOG['BOLD']}{CONSOLE_LOG['FG_GREEN']}"):
     with GitlabSection(
         "job_data",
         "LAVA job info",
@@ -378,12 +131,41 @@ def show_job_data(job, colour=f"{CONSOLE_LOG['BOLD']}{CONSOLE_LOG['FG_GREEN']}")
         start_collapsed=True,
         colour=colour,
     ):
-        show = _call_proxy(job.proxy.scheduler.jobs.show, job.job_id)
-        for field, value in show.items():
+        wait_post_processing_retries: int = WAIT_FOR_LAVA_POST_PROCESSING_RETRIES
+        while not job.is_post_processed() and wait_post_processing_retries > 0:
+            # Wait a little until LAVA finishes processing metadata
+            time.sleep(WAIT_FOR_LAVA_POST_PROCESSING_SEC)
+            wait_post_processing_retries -= 1
+
+        if not job.is_post_processed():
+            waited_for_sec: int = (
+                WAIT_FOR_LAVA_POST_PROCESSING_RETRIES
+                * WAIT_FOR_LAVA_POST_PROCESSING_SEC
+            )
+            print_log(
+                f"Waited for {waited_for_sec} seconds "
+                "for LAVA to post-process the job, it haven't finished yet. "
+                "Dumping it's info anyway"
+            )
+
+        details: dict[str, str] = job.show()
+        for field, value in details.items():
             print(f"{field:<15}: {value}")
+        job.refresh_log()
 
 
 def fetch_logs(job, max_idle_time, log_follower) -> None:
+    is_job_hanging(job, max_idle_time)
+
+    time.sleep(LOG_POLLING_TIME_SEC)
+    new_log_lines = fetch_new_log_lines(job)
+    parsed_lines = parse_log_lines(job, log_follower, new_log_lines)
+
+    for line in parsed_lines:
+        print_log(line)
+
+
+def is_job_hanging(job, max_idle_time):
     # Poll to check for new logs, assuming that a prolonged period of
     # silence means that the device has died and we should try it again
     if datetime.now() - job.last_log_time > max_idle_time:
@@ -398,16 +180,8 @@ def fetch_logs(job, max_idle_time, log_follower) -> None:
             timeout_duration=max_idle_time,
         )
 
-    time.sleep(LOG_POLLING_TIME_SEC)
 
-    # The XMLRPC binary packet may be corrupted, causing a YAML scanner error.
-    # Retry the log fetching several times before exposing the error.
-    for _ in range(5):
-        with contextlib.suppress(MesaCIParseException):
-            new_log_lines = job.get_logs()
-            break
-    else:
-        raise MesaCIParseException
+def parse_log_lines(job, log_follower, new_log_lines):
 
     if log_follower.feed(new_log_lines):
         # If we had non-empty log data, we can assure that the device is alive.
@@ -422,12 +196,23 @@ def fetch_logs(job, max_idle_time, log_follower) -> None:
         LogSectionType.LAVA_POST_PROCESSING,
     ):
         parsed_lines = job.parse_job_result_from_log(parsed_lines)
+    return parsed_lines
 
-    for line in parsed_lines:
-        print_log(line)
+
+def fetch_new_log_lines(job):
+
+    # The XMLRPC binary packet may be corrupted, causing a YAML scanner error.
+    # Retry the log fetching several times before exposing the error.
+    for _ in range(5):
+        with contextlib.suppress(MesaCIParseException):
+            new_log_lines = job.get_logs()
+            break
+    else:
+        raise MesaCIParseException
+    return new_log_lines
 
 
-def follow_job_execution(job):
+def submit_job(job):
     try:
         job.submit()
     except Exception as mesa_ci_err:
@@ -435,11 +220,16 @@ def follow_job_execution(job):
             f"Could not submit LAVA job. Reason: {mesa_ci_err}"
         ) from mesa_ci_err
 
+
+def wait_for_job_get_started(job):
     print_log(f"Waiting for job {job.job_id} to start.")
     while not job.is_started():
         time.sleep(WAIT_FOR_DEVICE_POLLING_TIME_SEC)
+    job.refresh_log()
     print_log(f"Job {job.job_id} started.")
 
+
+def bootstrap_log_follower() -> LogFollower:
     gl = GitlabSection(
         id="lava_boot",
         header="LAVA boot",
@@ -447,20 +237,37 @@ def follow_job_execution(job):
         start_collapsed=True,
     )
     print(gl.start())
-    max_idle_time = timedelta(seconds=DEVICE_HANGING_TIMEOUT_SEC)
-    with LogFollower(current_section=gl) as lf:
+    return LogFollower(starting_section=gl)
 
+
+def follow_job_execution(job, log_follower):
+    with log_follower:
         max_idle_time = timedelta(seconds=DEVICE_HANGING_TIMEOUT_SEC)
         # Start to check job's health
         job.heartbeat()
         while not job.is_finished:
-            fetch_logs(job, max_idle_time, lf)
+            fetch_logs(job, max_idle_time, log_follower)
+            structural_log_phases(job, log_follower)
 
     # Mesa Developers expect to have a simple pass/fail job result.
     # If this does not happen, it probably means a LAVA infrastructure error
     # happened.
     if job.status not in ["pass", "fail"]:
-        find_lava_error(job)
+        raise_lava_error(job)
+
+    # LogFollower does some cleanup after the early exit (trigger by
+    # `hwci: pass|fail` regex), let's update the phases after the cleanup.
+    structural_log_phases(job, log_follower)
+
+
+def structural_log_phases(job, log_follower):
+    phases: dict[str, Any] = {
+        s.header.split(" - ")[0]: {
+            k: str(getattr(s, k)) for k in ("start_time", "end_time")
+        }
+        for s in log_follower.section_history
+    }
+    job.log["dut_job_phases"] = phases
 
 
 def print_job_final_status(job):
@@ -474,64 +281,145 @@ def print_job_final_status(job):
         f"{CONSOLE_LOG['RESET']}"
     )
 
-    show_job_data(job, colour=f"{CONSOLE_LOG['BOLD']}{color}")
+    job.refresh_log()
+    show_final_job_data(job, colour=f"{CONSOLE_LOG['BOLD']}{color}")
 
-def retriable_follow_job(proxy, job_definition) -> LAVAJob:
-    retry_count = NUMBER_OF_RETRIES_TIMEOUT_DETECTION
 
+def execute_job_with_retries(
+    proxy, job_definition, retry_count, jobs_log
+) -> Optional[LAVAJob]:
+    last_failed_job = None
     for attempt_no in range(1, retry_count + 2):
-        job = LAVAJob(proxy, job_definition)
+        # Need to get the logger value from its object to enable autosave
+        # features, if AutoSaveDict is enabled from StructuredLogging module
+        jobs_log.append({})
+        job_log = jobs_log[-1]
+        job = LAVAJob(proxy, job_definition, job_log)
+        STRUCTURAL_LOG["dut_attempt_counter"] = attempt_no
         try:
-            follow_job_execution(job)
+            job_log["submitter_start_time"] = datetime.now().isoformat()
+            submit_job(job)
+            wait_for_job_get_started(job)
+            log_follower: LogFollower = bootstrap_log_follower()
+            follow_job_execution(job, log_follower)
             return job
-        except MesaCIKnownIssueException as found_issue:
-            print_log(found_issue)
-            job.status = "canceled"
-        except MesaCIException as mesa_exception:
-            print_log(mesa_exception)
-            job.cancel()
-        except KeyboardInterrupt as e:
-            print_log("LAVA job submitter was interrupted. Cancelling the job.")
-            job.cancel()
-            raise e
+
+        except (MesaCIException, KeyboardInterrupt) as exception:
+            job.handle_exception(exception)
+
         finally:
+            print_job_final_status(job)
+            # If LAVA takes too long to post process the job, the submitter
+            # gives up and proceeds.
+            job_log["submitter_end_time"] = datetime.now().isoformat()
+            last_failed_job = job
             print_log(
                 f"{CONSOLE_LOG['BOLD']}"
                 f"Finished executing LAVA job in the attempt #{attempt_no}"
                 f"{CONSOLE_LOG['RESET']}"
             )
-            print_job_final_status(job)
 
-    raise MesaCIRetryError(
-        f"{CONSOLE_LOG['BOLD']}"
-        f"{CONSOLE_LOG['FG_RED']}"
-        "Job failed after it exceeded the number of "
-        f"{retry_count} retries."
-        f"{CONSOLE_LOG['RESET']}",
-        retry_count=retry_count,
+    return last_failed_job
+
+
+def retriable_follow_job(proxy, job_definition) -> LAVAJob:
+    number_of_retries = NUMBER_OF_RETRIES_TIMEOUT_DETECTION
+
+    last_attempted_job = execute_job_with_retries(
+        proxy, job_definition, number_of_retries, STRUCTURAL_LOG["dut_jobs"]
     )
 
+    if last_attempted_job.exception is not None:
+        # Infra failed in all attempts
+        raise MesaCIRetryError(
+            f"{CONSOLE_LOG['BOLD']}"
+            f"{CONSOLE_LOG['FG_RED']}"
+            "Job failed after it exceeded the number of "
+            f"{number_of_retries} retries."
+            f"{CONSOLE_LOG['RESET']}",
+            retry_count=number_of_retries,
+            last_job=last_attempted_job,
+        )
+
+    return last_attempted_job
+
+
+@dataclass
+class PathResolver:
+    def __post_init__(self):
+        for field in fields(self):
+            value = getattr(self, field.name)
+            if not value:
+                continue
+            if field.type == pathlib.Path:
+                value = pathlib.Path(value)
+                setattr(self, field.name, value.resolve())
+
+
+@dataclass
+class LAVAJobSubmitter(PathResolver):
+    boot_method: str
+    ci_project_dir: str
+    device_type: str
+    job_timeout_min: int  # The job timeout in minutes
+    build_url: str = None
+    dtb_filename: str = None
+    dump_yaml: bool = False  # Whether to dump the YAML payload to stdout
+    first_stage_init: str = None
+    jwt_file: pathlib.Path = None
+    kernel_image_name: str = None
+    kernel_image_type: str = ""
+    kernel_url_prefix: str = None
+    lava_tags: str = ""  # Comma-separated LAVA tags for the job
+    mesa_job_name: str = "mesa_ci_job"
+    pipeline_info: str = ""
+    rootfs_url_prefix: str = None
+    validate_only: bool = False  # Whether to only validate the job, not execute it
+    visibility_group: str = None  # Only affects LAVA farm maintainers
+    job_rootfs_overlay_url: str = None
+    structured_log_file: pathlib.Path = None  # Log file path with structured LAVA log
+    ssh_client_image: str = None  # x86_64 SSH client image to follow the job's output
+    __structured_log_context = contextlib.nullcontext()  # Structured Logger context
+
+    def __post_init__(self) -> None:
+        super().__post_init__()
+        # Remove mesa job names with spaces, which breaks the lava-test-case command
+        self.mesa_job_name = self.mesa_job_name.split(" ")[0]
+
+        if not self.structured_log_file:
+            return
+
+        self.__structured_log_context = StructuredLoggerWrapper(self).logger_context()
+        self.proxy = setup_lava_proxy()
+
+    def __prepare_submission(self) -> str:
+        # Overwrite the timeout for the testcases with the value offered by the
+        # user. The testcase running time should be at least 4 times greater than
+        # the other sections (boot and setup), so we can safely ignore them.
+        # If LAVA fails to stop the job at this stage, it will fall back to the
+        # script section timeout with a reasonable delay.
+        GL_SECTION_TIMEOUTS[LogSectionType.TEST_CASE] = timedelta(
+            minutes=self.job_timeout_min
+        )
 
-def treat_mesa_job_name(args):
-    # Remove mesa job names with spaces, which breaks the lava-test-case command
-    args.mesa_job_name = args.mesa_job_name.split(" ")[0]
+        job_definition = generate_lava_job_definition(self)
 
+        if self.dump_yaml:
+            self.dump_job_definition(job_definition)
 
-def main(args):
-    proxy = setup_lava_proxy()
+        validation_job = LAVAJob(self.proxy, job_definition)
+        if errors := validation_job.validate():
+            fatal_err(f"Error in LAVA job definition: {errors}")
+        print_log("LAVA job definition validated successfully")
 
-    # Overwrite the timeout for the testcases with the value offered by the
-    # user. The testcase running time should be at least 4 times greater than
-    # the other sections (boot and setup), so we can safely ignore them.
-    # If LAVA fails to stop the job at this stage, it will fall back to the
-    # script section timeout with a reasonable delay.
-    GL_SECTION_TIMEOUTS[LogSectionType.TEST_CASE] = timedelta(minutes=args.job_timeout)
+        return job_definition
 
-    job_definition_stream = StringIO()
-    lava_yaml.dump(generate_lava_yaml_payload(args), job_definition_stream)
-    job_definition = job_definition_stream.getvalue()
+    @classmethod
+    def is_under_ci(cls):
+        ci_envvar: str = getenv("CI", "false")
+        return ci_envvar.lower() == "true"
 
-    if args.dump_yaml:
+    def dump_job_definition(self, job_definition) -> None:
         with GitlabSection(
             "yaml_dump",
             "LAVA job definition (YAML)",
@@ -539,44 +427,98 @@ def main(args):
             start_collapsed=True,
         ):
             print(hide_sensitive_data(job_definition))
-    job = LAVAJob(proxy, job_definition)
 
-    if errors := job.validate():
-        fatal_err(f"Error in LAVA job definition: {errors}")
-    print_log("LAVA job definition validated successfully")
+    def submit(self) -> None:
+        """
+        Prepares and submits the LAVA job.
+        If `validate_only` is True, it validates the job without submitting it.
+        If the job finishes with a non-pass status or encounters an exception,
+        the program exits with a non-zero return code.
+        """
+        job_definition: str = self.__prepare_submission()
 
-    if args.validate_only:
-        return
+        if self.validate_only:
+            return
+
+        with self.__structured_log_context:
+            last_attempt_job = None
+            try:
+                last_attempt_job = retriable_follow_job(self.proxy, job_definition)
+
+            except MesaCIRetryError as retry_exception:
+                last_attempt_job = retry_exception.last_job
+
+            except Exception as exception:
+                STRUCTURAL_LOG["job_combined_fail_reason"] = str(exception)
+                raise exception
+
+            finally:
+                self.finish_script(last_attempt_job)
+
+    def print_log_artifact_url(self):
+        base_url = "https://$CI_PROJECT_ROOT_NAMESPACE.pages.freedesktop.org/"
+        artifacts_path = "-/$CI_PROJECT_NAME/-/jobs/$CI_JOB_ID/artifacts/"
+        relative_log_path = self.structured_log_file.relative_to(pathlib.Path.cwd())
+        full_path = f"{base_url}{artifacts_path}{relative_log_path}"
+        artifact_url = path.expandvars(full_path)
+
+        print_log(f"Structural Logging data available at: {artifact_url}")
+
+    def finish_script(self, last_attempt_job):
+        if self.is_under_ci() and self.structured_log_file:
+            self.print_log_artifact_url()
 
-    finished_job = retriable_follow_job(proxy, job_definition)
-    exit_code = 0 if finished_job.status == "pass" else 1
-    sys.exit(exit_code)
-
-
-def create_parser():
-    parser = argparse.ArgumentParser("LAVA job submitter")
-
-    parser.add_argument("--pipeline-info")
-    parser.add_argument("--rootfs-url-prefix")
-    parser.add_argument("--kernel-url-prefix")
-    parser.add_argument("--build-url")
-    parser.add_argument("--job-rootfs-overlay-url")
-    parser.add_argument("--job-timeout", type=int)
-    parser.add_argument("--first-stage-init")
-    parser.add_argument("--ci-project-dir")
-    parser.add_argument("--device-type")
-    parser.add_argument("--dtb", nargs='?', default="")
-    parser.add_argument("--kernel-image-name")
-    parser.add_argument("--kernel-image-type", nargs='?', default="")
-    parser.add_argument("--boot-method")
-    parser.add_argument("--lava-tags", nargs='?', default="")
-    parser.add_argument("--jwt-file", type=pathlib.Path)
-    parser.add_argument("--validate-only", action='store_true')
-    parser.add_argument("--dump-yaml", action='store_true')
-    parser.add_argument("--visibility-group")
-    parser.add_argument("--mesa-job-name")
-
-    return parser
+        if not last_attempt_job:
+            # No job was run, something bad happened
+            STRUCTURAL_LOG["job_combined_status"] = "script_crash"
+            current_exception = str(sys.exc_info()[0])
+            STRUCTURAL_LOG["job_combined_fail_reason"] = current_exception
+            raise SystemExit(1)
+
+        STRUCTURAL_LOG["job_combined_status"] = last_attempt_job.status
+
+        if last_attempt_job.status != "pass":
+            raise SystemExit(1)
+
+
+class StructuredLoggerWrapper:
+    def __init__(self, submitter: LAVAJobSubmitter) -> None:
+        self.__submitter: LAVAJobSubmitter = submitter
+
+    def _init_logger(self):
+        STRUCTURAL_LOG["fixed_tags"] = self.__submitter.lava_tags
+        STRUCTURAL_LOG["dut_job_type"] = self.__submitter.device_type
+        STRUCTURAL_LOG["job_combined_fail_reason"] = None
+        STRUCTURAL_LOG["job_combined_status"] = "not_submitted"
+        STRUCTURAL_LOG["dut_attempt_counter"] = 0
+
+        # Initialize dut_jobs list to enable appends
+        STRUCTURAL_LOG["dut_jobs"] = []
+
+    @contextlib.contextmanager
+    def _simple_logger_context(self):
+        log_file = pathlib.Path(self.__submitter.structured_log_file)
+        log_file.parent.mkdir(parents=True, exist_ok=True)
+        try:
+            # Truncate the file
+            log_file.write_text("")
+            yield
+        finally:
+            log_file.write_text(json.dumps(STRUCTURAL_LOG, indent=2))
+
+    def logger_context(self):
+        context = contextlib.nullcontext()
+        try:
+
+            global STRUCTURAL_LOG
+            STRUCTURAL_LOG = StructuredLogger(
+                self.__submitter.structured_log_file, truncate=True
+            ).data
+        except NameError:
+            context = self._simple_logger_context()
+
+        self._init_logger()
+        return context
 
 
 if __name__ == "__main__":
@@ -585,10 +527,11 @@ if __name__ == "__main__":
     # more buffering
     sys.stdout.reconfigure(line_buffering=True)
     sys.stderr.reconfigure(line_buffering=True)
-
-    parser = create_parser()
-
-    parser.set_defaults(func=main)
-    args = parser.parse_args()
-    treat_mesa_job_name(args)
-    args.func(args)
+    # LAVA farm is giving datetime in UTC timezone, let's set it locally for the
+    # script run.
+    # Setting environ here will not affect the system time, as the os.environ
+    # lifetime follows the script one.
+    environ["TZ"] = "UTC"
+    time.tzset()
+
+    fire.Fire(LAVAJobSubmitter)
diff --git a/lib/mesa/.gitlab-ci/lava/requirements.txt b/lib/mesa/.gitlab-ci/lava/requirements.txt
index 7186eceb9..e89021f3f 100644
--- a/lib/mesa/.gitlab-ci/lava/requirements.txt
+++ b/lib/mesa/.gitlab-ci/lava/requirements.txt
@@ -1 +1,2 @@
 lavacli==1.5.2
+fire==0.5.0
diff --git a/lib/mesa/.gitlab-ci/lava/utils/__init__.py b/lib/mesa/.gitlab-ci/lava/utils/__init__.py
index 18bb459c1..349d2b325 100644
--- a/lib/mesa/.gitlab-ci/lava/utils/__init__.py
+++ b/lib/mesa/.gitlab-ci/lava/utils/__init__.py
@@ -1,5 +1,8 @@
 from .console_format import CONSOLE_LOG
 from .gitlab_section import GitlabSection
+from .lava_job import LAVAJob
+from .lava_job_definition import generate_lava_job_definition
+from .lava_proxy import call_proxy, setup_lava_proxy
 from .log_follower import (
     LogFollower,
     fatal_err,
diff --git a/lib/mesa/.gitlab-ci/lava/utils/gitlab_section.py b/lib/mesa/.gitlab-ci/lava/utils/gitlab_section.py
index 7e2398d7a..034afb4eb 100644
--- a/lib/mesa/.gitlab-ci/lava/utils/gitlab_section.py
+++ b/lib/mesa/.gitlab-ci/lava/utils/gitlab_section.py
@@ -11,6 +11,7 @@ if TYPE_CHECKING:
     from lava.utils.log_section import LogSectionType
 
 
+# TODO: Add section final status to assist with monitoring
 @dataclass
 class GitlabSection:
     id: str
@@ -37,6 +38,14 @@ class GitlabSection:
     def has_finished(self) -> bool:
         return self.__end_time is not None
 
+    @property
+    def start_time(self) -> datetime:
+        return self.__start_time
+
+    @property
+    def end_time(self) -> Optional[datetime]:
+        return self.__end_time
+
     def get_timestamp(self, time: datetime) -> str:
         unix_ts = datetime.timestamp(time)
         return str(int(unix_ts))
@@ -54,6 +63,16 @@ class GitlabSection:
 
         return f"{before_header}{header_wrapper}"
 
+    def __str__(self) -> str:
+        status = "NS" if not self.has_started else "F" if self.has_finished else "IP"
+        delta = self.delta_time()
+        elapsed_time = "N/A" if delta is None else str(delta)
+        return (
+            f"GitlabSection({self.id}, {self.header}, {self.type}, "
+            f"SC={self.start_collapsed}, S={status}, ST={self.start_time}, "
+            f"ET={self.end_time}, ET={elapsed_time})"
+        )
+
     def __enter__(self):
         print(self.start())
         return self
diff --git a/lib/mesa/.gitlab-ci/lava/utils/lava_job.py b/lib/mesa/.gitlab-ci/lava/utils/lava_job.py
new file mode 100644
index 000000000..b69f8b9fb
--- /dev/null
+++ b/lib/mesa/.gitlab-ci/lava/utils/lava_job.py
@@ -0,0 +1,186 @@
+import re
+import xmlrpc
+from collections import defaultdict
+from datetime import datetime
+from typing import Any, Optional
+
+from lava.exceptions import (
+    MesaCIException,
+    MesaCIKnownIssueException,
+    MesaCIParseException,
+    MesaCITimeoutError,
+)
+from lava.utils import CONSOLE_LOG
+from lava.utils.log_follower import print_log
+from lavacli.utils import flow_yaml as lava_yaml
+
+from .lava_proxy import call_proxy
+
+
+class LAVAJob:
+    COLOR_STATUS_MAP: dict[str, str] = {
+        "pass": CONSOLE_LOG["FG_GREEN"],
+        "hung": CONSOLE_LOG["FG_YELLOW"],
+        "fail": CONSOLE_LOG["FG_RED"],
+        "canceled": CONSOLE_LOG["FG_MAGENTA"],
+    }
+
+    def __init__(self, proxy, definition, log=defaultdict(str)) -> None:
+        self._job_id = None
+        self.proxy = proxy
+        self.definition = definition
+        self.last_log_line = 0
+        self.last_log_time = None
+        self._is_finished = False
+        self.log: dict[str, Any] = log
+        self.status = "not_submitted"
+        self.__exception: Optional[str] = None
+
+    def heartbeat(self) -> None:
+        self.last_log_time: datetime = datetime.now()
+        self.status = "running"
+
+    @property
+    def status(self) -> str:
+        return self._status
+
+    @status.setter
+    def status(self, new_status: str) -> None:
+        self._status = new_status
+        self.log["status"] = self._status
+
+    @property
+    def job_id(self) -> int:
+        return self._job_id
+
+    @job_id.setter
+    def job_id(self, new_id: int) -> None:
+        self._job_id = new_id
+        self.log["lava_job_id"] = self._job_id
+
+    @property
+    def is_finished(self) -> bool:
+        return self._is_finished
+
+    @property
+    def exception(self) -> str:
+        return self.__exception
+
+    @exception.setter
+    def exception(self, exception: Exception) -> None:
+        self.__exception = repr(exception)
+        self.log["dut_job_fail_reason"] = self.__exception
+
+    def validate(self) -> Optional[dict]:
+        """Returns a dict with errors, if the validation fails.
+
+        Returns:
+            Optional[dict]: a dict with the validation errors, if any
+        """
+        return call_proxy(self.proxy.scheduler.jobs.validate, self.definition, True)
+
+    def show(self) -> dict[str, str]:
+        return call_proxy(self.proxy.scheduler.jobs.show, self._job_id)
+
+    def get_lava_time(self, key, data) -> Optional[str]:
+        return data[key].value if data[key] else None
+
+    def refresh_log(self) -> None:
+        details = self.show()
+        self.log["dut_start_time"] = self.get_lava_time("start_time", details)
+        self.log["dut_submit_time"] = self.get_lava_time("submit_time", details)
+        self.log["dut_end_time"] = self.get_lava_time("end_time", details)
+        self.log["dut_name"] = details.get("device")
+        self.log["dut_state"] = details.get("state")
+
+    def submit(self) -> bool:
+        try:
+            self.job_id = call_proxy(self.proxy.scheduler.jobs.submit, self.definition)
+            self.status = "submitted"
+            self.refresh_log()
+        except MesaCIException:
+            return False
+        return True
+
+    def lava_state(self) -> str:
+        job_state: dict[str, str] = call_proxy(
+            self.proxy.scheduler.job_state, self._job_id
+        )
+        return job_state["job_state"]
+
+    def cancel(self):
+        if self._job_id:
+            self.proxy.scheduler.jobs.cancel(self._job_id)
+            # If we don't have yet set another job's status, let's update it
+            # with canceled one
+            if self.status == "running":
+                self.status = "canceled"
+
+    def is_started(self) -> bool:
+        waiting_states = ("Submitted", "Scheduling", "Scheduled")
+        return self.lava_state() not in waiting_states
+
+    def is_post_processed(self) -> bool:
+        return self.lava_state() != "Running"
+
+    def _load_log_from_data(self, data) -> list[str]:
+        lines = []
+        if isinstance(data, xmlrpc.client.Binary):
+            # We are dealing with xmlrpc.client.Binary
+            # Let's extract the data
+            data = data.data
+        # When there is no new log data, the YAML is empty
+        if loaded_lines := lava_yaml.load(data):
+            lines: list[str] = loaded_lines
+            self.last_log_line += len(lines)
+        return lines
+
+    def get_logs(self) -> list[str]:
+        try:
+            (finished, data) = call_proxy(
+                self.proxy.scheduler.jobs.logs, self._job_id, self.last_log_line
+            )
+            self._is_finished = finished
+            return self._load_log_from_data(data)
+
+        except Exception as mesa_ci_err:
+            raise MesaCIParseException(
+                f"Could not get LAVA job logs. Reason: {mesa_ci_err}"
+            ) from mesa_ci_err
+
+    def parse_job_result_from_log(
+        self, lava_lines: list[dict[str, str]]
+    ) -> list[dict[str, str]]:
+        """Use the console log to catch if the job has completed successfully or
+        not. Returns the list of log lines until the result line."""
+
+        last_line = None  # Print all lines. lines[:None] == lines[:]
+
+        for idx, line in enumerate(lava_lines):
+            if result := re.search(r"hwci: mesa: (pass|fail)", line):
+                self._is_finished = True
+                self.status = result[1]
+
+                last_line = idx + 1
+                # We reached the log end here. hwci script has finished.
+                break
+        return lava_lines[:last_line]
+
+    def handle_exception(self, exception: Exception):
+        print_log(exception)
+        self.cancel()
+        self.exception = exception
+
+        # Give more accurate status depending on exception
+        if isinstance(exception, MesaCIKnownIssueException):
+            self.status = "canceled"
+        elif isinstance(exception, MesaCITimeoutError):
+            self.status = "hung"
+        elif isinstance(exception, MesaCIException):
+            self.status = "failed"
+        elif isinstance(exception, KeyboardInterrupt):
+            self.status = "interrupted"
+            print_log("LAVA job submitter was interrupted. Cancelling the job.")
+            raise
+        else:
+            self.status = "job_submitter_error"
diff --git a/lib/mesa/.gitlab-ci/lava/utils/lava_job_definition.py b/lib/mesa/.gitlab-ci/lava/utils/lava_job_definition.py
new file mode 100644
index 000000000..c7b43658c
--- /dev/null
+++ b/lib/mesa/.gitlab-ci/lava/utils/lava_job_definition.py
@@ -0,0 +1,150 @@
+from io import StringIO
+from typing import TYPE_CHECKING, Any
+
+import re
+from lava.utils.lava_farm import LavaFarm, get_lava_farm
+from ruamel.yaml.scalarstring import LiteralScalarString
+from ruamel.yaml import YAML
+from os import getenv
+
+if TYPE_CHECKING:
+    from lava.lava_job_submitter import LAVAJobSubmitter
+
+# How many attempts should be made when a timeout happen during LAVA device boot.
+NUMBER_OF_ATTEMPTS_LAVA_BOOT = int(getenv("LAVA_NUMBER_OF_ATTEMPTS_LAVA_BOOT", 3))
+
+# Supports any integers in [0, 100].
+# The scheduler considers the job priority when ordering the queue
+# to consider which job should run next.
+JOB_PRIORITY = int(getenv("JOB_PRIORITY", 75))
+
+
+def has_ssh_support(job_submitter: "LAVAJobSubmitter") -> bool:
+    force_uart = bool(getenv("LAVA_FORCE_UART", False))
+
+    if force_uart:
+        return False
+
+    # Only Collabora's farm supports to run docker container as a LAVA actions,
+    # which is required to follow the job in a SSH section
+    current_farm = get_lava_farm()
+
+    # SSH job definition still needs to add support for fastboot.
+    job_uses_fastboot: bool = job_submitter.boot_method == "fastboot"
+
+    return current_farm == LavaFarm.COLLABORA and not job_uses_fastboot
+
+
+def generate_lava_yaml_payload(job_submitter: "LAVAJobSubmitter") -> dict[str, Any]:
+    """
+    Bridge function to use the supported job definition depending on some Mesa
+    CI job characteristics.
+
+    The strategy here, is to use LAVA with a containerized SSH session to follow
+    the job output, escaping from dumping data to the UART, which proves to be
+    error prone in some devices.
+    """
+    from lava.utils.ssh_job_definition import (
+        generate_lava_yaml_payload as ssh_lava_yaml,
+    )
+    from lava.utils.uart_job_definition import (
+        generate_lava_yaml_payload as uart_lava_yaml,
+    )
+
+    if has_ssh_support(job_submitter):
+        return ssh_lava_yaml(job_submitter)
+
+    return uart_lava_yaml(job_submitter)
+
+
+def generate_lava_job_definition(job_submitter: "LAVAJobSubmitter") -> str:
+    job_stream = StringIO()
+    yaml = YAML()
+    yaml.width = 4096
+    yaml.dump(generate_lava_yaml_payload(job_submitter), job_stream)
+    return job_stream.getvalue()
+
+
+def to_yaml_block(steps_array: list[str], escape_vars=[]) -> LiteralScalarString:
+    def escape_envvar(match):
+        return "\\" + match.group(0)
+
+    filtered_array = [s for s in steps_array if s.strip() and not s.startswith("#")]
+    final_str = "\n".join(filtered_array)
+
+    for escape_var in escape_vars:
+        # Find env vars and add '\\' before them
+        final_str = re.sub(rf"\${escape_var}*", escape_envvar, final_str)
+    return LiteralScalarString(final_str)
+
+
+def generate_metadata(args) -> dict[str, Any]:
+    # General metadata and permissions
+    values = {
+        "job_name": f"mesa: {args.pipeline_info}",
+        "device_type": args.device_type,
+        "visibility": {"group": [args.visibility_group]},
+        "priority": JOB_PRIORITY,
+        "context": {
+            "extra_nfsroot_args": " init=/init rootwait usbcore.quirks=0bda:8153:k"
+        },
+        "timeouts": {
+            "job": {"minutes": args.job_timeout_min},
+            "actions": {
+                "depthcharge-retry": {
+                    # Could take between 1 and 1.5 min in slower boots
+                    "minutes": 4
+                },
+                "depthcharge-start": {
+                    # Should take less than 1 min.
+                    "minutes": 1,
+                },
+                "depthcharge-action": {
+                    # This timeout englobes the entire depthcharge timing,
+                    # including retries
+                    "minutes": 5
+                    * NUMBER_OF_ATTEMPTS_LAVA_BOOT,
+                },
+            },
+        },
+    }
+
+    if args.lava_tags:
+        values["tags"] = args.lava_tags.split(",")
+
+    return values
+
+
+def artifact_download_steps(args):
+    """
+    This function is responsible for setting up the SSH server in the DUT and to
+    export the first boot environment to a file.
+    """
+    # Putting JWT pre-processing and mesa download, within init-stage1.sh file,
+    # as we do with non-SSH version.
+    download_steps = [
+        "set -ex",
+        "curl -L --retry 4 -f --retry-all-errors --retry-delay 60 "
+        f"{args.job_rootfs_overlay_url} | tar -xz -C /",
+        f"mkdir -p {args.ci_project_dir}",
+        f"curl -L --retry 4 -f --retry-all-errors --retry-delay 60 {args.build_url} | "
+        f"tar --zstd -x -C {args.ci_project_dir}",
+    ]
+
+    # If the JWT file is provided, we will use it to authenticate with the cloud
+    # storage provider and will hide it from the job output in Gitlab.
+    if args.jwt_file:
+        with open(args.jwt_file) as jwt_file:
+            download_steps += [
+                "set +x  # HIDE_START",
+                f'echo -n "{jwt_file.read()}" > "{args.jwt_file}"',
+                "set -x  # HIDE_END",
+                f'echo "export CI_JOB_JWT_FILE={args.jwt_file}" >> /set-job-env-vars.sh',
+            ]
+    else:
+        download_steps += [
+            "echo Could not find jwt file, disabling S3 requests...",
+            "sed -i '/S3_RESULTS_UPLOAD/d' /set-job-env-vars.sh",
+        ]
+
+    return download_steps
diff --git a/lib/mesa/.gitlab-ci/lava/utils/lava_proxy.py b/lib/mesa/.gitlab-ci/lava/utils/lava_proxy.py
new file mode 100644
index 000000000..581ec4603
--- /dev/null
+++ b/lib/mesa/.gitlab-ci/lava/utils/lava_proxy.py
@@ -0,0 +1,44 @@
+import time
+import traceback
+import urllib
+import urllib.parse
+import xmlrpc
+import xmlrpc.client
+
+import lavacli
+
+from .log_follower import fatal_err, print_log
+
+
+def setup_lava_proxy():
+    config = lavacli.load_config("default")
+    uri, usr, tok = (config.get(key) for key in ("uri", "username", "token"))
+    uri_obj = urllib.parse.urlparse(uri)
+    uri_str = f"{uri_obj.scheme}://{usr}:{tok}@{uri_obj.netloc}{uri_obj.path}"
+    transport = lavacli.RequestsTransport(
+        uri_obj.scheme,
+        config.get("proxy"),
+        config.get("timeout", 120.0),
+        config.get("verify_ssl_cert", True),
+    )
+    proxy = xmlrpc.client.ServerProxy(uri_str, allow_none=True, transport=transport)
+
+    print_log(f'Proxy for {config["uri"]} created.')
+
+    return proxy
+
+
+def call_proxy(fn, *args):
+    retries = 60
+    for n in range(1, retries + 1):
+        try:
+            return fn(*args)
+        except xmlrpc.client.ProtocolError as err:
+            if n == retries:
+                traceback.print_exc()
+                fatal_err(f"A protocol error occurred (Err {err.errcode} {err.errmsg})")
+            else:
+                time.sleep(15)
+        except xmlrpc.client.Fault as err:
+            traceback.print_exc()
+            fatal_err(f"FATAL: Fault: {err.faultString} (code: {err.faultCode})", err)
diff --git a/lib/mesa/.gitlab-ci/lava/utils/log_follower.py b/lib/mesa/.gitlab-ci/lava/utils/log_follower.py
index b2bfcf36c..1fdf490bc 100644
--- a/lib/mesa/.gitlab-ci/lava/utils/log_follower.py
+++ b/lib/mesa/.gitlab-ci/lava/utils/log_follower.py
@@ -32,7 +32,9 @@ from lava.utils.log_section import (
 
 @dataclass
 class LogFollower:
-    current_section: Optional[GitlabSection] = None
+    starting_section: Optional[GitlabSection] = None
+    _current_section: Optional[GitlabSection] = None
+    section_history: list[GitlabSection] = field(default_factory=list, init=False)
     timeout_durations: dict[LogSectionType, timedelta] = field(
         default_factory=lambda: DEFAULT_GITLAB_SECTION_TIMEOUTS,
     )
@@ -43,9 +45,11 @@ class LogFollower:
     _merge_next_line: str = field(default_factory=str, init=False)
 
     def __post_init__(self):
-        section_is_created = bool(self.current_section)
+        # Make it trigger current_section setter to populate section history
+        self.current_section = self.starting_section
+        section_is_created = bool(self._current_section)
         section_has_started = bool(
-            self.current_section and self.current_section.has_started
+            self._current_section and self._current_section.has_started
         )
         self.log_hints = LAVALogHints(self)
         assert (
@@ -57,10 +61,20 @@ class LogFollower:
         next(self.gl_section_fix_gen)
 
     @property
+    def current_section(self):
+        return self._current_section
+
+    @current_section.setter
+    def current_section(self, new_section: GitlabSection) -> None:
+        if old_section := self._current_section:
+            self.section_history.append(old_section)
+        self._current_section = new_section
+
+    @property
     def phase(self) -> LogSectionType:
         return (
-            self.current_section.type
-            if self.current_section
+            self._current_section.type
+            if self._current_section
             else LogSectionType.UNKNOWN
         )
 
@@ -75,22 +89,22 @@ class LogFollower:
             print(line)
 
     def watchdog(self):
-        if not self.current_section:
+        if not self._current_section:
             return
 
         timeout_duration = self.timeout_durations.get(
-            self.current_section.type, self.fallback_timeout
+            self._current_section.type, self.fallback_timeout
         )
 
-        if self.current_section.delta_time() > timeout_duration:
+        if self._current_section.delta_time() > timeout_duration:
             raise MesaCITimeoutError(
-                f"Gitlab Section {self.current_section} has timed out",
+                f"Gitlab Section {self._current_section} has timed out",
                 timeout_duration=timeout_duration,
             )
 
     def clear_current_section(self):
-        if self.current_section and not self.current_section.has_finished:
-            self._buffer.append(self.current_section.end())
+        if self._current_section and not self._current_section.has_finished:
+            self._buffer.append(self._current_section.end())
             self.current_section = None
 
     def update_section(self, new_section: GitlabSection):
@@ -110,6 +124,7 @@ class LogFollower:
         for log_section in LOG_SECTIONS:
             if new_section := log_section.from_log_line_to_section(line):
                 self.update_section(new_section)
+                break
 
     def detect_kernel_dump_line(self, line: dict[str, Union[str, list]]) -> bool:
         # line["msg"] can be a list[str] when there is a kernel dump
@@ -265,18 +280,31 @@ def fix_lava_gitlab_section_log():
 
 
 
-def print_log(msg: str) -> None:
+def print_log(msg: str, *args) -> None:
     # Reset color from timestamp, since `msg` can tint the terminal color
-    print(f"{CONSOLE_LOG['RESET']}{datetime.now()}: {msg}")
+    print(f"{CONSOLE_LOG['RESET']}{datetime.now()}: {msg}", *args)
 
 
-def fatal_err(msg):
+def fatal_err(msg, exception=None):
     colored_msg = f"{CONSOLE_LOG['FG_RED']}"
-    f"{msg}"
-    f"{CONSOLE_LOG['RESET']}"
-    print_log(colored_msg)
+    print_log(colored_msg, f"{msg}", f"{CONSOLE_LOG['RESET']}")
+    if exception:
+        raise exception
     sys.exit(1)
 
 
-def hide_sensitive_data(yaml_data: str, hide_tag: str ="HIDEME"):
-    return "".join(line for line in yaml_data.splitlines(True) if hide_tag not in line)
+def hide_sensitive_data(yaml_data: str, start_hide: str = "HIDE_START", end_hide: str = "HIDE_END") -> str:
+    skip_line = False
+    dump_data: list[str] = []
+    for line in yaml_data.splitlines(True):
+        if start_hide in line:
+            skip_line = True
+        elif end_hide in line:
+            skip_line = False
+
+        if skip_line:
+            continue
+
+        dump_data.append(line)
+
+    return "".join(dump_data)
diff --git a/lib/mesa/.gitlab-ci/lava/utils/log_section.py b/lib/mesa/.gitlab-ci/lava/utils/log_section.py
index b4072667e..25620a615 100644
--- a/lib/mesa/.gitlab-ci/lava/utils/log_section.py
+++ b/lib/mesa/.gitlab-ci/lava/utils/log_section.py
@@ -11,6 +11,7 @@ from lava.utils.gitlab_section import GitlabSection
 class LogSectionType(Enum):
     UNKNOWN = auto()
     LAVA_BOOT = auto()
+    TEST_DUT_SUITE = auto()
     TEST_SUITE = auto()
     TEST_CASE = auto()
     LAVA_POST_PROCESSING = auto()
@@ -24,7 +25,11 @@ class LogSectionType(Enum):
 # the enqueue delay.
 LAVA_BOOT_TIMEOUT = int(getenv("LAVA_BOOT_TIMEOUT", 9))
 
-# Test suite phase is where the initialization happens.
+# Test DUT suite phase is where the initialization happens in DUT, not on docker.
+# The device will be listening to SSH session until the end of the job.
+LAVA_TEST_DUT_SUITE_TIMEOUT = int(getenv("JOB_TIMEOUT", 60))
+
+# Test suite phase is where the initialization happens on docker.
 LAVA_TEST_SUITE_TIMEOUT = int(getenv("LAVA_TEST_SUITE_TIMEOUT", 5))
 
 # Test cases may take a long time, this script has no right to interrupt
@@ -39,6 +44,7 @@ LAVA_POST_PROCESSING_TIMEOUT = int(getenv("LAVA_POST_PROCESSING_TIMEOUT", 5))
 FALLBACK_GITLAB_SECTION_TIMEOUT = timedelta(minutes=10)
 DEFAULT_GITLAB_SECTION_TIMEOUTS = {
     LogSectionType.LAVA_BOOT: timedelta(minutes=LAVA_BOOT_TIMEOUT),
+    LogSectionType.TEST_DUT_SUITE: timedelta(minutes=LAVA_TEST_DUT_SUITE_TIMEOUT),
     LogSectionType.TEST_SUITE: timedelta(minutes=LAVA_TEST_SUITE_TIMEOUT),
     LogSectionType.TEST_CASE: timedelta(minutes=LAVA_TEST_CASE_TIMEOUT),
     LogSectionType.LAVA_POST_PROCESSING: timedelta(
@@ -83,10 +89,17 @@ LOG_SECTIONS = (
         section_type=LogSectionType.TEST_CASE,
     ),
     LogSection(
+        regex=re.compile(r"<?STARTRUN>? ([^>]*ssh.*server.*)"),
+        levels=("debug"),
+        section_id="{}",
+        section_header="[dut] test_suite {}",
+        section_type=LogSectionType.TEST_DUT_SUITE,
+    ),
+    LogSection(
         regex=re.compile(r"<?STARTRUN>? ([^>]*)"),
-        levels=("target", "debug"),
+        levels=("debug"),
         section_id="{}",
-        section_header="test_suite {}",
+        section_header="[docker] test_suite {}",
         section_type=LogSectionType.TEST_SUITE,
     ),
     LogSection(
diff --git a/lib/mesa/.gitlab-ci/lava/utils/ssh_job_definition.py b/lib/mesa/.gitlab-ci/lava/utils/ssh_job_definition.py
new file mode 100644
index 000000000..1308e5ca9
--- /dev/null
+++ b/lib/mesa/.gitlab-ci/lava/utils/ssh_job_definition.py
@@ -0,0 +1,208 @@
+"""
+In a few words: some devices in Mesa CI has problematic serial connection, they
+may hang (become silent) intermittently. Every time it hangs for minutes, the
+job is retried, causing delays in the overall pipeline executing, ultimately
+blocking legit MRs to merge.
+
+To reduce reliance on UART, we explored LAVA features, such as running docker
+containers as a test alongside the DUT one, to be able to create an SSH server
+in the DUT the earliest possible and an SSH client in a docker container, to
+establish a SSH session between both, allowing the console output to be passed
+via SSH pseudo terminal, instead of relying in the error-prone UART.
+
+In more detail, we aim to use "export -p" to share the initial boot environment
+with SSH LAVA test-cases.
+The "init-stage1.sh" script handles tasks such as system mounting and network
+setup, which are necessary for allocating a pseudo-terminal under "/dev/pts".
+Although these chores are not required for establishing an SSH session, they are
+essential for proper functionality to the target script given by HWCI_SCRIPT
+environment variable.
+
+Therefore, we have divided the job definition into four parts:
+
+1. [DUT] Logging in to DUT and run the SSH server with root access.
+2. [DUT] Running the "init-stage1.sh" script for the first SSH test case.
+3. [DUT] Export the first boot environment to `/dut-env-vars.sh` file.
+4. [SSH] Enabling the pseudo-terminal for colors and running the "init-stage2.sh"
+script after sourcing "dut-env-vars.sh" again for the second SSH test case.
+"""
+
+
+from pathlib import Path
+from typing import Any
+
+from .lava_job_definition import (
+    NUMBER_OF_ATTEMPTS_LAVA_BOOT,
+    artifact_download_steps,
+    generate_metadata,
+    to_yaml_block,
+)
+
+# Very early SSH server setup. Uses /dut_ready file to flag it is done.
+SSH_SERVER_COMMANDS = {
+    "auto_login": {
+        "login_commands": [
+            "dropbear -R -B",
+            "touch /dut_ready",
+        ],
+        "login_prompt": "ogin:",
+        # To login as root, the username should be empty
+        "username": "",
+    }
+}
+
+# TODO: Extract this inline script to a shell file, like we do with
+# init-stage[12].sh
+# The current way is difficult to maintain because one has to deal with escaping
+# characters for both Python and the resulting job definition YAML.
+# Plus, it always good to lint bash scripts with shellcheck.
+DOCKER_COMMANDS = [
+    """set -ex
+timeout 1m bash << EOF
+while [ -z "$(lava-target-ip)" ]; do
+    echo Waiting for DUT to join LAN;
+    sleep 1;
+done
+EOF
+
+ping -c 5 -w 60 $(lava-target-ip)
+
+lava_ssh_test_case() {
+    set -x
+    local test_case="${1}"
+    shift
+    lava-test-case \"${test_case}\" --shell \\
+        ssh ${SSH_PTY_ARGS:--T} \\
+        -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \\
+        root@$(lava-target-ip) \"${@}\"
+}""",
+]
+
+
+def generate_dut_test(args):
+    # Commands executed on DUT.
+    # Trying to execute the minimal number of commands, because the console data is
+    # retrieved via UART, which is hang-prone in some devices.
+
+    first_stage_steps: list[str] = Path(args.first_stage_init).read_text().splitlines()
+    return {
+        "namespace": "dut",
+        "definitions": [
+            {
+                "from": "inline",
+                "name": "setup-ssh-server",
+                "path": "inline-setup-ssh-server",
+                "repository": {
+                    "metadata": {
+                        "format": "Lava-Test Test Definition 1.0",
+                        "name": "dut-env-export",
+                    },
+                    "run": {
+                        "steps": [
+                            to_yaml_block(first_stage_steps),
+                            "export -p > /dut-env-vars.sh",  # Exporting the first boot environment
+                        ],
+                    },
+                },
+            }
+        ],
+    }
+
+
+def generate_docker_test(args):
+    # This is a growing list of commands that will be executed by the docker
+    # guest, which will be the SSH client.
+    docker_commands = []
+
+    # LAVA test wrapping Mesa CI job in a SSH session.
+    init_stages_test = {
+        "namespace": "container",
+        "timeout": {"minutes": args.job_timeout_min},
+        "failure_retry": 3,
+        "definitions": [
+            {
+                "name": "docker_ssh_client",
+                "from": "inline",
+                "path": "inline/docker_ssh_client.yaml",
+                "repository": {
+                    "metadata": {
+                        "name": "mesa",
+                        "description": "Mesa test plan",
+                        "format": "Lava-Test Test Definition 1.0",
+                    },
+                    "run": {"steps": docker_commands},
+                },
+            }
+        ],
+        "docker": {
+            "image": args.ssh_client_image,
+        },
+    }
+
+    docker_commands += [
+        to_yaml_block(DOCKER_COMMANDS, escape_vars=["LAVA_TARGET_IP"]),
+        "lava_ssh_test_case 'wait_for_dut_login' << EOF",
+        "while [ ! -e /dut_ready ]; do sleep 1; done;",
+        "EOF",
+        to_yaml_block(
+            (
+                "lava_ssh_test_case 'artifact_download' 'bash --' << EOF",
+                "source /dut-env-vars.sh",
+                *artifact_download_steps(args),
+                "EOF",
+            )
+        ),
+        "export SSH_PTY_ARGS=-tt",
+        # Putting CI_JOB name as the testcase name, it may help LAVA farm
+        # maintainers with monitoring
+        f"lava_ssh_test_case 'mesa-ci_{args.mesa_job_name}' "
+        # Changing directory to /, as the HWCI_SCRIPT expects that
+        "'\"cd / && /init-stage2.sh\"'",
+    ]
+
+    return init_stages_test
+
+
+def generate_lava_yaml_payload(args) -> dict[str, Any]:
+    values = generate_metadata(args)
+
+    # URLs to our kernel rootfs to boot from, both generated by the base
+    # container build
+    deploy = {
+        "namespace": "dut",
+        "failure_retry": NUMBER_OF_ATTEMPTS_LAVA_BOOT,
+        "timeout": {"minutes": 10},
+        "timeouts": {"http-download": {"minutes": 2}},
+        "to": "tftp",
+        "os": "oe",
+        "kernel": {"url": f"{args.kernel_url_prefix}/{args.kernel_image_name}"},
+        "nfsrootfs": {
+            "url": f"{args.rootfs_url_prefix}/lava-rootfs.tar.zst",
+            "compression": "zstd",
+        },
+    }
+    if args.kernel_image_type:
+        deploy["kernel"]["type"] = args.kernel_image_type
+    if args.dtb_filename:
+        deploy["dtb"] = {"url": f"{args.kernel_url_prefix}/{args.dtb_filename}.dtb"}
+
+    # always boot over NFS
+    boot = {
+        "namespace": "dut",
+        "failure_retry": NUMBER_OF_ATTEMPTS_LAVA_BOOT,
+        "method": args.boot_method,
+        "commands": "nfs",
+        "prompts": ["lava-shell:"],
+        **SSH_SERVER_COMMANDS,
+    }
+
+    # only declaring each job as a single 'test' since LAVA's test parsing is
+    # not useful to us
+    values["actions"] = [
+        {"deploy": deploy},
+        {"boot": boot},
+        {"test": generate_dut_test(args)},
+        {"test": generate_docker_test(args)},
+    ]
+
+    return values
diff --git a/lib/mesa/.gitlab-ci/lava/utils/uart_job_definition.py b/lib/mesa/.gitlab-ci/lava/utils/uart_job_definition.py
new file mode 100644
index 000000000..cd239c321
--- /dev/null
+++ b/lib/mesa/.gitlab-ci/lava/utils/uart_job_definition.py
@@ -0,0 +1,171 @@
+from typing import Any
+from .lava_job_definition import (
+    generate_metadata,
+    NUMBER_OF_ATTEMPTS_LAVA_BOOT,
+    artifact_download_steps,
+)
+
+
+def generate_lava_yaml_payload(args) -> dict[str, Any]:
+    values = generate_metadata(args)
+
+    # URLs to our kernel rootfs to boot from, both generated by the base
+    # container build
+
+    nfsrootfs = {
+        "url": f"{args.rootfs_url_prefix}/lava-rootfs.tar.zst",
+        "compression": "zstd",
+    }
+
+    fastboot_deploy_nfs = {
+        "timeout": {"minutes": 10},
+        "to": "nfs",
+        "nfsrootfs": nfsrootfs,
+    }
+
+    fastboot_deploy_prepare = {
+        "timeout": {"minutes": 5},
+        "to": "downloads",
+        "os": "oe",
+        "images": {
+            "kernel": {
+                "url": f"{args.kernel_url_prefix}/{args.kernel_image_name}",
+            },
+        },
+        "postprocess": {
+            "docker": {
+                "image": "registry.gitlab.collabora.com/lava/health-check-docker",
+                "steps": [
+                    f"cat Image.gz {args.dtb_filename}.dtb > Image.gz+dtb",
+                    "mkbootimg --kernel Image.gz+dtb"
+                    + ' --cmdline "root=/dev/nfs rw nfsroot=$NFS_SERVER_IP:$NFS_ROOTFS,tcp,hard rootwait ip=dhcp init=/init"'
+                    + " --pagesize 4096 --base 0x80000000 -o boot.img",
+                ],
+            }
+        },
+    }
+    if args.kernel_image_type:
+        fastboot_deploy_prepare["images"]["kernel"]["type"] = args.kernel_image_type
+    if args.dtb_filename:
+        fastboot_deploy_prepare["images"]["dtb"] = {
+            "url": f"{args.kernel_url_prefix}/{args.dtb_filename}.dtb"
+        }
+
+    tftp_deploy = {
+        "timeout": {"minutes": 5},
+        "to": "tftp",
+        "os": "oe",
+        "kernel": {
+            "url": f"{args.kernel_url_prefix}/{args.kernel_image_name}",
+        },
+        "nfsrootfs": nfsrootfs,
+    }
+    if args.kernel_image_type:
+        tftp_deploy["kernel"]["type"] = args.kernel_image_type
+    if args.dtb_filename:
+        tftp_deploy["dtb"] = {
+            "url": f"{args.kernel_url_prefix}/{args.dtb_filename}.dtb"
+        }
+
+    fastboot_deploy = {
+        "timeout": {"minutes": 2},
+        "to": "fastboot",
+        "docker": {
+            "image": "registry.gitlab.collabora.com/lava/health-check-docker",
+        },
+        "images": {
+            "boot": {"url": "downloads://boot.img"},
+        },
+    }
+
+    fastboot_boot = {
+        "timeout": {"minutes": 2},
+        "docker": {"image": "registry.gitlab.collabora.com/lava/health-check-docker"},
+        "failure_retry": NUMBER_OF_ATTEMPTS_LAVA_BOOT,
+        "method": args.boot_method,
+        "prompts": ["lava-shell:"],
+        "commands": ["set_active a"],
+    }
+
+    tftp_boot = {
+        "failure_retry": NUMBER_OF_ATTEMPTS_LAVA_BOOT,
+        "method": args.boot_method,
+        "prompts": ["lava-shell:"],
+        "commands": "nfs",
+    }
+
+    # skeleton test definition: only declaring each job as a single 'test'
+    # since LAVA's test parsing is not useful to us
+    run_steps = []
+    test = {
+        "timeout": {"minutes": args.job_timeout_min},
+        "failure_retry": 1,
+        "definitions": [
+            {
+                "name": "mesa",
+                "from": "inline",
+                "lava-signal": "kmsg",
+                "path": "inline/mesa.yaml",
+                "repository": {
+                    "metadata": {
+                        "name": "mesa",
+                        "description": "Mesa test plan",
+                        "os": ["oe"],
+                        "scope": ["functional"],
+                        "format": "Lava-Test Test Definition 1.0",
+                    },
+                    "run": {"steps": run_steps},
+                },
+            }
+        ],
+    }
+
+    # job execution script:
+    #   - inline .gitlab-ci/common/init-stage1.sh
+    #   - fetch and unpack per-pipeline build artifacts from build job
+    #   - fetch and unpack per-job environment from lava-submit.sh
+    #   - exec .gitlab-ci/common/init-stage2.sh
+
+    with open(args.first_stage_init, "r") as init_sh:
+        run_steps += [
+            x.rstrip() for x in init_sh if not x.startswith("#") and x.rstrip()
+        ]
+    # We cannot distribute the Adreno 660 shader firmware inside rootfs,
+    # since the license isn't bundled inside the repository
+    if args.device_type == "sm8350-hdk":
+        run_steps.append(
+            "curl -L --retry 4 -f --retry-all-errors --retry-delay 60 "
+            + "https://github.com/allahjasif1990/hdk888-firmware/raw/main/a660_zap.mbn "
+            + '-o "/lib/firmware/qcom/sm8350/a660_zap.mbn"'
+        )
+
+    run_steps += artifact_download_steps(args)
+
+    run_steps += [
+        f"mkdir -p {args.ci_project_dir}",
+        f"curl {args.build_url} | tar --zstd -x -C {args.ci_project_dir}",
+        # Sleep a bit to give time for bash to dump shell xtrace messages into
+        # console which may cause interleaving with LAVA_SIGNAL_STARTTC in some
+        # devices like a618.
+        "sleep 1",
+        # Putting CI_JOB name as the testcase name, it may help LAVA farm
+        # maintainers with monitoring
+        f"lava-test-case 'mesa-ci_{args.mesa_job_name}' --shell /init-stage2.sh",
+    ]
+
+    if args.boot_method == "fastboot":
+        values["actions"] = [
+            {"deploy": fastboot_deploy_nfs},
+            {"deploy": fastboot_deploy_prepare},
+            {"deploy": fastboot_deploy},
+            {"boot": fastboot_boot},
+            {"test": test},
+        ]
+    else:  # tftp
+        values["actions"] = [
+            {"deploy": tftp_deploy},
+            {"boot": tftp_boot},
+            {"test": test},
+        ]
+
+    return values
author	Jonathan Gray <jsg@cvs.openbsd.org>	2024-04-02 09:30:07 +0000
committer	Jonathan Gray <jsg@cvs.openbsd.org>	2024-04-02 09:30:07 +0000
commit	f54e142455cb3c9d1662dae7e096a32a47e5409b (patch)
tree	440ecd46269f0eac25e349e1ed58f246490c5e26 /lib/mesa/.gitlab-ci/lava
parent	36d8503c27530f68d655d3ef77a6eaa4dfd8ad65 (diff)