summaryrefslogtreecommitdiff
path: root/lib/mesa/.gitlab-ci/lava
diff options
context:
space:
mode:
Diffstat (limited to 'lib/mesa/.gitlab-ci/lava')
-rw-r--r--lib/mesa/.gitlab-ci/lava/exceptions.py3
-rwxr-xr-xlib/mesa/.gitlab-ci/lava/lava-gitlab-ci.yml77
-rwxr-xr-xlib/mesa/.gitlab-ci/lava/lava-pytest.sh26
-rwxr-xr-xlib/mesa/.gitlab-ci/lava/lava-submit.sh54
-rwxr-xr-xlib/mesa/.gitlab-ci/lava/lava_job_submitter.py723
-rw-r--r--lib/mesa/.gitlab-ci/lava/requirements.txt1
-rw-r--r--lib/mesa/.gitlab-ci/lava/utils/__init__.py3
-rw-r--r--lib/mesa/.gitlab-ci/lava/utils/gitlab_section.py19
-rw-r--r--lib/mesa/.gitlab-ci/lava/utils/lava_job.py186
-rw-r--r--lib/mesa/.gitlab-ci/lava/utils/lava_job_definition.py150
-rw-r--r--lib/mesa/.gitlab-ci/lava/utils/lava_proxy.py44
-rw-r--r--lib/mesa/.gitlab-ci/lava/utils/log_follower.py66
-rw-r--r--lib/mesa/.gitlab-ci/lava/utils/log_section.py19
-rw-r--r--lib/mesa/.gitlab-ci/lava/utils/ssh_job_definition.py208
-rw-r--r--lib/mesa/.gitlab-ci/lava/utils/uart_job_definition.py171
15 files changed, 1256 insertions, 494 deletions
diff --git a/lib/mesa/.gitlab-ci/lava/exceptions.py b/lib/mesa/.gitlab-ci/lava/exceptions.py
index 3c9a63eb3..f877b0245 100644
--- a/lib/mesa/.gitlab-ci/lava/exceptions.py
+++ b/lib/mesa/.gitlab-ci/lava/exceptions.py
@@ -12,9 +12,10 @@ class MesaCITimeoutError(MesaCIException):
class MesaCIRetryError(MesaCIException):
- def __init__(self, *args, retry_count: int) -> None:
+ def __init__(self, *args, retry_count: int, last_job: None) -> None:
super().__init__(*args)
self.retry_count = retry_count
+ self.last_job = last_job
class MesaCIParseException(MesaCIException):
diff --git a/lib/mesa/.gitlab-ci/lava/lava-gitlab-ci.yml b/lib/mesa/.gitlab-ci/lava/lava-gitlab-ci.yml
index 61039de87..de589595a 100755
--- a/lib/mesa/.gitlab-ci/lava/lava-gitlab-ci.yml
+++ b/lib/mesa/.gitlab-ci/lava/lava-gitlab-ci.yml
@@ -1,3 +1,7 @@
+variables:
+ LAVA_SSH_CLIENT_IMAGE: "${CI_REGISTRY_IMAGE}/alpine/x86_64_lava_ssh_client:${ALPINE_X86_64_LAVA_SSH_TAG}--${MESA_TEMPLATES_COMMIT}"
+
+
.lava-test:
# Cancel job if a newer commit is pushed to the same branch
interruptible: true
@@ -7,14 +11,14 @@
# proxy used to cache data locally
FDO_HTTP_CACHE_URI: "http://caching-proxy/cache/?uri="
# base system generated by the container build job, shared between many pipelines
- BASE_SYSTEM_HOST_PREFIX: "${MINIO_HOST}/mesa-lava"
- BASE_SYSTEM_MAINLINE_HOST_PATH: "${BASE_SYSTEM_HOST_PREFIX}/${FDO_UPSTREAM_REPO}/${DISTRIBUTION_TAG}/${ARCH}"
- BASE_SYSTEM_FORK_HOST_PATH: "${BASE_SYSTEM_HOST_PREFIX}/${CI_PROJECT_PATH}/${DISTRIBUTION_TAG}/${ARCH}"
+ BASE_SYSTEM_HOST_PREFIX: "${S3_HOST}/mesa-lava"
+ BASE_SYSTEM_MAINLINE_HOST_PATH: "${BASE_SYSTEM_HOST_PREFIX}/${FDO_UPSTREAM_REPO}/${DISTRIBUTION_TAG}/${DEBIAN_ARCH}"
+ BASE_SYSTEM_FORK_HOST_PATH: "${BASE_SYSTEM_HOST_PREFIX}/${CI_PROJECT_PATH}/${DISTRIBUTION_TAG}/${DEBIAN_ARCH}"
# per-job build artifacts
- BUILD_PATH: "${PIPELINE_ARTIFACTS_BASE}/${CI_PROJECT_NAME}-${ARCH}.tar.zst"
JOB_ROOTFS_OVERLAY_PATH: "${JOB_ARTIFACTS_BASE}/job-rootfs-overlay.tar.gz"
JOB_RESULTS_PATH: "${JOB_ARTIFACTS_BASE}/results.tar.zst"
- MINIO_RESULTS_UPLOAD: "${JOB_ARTIFACTS_BASE}"
+ S3_ARTIFACT_NAME: "mesa-${ARCH}-default-debugoptimized"
+ S3_RESULTS_UPLOAD: "${JOB_ARTIFACTS_BASE}"
PIGLIT_NO_WINDOW: 1
VISIBILITY_GROUP: "Collabora+fdo"
script:
@@ -32,45 +36,52 @@
- $RUNNER_TAG
after_script:
- curl -L --retry 4 -f --retry-all-errors --retry-delay 60 -s "https://${JOB_RESULTS_PATH}" | tar --zstd -x
+ needs:
+ - alpine/x86_64_lava_ssh_client
+ - !reference [.required-for-hardware-jobs, needs]
-.lava-test:armhf:
+.lava-test:arm32:
variables:
- ARCH: armhf
+ ARCH: arm32
+ DEBIAN_ARCH: armhf
KERNEL_IMAGE_NAME: zImage
KERNEL_IMAGE_TYPE: "zimage"
BOOT_METHOD: u-boot
extends:
- - .use-debian/arm_build # for same $MESA_ARTIFACTS_TAG as in kernel+rootfs_armhf
- - .use-debian/x86_build
+ - .use-debian/arm64_build # for same $MESA_ARTIFACTS_TAG as in kernel+rootfs_arm32
+ - .use-debian/x86_64_build
- .lava-test
- .use-kernel+rootfs-arm
needs:
- - kernel+rootfs_armhf
- - debian/x86_build
- - debian-armhf
+ - !reference [.lava-test, needs]
+ - kernel+rootfs_arm32
+ - debian/x86_64_build
+ - debian-arm32
-.lava-test-deqp:armhf:
+.lava-test-deqp:arm32:
extends:
- - .lava-test:armhf
+ - .lava-test:arm32
variables:
HWCI_TEST_SCRIPT: "/install/deqp-runner.sh"
.lava-test:arm64:
variables:
ARCH: arm64
+ DEBIAN_ARCH: arm64
KERNEL_IMAGE_NAME: Image
KERNEL_IMAGE_TYPE: "image"
BOOT_METHOD: u-boot
extends:
- - .use-debian/arm_build # for same $MESA_ARTIFACTS_TAG as in kernel+rootfs_arm64
- - .use-debian/x86_build
+ - .use-debian/arm64_build # for same $MESA_ARTIFACTS_TAG as in kernel+rootfs_arm64
+ - .use-debian/x86_64_build
- .lava-test
- .use-kernel+rootfs-arm
dependencies:
- debian-arm64
needs:
+ - !reference [.lava-test, needs]
- kernel+rootfs_arm64
- - debian/x86_build
+ - debian/x86_64_build
- debian-arm64
.lava-test-deqp:arm64:
@@ -79,30 +90,34 @@
extends:
- .lava-test:arm64
-.lava-test:amd64:
+.lava-test:x86_64:
variables:
- ARCH: amd64
+ ARCH: x86_64
+ DEBIAN_ARCH: amd64
KERNEL_IMAGE_NAME: bzImage
KERNEL_IMAGE_TYPE: "zimage"
BOOT_METHOD: u-boot
extends:
- - .use-debian/x86_build-base # for same $MESA_ARTIFACTS_BASE_TAG as in kernel+rootfs_amd64
- - .use-debian/x86_build
+ - .use-debian/x86_64_build-base # for same $MESA_ARTIFACTS_BASE_TAG as in kernel+rootfs_x86_64
+ - .use-debian/x86_64_build
- .lava-test
- - .use-kernel+rootfs-amd64
+ - .use-kernel+rootfs-x86_64
needs:
- - kernel+rootfs_amd64
+ - !reference [.lava-test, needs]
+ - kernel+rootfs_x86_64
- debian-testing
-.lava-test-deqp:amd64:
+.lava-test-deqp:x86_64:
variables:
HWCI_TEST_SCRIPT: "/install/deqp-runner.sh"
extends:
- - .lava-test:amd64
+ - .lava-test:x86_64
.lava-traces-base:
variables:
HWCI_TEST_SCRIPT: "/install/piglit/piglit-traces.sh"
+ # until we overcome Infrastructure issues, give traces extra 5 min before timeout
+ DEVICE_HANGING_TIMEOUT_SEC: 600
artifacts:
reports:
junit: results/junit.xml
@@ -113,15 +128,15 @@
PIGLIT_RESULTS: "${GPU_VERSION}-${PIGLIT_PROFILES}"
HWCI_TEST_SCRIPT: "/install/piglit/piglit-runner.sh"
-.lava-piglit-traces:amd64:
+.lava-piglit-traces:x86_64:
extends:
- - .lava-test:amd64
+ - .lava-test:x86_64
- .lava-piglit
- .lava-traces-base
-.lava-piglit-traces:armhf:
+.lava-piglit-traces:arm32:
extends:
- - .lava-test:armhf
+ - .lava-test:arm32
- .lava-piglit
- .lava-traces-base
@@ -131,9 +146,9 @@
- .lava-piglit
- .lava-traces-base
-.lava-piglit:amd64:
+.lava-piglit:x86_64:
extends:
- - .lava-test:amd64
+ - .lava-test:x86_64
- .lava-piglit
.lava-piglit:arm64:
diff --git a/lib/mesa/.gitlab-ci/lava/lava-pytest.sh b/lib/mesa/.gitlab-ci/lava/lava-pytest.sh
index 9ace8a05f..786a669b9 100755
--- a/lib/mesa/.gitlab-ci/lava/lava-pytest.sh
+++ b/lib/mesa/.gitlab-ci/lava/lava-pytest.sh
@@ -1,35 +1,17 @@
#!/usr/bin/env bash
-#
-# Copyright (C) 2022 Collabora Limited
+# SPDX-License-Identifier: MIT
+# © Collabora Limited
# Author: Guilherme Gallo <guilherme.gallo@collabora.com>
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice (including the next
-# paragraph) shall be included in all copies or substantial portions of the
-# Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
# This script runs unit/integration tests related with LAVA CI tools
+# shellcheck disable=SC1091 # The relative paths in this file only become valid at runtime.
set -ex
# Use this script in a python virtualenv for isolation
python3 -m venv .venv
. .venv/bin/activate
-python3 -m pip install -r ${CI_PROJECT_DIR}/.gitlab-ci/lava/requirements-test.txt
+python3 -m pip install --break-system-packages -r "${CI_PROJECT_DIR}/.gitlab-ci/lava/requirements-test.txt"
TEST_DIR=${CI_PROJECT_DIR}/.gitlab-ci/tests
diff --git a/lib/mesa/.gitlab-ci/lava/lava-submit.sh b/lib/mesa/.gitlab-ci/lava/lava-submit.sh
index 29d045a68..e02bcb24c 100755
--- a/lib/mesa/.gitlab-ci/lava/lava-submit.sh
+++ b/lib/mesa/.gitlab-ci/lava/lava-submit.sh
@@ -1,15 +1,18 @@
-#!/bin/bash
+#!/usr/bin/env bash
+# shellcheck disable=SC2086 # we want word splitting
-set -e
-set -x
+set -ex
-# Try to use the kernel and rootfs built in mainline first, so we're more
-# likely to hit cache
-if curl -s -X HEAD -L --retry 4 -f --retry-all-errors --retry-delay 60 \
- "https://${BASE_SYSTEM_MAINLINE_HOST_PATH}/done"; then
- BASE_SYSTEM_HOST_PATH="${BASE_SYSTEM_MAINLINE_HOST_PATH}"
-else
+# If we run in the fork (not from mesa or Marge-bot), reuse mainline kernel and rootfs, if exist.
+BASE_SYSTEM_HOST_PATH="${BASE_SYSTEM_MAINLINE_HOST_PATH}"
+if [ "$CI_PROJECT_PATH" != "$FDO_UPSTREAM_REPO" ]; then
+ if ! curl -s -X HEAD -L --retry 4 -f --retry-delay 60 \
+ "https://${BASE_SYSTEM_MAINLINE_HOST_PATH}/done"; then
+ echo "Using kernel and rootfs from the fork, cached from mainline is unavailable."
BASE_SYSTEM_HOST_PATH="${BASE_SYSTEM_FORK_HOST_PATH}"
+ else
+ echo "Using the cached mainline kernel and rootfs."
+ fi
fi
rm -rf results
@@ -18,46 +21,41 @@ mkdir -p results/job-rootfs-overlay/
cp artifacts/ci-common/capture-devcoredump.sh results/job-rootfs-overlay/
cp artifacts/ci-common/init-*.sh results/job-rootfs-overlay/
cp artifacts/ci-common/intel-gpu-freq.sh results/job-rootfs-overlay/
+cp artifacts/ci-common/kdl.sh results/job-rootfs-overlay/
cp "$SCRIPTS_DIR"/setup-test-env.sh results/job-rootfs-overlay/
# Prepare env vars for upload.
-KERNEL_IMAGE_BASE_URL="https://${BASE_SYSTEM_HOST_PATH}" \
- artifacts/ci-common/generate-env.sh > results/job-rootfs-overlay/set-job-env-vars.sh
section_start variables "Variables passed through:"
-cat results/job-rootfs-overlay/set-job-env-vars.sh
+artifacts/ci-common/generate-env.sh | tee results/job-rootfs-overlay/set-job-env-vars.sh
section_end variables
tar zcf job-rootfs-overlay.tar.gz -C results/job-rootfs-overlay/ .
ci-fairy s3cp --token-file "${CI_JOB_JWT_FILE}" job-rootfs-overlay.tar.gz "https://${JOB_ROOTFS_OVERLAY_PATH}"
-ARTIFACT_URL="${FDO_HTTP_CACHE_URI:-}https://${BUILD_PATH}"
-# Make it take the mesa build from MINIO_ARTIFACT_NAME, if it is specified in
-# the environment. This will make the LAVA behavior consistent with the
-# baremetal jobs.
-if [ -n "${MINIO_ARTIFACT_NAME}" ]
-then
- ARTIFACT_URL="${FDO_HTTP_CACHE_URI:-}https://${PIPELINE_ARTIFACTS_BASE}/${MINIO_ARTIFACT_NAME}.tar.zst"
-fi
+ARTIFACT_URL="${FDO_HTTP_CACHE_URI:-}https://${PIPELINE_ARTIFACTS_BASE}/${S3_ARTIFACT_NAME:?}.tar.zst"
touch results/lava.log
tail -f results/lava.log &
PYTHONPATH=artifacts/ artifacts/lava/lava_job_submitter.py \
+ submit \
--dump-yaml \
--pipeline-info "$CI_JOB_NAME: $CI_PIPELINE_URL on $CI_COMMIT_REF_NAME ${CI_NODE_INDEX}/${CI_NODE_TOTAL}" \
--rootfs-url-prefix "https://${BASE_SYSTEM_HOST_PATH}" \
- --kernel-url-prefix "https://${BASE_SYSTEM_HOST_PATH}" \
+ --kernel-url-prefix "${KERNEL_IMAGE_BASE}/${DEBIAN_ARCH}" \
--build-url "${ARTIFACT_URL}" \
--job-rootfs-overlay-url "${FDO_HTTP_CACHE_URI:-}https://${JOB_ROOTFS_OVERLAY_PATH}" \
- --job-timeout ${JOB_TIMEOUT:-30} \
+ --job-timeout-min ${JOB_TIMEOUT:-30} \
--first-stage-init artifacts/ci-common/init-stage1.sh \
- --ci-project-dir ${CI_PROJECT_DIR} \
- --device-type ${DEVICE_TYPE} \
- --dtb ${DTB} \
+ --ci-project-dir "${CI_PROJECT_DIR}" \
+ --device-type "${DEVICE_TYPE}" \
+ --dtb-filename "${DTB}" \
--jwt-file "${CI_JOB_JWT_FILE}" \
- --kernel-image-name ${KERNEL_IMAGE_NAME} \
+ --kernel-image-name "${KERNEL_IMAGE_NAME}" \
--kernel-image-type "${KERNEL_IMAGE_TYPE}" \
- --boot-method ${BOOT_METHOD} \
- --visibility-group ${VISIBILITY_GROUP} \
+ --boot-method "${BOOT_METHOD}" \
+ --visibility-group "${VISIBILITY_GROUP}" \
--lava-tags "${LAVA_TAGS}" \
--mesa-job-name "$CI_JOB_NAME" \
+ --structured-log-file "results/lava_job_detail.json" \
+ --ssh-client-image "${LAVA_SSH_CLIENT_IMAGE}" \
>> results/lava.log
diff --git a/lib/mesa/.gitlab-ci/lava/lava_job_submitter.py b/lib/mesa/.gitlab-ci/lava/lava_job_submitter.py
index 5feb4688c..b2d8e5306 100755
--- a/lib/mesa/.gitlab-ci/lava/lava_job_submitter.py
+++ b/lib/mesa/.gitlab-ci/lava/lava_job_submitter.py
@@ -9,25 +9,21 @@
"""Send a job to LAVA, track it and collect log back"""
-
-import argparse
import contextlib
+import json
import pathlib
-import re
import sys
import time
-import traceback
-import urllib.parse
-import xmlrpc.client
+from collections import defaultdict
+from dataclasses import dataclass, fields
from datetime import datetime, timedelta
from io import StringIO
-from os import getenv
+from os import environ, getenv, path
from typing import Any, Optional
-import lavacli
+import fire
from lava.exceptions import (
MesaCIException,
- MesaCIKnownIssueException,
MesaCIParseException,
MesaCIRetryError,
MesaCITimeoutError,
@@ -36,303 +32,61 @@ from lava.utils import CONSOLE_LOG
from lava.utils import DEFAULT_GITLAB_SECTION_TIMEOUTS as GL_SECTION_TIMEOUTS
from lava.utils import (
GitlabSection,
+ LAVAJob,
LogFollower,
LogSectionType,
+ call_proxy,
fatal_err,
+ generate_lava_job_definition,
hide_sensitive_data,
print_log,
+ setup_lava_proxy,
)
from lavacli.utils import flow_yaml as lava_yaml
+# Initialize structural logging with a defaultdict, it can be changed for more
+# sophisticated dict-like data abstractions.
+STRUCTURAL_LOG = defaultdict(list)
+
+try:
+ from ci.structured_logger import StructuredLogger
+except ImportError as e:
+ print_log(
+ f"Could not import StructuredLogger library: {e}. "
+ "Falling back to defaultdict based structured logger."
+ )
+
# Timeout in seconds to decide if the device from the dispatched LAVA job has
# hung or not due to the lack of new log output.
-DEVICE_HANGING_TIMEOUT_SEC = int(getenv("LAVA_DEVICE_HANGING_TIMEOUT_SEC", 5*60))
+DEVICE_HANGING_TIMEOUT_SEC = int(getenv("DEVICE_HANGING_TIMEOUT_SEC", 5*60))
# How many seconds the script should wait before try a new polling iteration to
# check if the dispatched LAVA job is running or waiting in the job queue.
-WAIT_FOR_DEVICE_POLLING_TIME_SEC = int(getenv("LAVA_WAIT_FOR_DEVICE_POLLING_TIME_SEC", 10))
+WAIT_FOR_DEVICE_POLLING_TIME_SEC = int(
+ getenv("LAVA_WAIT_FOR_DEVICE_POLLING_TIME_SEC", 1)
+)
+
+# How many seconds the script will wait to let LAVA finalize the job and give
+# the final details.
+WAIT_FOR_LAVA_POST_PROCESSING_SEC = int(getenv("LAVA_WAIT_LAVA_POST_PROCESSING_SEC", 5))
+WAIT_FOR_LAVA_POST_PROCESSING_RETRIES = int(
+ getenv("LAVA_WAIT_LAVA_POST_PROCESSING_RETRIES", 6)
+)
# How many seconds to wait between log output LAVA RPC calls.
LOG_POLLING_TIME_SEC = int(getenv("LAVA_LOG_POLLING_TIME_SEC", 5))
# How many retries should be made when a timeout happen.
-NUMBER_OF_RETRIES_TIMEOUT_DETECTION = int(getenv("LAVA_NUMBER_OF_RETRIES_TIMEOUT_DETECTION", 2))
-
-# How many attempts should be made when a timeout happen during LAVA device boot.
-NUMBER_OF_ATTEMPTS_LAVA_BOOT = int(getenv("LAVA_NUMBER_OF_ATTEMPTS_LAVA_BOOT", 3))
-
-# Supports any integers in [0, 100].
-# The scheduler considers the job priority when ordering the queue
-# to consider which job should run next.
-JOB_PRIORITY = int(getenv("LAVA_JOB_PRIORITY", 75))
-
-
-def generate_lava_yaml_payload(args) -> dict[str, Any]:
- # General metadata and permissions, plus also inexplicably kernel arguments
- values = {
- 'job_name': 'mesa: {}'.format(args.pipeline_info),
- 'device_type': args.device_type,
- 'visibility': { 'group': [ args.visibility_group ] },
- 'priority': JOB_PRIORITY,
- 'context': {
- 'extra_nfsroot_args': ' init=/init rootwait usbcore.quirks=0bda:8153:k'
- },
- "timeouts": {
- "job": {"minutes": args.job_timeout},
- "actions": {
- "depthcharge-retry": {
- # Could take between 1 and 1.5 min in slower boots
- "minutes": 2
- },
- "depthcharge-start": {
- # Should take less than 1 min.
- "minutes": 1,
- },
- "depthcharge-action": {
- # This timeout englobes the entire depthcharge timing,
- # including retries
- "minutes": 2 * NUMBER_OF_ATTEMPTS_LAVA_BOOT,
- },
- }
- },
- }
-
- if args.lava_tags:
- values['tags'] = args.lava_tags.split(',')
-
- # URLs to our kernel rootfs to boot from, both generated by the base
- # container build
- deploy = {
- 'timeout': { 'minutes': 10 },
- 'to': 'tftp',
- 'os': 'oe',
- 'kernel': {
- 'url': '{}/{}'.format(args.kernel_url_prefix, args.kernel_image_name),
- },
- 'nfsrootfs': {
- 'url': '{}/lava-rootfs.tar.zst'.format(args.rootfs_url_prefix),
- 'compression': 'zstd',
- }
- }
- if args.kernel_image_type:
- deploy['kernel']['type'] = args.kernel_image_type
- if args.dtb:
- deploy['dtb'] = {
- 'url': '{}/{}.dtb'.format(args.kernel_url_prefix, args.dtb)
- }
-
- # always boot over NFS
- boot = {
- "failure_retry": NUMBER_OF_ATTEMPTS_LAVA_BOOT,
- "method": args.boot_method,
- "commands": "nfs",
- "prompts": ["lava-shell:"],
- }
-
- # skeleton test definition: only declaring each job as a single 'test'
- # since LAVA's test parsing is not useful to us
- run_steps = []
- test = {
- 'timeout': { 'minutes': args.job_timeout },
- 'failure_retry': 1,
- 'definitions': [ {
- 'name': 'mesa',
- 'from': 'inline',
- 'lava-signal': 'kmsg',
- 'path': 'inline/mesa.yaml',
- 'repository': {
- 'metadata': {
- 'name': 'mesa',
- 'description': 'Mesa test plan',
- 'os': [ 'oe' ],
- 'scope': [ 'functional' ],
- 'format': 'Lava-Test Test Definition 1.0',
- },
- 'run': {
- "steps": run_steps
- },
- },
- } ],
- }
-
- # job execution script:
- # - inline .gitlab-ci/common/init-stage1.sh
- # - fetch and unpack per-pipeline build artifacts from build job
- # - fetch and unpack per-job environment from lava-submit.sh
- # - exec .gitlab-ci/common/init-stage2.sh
-
- with open(args.first_stage_init, "r") as init_sh:
- run_steps += [
- x.rstrip() for x in init_sh if not x.startswith("#") and x.rstrip()
- ]
- run_steps.append(
- f"curl -L --retry 4 -f --retry-all-errors --retry-delay 60 {args.job_rootfs_overlay_url} | tar -xz -C /",
- )
-
- if args.jwt_file:
- with open(args.jwt_file) as jwt_file:
- run_steps += [
- "set +x",
- f'echo -n "{jwt_file.read()}" > "{args.jwt_file}" # HIDEME',
- "set -x",
- f'echo "export CI_JOB_JWT_FILE={args.jwt_file}" >> /set-job-env-vars.sh',
- ]
- else:
- run_steps += [
- "echo Could not find jwt file, disabling MINIO requests...",
- "sed -i '/MINIO_RESULTS_UPLOAD/d' /set-job-env-vars.sh",
- ]
-
- run_steps += [
- 'mkdir -p {}'.format(args.ci_project_dir),
- 'curl {} | tar --zstd -x -C {}'.format(args.build_url, args.ci_project_dir),
-
- # Sleep a bit to give time for bash to dump shell xtrace messages into
- # console which may cause interleaving with LAVA_SIGNAL_STARTTC in some
- # devices like a618.
- 'sleep 1',
-
- # Putting CI_JOB name as the testcase name, it may help LAVA farm
- # maintainers with monitoring
- f"lava-test-case 'mesa-ci_{args.mesa_job_name}' --shell /init-stage2.sh",
- ]
-
- values['actions'] = [
- { 'deploy': deploy },
- { 'boot': boot },
- { 'test': test },
- ]
-
- return values
-
-
-def setup_lava_proxy():
- config = lavacli.load_config("default")
- uri, usr, tok = (config.get(key) for key in ("uri", "username", "token"))
- uri_obj = urllib.parse.urlparse(uri)
- uri_str = "{}://{}:{}@{}{}".format(uri_obj.scheme, usr, tok, uri_obj.netloc, uri_obj.path)
- transport = lavacli.RequestsTransport(
- uri_obj.scheme,
- config.get("proxy"),
- config.get("timeout", 120.0),
- config.get("verify_ssl_cert", True),
- )
- proxy = xmlrpc.client.ServerProxy(
- uri_str, allow_none=True, transport=transport)
-
- print_log("Proxy for {} created.".format(config['uri']))
-
- return proxy
-
-
-def _call_proxy(fn, *args):
- retries = 60
- for n in range(1, retries + 1):
- try:
- return fn(*args)
- except xmlrpc.client.ProtocolError as err:
- if n == retries:
- traceback.print_exc()
- fatal_err("A protocol error occurred (Err {} {})".format(err.errcode, err.errmsg))
- else:
- time.sleep(15)
- except xmlrpc.client.Fault as err:
- traceback.print_exc()
- fatal_err("FATAL: Fault: {} (code: {})".format(err.faultString, err.faultCode))
-
-
-class LAVAJob:
- COLOR_STATUS_MAP = {
- "pass": CONSOLE_LOG["FG_GREEN"],
- "hung": CONSOLE_LOG["FG_YELLOW"],
- "fail": CONSOLE_LOG["FG_RED"],
- "canceled": CONSOLE_LOG["FG_MAGENTA"],
- }
-
- def __init__(self, proxy, definition):
- self.job_id = None
- self.proxy = proxy
- self.definition = definition
- self.last_log_line = 0
- self.last_log_time = None
- self.is_finished = False
- self.status = "created"
-
- def heartbeat(self):
- self.last_log_time = datetime.now()
- self.status = "running"
-
- def validate(self) -> Optional[dict]:
- """Returns a dict with errors, if the validation fails.
-
- Returns:
- Optional[dict]: a dict with the validation errors, if any
- """
- return _call_proxy(self.proxy.scheduler.jobs.validate, self.definition, True)
-
- def submit(self):
- try:
- self.job_id = _call_proxy(self.proxy.scheduler.jobs.submit, self.definition)
- except MesaCIException:
- return False
- return True
-
- def cancel(self):
- if self.job_id:
- self.proxy.scheduler.jobs.cancel(self.job_id)
-
- def is_started(self) -> bool:
- waiting_states = ["Submitted", "Scheduling", "Scheduled"]
- job_state: dict[str, str] = _call_proxy(
- self.proxy.scheduler.job_state, self.job_id
- )
- return job_state["job_state"] not in waiting_states
-
- def _load_log_from_data(self, data) -> list[str]:
- lines = []
- if isinstance(data, xmlrpc.client.Binary):
- # We are dealing with xmlrpc.client.Binary
- # Let's extract the data
- data = data.data
- # When there is no new log data, the YAML is empty
- if loaded_lines := lava_yaml.load(data):
- lines = loaded_lines
- self.last_log_line += len(lines)
- return lines
-
- def get_logs(self) -> list[str]:
- try:
- (finished, data) = _call_proxy(
- self.proxy.scheduler.jobs.logs, self.job_id, self.last_log_line
- )
- self.is_finished = finished
- return self._load_log_from_data(data)
-
- except Exception as mesa_ci_err:
- raise MesaCIParseException(
- f"Could not get LAVA job logs. Reason: {mesa_ci_err}"
- ) from mesa_ci_err
-
- def parse_job_result_from_log(
- self, lava_lines: list[dict[str, str]]
- ) -> list[dict[str, str]]:
- """Use the console log to catch if the job has completed successfully or
- not. Returns the list of log lines until the result line."""
-
- last_line = None # Print all lines. lines[:None] == lines[:]
-
- for idx, line in enumerate(lava_lines):
- if result := re.search(r"hwci: mesa: (pass|fail)", line):
- self.is_finished = True
- self.status = result.group(1)
-
- last_line = idx + 1
- # We reached the log end here. hwci script has finished.
- break
- return lava_lines[:last_line]
+NUMBER_OF_RETRIES_TIMEOUT_DETECTION = int(
+ getenv("LAVA_NUMBER_OF_RETRIES_TIMEOUT_DETECTION", 2)
+)
-def find_exception_from_metadata(metadata, job_id):
+def raise_exception_from_metadata(metadata: dict, job_id: int) -> None:
+ """
+ Investigate infrastructure errors from the job metadata.
+ If it finds an error, raise it as MesaCIException.
+ """
if "result" not in metadata or metadata["result"] != "fail":
return
if "error_type" in metadata:
@@ -354,23 +108,22 @@ def find_exception_from_metadata(metadata, job_id):
raise MesaCIException(
f"LAVA job {job_id} failed validation (possible download error). Retry."
)
- return metadata
-def find_lava_error(job) -> None:
- # Look for infrastructure errors and retry if we see them.
- results_yaml = _call_proxy(job.proxy.results.get_testjob_results_yaml, job.job_id)
+def raise_lava_error(job) -> None:
+ # Look for infrastructure errors, raise them, and retry if we see them.
+ results_yaml = call_proxy(job.proxy.results.get_testjob_results_yaml, job.job_id)
results = lava_yaml.load(results_yaml)
for res in results:
metadata = res["metadata"]
- find_exception_from_metadata(metadata, job.job_id)
+ raise_exception_from_metadata(metadata, job.job_id)
# If we reach this far, it means that the job ended without hwci script
# result and no LAVA infrastructure problem was found
job.status = "fail"
-def show_job_data(job, colour=f"{CONSOLE_LOG['BOLD']}{CONSOLE_LOG['FG_GREEN']}"):
+def show_final_job_data(job, colour=f"{CONSOLE_LOG['BOLD']}{CONSOLE_LOG['FG_GREEN']}"):
with GitlabSection(
"job_data",
"LAVA job info",
@@ -378,12 +131,41 @@ def show_job_data(job, colour=f"{CONSOLE_LOG['BOLD']}{CONSOLE_LOG['FG_GREEN']}")
start_collapsed=True,
colour=colour,
):
- show = _call_proxy(job.proxy.scheduler.jobs.show, job.job_id)
- for field, value in show.items():
+ wait_post_processing_retries: int = WAIT_FOR_LAVA_POST_PROCESSING_RETRIES
+ while not job.is_post_processed() and wait_post_processing_retries > 0:
+ # Wait a little until LAVA finishes processing metadata
+ time.sleep(WAIT_FOR_LAVA_POST_PROCESSING_SEC)
+ wait_post_processing_retries -= 1
+
+ if not job.is_post_processed():
+ waited_for_sec: int = (
+ WAIT_FOR_LAVA_POST_PROCESSING_RETRIES
+ * WAIT_FOR_LAVA_POST_PROCESSING_SEC
+ )
+ print_log(
+ f"Waited for {waited_for_sec} seconds "
+ "for LAVA to post-process the job, it haven't finished yet. "
+ "Dumping it's info anyway"
+ )
+
+ details: dict[str, str] = job.show()
+ for field, value in details.items():
print(f"{field:<15}: {value}")
+ job.refresh_log()
def fetch_logs(job, max_idle_time, log_follower) -> None:
+ is_job_hanging(job, max_idle_time)
+
+ time.sleep(LOG_POLLING_TIME_SEC)
+ new_log_lines = fetch_new_log_lines(job)
+ parsed_lines = parse_log_lines(job, log_follower, new_log_lines)
+
+ for line in parsed_lines:
+ print_log(line)
+
+
+def is_job_hanging(job, max_idle_time):
# Poll to check for new logs, assuming that a prolonged period of
# silence means that the device has died and we should try it again
if datetime.now() - job.last_log_time > max_idle_time:
@@ -398,16 +180,8 @@ def fetch_logs(job, max_idle_time, log_follower) -> None:
timeout_duration=max_idle_time,
)
- time.sleep(LOG_POLLING_TIME_SEC)
- # The XMLRPC binary packet may be corrupted, causing a YAML scanner error.
- # Retry the log fetching several times before exposing the error.
- for _ in range(5):
- with contextlib.suppress(MesaCIParseException):
- new_log_lines = job.get_logs()
- break
- else:
- raise MesaCIParseException
+def parse_log_lines(job, log_follower, new_log_lines):
if log_follower.feed(new_log_lines):
# If we had non-empty log data, we can assure that the device is alive.
@@ -422,12 +196,23 @@ def fetch_logs(job, max_idle_time, log_follower) -> None:
LogSectionType.LAVA_POST_PROCESSING,
):
parsed_lines = job.parse_job_result_from_log(parsed_lines)
+ return parsed_lines
- for line in parsed_lines:
- print_log(line)
+
+def fetch_new_log_lines(job):
+
+ # The XMLRPC binary packet may be corrupted, causing a YAML scanner error.
+ # Retry the log fetching several times before exposing the error.
+ for _ in range(5):
+ with contextlib.suppress(MesaCIParseException):
+ new_log_lines = job.get_logs()
+ break
+ else:
+ raise MesaCIParseException
+ return new_log_lines
-def follow_job_execution(job):
+def submit_job(job):
try:
job.submit()
except Exception as mesa_ci_err:
@@ -435,11 +220,16 @@ def follow_job_execution(job):
f"Could not submit LAVA job. Reason: {mesa_ci_err}"
) from mesa_ci_err
+
+def wait_for_job_get_started(job):
print_log(f"Waiting for job {job.job_id} to start.")
while not job.is_started():
time.sleep(WAIT_FOR_DEVICE_POLLING_TIME_SEC)
+ job.refresh_log()
print_log(f"Job {job.job_id} started.")
+
+def bootstrap_log_follower() -> LogFollower:
gl = GitlabSection(
id="lava_boot",
header="LAVA boot",
@@ -447,20 +237,37 @@ def follow_job_execution(job):
start_collapsed=True,
)
print(gl.start())
- max_idle_time = timedelta(seconds=DEVICE_HANGING_TIMEOUT_SEC)
- with LogFollower(current_section=gl) as lf:
+ return LogFollower(starting_section=gl)
+
+def follow_job_execution(job, log_follower):
+ with log_follower:
max_idle_time = timedelta(seconds=DEVICE_HANGING_TIMEOUT_SEC)
# Start to check job's health
job.heartbeat()
while not job.is_finished:
- fetch_logs(job, max_idle_time, lf)
+ fetch_logs(job, max_idle_time, log_follower)
+ structural_log_phases(job, log_follower)
# Mesa Developers expect to have a simple pass/fail job result.
# If this does not happen, it probably means a LAVA infrastructure error
# happened.
if job.status not in ["pass", "fail"]:
- find_lava_error(job)
+ raise_lava_error(job)
+
+ # LogFollower does some cleanup after the early exit (trigger by
+ # `hwci: pass|fail` regex), let's update the phases after the cleanup.
+ structural_log_phases(job, log_follower)
+
+
+def structural_log_phases(job, log_follower):
+ phases: dict[str, Any] = {
+ s.header.split(" - ")[0]: {
+ k: str(getattr(s, k)) for k in ("start_time", "end_time")
+ }
+ for s in log_follower.section_history
+ }
+ job.log["dut_job_phases"] = phases
def print_job_final_status(job):
@@ -474,64 +281,145 @@ def print_job_final_status(job):
f"{CONSOLE_LOG['RESET']}"
)
- show_job_data(job, colour=f"{CONSOLE_LOG['BOLD']}{color}")
+ job.refresh_log()
+ show_final_job_data(job, colour=f"{CONSOLE_LOG['BOLD']}{color}")
-def retriable_follow_job(proxy, job_definition) -> LAVAJob:
- retry_count = NUMBER_OF_RETRIES_TIMEOUT_DETECTION
+def execute_job_with_retries(
+ proxy, job_definition, retry_count, jobs_log
+) -> Optional[LAVAJob]:
+ last_failed_job = None
for attempt_no in range(1, retry_count + 2):
- job = LAVAJob(proxy, job_definition)
+ # Need to get the logger value from its object to enable autosave
+ # features, if AutoSaveDict is enabled from StructuredLogging module
+ jobs_log.append({})
+ job_log = jobs_log[-1]
+ job = LAVAJob(proxy, job_definition, job_log)
+ STRUCTURAL_LOG["dut_attempt_counter"] = attempt_no
try:
- follow_job_execution(job)
+ job_log["submitter_start_time"] = datetime.now().isoformat()
+ submit_job(job)
+ wait_for_job_get_started(job)
+ log_follower: LogFollower = bootstrap_log_follower()
+ follow_job_execution(job, log_follower)
return job
- except MesaCIKnownIssueException as found_issue:
- print_log(found_issue)
- job.status = "canceled"
- except MesaCIException as mesa_exception:
- print_log(mesa_exception)
- job.cancel()
- except KeyboardInterrupt as e:
- print_log("LAVA job submitter was interrupted. Cancelling the job.")
- job.cancel()
- raise e
+
+ except (MesaCIException, KeyboardInterrupt) as exception:
+ job.handle_exception(exception)
+
finally:
+ print_job_final_status(job)
+ # If LAVA takes too long to post process the job, the submitter
+ # gives up and proceeds.
+ job_log["submitter_end_time"] = datetime.now().isoformat()
+ last_failed_job = job
print_log(
f"{CONSOLE_LOG['BOLD']}"
f"Finished executing LAVA job in the attempt #{attempt_no}"
f"{CONSOLE_LOG['RESET']}"
)
- print_job_final_status(job)
- raise MesaCIRetryError(
- f"{CONSOLE_LOG['BOLD']}"
- f"{CONSOLE_LOG['FG_RED']}"
- "Job failed after it exceeded the number of "
- f"{retry_count} retries."
- f"{CONSOLE_LOG['RESET']}",
- retry_count=retry_count,
+ return last_failed_job
+
+
+def retriable_follow_job(proxy, job_definition) -> LAVAJob:
+ number_of_retries = NUMBER_OF_RETRIES_TIMEOUT_DETECTION
+
+ last_attempted_job = execute_job_with_retries(
+ proxy, job_definition, number_of_retries, STRUCTURAL_LOG["dut_jobs"]
)
+ if last_attempted_job.exception is not None:
+ # Infra failed in all attempts
+ raise MesaCIRetryError(
+ f"{CONSOLE_LOG['BOLD']}"
+ f"{CONSOLE_LOG['FG_RED']}"
+ "Job failed after it exceeded the number of "
+ f"{number_of_retries} retries."
+ f"{CONSOLE_LOG['RESET']}",
+ retry_count=number_of_retries,
+ last_job=last_attempted_job,
+ )
+
+ return last_attempted_job
+
+
+@dataclass
+class PathResolver:
+ def __post_init__(self):
+ for field in fields(self):
+ value = getattr(self, field.name)
+ if not value:
+ continue
+ if field.type == pathlib.Path:
+ value = pathlib.Path(value)
+ setattr(self, field.name, value.resolve())
+
+
+@dataclass
+class LAVAJobSubmitter(PathResolver):
+ boot_method: str
+ ci_project_dir: str
+ device_type: str
+ job_timeout_min: int # The job timeout in minutes
+ build_url: str = None
+ dtb_filename: str = None
+ dump_yaml: bool = False # Whether to dump the YAML payload to stdout
+ first_stage_init: str = None
+ jwt_file: pathlib.Path = None
+ kernel_image_name: str = None
+ kernel_image_type: str = ""
+ kernel_url_prefix: str = None
+ lava_tags: str = "" # Comma-separated LAVA tags for the job
+ mesa_job_name: str = "mesa_ci_job"
+ pipeline_info: str = ""
+ rootfs_url_prefix: str = None
+ validate_only: bool = False # Whether to only validate the job, not execute it
+ visibility_group: str = None # Only affects LAVA farm maintainers
+ job_rootfs_overlay_url: str = None
+ structured_log_file: pathlib.Path = None # Log file path with structured LAVA log
+ ssh_client_image: str = None # x86_64 SSH client image to follow the job's output
+ __structured_log_context = contextlib.nullcontext() # Structured Logger context
+
+ def __post_init__(self) -> None:
+ super().__post_init__()
+ # Remove mesa job names with spaces, which breaks the lava-test-case command
+ self.mesa_job_name = self.mesa_job_name.split(" ")[0]
+
+ if not self.structured_log_file:
+ return
+
+ self.__structured_log_context = StructuredLoggerWrapper(self).logger_context()
+ self.proxy = setup_lava_proxy()
+
+ def __prepare_submission(self) -> str:
+ # Overwrite the timeout for the testcases with the value offered by the
+ # user. The testcase running time should be at least 4 times greater than
+ # the other sections (boot and setup), so we can safely ignore them.
+ # If LAVA fails to stop the job at this stage, it will fall back to the
+ # script section timeout with a reasonable delay.
+ GL_SECTION_TIMEOUTS[LogSectionType.TEST_CASE] = timedelta(
+ minutes=self.job_timeout_min
+ )
-def treat_mesa_job_name(args):
- # Remove mesa job names with spaces, which breaks the lava-test-case command
- args.mesa_job_name = args.mesa_job_name.split(" ")[0]
+ job_definition = generate_lava_job_definition(self)
+ if self.dump_yaml:
+ self.dump_job_definition(job_definition)
-def main(args):
- proxy = setup_lava_proxy()
+ validation_job = LAVAJob(self.proxy, job_definition)
+ if errors := validation_job.validate():
+ fatal_err(f"Error in LAVA job definition: {errors}")
+ print_log("LAVA job definition validated successfully")
- # Overwrite the timeout for the testcases with the value offered by the
- # user. The testcase running time should be at least 4 times greater than
- # the other sections (boot and setup), so we can safely ignore them.
- # If LAVA fails to stop the job at this stage, it will fall back to the
- # script section timeout with a reasonable delay.
- GL_SECTION_TIMEOUTS[LogSectionType.TEST_CASE] = timedelta(minutes=args.job_timeout)
+ return job_definition
- job_definition_stream = StringIO()
- lava_yaml.dump(generate_lava_yaml_payload(args), job_definition_stream)
- job_definition = job_definition_stream.getvalue()
+ @classmethod
+ def is_under_ci(cls):
+ ci_envvar: str = getenv("CI", "false")
+ return ci_envvar.lower() == "true"
- if args.dump_yaml:
+ def dump_job_definition(self, job_definition) -> None:
with GitlabSection(
"yaml_dump",
"LAVA job definition (YAML)",
@@ -539,44 +427,98 @@ def main(args):
start_collapsed=True,
):
print(hide_sensitive_data(job_definition))
- job = LAVAJob(proxy, job_definition)
- if errors := job.validate():
- fatal_err(f"Error in LAVA job definition: {errors}")
- print_log("LAVA job definition validated successfully")
+ def submit(self) -> None:
+ """
+ Prepares and submits the LAVA job.
+ If `validate_only` is True, it validates the job without submitting it.
+ If the job finishes with a non-pass status or encounters an exception,
+ the program exits with a non-zero return code.
+ """
+ job_definition: str = self.__prepare_submission()
- if args.validate_only:
- return
+ if self.validate_only:
+ return
+
+ with self.__structured_log_context:
+ last_attempt_job = None
+ try:
+ last_attempt_job = retriable_follow_job(self.proxy, job_definition)
+
+ except MesaCIRetryError as retry_exception:
+ last_attempt_job = retry_exception.last_job
+
+ except Exception as exception:
+ STRUCTURAL_LOG["job_combined_fail_reason"] = str(exception)
+ raise exception
+
+ finally:
+ self.finish_script(last_attempt_job)
+
+ def print_log_artifact_url(self):
+ base_url = "https://$CI_PROJECT_ROOT_NAMESPACE.pages.freedesktop.org/"
+ artifacts_path = "-/$CI_PROJECT_NAME/-/jobs/$CI_JOB_ID/artifacts/"
+ relative_log_path = self.structured_log_file.relative_to(pathlib.Path.cwd())
+ full_path = f"{base_url}{artifacts_path}{relative_log_path}"
+ artifact_url = path.expandvars(full_path)
+
+ print_log(f"Structural Logging data available at: {artifact_url}")
+
+ def finish_script(self, last_attempt_job):
+ if self.is_under_ci() and self.structured_log_file:
+ self.print_log_artifact_url()
- finished_job = retriable_follow_job(proxy, job_definition)
- exit_code = 0 if finished_job.status == "pass" else 1
- sys.exit(exit_code)
-
-
-def create_parser():
- parser = argparse.ArgumentParser("LAVA job submitter")
-
- parser.add_argument("--pipeline-info")
- parser.add_argument("--rootfs-url-prefix")
- parser.add_argument("--kernel-url-prefix")
- parser.add_argument("--build-url")
- parser.add_argument("--job-rootfs-overlay-url")
- parser.add_argument("--job-timeout", type=int)
- parser.add_argument("--first-stage-init")
- parser.add_argument("--ci-project-dir")
- parser.add_argument("--device-type")
- parser.add_argument("--dtb", nargs='?', default="")
- parser.add_argument("--kernel-image-name")
- parser.add_argument("--kernel-image-type", nargs='?', default="")
- parser.add_argument("--boot-method")
- parser.add_argument("--lava-tags", nargs='?', default="")
- parser.add_argument("--jwt-file", type=pathlib.Path)
- parser.add_argument("--validate-only", action='store_true')
- parser.add_argument("--dump-yaml", action='store_true')
- parser.add_argument("--visibility-group")
- parser.add_argument("--mesa-job-name")
-
- return parser
+ if not last_attempt_job:
+ # No job was run, something bad happened
+ STRUCTURAL_LOG["job_combined_status"] = "script_crash"
+ current_exception = str(sys.exc_info()[0])
+ STRUCTURAL_LOG["job_combined_fail_reason"] = current_exception
+ raise SystemExit(1)
+
+ STRUCTURAL_LOG["job_combined_status"] = last_attempt_job.status
+
+ if last_attempt_job.status != "pass":
+ raise SystemExit(1)
+
+
+class StructuredLoggerWrapper:
+ def __init__(self, submitter: LAVAJobSubmitter) -> None:
+ self.__submitter: LAVAJobSubmitter = submitter
+
+ def _init_logger(self):
+ STRUCTURAL_LOG["fixed_tags"] = self.__submitter.lava_tags
+ STRUCTURAL_LOG["dut_job_type"] = self.__submitter.device_type
+ STRUCTURAL_LOG["job_combined_fail_reason"] = None
+ STRUCTURAL_LOG["job_combined_status"] = "not_submitted"
+ STRUCTURAL_LOG["dut_attempt_counter"] = 0
+
+ # Initialize dut_jobs list to enable appends
+ STRUCTURAL_LOG["dut_jobs"] = []
+
+ @contextlib.contextmanager
+ def _simple_logger_context(self):
+ log_file = pathlib.Path(self.__submitter.structured_log_file)
+ log_file.parent.mkdir(parents=True, exist_ok=True)
+ try:
+ # Truncate the file
+ log_file.write_text("")
+ yield
+ finally:
+ log_file.write_text(json.dumps(STRUCTURAL_LOG, indent=2))
+
+ def logger_context(self):
+ context = contextlib.nullcontext()
+ try:
+
+ global STRUCTURAL_LOG
+ STRUCTURAL_LOG = StructuredLogger(
+ self.__submitter.structured_log_file, truncate=True
+ ).data
+ except NameError:
+ context = self._simple_logger_context()
+
+ self._init_logger()
+ return context
if __name__ == "__main__":
@@ -585,10 +527,11 @@ if __name__ == "__main__":
# more buffering
sys.stdout.reconfigure(line_buffering=True)
sys.stderr.reconfigure(line_buffering=True)
-
- parser = create_parser()
-
- parser.set_defaults(func=main)
- args = parser.parse_args()
- treat_mesa_job_name(args)
- args.func(args)
+ # LAVA farm is giving datetime in UTC timezone, let's set it locally for the
+ # script run.
+ # Setting environ here will not affect the system time, as the os.environ
+ # lifetime follows the script one.
+ environ["TZ"] = "UTC"
+ time.tzset()
+
+ fire.Fire(LAVAJobSubmitter)
diff --git a/lib/mesa/.gitlab-ci/lava/requirements.txt b/lib/mesa/.gitlab-ci/lava/requirements.txt
index 7186eceb9..e89021f3f 100644
--- a/lib/mesa/.gitlab-ci/lava/requirements.txt
+++ b/lib/mesa/.gitlab-ci/lava/requirements.txt
@@ -1 +1,2 @@
lavacli==1.5.2
+fire==0.5.0
diff --git a/lib/mesa/.gitlab-ci/lava/utils/__init__.py b/lib/mesa/.gitlab-ci/lava/utils/__init__.py
index 18bb459c1..349d2b325 100644
--- a/lib/mesa/.gitlab-ci/lava/utils/__init__.py
+++ b/lib/mesa/.gitlab-ci/lava/utils/__init__.py
@@ -1,5 +1,8 @@
from .console_format import CONSOLE_LOG
from .gitlab_section import GitlabSection
+from .lava_job import LAVAJob
+from .lava_job_definition import generate_lava_job_definition
+from .lava_proxy import call_proxy, setup_lava_proxy
from .log_follower import (
LogFollower,
fatal_err,
diff --git a/lib/mesa/.gitlab-ci/lava/utils/gitlab_section.py b/lib/mesa/.gitlab-ci/lava/utils/gitlab_section.py
index 7e2398d7a..034afb4eb 100644
--- a/lib/mesa/.gitlab-ci/lava/utils/gitlab_section.py
+++ b/lib/mesa/.gitlab-ci/lava/utils/gitlab_section.py
@@ -11,6 +11,7 @@ if TYPE_CHECKING:
from lava.utils.log_section import LogSectionType
+# TODO: Add section final status to assist with monitoring
@dataclass
class GitlabSection:
id: str
@@ -37,6 +38,14 @@ class GitlabSection:
def has_finished(self) -> bool:
return self.__end_time is not None
+ @property
+ def start_time(self) -> datetime:
+ return self.__start_time
+
+ @property
+ def end_time(self) -> Optional[datetime]:
+ return self.__end_time
+
def get_timestamp(self, time: datetime) -> str:
unix_ts = datetime.timestamp(time)
return str(int(unix_ts))
@@ -54,6 +63,16 @@ class GitlabSection:
return f"{before_header}{header_wrapper}"
+ def __str__(self) -> str:
+ status = "NS" if not self.has_started else "F" if self.has_finished else "IP"
+ delta = self.delta_time()
+ elapsed_time = "N/A" if delta is None else str(delta)
+ return (
+ f"GitlabSection({self.id}, {self.header}, {self.type}, "
+ f"SC={self.start_collapsed}, S={status}, ST={self.start_time}, "
+ f"ET={self.end_time}, ET={elapsed_time})"
+ )
+
def __enter__(self):
print(self.start())
return self
diff --git a/lib/mesa/.gitlab-ci/lava/utils/lava_job.py b/lib/mesa/.gitlab-ci/lava/utils/lava_job.py
new file mode 100644
index 000000000..b69f8b9fb
--- /dev/null
+++ b/lib/mesa/.gitlab-ci/lava/utils/lava_job.py
@@ -0,0 +1,186 @@
+import re
+import xmlrpc
+from collections import defaultdict
+from datetime import datetime
+from typing import Any, Optional
+
+from lava.exceptions import (
+ MesaCIException,
+ MesaCIKnownIssueException,
+ MesaCIParseException,
+ MesaCITimeoutError,
+)
+from lava.utils import CONSOLE_LOG
+from lava.utils.log_follower import print_log
+from lavacli.utils import flow_yaml as lava_yaml
+
+from .lava_proxy import call_proxy
+
+
+class LAVAJob:
+ COLOR_STATUS_MAP: dict[str, str] = {
+ "pass": CONSOLE_LOG["FG_GREEN"],
+ "hung": CONSOLE_LOG["FG_YELLOW"],
+ "fail": CONSOLE_LOG["FG_RED"],
+ "canceled": CONSOLE_LOG["FG_MAGENTA"],
+ }
+
+ def __init__(self, proxy, definition, log=defaultdict(str)) -> None:
+ self._job_id = None
+ self.proxy = proxy
+ self.definition = definition
+ self.last_log_line = 0
+ self.last_log_time = None
+ self._is_finished = False
+ self.log: dict[str, Any] = log
+ self.status = "not_submitted"
+ self.__exception: Optional[str] = None
+
+ def heartbeat(self) -> None:
+ self.last_log_time: datetime = datetime.now()
+ self.status = "running"
+
+ @property
+ def status(self) -> str:
+ return self._status
+
+ @status.setter
+ def status(self, new_status: str) -> None:
+ self._status = new_status
+ self.log["status"] = self._status
+
+ @property
+ def job_id(self) -> int:
+ return self._job_id
+
+ @job_id.setter
+ def job_id(self, new_id: int) -> None:
+ self._job_id = new_id
+ self.log["lava_job_id"] = self._job_id
+
+ @property
+ def is_finished(self) -> bool:
+ return self._is_finished
+
+ @property
+ def exception(self) -> str:
+ return self.__exception
+
+ @exception.setter
+ def exception(self, exception: Exception) -> None:
+ self.__exception = repr(exception)
+ self.log["dut_job_fail_reason"] = self.__exception
+
+ def validate(self) -> Optional[dict]:
+ """Returns a dict with errors, if the validation fails.
+
+ Returns:
+ Optional[dict]: a dict with the validation errors, if any
+ """
+ return call_proxy(self.proxy.scheduler.jobs.validate, self.definition, True)
+
+ def show(self) -> dict[str, str]:
+ return call_proxy(self.proxy.scheduler.jobs.show, self._job_id)
+
+ def get_lava_time(self, key, data) -> Optional[str]:
+ return data[key].value if data[key] else None
+
+ def refresh_log(self) -> None:
+ details = self.show()
+ self.log["dut_start_time"] = self.get_lava_time("start_time", details)
+ self.log["dut_submit_time"] = self.get_lava_time("submit_time", details)
+ self.log["dut_end_time"] = self.get_lava_time("end_time", details)
+ self.log["dut_name"] = details.get("device")
+ self.log["dut_state"] = details.get("state")
+
+ def submit(self) -> bool:
+ try:
+ self.job_id = call_proxy(self.proxy.scheduler.jobs.submit, self.definition)
+ self.status = "submitted"
+ self.refresh_log()
+ except MesaCIException:
+ return False
+ return True
+
+ def lava_state(self) -> str:
+ job_state: dict[str, str] = call_proxy(
+ self.proxy.scheduler.job_state, self._job_id
+ )
+ return job_state["job_state"]
+
+ def cancel(self):
+ if self._job_id:
+ self.proxy.scheduler.jobs.cancel(self._job_id)
+ # If we don't have yet set another job's status, let's update it
+ # with canceled one
+ if self.status == "running":
+ self.status = "canceled"
+
+ def is_started(self) -> bool:
+ waiting_states = ("Submitted", "Scheduling", "Scheduled")
+ return self.lava_state() not in waiting_states
+
+ def is_post_processed(self) -> bool:
+ return self.lava_state() != "Running"
+
+ def _load_log_from_data(self, data) -> list[str]:
+ lines = []
+ if isinstance(data, xmlrpc.client.Binary):
+ # We are dealing with xmlrpc.client.Binary
+ # Let's extract the data
+ data = data.data
+ # When there is no new log data, the YAML is empty
+ if loaded_lines := lava_yaml.load(data):
+ lines: list[str] = loaded_lines
+ self.last_log_line += len(lines)
+ return lines
+
+ def get_logs(self) -> list[str]:
+ try:
+ (finished, data) = call_proxy(
+ self.proxy.scheduler.jobs.logs, self._job_id, self.last_log_line
+ )
+ self._is_finished = finished
+ return self._load_log_from_data(data)
+
+ except Exception as mesa_ci_err:
+ raise MesaCIParseException(
+ f"Could not get LAVA job logs. Reason: {mesa_ci_err}"
+ ) from mesa_ci_err
+
+ def parse_job_result_from_log(
+ self, lava_lines: list[dict[str, str]]
+ ) -> list[dict[str, str]]:
+ """Use the console log to catch if the job has completed successfully or
+ not. Returns the list of log lines until the result line."""
+
+ last_line = None # Print all lines. lines[:None] == lines[:]
+
+ for idx, line in enumerate(lava_lines):
+ if result := re.search(r"hwci: mesa: (pass|fail)", line):
+ self._is_finished = True
+ self.status = result[1]
+
+ last_line = idx + 1
+ # We reached the log end here. hwci script has finished.
+ break
+ return lava_lines[:last_line]
+
+ def handle_exception(self, exception: Exception):
+ print_log(exception)
+ self.cancel()
+ self.exception = exception
+
+ # Give more accurate status depending on exception
+ if isinstance(exception, MesaCIKnownIssueException):
+ self.status = "canceled"
+ elif isinstance(exception, MesaCITimeoutError):
+ self.status = "hung"
+ elif isinstance(exception, MesaCIException):
+ self.status = "failed"
+ elif isinstance(exception, KeyboardInterrupt):
+ self.status = "interrupted"
+ print_log("LAVA job submitter was interrupted. Cancelling the job.")
+ raise
+ else:
+ self.status = "job_submitter_error"
diff --git a/lib/mesa/.gitlab-ci/lava/utils/lava_job_definition.py b/lib/mesa/.gitlab-ci/lava/utils/lava_job_definition.py
new file mode 100644
index 000000000..c7b43658c
--- /dev/null
+++ b/lib/mesa/.gitlab-ci/lava/utils/lava_job_definition.py
@@ -0,0 +1,150 @@
+from io import StringIO
+from typing import TYPE_CHECKING, Any
+
+import re
+from lava.utils.lava_farm import LavaFarm, get_lava_farm
+from ruamel.yaml.scalarstring import LiteralScalarString
+from ruamel.yaml import YAML
+from os import getenv
+
+if TYPE_CHECKING:
+ from lava.lava_job_submitter import LAVAJobSubmitter
+
+# How many attempts should be made when a timeout happen during LAVA device boot.
+NUMBER_OF_ATTEMPTS_LAVA_BOOT = int(getenv("LAVA_NUMBER_OF_ATTEMPTS_LAVA_BOOT", 3))
+
+# Supports any integers in [0, 100].
+# The scheduler considers the job priority when ordering the queue
+# to consider which job should run next.
+JOB_PRIORITY = int(getenv("JOB_PRIORITY", 75))
+
+
+def has_ssh_support(job_submitter: "LAVAJobSubmitter") -> bool:
+ force_uart = bool(getenv("LAVA_FORCE_UART", False))
+
+ if force_uart:
+ return False
+
+ # Only Collabora's farm supports to run docker container as a LAVA actions,
+ # which is required to follow the job in a SSH section
+ current_farm = get_lava_farm()
+
+ # SSH job definition still needs to add support for fastboot.
+ job_uses_fastboot: bool = job_submitter.boot_method == "fastboot"
+
+ return current_farm == LavaFarm.COLLABORA and not job_uses_fastboot
+
+
+def generate_lava_yaml_payload(job_submitter: "LAVAJobSubmitter") -> dict[str, Any]:
+ """
+ Bridge function to use the supported job definition depending on some Mesa
+ CI job characteristics.
+
+ The strategy here, is to use LAVA with a containerized SSH session to follow
+ the job output, escaping from dumping data to the UART, which proves to be
+ error prone in some devices.
+ """
+ from lava.utils.ssh_job_definition import (
+ generate_lava_yaml_payload as ssh_lava_yaml,
+ )
+ from lava.utils.uart_job_definition import (
+ generate_lava_yaml_payload as uart_lava_yaml,
+ )
+
+ if has_ssh_support(job_submitter):
+ return ssh_lava_yaml(job_submitter)
+
+ return uart_lava_yaml(job_submitter)
+
+
+def generate_lava_job_definition(job_submitter: "LAVAJobSubmitter") -> str:
+ job_stream = StringIO()
+ yaml = YAML()
+ yaml.width = 4096
+ yaml.dump(generate_lava_yaml_payload(job_submitter), job_stream)
+ return job_stream.getvalue()
+
+
+def to_yaml_block(steps_array: list[str], escape_vars=[]) -> LiteralScalarString:
+ def escape_envvar(match):
+ return "\\" + match.group(0)
+
+ filtered_array = [s for s in steps_array if s.strip() and not s.startswith("#")]
+ final_str = "\n".join(filtered_array)
+
+ for escape_var in escape_vars:
+ # Find env vars and add '\\' before them
+ final_str = re.sub(rf"\${escape_var}*", escape_envvar, final_str)
+ return LiteralScalarString(final_str)
+
+
+def generate_metadata(args) -> dict[str, Any]:
+ # General metadata and permissions
+ values = {
+ "job_name": f"mesa: {args.pipeline_info}",
+ "device_type": args.device_type,
+ "visibility": {"group": [args.visibility_group]},
+ "priority": JOB_PRIORITY,
+ "context": {
+ "extra_nfsroot_args": " init=/init rootwait usbcore.quirks=0bda:8153:k"
+ },
+ "timeouts": {
+ "job": {"minutes": args.job_timeout_min},
+ "actions": {
+ "depthcharge-retry": {
+ # Could take between 1 and 1.5 min in slower boots
+ "minutes": 4
+ },
+ "depthcharge-start": {
+ # Should take less than 1 min.
+ "minutes": 1,
+ },
+ "depthcharge-action": {
+ # This timeout englobes the entire depthcharge timing,
+ # including retries
+ "minutes": 5
+ * NUMBER_OF_ATTEMPTS_LAVA_BOOT,
+ },
+ },
+ },
+ }
+
+ if args.lava_tags:
+ values["tags"] = args.lava_tags.split(",")
+
+ return values
+
+
+def artifact_download_steps(args):
+ """
+ This function is responsible for setting up the SSH server in the DUT and to
+ export the first boot environment to a file.
+ """
+ # Putting JWT pre-processing and mesa download, within init-stage1.sh file,
+ # as we do with non-SSH version.
+ download_steps = [
+ "set -ex",
+ "curl -L --retry 4 -f --retry-all-errors --retry-delay 60 "
+ f"{args.job_rootfs_overlay_url} | tar -xz -C /",
+ f"mkdir -p {args.ci_project_dir}",
+ f"curl -L --retry 4 -f --retry-all-errors --retry-delay 60 {args.build_url} | "
+ f"tar --zstd -x -C {args.ci_project_dir}",
+ ]
+
+ # If the JWT file is provided, we will use it to authenticate with the cloud
+ # storage provider and will hide it from the job output in Gitlab.
+ if args.jwt_file:
+ with open(args.jwt_file) as jwt_file:
+ download_steps += [
+ "set +x # HIDE_START",
+ f'echo -n "{jwt_file.read()}" > "{args.jwt_file}"',
+ "set -x # HIDE_END",
+ f'echo "export CI_JOB_JWT_FILE={args.jwt_file}" >> /set-job-env-vars.sh',
+ ]
+ else:
+ download_steps += [
+ "echo Could not find jwt file, disabling S3 requests...",
+ "sed -i '/S3_RESULTS_UPLOAD/d' /set-job-env-vars.sh",
+ ]
+
+ return download_steps
diff --git a/lib/mesa/.gitlab-ci/lava/utils/lava_proxy.py b/lib/mesa/.gitlab-ci/lava/utils/lava_proxy.py
new file mode 100644
index 000000000..581ec4603
--- /dev/null
+++ b/lib/mesa/.gitlab-ci/lava/utils/lava_proxy.py
@@ -0,0 +1,44 @@
+import time
+import traceback
+import urllib
+import urllib.parse
+import xmlrpc
+import xmlrpc.client
+
+import lavacli
+
+from .log_follower import fatal_err, print_log
+
+
+def setup_lava_proxy():
+ config = lavacli.load_config("default")
+ uri, usr, tok = (config.get(key) for key in ("uri", "username", "token"))
+ uri_obj = urllib.parse.urlparse(uri)
+ uri_str = f"{uri_obj.scheme}://{usr}:{tok}@{uri_obj.netloc}{uri_obj.path}"
+ transport = lavacli.RequestsTransport(
+ uri_obj.scheme,
+ config.get("proxy"),
+ config.get("timeout", 120.0),
+ config.get("verify_ssl_cert", True),
+ )
+ proxy = xmlrpc.client.ServerProxy(uri_str, allow_none=True, transport=transport)
+
+ print_log(f'Proxy for {config["uri"]} created.')
+
+ return proxy
+
+
+def call_proxy(fn, *args):
+ retries = 60
+ for n in range(1, retries + 1):
+ try:
+ return fn(*args)
+ except xmlrpc.client.ProtocolError as err:
+ if n == retries:
+ traceback.print_exc()
+ fatal_err(f"A protocol error occurred (Err {err.errcode} {err.errmsg})")
+ else:
+ time.sleep(15)
+ except xmlrpc.client.Fault as err:
+ traceback.print_exc()
+ fatal_err(f"FATAL: Fault: {err.faultString} (code: {err.faultCode})", err)
diff --git a/lib/mesa/.gitlab-ci/lava/utils/log_follower.py b/lib/mesa/.gitlab-ci/lava/utils/log_follower.py
index b2bfcf36c..1fdf490bc 100644
--- a/lib/mesa/.gitlab-ci/lava/utils/log_follower.py
+++ b/lib/mesa/.gitlab-ci/lava/utils/log_follower.py
@@ -32,7 +32,9 @@ from lava.utils.log_section import (
@dataclass
class LogFollower:
- current_section: Optional[GitlabSection] = None
+ starting_section: Optional[GitlabSection] = None
+ _current_section: Optional[GitlabSection] = None
+ section_history: list[GitlabSection] = field(default_factory=list, init=False)
timeout_durations: dict[LogSectionType, timedelta] = field(
default_factory=lambda: DEFAULT_GITLAB_SECTION_TIMEOUTS,
)
@@ -43,9 +45,11 @@ class LogFollower:
_merge_next_line: str = field(default_factory=str, init=False)
def __post_init__(self):
- section_is_created = bool(self.current_section)
+ # Make it trigger current_section setter to populate section history
+ self.current_section = self.starting_section
+ section_is_created = bool(self._current_section)
section_has_started = bool(
- self.current_section and self.current_section.has_started
+ self._current_section and self._current_section.has_started
)
self.log_hints = LAVALogHints(self)
assert (
@@ -57,10 +61,20 @@ class LogFollower:
next(self.gl_section_fix_gen)
@property
+ def current_section(self):
+ return self._current_section
+
+ @current_section.setter
+ def current_section(self, new_section: GitlabSection) -> None:
+ if old_section := self._current_section:
+ self.section_history.append(old_section)
+ self._current_section = new_section
+
+ @property
def phase(self) -> LogSectionType:
return (
- self.current_section.type
- if self.current_section
+ self._current_section.type
+ if self._current_section
else LogSectionType.UNKNOWN
)
@@ -75,22 +89,22 @@ class LogFollower:
print(line)
def watchdog(self):
- if not self.current_section:
+ if not self._current_section:
return
timeout_duration = self.timeout_durations.get(
- self.current_section.type, self.fallback_timeout
+ self._current_section.type, self.fallback_timeout
)
- if self.current_section.delta_time() > timeout_duration:
+ if self._current_section.delta_time() > timeout_duration:
raise MesaCITimeoutError(
- f"Gitlab Section {self.current_section} has timed out",
+ f"Gitlab Section {self._current_section} has timed out",
timeout_duration=timeout_duration,
)
def clear_current_section(self):
- if self.current_section and not self.current_section.has_finished:
- self._buffer.append(self.current_section.end())
+ if self._current_section and not self._current_section.has_finished:
+ self._buffer.append(self._current_section.end())
self.current_section = None
def update_section(self, new_section: GitlabSection):
@@ -110,6 +124,7 @@ class LogFollower:
for log_section in LOG_SECTIONS:
if new_section := log_section.from_log_line_to_section(line):
self.update_section(new_section)
+ break
def detect_kernel_dump_line(self, line: dict[str, Union[str, list]]) -> bool:
# line["msg"] can be a list[str] when there is a kernel dump
@@ -265,18 +280,31 @@ def fix_lava_gitlab_section_log():
-def print_log(msg: str) -> None:
+def print_log(msg: str, *args) -> None:
# Reset color from timestamp, since `msg` can tint the terminal color
- print(f"{CONSOLE_LOG['RESET']}{datetime.now()}: {msg}")
+ print(f"{CONSOLE_LOG['RESET']}{datetime.now()}: {msg}", *args)
-def fatal_err(msg):
+def fatal_err(msg, exception=None):
colored_msg = f"{CONSOLE_LOG['FG_RED']}"
- f"{msg}"
- f"{CONSOLE_LOG['RESET']}"
- print_log(colored_msg)
+ print_log(colored_msg, f"{msg}", f"{CONSOLE_LOG['RESET']}")
+ if exception:
+ raise exception
sys.exit(1)
-def hide_sensitive_data(yaml_data: str, hide_tag: str ="HIDEME"):
- return "".join(line for line in yaml_data.splitlines(True) if hide_tag not in line)
+def hide_sensitive_data(yaml_data: str, start_hide: str = "HIDE_START", end_hide: str = "HIDE_END") -> str:
+ skip_line = False
+ dump_data: list[str] = []
+ for line in yaml_data.splitlines(True):
+ if start_hide in line:
+ skip_line = True
+ elif end_hide in line:
+ skip_line = False
+
+ if skip_line:
+ continue
+
+ dump_data.append(line)
+
+ return "".join(dump_data)
diff --git a/lib/mesa/.gitlab-ci/lava/utils/log_section.py b/lib/mesa/.gitlab-ci/lava/utils/log_section.py
index b4072667e..25620a615 100644
--- a/lib/mesa/.gitlab-ci/lava/utils/log_section.py
+++ b/lib/mesa/.gitlab-ci/lava/utils/log_section.py
@@ -11,6 +11,7 @@ from lava.utils.gitlab_section import GitlabSection
class LogSectionType(Enum):
UNKNOWN = auto()
LAVA_BOOT = auto()
+ TEST_DUT_SUITE = auto()
TEST_SUITE = auto()
TEST_CASE = auto()
LAVA_POST_PROCESSING = auto()
@@ -24,7 +25,11 @@ class LogSectionType(Enum):
# the enqueue delay.
LAVA_BOOT_TIMEOUT = int(getenv("LAVA_BOOT_TIMEOUT", 9))
-# Test suite phase is where the initialization happens.
+# Test DUT suite phase is where the initialization happens in DUT, not on docker.
+# The device will be listening to SSH session until the end of the job.
+LAVA_TEST_DUT_SUITE_TIMEOUT = int(getenv("JOB_TIMEOUT", 60))
+
+# Test suite phase is where the initialization happens on docker.
LAVA_TEST_SUITE_TIMEOUT = int(getenv("LAVA_TEST_SUITE_TIMEOUT", 5))
# Test cases may take a long time, this script has no right to interrupt
@@ -39,6 +44,7 @@ LAVA_POST_PROCESSING_TIMEOUT = int(getenv("LAVA_POST_PROCESSING_TIMEOUT", 5))
FALLBACK_GITLAB_SECTION_TIMEOUT = timedelta(minutes=10)
DEFAULT_GITLAB_SECTION_TIMEOUTS = {
LogSectionType.LAVA_BOOT: timedelta(minutes=LAVA_BOOT_TIMEOUT),
+ LogSectionType.TEST_DUT_SUITE: timedelta(minutes=LAVA_TEST_DUT_SUITE_TIMEOUT),
LogSectionType.TEST_SUITE: timedelta(minutes=LAVA_TEST_SUITE_TIMEOUT),
LogSectionType.TEST_CASE: timedelta(minutes=LAVA_TEST_CASE_TIMEOUT),
LogSectionType.LAVA_POST_PROCESSING: timedelta(
@@ -83,10 +89,17 @@ LOG_SECTIONS = (
section_type=LogSectionType.TEST_CASE,
),
LogSection(
+ regex=re.compile(r"<?STARTRUN>? ([^>]*ssh.*server.*)"),
+ levels=("debug"),
+ section_id="{}",
+ section_header="[dut] test_suite {}",
+ section_type=LogSectionType.TEST_DUT_SUITE,
+ ),
+ LogSection(
regex=re.compile(r"<?STARTRUN>? ([^>]*)"),
- levels=("target", "debug"),
+ levels=("debug"),
section_id="{}",
- section_header="test_suite {}",
+ section_header="[docker] test_suite {}",
section_type=LogSectionType.TEST_SUITE,
),
LogSection(
diff --git a/lib/mesa/.gitlab-ci/lava/utils/ssh_job_definition.py b/lib/mesa/.gitlab-ci/lava/utils/ssh_job_definition.py
new file mode 100644
index 000000000..1308e5ca9
--- /dev/null
+++ b/lib/mesa/.gitlab-ci/lava/utils/ssh_job_definition.py
@@ -0,0 +1,208 @@
+"""
+In a few words: some devices in Mesa CI has problematic serial connection, they
+may hang (become silent) intermittently. Every time it hangs for minutes, the
+job is retried, causing delays in the overall pipeline executing, ultimately
+blocking legit MRs to merge.
+
+To reduce reliance on UART, we explored LAVA features, such as running docker
+containers as a test alongside the DUT one, to be able to create an SSH server
+in the DUT the earliest possible and an SSH client in a docker container, to
+establish a SSH session between both, allowing the console output to be passed
+via SSH pseudo terminal, instead of relying in the error-prone UART.
+
+In more detail, we aim to use "export -p" to share the initial boot environment
+with SSH LAVA test-cases.
+The "init-stage1.sh" script handles tasks such as system mounting and network
+setup, which are necessary for allocating a pseudo-terminal under "/dev/pts".
+Although these chores are not required for establishing an SSH session, they are
+essential for proper functionality to the target script given by HWCI_SCRIPT
+environment variable.
+
+Therefore, we have divided the job definition into four parts:
+
+1. [DUT] Logging in to DUT and run the SSH server with root access.
+2. [DUT] Running the "init-stage1.sh" script for the first SSH test case.
+3. [DUT] Export the first boot environment to `/dut-env-vars.sh` file.
+4. [SSH] Enabling the pseudo-terminal for colors and running the "init-stage2.sh"
+script after sourcing "dut-env-vars.sh" again for the second SSH test case.
+"""
+
+
+from pathlib import Path
+from typing import Any
+
+from .lava_job_definition import (
+ NUMBER_OF_ATTEMPTS_LAVA_BOOT,
+ artifact_download_steps,
+ generate_metadata,
+ to_yaml_block,
+)
+
+# Very early SSH server setup. Uses /dut_ready file to flag it is done.
+SSH_SERVER_COMMANDS = {
+ "auto_login": {
+ "login_commands": [
+ "dropbear -R -B",
+ "touch /dut_ready",
+ ],
+ "login_prompt": "ogin:",
+ # To login as root, the username should be empty
+ "username": "",
+ }
+}
+
+# TODO: Extract this inline script to a shell file, like we do with
+# init-stage[12].sh
+# The current way is difficult to maintain because one has to deal with escaping
+# characters for both Python and the resulting job definition YAML.
+# Plus, it always good to lint bash scripts with shellcheck.
+DOCKER_COMMANDS = [
+ """set -ex
+timeout 1m bash << EOF
+while [ -z "$(lava-target-ip)" ]; do
+ echo Waiting for DUT to join LAN;
+ sleep 1;
+done
+EOF
+
+ping -c 5 -w 60 $(lava-target-ip)
+
+lava_ssh_test_case() {
+ set -x
+ local test_case="${1}"
+ shift
+ lava-test-case \"${test_case}\" --shell \\
+ ssh ${SSH_PTY_ARGS:--T} \\
+ -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \\
+ root@$(lava-target-ip) \"${@}\"
+}""",
+]
+
+
+def generate_dut_test(args):
+ # Commands executed on DUT.
+ # Trying to execute the minimal number of commands, because the console data is
+ # retrieved via UART, which is hang-prone in some devices.
+
+ first_stage_steps: list[str] = Path(args.first_stage_init).read_text().splitlines()
+ return {
+ "namespace": "dut",
+ "definitions": [
+ {
+ "from": "inline",
+ "name": "setup-ssh-server",
+ "path": "inline-setup-ssh-server",
+ "repository": {
+ "metadata": {
+ "format": "Lava-Test Test Definition 1.0",
+ "name": "dut-env-export",
+ },
+ "run": {
+ "steps": [
+ to_yaml_block(first_stage_steps),
+ "export -p > /dut-env-vars.sh", # Exporting the first boot environment
+ ],
+ },
+ },
+ }
+ ],
+ }
+
+
+def generate_docker_test(args):
+ # This is a growing list of commands that will be executed by the docker
+ # guest, which will be the SSH client.
+ docker_commands = []
+
+ # LAVA test wrapping Mesa CI job in a SSH session.
+ init_stages_test = {
+ "namespace": "container",
+ "timeout": {"minutes": args.job_timeout_min},
+ "failure_retry": 3,
+ "definitions": [
+ {
+ "name": "docker_ssh_client",
+ "from": "inline",
+ "path": "inline/docker_ssh_client.yaml",
+ "repository": {
+ "metadata": {
+ "name": "mesa",
+ "description": "Mesa test plan",
+ "format": "Lava-Test Test Definition 1.0",
+ },
+ "run": {"steps": docker_commands},
+ },
+ }
+ ],
+ "docker": {
+ "image": args.ssh_client_image,
+ },
+ }
+
+ docker_commands += [
+ to_yaml_block(DOCKER_COMMANDS, escape_vars=["LAVA_TARGET_IP"]),
+ "lava_ssh_test_case 'wait_for_dut_login' << EOF",
+ "while [ ! -e /dut_ready ]; do sleep 1; done;",
+ "EOF",
+ to_yaml_block(
+ (
+ "lava_ssh_test_case 'artifact_download' 'bash --' << EOF",
+ "source /dut-env-vars.sh",
+ *artifact_download_steps(args),
+ "EOF",
+ )
+ ),
+ "export SSH_PTY_ARGS=-tt",
+ # Putting CI_JOB name as the testcase name, it may help LAVA farm
+ # maintainers with monitoring
+ f"lava_ssh_test_case 'mesa-ci_{args.mesa_job_name}' "
+ # Changing directory to /, as the HWCI_SCRIPT expects that
+ "'\"cd / && /init-stage2.sh\"'",
+ ]
+
+ return init_stages_test
+
+
+def generate_lava_yaml_payload(args) -> dict[str, Any]:
+ values = generate_metadata(args)
+
+ # URLs to our kernel rootfs to boot from, both generated by the base
+ # container build
+ deploy = {
+ "namespace": "dut",
+ "failure_retry": NUMBER_OF_ATTEMPTS_LAVA_BOOT,
+ "timeout": {"minutes": 10},
+ "timeouts": {"http-download": {"minutes": 2}},
+ "to": "tftp",
+ "os": "oe",
+ "kernel": {"url": f"{args.kernel_url_prefix}/{args.kernel_image_name}"},
+ "nfsrootfs": {
+ "url": f"{args.rootfs_url_prefix}/lava-rootfs.tar.zst",
+ "compression": "zstd",
+ },
+ }
+ if args.kernel_image_type:
+ deploy["kernel"]["type"] = args.kernel_image_type
+ if args.dtb_filename:
+ deploy["dtb"] = {"url": f"{args.kernel_url_prefix}/{args.dtb_filename}.dtb"}
+
+ # always boot over NFS
+ boot = {
+ "namespace": "dut",
+ "failure_retry": NUMBER_OF_ATTEMPTS_LAVA_BOOT,
+ "method": args.boot_method,
+ "commands": "nfs",
+ "prompts": ["lava-shell:"],
+ **SSH_SERVER_COMMANDS,
+ }
+
+ # only declaring each job as a single 'test' since LAVA's test parsing is
+ # not useful to us
+ values["actions"] = [
+ {"deploy": deploy},
+ {"boot": boot},
+ {"test": generate_dut_test(args)},
+ {"test": generate_docker_test(args)},
+ ]
+
+ return values
diff --git a/lib/mesa/.gitlab-ci/lava/utils/uart_job_definition.py b/lib/mesa/.gitlab-ci/lava/utils/uart_job_definition.py
new file mode 100644
index 000000000..cd239c321
--- /dev/null
+++ b/lib/mesa/.gitlab-ci/lava/utils/uart_job_definition.py
@@ -0,0 +1,171 @@
+from typing import Any
+from .lava_job_definition import (
+ generate_metadata,
+ NUMBER_OF_ATTEMPTS_LAVA_BOOT,
+ artifact_download_steps,
+)
+
+
+def generate_lava_yaml_payload(args) -> dict[str, Any]:
+ values = generate_metadata(args)
+
+ # URLs to our kernel rootfs to boot from, both generated by the base
+ # container build
+
+ nfsrootfs = {
+ "url": f"{args.rootfs_url_prefix}/lava-rootfs.tar.zst",
+ "compression": "zstd",
+ }
+
+ fastboot_deploy_nfs = {
+ "timeout": {"minutes": 10},
+ "to": "nfs",
+ "nfsrootfs": nfsrootfs,
+ }
+
+ fastboot_deploy_prepare = {
+ "timeout": {"minutes": 5},
+ "to": "downloads",
+ "os": "oe",
+ "images": {
+ "kernel": {
+ "url": f"{args.kernel_url_prefix}/{args.kernel_image_name}",
+ },
+ },
+ "postprocess": {
+ "docker": {
+ "image": "registry.gitlab.collabora.com/lava/health-check-docker",
+ "steps": [
+ f"cat Image.gz {args.dtb_filename}.dtb > Image.gz+dtb",
+ "mkbootimg --kernel Image.gz+dtb"
+ + ' --cmdline "root=/dev/nfs rw nfsroot=$NFS_SERVER_IP:$NFS_ROOTFS,tcp,hard rootwait ip=dhcp init=/init"'
+ + " --pagesize 4096 --base 0x80000000 -o boot.img",
+ ],
+ }
+ },
+ }
+ if args.kernel_image_type:
+ fastboot_deploy_prepare["images"]["kernel"]["type"] = args.kernel_image_type
+ if args.dtb_filename:
+ fastboot_deploy_prepare["images"]["dtb"] = {
+ "url": f"{args.kernel_url_prefix}/{args.dtb_filename}.dtb"
+ }
+
+ tftp_deploy = {
+ "timeout": {"minutes": 5},
+ "to": "tftp",
+ "os": "oe",
+ "kernel": {
+ "url": f"{args.kernel_url_prefix}/{args.kernel_image_name}",
+ },
+ "nfsrootfs": nfsrootfs,
+ }
+ if args.kernel_image_type:
+ tftp_deploy["kernel"]["type"] = args.kernel_image_type
+ if args.dtb_filename:
+ tftp_deploy["dtb"] = {
+ "url": f"{args.kernel_url_prefix}/{args.dtb_filename}.dtb"
+ }
+
+ fastboot_deploy = {
+ "timeout": {"minutes": 2},
+ "to": "fastboot",
+ "docker": {
+ "image": "registry.gitlab.collabora.com/lava/health-check-docker",
+ },
+ "images": {
+ "boot": {"url": "downloads://boot.img"},
+ },
+ }
+
+ fastboot_boot = {
+ "timeout": {"minutes": 2},
+ "docker": {"image": "registry.gitlab.collabora.com/lava/health-check-docker"},
+ "failure_retry": NUMBER_OF_ATTEMPTS_LAVA_BOOT,
+ "method": args.boot_method,
+ "prompts": ["lava-shell:"],
+ "commands": ["set_active a"],
+ }
+
+ tftp_boot = {
+ "failure_retry": NUMBER_OF_ATTEMPTS_LAVA_BOOT,
+ "method": args.boot_method,
+ "prompts": ["lava-shell:"],
+ "commands": "nfs",
+ }
+
+ # skeleton test definition: only declaring each job as a single 'test'
+ # since LAVA's test parsing is not useful to us
+ run_steps = []
+ test = {
+ "timeout": {"minutes": args.job_timeout_min},
+ "failure_retry": 1,
+ "definitions": [
+ {
+ "name": "mesa",
+ "from": "inline",
+ "lava-signal": "kmsg",
+ "path": "inline/mesa.yaml",
+ "repository": {
+ "metadata": {
+ "name": "mesa",
+ "description": "Mesa test plan",
+ "os": ["oe"],
+ "scope": ["functional"],
+ "format": "Lava-Test Test Definition 1.0",
+ },
+ "run": {"steps": run_steps},
+ },
+ }
+ ],
+ }
+
+ # job execution script:
+ # - inline .gitlab-ci/common/init-stage1.sh
+ # - fetch and unpack per-pipeline build artifacts from build job
+ # - fetch and unpack per-job environment from lava-submit.sh
+ # - exec .gitlab-ci/common/init-stage2.sh
+
+ with open(args.first_stage_init, "r") as init_sh:
+ run_steps += [
+ x.rstrip() for x in init_sh if not x.startswith("#") and x.rstrip()
+ ]
+ # We cannot distribute the Adreno 660 shader firmware inside rootfs,
+ # since the license isn't bundled inside the repository
+ if args.device_type == "sm8350-hdk":
+ run_steps.append(
+ "curl -L --retry 4 -f --retry-all-errors --retry-delay 60 "
+ + "https://github.com/allahjasif1990/hdk888-firmware/raw/main/a660_zap.mbn "
+ + '-o "/lib/firmware/qcom/sm8350/a660_zap.mbn"'
+ )
+
+ run_steps += artifact_download_steps(args)
+
+ run_steps += [
+ f"mkdir -p {args.ci_project_dir}",
+ f"curl {args.build_url} | tar --zstd -x -C {args.ci_project_dir}",
+ # Sleep a bit to give time for bash to dump shell xtrace messages into
+ # console which may cause interleaving with LAVA_SIGNAL_STARTTC in some
+ # devices like a618.
+ "sleep 1",
+ # Putting CI_JOB name as the testcase name, it may help LAVA farm
+ # maintainers with monitoring
+ f"lava-test-case 'mesa-ci_{args.mesa_job_name}' --shell /init-stage2.sh",
+ ]
+
+ if args.boot_method == "fastboot":
+ values["actions"] = [
+ {"deploy": fastboot_deploy_nfs},
+ {"deploy": fastboot_deploy_prepare},
+ {"deploy": fastboot_deploy},
+ {"boot": fastboot_boot},
+ {"test": test},
+ ]
+ else: # tftp
+ values["actions"] = [
+ {"deploy": tftp_deploy},
+ {"boot": tftp_boot},
+ {"test": test},
+ ]
+
+ return values