diff options
Diffstat (limited to 'lib/mesa/.gitlab-ci/lava')
-rw-r--r-- | lib/mesa/.gitlab-ci/lava/exceptions.py | 3 | ||||
-rwxr-xr-x | lib/mesa/.gitlab-ci/lava/lava-gitlab-ci.yml | 77 | ||||
-rwxr-xr-x | lib/mesa/.gitlab-ci/lava/lava-pytest.sh | 26 | ||||
-rwxr-xr-x | lib/mesa/.gitlab-ci/lava/lava-submit.sh | 54 | ||||
-rwxr-xr-x | lib/mesa/.gitlab-ci/lava/lava_job_submitter.py | 723 | ||||
-rw-r--r-- | lib/mesa/.gitlab-ci/lava/requirements.txt | 1 | ||||
-rw-r--r-- | lib/mesa/.gitlab-ci/lava/utils/__init__.py | 3 | ||||
-rw-r--r-- | lib/mesa/.gitlab-ci/lava/utils/gitlab_section.py | 19 | ||||
-rw-r--r-- | lib/mesa/.gitlab-ci/lava/utils/lava_job.py | 186 | ||||
-rw-r--r-- | lib/mesa/.gitlab-ci/lava/utils/lava_job_definition.py | 150 | ||||
-rw-r--r-- | lib/mesa/.gitlab-ci/lava/utils/lava_proxy.py | 44 | ||||
-rw-r--r-- | lib/mesa/.gitlab-ci/lava/utils/log_follower.py | 66 | ||||
-rw-r--r-- | lib/mesa/.gitlab-ci/lava/utils/log_section.py | 19 | ||||
-rw-r--r-- | lib/mesa/.gitlab-ci/lava/utils/ssh_job_definition.py | 208 | ||||
-rw-r--r-- | lib/mesa/.gitlab-ci/lava/utils/uart_job_definition.py | 171 |
15 files changed, 1256 insertions, 494 deletions
diff --git a/lib/mesa/.gitlab-ci/lava/exceptions.py b/lib/mesa/.gitlab-ci/lava/exceptions.py index 3c9a63eb3..f877b0245 100644 --- a/lib/mesa/.gitlab-ci/lava/exceptions.py +++ b/lib/mesa/.gitlab-ci/lava/exceptions.py @@ -12,9 +12,10 @@ class MesaCITimeoutError(MesaCIException): class MesaCIRetryError(MesaCIException): - def __init__(self, *args, retry_count: int) -> None: + def __init__(self, *args, retry_count: int, last_job: None) -> None: super().__init__(*args) self.retry_count = retry_count + self.last_job = last_job class MesaCIParseException(MesaCIException): diff --git a/lib/mesa/.gitlab-ci/lava/lava-gitlab-ci.yml b/lib/mesa/.gitlab-ci/lava/lava-gitlab-ci.yml index 61039de87..de589595a 100755 --- a/lib/mesa/.gitlab-ci/lava/lava-gitlab-ci.yml +++ b/lib/mesa/.gitlab-ci/lava/lava-gitlab-ci.yml @@ -1,3 +1,7 @@ +variables: + LAVA_SSH_CLIENT_IMAGE: "${CI_REGISTRY_IMAGE}/alpine/x86_64_lava_ssh_client:${ALPINE_X86_64_LAVA_SSH_TAG}--${MESA_TEMPLATES_COMMIT}" + + .lava-test: # Cancel job if a newer commit is pushed to the same branch interruptible: true @@ -7,14 +11,14 @@ # proxy used to cache data locally FDO_HTTP_CACHE_URI: "http://caching-proxy/cache/?uri=" # base system generated by the container build job, shared between many pipelines - BASE_SYSTEM_HOST_PREFIX: "${MINIO_HOST}/mesa-lava" - BASE_SYSTEM_MAINLINE_HOST_PATH: "${BASE_SYSTEM_HOST_PREFIX}/${FDO_UPSTREAM_REPO}/${DISTRIBUTION_TAG}/${ARCH}" - BASE_SYSTEM_FORK_HOST_PATH: "${BASE_SYSTEM_HOST_PREFIX}/${CI_PROJECT_PATH}/${DISTRIBUTION_TAG}/${ARCH}" + BASE_SYSTEM_HOST_PREFIX: "${S3_HOST}/mesa-lava" + BASE_SYSTEM_MAINLINE_HOST_PATH: "${BASE_SYSTEM_HOST_PREFIX}/${FDO_UPSTREAM_REPO}/${DISTRIBUTION_TAG}/${DEBIAN_ARCH}" + BASE_SYSTEM_FORK_HOST_PATH: "${BASE_SYSTEM_HOST_PREFIX}/${CI_PROJECT_PATH}/${DISTRIBUTION_TAG}/${DEBIAN_ARCH}" # per-job build artifacts - BUILD_PATH: "${PIPELINE_ARTIFACTS_BASE}/${CI_PROJECT_NAME}-${ARCH}.tar.zst" JOB_ROOTFS_OVERLAY_PATH: "${JOB_ARTIFACTS_BASE}/job-rootfs-overlay.tar.gz" JOB_RESULTS_PATH: "${JOB_ARTIFACTS_BASE}/results.tar.zst" - MINIO_RESULTS_UPLOAD: "${JOB_ARTIFACTS_BASE}" + S3_ARTIFACT_NAME: "mesa-${ARCH}-default-debugoptimized" + S3_RESULTS_UPLOAD: "${JOB_ARTIFACTS_BASE}" PIGLIT_NO_WINDOW: 1 VISIBILITY_GROUP: "Collabora+fdo" script: @@ -32,45 +36,52 @@ - $RUNNER_TAG after_script: - curl -L --retry 4 -f --retry-all-errors --retry-delay 60 -s "https://${JOB_RESULTS_PATH}" | tar --zstd -x + needs: + - alpine/x86_64_lava_ssh_client + - !reference [.required-for-hardware-jobs, needs] -.lava-test:armhf: +.lava-test:arm32: variables: - ARCH: armhf + ARCH: arm32 + DEBIAN_ARCH: armhf KERNEL_IMAGE_NAME: zImage KERNEL_IMAGE_TYPE: "zimage" BOOT_METHOD: u-boot extends: - - .use-debian/arm_build # for same $MESA_ARTIFACTS_TAG as in kernel+rootfs_armhf - - .use-debian/x86_build + - .use-debian/arm64_build # for same $MESA_ARTIFACTS_TAG as in kernel+rootfs_arm32 + - .use-debian/x86_64_build - .lava-test - .use-kernel+rootfs-arm needs: - - kernel+rootfs_armhf - - debian/x86_build - - debian-armhf + - !reference [.lava-test, needs] + - kernel+rootfs_arm32 + - debian/x86_64_build + - debian-arm32 -.lava-test-deqp:armhf: +.lava-test-deqp:arm32: extends: - - .lava-test:armhf + - .lava-test:arm32 variables: HWCI_TEST_SCRIPT: "/install/deqp-runner.sh" .lava-test:arm64: variables: ARCH: arm64 + DEBIAN_ARCH: arm64 KERNEL_IMAGE_NAME: Image KERNEL_IMAGE_TYPE: "image" BOOT_METHOD: u-boot extends: - - .use-debian/arm_build # for same $MESA_ARTIFACTS_TAG as in kernel+rootfs_arm64 - - .use-debian/x86_build + - .use-debian/arm64_build # for same $MESA_ARTIFACTS_TAG as in kernel+rootfs_arm64 + - .use-debian/x86_64_build - .lava-test - .use-kernel+rootfs-arm dependencies: - debian-arm64 needs: + - !reference [.lava-test, needs] - kernel+rootfs_arm64 - - debian/x86_build + - debian/x86_64_build - debian-arm64 .lava-test-deqp:arm64: @@ -79,30 +90,34 @@ extends: - .lava-test:arm64 -.lava-test:amd64: +.lava-test:x86_64: variables: - ARCH: amd64 + ARCH: x86_64 + DEBIAN_ARCH: amd64 KERNEL_IMAGE_NAME: bzImage KERNEL_IMAGE_TYPE: "zimage" BOOT_METHOD: u-boot extends: - - .use-debian/x86_build-base # for same $MESA_ARTIFACTS_BASE_TAG as in kernel+rootfs_amd64 - - .use-debian/x86_build + - .use-debian/x86_64_build-base # for same $MESA_ARTIFACTS_BASE_TAG as in kernel+rootfs_x86_64 + - .use-debian/x86_64_build - .lava-test - - .use-kernel+rootfs-amd64 + - .use-kernel+rootfs-x86_64 needs: - - kernel+rootfs_amd64 + - !reference [.lava-test, needs] + - kernel+rootfs_x86_64 - debian-testing -.lava-test-deqp:amd64: +.lava-test-deqp:x86_64: variables: HWCI_TEST_SCRIPT: "/install/deqp-runner.sh" extends: - - .lava-test:amd64 + - .lava-test:x86_64 .lava-traces-base: variables: HWCI_TEST_SCRIPT: "/install/piglit/piglit-traces.sh" + # until we overcome Infrastructure issues, give traces extra 5 min before timeout + DEVICE_HANGING_TIMEOUT_SEC: 600 artifacts: reports: junit: results/junit.xml @@ -113,15 +128,15 @@ PIGLIT_RESULTS: "${GPU_VERSION}-${PIGLIT_PROFILES}" HWCI_TEST_SCRIPT: "/install/piglit/piglit-runner.sh" -.lava-piglit-traces:amd64: +.lava-piglit-traces:x86_64: extends: - - .lava-test:amd64 + - .lava-test:x86_64 - .lava-piglit - .lava-traces-base -.lava-piglit-traces:armhf: +.lava-piglit-traces:arm32: extends: - - .lava-test:armhf + - .lava-test:arm32 - .lava-piglit - .lava-traces-base @@ -131,9 +146,9 @@ - .lava-piglit - .lava-traces-base -.lava-piglit:amd64: +.lava-piglit:x86_64: extends: - - .lava-test:amd64 + - .lava-test:x86_64 - .lava-piglit .lava-piglit:arm64: diff --git a/lib/mesa/.gitlab-ci/lava/lava-pytest.sh b/lib/mesa/.gitlab-ci/lava/lava-pytest.sh index 9ace8a05f..786a669b9 100755 --- a/lib/mesa/.gitlab-ci/lava/lava-pytest.sh +++ b/lib/mesa/.gitlab-ci/lava/lava-pytest.sh @@ -1,35 +1,17 @@ #!/usr/bin/env bash -# -# Copyright (C) 2022 Collabora Limited +# SPDX-License-Identifier: MIT +# © Collabora Limited # Author: Guilherme Gallo <guilherme.gallo@collabora.com> -# -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice (including the next -# paragraph) shall be included in all copies or substantial portions of the -# Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. # This script runs unit/integration tests related with LAVA CI tools +# shellcheck disable=SC1091 # The relative paths in this file only become valid at runtime. set -ex # Use this script in a python virtualenv for isolation python3 -m venv .venv . .venv/bin/activate -python3 -m pip install -r ${CI_PROJECT_DIR}/.gitlab-ci/lava/requirements-test.txt +python3 -m pip install --break-system-packages -r "${CI_PROJECT_DIR}/.gitlab-ci/lava/requirements-test.txt" TEST_DIR=${CI_PROJECT_DIR}/.gitlab-ci/tests diff --git a/lib/mesa/.gitlab-ci/lava/lava-submit.sh b/lib/mesa/.gitlab-ci/lava/lava-submit.sh index 29d045a68..e02bcb24c 100755 --- a/lib/mesa/.gitlab-ci/lava/lava-submit.sh +++ b/lib/mesa/.gitlab-ci/lava/lava-submit.sh @@ -1,15 +1,18 @@ -#!/bin/bash +#!/usr/bin/env bash +# shellcheck disable=SC2086 # we want word splitting -set -e -set -x +set -ex -# Try to use the kernel and rootfs built in mainline first, so we're more -# likely to hit cache -if curl -s -X HEAD -L --retry 4 -f --retry-all-errors --retry-delay 60 \ - "https://${BASE_SYSTEM_MAINLINE_HOST_PATH}/done"; then - BASE_SYSTEM_HOST_PATH="${BASE_SYSTEM_MAINLINE_HOST_PATH}" -else +# If we run in the fork (not from mesa or Marge-bot), reuse mainline kernel and rootfs, if exist. +BASE_SYSTEM_HOST_PATH="${BASE_SYSTEM_MAINLINE_HOST_PATH}" +if [ "$CI_PROJECT_PATH" != "$FDO_UPSTREAM_REPO" ]; then + if ! curl -s -X HEAD -L --retry 4 -f --retry-delay 60 \ + "https://${BASE_SYSTEM_MAINLINE_HOST_PATH}/done"; then + echo "Using kernel and rootfs from the fork, cached from mainline is unavailable." BASE_SYSTEM_HOST_PATH="${BASE_SYSTEM_FORK_HOST_PATH}" + else + echo "Using the cached mainline kernel and rootfs." + fi fi rm -rf results @@ -18,46 +21,41 @@ mkdir -p results/job-rootfs-overlay/ cp artifacts/ci-common/capture-devcoredump.sh results/job-rootfs-overlay/ cp artifacts/ci-common/init-*.sh results/job-rootfs-overlay/ cp artifacts/ci-common/intel-gpu-freq.sh results/job-rootfs-overlay/ +cp artifacts/ci-common/kdl.sh results/job-rootfs-overlay/ cp "$SCRIPTS_DIR"/setup-test-env.sh results/job-rootfs-overlay/ # Prepare env vars for upload. -KERNEL_IMAGE_BASE_URL="https://${BASE_SYSTEM_HOST_PATH}" \ - artifacts/ci-common/generate-env.sh > results/job-rootfs-overlay/set-job-env-vars.sh section_start variables "Variables passed through:" -cat results/job-rootfs-overlay/set-job-env-vars.sh +artifacts/ci-common/generate-env.sh | tee results/job-rootfs-overlay/set-job-env-vars.sh section_end variables tar zcf job-rootfs-overlay.tar.gz -C results/job-rootfs-overlay/ . ci-fairy s3cp --token-file "${CI_JOB_JWT_FILE}" job-rootfs-overlay.tar.gz "https://${JOB_ROOTFS_OVERLAY_PATH}" -ARTIFACT_URL="${FDO_HTTP_CACHE_URI:-}https://${BUILD_PATH}" -# Make it take the mesa build from MINIO_ARTIFACT_NAME, if it is specified in -# the environment. This will make the LAVA behavior consistent with the -# baremetal jobs. -if [ -n "${MINIO_ARTIFACT_NAME}" ] -then - ARTIFACT_URL="${FDO_HTTP_CACHE_URI:-}https://${PIPELINE_ARTIFACTS_BASE}/${MINIO_ARTIFACT_NAME}.tar.zst" -fi +ARTIFACT_URL="${FDO_HTTP_CACHE_URI:-}https://${PIPELINE_ARTIFACTS_BASE}/${S3_ARTIFACT_NAME:?}.tar.zst" touch results/lava.log tail -f results/lava.log & PYTHONPATH=artifacts/ artifacts/lava/lava_job_submitter.py \ + submit \ --dump-yaml \ --pipeline-info "$CI_JOB_NAME: $CI_PIPELINE_URL on $CI_COMMIT_REF_NAME ${CI_NODE_INDEX}/${CI_NODE_TOTAL}" \ --rootfs-url-prefix "https://${BASE_SYSTEM_HOST_PATH}" \ - --kernel-url-prefix "https://${BASE_SYSTEM_HOST_PATH}" \ + --kernel-url-prefix "${KERNEL_IMAGE_BASE}/${DEBIAN_ARCH}" \ --build-url "${ARTIFACT_URL}" \ --job-rootfs-overlay-url "${FDO_HTTP_CACHE_URI:-}https://${JOB_ROOTFS_OVERLAY_PATH}" \ - --job-timeout ${JOB_TIMEOUT:-30} \ + --job-timeout-min ${JOB_TIMEOUT:-30} \ --first-stage-init artifacts/ci-common/init-stage1.sh \ - --ci-project-dir ${CI_PROJECT_DIR} \ - --device-type ${DEVICE_TYPE} \ - --dtb ${DTB} \ + --ci-project-dir "${CI_PROJECT_DIR}" \ + --device-type "${DEVICE_TYPE}" \ + --dtb-filename "${DTB}" \ --jwt-file "${CI_JOB_JWT_FILE}" \ - --kernel-image-name ${KERNEL_IMAGE_NAME} \ + --kernel-image-name "${KERNEL_IMAGE_NAME}" \ --kernel-image-type "${KERNEL_IMAGE_TYPE}" \ - --boot-method ${BOOT_METHOD} \ - --visibility-group ${VISIBILITY_GROUP} \ + --boot-method "${BOOT_METHOD}" \ + --visibility-group "${VISIBILITY_GROUP}" \ --lava-tags "${LAVA_TAGS}" \ --mesa-job-name "$CI_JOB_NAME" \ + --structured-log-file "results/lava_job_detail.json" \ + --ssh-client-image "${LAVA_SSH_CLIENT_IMAGE}" \ >> results/lava.log diff --git a/lib/mesa/.gitlab-ci/lava/lava_job_submitter.py b/lib/mesa/.gitlab-ci/lava/lava_job_submitter.py index 5feb4688c..b2d8e5306 100755 --- a/lib/mesa/.gitlab-ci/lava/lava_job_submitter.py +++ b/lib/mesa/.gitlab-ci/lava/lava_job_submitter.py @@ -9,25 +9,21 @@ """Send a job to LAVA, track it and collect log back""" - -import argparse import contextlib +import json import pathlib -import re import sys import time -import traceback -import urllib.parse -import xmlrpc.client +from collections import defaultdict +from dataclasses import dataclass, fields from datetime import datetime, timedelta from io import StringIO -from os import getenv +from os import environ, getenv, path from typing import Any, Optional -import lavacli +import fire from lava.exceptions import ( MesaCIException, - MesaCIKnownIssueException, MesaCIParseException, MesaCIRetryError, MesaCITimeoutError, @@ -36,303 +32,61 @@ from lava.utils import CONSOLE_LOG from lava.utils import DEFAULT_GITLAB_SECTION_TIMEOUTS as GL_SECTION_TIMEOUTS from lava.utils import ( GitlabSection, + LAVAJob, LogFollower, LogSectionType, + call_proxy, fatal_err, + generate_lava_job_definition, hide_sensitive_data, print_log, + setup_lava_proxy, ) from lavacli.utils import flow_yaml as lava_yaml +# Initialize structural logging with a defaultdict, it can be changed for more +# sophisticated dict-like data abstractions. +STRUCTURAL_LOG = defaultdict(list) + +try: + from ci.structured_logger import StructuredLogger +except ImportError as e: + print_log( + f"Could not import StructuredLogger library: {e}. " + "Falling back to defaultdict based structured logger." + ) + # Timeout in seconds to decide if the device from the dispatched LAVA job has # hung or not due to the lack of new log output. -DEVICE_HANGING_TIMEOUT_SEC = int(getenv("LAVA_DEVICE_HANGING_TIMEOUT_SEC", 5*60)) +DEVICE_HANGING_TIMEOUT_SEC = int(getenv("DEVICE_HANGING_TIMEOUT_SEC", 5*60)) # How many seconds the script should wait before try a new polling iteration to # check if the dispatched LAVA job is running or waiting in the job queue. -WAIT_FOR_DEVICE_POLLING_TIME_SEC = int(getenv("LAVA_WAIT_FOR_DEVICE_POLLING_TIME_SEC", 10)) +WAIT_FOR_DEVICE_POLLING_TIME_SEC = int( + getenv("LAVA_WAIT_FOR_DEVICE_POLLING_TIME_SEC", 1) +) + +# How many seconds the script will wait to let LAVA finalize the job and give +# the final details. +WAIT_FOR_LAVA_POST_PROCESSING_SEC = int(getenv("LAVA_WAIT_LAVA_POST_PROCESSING_SEC", 5)) +WAIT_FOR_LAVA_POST_PROCESSING_RETRIES = int( + getenv("LAVA_WAIT_LAVA_POST_PROCESSING_RETRIES", 6) +) # How many seconds to wait between log output LAVA RPC calls. LOG_POLLING_TIME_SEC = int(getenv("LAVA_LOG_POLLING_TIME_SEC", 5)) # How many retries should be made when a timeout happen. -NUMBER_OF_RETRIES_TIMEOUT_DETECTION = int(getenv("LAVA_NUMBER_OF_RETRIES_TIMEOUT_DETECTION", 2)) - -# How many attempts should be made when a timeout happen during LAVA device boot. -NUMBER_OF_ATTEMPTS_LAVA_BOOT = int(getenv("LAVA_NUMBER_OF_ATTEMPTS_LAVA_BOOT", 3)) - -# Supports any integers in [0, 100]. -# The scheduler considers the job priority when ordering the queue -# to consider which job should run next. -JOB_PRIORITY = int(getenv("LAVA_JOB_PRIORITY", 75)) - - -def generate_lava_yaml_payload(args) -> dict[str, Any]: - # General metadata and permissions, plus also inexplicably kernel arguments - values = { - 'job_name': 'mesa: {}'.format(args.pipeline_info), - 'device_type': args.device_type, - 'visibility': { 'group': [ args.visibility_group ] }, - 'priority': JOB_PRIORITY, - 'context': { - 'extra_nfsroot_args': ' init=/init rootwait usbcore.quirks=0bda:8153:k' - }, - "timeouts": { - "job": {"minutes": args.job_timeout}, - "actions": { - "depthcharge-retry": { - # Could take between 1 and 1.5 min in slower boots - "minutes": 2 - }, - "depthcharge-start": { - # Should take less than 1 min. - "minutes": 1, - }, - "depthcharge-action": { - # This timeout englobes the entire depthcharge timing, - # including retries - "minutes": 2 * NUMBER_OF_ATTEMPTS_LAVA_BOOT, - }, - } - }, - } - - if args.lava_tags: - values['tags'] = args.lava_tags.split(',') - - # URLs to our kernel rootfs to boot from, both generated by the base - # container build - deploy = { - 'timeout': { 'minutes': 10 }, - 'to': 'tftp', - 'os': 'oe', - 'kernel': { - 'url': '{}/{}'.format(args.kernel_url_prefix, args.kernel_image_name), - }, - 'nfsrootfs': { - 'url': '{}/lava-rootfs.tar.zst'.format(args.rootfs_url_prefix), - 'compression': 'zstd', - } - } - if args.kernel_image_type: - deploy['kernel']['type'] = args.kernel_image_type - if args.dtb: - deploy['dtb'] = { - 'url': '{}/{}.dtb'.format(args.kernel_url_prefix, args.dtb) - } - - # always boot over NFS - boot = { - "failure_retry": NUMBER_OF_ATTEMPTS_LAVA_BOOT, - "method": args.boot_method, - "commands": "nfs", - "prompts": ["lava-shell:"], - } - - # skeleton test definition: only declaring each job as a single 'test' - # since LAVA's test parsing is not useful to us - run_steps = [] - test = { - 'timeout': { 'minutes': args.job_timeout }, - 'failure_retry': 1, - 'definitions': [ { - 'name': 'mesa', - 'from': 'inline', - 'lava-signal': 'kmsg', - 'path': 'inline/mesa.yaml', - 'repository': { - 'metadata': { - 'name': 'mesa', - 'description': 'Mesa test plan', - 'os': [ 'oe' ], - 'scope': [ 'functional' ], - 'format': 'Lava-Test Test Definition 1.0', - }, - 'run': { - "steps": run_steps - }, - }, - } ], - } - - # job execution script: - # - inline .gitlab-ci/common/init-stage1.sh - # - fetch and unpack per-pipeline build artifacts from build job - # - fetch and unpack per-job environment from lava-submit.sh - # - exec .gitlab-ci/common/init-stage2.sh - - with open(args.first_stage_init, "r") as init_sh: - run_steps += [ - x.rstrip() for x in init_sh if not x.startswith("#") and x.rstrip() - ] - run_steps.append( - f"curl -L --retry 4 -f --retry-all-errors --retry-delay 60 {args.job_rootfs_overlay_url} | tar -xz -C /", - ) - - if args.jwt_file: - with open(args.jwt_file) as jwt_file: - run_steps += [ - "set +x", - f'echo -n "{jwt_file.read()}" > "{args.jwt_file}" # HIDEME', - "set -x", - f'echo "export CI_JOB_JWT_FILE={args.jwt_file}" >> /set-job-env-vars.sh', - ] - else: - run_steps += [ - "echo Could not find jwt file, disabling MINIO requests...", - "sed -i '/MINIO_RESULTS_UPLOAD/d' /set-job-env-vars.sh", - ] - - run_steps += [ - 'mkdir -p {}'.format(args.ci_project_dir), - 'curl {} | tar --zstd -x -C {}'.format(args.build_url, args.ci_project_dir), - - # Sleep a bit to give time for bash to dump shell xtrace messages into - # console which may cause interleaving with LAVA_SIGNAL_STARTTC in some - # devices like a618. - 'sleep 1', - - # Putting CI_JOB name as the testcase name, it may help LAVA farm - # maintainers with monitoring - f"lava-test-case 'mesa-ci_{args.mesa_job_name}' --shell /init-stage2.sh", - ] - - values['actions'] = [ - { 'deploy': deploy }, - { 'boot': boot }, - { 'test': test }, - ] - - return values - - -def setup_lava_proxy(): - config = lavacli.load_config("default") - uri, usr, tok = (config.get(key) for key in ("uri", "username", "token")) - uri_obj = urllib.parse.urlparse(uri) - uri_str = "{}://{}:{}@{}{}".format(uri_obj.scheme, usr, tok, uri_obj.netloc, uri_obj.path) - transport = lavacli.RequestsTransport( - uri_obj.scheme, - config.get("proxy"), - config.get("timeout", 120.0), - config.get("verify_ssl_cert", True), - ) - proxy = xmlrpc.client.ServerProxy( - uri_str, allow_none=True, transport=transport) - - print_log("Proxy for {} created.".format(config['uri'])) - - return proxy - - -def _call_proxy(fn, *args): - retries = 60 - for n in range(1, retries + 1): - try: - return fn(*args) - except xmlrpc.client.ProtocolError as err: - if n == retries: - traceback.print_exc() - fatal_err("A protocol error occurred (Err {} {})".format(err.errcode, err.errmsg)) - else: - time.sleep(15) - except xmlrpc.client.Fault as err: - traceback.print_exc() - fatal_err("FATAL: Fault: {} (code: {})".format(err.faultString, err.faultCode)) - - -class LAVAJob: - COLOR_STATUS_MAP = { - "pass": CONSOLE_LOG["FG_GREEN"], - "hung": CONSOLE_LOG["FG_YELLOW"], - "fail": CONSOLE_LOG["FG_RED"], - "canceled": CONSOLE_LOG["FG_MAGENTA"], - } - - def __init__(self, proxy, definition): - self.job_id = None - self.proxy = proxy - self.definition = definition - self.last_log_line = 0 - self.last_log_time = None - self.is_finished = False - self.status = "created" - - def heartbeat(self): - self.last_log_time = datetime.now() - self.status = "running" - - def validate(self) -> Optional[dict]: - """Returns a dict with errors, if the validation fails. - - Returns: - Optional[dict]: a dict with the validation errors, if any - """ - return _call_proxy(self.proxy.scheduler.jobs.validate, self.definition, True) - - def submit(self): - try: - self.job_id = _call_proxy(self.proxy.scheduler.jobs.submit, self.definition) - except MesaCIException: - return False - return True - - def cancel(self): - if self.job_id: - self.proxy.scheduler.jobs.cancel(self.job_id) - - def is_started(self) -> bool: - waiting_states = ["Submitted", "Scheduling", "Scheduled"] - job_state: dict[str, str] = _call_proxy( - self.proxy.scheduler.job_state, self.job_id - ) - return job_state["job_state"] not in waiting_states - - def _load_log_from_data(self, data) -> list[str]: - lines = [] - if isinstance(data, xmlrpc.client.Binary): - # We are dealing with xmlrpc.client.Binary - # Let's extract the data - data = data.data - # When there is no new log data, the YAML is empty - if loaded_lines := lava_yaml.load(data): - lines = loaded_lines - self.last_log_line += len(lines) - return lines - - def get_logs(self) -> list[str]: - try: - (finished, data) = _call_proxy( - self.proxy.scheduler.jobs.logs, self.job_id, self.last_log_line - ) - self.is_finished = finished - return self._load_log_from_data(data) - - except Exception as mesa_ci_err: - raise MesaCIParseException( - f"Could not get LAVA job logs. Reason: {mesa_ci_err}" - ) from mesa_ci_err - - def parse_job_result_from_log( - self, lava_lines: list[dict[str, str]] - ) -> list[dict[str, str]]: - """Use the console log to catch if the job has completed successfully or - not. Returns the list of log lines until the result line.""" - - last_line = None # Print all lines. lines[:None] == lines[:] - - for idx, line in enumerate(lava_lines): - if result := re.search(r"hwci: mesa: (pass|fail)", line): - self.is_finished = True - self.status = result.group(1) - - last_line = idx + 1 - # We reached the log end here. hwci script has finished. - break - return lava_lines[:last_line] +NUMBER_OF_RETRIES_TIMEOUT_DETECTION = int( + getenv("LAVA_NUMBER_OF_RETRIES_TIMEOUT_DETECTION", 2) +) -def find_exception_from_metadata(metadata, job_id): +def raise_exception_from_metadata(metadata: dict, job_id: int) -> None: + """ + Investigate infrastructure errors from the job metadata. + If it finds an error, raise it as MesaCIException. + """ if "result" not in metadata or metadata["result"] != "fail": return if "error_type" in metadata: @@ -354,23 +108,22 @@ def find_exception_from_metadata(metadata, job_id): raise MesaCIException( f"LAVA job {job_id} failed validation (possible download error). Retry." ) - return metadata -def find_lava_error(job) -> None: - # Look for infrastructure errors and retry if we see them. - results_yaml = _call_proxy(job.proxy.results.get_testjob_results_yaml, job.job_id) +def raise_lava_error(job) -> None: + # Look for infrastructure errors, raise them, and retry if we see them. + results_yaml = call_proxy(job.proxy.results.get_testjob_results_yaml, job.job_id) results = lava_yaml.load(results_yaml) for res in results: metadata = res["metadata"] - find_exception_from_metadata(metadata, job.job_id) + raise_exception_from_metadata(metadata, job.job_id) # If we reach this far, it means that the job ended without hwci script # result and no LAVA infrastructure problem was found job.status = "fail" -def show_job_data(job, colour=f"{CONSOLE_LOG['BOLD']}{CONSOLE_LOG['FG_GREEN']}"): +def show_final_job_data(job, colour=f"{CONSOLE_LOG['BOLD']}{CONSOLE_LOG['FG_GREEN']}"): with GitlabSection( "job_data", "LAVA job info", @@ -378,12 +131,41 @@ def show_job_data(job, colour=f"{CONSOLE_LOG['BOLD']}{CONSOLE_LOG['FG_GREEN']}") start_collapsed=True, colour=colour, ): - show = _call_proxy(job.proxy.scheduler.jobs.show, job.job_id) - for field, value in show.items(): + wait_post_processing_retries: int = WAIT_FOR_LAVA_POST_PROCESSING_RETRIES + while not job.is_post_processed() and wait_post_processing_retries > 0: + # Wait a little until LAVA finishes processing metadata + time.sleep(WAIT_FOR_LAVA_POST_PROCESSING_SEC) + wait_post_processing_retries -= 1 + + if not job.is_post_processed(): + waited_for_sec: int = ( + WAIT_FOR_LAVA_POST_PROCESSING_RETRIES + * WAIT_FOR_LAVA_POST_PROCESSING_SEC + ) + print_log( + f"Waited for {waited_for_sec} seconds " + "for LAVA to post-process the job, it haven't finished yet. " + "Dumping it's info anyway" + ) + + details: dict[str, str] = job.show() + for field, value in details.items(): print(f"{field:<15}: {value}") + job.refresh_log() def fetch_logs(job, max_idle_time, log_follower) -> None: + is_job_hanging(job, max_idle_time) + + time.sleep(LOG_POLLING_TIME_SEC) + new_log_lines = fetch_new_log_lines(job) + parsed_lines = parse_log_lines(job, log_follower, new_log_lines) + + for line in parsed_lines: + print_log(line) + + +def is_job_hanging(job, max_idle_time): # Poll to check for new logs, assuming that a prolonged period of # silence means that the device has died and we should try it again if datetime.now() - job.last_log_time > max_idle_time: @@ -398,16 +180,8 @@ def fetch_logs(job, max_idle_time, log_follower) -> None: timeout_duration=max_idle_time, ) - time.sleep(LOG_POLLING_TIME_SEC) - # The XMLRPC binary packet may be corrupted, causing a YAML scanner error. - # Retry the log fetching several times before exposing the error. - for _ in range(5): - with contextlib.suppress(MesaCIParseException): - new_log_lines = job.get_logs() - break - else: - raise MesaCIParseException +def parse_log_lines(job, log_follower, new_log_lines): if log_follower.feed(new_log_lines): # If we had non-empty log data, we can assure that the device is alive. @@ -422,12 +196,23 @@ def fetch_logs(job, max_idle_time, log_follower) -> None: LogSectionType.LAVA_POST_PROCESSING, ): parsed_lines = job.parse_job_result_from_log(parsed_lines) + return parsed_lines - for line in parsed_lines: - print_log(line) + +def fetch_new_log_lines(job): + + # The XMLRPC binary packet may be corrupted, causing a YAML scanner error. + # Retry the log fetching several times before exposing the error. + for _ in range(5): + with contextlib.suppress(MesaCIParseException): + new_log_lines = job.get_logs() + break + else: + raise MesaCIParseException + return new_log_lines -def follow_job_execution(job): +def submit_job(job): try: job.submit() except Exception as mesa_ci_err: @@ -435,11 +220,16 @@ def follow_job_execution(job): f"Could not submit LAVA job. Reason: {mesa_ci_err}" ) from mesa_ci_err + +def wait_for_job_get_started(job): print_log(f"Waiting for job {job.job_id} to start.") while not job.is_started(): time.sleep(WAIT_FOR_DEVICE_POLLING_TIME_SEC) + job.refresh_log() print_log(f"Job {job.job_id} started.") + +def bootstrap_log_follower() -> LogFollower: gl = GitlabSection( id="lava_boot", header="LAVA boot", @@ -447,20 +237,37 @@ def follow_job_execution(job): start_collapsed=True, ) print(gl.start()) - max_idle_time = timedelta(seconds=DEVICE_HANGING_TIMEOUT_SEC) - with LogFollower(current_section=gl) as lf: + return LogFollower(starting_section=gl) + +def follow_job_execution(job, log_follower): + with log_follower: max_idle_time = timedelta(seconds=DEVICE_HANGING_TIMEOUT_SEC) # Start to check job's health job.heartbeat() while not job.is_finished: - fetch_logs(job, max_idle_time, lf) + fetch_logs(job, max_idle_time, log_follower) + structural_log_phases(job, log_follower) # Mesa Developers expect to have a simple pass/fail job result. # If this does not happen, it probably means a LAVA infrastructure error # happened. if job.status not in ["pass", "fail"]: - find_lava_error(job) + raise_lava_error(job) + + # LogFollower does some cleanup after the early exit (trigger by + # `hwci: pass|fail` regex), let's update the phases after the cleanup. + structural_log_phases(job, log_follower) + + +def structural_log_phases(job, log_follower): + phases: dict[str, Any] = { + s.header.split(" - ")[0]: { + k: str(getattr(s, k)) for k in ("start_time", "end_time") + } + for s in log_follower.section_history + } + job.log["dut_job_phases"] = phases def print_job_final_status(job): @@ -474,64 +281,145 @@ def print_job_final_status(job): f"{CONSOLE_LOG['RESET']}" ) - show_job_data(job, colour=f"{CONSOLE_LOG['BOLD']}{color}") + job.refresh_log() + show_final_job_data(job, colour=f"{CONSOLE_LOG['BOLD']}{color}") -def retriable_follow_job(proxy, job_definition) -> LAVAJob: - retry_count = NUMBER_OF_RETRIES_TIMEOUT_DETECTION +def execute_job_with_retries( + proxy, job_definition, retry_count, jobs_log +) -> Optional[LAVAJob]: + last_failed_job = None for attempt_no in range(1, retry_count + 2): - job = LAVAJob(proxy, job_definition) + # Need to get the logger value from its object to enable autosave + # features, if AutoSaveDict is enabled from StructuredLogging module + jobs_log.append({}) + job_log = jobs_log[-1] + job = LAVAJob(proxy, job_definition, job_log) + STRUCTURAL_LOG["dut_attempt_counter"] = attempt_no try: - follow_job_execution(job) + job_log["submitter_start_time"] = datetime.now().isoformat() + submit_job(job) + wait_for_job_get_started(job) + log_follower: LogFollower = bootstrap_log_follower() + follow_job_execution(job, log_follower) return job - except MesaCIKnownIssueException as found_issue: - print_log(found_issue) - job.status = "canceled" - except MesaCIException as mesa_exception: - print_log(mesa_exception) - job.cancel() - except KeyboardInterrupt as e: - print_log("LAVA job submitter was interrupted. Cancelling the job.") - job.cancel() - raise e + + except (MesaCIException, KeyboardInterrupt) as exception: + job.handle_exception(exception) + finally: + print_job_final_status(job) + # If LAVA takes too long to post process the job, the submitter + # gives up and proceeds. + job_log["submitter_end_time"] = datetime.now().isoformat() + last_failed_job = job print_log( f"{CONSOLE_LOG['BOLD']}" f"Finished executing LAVA job in the attempt #{attempt_no}" f"{CONSOLE_LOG['RESET']}" ) - print_job_final_status(job) - raise MesaCIRetryError( - f"{CONSOLE_LOG['BOLD']}" - f"{CONSOLE_LOG['FG_RED']}" - "Job failed after it exceeded the number of " - f"{retry_count} retries." - f"{CONSOLE_LOG['RESET']}", - retry_count=retry_count, + return last_failed_job + + +def retriable_follow_job(proxy, job_definition) -> LAVAJob: + number_of_retries = NUMBER_OF_RETRIES_TIMEOUT_DETECTION + + last_attempted_job = execute_job_with_retries( + proxy, job_definition, number_of_retries, STRUCTURAL_LOG["dut_jobs"] ) + if last_attempted_job.exception is not None: + # Infra failed in all attempts + raise MesaCIRetryError( + f"{CONSOLE_LOG['BOLD']}" + f"{CONSOLE_LOG['FG_RED']}" + "Job failed after it exceeded the number of " + f"{number_of_retries} retries." + f"{CONSOLE_LOG['RESET']}", + retry_count=number_of_retries, + last_job=last_attempted_job, + ) + + return last_attempted_job + + +@dataclass +class PathResolver: + def __post_init__(self): + for field in fields(self): + value = getattr(self, field.name) + if not value: + continue + if field.type == pathlib.Path: + value = pathlib.Path(value) + setattr(self, field.name, value.resolve()) + + +@dataclass +class LAVAJobSubmitter(PathResolver): + boot_method: str + ci_project_dir: str + device_type: str + job_timeout_min: int # The job timeout in minutes + build_url: str = None + dtb_filename: str = None + dump_yaml: bool = False # Whether to dump the YAML payload to stdout + first_stage_init: str = None + jwt_file: pathlib.Path = None + kernel_image_name: str = None + kernel_image_type: str = "" + kernel_url_prefix: str = None + lava_tags: str = "" # Comma-separated LAVA tags for the job + mesa_job_name: str = "mesa_ci_job" + pipeline_info: str = "" + rootfs_url_prefix: str = None + validate_only: bool = False # Whether to only validate the job, not execute it + visibility_group: str = None # Only affects LAVA farm maintainers + job_rootfs_overlay_url: str = None + structured_log_file: pathlib.Path = None # Log file path with structured LAVA log + ssh_client_image: str = None # x86_64 SSH client image to follow the job's output + __structured_log_context = contextlib.nullcontext() # Structured Logger context + + def __post_init__(self) -> None: + super().__post_init__() + # Remove mesa job names with spaces, which breaks the lava-test-case command + self.mesa_job_name = self.mesa_job_name.split(" ")[0] + + if not self.structured_log_file: + return + + self.__structured_log_context = StructuredLoggerWrapper(self).logger_context() + self.proxy = setup_lava_proxy() + + def __prepare_submission(self) -> str: + # Overwrite the timeout for the testcases with the value offered by the + # user. The testcase running time should be at least 4 times greater than + # the other sections (boot and setup), so we can safely ignore them. + # If LAVA fails to stop the job at this stage, it will fall back to the + # script section timeout with a reasonable delay. + GL_SECTION_TIMEOUTS[LogSectionType.TEST_CASE] = timedelta( + minutes=self.job_timeout_min + ) -def treat_mesa_job_name(args): - # Remove mesa job names with spaces, which breaks the lava-test-case command - args.mesa_job_name = args.mesa_job_name.split(" ")[0] + job_definition = generate_lava_job_definition(self) + if self.dump_yaml: + self.dump_job_definition(job_definition) -def main(args): - proxy = setup_lava_proxy() + validation_job = LAVAJob(self.proxy, job_definition) + if errors := validation_job.validate(): + fatal_err(f"Error in LAVA job definition: {errors}") + print_log("LAVA job definition validated successfully") - # Overwrite the timeout for the testcases with the value offered by the - # user. The testcase running time should be at least 4 times greater than - # the other sections (boot and setup), so we can safely ignore them. - # If LAVA fails to stop the job at this stage, it will fall back to the - # script section timeout with a reasonable delay. - GL_SECTION_TIMEOUTS[LogSectionType.TEST_CASE] = timedelta(minutes=args.job_timeout) + return job_definition - job_definition_stream = StringIO() - lava_yaml.dump(generate_lava_yaml_payload(args), job_definition_stream) - job_definition = job_definition_stream.getvalue() + @classmethod + def is_under_ci(cls): + ci_envvar: str = getenv("CI", "false") + return ci_envvar.lower() == "true" - if args.dump_yaml: + def dump_job_definition(self, job_definition) -> None: with GitlabSection( "yaml_dump", "LAVA job definition (YAML)", @@ -539,44 +427,98 @@ def main(args): start_collapsed=True, ): print(hide_sensitive_data(job_definition)) - job = LAVAJob(proxy, job_definition) - if errors := job.validate(): - fatal_err(f"Error in LAVA job definition: {errors}") - print_log("LAVA job definition validated successfully") + def submit(self) -> None: + """ + Prepares and submits the LAVA job. + If `validate_only` is True, it validates the job without submitting it. + If the job finishes with a non-pass status or encounters an exception, + the program exits with a non-zero return code. + """ + job_definition: str = self.__prepare_submission() - if args.validate_only: - return + if self.validate_only: + return + + with self.__structured_log_context: + last_attempt_job = None + try: + last_attempt_job = retriable_follow_job(self.proxy, job_definition) + + except MesaCIRetryError as retry_exception: + last_attempt_job = retry_exception.last_job + + except Exception as exception: + STRUCTURAL_LOG["job_combined_fail_reason"] = str(exception) + raise exception + + finally: + self.finish_script(last_attempt_job) + + def print_log_artifact_url(self): + base_url = "https://$CI_PROJECT_ROOT_NAMESPACE.pages.freedesktop.org/" + artifacts_path = "-/$CI_PROJECT_NAME/-/jobs/$CI_JOB_ID/artifacts/" + relative_log_path = self.structured_log_file.relative_to(pathlib.Path.cwd()) + full_path = f"{base_url}{artifacts_path}{relative_log_path}" + artifact_url = path.expandvars(full_path) + + print_log(f"Structural Logging data available at: {artifact_url}") + + def finish_script(self, last_attempt_job): + if self.is_under_ci() and self.structured_log_file: + self.print_log_artifact_url() - finished_job = retriable_follow_job(proxy, job_definition) - exit_code = 0 if finished_job.status == "pass" else 1 - sys.exit(exit_code) - - -def create_parser(): - parser = argparse.ArgumentParser("LAVA job submitter") - - parser.add_argument("--pipeline-info") - parser.add_argument("--rootfs-url-prefix") - parser.add_argument("--kernel-url-prefix") - parser.add_argument("--build-url") - parser.add_argument("--job-rootfs-overlay-url") - parser.add_argument("--job-timeout", type=int) - parser.add_argument("--first-stage-init") - parser.add_argument("--ci-project-dir") - parser.add_argument("--device-type") - parser.add_argument("--dtb", nargs='?', default="") - parser.add_argument("--kernel-image-name") - parser.add_argument("--kernel-image-type", nargs='?', default="") - parser.add_argument("--boot-method") - parser.add_argument("--lava-tags", nargs='?', default="") - parser.add_argument("--jwt-file", type=pathlib.Path) - parser.add_argument("--validate-only", action='store_true') - parser.add_argument("--dump-yaml", action='store_true') - parser.add_argument("--visibility-group") - parser.add_argument("--mesa-job-name") - - return parser + if not last_attempt_job: + # No job was run, something bad happened + STRUCTURAL_LOG["job_combined_status"] = "script_crash" + current_exception = str(sys.exc_info()[0]) + STRUCTURAL_LOG["job_combined_fail_reason"] = current_exception + raise SystemExit(1) + + STRUCTURAL_LOG["job_combined_status"] = last_attempt_job.status + + if last_attempt_job.status != "pass": + raise SystemExit(1) + + +class StructuredLoggerWrapper: + def __init__(self, submitter: LAVAJobSubmitter) -> None: + self.__submitter: LAVAJobSubmitter = submitter + + def _init_logger(self): + STRUCTURAL_LOG["fixed_tags"] = self.__submitter.lava_tags + STRUCTURAL_LOG["dut_job_type"] = self.__submitter.device_type + STRUCTURAL_LOG["job_combined_fail_reason"] = None + STRUCTURAL_LOG["job_combined_status"] = "not_submitted" + STRUCTURAL_LOG["dut_attempt_counter"] = 0 + + # Initialize dut_jobs list to enable appends + STRUCTURAL_LOG["dut_jobs"] = [] + + @contextlib.contextmanager + def _simple_logger_context(self): + log_file = pathlib.Path(self.__submitter.structured_log_file) + log_file.parent.mkdir(parents=True, exist_ok=True) + try: + # Truncate the file + log_file.write_text("") + yield + finally: + log_file.write_text(json.dumps(STRUCTURAL_LOG, indent=2)) + + def logger_context(self): + context = contextlib.nullcontext() + try: + + global STRUCTURAL_LOG + STRUCTURAL_LOG = StructuredLogger( + self.__submitter.structured_log_file, truncate=True + ).data + except NameError: + context = self._simple_logger_context() + + self._init_logger() + return context if __name__ == "__main__": @@ -585,10 +527,11 @@ if __name__ == "__main__": # more buffering sys.stdout.reconfigure(line_buffering=True) sys.stderr.reconfigure(line_buffering=True) - - parser = create_parser() - - parser.set_defaults(func=main) - args = parser.parse_args() - treat_mesa_job_name(args) - args.func(args) + # LAVA farm is giving datetime in UTC timezone, let's set it locally for the + # script run. + # Setting environ here will not affect the system time, as the os.environ + # lifetime follows the script one. + environ["TZ"] = "UTC" + time.tzset() + + fire.Fire(LAVAJobSubmitter) diff --git a/lib/mesa/.gitlab-ci/lava/requirements.txt b/lib/mesa/.gitlab-ci/lava/requirements.txt index 7186eceb9..e89021f3f 100644 --- a/lib/mesa/.gitlab-ci/lava/requirements.txt +++ b/lib/mesa/.gitlab-ci/lava/requirements.txt @@ -1 +1,2 @@ lavacli==1.5.2 +fire==0.5.0 diff --git a/lib/mesa/.gitlab-ci/lava/utils/__init__.py b/lib/mesa/.gitlab-ci/lava/utils/__init__.py index 18bb459c1..349d2b325 100644 --- a/lib/mesa/.gitlab-ci/lava/utils/__init__.py +++ b/lib/mesa/.gitlab-ci/lava/utils/__init__.py @@ -1,5 +1,8 @@ from .console_format import CONSOLE_LOG from .gitlab_section import GitlabSection +from .lava_job import LAVAJob +from .lava_job_definition import generate_lava_job_definition +from .lava_proxy import call_proxy, setup_lava_proxy from .log_follower import ( LogFollower, fatal_err, diff --git a/lib/mesa/.gitlab-ci/lava/utils/gitlab_section.py b/lib/mesa/.gitlab-ci/lava/utils/gitlab_section.py index 7e2398d7a..034afb4eb 100644 --- a/lib/mesa/.gitlab-ci/lava/utils/gitlab_section.py +++ b/lib/mesa/.gitlab-ci/lava/utils/gitlab_section.py @@ -11,6 +11,7 @@ if TYPE_CHECKING: from lava.utils.log_section import LogSectionType +# TODO: Add section final status to assist with monitoring @dataclass class GitlabSection: id: str @@ -37,6 +38,14 @@ class GitlabSection: def has_finished(self) -> bool: return self.__end_time is not None + @property + def start_time(self) -> datetime: + return self.__start_time + + @property + def end_time(self) -> Optional[datetime]: + return self.__end_time + def get_timestamp(self, time: datetime) -> str: unix_ts = datetime.timestamp(time) return str(int(unix_ts)) @@ -54,6 +63,16 @@ class GitlabSection: return f"{before_header}{header_wrapper}" + def __str__(self) -> str: + status = "NS" if not self.has_started else "F" if self.has_finished else "IP" + delta = self.delta_time() + elapsed_time = "N/A" if delta is None else str(delta) + return ( + f"GitlabSection({self.id}, {self.header}, {self.type}, " + f"SC={self.start_collapsed}, S={status}, ST={self.start_time}, " + f"ET={self.end_time}, ET={elapsed_time})" + ) + def __enter__(self): print(self.start()) return self diff --git a/lib/mesa/.gitlab-ci/lava/utils/lava_job.py b/lib/mesa/.gitlab-ci/lava/utils/lava_job.py new file mode 100644 index 000000000..b69f8b9fb --- /dev/null +++ b/lib/mesa/.gitlab-ci/lava/utils/lava_job.py @@ -0,0 +1,186 @@ +import re +import xmlrpc +from collections import defaultdict +from datetime import datetime +from typing import Any, Optional + +from lava.exceptions import ( + MesaCIException, + MesaCIKnownIssueException, + MesaCIParseException, + MesaCITimeoutError, +) +from lava.utils import CONSOLE_LOG +from lava.utils.log_follower import print_log +from lavacli.utils import flow_yaml as lava_yaml + +from .lava_proxy import call_proxy + + +class LAVAJob: + COLOR_STATUS_MAP: dict[str, str] = { + "pass": CONSOLE_LOG["FG_GREEN"], + "hung": CONSOLE_LOG["FG_YELLOW"], + "fail": CONSOLE_LOG["FG_RED"], + "canceled": CONSOLE_LOG["FG_MAGENTA"], + } + + def __init__(self, proxy, definition, log=defaultdict(str)) -> None: + self._job_id = None + self.proxy = proxy + self.definition = definition + self.last_log_line = 0 + self.last_log_time = None + self._is_finished = False + self.log: dict[str, Any] = log + self.status = "not_submitted" + self.__exception: Optional[str] = None + + def heartbeat(self) -> None: + self.last_log_time: datetime = datetime.now() + self.status = "running" + + @property + def status(self) -> str: + return self._status + + @status.setter + def status(self, new_status: str) -> None: + self._status = new_status + self.log["status"] = self._status + + @property + def job_id(self) -> int: + return self._job_id + + @job_id.setter + def job_id(self, new_id: int) -> None: + self._job_id = new_id + self.log["lava_job_id"] = self._job_id + + @property + def is_finished(self) -> bool: + return self._is_finished + + @property + def exception(self) -> str: + return self.__exception + + @exception.setter + def exception(self, exception: Exception) -> None: + self.__exception = repr(exception) + self.log["dut_job_fail_reason"] = self.__exception + + def validate(self) -> Optional[dict]: + """Returns a dict with errors, if the validation fails. + + Returns: + Optional[dict]: a dict with the validation errors, if any + """ + return call_proxy(self.proxy.scheduler.jobs.validate, self.definition, True) + + def show(self) -> dict[str, str]: + return call_proxy(self.proxy.scheduler.jobs.show, self._job_id) + + def get_lava_time(self, key, data) -> Optional[str]: + return data[key].value if data[key] else None + + def refresh_log(self) -> None: + details = self.show() + self.log["dut_start_time"] = self.get_lava_time("start_time", details) + self.log["dut_submit_time"] = self.get_lava_time("submit_time", details) + self.log["dut_end_time"] = self.get_lava_time("end_time", details) + self.log["dut_name"] = details.get("device") + self.log["dut_state"] = details.get("state") + + def submit(self) -> bool: + try: + self.job_id = call_proxy(self.proxy.scheduler.jobs.submit, self.definition) + self.status = "submitted" + self.refresh_log() + except MesaCIException: + return False + return True + + def lava_state(self) -> str: + job_state: dict[str, str] = call_proxy( + self.proxy.scheduler.job_state, self._job_id + ) + return job_state["job_state"] + + def cancel(self): + if self._job_id: + self.proxy.scheduler.jobs.cancel(self._job_id) + # If we don't have yet set another job's status, let's update it + # with canceled one + if self.status == "running": + self.status = "canceled" + + def is_started(self) -> bool: + waiting_states = ("Submitted", "Scheduling", "Scheduled") + return self.lava_state() not in waiting_states + + def is_post_processed(self) -> bool: + return self.lava_state() != "Running" + + def _load_log_from_data(self, data) -> list[str]: + lines = [] + if isinstance(data, xmlrpc.client.Binary): + # We are dealing with xmlrpc.client.Binary + # Let's extract the data + data = data.data + # When there is no new log data, the YAML is empty + if loaded_lines := lava_yaml.load(data): + lines: list[str] = loaded_lines + self.last_log_line += len(lines) + return lines + + def get_logs(self) -> list[str]: + try: + (finished, data) = call_proxy( + self.proxy.scheduler.jobs.logs, self._job_id, self.last_log_line + ) + self._is_finished = finished + return self._load_log_from_data(data) + + except Exception as mesa_ci_err: + raise MesaCIParseException( + f"Could not get LAVA job logs. Reason: {mesa_ci_err}" + ) from mesa_ci_err + + def parse_job_result_from_log( + self, lava_lines: list[dict[str, str]] + ) -> list[dict[str, str]]: + """Use the console log to catch if the job has completed successfully or + not. Returns the list of log lines until the result line.""" + + last_line = None # Print all lines. lines[:None] == lines[:] + + for idx, line in enumerate(lava_lines): + if result := re.search(r"hwci: mesa: (pass|fail)", line): + self._is_finished = True + self.status = result[1] + + last_line = idx + 1 + # We reached the log end here. hwci script has finished. + break + return lava_lines[:last_line] + + def handle_exception(self, exception: Exception): + print_log(exception) + self.cancel() + self.exception = exception + + # Give more accurate status depending on exception + if isinstance(exception, MesaCIKnownIssueException): + self.status = "canceled" + elif isinstance(exception, MesaCITimeoutError): + self.status = "hung" + elif isinstance(exception, MesaCIException): + self.status = "failed" + elif isinstance(exception, KeyboardInterrupt): + self.status = "interrupted" + print_log("LAVA job submitter was interrupted. Cancelling the job.") + raise + else: + self.status = "job_submitter_error" diff --git a/lib/mesa/.gitlab-ci/lava/utils/lava_job_definition.py b/lib/mesa/.gitlab-ci/lava/utils/lava_job_definition.py new file mode 100644 index 000000000..c7b43658c --- /dev/null +++ b/lib/mesa/.gitlab-ci/lava/utils/lava_job_definition.py @@ -0,0 +1,150 @@ +from io import StringIO +from typing import TYPE_CHECKING, Any + +import re +from lava.utils.lava_farm import LavaFarm, get_lava_farm +from ruamel.yaml.scalarstring import LiteralScalarString +from ruamel.yaml import YAML +from os import getenv + +if TYPE_CHECKING: + from lava.lava_job_submitter import LAVAJobSubmitter + +# How many attempts should be made when a timeout happen during LAVA device boot. +NUMBER_OF_ATTEMPTS_LAVA_BOOT = int(getenv("LAVA_NUMBER_OF_ATTEMPTS_LAVA_BOOT", 3)) + +# Supports any integers in [0, 100]. +# The scheduler considers the job priority when ordering the queue +# to consider which job should run next. +JOB_PRIORITY = int(getenv("JOB_PRIORITY", 75)) + + +def has_ssh_support(job_submitter: "LAVAJobSubmitter") -> bool: + force_uart = bool(getenv("LAVA_FORCE_UART", False)) + + if force_uart: + return False + + # Only Collabora's farm supports to run docker container as a LAVA actions, + # which is required to follow the job in a SSH section + current_farm = get_lava_farm() + + # SSH job definition still needs to add support for fastboot. + job_uses_fastboot: bool = job_submitter.boot_method == "fastboot" + + return current_farm == LavaFarm.COLLABORA and not job_uses_fastboot + + +def generate_lava_yaml_payload(job_submitter: "LAVAJobSubmitter") -> dict[str, Any]: + """ + Bridge function to use the supported job definition depending on some Mesa + CI job characteristics. + + The strategy here, is to use LAVA with a containerized SSH session to follow + the job output, escaping from dumping data to the UART, which proves to be + error prone in some devices. + """ + from lava.utils.ssh_job_definition import ( + generate_lava_yaml_payload as ssh_lava_yaml, + ) + from lava.utils.uart_job_definition import ( + generate_lava_yaml_payload as uart_lava_yaml, + ) + + if has_ssh_support(job_submitter): + return ssh_lava_yaml(job_submitter) + + return uart_lava_yaml(job_submitter) + + +def generate_lava_job_definition(job_submitter: "LAVAJobSubmitter") -> str: + job_stream = StringIO() + yaml = YAML() + yaml.width = 4096 + yaml.dump(generate_lava_yaml_payload(job_submitter), job_stream) + return job_stream.getvalue() + + +def to_yaml_block(steps_array: list[str], escape_vars=[]) -> LiteralScalarString: + def escape_envvar(match): + return "\\" + match.group(0) + + filtered_array = [s for s in steps_array if s.strip() and not s.startswith("#")] + final_str = "\n".join(filtered_array) + + for escape_var in escape_vars: + # Find env vars and add '\\' before them + final_str = re.sub(rf"\${escape_var}*", escape_envvar, final_str) + return LiteralScalarString(final_str) + + +def generate_metadata(args) -> dict[str, Any]: + # General metadata and permissions + values = { + "job_name": f"mesa: {args.pipeline_info}", + "device_type": args.device_type, + "visibility": {"group": [args.visibility_group]}, + "priority": JOB_PRIORITY, + "context": { + "extra_nfsroot_args": " init=/init rootwait usbcore.quirks=0bda:8153:k" + }, + "timeouts": { + "job": {"minutes": args.job_timeout_min}, + "actions": { + "depthcharge-retry": { + # Could take between 1 and 1.5 min in slower boots + "minutes": 4 + }, + "depthcharge-start": { + # Should take less than 1 min. + "minutes": 1, + }, + "depthcharge-action": { + # This timeout englobes the entire depthcharge timing, + # including retries + "minutes": 5 + * NUMBER_OF_ATTEMPTS_LAVA_BOOT, + }, + }, + }, + } + + if args.lava_tags: + values["tags"] = args.lava_tags.split(",") + + return values + + +def artifact_download_steps(args): + """ + This function is responsible for setting up the SSH server in the DUT and to + export the first boot environment to a file. + """ + # Putting JWT pre-processing and mesa download, within init-stage1.sh file, + # as we do with non-SSH version. + download_steps = [ + "set -ex", + "curl -L --retry 4 -f --retry-all-errors --retry-delay 60 " + f"{args.job_rootfs_overlay_url} | tar -xz -C /", + f"mkdir -p {args.ci_project_dir}", + f"curl -L --retry 4 -f --retry-all-errors --retry-delay 60 {args.build_url} | " + f"tar --zstd -x -C {args.ci_project_dir}", + ] + + # If the JWT file is provided, we will use it to authenticate with the cloud + # storage provider and will hide it from the job output in Gitlab. + if args.jwt_file: + with open(args.jwt_file) as jwt_file: + download_steps += [ + "set +x # HIDE_START", + f'echo -n "{jwt_file.read()}" > "{args.jwt_file}"', + "set -x # HIDE_END", + f'echo "export CI_JOB_JWT_FILE={args.jwt_file}" >> /set-job-env-vars.sh', + ] + else: + download_steps += [ + "echo Could not find jwt file, disabling S3 requests...", + "sed -i '/S3_RESULTS_UPLOAD/d' /set-job-env-vars.sh", + ] + + return download_steps diff --git a/lib/mesa/.gitlab-ci/lava/utils/lava_proxy.py b/lib/mesa/.gitlab-ci/lava/utils/lava_proxy.py new file mode 100644 index 000000000..581ec4603 --- /dev/null +++ b/lib/mesa/.gitlab-ci/lava/utils/lava_proxy.py @@ -0,0 +1,44 @@ +import time +import traceback +import urllib +import urllib.parse +import xmlrpc +import xmlrpc.client + +import lavacli + +from .log_follower import fatal_err, print_log + + +def setup_lava_proxy(): + config = lavacli.load_config("default") + uri, usr, tok = (config.get(key) for key in ("uri", "username", "token")) + uri_obj = urllib.parse.urlparse(uri) + uri_str = f"{uri_obj.scheme}://{usr}:{tok}@{uri_obj.netloc}{uri_obj.path}" + transport = lavacli.RequestsTransport( + uri_obj.scheme, + config.get("proxy"), + config.get("timeout", 120.0), + config.get("verify_ssl_cert", True), + ) + proxy = xmlrpc.client.ServerProxy(uri_str, allow_none=True, transport=transport) + + print_log(f'Proxy for {config["uri"]} created.') + + return proxy + + +def call_proxy(fn, *args): + retries = 60 + for n in range(1, retries + 1): + try: + return fn(*args) + except xmlrpc.client.ProtocolError as err: + if n == retries: + traceback.print_exc() + fatal_err(f"A protocol error occurred (Err {err.errcode} {err.errmsg})") + else: + time.sleep(15) + except xmlrpc.client.Fault as err: + traceback.print_exc() + fatal_err(f"FATAL: Fault: {err.faultString} (code: {err.faultCode})", err) diff --git a/lib/mesa/.gitlab-ci/lava/utils/log_follower.py b/lib/mesa/.gitlab-ci/lava/utils/log_follower.py index b2bfcf36c..1fdf490bc 100644 --- a/lib/mesa/.gitlab-ci/lava/utils/log_follower.py +++ b/lib/mesa/.gitlab-ci/lava/utils/log_follower.py @@ -32,7 +32,9 @@ from lava.utils.log_section import ( @dataclass class LogFollower: - current_section: Optional[GitlabSection] = None + starting_section: Optional[GitlabSection] = None + _current_section: Optional[GitlabSection] = None + section_history: list[GitlabSection] = field(default_factory=list, init=False) timeout_durations: dict[LogSectionType, timedelta] = field( default_factory=lambda: DEFAULT_GITLAB_SECTION_TIMEOUTS, ) @@ -43,9 +45,11 @@ class LogFollower: _merge_next_line: str = field(default_factory=str, init=False) def __post_init__(self): - section_is_created = bool(self.current_section) + # Make it trigger current_section setter to populate section history + self.current_section = self.starting_section + section_is_created = bool(self._current_section) section_has_started = bool( - self.current_section and self.current_section.has_started + self._current_section and self._current_section.has_started ) self.log_hints = LAVALogHints(self) assert ( @@ -57,10 +61,20 @@ class LogFollower: next(self.gl_section_fix_gen) @property + def current_section(self): + return self._current_section + + @current_section.setter + def current_section(self, new_section: GitlabSection) -> None: + if old_section := self._current_section: + self.section_history.append(old_section) + self._current_section = new_section + + @property def phase(self) -> LogSectionType: return ( - self.current_section.type - if self.current_section + self._current_section.type + if self._current_section else LogSectionType.UNKNOWN ) @@ -75,22 +89,22 @@ class LogFollower: print(line) def watchdog(self): - if not self.current_section: + if not self._current_section: return timeout_duration = self.timeout_durations.get( - self.current_section.type, self.fallback_timeout + self._current_section.type, self.fallback_timeout ) - if self.current_section.delta_time() > timeout_duration: + if self._current_section.delta_time() > timeout_duration: raise MesaCITimeoutError( - f"Gitlab Section {self.current_section} has timed out", + f"Gitlab Section {self._current_section} has timed out", timeout_duration=timeout_duration, ) def clear_current_section(self): - if self.current_section and not self.current_section.has_finished: - self._buffer.append(self.current_section.end()) + if self._current_section and not self._current_section.has_finished: + self._buffer.append(self._current_section.end()) self.current_section = None def update_section(self, new_section: GitlabSection): @@ -110,6 +124,7 @@ class LogFollower: for log_section in LOG_SECTIONS: if new_section := log_section.from_log_line_to_section(line): self.update_section(new_section) + break def detect_kernel_dump_line(self, line: dict[str, Union[str, list]]) -> bool: # line["msg"] can be a list[str] when there is a kernel dump @@ -265,18 +280,31 @@ def fix_lava_gitlab_section_log(): -def print_log(msg: str) -> None: +def print_log(msg: str, *args) -> None: # Reset color from timestamp, since `msg` can tint the terminal color - print(f"{CONSOLE_LOG['RESET']}{datetime.now()}: {msg}") + print(f"{CONSOLE_LOG['RESET']}{datetime.now()}: {msg}", *args) -def fatal_err(msg): +def fatal_err(msg, exception=None): colored_msg = f"{CONSOLE_LOG['FG_RED']}" - f"{msg}" - f"{CONSOLE_LOG['RESET']}" - print_log(colored_msg) + print_log(colored_msg, f"{msg}", f"{CONSOLE_LOG['RESET']}") + if exception: + raise exception sys.exit(1) -def hide_sensitive_data(yaml_data: str, hide_tag: str ="HIDEME"): - return "".join(line for line in yaml_data.splitlines(True) if hide_tag not in line) +def hide_sensitive_data(yaml_data: str, start_hide: str = "HIDE_START", end_hide: str = "HIDE_END") -> str: + skip_line = False + dump_data: list[str] = [] + for line in yaml_data.splitlines(True): + if start_hide in line: + skip_line = True + elif end_hide in line: + skip_line = False + + if skip_line: + continue + + dump_data.append(line) + + return "".join(dump_data) diff --git a/lib/mesa/.gitlab-ci/lava/utils/log_section.py b/lib/mesa/.gitlab-ci/lava/utils/log_section.py index b4072667e..25620a615 100644 --- a/lib/mesa/.gitlab-ci/lava/utils/log_section.py +++ b/lib/mesa/.gitlab-ci/lava/utils/log_section.py @@ -11,6 +11,7 @@ from lava.utils.gitlab_section import GitlabSection class LogSectionType(Enum): UNKNOWN = auto() LAVA_BOOT = auto() + TEST_DUT_SUITE = auto() TEST_SUITE = auto() TEST_CASE = auto() LAVA_POST_PROCESSING = auto() @@ -24,7 +25,11 @@ class LogSectionType(Enum): # the enqueue delay. LAVA_BOOT_TIMEOUT = int(getenv("LAVA_BOOT_TIMEOUT", 9)) -# Test suite phase is where the initialization happens. +# Test DUT suite phase is where the initialization happens in DUT, not on docker. +# The device will be listening to SSH session until the end of the job. +LAVA_TEST_DUT_SUITE_TIMEOUT = int(getenv("JOB_TIMEOUT", 60)) + +# Test suite phase is where the initialization happens on docker. LAVA_TEST_SUITE_TIMEOUT = int(getenv("LAVA_TEST_SUITE_TIMEOUT", 5)) # Test cases may take a long time, this script has no right to interrupt @@ -39,6 +44,7 @@ LAVA_POST_PROCESSING_TIMEOUT = int(getenv("LAVA_POST_PROCESSING_TIMEOUT", 5)) FALLBACK_GITLAB_SECTION_TIMEOUT = timedelta(minutes=10) DEFAULT_GITLAB_SECTION_TIMEOUTS = { LogSectionType.LAVA_BOOT: timedelta(minutes=LAVA_BOOT_TIMEOUT), + LogSectionType.TEST_DUT_SUITE: timedelta(minutes=LAVA_TEST_DUT_SUITE_TIMEOUT), LogSectionType.TEST_SUITE: timedelta(minutes=LAVA_TEST_SUITE_TIMEOUT), LogSectionType.TEST_CASE: timedelta(minutes=LAVA_TEST_CASE_TIMEOUT), LogSectionType.LAVA_POST_PROCESSING: timedelta( @@ -83,10 +89,17 @@ LOG_SECTIONS = ( section_type=LogSectionType.TEST_CASE, ), LogSection( + regex=re.compile(r"<?STARTRUN>? ([^>]*ssh.*server.*)"), + levels=("debug"), + section_id="{}", + section_header="[dut] test_suite {}", + section_type=LogSectionType.TEST_DUT_SUITE, + ), + LogSection( regex=re.compile(r"<?STARTRUN>? ([^>]*)"), - levels=("target", "debug"), + levels=("debug"), section_id="{}", - section_header="test_suite {}", + section_header="[docker] test_suite {}", section_type=LogSectionType.TEST_SUITE, ), LogSection( diff --git a/lib/mesa/.gitlab-ci/lava/utils/ssh_job_definition.py b/lib/mesa/.gitlab-ci/lava/utils/ssh_job_definition.py new file mode 100644 index 000000000..1308e5ca9 --- /dev/null +++ b/lib/mesa/.gitlab-ci/lava/utils/ssh_job_definition.py @@ -0,0 +1,208 @@ +""" +In a few words: some devices in Mesa CI has problematic serial connection, they +may hang (become silent) intermittently. Every time it hangs for minutes, the +job is retried, causing delays in the overall pipeline executing, ultimately +blocking legit MRs to merge. + +To reduce reliance on UART, we explored LAVA features, such as running docker +containers as a test alongside the DUT one, to be able to create an SSH server +in the DUT the earliest possible and an SSH client in a docker container, to +establish a SSH session between both, allowing the console output to be passed +via SSH pseudo terminal, instead of relying in the error-prone UART. + +In more detail, we aim to use "export -p" to share the initial boot environment +with SSH LAVA test-cases. +The "init-stage1.sh" script handles tasks such as system mounting and network +setup, which are necessary for allocating a pseudo-terminal under "/dev/pts". +Although these chores are not required for establishing an SSH session, they are +essential for proper functionality to the target script given by HWCI_SCRIPT +environment variable. + +Therefore, we have divided the job definition into four parts: + +1. [DUT] Logging in to DUT and run the SSH server with root access. +2. [DUT] Running the "init-stage1.sh" script for the first SSH test case. +3. [DUT] Export the first boot environment to `/dut-env-vars.sh` file. +4. [SSH] Enabling the pseudo-terminal for colors and running the "init-stage2.sh" +script after sourcing "dut-env-vars.sh" again for the second SSH test case. +""" + + +from pathlib import Path +from typing import Any + +from .lava_job_definition import ( + NUMBER_OF_ATTEMPTS_LAVA_BOOT, + artifact_download_steps, + generate_metadata, + to_yaml_block, +) + +# Very early SSH server setup. Uses /dut_ready file to flag it is done. +SSH_SERVER_COMMANDS = { + "auto_login": { + "login_commands": [ + "dropbear -R -B", + "touch /dut_ready", + ], + "login_prompt": "ogin:", + # To login as root, the username should be empty + "username": "", + } +} + +# TODO: Extract this inline script to a shell file, like we do with +# init-stage[12].sh +# The current way is difficult to maintain because one has to deal with escaping +# characters for both Python and the resulting job definition YAML. +# Plus, it always good to lint bash scripts with shellcheck. +DOCKER_COMMANDS = [ + """set -ex +timeout 1m bash << EOF +while [ -z "$(lava-target-ip)" ]; do + echo Waiting for DUT to join LAN; + sleep 1; +done +EOF + +ping -c 5 -w 60 $(lava-target-ip) + +lava_ssh_test_case() { + set -x + local test_case="${1}" + shift + lava-test-case \"${test_case}\" --shell \\ + ssh ${SSH_PTY_ARGS:--T} \\ + -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \\ + root@$(lava-target-ip) \"${@}\" +}""", +] + + +def generate_dut_test(args): + # Commands executed on DUT. + # Trying to execute the minimal number of commands, because the console data is + # retrieved via UART, which is hang-prone in some devices. + + first_stage_steps: list[str] = Path(args.first_stage_init).read_text().splitlines() + return { + "namespace": "dut", + "definitions": [ + { + "from": "inline", + "name": "setup-ssh-server", + "path": "inline-setup-ssh-server", + "repository": { + "metadata": { + "format": "Lava-Test Test Definition 1.0", + "name": "dut-env-export", + }, + "run": { + "steps": [ + to_yaml_block(first_stage_steps), + "export -p > /dut-env-vars.sh", # Exporting the first boot environment + ], + }, + }, + } + ], + } + + +def generate_docker_test(args): + # This is a growing list of commands that will be executed by the docker + # guest, which will be the SSH client. + docker_commands = [] + + # LAVA test wrapping Mesa CI job in a SSH session. + init_stages_test = { + "namespace": "container", + "timeout": {"minutes": args.job_timeout_min}, + "failure_retry": 3, + "definitions": [ + { + "name": "docker_ssh_client", + "from": "inline", + "path": "inline/docker_ssh_client.yaml", + "repository": { + "metadata": { + "name": "mesa", + "description": "Mesa test plan", + "format": "Lava-Test Test Definition 1.0", + }, + "run": {"steps": docker_commands}, + }, + } + ], + "docker": { + "image": args.ssh_client_image, + }, + } + + docker_commands += [ + to_yaml_block(DOCKER_COMMANDS, escape_vars=["LAVA_TARGET_IP"]), + "lava_ssh_test_case 'wait_for_dut_login' << EOF", + "while [ ! -e /dut_ready ]; do sleep 1; done;", + "EOF", + to_yaml_block( + ( + "lava_ssh_test_case 'artifact_download' 'bash --' << EOF", + "source /dut-env-vars.sh", + *artifact_download_steps(args), + "EOF", + ) + ), + "export SSH_PTY_ARGS=-tt", + # Putting CI_JOB name as the testcase name, it may help LAVA farm + # maintainers with monitoring + f"lava_ssh_test_case 'mesa-ci_{args.mesa_job_name}' " + # Changing directory to /, as the HWCI_SCRIPT expects that + "'\"cd / && /init-stage2.sh\"'", + ] + + return init_stages_test + + +def generate_lava_yaml_payload(args) -> dict[str, Any]: + values = generate_metadata(args) + + # URLs to our kernel rootfs to boot from, both generated by the base + # container build + deploy = { + "namespace": "dut", + "failure_retry": NUMBER_OF_ATTEMPTS_LAVA_BOOT, + "timeout": {"minutes": 10}, + "timeouts": {"http-download": {"minutes": 2}}, + "to": "tftp", + "os": "oe", + "kernel": {"url": f"{args.kernel_url_prefix}/{args.kernel_image_name}"}, + "nfsrootfs": { + "url": f"{args.rootfs_url_prefix}/lava-rootfs.tar.zst", + "compression": "zstd", + }, + } + if args.kernel_image_type: + deploy["kernel"]["type"] = args.kernel_image_type + if args.dtb_filename: + deploy["dtb"] = {"url": f"{args.kernel_url_prefix}/{args.dtb_filename}.dtb"} + + # always boot over NFS + boot = { + "namespace": "dut", + "failure_retry": NUMBER_OF_ATTEMPTS_LAVA_BOOT, + "method": args.boot_method, + "commands": "nfs", + "prompts": ["lava-shell:"], + **SSH_SERVER_COMMANDS, + } + + # only declaring each job as a single 'test' since LAVA's test parsing is + # not useful to us + values["actions"] = [ + {"deploy": deploy}, + {"boot": boot}, + {"test": generate_dut_test(args)}, + {"test": generate_docker_test(args)}, + ] + + return values diff --git a/lib/mesa/.gitlab-ci/lava/utils/uart_job_definition.py b/lib/mesa/.gitlab-ci/lava/utils/uart_job_definition.py new file mode 100644 index 000000000..cd239c321 --- /dev/null +++ b/lib/mesa/.gitlab-ci/lava/utils/uart_job_definition.py @@ -0,0 +1,171 @@ +from typing import Any +from .lava_job_definition import ( + generate_metadata, + NUMBER_OF_ATTEMPTS_LAVA_BOOT, + artifact_download_steps, +) + + +def generate_lava_yaml_payload(args) -> dict[str, Any]: + values = generate_metadata(args) + + # URLs to our kernel rootfs to boot from, both generated by the base + # container build + + nfsrootfs = { + "url": f"{args.rootfs_url_prefix}/lava-rootfs.tar.zst", + "compression": "zstd", + } + + fastboot_deploy_nfs = { + "timeout": {"minutes": 10}, + "to": "nfs", + "nfsrootfs": nfsrootfs, + } + + fastboot_deploy_prepare = { + "timeout": {"minutes": 5}, + "to": "downloads", + "os": "oe", + "images": { + "kernel": { + "url": f"{args.kernel_url_prefix}/{args.kernel_image_name}", + }, + }, + "postprocess": { + "docker": { + "image": "registry.gitlab.collabora.com/lava/health-check-docker", + "steps": [ + f"cat Image.gz {args.dtb_filename}.dtb > Image.gz+dtb", + "mkbootimg --kernel Image.gz+dtb" + + ' --cmdline "root=/dev/nfs rw nfsroot=$NFS_SERVER_IP:$NFS_ROOTFS,tcp,hard rootwait ip=dhcp init=/init"' + + " --pagesize 4096 --base 0x80000000 -o boot.img", + ], + } + }, + } + if args.kernel_image_type: + fastboot_deploy_prepare["images"]["kernel"]["type"] = args.kernel_image_type + if args.dtb_filename: + fastboot_deploy_prepare["images"]["dtb"] = { + "url": f"{args.kernel_url_prefix}/{args.dtb_filename}.dtb" + } + + tftp_deploy = { + "timeout": {"minutes": 5}, + "to": "tftp", + "os": "oe", + "kernel": { + "url": f"{args.kernel_url_prefix}/{args.kernel_image_name}", + }, + "nfsrootfs": nfsrootfs, + } + if args.kernel_image_type: + tftp_deploy["kernel"]["type"] = args.kernel_image_type + if args.dtb_filename: + tftp_deploy["dtb"] = { + "url": f"{args.kernel_url_prefix}/{args.dtb_filename}.dtb" + } + + fastboot_deploy = { + "timeout": {"minutes": 2}, + "to": "fastboot", + "docker": { + "image": "registry.gitlab.collabora.com/lava/health-check-docker", + }, + "images": { + "boot": {"url": "downloads://boot.img"}, + }, + } + + fastboot_boot = { + "timeout": {"minutes": 2}, + "docker": {"image": "registry.gitlab.collabora.com/lava/health-check-docker"}, + "failure_retry": NUMBER_OF_ATTEMPTS_LAVA_BOOT, + "method": args.boot_method, + "prompts": ["lava-shell:"], + "commands": ["set_active a"], + } + + tftp_boot = { + "failure_retry": NUMBER_OF_ATTEMPTS_LAVA_BOOT, + "method": args.boot_method, + "prompts": ["lava-shell:"], + "commands": "nfs", + } + + # skeleton test definition: only declaring each job as a single 'test' + # since LAVA's test parsing is not useful to us + run_steps = [] + test = { + "timeout": {"minutes": args.job_timeout_min}, + "failure_retry": 1, + "definitions": [ + { + "name": "mesa", + "from": "inline", + "lava-signal": "kmsg", + "path": "inline/mesa.yaml", + "repository": { + "metadata": { + "name": "mesa", + "description": "Mesa test plan", + "os": ["oe"], + "scope": ["functional"], + "format": "Lava-Test Test Definition 1.0", + }, + "run": {"steps": run_steps}, + }, + } + ], + } + + # job execution script: + # - inline .gitlab-ci/common/init-stage1.sh + # - fetch and unpack per-pipeline build artifacts from build job + # - fetch and unpack per-job environment from lava-submit.sh + # - exec .gitlab-ci/common/init-stage2.sh + + with open(args.first_stage_init, "r") as init_sh: + run_steps += [ + x.rstrip() for x in init_sh if not x.startswith("#") and x.rstrip() + ] + # We cannot distribute the Adreno 660 shader firmware inside rootfs, + # since the license isn't bundled inside the repository + if args.device_type == "sm8350-hdk": + run_steps.append( + "curl -L --retry 4 -f --retry-all-errors --retry-delay 60 " + + "https://github.com/allahjasif1990/hdk888-firmware/raw/main/a660_zap.mbn " + + '-o "/lib/firmware/qcom/sm8350/a660_zap.mbn"' + ) + + run_steps += artifact_download_steps(args) + + run_steps += [ + f"mkdir -p {args.ci_project_dir}", + f"curl {args.build_url} | tar --zstd -x -C {args.ci_project_dir}", + # Sleep a bit to give time for bash to dump shell xtrace messages into + # console which may cause interleaving with LAVA_SIGNAL_STARTTC in some + # devices like a618. + "sleep 1", + # Putting CI_JOB name as the testcase name, it may help LAVA farm + # maintainers with monitoring + f"lava-test-case 'mesa-ci_{args.mesa_job_name}' --shell /init-stage2.sh", + ] + + if args.boot_method == "fastboot": + values["actions"] = [ + {"deploy": fastboot_deploy_nfs}, + {"deploy": fastboot_deploy_prepare}, + {"deploy": fastboot_deploy}, + {"boot": fastboot_boot}, + {"test": test}, + ] + else: # tftp + values["actions"] = [ + {"deploy": tftp_deploy}, + {"boot": tftp_boot}, + {"test": test}, + ] + + return values |