diff --git a/clanModules/single-disk/roles/default.nix b/clanModules/single-disk/roles/default.nix index 64edc68d6..0a6faf2bb 100644 --- a/clanModules/single-disk/roles/default.nix +++ b/clanModules/single-disk/roles/default.nix @@ -36,6 +36,7 @@ type = "filesystem"; format = "vfat"; mountpoint = "/boot"; + mountOptions = [ "umask=0077" ]; }; }; root = { diff --git a/machines/test-backup/facts/borgbackup.ssh.pub b/machines/test-backup/facts/borgbackup.ssh.pub index c305404cd..3b4baf37e 100644 --- a/machines/test-backup/facts/borgbackup.ssh.pub +++ b/machines/test-backup/facts/borgbackup.ssh.pub @@ -1 +1 @@ -ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIBIbwIVnLy+uoDZ6uK/OCc1QK46SIGeC3mVc85dqLYQw lass@ignavia +ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIEOdlvNTJPxTpMjuuNytGEUAO8NUuvL2nm9dpWZULCR6 nixbld@turingmachine diff --git a/machines/test-backup/facts/ssh.id_ed25519.pub b/machines/test-backup/facts/ssh.id_ed25519.pub index c305404cd..a23c24019 100644 --- a/machines/test-backup/facts/ssh.id_ed25519.pub +++ b/machines/test-backup/facts/ssh.id_ed25519.pub @@ -1 +1 @@ -ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIBIbwIVnLy+uoDZ6uK/OCc1QK46SIGeC3mVc85dqLYQw lass@ignavia +ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAILns3iEVA7MaN+K8qVRFywVOjBZsGyfRuBl26nGL/tXe nixbld@turingmachine diff --git a/nixosModules/clanCore/vm.nix b/nixosModules/clanCore/vm.nix index e666f7159..05951f6be 100644 --- a/nixosModules/clanCore/vm.nix +++ b/nixosModules/clanCore/vm.nix @@ -47,8 +47,8 @@ let ]; boot.initrd.systemd.emergencyAccess = true; - # sysusers would be faster because it doesn't need perl, but it cannot create normal users - systemd.sysusers.enable = true; + # userborn would be faster because it doesn't need perl, but it cannot create normal users + services.userborn.enable = true; users.mutableUsers = false; users.allowNoPasswordLogin = true; diff --git a/nixosModules/clanCore/waypipe.nix b/nixosModules/clanCore/waypipe.nix index 6b590cd9f..20d830e96 100644 --- a/nixosModules/clanCore/waypipe.nix +++ b/nixosModules/clanCore/waypipe.nix @@ -45,9 +45,7 @@ security.sudo.wheelNeedsPassword = false; users.users.user = lib.mkIf (config.clan.services.waypipe.user == "user") { - # workaround sysusers - isSystemUser = true; - uid = 998; + uid = 1000; group = "users"; initialPassword = ""; extraGroups = [ diff --git a/pkgs/clan-cli/clan_cli/__init__.py b/pkgs/clan-cli/clan_cli/__init__.py index ef0bab226..b676f7a48 100644 --- a/pkgs/clan-cli/clan_cli/__init__.py +++ b/pkgs/clan-cli/clan_cli/__init__.py @@ -24,7 +24,7 @@ from . import ( from .clan_uri import FlakeId from .custom_logger import setup_logging from .dirs import get_clan_flake_toplevel_or_env -from .errors import ClanCmdError, ClanError +from .errors import ClanError from .facts import cli as facts from .flash import cli as flash_cli from .hyperlink import help_hyperlink @@ -401,7 +401,7 @@ def main() -> None: if len(sys.argv) == 1: parser.print_help() - if getattr(args, "debug", False): + if debug := getattr(args, "debug", False): setup_logging(logging.DEBUG, root_log_name=__name__.split(".")[0]) log.debug("Debug log activated") else: @@ -413,21 +413,11 @@ def main() -> None: try: args.func(args) except ClanError as e: - if isinstance(e, ClanCmdError): - msg = "" - if e.cmd.msg: - msg += f"{e.cmd.msg}: " - msg += f"command exited with code {e.cmd.returncode}: {e.cmd.command}" - log.error(msg) # noqa: TRY400 + if debug: + log.exception("Exited with error") + else: + log.error("%s", e) # noqa: TRY400 sys.exit(1) - - if not e.msg: # should not be empty, print stack trace - raise - msg = e.msg - if e.description: - msg += f": {e.description}" - log.error(msg) # noqa: TRY400 - sys.exit(1) except KeyboardInterrupt: log.warning("Interrupted by user") sys.exit(1) diff --git a/pkgs/clan-cli/clan_cli/cmd.py b/pkgs/clan-cli/clan_cli/cmd.py index b896b6e06..d95e7db0e 100644 --- a/pkgs/clan-cli/clan_cli/cmd.py +++ b/pkgs/clan-cli/clan_cli/cmd.py @@ -131,8 +131,6 @@ def run( if input: process.communicate(input) - else: - process.wait() tend = datetime.datetime.now(tz=datetime.UTC) global TIME_TABLE diff --git a/pkgs/clan-cli/clan_cli/dirs.py b/pkgs/clan-cli/clan_cli/dirs.py index 8dc2b16ff..0786376d9 100644 --- a/pkgs/clan-cli/clan_cli/dirs.py +++ b/pkgs/clan-cli/clan_cli/dirs.py @@ -4,6 +4,8 @@ import sys import urllib from pathlib import Path +from clan_cli.clan_uri import FlakeId + from .errors import ClanError log = logging.getLogger(__name__) @@ -100,8 +102,8 @@ def user_history_file() -> Path: return user_config_dir() / "clan" / "history" -def vm_state_dir(flake_url: str, vm_name: str) -> Path: - clan_key = clan_key_safe(flake_url) +def vm_state_dir(flake_url: FlakeId, vm_name: str) -> Path: + clan_key = clan_key_safe(str(flake_url)) return user_data_dir() / "clan" / "vmstate" / clan_key / vm_name diff --git a/pkgs/clan-cli/clan_cli/facts/generate.py b/pkgs/clan-cli/clan_cli/facts/generate.py index 7e5200a9c..d16d2df10 100644 --- a/pkgs/clan-cli/clan_cli/facts/generate.py +++ b/pkgs/clan-cli/clan_cli/facts/generate.py @@ -218,6 +218,9 @@ def generate_facts( def generate_command(args: argparse.Namespace) -> None: + if args.flake is None: + msg = "Could not find clan flake toplevel directory" + raise ClanError(msg) if len(args.machines) == 0: machines = get_all_machines(args.flake, args.option) else: diff --git a/pkgs/clan-cli/clan_cli/facts/public_modules/vm.py b/pkgs/clan-cli/clan_cli/facts/public_modules/vm.py index 631b4d6fb..adfe75370 100644 --- a/pkgs/clan-cli/clan_cli/facts/public_modules/vm.py +++ b/pkgs/clan-cli/clan_cli/facts/public_modules/vm.py @@ -14,7 +14,7 @@ class FactStore(FactStoreBase): def __init__(self, machine: Machine) -> None: self.machine = machine self.works_remotely = False - self.dir = vm_state_dir(str(machine.flake), machine.name) / "facts" + self.dir = vm_state_dir(machine.flake, machine.name) / "facts" log.debug(f"FactStore initialized with dir {self.dir}") def exists(self, service: str, name: str) -> bool: diff --git a/pkgs/clan-cli/clan_cli/facts/secret_modules/vm.py b/pkgs/clan-cli/clan_cli/facts/secret_modules/vm.py index 0acf09c34..befcdf766 100644 --- a/pkgs/clan-cli/clan_cli/facts/secret_modules/vm.py +++ b/pkgs/clan-cli/clan_cli/facts/secret_modules/vm.py @@ -10,7 +10,7 @@ from . import SecretStoreBase class SecretStore(SecretStoreBase): def __init__(self, machine: Machine) -> None: self.machine = machine - self.dir = vm_state_dir(str(machine.flake), machine.name) / "secrets" + self.dir = vm_state_dir(machine.flake, machine.name) / "secrets" self.dir.mkdir(parents=True, exist_ok=True) def set( diff --git a/pkgs/clan-cli/clan_cli/qemu/qga.py b/pkgs/clan-cli/clan_cli/qemu/qga.py index 0cae576a1..76f3905c6 100644 --- a/pkgs/clan-cli/clan_cli/qemu/qga.py +++ b/pkgs/clan-cli/clan_cli/qemu/qga.py @@ -1,9 +1,17 @@ import base64 -import json -import socket -from time import sleep +import time +import types +from dataclasses import dataclass from clan_cli.errors import ClanError +from clan_cli.qemu.qmp import QEMUMonitorProtocol + + +@dataclass +class VmCommandResult: + returncode: int + stdout: str | None + stderr: str | None # qga is almost like qmp, but not quite, because: @@ -11,64 +19,64 @@ from clan_cli.errors import ClanError # - no need to initialize by asking for capabilities # - results need to be base64 decoded class QgaSession: - def __init__(self, sock: socket.socket) -> None: - self.sock = sock + def __init__(self, address: str) -> None: + self.client = QEMUMonitorProtocol(address) + self.client.connect(negotiate=False) - def get_response(self) -> dict: - result = self.sock.recv(9999999) - return json.loads(result) + def __enter__(self) -> "QgaSession": + # Implement context manager enter function. + return self - # only execute, don't wait for response - def exec_cmd(self, cmd: str) -> None: - self.sock.send( - json.dumps( - { - "execute": "guest-exec", - "arguments": { - "path": "/bin/sh", - "arg": ["-l", "-c", cmd], - "capture-output": True, - }, - } - ).encode("utf-8") + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + traceback: types.TracebackType | None, + ) -> None: + # Implement context manager exit function. + self.client.close() + + def run_nonblocking(self, cmd: list[str]) -> int: + result_pid = self.client.cmd( + "guest-exec", {"path": cmd[0], "arg": cmd[1:], "capture-output": True} ) + if result_pid is None: + msg = "Could not get PID from QGA" + raise ClanError(msg) + try: + return result_pid["return"]["pid"] + except KeyError as e: + if "error" in result_pid: + msg = f"Could not run command: {result_pid['error']['desc']}" + raise ClanError(msg) from e + msg = f"PID could not be found: {result_pid}" + raise ClanError(msg) from e # run, wait for result, return exitcode and output - def run(self, cmd: str, check: bool = False) -> tuple[int, str, str]: - self.exec_cmd(cmd) - result_pid = self.get_response() - pid = result_pid["return"]["pid"] + def run(self, cmd: list[str], check: bool = True) -> VmCommandResult: + pid = self.run_nonblocking(cmd) # loop until exited=true - status_payload = json.dumps( - { - "execute": "guest-exec-status", - "arguments": { - "pid": pid, - }, - } - ).encode("utf-8") while True: - self.sock.send(status_payload) - result = self.get_response() + result = self.client.cmd("guest-exec-status", {"pid": pid}) + if result is None: + msg = "Could not get status from QGA" + raise ClanError(msg) if "error" in result and result["error"]["desc"].startswith("PID"): msg = "PID could not be found" raise ClanError(msg) if result["return"]["exited"]: break - sleep(0.1) + time.sleep(0.1) exitcode = result["return"]["exitcode"] - stdout = ( - "" - if "out-data" not in result["return"] - else base64.b64decode(result["return"]["out-data"]).decode("utf-8") - ) - stderr = ( - "" - if "err-data" not in result["return"] - else base64.b64decode(result["return"]["err-data"]).decode("utf-8") - ) + err_data = result["return"].get("err-data") + stdout = None + stderr = None + if out_data := result["return"].get("out-data"): + stdout = base64.b64decode(out_data).decode("utf-8") + if err_data is not None: + stderr = base64.b64decode(err_data).decode("utf-8") if check and exitcode != 0: msg = f"Command on guest failed\nCommand: {cmd}\nExitcode {exitcode}\nStdout: {stdout}\nStderr: {stderr}" raise ClanError(msg) - return exitcode, stdout, stderr + return VmCommandResult(exitcode, stdout, stderr) diff --git a/pkgs/clan-cli/clan_cli/vars/generate.py b/pkgs/clan-cli/clan_cli/vars/generate.py index 144a0dd0e..c1454dc6d 100644 --- a/pkgs/clan-cli/clan_cli/vars/generate.py +++ b/pkgs/clan-cli/clan_cli/vars/generate.py @@ -367,6 +367,9 @@ def generate_vars( def generate_command(args: argparse.Namespace) -> None: + if args.flake is None: + msg = "Could not find clan flake toplevel directory" + raise ClanError(msg) if len(args.machines) == 0: machines = get_all_machines(args.flake, args.option) else: diff --git a/pkgs/clan-cli/clan_cli/vars/public_modules/vm.py b/pkgs/clan-cli/clan_cli/vars/public_modules/vm.py index 65860f212..e54bb1633 100644 --- a/pkgs/clan-cli/clan_cli/vars/public_modules/vm.py +++ b/pkgs/clan-cli/clan_cli/vars/public_modules/vm.py @@ -14,7 +14,7 @@ class FactStore(FactStoreBase): def __init__(self, machine: Machine) -> None: self.machine = machine self.works_remotely = False - self.dir = vm_state_dir(str(machine.flake), machine.name) / "facts" + self.dir = vm_state_dir(machine.flake, machine.name) / "facts" log.debug(f"FactStore initialized with dir {self.dir}") @property diff --git a/pkgs/clan-cli/clan_cli/vars/secret_modules/vm.py b/pkgs/clan-cli/clan_cli/vars/secret_modules/vm.py index e8e52f777..2aef6770b 100644 --- a/pkgs/clan-cli/clan_cli/vars/secret_modules/vm.py +++ b/pkgs/clan-cli/clan_cli/vars/secret_modules/vm.py @@ -10,7 +10,7 @@ from . import SecretStoreBase class SecretStore(SecretStoreBase): def __init__(self, machine: Machine) -> None: self.machine = machine - self.dir = vm_state_dir(str(machine.flake), machine.name) / "secrets" + self.dir = vm_state_dir(machine.flake, machine.name) / "secrets" self.dir.mkdir(parents=True, exist_ok=True) @property diff --git a/pkgs/clan-cli/clan_cli/vms/inspect.py b/pkgs/clan-cli/clan_cli/vms/inspect.py index ecbfd6ba4..ada68be7c 100644 --- a/pkgs/clan-cli/clan_cli/vms/inspect.py +++ b/pkgs/clan-cli/clan_cli/vms/inspect.py @@ -17,14 +17,17 @@ class WaypipeConfig: @dataclass class VmConfig: machine_name: str - machine_icon: Path - machine_description: str flake_url: FlakeId - clan_name: str cores: int memory_size: int graphics: bool + + # FIXME: I don't think this belongs here. + clan_name: str + machine_description: str | None + machine_icon: Path | None + waypipe: bool = False def __post_init__(self) -> None: diff --git a/pkgs/clan-cli/clan_cli/vms/qemu.py b/pkgs/clan-cli/clan_cli/vms/qemu.py index 9cdbb55d9..5a2315f11 100644 --- a/pkgs/clan-cli/clan_cli/vms/qemu.py +++ b/pkgs/clan-cli/clan_cli/vms/qemu.py @@ -92,10 +92,11 @@ def qemu_command( virtiofsd_socket: Path, qmp_socket_file: Path, qga_socket_file: Path, - portmap: list[tuple[int, int]] | None = None, + portmap: dict[int, int] | None = None, + interactive: bool = False, ) -> QemuCommand: if portmap is None: - portmap = [] + portmap = {} kernel_cmdline = [ (Path(nixos_config["toplevel"]) / "kernel-params").read_text(), f'init={nixos_config["toplevel"]}/init', @@ -104,7 +105,7 @@ def qemu_command( ] if not vm.waypipe: kernel_cmdline.append("console=tty0") - hostfwd = ",".join(f"hostfwd=tcp::{h}-:{g}" for h, g in portmap) + hostfwd = ",".join(f"hostfwd=tcp::{h}-:{g}" for h, g in portmap.items()) # fmt: off command = [ "qemu-kvm", @@ -137,12 +138,20 @@ def qemu_command( "-chardev", f"socket,path={qga_socket_file},server=on,wait=off,id=qga0", "-device", "virtio-serial", "-device", "virtserialport,chardev=qga0,name=org.qemu.guest_agent.0", - - "-serial", "null", - "-chardev", "stdio,mux=on,id=char0,signal=off", - "-mon", "chardev=char0,mode=readline", - "-device", "virtconsole,chardev=char0,nr=0", ] # fmt: on + if interactive: + command.extend([ + "-serial", "null", + "-chardev", "stdio,mux=on,id=char0,signal=off", + "-mon", "chardev=char0,mode=readline", + "-device", "virtconsole,chardev=char0,nr=0", + ]) + else: + command.extend([ + "-serial", "null", + "-chardev", "file,id=char0,path=/dev/stdout", + "-device", "virtconsole,chardev=char0,nr=0", + ]) vsock_cid = None if vm.graphics: diff --git a/pkgs/clan-cli/clan_cli/vms/run.py b/pkgs/clan-cli/clan_cli/vms/run.py index 73dd2ecf2..dda9a4d14 100644 --- a/pkgs/clan-cli/clan_cli/vms/run.py +++ b/pkgs/clan-cli/clan_cli/vms/run.py @@ -3,17 +3,24 @@ import importlib import json import logging import os -from contextlib import ExitStack +import subprocess +import time +from collections.abc import Iterator +from concurrent.futures import ThreadPoolExecutor +from contextlib import ExitStack, contextmanager +from dataclasses import dataclass from pathlib import Path from tempfile import TemporaryDirectory -from clan_cli.cmd import Log, run +from clan_cli.cmd import CmdOut, Log, handle_output, run from clan_cli.completions import add_dynamic_completer, complete_machines from clan_cli.dirs import module_root, user_cache_dir, vm_state_dir -from clan_cli.errors import ClanError +from clan_cli.errors import ClanCmdError, ClanError from clan_cli.facts.generate import generate_facts from clan_cli.machines.machines import Machine from clan_cli.nix import nix_shell +from clan_cli.qemu.qga import QgaSession +from clan_cli.qemu.qmp import QEMUMonitorProtocol from .inspect import VmConfig, inspect_vm from .qemu import qemu_command @@ -25,13 +32,14 @@ log = logging.getLogger(__name__) def facts_to_nixos_config(facts: dict[str, dict[str, bytes]]) -> dict: nixos_config: dict = {} - nixos_config["clanCore"] = {} - nixos_config["clanCore"]["secrets"] = {} + nixos_config["clan"] = {} + nixos_config["clan"]["core"] = {} + nixos_config["clan"]["core"]["secrets"] = {} for service, service_facts in facts.items(): - nixos_config["clanCore"]["secrets"][service] = {} - nixos_config["clanCore"]["secrets"][service]["facts"] = {} + nixos_config["clan"]["core"]["secrets"][service] = {} + nixos_config["clan"]["core"]["secrets"][service]["facts"] = {} for fact, value in service_facts.items(): - nixos_config["clanCore"]["secrets"][service]["facts"][fact] = { + nixos_config["clan"]["core"]["secrets"][service]["facts"][fact] = { "value": value.decode() } return nixos_config @@ -107,16 +115,101 @@ def prepare_disk( return disk_img -def run_vm( +@contextmanager +def start_vm( + args: list[str], + packages: list[str], + extra_env: dict[str, str], + stdout: int | None = None, + stderr: int | None = None, + stdin: int | None = None, +) -> Iterator[subprocess.Popen]: + env = os.environ.copy() + env.update(extra_env) + cmd = nix_shell(packages, args) + with subprocess.Popen( + cmd, env=env, stdout=stdout, stderr=stderr, stdin=stdin + ) as process: + try: + yield process + finally: + process.terminate() + try: + # Fix me: This should in future properly shutdown the VM using qmp + process.wait(timeout=5) + except subprocess.TimeoutExpired: + process.kill() + + +class QemuVm: + def __init__( + self, + machine: Machine, + process: subprocess.Popen, + socketdir: Path, + ) -> None: + self.machine = machine + self.process = process + self.qmp_socket_file = socketdir / "qmp.sock" + self.qga_socket_file = socketdir / "qga.sock" + + # wait for vm to be up then connect and return qmp instance + @contextmanager + def qmp_connect(self) -> Iterator[QEMUMonitorProtocol]: + with QEMUMonitorProtocol( + address=str(os.path.realpath(self.qmp_socket_file)), + ) as qmp: + qmp.connect() + yield qmp + + @contextmanager + def qga_connect(self, timeout_sec: float = 100) -> Iterator[QgaSession]: + start_time = time.time() + # try to reconnect a couple of times if connection refused + session = None + while time.time() - start_time < timeout_sec: + try: + session = QgaSession(str(self.qga_socket_file)) + break + except ConnectionRefusedError: + time.sleep(0.1) + continue + if session is None: + msg = ( + f"Timeout after {timeout_sec} seconds. Could not connect to QGA socket" + ) + raise ClanError(msg) + with session: + yield session + + def wait_up(self, timeout_sec: float = 60) -> None: + start_time = time.time() + while time.time() - start_time < timeout_sec: + if self.process.poll() is not None: + msg = "VM failed to start. Qemu process exited with code {self.process.returncode}" + raise ClanError(msg) + if self.qmp_socket_file.exists(): + break + time.sleep(0.1) + + def wait_down(self) -> int: + return self.process.wait() + + +@contextmanager +def spawn_vm( vm: VmConfig, *, cachedir: Path | None = None, socketdir: Path | None = None, nix_options: list[str] | None = None, - portmap: list[tuple[int, int]] | None = None, -) -> None: + portmap: dict[int, int] | None = None, + stdout: int | None = None, + stderr: int | None = None, + stdin: int | None = None, +) -> Iterator[QemuVm]: if portmap is None: - portmap = [] + portmap = {} if nix_options is None: nix_options = [] with ExitStack() as stack: @@ -141,7 +234,7 @@ def run_vm( # TODO: We should get this from the vm argument nixos_config = build_vm(machine, cachedir, nix_options) - state_dir = vm_state_dir(str(vm.flake_url), machine.name) + state_dir = vm_state_dir(vm.flake_url, machine.name) state_dir.mkdir(parents=True, exist_ok=True) # specify socket files for qmp and qga @@ -181,28 +274,96 @@ def run_vm( qmp_socket_file=qmp_socket_file, qga_socket_file=qga_socket_file, portmap=portmap, + interactive=stdin is None, ) packages = ["nixpkgs#qemu"] - env = os.environ.copy() + extra_env = {} if vm.graphics and not vm.waypipe: packages.append("nixpkgs#virt-viewer") remote_viewer_mimetypes = module_root() / "vms" / "mimetypes" - env["XDG_DATA_DIRS"] = ( - f"{remote_viewer_mimetypes}:{env.get('XDG_DATA_DIRS', '')}" + extra_env["XDG_DATA_DIRS"] = ( + f"{remote_viewer_mimetypes}:{os.environ.get('XDG_DATA_DIRS', '')}" ) with ( start_waypipe(qemu_cmd.vsock_cid, f"[{vm.machine_name}] "), start_virtiofsd(virtiofsd_socket), + start_vm( + qemu_cmd.args, packages, extra_env, stdout=stdout, stderr=stderr + ) as process, ): - run( - nix_shell(packages, qemu_cmd.args), - env=env, - log=Log.BOTH, - error_msg=f"Could not start vm {machine}", - ) + qemu_vm = QemuVm(machine, process, socketdir) + qemu_vm.wait_up() + + try: + yield qemu_vm + finally: + try: + with qemu_vm.qmp_connect() as qmp: + qmp.command("system_powerdown") + qemu_vm.wait_down() + except OSError: + pass + # TODO: add a timeout here instead of waiting indefinitely + + +@dataclass +class RuntimeConfig: + cachedir: Path | None = None + socketdir: Path | None = None + nix_options: list[str] | None = None + portmap: dict[int, int] | None = None + command: list[str] | None = None + no_block: bool = False + + +def run_vm( + vm_config: VmConfig, + runtime_config: RuntimeConfig, +) -> CmdOut: + stdin = None + if runtime_config.command is not None: + stdin = subprocess.DEVNULL + with ( + spawn_vm( + vm_config, + cachedir=runtime_config.cachedir, + socketdir=runtime_config.socketdir, + nix_options=runtime_config.nix_options, + portmap=runtime_config.portmap, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + stdin=stdin, + ) as vm, + ThreadPoolExecutor(max_workers=1) as executor, + ): + future = executor.submit(handle_output, vm.process, Log.BOTH) + args: list[str] = vm.process.args # type: ignore[assignment] + + if runtime_config.command is not None: + with vm.qga_connect() as qga: + if runtime_config.no_block: + qga.run_nonblocking(runtime_config.command) + else: + qga.run(runtime_config.command) + + stdout_buf, stderr_buf = future.result() + + rc = vm.wait_down() + cmd_out = CmdOut( + stdout=stdout_buf, + stderr=stderr_buf, + cwd=Path.cwd(), + command_list=args, + returncode=vm.process.returncode, + msg=f"VM {vm_config.machine_name} exited with code {rc}", + env={}, + ) + if rc != 0: + raise ClanCmdError(cmd_out) + return cmd_out def run_command( @@ -212,9 +373,16 @@ def run_command( vm: VmConfig = inspect_vm(machine=machine_obj) - portmap = [p.split(":") for p in args.publish] + portmap = dict(p.split(":") for p in args.publish) - run_vm(vm, nix_options=args.option, portmap=portmap) + runtime_config = RuntimeConfig( + nix_options=args.option, + portmap=portmap, + command=args.command, + no_block=args.no_block, + ) + + run_vm(vm, runtime_config) def register_run_parser(parser: argparse.ArgumentParser) -> None: @@ -231,4 +399,15 @@ def register_run_parser(parser: argparse.ArgumentParser) -> None: default=[], help="Forward ports from host to guest", ) + parser.add_argument( + "--no-block", + action="store_true", + help="Do no block when running the command", + default=False, + ) + parser.add_argument( + "command", + nargs=argparse.REMAINDER, + help="command to run in the vm", + ) parser.set_defaults(func=lambda args: run_command(args)) diff --git a/pkgs/clan-cli/pyproject.toml b/pkgs/clan-cli/pyproject.toml index a9a6d7023..d388d2a64 100644 --- a/pkgs/clan-cli/pyproject.toml +++ b/pkgs/clan-cli/pyproject.toml @@ -33,7 +33,7 @@ testpaths = "tests" faulthandler_timeout = 60 log_level = "DEBUG" log_format = "%(levelname)s: %(message)s\n %(pathname)s:%(lineno)d::%(funcName)s" -addopts = "--cov . --cov-report term --cov-report html:.reports/html --no-cov-on-fail --durations 5 --color=yes --new-first -n auto" # Add --pdb for debugging +addopts = "--cov . --cov-report term --cov-report html:.reports/html --no-cov-on-fail --durations 5 --color=yes --new-first -W error -n auto" # Add --pdb for debugging norecursedirs = "tests/helpers" markers = ["impure", "with_core"] filterwarnings = "default::ResourceWarning" diff --git a/pkgs/clan-cli/tests/helpers/vms.py b/pkgs/clan-cli/tests/helpers/vms.py deleted file mode 100644 index 0b0247ea6..000000000 --- a/pkgs/clan-cli/tests/helpers/vms.py +++ /dev/null @@ -1,132 +0,0 @@ -import contextlib -import os -import socket -import sys -import threading -import traceback -from collections.abc import Iterator -from pathlib import Path -from time import sleep - -from clan_cli.dirs import vm_state_dir -from clan_cli.errors import ClanError -from clan_cli.qemu.qga import QgaSession -from clan_cli.qemu.qmp import QEMUMonitorProtocol - -from . import cli - - -def find_free_port() -> int: - """Find an unused localhost port from 1024-65535 and return it.""" - with contextlib.closing(socket.socket(type=socket.SOCK_STREAM)) as sock: - sock.bind(("127.0.0.1", 0)) - return sock.getsockname()[1] - - -class VmThread(threading.Thread): - def __init__(self, machine_name: str, ssh_port: int | None = None) -> None: - super().__init__() - self.machine_name = machine_name - self.ssh_port = ssh_port - self.exception: Exception | None = None - self.daemon = True - - def run(self) -> None: - try: - cli.run( - ["vms", "run", self.machine_name, "--publish", f"{self.ssh_port}:22"] - ) - except Exception as ex: - # print exception details - print(traceback.format_exc(), file=sys.stderr) - print(sys.exc_info()[2], file=sys.stderr) - self.exception = ex - - -def run_vm_in_thread(machine_name: str, ssh_port: int | None = None) -> VmThread: - # runs machine and prints exceptions - if ssh_port is None: - ssh_port = find_free_port() - - vm_thread = VmThread(machine_name, ssh_port) - vm_thread.start() - return vm_thread - - -# wait for qmp socket to exist -def wait_vm_up(machine_name: str, vm: VmThread, flake_url: str | None = None) -> None: - if flake_url is None: - flake_url = str(Path.cwd()) - socket_file = vm_state_dir(flake_url, machine_name) / "qmp.sock" - timeout: float = 600 - while True: - if vm.exception: - msg = "VM failed to start" - raise ClanError(msg) from vm.exception - if timeout <= 0: - msg = f"qmp socket {socket_file} not found. Is the VM running?" - raise TimeoutError(msg) - if socket_file.exists(): - break - sleep(0.1) - timeout -= 0.1 - - -# wait for vm to be down by checking if qmp socket is down -def wait_vm_down(machine_name: str, vm: VmThread, flake_url: str | None = None) -> None: - if flake_url is None: - flake_url = str(Path.cwd()) - socket_file = vm_state_dir(flake_url, machine_name) / "qmp.sock" - timeout: float = 300 - while socket_file.exists(): - if vm.exception: - msg = "VM failed to start" - raise ClanError(msg) from vm.exception - if timeout <= 0: - msg = f"qmp socket {socket_file} still exists. Is the VM down?" - raise TimeoutError(msg) - sleep(0.1) - timeout -= 0.1 - - -# wait for vm to be up then connect and return qmp instance -@contextlib.contextmanager -def qmp_connect( - machine_name: str, vm: VmThread, flake_url: str | None = None -) -> Iterator[QEMUMonitorProtocol]: - if flake_url is None: - flake_url = str(Path.cwd()) - state_dir = vm_state_dir(flake_url, machine_name) - wait_vm_up(machine_name, vm, flake_url) - with QEMUMonitorProtocol( - address=str(os.path.realpath(state_dir / "qmp.sock")), - ) as qmp: - qmp.connect() - yield qmp - - -# wait for vm to be up then connect and return qga instance -@contextlib.contextmanager -def qga_connect( - machine_name: str, vm: VmThread, flake_url: str | None = None -) -> Iterator[QgaSession]: - if flake_url is None: - flake_url = str(Path.cwd()) - state_dir = vm_state_dir(flake_url, machine_name) - wait_vm_up(machine_name, vm, flake_url) - - sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) - try: - # try to reconnect a couple of times if connection refused - socket_file = os.path.realpath(state_dir / "qga.sock") - for _ in range(100): - try: - sock.connect(str(socket_file)) - except ConnectionRefusedError: - sleep(0.1) - else: - break - sock.connect(str(socket_file)) - yield QgaSession(sock) - finally: - sock.close() diff --git a/pkgs/clan-cli/tests/test_dirs.py b/pkgs/clan-cli/tests/test_dirs.py index ac2626834..bc1e48e64 100644 --- a/pkgs/clan-cli/tests/test_dirs.py +++ b/pkgs/clan-cli/tests/test_dirs.py @@ -16,6 +16,7 @@ # (subdir / ".clan-flake").touch() # assert _get_clan_flake_toplevel() == subdir +from clan_cli.clan_uri import FlakeId from clan_cli.dirs import clan_key_safe, vm_state_dir @@ -24,12 +25,12 @@ def test_clan_key_safe() -> None: def test_vm_state_dir_identity() -> None: - dir1 = vm_state_dir("https://some.clan", "vm1") - dir2 = vm_state_dir("https://some.clan", "vm1") + dir1 = vm_state_dir(FlakeId("https://some.clan"), "vm1") + dir2 = vm_state_dir(FlakeId("https://some.clan"), "vm1") assert str(dir1) == str(dir2) def test_vm_state_dir_no_collision() -> None: - dir1 = vm_state_dir("/foo/bar", "vm1") - dir2 = vm_state_dir("https://some.clan", "vm1") + dir1 = vm_state_dir(FlakeId("/foo/bar"), "vm1") + dir2 = vm_state_dir(FlakeId("https://some.clan"), "vm1") assert str(dir1) != str(dir2) diff --git a/pkgs/clan-cli/tests/test_flake_with_core/flake.nix b/pkgs/clan-cli/tests/test_flake_with_core/flake.nix index c67837be5..6c94a2505 100644 --- a/pkgs/clan-cli/tests/test_flake_with_core/flake.nix +++ b/pkgs/clan-cli/tests/test_flake_with_core/flake.nix @@ -24,16 +24,6 @@ clan.core.networking.zerotier.controller.enable = true; networking.useDHCP = false; - - systemd.services.shutdown-after-boot = { - enable = true; - wantedBy = [ "multi-user.target" ]; - after = [ "multi-user.target" ]; - script = '' - #!/usr/bin/env bash - shutdown -h now - ''; - }; }; vm2 = { lib, ... }: diff --git a/pkgs/clan-cli/tests/test_vars_deployment.py b/pkgs/clan-cli/tests/test_vars_deployment.py index 22296909b..996d92d09 100644 --- a/pkgs/clan-cli/tests/test_vars_deployment.py +++ b/pkgs/clan-cli/tests/test_vars_deployment.py @@ -1,14 +1,17 @@ import json +from contextlib import ExitStack from pathlib import Path import pytest from age_keys import SopsSetup from clan_cli import cmd +from clan_cli.clan_uri import FlakeId +from clan_cli.machines.machines import Machine from clan_cli.nix import nix_eval, run +from clan_cli.vms.run import inspect_vm, spawn_vm from fixtures_flakes import generate_flake from helpers import cli from helpers.nixos_config import nested_dict -from helpers.vms import qga_connect, run_vm_in_thread, wait_vm_down from root import CLAN_CORE @@ -61,9 +64,10 @@ def test_vm_deployment( flake_template=CLAN_CORE / "templates" / "minimal", machine_configs={"m1_machine": machine1_config, "m2_machine": machine2_config}, ) - monkeypatch.chdir(flake.path) - sops_setup.init() - cli.run(["vars", "generate"]) + + sops_setup.init(flake.path) + cli.run(["vars", "generate", "--flake", str(flake.path)]) + # check sops secrets not empty for machine in ["m1_machine", "m2_machine"]: sops_secrets = json.loads( @@ -94,37 +98,31 @@ def test_vm_deployment( ).stdout.strip() assert "no-such-path" not in shared_secret_path # run nix flake lock - cmd.run(["nix", "flake", "lock"]) - vm_m1 = run_vm_in_thread("m1_machine") - vm_m2 = run_vm_in_thread("m2_machine") - with ( - qga_connect("m1_machine", vm_m1) as qga_m1, - qga_connect("m2_machine", vm_m2) as qga_m2, - ): + cmd.run(["nix", "flake", "lock"], cwd=flake.path) + + vm1_config = inspect_vm(machine=Machine("m1_machine", FlakeId(str(flake.path)))) + vm2_config = inspect_vm(machine=Machine("m2_machine", FlakeId(str(flake.path)))) + with ExitStack() as stack: + vm1 = stack.enter_context(spawn_vm(vm1_config)) + vm2 = stack.enter_context(spawn_vm(vm2_config)) + qga_m1 = stack.enter_context(vm1.qga_connect()) + qga_m2 = stack.enter_context(vm2.qga_connect()) # check my_secret is deployed - _, out, _ = qga_m1.run( - "cat /run/secrets/vars/m1_generator/my_secret", check=True - ) - assert out == "hello\n" + result = qga_m1.run(["cat", "/run/secrets/vars/m1_generator/my_secret"]) + assert result.stdout == "hello\n" # check shared_secret is deployed on m1 - _, out, _ = qga_m1.run( - "cat /run/secrets/vars/my_shared_generator/shared_secret", check=True + result = qga_m1.run( + ["cat", "/run/secrets/vars/my_shared_generator/shared_secret"] ) - assert out == "hello\n" + assert result.stdout == "hello\n" # check shared_secret is deployed on m2 - _, out, _ = qga_m2.run( - "cat /run/secrets/vars/my_shared_generator/shared_secret", check=True + result = qga_m2.run( + ["cat", "/run/secrets/vars/my_shared_generator/shared_secret"] ) - assert out == "hello\n" + assert result.stdout == "hello\n" # check no_deploy_secret is not deployed - returncode, out, _ = qga_m1.run( - "test -e /run/secrets/vars/my_shared_generator/no_deploy_secret", + result = qga_m1.run( + ["test", "-e", "/run/secrets/vars/my_shared_generator/no_deploy_secret"], check=False, ) - assert returncode != 0 - qga_m1.exec_cmd("poweroff") - qga_m2.exec_cmd("poweroff") - wait_vm_down("m1_machine", vm_m1) - wait_vm_down("m2_machine", vm_m2) - vm_m1.join() - vm_m2.join() + assert result.returncode != 0 diff --git a/pkgs/clan-cli/tests/test_vms_cli.py b/pkgs/clan-cli/tests/test_vms_cli.py index 7944fecf0..7957add0a 100644 --- a/pkgs/clan-cli/tests/test_vms_cli.py +++ b/pkgs/clan-cli/tests/test_vms_cli.py @@ -2,10 +2,12 @@ from pathlib import Path from typing import TYPE_CHECKING import pytest +from clan_cli.clan_uri import FlakeId +from clan_cli.machines.machines import Machine +from clan_cli.vms.run import inspect_vm, spawn_vm from fixtures_flakes import FlakeForTest, generate_flake from helpers import cli from helpers.nixos_config import nested_dict -from helpers.vms import qga_connect, qmp_connect, run_vm_in_thread, wait_vm_down from root import CLAN_CORE from stdout import CaptureOutput @@ -51,7 +53,7 @@ def test_run( "user1", ] ) - cli.run(["vms", "run", "vm1"]) + cli.run(["vms", "run", "--no-block", "vm1", "shutdown", "-h", "now"]) @pytest.mark.skipif(no_kvm, reason="Requires KVM") @@ -66,7 +68,7 @@ def test_vm_persistence( config["my_machine"]["systemd"]["services"]["logrotate-checkconf"]["enable"] = False config["my_machine"]["services"]["getty"]["autologinUser"] = "root" config["my_machine"]["clan"]["virtualisation"] = {"graphics": False} - config["my_machine"]["clan"]["networking"] = {"targetHost": "client"} + config["my_machine"]["clan"]["core"]["networking"] = {"targetHost": "client"} config["my_machine"]["clan"]["core"]["state"]["my_state"]["folders"] = [ # to be owned by root "/var/my-state", @@ -84,54 +86,40 @@ def test_vm_persistence( machine_configs=config, ) - monkeypatch.chdir(flake.path) + vm_config = inspect_vm(machine=Machine("my_machine", FlakeId(str(flake.path)))) - vm = run_vm_in_thread("my_machine") - - # wait for the VM to start and connect qga - with qga_connect("my_machine", vm) as qga: + with spawn_vm(vm_config) as vm, vm.qga_connect() as qga: # create state via qmp command instead of systemd service - qga.run("echo 'dream2nix' > /var/my-state/root", check=True) - qga.run("echo 'dream2nix' > /var/my-state/test", check=True) - qga.run("chown test /var/my-state/test", check=True) - qga.run("chown test /var/user-state", check=True) - qga.run("touch /var/my-state/rebooting", check=True) - qga.exec_cmd("poweroff") - - # wait for socket to be down (systemd service 'poweroff' rebooting machine) - wait_vm_down("my_machine", vm) - - vm.join() + qga.run(["/bin/sh", "-c", "echo 'dream2nix' > /var/my-state/root"]) + qga.run(["/bin/sh", "-c", "echo 'dream2nix' > /var/my-state/test"]) + qga.run(["/bin/sh", "-c", "chown test /var/my-state/test"]) + qga.run(["/bin/sh", "-c", "chown test /var/user-state"]) + qga.run_nonblocking(["shutdown", "-h", "now"]) ## start vm again - vm = run_vm_in_thread("my_machine") - - ## connect second time - with qga_connect("my_machine", vm) as qga: + with spawn_vm(vm_config) as vm, vm.qga_connect() as qga: # check state exists - qga.run("cat /var/my-state/test", check=True) + qga.run(["cat", "/var/my-state/test"]) # ensure root file is owned by root - qga.run("stat -c '%U' /var/my-state/root", check=True) + qga.run(["stat", "-c", "%U", "/var/my-state/root"]) # ensure test file is owned by test - qga.run("stat -c '%U' /var/my-state/test", check=True) + qga.run(["stat", "-c", "%U", "/var/my-state/test"]) # ensure /var/user-state is owned by test - qga.run("stat -c '%U' /var/user-state", check=True) + qga.run(["stat", "-c", "%U", "/var/user-state"]) # ensure that the file created by the service is still there and has the expected content - exitcode, out, err = qga.run("cat /var/my-state/test") - assert exitcode == 0, err - assert out == "dream2nix\n", out + result = qga.run(["cat", "/var/my-state/test"]) + assert result.stdout == "dream2nix\n", result.stdout # check for errors - exitcode, out, err = qga.run("cat /var/my-state/error") - assert exitcode == 1, out + result = qga.run(["cat", "/var/my-state/error"], check=False) + assert result.returncode == 1, result.stdout # check all systemd services are OK, or print details - exitcode, out, err = qga.run( - "systemctl --failed | tee /tmp/yolo | grep -q '0 loaded units listed' || ( cat /tmp/yolo && false )" + result = qga.run( + [ + "/bin/sh", + "-c", + "systemctl --failed | tee /tmp/log | grep -q '0 loaded units listed' || ( cat /tmp/log && false )", + ], ) - assert exitcode == 0, out - - with qmp_connect("my_machine", vm) as qmp: - qmp.command("system_powerdown") - vm.join() diff --git a/pkgs/clan-vm-manager/clan_vm_manager/components/vmobj.py b/pkgs/clan-vm-manager/clan_vm_manager/components/vmobj.py index ef58b2419..d87e1c867 100644 --- a/pkgs/clan-vm-manager/clan_vm_manager/components/vmobj.py +++ b/pkgs/clan-vm-manager/clan_vm_manager/components/vmobj.py @@ -158,14 +158,9 @@ class VMObject(GObject.Object): ) assert self.machine is not None - if self.machine.flake.is_local(): - state_dir = vm_state_dir( - flake_url=str(self.machine.flake.path), vm_name=self.machine.name - ) - else: - state_dir = vm_state_dir( - flake_url=self.machine.flake.url, vm_name=self.machine.name - ) + state_dir = vm_state_dir( + flake_url=self.machine.flake, vm_name=self.machine.name + ) self.qmp_wrap = QMPWrapper(state_dir) assert self.machine is not None yield self.machine