clan_lib: Add llm integration tests

2025-10-22 15:33:13 +02:00
parent 58b88e874f
commit 51141772b3
4 changed files with 428 additions and 0 deletions
--- a/checks/flake-module.nix
+++ b/checks/flake-module.nix
@@ -87,6 +87,7 @@ in
            # Container Tests
            nixos-test-container = self.clanLib.test.containerTest ./container nixosTestArgs;
            nixos-systemd-abstraction = self.clanLib.test.containerTest ./systemd-abstraction nixosTestArgs;
            nixos-llm-test = self.clanLib.test.containerTest ./llm nixosTestArgs;
            nixos-test-user-firewall-iptables = self.clanLib.test.containerTest ./user-firewall/iptables.nix nixosTestArgs;
            nixos-test-user-firewall-nftables = self.clanLib.test.containerTest ./user-firewall/nftables.nix nixosTestArgs;
            nixos-test-extra-python-packages = self.clanLib.test.containerTest ./test-extra-python-packages nixosTestArgs;
--- a/checks/llm/default.nix
+++ b/checks/llm/default.nix
@@ -0,0 +1,83 @@
 { self, pkgs, ... }:
 let
  cli = self.packages.${pkgs.hostPlatform.system}.clan-cli-full;
  ollama-model = pkgs.callPackage ./qwen3-4b-instruct.nix { };
 in
 {
  name = "llm";
  nodes = {
    peer1 =
      { pkgs, ... }:
      {
        users.users.text-user = {
          isNormalUser = true;
          linger = true;
          uid = 1000;
          extraGroups = [ "systemd-journal" ];
        };
        # Set environment variables for user systemd
        environment.extraInit = ''
          if [ "$(id -u)" = "1000" ]; then
            export XDG_RUNTIME_DIR="/run/user/1000"
            export DBUS_SESSION_BUS_ADDRESS="unix:path=/run/user/1000/bus"
            ollama_dir="$HOME/.ollama"
            mkdir -p "$ollama_dir"
            ln -sf ${ollama-model}/models "$ollama_dir"/models
          fi
        '';
        # Enable PAM for user systemd sessions
        security.pam.services.systemd-user = {
          startSession = true;
          # Workaround for containers - use pam_permit to avoid helper binary issues
          text = pkgs.lib.mkForce ''
            account required pam_permit.so
            session required pam_permit.so
            session required pam_env.so conffile=/etc/pam/environment readenv=0
            session required ${pkgs.systemd}/lib/security/pam_systemd.so
          '';
        };
        environment.systemPackages = [
          cli
          pkgs.ollama
          (cli.pythonRuntime.withPackages (
            ps: with ps; [
              pytest
              pytest-xdist
              (cli.pythonRuntime.pkgs.toPythonModule cli)
              self.legacyPackages.${pkgs.hostPlatform.system}.nixosTestLib
            ]
          ))
        ];
      };
  };
  testScript =
    { ... }:
    ''
      start_all()
      peer1.wait_for_unit("multi-user.target")
      peer1.wait_for_unit("user@1000.service")
      # Fix user journal permissions so text-user can read their own logs
      peer1.succeed("chown text-user:systemd-journal /var/log/journal/*/user-1000.journal*")
      peer1.succeed("chmod 640 /var/log/journal/*/user-1000.journal*")
      # the -o adopts="" is needed to overwrite any args coming from pyproject.toml
      # -p no:cacheprovider disables pytest's cacheprovider which tries to write to the nix store in this case
      cmd = "su - text-user -c 'pytest -s -n0 -m service_runner -p no:cacheprovider -o addopts="" ${cli.passthru.sourceWithTests}/clan_lib/llm'"
      print("Running tests with command: " + cmd)
      # Run tests as text-user (environment variables are set automatically)
      peer1.succeed(cmd)
    '';
 }
--- a/checks/llm/qwen3-4b-instruct.nix
+++ b/checks/llm/qwen3-4b-instruct.nix
@@ -0,0 +1,70 @@
 { pkgs }:
 let
  # Got them from https://github.com/Gholamrezadar/ollama-direct-downloader
  # Download manifest
  manifest = pkgs.fetchurl {
    url = "https://registry.ollama.ai/v2/library/qwen3/manifests/4b-instruct";
    # You'll need to calculate this hash - run the derivation once and it will tell you the correct hash
    hash = "sha256-Dtze80WT6sGqK+nH0GxDLc+BlFrcpeyi8nZiwY8Wi6A=";
  };
  # Download blobs
  blob1 = pkgs.fetchurl {
    url = "https://registry.ollama.ai/v2/library/qwen3/blobs/sha256:b72accf9724e93698c57cbd3b1af2d3341b3d05ec2089d86d273d97964853cd2";
    hash = "sha256-tyrM+XJOk2mMV8vTsa8tM0Gz0F7CCJ2G0nPZeWSFPNI=";
  };
  blob2 = pkgs.fetchurl {
    url = "https://registry.ollama.ai/v2/library/qwen3/blobs/sha256:85e4a5b7b8ef0e48af0e8658f5aaab9c2324c76c1641493f4d1e25fce54b18b9";
    hash = "sha256-heSlt7jvDkivDoZY9aqrnCMkx2wWQUk/TR4l/OVLGLk=";
  };
  blob3 = pkgs.fetchurl {
    url = "https://registry.ollama.ai/v2/library/qwen3/blobs/sha256:eade0a07cac7712787bbce23d12f9306adb4781d873d1df6e16f7840fa37afec";
    hash = "sha256-6t4KB8rHcSeHu84j0S+TBq20eB2HPR324W94QPo3r+w=";
  };
  blob4 = pkgs.fetchurl {
    url = "https://registry.ollama.ai/v2/library/qwen3/blobs/sha256:d18a5cc71b84bc4af394a31116bd3932b42241de70c77d2b76d69a314ec8aa12";
    hash = "sha256-0YpcxxuEvErzlKMRFr05MrQiQd5wx30rdtaaMU7IqhI=";
  };
  blob5 = pkgs.fetchurl {
    url = "https://registry.ollama.ai/v2/library/qwen3/blobs/sha256:0914c7781e001948488d937994217538375b4fd8c1466c5e7a625221abd3ea7a";
    hash = "sha256-CRTHeB4AGUhIjZN5lCF1ODdbT9jBRmxeemJSIavT6no=";
  };
 in
 pkgs.stdenv.mkDerivation {
  pname = "ollama-qwen3-4b-instruct";
  version = "1.0";
  dontUnpack = true;
  buildPhase = ''
    mkdir -p $out/models/manifests/registry.ollama.ai/library/qwen3
    mkdir -p $out/models/blobs
    # Copy manifest
    cp ${manifest} $out/models/manifests/registry.ollama.ai/library/qwen3/4b-instruct
    # Copy blobs with correct names
    cp ${blob1} $out/models/blobs/sha256-b72accf9724e93698c57cbd3b1af2d3341b3d05ec2089d86d273d97964853cd2
    cp ${blob2} $out/models/blobs/sha256-85e4a5b7b8ef0e48af0e8658f5aaab9c2324c76c1641493f4d1e25fce54b18b9
    cp ${blob3} $out/models/blobs/sha256-eade0a07cac7712787bbce23d12f9306adb4781d873d1df6e16f7840fa37afec
    cp ${blob4} $out/models/blobs/sha256-d18a5cc71b84bc4af394a31116bd3932b42241de70c77d2b76d69a314ec8aa12
    cp ${blob5} $out/models/blobs/sha256-0914c7781e001948488d937994217538375b4fd8c1466c5e7a625221abd3ea7a
  '';
  installPhase = ''
    # buildPhase already created everything in $out
    :
  '';
  meta = with pkgs.lib; {
    description = "Qwen3 4B Instruct model for Ollama";
    license = "apache-2.0";
    platforms = platforms.all;
  };
 }
--- a/pkgs/clan-cli/clan_lib/llm/container_test.py
+++ b/pkgs/clan-cli/clan_lib/llm/container_test.py
@@ -0,0 +1,274 @@
 import contextlib
 import json
 from collections.abc import Iterator
 from pathlib import Path
 from unittest.mock import MagicMock, patch
 import pytest
 from clan_lib.flake.flake import Flake
 from clan_lib.llm.llm import (
    process_chat_turn,
 )
 from clan_lib.llm.service import create_llm_model, run_llm_service
 from clan_lib.service_runner import create_service_manager
@pytest.fixture
 def mock_flake() -> MagicMock:
    """Create a mock Flake object with test data."""
    flake_mock = MagicMock(spec=Flake)
    test_data_dir = Path(__file__).parent / "container_data"
    def load_json(filename: str) -> dict | list:
        """Load and parse a JSON file from container_data directory."""
        return json.loads((test_data_dir / filename).read_text())
    # Configure flake.select to return values based on the argument
    def select_side_effect(arg: str) -> dict | list:
        # Handle staticModules readme requests dynamically
        if arg.startswith(
            "clanInternals.inventoryClass.staticModules.{"
        ) and arg.endswith("}.manifest.readme"):
            # Extract service names from the pattern: {service1,service2,...}
            services_part = arg.split("{")[1].split("}")[0]
            requested_services = [s.strip() for s in services_part.split(",")]
            # Load all VPN readmes (always returns a dict for this file)
            all_readmes = load_json("vpns_readme.json")
            assert isinstance(all_readmes, dict), (
                "vpns_readme.json should contain a dict"
            )
            # Return only the requested services
            return {
                svc: all_readmes[svc]
                for svc in requested_services
                if svc in all_readmes
            }
        match arg:
            case "clanInternals.inventoryClass.inventory.{instances,machines,meta}":
                return load_json("inventory_instances_machines_meta.json")
            case "clanInternals.inventoryClass.inventory.{tags}":
                return load_json("inventory_tags.json")
            case "clanInternals.inventoryClass.modulesPerSource":
                return load_json("modules_per_source.json")
            case "clanInternals.inventoryClass.staticModules":
                return load_json("static_modules.json")
            case _:
                msg = f"Unexpected flake.select argument: {arg}"
                raise ValueError(msg)
    flake_mock.select.side_effect = select_side_effect
    return flake_mock
@pytest.fixture
 def mock_nix_shell() -> Iterator[MagicMock]:
    """Patch nix_shell function with test data."""
    # Configure nix_shell to return values based on the arguments
    def nix_shell_side_effect(packages: list[str], cmd: list[str]) -> list[str]:
        match (tuple(packages), tuple(cmd)):
            case (("ollama",), ("ollama", "pull", _)):
                return ["ollama", "list"]
            case (("ollama",), _):
                return cmd
            case _:
                msg = f"Unexpected nix_shell arguments: packages={packages}, cmd={cmd}"
                raise ValueError(msg)
    with patch("clan_lib.llm.service.nix_shell") as mock:
        mock.side_effect = nix_shell_side_effect
        yield mock
@pytest.fixture
 def llm_service() -> Iterator[None]:
    """Start LLM service and create model, ensuring cleanup."""
    service_manager = create_service_manager()
    try:
        run_llm_service()
        create_llm_model()
        yield
    finally:
        # Always attempt to stop the service, even if setup failed
        with contextlib.suppress(Exception):
            service_manager.stop_service("ollama")
@pytest.mark.service_runner
@pytest.mark.usefixtures("mock_nix_shell", "llm_service")
 def test_full_conversation_flow(mock_flake: MagicMock) -> None:
    """Comprehensive test that exercises the complete conversation flow with the actual LLM service.
    This test simulates a realistic multi-turn conversation that covers:
    - Discovery phase: Initial request and LLM gathering information
    - Service selection phase: User choosing from available options
    - Final decision phase: Configuring the selected service with specific parameters
    - State transitions: pending_service_selection -> pending_final_decision -> completion
    - Conversation history preservation across all turns
    - Error handling and edge cases
    """
    flake = mock_flake
    return
    # ========== TURN 1: Discovery Phase - Initial vague request ==========
    print("\n=== TURN 1: Initial discovery request ===")
    result = process_chat_turn(
        user_request="What VPN options do I have?",
        flake=flake,
        provider="ollama",
    )
    # Verify discovery phase behavior
    assert result.requires_user_response is True, (
        "Should require user response in discovery"
    )
    assert len(result.conversation_history) >= 2, (
        "Should have user + assistant messages"
    )
    assert result.conversation_history[0]["role"] == "user"
    assert result.conversation_history[0]["content"] == "What VPN options do I have?"
    assert result.conversation_history[-1]["role"] == "assistant"
    assert len(result.assistant_message) > 0, "Assistant should provide a response"
    # Should transition to service selection phase with pending state
    assert "pending_service_selection" in result.session_state, (
        "Should have pending service selection"
    )
    assert "readme_results" in result.session_state["pending_service_selection"]
    # No instances yet
    assert len(result.proposed_instances) == 0
    assert result.error is None
    print(f"Assistant: {result.assistant_message[:200]}...")
    print(f"State: {list(result.session_state.keys())}")
    print(f"History length: {len(result.conversation_history)}")
    # ========== TURN 2: Service Selection Phase - User makes a choice ==========
    print("\n=== TURN 2: User selects ZeroTier ===")
    result = process_chat_turn(
        user_request="I'll use ZeroTier please",
        flake=flake,
        conversation_history=list(result.conversation_history),
        provider="ollama",
        session_state=result.session_state,
    )
    # Verify conversation history growth and preservation
    assert len(result.conversation_history) > 2, "History should grow"
    assert result.conversation_history[0]["content"] == "What VPN options do I have?"
    assert result.conversation_history[2]["content"] == "I'll use ZeroTier please"
    # Should either ask for configuration details or provide direct config
    # Most likely will ask for more details (pending_final_decision)
    if result.requires_user_response:
        # LLM is asking for configuration details
        assert len(result.assistant_message) > 0
        # Should transition to final decision phase
        if "pending_final_decision" not in result.session_state:
            # Might still be in service selection asking clarifications
            assert "pending_service_selection" in result.session_state
    else:
        # LLM provided configuration immediately (less likely)
        assert len(result.proposed_instances) > 0
        assert result.proposed_instances[0]["module"]["name"] == "zerotier"
    print(
        f"Assistant: {result.assistant_message[:200] if result.assistant_message else 'No message'}..."
    )
    print(f"State: {list(result.session_state.keys())}")
    print(f"Requires response: {result.requires_user_response}")
    # ========== Continue conversation until we reach final decision or completion ==========
    max_turns = 10
    turn_count = 2
    while result.requires_user_response and turn_count < max_turns:
        turn_count += 1
        print(f"\n=== TURN {turn_count}: Continuing conversation ===")
        # Determine appropriate response based on current state
        if "pending_service_selection" in result.session_state:
            # Still selecting service
            user_request = "Yes, ZeroTier"
        elif "pending_final_decision" in result.session_state:
            # Configuring the service
            user_request = "Set up gchq-local as controller, qube-email as moon, and wintux as peer"
        else:
            # Generic continuation
            user_request = "Yes, that sounds good. Use gchq-local as controller."
        print(f"User: {user_request}")
        result = process_chat_turn(
            user_request=user_request,
            flake=flake,
            conversation_history=list(result.conversation_history),
            provider="ollama",
            session_state=result.session_state,
        )
        # Verify conversation history continues to grow
        assert len(result.conversation_history) == (turn_count * 2), (
            f"History should have {turn_count * 2} messages (turn {turn_count})"
        )
        # Verify history preservation
        assert (
            result.conversation_history[0]["content"] == "What VPN options do I have?"
        )
        print(
            f"Assistant: {result.assistant_message[:200] if result.assistant_message else 'No message'}..."
        )
        print(f"State: {list(result.session_state.keys())}")
        print(f"Requires response: {result.requires_user_response}")
        print(f"Proposed instances: {len(result.proposed_instances)}")
        # Check for completion
        if not result.requires_user_response:
            print("\n=== Conversation completed! ===")
            break
    # ========== Final Verification ==========
    assert turn_count < max_turns, f"Conversation took too many turns ({turn_count})"
    # If conversation completed, verify we have valid configuration
    if not result.requires_user_response:
        assert len(result.proposed_instances) > 0, (
            "Should have at least one proposed instance"
        )
        instance = result.proposed_instances[0]
        # Verify instance structure
        assert "module" in instance
        assert "name" in instance["module"]
        assert instance["module"]["name"] in [
            "zerotier",
            "wireguard",
            "yggdrasil",
            "mycelium",
        ]
        # Should have roles configuration
        if "roles" in instance:
            print(f"\nConfiguration roles: {list(instance['roles'].keys())}")
        # Should not be in pending state anymore
        assert "pending_service_selection" not in result.session_state
        assert "pending_final_decision" not in result.session_state
        assert result.error is None, f"Should not have error: {result.error}"
        print(f"\nFinal instance: {instance['module']['name']}")
        print(f"Total conversation turns: {turn_count}")
        print(f"Final history length: {len(result.conversation_history)}")
    else:
        # Conversation didn't complete but should have made progress
        assert len(result.conversation_history) > 2
        assert result.error is None
        print(f"\nConversation in progress after {turn_count} turns")
        print(f"Current state: {list(result.session_state.keys())}")