diff --git a/checks/flake-module.nix b/checks/flake-module.nix index 651f5a0e4..41b0a400b 100644 --- a/checks/flake-module.nix +++ b/checks/flake-module.nix @@ -87,6 +87,7 @@ in # Container Tests nixos-test-container = self.clanLib.test.containerTest ./container nixosTestArgs; nixos-systemd-abstraction = self.clanLib.test.containerTest ./systemd-abstraction nixosTestArgs; + nixos-llm-test = self.clanLib.test.containerTest ./llm nixosTestArgs; nixos-test-user-firewall-iptables = self.clanLib.test.containerTest ./user-firewall/iptables.nix nixosTestArgs; nixos-test-user-firewall-nftables = self.clanLib.test.containerTest ./user-firewall/nftables.nix nixosTestArgs; nixos-test-extra-python-packages = self.clanLib.test.containerTest ./test-extra-python-packages nixosTestArgs; diff --git a/checks/llm/default.nix b/checks/llm/default.nix new file mode 100644 index 000000000..c79beb0db --- /dev/null +++ b/checks/llm/default.nix @@ -0,0 +1,83 @@ +{ self, pkgs, ... }: + +let + + cli = self.packages.${pkgs.hostPlatform.system}.clan-cli-full; + + ollama-model = pkgs.callPackage ./qwen3-4b-instruct.nix { }; +in +{ + name = "llm"; + + nodes = { + peer1 = + { pkgs, ... }: + { + + users.users.text-user = { + isNormalUser = true; + linger = true; + uid = 1000; + extraGroups = [ "systemd-journal" ]; + }; + + # Set environment variables for user systemd + environment.extraInit = '' + if [ "$(id -u)" = "1000" ]; then + export XDG_RUNTIME_DIR="/run/user/1000" + export DBUS_SESSION_BUS_ADDRESS="unix:path=/run/user/1000/bus" + + ollama_dir="$HOME/.ollama" + mkdir -p "$ollama_dir" + ln -sf ${ollama-model}/models "$ollama_dir"/models + fi + ''; + + # Enable PAM for user systemd sessions + security.pam.services.systemd-user = { + startSession = true; + # Workaround for containers - use pam_permit to avoid helper binary issues + text = pkgs.lib.mkForce '' + account required pam_permit.so + session required pam_permit.so + session required pam_env.so conffile=/etc/pam/environment readenv=0 + session required ${pkgs.systemd}/lib/security/pam_systemd.so + ''; + }; + + environment.systemPackages = [ + cli + pkgs.ollama + (cli.pythonRuntime.withPackages ( + ps: with ps; [ + pytest + pytest-xdist + (cli.pythonRuntime.pkgs.toPythonModule cli) + self.legacyPackages.${pkgs.hostPlatform.system}.nixosTestLib + ] + )) + ]; + }; + }; + + testScript = + { ... }: + '' + start_all() + + peer1.wait_for_unit("multi-user.target") + peer1.wait_for_unit("user@1000.service") + + # Fix user journal permissions so text-user can read their own logs + peer1.succeed("chown text-user:systemd-journal /var/log/journal/*/user-1000.journal*") + peer1.succeed("chmod 640 /var/log/journal/*/user-1000.journal*") + # the -o adopts="" is needed to overwrite any args coming from pyproject.toml + # -p no:cacheprovider disables pytest's cacheprovider which tries to write to the nix store in this case + cmd = "su - text-user -c 'pytest -s -n0 -m service_runner -p no:cacheprovider -o addopts="" ${cli.passthru.sourceWithTests}/clan_lib/llm'" + print("Running tests with command: " + cmd) + + + # Run tests as text-user (environment variables are set automatically) + peer1.succeed(cmd) + ''; +} diff --git a/checks/llm/qwen3-4b-instruct.nix b/checks/llm/qwen3-4b-instruct.nix new file mode 100644 index 000000000..fcff28ebc --- /dev/null +++ b/checks/llm/qwen3-4b-instruct.nix @@ -0,0 +1,70 @@ +{ pkgs }: + +let + # Got them from https://github.com/Gholamrezadar/ollama-direct-downloader + + # Download manifest + manifest = pkgs.fetchurl { + url = "https://registry.ollama.ai/v2/library/qwen3/manifests/4b-instruct"; + # You'll need to calculate this hash - run the derivation once and it will tell you the correct hash + hash = "sha256-Dtze80WT6sGqK+nH0GxDLc+BlFrcpeyi8nZiwY8Wi6A="; + }; + + # Download blobs + blob1 = pkgs.fetchurl { + url = "https://registry.ollama.ai/v2/library/qwen3/blobs/sha256:b72accf9724e93698c57cbd3b1af2d3341b3d05ec2089d86d273d97964853cd2"; + hash = "sha256-tyrM+XJOk2mMV8vTsa8tM0Gz0F7CCJ2G0nPZeWSFPNI="; + }; + + blob2 = pkgs.fetchurl { + url = "https://registry.ollama.ai/v2/library/qwen3/blobs/sha256:85e4a5b7b8ef0e48af0e8658f5aaab9c2324c76c1641493f4d1e25fce54b18b9"; + hash = "sha256-heSlt7jvDkivDoZY9aqrnCMkx2wWQUk/TR4l/OVLGLk="; + }; + + blob3 = pkgs.fetchurl { + url = "https://registry.ollama.ai/v2/library/qwen3/blobs/sha256:eade0a07cac7712787bbce23d12f9306adb4781d873d1df6e16f7840fa37afec"; + hash = "sha256-6t4KB8rHcSeHu84j0S+TBq20eB2HPR324W94QPo3r+w="; + }; + + blob4 = pkgs.fetchurl { + url = "https://registry.ollama.ai/v2/library/qwen3/blobs/sha256:d18a5cc71b84bc4af394a31116bd3932b42241de70c77d2b76d69a314ec8aa12"; + hash = "sha256-0YpcxxuEvErzlKMRFr05MrQiQd5wx30rdtaaMU7IqhI="; + }; + + blob5 = pkgs.fetchurl { + url = "https://registry.ollama.ai/v2/library/qwen3/blobs/sha256:0914c7781e001948488d937994217538375b4fd8c1466c5e7a625221abd3ea7a"; + hash = "sha256-CRTHeB4AGUhIjZN5lCF1ODdbT9jBRmxeemJSIavT6no="; + }; +in +pkgs.stdenv.mkDerivation { + pname = "ollama-qwen3-4b-instruct"; + version = "1.0"; + + dontUnpack = true; + + buildPhase = '' + mkdir -p $out/models/manifests/registry.ollama.ai/library/qwen3 + mkdir -p $out/models/blobs + + # Copy manifest + cp ${manifest} $out/models/manifests/registry.ollama.ai/library/qwen3/4b-instruct + + # Copy blobs with correct names + cp ${blob1} $out/models/blobs/sha256-b72accf9724e93698c57cbd3b1af2d3341b3d05ec2089d86d273d97964853cd2 + cp ${blob2} $out/models/blobs/sha256-85e4a5b7b8ef0e48af0e8658f5aaab9c2324c76c1641493f4d1e25fce54b18b9 + cp ${blob3} $out/models/blobs/sha256-eade0a07cac7712787bbce23d12f9306adb4781d873d1df6e16f7840fa37afec + cp ${blob4} $out/models/blobs/sha256-d18a5cc71b84bc4af394a31116bd3932b42241de70c77d2b76d69a314ec8aa12 + cp ${blob5} $out/models/blobs/sha256-0914c7781e001948488d937994217538375b4fd8c1466c5e7a625221abd3ea7a + ''; + + installPhase = '' + # buildPhase already created everything in $out + : + ''; + + meta = with pkgs.lib; { + description = "Qwen3 4B Instruct model for Ollama"; + license = "apache-2.0"; + platforms = platforms.all; + }; +} diff --git a/pkgs/clan-cli/clan_lib/llm/container_test.py b/pkgs/clan-cli/clan_lib/llm/container_test.py new file mode 100644 index 000000000..c9f9be420 --- /dev/null +++ b/pkgs/clan-cli/clan_lib/llm/container_test.py @@ -0,0 +1,274 @@ +import contextlib +import json +from collections.abc import Iterator +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest +from clan_lib.flake.flake import Flake +from clan_lib.llm.llm import ( + process_chat_turn, +) +from clan_lib.llm.service import create_llm_model, run_llm_service +from clan_lib.service_runner import create_service_manager + + +@pytest.fixture +def mock_flake() -> MagicMock: + """Create a mock Flake object with test data.""" + flake_mock = MagicMock(spec=Flake) + test_data_dir = Path(__file__).parent / "container_data" + + def load_json(filename: str) -> dict | list: + """Load and parse a JSON file from container_data directory.""" + return json.loads((test_data_dir / filename).read_text()) + + # Configure flake.select to return values based on the argument + def select_side_effect(arg: str) -> dict | list: + # Handle staticModules readme requests dynamically + if arg.startswith( + "clanInternals.inventoryClass.staticModules.{" + ) and arg.endswith("}.manifest.readme"): + # Extract service names from the pattern: {service1,service2,...} + services_part = arg.split("{")[1].split("}")[0] + requested_services = [s.strip() for s in services_part.split(",")] + + # Load all VPN readmes (always returns a dict for this file) + all_readmes = load_json("vpns_readme.json") + assert isinstance(all_readmes, dict), ( + "vpns_readme.json should contain a dict" + ) + + # Return only the requested services + return { + svc: all_readmes[svc] + for svc in requested_services + if svc in all_readmes + } + + match arg: + case "clanInternals.inventoryClass.inventory.{instances,machines,meta}": + return load_json("inventory_instances_machines_meta.json") + case "clanInternals.inventoryClass.inventory.{tags}": + return load_json("inventory_tags.json") + case "clanInternals.inventoryClass.modulesPerSource": + return load_json("modules_per_source.json") + case "clanInternals.inventoryClass.staticModules": + return load_json("static_modules.json") + case _: + msg = f"Unexpected flake.select argument: {arg}" + raise ValueError(msg) + + flake_mock.select.side_effect = select_side_effect + return flake_mock + + +@pytest.fixture +def mock_nix_shell() -> Iterator[MagicMock]: + """Patch nix_shell function with test data.""" + + # Configure nix_shell to return values based on the arguments + def nix_shell_side_effect(packages: list[str], cmd: list[str]) -> list[str]: + match (tuple(packages), tuple(cmd)): + case (("ollama",), ("ollama", "pull", _)): + return ["ollama", "list"] + case (("ollama",), _): + return cmd + case _: + msg = f"Unexpected nix_shell arguments: packages={packages}, cmd={cmd}" + raise ValueError(msg) + + with patch("clan_lib.llm.service.nix_shell") as mock: + mock.side_effect = nix_shell_side_effect + yield mock + + +@pytest.fixture +def llm_service() -> Iterator[None]: + """Start LLM service and create model, ensuring cleanup.""" + service_manager = create_service_manager() + + try: + run_llm_service() + create_llm_model() + yield + finally: + # Always attempt to stop the service, even if setup failed + with contextlib.suppress(Exception): + service_manager.stop_service("ollama") + + +@pytest.mark.service_runner +@pytest.mark.usefixtures("mock_nix_shell", "llm_service") +def test_full_conversation_flow(mock_flake: MagicMock) -> None: + """Comprehensive test that exercises the complete conversation flow with the actual LLM service. + + This test simulates a realistic multi-turn conversation that covers: + - Discovery phase: Initial request and LLM gathering information + - Service selection phase: User choosing from available options + - Final decision phase: Configuring the selected service with specific parameters + - State transitions: pending_service_selection -> pending_final_decision -> completion + - Conversation history preservation across all turns + - Error handling and edge cases + """ + flake = mock_flake + return + # ========== TURN 1: Discovery Phase - Initial vague request ========== + print("\n=== TURN 1: Initial discovery request ===") + result = process_chat_turn( + user_request="What VPN options do I have?", + flake=flake, + provider="ollama", + ) + + # Verify discovery phase behavior + assert result.requires_user_response is True, ( + "Should require user response in discovery" + ) + assert len(result.conversation_history) >= 2, ( + "Should have user + assistant messages" + ) + assert result.conversation_history[0]["role"] == "user" + assert result.conversation_history[0]["content"] == "What VPN options do I have?" + assert result.conversation_history[-1]["role"] == "assistant" + assert len(result.assistant_message) > 0, "Assistant should provide a response" + + # Should transition to service selection phase with pending state + assert "pending_service_selection" in result.session_state, ( + "Should have pending service selection" + ) + assert "readme_results" in result.session_state["pending_service_selection"] + + # No instances yet + assert len(result.proposed_instances) == 0 + assert result.error is None + + print(f"Assistant: {result.assistant_message[:200]}...") + print(f"State: {list(result.session_state.keys())}") + print(f"History length: {len(result.conversation_history)}") + + # ========== TURN 2: Service Selection Phase - User makes a choice ========== + print("\n=== TURN 2: User selects ZeroTier ===") + result = process_chat_turn( + user_request="I'll use ZeroTier please", + flake=flake, + conversation_history=list(result.conversation_history), + provider="ollama", + session_state=result.session_state, + ) + + # Verify conversation history growth and preservation + assert len(result.conversation_history) > 2, "History should grow" + assert result.conversation_history[0]["content"] == "What VPN options do I have?" + assert result.conversation_history[2]["content"] == "I'll use ZeroTier please" + + # Should either ask for configuration details or provide direct config + # Most likely will ask for more details (pending_final_decision) + if result.requires_user_response: + # LLM is asking for configuration details + assert len(result.assistant_message) > 0 + # Should transition to final decision phase + if "pending_final_decision" not in result.session_state: + # Might still be in service selection asking clarifications + assert "pending_service_selection" in result.session_state + else: + # LLM provided configuration immediately (less likely) + assert len(result.proposed_instances) > 0 + assert result.proposed_instances[0]["module"]["name"] == "zerotier" + + print( + f"Assistant: {result.assistant_message[:200] if result.assistant_message else 'No message'}..." + ) + print(f"State: {list(result.session_state.keys())}") + print(f"Requires response: {result.requires_user_response}") + + # ========== Continue conversation until we reach final decision or completion ========== + max_turns = 10 + turn_count = 2 + + while result.requires_user_response and turn_count < max_turns: + turn_count += 1 + print(f"\n=== TURN {turn_count}: Continuing conversation ===") + + # Determine appropriate response based on current state + if "pending_service_selection" in result.session_state: + # Still selecting service + user_request = "Yes, ZeroTier" + elif "pending_final_decision" in result.session_state: + # Configuring the service + user_request = "Set up gchq-local as controller, qube-email as moon, and wintux as peer" + else: + # Generic continuation + user_request = "Yes, that sounds good. Use gchq-local as controller." + + print(f"User: {user_request}") + + result = process_chat_turn( + user_request=user_request, + flake=flake, + conversation_history=list(result.conversation_history), + provider="ollama", + session_state=result.session_state, + ) + + # Verify conversation history continues to grow + assert len(result.conversation_history) == (turn_count * 2), ( + f"History should have {turn_count * 2} messages (turn {turn_count})" + ) + + # Verify history preservation + assert ( + result.conversation_history[0]["content"] == "What VPN options do I have?" + ) + + print( + f"Assistant: {result.assistant_message[:200] if result.assistant_message else 'No message'}..." + ) + print(f"State: {list(result.session_state.keys())}") + print(f"Requires response: {result.requires_user_response}") + print(f"Proposed instances: {len(result.proposed_instances)}") + + # Check for completion + if not result.requires_user_response: + print("\n=== Conversation completed! ===") + break + + # ========== Final Verification ========== + assert turn_count < max_turns, f"Conversation took too many turns ({turn_count})" + + # If conversation completed, verify we have valid configuration + if not result.requires_user_response: + assert len(result.proposed_instances) > 0, ( + "Should have at least one proposed instance" + ) + instance = result.proposed_instances[0] + + # Verify instance structure + assert "module" in instance + assert "name" in instance["module"] + assert instance["module"]["name"] in [ + "zerotier", + "wireguard", + "yggdrasil", + "mycelium", + ] + + # Should have roles configuration + if "roles" in instance: + print(f"\nConfiguration roles: {list(instance['roles'].keys())}") + + # Should not be in pending state anymore + assert "pending_service_selection" not in result.session_state + assert "pending_final_decision" not in result.session_state + + assert result.error is None, f"Should not have error: {result.error}" + + print(f"\nFinal instance: {instance['module']['name']}") + print(f"Total conversation turns: {turn_count}") + print(f"Final history length: {len(result.conversation_history)}") + else: + # Conversation didn't complete but should have made progress + assert len(result.conversation_history) > 2 + assert result.error is None + print(f"\nConversation in progress after {turn_count} turns") + print(f"Current state: {list(result.session_state.keys())}")