Merge pull request 'vars/performance: aggregate selects over all machines and generators' (#5402) from dave into main

Reviewed-on: https://git.clan.lol/clan/clan-core/pulls/5402
This commit is contained in:
DavHau
2025-10-08 06:04:50 +00:00
5 changed files with 86 additions and 52 deletions

View File

@@ -75,13 +75,14 @@ class TestFlake(Flake):
def path(self) -> Path: def path(self) -> Path:
return self.test_dir return self.test_dir
def select_machine(self, machine_name: str, selector: str) -> Any: def machine_selector(self, machine_name: str, selector: str) -> str:
"""Select a nix attribute for a specific machine. """Create a selector for a specific machine.
Args: Args:
machine_name: The name of the machine machine_name: The name of the machine
selector: The attribute selector string relative to the machine config selector: The attribute selector string relative to the machine config
apply: Optional function to apply to the result Returns:
The full selector string for the machine
""" """
config = nix_config() config = nix_config()
@@ -89,9 +90,7 @@ class TestFlake(Flake):
test_system = system test_system = system
if system.endswith("-darwin"): if system.endswith("-darwin"):
test_system = system.rstrip("darwin") + "linux" test_system = system.rstrip("darwin") + "linux"
return f'checks."{test_system}".{self.check_attr}.machinesCross."{system}"."{machine_name}".{selector}'
full_selector = f'checks."{test_system}".{self.check_attr}.machinesCross.{system}."{machine_name}".{selector}'
return self.select(full_selector)
# we don't want to evaluate all machines of the flake. Only the ones defined in the test # we don't want to evaluate all machines of the flake. Only the ones defined in the test
def set_machine_names(self, machine_names: list[str]) -> None: def set_machine_names(self, machine_names: list[str]) -> None:

View File

@@ -1264,65 +1264,74 @@ def test_cache_misses_for_vars_operations(
flake: ClanFlake, flake: ClanFlake,
) -> None: ) -> None:
"""Test that vars operations result in minimal cache misses.""" """Test that vars operations result in minimal cache misses."""
# Set up first machine with two generators
config = flake.machines["my_machine"] = create_test_machine_config() config = flake.machines["my_machine"] = create_test_machine_config()
# Set up a simple generator with a public value # Set up two generators with public values
my_generator = config["clan"]["core"]["vars"]["generators"]["my_generator"] gen1 = config["clan"]["core"]["vars"]["generators"]["gen1"]
my_generator["files"]["my_value"]["secret"] = False gen1["files"]["value1"]["secret"] = False
my_generator["script"] = 'echo -n "test_value" > "$out"/my_value' gen1["script"] = 'echo -n "test_value1" > "$out"/value1'
gen2 = config["clan"]["core"]["vars"]["generators"]["gen2"]
gen2["files"]["value2"]["secret"] = False
gen2["script"] = 'echo -n "test_value2" > "$out"/value2'
# Add a second machine with the same generator configuration
flake.machines["other_machine"] = config.copy()
flake.refresh() flake.refresh()
monkeypatch.chdir(flake.path) monkeypatch.chdir(flake.path)
# Create a fresh machine object to ensure clean cache state # Create fresh machine objects to ensure clean cache state
machine = Machine(name="my_machine", flake=Flake(str(flake.path))) flake_obj = Flake(str(flake.path))
machine1 = Machine(name="my_machine", flake=flake_obj)
machine2 = Machine(name="other_machine", flake=flake_obj)
# Test 1: Running vars generate with a fresh cache should result in exactly 3 cache misses # Test 1: Running vars generate for BOTH machines simultaneously should still result in exactly 2 cache misses
# Expected cache misses: # Even though we have:
# 1. One for getting the list of generators # - 2 machines (my_machine and other_machine)
# 2. One for getting the final script of our test generator (my_generator) # - 2 generators per machine (gen1 and gen2)
# 3. One for getting the final script of the state version generator (added by default) # We still only get 2 cache misses when generating for both machines:
# TODO: The third cache miss is undesired in tests. disable state version module for tests # 1. One for getting the list of generators for both machines
# 2. One batched evaluation for getting all generator scripts for both machines
# The key insight: the system should batch ALL evaluations across ALL machines into a single nix eval
run_generators( run_generators(
machines=[machine], machines=[machine1, machine2],
generators=None, # Generate all generators=None, # Generate all
) )
# Print stack traces if we have more than 3 cache misses # Print stack traces if we have more than 2 cache misses
if machine.flake._cache_misses != 3: if flake_obj._cache_misses != 2:
machine.flake.print_cache_miss_analysis( flake_obj.print_cache_miss_analysis(
title="Cache miss analysis for vars generate" title="Cache miss analysis for vars generate"
) )
assert machine.flake._cache_misses == 2, ( assert flake_obj._cache_misses == 2, (
f"Expected exactly 2 cache misses for vars generate, got {machine.flake._cache_misses}" f"Expected exactly 2 cache misses for vars generate, got {flake_obj._cache_misses}"
) )
# Verify the value was generated correctly
var_value = get_machine_var(machine, "my_generator/my_value")
assert var_value.printable_value == "test_value"
# Test 2: List all vars should result in exactly 1 cache miss # Test 2: List all vars should result in exactly 1 cache miss
# Force cache invalidation (this also resets cache miss tracking) # Force cache invalidation (this also resets cache miss tracking)
invalidate_flake_cache(flake.path) invalidate_flake_cache(flake.path)
machine.flake.invalidate_cache() flake_obj.invalidate_cache()
stringify_all_vars(machine) stringify_all_vars(machine1)
assert machine.flake._cache_misses == 1, ( assert flake_obj._cache_misses == 1, (
f"Expected exactly 1 cache miss for vars list, got {machine.flake._cache_misses}" f"Expected exactly 1 cache miss for vars list, got {flake_obj._cache_misses}"
) )
# Test 3: Getting a specific var with a fresh cache should result in exactly 1 cache miss # Test 3: Getting a specific var with a fresh cache should result in exactly 1 cache miss
# Force cache invalidation (this also resets cache miss tracking) # Force cache invalidation (this also resets cache miss tracking)
invalidate_flake_cache(flake.path) invalidate_flake_cache(flake.path)
machine.flake.invalidate_cache() flake_obj.invalidate_cache()
var_value = get_machine_var(machine, "my_generator/my_value") # Only test gen1 for the get operation
assert var_value.printable_value == "test_value" var_value = get_machine_var(machine1, "gen1/value1")
assert var_value.printable_value == "test_value1"
assert machine.flake._cache_misses == 1, ( assert flake_obj._cache_misses == 1, (
f"Expected exactly 1 cache miss for vars get with fresh cache, got {machine.flake._cache_misses}" f"Expected exactly 1 cache miss for vars get with fresh cache, got {flake_obj._cache_misses}"
) )

View File

@@ -245,15 +245,19 @@ class Generator:
return sec_store.get(self, prompt.name).decode() return sec_store.get(self, prompt.name).decode()
return None return None
def final_script_selector(self, machine_name: str) -> str:
if self._flake is None:
msg = "Flake cannot be None"
raise ClanError(msg)
return self._flake.machine_selector(
machine_name, f'config.clan.core.vars.generators."{self.name}".finalScript'
)
def final_script(self, machine: "Machine") -> Path: def final_script(self, machine: "Machine") -> Path:
if self._flake is None: if self._flake is None:
msg = "Flake cannot be None" msg = "Flake cannot be None"
raise ClanError(msg) raise ClanError(msg)
output = Path( output = Path(self._flake.select(self.final_script_selector(machine.name)))
machine.select(
f'config.clan.core.vars.generators."{self.name}".finalScript',
),
)
if tmp_store := nix_test_store(): if tmp_store := nix_test_store():
output = tmp_store.joinpath(*output.parts[1:]) output = tmp_store.joinpath(*output.parts[1:])
return output return output

View File

@@ -1132,6 +1132,20 @@ class Flake:
return self._cache.select(selector) return self._cache.select(selector)
def machine_selector(self, machine_name: str, selector: str) -> str:
"""Create a selector for a specific machine.
Args:
machine_name: The name of the machine
selector: The attribute selector string relative to the machine config
Returns:
The full selector string for the machine
"""
config = nix_config()
system = config["system"]
return f'clanInternals.machines."{system}"."{machine_name}".{selector}'
def select_machine(self, machine_name: str, selector: str) -> Any: def select_machine(self, machine_name: str, selector: str) -> Any:
"""Select a nix attribute for a specific machine. """Select a nix attribute for a specific machine.
@@ -1141,11 +1155,7 @@ class Flake:
apply: Optional function to apply to the result apply: Optional function to apply to the result
""" """
config = nix_config() return self.select(self.machine_selector(machine_name, selector))
system = config["system"]
full_selector = f'clanInternals.machines."{system}"."{machine_name}".{selector}'
return self.select(full_selector)
def list_machines( def list_machines(
self, self,

View File

@@ -177,13 +177,25 @@ def run_generators(
for machine in machines: for machine in machines:
_ensure_healthy(machine=machine) _ensure_healthy(machine=machine)
# get the flake via any machine (they are all the same)
flake = machines[0].flake
def get_generator_machine(generator: Generator) -> Machine:
if generator.machine is None:
# return first machine if generator is not tied to a specific one
return machines[0]
return Machine(name=generator.machine, flake=flake)
# preheat the select cache, to reduce repeated calls during execution
selectors = []
for generator in generator_objects:
machine = get_generator_machine(generator)
selectors.append(generator.final_script_selector(machine.name))
flake.precache(selectors)
# execute generators # execute generators
for generator in generator_objects: for generator in generator_objects:
machine = ( machine = get_generator_machine(generator)
machines[0]
if generator.machine is None
else Machine(name=generator.machine, flake=machines[0].flake)
)
if check_can_migrate(machine, generator): if check_can_migrate(machine, generator):
migrate_files(machine, generator) migrate_files(machine, generator)
else: else: