From b3c6770f8d52db57254cbfad0c359356af787027 Mon Sep 17 00:00:00 2001 From: Serge Bazanski Date: Fri, 10 Sep 2021 22:27:24 +0000 Subject: [PATCH] ops, cluster: consolidate NixOS provisioning This moves the diff-and-activate logic from cluster/nix/provision.nix into ops/{provision,machines}.nix that can be used for both cluster machines and bgpwtf machines. The provisioning scripts now live per-NixOS-config, and anything under ops.machines.$fqdn now has a .passthru.hscloud.provision derivation which is that script. When ran, it will attempt to deploy onto the target machine. There's also a top-level tool at `ops.provision` which builds all configurations / machines and can be called with the machine name/fqdn to call the corresponding provisioner script. clustercfg is changed to use the new provisioning logic. Change-Id: I258abce9e8e3db42af35af102f32ab7963046353 --- cluster/clustercfg/clustercfg.py | 10 +-- cluster/nix/provision.nix | 49 ------------- ops/README.md | 23 ++++++ ops/machines.nix | 119 ++++++++++++++++++++++--------- ops/provision.nix | 74 +++++++++++++++++++ 5 files changed, 189 insertions(+), 86 deletions(-) delete mode 100644 cluster/nix/provision.nix create mode 100644 ops/README.md create mode 100644 ops/provision.nix diff --git a/cluster/clustercfg/clustercfg.py b/cluster/clustercfg/clustercfg.py index 0adef403..d852d6ac 100644 --- a/cluster/clustercfg/clustercfg.py +++ b/cluster/clustercfg/clustercfg.py @@ -206,10 +206,12 @@ def nodestrap(args, nocerts=False): ca_admitomatic = ca.CA(ss, certs_root, 'admitomatic', 'admitomatic webhook CA') ca_admitomatic.make_cert('admitomatic-webhook', ou='Admitomatic Webhook', hosts=['admitomatic.admitomatic.svc']) - subprocess.check_call(["nix", "run", - "-f", local_root, - "cluster.nix.provision", - "-c", "provision-{}".format(fqdn.split('.')[0])]) + toplevel = subprocess.check_output([ + "nix-build", + local_root, + "-A", "ops.machines.\"" + fqdn + "\".config.passthru.hscloud.provision", + ]).decode().strip() + subprocess.check_call([toplevel]) def usage(): diff --git a/cluster/nix/provision.nix b/cluster/nix/provision.nix deleted file mode 100644 index 7ab7e714..00000000 --- a/cluster/nix/provision.nix +++ /dev/null @@ -1,49 +0,0 @@ -{ hscloud, pkgs, ... }: - -with builtins; - -let - machines = (import ./defs-machines.nix); - configurations = builtins.listToAttrs (map (machine: { - name = machine.fqdn; - value = pkgs.nixos ({ config, pkgs, ... }: { - networking.hostName = machine.name; - imports = [ - ./modules/base.nix - ./modules/kubernetes.nix - ]; - }); - }) machines); - - scriptForMachine = machine: let - configuration = configurations."${machine.fqdn}"; - in '' - set -e - remote=root@${machine.fqdn} - echo "Configuration for ${machine.fqdn} is ${configuration.toplevel}" - nix copy --no-check-sigs -s --to ssh://$remote ${configuration.toplevel} - echo "/etc/systemd/system diff:" - ssh $remote diff -ur /var/run/current-system/etc/systemd/system ${configuration.toplevel}/etc/systemd/system || true - echo "" - echo "" - ssh $remote ${configuration.toplevel}/bin/switch-to-configuration dry-activate - read -p "Do you want to switch to this configuration? " -n 1 -r - echo - if [[ $REPLY =~ ^[Yy]$ ]]; then - ssh $remote ${configuration.toplevel}/bin/switch-to-configuration switch - fi - ''; - - provisioners = (map (machine: - pkgs.writeScriptBin "provision-${machine.name}" (scriptForMachine machine) - ) machines); - - provision = pkgs.writeScriptBin "provision" ( - '' - echo "Available provisioniers:" - '' + (concatStringsSep "\n" (map (machine: "echo ' provision-${machine.name}'") machines))); -in -pkgs.symlinkJoin { - name = "provision"; - paths = [ provision ] ++ provisioners; -} diff --git a/ops/README.md b/ops/README.md new file mode 100644 index 00000000..d31f767b --- /dev/null +++ b/ops/README.md @@ -0,0 +1,23 @@ +Operations +=== + +Deploying NixOS machines +--- + +Machine configurations are in `ops/machines.nix`. + +Wrapper script to show all available machines and provision a single machine: + + $ $(nix-build -A ops.provision) + Available machines: + - bc01n01.hswaw.net + - bc01n02.hswaw.net + - dcr01s22.hswaw.net + - dcr01s24.hswaw.net + - edge01.waw.bgp.wtf + + $ $(nix-build -A ops.provision) edge01.waw.bgp.wtf + +This can be slow, as it evaluates/builds all machines' configs. If you just want to deploy one machine and possible iterate faster: + + $ $(nix-build -A 'ops.machines."edge01.waw.bgp.wtf".config.passthru.hscloud.provision') diff --git a/ops/machines.nix b/ops/machines.nix index 0e632289..5401e30a 100644 --- a/ops/machines.nix +++ b/ops/machines.nix @@ -3,30 +3,41 @@ # This allows to have a common attrset of machines that can be deployed # in the same way. # -# Currently building/deployment is still done in a half-assed way: -# -# machine=edge01.waw.bgp.wtf -# d=$(nix-build -A 'ops.machines."'$machine'"'.toplevel) -# -# To then deploy derivation $d on $machine: -# -# nix-copy-closure --to root@$machine $d -# ssh root@$machine $d/bin/switch-to-configuration dry-activate -# ssh root@$machine $d/bin/switch-to-configuration test -# ssh root@$machine nix-env -p /nix/var/nix/profiles/system --set $d -# ssh root@$machine $d/bin/switch-to-configuration boot -# -# TODO(q3k): merge this with //cluster/clustercfg - this should be unified! +# For information about building/deploying machines see //ops/README.md. { hscloud, pkgs, ... }: let + # nixpkgs for cluster machines (.hswaw.net). Currently pinned to an old + # nixpkgs because NixOS modules for kubernetes changed enough that it's not + # super easy to use them as is. + # + # TODO(q3k): fix this: use an old nixpkgs for Kube modules while using + # hscloud nixpkgs for everything else. + nixpkgsCluster = import (pkgs.fetchFromGitHub { + owner = "nixos"; + repo = "nixpkgs-channels"; + rev = "44ad80ab1036c5cc83ada4bfa451dac9939f2a10"; + sha256 = "1b61nzvy0d46cspy07szkc0rggacxiqg9v1py27pkqpj7rvawfsk"; + }) {}; + + # edge01 still lives on an old nixpkgs checkout. + # + # TODO(b/3): unpin and deploy. + nixpkgsBgpwtf = import (pkgs.fetchFromGitHub { + owner = "nixos"; + repo = "nixpkgs-channels"; + rev = "c59ea8b8a0e7f927e7291c14ea6cd1bd3a16ff38"; + sha256 = "1ak7jqx94fjhc68xh1lh35kh3w3ndbadprrb762qgvcfb8351x8v"; + }) {}; + # Stopgap measure to import //cluster/nix machine definitions into new - # //ops/machines infrastructure. + # //ops/ infrastructure. + # # TODO(q3k): inject defs-cluster-k0.nix / defs-machines.nix content via # nixos options instead of having module definitions loading it themselves, # deduplicate list of machines below with defs-machines.nix somehow. - mkClusterMachine = name: pkgs.nixos ({ config, pkgs, ... }: { + clusterMachineConfig = name: [({ config, pkgs, ...}: { # The hostname is used by //cluster/nix machinery to load the appropriate # config from defs-machines into defs-cluster-k0. networking.hostName = name; @@ -34,29 +45,71 @@ let ../cluster/nix/modules/base.nix ../cluster/nix/modules/kubernetes.nix ]; - }); + })]; + # mkMachine builds NixOS modules into a NixOS derivation, and injects + # passthru.hscloud.provision which deploys that configuration over SSH to a + # production machine. mkMachine = pkgs: paths: pkgs.nixos ({ config, pkgs, ... }: { imports = paths; + + config = let + name = config.networking.hostName; + domain = if (config.networking ? domain) && config.networking.domain != null then config.networking.domain else "hswaw.net"; + fqdn = name + "." + domain; + toplevel = config.system.build.toplevel; + + runProvision = '' + #!/bin/sh + set -eu + remote=root@${fqdn} + echo "Configuration for ${fqdn} is ${toplevel}" + nix copy -s --to ssh://$remote ${toplevel} + + running="$(ssh $remote readlink -f /nix/var/nix/profiles/system)" + if [ "$running" == "${toplevel}" ]; then + echo "${fqdn} already running ${toplevel}." + else + echo "/etc/systemd/system diff:" + ssh $remote diff -ur /var/run/current-system/etc/systemd/system ${toplevel}/etc/systemd/system || true + echo "" + echo "" + echo "dry-activate diff:" + ssh $remote ${toplevel}/bin/switch-to-configuration dry-activate + read -p "Do you want to switch to this configuration? " -n 1 -r + echo + if ! [[ $REPLY =~ ^[Yy]$ ]]; then + exit 1 + fi + + echo -ne "\n\nswitch-to-configuration test...\n" + ssh $remote ${toplevel}/bin/switch-to-configuration test + fi + + echo -ne "\n\n" + read -p "Do you want to set this configuration as boot? " -n 1 -r + echo + if ! [[ $REPLY =~ ^[Yy]$ ]]; then + exit 1 + fi + + echo -ne "\n\nsetting system profile...\n" + ssh $remote nix-env -p /nix/var/nix/profiles/system --set ${toplevel} + + echo -ne "\n\nswitch-to-configuration boot...\n" + ssh $remote ${toplevel}/bin/switch-to-configuration boot + ''; + in { + passthru.hscloud.provision = pkgs.writeScript "provision-${fqdn}" runProvision; + }; }); - in { - "bc01n01.hswaw.net" = mkClusterMachine "bc01n01"; - "bc01n02.hswaw.net" = mkClusterMachine "bc01n02"; - "bc01n03.hswaw.net" = mkClusterMachine "bc01n03"; - "dcr01s22.hswaw.net" = mkClusterMachine "dcr01s22"; - "dcr01s24.hswaw.net" = mkClusterMachine "dcr01s24"; + "bc01n01.hswaw.net" = mkMachine nixpkgsCluster (clusterMachineConfig "bc01n01"); + "bc01n02.hswaw.net" = mkMachine nixpkgsCluster (clusterMachineConfig "bc01n02"); + "dcr01s22.hswaw.net" = mkMachine nixpkgsCluster (clusterMachineConfig "dcr01s22"); + "dcr01s24.hswaw.net" = mkMachine nixpkgsCluster (clusterMachineConfig "dcr01s24"); - # edge01 still lives on an old nixpkgs checkout. - # TODO(b/3): unpin and deploy. - "edge01.waw.bgp.wtf" = mkMachine ( - import (pkgs.fetchFromGitHub { - owner = "nixos"; - repo = "nixpkgs-channels"; - rev = "c59ea8b8a0e7f927e7291c14ea6cd1bd3a16ff38"; - sha256 = "1ak7jqx94fjhc68xh1lh35kh3w3ndbadprrb762qgvcfb8351x8v"; - }) {} - ) [ + "edge01.waw.bgp.wtf" = mkMachine nixpkgsBgpwtf [ ../bgpwtf/machines/edge01.waw.bgp.wtf.nix ../bgpwtf/machines/edge01.waw.bgp.wtf-hardware.nix ]; diff --git a/ops/provision.nix b/ops/provision.nix new file mode 100644 index 00000000..76054c42 --- /dev/null +++ b/ops/provision.nix @@ -0,0 +1,74 @@ +# Top-level wrapper script for calling per-machine provisioners. +# +# Given ops.machines."edge01.waw.bgp.wtf".config.passthru.hscloud.provision, +# this script allows to run it by doing: +# $ $(nix-build -A ops.provision) edge01.waw.bgp.wtf +# Or, to first list all available machines by doing: +# $ $(nix-build -A ops.provision) +# +# The main logic of the provisioner script is in machines.nix. + +{ hscloud, pkgs, lib, ... }: + +with lib; with builtins; + +let + + # All machines from ops.machines, keyed by FQDN. + machines = filterAttrs (n: _: n != "__readTree") hscloud.ops.machines; + # Machines' provisioner scripts, keyed by machine FQDN. + machineProvisioners = mapAttrs (_: v: v.config.passthru.hscloud.provision) machines; + # List of machine FQDNs. + machineNames = attrNames machines; + + # User-friendly list of machines by FQDN. + machineList = concatStringsSep "\n" + (map + (name: " - ${name}") + machineNames); + + # Derivation containing bin/provision-FQDN symlinks to machines' provisioners. + forest = pkgs.linkFarm "provision-forest" + (mapAttrsToList + (fqdn: p: { name = "bin/provision-${fqdn}"; path = p; }) + machineProvisioners); +in + +pkgs.writeScript "provision" '' + #!/bin/sh + name="$1" + + usage() { + echo >&2 "Usage: $0 machine|machine.hswaw.net" + echo >&2 "Available machines:" + echo >&2 "${machineList}" + } + + if [ -z "$name" ]; then + usage + exit 1 + fi + + provisioner="${forest}/bin/provision-$name" + if [ ! -e "$provisioner" ]; then + name="$name.hswaw.net" + provisioner="${forest}/bin/provision-$name" + fi + if [ ! -e "$provisioner" ]; then + usage + exit 1 + fi + # :^) + echo -ne "\e[34mh \e[31ms \e[33mc l \e[34mo \e[32mu \e[31md \e[0m" + echo "" + echo "Starting provisioner for $name..." + echo "" + echo "Too slow to evaluate? Equivalent faster command line that rebuilds just one node:" + echo " \$(nix-build -A 'ops.machines.\"$name\".config.passthru.hscloud.provision')" + echo "" + echo "Or, if you want to deploy the same configuration on different machines, just run" + echo "this script again without re-evaluating nix:" + echo " $0 $name" + echo "" + exec "$provisioner" +''