hscloud/ops/machines.nix
Serge Bazanski 55a486ae49 cluster: refactor nix machinery to fit //ops
This is a chonky refactor that get rids of the previous cluster-centric
defs-* plain nix file setup.

Now, nodes are configured individually in plain nixos modules, and are
provided a view of all other nodes in the 'machines' attribute. Cluster
logic is moved into modules which inspect this array to find other nodes
within the same cluster.

Kubernetes options are not fully clusterified yet (ie., they are still
hardcode to only provide the 'k0' cluster) but that can be fixed later.
The Ceph machinery is a good example of how that can be done.

The new NixOS configs are zero-diff against prod. While this is done
mostly by keeping the logic, we had to keep a few newly discovered
'bugs' around by adding some temporary options which keeps things as they
are. These will be removed in a future CL, then introducing a diff (but
no functional changes, hopefully).

We also remove the nix eval from clustercfg as it was not used anymore
(basically since we refactored certs at some point).

Change-Id: Id79772a96249b0e6344046f96f9c2cb481c4e1f4
Reviewed-on: https://gerrit.hackerspace.pl/c/hscloud/+/1322
Reviewed-by: informatic <informatic@hackerspace.pl>
2022-06-19 11:48:52 +00:00

153 lines
5.6 KiB
Nix

# Top-level file aggregating all machines managed from hscloud.
#
# This allows to have a common attrset of machines that can be deployed
# in the same way.
#
# For information about building/deploying machines see //ops/README.md.
{ hscloud, pkgs, ... }:
let
# nixpkgs for cluster machines (.hswaw.net). Currently pinned to an old
# nixpkgs because NixOS modules for kubernetes changed enough that it's not
# super easy to use them as is.
#
# TODO(q3k): fix this: use an old nixpkgs for Kube modules while using
# hscloud nixpkgs for everything else.
nixpkgsCluster = import (pkgs.fetchFromGitHub {
owner = "nixos";
repo = "nixpkgs-channels";
rev = "44ad80ab1036c5cc83ada4bfa451dac9939f2a10";
sha256 = "1b61nzvy0d46cspy07szkc0rggacxiqg9v1py27pkqpj7rvawfsk";
}) {
overlays = [
(self: super: rec {
# Use a newer version of Ceph (16, Pacific, EOL 2023-06-01) than in
# this nixpkgs (15, Octopus, EOL 2022-06-01).
#
# This is to:
# 1. Fix a bug in which ceph-volume lvm create fails due to a rocksdb
# mismatch (https://tracker.ceph.com/issues/49815)
# 2. At the time of deployment not start out with an ancient version
# of Ceph.
#
# Once we unpin nixpkgsCluster past a version that contains this Ceph,
# this can be unoverlayed.
inherit (super.callPackages ./ceph {
boost = super.boost17x.override { enablePython = true; python = super.python3; };
lua = super.lua5_4;
}) ceph ceph-client;
ceph-lib = ceph.lib;
})
];
};
# edge01 still lives on an old nixpkgs checkout.
#
# TODO(b/3): unpin and deploy.
nixpkgsBgpwtf = import (pkgs.fetchFromGitHub {
owner = "nixos";
repo = "nixpkgs-channels";
rev = "c59ea8b8a0e7f927e7291c14ea6cd1bd3a16ff38";
sha256 = "1ak7jqx94fjhc68xh1lh35kh3w3ndbadprrb762qgvcfb8351x8v";
}) {};
# customs.hackerspace.pl migration temporary checkout
nixpkgsCustoms = import (pkgs.fetchFromGitHub {
owner = "nixos";
repo = "nixpkgs";
rev = "d12178b1c4a6ef1232c8c677573ba9db204e66ff";
sha256 = "0p7df7yzi35kblxr5ks0rxxp9cfh269g88xpj60sdhdjvfnn6cp7";
}) {};
# mkMachine builds NixOS modules into a NixOS derivation.
# It:
# 1) injects passthru.hscloud.provision which deploys that configuration
# over SSH to a production machine.
# 2) injects 'workspace' as a nixos module argument which points to the root
# of the hscloud readTree object. It will contain whatever nixpkgs
# checkout this file has been invoked with, ie. will not be 'mixed in'
# with the pkgs argument.
mkMachine = machines: pkgs: paths: pkgs.nixos ({ config, pkgs, ... }: {
imports = paths;
config = let
name = config.networking.hostName;
domain = if (config.networking ? domain) && config.networking.domain != null then config.networking.domain else "hswaw.net";
fqdn = name + "." + domain;
toplevel = config.system.build.toplevel;
runProvision = ''
#!/bin/sh
set -eu
remote=root@${fqdn}
echo "Configuration for ${fqdn} is ${toplevel}"
nix copy -s --to ssh://$remote ${toplevel}
running="$(ssh $remote readlink -f /nix/var/nix/profiles/system)"
if [ "$running" == "${toplevel}" ]; then
echo "${fqdn} already running ${toplevel}."
else
echo "/etc/systemd/system diff:"
ssh $remote diff -ur /var/run/current-system/etc/systemd/system ${toplevel}/etc/systemd/system || true
echo ""
echo ""
echo "dry-activate diff:"
ssh $remote ${toplevel}/bin/switch-to-configuration dry-activate
read -p "Do you want to switch to this configuration? " -n 1 -r
echo
if ! [[ $REPLY =~ ^[Yy]$ ]]; then
exit 1
fi
echo -ne "\n\nswitch-to-configuration test...\n"
ssh $remote ${toplevel}/bin/switch-to-configuration test
fi
echo -ne "\n\n"
read -p "Do you want to set this configuration as boot? " -n 1 -r
echo
if ! [[ $REPLY =~ ^[Yy]$ ]]; then
exit 1
fi
echo -ne "\n\nsetting system profile...\n"
ssh $remote nix-env -p /nix/var/nix/profiles/system --set ${toplevel}
echo -ne "\n\nswitch-to-configuration boot...\n"
ssh $remote ${toplevel}/bin/switch-to-configuration boot
'';
in {
passthru.hscloud.provision = pkgs.writeScript "provision-${fqdn}" runProvision;
# TODO(q3k): this should be named hscloud, but that seems to not work. Debug and rename.
_module.args.workspace = hscloud;
_module.args.machines = machines;
};
});
mkClusterMachine = machines: path: mkMachine machines nixpkgsCluster [
../cluster/machines/modules/base.nix
../cluster/machines/modules/kube-controlplane.nix
../cluster/machines/modules/kube-dataplane.nix
../cluster/machines/modules/ceph.nix
path
];
machines = self: {
"bc01n01.hswaw.net" = mkClusterMachine self ../cluster/machines/bc01n01.hswaw.net.nix;
"bc01n02.hswaw.net" = mkClusterMachine self ../cluster/machines/bc01n02.hswaw.net.nix;
"dcr01s22.hswaw.net" = mkClusterMachine self ../cluster/machines/dcr01s22.hswaw.net.nix;
"dcr01s24.hswaw.net" = mkClusterMachine self ../cluster/machines/dcr01s24.hswaw.net.nix;
"edge01.waw.bgp.wtf" = mkMachine self nixpkgsBgpwtf [
../bgpwtf/machines/edge01.waw.bgp.wtf.nix
../bgpwtf/machines/edge01.waw.bgp.wtf-hardware.nix
];
"customs.hackerspace.pl" = mkMachine self pkgs [
../hswaw/machines/customs.hackerspace.pl/configuration.nix
];
};
in pkgs.lib.fix machines