# This runs Ceph on hscloud cluster(s). # # This lightly wraps the upstream NixOS ceph module, which is already fairly light. # # Most importantly, it does _not_ attempt to do any cluster # bootstrapping/maintenance. This means, that any configuration action that # does the following: # 0. Bringing up a cluster # 1. Adding/removing Mons # 2. Changing a Mon IP address # 3. Adding/removing OSDs # ... must be done in tandem with manual operations on the affected nodes. For # example, bootstrapping a cluster will involve keychain and monmap management, # changing anything with mons will involve monmap management, adding new OSDs # will require provisioning them with ceph-volume, etc. # # This is in stark contrast to a fully-managed solution like rook. Since we # don't have hundreds of clusters, none of the above is automated, especially # as that kind of automation is quite tricky to do reliably. { config, lib, pkgs, ... }: with builtins; with lib; with (( import ../defs-cluster-k0.nix ) config.networking.hostName); let machineName = config.networking.hostName; isMon = hasAttr machineName cephCluster.mons; isOsd = hasAttr machineName cephCluster.osds; hasCeph = isMon || isOsd; # This NixOS Ceph option fragment is present on every machine that runs a # mon, and basically tells the NixOS machinery to run mons/mgrs if needed on # this machine. cephMonConfig = if isMon then { mon = { enable = true; daemons = [ machineName ]; }; mgr = { enable = true; daemons = [ machineName ]; }; } else {}; # Same as for cephMonConfig, but this time for OSDs. cephOsdConfig = if isOsd then { osd = { enable = true; daemons = map (el: "${toString el.id}") cephCluster.osds.${machineName}; }; rgw = { enable = true; daemons = [ "rook-k0.rgw.${machineName}" ]; }; } else {}; # The full option fragment for services.ceph. It contains ceph.conf fragments # (in .global.*) and merges ceph{Mon,Osd}Config. cephConfig = { enable = true; global = { fsid = cephCluster.fsid; clusterName = cephCluster.name; # Every Ceph node always attempts to connect to all mons. monHost = concatStringsSep "," (mapAttrsToList (k: _: machinesByName.${k}.ipAddr) cephCluster.mons); monInitialMembers = concatStringsSep "," (builtins.attrNames cephCluster.mons); }; } // cephMonConfig // cephOsdConfig; # Merge ceph-volume lvm activate into ceph-osd-ID services. # # This is because the upstream module seems to have been written with # filestore in mind, not bluestore. Filestore is relatively simple: an xfs # filesystem is mounted into /var/lib/caph/osd/$cluster-$id, that in turn # contains everything for that OSD to work. # # Bluestore is a bit different. Instead of a normal filesystem being mounted, # Ceph manages a block device fully using LVM (and in our case, dmcrypt). # Every bluestore volume needs to be 'activated' before it can be used by an # OSD. Activation takes care of doing LVM and dmcrypt mounts, and prepares # the /var/lib/ceph/osd/$cluster-$id directory as if a filestore was present # there. However, instead of this being a diskmount, it's instead a tmpfs # into which a bunch of files are dropped, loaded from the LVM raw device. # # To make the upstream NixOS module OSD work with bluestore, we do the following: # 1. Change ConditionPathExists from the OSD mount into a /dev/disk/by-id # path. This gates the service on that device being present. # 2. Inject an ExecStartPre which runs ceph-volume lvm activate, if needed. # 3. Add lvm/cryptsetup to the PATH of the service (as used by ceph-volume, # which seems to look for them on PATH instead of being properly # nixified). # # We also inject smartmontools into PATH for smartctl, which allows the OSD # to monitor device health. osdActivateServices = listToAttrs (map (el: let osdId = toString el.id; osdUuid = el.uuid; diskPath = el.path; in { name = "ceph-osd-${osdId}"; value = { path = with pkgs; [ lvm2 cryptsetup smartmontools ]; serviceConfig = { ExecStartPre = lib.mkForce [ ("+" + (toString (pkgs.writeScript "ceph-osd-${osdId}-activate.sh" '' #!/bin/sh set -e dir="/var/lib/ceph/osd/${cephCluster.name}-${osdId}/" disk="${el.path}" uuid="${osdUuid}" if [ -d "$dir" ] && [ -f "$dir"/keyring ]; then echo "Volume $dir already activated, skipping..." else echo "Activating $dir with $disk, uuid $uuid..." ${pkgs.ceph}/bin/ceph-volume lvm activate --bluestore --no-systemd ${osdId} $uuid fi ''))) "${pkgs.ceph.lib}/libexec/ceph/ceph-osd-prestart.sh --id ${osdId} --cluster ${cephCluster.name}" ]; }; unitConfig = { ConditionPathExists = lib.mkForce el.path; }; }; }) (if isOsd then cephCluster.osds.${machineName} else [])); in rec { services.ceph = if hasCeph then cephConfig else {}; environment.systemPackages = with pkgs; [ ceph cryptsetup smartmontools ]; systemd.services = osdActivateServices; # Hack - the upstream ceph module should generate ${clusterName}.conf instead # of ceph.conf, let's just symlink it. environment.etc."ceph/${cephCluster.name}.conf".source = "/etc/ceph/ceph.conf"; }