2021-09-10 22:30:56 +00:00
|
|
|
# This runs Ceph on hscloud cluster(s).
|
|
|
|
#
|
|
|
|
# This lightly wraps the upstream NixOS ceph module, which is already fairly light.
|
|
|
|
#
|
|
|
|
# Most importantly, it does _not_ attempt to do any cluster
|
|
|
|
# bootstrapping/maintenance. This means, that any configuration action that
|
|
|
|
# does the following:
|
|
|
|
# 0. Bringing up a cluster
|
|
|
|
# 1. Adding/removing Mons
|
|
|
|
# 2. Changing a Mon IP address
|
|
|
|
# 3. Adding/removing OSDs
|
|
|
|
# ... must be done in tandem with manual operations on the affected nodes. For
|
|
|
|
# example, bootstrapping a cluster will involve keychain and monmap management,
|
|
|
|
# changing anything with mons will involve monmap management, adding new OSDs
|
|
|
|
# will require provisioning them with ceph-volume, etc.
|
|
|
|
#
|
|
|
|
# This is in stark contrast to a fully-managed solution like rook. Since we
|
|
|
|
# don't have hundreds of clusters, none of the above is automated, especially
|
|
|
|
# as that kind of automation is quite tricky to do reliably.
|
|
|
|
|
|
|
|
{ config, lib, pkgs, ... }:
|
|
|
|
|
|
|
|
with builtins;
|
|
|
|
with lib;
|
|
|
|
|
|
|
|
with (( import ../defs-cluster-k0.nix ) config.networking.hostName);
|
|
|
|
|
|
|
|
let
|
|
|
|
machineName = config.networking.hostName;
|
|
|
|
isMon = hasAttr machineName cephCluster.mons;
|
|
|
|
isOsd = hasAttr machineName cephCluster.osds;
|
|
|
|
hasCeph = isMon || isOsd;
|
|
|
|
|
|
|
|
# This NixOS Ceph option fragment is present on every machine that runs a
|
|
|
|
# mon, and basically tells the NixOS machinery to run mons/mgrs if needed on
|
|
|
|
# this machine.
|
|
|
|
cephMonConfig = if isMon then {
|
|
|
|
mon = {
|
|
|
|
enable = true;
|
|
|
|
daemons = [ machineName ];
|
|
|
|
};
|
|
|
|
mgr = {
|
|
|
|
enable = true;
|
|
|
|
daemons = [ machineName ];
|
|
|
|
};
|
|
|
|
} else {};
|
|
|
|
|
|
|
|
# Same as for cephMonConfig, but this time for OSDs.
|
|
|
|
cephOsdConfig = if isOsd then {
|
|
|
|
osd = {
|
|
|
|
enable = true;
|
|
|
|
daemons = map (el: "${toString el.id}") cephCluster.osds.${machineName};
|
|
|
|
};
|
2021-09-12 22:09:48 +00:00
|
|
|
rgw = {
|
|
|
|
enable = true;
|
|
|
|
daemons = [ "rook-k0.rgw.${machineName}" ];
|
|
|
|
};
|
2021-09-10 22:30:56 +00:00
|
|
|
} else {};
|
|
|
|
|
|
|
|
# The full option fragment for services.ceph. It contains ceph.conf fragments
|
|
|
|
# (in .global.*) and merges ceph{Mon,Osd}Config.
|
|
|
|
cephConfig = {
|
|
|
|
enable = true;
|
|
|
|
global = {
|
|
|
|
fsid = cephCluster.fsid;
|
|
|
|
clusterName = cephCluster.name;
|
|
|
|
|
|
|
|
# Every Ceph node always attempts to connect to all mons.
|
|
|
|
monHost = concatStringsSep "," (mapAttrsToList (k: _: machinesByName.${k}.ipAddr) cephCluster.mons);
|
|
|
|
monInitialMembers = concatStringsSep "," (builtins.attrNames cephCluster.mons);
|
|
|
|
};
|
|
|
|
} // cephMonConfig // cephOsdConfig;
|
|
|
|
|
|
|
|
# Merge ceph-volume lvm activate into ceph-osd-ID services.
|
|
|
|
#
|
|
|
|
# This is because the upstream module seems to have been written with
|
|
|
|
# filestore in mind, not bluestore. Filestore is relatively simple: an xfs
|
|
|
|
# filesystem is mounted into /var/lib/caph/osd/$cluster-$id, that in turn
|
|
|
|
# contains everything for that OSD to work.
|
|
|
|
#
|
|
|
|
# Bluestore is a bit different. Instead of a normal filesystem being mounted,
|
|
|
|
# Ceph manages a block device fully using LVM (and in our case, dmcrypt).
|
|
|
|
# Every bluestore volume needs to be 'activated' before it can be used by an
|
|
|
|
# OSD. Activation takes care of doing LVM and dmcrypt mounts, and prepares
|
|
|
|
# the /var/lib/ceph/osd/$cluster-$id directory as if a filestore was present
|
|
|
|
# there. However, instead of this being a diskmount, it's instead a tmpfs
|
|
|
|
# into which a bunch of files are dropped, loaded from the LVM raw device.
|
|
|
|
#
|
|
|
|
# To make the upstream NixOS module OSD work with bluestore, we do the following:
|
|
|
|
# 1. Change ConditionPathExists from the OSD mount into a /dev/disk/by-id
|
|
|
|
# path. This gates the service on that device being present.
|
|
|
|
# 2. Inject an ExecStartPre which runs ceph-volume lvm activate, if needed.
|
|
|
|
# 3. Add lvm/cryptsetup to the PATH of the service (as used by ceph-volume,
|
|
|
|
# which seems to look for them on PATH instead of being properly
|
|
|
|
# nixified).
|
|
|
|
#
|
|
|
|
# We also inject smartmontools into PATH for smartctl, which allows the OSD
|
|
|
|
# to monitor device health.
|
|
|
|
osdActivateServices = listToAttrs (map (el: let
|
|
|
|
osdId = toString el.id;
|
|
|
|
osdUuid = el.uuid;
|
|
|
|
diskPath = el.path;
|
|
|
|
in {
|
|
|
|
name = "ceph-osd-${osdId}";
|
|
|
|
value = {
|
|
|
|
path = with pkgs; [
|
|
|
|
lvm2
|
|
|
|
cryptsetup
|
|
|
|
smartmontools
|
|
|
|
];
|
|
|
|
serviceConfig = {
|
|
|
|
ExecStartPre = lib.mkForce [
|
|
|
|
("+" + (toString (pkgs.writeScript "ceph-osd-${osdId}-activate.sh" ''
|
|
|
|
#!/bin/sh
|
|
|
|
set -e
|
|
|
|
dir="/var/lib/ceph/osd/${cephCluster.name}-${osdId}/"
|
|
|
|
disk="${el.path}"
|
|
|
|
uuid="${osdUuid}"
|
|
|
|
if [ -d "$dir" ] && [ -f "$dir"/keyring ]; then
|
|
|
|
echo "Volume $dir already activated, skipping..."
|
|
|
|
else
|
|
|
|
echo "Activating $dir with $disk, uuid $uuid..."
|
|
|
|
${pkgs.ceph}/bin/ceph-volume lvm activate --bluestore --no-systemd ${osdId} $uuid
|
|
|
|
fi
|
|
|
|
|
|
|
|
'')))
|
|
|
|
|
|
|
|
"${pkgs.ceph.lib}/libexec/ceph/ceph-osd-prestart.sh --id ${osdId} --cluster ${cephCluster.name}"
|
|
|
|
];
|
|
|
|
};
|
|
|
|
unitConfig = {
|
|
|
|
ConditionPathExists = lib.mkForce el.path;
|
|
|
|
};
|
|
|
|
};
|
|
|
|
}) (if isOsd then cephCluster.osds.${machineName} else []));
|
|
|
|
|
|
|
|
in rec {
|
|
|
|
services.ceph = if hasCeph then cephConfig else {};
|
|
|
|
|
|
|
|
environment.systemPackages = with pkgs; [
|
|
|
|
ceph cryptsetup smartmontools
|
|
|
|
];
|
|
|
|
|
|
|
|
systemd.services = osdActivateServices;
|
|
|
|
|
|
|
|
# Hack - the upstream ceph module should generate ${clusterName}.conf instead
|
|
|
|
# of ceph.conf, let's just symlink it.
|
|
|
|
environment.etc."ceph/${cephCluster.name}.conf".source = "/etc/ceph/ceph.conf";
|
|
|
|
}
|