diff --git a/cluster/nix/defs-cluster-k0.nix b/cluster/nix/defs-cluster-k0.nix index c3519cc1..cd0fcacf 100644 --- a/cluster/nix/defs-cluster-k0.nix +++ b/cluster/nix/defs-cluster-k0.nix @@ -10,8 +10,60 @@ in rec { fqdn = machineName + domain; machine = (builtins.head (builtins.filter (n: n.fqdn == fqdn) machines)); otherMachines = (builtins.filter (n: n.fqdn != fqdn) machines); + machinesByName = builtins.listToAttrs (map (m: { name = m.name; value = m; }) machines); inherit machines; + # Ceph cluster to run systemd modules for. + cephCluster = { + fsid = "74592dc2-31b7-4dbe-88cf-40459dfeb354"; + name = "k0"; + + # Map from node name to mon configuration (currently always empty). + # + # Each mon also runs a mgr daemon (which is a leader-elected kitchen + # sink^W^Whousekeeping service hanging off of a mon cluster). + # + # Consult the Ceph documentation + # (https://docs.ceph.com/en/pacific/rados/operations/add-or-rm-mons/) on + # how to actually carry out mon-related maintenance operations. + mons = { + bc01n02 = {}; + }; + + # Map from node name to list of disks on node. + # Each disk is: + # id: OSD numerical ID, eg. 0 for osd.0. You get this after running + # ceph-lvm volume create. + # path: Filesystem path for disk backing drive. This should be something + # in /dev/disk/by-id for safety. This is only used to gate OSD + # daemon startup by disk presence. + # uuid: OSD uuid/fsid. You get this after running ceph-lvm volume create. + # + # Quick guide how to set up a new OSD (but please refer to the Ceph manual): + # 0. Copy /var/lib/ceph/bootstrap-osd/k0.keyring from another OSD node to + # the new OSD node, if this is a new node. Remember to chown ceph:ceph + # chmod 0600! + # 1. nix-shell -p ceph lvm2 cryptsetup (if on a node that's not yet an OSD) + # 2. ceph-volume --cluster k0 lvm create --bluestore --data /dev/sdX --no-systemd --dmcrypt + # 3. The above will mount a tmpfs on /var/lib/ceph/osd/k0-X. X is the new + # osd id. A file named fsid inside this directory is the new OSD fsid/uuid. + # 4. Configure osds below with the above information, redeploy node from nix. + osds = { + dcr01s22 = [ + { id = 0; path = "/dev/disk/by-id/scsi-35000c500850293e3"; uuid = "314034c5-474c-4d0d-ba41-36a881c52560";} + { id = 1; path = "/dev/disk/by-id/scsi-35000c500850312cb"; uuid = "a7f1baa0-0fc3-4ab1-9895-67abdc29de03";} + { id = 2; path = "/dev/disk/by-id/scsi-35000c5008508e3ef"; uuid = "11ac8316-6a87-48a7-a0c7-74c3cef6c2fa";} + { id = 3; path = "/dev/disk/by-id/scsi-35000c5008508e23f"; uuid = "c6b838d1-b08c-4788-936c-293041ed2d4d";} + ]; + dcr01s24 = [ + { id = 4; path = "/dev/disk/by-id/scsi-35000c5008509199b"; uuid = "a2b4663d-bd8f-49b3-b0b0-195c56ba252f";} + { id = 5; path = "/dev/disk/by-id/scsi-35000c50085046abf"; uuid = "a2242989-ccce-4367-8813-519b64b5afdb";} + { id = 6; path = "/dev/disk/by-id/scsi-35000c5008502929b"; uuid = "7deac89c-22dd-4c2b-b3cc-43ff7f990fd6";} + { id = 7; path = "/dev/disk/by-id/scsi-35000c5008502a323"; uuid = "e305ebb3-9cac-44d2-9f1d-bbb72c8ab51f";} + ]; + }; + }; + pki = rec { make = (radix: name: rec { ca = ./../certs + "/ca-${radix}.crt"; diff --git a/cluster/nix/modules/ceph.nix b/cluster/nix/modules/ceph.nix new file mode 100644 index 00000000..bc3180f6 --- /dev/null +++ b/cluster/nix/modules/ceph.nix @@ -0,0 +1,145 @@ +# This runs Ceph on hscloud cluster(s). +# +# This lightly wraps the upstream NixOS ceph module, which is already fairly light. +# +# Most importantly, it does _not_ attempt to do any cluster +# bootstrapping/maintenance. This means, that any configuration action that +# does the following: +# 0. Bringing up a cluster +# 1. Adding/removing Mons +# 2. Changing a Mon IP address +# 3. Adding/removing OSDs +# ... must be done in tandem with manual operations on the affected nodes. For +# example, bootstrapping a cluster will involve keychain and monmap management, +# changing anything with mons will involve monmap management, adding new OSDs +# will require provisioning them with ceph-volume, etc. +# +# This is in stark contrast to a fully-managed solution like rook. Since we +# don't have hundreds of clusters, none of the above is automated, especially +# as that kind of automation is quite tricky to do reliably. + +{ config, lib, pkgs, ... }: + +with builtins; +with lib; + +with (( import ../defs-cluster-k0.nix ) config.networking.hostName); + +let + machineName = config.networking.hostName; + isMon = hasAttr machineName cephCluster.mons; + isOsd = hasAttr machineName cephCluster.osds; + hasCeph = isMon || isOsd; + + # This NixOS Ceph option fragment is present on every machine that runs a + # mon, and basically tells the NixOS machinery to run mons/mgrs if needed on + # this machine. + cephMonConfig = if isMon then { + mon = { + enable = true; + daemons = [ machineName ]; + }; + mgr = { + enable = true; + daemons = [ machineName ]; + }; + } else {}; + + # Same as for cephMonConfig, but this time for OSDs. + cephOsdConfig = if isOsd then { + osd = { + enable = true; + daemons = map (el: "${toString el.id}") cephCluster.osds.${machineName}; + }; + } else {}; + + # The full option fragment for services.ceph. It contains ceph.conf fragments + # (in .global.*) and merges ceph{Mon,Osd}Config. + cephConfig = { + enable = true; + global = { + fsid = cephCluster.fsid; + clusterName = cephCluster.name; + + # Every Ceph node always attempts to connect to all mons. + monHost = concatStringsSep "," (mapAttrsToList (k: _: machinesByName.${k}.ipAddr) cephCluster.mons); + monInitialMembers = concatStringsSep "," (builtins.attrNames cephCluster.mons); + }; + } // cephMonConfig // cephOsdConfig; + + # Merge ceph-volume lvm activate into ceph-osd-ID services. + # + # This is because the upstream module seems to have been written with + # filestore in mind, not bluestore. Filestore is relatively simple: an xfs + # filesystem is mounted into /var/lib/caph/osd/$cluster-$id, that in turn + # contains everything for that OSD to work. + # + # Bluestore is a bit different. Instead of a normal filesystem being mounted, + # Ceph manages a block device fully using LVM (and in our case, dmcrypt). + # Every bluestore volume needs to be 'activated' before it can be used by an + # OSD. Activation takes care of doing LVM and dmcrypt mounts, and prepares + # the /var/lib/ceph/osd/$cluster-$id directory as if a filestore was present + # there. However, instead of this being a diskmount, it's instead a tmpfs + # into which a bunch of files are dropped, loaded from the LVM raw device. + # + # To make the upstream NixOS module OSD work with bluestore, we do the following: + # 1. Change ConditionPathExists from the OSD mount into a /dev/disk/by-id + # path. This gates the service on that device being present. + # 2. Inject an ExecStartPre which runs ceph-volume lvm activate, if needed. + # 3. Add lvm/cryptsetup to the PATH of the service (as used by ceph-volume, + # which seems to look for them on PATH instead of being properly + # nixified). + # + # We also inject smartmontools into PATH for smartctl, which allows the OSD + # to monitor device health. + osdActivateServices = listToAttrs (map (el: let + osdId = toString el.id; + osdUuid = el.uuid; + diskPath = el.path; + in { + name = "ceph-osd-${osdId}"; + value = { + path = with pkgs; [ + lvm2 + cryptsetup + smartmontools + ]; + serviceConfig = { + ExecStartPre = lib.mkForce [ + ("+" + (toString (pkgs.writeScript "ceph-osd-${osdId}-activate.sh" '' + #!/bin/sh + set -e + dir="/var/lib/ceph/osd/${cephCluster.name}-${osdId}/" + disk="${el.path}" + uuid="${osdUuid}" + if [ -d "$dir" ] && [ -f "$dir"/keyring ]; then + echo "Volume $dir already activated, skipping..." + else + echo "Activating $dir with $disk, uuid $uuid..." + ${pkgs.ceph}/bin/ceph-volume lvm activate --bluestore --no-systemd ${osdId} $uuid + fi + + ''))) + + "${pkgs.ceph.lib}/libexec/ceph/ceph-osd-prestart.sh --id ${osdId} --cluster ${cephCluster.name}" + ]; + }; + unitConfig = { + ConditionPathExists = lib.mkForce el.path; + }; + }; + }) (if isOsd then cephCluster.osds.${machineName} else [])); + +in rec { + services.ceph = if hasCeph then cephConfig else {}; + + environment.systemPackages = with pkgs; [ + ceph cryptsetup smartmontools + ]; + + systemd.services = osdActivateServices; + + # Hack - the upstream ceph module should generate ${clusterName}.conf instead + # of ceph.conf, let's just symlink it. + environment.etc."ceph/${cephCluster.name}.conf".source = "/etc/ceph/ceph.conf"; +} diff --git a/ops/ceph/0000-fix-SPDK-build-env.patch b/ops/ceph/0000-fix-SPDK-build-env.patch new file mode 100644 index 00000000..a117408b --- /dev/null +++ b/ops/ceph/0000-fix-SPDK-build-env.patch @@ -0,0 +1,11 @@ +--- a/cmake/modules/BuildSPDK.cmake ++++ b/cmake/modules/BuildSPDK.cmake +@@ -35,7 +35,7 @@ macro(build_spdk) + # unset $CFLAGS, otherwise it will interfere with how SPDK sets + # its include directory. + # unset $LDFLAGS, otherwise SPDK will fail to mock some functions. +- BUILD_COMMAND env -i PATH=$ENV{PATH} CC=${CMAKE_C_COMPILER} ${make_cmd} EXTRA_CFLAGS="${spdk_CFLAGS}" ++ BUILD_COMMAND env -i PATH=$ENV{PATH} CC=${CMAKE_C_COMPILER} ${make_cmd} EXTRA_CFLAGS="${spdk_CFLAGS}" C_OPT="-mssse3" + BUILD_IN_SOURCE 1 + INSTALL_COMMAND "true") + unset(make_cmd) diff --git a/ops/ceph/COPYING b/ops/ceph/COPYING new file mode 100644 index 00000000..fe46c6a1 --- /dev/null +++ b/ops/ceph/COPYING @@ -0,0 +1,20 @@ +Copyright (c) 2003-2021 Eelco Dolstra and the Nixpkgs/NixOS contributors + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/ops/ceph/README.md b/ops/ceph/README.md new file mode 100644 index 00000000..1a25652a --- /dev/null +++ b/ops/ceph/README.md @@ -0,0 +1,3 @@ +Ceph 16.4 backport from nixpkgs @ 2021-09-10. + +To be removed once nixpkgs on hscloud nodes is bumped past this version being available upstream. diff --git a/ops/ceph/default.nix b/ops/ceph/default.nix new file mode 100644 index 00000000..0ccc96ce --- /dev/null +++ b/ops/ceph/default.nix @@ -0,0 +1,254 @@ +{ lib, stdenv, runCommand, fetchurl +, ensureNewerSourcesHook +, cmake, pkg-config +, which, git +, boost +, libxml2, zlib, lz4 +, openldap, lttng-ust +, babeltrace, gperf +, gtest +, cunit, snappy +, makeWrapper +, leveldb, oathToolkit +, libnl, libcap_ng +, rdkafka +, nixosTests +, cryptsetup +, sqlite +, lua +, icu +, bzip2 +, doxygen +, graphviz +, fmt +, python3 + +# Optional Dependencies +, yasm ? null, fcgi ? null, expat ? null +, curl ? null, fuse ? null +, libedit ? null, libatomic_ops ? null +, libs3 ? null + +# Mallocs +, jemalloc ? null, gperftools ? null + +# Crypto Dependencies +, cryptopp ? null +, nss ? null, nspr ? null + +# Linux Only Dependencies +, linuxHeaders, util-linux, libuuid, udev, keyutils, rdma-core, rabbitmq-c +, libaio ? null, libxfs ? null, zfs ? null, liburing ? null +, ... +}: + +# We must have one crypto library +assert cryptopp != null || (nss != null && nspr != null); + +let + shouldUsePkg = pkg: if pkg != null && pkg.meta.available then pkg else null; + + optYasm = shouldUsePkg yasm; + optFcgi = shouldUsePkg fcgi; + optExpat = shouldUsePkg expat; + optCurl = shouldUsePkg curl; + optFuse = shouldUsePkg fuse; + optLibedit = shouldUsePkg libedit; + optLibatomic_ops = shouldUsePkg libatomic_ops; + optLibs3 = shouldUsePkg libs3; + + optJemalloc = shouldUsePkg jemalloc; + optGperftools = shouldUsePkg gperftools; + + optCryptopp = shouldUsePkg cryptopp; + optNss = shouldUsePkg nss; + optNspr = shouldUsePkg nspr; + + optLibaio = shouldUsePkg libaio; + optLibxfs = shouldUsePkg libxfs; + optZfs = shouldUsePkg zfs; + + hasRadosgw = optFcgi != null && optExpat != null && optCurl != null && optLibedit != null; + + + # Malloc implementation (can be jemalloc, tcmalloc or null) + malloc = if optJemalloc != null then optJemalloc else optGperftools; + + # We prefer nss over cryptopp + cryptoStr = if optNss != null && optNspr != null then "nss" else + if optCryptopp != null then "cryptopp" else "none"; + + cryptoLibsMap = { + nss = [ optNss optNspr ]; + cryptopp = [ optCryptopp ]; + none = [ ]; + }; + + getMeta = description: with lib; { + homepage = "https://ceph.com/"; + inherit description; + license = with licenses; [ lgpl21 gpl2 bsd3 mit publicDomain ]; + maintainers = with maintainers; [ adev ak johanot krav ]; + platforms = [ "x86_64-linux" "aarch64-linux" ]; + }; + + ceph-common = python.pkgs.buildPythonPackage rec{ + pname = "ceph-common"; + inherit src version; + + sourceRoot = "ceph-${version}/src/python-common"; + + checkInputs = [ python.pkgs.pytest ]; + propagatedBuildInputs = with python.pkgs; [ pyyaml six ]; + + meta = getMeta "Ceph common module for code shared by manager modules"; + }; + + python = python3.override { + packageOverrides = self: super: { + # scipy > 1.3 breaks diskprediction_local, leading to mgr hang on startup + # Bump once these issues are resolved: + # https://tracker.ceph.com/issues/42764 https://tracker.ceph.com/issues/45147 + scipy = super.scipy.overridePythonAttrs (oldAttrs: rec { + version = "1.3.3"; + src = oldAttrs.src.override { + inherit version; + sha256 = "02iqb7ws7fw5fd1a83hx705pzrw1imj7z0bphjsl4bfvw254xgv4"; + }; + doCheck = false; + }); + }; + }; + + ceph-python-env = python.withPackages (ps: [ + ps.sphinx + ps.flask + ps.cython + ps.setuptools + ps.virtualenv + # Libraries needed by the python tools + ps.Mako + ceph-common + ps.cherrypy + ps.cmd2 + ps.colorama + ps.python-dateutil + ps.jsonpatch + ps.pecan + ps.prettytable + ps.pyopenssl + ps.pyjwt + ps.webob + ps.bcrypt + ps.scipy + ps.six + ps.pyyaml + ]); + sitePackages = ceph-python-env.python.sitePackages; + + version = "16.2.4"; + src = fetchurl { + url = "http://download.ceph.com/tarballs/ceph-${version}.tar.gz"; + sha256 = "sha256-J6FVK7feNN8cGO5BSDlfRGACAzchmRUSWR+a4ZgeWy0="; + }; +in rec { + ceph = stdenv.mkDerivation { + pname = "ceph"; + inherit src version; + + patches = [ + ./0000-fix-SPDK-build-env.patch + ]; + + nativeBuildInputs = [ + cmake + pkg-config which git python.pkgs.wrapPython makeWrapper + python.pkgs.python # for the toPythonPath function + (ensureNewerSourcesHook { year = "1980"; }) + python + fmt + # for building docs/man-pages presumably + doxygen + graphviz + ]; + + buildInputs = cryptoLibsMap.${cryptoStr} ++ [ + boost ceph-python-env libxml2 optYasm optLibatomic_ops optLibs3 + malloc zlib openldap lttng-ust babeltrace gperf gtest cunit + snappy lz4 oathToolkit leveldb libnl libcap_ng rdkafka + cryptsetup sqlite lua icu bzip2 + ] ++ lib.optionals stdenv.isLinux [ + linuxHeaders util-linux libuuid udev keyutils liburing optLibaio optLibxfs optZfs + # ceph 14 + rdma-core rabbitmq-c + ] ++ lib.optionals hasRadosgw [ + optFcgi optExpat optCurl optFuse optLibedit + ]; + + pythonPath = [ ceph-python-env "${placeholder "out"}/${ceph-python-env.sitePackages}" ]; + + preConfigure ='' + substituteInPlace src/common/module.c --replace "/sbin/modinfo" "modinfo" + substituteInPlace src/common/module.c --replace "/sbin/modprobe" "modprobe" + substituteInPlace src/common/module.c --replace "/bin/grep" "grep" + + # for pybind/rgw to find internal dep + export LD_LIBRARY_PATH="$PWD/build/lib''${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH" + # install target needs to be in PYTHONPATH for "*.pth support" check to succeed + # set PYTHONPATH, so the build system doesn't silently skip installing ceph-volume and others + export PYTHONPATH=${ceph-python-env}/${sitePackages}:$lib/${sitePackages}:$out/${sitePackages} + patchShebangs src/script src/spdk src/test src/tools + ''; + + cmakeFlags = [ + "-DWITH_SYSTEM_ROCKSDB=OFF" # breaks Bluestore + "-DCMAKE_INSTALL_DATADIR=${placeholder "lib"}/lib" + + "-DWITH_SYSTEM_BOOST=ON" + "-DWITH_SYSTEM_GTEST=ON" + "-DMGR_PYTHON_VERSION=${ceph-python-env.python.pythonVersion}" + "-DWITH_SYSTEMD=OFF" + "-DWITH_TESTS=OFF" + "-DWITH_CEPHFS_SHELL=ON" + # TODO breaks with sandbox, tries to download stuff with npm + "-DWITH_MGR_DASHBOARD_FRONTEND=OFF" + # WITH_XFS has been set default ON from Ceph 16, keeping it optional in nixpkgs for now + ''-DWITH_XFS=${if optLibxfs != null then "ON" else "OFF"}'' + ] ++ lib.optional stdenv.isLinux "-DWITH_SYSTEM_LIBURING=ON"; + + postFixup = '' + wrapPythonPrograms + wrapProgram $out/bin/ceph-mgr --prefix PYTHONPATH ":" "$(toPythonPath ${placeholder "out"}):$(toPythonPath ${ceph-python-env})" + + # Test that ceph-volume exists since the build system has a tendency to + # silently drop it with misconfigurations. + test -f $out/bin/ceph-volume + ''; + + outputs = [ "out" "lib" "dev" "doc" "man" ]; + + doCheck = false; # uses pip to install things from the internet + + # Takes 7+h to build with 2 cores. + requiredSystemFeatures = [ "big-parallel" ]; + + meta = getMeta "Distributed storage system"; + + passthru.version = version; + passthru.tests = { inherit (nixosTests) ceph-single-node ceph-multi-node ceph-single-node-bluestore; }; + }; + + ceph-client = runCommand "ceph-client-${version}" { + meta = getMeta "Tools needed to mount Ceph's RADOS Block Devices"; + } '' + mkdir -p $out/{bin,etc,${sitePackages},share/bash-completion/completions} + cp -r ${ceph}/bin/{ceph,.ceph-wrapped,rados,rbd,rbdmap} $out/bin + cp -r ${ceph}/bin/ceph-{authtool,conf,dencoder,rbdnamer,syn} $out/bin + cp -r ${ceph}/bin/rbd-replay* $out/bin + cp -r ${ceph}/${sitePackages} $out/${sitePackages} + cp -r ${ceph}/etc/bash_completion.d $out/share/bash-completion/completions + # wrapPythonPrograms modifies .ceph-wrapped, so lets just update its paths + substituteInPlace $out/bin/ceph --replace ${ceph} $out + substituteInPlace $out/bin/.ceph-wrapped --replace ${ceph} $out + ''; +} diff --git a/ops/machines.nix b/ops/machines.nix index 5401e30a..9a54c56c 100644 --- a/ops/machines.nix +++ b/ops/machines.nix @@ -19,7 +19,28 @@ let repo = "nixpkgs-channels"; rev = "44ad80ab1036c5cc83ada4bfa451dac9939f2a10"; sha256 = "1b61nzvy0d46cspy07szkc0rggacxiqg9v1py27pkqpj7rvawfsk"; - }) {}; + }) { + overlays = [ + (self: super: rec { + # Use a newer version of Ceph (16, Pacific, EOL 2023-06-01) than in + # this nixpkgs (15, Octopus, EOL 2022-06-01). + # + # This is to: + # 1. Fix a bug in which ceph-volume lvm create fails due to a rocksdb + # mismatch (https://tracker.ceph.com/issues/49815) + # 2. At the time of deployment not start out with an ancient version + # of Ceph. + # + # Once we unpin nixpkgsCluster past a version that contains this Ceph, + # this can be unoverlayed. + inherit (super.callPackages ./ceph { + boost = super.boost17x.override { enablePython = true; python = super.python3; }; + lua = super.lua5_4; + }) ceph ceph-client; + ceph-lib = ceph.lib; + }) + ]; + }; # edge01 still lives on an old nixpkgs checkout. # @@ -44,6 +65,7 @@ let imports = [ ../cluster/nix/modules/base.nix ../cluster/nix/modules/kubernetes.nix + ../cluster/nix/modules/ceph.nix ]; })];