1
0
Fork 0

cluster: deploy NixOS-based ceph

First pass at a non-rook-managed Ceph cluster. We call it k0 instead of
ceph-waw4, as we pretty much are sure now that we will always have a
one-kube-cluster-to-one-ceph-cluster correspondence, with different Ceph
pools for different media kinds (if at all).

For now this has one mon and spinning rust OSDs. This can be iterated on
to make it less terrible with time.

See b/6 for more details.

Change-Id: Ie502a232c700af93f33fcad9fa1c57058161aa11
master
q3k 2021-09-10 22:30:56 +00:00
parent 1dbefed537
commit 9848e7e15f
7 changed files with 508 additions and 1 deletions

View File

@ -10,8 +10,60 @@ in rec {
fqdn = machineName + domain;
machine = (builtins.head (builtins.filter (n: n.fqdn == fqdn) machines));
otherMachines = (builtins.filter (n: n.fqdn != fqdn) machines);
machinesByName = builtins.listToAttrs (map (m: { name = m.name; value = m; }) machines);
inherit machines;
# Ceph cluster to run systemd modules for.
cephCluster = {
fsid = "74592dc2-31b7-4dbe-88cf-40459dfeb354";
name = "k0";
# Map from node name to mon configuration (currently always empty).
#
# Each mon also runs a mgr daemon (which is a leader-elected kitchen
# sink^W^Whousekeeping service hanging off of a mon cluster).
#
# Consult the Ceph documentation
# (https://docs.ceph.com/en/pacific/rados/operations/add-or-rm-mons/) on
# how to actually carry out mon-related maintenance operations.
mons = {
bc01n02 = {};
};
# Map from node name to list of disks on node.
# Each disk is:
# id: OSD numerical ID, eg. 0 for osd.0. You get this after running
# ceph-lvm volume create.
# path: Filesystem path for disk backing drive. This should be something
# in /dev/disk/by-id for safety. This is only used to gate OSD
# daemon startup by disk presence.
# uuid: OSD uuid/fsid. You get this after running ceph-lvm volume create.
#
# Quick guide how to set up a new OSD (but please refer to the Ceph manual):
# 0. Copy /var/lib/ceph/bootstrap-osd/k0.keyring from another OSD node to
# the new OSD node, if this is a new node. Remember to chown ceph:ceph
# chmod 0600!
# 1. nix-shell -p ceph lvm2 cryptsetup (if on a node that's not yet an OSD)
# 2. ceph-volume --cluster k0 lvm create --bluestore --data /dev/sdX --no-systemd --dmcrypt
# 3. The above will mount a tmpfs on /var/lib/ceph/osd/k0-X. X is the new
# osd id. A file named fsid inside this directory is the new OSD fsid/uuid.
# 4. Configure osds below with the above information, redeploy node from nix.
osds = {
dcr01s22 = [
{ id = 0; path = "/dev/disk/by-id/scsi-35000c500850293e3"; uuid = "314034c5-474c-4d0d-ba41-36a881c52560";}
{ id = 1; path = "/dev/disk/by-id/scsi-35000c500850312cb"; uuid = "a7f1baa0-0fc3-4ab1-9895-67abdc29de03";}
{ id = 2; path = "/dev/disk/by-id/scsi-35000c5008508e3ef"; uuid = "11ac8316-6a87-48a7-a0c7-74c3cef6c2fa";}
{ id = 3; path = "/dev/disk/by-id/scsi-35000c5008508e23f"; uuid = "c6b838d1-b08c-4788-936c-293041ed2d4d";}
];
dcr01s24 = [
{ id = 4; path = "/dev/disk/by-id/scsi-35000c5008509199b"; uuid = "a2b4663d-bd8f-49b3-b0b0-195c56ba252f";}
{ id = 5; path = "/dev/disk/by-id/scsi-35000c50085046abf"; uuid = "a2242989-ccce-4367-8813-519b64b5afdb";}
{ id = 6; path = "/dev/disk/by-id/scsi-35000c5008502929b"; uuid = "7deac89c-22dd-4c2b-b3cc-43ff7f990fd6";}
{ id = 7; path = "/dev/disk/by-id/scsi-35000c5008502a323"; uuid = "e305ebb3-9cac-44d2-9f1d-bbb72c8ab51f";}
];
};
};
pki = rec {
make = (radix: name: rec {
ca = ./../certs + "/ca-${radix}.crt";

View File

@ -0,0 +1,145 @@
# This runs Ceph on hscloud cluster(s).
#
# This lightly wraps the upstream NixOS ceph module, which is already fairly light.
#
# Most importantly, it does _not_ attempt to do any cluster
# bootstrapping/maintenance. This means, that any configuration action that
# does the following:
# 0. Bringing up a cluster
# 1. Adding/removing Mons
# 2. Changing a Mon IP address
# 3. Adding/removing OSDs
# ... must be done in tandem with manual operations on the affected nodes. For
# example, bootstrapping a cluster will involve keychain and monmap management,
# changing anything with mons will involve monmap management, adding new OSDs
# will require provisioning them with ceph-volume, etc.
#
# This is in stark contrast to a fully-managed solution like rook. Since we
# don't have hundreds of clusters, none of the above is automated, especially
# as that kind of automation is quite tricky to do reliably.
{ config, lib, pkgs, ... }:
with builtins;
with lib;
with (( import ../defs-cluster-k0.nix ) config.networking.hostName);
let
machineName = config.networking.hostName;
isMon = hasAttr machineName cephCluster.mons;
isOsd = hasAttr machineName cephCluster.osds;
hasCeph = isMon || isOsd;
# This NixOS Ceph option fragment is present on every machine that runs a
# mon, and basically tells the NixOS machinery to run mons/mgrs if needed on
# this machine.
cephMonConfig = if isMon then {
mon = {
enable = true;
daemons = [ machineName ];
};
mgr = {
enable = true;
daemons = [ machineName ];
};
} else {};
# Same as for cephMonConfig, but this time for OSDs.
cephOsdConfig = if isOsd then {
osd = {
enable = true;
daemons = map (el: "${toString el.id}") cephCluster.osds.${machineName};
};
} else {};
# The full option fragment for services.ceph. It contains ceph.conf fragments
# (in .global.*) and merges ceph{Mon,Osd}Config.
cephConfig = {
enable = true;
global = {
fsid = cephCluster.fsid;
clusterName = cephCluster.name;
# Every Ceph node always attempts to connect to all mons.
monHost = concatStringsSep "," (mapAttrsToList (k: _: machinesByName.${k}.ipAddr) cephCluster.mons);
monInitialMembers = concatStringsSep "," (builtins.attrNames cephCluster.mons);
};
} // cephMonConfig // cephOsdConfig;
# Merge ceph-volume lvm activate into ceph-osd-ID services.
#
# This is because the upstream module seems to have been written with
# filestore in mind, not bluestore. Filestore is relatively simple: an xfs
# filesystem is mounted into /var/lib/caph/osd/$cluster-$id, that in turn
# contains everything for that OSD to work.
#
# Bluestore is a bit different. Instead of a normal filesystem being mounted,
# Ceph manages a block device fully using LVM (and in our case, dmcrypt).
# Every bluestore volume needs to be 'activated' before it can be used by an
# OSD. Activation takes care of doing LVM and dmcrypt mounts, and prepares
# the /var/lib/ceph/osd/$cluster-$id directory as if a filestore was present
# there. However, instead of this being a diskmount, it's instead a tmpfs
# into which a bunch of files are dropped, loaded from the LVM raw device.
#
# To make the upstream NixOS module OSD work with bluestore, we do the following:
# 1. Change ConditionPathExists from the OSD mount into a /dev/disk/by-id
# path. This gates the service on that device being present.
# 2. Inject an ExecStartPre which runs ceph-volume lvm activate, if needed.
# 3. Add lvm/cryptsetup to the PATH of the service (as used by ceph-volume,
# which seems to look for them on PATH instead of being properly
# nixified).
#
# We also inject smartmontools into PATH for smartctl, which allows the OSD
# to monitor device health.
osdActivateServices = listToAttrs (map (el: let
osdId = toString el.id;
osdUuid = el.uuid;
diskPath = el.path;
in {
name = "ceph-osd-${osdId}";
value = {
path = with pkgs; [
lvm2
cryptsetup
smartmontools
];
serviceConfig = {
ExecStartPre = lib.mkForce [
("+" + (toString (pkgs.writeScript "ceph-osd-${osdId}-activate.sh" ''
#!/bin/sh
set -e
dir="/var/lib/ceph/osd/${cephCluster.name}-${osdId}/"
disk="${el.path}"
uuid="${osdUuid}"
if [ -d "$dir" ] && [ -f "$dir"/keyring ]; then
echo "Volume $dir already activated, skipping..."
else
echo "Activating $dir with $disk, uuid $uuid..."
${pkgs.ceph}/bin/ceph-volume lvm activate --bluestore --no-systemd ${osdId} $uuid
fi
'')))
"${pkgs.ceph.lib}/libexec/ceph/ceph-osd-prestart.sh --id ${osdId} --cluster ${cephCluster.name}"
];
};
unitConfig = {
ConditionPathExists = lib.mkForce el.path;
};
};
}) (if isOsd then cephCluster.osds.${machineName} else []));
in rec {
services.ceph = if hasCeph then cephConfig else {};
environment.systemPackages = with pkgs; [
ceph cryptsetup smartmontools
];
systemd.services = osdActivateServices;
# Hack - the upstream ceph module should generate ${clusterName}.conf instead
# of ceph.conf, let's just symlink it.
environment.etc."ceph/${cephCluster.name}.conf".source = "/etc/ceph/ceph.conf";
}

View File

@ -0,0 +1,11 @@
--- a/cmake/modules/BuildSPDK.cmake
+++ b/cmake/modules/BuildSPDK.cmake
@@ -35,7 +35,7 @@ macro(build_spdk)
# unset $CFLAGS, otherwise it will interfere with how SPDK sets
# its include directory.
# unset $LDFLAGS, otherwise SPDK will fail to mock some functions.
- BUILD_COMMAND env -i PATH=$ENV{PATH} CC=${CMAKE_C_COMPILER} ${make_cmd} EXTRA_CFLAGS="${spdk_CFLAGS}"
+ BUILD_COMMAND env -i PATH=$ENV{PATH} CC=${CMAKE_C_COMPILER} ${make_cmd} EXTRA_CFLAGS="${spdk_CFLAGS}" C_OPT="-mssse3"
BUILD_IN_SOURCE 1
INSTALL_COMMAND "true")
unset(make_cmd)

20
ops/ceph/COPYING Normal file
View File

@ -0,0 +1,20 @@
Copyright (c) 2003-2021 Eelco Dolstra and the Nixpkgs/NixOS contributors
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

3
ops/ceph/README.md Normal file
View File

@ -0,0 +1,3 @@
Ceph 16.4 backport from nixpkgs @ 2021-09-10.
To be removed once nixpkgs on hscloud nodes is bumped past this version being available upstream.

254
ops/ceph/default.nix Normal file
View File

@ -0,0 +1,254 @@
{ lib, stdenv, runCommand, fetchurl
, ensureNewerSourcesHook
, cmake, pkg-config
, which, git
, boost
, libxml2, zlib, lz4
, openldap, lttng-ust
, babeltrace, gperf
, gtest
, cunit, snappy
, makeWrapper
, leveldb, oathToolkit
, libnl, libcap_ng
, rdkafka
, nixosTests
, cryptsetup
, sqlite
, lua
, icu
, bzip2
, doxygen
, graphviz
, fmt
, python3
# Optional Dependencies
, yasm ? null, fcgi ? null, expat ? null
, curl ? null, fuse ? null
, libedit ? null, libatomic_ops ? null
, libs3 ? null
# Mallocs
, jemalloc ? null, gperftools ? null
# Crypto Dependencies
, cryptopp ? null
, nss ? null, nspr ? null
# Linux Only Dependencies
, linuxHeaders, util-linux, libuuid, udev, keyutils, rdma-core, rabbitmq-c
, libaio ? null, libxfs ? null, zfs ? null, liburing ? null
, ...
}:
# We must have one crypto library
assert cryptopp != null || (nss != null && nspr != null);
let
shouldUsePkg = pkg: if pkg != null && pkg.meta.available then pkg else null;
optYasm = shouldUsePkg yasm;
optFcgi = shouldUsePkg fcgi;
optExpat = shouldUsePkg expat;
optCurl = shouldUsePkg curl;
optFuse = shouldUsePkg fuse;
optLibedit = shouldUsePkg libedit;
optLibatomic_ops = shouldUsePkg libatomic_ops;
optLibs3 = shouldUsePkg libs3;
optJemalloc = shouldUsePkg jemalloc;
optGperftools = shouldUsePkg gperftools;
optCryptopp = shouldUsePkg cryptopp;
optNss = shouldUsePkg nss;
optNspr = shouldUsePkg nspr;
optLibaio = shouldUsePkg libaio;
optLibxfs = shouldUsePkg libxfs;
optZfs = shouldUsePkg zfs;
hasRadosgw = optFcgi != null && optExpat != null && optCurl != null && optLibedit != null;
# Malloc implementation (can be jemalloc, tcmalloc or null)
malloc = if optJemalloc != null then optJemalloc else optGperftools;
# We prefer nss over cryptopp
cryptoStr = if optNss != null && optNspr != null then "nss" else
if optCryptopp != null then "cryptopp" else "none";
cryptoLibsMap = {
nss = [ optNss optNspr ];
cryptopp = [ optCryptopp ];
none = [ ];
};
getMeta = description: with lib; {
homepage = "https://ceph.com/";
inherit description;
license = with licenses; [ lgpl21 gpl2 bsd3 mit publicDomain ];
maintainers = with maintainers; [ adev ak johanot krav ];
platforms = [ "x86_64-linux" "aarch64-linux" ];
};
ceph-common = python.pkgs.buildPythonPackage rec{
pname = "ceph-common";
inherit src version;
sourceRoot = "ceph-${version}/src/python-common";
checkInputs = [ python.pkgs.pytest ];
propagatedBuildInputs = with python.pkgs; [ pyyaml six ];
meta = getMeta "Ceph common module for code shared by manager modules";
};
python = python3.override {
packageOverrides = self: super: {
# scipy > 1.3 breaks diskprediction_local, leading to mgr hang on startup
# Bump once these issues are resolved:
# https://tracker.ceph.com/issues/42764 https://tracker.ceph.com/issues/45147
scipy = super.scipy.overridePythonAttrs (oldAttrs: rec {
version = "1.3.3";
src = oldAttrs.src.override {
inherit version;
sha256 = "02iqb7ws7fw5fd1a83hx705pzrw1imj7z0bphjsl4bfvw254xgv4";
};
doCheck = false;
});
};
};
ceph-python-env = python.withPackages (ps: [
ps.sphinx
ps.flask
ps.cython
ps.setuptools
ps.virtualenv
# Libraries needed by the python tools
ps.Mako
ceph-common
ps.cherrypy
ps.cmd2
ps.colorama
ps.python-dateutil
ps.jsonpatch
ps.pecan
ps.prettytable
ps.pyopenssl
ps.pyjwt
ps.webob
ps.bcrypt
ps.scipy
ps.six
ps.pyyaml
]);
sitePackages = ceph-python-env.python.sitePackages;
version = "16.2.4";
src = fetchurl {
url = "http://download.ceph.com/tarballs/ceph-${version}.tar.gz";
sha256 = "sha256-J6FVK7feNN8cGO5BSDlfRGACAzchmRUSWR+a4ZgeWy0=";
};
in rec {
ceph = stdenv.mkDerivation {
pname = "ceph";
inherit src version;
patches = [
./0000-fix-SPDK-build-env.patch
];
nativeBuildInputs = [
cmake
pkg-config which git python.pkgs.wrapPython makeWrapper
python.pkgs.python # for the toPythonPath function
(ensureNewerSourcesHook { year = "1980"; })
python
fmt
# for building docs/man-pages presumably
doxygen
graphviz
];
buildInputs = cryptoLibsMap.${cryptoStr} ++ [
boost ceph-python-env libxml2 optYasm optLibatomic_ops optLibs3
malloc zlib openldap lttng-ust babeltrace gperf gtest cunit
snappy lz4 oathToolkit leveldb libnl libcap_ng rdkafka
cryptsetup sqlite lua icu bzip2
] ++ lib.optionals stdenv.isLinux [
linuxHeaders util-linux libuuid udev keyutils liburing optLibaio optLibxfs optZfs
# ceph 14
rdma-core rabbitmq-c
] ++ lib.optionals hasRadosgw [
optFcgi optExpat optCurl optFuse optLibedit
];
pythonPath = [ ceph-python-env "${placeholder "out"}/${ceph-python-env.sitePackages}" ];
preConfigure =''
substituteInPlace src/common/module.c --replace "/sbin/modinfo" "modinfo"
substituteInPlace src/common/module.c --replace "/sbin/modprobe" "modprobe"
substituteInPlace src/common/module.c --replace "/bin/grep" "grep"
# for pybind/rgw to find internal dep
export LD_LIBRARY_PATH="$PWD/build/lib''${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH"
# install target needs to be in PYTHONPATH for "*.pth support" check to succeed
# set PYTHONPATH, so the build system doesn't silently skip installing ceph-volume and others
export PYTHONPATH=${ceph-python-env}/${sitePackages}:$lib/${sitePackages}:$out/${sitePackages}
patchShebangs src/script src/spdk src/test src/tools
'';
cmakeFlags = [
"-DWITH_SYSTEM_ROCKSDB=OFF" # breaks Bluestore
"-DCMAKE_INSTALL_DATADIR=${placeholder "lib"}/lib"
"-DWITH_SYSTEM_BOOST=ON"
"-DWITH_SYSTEM_GTEST=ON"
"-DMGR_PYTHON_VERSION=${ceph-python-env.python.pythonVersion}"
"-DWITH_SYSTEMD=OFF"
"-DWITH_TESTS=OFF"
"-DWITH_CEPHFS_SHELL=ON"
# TODO breaks with sandbox, tries to download stuff with npm
"-DWITH_MGR_DASHBOARD_FRONTEND=OFF"
# WITH_XFS has been set default ON from Ceph 16, keeping it optional in nixpkgs for now
''-DWITH_XFS=${if optLibxfs != null then "ON" else "OFF"}''
] ++ lib.optional stdenv.isLinux "-DWITH_SYSTEM_LIBURING=ON";
postFixup = ''
wrapPythonPrograms
wrapProgram $out/bin/ceph-mgr --prefix PYTHONPATH ":" "$(toPythonPath ${placeholder "out"}):$(toPythonPath ${ceph-python-env})"
# Test that ceph-volume exists since the build system has a tendency to
# silently drop it with misconfigurations.
test -f $out/bin/ceph-volume
'';
outputs = [ "out" "lib" "dev" "doc" "man" ];
doCheck = false; # uses pip to install things from the internet
# Takes 7+h to build with 2 cores.
requiredSystemFeatures = [ "big-parallel" ];
meta = getMeta "Distributed storage system";
passthru.version = version;
passthru.tests = { inherit (nixosTests) ceph-single-node ceph-multi-node ceph-single-node-bluestore; };
};
ceph-client = runCommand "ceph-client-${version}" {
meta = getMeta "Tools needed to mount Ceph's RADOS Block Devices";
} ''
mkdir -p $out/{bin,etc,${sitePackages},share/bash-completion/completions}
cp -r ${ceph}/bin/{ceph,.ceph-wrapped,rados,rbd,rbdmap} $out/bin
cp -r ${ceph}/bin/ceph-{authtool,conf,dencoder,rbdnamer,syn} $out/bin
cp -r ${ceph}/bin/rbd-replay* $out/bin
cp -r ${ceph}/${sitePackages} $out/${sitePackages}
cp -r ${ceph}/etc/bash_completion.d $out/share/bash-completion/completions
# wrapPythonPrograms modifies .ceph-wrapped, so lets just update its paths
substituteInPlace $out/bin/ceph --replace ${ceph} $out
substituteInPlace $out/bin/.ceph-wrapped --replace ${ceph} $out
'';
}

View File

@ -19,7 +19,28 @@ let
repo = "nixpkgs-channels";
rev = "44ad80ab1036c5cc83ada4bfa451dac9939f2a10";
sha256 = "1b61nzvy0d46cspy07szkc0rggacxiqg9v1py27pkqpj7rvawfsk";
}) {};
}) {
overlays = [
(self: super: rec {
# Use a newer version of Ceph (16, Pacific, EOL 2023-06-01) than in
# this nixpkgs (15, Octopus, EOL 2022-06-01).
#
# This is to:
# 1. Fix a bug in which ceph-volume lvm create fails due to a rocksdb
# mismatch (https://tracker.ceph.com/issues/49815)
# 2. At the time of deployment not start out with an ancient version
# of Ceph.
#
# Once we unpin nixpkgsCluster past a version that contains this Ceph,
# this can be unoverlayed.
inherit (super.callPackages ./ceph {
boost = super.boost17x.override { enablePython = true; python = super.python3; };
lua = super.lua5_4;
}) ceph ceph-client;
ceph-lib = ceph.lib;
})
];
};
# edge01 still lives on an old nixpkgs checkout.
#
@ -44,6 +65,7 @@ let
imports = [
../cluster/nix/modules/base.nix
../cluster/nix/modules/kubernetes.nix
../cluster/nix/modules/ceph.nix
];
})];