cluster: replace docker with containerd

This removes Docker and docker-shim from our production kubernetes, and
moves over to containerd/CRI. Docker support within Kubernetes was
always slightly shitty, and with 1.20 the integration was dropped
entirely. CRI/Containerd/runc is pretty much the new standard.

Change-Id: I98c89d5433f221b5fe766fcbef261fd72db530fe
changes/62/762/14
q3k 2021-02-13 17:42:48 +00:00
parent 4b613303b1
commit 765e369255
3 changed files with 186 additions and 30 deletions

View File

@ -0,0 +1,134 @@
version = 2
root = "/var/lib/containerd"
state = "/run/containerd"
plugin_dir = ""
disabled_plugins = []
required_plugins = []
oom_score = 0
[grpc]
address = "/run/containerd/containerd.sock"
tcp_address = ""
tcp_tls_cert = ""
tcp_tls_key = ""
uid = 0
gid = 0
max_recv_message_size = 16777216
max_send_message_size = 16777216
[ttrpc]
address = ""
uid = 0
gid = 0
[debug]
address = ""
uid = 0
gid = 0
level = ""
[metrics]
address = ""
grpc_histogram = false
[cgroup]
path = ""
[timeouts]
"io.containerd.timeout.shim.cleanup" = "5s"
"io.containerd.timeout.shim.load" = "5s"
"io.containerd.timeout.shim.shutdown" = "3s"
"io.containerd.timeout.task.state" = "2s"
[plugins]
[plugins."io.containerd.gc.v1.scheduler"]
pause_threshold = 0.02
deletion_threshold = 0
mutation_threshold = 100
schedule_delay = "0s"
startup_delay = "100ms"
[plugins."io.containerd.grpc.v1.cri"]
disable_tcp_service = true
stream_server_address = "127.0.0.1"
stream_server_port = "0"
stream_idle_timeout = "4h0m0s"
enable_selinux = false
selinux_category_range = 1024
sandbox_image = "k8s.gcr.io/pause:3.2"
stats_collect_period = 10
systemd_cgroup = false
enable_tls_streaming = false
max_container_log_line_size = 16384
disable_cgroup = false
disable_apparmor = false
restrict_oom_score_adj = false
max_concurrent_downloads = 3
disable_proc_mount = false
unset_seccomp_profile = ""
tolerate_missing_hugetlb_controller = true
disable_hugetlb_controller = true
ignore_image_defined_volumes = false
[plugins."io.containerd.grpc.v1.cri".containerd]
snapshotter = "overlayfs"
default_runtime_name = "runc"
no_pivot = false
disable_snapshot_annotations = true
discard_unpacked_layers = false
[plugins."io.containerd.grpc.v1.cri".containerd.default_runtime]
runtime_type = ""
runtime_engine = ""
runtime_root = ""
privileged_without_host_devices = false
base_runtime_spec = ""
[plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime]
runtime_type = ""
runtime_engine = ""
runtime_root = ""
privileged_without_host_devices = false
base_runtime_spec = ""
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes]
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
runtime_type = "io.containerd.runc.v2"
runtime_engine = ""
runtime_root = ""
privileged_without_host_devices = false
base_runtime_spec = ""
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
SystemdCgroup = true
[plugins."io.containerd.grpc.v1.cri".cni]
bin_dir = "/opt/cni/bin"
conf_dir = "/opt/cni/conf"
max_conf_num = 1
conf_template = ""
[plugins."io.containerd.grpc.v1.cri".registry]
[plugins."io.containerd.grpc.v1.cri".registry.mirrors]
[plugins."io.containerd.grpc.v1.cri".registry.mirrors."docker.io"]
endpoint = ["https://registry-1.docker.io"]
[plugins."io.containerd.grpc.v1.cri".image_decryption]
key_model = ""
[plugins."io.containerd.grpc.v1.cri".x509_key_pair_streaming]
tls_cert_file = ""
tls_key_file = ""
[plugins."io.containerd.internal.v1.opt"]
path = "/opt/containerd"
[plugins."io.containerd.internal.v1.restart"]
interval = "10s"
[plugins."io.containerd.metadata.v1.bolt"]
content_sharing_policy = "shared"
[plugins."io.containerd.monitor.v1.cgroups"]
no_prometheus = false
[plugins."io.containerd.runtime.v1.linux"]
shim = "containerd-shim"
runtime = "runc"
runtime_root = ""
no_shim = false
shim_debug = false
[plugins."io.containerd.runtime.v2.task"]
platforms = ["linux/amd64"]
[plugins."io.containerd.service.v1.diff-service"]
default = ["walking"]
[plugins."io.containerd.snapshotter.v1.devmapper"]
root_path = ""
pool_name = ""
base_image_size = ""
async_remove = false

View File

@ -16,7 +16,7 @@ let
name = "pause";
tag = "latest";
contents = top.package.pause;
config.Cmd = "/bin/pause";
config.Cmd = ["/bin/pause"];
};
kubeconfig = top.lib.mkKubeConfig "kubelet" cfg.kubeconfig;
@ -45,12 +45,6 @@ let
taints = concatMapStringsSep "," (v: "${v.key}=${v.value}:${v.effect}") (mapAttrsToList (n: v: v) cfg.taints);
in
{
imports = [
#(mkRemovedOptionModule [ "services" "kubernetes" "kubelet" "applyManifests" ] "")
#(mkRemovedOptionModule [ "services" "kubernetes" "kubelet" "cadvisorPort" ] "")
#(mkRemovedOptionModule [ "services" "kubernetes" "kubelet" "allowPrivileged" ] "")
];
# services/cluster/kubernetes/default.nix still wants to poke flannel,
# but since we nuke that module we have to add a fake tunable for it.
options.services.kubernetes.flannel = {
@ -203,15 +197,57 @@ in
(mkIf cfg.enable {
services.kubernetes.kubelet.seedDockerImages = [infraContainer];
# Drop crictl into administrative command line.
environment.systemPackages = with pkgs; [ cri-tools ];
# Force disable Docker.
virtualisation.docker.enable = false;
# TODO(q3k): move to unified cgroups (cgroup v2) once we upgrade to
# Kubelet 1.19.
systemd.enableUnifiedCgroupHierarchy = false;
# Run containerd service. This is exposes the CRI API that is consumed by
# crictl and Kubelet.
systemd.services.containerd = {
description = "containerd container runtime";
wantedBy = [ "kubernetes.target" ];
after = [ "network.target" ];
path = with pkgs; [ runc iptables ];
serviceConfig = {
Delegate = "yes";
KillMode = "process";
Restart = "always";
RestartSec = "5";
LimitNPROC = "infinity";
LimitCORE = "infinity";
# https://github.com/coreos/fedora-coreos-tracker/issues/329
LimitNOFILE = "1048576";
TasksMax = "infinity";
OOMScoreAdjust = "-999";
ExecStart = "${pkgs.containerd}/bin/containerd -c ${./containerd.toml}";
};
};
systemd.services.kubelet = {
description = "Kubernetes Kubelet Service";
wantedBy = [ "kubernetes.target" ];
after = [ "network.target" "docker.service" "kube-apiserver.service" ];
path = with pkgs; [ gitMinimal openssh docker utillinux iproute ethtool thin-provisioning-tools iptables socat ] ++ top.path;
after = [ "network.target" "containerd.service" "kube-apiserver.service" ];
path = with pkgs; [ gitMinimal openssh utillinux iproute ethtool thin-provisioning-tools iptables socat cri-tools containerd gzip ] ++ top.path;
# Mildly hacky - by moving over to OCI image build infrastructure in
# NixOS we should be able to get rid of the gunzip.
# TODO(q3k): figure this out, check if this is even being used by
# kubelet.
preStart = ''
${concatMapStrings (img: ''
echo "Seeding docker image: ${img}"
docker load <${img}
echo "Seeding OCI image: ${img}"
cp ${img} /tmp/image.tar.gz
rm -f /tmp/image.tar
gunzip /tmp/image.tar.gz
ctr -n=k8s.io images import /tmp/image.tar || true
rm /tmp/image.tar
'') cfg.seedDockerImages}
'';
serviceConfig = {
@ -221,6 +257,9 @@ in
Restart = "on-failure";
RestartSec = "1000ms";
ExecStart = ''${cfg.package}/bin/kubelet \
--cgroup-driver=systemd \
--container-runtime=remote \
--container-runtime-endpoint=unix:///var/run/containerd/containerd.sock \
--address=${cfg.address} \
--authentication-token-webhook \
--authentication-token-webhook-cache-ttl="10s" \
@ -263,7 +302,8 @@ in
};
};
boot.kernelModules = ["br_netfilter"];
boot.kernelModules = [ "br_netfilter" "overlay" ];
boot.kernel.sysctl."net.ipv4.ip_forward" = "1";
services.kubernetes.kubelet.hostname = with config.networking;
mkDefault (hostName + optionalString (domain != null) ".${domain}");

View File

@ -30,24 +30,6 @@ in rec {
./kubelet.nix
];
# List services that you want to enable:
virtualisation.docker.enable = true;
virtualisation.docker.extraOptions = "--iptables=false --ip-masq=false --ip-forward=true";
# Docker 1.13 sets iptables FORWARD to DROP. Unfuck this.
systemd.services."docker-iptables-unfuck" = {
enable = true;
wantedBy = [ "kubernetes.target" ];
description = "Docker iptable Unfuck";
after = [ "docker.service" ];
requires = [ "docker.service" ];
path = [ pkgs.iptables ];
script = ''
iptables -P FORWARD ACCEPT
'';
serviceConfig.Type = "oneshot";
};
networking.firewall.enable = false;
# Point k8s apiserver address at ourselves, as every machine runs an apiserver with this cert name.