diff --git a/cluster/nix/modules/containerd.toml b/cluster/nix/modules/containerd.toml new file mode 100644 index 00000000..b079637e --- /dev/null +++ b/cluster/nix/modules/containerd.toml @@ -0,0 +1,134 @@ +version = 2 +root = "/var/lib/containerd" +state = "/run/containerd" +plugin_dir = "" +disabled_plugins = [] +required_plugins = [] +oom_score = 0 + +[grpc] + address = "/run/containerd/containerd.sock" + tcp_address = "" + tcp_tls_cert = "" + tcp_tls_key = "" + uid = 0 + gid = 0 + max_recv_message_size = 16777216 + max_send_message_size = 16777216 + +[ttrpc] + address = "" + uid = 0 + gid = 0 + +[debug] + address = "" + uid = 0 + gid = 0 + level = "" + +[metrics] + address = "" + grpc_histogram = false + +[cgroup] + path = "" + +[timeouts] + "io.containerd.timeout.shim.cleanup" = "5s" + "io.containerd.timeout.shim.load" = "5s" + "io.containerd.timeout.shim.shutdown" = "3s" + "io.containerd.timeout.task.state" = "2s" + +[plugins] + [plugins."io.containerd.gc.v1.scheduler"] + pause_threshold = 0.02 + deletion_threshold = 0 + mutation_threshold = 100 + schedule_delay = "0s" + startup_delay = "100ms" + [plugins."io.containerd.grpc.v1.cri"] + disable_tcp_service = true + stream_server_address = "127.0.0.1" + stream_server_port = "0" + stream_idle_timeout = "4h0m0s" + enable_selinux = false + selinux_category_range = 1024 + sandbox_image = "k8s.gcr.io/pause:3.2" + stats_collect_period = 10 + systemd_cgroup = false + enable_tls_streaming = false + max_container_log_line_size = 16384 + disable_cgroup = false + disable_apparmor = false + restrict_oom_score_adj = false + max_concurrent_downloads = 3 + disable_proc_mount = false + unset_seccomp_profile = "" + tolerate_missing_hugetlb_controller = true + disable_hugetlb_controller = true + ignore_image_defined_volumes = false + [plugins."io.containerd.grpc.v1.cri".containerd] + snapshotter = "overlayfs" + default_runtime_name = "runc" + no_pivot = false + disable_snapshot_annotations = true + discard_unpacked_layers = false + [plugins."io.containerd.grpc.v1.cri".containerd.default_runtime] + runtime_type = "" + runtime_engine = "" + runtime_root = "" + privileged_without_host_devices = false + base_runtime_spec = "" + [plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime] + runtime_type = "" + runtime_engine = "" + runtime_root = "" + privileged_without_host_devices = false + base_runtime_spec = "" + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes] + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc] + runtime_type = "io.containerd.runc.v2" + runtime_engine = "" + runtime_root = "" + privileged_without_host_devices = false + base_runtime_spec = "" + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options] + SystemdCgroup = true + [plugins."io.containerd.grpc.v1.cri".cni] + bin_dir = "/opt/cni/bin" + conf_dir = "/opt/cni/conf" + max_conf_num = 1 + conf_template = "" + [plugins."io.containerd.grpc.v1.cri".registry] + [plugins."io.containerd.grpc.v1.cri".registry.mirrors] + [plugins."io.containerd.grpc.v1.cri".registry.mirrors."docker.io"] + endpoint = ["https://registry-1.docker.io"] + [plugins."io.containerd.grpc.v1.cri".image_decryption] + key_model = "" + [plugins."io.containerd.grpc.v1.cri".x509_key_pair_streaming] + tls_cert_file = "" + tls_key_file = "" + [plugins."io.containerd.internal.v1.opt"] + path = "/opt/containerd" + [plugins."io.containerd.internal.v1.restart"] + interval = "10s" + [plugins."io.containerd.metadata.v1.bolt"] + content_sharing_policy = "shared" + [plugins."io.containerd.monitor.v1.cgroups"] + no_prometheus = false + [plugins."io.containerd.runtime.v1.linux"] + shim = "containerd-shim" + runtime = "runc" + runtime_root = "" + no_shim = false + shim_debug = false + [plugins."io.containerd.runtime.v2.task"] + platforms = ["linux/amd64"] + [plugins."io.containerd.service.v1.diff-service"] + default = ["walking"] + [plugins."io.containerd.snapshotter.v1.devmapper"] + root_path = "" + pool_name = "" + base_image_size = "" + async_remove = false diff --git a/cluster/nix/modules/kubelet.nix b/cluster/nix/modules/kubelet.nix index f475b5ba..1a71b480 100644 --- a/cluster/nix/modules/kubelet.nix +++ b/cluster/nix/modules/kubelet.nix @@ -16,7 +16,7 @@ let name = "pause"; tag = "latest"; contents = top.package.pause; - config.Cmd = "/bin/pause"; + config.Cmd = ["/bin/pause"]; }; kubeconfig = top.lib.mkKubeConfig "kubelet" cfg.kubeconfig; @@ -45,12 +45,6 @@ let taints = concatMapStringsSep "," (v: "${v.key}=${v.value}:${v.effect}") (mapAttrsToList (n: v: v) cfg.taints); in { - imports = [ - #(mkRemovedOptionModule [ "services" "kubernetes" "kubelet" "applyManifests" ] "") - #(mkRemovedOptionModule [ "services" "kubernetes" "kubelet" "cadvisorPort" ] "") - #(mkRemovedOptionModule [ "services" "kubernetes" "kubelet" "allowPrivileged" ] "") - ]; - # services/cluster/kubernetes/default.nix still wants to poke flannel, # but since we nuke that module we have to add a fake tunable for it. options.services.kubernetes.flannel = { @@ -203,15 +197,57 @@ in (mkIf cfg.enable { services.kubernetes.kubelet.seedDockerImages = [infraContainer]; + # Drop crictl into administrative command line. + environment.systemPackages = with pkgs; [ cri-tools ]; + + # Force disable Docker. + virtualisation.docker.enable = false; + + # TODO(q3k): move to unified cgroups (cgroup v2) once we upgrade to + # Kubelet 1.19. + systemd.enableUnifiedCgroupHierarchy = false; + + # Run containerd service. This is exposes the CRI API that is consumed by + # crictl and Kubelet. + systemd.services.containerd = { + description = "containerd container runtime"; + wantedBy = [ "kubernetes.target" ]; + after = [ "network.target" ]; + path = with pkgs; [ runc iptables ]; + serviceConfig = { + Delegate = "yes"; + KillMode = "process"; + Restart = "always"; + RestartSec = "5"; + LimitNPROC = "infinity"; + LimitCORE = "infinity"; + # https://github.com/coreos/fedora-coreos-tracker/issues/329 + LimitNOFILE = "1048576"; + TasksMax = "infinity"; + OOMScoreAdjust = "-999"; + + ExecStart = "${pkgs.containerd}/bin/containerd -c ${./containerd.toml}"; + }; + }; + systemd.services.kubelet = { description = "Kubernetes Kubelet Service"; wantedBy = [ "kubernetes.target" ]; - after = [ "network.target" "docker.service" "kube-apiserver.service" ]; - path = with pkgs; [ gitMinimal openssh docker utillinux iproute ethtool thin-provisioning-tools iptables socat ] ++ top.path; + after = [ "network.target" "containerd.service" "kube-apiserver.service" ]; + path = with pkgs; [ gitMinimal openssh utillinux iproute ethtool thin-provisioning-tools iptables socat cri-tools containerd gzip ] ++ top.path; + + # Mildly hacky - by moving over to OCI image build infrastructure in + # NixOS we should be able to get rid of the gunzip. + # TODO(q3k): figure this out, check if this is even being used by + # kubelet. preStart = '' ${concatMapStrings (img: '' - echo "Seeding docker image: ${img}" - docker load <${img} + echo "Seeding OCI image: ${img}" + cp ${img} /tmp/image.tar.gz + rm -f /tmp/image.tar + gunzip /tmp/image.tar.gz + ctr -n=k8s.io images import /tmp/image.tar || true + rm /tmp/image.tar '') cfg.seedDockerImages} ''; serviceConfig = { @@ -221,6 +257,9 @@ in Restart = "on-failure"; RestartSec = "1000ms"; ExecStart = ''${cfg.package}/bin/kubelet \ + --cgroup-driver=systemd \ + --container-runtime=remote \ + --container-runtime-endpoint=unix:///var/run/containerd/containerd.sock \ --address=${cfg.address} \ --authentication-token-webhook \ --authentication-token-webhook-cache-ttl="10s" \ @@ -263,7 +302,8 @@ in }; }; - boot.kernelModules = ["br_netfilter"]; + boot.kernelModules = [ "br_netfilter" "overlay" ]; + boot.kernel.sysctl."net.ipv4.ip_forward" = "1"; services.kubernetes.kubelet.hostname = with config.networking; mkDefault (hostName + optionalString (domain != null) ".${domain}"); diff --git a/cluster/nix/modules/kubernetes.nix b/cluster/nix/modules/kubernetes.nix index 92e28de9..879c50fe 100644 --- a/cluster/nix/modules/kubernetes.nix +++ b/cluster/nix/modules/kubernetes.nix @@ -30,24 +30,6 @@ in rec { ./kubelet.nix ]; - # List services that you want to enable: - virtualisation.docker.enable = true; - virtualisation.docker.extraOptions = "--iptables=false --ip-masq=false --ip-forward=true"; - - # Docker 1.13 sets iptables FORWARD to DROP. Unfuck this. - systemd.services."docker-iptables-unfuck" = { - enable = true; - wantedBy = [ "kubernetes.target" ]; - description = "Docker iptable Unfuck"; - after = [ "docker.service" ]; - requires = [ "docker.service" ]; - path = [ pkgs.iptables ]; - script = '' - iptables -P FORWARD ACCEPT - ''; - serviceConfig.Type = "oneshot"; - }; - networking.firewall.enable = false; # Point k8s apiserver address at ourselves, as every machine runs an apiserver with this cert name.