hscloud/ops/monitoring/lib/cluster.libsonnet

local kube = import "../../../kube/kube.libsonnet";

{
    // Cluster sets up all cluster-specific monitoring resources in their own namespace.
    //
    // Currently this consists of a prometheus server that scrapes k8s nodes for kubelet
    // and cAdvisor metrics, and possibly ships over metrics to the global tier via set
    // upstreams.
    Cluster(name):: {
        local cluster = self,
        local cfg = cluster.cfg,
        cfg:: {
            name: name,
            namespace: "monitoring-cluster",

            images: {
                prometheus: "prom/prometheus:v2.18.1",
            },

            storageClasses: {
                prometheus: error "storageClasses.prometheus must be set",
            },

            // Username used to authenticate to upstreams.
            username: error "username must be set",

            // Global tier upstreams that this cluster should ship metrics off to.
            // List of
            //  {
            //     remote: URL of upstream
            //     password: password used to authenticate, in conjunction with cfg.username.
            //
            upstreams: [],
        },

        namespace: kube.Namespace(cfg.namespace),

        prometheus: {
            local prometheus = self,

            // Configuration that's going to be emitted as prometheus.yml and passed to the
            // prometheus server for this cluster.
            configuration:: {
                global: {
                    external_labels: {
                        cluster: cluster.cfg.name,
                    },
                },

                // Constructor for a Kubernetes scrape job that uses the pod's service account and
                // TLS configuration, selecting the given k8s scrape 'role'.
                local kubeScrapeConfig = function(name, role) {
                    job_name: name,
                    scheme: "https",
                    scrape_interval: "30s",
                    kubernetes_sd_configs: [ { role: role }, ],
                    tls_config: {
                        ca_file: "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt",
                    },
                    bearer_token_file: "/var/run/secrets/kubernetes.io/serviceaccount/token",
                },


                // When scraping node-based metrics (ie. node and cadvisor metrics) we contact
                // the metrics endpoints on the kubelet via the API server. This is done by
                // relabeling _address__ and __metrics_path__ to point at the k8s API server,
                // and at the API server proxy path to reach a node's metrics endpoint.
                //
                // This approach was lifted from the prometheus examples for Kubernetes, and
                // while the benefits outlined there do not matter that much to us (our
                // kubelets listen on public addresses, anyway), we still enjoy this approach
                // for the fact that we don't have to hardcode the kubelet TLS port.
                //
                // https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml
                //
                // When contacting the API server, we hardcode the 'hswaw.net' DNS suffix as
                // our API server's TLS certificate only has a CN/SAN for its full FQDN, not
                // the .svc.cluster.local shorthand (see //cluster/clustercfg:clustercfg.py).
                local kubeScrapeNodeMetrics = function(name, path) kubeScrapeConfig(name, "node") {
                    relabel_configs: [
                        {
                            action: "labelmap",
                            regex: "__meta_kubernetes_node_label_(.+)",
                        },
                        {
                            action: "replace",
                            target_label: "__address__",
                            replacement: "kubernetes.default.svc.%s.hswaw.net:443" % [cluster.cfg.name],
                        },
                        {
                            target_label: "__metrics_path__",
                            source_labels: ["__meta_kubernetes_node_name"],
                            regex: "(.+)",
                            replacement: "/api/v1/nodes/${1}/proxy" + path,
                        },
                    ],
                },

                // When scraping API server-colocated metrics (ie. metrics from nixos services running alongside
                // APIserver instances), we contact the metrics endpoints directly over the node's IP addresses
                // and an external port. The node IP addresses are discovered via Prometheus kubernetes endpoint
                // discovery which selects all endpoints for the default/kubernetes service. This service is
                // backed by apiserver instances on public IP addresses. We can then rewrite the received port
                // by the port of the service we're interested in to get to that service.
                local kubeScrapeAPIServerColocated = function(name, port) kubeScrapeConfig(name, "endpoints") {
                    relabel_configs: [
                        // Select only endpoints that back the default/kubernetes service. These are all
                        // public IP addresses of nodes that run the API server.
                        {
                            action: "keep",
                            regex: "default;kubernetes;https",
                            source_labels: [
                                "__meta_kubernetes_namespace",
                                "__meta_kubernetes_service_name",
                                "__meta_kubernetes_endpoint_port_name",
                            ],
                        },
                    ] + (if port == 4001 then [] else [
                        // Replace endpoint port with requested port, if the requested port is not the apiserver's
                        // port 4001, which is the one returned by default for the these endpoints.
                        {
                            action: "replace",
                            regex: "([^:]+):.+",
                            replacement: "$1:%d" % [port],
                            source_labels: [
                                "__address__",
                            ],
                            target_label: "__address__",
                        },
                    ]),
                    // We disable server-side TLS certificate verification.
                    // Unfortunately, all apiserver-colocated services run with TLS certificates that do not have
                    // the right IP address SAN. Unfortunately, we can't override the TLS ServerName for a scrape
                    // target [1], so the only two choiced we are left with are:
                    //   1) re-emit relevant certificates with IP address SANs that allow for access by IP.
                    //   2) disable TLS verification.
                    // We choose 2), knowing that if someone manages to hijack a target IP address they can end up
                    // stealing our bearer token and impersonating the service account with which Prometheus is
                    // running. In the long term, we hope for [1] to be resolved.
                    //
                    // TODO(q3k): revisit this once [1] gets fixed.
                    // [1] - https://github.com/prometheus/prometheus/issues/4827
                    tls_config: {
                        insecure_skip_verify: true,
                    },
                },

                scrape_configs: [
                    /// Scrape per-node metrics, proxied via the APIServer..
                    // Scrape Kubernetes node metrics via apiserver. This emits kube_node_* metrics.
                    kubeScrapeNodeMetrics("cluster_node_metrics", "/metrics"),
                    // Scrape Kubernetes node cadvisor metrics via apiserver. This emits container_* metrics.
                    kubeScrapeNodeMetrics("cluster_cadvisor_metrics", "/metrics/cadvisor"),

                    /// Scape apiserver-colocated ('master node') metrics, over nodes' public IP addresses.
                    /// (currently all nodes are 'master' nodes)
                    // Scrape Kubernetes apiserver metrics.
                    kubeScrapeAPIServerColocated("cluster_apiserver_metrics", 4001),
                    // Scrape Kubernetes controller-manager metrics.
                    kubeScrapeAPIServerColocated("cluster_controllermanager_metrics", 4003),
                    // Scrape Kubernetes scheduler metrics.
                    kubeScrapeAPIServerColocated("cluster_scheduler_metrics", 4005),
                ],

                remote_write: [
                    {
                        url: u.remote,
                        basic_auth: {
                            username: cluster.cfg.username,
                            password: u.password,
                        },
                    }
                    for u in cluster.cfg.upstreams
                ],
            },

            configmap: kube.ConfigMap("prometheus-cluster") {
                metadata+: {
                    namespace: cfg.namespace,
                },
                data: {
                    "prometheus.yml": std.manifestYamlDoc(prometheus.configuration),
                },
            },

            sa: kube.ServiceAccount("prometheus-cluster") {
                metadata+: {
                    namespace: cfg.namespace,
                },
            },

            cr: kube.ClusterRole("monitoring-cluster-prometheus-server-%s" % [cfg.name]) {
                rules: [
                    // Allow access to all metrics.
                    { nonResourceURLs: ["/metrics"], verbs: ["get"], },
                    // Allow to access node details for discovery.
                    { apiGroups: [""], resources: ["nodes"], verbs: ["list", "watch", "get"], },
                    { apiGroups: [""], resources: ["endpoints", "services", "pods"], verbs: ["list", "watch", "get"], },
                    // Allow to proxy to bare node HTTP to access per-node metrics endpoints.
                    { apiGroups: [""], resources: ["nodes/proxy"], verbs: ["get"], },
                ],
            },

            crb: kube.ClusterRoleBinding("monitoring-cluster-prometheus-server-%s" % [cfg.name]) {
                subjects_: [prometheus.sa],
                roleRef_: prometheus.cr,
            },

            deploy: kube.Deployment("prometheus-cluster") {
                metadata+: {
                    namespace: cfg.namespace,
                },
                spec+: {
                    template+: {
                        spec+: {
                            containers_: {
                                default: kube.Container("default") {
                                    image: cfg.images.prometheus,
                                    command: [
                                        "/bin/prometheus",
                                        "--config.file=/etc/prometheus/prometheus.yml",
                                        "--storage.tsdb.path=/prometheus",
                                        "--storage.tsdb.retention.size=10GB",
                                        "--web.console.libraries=/usr/share/prometheus/console_libraries",
                                        "--web.console.templates=/usr/share/prometheus/consoles",
                                        "--web.enable-lifecycle",
                                    ],
                                    resources: {
                                        requests: {
                                            memory: "3Gi",
                                            cpu: "100m",
                                        },
                                        limits: {
                                            memory: "3Gi",
                                            cpu: "1",
                                        },
                                    },
                                    volumeMounts_: {
                                        data: { mountPath: "/prometheus", },
                                        configmap: { mountPath: "/etc/prometheus", },
                                    },
                                },
                            },
                            serviceAccountName: prometheus.sa.metadata.name,
                            tolerations: [
                                { key: "CriticalAddonsOnly", operator: "Exists" },
                            ],
                            volumes_: {
                                data: kube.PersistentVolumeClaimVolume(prometheus.pvc),
                                configmap: kube.ConfigMapVolume(prometheus.configmap),
                            },
                        },
                    },
                },
            },

            // Kubernetes metric storage volume.
            pvc: kube.PersistentVolumeClaim("prometheus-cluster") {
                metadata+: {
                    namespace: cfg.namespace,
                },
                spec+: {
                    storageClassName: cfg.storageClasses.prometheus,
                    accessModes: ["ReadWriteOnce"],
                    resources: {
                        requests: {
                            storage: "16Gi",
                        },
                    },
                },
            },

            // Network Policy governing access to the prometheus server.
            np: kube.NetworkPolicy("prometheus-cluster") {
                metadata+: {
                    namespace: cfg.namespace,
                },
                spec+: kube.podLabelsSelector(prometheus.deploy) {
                    ingress_: {
                        // Deny all inbound traffic to pod.
                        // This will be augmented to allow access from some other pod/namespace
                        // in the future.
                    },
                    egress_: {
                        // Allow all outbound traffic from pod.
                        outboundAll: {},
                    },
                    policyTypes: ["Ingress", "Egress"],
                },
            },
        },
    },
}