hscloud/ops/monitoring/lib/cluster.libsonnet

local kube = import "../../../kube/kube.libsonnet";

{
    // Cluster sets up all cluster-specific monitoring resources in their own namespace.
    // Currently this consists of a prometheus server that scrapes k8s nodes for kubelet
    // and cAdvisor metrics.
    Cluster(name):: {
        local cluster = self,
        local cfg = cluster.cfg,
        cfg:: {
            name: name,
            namespace: "monitoring-cluster",

            images: {
                prometheus: "prom/prometheus:v2.18.1",
            },

            storageClasses: {
                prometheus: error "storageClasses.prometheus must be set",
            },
        },

        namespace: kube.Namespace(cfg.namespace),

        prometheus: {
            local prometheus = self,

            // Configuration that's going to be emitted as prometheus.yml and passed to the
            // prometheus server for this cluster.
            configuration:: {
                global: {
                    external_labels: {
                        cluster: cluster.cfg.name,
                    },
                },

                // Constructor for a Kubernetes scrape job that uses the pod's service account and
                // TLS configuration, selecting the given k8s scrape 'role'.
                local kubeScrapeConfig = function(name, role) {
                    job_name: name,
                    scheme: "https",
                    scrape_interval: "30s",
                    kubernetes_sd_configs: [ { role: role }, ],
                    tls_config: {
                        ca_file: "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt",
                    },
                    bearer_token_file: "/var/run/secrets/kubernetes.io/serviceaccount/token",
                },

                scrape_configs: [
                    // When scraping node-based metrics (ie. node and cadvisor metrics) we contact
                    // the metrics endpoints on the kubelet via the API server. This is done by
                    // relabeling _address__ and __metrics_path__ to point at the k8s API server,
                    // and at the API server proxy path to reach a node's metrics endpoint.
                    //
                    // This approach was lifted from the prometheus examples for Kubernetes, and
                    // while the benefits outlined there do not matter that much to us (our
                    // kubelets listen on public addresses, anyway), we still enjoy this approach
                    // for the fact that we don't have to hardcode the kubelet TLS port.
                    //
                    // https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml
                    //
                    // When contacting the API server, we hardcode the 'hswaw.net' DNS suffix as
                    // our API server's TLS certificate only has a CN/SAN for its full FQDN, not
                    // the .svc.cluster.local shorthand (see //cluster/clustercfg:clustercfg.py).

                    // Scrape Kubernetes node metrics via apiserver. This emites kube_node_* metrics.
                    kubeScrapeConfig("cluster_node_metrics", "node") {
                        relabel_configs: [
                            {
                                action: "labelmap",
                                regex: "__meta_kubernetes_node_label_(.+)",
                            },
                            {
                                action: "replace",
                                target_label: "__address__",
                                replacement: "kubernetes.default.svc.%s.hswaw.net:443" % [cluster.cfg.name],
                            },
                            {
                                target_label: "__metrics_path__",
                                source_labels: ["__meta_kubernetes_node_name"],
                                regex: "(.+)",
                                replacement: "/api/v1/nodes/${1}/proxy/metrics",
                            },
                        ],
                    },
                    // Scrape Kubernetes node cadvisor metrics via apiserver. This emits container_* metrics.
                    kubeScrapeConfig("cluster_cadvisor_metrics", "node") {
                        relabel_configs: [
                            {
                                action: "labelmap",
                                regex: "__meta_kubernetes_node_label_(.+)",
                            },
                            {
                                action: "replace",
                                target_label: "__address__",
                                replacement: "kubernetes.default.svc.%s.hswaw.net:443" % [cluster.cfg.name],
                            },
                            {
                                target_label: "__metrics_path__",
                                source_labels: ["__meta_kubernetes_node_name"],
                                regex: "(.+)",
                                replacement: "/api/v1/nodes/${1}/proxy/metrics/cadvisor",
                            },
                        ],
                    },
                ],
            },

            configmap: kube.ConfigMap("prometheus-cluster") {
                metadata+: {
                    namespace: cfg.namespace,
                },
                data: {
                    "prometheus.yml": std.manifestYamlDoc(prometheus.configuration),
                },
            },

            sa: kube.ServiceAccount("prometheus-cluster") {
                metadata+: {
                    namespace: cfg.namespace,
                },
            },

            cr: kube.ClusterRole("monitoring-cluster-prometheus-server-%s" % [cfg.name]) {
                rules: [
                    // Allow access to all metrics.
                    { nonResourceURLs: ["/metrics"], verbs: ["get"], },
                    // Allow to access node details for discovery.
                    { apiGroups: [""], resources: ["nodes"], verbs: ["list", "watch", "get"], },
                    // Allow to proxy to bare node HTTP to access per-node metrics endpoints. 
                    { apiGroups: [""], resources: ["nodes/proxy"], verbs: ["get"], },
                ],
            },

            crb: kube.ClusterRoleBinding("monitoring-cluster-prometheus-server-%s" % [cfg.name]) {
                subjects_: [prometheus.sa],
                roleRef_: prometheus.cr,
            },

            deploy: kube.Deployment("prometheus-cluster") {
                metadata+: {
                    namespace: cfg.namespace,
                },
                spec+: {
                    template+: {
                        spec+: {
                            containers_: {
                                default: kube.Container("default") {
                                    image: cfg.images.prometheus,
                                    command: [
                                        "/bin/prometheus",
                                        "--config.file=/etc/prometheus/prometheus.yml",
                                        "--storage.tsdb.path=/prometheus",
                                        # TODO(q3k): reduce this once we have a long-term storage
                                        # solution.
                                        "--storage.tsdb.retention.time=120d",
                                        "--web.console.libraries=/usr/share/prometheus/console_libraries",
                                        "--web.console.templates=/usr/share/prometheus/consoles",
                                        "--web.enable-lifecycle",
                                    ],
                                    resources: {
                                        requests: {
                                            memory: "256Mi",
                                            cpu: "100m",
                                        },
                                        limits: {
                                            memory: "1Gi",
                                            cpu: "1",
                                        },
                                    },
                                    volumeMounts_: {
                                        data: { mountPath: "/prometheus", },
                                        configmap: { mountPath: "/etc/prometheus", },
                                    },
                                },
                            },
                            serviceAccountName: prometheus.sa.metadata.name,
                            tolerations: [
                                { key: "CriticalAddonsOnly", operator: "Exists" },
                            ],
                            volumes_: {
                                data: kube.PersistentVolumeClaimVolume(prometheus.pvc),
                                configmap: kube.ConfigMapVolume(prometheus.configmap),
                            },
                        },
                    },
                },
            },

            // Kubernetes metric storage volume.
            pvc: kube.PersistentVolumeClaim("prometheus-cluster") {
                metadata+: {
                    namespace: cfg.namespace,
                },
                spec+: {
                    storageClassName: cfg.storageClasses.prometheus,
                    accessModes: ["ReadWriteOnce"],
                    resources: {
                        requests: {
                            storage: "32Gi",
                        },
                    },
                },
            },

            // Network Policy governing access to the prometheus server.
            np: kube.NetworkPolicy("prometheus-cluster") {
                metadata+: {
                    namespace: cfg.namespace,
                },
                spec+: kube.podLabelsSelector(prometheus.deploy) {
                    ingress_: {
                        // Deny all inbound traffic to pod.
                        // This will be augmented to allow access from some other pod/namespace
                        // in the future.
                    },
                    egress_: {
                        // Allow all outbound traffic from pod.
                        outboundAll: {},
                    },
                    policyTypes: ["Ingress", "Egress"],
                },
            },
        },
    },
}
ops/monitoring: split up jsonnet, add simple docs Change-Id: I8120958a6862411de0446896875766834457aba9 2020-06-06 15:04:07 +00:00			`local kube = import "../../../kube/kube.libsonnet";`
ops/metrics: basic cluster setup with prometheus We handwavingly plan on implementing monitoring as a two-tier system: - a 'global' component that is reponsible for global aggregation, long-term storage and alerting. - multiple 'per-cluster' components, that collect metrics from Kubernetes clusters and export them to the global component. In addition, several lower tiers (collected by per-cluster components) might also be implemented in the future - for instance, specific to some subprojects. Here we start sketching out some basic jsonnet structure (currently all in a single file, with little parametrization) and a cluster-level prometheus server that scrapes Kubernetes Node and cAdvisor metrics. This review is mostly to get this commited as early as possible, and to make sure that the little existing Prometheus scrape configuration is sane. Change-Id: If37ac3b1243b8b6f464d65fee6d53080c36f992c 2020-06-06 10:35:06 +00:00
			`{`
ops/monitoring: split up jsonnet, add simple docs Change-Id: I8120958a6862411de0446896875766834457aba9 2020-06-06 15:04:07 +00:00			`// Cluster sets up all cluster-specific monitoring resources in their own namespace.`
			`// Currently this consists of a prometheus server that scrapes k8s nodes for kubelet`
			`// and cAdvisor metrics.`
ops/metrics: basic cluster setup with prometheus We handwavingly plan on implementing monitoring as a two-tier system: - a 'global' component that is reponsible for global aggregation, long-term storage and alerting. - multiple 'per-cluster' components, that collect metrics from Kubernetes clusters and export them to the global component. In addition, several lower tiers (collected by per-cluster components) might also be implemented in the future - for instance, specific to some subprojects. Here we start sketching out some basic jsonnet structure (currently all in a single file, with little parametrization) and a cluster-level prometheus server that scrapes Kubernetes Node and cAdvisor metrics. This review is mostly to get this commited as early as possible, and to make sure that the little existing Prometheus scrape configuration is sane. Change-Id: If37ac3b1243b8b6f464d65fee6d53080c36f992c 2020-06-06 10:35:06 +00:00			`Cluster(name):: {`
			`local cluster = self,`
			`local cfg = cluster.cfg,`
			`cfg:: {`
			`name: name,`
			`namespace: "monitoring-cluster",`

			`images: {`
			`prometheus: "prom/prometheus:v2.18.1",`
			`},`

			`storageClasses: {`
ops/monitoring: split up jsonnet, add simple docs Change-Id: I8120958a6862411de0446896875766834457aba9 2020-06-06 15:04:07 +00:00			`prometheus: error "storageClasses.prometheus must be set",`
ops/metrics: basic cluster setup with prometheus We handwavingly plan on implementing monitoring as a two-tier system: - a 'global' component that is reponsible for global aggregation, long-term storage and alerting. - multiple 'per-cluster' components, that collect metrics from Kubernetes clusters and export them to the global component. In addition, several lower tiers (collected by per-cluster components) might also be implemented in the future - for instance, specific to some subprojects. Here we start sketching out some basic jsonnet structure (currently all in a single file, with little parametrization) and a cluster-level prometheus server that scrapes Kubernetes Node and cAdvisor metrics. This review is mostly to get this commited as early as possible, and to make sure that the little existing Prometheus scrape configuration is sane. Change-Id: If37ac3b1243b8b6f464d65fee6d53080c36f992c 2020-06-06 10:35:06 +00:00			`},`
			`},`

			`namespace: kube.Namespace(cfg.namespace),`

			`prometheus: {`
			`local prometheus = self,`

			`// Configuration that's going to be emitted as prometheus.yml and passed to the`
			`// prometheus server for this cluster.`
			`configuration:: {`
			`global: {`
			`external_labels: {`
			`cluster: cluster.cfg.name,`
			`},`
			`},`

			`// Constructor for a Kubernetes scrape job that uses the pod's service account and`
			`// TLS configuration, selecting the given k8s scrape 'role'.`
			`local kubeScrapeConfig = function(name, role) {`
			`job_name: name,`
			`scheme: "https",`
			`scrape_interval: "30s",`
			`kubernetes_sd_configs: [ { role: role }, ],`
			`tls_config: {`
			`ca_file: "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt",`
			`},`
			`bearer_token_file: "/var/run/secrets/kubernetes.io/serviceaccount/token",`
			`},`

			`scrape_configs: [`
			`// When scraping node-based metrics (ie. node and cadvisor metrics) we contact`
			`// the metrics endpoints on the kubelet via the API server. This is done by`
			`// relabeling _address__ and __metrics_path__ to point at the k8s API server,`
			`// and at the API server proxy path to reach a node's metrics endpoint.`
			`//`
			`// This approach was lifted from the prometheus examples for Kubernetes, and`
			`// while the benefits outlined there do not matter that much to us (our`
			`// kubelets listen on public addresses, anyway), we still enjoy this approach`
			`// for the fact that we don't have to hardcode the kubelet TLS port.`
			`//`
			`// https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml`
			`//`
			`// When contacting the API server, we hardcode the 'hswaw.net' DNS suffix as`
			`// our API server's TLS certificate only has a CN/SAN for its full FQDN, not`
			`// the .svc.cluster.local shorthand (see //cluster/clustercfg:clustercfg.py).`

			`// Scrape Kubernetes node metrics via apiserver. This emites kube_node_* metrics.`
			`kubeScrapeConfig("cluster_node_metrics", "node") {`
			`relabel_configs: [`
			`{`
			`action: "labelmap",`
			`regex: "__meta_kubernetes_node_label_(.+)",`
			`},`
			`{`
			`action: "replace",`
			`target_label: "__address__",`
			`replacement: "kubernetes.default.svc.%s.hswaw.net:443" % [cluster.cfg.name],`
			`},`
			`{`
			`target_label: "__metrics_path__",`
			`source_labels: ["__meta_kubernetes_node_name"],`
			`regex: "(.+)",`
			`replacement: "/api/v1/nodes/${1}/proxy/metrics",`
			`},`
			`],`
			`},`
			`// Scrape Kubernetes node cadvisor metrics via apiserver. This emits container_* metrics.`
			`kubeScrapeConfig("cluster_cadvisor_metrics", "node") {`
			`relabel_configs: [`
			`{`
			`action: "labelmap",`
			`regex: "__meta_kubernetes_node_label_(.+)",`
			`},`
			`{`
			`action: "replace",`
			`target_label: "__address__",`
			`replacement: "kubernetes.default.svc.%s.hswaw.net:443" % [cluster.cfg.name],`
			`},`
			`{`
			`target_label: "__metrics_path__",`
			`source_labels: ["__meta_kubernetes_node_name"],`
			`regex: "(.+)",`
			`replacement: "/api/v1/nodes/${1}/proxy/metrics/cadvisor",`
			`},`
			`],`
			`},`
			`],`
			`},`

			`configmap: kube.ConfigMap("prometheus-cluster") {`
			`metadata+: {`
			`namespace: cfg.namespace,`
			`},`
			`data: {`
			`"prometheus.yml": std.manifestYamlDoc(prometheus.configuration),`
			`},`
			`},`

			`sa: kube.ServiceAccount("prometheus-cluster") {`
			`metadata+: {`
			`namespace: cfg.namespace,`
			`},`
			`},`

			`cr: kube.ClusterRole("monitoring-cluster-prometheus-server-%s" % [cfg.name]) {`
			`rules: [`
			`// Allow access to all metrics.`
			`{ nonResourceURLs: ["/metrics"], verbs: ["get"], },`
			`// Allow to access node details for discovery.`
			`{ apiGroups: [""], resources: ["nodes"], verbs: ["list", "watch", "get"], },`
			`// Allow to proxy to bare node HTTP to access per-node metrics endpoints.`
			`{ apiGroups: [""], resources: ["nodes/proxy"], verbs: ["get"], },`
			`],`
			`},`

			`crb: kube.ClusterRoleBinding("monitoring-cluster-prometheus-server-%s" % [cfg.name]) {`
			`subjects_: [prometheus.sa],`
			`roleRef_: prometheus.cr,`
			`},`

			`deploy: kube.Deployment("prometheus-cluster") {`
			`metadata+: {`
			`namespace: cfg.namespace,`
			`},`
			`spec+: {`
			`template+: {`
			`spec+: {`
			`containers_: {`
			`default: kube.Container("default") {`
			`image: cfg.images.prometheus,`
			`command: [`
			`"/bin/prometheus",`
			`"--config.file=/etc/prometheus/prometheus.yml",`
			`"--storage.tsdb.path=/prometheus",`
			`# TODO(q3k): reduce this once we have a long-term storage`
			`# solution.`
			`"--storage.tsdb.retention.time=120d",`
			`"--web.console.libraries=/usr/share/prometheus/console_libraries",`
			`"--web.console.templates=/usr/share/prometheus/consoles",`
			`"--web.enable-lifecycle",`
			`],`
			`resources: {`
			`requests: {`
			`memory: "256Mi",`
			`cpu: "100m",`
			`},`
			`limits: {`
			`memory: "1Gi",`
			`cpu: "1",`
			`},`
			`},`
			`volumeMounts_: {`
			`data: { mountPath: "/prometheus", },`
			`configmap: { mountPath: "/etc/prometheus", },`
			`},`
			`},`
			`},`
			`serviceAccountName: prometheus.sa.metadata.name,`
			`tolerations: [`
			`{ key: "CriticalAddonsOnly", operator: "Exists" },`
			`],`
			`volumes_: {`
			`data: kube.PersistentVolumeClaimVolume(prometheus.pvc),`
			`configmap: kube.ConfigMapVolume(prometheus.configmap),`
			`},`
			`},`
			`},`
			`},`
			`},`

			`// Kubernetes metric storage volume.`
			`pvc: kube.PersistentVolumeClaim("prometheus-cluster") {`
			`metadata+: {`
			`namespace: cfg.namespace,`
			`},`
			`spec+: {`
			`storageClassName: cfg.storageClasses.prometheus,`
			`accessModes: ["ReadWriteOnce"],`
			`resources: {`
			`requests: {`
			`storage: "32Gi",`
			`},`
			`},`
			`},`
			`},`

			`// Network Policy governing access to the prometheus server.`
			`np: kube.NetworkPolicy("prometheus-cluster") {`
			`metadata+: {`
			`namespace: cfg.namespace,`
			`},`
			`spec+: kube.podLabelsSelector(prometheus.deploy) {`
			`ingress_: {`
			`// Deny all inbound traffic to pod.`
			`// This will be augmented to allow access from some other pod/namespace`
			`// in the future.`
			`},`
			`egress_: {`
			`// Allow all outbound traffic from pod.`
			`outboundAll: {},`
			`},`
			`policyTypes: ["Ingress", "Egress"],`
			`},`
			`},`
			`},`
			`},`
			`}`