forked from hswaw/hscloud
294 lines
14 KiB
Plaintext
294 lines
14 KiB
Plaintext
local kube = import "../../../kube/kube.libsonnet";
|
|
|
|
{
|
|
// Cluster sets up all cluster-specific monitoring resources in their own namespace.
|
|
//
|
|
// Currently this consists of a prometheus server that scrapes k8s nodes for kubelet
|
|
// and cAdvisor metrics, and possibly ships over metrics to the global tier via set
|
|
// upstreams.
|
|
Cluster(name):: {
|
|
local cluster = self,
|
|
local cfg = cluster.cfg,
|
|
cfg:: {
|
|
name: name,
|
|
namespace: "monitoring-cluster",
|
|
|
|
images: {
|
|
prometheus: "prom/prometheus:v2.18.1",
|
|
},
|
|
|
|
storageClasses: {
|
|
prometheus: error "storageClasses.prometheus must be set",
|
|
},
|
|
|
|
// Username used to authenticate to upstreams.
|
|
username: error "username must be set",
|
|
|
|
// Global tier upstreams that this cluster should ship metrics off to.
|
|
// List of
|
|
// {
|
|
// remote: URL of upstream
|
|
// password: password used to authenticate, in conjunction with cfg.username.
|
|
//
|
|
upstreams: [],
|
|
},
|
|
|
|
namespace: kube.Namespace(cfg.namespace),
|
|
|
|
prometheus: {
|
|
local prometheus = self,
|
|
|
|
// Configuration that's going to be emitted as prometheus.yml and passed to the
|
|
// prometheus server for this cluster.
|
|
configuration:: {
|
|
global: {
|
|
external_labels: {
|
|
cluster: cluster.cfg.name,
|
|
},
|
|
},
|
|
|
|
// Constructor for a Kubernetes scrape job that uses the pod's service account and
|
|
// TLS configuration, selecting the given k8s scrape 'role'.
|
|
local kubeScrapeConfig = function(name, role) {
|
|
job_name: name,
|
|
scheme: "https",
|
|
scrape_interval: "30s",
|
|
kubernetes_sd_configs: [ { role: role }, ],
|
|
tls_config: {
|
|
ca_file: "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt",
|
|
},
|
|
bearer_token_file: "/var/run/secrets/kubernetes.io/serviceaccount/token",
|
|
},
|
|
|
|
|
|
// When scraping node-based metrics (ie. node and cadvisor metrics) we contact
|
|
// the metrics endpoints on the kubelet via the API server. This is done by
|
|
// relabeling _address__ and __metrics_path__ to point at the k8s API server,
|
|
// and at the API server proxy path to reach a node's metrics endpoint.
|
|
//
|
|
// This approach was lifted from the prometheus examples for Kubernetes, and
|
|
// while the benefits outlined there do not matter that much to us (our
|
|
// kubelets listen on public addresses, anyway), we still enjoy this approach
|
|
// for the fact that we don't have to hardcode the kubelet TLS port.
|
|
//
|
|
// https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml
|
|
//
|
|
// When contacting the API server, we hardcode the 'hswaw.net' DNS suffix as
|
|
// our API server's TLS certificate only has a CN/SAN for its full FQDN, not
|
|
// the .svc.cluster.local shorthand (see //cluster/clustercfg:clustercfg.py).
|
|
local kubeScrapeNodeMetrics = function(name, path) kubeScrapeConfig(name, "node") {
|
|
relabel_configs: [
|
|
{
|
|
action: "labelmap",
|
|
regex: "__meta_kubernetes_node_label_(.+)",
|
|
},
|
|
{
|
|
action: "replace",
|
|
target_label: "__address__",
|
|
replacement: "kubernetes.default.svc.%s.hswaw.net:443" % [cluster.cfg.name],
|
|
},
|
|
{
|
|
target_label: "__metrics_path__",
|
|
source_labels: ["__meta_kubernetes_node_name"],
|
|
regex: "(.+)",
|
|
replacement: "/api/v1/nodes/${1}/proxy" + path,
|
|
},
|
|
],
|
|
},
|
|
|
|
// When scraping API server-colocated metrics (ie. metrics from nixos services running alongside
|
|
// APIserver instances), we contact the metrics endpoints directly over the node's IP addresses
|
|
// and an external port. The node IP addresses are discovered via Prometheus kubernetes endpoint
|
|
// discovery which selects all endpoints for the default/kubernetes service. This service is
|
|
// backed by apiserver instances on public IP addresses. We can then rewrite the received port
|
|
// by the port of the service we're interested in to get to that service.
|
|
local kubeScrapeAPIServerColocated = function(name, port) kubeScrapeConfig(name, "endpoints") {
|
|
relabel_configs: [
|
|
// Select only endpoints that back the default/kubernetes service. These are all
|
|
// public IP addresses of nodes that run the API server.
|
|
{
|
|
action: "keep",
|
|
regex: "default;kubernetes;https",
|
|
source_labels: [
|
|
"__meta_kubernetes_namespace",
|
|
"__meta_kubernetes_service_name",
|
|
"__meta_kubernetes_endpoint_port_name",
|
|
],
|
|
},
|
|
] + (if port == 4001 then [] else [
|
|
// Replace endpoint port with requested port, if the requested port is not the apiserver's
|
|
// port 4001, which is the one returned by default for the these endpoints.
|
|
{
|
|
action: "replace",
|
|
regex: "([^:]+):.+",
|
|
replacement: "$1:%d" % [port],
|
|
source_labels: [
|
|
"__address__",
|
|
],
|
|
target_label: "__address__",
|
|
},
|
|
]),
|
|
// We disable server-side TLS certificate verification.
|
|
// Unfortunately, all apiserver-colocated services run with TLS certificates that do not have
|
|
// the right IP address SAN. Unfortunately, we can't override the TLS ServerName for a scrape
|
|
// target [1], so the only two choiced we are left with are:
|
|
// 1) re-emit relevant certificates with IP address SANs that allow for access by IP.
|
|
// 2) disable TLS verification.
|
|
// We choose 2), knowing that if someone manages to hijack a target IP address they can end up
|
|
// stealing our bearer token and impersonating the service account with which Prometheus is
|
|
// running. In the long term, we hope for [1] to be resolved.
|
|
//
|
|
// TODO(q3k): revisit this once [1] gets fixed.
|
|
// [1] - https://github.com/prometheus/prometheus/issues/4827
|
|
tls_config: {
|
|
insecure_skip_verify: true,
|
|
},
|
|
},
|
|
|
|
scrape_configs: [
|
|
/// Scrape per-node metrics, proxied via the APIServer..
|
|
// Scrape Kubernetes node metrics via apiserver. This emits kube_node_* metrics.
|
|
kubeScrapeNodeMetrics("cluster_node_metrics", "/metrics"),
|
|
// Scrape Kubernetes node cadvisor metrics via apiserver. This emits container_* metrics.
|
|
kubeScrapeNodeMetrics("cluster_cadvisor_metrics", "/metrics/cadvisor"),
|
|
|
|
/// Scape apiserver-colocated ('master node') metrics, over nodes' public IP addresses.
|
|
/// (currently all nodes are 'master' nodes)
|
|
// Scrape Kubernetes apiserver metrics.
|
|
kubeScrapeAPIServerColocated("cluster_apiserver_metrics", 4001),
|
|
// Scrape Kubernetes controller-manager metrics.
|
|
kubeScrapeAPIServerColocated("cluster_controllermanager_metrics", 4003),
|
|
// Scrape Kubernetes scheduler metrics.
|
|
kubeScrapeAPIServerColocated("cluster_scheduler_metrics", 4005),
|
|
],
|
|
|
|
remote_write: [
|
|
{
|
|
url: u.remote,
|
|
basic_auth: {
|
|
username: cluster.cfg.username,
|
|
password: u.password,
|
|
},
|
|
}
|
|
for u in cluster.cfg.upstreams
|
|
],
|
|
},
|
|
|
|
configmap: kube.ConfigMap("prometheus-cluster") {
|
|
metadata+: {
|
|
namespace: cfg.namespace,
|
|
},
|
|
data: {
|
|
"prometheus.yml": std.manifestYamlDoc(prometheus.configuration),
|
|
},
|
|
},
|
|
|
|
sa: kube.ServiceAccount("prometheus-cluster") {
|
|
metadata+: {
|
|
namespace: cfg.namespace,
|
|
},
|
|
},
|
|
|
|
cr: kube.ClusterRole("monitoring-cluster-prometheus-server-%s" % [cfg.name]) {
|
|
rules: [
|
|
// Allow access to all metrics.
|
|
{ nonResourceURLs: ["/metrics"], verbs: ["get"], },
|
|
// Allow to access node details for discovery.
|
|
{ apiGroups: [""], resources: ["nodes"], verbs: ["list", "watch", "get"], },
|
|
{ apiGroups: [""], resources: ["endpoints", "services", "pods"], verbs: ["list", "watch", "get"], },
|
|
// Allow to proxy to bare node HTTP to access per-node metrics endpoints.
|
|
{ apiGroups: [""], resources: ["nodes/proxy"], verbs: ["get"], },
|
|
],
|
|
},
|
|
|
|
crb: kube.ClusterRoleBinding("monitoring-cluster-prometheus-server-%s" % [cfg.name]) {
|
|
subjects_: [prometheus.sa],
|
|
roleRef_: prometheus.cr,
|
|
},
|
|
|
|
deploy: kube.Deployment("prometheus-cluster") {
|
|
metadata+: {
|
|
namespace: cfg.namespace,
|
|
},
|
|
spec+: {
|
|
template+: {
|
|
spec+: {
|
|
containers_: {
|
|
default: kube.Container("default") {
|
|
image: cfg.images.prometheus,
|
|
command: [
|
|
"/bin/prometheus",
|
|
"--config.file=/etc/prometheus/prometheus.yml",
|
|
"--storage.tsdb.path=/prometheus",
|
|
"--storage.tsdb.retention.size=10GB",
|
|
"--web.console.libraries=/usr/share/prometheus/console_libraries",
|
|
"--web.console.templates=/usr/share/prometheus/consoles",
|
|
"--web.enable-lifecycle",
|
|
],
|
|
resources: {
|
|
requests: {
|
|
memory: "3Gi",
|
|
cpu: "100m",
|
|
},
|
|
limits: {
|
|
memory: "3Gi",
|
|
cpu: "1",
|
|
},
|
|
},
|
|
volumeMounts_: {
|
|
data: { mountPath: "/prometheus", },
|
|
configmap: { mountPath: "/etc/prometheus", },
|
|
},
|
|
},
|
|
},
|
|
serviceAccountName: prometheus.sa.metadata.name,
|
|
tolerations: [
|
|
{ key: "CriticalAddonsOnly", operator: "Exists" },
|
|
],
|
|
volumes_: {
|
|
data: kube.PersistentVolumeClaimVolume(prometheus.pvc),
|
|
configmap: kube.ConfigMapVolume(prometheus.configmap),
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
|
|
// Kubernetes metric storage volume.
|
|
pvc: kube.PersistentVolumeClaim("prometheus-cluster") {
|
|
metadata+: {
|
|
namespace: cfg.namespace,
|
|
},
|
|
spec+: {
|
|
storageClassName: cfg.storageClasses.prometheus,
|
|
accessModes: ["ReadWriteOnce"],
|
|
resources: {
|
|
requests: {
|
|
storage: "16Gi",
|
|
},
|
|
},
|
|
},
|
|
},
|
|
|
|
// Network Policy governing access to the prometheus server.
|
|
np: kube.NetworkPolicy("prometheus-cluster") {
|
|
metadata+: {
|
|
namespace: cfg.namespace,
|
|
},
|
|
spec+: kube.podLabelsSelector(prometheus.deploy) {
|
|
ingress_: {
|
|
// Deny all inbound traffic to pod.
|
|
// This will be augmented to allow access from some other pod/namespace
|
|
// in the future.
|
|
},
|
|
egress_: {
|
|
// Allow all outbound traffic from pod.
|
|
outboundAll: {},
|
|
},
|
|
policyTypes: ["Ingress", "Egress"],
|
|
},
|
|
},
|
|
},
|
|
},
|
|
}
|