diff --git a/ops/monitoring/README.md b/ops/monitoring/README.md new file mode 100644 index 00000000..0594678d --- /dev/null +++ b/ops/monitoring/README.md @@ -0,0 +1,41 @@ +hscloud monitoring +================== + +Quick links +----------- + + - *Old Global Dashboard*: [monitoring.hackerspace.pl](https://monitoring.hackerspace.pl) - old monitoring system, unrelated to this one, configured using Chef at management.hackerspace.pl (long since dead). This setup is supposed to replace it. + +Architecture +------------ + +The hscloud monitoring solution is two-tiered: + + - at the *global* tier we run metrics aggregation, long-term storage, dashboard and alerting. + - at the *agent* tier we collect metrics from various sources (possibly even lower tiered agents). + +All agent-tier agents send metrics to all global instances. + + + .--------. .--------. '. + | global | | global | > - global tier + '--------' '--------' .' (contains 'global instances') + | '---. .---' | + | X | + | .---' '---. | + | | | | + .--------------. .--------------------. '. + | cluster | | hswaw-proxy | | + | k0.hswaw.net | | waw.hackerspace.pl | > - agent tier + '--------------' '--------------------' .' (contains 'agents') + + +Agent - cluster +--------------- + +Cluster agents are responsible from collecting Kubernetes cluster metrics. They run a prometheus server that scrapes kubelet/cadvisor/... metrics and send them off to global instances. + +Global Instances +---------------- + +Global agents run Victoria Metrics, ingest metrics from all agents, and perform long-term storage. In the future they will also run Grafana and AlertManager. diff --git a/ops/monitoring/k0.jsonnet b/ops/monitoring/k0.jsonnet index 028a4630..62810c5d 100644 --- a/ops/monitoring/k0.jsonnet +++ b/ops/monitoring/k0.jsonnet @@ -1,11 +1,39 @@ -local lib = import "lib.libsonnet"; +local cluster = import "lib/cluster.libsonnet"; +local global = import "lib/global.libsonnet"; + +// Monitoring tiers set up on k0. See README for architectural background. { - cluster: lib.Cluster("k0") { - cfg+: { - storageClasses+: { - prometheus: "waw-hdd-redundant-3", - }, + local k0 = self, + local cfg = { + storageClasses+: { + prometheus: "waw-hdd-redundant-3", + victoria: "waw-hdd-redundant-3", }, }, + + // Cluster tier - prometheus. + cluster: cluster.Cluster("k0") { + cfg+: cfg { + username: "cluster-k0", + upstreams: [ + { password: std.split(importstr "secrets/plain/global-agent-cluster-k0", "\n")[0], remote: k0.global.internalIngestURL }, + ], + }, + }, + + // Global tier - victoria metrics. + global: global.Global("k0") { + cfg+: cfg { + hosts: { + globalAPI: "monitoring-global-api.k0.hswaw.net", + }, + agents: [ + // Ingestion from k0 cluster tier. + { username: k0.cluster.cfg.username, password: std.split(importstr "secrets/plain/global-agent-cluster-k0", "\n")[0], }, + // Access from q3k's test Grafana. + { username: "grafana", password: std.split(importstr "secrets/plain/global-agent-grafana", "\n")[0], }, + ], + }, + }, } diff --git a/ops/monitoring/lib.libsonnet b/ops/monitoring/lib.libsonnet deleted file mode 100644 index 61f49b49..00000000 --- a/ops/monitoring/lib.libsonnet +++ /dev/null @@ -1,5 +0,0 @@ -local cluster = import "lib/cluster.libsonnet"; - -{ - Cluster: cluster.Cluster, -} diff --git a/ops/monitoring/lib/cluster.libsonnet b/ops/monitoring/lib/cluster.libsonnet index 9b64f05c..511d4262 100644 --- a/ops/monitoring/lib/cluster.libsonnet +++ b/ops/monitoring/lib/cluster.libsonnet @@ -2,8 +2,10 @@ local kube = import "../../../kube/kube.libsonnet"; { // Cluster sets up all cluster-specific monitoring resources in their own namespace. + // // Currently this consists of a prometheus server that scrapes k8s nodes for kubelet - // and cAdvisor metrics. + // and cAdvisor metrics, and possibly ships over metrics to the global tier via set + // upstreams. Cluster(name):: { local cluster = self, local cfg = cluster.cfg, @@ -18,6 +20,17 @@ local kube = import "../../../kube/kube.libsonnet"; storageClasses: { prometheus: error "storageClasses.prometheus must be set", }, + + // Username used to authenticate to upstreams. + username: error "username must be set", + + // Global tier upstreams that this cluster should ship metrics off to. + // List of + // { + // remote: URL of upstream + // password: password used to authenticate, in conjunction with cfg.username. + // + upstreams: [], }, namespace: kube.Namespace(cfg.namespace), @@ -105,6 +118,17 @@ local kube = import "../../../kube/kube.libsonnet"; ], }, ], + + remote_write: [ + { + url: u.remote, + basic_auth: { + username: cluster.cfg.username, + password: u.password, + }, + } + for u in cluster.cfg.upstreams + ], }, configmap: kube.ConfigMap("prometheus-cluster") { @@ -152,9 +176,7 @@ local kube = import "../../../kube/kube.libsonnet"; "/bin/prometheus", "--config.file=/etc/prometheus/prometheus.yml", "--storage.tsdb.path=/prometheus", - # TODO(q3k): reduce this once we have a long-term storage - # solution. - "--storage.tsdb.retention.time=120d", + "--storage.tsdb.retention.size=10GB", "--web.console.libraries=/usr/share/prometheus/console_libraries", "--web.console.templates=/usr/share/prometheus/consoles", "--web.enable-lifecycle", @@ -198,7 +220,7 @@ local kube = import "../../../kube/kube.libsonnet"; accessModes: ["ReadWriteOnce"], resources: { requests: { - storage: "32Gi", + storage: "16Gi", }, }, }, diff --git a/ops/monitoring/lib/global.libsonnet b/ops/monitoring/lib/global.libsonnet new file mode 100644 index 00000000..dbdbebb1 --- /dev/null +++ b/ops/monitoring/lib/global.libsonnet @@ -0,0 +1,149 @@ +local kube = import "../../../kube/kube.libsonnet"; + +{ + // Global sets up a global tier instance of the hscloud monitoring infrastructure. + // + // This currently consists of Victoria Metrics, to which the agent tier sends metrics data via + // the prometheus remote_write protocol. + // Victoria Metrics is here used as a long-term storage solution. However, right now, it + // just keeps data locally on disk. In the future, S3 snapshots/backups should be introduced. + Global(name):: { + local global = self, + local cfg = global.cfg, + + cfg:: { + name: name, + namespace: "monitoring-global-%s" % [cfg.name], + + images: { + victoria: "victoriametrics/victoria-metrics:v1.40.0", + vmauth: "victoriametrics/vmauth:v1.40.0", + }, + + hosts: { + // DNS hostname that this global tier will use. Ingress will run under it. + globalAPI: error "hosts.globalAPI must be set", + }, + + storageClasses: { + // Storage class used for main data retention. + victoria: error "storageClasses.victoria must be set", + }, + + // A list of agents that will push metrics to this instance. + // List of: + // { + // username: the username that the agent will authenticate with + // password: the password that the agent will authenticate with + // } + agents: [], + }, + + // Generated URLs that agents should use to ship metrics over. Both require HTTP basic + // auth, configured via cfg.agents. + // The internal URL should be used for agents colocated in the same Kubernetes cluster. + internalIngestURL:: "http://%s/api/v1/write" % [global.victoria.serviceAPI.host_colon_port], + // The glboal URL should be used for agents sending data over the internet. + globalIngestURL:: "https://%s/api/v1/write" % [cfg.hosts.globalAPI], + + namespace: kube.Namespace(cfg.namespace), + local ns = global.namespace, + + victoria: { + local victoria = self, + + pvc: ns.Contain(kube.PersistentVolumeClaim("victoria-data")) { + spec+: { + storageClassName: cfg.storageClasses.victoria, + accessModes: ["ReadWriteOnce"], + resources: { + requests: { + storage: "64Gi", + }, + }, + }, + }, + + authSecret: ns.Contain(kube.Secret("vmauth")) { + data+: { + "config.yaml": std.base64(std.manifestJson({ + users: [ + { + username: a.username, + password: a.password, + url_prefix: "http://localhost:8428", + } + for a in cfg.agents + ], + }) + "\n") + }, + }, + + deploy: ns.Contain(kube.Deployment("victoria")) { + spec+: { + template+: { + spec+: { + containers_: { + default: kube.Container("default") { + image: cfg.images.victoria, + volumeMounts_: { + data: { mountPath: "/victoria-metrics-data", }, + }, + }, + vmauth: kube.Container("vmauth") { + image: cfg.images.vmauth, + command: [ + "/vmauth-prod", + "-auth.config", "/mnt/secret/config.yaml", + ], + volumeMounts_: { + secret: { mountPath: "/mnt/secret", }, + }, + ports_: { + api: { containerPort: 8427 } + }, + } + }, + volumes_: { + data: kube.PersistentVolumeClaimVolume(victoria.pvc), + secret: kube.SecretVolume(victoria.authSecret), + }, + }, + }, + }, + }, + + serviceAPI: ns.Contain(kube.Service("victoria-api")) { + target_pod: victoria.deploy.spec.template, + spec+: { + ports: [ + { name: "api", port: 8427, targetPort: 8427, protocol: "TCP" }, + ], + type: "ClusterIP", + }, + }, + + ingressAPI: ns.Contain(kube.Ingress("victoria-api")) { + metadata+: { + annotations+: { + "kubernetes.io/tls-acme": "true", + "certmanager.k8s.io/cluster-issuer": "letsencrypt-prod", + }, + }, + spec+: { + tls: [ + { hosts: [cfg.hosts.globalAPI], secretName: "ingress-tls" }, + ], + rules: [ + { + host: cfg.hosts.globalAPI, + http: { + paths: [ { path: "/", backend: { serviceName: victoria.serviceAPI.metadata.name, servicePort: 8427 } }, ], + }, + } + ], + }, + }, + }, + } +} diff --git a/ops/monitoring/secrets/cipher/global-agent-cluster-k0 b/ops/monitoring/secrets/cipher/global-agent-cluster-k0 new file mode 100644 index 00000000..29e715b8 --- /dev/null +++ b/ops/monitoring/secrets/cipher/global-agent-cluster-k0 @@ -0,0 +1,40 @@ +-----BEGIN PGP MESSAGE----- + +hQEMAzhuiT4RC8VbAQgAiZLuysTzxY8VM1wOAC7Hb0/3dHh0/5cFG1nOC6svnVt4 +NLZG0K+9uSuku76N/TZak1lk0pieeW9PE+FBDAAjUhGKS1/0qvZmG2Y5T3qs7pYf +0Zv68hKix88bEfK7yfF/t68cYB1F2ms/4Y5tCBuW3av8MI7XQifWdnwgokxbE6xY +yhGpII6zZemfA+kuMo4BRsyy2Z1xsKo7Ah64hQQUQFXwzr+i4hzpp2AeVlWAcFNj +IlHPxA02ZcBCtjz2DLShFN2s8WBenboM88eUfeKpRAbMMfGcycmpIt7uf6pZ1UJa +viTnfV1juqyXaMLECOBNYBhlMagjRIZ0CbM/5mn3Q4UBDANcG2tp6fXqvgEIAK2M +pbeD3JpNE6pRvQsAHuKObQ+Bm82CxZg2uS9OPwNm6l7ESROpCnTRU8ahHIJO2f1d +IMXzLO4M6QMb5FpAl2ixsT/SeZ9Z8NSxcl1ndByTRPQ3wSNfCV8wW7tXIWHzv1El +pjBRowEbitwuwFgfgk86lYdYLKRPefAPr4fFNQV6aGLSWdVMo6vdR/C78xDivduy +A79Fu64+nsKgOrKHkcxn4YyhFDTOt7avpCX3xAFDWoN7w3W5iQ/EQk+6SVnfsqjo +IqTcxcS1o1TxpEjyoBPgpAERFEJEjIE2Dpo8E5UjJDLHMtSJMMqrAW7nLZJimI+c +DSY83h5VtCzAnvjIXYeFAgwDodoT8VqRl4UBD/4ujIoPDkIZ/dLGbiwtlwZz4giY +2LbfxHq2nkwy3+V6fbAxp+GyK4lB4XiZia2lMeWk5UECK8z9fpAhGvrFaSwjXkvn +ig8LY4WFW8uxjtilJXxPBIqkl9EMyEZJaBFQ3d9icE2ZgPV7peXtZBYgZVD4fY7E +Pxi2CTRrILr628Vqpbo0GdB88NdDd24wzkec6rVVV8WktWbyNXvzzJE6BbkcdBj6 +DyfG55SKSQAjYfC3b8LYtcCZDXiidFMflDXraVuaoOWHKuAb8Mwhrz0TwdTVSH0G +xcnANQjBXf+TNfm8nrfLLSnmyili4qOgEuRQKeSJR+aiR3kDQhV0exd/jAJlcTcF ++QuLWEgpj95W+TD9s/EaVjRBIWs2TFvsUYlU+HZap0GzFiBQQubpK/EgJsh6Xz1/ +3mjDxG5vv/Wdti0ko3oul9koS24eNK07lwM6g/GwtLhT48h/Db/M/9/Vadx+T6KW +fEKTdYyn438hOVKqrKwIRLp98e++VbLIg7pQsH0YOFK0QMFk6N7IbYRSEBnuDD+F +W3VJ4wiP0dwM2LttHwFTapReaDkORYv70rvb3mplp4kCLF7AvUJJQaU1cncuLdwd +Sj3iDu8s0lf3lM0Y+SlB1xDkzHrOGmcC0I4JVH+padBDKCpAn/8IbFmlaiV1Xjtd +KTwB6+NcMtmqX+ObNYUCDAPiA8lOXOuz7wEP/A9JFmiFZ36mPnfjbA7OHz6U3zOT +71iHwJDJXXG4g83WKtOLTYaNAizjXVz7wInWbwigTK2uD38lzOfArbU7UaAP38yS +89xNff5YOQASO71AJutoulUbA43TFF0gVqtcqsYJO7Cx+DSrBwhHXtFsppOVeMsH +9CmxVskPnwyZGG1yAJJ+EnD+y+SGmUh97EyWH+UKNZ4fiXVnwNt1ffY6vFYz61d8 +AferfbvKy/9UN2gxn+QDVjDdyf5Qk14t6ljdBnO/RVbZNpU7pIRepttUFlCr87a+ +SZR+WGPOaedzq/8nu3rABEqOxyLd0W0c9eOFwAtLxszjQ3yakhDseH2xlpeqa5g1 +YzPFL79ywFDLjdfnPju8OBROEtyZD2mNrCqR846xIjPERjMhDYSS7DUI73iC7hrW +5hzfNj7ky1l0mYg3lfIdbtDrQO/HjnsYL3JA8WTcNHVEx3EpgEpQCz7g2TZAfpNs +E08pkn8hLuNg+PH1RvTFLTVflclfZsnTPu7np7TTE8O1OaA8qUG/P1nAXX+wX93a +a2uHQ39I3VZqPA7eRK+Gp6lDSBP1pbZbKz5tV/9glVPXKXY+bDamlWE7kgXbGOsY +zILK+jN9GFad4/gg9b7Upw5EdphnyAoob2SbrKbFN5ALyfFbgG+wFhFdb2oS7tCk +eLx5zEc+aQReFEQ30nMBLEWI+DfbN4nby1Ccp3bcOQvnSr9a0XJYSgFtcVve9aWw +/tYpNn0+fo1M1J+93UPjfjL9/ApDH3dDaS6L6WR8jn2EHALHUbwNuShBnDcCAlQe +/ZVaR2LGJprPHURChG2pognfRZhp+YK06diTwUHtyHir +=e/F1 +-----END PGP MESSAGE----- diff --git a/ops/monitoring/secrets/cipher/global-agent-grafana b/ops/monitoring/secrets/cipher/global-agent-grafana new file mode 100644 index 00000000..fd501e64 --- /dev/null +++ b/ops/monitoring/secrets/cipher/global-agent-grafana @@ -0,0 +1,40 @@ +-----BEGIN PGP MESSAGE----- + +hQEMAzhuiT4RC8VbAQf+NMwFZv9tVcUOo13hi7r2Z5V294dseTFk+q3nE//ZmWx6 +6LL70Ggdc3etozf9w6uriQG0wbrfy7XwOYkpFJYaJb2gut0xxG/Fw221ZGR8elpe +79FzveD0FUZK+UdixXMiqYQOiwUK5+RbjhKN+R3WjG5mrClHDeCG5WrXFPvT9wOX +dA3ED/ZczrNxvSbKeE1imFoCeudrC9/zo/CRmb0BHrIoOEe+vCe/MzN0s/fkiq1k +RyZZxJ0M6/oudlcexyaYJcTdBTW1ZMNmmZ9lWsBjmf5kTKGu1tDUuMU9RBJmtyYn +8euaTwwCOfZjz6vdKoGer07ftEyfbjDGuU6zOtN9Z4UBDANcG2tp6fXqvgEH/i++ +MbBnFbCOajtIN2xN8P6WiP6RjPmUKaKLJCltZHqPPYuULFWTa8uBIbfqVjtgFfoE +43eBgP+D1EQooq6O7NqWcoCY7LphwKx///oWsmeuiRy+wwQOGMV45tF31n944P1U +qZGhik8n7pxLkNaC5ohaucQJeaDSi7GuMATzBWdGY6lZkaNLUfrPmKXu9tyIaA4u +b80gPvFWc/9PgnS9rAcVPA7/8Il53EVJJsYK2/S7nDFKRTJfThId4cvvAFUXkuwW +hM6FEFcJdSW90qbhBGCwr3yfvSb3Je0k7gYrg9iQTLxRpIbG3Mbz9irEKno3alGe +8FvuAdmVn1AVyxNomT+FAgwDodoT8VqRl4UBD/0bGb+8AF3mTmEMTHAPPLUIxhu+ +ihb4Po6OxQ7u/UCMSHXhFQCqb4ytK2JsG2UhcIiYrQrZMVGQh3rGcfZESHpX1/Ol +rTwkjUZSSnek8M1hbEkS7PU2rePsXt/O07+zenqMO3pMeVsX+VLEGXRS6KZ9WXsc +X09iLyqBErgntaM7otMSZVSPV7VFEaIoVdPe4NZxudDMedeA0hr1BneaVUNVjMtQ +UE6ZVxzFSoqMnfsJY9/dn/uhHdv7qOhzw0ABINmDybI6IWNEaGzzSJC2HcHZjocY +h2Se4mzxjOz9X4CG28h8b9jFHRtSe4OiSAQYxwtZNxGR6kCt1PfP20oemPBg+LXF +T6+ledT5nkkaoztl5EOxKoh20BfbNOR2AWbPYuRLJ7OKF/dFDJ3PndPwSjEvPZDY +xpvHszqVlcMpleqM/iQILD33Nzz9RhtSZHQiGYuZsak4aeUWsz/3a6JhtcliK4Do +CyG3J1wVf0a+jsXlDr0M50qf+aY7k76zTqtfXcPSKypMeP0yZaYMPCnzHvFpj39u +u3NGOEiwv1WXMrUn59SuL9X6aP5s4D455E5JDuOFKrndN78CKqsCTPkNGOCU5F+f +B25lXY0yF22RLiHbWiAGcM+u4roi7qA8HWYly6lqOl3imk3D3NJP+ENGyoCxgfcQ +GIrl8z5fyE5GtJUgAoUCDAPiA8lOXOuz7wEP/3ZmA1njB/F1nu/vafx90O0A0Mmr +J7EvveK89W2P5JsZjEX/sVSurx1kY4U1Lofe00jdbsQNtfQ33/K2zr0Vb5G3VQxL +QnsksGO9MjywwVzpspuS+gQE2P6VU5YjpHXA15SJXkJV/SDvxoPSyPt5x3m0nU+9 +aj43mTgKdvTwSeDoEHzt54KRY01HOugpmmY/TZ5Wkeam2vNsCaSEYAdDTaSEG6j2 +OfxQf8X/A+7RDwyoVpjDg4LVAyHmcomWBeudEH4GkR+oGC7YQ39QEb4TA9h0ufUO +2N1XWIf+FraCFX5x9IeKoe9ZyInz8I24lRMRHHu1RGrluGHCjflVzuIfOnasy2yU +CX9EhvoL2IxfCLdY2XoBcrpCcTr+FKu4/5n4P3WN6TSWnQlvCypgfJ7zkjGWytQW +cChiouYvYLcW2q1opZgIu3J1i7QnS6qHzzhK3eejBF0DZinxs+q7cylVVAzrq5FF +p6+v6OXlZRbsIoBJg0kKKfUwqzoTwEFPvPMvoIxCmwSfQmlnemeXrXdahqbZXgIO +a4jMrlene6Asr7xVZ+9siv5plPogmvWco/950KmKlXOUEg439nADHzhTYPmai5YA +i9inb9B1sAcpbYQejUwnIx+W11qsyE2PAXHdj/mvVm1fzO/VJ+yGHtnKwfGtHS8x +v7vq6yCM2p4HPQLC0m4BFIvHf29iKhYDxjj3F0d//VEez/43+79bDDakmS5StD6P +DykfoaYRBijgUfhG4a4UbMBbpIuukwBI0EQoy+3Sca6Rx1Da5lIPXuMbb/N/c4rp +xL1OhySubTvzg2yTCfUoEuMWZFL1rwgT9lrKVg== +=Do8R +-----END PGP MESSAGE----- diff --git a/ops/monitoring/secrets/plain/.gitignore b/ops/monitoring/secrets/plain/.gitignore new file mode 100644 index 00000000..72e8ffc0 --- /dev/null +++ b/ops/monitoring/secrets/plain/.gitignore @@ -0,0 +1 @@ +*