1
0
Fork 0

Merge changes I84873bc3,I1eedb190

* changes:
  ops/monitoring: deploy grafana
  ops/monitoring: scrape apiserver, scheduler, and controller-manager
master
q3k 2021-01-30 16:22:44 +00:00 committed by Gerrit Code Review
commit d82807e024
4 changed files with 309 additions and 61 deletions

View File

@ -9,6 +9,7 @@ local global = import "lib/global.libsonnet";
storageClasses+: {
prometheus: "waw-hdd-redundant-3",
victoria: "waw-hdd-redundant-3",
grafana: "waw-hdd-redundant-3",
},
},
@ -25,15 +26,22 @@ local global = import "lib/global.libsonnet";
// Global tier - victoria metrics.
global: global.Global("k0") {
cfg+: cfg {
oauth: {
clientId: "22659ba3-c8b2-4855-9553-f78884e0d743",
clientSecret: std.split(importstr "secrets/plain/global-oauth-client-secret", "\n")[0],
},
hosts: {
globalAPI: "monitoring-global-api.k0.hswaw.net",
globalDashboard: "monitoring-global-dashboard.k0.hswaw.net",
},
agents: [
// Ingestion from k0 cluster tier.
{ username: k0.cluster.cfg.username, password: std.split(importstr "secrets/plain/global-agent-cluster-k0", "\n")[0], },
// Access from q3k's test Grafana.
{ username: "grafana", password: std.split(importstr "secrets/plain/global-agent-grafana", "\n")[0], },
],
loopbackGrafanaUser: {
username: "grafana",
password: std.split(importstr "secrets/plain/global-agent-grafana", "\n")[0],
},
},
},
}

View File

@ -60,63 +60,106 @@ local kube = import "../../../kube/kube.libsonnet";
bearer_token_file: "/var/run/secrets/kubernetes.io/serviceaccount/token",
},
scrape_configs: [
// When scraping node-based metrics (ie. node and cadvisor metrics) we contact
// the metrics endpoints on the kubelet via the API server. This is done by
// relabeling _address__ and __metrics_path__ to point at the k8s API server,
// and at the API server proxy path to reach a node's metrics endpoint.
//
// This approach was lifted from the prometheus examples for Kubernetes, and
// while the benefits outlined there do not matter that much to us (our
// kubelets listen on public addresses, anyway), we still enjoy this approach
// for the fact that we don't have to hardcode the kubelet TLS port.
//
// https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml
//
// When contacting the API server, we hardcode the 'hswaw.net' DNS suffix as
// our API server's TLS certificate only has a CN/SAN for its full FQDN, not
// the .svc.cluster.local shorthand (see //cluster/clustercfg:clustercfg.py).
// Scrape Kubernetes node metrics via apiserver. This emites kube_node_* metrics.
kubeScrapeConfig("cluster_node_metrics", "node") {
relabel_configs: [
{
action: "labelmap",
regex: "__meta_kubernetes_node_label_(.+)",
},
{
action: "replace",
target_label: "__address__",
replacement: "kubernetes.default.svc.%s.hswaw.net:443" % [cluster.cfg.name],
},
{
target_label: "__metrics_path__",
source_labels: ["__meta_kubernetes_node_name"],
regex: "(.+)",
replacement: "/api/v1/nodes/${1}/proxy/metrics",
},
],
// When scraping node-based metrics (ie. node and cadvisor metrics) we contact
// the metrics endpoints on the kubelet via the API server. This is done by
// relabeling _address__ and __metrics_path__ to point at the k8s API server,
// and at the API server proxy path to reach a node's metrics endpoint.
//
// This approach was lifted from the prometheus examples for Kubernetes, and
// while the benefits outlined there do not matter that much to us (our
// kubelets listen on public addresses, anyway), we still enjoy this approach
// for the fact that we don't have to hardcode the kubelet TLS port.
//
// https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml
//
// When contacting the API server, we hardcode the 'hswaw.net' DNS suffix as
// our API server's TLS certificate only has a CN/SAN for its full FQDN, not
// the .svc.cluster.local shorthand (see //cluster/clustercfg:clustercfg.py).
local kubeScrapeNodeMetrics = function(name, path) kubeScrapeConfig(name, "node") {
relabel_configs: [
{
action: "labelmap",
regex: "__meta_kubernetes_node_label_(.+)",
},
{
action: "replace",
target_label: "__address__",
replacement: "kubernetes.default.svc.%s.hswaw.net:443" % [cluster.cfg.name],
},
{
target_label: "__metrics_path__",
source_labels: ["__meta_kubernetes_node_name"],
regex: "(.+)",
replacement: "/api/v1/nodes/${1}/proxy" + path,
},
],
},
// When scraping API server-colocated metrics (ie. metrics from nixos services running alongside
// APIserver instances), we contact the metrics endpoints directly over the node's IP addresses
// and an external port. The node IP addresses are discovered via Prometheus kubernetes endpoint
// discovery which selects all endpoints for the default/kubernetes service. This service is
// backed by apiserver instances on public IP addresses. We can then rewrite the received port
// by the port of the service we're interested in to get to that service.
local kubeScrapeAPIServerColocated = function(name, port) kubeScrapeConfig(name, "endpoints") {
relabel_configs: [
// Select only endpoints that back the default/kubernetes service. These are all
// public IP addresses of nodes that run the API server.
{
action: "keep",
regex: "default;kubernetes;https",
source_labels: [
"__meta_kubernetes_namespace",
"__meta_kubernetes_service_name",
"__meta_kubernetes_endpoint_port_name",
],
},
] + (if port == 4001 then [] else [
// Replace endpoint port with requested port, if the requested port is not the apiserver's
// port 4001, which is the one returned by default for the these endpoints.
{
action: "replace",
regex: "([^:]+):.+",
replacement: "$1:%d" % [port],
source_labels: [
"__address__",
],
target_label: "__address__",
},
]),
// We disable server-side TLS certificate verification.
// Unfortunately, all apiserver-colocated services run with TLS certificates that do not have
// the right IP address SAN. Unfortunately, we can't override the TLS ServerName for a scrape
// target [1], so the only two choiced we are left with are:
// 1) re-emit relevant certificates with IP address SANs that allow for access by IP.
// 2) disable TLS verification.
// We choose 2), knowing that if someone manages to hijack a target IP address they can end up
// stealing our bearer token and impersonating the service account with which Prometheus is
// running. In the long term, we hope for [1] to be resolved.
//
// TODO(q3k): revisit this once [1] gets fixed.
// [1] - https://github.com/prometheus/prometheus/issues/4827
tls_config: {
insecure_skip_verify: true,
},
},
scrape_configs: [
/// Scrape per-node metrics, proxied via the APIServer..
// Scrape Kubernetes node metrics via apiserver. This emits kube_node_* metrics.
kubeScrapeNodeMetrics("cluster_node_metrics", "/metrics"),
// Scrape Kubernetes node cadvisor metrics via apiserver. This emits container_* metrics.
kubeScrapeConfig("cluster_cadvisor_metrics", "node") {
relabel_configs: [
{
action: "labelmap",
regex: "__meta_kubernetes_node_label_(.+)",
},
{
action: "replace",
target_label: "__address__",
replacement: "kubernetes.default.svc.%s.hswaw.net:443" % [cluster.cfg.name],
},
{
target_label: "__metrics_path__",
source_labels: ["__meta_kubernetes_node_name"],
regex: "(.+)",
replacement: "/api/v1/nodes/${1}/proxy/metrics/cadvisor",
},
],
},
kubeScrapeNodeMetrics("cluster_cadvisor_metrics", "/metrics/cadvisor"),
/// Scape apiserver-colocated ('master node') metrics, over nodes' public IP addresses.
/// (currently all nodes are 'master' nodes)
// Scrape Kubernetes apiserver metrics.
kubeScrapeAPIServerColocated("cluster_apiserver_metrics", 4001),
// Scrape Kubernetes controller-manager metrics.
kubeScrapeAPIServerColocated("cluster_controllermanager_metrics", 4003),
// Scrape Kubernetes scheduler metrics.
kubeScrapeAPIServerColocated("cluster_scheduler_metrics", 4005),
],
remote_write: [
@ -152,6 +195,7 @@ local kube = import "../../../kube/kube.libsonnet";
{ nonResourceURLs: ["/metrics"], verbs: ["get"], },
// Allow to access node details for discovery.
{ apiGroups: [""], resources: ["nodes"], verbs: ["list", "watch", "get"], },
{ apiGroups: [""], resources: ["endpoints", "services", "pods"], verbs: ["list", "watch", "get"], },
// Allow to proxy to bare node HTTP to access per-node metrics endpoints.
{ apiGroups: [""], resources: ["nodes/proxy"], verbs: ["get"], },
],
@ -183,11 +227,11 @@ local kube = import "../../../kube/kube.libsonnet";
],
resources: {
requests: {
memory: "256Mi",
memory: "3Gi",
cpu: "100m",
},
limits: {
memory: "1Gi",
memory: "3Gi",
cpu: "1",
},
},

View File

@ -18,11 +18,13 @@ local kube = import "../../../kube/kube.libsonnet";
images: {
victoria: "victoriametrics/victoria-metrics:v1.40.0",
vmauth: "victoriametrics/vmauth:v1.40.0",
grafana: "grafana/grafana:7.2.1",
},
hosts: {
// DNS hostname that this global tier will use. Ingress will run under it.
globalAPI: error "hosts.globalAPI must be set",
globalDashboard: error "hosts.globalDashboard must be set",
},
storageClasses: {
@ -30,6 +32,11 @@ local kube = import "../../../kube/kube.libsonnet";
victoria: error "storageClasses.victoria must be set",
},
oauth: {
clientId: error "oauth.clientId must be set",
clientSecret: error "oauth.clientSecret must be set",
},
// A list of agents that will push metrics to this instance.
// List of:
// {
@ -41,10 +48,14 @@ local kube = import "../../../kube/kube.libsonnet";
// Generated URLs that agents should use to ship metrics over. Both require HTTP basic
// auth, configured via cfg.agents.
// The internal URL should be used for agents colocated in the same Kubernetes cluster.
// The internal URL that should be used for agents colocated in the same Kubernetes cluster.
internalIngestURL:: "http://%s/api/v1/write" % [global.victoria.serviceAPI.host_colon_port],
// The glboal URL should be used for agents sending data over the internet.
// The internal URL that should be used for readers colocated in the same Kubernetes cluster.
internalReadURL:: "http://%s/" % [global.victoria.serviceAPI.host_colon_port],
// The global URL that should be used for agents sending data over the internet.
globalIngestURL:: "https://%s/api/v1/write" % [cfg.hosts.globalAPI],
// The global URL that should be used for readers over the internet.
globalReadURL:: "https://%s" % [cfg.hosts.globalAPI],
namespace: kube.Namespace(cfg.namespace),
local ns = global.namespace,
@ -73,7 +84,7 @@ local kube = import "../../../kube/kube.libsonnet";
password: a.password,
url_prefix: "http://localhost:8428",
}
for a in cfg.agents
for a in (cfg.agents + [cfg.loopbackGrafanaUser])
],
}) + "\n")
},
@ -145,5 +156,150 @@ local kube = import "../../../kube/kube.libsonnet";
},
},
},
grafana: {
local grafana = self,
// grafana.ini, serialized to secret.
ini:: {
sections: {
"auth": {
"disable_login_form": true,
"oauth_auto_login": true,
},
"security": {
# We do not disable basic auth, as we want to use builtin
# users as API users (eg for config reload), but we want
# to disable the default admin:admin user.
"disable_initial_admin_creation": true,
},
"auth.generic_oauth": {
enabled: true,
client_id: cfg.oauth.clientId,
client_secret: cfg.oauth.clientSecret,
auth_url: "https://sso-v2.hackerspace.pl/oauth/authorize",
token_url: "https://sso-v2.hackerspace.pl/oauth/token",
api_url: "https://sso-v2.hackerspace.pl/api/1/userinfo",
scopes: "openid",
email_attribute_path: "email",
allow_sign_up: true,
role_attribute_path: "contains(groups, 'grafana-admin')",
},
"server": {
domain: cfg.hosts.globalDashboard,
root_url: "https://%s/" % [ cfg.hosts.globalDashboard ],
},
},
},
datasources:: {
apiVersion: 1,
datasources: [
{
name: "victoria-global",
type: "prometheus",
uid: "victoria-global",
isDefault: true,
url: global.internalReadURL,
basicAuth: true,
basicAuthUser: cfg.loopbackGrafanaUser.username,
secureJsonData: {
basicAuthPassword: cfg.loopbackGrafanaUser.password,
},
},
],
},
config: ns.Contain(kube.Secret("grafana-config")) {
data+: {
"grafana.ini": std.base64(std.manifestIni(grafana.ini)),
"datasources.yaml": std.base64(std.manifestYamlDoc(grafana.datasources)),
},
},
pvc: ns.Contain(kube.PersistentVolumeClaim("grafana-data")) {
spec+: {
storageClassName: cfg.storageClasses.grafana,
accessModes: ["ReadWriteOnce"],
resources: {
requests: {
storage: "8Gi",
},
},
},
},
deploy: ns.Contain(kube.Deployment("grafana")) {
spec+: {
template+: {
spec+: {
containers_: {
default: kube.Container("default") {
image: cfg.images.grafana,
ports_: {
public: { containerPort: 3000 },
},
env_: {
GF_PATHS_CONFIG: "/etc/hscloud-config/grafana.ini",
GF_PATHS_PROVISIONING: "/etc/hscloud-config/provisioning",
GF_PATHS_DATA: "/var/lib/grafana",
},
volumeMounts_: {
config: { mountPath: "/etc/hscloud-config", },
data: { mountPath: "/var/lib/grafana", },
},
resources: {
requests: { cpu: "100m", memory: "256M", },
limits: { cpu: "200m", memory: "512M", },
},
},
},
volumes_: {
data: kube.PersistentVolumeClaimVolume(grafana.pvc),
config: kube.SecretVolume(grafana.config) {
secret+: {
items: [
{ key: "grafana.ini", path: "grafana.ini", },
{ key: "datasources.yaml", path: "provisioning/datasources/datasources.yaml", },
],
},
},
},
},
},
},
},
service: ns.Contain(kube.Service("grafana-public")) {
target_pod: grafana.deploy.spec.template,
spec+: {
ports: [
{ name: "public", port: 3000, targetPort: 3000, protocol: "TCP" },
],
},
},
ingress: ns.Contain(kube.Ingress("grafana-public")) {
metadata+: {
annotations+: {
"kubernetes.io/tls-acme": "true",
"certmanager.k8s.io/cluster-issuer": "letsencrypt-prod",
},
},
spec+: {
tls: [
{ hosts: [cfg.hosts.globalDashboard], secretName: "ingress-grafana-tls" },
],
rules: [
{
host: cfg.hosts.globalDashboard,
http: {
paths: [ { path: "/", backend: { serviceName: grafana.service.metadata.name, servicePort: 3000 } }, ],
},
}
],
},
},
},
}
}

View File

@ -0,0 +1,40 @@
-----BEGIN PGP MESSAGE-----
hQEMAzhuiT4RC8VbAQf9Ei3B4VGp5X1sBBvdpD0P1gbZcOMuQrChLKf4WFTkJ31V
7iK88YzJXM1VN0/GdTS4xk30D9Bh6nkbyWqSQ6e5mI6rU06DHjEF4nH/rCVNNImx
2lsAfHkvyBYV2rzMD+v7o/WWcR0RzemtopJvJXahM39Dd4WKQEqilcvwFM3p/zAG
p9svNEpangRCw4viNeP8RzBIHl6d73gcLwYtlmmj/URR4hVh0QByvJE+8tZJaelg
D2ILnnv30If51H6iRjUSdQYiScPyAc0Ooe7nLNyiZJHe2unv1wpFK/ppW5nTLc6J
Jl3ku5k5Fza5GLImxT+r3LFaGCUZwI2Ilh+aixOd8YUBDANcG2tp6fXqvgEIAM4s
Vty4caVhY8wIK4shv+2N8VXxaa8AHBMycfsAdrMG7ohrVLBJcNCs2CfYDRcLLxXq
y/PU53hffCgg19g1np+8rsYis5JXS8Uqri/54T/S4cMid1UaCq2BIs+1A/9j780G
4GGArAFDS451t5QjWzXl2W0ZVTeTSVC3s93psht10cZt8APAxlefkoPwSbb2kYz5
CCOmUGGLwHB87xBl0jRZ55A2Qe77637YEvbRBr79OhztSIJ1WJjkNFLqOVbCDcR0
IH9kVES2fN/4KCI772P+Rmh330B13UHk9xnu1xEJsi57HjCof+zwGvmEfNrKtS9d
knHAlDPycEVnQMDVNUOFAgwDodoT8VqRl4UBD/902MbY7Psg+wm7s1ybsclWRA1q
lJToPhB1NeDhdh/9l51kWT5JvUjS6jCvoGHyJvnxXR6Ot3i+8mjEiHZf6amu5gvq
skvzQwt+XwtIOaUxJChfRhk+GoyT6EpSHXYDNWKfWPG4gUaM42o8S7BObyjGjwXE
kTf3bvw50YNqJo7DmSJ1yS/sY4/J9wWT0jz0jSc9PjpAI9qw8vbWSrfbMa7EWos3
ENyIDl0GlF5S13J5GtyOCQLh9TsHi+zCe/jhmu4uhSeHxyuGru+UvNE1ME0XIUAS
fUJ5dLIfdLH+ILBRBZ+G0XRT/3XkWlyhuRZf7ALU3tG1wXRV1evc0zv7kEcz2hQm
gUPXkZzcFIG1cO3r9FhBvAM86p+UHSdsXdRXSVWsH12QFDjv8ZollPzO3ZztQI6a
R6E3WQ1nyiFjVTHKrCus89UDqBtAiYujfuwLcDYP9wMBW7JpETd1qurccSnL3duh
3jkKGHeskQPkB9UrT1P66zUjT/gAFDy5/sfVxoO5y+jPAJS9owYrONAoQtTL0HcA
4ixmaDb3ZzBt1LAfDDlGSjt4agQVfVLeGPF/zrFS4GrqzPDREyfTAsYdokA+y0LM
XI6mSsHd01HPGpRbsE5ABOO88sqRnuD8KBxWpgaG+Z8zn1uuf7n1L2JRWpFcd8h/
C09qbhK0+9C80HBZqoUCDAPiA8lOXOuz7wEP/R81sepe2UgcwMuBQmrn30y+kN0i
93zhYDVJFYUF07b7ociu2OnGFCnFF3ZQNao3ZvSuKoCKkQvcf7mxHA9xkFjiwGAi
elhHDQcUt8IriosGNhSArujEZ1kc1Nk9MWQKRSLhVXNtdTrn4e15OPXO+AR7CszW
Kz9Mwo9BNPzu7Zwq1JfUOExpDPT6fPVHZNnzg3KU4s2HRcrLD9JEE2i2/VxbmszH
aTy+/1kF8hHSfRV0Q7NcjRAbztWrd47HqsWmmWzjcjnSKNV1n7P5AcB06Yjdf0+0
xEuehwseJs6OhL3MxCsQoFuM9xhm7W/rfGQe+JvJc9Hxb60AgoMGJ1GSHz8xhjyx
EOujnIabcUeOm0h0twEi98+OJTlKss1YPdcKMPCit7SJZX8k6t2deOp8t0x9R5hH
v30DRSVgNeqDkBK0dEouR3xLzNz8yardFqVpM88w4D/npUQ5RB6+1af5LFYrm4zG
kEit4bYdJVpfgt0ZRFoyWaAiAt07ARFmoWeQRRDrpbX+ddKAFmvHl0oyRy/QF2xx
P6YT8UyEDNraXchAf4cBjuCuiRyqVqaPAOLp3rKmEBBiXddRX9fsq24/9X5QY4o8
Kemf0fbH9ndsL4vPrJI/j7nvbgq2dpFuHlnFgE5EUEFoPcDI1GI6hUr5UUffnjzM
aOPp1vxrxhwQy0IG0nQBTdekgVaPiqP+AxVfQbSjz6zNotSJMPAvbx1aNWmxesXO
eZYeMaSVRnSHub97eb6hn167olrcrAzPxFssb7iTEQh2Xs6PeWbe0FsTz0Fim/yY
iIw5GlFw15/afo86hbDgrK0j2ZiafKvZYC2EtKoYGzAoxA==
=8iTI
-----END PGP MESSAGE-----