2019-01-13 21:06:33 +00:00
|
|
|
# Top level cluster configuration.
|
|
|
|
|
|
|
|
local kube = import "../../kube/kube.libsonnet";
|
2019-08-29 18:12:24 +00:00
|
|
|
local policies = import "../../kube/policies.libsonnet";
|
2019-06-20 22:24:09 +00:00
|
|
|
|
2019-01-17 17:57:19 +00:00
|
|
|
local calico = import "lib/calico.libsonnet";
|
2019-06-20 22:24:09 +00:00
|
|
|
local certmanager = import "lib/cert-manager.libsonnet";
|
|
|
|
local cockroachdb = import "lib/cockroachdb.libsonnet";
|
|
|
|
local coredns = import "lib/coredns.libsonnet";
|
2019-01-18 08:40:59 +00:00
|
|
|
local metallb = import "lib/metallb.libsonnet";
|
2019-06-20 22:24:09 +00:00
|
|
|
local metrics = import "lib/metrics.libsonnet";
|
2019-04-01 15:56:28 +00:00
|
|
|
local nginx = import "lib/nginx.libsonnet";
|
2019-08-29 18:12:24 +00:00
|
|
|
local prodvider = import "lib/prodvider.libsonnet";
|
2019-07-21 14:56:41 +00:00
|
|
|
local registry = import "lib/registry.libsonnet";
|
2019-04-01 16:40:50 +00:00
|
|
|
local rook = import "lib/rook.libsonnet";
|
2019-10-02 18:46:48 +00:00
|
|
|
local pki = import "lib/pki.libsonnet";
|
2019-01-13 21:06:33 +00:00
|
|
|
|
2019-10-02 18:46:48 +00:00
|
|
|
local Cluster(short, realm) = {
|
2019-01-13 21:06:33 +00:00
|
|
|
local cluster = self,
|
2019-07-21 14:56:41 +00:00
|
|
|
local cfg = cluster.cfg,
|
|
|
|
|
2019-10-02 18:46:48 +00:00
|
|
|
short:: short,
|
|
|
|
realm:: realm,
|
|
|
|
fqdn:: "%s.%s" % [cluster.short, cluster.realm],
|
|
|
|
|
2019-07-21 14:56:41 +00:00
|
|
|
cfg:: {
|
|
|
|
// Storage class used for internal services (like registry). This must
|
|
|
|
// be set to a valid storage class. This can either be a cloud provider class
|
|
|
|
// (when running on GKE &co) or a storage class created using rook.
|
|
|
|
storageClassNameRedundant: error "storageClassNameRedundant must be set",
|
|
|
|
},
|
2019-01-13 21:06:33 +00:00
|
|
|
|
|
|
|
// These are required to let the API Server contact kubelets.
|
|
|
|
crAPIServerToKubelet: kube.ClusterRole("system:kube-apiserver-to-kubelet") {
|
|
|
|
metadata+: {
|
|
|
|
annotations+: {
|
|
|
|
"rbac.authorization.kubernetes.io/autoupdate": "true",
|
|
|
|
},
|
|
|
|
labels+: {
|
2019-08-29 18:12:24 +00:00
|
|
|
"kubernetes.io/bootstrapping": "rbac-defaults",
|
2019-01-13 21:06:33 +00:00
|
|
|
},
|
|
|
|
},
|
|
|
|
rules: [
|
|
|
|
{
|
|
|
|
apiGroups: [""],
|
|
|
|
resources: ["nodes/%s" % r for r in [ "proxy", "stats", "log", "spec", "metrics" ]],
|
|
|
|
verbs: ["*"],
|
|
|
|
},
|
|
|
|
],
|
|
|
|
},
|
2019-01-13 21:08:05 +00:00
|
|
|
crbAPIServer: kube.ClusterRoleBinding("system:kube-apiserver") {
|
2019-01-13 21:06:33 +00:00
|
|
|
roleRef: {
|
|
|
|
apiGroup: "rbac.authorization.k8s.io",
|
|
|
|
kind: "ClusterRole",
|
|
|
|
name: cluster.crAPIServerToKubelet.metadata.name,
|
|
|
|
},
|
|
|
|
subjects: [
|
|
|
|
{
|
|
|
|
apiGroup: "rbac.authorization.k8s.io",
|
|
|
|
kind: "User",
|
|
|
|
# A cluster API Server authenticates with a certificate whose CN is == to the FQDN of the cluster.
|
2019-10-02 18:46:48 +00:00
|
|
|
name: cluster.fqdn,
|
2019-01-13 21:06:33 +00:00
|
|
|
},
|
|
|
|
],
|
2019-01-13 23:02:59 +00:00
|
|
|
},
|
|
|
|
|
2019-08-29 18:12:24 +00:00
|
|
|
// This ClusteRole is bound to all humans that log in via prodaccess/prodvider/SSO.
|
|
|
|
// It should allow viewing of non-sensitive data for debugability and openness.
|
|
|
|
crViewer: kube.ClusterRole("system:viewer") {
|
|
|
|
rules: [
|
|
|
|
{
|
|
|
|
apiGroups: [""],
|
|
|
|
resources: [
|
|
|
|
"nodes",
|
|
|
|
"namespaces",
|
|
|
|
"pods",
|
|
|
|
"configmaps",
|
|
|
|
"services",
|
|
|
|
],
|
|
|
|
verbs: ["list"],
|
|
|
|
},
|
|
|
|
{
|
|
|
|
apiGroups: ["metrics.k8s.io"],
|
|
|
|
resources: [
|
|
|
|
"nodes",
|
|
|
|
"pods",
|
|
|
|
],
|
|
|
|
verbs: ["list"],
|
|
|
|
},
|
|
|
|
{
|
|
|
|
apiGroups: ["apps"],
|
|
|
|
resources: [
|
|
|
|
"statefulsets",
|
|
|
|
],
|
|
|
|
verbs: ["list"],
|
|
|
|
},
|
|
|
|
{
|
|
|
|
apiGroups: ["extensions"],
|
|
|
|
resources: [
|
|
|
|
"deployments",
|
|
|
|
"ingresses",
|
|
|
|
],
|
|
|
|
verbs: ["list"],
|
|
|
|
}
|
|
|
|
],
|
|
|
|
},
|
|
|
|
// This ClusterRole is applied (scoped to personal namespace) to all humans.
|
|
|
|
crFullInNamespace: kube.ClusterRole("system:admin-namespace") {
|
|
|
|
rules: [
|
|
|
|
{
|
|
|
|
apiGroups: ["*"],
|
|
|
|
resources: ["*"],
|
|
|
|
verbs: ["*"],
|
|
|
|
},
|
|
|
|
],
|
|
|
|
},
|
|
|
|
// This ClusterRoleBindings allows root access to cluster admins.
|
|
|
|
crbAdmins: kube.ClusterRoleBinding("system:admins") {
|
|
|
|
roleRef: {
|
|
|
|
apiGroup: "rbac.authorization.k8s.io",
|
|
|
|
kind: "ClusterRole",
|
|
|
|
name: "cluster-admin",
|
|
|
|
},
|
|
|
|
subjects: [
|
|
|
|
{
|
|
|
|
apiGroup: "rbac.authorization.k8s.io",
|
|
|
|
kind: "User",
|
|
|
|
name: user + "@hackerspace.pl",
|
|
|
|
} for user in [
|
|
|
|
"q3k",
|
|
|
|
"implr",
|
|
|
|
"informatic",
|
|
|
|
]
|
|
|
|
],
|
|
|
|
},
|
|
|
|
|
|
|
|
podSecurityPolicies: policies.Cluster {},
|
|
|
|
|
|
|
|
allowInsecureNamespaces: [
|
|
|
|
policies.AllowNamespaceInsecure("kube-system"),
|
|
|
|
# TODO(q3k): fix this?
|
|
|
|
policies.AllowNamespaceInsecure("ceph-waw2"),
|
2019-09-25 00:51:51 +00:00
|
|
|
policies.AllowNamespaceInsecure("matrix"),
|
|
|
|
policies.AllowNamespaceInsecure("registry"),
|
|
|
|
policies.AllowNamespaceInsecure("internet"),
|
2019-08-29 18:12:24 +00:00
|
|
|
],
|
|
|
|
|
|
|
|
// Allow all service accounts (thus all controllers) to create secure pods.
|
|
|
|
crbAllowServiceAccountsSecure: kube.ClusterRoleBinding("policy:allow-all-secure") {
|
|
|
|
roleRef_: cluster.podSecurityPolicies.secureRole,
|
|
|
|
subjects: [
|
|
|
|
{
|
|
|
|
kind: "Group",
|
|
|
|
apiGroup: "rbac.authorization.k8s.io",
|
|
|
|
name: "system:serviceaccounts",
|
|
|
|
}
|
|
|
|
],
|
|
|
|
},
|
|
|
|
|
2019-01-17 17:57:19 +00:00
|
|
|
// Calico network fabric
|
|
|
|
calico: calico.Environment {},
|
2019-01-13 23:02:59 +00:00
|
|
|
// CoreDNS for this cluster.
|
2019-10-02 18:47:18 +00:00
|
|
|
dns: coredns.Environment {
|
|
|
|
cfg+: {
|
|
|
|
cluster_domains: [
|
|
|
|
"cluster.local",
|
2019-10-02 18:46:48 +00:00
|
|
|
cluster.fqdn,
|
2019-10-02 18:47:18 +00:00
|
|
|
],
|
|
|
|
},
|
|
|
|
},
|
2019-01-17 17:57:19 +00:00
|
|
|
// Metrics Server
|
|
|
|
metrics: metrics.Environment {},
|
2019-01-18 08:40:59 +00:00
|
|
|
// Metal Load Balancer
|
2019-04-01 16:00:44 +00:00
|
|
|
metallb: metallb.Environment {
|
|
|
|
cfg+: {
|
|
|
|
addressPools: [
|
|
|
|
{ name: "public-v4-1", protocol: "layer2", addresses: ["185.236.240.50-185.236.240.63"] },
|
|
|
|
],
|
|
|
|
},
|
|
|
|
},
|
2019-04-01 15:56:28 +00:00
|
|
|
// Main nginx Ingress Controller
|
|
|
|
nginx: nginx.Environment {},
|
2019-04-02 11:20:15 +00:00
|
|
|
certmanager: certmanager.Environment {},
|
2019-10-02 18:59:26 +00:00
|
|
|
issuer: kube.ClusterIssuer("letsencrypt-prod") {
|
2019-04-02 12:44:04 +00:00
|
|
|
spec: {
|
|
|
|
acme: {
|
|
|
|
server: "https://acme-v02.api.letsencrypt.org/directory",
|
|
|
|
email: "bofh@hackerspace.pl",
|
|
|
|
privateKeySecretRef: {
|
|
|
|
name: "letsencrypt-prod"
|
|
|
|
},
|
|
|
|
http01: {},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
2019-04-01 22:06:13 +00:00
|
|
|
|
2019-04-01 16:40:50 +00:00
|
|
|
// Rook Ceph storage
|
2019-06-20 14:42:19 +00:00
|
|
|
rook: rook.Operator {
|
|
|
|
operator+: {
|
|
|
|
spec+: {
|
|
|
|
// TODO(q3k): Bring up the operator again when stability gets fixed
|
|
|
|
// See: https://github.com/rook/rook/issues/3059#issuecomment-492378873
|
2019-08-08 15:48:25 +00:00
|
|
|
replicas: 1,
|
2019-06-20 14:42:19 +00:00
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
2019-07-21 14:56:41 +00:00
|
|
|
|
|
|
|
// Docker registry
|
|
|
|
registry: registry.Environment {
|
|
|
|
cfg+: {
|
2019-10-02 18:46:48 +00:00
|
|
|
domain: "registry.%s" % [cluster.fqdn],
|
2019-08-08 15:48:25 +00:00
|
|
|
storageClassName: cfg.storageClassNameParanoid,
|
|
|
|
objectStorageName: "waw-hdd-redundant-2-object",
|
2019-07-21 14:56:41 +00:00
|
|
|
},
|
|
|
|
},
|
2019-08-29 18:12:24 +00:00
|
|
|
|
2019-10-02 18:46:48 +00:00
|
|
|
// TLS PKI machinery
|
|
|
|
pki: pki.Environment(cluster.short, cluster.realm),
|
|
|
|
|
2019-08-29 18:12:24 +00:00
|
|
|
// Prodvider
|
2019-10-04 11:46:39 +00:00
|
|
|
prodvider: prodvider.Environment {
|
|
|
|
cfg+: {
|
|
|
|
apiEndpoint: "kubernetes.default.svc.%s" % [cluster.fqdn],
|
|
|
|
},
|
|
|
|
},
|
2019-06-20 22:24:09 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
{
|
|
|
|
k0: {
|
|
|
|
local k0 = self,
|
2019-10-02 18:46:48 +00:00
|
|
|
cluster: Cluster("k0", "hswaw.net") {
|
2019-07-21 14:56:41 +00:00
|
|
|
cfg+: {
|
2019-08-08 15:48:25 +00:00
|
|
|
storageClassNameParanoid: k0.ceph.blockParanoid.name,
|
2019-07-21 14:56:41 +00:00
|
|
|
},
|
|
|
|
},
|
2019-06-20 22:24:09 +00:00
|
|
|
cockroach: {
|
2019-08-09 12:13:50 +00:00
|
|
|
waw2: cockroachdb.Cluster("crdb-waw1") {
|
2019-06-20 22:24:09 +00:00
|
|
|
cfg+: {
|
|
|
|
topology: [
|
2019-06-22 00:07:41 +00:00
|
|
|
{ name: "bc01n01", node: "bc01n01.hswaw.net" },
|
|
|
|
{ name: "bc01n02", node: "bc01n02.hswaw.net" },
|
|
|
|
{ name: "bc01n03", node: "bc01n03.hswaw.net" },
|
2019-06-20 22:24:09 +00:00
|
|
|
],
|
2019-08-09 12:13:50 +00:00
|
|
|
hostPath: "/var/db/crdb-waw1",
|
2019-04-01 22:06:13 +00:00
|
|
|
},
|
|
|
|
},
|
2019-08-01 18:16:27 +00:00
|
|
|
clients: {
|
|
|
|
cccampix: k0.cockroach.waw2.Client("cccampix"),
|
|
|
|
cccampixDev: k0.cockroach.waw2.Client("cccampix-dev"),
|
|
|
|
},
|
2019-04-01 22:06:13 +00:00
|
|
|
},
|
2019-06-20 22:24:09 +00:00
|
|
|
ceph: {
|
2019-08-08 15:48:25 +00:00
|
|
|
// waw1 cluster - dead as of 2019/08/06, data corruption
|
|
|
|
// waw2 cluster
|
|
|
|
waw2: rook.Cluster(k0.cluster.rook, "ceph-waw2") {
|
2019-06-20 22:24:09 +00:00
|
|
|
spec: {
|
|
|
|
mon: {
|
|
|
|
count: 3,
|
|
|
|
allowMultiplePerNode: false,
|
|
|
|
},
|
|
|
|
storage: {
|
|
|
|
useAllNodes: false,
|
|
|
|
useAllDevices: false,
|
|
|
|
config: {
|
|
|
|
databaseSizeMB: "1024",
|
|
|
|
journalSizeMB: "1024",
|
|
|
|
},
|
|
|
|
nodes: [
|
|
|
|
{
|
|
|
|
name: "bc01n01.hswaw.net",
|
|
|
|
location: "rack=dcr01 chassis=bc01 host=bc01n01",
|
|
|
|
devices: [ { name: "sda" } ],
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "bc01n02.hswaw.net",
|
|
|
|
location: "rack=dcr01 chassis=bc01 host=bc01n02",
|
|
|
|
devices: [ { name: "sda" } ],
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "bc01n03.hswaw.net",
|
|
|
|
location: "rack=dcr01 chassis=bc01 host=bc01n03",
|
|
|
|
devices: [ { name: "sda" } ],
|
|
|
|
},
|
|
|
|
],
|
|
|
|
},
|
Get in the Cluster, Benji!
Here we introduce benji [1], a backup system based on backy2. It lets us
backup Ceph RBD objects from Rook into Wasabi, our offsite S3-compatible
storage provider.
Benji runs as a k8s CronJob, every hour at 42 minutes. It does the
following:
- runs benji-pvc-backup, which iterates over all PVCs in k8s, and backs
up their respective PVs to Wasabi
- runs benji enforce, marking backups outside our backup policy [2] as
to be deleted
- runs benji cleanup, to remove unneeded backups
- runs a custom script to backup benji's sqlite3 database into wasabi
(unencrypted, but we're fine with that - as the metadata only contains
image/pool names, thus Ceph PV and pool names)
[1] - https://benji-backup.me/index.html
[2] - latest3,hours48,days7,months12, which means the latest 3 backups,
then one backup for the next 48 hours, then one backup for the next
7 days, then one backup for the next 12 months, for a total of 65
backups (deduplicated, of course)
We also drive-by update some docs (make them mmore separated into
user/admin docs).
Change-Id: Ibe0942fd38bc232399c0e1eaddade3f4c98bc6b4
2019-08-31 14:33:29 +00:00
|
|
|
benji:: {
|
|
|
|
metadataStorageClass: "waw-hdd-paranoid-2",
|
|
|
|
encryptionPassword: std.split((importstr "../secrets/plain/k0-benji-encryption-password"), '\n')[0],
|
|
|
|
pools: [
|
|
|
|
"waw-hdd-redundant-2",
|
|
|
|
"waw-hdd-redundant-2-metadata",
|
|
|
|
"waw-hdd-paranoid-2",
|
|
|
|
"waw-hdd-yolo-2",
|
|
|
|
],
|
|
|
|
s3Configuration: {
|
|
|
|
awsAccessKeyId: "RPYZIROFXNLQVU2WJ4R3",
|
|
|
|
awsSecretAccessKey: std.split((importstr "../secrets/plain/k0-benji-secret-access-key"), '\n')[0],
|
|
|
|
bucketName: "benji-k0-backups",
|
|
|
|
endpointUrl: "https://s3.eu-central-1.wasabisys.com/",
|
|
|
|
},
|
|
|
|
}
|
2019-06-20 22:24:09 +00:00
|
|
|
},
|
2019-04-01 23:05:38 +00:00
|
|
|
},
|
2019-06-20 22:24:09 +00:00
|
|
|
// redundant block storage
|
2019-08-08 15:48:25 +00:00
|
|
|
blockRedundant: rook.ECBlockPool(k0.ceph.waw2, "waw-hdd-redundant-2") {
|
2019-06-20 22:24:09 +00:00
|
|
|
spec: {
|
|
|
|
failureDomain: "host",
|
|
|
|
erasureCoded: {
|
|
|
|
dataChunks: 2,
|
|
|
|
codingChunks: 1,
|
|
|
|
},
|
|
|
|
},
|
2019-05-17 16:08:48 +00:00
|
|
|
},
|
2019-08-08 15:48:25 +00:00
|
|
|
// paranoid block storage (3 replicas)
|
|
|
|
blockParanoid: rook.ReplicatedBlockPool(k0.ceph.waw2, "waw-hdd-paranoid-2") {
|
|
|
|
spec: {
|
|
|
|
failureDomain: "host",
|
|
|
|
replicated: {
|
|
|
|
size: 3,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
2019-06-20 22:24:09 +00:00
|
|
|
// yolo block storage (no replicas!)
|
2019-08-08 15:48:25 +00:00
|
|
|
blockYolo: rook.ReplicatedBlockPool(k0.ceph.waw2, "waw-hdd-yolo-2") {
|
2019-06-20 22:24:09 +00:00
|
|
|
spec: {
|
|
|
|
failureDomain: "host",
|
|
|
|
replicated: {
|
|
|
|
size: 1,
|
|
|
|
},
|
|
|
|
},
|
2019-04-07 16:49:41 +00:00
|
|
|
},
|
2019-08-08 15:48:25 +00:00
|
|
|
objectRedundant: rook.S3ObjectStore(k0.ceph.waw2, "waw-hdd-redundant-2-object") {
|
2019-06-20 22:24:09 +00:00
|
|
|
spec: {
|
|
|
|
metadataPool: {
|
|
|
|
failureDomain: "host",
|
|
|
|
replicated: { size: 3 },
|
|
|
|
},
|
|
|
|
dataPool: {
|
|
|
|
failureDomain: "host",
|
|
|
|
erasureCoded: {
|
|
|
|
dataChunks: 2,
|
|
|
|
codingChunks: 1,
|
|
|
|
},
|
|
|
|
},
|
2019-04-07 16:49:41 +00:00
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
2019-09-02 14:32:40 +00:00
|
|
|
|
|
|
|
# Used for owncloud.hackerspace.pl, which for now lices on boston-packets.hackerspace.pl.
|
|
|
|
nextcloud: kube._Object("ceph.rook.io/v1", "CephObjectStoreUser", "nextcloud") {
|
|
|
|
metadata+: {
|
|
|
|
namespace: "ceph-waw2",
|
|
|
|
},
|
|
|
|
spec: {
|
|
|
|
store: "waw-hdd-redundant-2-object",
|
|
|
|
displayName: "nextcloud",
|
|
|
|
},
|
|
|
|
},
|
2019-04-07 16:49:41 +00:00
|
|
|
},
|
2019-01-13 21:06:33 +00:00
|
|
|
}
|