2020-06-05 23:21:45 +00:00
|
|
|
// k0.hswaw.net kubernetes cluster
|
|
|
|
// This defines the cluster as a single object.
|
|
|
|
// Use the sibling k0*.jsonnet 'view' files to actually apply the configuration.
|
2019-01-13 21:06:33 +00:00
|
|
|
|
|
|
|
local kube = import "../../kube/kube.libsonnet";
|
2019-08-29 18:12:24 +00:00
|
|
|
local policies = import "../../kube/policies.libsonnet";
|
2019-06-20 22:24:09 +00:00
|
|
|
|
2020-06-05 23:21:45 +00:00
|
|
|
local cluster = import "cluster.libsonnet";
|
|
|
|
|
2019-06-20 22:24:09 +00:00
|
|
|
local cockroachdb = import "lib/cockroachdb.libsonnet";
|
2019-07-21 14:56:41 +00:00
|
|
|
local registry = import "lib/registry.libsonnet";
|
2019-04-01 16:40:50 +00:00
|
|
|
local rook = import "lib/rook.libsonnet";
|
2019-01-13 23:02:59 +00:00
|
|
|
|
2020-06-05 23:21:45 +00:00
|
|
|
{
|
|
|
|
k0: {
|
|
|
|
local k0 = self,
|
|
|
|
cluster: cluster.Cluster("k0", "hswaw.net") {
|
|
|
|
cfg+: {
|
|
|
|
storageClassNameParanoid: k0.ceph.waw2Pools.blockParanoid.name,
|
2020-05-11 18:49:31 +00:00
|
|
|
},
|
2020-06-05 23:21:45 +00:00
|
|
|
metallb+: {
|
|
|
|
cfg+: {
|
|
|
|
peers: [
|
|
|
|
{
|
|
|
|
"peer-address": "185.236.240.33",
|
|
|
|
"peer-asn": 65001,
|
|
|
|
"my-asn": 65002,
|
|
|
|
},
|
2019-11-01 17:43:45 +00:00
|
|
|
],
|
2020-06-05 23:21:45 +00:00
|
|
|
addressPools: [
|
|
|
|
{
|
|
|
|
name: "public-v4-1",
|
|
|
|
protocol: "bgp",
|
|
|
|
addresses: [
|
|
|
|
"185.236.240.48/28",
|
|
|
|
],
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "public-v4-2",
|
|
|
|
protocol: "bgp",
|
|
|
|
addresses: [
|
|
|
|
"185.236.240.112/28"
|
|
|
|
],
|
|
|
|
},
|
2019-11-01 17:43:45 +00:00
|
|
|
],
|
|
|
|
},
|
2019-06-20 14:42:19 +00:00
|
|
|
},
|
|
|
|
},
|
2019-06-20 22:24:09 +00:00
|
|
|
|
2020-06-05 23:21:45 +00:00
|
|
|
// Docker registry
|
|
|
|
registry: registry.Environment {
|
2019-07-21 14:56:41 +00:00
|
|
|
cfg+: {
|
2020-06-05 23:21:45 +00:00
|
|
|
domain: "registry.%s" % [k0.cluster.fqdn],
|
|
|
|
storageClassName: k0.cluster.cfg.storageClassNameParanoid,
|
|
|
|
objectStorageName: "waw-hdd-redundant-2-object",
|
2019-07-21 14:56:41 +00:00
|
|
|
},
|
|
|
|
},
|
2020-06-05 23:21:45 +00:00
|
|
|
|
|
|
|
// CockroachDB, running on bc01n{01,02,03}.
|
2019-06-20 22:24:09 +00:00
|
|
|
cockroach: {
|
2019-08-09 12:13:50 +00:00
|
|
|
waw2: cockroachdb.Cluster("crdb-waw1") {
|
2019-06-20 22:24:09 +00:00
|
|
|
cfg+: {
|
|
|
|
topology: [
|
2019-06-22 00:07:41 +00:00
|
|
|
{ name: "bc01n01", node: "bc01n01.hswaw.net" },
|
|
|
|
{ name: "bc01n02", node: "bc01n02.hswaw.net" },
|
|
|
|
{ name: "bc01n03", node: "bc01n03.hswaw.net" },
|
2019-06-20 22:24:09 +00:00
|
|
|
],
|
2020-06-05 23:21:45 +00:00
|
|
|
// Host path on SSD.
|
2019-08-09 12:13:50 +00:00
|
|
|
hostPath: "/var/db/crdb-waw1",
|
2019-04-01 22:06:13 +00:00
|
|
|
},
|
|
|
|
},
|
2019-08-01 18:16:27 +00:00
|
|
|
clients: {
|
|
|
|
cccampix: k0.cockroach.waw2.Client("cccampix"),
|
|
|
|
cccampixDev: k0.cockroach.waw2.Client("cccampix-dev"),
|
2020-03-25 09:55:05 +00:00
|
|
|
buglessDev: k0.cockroach.waw2.Client("bugless-dev"),
|
2020-05-30 12:32:27 +00:00
|
|
|
sso: k0.cockroach.waw2.Client("sso"),
|
2019-08-01 18:16:27 +00:00
|
|
|
},
|
2019-04-01 22:06:13 +00:00
|
|
|
},
|
2020-06-05 23:21:45 +00:00
|
|
|
|
2019-06-20 22:24:09 +00:00
|
|
|
ceph: {
|
2019-08-08 15:48:25 +00:00
|
|
|
// waw1 cluster - dead as of 2019/08/06, data corruption
|
2020-06-05 23:21:45 +00:00
|
|
|
// waw2 cluster: shitty 7200RPM 2.5" HDDs
|
2019-08-08 15:48:25 +00:00
|
|
|
waw2: rook.Cluster(k0.cluster.rook, "ceph-waw2") {
|
2019-06-20 22:24:09 +00:00
|
|
|
spec: {
|
|
|
|
mon: {
|
|
|
|
count: 3,
|
|
|
|
allowMultiplePerNode: false,
|
|
|
|
},
|
|
|
|
storage: {
|
|
|
|
useAllNodes: false,
|
|
|
|
useAllDevices: false,
|
|
|
|
config: {
|
|
|
|
databaseSizeMB: "1024",
|
|
|
|
journalSizeMB: "1024",
|
|
|
|
},
|
|
|
|
nodes: [
|
|
|
|
{
|
|
|
|
name: "bc01n01.hswaw.net",
|
|
|
|
location: "rack=dcr01 chassis=bc01 host=bc01n01",
|
|
|
|
devices: [ { name: "sda" } ],
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "bc01n02.hswaw.net",
|
|
|
|
location: "rack=dcr01 chassis=bc01 host=bc01n02",
|
|
|
|
devices: [ { name: "sda" } ],
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "bc01n03.hswaw.net",
|
|
|
|
location: "rack=dcr01 chassis=bc01 host=bc01n03",
|
|
|
|
devices: [ { name: "sda" } ],
|
|
|
|
},
|
|
|
|
],
|
|
|
|
},
|
Get in the Cluster, Benji!
Here we introduce benji [1], a backup system based on backy2. It lets us
backup Ceph RBD objects from Rook into Wasabi, our offsite S3-compatible
storage provider.
Benji runs as a k8s CronJob, every hour at 42 minutes. It does the
following:
- runs benji-pvc-backup, which iterates over all PVCs in k8s, and backs
up their respective PVs to Wasabi
- runs benji enforce, marking backups outside our backup policy [2] as
to be deleted
- runs benji cleanup, to remove unneeded backups
- runs a custom script to backup benji's sqlite3 database into wasabi
(unencrypted, but we're fine with that - as the metadata only contains
image/pool names, thus Ceph PV and pool names)
[1] - https://benji-backup.me/index.html
[2] - latest3,hours48,days7,months12, which means the latest 3 backups,
then one backup for the next 48 hours, then one backup for the next
7 days, then one backup for the next 12 months, for a total of 65
backups (deduplicated, of course)
We also drive-by update some docs (make them mmore separated into
user/admin docs).
Change-Id: Ibe0942fd38bc232399c0e1eaddade3f4c98bc6b4
2019-08-31 14:33:29 +00:00
|
|
|
benji:: {
|
|
|
|
metadataStorageClass: "waw-hdd-paranoid-2",
|
|
|
|
encryptionPassword: std.split((importstr "../secrets/plain/k0-benji-encryption-password"), '\n')[0],
|
|
|
|
pools: [
|
|
|
|
"waw-hdd-redundant-2",
|
|
|
|
"waw-hdd-redundant-2-metadata",
|
|
|
|
"waw-hdd-paranoid-2",
|
|
|
|
"waw-hdd-yolo-2",
|
|
|
|
],
|
|
|
|
s3Configuration: {
|
|
|
|
awsAccessKeyId: "RPYZIROFXNLQVU2WJ4R3",
|
|
|
|
awsSecretAccessKey: std.split((importstr "../secrets/plain/k0-benji-secret-access-key"), '\n')[0],
|
|
|
|
bucketName: "benji-k0-backups",
|
|
|
|
endpointUrl: "https://s3.eu-central-1.wasabisys.com/",
|
|
|
|
},
|
|
|
|
}
|
2019-06-20 22:24:09 +00:00
|
|
|
},
|
2019-04-01 23:05:38 +00:00
|
|
|
},
|
2019-11-01 17:43:45 +00:00
|
|
|
waw2Pools: {
|
|
|
|
// redundant block storage
|
|
|
|
blockRedundant: rook.ECBlockPool(k0.ceph.waw2, "waw-hdd-redundant-2") {
|
|
|
|
spec: {
|
|
|
|
failureDomain: "host",
|
|
|
|
erasureCoded: {
|
|
|
|
dataChunks: 2,
|
|
|
|
codingChunks: 1,
|
|
|
|
},
|
2019-06-20 22:24:09 +00:00
|
|
|
},
|
|
|
|
},
|
2019-11-01 17:43:45 +00:00
|
|
|
// paranoid block storage (3 replicas)
|
|
|
|
blockParanoid: rook.ReplicatedBlockPool(k0.ceph.waw2, "waw-hdd-paranoid-2") {
|
|
|
|
spec: {
|
|
|
|
failureDomain: "host",
|
|
|
|
replicated: {
|
|
|
|
size: 3,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
// yolo block storage (no replicas!)
|
|
|
|
blockYolo: rook.ReplicatedBlockPool(k0.ceph.waw2, "waw-hdd-yolo-2") {
|
|
|
|
spec: {
|
|
|
|
failureDomain: "host",
|
|
|
|
replicated: {
|
|
|
|
size: 1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
objectRedundant: rook.S3ObjectStore(k0.ceph.waw2, "waw-hdd-redundant-2-object") {
|
|
|
|
spec: {
|
|
|
|
metadataPool: {
|
|
|
|
failureDomain: "host",
|
|
|
|
replicated: { size: 3 },
|
|
|
|
},
|
|
|
|
dataPool: {
|
|
|
|
failureDomain: "host",
|
|
|
|
erasureCoded: {
|
|
|
|
dataChunks: 2,
|
|
|
|
codingChunks: 1,
|
|
|
|
},
|
|
|
|
},
|
2019-08-08 15:48:25 +00:00
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
2020-06-05 23:21:45 +00:00
|
|
|
|
|
|
|
// waw3: 6TB SAS 3.5" HDDs
|
2019-11-01 17:43:45 +00:00
|
|
|
waw3: rook.Cluster(k0.cluster.rook, "ceph-waw3") {
|
2019-06-20 22:24:09 +00:00
|
|
|
spec: {
|
2019-11-01 17:43:45 +00:00
|
|
|
mon: {
|
2020-03-28 16:58:19 +00:00
|
|
|
count: 3,
|
2019-11-01 17:43:45 +00:00
|
|
|
allowMultiplePerNode: false,
|
|
|
|
},
|
|
|
|
storage: {
|
|
|
|
useAllNodes: false,
|
|
|
|
useAllDevices: false,
|
|
|
|
config: {
|
|
|
|
databaseSizeMB: "1024",
|
|
|
|
journalSizeMB: "1024",
|
|
|
|
},
|
|
|
|
nodes: [
|
|
|
|
{
|
|
|
|
name: "dcr01s22.hswaw.net",
|
|
|
|
location: "rack=dcr01 host=dcr01s22",
|
|
|
|
devices: [
|
|
|
|
// https://github.com/rook/rook/issues/1228
|
|
|
|
//{ name: "disk/by-id/wwan-0x" + wwan }
|
|
|
|
//for wwan in [
|
|
|
|
// "5000c5008508c433",
|
|
|
|
// "5000c500850989cf",
|
|
|
|
// "5000c5008508f843",
|
|
|
|
// "5000c5008508baf7",
|
|
|
|
//]
|
|
|
|
{ name: "sdn" },
|
|
|
|
{ name: "sda" },
|
|
|
|
{ name: "sdb" },
|
|
|
|
{ name: "sdc" },
|
|
|
|
],
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "dcr01s24.hswaw.net",
|
|
|
|
location: "rack=dcr01 host=dcr01s22",
|
|
|
|
devices: [
|
|
|
|
// https://github.com/rook/rook/issues/1228
|
|
|
|
//{ name: "disk/by-id/wwan-0x" + wwan }
|
|
|
|
//for wwan in [
|
|
|
|
// "5000c5008508ee03",
|
|
|
|
// "5000c5008508c9ef",
|
|
|
|
// "5000c5008508df33",
|
|
|
|
// "5000c5008508dd3b",
|
|
|
|
//]
|
|
|
|
{ name: "sdm" },
|
|
|
|
{ name: "sda" },
|
|
|
|
{ name: "sdb" },
|
|
|
|
{ name: "sdc" },
|
|
|
|
],
|
|
|
|
},
|
|
|
|
],
|
2019-06-20 22:24:09 +00:00
|
|
|
},
|
2019-11-01 17:43:45 +00:00
|
|
|
benji:: {
|
2019-12-21 22:45:07 +00:00
|
|
|
metadataStorageClass: "waw-hdd-redundant-3",
|
2019-11-01 17:43:45 +00:00
|
|
|
encryptionPassword: std.split((importstr "../secrets/plain/k0-benji-encryption-password"), '\n')[0],
|
|
|
|
pools: [
|
2019-12-21 22:45:07 +00:00
|
|
|
"waw-hdd-redundant-3",
|
|
|
|
"waw-hdd-redundant-3-metadata",
|
|
|
|
"waw-hdd-yolo-3",
|
2019-11-01 17:43:45 +00:00
|
|
|
],
|
|
|
|
s3Configuration: {
|
|
|
|
awsAccessKeyId: "RPYZIROFXNLQVU2WJ4R3",
|
|
|
|
awsSecretAccessKey: std.split((importstr "../secrets/plain/k0-benji-secret-access-key"), '\n')[0],
|
|
|
|
bucketName: "benji-k0-backups-waw3",
|
|
|
|
endpointUrl: "https://s3.eu-central-1.wasabisys.com/",
|
|
|
|
},
|
|
|
|
}
|
2019-06-20 22:24:09 +00:00
|
|
|
},
|
2019-04-07 16:49:41 +00:00
|
|
|
},
|
2019-11-01 17:43:45 +00:00
|
|
|
waw3Pools: {
|
|
|
|
// redundant block storage
|
|
|
|
blockRedundant: rook.ECBlockPool(k0.ceph.waw3, "waw-hdd-redundant-3") {
|
|
|
|
metadataReplicas: 2,
|
|
|
|
spec: {
|
2019-06-20 22:24:09 +00:00
|
|
|
failureDomain: "host",
|
2019-11-01 17:43:45 +00:00
|
|
|
replicated: {
|
|
|
|
size: 2,
|
|
|
|
},
|
2019-06-20 22:24:09 +00:00
|
|
|
},
|
2019-11-01 17:43:45 +00:00
|
|
|
},
|
|
|
|
// yolo block storage (low usage, no host redundancy)
|
|
|
|
blockYolo: rook.ReplicatedBlockPool(k0.ceph.waw3, "waw-hdd-yolo-3") {
|
|
|
|
spec: {
|
|
|
|
failureDomain: "osd",
|
2019-06-20 22:24:09 +00:00
|
|
|
erasureCoded: {
|
2019-11-01 17:43:45 +00:00
|
|
|
dataChunks: 12,
|
|
|
|
codingChunks: 4,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
objectRedundant: rook.S3ObjectStore(k0.ceph.waw3, "waw-hdd-redundant-3-object") {
|
|
|
|
spec: {
|
|
|
|
metadataPool: {
|
|
|
|
failureDomain: "host",
|
|
|
|
replicated: { size: 2 },
|
|
|
|
},
|
|
|
|
dataPool: {
|
|
|
|
failureDomain: "host",
|
|
|
|
replicated: { size: 2 },
|
2019-06-20 22:24:09 +00:00
|
|
|
},
|
|
|
|
},
|
2019-04-07 16:49:41 +00:00
|
|
|
},
|
|
|
|
},
|
2019-09-02 14:32:40 +00:00
|
|
|
|
2020-06-05 23:21:45 +00:00
|
|
|
// Clients for S3/radosgw storage.
|
|
|
|
clients: {
|
|
|
|
# Used for owncloud.hackerspace.pl, which for now lives on boston-packets.hackerspace.pl.
|
|
|
|
nextcloudWaw3: kube.CephObjectStoreUser("nextcloud") {
|
|
|
|
metadata+: {
|
|
|
|
namespace: "ceph-waw3",
|
|
|
|
},
|
|
|
|
spec: {
|
|
|
|
store: "waw-hdd-redundant-3-object",
|
|
|
|
displayName: "nextcloud",
|
|
|
|
},
|
|
|
|
},
|
2020-05-14 18:11:58 +00:00
|
|
|
|
2020-06-05 23:21:45 +00:00
|
|
|
# nuke@hackerspace.pl's personal storage.
|
|
|
|
nukePersonalWaw3: kube.CephObjectStoreUser("nuke-personal") {
|
|
|
|
metadata+: {
|
|
|
|
namespace: "ceph-waw3",
|
|
|
|
},
|
|
|
|
spec: {
|
|
|
|
store: "waw-hdd-redundant-3-object",
|
|
|
|
displayName: "nuke-personal",
|
|
|
|
},
|
|
|
|
},
|
2020-06-04 16:38:34 +00:00
|
|
|
|
2020-06-05 23:21:45 +00:00
|
|
|
# patryk@hackerspace.pl's ArmA3 mod bucket.
|
|
|
|
cz2ArmaModsWaw3: kube.CephObjectStoreUser("cz2-arma3mods") {
|
|
|
|
metadata+: {
|
|
|
|
namespace: "ceph-waw3",
|
|
|
|
},
|
|
|
|
spec: {
|
|
|
|
store: "waw-hdd-redundant-3-object",
|
|
|
|
displayName: "cz2-arma3mods",
|
|
|
|
},
|
2020-06-13 19:19:40 +00:00
|
|
|
},
|
|
|
|
# Buckets for spark pipelines
|
|
|
|
# TODO(implr): consider a second yolo-backed one for temp data
|
|
|
|
implrSparkWaw3: kube.CephObjectStoreUser("implr-spark") {
|
|
|
|
metadata+: {
|
|
|
|
namespace: "ceph-waw3",
|
|
|
|
},
|
|
|
|
spec: {
|
|
|
|
store: "waw-hdd-redundant-3-object",
|
|
|
|
displayName: "implr-spark",
|
|
|
|
},
|
2020-06-05 23:21:45 +00:00
|
|
|
},
|
2020-06-04 16:38:34 +00:00
|
|
|
},
|
|
|
|
},
|
2020-06-05 23:21:45 +00:00
|
|
|
|
|
|
|
|
|
|
|
# These are policies allowing for Insecure pods in some namespaces.
|
|
|
|
# A lot of them are spurious and come from the fact that we deployed
|
|
|
|
# these namespaces before we deployed the draconian PodSecurityPolicy
|
|
|
|
# we have now. This should be fixed by setting up some more granular
|
|
|
|
# policies, or fixing the workloads to not need some of the permission
|
|
|
|
# bits they use, whatever those might be.
|
|
|
|
# TODO(q3k): fix this?
|
|
|
|
unnecessarilyInsecureNamespaces: [
|
|
|
|
policies.AllowNamespaceInsecure("ceph-waw2"),
|
|
|
|
policies.AllowNamespaceInsecure("ceph-waw3"),
|
|
|
|
policies.AllowNamespaceInsecure("matrix"),
|
|
|
|
policies.AllowNamespaceInsecure("registry"),
|
|
|
|
policies.AllowNamespaceInsecure("internet"),
|
|
|
|
# TODO(implr): restricted policy with CAP_NET_ADMIN and tuntap, but no full root
|
|
|
|
policies.AllowNamespaceInsecure("implr-vpn"),
|
|
|
|
],
|
2019-04-07 16:49:41 +00:00
|
|
|
},
|
2019-01-13 21:06:33 +00:00
|
|
|
}
|