*: tear down ceph-waw2

It reached the stage of being crapped out so much that the OSDs spurious
IOPS killed the performance of disks colocated on the same M610 RAID
controllers. This made etcd _very_ slow, to the point of churning
through re-elections due to timeouts.

etcd/apiserver latencies, observe the difference at ~15:38:

https://object.ceph-waw3.hswaw.net/q3k-personal/4fbe8d4cfc8193cad307d487371b4e44358b931a7494aa88aff50b13fae9983c.png

I moved gerrit/* and matrix/appservice-irc-freenode PVCs to ceph-waw3 by
hand. The rest were non-critical so I removed them, they can be
recovered from benji backups if needed.

Change-Id: Iffbe87aefc06d8324a82b958a579143b7dd9914c
This commit is contained in:
q3k 2021-01-22 16:26:07 +01:00
parent 856b284e29
commit 61f978a0a0
3 changed files with 3 additions and 97 deletions

View file

@ -26,8 +26,7 @@ matrix {
"irc-freenode": irc.AppServiceIrc("freenode") {
cfg+: {
image: cfg.images.appserviceIRC,
// TODO(q3k): move this appservice to waw-hdd-redundant-3
storageClassName: "waw-hdd-paranoid-2",
storageClassName: "waw-hdd-redundant-3",
metadata: app.metadata("appservice-irc-freenode"),
// TODO(q3k): add labels to blessed nodes
nodeSelector: {

View file

@ -85,99 +85,7 @@ local rook = import "lib/rook.libsonnet";
ceph: {
// waw1 cluster - dead as of 2019/08/06, data corruption
// waw2 cluster: shitty 7200RPM 2.5" HDDs
waw2: rook.Cluster(k0.cluster.rook, "ceph-waw2") {
spec: {
// This cluster is quite broken. We're just keeping it around
// for the hell of it.
continueUpgradeAfterChecksEvenIfNotHealthy: true,
mon: {
count: 3,
allowMultiplePerNode: false,
},
storage: {
useAllNodes: false,
useAllDevices: false,
config: {
databaseSizeMB: "1024",
journalSizeMB: "1024",
},
nodes: [
{
name: "bc01n01.hswaw.net",
location: "rack=dcr01 chassis=bc01 host=bc01n01",
devices: [ { name: "sda" } ],
},
{
name: "bc01n02.hswaw.net",
location: "rack=dcr01 chassis=bc01 host=bc01n02",
devices: [ { name: "sda" } ],
},
],
},
benji:: {
metadataStorageClass: "waw-hdd-paranoid-2",
encryptionPassword: std.split((importstr "../secrets/plain/k0-benji-encryption-password"), '\n')[0],
pools: [
"waw-hdd-redundant-2",
"waw-hdd-redundant-2-metadata",
"waw-hdd-paranoid-2",
"waw-hdd-yolo-2",
],
s3Configuration: {
awsAccessKeyId: "RPYZIROFXNLQVU2WJ4R3",
awsSecretAccessKey: std.split((importstr "../secrets/plain/k0-benji-secret-access-key"), '\n')[0],
bucketName: "benji-k0-backups",
endpointUrl: "https://s3.eu-central-1.wasabisys.com/",
},
}
},
},
waw2Pools: {
// redundant block storage
blockRedundant: rook.ECBlockPool(k0.ceph.waw2, "waw-hdd-redundant-2") {
spec: {
failureDomain: "host",
erasureCoded: {
dataChunks: 2,
codingChunks: 1,
},
},
},
// paranoid block storage (3 replicas)
blockParanoid: rook.ReplicatedBlockPool(k0.ceph.waw2, "waw-hdd-paranoid-2") {
spec: {
failureDomain: "host",
replicated: {
size: 3,
},
},
},
// yolo block storage (no replicas!)
blockYolo: rook.ReplicatedBlockPool(k0.ceph.waw2, "waw-hdd-yolo-2") {
spec: {
failureDomain: "host",
replicated: {
size: 1,
},
},
},
objectRedundant: rook.S3ObjectStore(k0.ceph.waw2, "waw-hdd-redundant-2-object") {
spec: {
metadataPool: {
failureDomain: "host",
replicated: { size: 3 },
},
dataPool: {
failureDomain: "host",
erasureCoded: {
dataChunks: 2,
codingChunks: 1,
},
},
},
},
},
// waw2 cluster - dead as of 2021/01/22, torn down (horrible M610 RAID controllers are horrible)
// waw3: 6TB SAS 3.5" HDDs
waw3: rook.Cluster(k0.cluster.rook, "ceph-waw3") {
@ -381,7 +289,6 @@ local rook = import "lib/rook.libsonnet";
# bits they use, whatever those might be.
# TODO(q3k): fix this?
unnecessarilyInsecureNamespaces: [
policies.AllowNamespaceInsecure("ceph-waw2"),
policies.AllowNamespaceInsecure("ceph-waw3"),
policies.AllowNamespaceInsecure("matrix"),
policies.AllowNamespaceInsecure("registry"),

View file

@ -11,7 +11,7 @@ local gerrit = import "gerrit.libsonnet";
domain: "gerrit.hackerspace.pl",
identity: "7b6244cf-e30b-42c5-ba91-c329ef4e6cf1",
storageClassName: "waw-hdd-paranoid-2",
storageClassName: "waw-hdd-redundant-3",
secureSecret: "gerrit",
},