diff --git a/cluster/clustercfg/BUILD b/cluster/clustercfg/BUILD index e207c4db..e7b308ef 100644 --- a/cluster/clustercfg/BUILD +++ b/cluster/clustercfg/BUILD @@ -1,19 +1,25 @@ -load("@pydeps//:requirements.bzl", "requirement") +load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_library") -py_binary( - name = "clustercfg", - python_version = "PY3", +go_library( + name = "go_default_library", srcs = [ - "clustercfg.py", - "ca.py", + "cmd_admincreds.go", + "cmd_gencerts.go", + "main.go", ], - visibility = ["//visibility:public"], + importpath = "code.hackerspace.pl/hscloud/cluster/clustercfg", + visibility = ["//visibility:private"], deps = [ - requirement("cffi"), - requirement("fabric"), - requirement("idna"), - requirement("six"), - "//tools:secretstore_lib", - "//tools/hscloud:python", + "//cluster/clustercfg/certs:go_default_library", + "//go/workspace:go_default_library", + "@com_github_spf13_cobra//:go_default_library", + "@io_k8s_client_go//tools/clientcmd:go_default_library", + "@io_k8s_client_go//tools/clientcmd/api:go_default_library", ], ) + +go_binary( + name = "clustercfg", + embed = [":go_default_library"], + visibility = ["//visibility:public"], +) diff --git a/cluster/clustercfg/ca.py b/cluster/clustercfg/ca.py deleted file mode 100644 index 0107080a..00000000 --- a/cluster/clustercfg/ca.py +++ /dev/null @@ -1,315 +0,0 @@ -# encoding: utf-8 -from datetime import datetime, timezone -import json -import logging -import os -from six import StringIO -import subprocess -import tempfile - - -logger = logging.getLogger(__name__) - - -_std_subj = { - "C": "PL", - "ST": "Mazowieckie", - "L": "Warsaw", - "O": "Warsaw Hackerspace", - "OU": "clustercfg", -} - -_ca_csr = { - "CN": "Prototype Test Certificate Authority", - "key": { - "algo": "rsa", - "size": 2048 - }, - "names": [ _std_subj ], -} - -_ca_config = { - "signing": { - "default": { - "expiry": "168h" - }, - "profiles": { - "intermediate": { - "expiry": "8760h", - "usages": [ - "signing", - "key encipherment", - "cert sign", - "crl sign", - "server auth", - "client auth", - ], - "ca_constraint": { - "is_ca": True, - }, - }, - "server": { - "expiry": "8760h", - "usages": [ - "signing", - "key encipherment", - "server auth" - ] - }, - "client": { - "expiry": "8760h", - "usages": [ - "signing", - "key encipherment", - "client auth" - ] - }, - "client-server": { - "expiry": "8760h", - "usages": [ - "signing", - "key encipherment", - "server auth", - "client auth" - ] - } - } - } -} - - -class CAException(Exception): - pass - - -class CA(object): - def __init__(self, secretstore, certdir, short, cn): - self.ss = secretstore - self.cdir = certdir - self.short = short - self.cn = cn - self._init_ca() - - def __str__(self): - return 'CN={} ({})'.format(self.cn, self.short) - - @property - def _secret_key(self): - return 'ca-{}.key'.format(self.short) - - @property - def _cert(self): - return os.path.join(self.cdir, 'ca-{}.crt'.format(self.short)) - - @property - def cert_data(self): - with open(self._cert) as f: - return f.read() - - def _cfssl_call(self, args, obj=None, stdin=None): - p = subprocess.Popen(['cfssl'] + args, - stdin=subprocess.PIPE, stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - if obj is not None: - stdin = json.dumps(obj) - - outs, errs = p.communicate(stdin.encode()) - if p.returncode != 0: - raise Exception( - 'cfssl failed. stderr: %r, stdout: %r, code: %r' % ( - errs, outs, p.returncode)) - - out = json.loads(outs) - return out - - def _init_ca(self): - if self.ss.exists(self._secret_key): - return - - ca_csr = dict(_ca_csr) - ca_csr['CN'] = self.cn - - logger.info("{}: Generating CA...".format(self)) - out = self._cfssl_call(['gencert', '-initca', '-'], obj=ca_csr) - - f = self.ss.open(self._secret_key, 'w') - f.write(out['key']) - f.close() - - f = open(self._cert, 'w') - f.write(out['cert']) - f.close() - - def gen_key(self, hosts, o=_std_subj['O'], ou=_std_subj['OU'], save=None): - """お元気ですか?""" - cfg = { - "CN": hosts[0], - "hosts": hosts, - "key": { - "algo": "rsa", - "size": 4096, - }, - "names": [ - { - "C": _std_subj["C"], - "ST": _std_subj["ST"], - "L": _std_subj["L"], - "O": o, - "OU": ou, - }, - ], - } - cfg.update(_ca_config) - logger.info("{}: Generating key/CSR for {}".format(self, hosts)) - out = self._cfssl_call(['genkey', '-'], obj=cfg) - - key, csr = out['key'], out['csr'] - if save is not None: - logging.info("{}: Saving new key to secret {}".format(self, save)) - f = self.ss.open(save, 'w') - f.write(key) - f.close() - - return key, csr - - def gen_csr(self, key, hosts, o=_std_subj['O'], ou=_std_subj['OU']): - """ - Generate a CSR while already having a private key - for renewals, etc. - - TODO(q3k): this shouldn't be a CA method, but a cert method. - """ - cfg = { - "CN": hosts[0], - "hosts": hosts, - "key": { - "algo": "rsa", - "size": 4096, - }, - "names": [ - { - "C": _std_subj["C"], - "ST": _std_subj["ST"], - "L": _std_subj["L"], - "O": o, - "OU": ou, - }, - ], - } - cfg.update(_ca_config) - logger.info("{}: Generating CSR for {}".format(self, hosts)) - out = self._cfssl_call(['gencsr', '-key', key, '-'], obj=cfg) - - return out['csr'] - - def sign(self, csr, save=None, profile='client-server'): - logging.info("{}: Signing CSR".format(self)) - ca = self._cert - cakey = self.ss.plaintext(self._secret_key) - - config = tempfile.NamedTemporaryFile(mode='w') - json.dump(_ca_config, config) - config.flush() - - out = self._cfssl_call(['sign', '-ca=' + ca, '-ca-key=' + cakey, - '-profile='+profile, '-config='+config.name, '-'], stdin=csr) - cert = out['cert'] - if save is not None: - name = os.path.join(self.cdir, save) - logging.info("{}: Saving new certificate to {}".format(self, name)) - f = open(name, 'w') - f.write(cert) - f.close() - - config.close() - return cert - - def upload(self, c, remote_cert): - logger.info("Uploading CA {} to {}".format(self, remote_cert)) - c.put(local=self._cert, remote=remote_cert) - - def make_cert(self, *a, **kw): - return ManagedCertificate(self, *a, **kw) - - -class ManagedCertificate(object): - def __init__(self, ca, name, hosts, o=None, ou=None, profile='client-server'): - self.ca = ca - - self.hosts = hosts - self.name = name - self.key = '{}.key'.format(name) - self.cert = '{}.cert'.format(name) - self.o = o - self.ou = ou - self.profile = profile - - self.ensure() - - def __str__(self): - return '{}'.format(self.name) - - @property - def key_exists(self): - return self.ca.ss.exists(self.key) - - @property - def key_data(self): - f = open(self.ca.ss.open(self.key)) - d = f.read() - f.close() - return d - - @property - def key_path(self): - return self.ca.ss.plaintext(self.key) - - @property - def cert_path(self): - return os.path.join(self.ca.cdir, self.cert) - - @property - def cert_exists(self): - return os.path.exists(self.cert_path) - - @property - def cert_data(self): - with open(self.cert_path) as f: - return f.read() - - @property - def cert_expires_soon(self): - if not self.cert_exists: - return False - - out = self.ca._cfssl_call(['certinfo', '-cert', self.cert_path], stdin="") - not_after = datetime.strptime(out['not_after'], '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc) - until = not_after - datetime.now(timezone.utc) - if until.days < 30: - return True - return False - - def ensure(self): - if self.key_exists and self.cert_exists and not self.cert_expires_soon: - return - - key = None - if not self.key_exists: - logger.info("{}: Generating key...".format(self)) - key, csr = self.ca.gen_key(self.hosts, o=self.o, ou=self.ou, save=self.key) - else: - logger.info("{}: Renewing certificate...".format(self)) - # Use already existing key - csr = self.ca.gen_csr(self.key_path, self.hosts, o=self.o, ou=self.ou) - self.ca.sign(csr, save=self.cert, profile=self.profile) - - def upload(self, c, remote_cert, remote_key, concat_ca=False): - logger.info("Uploading Cert {} to {} & {}".format(self, remote_cert, remote_key)) - if concat_ca: - f = StringIO(self.cert_data + self.ca.cert_data) - c.put(local=f, remote=remote_cert) - else: - c.put(local=self.cert_path, remote=remote_cert) - c.put(local=self.key_path, remote=remote_key) - - def upload_pki(self, c, pki, concat_ca=False): - self.upload(c, pki['cert'], pki['key'], concat_ca) diff --git a/cluster/clustercfg/certs/BUILD.bazel b/cluster/clustercfg/certs/BUILD.bazel new file mode 100644 index 00000000..c1156804 --- /dev/null +++ b/cluster/clustercfg/certs/BUILD.bazel @@ -0,0 +1,12 @@ +load("@io_bazel_rules_go//go:def.bzl", "go_library") + +go_library( + name = "go_default_library", + srcs = [ + "certs.go", + "generator.go", + "x509.go", + ], + importpath = "code.hackerspace.pl/hscloud/cluster/clustercfg/certs", + visibility = ["//visibility:public"], +) diff --git a/cluster/clustercfg/certs/certs.go b/cluster/clustercfg/certs/certs.go new file mode 100644 index 00000000..75717175 --- /dev/null +++ b/cluster/clustercfg/certs/certs.go @@ -0,0 +1,288 @@ +package certs + +import ( + "net" + "time" +) + +// Certificates is the set of certificates required to run our Kubernetes +// production. +type Certificates struct { + CAs CAs + + ProdviderIntermediateCA *Certificate + + Global Global + PerNode map[string]PerNode +} + +type ensurer interface { + Ensure() error +} + +// Ensure checks that all the Kubernetes production certificates and keys are +// present on disk, generating them as necessary. +// +// If the user has not decrypted cluster/secrets, an error will be returned. +// However, deeper sync checks are not currently performed. +func (c *Certificates) Ensure() error { + sub := []ensurer{ + &c.CAs, + c.ProdviderIntermediateCA, + &c.Global, + } + for _, pn := range c.PerNode { + sub = append(sub, &pn) + } + for _, s := range sub { + if err := s.Ensure(); err != nil { + return err + } + } + return nil +} + +// CAs are the root certificate authorities we use. +type CAs struct { + // EtcdPeer is used by etcd member nodes to authenticate peers. + EtcdPeer *Certificate + // Etcd is used by etcd membrer nodes to authenticate clients (ie. + // kube-apiservers). + Etcd *Certificate + // Kube is the main Kubernetes 'identity' CA, used to identify components + // and users. + Kube *Certificate + // KubeFront is the proxy/aggregation CA used by external apiservices to + // authenticate incoming apiserver connections. + KubeFront *Certificate + // Admitomatic is the CA used by the admitomatic webhook to authenticate + // incoming apiserver connections. + Admitomatic *Certificate +} + +func (c *CAs) Ensure() error { + sub := []ensurer{ + c.EtcdPeer, + c.Etcd, + c.Kube, + c.KubeFront, + c.Admitomatic, + } + for _, s := range sub { + if err := s.Ensure(); err != nil { + return err + } + } + return nil +} + +// Global are all the non-per-node certificates we use. +type Global struct { + // EtcdKube is used by kubernetes apiservers to authenticate to etcd + // members. + EtcdKube *Certificate + + // KubeApiserver is used by kubernetes apiservers to authenticate to other + // kubernetes components/users. + KubeApiserver *Certificate + // KubeControllerManager is used by kubernetes controller managers to + // authenticate to the kubernetes apiservers. + KubeControllerManager *Certificate + // KubeScheduler is used by the kubernetes schedulers to authenticate to + // the kubernetes apiservers. + KubeScheduler *Certificate + + // KubefrontApiserver is used by the kubernetes apiserver to authenticate + // to external apiservices. + KubefrontApiserver *Certificate + + // AdmitomaticWebhook is used by the admitomatic webhook to authenticate to + // the Kubernetes apiservers. + AdmitomaticWebhook *Certificate +} + +func (g *Global) Ensure() error { + sub := []ensurer{ + g.EtcdKube, + g.KubeApiserver, + g.KubeControllerManager, + g.KubeScheduler, + g.KubefrontApiserver, + g.AdmitomaticWebhook, + } + for _, s := range sub { + if err := s.Ensure(); err != nil { + return err + } + } + return nil +} + +func (c *Certificates) MakeKubeEmergencyCreds(root, breadcrumb string) *Certificate { + return &Certificate{ + name: "emergency", + duration: 7 * 24 * time.Hour, + root: root, + kind: kindClient, + cn: "admin", + san: []string{"admin", breadcrumb}, + o: "system:masters", + issuer: c.CAs.Kube, + } +} + +// Per node are all the per-node certificates we use. +type PerNode struct { + // EtcdPeer is used by etcd members to authenticate to other etcd members. + EtcdPeer *Certificate + // EtcdClient is used by etcd members to authenticate to their clients. + EtcdClient *Certificate + + // Kubelet is used by kubelets to authenticate to other kubernetes + // components. + Kubelet *Certificate +} + +func (p *PerNode) Ensure() error { + sub := []ensurer{ + p.EtcdPeer, + p.EtcdClient, + p.Kubelet, + } + for _, s := range sub { + if err := s.Ensure(); err != nil { + return err + } + } + return nil +} + +func mkCA(root, name, cn string) *Certificate { + return &Certificate{ + name: name, + root: root, + kind: kindCA, + cn: cn, + } +} + +// Prepare builds our Certificates structure at a given location on the +// filesystem, for the given nodes. +// +// Calling Ensure() on the returned Certificates will actually engage +// generation logic. Before that, no disk accesses are performed. +func Prepare(root string, fqdns []string) Certificates { + certs := Certificates{ + CAs: CAs{ + EtcdPeer: mkCA(root, "ca-etcdpeer", "etcd peer ca"), + Etcd: mkCA(root, "ca-etcd", "etcd ca"), + Kube: mkCA(root, "ca-kube", "kubernetes main CA"), + KubeFront: mkCA(root, "ca-kubefront", "kubernetes frontend CA"), + Admitomatic: mkCA(root, "ca-admitomatic", "admitomatic webhook CA"), + }, + PerNode: make(map[string]PerNode), + } + + certs.ProdviderIntermediateCA = &Certificate{ + name: "ca-kube-prodvider", + root: root, + kind: kindProdvider, + cn: "kubernetes prodvider intermediate", + issuer: certs.CAs.Kube, + } + certs.Global = Global{ + EtcdKube: &Certificate{ + name: "etcd-kube", + root: root, + kind: kindClient, + cn: "kube etcd client certificate", + san: []string{"kube"}, + issuer: certs.CAs.Etcd, + }, + KubeApiserver: &Certificate{ + name: "kube-apiserver", + root: root, + kind: kindClientServer, + cn: "k0.hswaw.net", + san: []string{ + "k0.hswaw.net", + "kubernetes.default.svc.k0.hswaw.net", + }, + ips: []net.IP{ + {10, 10, 12, 1}, + }, + issuer: certs.CAs.Kube, + }, + KubeControllerManager: &Certificate{ + name: "kube-controllermanager", + root: root, + kind: kindClientServer, + cn: "system:kube-controller-manager", + san: []string{"system:kube-controller-manager"}, + o: "system:kube-controller-manager", + issuer: certs.CAs.Kube, + }, + KubeScheduler: &Certificate{ + name: "kube-scheduler", + root: root, + kind: kindClientServer, + cn: "system:kube-scheduler", + san: []string{"system:kube-scheduler"}, + o: "system:kube-scheduler", + issuer: certs.CAs.Kube, + }, + KubefrontApiserver: &Certificate{ + name: "kubefront-apiserver", + root: root, + kind: kindClientServer, + cn: "Kubernetes Frontend", + san: []string{"apiserver"}, + issuer: certs.CAs.KubeFront, + }, + AdmitomaticWebhook: &Certificate{ + name: "admitomatic-webhook", + root: root, + kind: kindServer, + cn: "Admitomatic Webhook", + san: []string{"admitomatic.admitomatic.svc"}, + issuer: certs.CAs.Admitomatic, + }, + } + for _, fqdn := range fqdns { + certs.PerNode[fqdn] = PerNode{ + EtcdPeer: &Certificate{ + name: "etcdpeer-" + fqdn, + root: root, + kind: kindClientServer, + cn: "node etcd peer certificate", + san: []string{fqdn}, + issuer: certs.CAs.EtcdPeer, + }, + EtcdClient: &Certificate{ + name: "etcd-" + fqdn, + root: root, + // etcd seems to need client too, as it's connecting to itself + // for... some reason? + // https://github.com/etcd-io/etcd/issues/9785 + kind: kindClientServer, + cn: "node etcd server certificate", + san: []string{fqdn}, + issuer: certs.CAs.Etcd, + }, + Kubelet: &Certificate{ + name: "kube-kubelet-" + fqdn, + root: root, + kind: kindClientServer, + cn: "system:node:" + fqdn, + o: "system:nodes", + san: []string{ + "system:node:" + fqdn, + fqdn, + }, + issuer: certs.CAs.Kube, + }, + } + } + + return certs +} diff --git a/cluster/clustercfg/certs/generator.go b/cluster/clustercfg/certs/generator.go new file mode 100644 index 00000000..7cd2c818 --- /dev/null +++ b/cluster/clustercfg/certs/generator.go @@ -0,0 +1,297 @@ +package certs + +import ( + "bytes" + "crypto" + "crypto/ed25519" + "crypto/rand" + "crypto/x509" + "encoding/pem" + "errors" + "fmt" + "log" + "math/big" + "net" + "os" + "path/filepath" + "time" +) + +// Certificate is a higher-level descriptor of an intent to generate a +// certificate and corresponding Ed25519 keypair on disk. +type Certificate struct { + // uniquer name for this cert, used to calculate filesystem paths. + name string + // root directory where all certs are stored. + root string + // duration used to determine TimeAfter. If not set, the certificate will + // never expire. + duration time.Duration + + kind certificateKind + + // cn is the subject common name that's going to be produced in the X.509 + // certificate. + cn string + // o is the subject organziation that's going to be produced in the X.509 + // certificate. + o string + // san are the DNS alternate names that are going to be produced in the + // X.509 certificate. + san []string + // ips are the IP alternate names that are going to be produced in the + // X.509 certificate. + ips []net.IP + + // issuer, if set, is the certificate that will sign this certificate. If + // not set, the certificate will be self-signed. + issuer *Certificate +} + +// Paths returns local filesystem paths to the CA certificate, certificate and +// key respectively. If the certificate is self signed, the CA path returned +// will be empty. These files might or might not live on the file system - you +// should first call Ensure to make sure they do. +func (c *Certificate) Paths() (caPath, certPath, keyPath string) { + if c.issuer != nil { + caPath = c.issuer.path(fileKindCert) + } + certPath = c.path(fileKindCert) + keyPath = c.path(fileKindKey) + return +} + +type certificateKind string + +const ( + kindServer certificateKind = "server" + kindClient certificateKind = "client" + kindClientServer certificateKind = "client-server" + kindCA certificateKind = "ca" + kindProdvider certificateKind = "prodvider" +) + +type fileKind string + +const ( + fileKindKey fileKind = "key" + fileKindKeyEncrypted fileKind = "key-encrypted" + fileKindCert fileKind = "cert" +) + +// path returns the path to the generated fileKind for this Certificate. +func (c *Certificate) path(k fileKind) string { + switch k { + case fileKindKeyEncrypted: + return filepath.Join(c.root, "secrets", "cipher", c.name+".key") + case fileKindKey: + return filepath.Join(c.root, "secrets", "plain", c.name+".key") + case fileKindCert: + // clustercfg.py compat: CA certs end in .crt, non-CA certs end in .cert. + // We're keeping this accidental convention to avoid spurious nix rebuilds + // when migrating. + // + // Feel free to fix it if it annoys you. + extension := ".cert" + if c.kind == kindCA { + extension = ".crt" + } + return filepath.Join(c.root, "certs", c.name+extension) + default: + panic("unexpected file kind type " + k) + } +} + +// ensureKey loads or generates-then-saves the private key for this +// Certificate. +func (c *Certificate) ensureKey() (crypto.Signer, error) { + path := c.path(fileKindKey) + _, err := os.Stat(path) + switch { + case err == nil: + return c.loadKey() + case errors.Is(err, os.ErrNotExist): + epath := c.path(fileKindKeyEncrypted) + if _, err = os.Stat(epath); err == nil { + return nil, fmt.Errorf("plaintext key at %q not found, but exists encrypted at %q - please decrypt using secretstore", path, epath) + } + return c.generateKey() + default: + return nil, fmt.Errorf("could not read key: %w", err) + } +} + +func (c *Certificate) loadKey() (crypto.Signer, error) { + path := c.path(fileKindKey) + bytes, err := os.ReadFile(path) + if err != nil { + return nil, err + } + block, _ := pem.Decode(bytes) + if block == nil { + return nil, fmt.Errorf("no PEM block found") + } + if block.Type != "PRIVATE KEY" { + return nil, fmt.Errorf("unexpected PEM block: %q", block.Type) + } + key, err := x509.ParsePKCS8PrivateKey(block.Bytes) + if err != nil { + return nil, err + } + if k, ok := key.(ed25519.PrivateKey); ok { + return k, nil + } + return nil, fmt.Errorf("not an ED25519 key") +} + +func (c *Certificate) generateKey() (crypto.Signer, error) { + _, priv, err := ed25519.GenerateKey(rand.Reader) + if err != nil { + return nil, err + } + + pkcs8, err := x509.MarshalPKCS8PrivateKey(priv) + if err != nil { + return nil, err + } + + block := pem.EncodeToMemory(&pem.Block{Type: "PRIVATE KEY", Bytes: pkcs8}) + path := c.path(fileKindKey) + os.MkdirAll(filepath.Dir(path), 0700) + log.Printf("Saving %s key to %s ...", c.name, path) + if err := os.WriteFile(path, block, 0600); err != nil { + return nil, err + } + + return priv, nil +} + +// ensureCert loads or generates-then-saves the X.509 certificate for the +// Certificate. +func (c *Certificate) ensureCert() (*x509.Certificate, error) { + path := c.path(fileKindCert) + _, err := os.Stat(path) + switch { + case err == nil: + cert, err := c.loadCert() + switch err { + case nil: + return cert, nil + case errExpired: + return c.generateCert() + default: + return nil, err + } + case errors.Is(err, os.ErrNotExist): + return c.generateCert() + default: + return nil, fmt.Errorf("could not read cert: %w", err) + } +} + +func (c *Certificate) generateCert() (*x509.Certificate, error) { + serialNumberLimit := new(big.Int).Lsh(big.NewInt(1), 127) + serialNumber, err := rand.Int(rand.Reader, serialNumberLimit) + if err != nil { + return nil, err + } + + notAfter := unknownNotAfter + if c.duration != 0 { + notAfter = time.Now().Add(c.duration) + } + template := c.template() + template.SerialNumber = serialNumber + template.NotBefore = time.Now() + template.NotAfter = notAfter + + parent := template + skey, err := c.ensureKey() + if err != nil { + return nil, fmt.Errorf("when ensuring key: %w", err) + } + pkey := skey.Public() + caskey := skey + if c.issuer != nil { + caskey, err = c.issuer.ensureKey() + if err != nil { + return nil, fmt.Errorf("when ensuring CA key: %w", err) + } + cacert, err := c.issuer.ensureCert() + if err != nil { + return nil, fmt.Errorf("when ensuring CA cert: %w", err) + } + parent = cacert + } + + bytes, err := x509.CreateCertificate(rand.Reader, template, parent, pkey, caskey) + if err != nil { + return nil, fmt.Errorf("issuing certificate failed: %w", err) + } + + block := pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: bytes}) + path := c.path(fileKindCert) + os.MkdirAll(filepath.Dir(path), 0700) + log.Printf("Saving %s cert to %s ...", c.name, path) + if err := os.WriteFile(path, block, 0600); err != nil { + return nil, err + } + + return x509.ParseCertificate(bytes) +} + +// errExpired is returned if the cert exists on disk but has (nearly) expired. +var errExpired = errors.New("certificate expired") + +func (c *Certificate) loadCert() (*x509.Certificate, error) { + path := c.path(fileKindCert) + b, err := os.ReadFile(path) + if err != nil { + return nil, err + } + + block, _ := pem.Decode(b) + if block == nil { + return nil, fmt.Errorf("no PEM block found") + } + if block.Type != "CERTIFICATE" { + return nil, fmt.Errorf("unexpected PEM block: %q", block.Type) + } + cert, err := x509.ParseCertificate(block.Bytes) + if err != nil { + return nil, err + } + if time.Now().Add(time.Hour).After(cert.NotAfter) { + return nil, errExpired + } + pkey, ok := cert.PublicKey.(ed25519.PublicKey) + if !ok { + return nil, fmt.Errorf("not a ED25519 cert") + } + skey, err := c.ensureKey() + if err != nil { + return nil, fmt.Errorf("when ensuring key: %w", err) + } + if !bytes.Equal(pkey, skey.Public().(ed25519.PublicKey)) { + return nil, fmt.Errorf("issued for different key") + } + + template := c.template() + if err := compareCertData(template, cert); err != nil { + return nil, err + } + return cert, nil +} + +// Ensure makes sure the given Certificate (and all of its' issuers) have +// corresponding private keys and X.509 certificates on disk, generating things +// as necessary. +func (c *Certificate) Ensure() error { + cert, err := c.ensureCert() + if err != nil { + return fmt.Errorf("when ensuring cert %s: %w", c.name, err) + } + _ = cert + + return nil +} diff --git a/cluster/clustercfg/certs/x509.go b/cluster/clustercfg/certs/x509.go new file mode 100644 index 00000000..a64ec1a1 --- /dev/null +++ b/cluster/clustercfg/certs/x509.go @@ -0,0 +1,98 @@ +package certs + +import ( + "bytes" + "crypto/x509" + "crypto/x509/pkix" + "fmt" + "strings" + "time" +) + +var ( + // From RFC 5280 Section 4.1.2.5 + unknownNotAfter = time.Unix(253402300799, 0) +) + +// compareCertData returns an error if any of the 'important' bits of the two +// certificates differ. Those are the bits that we template ourselves, and that +// are not issue-dependent (ie. not time or serial or kid or ...). +func compareCertData(template, cert *x509.Certificate) error { + if want, got := template.Subject.String(), cert.Subject.String(); want != got { + return fmt.Errorf("issued for different subject, wanted %s, got %s", want, got) + } + if want, got := strings.Join(template.DNSNames, ","), strings.Join(cert.DNSNames, ","); want != got { + return fmt.Errorf("issued for different DNS names, wanted %s, got %s", want, got) + } + if want, got := len(template.IPAddresses), len(cert.IPAddresses); want != got { + return fmt.Errorf("issued for different IP addresses, wanted %v, got %v", want, got) + } else { + for i := 0; i < len(template.IPAddresses); i++ { + if want, got := template.IPAddresses[i], cert.IPAddresses[i]; !bytes.Equal(want, got) { + return fmt.Errorf("issued for different IP addresses, wanted %v, got %v", want, got) + } + } + } + if want, got := template.KeyUsage, cert.KeyUsage; want != got { + return fmt.Errorf("issued for different key usage, wanted %d, got %d", want, got) + } + if want, got := len(template.ExtKeyUsage), len(cert.ExtKeyUsage); want != got { + return fmt.Errorf("issued for different ext key usage, wanted %v, got %v", want, got) + } else { + for i := 0; i < len(template.ExtKeyUsage); i++ { + if want, got := template.ExtKeyUsage[i], cert.ExtKeyUsage[i]; want != got { + return fmt.Errorf("issued for different ext key usage, wanted %v, got %v", want, got) + } + } + } + if want, got := template.IsCA, cert.IsCA; want != got { + return fmt.Errorf("issued for different IsCA, wanted %v, got %v", want, got) + } + if want, got := template.BasicConstraintsValid, cert.BasicConstraintsValid; want != got { + return fmt.Errorf("issued for different basic constraints valid, wanted %v, got %v", want, got) + } + return nil +} + +// template builds an x509 'template' certificate, ie. makes an +// x509.Certificate with all the fields built up from the data contained in +// Certificate, but without any per-issue fields like times, serial number, +// etc. +func (c *Certificate) template() *x509.Certificate { + template := &x509.Certificate{ + Subject: pkix.Name{ + CommonName: c.cn, + }, + DNSNames: c.san, + IPAddresses: c.ips, + } + if c.o != "" { + template.Subject.Organization = []string{c.o} + } + switch c.kind { + case kindServer: + template.KeyUsage = x509.KeyUsageDigitalSignature | x509.KeyUsageKeyEncipherment + template.ExtKeyUsage = []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth} + template.DNSNames = c.san + case kindClient: + template.KeyUsage = x509.KeyUsageDigitalSignature | x509.KeyUsageKeyEncipherment + template.ExtKeyUsage = []x509.ExtKeyUsage{x509.ExtKeyUsageClientAuth} + template.DNSNames = c.san + case kindClientServer: + template.KeyUsage = x509.KeyUsageDigitalSignature | x509.KeyUsageKeyEncipherment + template.ExtKeyUsage = []x509.ExtKeyUsage{x509.ExtKeyUsageClientAuth, x509.ExtKeyUsageServerAuth} + case kindCA: + template.IsCA = true + template.BasicConstraintsValid = true + template.KeyUsage = x509.KeyUsageCertSign | x509.KeyUsageCRLSign | x509.KeyUsageDigitalSignature + template.ExtKeyUsage = []x509.ExtKeyUsage{x509.ExtKeyUsageClientAuth, x509.ExtKeyUsageServerAuth, x509.ExtKeyUsageOCSPSigning} + template.AuthorityKeyId = template.SubjectKeyId + case kindProdvider: + template.IsCA = true + template.BasicConstraintsValid = true + template.KeyUsage = x509.KeyUsageCertSign | x509.KeyUsageCRLSign | x509.KeyUsageDigitalSignature | x509.KeyUsageKeyEncipherment + template.ExtKeyUsage = []x509.ExtKeyUsage{x509.ExtKeyUsageClientAuth, x509.ExtKeyUsageServerAuth, x509.ExtKeyUsageOCSPSigning} + template.AuthorityKeyId = template.SubjectKeyId + } + return template +} diff --git a/cluster/clustercfg/clustercfg.py b/cluster/clustercfg/clustercfg.py deleted file mode 100644 index 30b87b82..00000000 --- a/cluster/clustercfg/clustercfg.py +++ /dev/null @@ -1,221 +0,0 @@ -#!/usr/bin/env python - -from builtins import object - -import datetime -from io import BytesIO -import json -import logging -import os -import tempfile -import subprocess -import sys - -from cryptography import x509 -from cryptography.hazmat.backends import default_backend -import fabric - -from tools import secretstore -from tools.hscloud import lib as hscloud - -import ca - - -local_root = hscloud.workspace_location() - - -cluster = 'k0.hswaw.net' -ss = secretstore.SecretStore( - plain_root=os.path.join(local_root, 'cluster/secrets/plain'), - cipher_root=os.path.join(local_root, 'cluster/secrets/cipher')) - - -logger = logging.getLogger() -logger.setLevel(logging.INFO) -formatter = logging.Formatter('%(levelname)s - %(message)s') -sh = logging.StreamHandler() -sh.setFormatter(formatter) -logger.addHandler(sh) - - -def configure_k8s(username, ca, cert, key): - subprocess.check_call([ - 'kubectl', 'config', - 'set-cluster', 'admin.' + cluster, - '--certificate-authority=' + ca, - '--embed-certs=true', - '--server=https://' + cluster + ':4001', - ]) - subprocess.check_call([ - 'kubectl', 'config', - 'set-credentials', username, - '--client-certificate=' + cert, - '--client-key=' + key, - '--embed-certs=true', - ]) - subprocess.check_call([ - 'kubectl', 'config', - 'set-context', 'admin.' + cluster, - '--cluster=' + 'admin.' + cluster, - '--user=' + username, - ]) - subprocess.check_call([ - 'kubectl', 'config', - 'use-context', 'admin.' + cluster, - ]) - - -def admincreds(args): - if len(args) != 1: - sys.stderr.write("Usage: admincreds q3k\n") - return 1 - username = args[0] - print("") - print("WARNING WARNING WARNING WARNING WARNING WARNING") - print("===============================================") - print("") - print("You are requesting ADMIN credentials.") - print("") - print("You likely shouldn't be doing this, and") - print("instead should be using `prodaccess`.") - print("") - print("===============================================") - print("WARNING WARNING WARNING WARNING WARNING WARNING") - print("") - - ## Make kube certificates. - certs_root = os.path.join(local_root, 'cluster/certs') - ca_kube = ca.CA(ss, certs_root, 'kube', 'kubernetes main CA') - - local_key = os.path.join(local_root, '.kubectl/admin.key') - local_crt = os.path.join(local_root, '.kubectl/admin.crt') - - kubectl = os.path.join(local_root, '.kubectl') - if not os.path.exists(kubectl): - os.mkdir(kubectl) - - generate_cert = False - if not os.path.exists(local_key): - generate_cert = True - - if os.path.exists(local_crt): - with open(local_crt, 'rb') as f: - b = f.read() - cert = x509.load_pem_x509_certificate(b, default_backend()) - delta = cert.not_valid_after - datetime.datetime.now() - logger.info("admin: existing cert expiry: {}".format(delta)) - if delta.total_seconds() < 3600 * 24: - logger.info("admin: expires soon, regenerating") - generate_cert = True - else: - generate_cert = True - - if not generate_cert: - return configure_k8s(username, ca_kube._cert, local_crt, local_key) - - key, csr = ca_kube.gen_key(hosts=['admin', username], o='system:masters', ou='Kube Admin Account') - crt = ca_kube.sign(csr) - - with open(local_key, 'w') as f: - f.write(key) - - with open(local_crt, 'w') as f: - f.write(crt) - - configure_k8s(username, ca_kube._cert, local_crt, local_key) - - -def nodestrap(args, nocerts=False): - if len(args) != 1: - sys.stderr.write("Usage: nodestrap bc01n01.hswaw.net\n") - return 1 - fqdn = args[0] - - logger.info("Nodestrapping {}...".format(fqdn)) - r = fabric.Connection('root@{}'.format(fqdn)) - - if not nocerts: - certs_root = os.path.join(local_root, 'cluster/certs') - - # Make etcd peer certificate for node. - ca_etcd_peer = ca.CA(ss, certs_root, 'etcdpeer', 'etcd peer ca') - ca_etcd_peer.make_cert('etcdpeer-{}'.format(fqdn), hosts=[fqdn], ou='node etcd peer certificate') - - # Make etcd server certificate for node and client certificate for kube. - ca_etcd = ca.CA(ss, certs_root, 'etcd', 'etcd ca') - - ca_etcd.make_cert('etcd-{}'.format(fqdn), hosts=[fqdn], ou='node etcd server certificate') - - ca_etcd.make_cert('etcd-kube', hosts=['kube'], ou='kube etcd client certificate') - - ca_etcd.make_cert('etcd-root', hosts=['root'], ou='root etcd client certificate') - - ca_etcd.make_cert('etcd-calico', hosts=['calico'], ou='root etcd client certificate') - - ## Make kube certificates. - ca_kube = ca.CA(ss, certs_root, 'kube', 'kubernetes main CA') - - # Make prodvider intermediate CA. - ca_kube.make_cert('ca-kube-prodvider', o='Warsaw Hackerspace', ou='kubernetes prodvider intermediate', hosts=['kubernetes prodvider intermediate CA'], profile='intermediate').ensure() - - # Make kubelet certificate (per node). - ca_kube.make_cert('kube-kubelet-'+fqdn, o='system:nodes', ou='Kubelet', hosts=['system:node:'+fqdn, fqdn]) - - # Make apiserver certificate. - ca_kube.make_cert('kube-apiserver', ou='Kubernetes API', hosts=[cluster, 'kubernetes.default.svc.'+cluster, '10.10.12.1']) - - # Make service accounts decryption key (as cert for consistency). - ca_kube.make_cert('kube-serviceaccounts', ou='Kubernetes Service Accounts Signer', hosts=['serviceaccounts']) - - # Make kube component certificates. - kube_components = ['controllermanager', 'scheduler', 'proxy'] - for k in kube_components: - # meh - if k == 'controllermanager': - o = 'system:kube-controller-manager' - else: - o = 'system:kube-'+k - ou = 'Kubernetes Component '+k - c = ca_kube.make_cert('kube-'+k, ou=ou, o=o, hosts=[o,]) - - ## Make kubefront certificates. - ca_kubefront = ca.CA(ss, certs_root, 'kubefront', 'kubernetes frontend CA') - ca_kubefront.make_cert('kubefront-apiserver', ou='Kubernetes Frontend', hosts=['apiserver']) - - ## Make admitomatic (admission controller) certificates. - ca_admitomatic = ca.CA(ss, certs_root, 'admitomatic', 'admitomatic webhook CA') - ca_admitomatic.make_cert('admitomatic-webhook', ou='Admitomatic Webhook', hosts=['admitomatic.admitomatic.svc']) - - toplevel = subprocess.check_output([ - "nix-build", - local_root, - "-A", "ops.machines.\"" + fqdn + "\".config.passthru.hscloud.provision", - ]).decode().strip() - subprocess.check_call([toplevel]) - - -def usage(): - sys.stderr.write("Usage: clustercfg \n") - - -def main(): - if len(sys.argv) < 2: - usage() - return 1 - - mode = sys.argv[1] - if mode == "nodestrap": - return nodestrap(sys.argv[2:]) - elif mode == "nodestrap-nocerts": - return nodestrap(sys.argv[2:], nocerts=True) - elif mode == "admincreds": - return admincreds(sys.argv[2:]) - elif mode == "smoketest": - sys.stdout.write("Smoke test passed.") - return 0 - else: - usage() - return 1 - -if __name__ == '__main__': - sys.exit(main() or 0) diff --git a/cluster/clustercfg/cmd_admincreds.go b/cluster/clustercfg/cmd_admincreds.go new file mode 100644 index 00000000..110c54d5 --- /dev/null +++ b/cluster/clustercfg/cmd_admincreds.go @@ -0,0 +1,109 @@ +package main + +import ( + "fmt" + "log" + "os" + "os/user" + "path/filepath" + + "github.com/spf13/cobra" + "k8s.io/client-go/tools/clientcmd" + clientapi "k8s.io/client-go/tools/clientcmd/api" + + "code.hackerspace.pl/hscloud/cluster/clustercfg/certs" + "code.hackerspace.pl/hscloud/go/workspace" +) + +var admincredsCmd = &cobra.Command{ + Use: "admincreds", + Short: "Acquire emergency Kubernetes credentials", + Long: ` +Use secretstore secrets to generate a Kubernetes system:masters keypair and +certificate. Only for use in emergencies. + +Your local username and hostname will make part of the cert and can be used +for auditing of accesses to apiservers. +`, + Run: func(cmd *cobra.Command, args []string) { + ws, err := workspace.Get() + if err != nil { + log.Fatalf("Could not figure out workspace: %v", err) + } + + uname := "UNKNOWN" + if u, err := user.Current(); err == nil { + uname = u.Username + } + hostname := "UNKNOWN" + if h, err := os.Hostname(); err == nil { + hostname = h + } + breadcrumb := fmt.Sprintf("%s@%s", uname, hostname) + + root := filepath.Join(ws, "cluster") + path := filepath.Join(ws, ".kubectl", "admincreds") + c := certs.Prepare(root, nil) + creds := c.MakeKubeEmergencyCreds(path, breadcrumb) + _ = creds + + log.Printf("") + log.Printf("WARNING WARNING WARNING WARNING WARNING WARNING") + log.Printf("===============================================") + log.Printf("") + log.Printf("You are requesting ADMIN credentials.") + log.Printf("") + log.Printf("You likely shouldn't be doing this, and") + log.Printf("instead should be using `prodaccess`.") + log.Printf("") + log.Printf("===============================================") + log.Printf("WARNING WARNING WARNING WARNING WARNING WARNING") + log.Printf("") + + log.Printf("Issuing certs...") + if err := creds.Ensure(); err != nil { + log.Fatalf("Failed: %v", err) + } + + log.Printf("Configuring kubectl...") + caPath, certPath, keyPath := creds.Paths() + if err := installKubeletConfig(caPath, certPath, keyPath, "emergency.k0"); err != nil { + log.Fatalf("Failed: %v", err) + } + + log.Fatalf("Done. Use kubectl --context=emergency.k0") + }, +} + +func installKubeletConfig(caPath, certPath, keyPath, configName string) error { + ca := clientcmd.NewDefaultPathOptions() + config, err := ca.GetStartingConfig() + if err != nil { + return fmt.Errorf("getting initial config failed: %w", err) + } + + config.AuthInfos[configName] = &clientapi.AuthInfo{ + ClientCertificate: certPath, + ClientKey: keyPath, + } + + config.Clusters[configName] = &clientapi.Cluster{ + CertificateAuthority: caPath, + Server: "https://k0.hswaw.net:4001", + } + + config.Contexts[configName] = &clientapi.Context{ + AuthInfo: configName, + Cluster: configName, + Namespace: "default", + } + + if err := clientcmd.ModifyConfig(ca, *config, true); err != nil { + return fmt.Errorf("modifying config failed: %w", err) + } + return nil +} + +func init() { + rootCmd.AddCommand(admincredsCmd) +} diff --git a/cluster/clustercfg/cmd_gencerts.go b/cluster/clustercfg/cmd_gencerts.go new file mode 100644 index 00000000..6cb6431b --- /dev/null +++ b/cluster/clustercfg/cmd_gencerts.go @@ -0,0 +1,63 @@ +package main + +import ( + "log" + "path/filepath" + "strings" + + "github.com/spf13/cobra" + + "code.hackerspace.pl/hscloud/cluster/clustercfg/certs" + "code.hackerspace.pl/hscloud/go/workspace" +) + +var flagFQDNs []string + +var gencertsCmd = &cobra.Command{ + Use: "gencerts", + Short: "(re)generate keys/certs for k0 cluster", + Long: ` +If you're adding a new cluster node, run this. It will populate //cluster/secrets +and //cluster/certificates with new certs/keys. + +By default, the nodes to generate certificates for are automatically discovered +by querying the local Nix machines defined in //ops, looking for anything that +has hscloud.kube.controller.enabled. That can be slow and/or incorrect. To override +node names, set --fqdn (either comma-separate them or repeat flags). +`, + Run: func(cmd *cobra.Command, args []string) { + ws, err := workspace.Get() + if err != nil { + log.Fatalf("Could not figure out workspace: %v", err) + } + path := filepath.Join(ws, "cluster") + + fqdns := flagFQDNs + if len(fqdns) == 0 { + log.Printf("--fqdn not set, figuring out machines from Nix...") + err = workspace.EvalHscloudNix(cmd.Context(), &fqdns, "ops.exports.kubeMachineNames") + if err != nil { + log.Fatalf("Could not figure out Kubernetes machine FQDNs: %v", err) + } + } + + for _, fqdn := range fqdns { + parts := strings.Split(fqdn, ".") + if len(parts) != 3 || parts[1] != "hswaw" || parts[2] != "net" { + log.Fatalf("Invalid FQDN %q: must be xxx.hswaw.net.", fqdn) + } + } + + log.Printf("Machines: --fqdn %s", strings.Join(fqdns, ",")) + c := certs.Prepare(path, fqdns) + if err := c.Ensure(); err != nil { + log.Fatalf("Failed: %v", err) + } + log.Printf("Done.") + }, +} + +func init() { + gencertsCmd.Flags().StringSliceVar(&flagFQDNs, "fqdn", nil, "List of machine FQDNs to generate certs for. If not set, will be automatically figured out from Nix modules in local checkout (slow).") + rootCmd.AddCommand(gencertsCmd) +} diff --git a/cluster/clustercfg/main.go b/cluster/clustercfg/main.go new file mode 100644 index 00000000..823932d5 --- /dev/null +++ b/cluster/clustercfg/main.go @@ -0,0 +1,20 @@ +package main + +import ( + "fmt" + "os" + + "github.com/spf13/cobra" +) + +var rootCmd = &cobra.Command{ + Use: "clustercfg", + Short: "admin management tool for k0 cluster", +} + +func main() { + if err := rootCmd.Execute(); err != nil { + fmt.Fprintln(os.Stderr, err) + os.Exit(1) + } +} diff --git a/cluster/doc/admin.md b/cluster/doc/admin.md index 097e749d..2f9ba18d 100644 --- a/cluster/doc/admin.md +++ b/cluster/doc/admin.md @@ -29,8 +29,9 @@ Provisioning nodes - bring up a new node with nixos, the configuration doesn't matter and will be nuked anyway - - edit cluster/nix/defs-machines.nix - - `bazel run //cluster/clustercfg nodestrap bc01nXX.hswaw.net` + - add machine to cluster/machines and ops/machines.nix + - generate certs with `bazel run //cluster/clustercfg gencerts` + - deploy using ops (see ops/README.md) Applying kubecfg state ---------------------- diff --git a/ops/exports.nix b/ops/exports.nix new file mode 100644 index 00000000..33566070 --- /dev/null +++ b/ops/exports.nix @@ -0,0 +1,14 @@ +{ hscloud, pkgs, hscloudForPkgs, ... }: + +{ + # Used by clustercfg to figure out which machines need kube certs. + kubeMachineNames = let + isKubeMachine = n: value: + n != "__readTree" && + (builtins.hasAttr "hscloud" value.options) && + (builtins.hasAttr "kube" value.options.hscloud) && + value.options.hscloud.kube.control.enable.value; + machines = pkgs.lib.filterAttrs isKubeMachine hscloud.ops.machines; + names = pkgs.lib.mapAttrsToList (name: _: name) machines; + in names; +} diff --git a/ops/monitoring/lib/cluster.libsonnet b/ops/monitoring/lib/cluster.libsonnet index 00aa7922..c742df01 100644 --- a/ops/monitoring/lib/cluster.libsonnet +++ b/ops/monitoring/lib/cluster.libsonnet @@ -75,7 +75,7 @@ local kube = import "../../../kube/kube.libsonnet"; // // When contacting the API server, we hardcode the 'hswaw.net' DNS suffix as // our API server's TLS certificate only has a CN/SAN for its full FQDN, not - // the .svc.cluster.local shorthand (see //cluster/clustercfg:clustercfg.py). + // the .svc.cluster.local shorthand (see //cluster/clustercfg). local kubeScrapeNodeMetrics = function(name, path) kubeScrapeConfig(name, "node") { relabel_configs: [ {