local kube = import "kube.libsonnet"; { local policies = self, policyNameAllowInsecure: "policy:allow-insecure", policyNameAllowSecure: "policy:allow-secure", policyNameAllowMostlySecure: "policy:allow-mostlysecure", # egrep 'define CAP_[A-Z_]+.+[0-9]+$' include/linux/capability.h | cut -d' ' -f 2 | tr '\n' ',' local allCapsStr = 'CAP_CHOWN,CAP_DAC_OVERRIDE,CAP_DAC_READ_SEARCH,CAP_FOWNER,CAP_FSETID,CAP_KILL,CAP_SETGID,CAP_SETUID,CAP_SETPCAP,CAP_LINUX_IMMUTABLE,CAP_NET_BIND_SERVICE,CAP_NET_BROADCAST,CAP_NET_ADMIN,CAP_NET_RAW,CAP_IPC_LOCK,CAP_IPC_OWNER,CAP_SYS_MODULE,CAP_SYS_RAWIO,CAP_SYS_CHROOT,CAP_SYS_PTRACE,CAP_SYS_PACCT,CAP_SYS_ADMIN,CAP_SYS_BOOT,CAP_SYS_NICE,CAP_SYS_RESOURCE,CAP_SYS_TIME,CAP_SYS_TTY_CONFIG,CAP_MKNOD,CAP_LEASE,CAP_AUDIT_WRITE,CAP_AUDIT_CONTROL,CAP_SETFCAP,CAP_MAC_OVERRIDE,CAP_MAC_ADMIN,CAP_SYSLOG,CAP_WAKE_ALARM,CAP_BLOCK_SUSPEND,CAP_AUDIT_READ', // Split by `,`, remove CAP_ prefix, turn into unique set. local allCaps = std.set(std.map(function(el) std.substr(el, 4, std.length(el)-4), std.split(allCapsStr, ','))), Cluster: { local cluster = self, // Insecure: allowing creation of these pods allows you to pwn the entire cluster. insecure: kube._Object("policy/v1beta1", "PodSecurityPolicy", "insecure") { spec: { privileged: true, allowPrivilegeEscalation: true, allowedCapabilities: ['*'], volumes: ['*'], hostNetwork: true, hostPorts: [ { max: 40000, min: 1 }, ], hostIPC: true, hostPID: true, runAsUser: { rule: 'RunAsAny', }, seLinux: { rule: 'RunAsAny', }, supplementalGroups: { rule: 'RunAsAny', }, fsGroup: { rule: 'RunAsAny', }, }, }, insecureRole: kube.ClusterRole(policies.policyNameAllowInsecure) { rules: [ { apiGroups: ['policy'], resources: ['podsecuritypolicies'], verbs: ['use'], resourceNames: ['insecure'], } ], }, // Secure: very limited subset of security policy, everyone is allowed // to spawn containers of this kind. secure: kube._Object("policy/v1beta1", "PodSecurityPolicy", "secure") { spec: { privileged: false, # Required to prevent escalations to root. allowPrivilegeEscalation: false, # This is redundant with non-root + disallow privilege escalation, # but we can provide it for defense in depth. requiredDropCapabilities: ["ALL"], # Allow core volume types. volumes: [ 'configMap', 'emptyDir', 'projected', 'secret', 'downwardAPI', 'persistentVolumeClaim', ], hostNetwork: false, hostIPC: false, hostPID: false, runAsUser: { # Allow to run as root - docker, we trust you here. rule: 'RunAsAny', }, seLinux: { rule: 'RunAsAny', }, supplementalGroups: { rule: 'MustRunAs', ranges: [ { # Forbid adding the root group. min: 1, max: 65535, } ], }, fsGroup: { rule: 'MustRunAs', ranges: [ { # Forbid adding the root group. min: 1, max: 65535, } ], }, readOnlyRootFilesystem: false, }, }, secureRole: kube.ClusterRole(policies.policyNameAllowSecure) { rules: [ { apiGroups: ['policy'], resources: ['podsecuritypolicies'], verbs: ['use'], resourceNames: ['secure'], }, ], }, // MostlySecure: like secure, but allows for setuid inside containers // and enough filesystem access to run apt. mostlySecure: cluster.secure { metadata+: { name: "mostlysecure", }, spec+: { requiredDropCapabilities: std.setDiff(allCaps, [ // Drop everything apart from: "CHOWN", "DAC_OVERRIDE", "FOWNER", "LEASE", "SETGID", "SETUID", ]), supplementalGroups: { // Allow running as root gid - we allow running as root // uid anyway, as we trust our container runtime. rule: 'MustRunAs', ranges: [ { min: 0, max: 65535, }, ], }, fsGroup: { // Allow setting the fsGroup to 0, as all filesystem mounts // are trusted anyway. rule: 'MustRunAs', ranges: [ { min: 0, max: 65535, }, ], }, }, }, mostlySecureRole: kube.ClusterRole(policies.policyNameAllowMostlySecure) { rules: [ { apiGroups: ['policy'], resources: ['podsecuritypolicies'], verbs: ['use'], resourceNames: ['mostlysecure'], }, ], }, }, # Allow insecure access to all service accounts in a given namespace. AllowNamespaceInsecure(namespace): { rb: kube.RoleBinding("policy:allow-insecure-in-" + namespace) { metadata+: { namespace: namespace, }, roleRef_: policies.Cluster.insecureRole, subjects: [ { kind: "Group", apiGroup: "rbac.authorization.k8s.io", name: "system:serviceaccounts", } ], }, }, # Allow mostlysecure access to all service accounts in a given namespace. AllowNamespaceMostlySecure(namespace): { rb: kube.RoleBinding("policy:allow-mostlysecure-in-" + namespace) { metadata+: { namespace: namespace, }, roleRef_: policies.Cluster.mostlySecureRole, subjects: [ { kind: "Group", apiGroup: "rbac.authorization.k8s.io", name: "system:serviceaccounts", } ], }, }, }