1
0
Fork 0
hscloud/cluster/kube/lib/calico-bird.cfg.template

165 lines
6.3 KiB
Plaintext
Raw Normal View History

k0.hswaw.net: pass metallb through Calico Previously, we had the following setup: .-----------. | ..... | .-----------.-| | dcr01s24 | | .-----------.-| | | dcr01s22 | | | .---|-----------| |-' .--------. | |---------. | | | dcsw01 | <----- | metallb | |-' '--------' |---------' | '-----------' Ie., each metallb on each node directly talked to dcsw01 over BGP to announce ExternalIPs to our L3 fabric. Now, we rejigger the configuration to instead have Calico's BIRD instances talk BGP to dcsw01, and have metallb talk locally to Calico. .-------------------------. | dcr01s24 | |-------------------------| .--------. |---------. .---------. | | dcsw01 | <----- | Calico |<--| metallb | | '--------' |---------' '---------' | '-------------------------' This makes Calico announce our pod/service networks into our L3 fabric! Calico and metallb talk to eachother over 127.0.0.1 (they both run with Host Networking), but that requires one side to flip to pasive mode. We chose to do that with Calico, by overriding its BIRD config and special-casing any 127.0.0.1 peer to enable passive mode. We also override Calico's Other Bird Template (bird_ipam.cfg) to fiddle with the kernel programming filter (ie. to-kernel-routing-table filter), where we disable programming unreachable routes. This is because routes coming from metallb have their next-hop set to 127.0.0.1, which makes bird mark them as unreachable. Unreachable routes in the kernel will break local access to ExternalIPs, eg. register access from containerd. All routes pass through without route reflectors and a full mesh as we use eBGP over private ASNs in our fabric. We also have to make Calico aware of metallb pools - otherwise, routes announced by metallb end up being filtered by Calico. This is all mildly hacky. Here's hoping that Calico will be able to some day gain metallb-like functionality, ie. IPAM for externalIPs/LoadBalancers/... There seems to be however one problem with this change (but I'm not fixing it yet as it's not critical): metallb would previously only announce IPs from nodes that were serving that service. Now, however, the Calico internal mesh makes those appear from every node. This can probably be fixed by disabling local meshing, enabling route reflection on dcsw01 (to recreate the mesh routing through dcsw01). Or, maybe by some more hacking of the Calico BIRD config :/. Change-Id: I3df1f6ae7fa1911dd53956ced3b073581ef0e836
2020-09-20 22:52:57 +00:00
# This is forked from bird.cfg.template from calico running on k0.hswaw.net on 2020/09/21.
# Changed vs. upstream (C-f HSCLOUD):
# - set 'passive on' on 127.0.0.1 neighbors, used for estabilishing connectivity
# with metallb.
# Generated by confd
include "bird_aggr.cfg";
include "bird_ipam.cfg";
{{- $node_ip_key := printf "/host/%s/ip_addr_v4" (getenv "NODENAME")}}{{$node_ip := getv $node_ip_key}}
{{- $router_id := getenv "CALICO_ROUTER_ID" ""}}
{{- $node_name := getenv "NODENAME"}}
router id {{if eq "hash" ($router_id) -}}
{{hashToIPv4 $node_name}};
{{- else -}}
{{if ne "" ($router_id)}}{{$router_id}}{{else}}{{$node_ip}}{{end}};
{{- end}}
{{- define "LOGGING"}}
{{- $node_logging_key := printf "/host/%s/loglevel" (getenv "NODENAME")}}
{{- if exists $node_logging_key}}
{{- $logging := getv $node_logging_key}}
{{- if eq $logging "debug"}}
debug all;
{{- else if ne $logging "none"}}
debug { states };
{{- end}}
{{- else if exists "/global/loglevel"}}
{{- $logging := getv "/global/loglevel"}}
{{- if eq $logging "debug"}}
debug all;
{{- else if ne $logging "none"}}
debug { states };
{{- end}}
{{- else}}
debug { states };
{{- end}}
{{- end}}
# Configure synchronization between routing tables and kernel.
protocol kernel {
learn; # Learn all alien routes from the kernel
persist; # Don't remove routes on bird shutdown
scan time 2; # Scan kernel routing table every 2 seconds
import all;
export filter calico_kernel_programming; # Default is export none
graceful restart; # Turn on graceful restart to reduce potential flaps in
# routes when reloading BIRD configuration. With a full
# automatic mesh, there is no way to prevent BGP from
# flapping since multiple nodes update their BGP
# configuration at the same time, GR is not guaranteed to
# work correctly in this scenario.
}
# Watch interface up/down events.
protocol device {
{{- template "LOGGING"}}
scan time 2; # Scan interfaces every 2 seconds
}
protocol direct {
{{- template "LOGGING"}}
interface -"cali*", -"kube-ipvs*", "*"; # Exclude cali* and kube-ipvs* but
# include everything else. In
# IPVS-mode, kube-proxy creates a
# kube-ipvs0 interface. We exclude
# kube-ipvs0 because this interface
# gets an address for every in use
# cluster IP. We use static routes
# for when we legitimately want to
# export cluster IPs.
}
{{if eq "" ($node_ip)}}# IPv4 disabled on this node.
{{else}}{{$node_as_key := printf "/host/%s/as_num" (getenv "NODENAME")}}
# Template for all BGP clients
template bgp bgp_template {
{{- $as_key := or (and (exists $node_as_key) $node_as_key) "/global/as_num"}}
{{- $node_as_num := getv $as_key}}
{{- template "LOGGING"}}
description "Connection to BGP peer";
local as {{$node_as_num}};
multihop;
gateway recursive; # This should be the default, but just in case.
import all; # Import all routes, since we don't know what the upstream
# topology is and therefore have to trust the ToR/RR.
export filter calico_export_to_bgp_peers; # Only want to export routes for workloads.
source address {{$node_ip}}; # The local address we use for the TCP connection
add paths on;
graceful restart; # See comment in kernel section about graceful restart.
connect delay time 2;
connect retry time 5;
error wait time 5,30;
}
# ------------- Node-to-node mesh -------------
{{- $node_cid_key := printf "/host/%s/rr_cluster_id" (getenv "NODENAME")}}
{{- $node_cluster_id := getv $node_cid_key}}
{{if (json (getv "/global/node_mesh")).enabled}}
{{range $host := lsdir "/host"}}
{{$onode_as_key := printf "/host/%s/as_num" .}}
{{$onode_ip_key := printf "/host/%s/ip_addr_v4" .}}{{if exists $onode_ip_key}}{{$onode_ip := getv $onode_ip_key}}
{{$nums := split $onode_ip "."}}{{$id := join $nums "_"}}
# For peer {{$onode_ip_key}}
{{if eq $onode_ip ($node_ip) }}# Skipping ourselves ({{$node_ip}})
{{else if ne "" $onode_ip}}protocol bgp Mesh_{{$id}} from bgp_template {
neighbor {{$onode_ip}} as {{if exists $onode_as_key}}{{getv $onode_as_key}}{{else}}{{getv "/global/as_num"}}{{end}};
{{- /*
Make the peering unidirectional. This avoids a race where
- peer A opens a connection and begins a graceful restart
- before the restart completes, peer B opens its connection
- peer A sees the new connection and aborts the graceful restart, causing a route flap.
*/ -}}
{{if gt $onode_ip $node_ip}}
passive on; # Mesh is unidirectional, peer will connect to us.
{{- end}}
}{{end}}{{end}}{{end}}
{{else}}
# Node-to-node mesh disabled
{{end}}
# ------------- Global peers -------------
{{if ls "/global/peer_v4"}}
{{range gets "/global/peer_v4/*"}}{{$data := json .Value}}
{{$nums := split $data.ip "."}}{{$id := join $nums "_"}}
# For peer {{.Key}}
{{- if eq $data.ip ($node_ip) }}
# Skipping ourselves ({{$node_ip}})
{{- else}}
protocol bgp Global_{{$id}} from bgp_template {
{{if eq $data.ip ("127.0.0.1")}}passive on; # HSCLOUD {{end}}
neighbor {{$data.ip}} as {{$data.as_num}};
{{- if and (eq $data.as_num $node_as_num) (ne "" ($node_cluster_id)) (ne $data.rr_cluster_id ($node_cluster_id))}}
rr client;
rr cluster id {{$node_cluster_id}};
{{- end}}
}
{{- end}}
{{end}}
{{else}}# No global peers configured.{{end}}
# ------------- Node-specific peers -------------
{{$node_peers_key := printf "/host/%s/peer_v4" (getenv "NODENAME")}}
{{if ls $node_peers_key}}
{{range gets (printf "%s/*" $node_peers_key)}}{{$data := json .Value}}
{{$nums := split $data.ip "."}}{{$id := join $nums "_"}}
# For peer {{.Key}}
{{- if eq $data.ip ($node_ip) }}
# Skipping ourselves ({{$node_ip}})
{{- else}}
protocol bgp Node_{{$id}} from bgp_template {
neighbor {{$data.ip}} as {{$data.as_num}};
{{- if and (eq $data.as_num $node_as_num) (ne "" ($node_cluster_id)) (ne $data.rr_cluster_id ($node_cluster_id))}}
rr client;
rr cluster id {{$node_cluster_id}};
{{- end}}
}
{{- end}}
{{end}}
{{else}}# No node-specific peers configured.{{end}}
{{end}}{{/* End of IPv4 enable check */}}