1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * To speed up listener socket lookup, create an array to store all sockets 4 * listening on the same port. This allows a decision to be made after finding 5 * the first socket. An optional BPF program can also be configured for 6 * selecting the socket index from the array of available sockets. 7 */ 8 9 #include <net/sock_reuseport.h> 10 #include <linux/bpf.h> 11 #include <linux/rcupdate.h> 12 13 #define INIT_SOCKS 128 14 15 static DEFINE_SPINLOCK(reuseport_lock); 16 17 static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks) 18 { 19 unsigned int size = sizeof(struct sock_reuseport) + 20 sizeof(struct sock *) * max_socks; 21 struct sock_reuseport *reuse = kzalloc(size, GFP_ATOMIC); 22 23 if (!reuse) 24 return NULL; 25 26 reuse->max_socks = max_socks; 27 28 RCU_INIT_POINTER(reuse->prog, NULL); 29 return reuse; 30 } 31 32 int reuseport_alloc(struct sock *sk) 33 { 34 struct sock_reuseport *reuse; 35 36 /* bh lock used since this function call may precede hlist lock in 37 * soft irq of receive path or setsockopt from process context 38 */ 39 spin_lock_bh(&reuseport_lock); 40 41 /* Allocation attempts can occur concurrently via the setsockopt path 42 * and the bind/hash path. Nothing to do when we lose the race. 43 */ 44 if (rcu_dereference_protected(sk->sk_reuseport_cb, 45 lockdep_is_held(&reuseport_lock))) 46 goto out; 47 48 reuse = __reuseport_alloc(INIT_SOCKS); 49 if (!reuse) { 50 spin_unlock_bh(&reuseport_lock); 51 return -ENOMEM; 52 } 53 54 reuse->socks[0] = sk; 55 reuse->num_socks = 1; 56 rcu_assign_pointer(sk->sk_reuseport_cb, reuse); 57 58 out: 59 spin_unlock_bh(&reuseport_lock); 60 61 return 0; 62 } 63 EXPORT_SYMBOL(reuseport_alloc); 64 65 static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) 66 { 67 struct sock_reuseport *more_reuse; 68 u32 more_socks_size, i; 69 70 more_socks_size = reuse->max_socks * 2U; 71 if (more_socks_size > U16_MAX) 72 return NULL; 73 74 more_reuse = __reuseport_alloc(more_socks_size); 75 if (!more_reuse) 76 return NULL; 77 78 more_reuse->max_socks = more_socks_size; 79 more_reuse->num_socks = reuse->num_socks; 80 more_reuse->prog = reuse->prog; 81 82 memcpy(more_reuse->socks, reuse->socks, 83 reuse->num_socks * sizeof(struct sock *)); 84 85 for (i = 0; i < reuse->num_socks; ++i) 86 rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb, 87 more_reuse); 88 89 /* Note: we use kfree_rcu here instead of reuseport_free_rcu so 90 * that reuse and more_reuse can temporarily share a reference 91 * to prog. 92 */ 93 kfree_rcu(reuse, rcu); 94 return more_reuse; 95 } 96 97 static void reuseport_free_rcu(struct rcu_head *head) 98 { 99 struct sock_reuseport *reuse; 100 101 reuse = container_of(head, struct sock_reuseport, rcu); 102 if (reuse->prog) 103 bpf_prog_destroy(reuse->prog); 104 kfree(reuse); 105 } 106 107 /** 108 * reuseport_add_sock - Add a socket to the reuseport group of another. 109 * @sk: New socket to add to the group. 110 * @sk2: Socket belonging to the existing reuseport group. 111 * May return ENOMEM and not add socket to group under memory pressure. 112 */ 113 int reuseport_add_sock(struct sock *sk, struct sock *sk2) 114 { 115 struct sock_reuseport *old_reuse, *reuse; 116 117 if (!rcu_access_pointer(sk2->sk_reuseport_cb)) { 118 int err = reuseport_alloc(sk2); 119 120 if (err) 121 return err; 122 } 123 124 spin_lock_bh(&reuseport_lock); 125 reuse = rcu_dereference_protected(sk2->sk_reuseport_cb, 126 lockdep_is_held(&reuseport_lock)); 127 old_reuse = rcu_dereference_protected(sk->sk_reuseport_cb, 128 lockdep_is_held(&reuseport_lock)); 129 if (old_reuse && old_reuse->num_socks != 1) { 130 spin_unlock_bh(&reuseport_lock); 131 return -EBUSY; 132 } 133 134 if (reuse->num_socks == reuse->max_socks) { 135 reuse = reuseport_grow(reuse); 136 if (!reuse) { 137 spin_unlock_bh(&reuseport_lock); 138 return -ENOMEM; 139 } 140 } 141 142 reuse->socks[reuse->num_socks] = sk; 143 /* paired with smp_rmb() in reuseport_select_sock() */ 144 smp_wmb(); 145 reuse->num_socks++; 146 rcu_assign_pointer(sk->sk_reuseport_cb, reuse); 147 148 spin_unlock_bh(&reuseport_lock); 149 150 if (old_reuse) 151 call_rcu(&old_reuse->rcu, reuseport_free_rcu); 152 return 0; 153 } 154 155 void reuseport_detach_sock(struct sock *sk) 156 { 157 struct sock_reuseport *reuse; 158 int i; 159 160 spin_lock_bh(&reuseport_lock); 161 reuse = rcu_dereference_protected(sk->sk_reuseport_cb, 162 lockdep_is_held(&reuseport_lock)); 163 rcu_assign_pointer(sk->sk_reuseport_cb, NULL); 164 165 for (i = 0; i < reuse->num_socks; i++) { 166 if (reuse->socks[i] == sk) { 167 reuse->socks[i] = reuse->socks[reuse->num_socks - 1]; 168 reuse->num_socks--; 169 if (reuse->num_socks == 0) 170 call_rcu(&reuse->rcu, reuseport_free_rcu); 171 break; 172 } 173 } 174 spin_unlock_bh(&reuseport_lock); 175 } 176 EXPORT_SYMBOL(reuseport_detach_sock); 177 178 static struct sock *run_bpf(struct sock_reuseport *reuse, u16 socks, 179 struct bpf_prog *prog, struct sk_buff *skb, 180 int hdr_len) 181 { 182 struct sk_buff *nskb = NULL; 183 u32 index; 184 185 if (skb_shared(skb)) { 186 nskb = skb_clone(skb, GFP_ATOMIC); 187 if (!nskb) 188 return NULL; 189 skb = nskb; 190 } 191 192 /* temporarily advance data past protocol header */ 193 if (!pskb_pull(skb, hdr_len)) { 194 kfree_skb(nskb); 195 return NULL; 196 } 197 index = bpf_prog_run_save_cb(prog, skb); 198 __skb_push(skb, hdr_len); 199 200 consume_skb(nskb); 201 202 if (index >= socks) 203 return NULL; 204 205 return reuse->socks[index]; 206 } 207 208 /** 209 * reuseport_select_sock - Select a socket from an SO_REUSEPORT group. 210 * @sk: First socket in the group. 211 * @hash: When no BPF filter is available, use this hash to select. 212 * @skb: skb to run through BPF filter. 213 * @hdr_len: BPF filter expects skb data pointer at payload data. If 214 * the skb does not yet point at the payload, this parameter represents 215 * how far the pointer needs to advance to reach the payload. 216 * Returns a socket that should receive the packet (or NULL on error). 217 */ 218 struct sock *reuseport_select_sock(struct sock *sk, 219 u32 hash, 220 struct sk_buff *skb, 221 int hdr_len) 222 { 223 struct sock_reuseport *reuse; 224 struct bpf_prog *prog; 225 struct sock *sk2 = NULL; 226 u16 socks; 227 228 rcu_read_lock(); 229 reuse = rcu_dereference(sk->sk_reuseport_cb); 230 231 /* if memory allocation failed or add call is not yet complete */ 232 if (!reuse) 233 goto out; 234 235 prog = rcu_dereference(reuse->prog); 236 socks = READ_ONCE(reuse->num_socks); 237 if (likely(socks)) { 238 /* paired with smp_wmb() in reuseport_add_sock() */ 239 smp_rmb(); 240 241 if (prog && skb) 242 sk2 = run_bpf(reuse, socks, prog, skb, hdr_len); 243 244 /* no bpf or invalid bpf result: fall back to hash usage */ 245 if (!sk2) 246 sk2 = reuse->socks[reciprocal_scale(hash, socks)]; 247 } 248 249 out: 250 rcu_read_unlock(); 251 return sk2; 252 } 253 EXPORT_SYMBOL(reuseport_select_sock); 254 255 struct bpf_prog * 256 reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) 257 { 258 struct sock_reuseport *reuse; 259 struct bpf_prog *old_prog; 260 261 spin_lock_bh(&reuseport_lock); 262 reuse = rcu_dereference_protected(sk->sk_reuseport_cb, 263 lockdep_is_held(&reuseport_lock)); 264 old_prog = rcu_dereference_protected(reuse->prog, 265 lockdep_is_held(&reuseport_lock)); 266 rcu_assign_pointer(reuse->prog, prog); 267 spin_unlock_bh(&reuseport_lock); 268 269 return old_prog; 270 } 271 EXPORT_SYMBOL(reuseport_attach_prog); 272