1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * To speed up listener socket lookup, create an array to store all sockets 4 * listening on the same port. This allows a decision to be made after finding 5 * the first socket. An optional BPF program can also be configured for 6 * selecting the socket index from the array of available sockets. 7 */ 8 9 #include <net/sock_reuseport.h> 10 #include <linux/bpf.h> 11 #include <linux/rcupdate.h> 12 13 #define INIT_SOCKS 128 14 15 static DEFINE_SPINLOCK(reuseport_lock); 16 17 static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks) 18 { 19 unsigned int size = sizeof(struct sock_reuseport) + 20 sizeof(struct sock *) * max_socks; 21 struct sock_reuseport *reuse = kzalloc(size, GFP_ATOMIC); 22 23 if (!reuse) 24 return NULL; 25 26 reuse->max_socks = max_socks; 27 28 RCU_INIT_POINTER(reuse->prog, NULL); 29 return reuse; 30 } 31 32 int reuseport_alloc(struct sock *sk) 33 { 34 struct sock_reuseport *reuse; 35 36 /* bh lock used since this function call may precede hlist lock in 37 * soft irq of receive path or setsockopt from process context 38 */ 39 spin_lock_bh(&reuseport_lock); 40 41 /* Allocation attempts can occur concurrently via the setsockopt path 42 * and the bind/hash path. Nothing to do when we lose the race. 43 */ 44 if (rcu_dereference_protected(sk->sk_reuseport_cb, 45 lockdep_is_held(&reuseport_lock))) 46 goto out; 47 48 reuse = __reuseport_alloc(INIT_SOCKS); 49 if (!reuse) { 50 spin_unlock_bh(&reuseport_lock); 51 return -ENOMEM; 52 } 53 54 reuse->socks[0] = sk; 55 reuse->num_socks = 1; 56 rcu_assign_pointer(sk->sk_reuseport_cb, reuse); 57 58 out: 59 spin_unlock_bh(&reuseport_lock); 60 61 return 0; 62 } 63 EXPORT_SYMBOL(reuseport_alloc); 64 65 static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) 66 { 67 struct sock_reuseport *more_reuse; 68 u32 more_socks_size, i; 69 70 more_socks_size = reuse->max_socks * 2U; 71 if (more_socks_size > U16_MAX) 72 return NULL; 73 74 more_reuse = __reuseport_alloc(more_socks_size); 75 if (!more_reuse) 76 return NULL; 77 78 more_reuse->max_socks = more_socks_size; 79 more_reuse->num_socks = reuse->num_socks; 80 more_reuse->prog = reuse->prog; 81 82 memcpy(more_reuse->socks, reuse->socks, 83 reuse->num_socks * sizeof(struct sock *)); 84 85 for (i = 0; i < reuse->num_socks; ++i) 86 rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb, 87 more_reuse); 88 89 /* Note: we use kfree_rcu here instead of reuseport_free_rcu so 90 * that reuse and more_reuse can temporarily share a reference 91 * to prog. 92 */ 93 kfree_rcu(reuse, rcu); 94 return more_reuse; 95 } 96 97 /** 98 * reuseport_add_sock - Add a socket to the reuseport group of another. 99 * @sk: New socket to add to the group. 100 * @sk2: Socket belonging to the existing reuseport group. 101 * May return ENOMEM and not add socket to group under memory pressure. 102 */ 103 int reuseport_add_sock(struct sock *sk, struct sock *sk2) 104 { 105 struct sock_reuseport *reuse; 106 107 if (!rcu_access_pointer(sk2->sk_reuseport_cb)) { 108 int err = reuseport_alloc(sk2); 109 110 if (err) 111 return err; 112 } 113 114 spin_lock_bh(&reuseport_lock); 115 reuse = rcu_dereference_protected(sk2->sk_reuseport_cb, 116 lockdep_is_held(&reuseport_lock)), 117 WARN_ONCE(rcu_dereference_protected(sk->sk_reuseport_cb, 118 lockdep_is_held(&reuseport_lock)), 119 "socket already in reuseport group"); 120 121 if (reuse->num_socks == reuse->max_socks) { 122 reuse = reuseport_grow(reuse); 123 if (!reuse) { 124 spin_unlock_bh(&reuseport_lock); 125 return -ENOMEM; 126 } 127 } 128 129 reuse->socks[reuse->num_socks] = sk; 130 /* paired with smp_rmb() in reuseport_select_sock() */ 131 smp_wmb(); 132 reuse->num_socks++; 133 rcu_assign_pointer(sk->sk_reuseport_cb, reuse); 134 135 spin_unlock_bh(&reuseport_lock); 136 137 return 0; 138 } 139 140 static void reuseport_free_rcu(struct rcu_head *head) 141 { 142 struct sock_reuseport *reuse; 143 144 reuse = container_of(head, struct sock_reuseport, rcu); 145 if (reuse->prog) 146 bpf_prog_destroy(reuse->prog); 147 kfree(reuse); 148 } 149 150 void reuseport_detach_sock(struct sock *sk) 151 { 152 struct sock_reuseport *reuse; 153 int i; 154 155 spin_lock_bh(&reuseport_lock); 156 reuse = rcu_dereference_protected(sk->sk_reuseport_cb, 157 lockdep_is_held(&reuseport_lock)); 158 rcu_assign_pointer(sk->sk_reuseport_cb, NULL); 159 160 for (i = 0; i < reuse->num_socks; i++) { 161 if (reuse->socks[i] == sk) { 162 reuse->socks[i] = reuse->socks[reuse->num_socks - 1]; 163 reuse->num_socks--; 164 if (reuse->num_socks == 0) 165 call_rcu(&reuse->rcu, reuseport_free_rcu); 166 break; 167 } 168 } 169 spin_unlock_bh(&reuseport_lock); 170 } 171 EXPORT_SYMBOL(reuseport_detach_sock); 172 173 static struct sock *run_bpf(struct sock_reuseport *reuse, u16 socks, 174 struct bpf_prog *prog, struct sk_buff *skb, 175 int hdr_len) 176 { 177 struct sk_buff *nskb = NULL; 178 u32 index; 179 180 if (skb_shared(skb)) { 181 nskb = skb_clone(skb, GFP_ATOMIC); 182 if (!nskb) 183 return NULL; 184 skb = nskb; 185 } 186 187 /* temporarily advance data past protocol header */ 188 if (!pskb_pull(skb, hdr_len)) { 189 kfree_skb(nskb); 190 return NULL; 191 } 192 index = bpf_prog_run_save_cb(prog, skb); 193 __skb_push(skb, hdr_len); 194 195 consume_skb(nskb); 196 197 if (index >= socks) 198 return NULL; 199 200 return reuse->socks[index]; 201 } 202 203 /** 204 * reuseport_select_sock - Select a socket from an SO_REUSEPORT group. 205 * @sk: First socket in the group. 206 * @hash: When no BPF filter is available, use this hash to select. 207 * @skb: skb to run through BPF filter. 208 * @hdr_len: BPF filter expects skb data pointer at payload data. If 209 * the skb does not yet point at the payload, this parameter represents 210 * how far the pointer needs to advance to reach the payload. 211 * Returns a socket that should receive the packet (or NULL on error). 212 */ 213 struct sock *reuseport_select_sock(struct sock *sk, 214 u32 hash, 215 struct sk_buff *skb, 216 int hdr_len) 217 { 218 struct sock_reuseport *reuse; 219 struct bpf_prog *prog; 220 struct sock *sk2 = NULL; 221 u16 socks; 222 223 rcu_read_lock(); 224 reuse = rcu_dereference(sk->sk_reuseport_cb); 225 226 /* if memory allocation failed or add call is not yet complete */ 227 if (!reuse) 228 goto out; 229 230 prog = rcu_dereference(reuse->prog); 231 socks = READ_ONCE(reuse->num_socks); 232 if (likely(socks)) { 233 /* paired with smp_wmb() in reuseport_add_sock() */ 234 smp_rmb(); 235 236 if (prog && skb) 237 sk2 = run_bpf(reuse, socks, prog, skb, hdr_len); 238 else 239 sk2 = reuse->socks[reciprocal_scale(hash, socks)]; 240 } 241 242 out: 243 rcu_read_unlock(); 244 return sk2; 245 } 246 EXPORT_SYMBOL(reuseport_select_sock); 247 248 struct bpf_prog * 249 reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) 250 { 251 struct sock_reuseport *reuse; 252 struct bpf_prog *old_prog; 253 254 spin_lock_bh(&reuseport_lock); 255 reuse = rcu_dereference_protected(sk->sk_reuseport_cb, 256 lockdep_is_held(&reuseport_lock)); 257 old_prog = rcu_dereference_protected(reuse->prog, 258 lockdep_is_held(&reuseport_lock)); 259 rcu_assign_pointer(reuse->prog, prog); 260 spin_unlock_bh(&reuseport_lock); 261 262 return old_prog; 263 } 264 EXPORT_SYMBOL(reuseport_attach_prog); 265