1 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com 2 * 3 * This program is free software; you can redistribute it and/or 4 * modify it under the terms of version 2 of the GNU General Public 5 * License as published by the Free Software Foundation. 6 * 7 * This program is distributed in the hope that it will be useful, but 8 * WITHOUT ANY WARRANTY; without even the implied warranty of 9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 10 * General Public License for more details. 11 */ 12 #include <linux/bpf.h> 13 #include <linux/jhash.h> 14 #include <linux/filter.h> 15 #include <linux/vmalloc.h> 16 17 struct bucket { 18 struct hlist_head head; 19 raw_spinlock_t lock; 20 }; 21 22 struct bpf_htab { 23 struct bpf_map map; 24 struct bucket *buckets; 25 atomic_t count; /* number of elements in this hashtable */ 26 u32 n_buckets; /* number of hash buckets */ 27 u32 elem_size; /* size of each element in bytes */ 28 }; 29 30 /* each htab element is struct htab_elem + key + value */ 31 struct htab_elem { 32 struct hlist_node hash_node; 33 struct rcu_head rcu; 34 u32 hash; 35 char key[0] __aligned(8); 36 }; 37 38 /* Called from syscall */ 39 static struct bpf_map *htab_map_alloc(union bpf_attr *attr) 40 { 41 struct bpf_htab *htab; 42 int err, i; 43 44 htab = kzalloc(sizeof(*htab), GFP_USER); 45 if (!htab) 46 return ERR_PTR(-ENOMEM); 47 48 /* mandatory map attributes */ 49 htab->map.key_size = attr->key_size; 50 htab->map.value_size = attr->value_size; 51 htab->map.max_entries = attr->max_entries; 52 53 /* check sanity of attributes. 54 * value_size == 0 may be allowed in the future to use map as a set 55 */ 56 err = -EINVAL; 57 if (htab->map.max_entries == 0 || htab->map.key_size == 0 || 58 htab->map.value_size == 0) 59 goto free_htab; 60 61 /* hash table size must be power of 2 */ 62 htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); 63 64 err = -E2BIG; 65 if (htab->map.key_size > MAX_BPF_STACK) 66 /* eBPF programs initialize keys on stack, so they cannot be 67 * larger than max stack size 68 */ 69 goto free_htab; 70 71 if (htab->map.value_size >= (1 << (KMALLOC_SHIFT_MAX - 1)) - 72 MAX_BPF_STACK - sizeof(struct htab_elem)) 73 /* if value_size is bigger, the user space won't be able to 74 * access the elements via bpf syscall. This check also makes 75 * sure that the elem_size doesn't overflow and it's 76 * kmalloc-able later in htab_map_update_elem() 77 */ 78 goto free_htab; 79 80 htab->elem_size = sizeof(struct htab_elem) + 81 round_up(htab->map.key_size, 8) + 82 htab->map.value_size; 83 84 /* prevent zero size kmalloc and check for u32 overflow */ 85 if (htab->n_buckets == 0 || 86 htab->n_buckets > U32_MAX / sizeof(struct bucket)) 87 goto free_htab; 88 89 if ((u64) htab->n_buckets * sizeof(struct bucket) + 90 (u64) htab->elem_size * htab->map.max_entries >= 91 U32_MAX - PAGE_SIZE) 92 /* make sure page count doesn't overflow */ 93 goto free_htab; 94 95 htab->map.pages = round_up(htab->n_buckets * sizeof(struct bucket) + 96 htab->elem_size * htab->map.max_entries, 97 PAGE_SIZE) >> PAGE_SHIFT; 98 99 err = -ENOMEM; 100 htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket), 101 GFP_USER | __GFP_NOWARN); 102 103 if (!htab->buckets) { 104 htab->buckets = vmalloc(htab->n_buckets * sizeof(struct bucket)); 105 if (!htab->buckets) 106 goto free_htab; 107 } 108 109 for (i = 0; i < htab->n_buckets; i++) { 110 INIT_HLIST_HEAD(&htab->buckets[i].head); 111 raw_spin_lock_init(&htab->buckets[i].lock); 112 } 113 114 atomic_set(&htab->count, 0); 115 116 return &htab->map; 117 118 free_htab: 119 kfree(htab); 120 return ERR_PTR(err); 121 } 122 123 static inline u32 htab_map_hash(const void *key, u32 key_len) 124 { 125 return jhash(key, key_len, 0); 126 } 127 128 static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) 129 { 130 return &htab->buckets[hash & (htab->n_buckets - 1)]; 131 } 132 133 static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) 134 { 135 return &__select_bucket(htab, hash)->head; 136 } 137 138 static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash, 139 void *key, u32 key_size) 140 { 141 struct htab_elem *l; 142 143 hlist_for_each_entry_rcu(l, head, hash_node) 144 if (l->hash == hash && !memcmp(&l->key, key, key_size)) 145 return l; 146 147 return NULL; 148 } 149 150 /* Called from syscall or from eBPF program */ 151 static void *htab_map_lookup_elem(struct bpf_map *map, void *key) 152 { 153 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 154 struct hlist_head *head; 155 struct htab_elem *l; 156 u32 hash, key_size; 157 158 /* Must be called with rcu_read_lock. */ 159 WARN_ON_ONCE(!rcu_read_lock_held()); 160 161 key_size = map->key_size; 162 163 hash = htab_map_hash(key, key_size); 164 165 head = select_bucket(htab, hash); 166 167 l = lookup_elem_raw(head, hash, key, key_size); 168 169 if (l) 170 return l->key + round_up(map->key_size, 8); 171 172 return NULL; 173 } 174 175 /* Called from syscall */ 176 static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) 177 { 178 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 179 struct hlist_head *head; 180 struct htab_elem *l, *next_l; 181 u32 hash, key_size; 182 int i; 183 184 WARN_ON_ONCE(!rcu_read_lock_held()); 185 186 key_size = map->key_size; 187 188 hash = htab_map_hash(key, key_size); 189 190 head = select_bucket(htab, hash); 191 192 /* lookup the key */ 193 l = lookup_elem_raw(head, hash, key, key_size); 194 195 if (!l) { 196 i = 0; 197 goto find_first_elem; 198 } 199 200 /* key was found, get next key in the same bucket */ 201 next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)), 202 struct htab_elem, hash_node); 203 204 if (next_l) { 205 /* if next elem in this hash list is non-zero, just return it */ 206 memcpy(next_key, next_l->key, key_size); 207 return 0; 208 } 209 210 /* no more elements in this hash list, go to the next bucket */ 211 i = hash & (htab->n_buckets - 1); 212 i++; 213 214 find_first_elem: 215 /* iterate over buckets */ 216 for (; i < htab->n_buckets; i++) { 217 head = select_bucket(htab, i); 218 219 /* pick first element in the bucket */ 220 next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), 221 struct htab_elem, hash_node); 222 if (next_l) { 223 /* if it's not empty, just return it */ 224 memcpy(next_key, next_l->key, key_size); 225 return 0; 226 } 227 } 228 229 /* itereated over all buckets and all elements */ 230 return -ENOENT; 231 } 232 233 /* Called from syscall or from eBPF program */ 234 static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, 235 u64 map_flags) 236 { 237 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 238 struct htab_elem *l_new, *l_old; 239 struct hlist_head *head; 240 struct bucket *b; 241 unsigned long flags; 242 u32 key_size; 243 int ret; 244 245 if (map_flags > BPF_EXIST) 246 /* unknown flags */ 247 return -EINVAL; 248 249 WARN_ON_ONCE(!rcu_read_lock_held()); 250 251 /* allocate new element outside of lock */ 252 l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); 253 if (!l_new) 254 return -ENOMEM; 255 256 key_size = map->key_size; 257 258 memcpy(l_new->key, key, key_size); 259 memcpy(l_new->key + round_up(key_size, 8), value, map->value_size); 260 261 l_new->hash = htab_map_hash(l_new->key, key_size); 262 b = __select_bucket(htab, l_new->hash); 263 head = &b->head; 264 265 /* bpf_map_update_elem() can be called in_irq() */ 266 raw_spin_lock_irqsave(&b->lock, flags); 267 268 l_old = lookup_elem_raw(head, l_new->hash, key, key_size); 269 270 if (!l_old && unlikely(atomic_read(&htab->count) >= map->max_entries)) { 271 /* if elem with this 'key' doesn't exist and we've reached 272 * max_entries limit, fail insertion of new elem 273 */ 274 ret = -E2BIG; 275 goto err; 276 } 277 278 if (l_old && map_flags == BPF_NOEXIST) { 279 /* elem already exists */ 280 ret = -EEXIST; 281 goto err; 282 } 283 284 if (!l_old && map_flags == BPF_EXIST) { 285 /* elem doesn't exist, cannot update it */ 286 ret = -ENOENT; 287 goto err; 288 } 289 290 /* add new element to the head of the list, so that concurrent 291 * search will find it before old elem 292 */ 293 hlist_add_head_rcu(&l_new->hash_node, head); 294 if (l_old) { 295 hlist_del_rcu(&l_old->hash_node); 296 kfree_rcu(l_old, rcu); 297 } else { 298 atomic_inc(&htab->count); 299 } 300 raw_spin_unlock_irqrestore(&b->lock, flags); 301 302 return 0; 303 err: 304 raw_spin_unlock_irqrestore(&b->lock, flags); 305 kfree(l_new); 306 return ret; 307 } 308 309 /* Called from syscall or from eBPF program */ 310 static int htab_map_delete_elem(struct bpf_map *map, void *key) 311 { 312 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 313 struct hlist_head *head; 314 struct bucket *b; 315 struct htab_elem *l; 316 unsigned long flags; 317 u32 hash, key_size; 318 int ret = -ENOENT; 319 320 WARN_ON_ONCE(!rcu_read_lock_held()); 321 322 key_size = map->key_size; 323 324 hash = htab_map_hash(key, key_size); 325 b = __select_bucket(htab, hash); 326 head = &b->head; 327 328 raw_spin_lock_irqsave(&b->lock, flags); 329 330 l = lookup_elem_raw(head, hash, key, key_size); 331 332 if (l) { 333 hlist_del_rcu(&l->hash_node); 334 atomic_dec(&htab->count); 335 kfree_rcu(l, rcu); 336 ret = 0; 337 } 338 339 raw_spin_unlock_irqrestore(&b->lock, flags); 340 return ret; 341 } 342 343 static void delete_all_elements(struct bpf_htab *htab) 344 { 345 int i; 346 347 for (i = 0; i < htab->n_buckets; i++) { 348 struct hlist_head *head = select_bucket(htab, i); 349 struct hlist_node *n; 350 struct htab_elem *l; 351 352 hlist_for_each_entry_safe(l, n, head, hash_node) { 353 hlist_del_rcu(&l->hash_node); 354 atomic_dec(&htab->count); 355 kfree(l); 356 } 357 } 358 } 359 360 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ 361 static void htab_map_free(struct bpf_map *map) 362 { 363 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 364 365 /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, 366 * so the programs (can be more than one that used this map) were 367 * disconnected from events. Wait for outstanding critical sections in 368 * these programs to complete 369 */ 370 synchronize_rcu(); 371 372 /* some of kfree_rcu() callbacks for elements of this map may not have 373 * executed. It's ok. Proceed to free residual elements and map itself 374 */ 375 delete_all_elements(htab); 376 kvfree(htab->buckets); 377 kfree(htab); 378 } 379 380 static const struct bpf_map_ops htab_ops = { 381 .map_alloc = htab_map_alloc, 382 .map_free = htab_map_free, 383 .map_get_next_key = htab_map_get_next_key, 384 .map_lookup_elem = htab_map_lookup_elem, 385 .map_update_elem = htab_map_update_elem, 386 .map_delete_elem = htab_map_delete_elem, 387 }; 388 389 static struct bpf_map_type_list htab_type __read_mostly = { 390 .ops = &htab_ops, 391 .type = BPF_MAP_TYPE_HASH, 392 }; 393 394 static int __init register_htab_map(void) 395 { 396 bpf_register_map_type(&htab_type); 397 return 0; 398 } 399 late_initcall(register_htab_map); 400