| // SPDX-License-Identifier: GPL-2.0 |
| /* |
| * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. |
| */ |
| |
| #ifdef COMPAT_CANNOT_DEPRECIATE_BH_RCU |
| /* We normally alias all non-_bh functions to the _bh ones in the compat layer, |
| * but that's not appropriate here, where we actually do want non-_bh ones. |
| */ |
| #undef synchronize_rcu |
| #define synchronize_rcu old_synchronize_rcu |
| #undef call_rcu |
| #define call_rcu old_call_rcu |
| #undef rcu_barrier |
| #define rcu_barrier old_rcu_barrier |
| #endif |
| |
| #include "ratelimiter.h" |
| #include <linux/siphash.h> |
| #include <linux/mm.h> |
| #include <linux/slab.h> |
| #include <net/ip.h> |
| |
| static struct kmem_cache *entry_cache; |
| static hsiphash_key_t key; |
| static spinlock_t table_lock = __SPIN_LOCK_UNLOCKED("ratelimiter_table_lock"); |
| static DEFINE_MUTEX(init_lock); |
| static u64 init_refcnt; /* Protected by init_lock, hence not atomic. */ |
| static atomic_t total_entries = ATOMIC_INIT(0); |
| static unsigned int max_entries, table_size; |
| static void wg_ratelimiter_gc_entries(struct work_struct *); |
| static DECLARE_DEFERRABLE_WORK(gc_work, wg_ratelimiter_gc_entries); |
| static struct hlist_head *table_v4; |
| #if IS_ENABLED(CONFIG_IPV6) |
| static struct hlist_head *table_v6; |
| #endif |
| |
| struct ratelimiter_entry { |
| u64 last_time_ns, tokens, ip; |
| void *net; |
| spinlock_t lock; |
| struct hlist_node hash; |
| struct rcu_head rcu; |
| }; |
| |
| enum { |
| PACKETS_PER_SECOND = 20, |
| PACKETS_BURSTABLE = 5, |
| PACKET_COST = NSEC_PER_SEC / PACKETS_PER_SECOND, |
| TOKEN_MAX = PACKET_COST * PACKETS_BURSTABLE |
| }; |
| |
| static void entry_free(struct rcu_head *rcu) |
| { |
| kmem_cache_free(entry_cache, |
| container_of(rcu, struct ratelimiter_entry, rcu)); |
| atomic_dec(&total_entries); |
| } |
| |
| static void entry_uninit(struct ratelimiter_entry *entry) |
| { |
| hlist_del_rcu(&entry->hash); |
| call_rcu(&entry->rcu, entry_free); |
| } |
| |
| /* Calling this function with a NULL work uninits all entries. */ |
| static void wg_ratelimiter_gc_entries(struct work_struct *work) |
| { |
| const u64 now = ktime_get_coarse_boottime_ns(); |
| struct ratelimiter_entry *entry; |
| struct hlist_node *temp; |
| unsigned int i; |
| |
| for (i = 0; i < table_size; ++i) { |
| spin_lock(&table_lock); |
| hlist_for_each_entry_safe(entry, temp, &table_v4[i], hash) { |
| if (unlikely(!work) || |
| now - entry->last_time_ns > NSEC_PER_SEC) |
| entry_uninit(entry); |
| } |
| #if IS_ENABLED(CONFIG_IPV6) |
| hlist_for_each_entry_safe(entry, temp, &table_v6[i], hash) { |
| if (unlikely(!work) || |
| now - entry->last_time_ns > NSEC_PER_SEC) |
| entry_uninit(entry); |
| } |
| #endif |
| spin_unlock(&table_lock); |
| if (likely(work)) |
| cond_resched(); |
| } |
| if (likely(work)) |
| queue_delayed_work(system_power_efficient_wq, &gc_work, HZ); |
| } |
| |
| bool wg_ratelimiter_allow(struct sk_buff *skb, struct net *net) |
| { |
| /* We only take the bottom half of the net pointer, so that we can hash |
| * 3 words in the end. This way, siphash's len param fits into the final |
| * u32, and we don't incur an extra round. |
| */ |
| const u32 net_word = (unsigned long)net; |
| struct ratelimiter_entry *entry; |
| struct hlist_head *bucket; |
| u64 ip; |
| |
| if (skb->protocol == htons(ETH_P_IP)) { |
| ip = (u64 __force)ip_hdr(skb)->saddr; |
| bucket = &table_v4[hsiphash_2u32(net_word, ip, &key) & |
| (table_size - 1)]; |
| } |
| #if IS_ENABLED(CONFIG_IPV6) |
| else if (skb->protocol == htons(ETH_P_IPV6)) { |
| /* Only use 64 bits, so as to ratelimit the whole /64. */ |
| memcpy(&ip, &ipv6_hdr(skb)->saddr, sizeof(ip)); |
| bucket = &table_v6[hsiphash_3u32(net_word, ip >> 32, ip, &key) & |
| (table_size - 1)]; |
| } |
| #endif |
| else |
| return false; |
| rcu_read_lock(); |
| hlist_for_each_entry_rcu(entry, bucket, hash) { |
| if (entry->net == net && entry->ip == ip) { |
| u64 now, tokens; |
| bool ret; |
| /* Quasi-inspired by nft_limit.c, but this is actually a |
| * slightly different algorithm. Namely, we incorporate |
| * the burst as part of the maximum tokens, rather than |
| * as part of the rate. |
| */ |
| spin_lock(&entry->lock); |
| now = ktime_get_coarse_boottime_ns(); |
| tokens = min_t(u64, TOKEN_MAX, |
| entry->tokens + now - |
| entry->last_time_ns); |
| entry->last_time_ns = now; |
| ret = tokens >= PACKET_COST; |
| entry->tokens = ret ? tokens - PACKET_COST : tokens; |
| spin_unlock(&entry->lock); |
| rcu_read_unlock(); |
| return ret; |
| } |
| } |
| rcu_read_unlock(); |
| |
| if (atomic_inc_return(&total_entries) > max_entries) |
| goto err_oom; |
| |
| entry = kmem_cache_alloc(entry_cache, GFP_KERNEL); |
| if (unlikely(!entry)) |
| goto err_oom; |
| |
| entry->net = net; |
| entry->ip = ip; |
| INIT_HLIST_NODE(&entry->hash); |
| spin_lock_init(&entry->lock); |
| entry->last_time_ns = ktime_get_coarse_boottime_ns(); |
| entry->tokens = TOKEN_MAX - PACKET_COST; |
| spin_lock(&table_lock); |
| hlist_add_head_rcu(&entry->hash, bucket); |
| spin_unlock(&table_lock); |
| return true; |
| |
| err_oom: |
| atomic_dec(&total_entries); |
| return false; |
| } |
| |
| int wg_ratelimiter_init(void) |
| { |
| mutex_lock(&init_lock); |
| if (++init_refcnt != 1) |
| goto out; |
| |
| entry_cache = KMEM_CACHE(ratelimiter_entry, 0); |
| if (!entry_cache) |
| goto err; |
| |
| /* xt_hashlimit.c uses a slightly different algorithm for ratelimiting, |
| * but what it shares in common is that it uses a massive hashtable. So, |
| * we borrow their wisdom about good table sizes on different systems |
| * dependent on RAM. This calculation here comes from there. |
| */ |
| table_size = (totalram_pages() > (1U << 30) / PAGE_SIZE) ? 8192 : |
| max_t(unsigned long, 16, roundup_pow_of_two( |
| (totalram_pages() << PAGE_SHIFT) / |
| (1U << 14) / sizeof(struct hlist_head))); |
| max_entries = table_size * 8; |
| |
| table_v4 = kvzalloc(table_size * sizeof(*table_v4), GFP_KERNEL); |
| if (unlikely(!table_v4)) |
| goto err_kmemcache; |
| |
| #if IS_ENABLED(CONFIG_IPV6) |
| table_v6 = kvzalloc(table_size * sizeof(*table_v6), GFP_KERNEL); |
| if (unlikely(!table_v6)) { |
| kvfree(table_v4); |
| goto err_kmemcache; |
| } |
| #endif |
| |
| queue_delayed_work(system_power_efficient_wq, &gc_work, HZ); |
| get_random_bytes(&key, sizeof(key)); |
| out: |
| mutex_unlock(&init_lock); |
| return 0; |
| |
| err_kmemcache: |
| kmem_cache_destroy(entry_cache); |
| err: |
| --init_refcnt; |
| mutex_unlock(&init_lock); |
| return -ENOMEM; |
| } |
| |
| void wg_ratelimiter_uninit(void) |
| { |
| mutex_lock(&init_lock); |
| if (!init_refcnt || --init_refcnt) |
| goto out; |
| |
| cancel_delayed_work_sync(&gc_work); |
| wg_ratelimiter_gc_entries(NULL); |
| rcu_barrier(); |
| kvfree(table_v4); |
| #if IS_ENABLED(CONFIG_IPV6) |
| kvfree(table_v6); |
| #endif |
| kmem_cache_destroy(entry_cache); |
| out: |
| mutex_unlock(&init_lock); |
| } |
| |
| #include "selftest/ratelimiter.c" |