xref: /openbmc/linux/arch/loongarch/lib/csum.c (revision fd5e9fccbd504c5179ab57ff695c610bca8809d6)
169e3a6aaSBibo Mao // SPDX-License-Identifier: GPL-2.0-only
269e3a6aaSBibo Mao // Copyright (C) 2019-2020 Arm Ltd.
369e3a6aaSBibo Mao 
469e3a6aaSBibo Mao #include <linux/compiler.h>
569e3a6aaSBibo Mao #include <linux/kasan-checks.h>
669e3a6aaSBibo Mao #include <linux/kernel.h>
769e3a6aaSBibo Mao 
869e3a6aaSBibo Mao #include <net/checksum.h>
969e3a6aaSBibo Mao 
accumulate(u64 sum,u64 data)1069e3a6aaSBibo Mao static u64 accumulate(u64 sum, u64 data)
1169e3a6aaSBibo Mao {
1269e3a6aaSBibo Mao 	sum += data;
1369e3a6aaSBibo Mao 	if (sum < data)
1469e3a6aaSBibo Mao 		sum += 1;
1569e3a6aaSBibo Mao 	return sum;
1669e3a6aaSBibo Mao }
1769e3a6aaSBibo Mao 
1869e3a6aaSBibo Mao /*
1969e3a6aaSBibo Mao  * We over-read the buffer and this makes KASAN unhappy. Instead, disable
2069e3a6aaSBibo Mao  * instrumentation and call kasan explicitly.
2169e3a6aaSBibo Mao  */
do_csum(const unsigned char * buff,int len)2269e3a6aaSBibo Mao unsigned int __no_sanitize_address do_csum(const unsigned char *buff, int len)
2369e3a6aaSBibo Mao {
2469e3a6aaSBibo Mao 	unsigned int offset, shift, sum;
2569e3a6aaSBibo Mao 	const u64 *ptr;
2669e3a6aaSBibo Mao 	u64 data, sum64 = 0;
2769e3a6aaSBibo Mao 
28*964a8895SYuli Wang 	if (unlikely(len <= 0))
2969e3a6aaSBibo Mao 		return 0;
3069e3a6aaSBibo Mao 
3169e3a6aaSBibo Mao 	offset = (unsigned long)buff & 7;
3269e3a6aaSBibo Mao 	/*
3369e3a6aaSBibo Mao 	 * This is to all intents and purposes safe, since rounding down cannot
3469e3a6aaSBibo Mao 	 * result in a different page or cache line being accessed, and @buff
3569e3a6aaSBibo Mao 	 * should absolutely not be pointing to anything read-sensitive. We do,
3669e3a6aaSBibo Mao 	 * however, have to be careful not to piss off KASAN, which means using
3769e3a6aaSBibo Mao 	 * unchecked reads to accommodate the head and tail, for which we'll
3869e3a6aaSBibo Mao 	 * compensate with an explicit check up-front.
3969e3a6aaSBibo Mao 	 */
4069e3a6aaSBibo Mao 	kasan_check_read(buff, len);
4169e3a6aaSBibo Mao 	ptr = (u64 *)(buff - offset);
4269e3a6aaSBibo Mao 	len = len + offset - 8;
4369e3a6aaSBibo Mao 
4469e3a6aaSBibo Mao 	/*
4569e3a6aaSBibo Mao 	 * Head: zero out any excess leading bytes. Shifting back by the same
4669e3a6aaSBibo Mao 	 * amount should be at least as fast as any other way of handling the
4769e3a6aaSBibo Mao 	 * odd/even alignment, and means we can ignore it until the very end.
4869e3a6aaSBibo Mao 	 */
4969e3a6aaSBibo Mao 	shift = offset * 8;
5069e3a6aaSBibo Mao 	data = *ptr++;
5169e3a6aaSBibo Mao 	data = (data >> shift) << shift;
5269e3a6aaSBibo Mao 
5369e3a6aaSBibo Mao 	/*
5469e3a6aaSBibo Mao 	 * Body: straightforward aligned loads from here on (the paired loads
5569e3a6aaSBibo Mao 	 * underlying the quadword type still only need dword alignment). The
5669e3a6aaSBibo Mao 	 * main loop strictly excludes the tail, so the second loop will always
5769e3a6aaSBibo Mao 	 * run at least once.
5869e3a6aaSBibo Mao 	 */
5969e3a6aaSBibo Mao 	while (unlikely(len > 64)) {
6069e3a6aaSBibo Mao 		__uint128_t tmp1, tmp2, tmp3, tmp4;
6169e3a6aaSBibo Mao 
6269e3a6aaSBibo Mao 		tmp1 = *(__uint128_t *)ptr;
6369e3a6aaSBibo Mao 		tmp2 = *(__uint128_t *)(ptr + 2);
6469e3a6aaSBibo Mao 		tmp3 = *(__uint128_t *)(ptr + 4);
6569e3a6aaSBibo Mao 		tmp4 = *(__uint128_t *)(ptr + 6);
6669e3a6aaSBibo Mao 
6769e3a6aaSBibo Mao 		len -= 64;
6869e3a6aaSBibo Mao 		ptr += 8;
6969e3a6aaSBibo Mao 
7069e3a6aaSBibo Mao 		/* This is the "don't dump the carry flag into a GPR" idiom */
7169e3a6aaSBibo Mao 		tmp1 += (tmp1 >> 64) | (tmp1 << 64);
7269e3a6aaSBibo Mao 		tmp2 += (tmp2 >> 64) | (tmp2 << 64);
7369e3a6aaSBibo Mao 		tmp3 += (tmp3 >> 64) | (tmp3 << 64);
7469e3a6aaSBibo Mao 		tmp4 += (tmp4 >> 64) | (tmp4 << 64);
7569e3a6aaSBibo Mao 		tmp1 = ((tmp1 >> 64) << 64) | (tmp2 >> 64);
7669e3a6aaSBibo Mao 		tmp1 += (tmp1 >> 64) | (tmp1 << 64);
7769e3a6aaSBibo Mao 		tmp3 = ((tmp3 >> 64) << 64) | (tmp4 >> 64);
7869e3a6aaSBibo Mao 		tmp3 += (tmp3 >> 64) | (tmp3 << 64);
7969e3a6aaSBibo Mao 		tmp1 = ((tmp1 >> 64) << 64) | (tmp3 >> 64);
8069e3a6aaSBibo Mao 		tmp1 += (tmp1 >> 64) | (tmp1 << 64);
8169e3a6aaSBibo Mao 		tmp1 = ((tmp1 >> 64) << 64) | sum64;
8269e3a6aaSBibo Mao 		tmp1 += (tmp1 >> 64) | (tmp1 << 64);
8369e3a6aaSBibo Mao 		sum64 = tmp1 >> 64;
8469e3a6aaSBibo Mao 	}
8569e3a6aaSBibo Mao 	while (len > 8) {
8669e3a6aaSBibo Mao 		__uint128_t tmp;
8769e3a6aaSBibo Mao 
8869e3a6aaSBibo Mao 		sum64 = accumulate(sum64, data);
8969e3a6aaSBibo Mao 		tmp = *(__uint128_t *)ptr;
9069e3a6aaSBibo Mao 
9169e3a6aaSBibo Mao 		len -= 16;
9269e3a6aaSBibo Mao 		ptr += 2;
9369e3a6aaSBibo Mao 
9469e3a6aaSBibo Mao 		data = tmp >> 64;
9569e3a6aaSBibo Mao 		sum64 = accumulate(sum64, tmp);
9669e3a6aaSBibo Mao 	}
9769e3a6aaSBibo Mao 	if (len > 0) {
9869e3a6aaSBibo Mao 		sum64 = accumulate(sum64, data);
9969e3a6aaSBibo Mao 		data = *ptr;
10069e3a6aaSBibo Mao 		len -= 8;
10169e3a6aaSBibo Mao 	}
10269e3a6aaSBibo Mao 	/*
10369e3a6aaSBibo Mao 	 * Tail: zero any over-read bytes similarly to the head, again
10469e3a6aaSBibo Mao 	 * preserving odd/even alignment.
10569e3a6aaSBibo Mao 	 */
10669e3a6aaSBibo Mao 	shift = len * -8;
10769e3a6aaSBibo Mao 	data = (data << shift) >> shift;
10869e3a6aaSBibo Mao 	sum64 = accumulate(sum64, data);
10969e3a6aaSBibo Mao 
11069e3a6aaSBibo Mao 	/* Finally, folding */
11169e3a6aaSBibo Mao 	sum64 += (sum64 >> 32) | (sum64 << 32);
11269e3a6aaSBibo Mao 	sum = sum64 >> 32;
11369e3a6aaSBibo Mao 	sum += (sum >> 16) | (sum << 16);
11469e3a6aaSBibo Mao 	if (offset & 1)
11569e3a6aaSBibo Mao 		return (u16)swab32(sum);
11669e3a6aaSBibo Mao 
11769e3a6aaSBibo Mao 	return sum >> 16;
11869e3a6aaSBibo Mao }
11969e3a6aaSBibo Mao 
csum_ipv6_magic(const struct in6_addr * saddr,const struct in6_addr * daddr,__u32 len,__u8 proto,__wsum csum)12069e3a6aaSBibo Mao __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
12169e3a6aaSBibo Mao 			const struct in6_addr *daddr,
12269e3a6aaSBibo Mao 			__u32 len, __u8 proto, __wsum csum)
12369e3a6aaSBibo Mao {
12469e3a6aaSBibo Mao 	__uint128_t src, dst;
12569e3a6aaSBibo Mao 	u64 sum = (__force u64)csum;
12669e3a6aaSBibo Mao 
12769e3a6aaSBibo Mao 	src = *(const __uint128_t *)saddr->s6_addr;
12869e3a6aaSBibo Mao 	dst = *(const __uint128_t *)daddr->s6_addr;
12969e3a6aaSBibo Mao 
13069e3a6aaSBibo Mao 	sum += (__force u32)htonl(len);
13169e3a6aaSBibo Mao 	sum += (u32)proto << 24;
13269e3a6aaSBibo Mao 	src += (src >> 64) | (src << 64);
13369e3a6aaSBibo Mao 	dst += (dst >> 64) | (dst << 64);
13469e3a6aaSBibo Mao 
13569e3a6aaSBibo Mao 	sum = accumulate(sum, src >> 64);
13669e3a6aaSBibo Mao 	sum = accumulate(sum, dst >> 64);
13769e3a6aaSBibo Mao 
13869e3a6aaSBibo Mao 	sum += ((sum >> 32) | (sum << 32));
13969e3a6aaSBibo Mao 	return csum_fold((__force __wsum)(sum >> 32));
14069e3a6aaSBibo Mao }
14169e3a6aaSBibo Mao EXPORT_SYMBOL(csum_ipv6_magic);
142