1*b2441318SGreg Kroah-Hartman/* SPDX-License-Identifier: GPL-2.0 */ 21da177e4SLinus Torvalds/* 31da177e4SLinus Torvalds * 41da177e4SLinus Torvalds * Optmized version of the standard do_csum() function 51da177e4SLinus Torvalds * 61da177e4SLinus Torvalds * Return: a 64bit quantity containing the 16bit Internet checksum 71da177e4SLinus Torvalds * 81da177e4SLinus Torvalds * Inputs: 91da177e4SLinus Torvalds * in0: address of buffer to checksum (char *) 101da177e4SLinus Torvalds * in1: length of the buffer (int) 111da177e4SLinus Torvalds * 121da177e4SLinus Torvalds * Copyright (C) 1999, 2001-2002 Hewlett-Packard Co 131da177e4SLinus Torvalds * Stephane Eranian <eranian@hpl.hp.com> 141da177e4SLinus Torvalds * 151da177e4SLinus Torvalds * 02/04/22 Ken Chen <kenneth.w.chen@intel.com> 161da177e4SLinus Torvalds * Data locality study on the checksum buffer. 171da177e4SLinus Torvalds * More optimization cleanup - remove excessive stop bits. 181da177e4SLinus Torvalds * 02/04/08 David Mosberger <davidm@hpl.hp.com> 191da177e4SLinus Torvalds * More cleanup and tuning. 201da177e4SLinus Torvalds * 01/04/18 Jun Nakajima <jun.nakajima@intel.com> 211da177e4SLinus Torvalds * Clean up and optimize and the software pipeline, loading two 221da177e4SLinus Torvalds * back-to-back 8-byte words per loop. Clean up the initialization 231da177e4SLinus Torvalds * for the loop. Support the cases where load latency = 1 or 2. 241da177e4SLinus Torvalds * Set CONFIG_IA64_LOAD_LATENCY to 1 or 2 (default). 251da177e4SLinus Torvalds */ 261da177e4SLinus Torvalds 271da177e4SLinus Torvalds#include <asm/asmmacro.h> 281da177e4SLinus Torvalds 291da177e4SLinus Torvalds// 301da177e4SLinus Torvalds// Theory of operations: 311da177e4SLinus Torvalds// The goal is to go as quickly as possible to the point where 321da177e4SLinus Torvalds// we can checksum 16 bytes/loop. Before reaching that point we must 331da177e4SLinus Torvalds// take care of incorrect alignment of first byte. 341da177e4SLinus Torvalds// 351da177e4SLinus Torvalds// The code hereafter also takes care of the "tail" part of the buffer 361da177e4SLinus Torvalds// before entering the core loop, if any. The checksum is a sum so it 371da177e4SLinus Torvalds// allows us to commute operations. So we do the "head" and "tail" 381da177e4SLinus Torvalds// first to finish at full speed in the body. Once we get the head and 391da177e4SLinus Torvalds// tail values, we feed them into the pipeline, very handy initialization. 401da177e4SLinus Torvalds// 411da177e4SLinus Torvalds// Of course we deal with the special case where the whole buffer fits 421da177e4SLinus Torvalds// into one 8 byte word. In this case we have only one entry in the pipeline. 431da177e4SLinus Torvalds// 441da177e4SLinus Torvalds// We use a (LOAD_LATENCY+2)-stage pipeline in the loop to account for 451da177e4SLinus Torvalds// possible load latency and also to accommodate for head and tail. 461da177e4SLinus Torvalds// 471da177e4SLinus Torvalds// The end of the function deals with folding the checksum from 64bits 481da177e4SLinus Torvalds// down to 16bits taking care of the carry. 491da177e4SLinus Torvalds// 501da177e4SLinus Torvalds// This version avoids synchronization in the core loop by also using a 511da177e4SLinus Torvalds// pipeline for the accumulation of the checksum in resultx[] (x=1,2). 521da177e4SLinus Torvalds// 531da177e4SLinus Torvalds// wordx[] (x=1,2) 541da177e4SLinus Torvalds// |---| 551da177e4SLinus Torvalds// | | 0 : new value loaded in pipeline 561da177e4SLinus Torvalds// |---| 571da177e4SLinus Torvalds// | | - : in transit data 581da177e4SLinus Torvalds// |---| 591da177e4SLinus Torvalds// | | LOAD_LATENCY : current value to add to checksum 601da177e4SLinus Torvalds// |---| 611da177e4SLinus Torvalds// | | LOAD_LATENCY+1 : previous value added to checksum 621da177e4SLinus Torvalds// |---| (previous iteration) 631da177e4SLinus Torvalds// 641da177e4SLinus Torvalds// resultx[] (x=1,2) 651da177e4SLinus Torvalds// |---| 661da177e4SLinus Torvalds// | | 0 : initial value 671da177e4SLinus Torvalds// |---| 681da177e4SLinus Torvalds// | | LOAD_LATENCY-1 : new checksum 691da177e4SLinus Torvalds// |---| 701da177e4SLinus Torvalds// | | LOAD_LATENCY : previous value of checksum 711da177e4SLinus Torvalds// |---| 721da177e4SLinus Torvalds// | | LOAD_LATENCY+1 : final checksum when out of the loop 731da177e4SLinus Torvalds// |---| 741da177e4SLinus Torvalds// 751da177e4SLinus Torvalds// 761da177e4SLinus Torvalds// See RFC1071 "Computing the Internet Checksum" for various techniques for 771da177e4SLinus Torvalds// calculating the Internet checksum. 781da177e4SLinus Torvalds// 791da177e4SLinus Torvalds// NOT YET DONE: 801da177e4SLinus Torvalds// - Maybe another algorithm which would take care of the folding at the 811da177e4SLinus Torvalds// end in a different manner 821da177e4SLinus Torvalds// - Work with people more knowledgeable than me on the network stack 831da177e4SLinus Torvalds// to figure out if we could not split the function depending on the 841da177e4SLinus Torvalds// type of packet or alignment we get. Like the ip_fast_csum() routine 851da177e4SLinus Torvalds// where we know we have at least 20bytes worth of data to checksum. 861da177e4SLinus Torvalds// - Do a better job of handling small packets. 871da177e4SLinus Torvalds// - Note on prefetching: it was found that under various load, i.e. ftp read/write, 881da177e4SLinus Torvalds// nfs read/write, the L1 cache hit rate is at 60% and L2 cache hit rate is at 99.8% 891da177e4SLinus Torvalds// on the data that buffer points to (partly because the checksum is often preceded by 901da177e4SLinus Torvalds// a copy_from_user()). This finding indiate that lfetch will not be beneficial since 911da177e4SLinus Torvalds// the data is already in the cache. 921da177e4SLinus Torvalds// 931da177e4SLinus Torvalds 941da177e4SLinus Torvalds#define saved_pfs r11 951da177e4SLinus Torvalds#define hmask r16 961da177e4SLinus Torvalds#define tmask r17 971da177e4SLinus Torvalds#define first1 r18 981da177e4SLinus Torvalds#define firstval r19 991da177e4SLinus Torvalds#define firstoff r20 1001da177e4SLinus Torvalds#define last r21 1011da177e4SLinus Torvalds#define lastval r22 1021da177e4SLinus Torvalds#define lastoff r23 1031da177e4SLinus Torvalds#define saved_lc r24 1041da177e4SLinus Torvalds#define saved_pr r25 1051da177e4SLinus Torvalds#define tmp1 r26 1061da177e4SLinus Torvalds#define tmp2 r27 1071da177e4SLinus Torvalds#define tmp3 r28 1081da177e4SLinus Torvalds#define carry1 r29 1091da177e4SLinus Torvalds#define carry2 r30 1101da177e4SLinus Torvalds#define first2 r31 1111da177e4SLinus Torvalds 1121da177e4SLinus Torvalds#define buf in0 1131da177e4SLinus Torvalds#define len in1 1141da177e4SLinus Torvalds 1151da177e4SLinus Torvalds#define LOAD_LATENCY 2 // XXX fix me 1161da177e4SLinus Torvalds 1171da177e4SLinus Torvalds#if (LOAD_LATENCY != 1) && (LOAD_LATENCY != 2) 1181da177e4SLinus Torvalds# error "Only 1 or 2 is supported/tested for LOAD_LATENCY." 1191da177e4SLinus Torvalds#endif 1201da177e4SLinus Torvalds 1211da177e4SLinus Torvalds#define PIPE_DEPTH (LOAD_LATENCY+2) 1221da177e4SLinus Torvalds#define ELD p[LOAD_LATENCY] // end of load 1231da177e4SLinus Torvalds#define ELD_1 p[LOAD_LATENCY+1] // and next stage 1241da177e4SLinus Torvalds 1251da177e4SLinus Torvalds// unsigned long do_csum(unsigned char *buf,long len) 1261da177e4SLinus Torvalds 1271da177e4SLinus TorvaldsGLOBAL_ENTRY(do_csum) 1281da177e4SLinus Torvalds .prologue 1291da177e4SLinus Torvalds .save ar.pfs, saved_pfs 1301da177e4SLinus Torvalds alloc saved_pfs=ar.pfs,2,16,0,16 1311da177e4SLinus Torvalds .rotr word1[4], word2[4],result1[LOAD_LATENCY+2],result2[LOAD_LATENCY+2] 1321da177e4SLinus Torvalds .rotp p[PIPE_DEPTH], pC1[2], pC2[2] 1331da177e4SLinus Torvalds mov ret0=r0 // in case we have zero length 1341da177e4SLinus Torvalds cmp.lt p0,p6=r0,len // check for zero length or negative (32bit len) 1351da177e4SLinus Torvalds ;; 1361da177e4SLinus Torvalds add tmp1=buf,len // last byte's address 1371da177e4SLinus Torvalds .save pr, saved_pr 1381da177e4SLinus Torvalds mov saved_pr=pr // preserve predicates (rotation) 1391da177e4SLinus Torvalds(p6) br.ret.spnt.many rp // return if zero or negative length 1401da177e4SLinus Torvalds 1411da177e4SLinus Torvalds mov hmask=-1 // initialize head mask 1421da177e4SLinus Torvalds tbit.nz p15,p0=buf,0 // is buf an odd address? 1431da177e4SLinus Torvalds and first1=-8,buf // 8-byte align down address of first1 element 1441da177e4SLinus Torvalds 1451da177e4SLinus Torvalds and firstoff=7,buf // how many bytes off for first1 element 1461da177e4SLinus Torvalds mov tmask=-1 // initialize tail mask 1471da177e4SLinus Torvalds 1481da177e4SLinus Torvalds ;; 1491da177e4SLinus Torvalds adds tmp2=-1,tmp1 // last-1 1501da177e4SLinus Torvalds and lastoff=7,tmp1 // how many bytes off for last element 1511da177e4SLinus Torvalds ;; 1521da177e4SLinus Torvalds sub tmp1=8,lastoff // complement to lastoff 1531da177e4SLinus Torvalds and last=-8,tmp2 // address of word containing last byte 1541da177e4SLinus Torvalds ;; 1551da177e4SLinus Torvalds sub tmp3=last,first1 // tmp3=distance from first1 to last 1561da177e4SLinus Torvalds .save ar.lc, saved_lc 1571da177e4SLinus Torvalds mov saved_lc=ar.lc // save lc 1581da177e4SLinus Torvalds cmp.eq p8,p9=last,first1 // everything fits in one word ? 1591da177e4SLinus Torvalds 1601da177e4SLinus Torvalds ld8 firstval=[first1],8 // load, ahead of time, "first1" word 1611da177e4SLinus Torvalds and tmp1=7, tmp1 // make sure that if tmp1==8 -> tmp1=0 1621da177e4SLinus Torvalds shl tmp2=firstoff,3 // number of bits 1631da177e4SLinus Torvalds ;; 1641da177e4SLinus Torvalds(p9) ld8 lastval=[last] // load, ahead of time, "last" word, if needed 1651da177e4SLinus Torvalds shl tmp1=tmp1,3 // number of bits 1661da177e4SLinus Torvalds(p9) adds tmp3=-8,tmp3 // effectively loaded 1671da177e4SLinus Torvalds ;; 1681da177e4SLinus Torvalds(p8) mov lastval=r0 // we don't need lastval if first1==last 1691da177e4SLinus Torvalds shl hmask=hmask,tmp2 // build head mask, mask off [0,first1off[ 1701da177e4SLinus Torvalds shr.u tmask=tmask,tmp1 // build tail mask, mask off ]8,lastoff] 1711da177e4SLinus Torvalds ;; 1721da177e4SLinus Torvalds .body 1731da177e4SLinus Torvalds#define count tmp3 1741da177e4SLinus Torvalds 1751da177e4SLinus Torvalds(p8) and hmask=hmask,tmask // apply tail mask to head mask if 1 word only 1761da177e4SLinus Torvalds(p9) and word2[0]=lastval,tmask // mask last it as appropriate 1771da177e4SLinus Torvalds shr.u count=count,3 // how many 8-byte? 1781da177e4SLinus Torvalds ;; 1791da177e4SLinus Torvalds // If count is odd, finish this 8-byte word so that we can 1801da177e4SLinus Torvalds // load two back-to-back 8-byte words per loop thereafter. 1811da177e4SLinus Torvalds and word1[0]=firstval,hmask // and mask it as appropriate 1821da177e4SLinus Torvalds tbit.nz p10,p11=count,0 // if (count is odd) 1831da177e4SLinus Torvalds ;; 1841da177e4SLinus Torvalds(p8) mov result1[0]=word1[0] 1851da177e4SLinus Torvalds(p9) add result1[0]=word1[0],word2[0] 1861da177e4SLinus Torvalds ;; 1871da177e4SLinus Torvalds cmp.ltu p6,p0=result1[0],word1[0] // check the carry 1881da177e4SLinus Torvalds cmp.eq.or.andcm p8,p0=0,count // exit if zero 8-byte 1891da177e4SLinus Torvalds ;; 1901da177e4SLinus Torvalds(p6) adds result1[0]=1,result1[0] 1911da177e4SLinus Torvalds(p8) br.cond.dptk .do_csum_exit // if (within an 8-byte word) 1921da177e4SLinus Torvalds(p11) br.cond.dptk .do_csum16 // if (count is even) 1931da177e4SLinus Torvalds 1941da177e4SLinus Torvalds // Here count is odd. 1951da177e4SLinus Torvalds ld8 word1[1]=[first1],8 // load an 8-byte word 1961da177e4SLinus Torvalds cmp.eq p9,p10=1,count // if (count == 1) 1971da177e4SLinus Torvalds adds count=-1,count // loaded an 8-byte word 1981da177e4SLinus Torvalds ;; 1991da177e4SLinus Torvalds add result1[0]=result1[0],word1[1] 2001da177e4SLinus Torvalds ;; 2011da177e4SLinus Torvalds cmp.ltu p6,p0=result1[0],word1[1] 2021da177e4SLinus Torvalds ;; 2031da177e4SLinus Torvalds(p6) adds result1[0]=1,result1[0] 2041da177e4SLinus Torvalds(p9) br.cond.sptk .do_csum_exit // if (count == 1) exit 20525985edcSLucas De Marchi // Fall through to calculate the checksum, feeding result1[0] as 2061da177e4SLinus Torvalds // the initial value in result1[0]. 2071da177e4SLinus Torvalds // 2081da177e4SLinus Torvalds // Calculate the checksum loading two 8-byte words per loop. 2091da177e4SLinus Torvalds // 2101da177e4SLinus Torvalds.do_csum16: 2111da177e4SLinus Torvalds add first2=8,first1 2121da177e4SLinus Torvalds shr.u count=count,1 // we do 16 bytes per loop 2131da177e4SLinus Torvalds ;; 2141da177e4SLinus Torvalds adds count=-1,count 2151da177e4SLinus Torvalds mov carry1=r0 2161da177e4SLinus Torvalds mov carry2=r0 2171da177e4SLinus Torvalds brp.loop.imp 1f,2f 2181da177e4SLinus Torvalds ;; 2191da177e4SLinus Torvalds mov ar.ec=PIPE_DEPTH 2201da177e4SLinus Torvalds mov ar.lc=count // set lc 2211da177e4SLinus Torvalds mov pr.rot=1<<16 2221da177e4SLinus Torvalds // result1[0] must be initialized in advance. 2231da177e4SLinus Torvalds mov result2[0]=r0 2241da177e4SLinus Torvalds ;; 2251da177e4SLinus Torvalds .align 32 2261da177e4SLinus Torvalds1: 2271da177e4SLinus Torvalds(ELD_1) cmp.ltu pC1[0],p0=result1[LOAD_LATENCY],word1[LOAD_LATENCY+1] 2281da177e4SLinus Torvalds(pC1[1])adds carry1=1,carry1 2291da177e4SLinus Torvalds(ELD_1) cmp.ltu pC2[0],p0=result2[LOAD_LATENCY],word2[LOAD_LATENCY+1] 2301da177e4SLinus Torvalds(pC2[1])adds carry2=1,carry2 2311da177e4SLinus Torvalds(ELD) add result1[LOAD_LATENCY-1]=result1[LOAD_LATENCY],word1[LOAD_LATENCY] 2321da177e4SLinus Torvalds(ELD) add result2[LOAD_LATENCY-1]=result2[LOAD_LATENCY],word2[LOAD_LATENCY] 2331da177e4SLinus Torvalds2: 2341da177e4SLinus Torvalds(p[0]) ld8 word1[0]=[first1],16 2351da177e4SLinus Torvalds(p[0]) ld8 word2[0]=[first2],16 2361da177e4SLinus Torvalds br.ctop.sptk 1b 2371da177e4SLinus Torvalds ;; 2381da177e4SLinus Torvalds // Since len is a 32-bit value, carry cannot be larger than a 64-bit value. 2391da177e4SLinus Torvalds(pC1[1])adds carry1=1,carry1 // since we miss the last one 2401da177e4SLinus Torvalds(pC2[1])adds carry2=1,carry2 2411da177e4SLinus Torvalds ;; 2421da177e4SLinus Torvalds add result1[LOAD_LATENCY+1]=result1[LOAD_LATENCY+1],carry1 2431da177e4SLinus Torvalds add result2[LOAD_LATENCY+1]=result2[LOAD_LATENCY+1],carry2 2441da177e4SLinus Torvalds ;; 2451da177e4SLinus Torvalds cmp.ltu p6,p0=result1[LOAD_LATENCY+1],carry1 2461da177e4SLinus Torvalds cmp.ltu p7,p0=result2[LOAD_LATENCY+1],carry2 2471da177e4SLinus Torvalds ;; 2481da177e4SLinus Torvalds(p6) adds result1[LOAD_LATENCY+1]=1,result1[LOAD_LATENCY+1] 2491da177e4SLinus Torvalds(p7) adds result2[LOAD_LATENCY+1]=1,result2[LOAD_LATENCY+1] 2501da177e4SLinus Torvalds ;; 2511da177e4SLinus Torvalds add result1[0]=result1[LOAD_LATENCY+1],result2[LOAD_LATENCY+1] 2521da177e4SLinus Torvalds ;; 2531da177e4SLinus Torvalds cmp.ltu p6,p0=result1[0],result2[LOAD_LATENCY+1] 2541da177e4SLinus Torvalds ;; 2551da177e4SLinus Torvalds(p6) adds result1[0]=1,result1[0] 2561da177e4SLinus Torvalds ;; 2571da177e4SLinus Torvalds.do_csum_exit: 2581da177e4SLinus Torvalds // 2591da177e4SLinus Torvalds // now fold 64 into 16 bits taking care of carry 2601da177e4SLinus Torvalds // that's not very good because it has lots of sequentiality 2611da177e4SLinus Torvalds // 2621da177e4SLinus Torvalds mov tmp3=0xffff 2631da177e4SLinus Torvalds zxt4 tmp1=result1[0] 2641da177e4SLinus Torvalds shr.u tmp2=result1[0],32 2651da177e4SLinus Torvalds ;; 2661da177e4SLinus Torvalds add result1[0]=tmp1,tmp2 2671da177e4SLinus Torvalds ;; 2681da177e4SLinus Torvalds and tmp1=result1[0],tmp3 2691da177e4SLinus Torvalds shr.u tmp2=result1[0],16 2701da177e4SLinus Torvalds ;; 2711da177e4SLinus Torvalds add result1[0]=tmp1,tmp2 2721da177e4SLinus Torvalds ;; 2731da177e4SLinus Torvalds and tmp1=result1[0],tmp3 2741da177e4SLinus Torvalds shr.u tmp2=result1[0],16 2751da177e4SLinus Torvalds ;; 2761da177e4SLinus Torvalds add result1[0]=tmp1,tmp2 2771da177e4SLinus Torvalds ;; 2781da177e4SLinus Torvalds and tmp1=result1[0],tmp3 2791da177e4SLinus Torvalds shr.u tmp2=result1[0],16 2801da177e4SLinus Torvalds ;; 2811da177e4SLinus Torvalds add ret0=tmp1,tmp2 2821da177e4SLinus Torvalds mov pr=saved_pr,0xffffffffffff0000 2831da177e4SLinus Torvalds ;; 2841da177e4SLinus Torvalds // if buf was odd then swap bytes 2851da177e4SLinus Torvalds mov ar.pfs=saved_pfs // restore ar.ec 2861da177e4SLinus Torvalds(p15) mux1 ret0=ret0,@rev // reverse word 2871da177e4SLinus Torvalds ;; 2881da177e4SLinus Torvalds mov ar.lc=saved_lc 2891da177e4SLinus Torvalds(p15) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes 2901da177e4SLinus Torvalds br.ret.sptk.many rp 2911da177e4SLinus Torvalds 2921da177e4SLinus Torvalds// I (Jun Nakajima) wrote an equivalent code (see below), but it was 2931da177e4SLinus Torvalds// not much better than the original. So keep the original there so that 2941da177e4SLinus Torvalds// someone else can challenge. 2951da177e4SLinus Torvalds// 2961da177e4SLinus Torvalds// shr.u word1[0]=result1[0],32 2971da177e4SLinus Torvalds// zxt4 result1[0]=result1[0] 2981da177e4SLinus Torvalds// ;; 2991da177e4SLinus Torvalds// add result1[0]=result1[0],word1[0] 3001da177e4SLinus Torvalds// ;; 3011da177e4SLinus Torvalds// zxt2 result2[0]=result1[0] 3021da177e4SLinus Torvalds// extr.u word1[0]=result1[0],16,16 3031da177e4SLinus Torvalds// shr.u carry1=result1[0],32 3041da177e4SLinus Torvalds// ;; 3051da177e4SLinus Torvalds// add result2[0]=result2[0],word1[0] 3061da177e4SLinus Torvalds// ;; 3071da177e4SLinus Torvalds// add result2[0]=result2[0],carry1 3081da177e4SLinus Torvalds// ;; 3091da177e4SLinus Torvalds// extr.u ret0=result2[0],16,16 3101da177e4SLinus Torvalds// ;; 3111da177e4SLinus Torvalds// add ret0=ret0,result2[0] 3121da177e4SLinus Torvalds// ;; 3131da177e4SLinus Torvalds// zxt2 ret0=ret0 3141da177e4SLinus Torvalds// mov ar.pfs=saved_pfs // restore ar.ec 3151da177e4SLinus Torvalds// mov pr=saved_pr,0xffffffffffff0000 3161da177e4SLinus Torvalds// ;; 3171da177e4SLinus Torvalds// // if buf was odd then swap bytes 3181da177e4SLinus Torvalds// mov ar.lc=saved_lc 3191da177e4SLinus Torvalds//(p15) mux1 ret0=ret0,@rev // reverse word 3201da177e4SLinus Torvalds// ;; 3211da177e4SLinus Torvalds//(p15) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes 3221da177e4SLinus Torvalds// br.ret.sptk.many rp 3231da177e4SLinus Torvalds 3241da177e4SLinus TorvaldsEND(do_csum) 325