1*d875c9b3Szhichang.yuan/* 2*d875c9b3Szhichang.yuan * Copyright (C) 2013 ARM Ltd. 3*d875c9b3Szhichang.yuan * Copyright (C) 2013 Linaro. 4*d875c9b3Szhichang.yuan * 5*d875c9b3Szhichang.yuan * This code is based on glibc cortex strings work originally authored by Linaro 6*d875c9b3Szhichang.yuan * and re-licensed under GPLv2 for the Linux kernel. The original code can 7*d875c9b3Szhichang.yuan * be found @ 8*d875c9b3Szhichang.yuan * 9*d875c9b3Szhichang.yuan * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ 10*d875c9b3Szhichang.yuan * files/head:/src/aarch64/ 11*d875c9b3Szhichang.yuan * 12*d875c9b3Szhichang.yuan * This program is free software; you can redistribute it and/or modify 13*d875c9b3Szhichang.yuan * it under the terms of the GNU General Public License version 2 as 14*d875c9b3Szhichang.yuan * published by the Free Software Foundation. 15*d875c9b3Szhichang.yuan * 16*d875c9b3Szhichang.yuan * This program is distributed in the hope that it will be useful, 17*d875c9b3Szhichang.yuan * but WITHOUT ANY WARRANTY; without even the implied warranty of 18*d875c9b3Szhichang.yuan * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19*d875c9b3Szhichang.yuan * GNU General Public License for more details. 20*d875c9b3Szhichang.yuan * 21*d875c9b3Szhichang.yuan * You should have received a copy of the GNU General Public License 22*d875c9b3Szhichang.yuan * along with this program. If not, see <http://www.gnu.org/licenses/>. 23*d875c9b3Szhichang.yuan */ 24*d875c9b3Szhichang.yuan 25*d875c9b3Szhichang.yuan#include <linux/linkage.h> 26*d875c9b3Szhichang.yuan#include <asm/assembler.h> 27*d875c9b3Szhichang.yuan 28*d875c9b3Szhichang.yuan/* 29*d875c9b3Szhichang.yuan* compare memory areas(when two memory areas' offset are different, 30*d875c9b3Szhichang.yuan* alignment handled by the hardware) 31*d875c9b3Szhichang.yuan* 32*d875c9b3Szhichang.yuan* Parameters: 33*d875c9b3Szhichang.yuan* x0 - const memory area 1 pointer 34*d875c9b3Szhichang.yuan* x1 - const memory area 2 pointer 35*d875c9b3Szhichang.yuan* x2 - the maximal compare byte length 36*d875c9b3Szhichang.yuan* Returns: 37*d875c9b3Szhichang.yuan* x0 - a compare result, maybe less than, equal to, or greater than ZERO 38*d875c9b3Szhichang.yuan*/ 39*d875c9b3Szhichang.yuan 40*d875c9b3Szhichang.yuan/* Parameters and result. */ 41*d875c9b3Szhichang.yuansrc1 .req x0 42*d875c9b3Szhichang.yuansrc2 .req x1 43*d875c9b3Szhichang.yuanlimit .req x2 44*d875c9b3Szhichang.yuanresult .req x0 45*d875c9b3Szhichang.yuan 46*d875c9b3Szhichang.yuan/* Internal variables. */ 47*d875c9b3Szhichang.yuandata1 .req x3 48*d875c9b3Szhichang.yuandata1w .req w3 49*d875c9b3Szhichang.yuandata2 .req x4 50*d875c9b3Szhichang.yuandata2w .req w4 51*d875c9b3Szhichang.yuanhas_nul .req x5 52*d875c9b3Szhichang.yuandiff .req x6 53*d875c9b3Szhichang.yuanendloop .req x7 54*d875c9b3Szhichang.yuantmp1 .req x8 55*d875c9b3Szhichang.yuantmp2 .req x9 56*d875c9b3Szhichang.yuantmp3 .req x10 57*d875c9b3Szhichang.yuanpos .req x11 58*d875c9b3Szhichang.yuanlimit_wd .req x12 59*d875c9b3Szhichang.yuanmask .req x13 60*d875c9b3Szhichang.yuan 61*d875c9b3Szhichang.yuanENTRY(memcmp) 62*d875c9b3Szhichang.yuan cbz limit, .Lret0 63*d875c9b3Szhichang.yuan eor tmp1, src1, src2 64*d875c9b3Szhichang.yuan tst tmp1, #7 65*d875c9b3Szhichang.yuan b.ne .Lmisaligned8 66*d875c9b3Szhichang.yuan ands tmp1, src1, #7 67*d875c9b3Szhichang.yuan b.ne .Lmutual_align 68*d875c9b3Szhichang.yuan sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ 69*d875c9b3Szhichang.yuan lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ 70*d875c9b3Szhichang.yuan /* 71*d875c9b3Szhichang.yuan * The input source addresses are at alignment boundary. 72*d875c9b3Szhichang.yuan * Directly compare eight bytes each time. 73*d875c9b3Szhichang.yuan */ 74*d875c9b3Szhichang.yuan.Lloop_aligned: 75*d875c9b3Szhichang.yuan ldr data1, [src1], #8 76*d875c9b3Szhichang.yuan ldr data2, [src2], #8 77*d875c9b3Szhichang.yuan.Lstart_realigned: 78*d875c9b3Szhichang.yuan subs limit_wd, limit_wd, #1 79*d875c9b3Szhichang.yuan eor diff, data1, data2 /* Non-zero if differences found. */ 80*d875c9b3Szhichang.yuan csinv endloop, diff, xzr, cs /* Last Dword or differences. */ 81*d875c9b3Szhichang.yuan cbz endloop, .Lloop_aligned 82*d875c9b3Szhichang.yuan 83*d875c9b3Szhichang.yuan /* Not reached the limit, must have found a diff. */ 84*d875c9b3Szhichang.yuan tbz limit_wd, #63, .Lnot_limit 85*d875c9b3Szhichang.yuan 86*d875c9b3Szhichang.yuan /* Limit % 8 == 0 => the diff is in the last 8 bytes. */ 87*d875c9b3Szhichang.yuan ands limit, limit, #7 88*d875c9b3Szhichang.yuan b.eq .Lnot_limit 89*d875c9b3Szhichang.yuan /* 90*d875c9b3Szhichang.yuan * The remained bytes less than 8. It is needed to extract valid data 91*d875c9b3Szhichang.yuan * from last eight bytes of the intended memory range. 92*d875c9b3Szhichang.yuan */ 93*d875c9b3Szhichang.yuan lsl limit, limit, #3 /* bytes-> bits. */ 94*d875c9b3Szhichang.yuan mov mask, #~0 95*d875c9b3Szhichang.yuanCPU_BE( lsr mask, mask, limit ) 96*d875c9b3Szhichang.yuanCPU_LE( lsl mask, mask, limit ) 97*d875c9b3Szhichang.yuan bic data1, data1, mask 98*d875c9b3Szhichang.yuan bic data2, data2, mask 99*d875c9b3Szhichang.yuan 100*d875c9b3Szhichang.yuan orr diff, diff, mask 101*d875c9b3Szhichang.yuan b .Lnot_limit 102*d875c9b3Szhichang.yuan 103*d875c9b3Szhichang.yuan.Lmutual_align: 104*d875c9b3Szhichang.yuan /* 105*d875c9b3Szhichang.yuan * Sources are mutually aligned, but are not currently at an 106*d875c9b3Szhichang.yuan * alignment boundary. Round down the addresses and then mask off 107*d875c9b3Szhichang.yuan * the bytes that precede the start point. 108*d875c9b3Szhichang.yuan */ 109*d875c9b3Szhichang.yuan bic src1, src1, #7 110*d875c9b3Szhichang.yuan bic src2, src2, #7 111*d875c9b3Szhichang.yuan ldr data1, [src1], #8 112*d875c9b3Szhichang.yuan ldr data2, [src2], #8 113*d875c9b3Szhichang.yuan /* 114*d875c9b3Szhichang.yuan * We can not add limit with alignment offset(tmp1) here. Since the 115*d875c9b3Szhichang.yuan * addition probably make the limit overflown. 116*d875c9b3Szhichang.yuan */ 117*d875c9b3Szhichang.yuan sub limit_wd, limit, #1/*limit != 0, so no underflow.*/ 118*d875c9b3Szhichang.yuan and tmp3, limit_wd, #7 119*d875c9b3Szhichang.yuan lsr limit_wd, limit_wd, #3 120*d875c9b3Szhichang.yuan add tmp3, tmp3, tmp1 121*d875c9b3Szhichang.yuan add limit_wd, limit_wd, tmp3, lsr #3 122*d875c9b3Szhichang.yuan add limit, limit, tmp1/* Adjust the limit for the extra. */ 123*d875c9b3Szhichang.yuan 124*d875c9b3Szhichang.yuan lsl tmp1, tmp1, #3/* Bytes beyond alignment -> bits.*/ 125*d875c9b3Szhichang.yuan neg tmp1, tmp1/* Bits to alignment -64. */ 126*d875c9b3Szhichang.yuan mov tmp2, #~0 127*d875c9b3Szhichang.yuan /*mask off the non-intended bytes before the start address.*/ 128*d875c9b3Szhichang.yuanCPU_BE( lsl tmp2, tmp2, tmp1 )/*Big-endian.Early bytes are at MSB*/ 129*d875c9b3Szhichang.yuan /* Little-endian. Early bytes are at LSB. */ 130*d875c9b3Szhichang.yuanCPU_LE( lsr tmp2, tmp2, tmp1 ) 131*d875c9b3Szhichang.yuan 132*d875c9b3Szhichang.yuan orr data1, data1, tmp2 133*d875c9b3Szhichang.yuan orr data2, data2, tmp2 134*d875c9b3Szhichang.yuan b .Lstart_realigned 135*d875c9b3Szhichang.yuan 136*d875c9b3Szhichang.yuan /*src1 and src2 have different alignment offset.*/ 137*d875c9b3Szhichang.yuan.Lmisaligned8: 138*d875c9b3Szhichang.yuan cmp limit, #8 139*d875c9b3Szhichang.yuan b.lo .Ltiny8proc /*limit < 8: compare byte by byte*/ 140*d875c9b3Szhichang.yuan 141*d875c9b3Szhichang.yuan and tmp1, src1, #7 142*d875c9b3Szhichang.yuan neg tmp1, tmp1 143*d875c9b3Szhichang.yuan add tmp1, tmp1, #8/*valid length in the first 8 bytes of src1*/ 144*d875c9b3Szhichang.yuan and tmp2, src2, #7 145*d875c9b3Szhichang.yuan neg tmp2, tmp2 146*d875c9b3Szhichang.yuan add tmp2, tmp2, #8/*valid length in the first 8 bytes of src2*/ 147*d875c9b3Szhichang.yuan subs tmp3, tmp1, tmp2 148*d875c9b3Szhichang.yuan csel pos, tmp1, tmp2, hi /*Choose the maximum.*/ 149*d875c9b3Szhichang.yuan 150*d875c9b3Szhichang.yuan sub limit, limit, pos 151*d875c9b3Szhichang.yuan /*compare the proceeding bytes in the first 8 byte segment.*/ 152*d875c9b3Szhichang.yuan.Ltinycmp: 153*d875c9b3Szhichang.yuan ldrb data1w, [src1], #1 154*d875c9b3Szhichang.yuan ldrb data2w, [src2], #1 155*d875c9b3Szhichang.yuan subs pos, pos, #1 156*d875c9b3Szhichang.yuan ccmp data1w, data2w, #0, ne /* NZCV = 0b0000. */ 157*d875c9b3Szhichang.yuan b.eq .Ltinycmp 158*d875c9b3Szhichang.yuan cbnz pos, 1f /*diff occurred before the last byte.*/ 159*d875c9b3Szhichang.yuan cmp data1w, data2w 160*d875c9b3Szhichang.yuan b.eq .Lstart_align 161*d875c9b3Szhichang.yuan1: 162*d875c9b3Szhichang.yuan sub result, data1, data2 163*d875c9b3Szhichang.yuan ret 164*d875c9b3Szhichang.yuan 165*d875c9b3Szhichang.yuan.Lstart_align: 166*d875c9b3Szhichang.yuan lsr limit_wd, limit, #3 167*d875c9b3Szhichang.yuan cbz limit_wd, .Lremain8 168*d875c9b3Szhichang.yuan 169*d875c9b3Szhichang.yuan ands xzr, src1, #7 170*d875c9b3Szhichang.yuan b.eq .Lrecal_offset 171*d875c9b3Szhichang.yuan /*process more leading bytes to make src1 aligned...*/ 172*d875c9b3Szhichang.yuan add src1, src1, tmp3 /*backwards src1 to alignment boundary*/ 173*d875c9b3Szhichang.yuan add src2, src2, tmp3 174*d875c9b3Szhichang.yuan sub limit, limit, tmp3 175*d875c9b3Szhichang.yuan lsr limit_wd, limit, #3 176*d875c9b3Szhichang.yuan cbz limit_wd, .Lremain8 177*d875c9b3Szhichang.yuan /*load 8 bytes from aligned SRC1..*/ 178*d875c9b3Szhichang.yuan ldr data1, [src1], #8 179*d875c9b3Szhichang.yuan ldr data2, [src2], #8 180*d875c9b3Szhichang.yuan 181*d875c9b3Szhichang.yuan subs limit_wd, limit_wd, #1 182*d875c9b3Szhichang.yuan eor diff, data1, data2 /*Non-zero if differences found.*/ 183*d875c9b3Szhichang.yuan csinv endloop, diff, xzr, ne 184*d875c9b3Szhichang.yuan cbnz endloop, .Lunequal_proc 185*d875c9b3Szhichang.yuan /*How far is the current SRC2 from the alignment boundary...*/ 186*d875c9b3Szhichang.yuan and tmp3, tmp3, #7 187*d875c9b3Szhichang.yuan 188*d875c9b3Szhichang.yuan.Lrecal_offset:/*src1 is aligned now..*/ 189*d875c9b3Szhichang.yuan neg pos, tmp3 190*d875c9b3Szhichang.yuan.Lloopcmp_proc: 191*d875c9b3Szhichang.yuan /* 192*d875c9b3Szhichang.yuan * Divide the eight bytes into two parts. First,backwards the src2 193*d875c9b3Szhichang.yuan * to an alignment boundary,load eight bytes and compare from 194*d875c9b3Szhichang.yuan * the SRC2 alignment boundary. If all 8 bytes are equal,then start 195*d875c9b3Szhichang.yuan * the second part's comparison. Otherwise finish the comparison. 196*d875c9b3Szhichang.yuan * This special handle can garantee all the accesses are in the 197*d875c9b3Szhichang.yuan * thread/task space in avoid to overrange access. 198*d875c9b3Szhichang.yuan */ 199*d875c9b3Szhichang.yuan ldr data1, [src1,pos] 200*d875c9b3Szhichang.yuan ldr data2, [src2,pos] 201*d875c9b3Szhichang.yuan eor diff, data1, data2 /* Non-zero if differences found. */ 202*d875c9b3Szhichang.yuan cbnz diff, .Lnot_limit 203*d875c9b3Szhichang.yuan 204*d875c9b3Szhichang.yuan /*The second part process*/ 205*d875c9b3Szhichang.yuan ldr data1, [src1], #8 206*d875c9b3Szhichang.yuan ldr data2, [src2], #8 207*d875c9b3Szhichang.yuan eor diff, data1, data2 /* Non-zero if differences found. */ 208*d875c9b3Szhichang.yuan subs limit_wd, limit_wd, #1 209*d875c9b3Szhichang.yuan csinv endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/ 210*d875c9b3Szhichang.yuan cbz endloop, .Lloopcmp_proc 211*d875c9b3Szhichang.yuan.Lunequal_proc: 212*d875c9b3Szhichang.yuan cbz diff, .Lremain8 213*d875c9b3Szhichang.yuan 214*d875c9b3Szhichang.yuan/*There is differnence occured in the latest comparison.*/ 215*d875c9b3Szhichang.yuan.Lnot_limit: 216*d875c9b3Szhichang.yuan/* 217*d875c9b3Szhichang.yuan* For little endian,reverse the low significant equal bits into MSB,then 218*d875c9b3Szhichang.yuan* following CLZ can find how many equal bits exist. 219*d875c9b3Szhichang.yuan*/ 220*d875c9b3Szhichang.yuanCPU_LE( rev diff, diff ) 221*d875c9b3Szhichang.yuanCPU_LE( rev data1, data1 ) 222*d875c9b3Szhichang.yuanCPU_LE( rev data2, data2 ) 223*d875c9b3Szhichang.yuan 224*d875c9b3Szhichang.yuan /* 225*d875c9b3Szhichang.yuan * The MS-non-zero bit of DIFF marks either the first bit 226*d875c9b3Szhichang.yuan * that is different, or the end of the significant data. 227*d875c9b3Szhichang.yuan * Shifting left now will bring the critical information into the 228*d875c9b3Szhichang.yuan * top bits. 229*d875c9b3Szhichang.yuan */ 230*d875c9b3Szhichang.yuan clz pos, diff 231*d875c9b3Szhichang.yuan lsl data1, data1, pos 232*d875c9b3Szhichang.yuan lsl data2, data2, pos 233*d875c9b3Szhichang.yuan /* 234*d875c9b3Szhichang.yuan * We need to zero-extend (char is unsigned) the value and then 235*d875c9b3Szhichang.yuan * perform a signed subtraction. 236*d875c9b3Szhichang.yuan */ 237*d875c9b3Szhichang.yuan lsr data1, data1, #56 238*d875c9b3Szhichang.yuan sub result, data1, data2, lsr #56 239*d875c9b3Szhichang.yuan ret 240*d875c9b3Szhichang.yuan 241*d875c9b3Szhichang.yuan.Lremain8: 242*d875c9b3Szhichang.yuan /* Limit % 8 == 0 =>. all data are equal.*/ 243*d875c9b3Szhichang.yuan ands limit, limit, #7 244*d875c9b3Szhichang.yuan b.eq .Lret0 245*d875c9b3Szhichang.yuan 246*d875c9b3Szhichang.yuan.Ltiny8proc: 247*d875c9b3Szhichang.yuan ldrb data1w, [src1], #1 248*d875c9b3Szhichang.yuan ldrb data2w, [src2], #1 249*d875c9b3Szhichang.yuan subs limit, limit, #1 250*d875c9b3Szhichang.yuan 251*d875c9b3Szhichang.yuan ccmp data1w, data2w, #0, ne /* NZCV = 0b0000. */ 252*d875c9b3Szhichang.yuan b.eq .Ltiny8proc 253*d875c9b3Szhichang.yuan sub result, data1, data2 254*d875c9b3Szhichang.yuan ret 255*d875c9b3Szhichang.yuan.Lret0: 256*d875c9b3Szhichang.yuan mov result, #0 257*d875c9b3Szhichang.yuan ret 258*d875c9b3Szhichang.yuanENDPROC(memcmp) 259