1caab277bSThomas Gleixner/* SPDX-License-Identifier: GPL-2.0-only */ 2d875c9b3Szhichang.yuan/* 36b8f6489SMark Rutland * Copyright (c) 2013-2021, Arm Limited. 4d875c9b3Szhichang.yuan * 543de30d3SSam Tebbs * Adapted from the original at: 66b8f6489SMark Rutland * https://github.com/ARM-software/optimized-routines/blob/e823e3abf5f89ecb/string/aarch64/memcmp.S 7d875c9b3Szhichang.yuan */ 8d875c9b3Szhichang.yuan 9d875c9b3Szhichang.yuan#include <linux/linkage.h> 10d875c9b3Szhichang.yuan#include <asm/assembler.h> 11d875c9b3Szhichang.yuan 1243de30d3SSam Tebbs/* Assumptions: 13d875c9b3Szhichang.yuan * 1443de30d3SSam Tebbs * ARMv8-a, AArch64, unaligned accesses. 15d875c9b3Szhichang.yuan */ 16d875c9b3Szhichang.yuan 1743de30d3SSam Tebbs#define L(label) .L ## label 1843de30d3SSam Tebbs 19d875c9b3Szhichang.yuan/* Parameters and result. */ 2043de30d3SSam Tebbs#define src1 x0 2143de30d3SSam Tebbs#define src2 x1 2243de30d3SSam Tebbs#define limit x2 2343de30d3SSam Tebbs#define result w0 24d875c9b3Szhichang.yuan 25d875c9b3Szhichang.yuan/* Internal variables. */ 2643de30d3SSam Tebbs#define data1 x3 2743de30d3SSam Tebbs#define data1w w3 2843de30d3SSam Tebbs#define data1h x4 2943de30d3SSam Tebbs#define data2 x5 3043de30d3SSam Tebbs#define data2w w5 3143de30d3SSam Tebbs#define data2h x6 3243de30d3SSam Tebbs#define tmp1 x7 3343de30d3SSam Tebbs#define tmp2 x8 34d875c9b3Szhichang.yuan 35*0f61f6beSMark RutlandSYM_FUNC_START(__pi_memcmp) 3643de30d3SSam Tebbs subs limit, limit, 8 3743de30d3SSam Tebbs b.lo L(less8) 38d875c9b3Szhichang.yuan 3943de30d3SSam Tebbs ldr data1, [src1], 8 4043de30d3SSam Tebbs ldr data2, [src2], 8 4143de30d3SSam Tebbs cmp data1, data2 4243de30d3SSam Tebbs b.ne L(return) 43d875c9b3Szhichang.yuan 4443de30d3SSam Tebbs subs limit, limit, 8 4543de30d3SSam Tebbs b.gt L(more16) 46d875c9b3Szhichang.yuan 4743de30d3SSam Tebbs ldr data1, [src1, limit] 4843de30d3SSam Tebbs ldr data2, [src2, limit] 4943de30d3SSam Tebbs b L(return) 50d875c9b3Szhichang.yuan 5143de30d3SSam TebbsL(more16): 5243de30d3SSam Tebbs ldr data1, [src1], 8 5343de30d3SSam Tebbs ldr data2, [src2], 8 5443de30d3SSam Tebbs cmp data1, data2 5543de30d3SSam Tebbs bne L(return) 56d875c9b3Szhichang.yuan 5743de30d3SSam Tebbs /* Jump directly to comparing the last 16 bytes for 32 byte (or less) 5843de30d3SSam Tebbs strings. */ 5943de30d3SSam Tebbs subs limit, limit, 16 6043de30d3SSam Tebbs b.ls L(last_bytes) 61d875c9b3Szhichang.yuan 6243de30d3SSam Tebbs /* We overlap loads between 0-32 bytes at either side of SRC1 when we 6343de30d3SSam Tebbs try to align, so limit it only to strings larger than 128 bytes. */ 6443de30d3SSam Tebbs cmp limit, 96 6543de30d3SSam Tebbs b.ls L(loop16) 66d875c9b3Szhichang.yuan 6743de30d3SSam Tebbs /* Align src1 and adjust src2 with bytes not yet done. */ 6843de30d3SSam Tebbs and tmp1, src1, 15 6943de30d3SSam Tebbs add limit, limit, tmp1 7043de30d3SSam Tebbs sub src1, src1, tmp1 7143de30d3SSam Tebbs sub src2, src2, tmp1 72d875c9b3Szhichang.yuan 7343de30d3SSam Tebbs /* Loop performing 16 bytes per iteration using aligned src1. 7443de30d3SSam Tebbs Limit is pre-decremented by 16 and must be larger than zero. 7543de30d3SSam Tebbs Exit if <= 16 bytes left to do or if the data is not equal. */ 7643de30d3SSam Tebbs .p2align 4 7743de30d3SSam TebbsL(loop16): 7843de30d3SSam Tebbs ldp data1, data1h, [src1], 16 7943de30d3SSam Tebbs ldp data2, data2h, [src2], 16 8043de30d3SSam Tebbs subs limit, limit, 16 8143de30d3SSam Tebbs ccmp data1, data2, 0, hi 8243de30d3SSam Tebbs ccmp data1h, data2h, 0, eq 8343de30d3SSam Tebbs b.eq L(loop16) 84d875c9b3Szhichang.yuan 8543de30d3SSam Tebbs cmp data1, data2 8643de30d3SSam Tebbs bne L(return) 8743de30d3SSam Tebbs mov data1, data1h 8843de30d3SSam Tebbs mov data2, data2h 8943de30d3SSam Tebbs cmp data1, data2 9043de30d3SSam Tebbs bne L(return) 9143de30d3SSam Tebbs 9243de30d3SSam Tebbs /* Compare last 1-16 bytes using unaligned access. */ 9343de30d3SSam TebbsL(last_bytes): 9443de30d3SSam Tebbs add src1, src1, limit 9543de30d3SSam Tebbs add src2, src2, limit 9643de30d3SSam Tebbs ldp data1, data1h, [src1] 9743de30d3SSam Tebbs ldp data2, data2h, [src2] 9843de30d3SSam Tebbs cmp data1, data2 9943de30d3SSam Tebbs bne L(return) 10043de30d3SSam Tebbs mov data1, data1h 10143de30d3SSam Tebbs mov data2, data2h 10243de30d3SSam Tebbs cmp data1, data2 10343de30d3SSam Tebbs 10443de30d3SSam Tebbs /* Compare data bytes and set return value to 0, -1 or 1. */ 10543de30d3SSam TebbsL(return): 10643de30d3SSam Tebbs#ifndef __AARCH64EB__ 10743de30d3SSam Tebbs rev data1, data1 10843de30d3SSam Tebbs rev data2, data2 10943de30d3SSam Tebbs#endif 11043de30d3SSam Tebbs cmp data1, data2 11143de30d3SSam TebbsL(ret_eq): 11243de30d3SSam Tebbs cset result, ne 11343de30d3SSam Tebbs cneg result, result, lo 11443de30d3SSam Tebbs ret 11543de30d3SSam Tebbs 11643de30d3SSam Tebbs .p2align 4 11743de30d3SSam Tebbs /* Compare up to 8 bytes. Limit is [-8..-1]. */ 11843de30d3SSam TebbsL(less8): 11943de30d3SSam Tebbs adds limit, limit, 4 12043de30d3SSam Tebbs b.lo L(less4) 12143de30d3SSam Tebbs ldr data1w, [src1], 4 12243de30d3SSam Tebbs ldr data2w, [src2], 4 123d875c9b3Szhichang.yuan cmp data1w, data2w 12443de30d3SSam Tebbs b.ne L(return) 12543de30d3SSam Tebbs sub limit, limit, 4 12643de30d3SSam TebbsL(less4): 12743de30d3SSam Tebbs adds limit, limit, 4 12843de30d3SSam Tebbs beq L(ret_eq) 12943de30d3SSam TebbsL(byte_loop): 13043de30d3SSam Tebbs ldrb data1w, [src1], 1 13143de30d3SSam Tebbs ldrb data2w, [src2], 1 13243de30d3SSam Tebbs subs limit, limit, 1 13343de30d3SSam Tebbs ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ 13443de30d3SSam Tebbs b.eq L(byte_loop) 13543de30d3SSam Tebbs sub result, data1w, data2w 136d875c9b3Szhichang.yuan ret 137*0f61f6beSMark RutlandSYM_FUNC_END(__pi_memcmp) 138*0f61f6beSMark RutlandSYM_FUNC_ALIAS_WEAK(memcmp, __pi_memcmp) 139ac0e8c72SMark RutlandEXPORT_SYMBOL_NOKASAN(memcmp) 140