1b2441318SGreg Kroah-Hartman/* SPDX-License-Identifier: GPL-2.0 */ 21da177e4SLinus Torvalds/* 31da177e4SLinus Torvalds * 41da177e4SLinus Torvalds * Optimized version of the standard strlen() function 51da177e4SLinus Torvalds * 61da177e4SLinus Torvalds * 71da177e4SLinus Torvalds * Inputs: 81da177e4SLinus Torvalds * in0 address of string 91da177e4SLinus Torvalds * 101da177e4SLinus Torvalds * Outputs: 111da177e4SLinus Torvalds * ret0 the number of characters in the string (0 if empty string) 121da177e4SLinus Torvalds * does not count the \0 131da177e4SLinus Torvalds * 141da177e4SLinus Torvalds * Copyright (C) 1999, 2001 Hewlett-Packard Co 151da177e4SLinus Torvalds * Stephane Eranian <eranian@hpl.hp.com> 161da177e4SLinus Torvalds * 171da177e4SLinus Torvalds * 09/24/99 S.Eranian add speculation recovery code 181da177e4SLinus Torvalds */ 191da177e4SLinus Torvalds 20*ab03e604SMasahiro Yamada#include <linux/export.h> 211da177e4SLinus Torvalds#include <asm/asmmacro.h> 221da177e4SLinus Torvalds 231da177e4SLinus Torvalds// 241da177e4SLinus Torvalds// 251da177e4SLinus Torvalds// This is an enhanced version of the basic strlen. it includes a combination 261da177e4SLinus Torvalds// of compute zero index (czx), parallel comparisons, speculative loads and 271da177e4SLinus Torvalds// loop unroll using rotating registers. 281da177e4SLinus Torvalds// 291da177e4SLinus Torvalds// General Ideas about the algorithm: 301da177e4SLinus Torvalds// The goal is to look at the string in chunks of 8 bytes. 311da177e4SLinus Torvalds// so we need to do a few extra checks at the beginning because the 321da177e4SLinus Torvalds// string may not be 8-byte aligned. In this case we load the 8byte 331da177e4SLinus Torvalds// quantity which includes the start of the string and mask the unused 341da177e4SLinus Torvalds// bytes with 0xff to avoid confusing czx. 351da177e4SLinus Torvalds// We use speculative loads and software pipelining to hide memory 361da177e4SLinus Torvalds// latency and do read ahead safely. This way we defer any exception. 371da177e4SLinus Torvalds// 381da177e4SLinus Torvalds// Because we don't want the kernel to be relying on particular 391da177e4SLinus Torvalds// settings of the DCR register, we provide recovery code in case 401da177e4SLinus Torvalds// speculation fails. The recovery code is going to "redo" the work using 411da177e4SLinus Torvalds// only normal loads. If we still get a fault then we generate a 421da177e4SLinus Torvalds// kernel panic. Otherwise we return the strlen as usual. 431da177e4SLinus Torvalds// 441da177e4SLinus Torvalds// The fact that speculation may fail can be caused, for instance, by 451da177e4SLinus Torvalds// the DCR.dm bit being set. In this case TLB misses are deferred, i.e., 461da177e4SLinus Torvalds// a NaT bit will be set if the translation is not present. The normal 471da177e4SLinus Torvalds// load, on the other hand, will cause the translation to be inserted 481da177e4SLinus Torvalds// if the mapping exists. 491da177e4SLinus Torvalds// 501da177e4SLinus Torvalds// It should be noted that we execute recovery code only when we need 511da177e4SLinus Torvalds// to use the data that has been speculatively loaded: we don't execute 521da177e4SLinus Torvalds// recovery code on pure read ahead data. 531da177e4SLinus Torvalds// 541da177e4SLinus Torvalds// Remarks: 551da177e4SLinus Torvalds// - the cmp r0,r0 is used as a fast way to initialize a predicate 561da177e4SLinus Torvalds// register to 1. This is required to make sure that we get the parallel 571da177e4SLinus Torvalds// compare correct. 581da177e4SLinus Torvalds// 591da177e4SLinus Torvalds// - we don't use the epilogue counter to exit the loop but we need to set 601da177e4SLinus Torvalds// it to zero beforehand. 611da177e4SLinus Torvalds// 621da177e4SLinus Torvalds// - after the loop we must test for Nat values because neither the 631da177e4SLinus Torvalds// czx nor cmp instruction raise a NaT consumption fault. We must be 641da177e4SLinus Torvalds// careful not to look too far for a Nat for which we don't care. 651da177e4SLinus Torvalds// For instance we don't need to look at a NaT in val2 if the zero byte 661da177e4SLinus Torvalds// was in val1. 671da177e4SLinus Torvalds// 681da177e4SLinus Torvalds// - Clearly performance tuning is required. 691da177e4SLinus Torvalds// 701da177e4SLinus Torvalds// 711da177e4SLinus Torvalds// 721da177e4SLinus Torvalds#define saved_pfs r11 731da177e4SLinus Torvalds#define tmp r10 741da177e4SLinus Torvalds#define base r16 751da177e4SLinus Torvalds#define orig r17 761da177e4SLinus Torvalds#define saved_pr r18 771da177e4SLinus Torvalds#define src r19 781da177e4SLinus Torvalds#define mask r20 791da177e4SLinus Torvalds#define val r21 801da177e4SLinus Torvalds#define val1 r22 811da177e4SLinus Torvalds#define val2 r23 821da177e4SLinus Torvalds 831da177e4SLinus TorvaldsGLOBAL_ENTRY(strlen) 841da177e4SLinus Torvalds .prologue 851da177e4SLinus Torvalds .save ar.pfs, saved_pfs 861da177e4SLinus Torvalds alloc saved_pfs=ar.pfs,11,0,0,8 // rotating must be multiple of 8 871da177e4SLinus Torvalds 881da177e4SLinus Torvalds .rotr v[2], w[2] // declares our 4 aliases 891da177e4SLinus Torvalds 901da177e4SLinus Torvalds extr.u tmp=in0,0,3 // tmp=least significant 3 bits 911da177e4SLinus Torvalds mov orig=in0 // keep trackof initial byte address 921da177e4SLinus Torvalds dep src=0,in0,0,3 // src=8byte-aligned in0 address 931da177e4SLinus Torvalds .save pr, saved_pr 941da177e4SLinus Torvalds mov saved_pr=pr // preserve predicates (rotation) 951da177e4SLinus Torvalds ;; 961da177e4SLinus Torvalds 971da177e4SLinus Torvalds .body 981da177e4SLinus Torvalds 991da177e4SLinus Torvalds ld8 v[1]=[src],8 // must not speculate: can fail here 1001da177e4SLinus Torvalds shl tmp=tmp,3 // multiply by 8bits/byte 1011da177e4SLinus Torvalds mov mask=-1 // our mask 1021da177e4SLinus Torvalds ;; 1031da177e4SLinus Torvalds ld8.s w[1]=[src],8 // speculatively load next 1041da177e4SLinus Torvalds cmp.eq p6,p0=r0,r0 // sets p6 to true for cmp.and 1051da177e4SLinus Torvalds sub tmp=64,tmp // how many bits to shift our mask on the right 1061da177e4SLinus Torvalds ;; 1071da177e4SLinus Torvalds shr.u mask=mask,tmp // zero enough bits to hold v[1] valuable part 1081da177e4SLinus Torvalds mov ar.ec=r0 // clear epilogue counter (saved in ar.pfs) 1091da177e4SLinus Torvalds ;; 1101da177e4SLinus Torvalds add base=-16,src // keep track of aligned base 1111da177e4SLinus Torvalds or v[1]=v[1],mask // now we have a safe initial byte pattern 1121da177e4SLinus Torvalds ;; 1131da177e4SLinus Torvalds1: 1141da177e4SLinus Torvalds ld8.s v[0]=[src],8 // speculatively load next 1151da177e4SLinus Torvalds czx1.r val1=v[1] // search 0 byte from right 1161da177e4SLinus Torvalds czx1.r val2=w[1] // search 0 byte from right following 8bytes 1171da177e4SLinus Torvalds ;; 1181da177e4SLinus Torvalds ld8.s w[0]=[src],8 // speculatively load next to next 1191da177e4SLinus Torvalds cmp.eq.and p6,p0=8,val1 // p6 = p6 and val1==8 1201da177e4SLinus Torvalds cmp.eq.and p6,p0=8,val2 // p6 = p6 and mask==8 1211da177e4SLinus Torvalds(p6) br.wtop.dptk 1b // loop until p6 == 0 1221da177e4SLinus Torvalds ;; 1231da177e4SLinus Torvalds // 1241da177e4SLinus Torvalds // We must return try the recovery code iff 1251da177e4SLinus Torvalds // val1_is_nat || (val1==8 && val2_is_nat) 1261da177e4SLinus Torvalds // 1271da177e4SLinus Torvalds // XXX Fixme 1281da177e4SLinus Torvalds // - there must be a better way of doing the test 1291da177e4SLinus Torvalds // 1301da177e4SLinus Torvalds cmp.eq p8,p9=8,val1 // p6 = val1 had zero (disambiguate) 1311da177e4SLinus Torvalds tnat.nz p6,p7=val1 // test NaT on val1 1321da177e4SLinus Torvalds(p6) br.cond.spnt .recover // jump to recovery if val1 is NaT 1331da177e4SLinus Torvalds ;; 1341da177e4SLinus Torvalds // 1351da177e4SLinus Torvalds // if we come here p7 is true, i.e., initialized for // cmp 1361da177e4SLinus Torvalds // 1371da177e4SLinus Torvalds cmp.eq.and p7,p0=8,val1// val1==8? 1381da177e4SLinus Torvalds tnat.nz.and p7,p0=val2 // test NaT if val2 1391da177e4SLinus Torvalds(p7) br.cond.spnt .recover // jump to recovery if val2 is NaT 1401da177e4SLinus Torvalds ;; 1411da177e4SLinus Torvalds(p8) mov val1=val2 // the other test got us out of the loop 1421da177e4SLinus Torvalds(p8) adds src=-16,src // correct position when 3 ahead 1431da177e4SLinus Torvalds(p9) adds src=-24,src // correct position when 4 ahead 1441da177e4SLinus Torvalds ;; 1451da177e4SLinus Torvalds sub ret0=src,orig // distance from base 1461da177e4SLinus Torvalds sub tmp=8,val1 // which byte in word 1471da177e4SLinus Torvalds mov pr=saved_pr,0xffffffffffff0000 1481da177e4SLinus Torvalds ;; 1491da177e4SLinus Torvalds sub ret0=ret0,tmp // adjust 1501da177e4SLinus Torvalds mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what 1511da177e4SLinus Torvalds br.ret.sptk.many rp // end of normal execution 1521da177e4SLinus Torvalds 1531da177e4SLinus Torvalds // 1541da177e4SLinus Torvalds // Outlined recovery code when speculation failed 1551da177e4SLinus Torvalds // 1561da177e4SLinus Torvalds // This time we don't use speculation and rely on the normal exception 1571da177e4SLinus Torvalds // mechanism. that's why the loop is not as good as the previous one 1581da177e4SLinus Torvalds // because read ahead is not possible 1591da177e4SLinus Torvalds // 1601da177e4SLinus Torvalds // IMPORTANT: 1611da177e4SLinus Torvalds // Please note that in the case of strlen() as opposed to strlen_user() 1621da177e4SLinus Torvalds // we don't use the exception mechanism, as this function is not 1631da177e4SLinus Torvalds // supposed to fail. If that happens it means we have a bug and the 1641da177e4SLinus Torvalds // code will cause of kernel fault. 1651da177e4SLinus Torvalds // 1661da177e4SLinus Torvalds // XXX Fixme 1671da177e4SLinus Torvalds // - today we restart from the beginning of the string instead 1681da177e4SLinus Torvalds // of trying to continue where we left off. 1691da177e4SLinus Torvalds // 1701da177e4SLinus Torvalds.recover: 1711da177e4SLinus Torvalds ld8 val=[base],8 // will fail if unrecoverable fault 1721da177e4SLinus Torvalds ;; 1731da177e4SLinus Torvalds or val=val,mask // remask first bytes 1741da177e4SLinus Torvalds cmp.eq p0,p6=r0,r0 // nullify first ld8 in loop 1751da177e4SLinus Torvalds ;; 1761da177e4SLinus Torvalds // 1771da177e4SLinus Torvalds // ar.ec is still zero here 1781da177e4SLinus Torvalds // 1791da177e4SLinus Torvalds2: 1801da177e4SLinus Torvalds(p6) ld8 val=[base],8 // will fail if unrecoverable fault 1811da177e4SLinus Torvalds ;; 1821da177e4SLinus Torvalds czx1.r val1=val // search 0 byte from right 1831da177e4SLinus Torvalds ;; 1841da177e4SLinus Torvalds cmp.eq p6,p0=8,val1 // val1==8 ? 1851da177e4SLinus Torvalds(p6) br.wtop.dptk 2b // loop until p6 == 0 1861da177e4SLinus Torvalds ;; // (avoid WAW on p63) 1871da177e4SLinus Torvalds sub ret0=base,orig // distance from base 1881da177e4SLinus Torvalds sub tmp=8,val1 1891da177e4SLinus Torvalds mov pr=saved_pr,0xffffffffffff0000 1901da177e4SLinus Torvalds ;; 1911da177e4SLinus Torvalds sub ret0=ret0,tmp // length=now - back -1 1921da177e4SLinus Torvalds mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what 1931da177e4SLinus Torvalds br.ret.sptk.many rp // end of successful recovery code 1941da177e4SLinus TorvaldsEND(strlen) 195e007c533SAl ViroEXPORT_SYMBOL(strlen) 196