1/* 2 * arch/alpha/lib/ev67-strchr.S 3 * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> 4 * 5 * Return the address of a given character within a null-terminated 6 * string, or null if it is not found. 7 * 8 * Much of the information about 21264 scheduling/coding comes from: 9 * Compiler Writer's Guide for the Alpha 21264 10 * abbreviated as 'CWG' in other comments here 11 * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html 12 * Scheduling notation: 13 * E - either cluster 14 * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 15 * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 16 * Try not to change the actual algorithm if possible for consistency. 17 */ 18#include <asm/export.h> 19#include <asm/regdef.h> 20 21 .set noreorder 22 .set noat 23 24 .align 4 25 .globl strchr 26 .ent strchr 27strchr: 28 .frame sp, 0, ra 29 .prologue 0 30 31 ldq_u t0, 0(a0) # L : load first quadword Latency=3 32 and a1, 0xff, t3 # E : 00000000000000ch 33 insbl a1, 1, t5 # U : 000000000000ch00 34 insbl a1, 7, a2 # U : ch00000000000000 35 36 insbl t3, 6, a3 # U : 00ch000000000000 37 or t5, t3, a1 # E : 000000000000chch 38 andnot a0, 7, v0 # E : align our loop pointer 39 lda t4, -1 # E : build garbage mask 40 41 mskqh t4, a0, t4 # U : only want relevant part of first quad 42 or a2, a3, a2 # E : chch000000000000 43 inswl a1, 2, t5 # E : 00000000chch0000 44 inswl a1, 4, a3 # E : 0000chch00000000 45 46 or a1, a2, a1 # E : chch00000000chch 47 or a3, t5, t5 # E : 0000chchchch0000 48 cmpbge zero, t0, t2 # E : bits set iff byte == zero 49 cmpbge zero, t4, t4 # E : bits set iff byte is garbage 50 51 /* This quad is _very_ serialized. Lots of stalling happens */ 52 or t5, a1, a1 # E : chchchchchchchch 53 xor t0, a1, t1 # E : make bytes == c zero 54 cmpbge zero, t1, t3 # E : bits set iff byte == c 55 or t2, t3, t0 # E : bits set iff char match or zero match 56 57 andnot t0, t4, t0 # E : clear garbage bits 58 cttz t0, a2 # U0 : speculative (in case we get a match) 59 nop # E : 60 bne t0, $found # U : 61 62 /* 63 * Yuk. This loop is going to stall like crazy waiting for the 64 * data to be loaded. Not much can be done about it unless it's 65 * unrolled multiple times - is that safe to do in kernel space? 66 * Or would exception handling recovery code do the trick here? 67 */ 68$loop: ldq t0, 8(v0) # L : Latency=3 69 addq v0, 8, v0 # E : 70 xor t0, a1, t1 # E : 71 cmpbge zero, t0, t2 # E : bits set iff byte == 0 72 73 cmpbge zero, t1, t3 # E : bits set iff byte == c 74 or t2, t3, t0 # E : 75 cttz t3, a2 # U0 : speculative (in case we get a match) 76 beq t0, $loop # U : 77 78$found: negq t0, t1 # E : clear all but least set bit 79 and t0, t1, t0 # E : 80 and t0, t3, t1 # E : bit set iff byte was the char 81 addq v0, a2, v0 # E : Add in the bit number from above 82 83 cmoveq t1, $31, v0 # E : Two mapping slots, latency = 2 84 nop 85 nop 86 ret # L0 : 87 88 .end strchr 89 EXPORT_SYMBOL(strchr) 90