xref: /openbmc/linux/arch/alpha/lib/ev6-clear_user.S (revision f3c78e94)
1b2441318SGreg Kroah-Hartman/* SPDX-License-Identifier: GPL-2.0 */
21da177e4SLinus Torvalds/*
31da177e4SLinus Torvalds * arch/alpha/lib/ev6-clear_user.S
41da177e4SLinus Torvalds * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
51da177e4SLinus Torvalds *
61da177e4SLinus Torvalds * Zero user space, handling exceptions as we go.
71da177e4SLinus Torvalds *
81da177e4SLinus Torvalds * We have to make sure that $0 is always up-to-date and contains the
91da177e4SLinus Torvalds * right "bytes left to zero" value (and that it is updated only _after_
101da177e4SLinus Torvalds * a successful copy).  There is also some rather minor exception setup
111da177e4SLinus Torvalds * stuff.
121da177e4SLinus Torvalds *
131da177e4SLinus Torvalds * Much of the information about 21264 scheduling/coding comes from:
141da177e4SLinus Torvalds *	Compiler Writer's Guide for the Alpha 21264
151da177e4SLinus Torvalds *	abbreviated as 'CWG' in other comments here
161da177e4SLinus Torvalds *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
171da177e4SLinus Torvalds * Scheduling notation:
181da177e4SLinus Torvalds *	E	- either cluster
191da177e4SLinus Torvalds *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1
201da177e4SLinus Torvalds *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1
211da177e4SLinus Torvalds * Try not to change the actual algorithm if possible for consistency.
221da177e4SLinus Torvalds * Determining actual stalls (other than slotting) doesn't appear to be easy to do.
231da177e4SLinus Torvalds * From perusing the source code context where this routine is called, it is
241da177e4SLinus Torvalds * a fair assumption that significant fractions of entire pages are zeroed, so
251da177e4SLinus Torvalds * it's going to be worth the effort to hand-unroll a big loop, and use wh64.
261da177e4SLinus Torvalds * ASSUMPTION:
271da177e4SLinus Torvalds *	The believed purpose of only updating $0 after a store is that a signal
281da177e4SLinus Torvalds *	may come along during the execution of this chunk of code, and we don't
291da177e4SLinus Torvalds *	want to leave a hole (and we also want to avoid repeating lots of work)
301da177e4SLinus Torvalds */
311da177e4SLinus Torvalds
32*f3c78e94SMasahiro Yamada#include <linux/export.h>
331da177e4SLinus Torvalds/* Allow an exception for an insn; exit if we get one.  */
341da177e4SLinus Torvalds#define EX(x,y...)			\
351da177e4SLinus Torvalds	99: x,##y;			\
361da177e4SLinus Torvalds	.section __ex_table,"a";	\
371da177e4SLinus Torvalds	.long 99b - .;			\
381da177e4SLinus Torvalds	lda $31, $exception-99b($31); 	\
391da177e4SLinus Torvalds	.previous
401da177e4SLinus Torvalds
411da177e4SLinus Torvalds	.set noat
421da177e4SLinus Torvalds	.set noreorder
431da177e4SLinus Torvalds	.align 4
441da177e4SLinus Torvalds
4585250231SAl Viro	.globl __clear_user
4685250231SAl Viro	.ent __clear_user
4785250231SAl Viro	.frame	$30, 0, $26
481da177e4SLinus Torvalds	.prologue 0
491da177e4SLinus Torvalds
501da177e4SLinus Torvalds				# Pipeline info : Slotting & Comments
5185250231SAl Viro__clear_user:
5285250231SAl Viro	and	$17, $17, $0
5385250231SAl Viro	and	$16, 7, $4	# .. E  .. ..	: find dest head misalignment
541da177e4SLinus Torvalds	beq	$0, $zerolength # U  .. .. ..	:  U L U L
551da177e4SLinus Torvalds
561da177e4SLinus Torvalds	addq	$0, $4, $1	# .. .. .. E	: bias counter
571da177e4SLinus Torvalds	and	$1, 7, $2	# .. .. E  ..	: number of misaligned bytes in tail
581da177e4SLinus Torvalds# Note - we never actually use $2, so this is a moot computation
591da177e4SLinus Torvalds# and we can rewrite this later...
601da177e4SLinus Torvalds	srl	$1, 3, $1	# .. E  .. ..	: number of quadwords to clear
611da177e4SLinus Torvalds	beq	$4, $headalign	# U  .. .. ..	: U L U L
621da177e4SLinus Torvalds
631da177e4SLinus Torvalds/*
641da177e4SLinus Torvalds * Head is not aligned.  Write (8 - $4) bytes to head of destination
6585250231SAl Viro * This means $16 is known to be misaligned
661da177e4SLinus Torvalds */
6785250231SAl Viro	EX( ldq_u $5, 0($16) )	# .. .. .. L	: load dst word to mask back in
681da177e4SLinus Torvalds	beq	$1, $onebyte	# .. .. U  ..	: sub-word store?
6985250231SAl Viro	mskql	$5, $16, $5	# .. U  .. ..	: take care of misaligned head
7085250231SAl Viro	addq	$16, 8, $16	# E  .. .. .. 	: L U U L
711da177e4SLinus Torvalds
7285250231SAl Viro	EX( stq_u $5, -8($16) )	# .. .. .. L	:
731da177e4SLinus Torvalds	subq	$1, 1, $1	# .. .. E  ..	:
741da177e4SLinus Torvalds	addq	$0, $4, $0	# .. E  .. ..	: bytes left -= 8 - misalignment
751da177e4SLinus Torvalds	subq	$0, 8, $0	# E  .. .. ..	: U L U L
761da177e4SLinus Torvalds
771da177e4SLinus Torvalds	.align	4
781da177e4SLinus Torvalds/*
791da177e4SLinus Torvalds * (The .align directive ought to be a moot point)
801da177e4SLinus Torvalds * values upon initial entry to the loop
811da177e4SLinus Torvalds * $1 is number of quadwords to clear (zero is a valid value)
821da177e4SLinus Torvalds * $2 is number of trailing bytes (0..7) ($2 never used...)
8385250231SAl Viro * $16 is known to be aligned 0mod8
841da177e4SLinus Torvalds */
851da177e4SLinus Torvalds$headalign:
861da177e4SLinus Torvalds	subq	$1, 16, $4	# .. .. .. E	: If < 16, we can not use the huge loop
8785250231SAl Viro	and	$16, 0x3f, $2	# .. .. E  ..	: Forward work for huge loop
881da177e4SLinus Torvalds	subq	$2, 0x40, $3	# .. E  .. ..	: bias counter (huge loop)
891da177e4SLinus Torvalds	blt	$4, $trailquad	# U  .. .. ..	: U L U L
901da177e4SLinus Torvalds
911da177e4SLinus Torvalds/*
921da177e4SLinus Torvalds * We know that we're going to do at least 16 quads, which means we are
931da177e4SLinus Torvalds * going to be able to use the large block clear loop at least once.
941da177e4SLinus Torvalds * Figure out how many quads we need to clear before we are 0mod64 aligned
951da177e4SLinus Torvalds * so we can use the wh64 instruction.
961da177e4SLinus Torvalds */
971da177e4SLinus Torvalds
981da177e4SLinus Torvalds	nop			# .. .. .. E
991da177e4SLinus Torvalds	nop			# .. .. E  ..
1001da177e4SLinus Torvalds	nop			# .. E  .. ..
1011da177e4SLinus Torvalds	beq	$3, $bigalign	# U  .. .. ..	: U L U L : Aligned 0mod64
1021da177e4SLinus Torvalds
1031da177e4SLinus Torvalds$alignmod64:
10485250231SAl Viro	EX( stq_u $31, 0($16) )	# .. .. .. L
1051da177e4SLinus Torvalds	addq	$3, 8, $3	# .. .. E  ..
1061da177e4SLinus Torvalds	subq	$0, 8, $0	# .. E  .. ..
1071da177e4SLinus Torvalds	nop			# E  .. .. ..	: U L U L
1081da177e4SLinus Torvalds
1091da177e4SLinus Torvalds	nop			# .. .. .. E
1101da177e4SLinus Torvalds	subq	$1, 1, $1	# .. .. E  ..
11185250231SAl Viro	addq	$16, 8, $16	# .. E  .. ..
1121da177e4SLinus Torvalds	blt	$3, $alignmod64	# U  .. .. ..	: U L U L
1131da177e4SLinus Torvalds
1141da177e4SLinus Torvalds$bigalign:
1151da177e4SLinus Torvalds/*
1161da177e4SLinus Torvalds * $0 is the number of bytes left
1171da177e4SLinus Torvalds * $1 is the number of quads left
11885250231SAl Viro * $16 is aligned 0mod64
1191da177e4SLinus Torvalds * we know that we'll be taking a minimum of one trip through
1201da177e4SLinus Torvalds * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
1211da177e4SLinus Torvalds * We are _not_ going to update $0 after every single store.  That
1221da177e4SLinus Torvalds * would be silly, because there will be cross-cluster dependencies
1231da177e4SLinus Torvalds * no matter how the code is scheduled.  By doing it in slightly
1241da177e4SLinus Torvalds * staggered fashion, we can still do this loop in 5 fetches
1251da177e4SLinus Torvalds * The worse case will be doing two extra quads in some future execution,
1261da177e4SLinus Torvalds * in the event of an interrupted clear.
1271da177e4SLinus Torvalds * Assumes the wh64 needs to be for 2 trips through the loop in the future
1281da177e4SLinus Torvalds * The wh64 is issued on for the starting destination address for trip +2
1291da177e4SLinus Torvalds * through the loop, and if there are less than two trips left, the target
1301da177e4SLinus Torvalds * address will be for the current trip.
1311da177e4SLinus Torvalds */
1321da177e4SLinus Torvalds	nop			# E :
1331da177e4SLinus Torvalds	nop			# E :
1341da177e4SLinus Torvalds	nop			# E :
13585250231SAl Viro	bis	$16,$16,$3	# E : U L U L : Initial wh64 address is dest
1361da177e4SLinus Torvalds	/* This might actually help for the current trip... */
1371da177e4SLinus Torvalds
1381da177e4SLinus Torvalds$do_wh64:
1391da177e4SLinus Torvalds	wh64	($3)		# .. .. .. L1	: memory subsystem hint
1401da177e4SLinus Torvalds	subq	$1, 16, $4	# .. .. E  ..	: Forward calculation - repeat the loop?
14185250231SAl Viro	EX( stq_u $31, 0($16) )	# .. L  .. ..
1421da177e4SLinus Torvalds	subq	$0, 8, $0	# E  .. .. ..	: U L U L
1431da177e4SLinus Torvalds
14485250231SAl Viro	addq	$16, 128, $3	# E : Target address of wh64
14585250231SAl Viro	EX( stq_u $31, 8($16) )	# L :
14685250231SAl Viro	EX( stq_u $31, 16($16) )	# L :
1471da177e4SLinus Torvalds	subq	$0, 16, $0	# E : U L L U
1481da177e4SLinus Torvalds
1491da177e4SLinus Torvalds	nop			# E :
15085250231SAl Viro	EX( stq_u $31, 24($16) )	# L :
15185250231SAl Viro	EX( stq_u $31, 32($16) )	# L :
1521da177e4SLinus Torvalds	subq	$0, 168, $5	# E : U L L U : two trips through the loop left?
1531da177e4SLinus Torvalds	/* 168 = 192 - 24, since we've already completed some stores */
1541da177e4SLinus Torvalds
1551da177e4SLinus Torvalds	subq	$0, 16, $0	# E :
15685250231SAl Viro	EX( stq_u $31, 40($16) )	# L :
15785250231SAl Viro	EX( stq_u $31, 48($16) )	# L :
15885250231SAl Viro	cmovlt	$5, $16, $3	# E : U L L U : Latency 2, extra mapping cycle
1591da177e4SLinus Torvalds
1601da177e4SLinus Torvalds	subq	$1, 8, $1	# E :
1611da177e4SLinus Torvalds	subq	$0, 16, $0	# E :
16285250231SAl Viro	EX( stq_u $31, 56($16) )	# L :
1631da177e4SLinus Torvalds	nop			# E : U L U L
1641da177e4SLinus Torvalds
1651da177e4SLinus Torvalds	nop			# E :
1661da177e4SLinus Torvalds	subq	$0, 8, $0	# E :
16785250231SAl Viro	addq	$16, 64, $16	# E :
1681da177e4SLinus Torvalds	bge	$4, $do_wh64	# U : U L U L
1691da177e4SLinus Torvalds
1701da177e4SLinus Torvalds$trailquad:
1711da177e4SLinus Torvalds	# zero to 16 quadwords left to store, plus any trailing bytes
1721da177e4SLinus Torvalds	# $1 is the number of quadwords left to go.
1731da177e4SLinus Torvalds	#
1741da177e4SLinus Torvalds	nop			# .. .. .. E
1751da177e4SLinus Torvalds	nop			# .. .. E  ..
1761da177e4SLinus Torvalds	nop			# .. E  .. ..
1771da177e4SLinus Torvalds	beq	$1, $trailbytes	# U  .. .. ..	: U L U L : Only 0..7 bytes to go
1781da177e4SLinus Torvalds
1791da177e4SLinus Torvalds$onequad:
18085250231SAl Viro	EX( stq_u $31, 0($16) )	# .. .. .. L
1811da177e4SLinus Torvalds	subq	$1, 1, $1	# .. .. E  ..
1821da177e4SLinus Torvalds	subq	$0, 8, $0	# .. E  .. ..
1831da177e4SLinus Torvalds	nop			# E  .. .. ..	: U L U L
1841da177e4SLinus Torvalds
1851da177e4SLinus Torvalds	nop			# .. .. .. E
1861da177e4SLinus Torvalds	nop			# .. .. E  ..
18785250231SAl Viro	addq	$16, 8, $16	# .. E  .. ..
1881da177e4SLinus Torvalds	bgt	$1, $onequad	# U  .. .. ..	: U L U L
1891da177e4SLinus Torvalds
1901da177e4SLinus Torvalds	# We have an unknown number of bytes left to go.
1911da177e4SLinus Torvalds$trailbytes:
1921da177e4SLinus Torvalds	nop			# .. .. .. E
1931da177e4SLinus Torvalds	nop			# .. .. E  ..
1941da177e4SLinus Torvalds	nop			# .. E  .. ..
1951da177e4SLinus Torvalds	beq	$0, $zerolength	# U  .. .. ..	: U L U L
1961da177e4SLinus Torvalds
1971da177e4SLinus Torvalds	# $0 contains the number of bytes left to copy (0..31)
1981da177e4SLinus Torvalds	# so we will use $0 as the loop counter
1991da177e4SLinus Torvalds	# We know for a fact that $0 > 0 zero due to previous context
2001da177e4SLinus Torvalds$onebyte:
20185250231SAl Viro	EX( stb $31, 0($16) )	# .. .. .. L
2021da177e4SLinus Torvalds	subq	$0, 1, $0	# .. .. E  ..	:
20385250231SAl Viro	addq	$16, 1, $16	# .. E  .. ..	:
2041da177e4SLinus Torvalds	bgt	$0, $onebyte	# U  .. .. ..	: U L U L
2051da177e4SLinus Torvalds
2061da177e4SLinus Torvalds$zerolength:
2071da177e4SLinus Torvalds$exception:			# Destination for exception recovery(?)
2081da177e4SLinus Torvalds	nop			# .. .. .. E	:
2091da177e4SLinus Torvalds	nop			# .. .. E  ..	:
2101da177e4SLinus Torvalds	nop			# .. E  .. ..	:
21185250231SAl Viro	ret	$31, ($26), 1	# L0 .. .. ..	: L U L U
21285250231SAl Viro	.end __clear_user
21385250231SAl Viro	EXPORT_SYMBOL(__clear_user)
214