1*b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0
21da177e4SLinus Torvalds /*
31da177e4SLinus Torvalds * linux/arch/alpha/lib/memcpy.c
41da177e4SLinus Torvalds *
51da177e4SLinus Torvalds * Copyright (C) 1995 Linus Torvalds
61da177e4SLinus Torvalds */
71da177e4SLinus Torvalds
81da177e4SLinus Torvalds /*
91da177e4SLinus Torvalds * This is a reasonably optimized memcpy() routine.
101da177e4SLinus Torvalds */
111da177e4SLinus Torvalds
121da177e4SLinus Torvalds /*
131da177e4SLinus Torvalds * Note that the C code is written to be optimized into good assembly. However,
141da177e4SLinus Torvalds * at this point gcc is unable to sanely compile "if (n >= 0)", resulting in a
151da177e4SLinus Torvalds * explicit compare against 0 (instead of just using the proper "blt reg, xx" or
161da177e4SLinus Torvalds * "bge reg, xx"). I hope alpha-gcc will be fixed to notice this eventually..
171da177e4SLinus Torvalds */
181da177e4SLinus Torvalds
191da177e4SLinus Torvalds #include <linux/types.h>
2000fc0e0dSAl Viro #include <linux/export.h>
211da177e4SLinus Torvalds
221da177e4SLinus Torvalds /*
231da177e4SLinus Torvalds * This should be done in one go with ldq_u*2/mask/stq_u. Do it
241da177e4SLinus Torvalds * with a macro so that we can fix it up later..
251da177e4SLinus Torvalds */
261da177e4SLinus Torvalds #define ALIGN_DEST_TO8_UP(d,s,n) \
271da177e4SLinus Torvalds while (d & 7) { \
281da177e4SLinus Torvalds if (n <= 0) return; \
291da177e4SLinus Torvalds n--; \
301da177e4SLinus Torvalds *(char *) d = *(char *) s; \
311da177e4SLinus Torvalds d++; s++; \
321da177e4SLinus Torvalds }
331da177e4SLinus Torvalds #define ALIGN_DEST_TO8_DN(d,s,n) \
341da177e4SLinus Torvalds while (d & 7) { \
351da177e4SLinus Torvalds if (n <= 0) return; \
361da177e4SLinus Torvalds n--; \
371da177e4SLinus Torvalds d--; s--; \
381da177e4SLinus Torvalds *(char *) d = *(char *) s; \
391da177e4SLinus Torvalds }
401da177e4SLinus Torvalds
411da177e4SLinus Torvalds /*
421da177e4SLinus Torvalds * This should similarly be done with ldq_u*2/mask/stq. The destination
431da177e4SLinus Torvalds * is aligned, but we don't fill in a full quad-word
441da177e4SLinus Torvalds */
451da177e4SLinus Torvalds #define DO_REST_UP(d,s,n) \
461da177e4SLinus Torvalds while (n > 0) { \
471da177e4SLinus Torvalds n--; \
481da177e4SLinus Torvalds *(char *) d = *(char *) s; \
491da177e4SLinus Torvalds d++; s++; \
501da177e4SLinus Torvalds }
511da177e4SLinus Torvalds #define DO_REST_DN(d,s,n) \
521da177e4SLinus Torvalds while (n > 0) { \
531da177e4SLinus Torvalds n--; \
541da177e4SLinus Torvalds d--; s--; \
551da177e4SLinus Torvalds *(char *) d = *(char *) s; \
561da177e4SLinus Torvalds }
571da177e4SLinus Torvalds
581da177e4SLinus Torvalds /*
591da177e4SLinus Torvalds * This should be done with ldq/mask/stq. The source and destination are
601da177e4SLinus Torvalds * aligned, but we don't fill in a full quad-word
611da177e4SLinus Torvalds */
621da177e4SLinus Torvalds #define DO_REST_ALIGNED_UP(d,s,n) DO_REST_UP(d,s,n)
631da177e4SLinus Torvalds #define DO_REST_ALIGNED_DN(d,s,n) DO_REST_DN(d,s,n)
641da177e4SLinus Torvalds
651da177e4SLinus Torvalds /*
661da177e4SLinus Torvalds * This does unaligned memory copies. We want to avoid storing to
671da177e4SLinus Torvalds * an unaligned address, as that would do a read-modify-write cycle.
681da177e4SLinus Torvalds * We also want to avoid double-reading the unaligned reads.
691da177e4SLinus Torvalds *
701da177e4SLinus Torvalds * Note the ordering to try to avoid load (and address generation) latencies.
711da177e4SLinus Torvalds */
__memcpy_unaligned_up(unsigned long d,unsigned long s,long n)721da177e4SLinus Torvalds static inline void __memcpy_unaligned_up (unsigned long d, unsigned long s,
731da177e4SLinus Torvalds long n)
741da177e4SLinus Torvalds {
751da177e4SLinus Torvalds ALIGN_DEST_TO8_UP(d,s,n);
761da177e4SLinus Torvalds n -= 8; /* to avoid compare against 8 in the loop */
771da177e4SLinus Torvalds if (n >= 0) {
781da177e4SLinus Torvalds unsigned long low_word, high_word;
791da177e4SLinus Torvalds __asm__("ldq_u %0,%1":"=r" (low_word):"m" (*(unsigned long *) s));
801da177e4SLinus Torvalds do {
811da177e4SLinus Torvalds unsigned long tmp;
821da177e4SLinus Torvalds __asm__("ldq_u %0,%1":"=r" (high_word):"m" (*(unsigned long *)(s+8)));
831da177e4SLinus Torvalds n -= 8;
841da177e4SLinus Torvalds __asm__("extql %1,%2,%0"
851da177e4SLinus Torvalds :"=r" (low_word)
861da177e4SLinus Torvalds :"r" (low_word), "r" (s));
871da177e4SLinus Torvalds __asm__("extqh %1,%2,%0"
881da177e4SLinus Torvalds :"=r" (tmp)
891da177e4SLinus Torvalds :"r" (high_word), "r" (s));
901da177e4SLinus Torvalds s += 8;
911da177e4SLinus Torvalds *(unsigned long *) d = low_word | tmp;
921da177e4SLinus Torvalds d += 8;
931da177e4SLinus Torvalds low_word = high_word;
941da177e4SLinus Torvalds } while (n >= 0);
951da177e4SLinus Torvalds }
961da177e4SLinus Torvalds n += 8;
971da177e4SLinus Torvalds DO_REST_UP(d,s,n);
981da177e4SLinus Torvalds }
991da177e4SLinus Torvalds
__memcpy_unaligned_dn(unsigned long d,unsigned long s,long n)1001da177e4SLinus Torvalds static inline void __memcpy_unaligned_dn (unsigned long d, unsigned long s,
1011da177e4SLinus Torvalds long n)
1021da177e4SLinus Torvalds {
1031da177e4SLinus Torvalds /* I don't understand AXP assembler well enough for this. -Tim */
1041da177e4SLinus Torvalds s += n;
1051da177e4SLinus Torvalds d += n;
1061da177e4SLinus Torvalds while (n--)
1071da177e4SLinus Torvalds * (char *) --d = * (char *) --s;
1081da177e4SLinus Torvalds }
1091da177e4SLinus Torvalds
1101da177e4SLinus Torvalds /*
1111da177e4SLinus Torvalds * Hmm.. Strange. The __asm__ here is there to make gcc use an integer register
1121da177e4SLinus Torvalds * for the load-store. I don't know why, but it would seem that using a floating
1131da177e4SLinus Torvalds * point register for the move seems to slow things down (very small difference,
1141da177e4SLinus Torvalds * though).
1151da177e4SLinus Torvalds *
1161da177e4SLinus Torvalds * Note the ordering to try to avoid load (and address generation) latencies.
1171da177e4SLinus Torvalds */
__memcpy_aligned_up(unsigned long d,unsigned long s,long n)1181da177e4SLinus Torvalds static inline void __memcpy_aligned_up (unsigned long d, unsigned long s,
1191da177e4SLinus Torvalds long n)
1201da177e4SLinus Torvalds {
1211da177e4SLinus Torvalds ALIGN_DEST_TO8_UP(d,s,n);
1221da177e4SLinus Torvalds n -= 8;
1231da177e4SLinus Torvalds while (n >= 0) {
1241da177e4SLinus Torvalds unsigned long tmp;
1251da177e4SLinus Torvalds __asm__("ldq %0,%1":"=r" (tmp):"m" (*(unsigned long *) s));
1261da177e4SLinus Torvalds n -= 8;
1271da177e4SLinus Torvalds s += 8;
1281da177e4SLinus Torvalds *(unsigned long *) d = tmp;
1291da177e4SLinus Torvalds d += 8;
1301da177e4SLinus Torvalds }
1311da177e4SLinus Torvalds n += 8;
1321da177e4SLinus Torvalds DO_REST_ALIGNED_UP(d,s,n);
1331da177e4SLinus Torvalds }
__memcpy_aligned_dn(unsigned long d,unsigned long s,long n)1341da177e4SLinus Torvalds static inline void __memcpy_aligned_dn (unsigned long d, unsigned long s,
1351da177e4SLinus Torvalds long n)
1361da177e4SLinus Torvalds {
1371da177e4SLinus Torvalds s += n;
1381da177e4SLinus Torvalds d += n;
1391da177e4SLinus Torvalds ALIGN_DEST_TO8_DN(d,s,n);
1401da177e4SLinus Torvalds n -= 8;
1411da177e4SLinus Torvalds while (n >= 0) {
1421da177e4SLinus Torvalds unsigned long tmp;
1431da177e4SLinus Torvalds s -= 8;
1441da177e4SLinus Torvalds __asm__("ldq %0,%1":"=r" (tmp):"m" (*(unsigned long *) s));
1451da177e4SLinus Torvalds n -= 8;
1461da177e4SLinus Torvalds d -= 8;
1471da177e4SLinus Torvalds *(unsigned long *) d = tmp;
1481da177e4SLinus Torvalds }
1491da177e4SLinus Torvalds n += 8;
1501da177e4SLinus Torvalds DO_REST_ALIGNED_DN(d,s,n);
1511da177e4SLinus Torvalds }
1521da177e4SLinus Torvalds
memcpy(void * dest,const void * src,size_t n)1531da177e4SLinus Torvalds void * memcpy(void * dest, const void *src, size_t n)
1541da177e4SLinus Torvalds {
1551da177e4SLinus Torvalds if (!(((unsigned long) dest ^ (unsigned long) src) & 7)) {
1561da177e4SLinus Torvalds __memcpy_aligned_up ((unsigned long) dest, (unsigned long) src,
1571da177e4SLinus Torvalds n);
1581da177e4SLinus Torvalds return dest;
1591da177e4SLinus Torvalds }
1601da177e4SLinus Torvalds __memcpy_unaligned_up ((unsigned long) dest, (unsigned long) src, n);
1611da177e4SLinus Torvalds return dest;
1621da177e4SLinus Torvalds }
16300fc0e0dSAl Viro EXPORT_SYMBOL(memcpy);
164