blob: 8aedbb5f4b86ec327383fa0d0b08023bc8ba89e5 [file] [log] [blame]
Paul Mackerras14cf11a2005-09-26 16:04:21 +10001/*
2 * Memory copy functions for 32-bit PowerPC.
3 *
4 * Copyright (C) 1996-2005 Paul Mackerras.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
Paul Mackerras14cf11a2005-09-26 16:04:21 +100011#include <asm/processor.h>
12#include <asm/cache.h>
13#include <asm/errno.h>
14#include <asm/ppc_asm.h>
Al Viro9445aa12016-01-13 23:33:46 -050015#include <asm/export.h>
Paul Mackerras14cf11a2005-09-26 16:04:21 +100016
17#define COPY_16_BYTES \
18 lwz r7,4(r4); \
19 lwz r8,8(r4); \
20 lwz r9,12(r4); \
21 lwzu r10,16(r4); \
22 stw r7,4(r6); \
23 stw r8,8(r6); \
24 stw r9,12(r6); \
25 stwu r10,16(r6)
26
27#define COPY_16_BYTES_WITHEX(n) \
288 ## n ## 0: \
29 lwz r7,4(r4); \
308 ## n ## 1: \
31 lwz r8,8(r4); \
328 ## n ## 2: \
33 lwz r9,12(r4); \
348 ## n ## 3: \
35 lwzu r10,16(r4); \
368 ## n ## 4: \
37 stw r7,4(r6); \
388 ## n ## 5: \
39 stw r8,8(r6); \
408 ## n ## 6: \
41 stw r9,12(r6); \
428 ## n ## 7: \
43 stwu r10,16(r6)
44
45#define COPY_16_BYTES_EXCODE(n) \
469 ## n ## 0: \
47 addi r5,r5,-(16 * n); \
48 b 104f; \
499 ## n ## 1: \
50 addi r5,r5,-(16 * n); \
51 b 105f; \
Nicholas Piggin24bfa6a2016-10-13 16:42:53 +110052 EX_TABLE(8 ## n ## 0b,9 ## n ## 0b); \
53 EX_TABLE(8 ## n ## 1b,9 ## n ## 0b); \
54 EX_TABLE(8 ## n ## 2b,9 ## n ## 0b); \
55 EX_TABLE(8 ## n ## 3b,9 ## n ## 0b); \
56 EX_TABLE(8 ## n ## 4b,9 ## n ## 1b); \
57 EX_TABLE(8 ## n ## 5b,9 ## n ## 1b); \
58 EX_TABLE(8 ## n ## 6b,9 ## n ## 1b); \
59 EX_TABLE(8 ## n ## 7b,9 ## n ## 1b)
Paul Mackerras14cf11a2005-09-26 16:04:21 +100060
61 .text
62 .stabs "arch/powerpc/lib/",N_SO,0,0,0f
Sean MacLennan025c0182010-09-01 07:21:21 +000063 .stabs "copy_32.S",N_SO,0,0,0f
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000640:
65
Stephen Rothwell7dffb722005-10-17 11:50:32 +100066CACHELINE_BYTES = L1_CACHE_BYTES
67LG_CACHELINE_BYTES = L1_CACHE_SHIFT
68CACHELINE_MASK = (L1_CACHE_BYTES-1)
Paul Mackerras14cf11a2005-09-26 16:04:21 +100069
LEROY Christophedf087e42015-05-19 12:07:48 +020070/*
71 * Use dcbz on the complete cache lines in the destination
72 * to set them to zero. This requires that the destination
73 * area is cacheable. -- paulus
LEROY Christophe400c47d2015-09-16 12:04:53 +020074 *
75 * During early init, cache might not be active yet, so dcbz cannot be used.
76 * We therefore skip the optimised bloc that uses dcbz. This jump is
77 * replaced by a nop once cache is active. This is done in machine_init()
LEROY Christophedf087e42015-05-19 12:07:48 +020078 */
LEROY Christophe5b2a32e2015-05-19 12:07:50 +020079_GLOBAL(memset)
LEROY Christophec152f142015-05-19 12:07:52 +020080 rlwimi r4,r4,8,16,23
81 rlwimi r4,r4,16,0,15
82
LEROY Christophedf087e42015-05-19 12:07:48 +020083 addi r6,r3,-4
84 cmplwi 0,r5,4
85 blt 7f
86 stwu r4,4(r6)
87 beqlr
88 andi. r0,r6,3
89 add r5,r0,r5
90 subf r6,r0,r6
LEROY Christophec152f142015-05-19 12:07:52 +020091 cmplwi 0,r4,0
92 bne 2f /* Use normal procedure if r4 is not zero */
Al Viro9445aa12016-01-13 23:33:46 -050093EXPORT_SYMBOL(memset)
LEROY Christophe400c47d2015-09-16 12:04:53 +020094_GLOBAL(memset_nocache_branch)
95 b 2f /* Skip optimised bloc until cache is enabled */
LEROY Christophec152f142015-05-19 12:07:52 +020096
LEROY Christophedf087e42015-05-19 12:07:48 +020097 clrlwi r7,r6,32-LG_CACHELINE_BYTES
98 add r8,r7,r5
99 srwi r9,r8,LG_CACHELINE_BYTES
100 addic. r9,r9,-1 /* total number of complete cachelines */
101 ble 2f
102 xori r0,r7,CACHELINE_MASK & ~3
103 srwi. r0,r0,2
104 beq 3f
105 mtctr r0
1064: stwu r4,4(r6)
107 bdnz 4b
1083: mtctr r9
109 li r7,4
11010: dcbz r7,r6
111 addi r6,r6,CACHELINE_BYTES
112 bdnz 10b
113 clrlwi r5,r8,32-LG_CACHELINE_BYTES
114 addi r5,r5,4
LEROY Christophedf087e42015-05-19 12:07:48 +0200115
LEROY Christophec152f142015-05-19 12:07:52 +02001162: srwi r0,r5,2
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000117 mtctr r0
118 bdz 6f
1191: stwu r4,4(r6)
120 bdnz 1b
1216: andi. r5,r5,3
1227: cmpwi 0,r5,0
123 beqlr
124 mtctr r5
125 addi r6,r6,3
1268: stbu r4,1(r6)
127 bdnz 8b
128 blr
129
LEROY Christophedf087e42015-05-19 12:07:48 +0200130/*
131 * This version uses dcbz on the complete cache lines in the
132 * destination area to reduce memory traffic. This requires that
133 * the destination area is cacheable.
134 * We only use this version if the source and dest don't overlap.
135 * -- paulus.
LEROY Christophe1cd03892015-09-16 12:04:51 +0200136 *
137 * During early init, cache might not be active yet, so dcbz cannot be used.
138 * We therefore jump to generic_memcpy which doesn't use dcbz. This jump is
139 * replaced by a nop once cache is active. This is done in machine_init()
LEROY Christophedf087e42015-05-19 12:07:48 +0200140 */
LEROY Christophe0b05e2d2015-05-19 12:07:55 +0200141_GLOBAL(memmove)
142 cmplw 0,r3,r4
143 bgt backwards_memcpy
144 /* fall through */
145
146_GLOBAL(memcpy)
LEROY Christophe1cd03892015-09-16 12:04:51 +0200147 b generic_memcpy
LEROY Christophedf087e42015-05-19 12:07:48 +0200148 add r7,r3,r5 /* test if the src & dst overlap */
149 add r8,r4,r5
150 cmplw 0,r4,r7
151 cmplw 1,r3,r8
152 crand 0,0,4 /* cr0.lt &= cr1.lt */
LEROY Christophe0b05e2d2015-05-19 12:07:55 +0200153 blt generic_memcpy /* if regions overlap */
LEROY Christophedf087e42015-05-19 12:07:48 +0200154
155 addi r4,r4,-4
156 addi r6,r3,-4
157 neg r0,r3
158 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
159 beq 58f
160
161 cmplw 0,r5,r0 /* is this more than total to do? */
162 blt 63f /* if not much to do */
163 andi. r8,r0,3 /* get it word-aligned first */
164 subf r5,r0,r5
165 mtctr r8
166 beq+ 61f
16770: lbz r9,4(r4) /* do some bytes */
LEROY Christophedf087e42015-05-19 12:07:48 +0200168 addi r4,r4,1
169 addi r6,r6,1
LEROY Christophe295ffb42015-05-19 12:07:57 +0200170 stb r9,3(r6)
LEROY Christophedf087e42015-05-19 12:07:48 +0200171 bdnz 70b
17261: srwi. r0,r0,2
173 mtctr r0
174 beq 58f
17572: lwzu r9,4(r4) /* do some words */
176 stwu r9,4(r6)
177 bdnz 72b
178
17958: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
180 clrlwi r5,r5,32-LG_CACHELINE_BYTES
181 li r11,4
182 mtctr r0
183 beq 63f
18453:
185 dcbz r11,r6
186 COPY_16_BYTES
187#if L1_CACHE_BYTES >= 32
188 COPY_16_BYTES
189#if L1_CACHE_BYTES >= 64
190 COPY_16_BYTES
191 COPY_16_BYTES
192#if L1_CACHE_BYTES >= 128
193 COPY_16_BYTES
194 COPY_16_BYTES
195 COPY_16_BYTES
196 COPY_16_BYTES
197#endif
198#endif
199#endif
200 bdnz 53b
201
20263: srwi. r0,r5,2
203 mtctr r0
204 beq 64f
20530: lwzu r0,4(r4)
206 stwu r0,4(r6)
207 bdnz 30b
208
20964: andi. r0,r5,3
210 mtctr r0
211 beq+ 65f
LEROY Christophe295ffb42015-05-19 12:07:57 +0200212 addi r4,r4,3
213 addi r6,r6,3
21440: lbzu r0,1(r4)
215 stbu r0,1(r6)
LEROY Christophedf087e42015-05-19 12:07:48 +0200216 bdnz 40b
21765: blr
Al Viro9445aa12016-01-13 23:33:46 -0500218EXPORT_SYMBOL(memcpy)
219EXPORT_SYMBOL(memmove)
LEROY Christophedf087e42015-05-19 12:07:48 +0200220
Michael Ellermanb4c6afd2016-03-16 21:36:06 +1100221generic_memcpy:
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000222 srwi. r7,r5,3
223 addi r6,r3,-4
224 addi r4,r4,-4
225 beq 2f /* if less than 8 bytes to do */
226 andi. r0,r6,3 /* get dest word aligned */
227 mtctr r7
228 bne 5f
2291: lwz r7,4(r4)
230 lwzu r8,8(r4)
231 stw r7,4(r6)
232 stwu r8,8(r6)
233 bdnz 1b
234 andi. r5,r5,7
2352: cmplwi 0,r5,4
236 blt 3f
237 lwzu r0,4(r4)
238 addi r5,r5,-4
239 stwu r0,4(r6)
2403: cmpwi 0,r5,0
241 beqlr
242 mtctr r5
243 addi r4,r4,3
244 addi r6,r6,3
2454: lbzu r0,1(r4)
246 stbu r0,1(r6)
247 bdnz 4b
248 blr
2495: subfic r0,r0,4
250 mtctr r0
2516: lbz r7,4(r4)
252 addi r4,r4,1
253 stb r7,4(r6)
254 addi r6,r6,1
255 bdnz 6b
256 subf r5,r0,r5
257 rlwinm. r7,r5,32-3,3,31
258 beq 2b
259 mtctr r7
260 b 1b
261
262_GLOBAL(backwards_memcpy)
263 rlwinm. r7,r5,32-3,3,31 /* r0 = r5 >> 3 */
264 add r6,r3,r5
265 add r4,r4,r5
266 beq 2f
267 andi. r0,r6,3
268 mtctr r7
269 bne 5f
2701: lwz r7,-4(r4)
271 lwzu r8,-8(r4)
272 stw r7,-4(r6)
273 stwu r8,-8(r6)
274 bdnz 1b
275 andi. r5,r5,7
2762: cmplwi 0,r5,4
277 blt 3f
278 lwzu r0,-4(r4)
279 subi r5,r5,4
280 stwu r0,-4(r6)
2813: cmpwi 0,r5,0
282 beqlr
283 mtctr r5
2844: lbzu r0,-1(r4)
285 stbu r0,-1(r6)
286 bdnz 4b
287 blr
2885: mtctr r0
2896: lbzu r7,-1(r4)
290 stbu r7,-1(r6)
291 bdnz 6b
292 subf r5,r0,r5
293 rlwinm. r7,r5,32-3,3,31
294 beq 2b
295 mtctr r7
296 b 1b
297
298_GLOBAL(__copy_tofrom_user)
299 addi r4,r4,-4
300 addi r6,r3,-4
301 neg r0,r3
302 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
303 beq 58f
304
305 cmplw 0,r5,r0 /* is this more than total to do? */
306 blt 63f /* if not much to do */
307 andi. r8,r0,3 /* get it word-aligned first */
308 mtctr r8
309 beq+ 61f
31070: lbz r9,4(r4) /* do some bytes */
31171: stb r9,4(r6)
312 addi r4,r4,1
313 addi r6,r6,1
314 bdnz 70b
31561: subf r5,r0,r5
316 srwi. r0,r0,2
317 mtctr r0
318 beq 58f
31972: lwzu r9,4(r4) /* do some words */
32073: stwu r9,4(r6)
321 bdnz 72b
322
Nicholas Piggin24bfa6a2016-10-13 16:42:53 +1100323 EX_TABLE(70b,100f)
324 EX_TABLE(71b,101f)
325 EX_TABLE(72b,102f)
326 EX_TABLE(73b,103f)
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000327
32858: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
329 clrlwi r5,r5,32-LG_CACHELINE_BYTES
330 li r11,4
331 beq 63f
332
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000333 /* Here we decide how far ahead to prefetch the source */
334 li r3,4
335 cmpwi r0,1
336 li r7,0
337 ble 114f
338 li r7,1
339#if MAX_COPY_PREFETCH > 1
340 /* Heuristically, for large transfers we prefetch
341 MAX_COPY_PREFETCH cachelines ahead. For small transfers
342 we prefetch 1 cacheline ahead. */
343 cmpwi r0,MAX_COPY_PREFETCH
344 ble 112f
345 li r7,MAX_COPY_PREFETCH
346112: mtctr r7
347111: dcbt r3,r4
348 addi r3,r3,CACHELINE_BYTES
349 bdnz 111b
350#else
351 dcbt r3,r4
352 addi r3,r3,CACHELINE_BYTES
353#endif /* MAX_COPY_PREFETCH > 1 */
354
355114: subf r8,r7,r0
356 mr r0,r7
357 mtctr r8
358
35953: dcbt r3,r4
36054: dcbz r11,r6
Nicholas Piggin24bfa6a2016-10-13 16:42:53 +1100361 EX_TABLE(54b,105f)
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000362/* the main body of the cacheline loop */
363 COPY_16_BYTES_WITHEX(0)
Stephen Rothwell7dffb722005-10-17 11:50:32 +1000364#if L1_CACHE_BYTES >= 32
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000365 COPY_16_BYTES_WITHEX(1)
Stephen Rothwell7dffb722005-10-17 11:50:32 +1000366#if L1_CACHE_BYTES >= 64
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000367 COPY_16_BYTES_WITHEX(2)
368 COPY_16_BYTES_WITHEX(3)
Stephen Rothwell7dffb722005-10-17 11:50:32 +1000369#if L1_CACHE_BYTES >= 128
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000370 COPY_16_BYTES_WITHEX(4)
371 COPY_16_BYTES_WITHEX(5)
372 COPY_16_BYTES_WITHEX(6)
373 COPY_16_BYTES_WITHEX(7)
374#endif
375#endif
376#endif
377 bdnz 53b
378 cmpwi r0,0
379 li r3,4
380 li r7,0
381 bne 114b
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000382
38363: srwi. r0,r5,2
384 mtctr r0
385 beq 64f
38630: lwzu r0,4(r4)
38731: stwu r0,4(r6)
388 bdnz 30b
389
39064: andi. r0,r5,3
391 mtctr r0
392 beq+ 65f
39340: lbz r0,4(r4)
39441: stb r0,4(r6)
395 addi r4,r4,1
396 addi r6,r6,1
397 bdnz 40b
39865: li r3,0
399 blr
400
401/* read fault, initial single-byte copy */
402100: li r9,0
403 b 90f
404/* write fault, initial single-byte copy */
405101: li r9,1
40690: subf r5,r8,r5
407 li r3,0
408 b 99f
409/* read fault, initial word copy */
410102: li r9,0
411 b 91f
412/* write fault, initial word copy */
413103: li r9,1
41491: li r3,2
415 b 99f
416
417/*
418 * this stuff handles faults in the cacheline loop and branches to either
419 * 104f (if in read part) or 105f (if in write part), after updating r5
420 */
421 COPY_16_BYTES_EXCODE(0)
Stephen Rothwell7dffb722005-10-17 11:50:32 +1000422#if L1_CACHE_BYTES >= 32
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000423 COPY_16_BYTES_EXCODE(1)
Stephen Rothwell7dffb722005-10-17 11:50:32 +1000424#if L1_CACHE_BYTES >= 64
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000425 COPY_16_BYTES_EXCODE(2)
426 COPY_16_BYTES_EXCODE(3)
Stephen Rothwell7dffb722005-10-17 11:50:32 +1000427#if L1_CACHE_BYTES >= 128
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000428 COPY_16_BYTES_EXCODE(4)
429 COPY_16_BYTES_EXCODE(5)
430 COPY_16_BYTES_EXCODE(6)
431 COPY_16_BYTES_EXCODE(7)
432#endif
433#endif
434#endif
435
436/* read fault in cacheline loop */
437104: li r9,0
438 b 92f
439/* fault on dcbz (effectively a write fault) */
440/* or write fault in cacheline loop */
441105: li r9,1
44292: li r3,LG_CACHELINE_BYTES
443 mfctr r8
444 add r0,r0,r8
445 b 106f
446/* read fault in final word loop */
447108: li r9,0
448 b 93f
449/* write fault in final word loop */
450109: li r9,1
45193: andi. r5,r5,3
452 li r3,2
453 b 99f
454/* read fault in final byte loop */
455110: li r9,0
456 b 94f
457/* write fault in final byte loop */
458111: li r9,1
45994: li r5,0
460 li r3,0
461/*
462 * At this stage the number of bytes not copied is
463 * r5 + (ctr << r3), and r9 is 0 for read or 1 for write.
464 */
46599: mfctr r0
466106: slw r3,r0,r3
467 add. r3,r3,r5
468 beq 120f /* shouldn't happen */
469 cmpwi 0,r9,0
470 bne 120f
471/* for a read fault, first try to continue the copy one byte at a time */
472 mtctr r3
473130: lbz r0,4(r4)
474131: stb r0,4(r6)
475 addi r4,r4,1
476 addi r6,r6,1
477 bdnz 130b
478/* then clear out the destination: r3 bytes starting at 4(r6) */
479132: mfctr r3
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000480120: blr
481
Nicholas Piggin24bfa6a2016-10-13 16:42:53 +1100482 EX_TABLE(30b,108b)
483 EX_TABLE(31b,109b)
484 EX_TABLE(40b,110b)
485 EX_TABLE(41b,111b)
486 EX_TABLE(130b,132b)
487 EX_TABLE(131b,120b)
Nicholas Piggin24bfa6a2016-10-13 16:42:53 +1100488
Al Viro9445aa12016-01-13 23:33:46 -0500489EXPORT_SYMBOL(__copy_tofrom_user)