linux/arch/e2k/lib/recovery_string.S

466 lines
9.7 KiB
ArmAsm

#include <asm/e2k_api.h>
.section ".text", "ax"
.global $recovery_memcpy_8
.type recovery_memcpy_8,@function
$recovery_memcpy_8:
.ignore ld_st_style
/*
* dr0 - dst
* dr1 - src
* dr2 - len
* dr3 - strd opcode
* dr4 - ldrd opcode
* r5 - enable prefetching
*
* Does not return a value.
*/
{
setwd wsz = 0x14, nfx = 0x1
ipd 1
disp %ctpr2, very_small_size
setbn rsz = 0xb, rbs = 0x8, rcur = 0x0
setbp psz = 0x1
/* dr14 holds the number of copied bytes
* in case pagefault happens */
addd,4 0x0, 0x0, %dr14
addd,3 %dr4, 0x0, %dr10
addd,5 %dr4, 0x8, %dr11
}
{
ipd 1
disp %ctpr1, small_size
addd,5 %dr4, 0x10, %dr12
/* %pred26 == 'true' if 'size' is zero (i.e. 'size' >= 8) */
cmpedb,0 %dr2, 0x0, %pred26
/* %pred27 == 'false' if 'size' >= 16 bytes */
cmpbdb,1 %dr2, 0x10, %pred27
/* %pred28 == 'false' if 'size' >= 24 bytes */
cmpbdb,3 %dr2, 0x18, %pred28
/* %pred25 == 'true' if 'size' <= 32 bytes */
cmpledb,4 %dr2, 0x20, %pred25
}
{
return %ctpr3
addd,5 %dr4, 0x18, %dr13
cmpbdb,0 %dr2, 0x40, %pred12
/* %pred29 == 'false' if 'size' >= 32 bytes */
cmpbdb,1 %dr2, 0x20, %pred29
/* If %pred6 is 'false' then the remaining 16-bytes
* tail has to be copied after the main copying loop
* which copies data in 32-bytes blocks. */
cmpandedb,3 %dr2, 0x10, %pred6
/* %pred7 == 'size' < 192 (minimum allowed size
* for the optimized copying algorythm - 6 cachelines
* for unrolling) */
cmpbdb,4 %dr2, 0xc0, %pred7
}
{
/* %pred8 == 'true' if 'size' is a multiple of 16 */
cmpandedb,1 %dr2, 0x8, %pred8
addd,0 %dr3, 0x0, %dr6
addd,2 %dr3, 0x10, %dr8
addd,4 %dr3, 0x8, %dr7
ldrd,3 [ %dr1 + %dr10 ], %db[10] ? ~ %pred26
ldrd,5 [ %dr1 + %dr11 ], %db[11] ? ~ %pred27
}
{
addd,4 %dr3, 0x18, %dr9
ldrd,3 [ %dr1 + %dr12 ], %db[22] ? ~ %pred28
ldrd,5 [ %dr1 + %dr13 ], %db[23] ? ~ %pred29
ct %ctpr2 ? %pred25
}
{
ipd 0
disp %ctpr2, copy_tail_small
addd,1 %dr1, 0x100, %db[0]
/* Check whether prefetching is disabled */
cmpesb,4 %r5, 0, %pred15
addd,0 %dr1, 0x20, %dr5
addd,2 %dr1, 0x40, %dr4
addd,5 %dr1, %dr2, %dr3
/* If the block is small, use simple loop without unrolling */
ct %ctpr1 ? %pred7
}
{
ipd 0
disp %ctpr3, skip_prefetch_loop
cmpbdb,4 %dr2, 0x2c0, %pred2
ldrd,0 [ %dr5 + %dr10 ], %db[8]
ldrd,2 [ %dr5 + %dr11 ], %db[9]
ldrd,3 [ %dr5 + %dr12 ], %db[20]
ldrd,5 [ %dr5 + %dr13 ], %db[21]
addd %dr5, 0x40, %dr5
}
{
ipd 0
disp %ctpr1, prefetch
cmpbdb,4 %dr2, 0xe0, %pred3
ldrd,0 [ %dr4 + %dr10 ], %db[6]
ldrd,2 [ %dr4 + %dr11 ], %db[7]
ldrd,3 [ %dr4 + %dr12 ], %db[18]
ldrd,5 [ %dr4 + %dr13 ], %db[19]
addd %dr4, 0x40, %dr4
}
{
ipd 1
disp %ctpr2, copy
ldrd,0 [ %dr5 + %dr10 ], %db[4]
ldrd,2 [ %dr5 + %dr11 ], %db[5]
ldrd,3 [ %dr5 + %dr12 ], %db[16]
ldrd,5 [ %dr5 + %dr13 ], %db[17]
addd %dr5, 0x40, %dr5
}
{
cmpbdb,4 %dr2, 0x4c0, %pred0
addd,1 %dr1, 0x100, %db[1]
ldrd,0 [ %dr4 + %dr10 ], %db[2]
ldrd,2 [ %dr4 + %dr11 ], %db[3]
ldrd,3 [ %dr4 + %dr12 ], %db[14]
ldrd,5 [ %dr4 + %dr13 ], %db[15]
ct %ctpr3 ? %pred15
}
/* Load the src block into the L2 cache - prefetching to L1
* is neither practical (only 1 line is fetched per cycle)
* nor needed (this loop is unrolled enough to do not worry
* about latency). */
{
subd,4 %dr3, 0x5c0, %dr4
ldb,sm [ %dr1 + 0xc0 ] MAS_BYPASS_L1_CACHE, %empty ? ~ %pred3
ldb,sm [ %dr1 + 0xe0 ] MAS_BYPASS_L1_CACHE, %empty ? ~ %pred3
ct %ctpr3 ? %pred2
}
prefetch:
{
/* pred1 = dr4 < db[0] =
* = dr1 + dr2 - 0x5c0 < dr1 + prefetched =
* = dr2 - prefetched < 0x5c0 =
* = size - prefetched < 0x5c0 */
cmpbdb,4 %dr4, %db[0], %pred1
ldb,0,sm [ %db[0] + 0 ] MAS_BYPASS_L1_CACHE, %empty
ldb,2,sm [ %db[0] + 0x40 ] MAS_BYPASS_L1_CACHE, %empty
ldb,3,sm [ %db[0] + 0x80 ] MAS_BYPASS_L1_CACHE, %empty
ldb,5,sm [ %db[0] + 0xc0 ] MAS_BYPASS_L1_CACHE, %empty
addd %db[0], 0x200, %db[0]
}
{
ldb,0,sm [ %db[1] + 0x100 ] MAS_BYPASS_L1_CACHE, %empty
ldb,2,sm [ %db[1] + 0x140 ] MAS_BYPASS_L1_CACHE, %empty
ldb,3,sm [ %db[1] + 0x180 ] MAS_BYPASS_L1_CACHE, %empty
ldb,5,sm [ %db[1] + 0x1c0 ] MAS_BYPASS_L1_CACHE, %empty
addd %db[1], 0x200, %db[1]
abp abpf = 1, abpt = 1
ct %ctpr1 ? ~ %pred0
}
skip_prefetch_loop:
/* Copy the page */
{
ipd 1
disp %ctpr1, copy_tail
subd,5 %dr14, 0x20, %dr14
ldb,0,sm [ %db[0] + 0 ] (MAS_LOAD_SPEC | MAS_BYPASS_L1_CACHE), %empty ? ~ %pred15
ldb,2,sm [ %db[0] + 0x40 ] (MAS_LOAD_SPEC | MAS_BYPASS_L1_CACHE), %empty ? ~ %pred15
cmpbdb,3 %dr2, 0xe0, %pred0
/* dr3 = dr1 + dr2 - 0x60 */
subd,4 %dr3, 0x60, %dr3
}
copy:
{
cmpldb,4 %dr3, %dr5, %pred1
ldrd,0 [ %dr5 + %dr10 ], %db[0]
ldrd,2 [ %dr5 + %dr11 ], %db[1]
ldrd,3 [ %dr5 + %dr12 ], %db[12]
ldrd,5 [ %dr5 + %dr13 ], %db[13]
addd %dr5, 0x20, %dr5
}
{
/* If trap happens on previous instruction %dr14
* will be negative, so we check for that in trap
* handler. */
addd,3 %dr14, 0x20, %dr14
strd,2 [ %dr0 + %dr6 ], %db[10]
strd,5 [ %dr0 + %dr7 ], %db[11]
addd,1 %dr6, 0x20, %dr6
addd,4 %dr7, 0x20, %dr7
}
{
strd,2 [ %dr0 + %dr8 ], %db[22]
strd,5 [ %dr0 + %dr9 ], %db[23]
addd,1 %dr8, 0x20, %dr8
addd,4 %dr9, 0x20, %dr9
abn abnf = 1, abnt = 1
abp abpf = 1, abpt = 1
ct %ctpr2 ? ~ %pred0
}
/* Copy the remaining tail */
{
subd,1 %dr2, 0x60, %dr3
ldrd,0 [ %dr5 + %dr10 ], %db[0] ? ~ %pred6
ldrd,2 [ %dr5 + %dr11 ], %db[1] ? ~ %pred6
addd,3 %dr10, 0x10, %dr10 ? ~ %pred6
cmpedb 0x0, 0x0, %pred0
return %ctpr3
}
{
ldrd,3 [ %dr5 + %dr10 ], %dr13 ? ~ %pred8
}
copy_tail:
{
addd,3 %dr14, 0x20, %dr14
cmpbesb %r6, %r3, %pred1
strd,2 [ %dr0 + %dr6 ], %db[10]
strd,5 [ %dr0 + %dr7 ], %db[11]
addd,1 %dr6, 0x20, %dr6
addd,4 %dr7, 0x20, %dr7
}
{
strd,2 [ %dr0 + %dr8 ], %db[22]
strd,5 [ %dr0 + %dr9 ], %db[23]
addd,1 %dr8, 0x20, %dr8
addd,4 %dr9, 0x20, %dr9
abn abnf = 1, abnt = 1
abp abpf = 1, abpt = 1
ct %ctpr1 ? %pred0
}
{
addd,3 %dr14, 0x20, %dr14
strd,2 [ %dr0 + %dr6 ], %db[10] ? ~ %pred6
strd,5 [ %dr0 + %dr7 ], %db[11] ? ~ %pred6
addd,1 %dr6, 0x10, %dr6 ? ~ %pred6
}
{
addd,3 %dr14, 0x10, %dr14 ? ~ %pred6
strd [ %dr0 + %dr6 ], %dr13 ? ~ %pred8
}
{
addd,3 %dr2, 0x0, %dr0
ct %ctpr3
}
very_small_size:
{
strd [ %dr0 + %dr6 ], %db[10] ? ~ %pred26
strd [ %dr0 + %dr7 ], %db[11] ? ~ %pred27
}
{
addd,0 %dr14, 0x10, %dr14 ? ~ %pred27
strd [ %dr0 + %dr8 ], %db[22] ? ~ %pred28
strd [ %dr0 + %dr9 ], %db[23] ? ~ %pred29
}
{
/* Return should not be in the same instruction
* with memory access, otherwise we will return
* on page fault and page fault handler will
* return from our caller. */
addd,3 %dr2, 0x0, %dr0
ct %ctpr3
}
small_size:
{
ipd 0
disp %ctpr1, copy_small
cmpbdb %dr2, 0x60, %pred0
subd,4 %dr3, 0x60, %dr3
subd,3 %dr14, 0x20, %dr14 ? ~ %pred12
ct %ctpr2 ? %pred12
}
copy_small:
{
cmpldb,4 %dr3, %dr5, %pred1
ldrd,0 [ %dr5 + %dr10 ], %db[8]
ldrd,3 [ %dr5 + %dr11 ], %db[9]
ldrd,2 [ %dr5 + %dr12 ], %db[20]
ldrd,5 [ %dr5 + %dr13 ], %db[21]
addd %dr5, 0x20, %dr5
}
{
/* If trap happens on previous instruction %dr14
* will be negative, so we check for that in trap
* handler. */
addd,3 %dr14, 0x20, %dr14
strd,2 [ %dr0 + %dr6 ], %db[10]
strd,5 [ %dr0 + %dr7 ], %db[11]
addd,1 %dr6, 0x20, %dr6
addd,4 %dr7, 0x20, %dr7
}
{
strd,2 [ %dr0 + %dr8 ], %db[22]
strd,5 [ %dr0 + %dr9 ], %db[23]
addd,1 %dr8, 0x20, %dr8
addd,4 %dr9, 0x20, %dr9
abn abnf = 1, abnt = 1
abp abpf = 1, abpt = 1
ct %ctpr1 ? ~ %pred0
}
copy_tail_small:
{
addd,4 %dr14, 0x20, %dr14 ? ~ %pred12
ldrd,0 [ %dr5 + %dr10 ], %db[8] ? ~ %pred6
ldrd,3 [ %dr5 + %dr11 ], %db[9] ? ~ %pred6
addd,1 %dr10, 0x10, %dr10 ? ~ %pred6
}
{
ldrd,2 [ %dr5 + %dr10 ], %dr13 ? ~ %pred8
}
{
strd,2 [ %dr0 + %dr6 ], %db[10]
strd,5 [ %dr0 + %dr7 ], %db[11]
addd,1 %dr6, 0x20, %dr6
addd,4 %dr7, 0x20, %dr7
}
{
addd,3 %dr14, 0x10, %dr14
strd,2 [ %dr0 + %dr8 ], %db[22]
strd,5 [ %dr0 + %dr9 ], %db[23]
}
{
addd,3 %dr14, 0x10, %dr14
strd,2 [ %dr0 + %dr6 ], %db[8] ? ~ %pred6
strd,5 [ %dr0 + %dr7 ], %db[9] ? ~ %pred6
addd,1 %dr6, 0x10, %dr6 ? ~ %pred6
}
{
addd,3 %dr14, 0x10, %dr14 ? ~ %pred6
strd,2 [ %dr0 + %dr6 ], %dr13 ? ~ %pred8
}
{
addd,3 %dr2, 0x0, %dr0
ct %ctpr3
}
.size $recovery_memcpy_8, . - $recovery_memcpy_8
.global $recovery_memcpy_fault
.global $.recovery_memcpy_fault
$recovery_memcpy_fault:
$.recovery_memcpy_fault:
.ignore ld_st_style
{
nop
cmpldb %dr14, 0, %pred0
return %ctpr3
}
{
addd 0, 0, %dr0 ? %pred0
addd 0, %dr14, %dr0 ? ~ %pred0
ct %ctpr3
}
.size $recovery_memcpy_fault, . - $recovery_memcpy_fault
.global $recovery_memset_8
$recovery_memset_8:
.ignore ld_st_style
/*
* dr0 - dst
* dr1 - value
* dr2 - tag
* dr3 - size
* dr4 - strd opcode
*/
{
ipd 0
disp %ctpr2, store_tail
setbp psz = 0x1
}
{
ipd 0
disp %ctpr1, store
cmpbesb,0 %r3, 0x18, %pred4
cmpandesb,1 %r3, 0x10, %pred2
puttagd,2 %dr1, %dr2, %dr1
}
{
return %ctpr3
cmpbsb,0 %r3, 0x40, %pred0
cmpandesb,1 %r3, 0x8, %pred3
}
{
subs,1 %r3, 0x60, %r3
addd,5 %dr4, 0x8, %dr5
}
{
addd,1 %dr4, 0x10, %dr6
addd,4 %dr4, 0x18, %dr7
ct %ctpr2 ? %pred4
}
store:
{
cmplsb %r3, %r4, %pred1
strd,2 [ %dr0 + %dr4 ], %dr1
strd,5 [ %dr0 + %dr5 ], %dr1
addd,1 %dr4, 0x20, %dr4
addd,4 %dr5, 0x20, %dr5
}
{
strd,2 [ %dr0 + %dr6 ], %dr1
strd,5 [ %dr0 + %dr7 ], %dr1
addd,1 %dr6, 0x20, %dr6
addd,4 %dr7, 0x20, %dr7
abp abpf = 1, abpt = 1
ct %ctpr1 ? ~ %pred0
}
store_tail:
{
strd,2 [ %dr0 + %dr4 ], %dr1 ? ~ %pred2
strd,5 [ %dr0 + %dr5 ], %dr1 ? ~ %pred2
addd,1 %dr4, 0x10, %dr4 ? ~ %pred2
}
{
strd,2 [ %dr0 + %dr4 ], %dr1 ? ~ %pred3
ct %ctpr3
}
.size $recovery_memset_8, . - $recovery_memset_8