472 lines
11 KiB
ArmAsm
472 lines
11 KiB
ArmAsm
/* Overlay manager for SPU.
|
|
|
|
Copyright (C) 2006-2017 Free Software Foundation, Inc.
|
|
|
|
This file is part of the GNU Binutils.
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program; if not, write to the Free Software
|
|
Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston,
|
|
MA 02110-1301, USA. */
|
|
|
|
/* MFC DMA defn's. */
|
|
#define MFC_GET_CMD 0x40
|
|
#define MFC_MAX_DMA_SIZE 0x4000
|
|
#define MFC_TAG_UPDATE_ALL 2
|
|
#define MFC_TAG_ID 0
|
|
|
|
/* Register usage. */
|
|
#define reserved1 $75
|
|
#define parm $75
|
|
#define tab1 reserved1
|
|
#define tab2 reserved1
|
|
#define vma reserved1
|
|
#define oldvma reserved1
|
|
#define newmask reserved1
|
|
#define map reserved1
|
|
|
|
#define reserved2 $76
|
|
#define off1 reserved2
|
|
#define off2 reserved2
|
|
#define present1 reserved2
|
|
#define present2 reserved2
|
|
#define sz reserved2
|
|
#define cmp reserved2
|
|
#define add64 reserved2
|
|
#define cgbits reserved2
|
|
#define off3 reserved2
|
|
#define off4 reserved2
|
|
#define addr4 reserved2
|
|
#define off5 reserved2
|
|
#define tagstat reserved2
|
|
|
|
#define reserved3 $77
|
|
#define size1 reserved3
|
|
#define size2 reserved3
|
|
#define rv3 reserved3
|
|
#define ealo reserved3
|
|
#define cmd reserved3
|
|
#define off64 reserved3
|
|
#define tab3 reserved3
|
|
#define tab4 reserved3
|
|
#define tab5 reserved3
|
|
|
|
#define reserved4 $78
|
|
#define ovl reserved4
|
|
#define rv2 reserved4
|
|
#define rv5 reserved4
|
|
#define cgshuf reserved4
|
|
#define newovl reserved4
|
|
#define irqtmp1 reserved4
|
|
#define irqtmp2 reserved4
|
|
|
|
#define reserved5 $79
|
|
#define target reserved5
|
|
|
|
#define save1 $74
|
|
#define rv4 save1
|
|
#define rv7 save1
|
|
#define tagid save1
|
|
#define maxsize save1
|
|
#define pbyte save1
|
|
#define pbit save1
|
|
|
|
#define save2 $73
|
|
#define cur save2
|
|
#define rv6 save2
|
|
#define osize save2
|
|
#define zovl save2
|
|
#define oldovl save2
|
|
#define newvma save2
|
|
|
|
#define save3 $72
|
|
#define rv1 save3
|
|
#define ea64 save3
|
|
#define buf3 save3
|
|
#define genwi save3
|
|
#define newmap save3
|
|
#define oldmask save3
|
|
|
|
#define save4 $71
|
|
#define irq_stat save4
|
|
|
|
.text
|
|
.align 4
|
|
.type __rv_pattern, @object
|
|
.size __rv_pattern, 16
|
|
__rv_pattern:
|
|
.word 0x00010203, 0x10111213, 0x80808080, 0x80808080
|
|
|
|
.type __cg_pattern, @object
|
|
.size __cg_pattern, 16
|
|
__cg_pattern:
|
|
.word 0x04050607, 0x80808080, 0x80808080, 0x80808080
|
|
|
|
.type __ovly_current, @object
|
|
.size __ovly_current, 16
|
|
__ovly_current:
|
|
.space 16
|
|
|
|
/*
|
|
* __ovly_return - stub for returning from overlay functions.
|
|
*
|
|
* On entry the four slots of $lr are:
|
|
* __ovly_return, prev ovl index, caller return addr, undefined.
|
|
*
|
|
* Load the previous overlay and jump to the caller return address.
|
|
* Updates __ovly_current.
|
|
*/
|
|
.align 4
|
|
.global __ovly_return
|
|
.type __ovly_return, @function
|
|
__ovly_return:
|
|
ila tab1, _ovly_table - 16 # 0,2 0
|
|
shlqbyi ovl, $lr, 4 # 1,4 0
|
|
#nop
|
|
shlqbyi target, $lr, 8 # 1,4 1
|
|
#nop; lnop
|
|
#nop; lnop
|
|
shli off1, ovl, 4 # 0,4 4
|
|
#lnop
|
|
#nop
|
|
hbr ovly_ret9, target # 1,15 5
|
|
#nop; lnop
|
|
#nop; lnop
|
|
#nop
|
|
lqx vma, tab1, off1 # 1,6 8
|
|
#ifdef OVLY_IRQ_SAVE
|
|
nop
|
|
stqd save4, -64($sp) # 1,6 9
|
|
#else
|
|
#nop; lnop
|
|
#endif
|
|
#nop; lnop
|
|
#nop; lnop
|
|
#nop; lnop
|
|
#nop; lnop
|
|
#nop
|
|
rotqbyi size1, vma, 4 # 1,4 14
|
|
#nop
|
|
stqd save3, -48($sp) # 1,6 15
|
|
#nop
|
|
stqd save2, -32($sp) # 1,6 16
|
|
#nop
|
|
stqd save1, -16($sp) # 1,6 17
|
|
andi present1, size1, 1 # 0,2 18
|
|
stqr ovl, __ovly_current # 1,6 18
|
|
#nop; lnop
|
|
#nop
|
|
brz present1, do_load # 1,4 20
|
|
ovly_ret9:
|
|
#nop
|
|
bi target # 1,4 21
|
|
|
|
/*
|
|
* __ovly_load - copy an overlay partion to local store.
|
|
*
|
|
* On entry $75 points to a word consisting of the overlay index in
|
|
* the top 14 bits, and the target address in the bottom 18 bits.
|
|
*
|
|
* Sets up $lr to return via __ovly_return. If $lr is already set
|
|
* to return via __ovly_return, don't change it. In that case we
|
|
* have a tail call from one overlay function to another.
|
|
* Updates __ovly_current.
|
|
*/
|
|
.align 3
|
|
.global __ovly_load
|
|
.type __ovly_load, @function
|
|
__ovly_load:
|
|
#if OVL_STUB_SIZE == 8
|
|
########
|
|
#nop
|
|
lqd target, 0(parm) # 1,6 -11
|
|
#nop; lnop
|
|
#nop; lnop
|
|
#nop; lnop
|
|
#nop; lnop
|
|
#nop; lnop
|
|
#nop
|
|
rotqby target, target, parm # 1,4 -5
|
|
ila tab2, _ovly_table - 16 # 0,2 -4
|
|
stqd save3, -48($sp) # 1,6 -4
|
|
#nop
|
|
stqd save2, -32($sp) # 1,6 -3
|
|
#nop
|
|
stqd save1, -16($sp) # 1,6 -2
|
|
rotmi ovl, target, -18 # 0,4 -1
|
|
hbr ovly_load9, target # 1,15 -1
|
|
ila rv1, __ovly_return # 0,2 0
|
|
#lnop
|
|
#nop; lnop
|
|
#nop
|
|
lqr cur, __ovly_current # 1,6 2
|
|
shli off2, ovl, 4 # 0,4 3
|
|
stqr ovl, __ovly_current # 1,6 3
|
|
ceq rv2, $lr, rv1 # 0,2 4
|
|
lqr rv3, __rv_pattern # 1,6 4
|
|
#nop; lnop
|
|
#nop; lnop
|
|
#nop
|
|
lqx vma, tab2, off2 # 1,6 7
|
|
########
|
|
#else /* OVL_STUB_SIZE == 16 */
|
|
########
|
|
ila tab2, _ovly_table - 16 # 0,2 0
|
|
stqd save3, -48($sp) # 1,6 0
|
|
ila rv1, __ovly_return # 0,2 1
|
|
stqd save2, -32($sp) # 1,6 1
|
|
shli off2, ovl, 4 # 0,4 2
|
|
lqr cur, __ovly_current # 1,6 2
|
|
nop
|
|
stqr ovl, __ovly_current # 1,6 3
|
|
ceq rv2, $lr, rv1 # 0,2 4
|
|
lqr rv3, __rv_pattern # 1,6 4
|
|
#nop
|
|
hbr ovly_load9, target # 1,15 5
|
|
#nop
|
|
lqx vma, tab2, off2 # 1,6 6
|
|
#nop
|
|
stqd save1, -16($sp) # 1,6 7
|
|
########
|
|
#endif
|
|
|
|
#nop; lnop
|
|
#nop; lnop
|
|
#nop
|
|
shufb rv4, rv1, cur, rv3 # 1,4 10
|
|
#nop
|
|
fsmb rv5, rv2 # 1,4 11
|
|
#nop
|
|
rotqmbyi rv6, $lr, -8 # 1,4 12
|
|
#nop
|
|
rotqbyi size2, vma, 4 # 1,4 13
|
|
#nop
|
|
lqd save3, -48($sp) # 1,6 14
|
|
#nop; lnop
|
|
or rv7, rv4, rv6 # 0,2 16
|
|
lqd save2, -32($sp) # 1,6 16
|
|
andi present2, size2, 1 # 0,2 17
|
|
#ifdef OVLY_IRQ_SAVE
|
|
stqd save4, -64($sp) # 1,6 17
|
|
#else
|
|
lnop # 1,0 17
|
|
#endif
|
|
selb $lr, rv7, $lr, rv5 # 0,2 18
|
|
lqd save1, -16($sp) # 1,6 18
|
|
#nop
|
|
brz present2, do_load # 1,4 19
|
|
ovly_load9:
|
|
#nop
|
|
bi target # 1,4 20
|
|
|
|
/* If we get here, we are about to load a new overlay.
|
|
* "vma" contains the relevant entry from _ovly_table[].
|
|
* extern struct {
|
|
* u32 vma;
|
|
* u32 size;
|
|
* u32 file_offset;
|
|
* u32 buf;
|
|
* } _ovly_table[];
|
|
*/
|
|
.align 3
|
|
.global __ovly_load_event
|
|
.type __ovly_load_event, @function
|
|
__ovly_load_event:
|
|
do_load:
|
|
#ifdef OVLY_IRQ_SAVE
|
|
ila irqtmp1, do_load10 # 0,2 -5
|
|
rotqbyi sz, vma, 8 # 1,4 -5
|
|
#nop
|
|
rdch irq_stat, $SPU_RdMachStat # 1,6 -4
|
|
#nop
|
|
bid irqtmp1 # 1,4 -3
|
|
do_load10:
|
|
nop
|
|
#else
|
|
#nop
|
|
rotqbyi sz, vma, 8 # 1,4 0
|
|
#endif
|
|
rotqbyi osize, vma, 4 # 1,4 1
|
|
#nop
|
|
lqa ea64, _EAR_ # 1,6 2
|
|
#nop
|
|
lqr cgshuf, __cg_pattern # 1,6 3
|
|
|
|
/* We could predict the branch at the end of this loop by adding a few
|
|
instructions, and there are plenty of free cycles to do so without
|
|
impacting loop execution time. However, it doesn't make a great
|
|
deal of sense since we need to wait for the dma to complete anyway. */
|
|
__ovly_xfer_loop:
|
|
#nop
|
|
rotqmbyi off64, sz, -4 # 1,4 4
|
|
#nop; lnop
|
|
#nop; lnop
|
|
#nop; lnop
|
|
cg cgbits, ea64, off64 # 0,2 8
|
|
#lnop
|
|
#nop; lnop
|
|
#nop
|
|
shufb add64, cgbits, cgbits, cgshuf # 1,4 10
|
|
#nop; lnop
|
|
#nop; lnop
|
|
#nop; lnop
|
|
addx add64, ea64, off64 # 0,2 14
|
|
#lnop
|
|
ila maxsize, MFC_MAX_DMA_SIZE # 0,2 15
|
|
lnop
|
|
ori ea64, add64, 0 # 0,2 16
|
|
rotqbyi ealo, add64, 4 # 1,4 16
|
|
cgt cmp, osize, maxsize # 0,2 17
|
|
wrch $MFC_LSA, vma # 1,6 17
|
|
#nop; lnop
|
|
selb sz, osize, maxsize, cmp # 0,2 19
|
|
wrch $MFC_EAH, ea64 # 1,6 19
|
|
ila tagid, MFC_TAG_ID # 0,2 20
|
|
wrch $MFC_EAL, ealo # 1,6 20
|
|
ila cmd, MFC_GET_CMD # 0,2 21
|
|
wrch $MFC_Size, sz # 1,6 21
|
|
sf osize, sz, osize # 0,2 22
|
|
wrch $MFC_TagId, tagid # 1,6 22
|
|
a vma, vma, sz # 0,2 23
|
|
wrch $MFC_Cmd, cmd # 1,6 23
|
|
#nop
|
|
brnz osize, __ovly_xfer_loop # 1,4 24
|
|
|
|
/* Now update our data structions while waiting for DMA to complete.
|
|
Low bit of .size needs to be cleared on the _ovly_table entry
|
|
corresponding to the evicted overlay, and set on the entry for the
|
|
newly loaded overlay. Note that no overlay may in fact be evicted
|
|
as _ovly_buf_table[] starts with all zeros. Don't zap .size entry
|
|
for zero index! Also of course update the _ovly_buf_table entry. */
|
|
#nop
|
|
lqr newovl, __ovly_current # 1,6 25
|
|
#nop; lnop
|
|
#nop; lnop
|
|
#nop; lnop
|
|
#nop; lnop
|
|
#nop; lnop
|
|
shli off3, newovl, 4 # 0,4 31
|
|
#lnop
|
|
ila tab3, _ovly_table - 16 # 0,2 32
|
|
#lnop
|
|
#nop
|
|
fsmbi pbyte, 0x100 # 1,4 33
|
|
#nop; lnop
|
|
#nop
|
|
lqx vma, tab3, off3 # 1,6 35
|
|
#nop; lnop
|
|
andi pbit, pbyte, 1 # 0,2 37
|
|
lnop
|
|
#nop; lnop
|
|
#nop; lnop
|
|
#nop; lnop
|
|
or newvma, vma, pbit # 0,2 41
|
|
rotqbyi buf3, vma, 12 # 1,4 41
|
|
#nop; lnop
|
|
#nop
|
|
stqx newvma, tab3, off3 # 1,6 43
|
|
#nop; lnop
|
|
shli off4, buf3, 2 # 1,4 45
|
|
#lnop
|
|
ila tab4, _ovly_buf_table - 4 # 0,2 46
|
|
#lnop
|
|
#nop; lnop
|
|
#nop; lnop
|
|
#nop
|
|
lqx map, tab4, off4 # 1,6 49
|
|
#nop
|
|
cwx genwi, tab4, off4 # 1,4 50
|
|
a addr4, tab4, off4 # 0,2 51
|
|
#lnop
|
|
#nop; lnop
|
|
#nop; lnop
|
|
#nop; lnop
|
|
#nop
|
|
rotqby oldovl, map, addr4 # 1,4 55
|
|
#nop
|
|
shufb newmap, newovl, map, genwi # 0,4 56
|
|
#if MFC_TAG_ID < 16
|
|
ila newmask, 1 << MFC_TAG_ID # 0,2 57
|
|
#else
|
|
ilhu newmask, 1 << (MFC_TAG_ID - 16) # 0,2 57
|
|
#endif
|
|
#lnop
|
|
#nop; lnop
|
|
#nop; lnop
|
|
stqd newmap, 0(addr4) # 1,6 60
|
|
|
|
/* Save app's tagmask, wait for DMA complete, restore mask. */
|
|
ila tagstat, MFC_TAG_UPDATE_ALL # 0,2 61
|
|
rdch oldmask, $MFC_RdTagMask # 1,6 61
|
|
#nop
|
|
wrch $MFC_WrTagMask, newmask # 1,6 62
|
|
#nop
|
|
wrch $MFC_WrTagUpdate, tagstat # 1,6 63
|
|
#nop
|
|
rdch tagstat, $MFC_RdTagStat # 1,6 64
|
|
#nop
|
|
sync # 1,4 65
|
|
/* Any hint prior to the sync is lost. A hint here allows the branch
|
|
to complete 15 cycles after the hint. With no hint the branch will
|
|
take 18 or 19 cycles. */
|
|
ila tab5, _ovly_table - 16 # 0,2 66
|
|
hbr do_load99, target # 1,15 66
|
|
shli off5, oldovl, 4 # 0,4 67
|
|
wrch $MFC_WrTagMask, oldmask # 1,6 67
|
|
ceqi zovl, oldovl, 0 # 0,2 68
|
|
#lnop
|
|
#nop; lnop
|
|
#nop
|
|
fsm zovl, zovl # 1,4 70
|
|
#nop
|
|
lqx oldvma, tab5, off5 # 1,6 71
|
|
#nop
|
|
lqd save3, -48($sp) # 1,6 72
|
|
#nop; lnop
|
|
andc pbit, pbit, zovl # 0,2 74
|
|
lqd save2, -32($sp) # 1,6 74
|
|
#ifdef OVLY_IRQ_SAVE
|
|
ila irqtmp2, do_load90 # 0,2 75
|
|
#lnop
|
|
andi irq_stat, irq_stat, 1 # 0,2 76
|
|
#lnop
|
|
#else
|
|
#nop; lnop
|
|
#nop; lnop
|
|
#endif
|
|
andc oldvma, oldvma, pbit # 0,2 77
|
|
lqd save1, -16($sp) # 1,6 77
|
|
nop # 0,0 78
|
|
#lnop
|
|
#nop
|
|
stqx oldvma, tab5, off5 # 1,6 79
|
|
#nop
|
|
#ifdef OVLY_IRQ_SAVE
|
|
binze irq_stat, irqtmp2 # 1,4 80
|
|
do_load90:
|
|
#nop
|
|
lqd save4, -64($sp) # 1,6 84
|
|
#else
|
|
#nop; lnop
|
|
#endif
|
|
|
|
.global _ovly_debug_event
|
|
.type _ovly_debug_event, @function
|
|
_ovly_debug_event:
|
|
nop
|
|
/* Branch to target address. */
|
|
do_load99:
|
|
bi target # 1,4 81/85
|
|
|
|
.size __ovly_load, . - __ovly_load
|