.type  emuCpuRun, %function
.globl emuCpuRun
.globl emuCpuRunCodeStart
.globl emuCpuRunCodeEnd


.syntax unified
.thumb


.globl emuCoreInit
emuCoreInit:
	bx lr

emuCpuRunCodeStart:



//this is a port of the older m3 core. m3 core has since been much improved, and this file has some low hanging fruit too...



////A NOTE ABOUT DISPATCH
// as per ARMv6M docs, ALU reads of PC read PC + 4, and writes ignore bottom bit. we use these to implement dispatch
// we get the bits we need to dispatch based on, LSLed 1, and we do not care what is in the low bit (that not caring saves a cycle each dispatch)
// we can then do "ADD PC, Rx" and then a "NOP" and then we'll jump to a 2-byte instr after this whose order matches Rx's value


//BL is often used for long branches since B is limited

//reg usage:
// r0  = temp
// r1  = temp
// r2  = temp (often instr)
// r3  = temp
// r4  = state
// r5  = temp
// r6  = temp
// r7  = temp
// r8  = next_instr label
// r10 = sr
// r11 = instrPtr
// r12 = dp_dispatch




emuCpuRun:
	push   {lr}									// so we can use "BL" for branching
	mov    r4, r0
	ldr    r0, [r4, #0x40]
	mov    r10, r0
	ldr    r1, [r4, #0x3c]
	ldr    r0, =dp_dispatch						//could have been clobbered
	mov    r12, r0
	ldr    r0, =next_instr
	mov    r8, r0
	b      next_instr_pc_is_in_r1

pc_changed_maybe_thumb:
	ldr    r1, [r4, #0x3c]						//get pc
	lsrs   r0, r1, #1							//test bottom bit
	bcc    next_instr_pc_is_in_r1
	bl     emu_out

pc_changed_not_thumb:
	ldr    r1, [r4, #0x3c]						//get pc
	mov    r11, r1

next_instr:
normal_fetch:									//expected results: r2 = [instrAdr], [state,0x3c] = instrAddr + 8
	mov    r1, r11

next_instr_pc_is_in_r1:
	ldmia  r1!, {r2}
	mov    r11, r1
	adds   r0, r1, #4
	str    r0, [r4, #0x3c]

	lsrs   r0, r2, #28							//isolate cc & dispatch based on cc
	cmp    r0, #0x0e							//most common
	beq    cc_al
	lsls   r0, #3								//4 instrs per cc
	add    pc, r0
	nop

cc_eq:
	mov    r0, r10								//get SR
	lsrs   r0, #31								//shift SR.Z into our C
	bcs    cc_al
	b      next_instr_pc_is_in_r1

cc_ne:
	mov    r0, r10								//get SR
	lsrs   r0, #31								//shift SR.Z into our C
	bcc    cc_al
	b      next_instr_pc_is_in_r1

cc_cs:
	mov    r0, r10								//get SR
	lsrs   r0, #30								//shift SR.C into our C
	bcs    cc_al
	b      next_instr_pc_is_in_r1

cc_cc:
	mov    r0, r10								//get SR
	lsrs   r0, #30								//shift SR.C into our C
	bcc    cc_al
	b      next_instr_pc_is_in_r1

cc_mi:
	mov    r0, r10								//get SR
	lsrs   r0, #32								//shift SR.N into our C
	bcs    cc_al
	b      next_instr_pc_is_in_r1

cc_pl:
	mov    r0, r10								//get SR
	lsrs   r0, #32								//shift SR.N into our C
	bcc    cc_al
	b      next_instr_pc_is_in_r1

cc_vs:
	mov    r0, r10								//get SR
	lsrs   r0, #29								//shift SR.V into our C
	bcs    cc_al
	b      next_instr_pc_is_in_r1

cc_vc:
	mov    r0, r10								//get SR
	lsrs   r0, #29								//shift SR.V into our C
	bcc    cc_al
	b      next_instr_pc_is_in_r1

cc_hi:
	msr    APSR_nzcvq, r10						//get SR into APSR
	bhi    cc_al
	b      next_instr_pc_is_in_r1

cc_ls:
	msr    APSR_nzcvq, r10						//get SR into APSR
	bls    cc_al
	b      next_instr_pc_is_in_r1

cc_ge:
	msr    APSR_nzcvq, r10						//get SR into APSR
	bge    cc_al
	b      next_instr_pc_is_in_r1

cc_lt:
	msr    APSR_nzcvq, r10						//get SR into APSR
	blt    cc_al
	b      next_instr_pc_is_in_r1

cc_gt:
	msr    APSR_nzcvq, r10						//get SR into APSR
	bgt    cc_al
	b      next_instr_pc_is_in_r1

cc_le:
	msr    APSR_nzcvq, r10						//get SR into APSR
	ble    cc_al
	b      next_instr_pc_is_in_r1

cc_al_not_used:									//unreached
	nop
	nop
	nop
	nop

cc_nv:											//NV - special instrs
	lsrs   r0, r2, #25
	cmp    r0, #0x7D
	bne    not_blx_imm

instr_is_blx_imm:
	mov    r3, r11								//addr of next instr
	str    r3, [r4, #0x38]						//save LR
	adds   r1, r3, #4							//base for further addition
	lsls   r3, r2, #8
	asrs   r3, #6
	adds   r1, r3								//pc += sext(imm24) << 2 
	lsls   r2, #7
	lsrs   r2, #31
	lsls   r2, #1
	adds   r1, r2								//pc[1] += instr[24]
	adds   r1, #1								//set low bit (mandatory)
	str    r1, [r4, #0x3c]
	bl     emu_out

not_blx_imm:
	ldr    r1, =0xFD70F000						//maybe a PLD ?
	ands   r2, r1
	ldr    r1, =0xF550F000
	cmp    r2, r1
	beq    next_instr
	
	udf    #0x00

cc_al:
	lsls   r0, r2, #4						//dispatch on instr[24..28]
	lsrs   r0, #27
	add    pc, r0
	nop
	b dp_reg
	b dp_reg
	b dp_imm
	b dp_imm
	b mem_imm
	b mem_imm
	b mem_reg
	b mem_reg
	b mem_mul		//LDM/STM
	b mem_mul		//LDM/STM
	b inst_b
	b inst_bl
	b cc_udf
	b cc_udf
	b cc_udf
	bl inst_swi		//ok to use BL since it is the last one

.ltorg

cc_udf:
	udf    #0x00

inst_bl:
	mov    r3, r11								//calculate addr of next instr
	str    r3, [r4, #0x38]						//save LR
	//fallthrough

inst_b:
	lsls   r2, #8
	asrs   r2, #6
	add    r2, r11
	adds   r1, r2, #4							//destination
	b      next_instr_pc_is_in_r1

mem_imm:
	lsls   r1, r2, #20
	lsrs   r1, #20
	b      adr_mode_2_common

mem_reg:
	lsls   r3, r2, #20							//get shift amt   instr[7..11]
	lsrs   r3, #27
	lsls   r0, r2, #25							//get shift type and zero bit   instr[4..6], LSL 1 for dispatch
	lsrs   r0, #28
	lsls   r1, r2, #28							//get Rm * 4
	lsrs   r1, #26
	ldr    r1, [r4, r1]							//get Rm's value
	add    pc, r0								//dispatch on instr[4..6]
	nop
	b a2_lsl
	b a2_und
	b a2_lsr
	b a2_und
	b a2_asr
	b a2_und
	b a2_ror
	b a2_und

a2_und:
	udf    #0x00

a2_lsr_32:
	lsrs   r1, #32
	b      adr_mode_2_common

a2_lsr:
	cmp    r3, #0
	beq    a2_lsr_32
	lsrs   r1, r3
	b      adr_mode_2_common

a2_asr_32:
	asrs   r1, #32
	b      adr_mode_2_common

a2_asr:
	cmp    r3, #0
	beq    a2_asr_32
	asrs   r1, r3
	b      adr_mode_2_common

a2_ror:
	cmp    r3, #0
	beq    a2_rrx
	rors   r1, r3
	b      adr_mode_2_common

a2_rrx:											//RRX: since carry out is dropped, this is basically an LSR and then top bit coming from C flag (aka SR & 0x20000000UL)
	mov    r0, r10								//get SR
	lsrs   r0, #29								//get SR.C into top bit of r0
	lsls   r0, #31
	lsrs   r1, #1								//LSR by 1
	orrs   r1, r0								//insert top bit
	b      adr_mode_2_common

a2_lsl:											//most common case so it gets the fallthrough (& thus speed improvement)
	lsls   r1, r3
	//fallthrough

adr_mode_2_common:								//adr mode value is in r1
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r5, r2, #16							//get Rd -> r5
	lsrs   r5, #28
	lsls   r6, r3, #2							//get Rn's value -> r6
	ldr    r6, [r4, r6]
	lsls   r0, r2, #7							//get 'PUBWL' bits LSL 1 -> r0
	lsrs   r0, #26
	add    pc, r0								//dispatch on r0
	nop

	b a2_00000							//STR   Rd, [Rn], #-addr_mode
	b a2_00001							//LDR   Rd, [Rn], #-addr_mode
	b a2_und_2							//STRT  Rd, [Rn], #-addr_mode - not supported
	b a2_und_2							//LDRT  Rd, [Rn], #-addr_mode - not supported
	b a2_00100							//STRB  Rd, [Rn], #-addr_mode
	b a2_00101							//LDRB  Rd, [Rn], #-addr_mode
	b a2_und_2							//STRBT Rd, [Rn], #-addr_mode - not supported
	b a2_und_2							//LDRBT Rd, [Rn], #-addr_mode - not supported
	b a2_01000							//STR   Rd, [Rn], #+addr_mode
	b a2_01001							//LDR   Rd, [Rn], #+addr_mode
	b a2_und_2							//STRT  Rd, [Rn], #+addr_mode - not supported
	b a2_und_2							//LDRT  Rd, [Rn], #+addr_mode - not supported
	b a2_01100							//STRB  Rd, [Rn], #+addr_mode
	b a2_01101							//LDRB  Rd, [Rn], #+addr_mode
	b a2_und_2							//STRBT Rd, [Rn], #+addr_mode - not supported
	b a2_und_2							//LDRBT Rd, [Rn], #+addr_mode - not supported
	b a2_10000							//STR   Rd, [Rn, #-addr_mode]
	b a2_10001							//LDR   Rd, [Rn, #-addr_mode]
	b a2_10010							//STR   Rd, [Rn, #-addr_mode]!
	b a2_10011							//LDR   Rd, [Rn, #-addr_mode]!
	b a2_10100							//STRB  Rd, [Rn, #-addr_mode]
	b a2_10101							//LDRB  Rd, [Rn, #-addr_mode]
	b a2_10110							//STRB  Rd, [Rn, #-addr_mode]!
	b a2_10111							//LDRB  Rd, [Rn, #-addr_mode]!
	b a2_11000							//STR   Rd, [Rn, #+addr_mode]
	b a2_11001							//LDR   Rd, [Rn, #+addr_mode]
	b a2_11010							//STR   Rd, [Rn, #+addr_mode]!
	b a2_11011							//LDR   Rd, [Rn, #+addr_mode]!
	b a2_11100							//STRB  Rd, [Rn, #+addr_mode]
	b a2_11101							//LDRB  Rd, [Rn, #+addr_mode]
	b a2_11110							//STRB  Rd, [Rn, #+addr_mode]!
	b a2_11111							//LDRB  Rd, [Rn, #+addr_mode]!

a2_00000:										// STR   Rd, [Rn], #-addr_mode
	lsls   r0, r5, #2							//get Rd's value -> r7
	ldr    r7, [r4, r0]
	str    r7, [r6]								//execute the store
	subs   r6, r1								//apply postindexing value
	lsls   r0, r3, #2							//store Rn's value back after writeback
	str    r6, [r4, r0]
	mov    pc, r8								//go to next instr

a2_00001:										// LDR   Rd, [Rn], #-addr_mode
	ldr    r7, [r6]								//execute the load
	lsls   r0, r5, #2							//set Rd's value <- r7
	str    r7, [r4, r0]
	subs   r6, r1								//apply postindexing value
	lsls   r0, r3, #2							//store Rn's value back after writeback
	str    r6, [r4, r0]
	cmp    r5, #15								//did we load pc?
	beq    pc_loaded
	mov    pc, r8
pc_loaded:
	lsrs   r7, #1								//did we load a thumb addr?
	bcs    j_emu_out_0							//if so, go directly out
	b      pc_changed_not_thumb					//if not, continue emulation knowing that it was not thumb

a2_00100:										// STRB  Rd, [Rn], #-addr_mode
	lsls   r0, r5, #2							//get Rd's value -> r7
	ldr    r7, [r4, r0]
	strb   r7, [r6]								//execute the store
	subs   r6, r1								//apply postindexing value
	lsls   r0, r3, #2							//store Rn's value back after writeback
	str    r6, [r4, r0]
	mov    pc, r8								//go to next instr

a2_00101:										// LDRB  Rd, [Rn], #-addr_mode
	ldrb   r7, [r6]								//execute the load
	lsls   r0, r5, #2							//set Rd's value <- r7 (loading PC using this instr is undefined so we do not check for it)
	str    r7, [r4, r0]
	subs   r6, r1								//apply postindexing value
	lsls   r0, r3, #2							//store Rn's value back after writeback
	str    r6, [r4, r0]
	mov    pc, r8								//go to next instr

a2_01000:										// STR   Rd, [Rn], #-addr_mode
	lsls   r0, r5, #2							//get Rd's value -> r7
	ldr    r7, [r4, r0]
	str    r7, [r6]								//execute the store
	adds   r6, r1								//apply postindexing value
	lsls   r0, r3, #2							//store Rn's value back after writeback
	str    r6, [r4, r0]
	mov    pc, r8								//go to next instr

a2_01001:										// LDR   Rd, [Rn], #-addr_mode
	ldr    r7, [r6]								//execute the load
	lsls   r0, r5, #2							//set Rd's value <- r7
	str    r7, [r4, r0]
	adds   r6, r1								//apply postindexing value
	lsls   r0, r3, #2							//store Rn's value back after writeback
	str    r6, [r4, r0]
	cmp    r5, #15								//did we load pc?
	beq    pc_loaded
	mov    pc, r8

a2_01100:										// STRB  Rd, [Rn], #+addr_mode
	lsls   r0, r5, #2							//get Rd's value -> r7
	ldr    r7, [r4, r0]
	strb   r7, [r6]								//execute the store
	adds   r6, r1								//apply postindexing value
	lsls   r0, r3, #2							//store Rn's value back after writeback
	str    r6, [r4, r0]
	mov    pc, r8								//go to next instr

a2_01101:										// LDRB  Rd, [Rn], #+addr_mode
	ldrb   r7, [r6]								//execute the load
	lsls   r0, r5, #2							//set Rd's value <- r7 (loading PC using this instr is undefined so we do not check for it)
	str    r7, [r4, r0]
	adds   r6, r1								//apply postindexing value
	lsls   r0, r3, #2							//store Rn's value back after writeback
	str    r6, [r4, r0]
	mov    pc, r8								//go to next instr

j_emu_out_0:
	bl     emu_out

a2_10000:										// STR   Rd, [Rn, #-addr_mode]
	lsls   r0, r5, #2							//get Rd's value -> r7
	ldr    r7, [r4, r0]
	subs   r6, r1								//apply preindexing value
	str    r7, [r6]								//execute the store
	mov    pc, r8								//go to next instr

a2_10001:										// LDR   Rd, [Rn, #-addr_mode]
	subs   r6, r1								//apply preindexing value
	ldr    r7, [r6]								//execute the load
	lsls   r0, r5, #2							//set Rd's value <- r7
	str    r7, [r4, r0]
	cmp    r5, #15								//did we load pc?
	beq    pc_loaded
	mov    pc, r8

a2_10010:										// STR   Rd, [Rn, #-addr_mode]!
	lsls   r0, r5, #2							//get Rd's value -> r7
	ldr    r7, [r4, r0]
	subs   r6, r1								//apply preindexing value
	str    r7, [r6]								//execute the store
	lsls   r0, r3, #2							//store Rn's value back after writeback
	str    r6, [r4, r0]
	mov    pc, r8								//go to next instr

a2_10011:										// LDR   Rd, [Rn, #-addr_mode]!
	subs   r6, r1								//apply preindexing value
	ldr    r7, [r6]								//execute the load
	lsls   r0, r5, #2							//set Rd's value <- r7
	str    r7, [r4, r0]
	lsls   r0, r3, #2							//store Rn's value back after writeback
	str    r6, [r4, r0]
	cmp    r5, #15								//did we load pc?
	beq    pc_loaded
	mov    pc, r8

a2_10100:										// STRB  Rd, [Rn, #-addr_mode]
	lsls   r0, r5, #2							//get Rd's value -> r7
	ldr    r7, [r4, r0]
	subs   r6, r1								//apply preindexing value
	strb   r7, [r6]								//execute the store
	mov    pc, r8								//go to next instr

a2_10101:										// LDRB  Rd, [Rn, #-addr_mode]
	subs   r6, r1								//apply preindexing value
	ldrb   r7, [r6]								//execute the load
	lsls   r0, r5, #2							//set Rd's value <- r7 (loading PC using this instr is undefined so we do not check for it)
	str    r7, [r4, r0]
	mov    pc, r8								//go to next instr

a2_10110:										// STRB  Rd, [Rn, #-addr_mode]!
	lsls   r0, r5, #2							//get Rd's value -> r7
	ldr    r7, [r4, r0]
	subs   r6, r1								//apply preindexing value
	strb   r7, [r6]								//execute the store
	lsls   r0, r3, #2							//store Rn's value back after writeback
	str    r6, [r4, r0]
	mov    pc, r8								//go to next instr

a2_10111:										// LDRB  Rd, [Rn, #-addr_mode]!
	subs   r6, r1								//apply preindexing value
	ldrb   r7, [r6]								//execute the load
	lsls   r0, r5, #2							//set Rd's value <- r7 (loading PC using this instr is undefined so we do not check for it)
	str    r7, [r4, r0]
	lsls   r0, r3, #2							//store Rn's value back after writeback
	str    r6, [r4, r0]
	mov    pc, r8								//go to next instr

a2_11000:										// STR   Rd, [Rn, #+addr_mode]
	lsls   r0, r5, #2							//get Rd's value -> r7
	ldr    r7, [r4, r0]
	str    r7, [r6, r1]							//execute the store with preindexing
	mov    pc, r8								//go to next instr

a2_11001:										// LDR   Rd, [Rn, #+addr_mode]
	ldr    r7, [r6, r1]							//execute the load with preindexing
	lsls   r0, r5, #2							//set Rd's value <- r7
	str    r7, [r4, r0]
	cmp    r5, #15								//did we load pc?
	beq    pc_loaded
	mov    pc, r8

a2_11010:										// STR   Rd, [Rn, #+addr_mode]!
	lsls   r0, r5, #2							//get Rd's value -> r7
	ldr    r7, [r4, r0]
	adds   r6, r1								//apply preindexing value
	str    r7, [r6]								//execute the store
	lsls   r0, r3, #2							//store Rn's value back after writeback
	str    r6, [r4, r0]
	mov    pc, r8								//go to next instr

a2_11011:										// LDR   Rd, [Rn, #+addr_mode]!
	adds   r6, r1								//apply preindexing value
	ldr    r7, [r6]								//execute the load
	lsls   r0, r5, #2							//set Rd's value <- r7
	str    r7, [r4, r0]
	lsls   r0, r3, #2							//store Rn's value back after writeback
	str    r6, [r4, r0]
	cmp    r5, #15								//did we load pc?
	beq    pc_loaded
	mov    pc, r8

a2_11100:										// STRB  Rd, [Rn, #+addr_mode]
	lsls   r0, r5, #2							//get Rd's value -> r7
	ldr    r7, [r4, r0]
	strb   r7, [r6, r1]							//execute the store with preindexing
	mov    pc, r8								//go to next instr

a2_11101:										// LDRB  Rd, [Rn, #+addr_mode]
	ldrb   r7, [r6, r1]							//execute the load with preindexing
	lsls   r0, r5, #2							//set Rd's value <- r7 (loading PC using this instr is undefined so we do not check for it)
	str    r7, [r4, r0]
	mov    pc, r8								//go to next instr

a2_11110:										// STRB  Rd, [Rn, #+addr_mode]!
	lsls   r0, r5, #2							//get Rd's value -> r7
	ldr    r7, [r4, r0]
	adds   r6, r1								//apply preindexing value
	strb   r7, [r6]								//execute the store
	lsls   r0, r3, #2							//store Rn's value back after writeback
	str    r6, [r4, r0]
	mov    pc, r8								//go to next instr

a2_11111:										// LDRB  Rd, [Rn, #+addr_mode]!
	adds   r6, r1								//apply preindexing value
	ldrb   r7, [r6]								//execute the load
	lsls   r0, r5, #2							//set Rd's value <- r7 (loading PC using this instr is undefined so we do not check for it)
	str    r7, [r4, r0]
	lsls   r0, r3, #2							//store Rn's value back after writeback
	str    r6, [r4, r0]
	mov    pc, r8								//go to next instr

a2_und_2:
	udf    #0x00

mem_mul:
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r7, r3, #2							//get Rn's value -> r7
	ldr    r7, [r4, r7]


	lsls   r0, r2, #7							//get 'PUSWL' bits LSL 1 -> r0    (we check "W" later actually for code dedup)
	lsrs   r0, #26
	add    pc, r0								//dispatch on r0
	nop

	b a4_stmda									//STMDA Rn, {}
	b a4_ldmda									//LDMDA Rn, {}
	b a4_stmda									//STMDA Rn!, {}
	b a4_ldmda									//LDMDA Rn!, {}
	b a4_undef									//STM with "S" bit unsupported
	b a4_ldmda_s								//LDM with "S" bit unsupported (but sometimes used with PC in the list & apparently works)
	b a4_undef									//STM with "S" bit unsupported
	b a4_ldmda_s								//LDM with "S" bit unsupported (but sometimes used with PC in the list & apparently works)
	b a4_stmia									//STMIA Rn, {}
	b a4_ldmia									//LDMIA Rn, {}
	b a4_stmia									//STMIA Rn!, {}
	b a4_ldmia									//LDMIA Rn!, {}
	b a4_undef									//STM with "S" bit unsupported
	b a4_ldmia_s								//LDM with "S" bit unsupported (but sometimes used with PC in the list & apparently works)
	b a4_undef									//STM with "S" bit unsupported
	b a4_ldmia_s								//LDM with "S" bit unsupported (but sometimes used with PC in the list & apparently works)
	b a4_stmdb									//STMDB Rn, {}
	b a4_ldmdb									//LDMDB Rn, {}
	b a4_stmdb									//STMDB Rn!, {}
	b a4_ldmdb									//LDMDB Rn!, {}
	b a4_undef									//STM with "S" bit unsupported
	b a4_ldmdb_s								//LDM with "S" bit unsupported (but sometimes used with PC in the list & apparently works)
	b a4_undef									//STM with "S" bit unsupported
	b a4_ldmdb_s								//LDM with "S" bit unsupported (but sometimes used with PC in the list & apparently works)
	b a4_stmib									//STMIB Rn, {}
	b a4_ldmib									//LDMIB Rn, {}
	b a4_stmib									//STMIB Rn!, {}
	b a4_ldmib									//LDMIB Rn!, {}
	b a4_undef									//STM with "S" bit unsupported
	b a4_ldmib_s								//LDM with "S" bit unsupported (but sometimes used with PC in the list & apparently works)
	b a4_undef									//STM with "S" bit unsupported
	b a4_ldmib_s								//LDM with "S" bit unsupported (but sometimes used with PC in the list & apparently works)

a4_stmda:
	lsls   r5, r2, #15							//move bits to almost top pos in r5
	movs   r0, #64								//start with r15
a4_stmda_loop:
	subs   r0, #4								//calc which reg we're targetting
	lsls   r5, #1								//see if we intend to use it
	beq    a4_done								//no more bits? we're done
	bpl    a4_stmda_loop						//if not interested in this reg, go deal with next one
	ldr    r6, [r4, r0]							//get i-th reg's value -> r6
	str    r6, [r7]								//store and adjust Rn's value
	subs   r7, #4
	b      a4_stmda_loop

a4_ldmda_s:
#ifndef ARM_EMU_DEFINE_THE_UNDEFINED
	b      a4_undef
#else
	//Bejeweled likes to use shit like "LDMDB   R11, {R4-R8,R11,SP,PC}^" to
	// return from armlets. This is undefined in user and system mode since there
	// is no SPSR to copy to CPSR in that mode. Apparently works in user mode
	// by just pretending that the S bit is clear. Verified on real HW.
	//this patch verifies that if S is set, PC is in the list, since else it is an
	//entirely different instr. If so, falls through to the normal ldm code
	lsrs   r5, r2, #16
	bpl    a4_undef
	//fallthrough to normal ldmda code below
#endif

a4_ldmda:
	lsls   r5, r2, #15							//move bits to almost top pos in r5
	movs   r0, #64								//start with r15
a4_ldmda_loop:
	subs   r0, #4								//calc which reg we're targetting
	lsls   r5, #1								//see if we intend to use it
	beq    a4_done_load							//no more bits? we're done
	bpl    a4_ldmda_loop						//if not interested in this reg, go deal with next one
	ldr    r6, [r7]								//load and adjust Rn's value
	subs   r7, #4
	str    r6, [r4, r0]							//set i-th reg's value <- r6
	b      a4_ldmda_loop

a4_stmia:
	uxth   r5, r2								//just bottom bits please
	movs   r0, #0								//start with r0
	b      1f
a4_stmia_loop:
	adds   r0, #4								//calc which reg we're targetting
1:
	lsrs   r5, #1								//see if we intend to use it
	bcc    2f
	ldr    r6, [r4, r0]							//get i-th reg's value -> r6
	stmia  r7!, {r6}							//store and adjust Rn's value
2:
	cmp    r5, #0
	bne    a4_stmia_loop
	b      a4_done

a4_ldmia_s:
#ifndef ARM_EMU_DEFINE_THE_UNDEFINED
	b      a4_undef
#else
	//Bejeweled likes to use shit like "LDMDB   R11, {R4-R8,R11,SP,PC}^" to
	// return from armlets. This is undefined in user and system mode since there
	// is no SPSR to copy to CPSR in that mode. Apparently works in user mode
	// by just pretending that the S bit is clear. Verified on real HW.
	//this patch verifies that if S is set, PC is in the list, since else it is an
	//entirely different instr. If so, falls through to the normal ldm code
	lsrs   r5, r2, #16
	bpl    a4_undef
	//fallthrough to normal ldmia code below
#endif

a4_ldmia:
	uxth   r5, r2								//just bottom bits please
	movs   r0, #0								//start with r0
	b      1f
a4_ldmia_loop:
	adds   r0, #4								//calc which reg we're targetting
1:
	lsrs   r5, #1								//see if we intend to use it
	bcc    2f
	ldmia  r7!, {r6}							//load and adjust Rn's value
	str    r6, [r4, r0]							//set i-th reg's value -> r6
2:
	cmp    r5, #0
	bne    a4_ldmia_loop
	cmp    r0, #60								//check right here is we just loaded PC (since it would have been loaded last and the check is thus easy. this check is correct, trust me)
	bne    a4_done								//writeback but do not flag pc for retest
	b      a4_done_load							//writebackand retest if pc was loaded

a4_stmdb:
	lsls   r5, r2, #15							//move bits to almost top pos in r5
	movs   r0, #64								//start with r15
a4_stmdb_loop:
	subs   r0, #4								//calc which reg we're targetting
	lsls   r5, #1								//see if we intend to use it
	beq    a4_done								//no more bits? we're done
	bpl    a4_stmdb_loop						//if not interested in this reg, go deal with next one
	ldr    r6, [r4, r0]							//get i-th reg's value -> r6
	subs   r7, #4								//store and adjust Rn's value
	str    r6, [r7]
	b      a4_stmdb_loop

a4_ldmdb_s:
#ifndef ARM_EMU_DEFINE_THE_UNDEFINED
	b      a4_undef
#else
	//Bejeweled likes to use shit like "LDMDB   R11, {R4-R8,R11,SP,PC}^" to
	// return from armlets. This is undefined in user and system mode since there
	// is no SPSR to copy to CPSR in that mode. Apparently works in user mode
	// by just pretending that the S bit is clear. Verified on real HW.
	//this patch verifies that if S is set, PC is in the list, since else it is an
	//entirely different instr. If so, falls through to the normal ldm code
	lsrs   r5, r2, #16
	bpl    a4_undef
	//fallthrough to normal ldmdb code below
#endif

a4_ldmdb:
	lsls   r5, r2, #15							//move bits to almost top pos in r5
	movs   r0, #64								//start with r15
a4_ldmdb_loop:
	subs   r0, #4								//calc which reg we're targetting
	lsls   r5, #1								//see if we intend to use it
	beq    a4_done_load							//no more bits? we're done
	bpl    a4_ldmdb_loop						//if not interested in this reg, go deal with next one
	subs   r7, #4								//load and adjust Rn's value
	ldr    r6, [r7]
	str    r6, [r4, r0]							//set i-th reg's value <- r6
	b      a4_ldmdb_loop

a4_stmib:
	uxth   r5, r2								//just bottom bits please
	movs   r0, #0								//start with r0
	b      1f
a4_stmib_loop:
	adds   r0, #4								//calc which reg we're targetting
1:
	lsrs   r5, #1								//see if we intend to use it
	bcc    2f
	ldr    r6, [r4, r0]							//get i-th reg's value -> r6
	adds   r7, #4								//store and adjust Rn's value
	str    r6, [r7]
2:
	cmp    r5, #0
	bne    a4_stmib_loop
	b      a4_done

a4_ldmib_s:
#ifndef ARM_EMU_DEFINE_THE_UNDEFINED
	b      a4_undef
#else
	//Bejeweled likes to use shit like "LDMDB   R11, {R4-R8,R11,SP,PC}^" to
	// return from armlets. This is undefined in user and system mode since there
	// is no SPSR to copy to CPSR in that mode. Apparently works in user mode
	// by just pretending that the S bit is clear. Verified on real HW.
	//this patch verifies that if S is set, PC is in the list, since else it is an
	//entirely different instr. If so, falls through to the normal ldm code
	lsrs   r5, r2, #16
	bpl    a4_undef
	//fallthrough to normal ldmib code below
#endif

a4_ldmib:
	uxth   r5, r2								//just bottom bits please
	movs   r0, #0								//start with r0
	b      1f
a4_ldmib_loop:
	adds   r0, #4								//calc which reg we're targetting
1:
	lsrs   r5, #1								//see if we intend to use it
	bcc    2f
	adds   r7, #4								//load and adjust Rn's value
	ldr    r6, [r7]
	str    r6, [r4, r0]							//set i-th reg's value -> r6
2:
	cmp    r5, #0
	bne    a4_ldmib_loop
	b      a4_done_load							//no more bits? we're done

a4_undef:
	udf   #0x00

a4_done_load:
	lsrs   r0, r2, #22
	bcc    1f
	lsls   r0, r3, #2							//write back Rn's value <- r7
	str    r7, [r4, r0]
1:
	lsrs   r2, #16
	bcs    j_pc_changed_maybe_thumb
	mov    pc, r8								//go to next instr
j_pc_changed_maybe_thumb:
	b      pc_changed_maybe_thumb

a4_done:
	lsrs   r0, r2, #22							//check for wbak
	bcc    1f
	lsls   r0, r3, #2							//write back Rn's value <- r7
	str    r7, [r4, r0]
1:
	mov    pc, r8

dp_imm:
	uxtb   r0, r2								//get imm8
	lsls   r1, r2, #20							//get rot amt
	lsrs   r1, #28
	beq    dp_imm_no_rot
	lsls   r1, #1
dp_imm_has_rot:
	rors   r0, r1
	mrs    r1, APSR								//grab shifter carry out
	mov    pc, r12								//goto dp_dispatch
dp_imm_no_rot:
	mov    r1, r10								//grab existing C bit as shifter carry out
	mov    pc, r12								//goto dp_dispatch

dp_reg:
	lsls   r6, r2, #28							//get Rm -> r6
	lsrs   r6, #28
	lsls   r3, r2, #20							//get imm or "Rs << 1" -> r3
	lsrs   r3, #27
	lsls   r5, r2, #24							//grab bits 4..7 which include shift and other vals that tell us what this is. LSL the whole thing by 1 for dispatch
	lsrs   r5, #27
	add    pc, r5								//dispatch on shift type and other bits
	nop
	
dp_reg_sh_disp:
	b dp_r_lsli
	b dp_r_lslr
	b dp_r_lsri
	b dp_r_lsrr
	b dp_r_asri
	b dp_r_asrr
	b dp_r_rori									//could be RRX
	b dp_r_rorr
	b dp_r_lsli
	b dp_mul_swp								// table 3.2 (SWP, SWPB, MUL, MLA, UMULL, UMLAL, SMULL, SMLAL)
	b dp_r_lsri
	b dp_halfw 									// table 3.2 (LDRH,STRH)
	b dp_r_asri
	b dp_ldrd_sb								// table 3.2 (LDRD, LDRSB)
	b dp_r_rori									//could be RRX
	bl dp_strd_sh								// table 3.2 (STRD, LDRSH) (BL is ok since thit is the last one in the table)


dp_r_lsli:
	lsls   r0, r6, #2							//get Rm's value -> r0
	ldr    r0, [r4, r0]
	cmp    r3, #0
	beq    dp_r_lsli_noshift
	lsls   r0, r3
	mrs    r1, APSR								//grab shifter carry out
	mov    pc, r12								//goto dp_dispatch
dp_r_lsli_noshift:
	mov    r1, r10								//grab existing C bit as shifter carry out
	mov    pc, r12								//goto dp_dispatch

dp_r_lslr:
	lsls   r0, r6, #2							//get Rm's value -> r0
	ldr    r0, [r4, r0]
	lsls   r3, #1								//get Rs's value's low 8 bits -> r3
	ldrb   r3, [r4, r3]
	lsls   r0, r3
	mrs    r1, APSR								//grab shifter carry out
	mov    pc, r12								//goto dp_dispatch

dp_r_lsri_32:
	lsrs   r0, #32
	mrs    r1, APSR								//grab shifter carry out
	mov    pc, r12								//goto dp_dispatch

dp_r_lsri:
	lsls   r0, r6, #2							//get Rm's value -> r0
	ldr    r0, [r4, r0]
	cmp    r3, #0
	beq    dp_r_lsri_32
	lsrs   r0, r3
	mrs    r1, APSR								//grab shifter carry out
	mov    pc, r12								//goto dp_dispatch

dp_r_lsrr:
	lsls   r0, r6, #2							//get Rm's value -> r0
	ldr    r0, [r4, r0]
	lsls   r3, #1								//get Rs's value's low 8 bits -> r3
	ldrb   r3, [r4, r3]
	lsrs   r0, r3
	mrs    r1, APSR								//grab shifter carry out
	mov    pc, r12								//goto dp_dispatch

dp_r_asri_32:
	asrs   r0, #32
	mrs    r1, APSR								//grab shifter carry out
	mov    pc, r12								//goto dp_dispatch

dp_r_asri:
	lsls   r0, r6, #2							//get Rm's value -> r0
	ldr    r0, [r4, r0]
	cmp    r3, #0
	beq    dp_r_asri_32
	asrs   r0, r3
	mrs    r1, APSR								//grab shifter carry out
	mov    pc, r12								//goto dp_dispatch

dp_r_asrr:
	lsls   r0, r6, #2							//get Rm's value -> r0
	ldr    r0, [r4, r0]
	lsls   r3, #1								//get Rs's value's low 8 bits -> r3
	ldrb   r3, [r4, r3]
	asrs   r0, r3
	mrs    r1, APSR								//grab shifter carry out
	mov    pc, r12								//goto dp_dispatch

dp_r_rori:
	lsls   r0, r6, #2							//get Rm's value -> r0
	ldr    r0, [r4, r0]
	cmp    r3, #0
	beq	   dp_r_rrx
	rors   r0, r3
	mrs    r1, APSR								//grab shifter carry out
	mov    pc, r12								//goto dp_dispatch

dp_r_rrx:										//C-M0 lacks RRx so we do it the hard way
	lsrs   r0, #1								//shift right to generate proper carry
	mrs    r1, APSR								//grab shifter carry out
	mov    r3, r10								//grab existing C bit
	lsrs   r3, #29								//get SR.C into top bit of r3
	lsls   r3, #31
	orrs   r0, r3								//insert top bit
	mov    pc, r12								//goto dp_dispatch

dp_r_rorr:
	lsls   r0, r6, #2							//get Rm's value -> r0
	ldr    r0, [r4, r0]
	lsls   r3, #1								//get Rs's value's low 8 bits -> r3
	ldrb   r3, [r4, r3]
	rors   r0, r3
	mrs    r1, APSR								//grab shifter carry out
	mov    pc, r12								//goto dp_dispatch

dp_mul_swp:
	lsls   r0, r6, #2							//get Rm's value -> r0
	ldr    r0, [r4, r0]
	lsls   r3, r2, #7							//isolate dispatchable bits, LSL 1 for dispatch
	lsrs   r3, #26
	add    pc, r3								//dispatch
	nop
	b dp_mul
	b dp_muls
	b dp_mla
	b dp_mlas
	b dp_mul_und
	b dp_mul_und
	b dp_mul_und
	b dp_mul_und
	b dp_umull
	b dp_umulls
	b dp_umlal
	b dp_umlals
	b dp_smull
	b dp_smulls
	b dp_smlal
	b dp_smlals
	b dp_swp
	b dp_mul_und
	b dp_mul_und
	b dp_mul_und
	b dp_swpb
	b dp_mul_und
	b dp_mul_und
	b dp_mul_und
	b dp_mul_und
	b dp_mul_und
	b dp_mul_und
	b dp_mul_und
	b dp_mul_und
	b dp_mul_und
	b dp_mul_und
	b dp_mul_und

dp_mul:
	lsls   r1, r2, #20							//get Rs -> r1
	lsrs   r1, #28
	lsls   r1, #2								//get Rs's value -> r1
	ldr    r1, [r4, r1]
	muls   r1, r0
	lsls   r0, r2, #12							//get Rd -> r0
	lsrs   r0, #28
	lsls   r0, #2								//set Rd's value <- r1
	str    r1, [r4, r0]
	mov    pc, r8

dp_muls:
	msr    APSR_nzcvq, r10						//grab existing CPSR -> APSR
	lsls   r1, r2, #20							//get Rs -> r1
	lsrs   r1, #28
	lsls   r1, #2								//get Rs's value -> r1
	ldr    r1, [r4, r1]
	muls   r1, r0
	mrs    r10, APSR							//grab resulting CPSR into r10
	lsls   r0, r2, #12							//get Rd -> r0
	lsrs   r0, #28
	lsls   r0, #2								//set Rd's value <- r1
	str    r1, [r4, r0]
	mov    pc, r8

dp_mla:
	lsls   r1, r2, #20							//get Rs -> r1
	lsrs   r1, #28
	lsls   r1, #2								//get Rs's value -> r1
	ldr    r1, [r4, r1]
	lsls   r5, r2, #16							//get Rn -> r5
	lsrs   r5, #28
	lsls   r5, #2								//get Rn's value -> r5
	ldr    r5, [r4, r5]
	muls   r1, r0
	adds   r1, r5
	lsls   r0, r2, #12							//get Rd -> r0
	lsrs   r0, #28
	lsls   r0, #2								//set Rd's value <- r1
	str    r1, [r4, r0]
	mov    pc, r8

dp_mlas:										//T2 has no flag-setting version of this, so we synthesize it. Luckily nobody uses this
	lsls   r1, r2, #20							//get Rs -> r1
	lsrs   r1, #28
	lsls   r1, #2								//get Rs's value -> r1
	ldr    r1, [r4, r1]
	lsls   r5, r2, #16							//get Rn -> r5
	lsrs   r5, #28
	lsls   r5, #2								//get Rn's value -> r5
	ldr    r5, [r4, r5]
	muls   r1, r0
	adds   r1, r5
	msr    APSR_nzcvq, r10						//stash it into APSR
	tst    r1, r1								//calculate condition codes for result
	mrs    r10, APSR							//grab resulting CPSR into r10
	lsls   r0, r2, #12							//get Rd -> r0
	lsrs   r0, #28
	lsls   r0, #2								//set Rd's value <- r1
	str    r1, [r4, r0]
	mov    pc, r8

dp_smull:
	lsls   r1, r2, #20							//get Rs -> r1
	lsrs   r1, #28
	lsls   r1, #2								//get Rs's value -> r1
	ldr    r1, [r4, r1]
	push   {r2}
	bl     do_smull								//lo = r1, hi = r3, a = r0, b = r1, out is r3:r1
	pop    {r2}
	lsls   r0, r2, #16							//get RdLo -> r0
	lsrs   r0, #28
	lsls   r0, #2								//set RdLo's value <- r1
	str    r1, [r4, r0]
	lsls   r0, r2, #12							//get RdHi -> r0
	lsrs   r0, #28
	lsls   r0, #2								//set RdHi's value <- r3
	str    r3, [r4, r0]
	mov    pc, r8

dp_smulls:										//T2 has no flag-setting version of this, so we synthesize it. Luckily nobody uses this
	lsls   r1, r2, #20							//get Rs -> r1
	lsrs   r1, #28
	lsls   r1, #2								//get Rs's value -> r1
	ldr    r1, [r4, r1]
	push   {r2}
	bl     do_smull								//lo = r1, hi = r3, a = r0, b = r1, out is r3:r1
	pop    {r2}
	lsls   r0, r2, #16							//get RdLo -> r0
	lsrs   r0, #28
	lsls   r0, #2								//set RdLo's value <- r1
	str    r1, [r4, r0]
	lsls   r0, r2, #12							//get RdHi -> r0
	lsrs   r0, #28
	lsls   r0, #2								//set RdHi's value <- r3
	str    r3, [r4, r0]
	mov    r2, r10								//grab existing CPSR -> r2
	lsrs   r0, r3, #31							//top bit of RdHi into r0
	lsls   r0, #1								//prepare it to go into N and a zero into Z
	orrs   r1, r3								//test for entire result being zero
	bne    1f
	adds   r0, #1								// set Z as needed
1:
	lsls   r2, #2								// clear current top 2 bit sof CPSR
	lsrs   r2, #2
	lsls   r0, #30								//move our calculated bits into proper positions
	orrs   r2, r0								//ORR them in
	mov    r10, r2								//set CPSR <- r2
	mov    pc, r8

dp_smlal:
	lsls   r1, r2, #20							//get Rs -> r1
	lsrs   r1, #28
	lsls   r1, #2								//get Rs's value -> r1
	ldr    r1, [r4, r1]
	lsls   r3, r2, #16							//get RdLo -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply RdLo by 4
	ldr    r5, [r4, r3]							//get RdLo's value -> r5
	lsls   r6, r2, #12							//get RdHi -> r6
	lsrs   r6, #28
	lsls   r6, #2								//premultiply RdHi by 4
	ldr    r7, [r4, r6]							//get RdHi's value -> r7
	push   {r3}
	bl     do_smlal								//execute "smlal  lo=r5, hi=r7, a=r0, b=r1"
	pop    {r3}
	str    r7, [r4, r6]							//set RdHi's value <- r7
	str    r5, [r4, r3]							//set RdLo's value <- r5
	mov    pc, r8

dp_smlals:										//T2 has no flag-setting version of this, so we synthesize it. Luckily nobody uses this
	lsls   r1, r2, #20							//get Rs -> r1
	lsrs   r1, #28
	lsls   r1, #2								//get Rs's value -> r1
	ldr    r1, [r4, r1]
	lsls   r3, r2, #16							//get RdLo -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply RdLo by 4
	ldr    r5, [r4, r3]							//get RdLo's value -> r5
	lsls   r6, r2, #12							//get RdHi -> r6
	lsrs   r6, #28
	lsls   r6, #2								//premultiply RdHi by 4
	ldr    r7, [r4, r6]							//get RdHi's value -> r7
	push   {r3}
	bl     do_smlal								//execute "smlal  lo=r5, hi=r7, a=r0, b=r1"
	pop    {r3}
	str    r7, [r4, r6]							//set RdHi's value <- r7
	str    r5, [r4, r3]							//set RdLo's value <- r5
	mov    r2, r10								//grab existing CPSR -> r2
	lsrs   r0, r7, #31							//top bit of RdHi into r0
	lsls   r0, #1								//prepare it to go into N and a zero into Z
	orrs   r5, r7								//test for entire result being zero
	bne    1f
	adds   r0, #1								// set Z as needed
1:
	lsls   r2, #2								// clear current top 2 bit sof CPSR
	lsrs   r2, #2
	lsls   r0, #30								//move our calculated bits into proper positions
	orrs   r2, r0								//ORR them in
	mov    r10, r2								//set CPSR <- r2
	mov    pc, r8

dp_umull:
	lsls   r1, r2, #20							//get Rs -> r1
	lsrs   r1, #28
	lsls   r1, #2								//get Rs's value -> r1
	ldr    r1, [r4, r1]
	push   {r2}
	bl     do_umull								//lo = r1, hi = r3, a = r0, b = r1, out is r3:r1
	pop    {r2}
	lsls   r0, r2, #16							//get RdLo -> r0
	lsrs   r0, #28
	lsls   r0, #2								//set RdLo's value <- r1
	str    r1, [r4, r0]
	lsls   r0, r2, #12							//get RdHi -> r0
	lsrs   r0, #28
	lsls   r0, #2								//set RdHi's value <- r3
	str    r3, [r4, r0]
	mov    pc, r8

dp_umulls:										//T2 has no flag-setting version of this, so we synthesize it. Luckily nobody uses this
	lsls   r1, r2, #20							//get Rs -> r1
	lsrs   r1, #28
	lsls   r1, #2								//get Rs's value -> r1
	ldr    r1, [r4, r1]
	push   {r2}
	bl     do_umull								//lo = r1, hi = r3, a = r0, b = r1, out is r3:r1
	pop    {r2}
	lsls   r0, r2, #16							//get RdLo -> r0
	lsrs   r0, #28
	lsls   r0, #2								//set RdLo's value <- r1
	str    r1, [r4, r0]
	lsls   r0, r2, #12							//get RdHi -> r0
	lsrs   r0, #28
	lsls   r0, #2								//set RdHi's value <- r3
	str    r3, [r4, r0]
	mov    r2, r10								//grab existing CPSR -> r2
	lsrs   r0, r3, #31							//top bit of RdHi into r0
	lsls   r0, #1								//prepare it to go into N and a zero into Z
	orrs   r1, r3								//test for entire result being zero
	bne    1f
	adds   r0, #1								// set Z as needed
1:
	lsls   r2, #2								// clear current top 2 bit sof CPSR
	lsrs   r2, #2
	lsls   r0, #30								//move our calculated bits into proper positions
	orrs   r2, r0								//ORR them in
	mov    r10, r2								//set CPSR <- r2
	mov    pc, r8

dp_umlal:
	lsls   r1, r2, #20							//get Rs -> r1
	lsrs   r1, #28
	lsls   r1, #2								//get Rs's value -> r1
	ldr    r1, [r4, r1]
	lsls   r3, r2, #16							//get RdLo -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply RdLo by 4
	ldr    r5, [r4, r3]							//get RdLo's value -> r5
	lsls   r6, r2, #12							//get RdHi -> r6
	lsrs   r6, #28
	lsls   r6, #2								//premultiply RdHi by 4
	ldr    r7, [r4, r6]							//get RdHi's value -> r7
	push   {r3}
	bl     do_umlal								//execute "umlal  r5, r7, r0, r1"
	pop    {r3}
	str    r7, [r4, r6]							//set RdHi's value <- r7
	str    r5, [r4, r3]							//set RdLo's value <- r5
	mov    pc, r8

dp_umlals:										//T2 has no flag-setting version of this, so we synthesize it. Luckily nobody uses this
	lsls   r1, r2, #20							//get Rs -> r1
	lsrs   r1, #28
	lsls   r1, #2								//get Rs's value -> r1
	ldr    r1, [r4, r1]
	lsls   r3, r2, #16							//get RdLo -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply RdLo by 4
	ldr    r5, [r4, r3]							//get RdLo's value -> r5
	lsls   r6, r2, #12							//get RdHi -> r6
	lsrs   r6, #28
	lsls   r6, #2								//premultiply RdHi by 4
	ldr    r7, [r4, r6]							//get RdHi's value -> r7
	push   {r3}
	bl     do_umlal								//execute "umlal  r5, r7, r0, r1"
	pop    {r3}
	str    r7, [r4, r6]							//set RdHi's value <- r7
	str    r5, [r4, r3]							//set RdLo's value <- r5
	mov    r2, r10								//grab existing CPSR -> r2
	lsrs   r0, r7, #31							//top bit of RdHi into r0
	lsls   r0, #1								//prepare it to go into N and a zero into Z
	orrs   r5, r7								//test for entire result being zero
	bne    1f
	adds   r0, #1								// set Z as needed
1:
	lsls   r2, #2								// clear current top 2 bit sof CPSR
	lsrs   r2, #2
	lsls   r0, #30								//move our calculated bits into proper positions
	orrs   r2, r0								//ORR them in
	mov    r10, r2								//set CPSR <- r2
	mov    pc, r8

dp_swp:
	lsls   r1, r2, #12							//get Rn -> r1
	lsrs   r1, #28
	lsls   r1, #2								//get Rn's value -> r1
	ldr    r1, [r4, r1]
	
	#ifdef HAVE_v8M_BASE

	1:
		ldrex  r3, [r1]
		strex  r5, r0, [r1]
		cmp    r5, #0
		bne    1b
	
	#else
	
		//not the best, but lacking ldrex/strex, this is the best we can do
		cpsid  i
		ldr    r3, [r1]
		str    r0, [r1]
		cpsie  i
	
	#endif
	
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r1, #2								//set Rd's value <- r3
	str    r3, [r4, r1]
	mov    pc, r8

dp_swpb:
	lsls   r1, r2, #12							//get Rn -> r1
	lsrs   r1, #28
	lsls   r1, #2								//get Rn's value -> r1
	ldr    r1, [r4, r1]
	
	#ifdef HAVE_v8M_BASE

	1:
		ldrexb r3, [r1]
		strexb r5, r0, [r1]
		cmp    r5, #0
		bne    1b
	
	#else
	
		//not the best, but lacking ldrex/strex, this is the best we can do
		cpsid  i
		ldrb   r3, [r1]
		strb   r0, [r1]
		cpsie  i
	
	#endif

	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r1, #2								//set Rd's value <- r3
	str    r3, [r4, r1]
	mov    pc, r8

dp_mul_und:
	udf    #0x00

dp_halfw:
	lsls   r1, r2, #7							//get 'PUiWL' LSL 1-> r1
	lsrs   r1, #26
	add    pc, r1								//dispatch on 'PUiWL'
	nop
	b dp_hw_00000								//STRH Rd, [Rn], -Rm
	b dp_hw_00001								//LDRH Rd, [Rn], -Rm
	b dp_hw_undef
	b dp_hw_undef
	b dp_hw_00100								//STRH Rd, [Rn], -#imm
	b dp_hw_00101								//LDRH Rd, [Rn], -#imm
	b dp_hw_undef
	b dp_hw_undef
	b dp_hw_01000								//STRH Rd, [Rn], +Rm
	b dp_hw_01001								//LDRH Rd, [Rn], +Rm
	b dp_hw_undef
	b dp_hw_undef
	b dp_hw_01100								//STRH Rd, [Rn], +#imm
	b dp_hw_01101								//LDRH Rd, [Rn], +#imm
	b dp_hw_undef
	b dp_hw_undef
	b dp_hw_10000								//STRH Rd, [Rn, -Rm]
	b dp_hw_10001								//LDRH Rd, [Rn, -Rm]
	b dp_hw_10010								//STRH Rd, [Rn, -Rm]!
	b dp_hw_10011								//LDRH Rd, [Rn, -Rm]!
	b dp_hw_10100								//STRH Rd, [Rn, -#imm]
	b dp_hw_10101								//LDRH Rd, [Rn, -#imm]
	b dp_hw_10110								//STRH Rd, [Rn, -#imm]!
	b dp_hw_10111								//LDRH Rd, [Rn, -#imm]!
	b dp_hw_11000								//STRH Rd, [Rn, +Rm]
	b dp_hw_11001								//LDRH Rd, [Rn, +Rm]
	b dp_hw_11010								//STRH Rd, [Rn, +Rm]!
	b dp_hw_11011								//LDRH Rd, [Rn, +Rm]!
	b dp_hw_11100								//STRH Rd, [Rn, +#imm]
	b dp_hw_11101								//LDRH Rd, [Rn, +#imm]
	b dp_hw_11111								//STRH Rd, [Rn, +#imm]!
	b dp_hw_11111								//LDRH Rd, [Rn, +#imm]!

dp_hw_undef:
	udf    #0x00

dp_hw_00000:									//STRH Rd, [Rn], -Rm
	lsls   r7, r6, #2							//get Rm's value -> r7
	ldr    r7, [r4, r7]
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	lsls   r6, r1, #2							//get Rd's value -> r6
	ldr    r6, [r4, r6]
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	strh   r6, [r5]								//perform the store
	subs   r5, r7								//positincrement
	str    r5, [r4, r3]							//set Rn's value <- r5
	mov    pc, r8

dp_hw_00001:									//LDRH Rd, [Rn], -Rm
	lsls   r7, r6, #2							//get Rm's value -> r7
	ldr    r7, [r4, r7]
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	ldrh   r6, [r5]								//perform the load
	lsls   r1, #2								//set Rd's value <- r6
	str    r6, [r4, r1]
	subs   r5, r7								//positincrement
	str    r5, [r4, r3]							//set Rn's value <- r5
	mov    pc, r8

dp_hw_00100:									//STRH Rd, [Rn], -#imm
	lsls   r7, r2, #20							//calculate imm -> r7
	lsrs   r7, #28
	lsls   r7, #4
	orrs   r7, r6
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	lsls   r6, r1, #2							//get Rd's value -> r6
	ldr    r6, [r4, r6]
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	strh   r6, [r5]								//perform the store
	subs   r5, r7								//positincrement
	str    r5, [r4, r3]							//set Rn's value <- r5
	mov    pc, r8

dp_hw_00101:									//LDRH Rd, [Rn], -#imm
	lsls   r7, r2, #20							//calculate imm -> r7
	lsrs   r7, #28
	lsls   r7, #4
	orrs   r7, r6
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	ldrh   r6, [r5]								//perform the load
	lsls   r1, #2								//set Rd's value <- r6
	str    r6, [r4, r1]
	subs   r5, r7								//positincrement
	str    r5, [r4, r3]							//set Rn's value <- r5
	mov    pc, r8

dp_hw_01000:									//STRH Rd, [Rn], +Rm
	lsls   r7, r6, #2							//get Rm's value -> r7
	ldr    r7, [r4, r7]
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	lsls   r6, r1, #2							//get Rd's value -> r6
	ldr    r6, [r4, r6]
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	strh   r6, [r5]								//perform the store
	adds   r5, r7								//positincrement
	str    r5, [r4, r3]							//set Rn's value <- r5
	mov    pc, r8

dp_hw_01001:									//LDRH Rd, [Rn], +Rm
	lsls   r7, r6, #2							//get Rm's value -> r7
	ldr    r7, [r4, r7]
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	ldrh   r6, [r5]								//perform the load
	lsls   r1, #2								//set Rd's value <- r6
	str    r6, [r4, r1]
	adds   r5, r7								//positincrement
	str    r5, [r4, r3]							//set Rn's value <- r5
	mov    pc, r8

dp_hw_01100:									//STRH Rd, [Rn], +#imm
	lsls   r7, r2, #20							//calculate imm -> r7
	lsrs   r7, #28
	lsls   r7, #4
	orrs   r7, r6
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	lsls   r6, r1, #2							//get Rd's value -> r6
	ldr    r6, [r4, r6]
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	strh   r6, [r5]								//perform the store
	adds   r5, r7								//positincrement
	str    r5, [r4, r3]							//set Rn's value <- r5
	mov    pc, r8

dp_hw_01101:									//LDRH Rd, [Rn], +#imm
	lsls   r7, r2, #20							//calculate imm -> r7
	lsrs   r7, #28
	lsls   r7, #4
	orrs   r7, r6
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	ldrh   r6, [r5]								//perform the load
	lsls   r1, #2								//set Rd's value <- r6
	str    r6, [r4, r1]
	adds   r5, r7								//positincrement
	str    r5, [r4, r3]							//set Rn's value <- r5
	mov    pc, r8

dp_hw_10000:									//STRH Rd, [Rn, -Rm]
	lsls   r7, r6, #2							//get Rm's value -> r7
	ldr    r7, [r4, r7]
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	lsls   r6, r1, #2							//get Rd's value -> r6
	ldr    r6, [r4, r6]
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	subs   r5, r7								//preincrement
	strh   r6, [r5]								//perform the store
	mov    pc, r8

dp_hw_10001:									//LDRH Rd, [Rn, -Rm]
	lsls   r7, r6, #2							//get Rm's value -> r7
	ldr    r7, [r4, r7]
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	subs   r5, r7								//preincrement
	ldrh   r6, [r5]								//perform the load
	lsls   r1, #2								//set Rd's value <- r6
	str    r6, [r4, r1]
	mov    pc, r8

dp_hw_10010:									//STRH Rd, [Rn, -Rm]!
	lsls   r7, r6, #2							//get Rm's value -> r7
	ldr    r7, [r4, r7]
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	lsls   r6, r1, #2							//get Rd's value -> r6
	ldr    r6, [r4, r6]
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	subs   r5, r7								//preincrement
	strh   r6, [r5]								//perform the store
	str    r5, [r4, r3]							//set Rn's value <- r5 (writeback)
	mov    pc, r8

dp_hw_10011:									//LDRH Rd, [Rn, -Rm]!
	lsls   r7, r6, #2							//get Rm's value -> r7
	ldr    r7, [r4, r7]
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	subs   r5, r7								//preincrement
	ldrh   r6, [r5]								//perform the load
	lsls   r1, #2								//set Rd's value <- r6
	str    r6, [r4, r1]
	str    r5, [r4, r3]							//set Rn's value <- r5 (writeback)
	mov    pc, r8

dp_hw_10100:									//STRH Rd, [Rn, -#imm]
	lsls   r7, r2, #20							//calculate imm -> r7
	lsrs   r7, #28
	lsls   r7, #4
	orrs   r7, r6
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	lsls   r6, r1, #2							//get Rd's value -> r6
	ldr    r6, [r4, r6]
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	subs   r5, r7								//preincrement
	strh   r6, [r5]								//perform the store
	mov    pc, r8

dp_hw_10101:									//LDRH Rd, [Rn, -#imm]
	lsls   r7, r2, #20							//calculate imm -> r7
	lsrs   r7, #28
	lsls   r7, #4
	orrs   r7, r6
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	subs   r5, r7								//preincrement
	ldrh   r6, [r5]								//perform the load
	lsls   r1, #2								//set Rd's value <- r6
	str    r6, [r4, r1]
	mov    pc, r8

dp_hw_10110:									//STRH Rd, [Rn, -#imm]!
	lsls   r7, r2, #20							//calculate imm -> r7
	lsrs   r7, #28
	lsls   r7, #4
	orrs   r7, r6
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	lsls   r6, r1, #2							//get Rd's value -> r6
	ldr    r6, [r4, r6]
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	subs   r5, r7								//preincrement
	strh   r6, [r5]								//perform the store
	str    r5, [r4, r3]							//set Rn's value <- r5 (writeback)
	mov    pc, r8

dp_hw_10111:									//LDRH Rd, [Rn, -#imm]!
	lsls   r7, r2, #20							//calculate imm -> r7
	lsrs   r7, #28
	lsls   r7, #4
	orrs   r7, r6
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	subs   r5, r7								//preincrement
	ldrh   r6, [r5]								//perform the load
	lsls   r1, #2								//set Rd's value <- r6
	str    r6, [r4, r1]
	str    r5, [r4, r3]							//set Rn's value <- r5 (writeback)
	mov    pc, r8

dp_hw_11000:									//STRH Rd, [Rn, +Rm]
	lsls   r7, r6, #2							//get Rm's value -> r7
	ldr    r7, [r4, r7]
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	lsls   r6, r1, #2							//get Rd's value -> r6
	ldr    r6, [r4, r6]
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	strh   r6, [r5, r7]							//perform the store
	mov    pc, r8

dp_hw_11001:									//LDRH Rd, [Rn, +Rm]
	lsls   r7, r6, #2							//get Rm's value -> r7
	ldr    r7, [r4, r7]
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	ldrh   r6, [r5, r7]							//perform the load
	lsls   r1, #2								//set Rd's value <- r6
	str    r6, [r4, r1]
	mov    pc, r8

dp_hw_11010:									//STRH Rd, [Rn, +Rm]!
	lsls   r7, r6, #2							//get Rm's value -> r7
	ldr    r7, [r4, r7]
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	lsls   r6, r1, #2							//get Rd's value -> r6
	ldr    r6, [r4, r6]
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	adds   r5, r7								//preincrement
	strh   r6, [r5]								//perform the store
	str    r5, [r4, r3]							//set Rn's value <- r5 (writeback)
	mov    pc, r8

dp_hw_11011:									//LDRH Rd, [Rn, +Rm]!
	lsls   r7, r6, #2							//get Rm's value -> r7
	ldr    r7, [r4, r7]
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	adds   r5, r7								//preincrement
	ldrh   r6, [r5]								//perform the load
	lsls   r1, #2								//set Rd's value <- r6
	str    r6, [r4, r1]
	str    r5, [r4, r3]							//set Rn's value <- r5 (writeback)
	mov    pc, r8

dp_hw_11100:									//STRH Rd, [Rn, +#imm]
	lsls   r7, r2, #20							//calculate imm -> r7
	lsrs   r7, #28
	lsls   r7, #4
	orrs   r7, r6
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	lsls   r6, r1, #2							//get Rd's value -> r6
	ldr    r6, [r4, r6]
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	strh   r6, [r5, r7]							//perform the store
	mov    pc, r8

dp_hw_11101:									//LDRH Rd, [Rn, +#imm]
	lsls   r7, r2, #20							//calculate imm -> r7
	lsrs   r7, #28
	lsls   r7, #4
	orrs   r7, r6
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	ldrh   r6, [r5, r7]							//perform the load
	lsls   r1, #2								//set Rd's value <- r6
	str    r6, [r4, r1]
	mov    pc, r8

dp_hw_11110:									//STRH Rd, [Rn, +#imm]!
	lsls   r7, r2, #20							//calculate imm -> r7
	lsrs   r7, #28
	lsls   r7, #4
	orrs   r7, r6
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	lsls   r6, r1, #2							//get Rd's value -> r6
	ldr    r6, [r4, r6]
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	adds   r5, r7								//preincrement
	strh   r6, [r5]								//perform the store
	str    r5, [r4, r3]							//set Rn's value <- r5 (writeback)
	mov    pc, r8

dp_hw_11111:									//LDRH Rd, [Rn, +#imm]!
	lsls   r7, r2, #20							//calculate imm -> r7
	lsrs   r7, #28
	lsls   r7, #4
	orrs   r7, r6
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	adds   r5, r7								//preincrement
	ldrh   r6, [r5]								//perform the load
	lsls   r1, #2								//set Rd's value <- r6
	str    r6, [r4, r1]
	str    r5, [r4, r3]							//set Rn's value <- r5 (writeback)
	mov    pc, r8

dp_ldrd_sb:
	lsls   r1, r2, #7							//get 'PUiWL' LSL 1-> r1
	lsrs   r1, #26
	add    pc, r1								//dispatch on 'PUiWL'
	nop
	b dp_sd_00000								//LDRD  Rd, [Rn], -Rm
	b dp_sd_00001								//LDRSB Rd, [Rn], -Rm
	b dp_sd_undef
	b dp_sd_undef
	b dp_sd_00100								//LDRD  Rd, [Rn], -#imm
	b dp_sd_00101								//LDRSB Rd, [Rn], -#imm
	b dp_sd_undef
	b dp_sd_undef
	b dp_sd_01000								//LDRD  Rd, [Rn], +Rm
	b dp_sd_01001								//LDRSB Rd, [Rn], +Rm
	b dp_sd_undef
	b dp_sd_undef
	b dp_sd_01100								//LDRD  Rd, [Rn], +#imm
	b dp_sd_01101								//LDRSB Rd, [Rn], +#imm
	b dp_sd_undef
	b dp_sd_undef
	b dp_sd_10000								//LDRD  Rd, [Rn, -Rm]
	b dp_sd_10001								//LDRSB Rd, [Rn, -Rm]
	b dp_sd_10010								//LDRD  Rd, [Rn, -Rm]!
	b dp_sd_10011								//LDRSB Rd, [Rn, -Rm]!
	b dp_sd_10100								//LDRD  Rd, [Rn, -#imm]
	b dp_sd_10101								//LDRSB Rd, [Rn, -#imm]
	b dp_sd_10110								//LDRD  Rd, [Rn, -#imm]!
	b dp_sd_10111								//LDRSB Rd, [Rn, -#imm]!
	b dp_sd_11000								//LDRD  Rd, [Rn, +Rm]
	b dp_sd_11001								//LDRSB Rd, [Rn, +Rm]
	b dp_sd_11010								//LDRD  Rd, [Rn, +Rm]!
	b dp_sd_11011								//LDRSB Rd, [Rn, +Rm]!
	b dp_sd_11100								//LDRD  Rd, [Rn, +#imm]
	b dp_sd_11101								//LDRSB Rd, [Rn, +#imm]
	b dp_sd_11111								//LDRD  Rd, [Rn, +#imm]!
	b dp_sd_11111								//LDRSB Rd, [Rn, +#imm]!

dp_sd_undef:
	udf    #0x00

dp_sd_00000:									//LDRD Rd, [Rn], -Rm
	lsls   r7, r6, #2							//get Rm's value -> r7
	ldr    r7, [r4, r7]
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	ldr    r6, [r5, #0x00]						//perform the doubleload
	ldr    r2, [r5, #0x04]
	lsls   r1, #1								//calc where we'll store the two regs
	adds   r1, r4
	str    r6, [r1, #0x00]						//store the two words into (Rd, Rd+1)
	str    r2, [r1, #0x04]
	subs   r5, r7								//positincrement
	str    r5, [r4, r3]							//set Rn's value <- r5
	mov    pc, r8

dp_sd_00001:									//LDRSB Rd, [Rn], -Rm
	lsls   r7, r6, #2							//get Rm's value -> r7
	ldr    r7, [r4, r7]
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	movs   r6, #0								//perform the load (C-M0 can only do LDRSB in [reg+reg] form so we must first create a zero)
	ldrsb  r6, [r5, r6]
	lsls   r1, #2								//set Rd's value <- r6
	str    r6, [r4, r1]
	subs   r5, r7								//positincrement
	str    r5, [r4, r3]							//set Rn's value <- r5
	mov    pc, r8

dp_sd_00100:									//LDRD Rd, [Rn], -#imm
	lsls   r7, r2, #20							//calculate imm -> r7
	lsrs   r7, #28
	lsls   r7, #4
	orrs   r7, r6
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	ldr    r6, [r5, #0x00]						//perform the doubleload
	ldr    r2, [r5, #0x04]
	lsls   r1, #1								//calc where we'll store the two regs
	adds   r1, r4
	str    r6, [r1, #0x00]						//store the two words into (Rd, Rd+1)
	str    r2, [r1, #0x04]
	subs   r5, r7								//positincrement
	str    r5, [r4, r3]							//set Rn's value <- r5
	mov    pc, r8

dp_sd_00101:									//LDRSB Rd, [Rn], -#imm
	lsls   r7, r2, #20							//calculate imm -> r7
	lsrs   r7, #28
	lsls   r7, #4
	orrs   r7, r6
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	movs   r6, #0								//perform the load (C-M0 can only do LDRSB in [reg+reg] form so we must first create a zero)
	ldrsb  r6, [r5, r6]
	lsls   r1, #2								//set Rd's value <- r6
	str    r6, [r4, r1]
	subs   r5, r7								//positincrement
	str    r5, [r4, r3]							//set Rn's value <- r5
	mov    pc, r8

dp_sd_01000:									//LDRD Rd, [Rn], +Rm
	lsls   r7, r6, #2							//get Rm's value -> r7
	ldr    r7, [r4, r7]
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	ldr    r6, [r5, #0x00]						//perform the doubleload
	ldr    r2, [r5, #0x04]
	lsls   r1, #1								//calc where we'll store the two regs
	adds   r1, r4
	str    r6, [r1, #0x00]						//store the two words into (Rd, Rd+1)
	str    r2, [r1, #0x04]
	adds   r5, r7								//positincrement
	str    r5, [r4, r3]							//set Rn's value <- r5
	mov    pc, r8

dp_sd_01001:									//LDRSB Rd, [Rn], +Rm
	lsls   r7, r6, #2							//get Rm's value -> r7
	ldr    r7, [r4, r7]
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	movs   r6, #0								//perform the load (C-M0 can only do LDRSB in [reg+reg] form so we must first create a zero)
	ldrsb  r6, [r5, r6]
	lsls   r1, #2								//set Rd's value <- r6
	str    r6, [r4, r1]
	adds   r5, r7								//positincrement
	str    r5, [r4, r3]							//set Rn's value <- r5
	mov    pc, r8

dp_sd_01100:									//LDRD Rd, [Rn], +#imm
	lsls   r7, r2, #20							//calculate imm -> r7
	lsrs   r7, #28
	lsls   r7, #4
	orrs   r7, r6
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	ldr    r6, [r5, #0x00]						//perform the doubleload
	ldr    r2, [r5, #0x04]
	lsls   r1, #1								//calc where we'll store the two regs
	adds   r1, r4
	str    r6, [r1, #0x00]						//store the two words into (Rd, Rd+1)
	str    r2, [r1, #0x04]
	adds   r5, r7								//positincrement
	str    r5, [r4, r3]							//set Rn's value <- r5
	mov    pc, r8

dp_sd_01101:									//LDRSB Rd, [Rn], +#imm
	lsls   r7, r2, #20							//calculate imm -> r7
	lsrs   r7, #28
	lsls   r7, #4
	orrs   r7, r6
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	movs   r6, #0								//perform the load (C-M0 can only do LDRSB in [reg+reg] form so we must first create a zero)
	ldrsb  r6, [r5, r6]
	lsls   r1, #2								//set Rd's value <- r6
	str    r6, [r4, r1]
	adds   r5, r7								//positincrement
	str    r5, [r4, r3]							//set Rn's value <- r5
	mov    pc, r8

dp_sd_10000:									//LDRD Rd, [Rn, -Rm]
	lsls   r7, r6, #2							//get Rm's value -> r7
	ldr    r7, [r4, r7]
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	subs   r5, r7								//preincrement
	ldr    r6, [r5, #0x00]						//perform the doubleload
	ldr    r2, [r5, #0x04]
	lsls   r1, #1								//calc where we'll store the two regs
	adds   r1, r4
	str    r6, [r1, #0x00]						//store the two words into (Rd, Rd+1)
	str    r2, [r1, #0x04]
	mov    pc, r8

dp_sd_10001:									//LDRSB Rd, [Rn, -Rm]
	lsls   r7, r6, #2							//get Rm's value -> r7
	ldr    r7, [r4, r7]
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	subs   r5, r7								//preincrement
	movs   r6, #0								//perform the load (C-M0 can only do LDRSB in [reg+reg] form so we must first create a zero)
	ldrsb  r6, [r5, r6]
	lsls   r1, #2								//set Rd's value <- r6
	str    r6, [r4, r1]
	mov    pc, r8

dp_sd_10010:									//LDRD Rd, [Rn, -Rm]!
	lsls   r7, r6, #2							//get Rm's value -> r7
	ldr    r7, [r4, r7]
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	subs   r5, r7								//preincrement
	ldr    r6, [r5, #0x00]						//perform the doubleload
	ldr    r2, [r5, #0x04]
	lsls   r1, #1								//calc where we'll store the two regs
	adds   r1, r4
	str    r6, [r1, #0x00]						//store the two words into (Rd, Rd+1)
	str    r2, [r1, #0x04]
	str    r5, [r4, r3]							//set Rn's value <- r5 (writeback)
	mov    pc, r8

dp_sd_10011:									//LDRSB Rd, [Rn, -Rm]!
	lsls   r7, r6, #2							//get Rm's value -> r7
	ldr    r7, [r4, r7]
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	subs   r5, r7								//preincrement
	movs   r6, #0								//perform the load (C-M0 can only do LDRSB in [reg+reg] form so we must first create a zero)
	ldrsb  r6, [r5, r6]
	lsls   r1, #2								//set Rd's value <- r6
	str    r6, [r4, r1]
	str    r5, [r4, r3]							//set Rn's value <- r5 (writeback)
	mov    pc, r8

dp_sd_10100:									//LDRD Rd, [Rn, -#imm]
	lsls   r7, r2, #20							//calculate imm -> r7
	lsrs   r7, #28
	lsls   r7, #4
	orrs   r7, r6
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	subs   r5, r7								//preincrement
	ldr    r6, [r5, #0x00]						//perform the doubleload
	ldr    r2, [r5, #0x04]
	lsls   r1, #1								//calc where we'll store the two regs
	adds   r1, r4
	str    r6, [r1, #0x00]						//store the two words into (Rd, Rd+1)
	str    r2, [r1, #0x04]
	mov    pc, r8

dp_sd_10101:									//LDRSB Rd, [Rn, -#imm]
	lsls   r7, r2, #20							//calculate imm -> r7
	lsrs   r7, #28
	lsls   r7, #4
	orrs   r7, r6
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	subs   r5, r7								//preincrement
	movs   r6, #0								//perform the load (C-M0 can only do LDRSB in [reg+reg] form so we must first create a zero)
	ldrsb  r6, [r5, r6]
	lsls   r1, #2								//set Rd's value <- r6
	str    r6, [r4, r1]
	mov    pc, r8

dp_sd_10110:									//LDRD Rd, [Rn, -#imm]!
	lsls   r7, r2, #20							//calculate imm -> r7
	lsrs   r7, #28
	lsls   r7, #4
	orrs   r7, r6
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	subs   r5, r7								//preincrement
	ldr    r6, [r5, #0x00]						//perform the doubleload
	ldr    r2, [r5, #0x04]
	lsls   r1, #1								//calc where we'll store the two regs
	adds   r1, r4
	str    r6, [r1, #0x00]						//store the two words into (Rd, Rd+1)
	str    r2, [r1, #0x04]
	str    r5, [r4, r3]							//set Rn's value <- r5 (writeback)
	mov    pc, r8

dp_sd_10111:									//LDRSB Rd, [Rn, -#imm]!
	lsls   r7, r2, #20							//calculate imm -> r7
	lsrs   r7, #28
	lsls   r7, #4
	orrs   r7, r6
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	subs   r5, r7								//preincrement
	movs   r6, #0								//perform the load (C-M0 can only do LDRSB in [reg+reg] form so we must first create a zero)
	ldrsb  r6, [r5, r6]
	lsls   r1, #2								//set Rd's value <- r6
	str    r6, [r4, r1]
	str    r5, [r4, r3]							//set Rn's value <- r5 (writeback)
	mov    pc, r8

dp_sd_11000:									//LDRD Rd, [Rn, +Rm]
	lsls   r7, r6, #2							//get Rm's value -> r7
	ldr    r7, [r4, r7]
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	adds   r5, r7								//preincrement
	ldr    r6, [r5, #0x00]						//perform the doubleload
	ldr    r2, [r5, #0x04]
	lsls   r1, #1								//calc where we'll store the two regs
	adds   r1, r4
	str    r6, [r1, #0x00]						//store the two words into (Rd, Rd+1)
	str    r2, [r1, #0x04]
	mov    pc, r8

dp_sd_11001:									//LDRSB Rd, [Rn, +Rm]
	lsls   r7, r6, #2							//get Rm's value -> r7
	ldr    r7, [r4, r7]
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	ldrsb  r6, [r5, r7]							//perform the load
	lsls   r1, #2								//set Rd's value <- r6
	str    r6, [r4, r1]
	mov    pc, r8

dp_sd_11010:									//LDRD Rd, [Rn, +Rm]!
	lsls   r7, r6, #2							//get Rm's value -> r7
	ldr    r7, [r4, r7]
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	adds   r5, r7								//preincrement
	ldr    r6, [r5, #0x00]						//perform the doubleload
	ldr    r2, [r5, #0x04]
	lsls   r1, #1								//calc where we'll store the two regs
	adds   r1, r4
	str    r6, [r1, #0x00]						//store the two words into (Rd, Rd+1)
	str    r2, [r1, #0x04]
	str    r5, [r4, r3]							//set Rn's value <- r5 (writeback)
	mov    pc, r8

dp_sd_11011:									//LDRSB Rd, [Rn, +Rm]!
	lsls   r7, r6, #2							//get Rm's value -> r7
	ldr    r7, [r4, r7]
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	adds   r5, r7								//preincrement
	movs   r6, #0								//perform the load (C-M0 can only do LDRSB in [reg+reg] form so we must first create a zero)
	ldrsb  r6, [r5, r6]
	lsls   r1, #2								//set Rd's value <- r6
	str    r6, [r4, r1]
	str    r5, [r4, r3]							//set Rn's value <- r5 (writeback)
	mov    pc, r8

dp_sd_11100:									//LDRD Rd, [Rn, +#imm]
	lsls   r7, r2, #20							//calculate imm -> r7
	lsrs   r7, #28
	lsls   r7, #4
	orrs   r7, r6
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	ldr    r6, [r5, #0x00]						//perform the doubleload
	ldr    r2, [r5, #0x04]
	lsls   r1, #1								//calc where we'll store the two regs
	adds   r1, r4
	str    r6, [r1, #0x00]						//store the two words into (Rd, Rd+1)
	str    r2, [r1, #0x04]
	mov    pc, r8

dp_sd_11101:									//LDRSB Rd, [Rn, +#imm]
	lsls   r7, r2, #20							//calculate imm -> r7
	lsrs   r7, #28
	lsls   r7, #4
	orrs   r7, r6
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	ldrsb  r6, [r5, r7]							//perform the load
	lsls   r1, #2								//set Rd's value <- r6
	str    r6, [r4, r1]
	mov    pc, r8

dp_sd_11110:									//LDRD Rd, [Rn, +#imm]!
	lsls   r7, r2, #20							//calculate imm -> r7
	lsrs   r7, #28
	lsls   r7, #4
	orrs   r7, r6
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	adds   r5, r7								//preincrement
	ldr    r6, [r5, #0x00]						//perform the doubleload
	ldr    r2, [r5, #0x04]
	lsls   r1, #1								//calc where we'll store the two regs
	adds   r1, r4
	str    r6, [r1, #0x00]						//store the two words into (Rd, Rd+1)
	str    r2, [r1, #0x04]
	str    r5, [r4, r3]							//set Rn's value <- r5 (writeback)
	mov    pc, r8

dp_sd_11111:									//LDRSB Rd, [Rn, +#imm]!
	lsls   r7, r2, #20							//calculate imm -> r7
	lsrs   r7, #28
	lsls   r7, #4
	orrs   r7, r6
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	adds   r5, r7								//preincrement
	movs   r6, #0								//perform the load (C-M0 can only do LDRSB in [reg+reg] form so we must first create a zero)
	ldrsb  r6, [r5, r6]
	lsls   r1, #2								//set Rd's value <- r6
	str    r6, [r4, r1]
	str    r5, [r4, r3]							//set Rn's value <- r5 (writeback)
	mov    pc, r8

dp_strd_sh:										// table 3.2 (STRD, LDRSH)
	lsls   r1, r2, #7							//get 'PUiWL' LSL 1-> r1
	lsrs   r1, #26
	add    pc, r1								//dispatch on 'PUiWL'
	nop
	b dp_hd_00000								//STRD  Rd, [Rn], -Rm
	b dp_hd_00001								//LDRSH Rd, [Rn], -Rm
	b dp_hd_undef
	b dp_hd_undef
	b dp_hd_00100								//STRD  Rd, [Rn], -#imm
	b dp_hd_00101								//LDRSH Rd, [Rn], -#imm
	b dp_hd_undef
	b dp_hd_undef
	b dp_hd_01000								//STRD  Rd, [Rn], +Rm
	b dp_hd_01001								//LDRSH Rd, [Rn], +Rm
	b dp_hd_undef
	b dp_hd_undef
	b dp_hd_01100								//STRD  Rd, [Rn], +#imm
	b dp_hd_01101								//LDRSH Rd, [Rn], +#imm
	b dp_hd_undef
	b dp_hd_undef
	b dp_hd_10000								//STRD  Rd, [Rn, -Rm]
	b dp_hd_10001								//LDRSH Rd, [Rn, -Rm]
	b dp_hd_10010								//STRD  Rd, [Rn, -Rm]!
	b dp_hd_10011								//LDRSH Rd, [Rn, -Rm]!
	b dp_hd_10100								//STRD  Rd, [Rn, -#imm]
	b dp_hd_10101								//LDRSH Rd, [Rn, -#imm]
	b dp_hd_10110								//STRD  Rd, [Rn, -#imm]!
	b dp_hd_10111								//LDRSH Rd, [Rn, -#imm]!
	b dp_hd_11000								//STRD  Rd, [Rn, +Rm]
	b dp_hd_11001								//LDRSH Rd, [Rn, +Rm]
	b dp_hd_11010								//STRD  Rd, [Rn, +Rm]!
	b dp_hd_11011								//LDRSH Rd, [Rn, +Rm]!
	b dp_hd_11100								//STRD  Rd, [Rn, +#imm]
	b dp_hd_11101								//LDRSH Rd, [Rn, +#imm]
	b dp_hd_11111								//STRD  Rd, [Rn, +#imm]!
	b dp_hd_11111								//LDRSH Rd, [Rn, +#imm]!

dp_hd_undef:
	udf    #0x00

dp_hd_00000:									//STRD Rd, [Rn], -Rm
	lsls   r7, r6, #2							//get Rm's value -> r7
	ldr    r7, [r4, r7]
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	lsls   r1, #2								//calc where we'll get the two regs
	adds   r1, r4
	ldr    r6, [r1, #0x00]						//get the two words from (Rd, Rd+1)
	ldr    r2, [r1, #0x04]	
	str    r6, [r5, #0x00]						//perform the doublestore
	str    r2, [r5, #0x04]
	subs   r5, r7								//positincrement
	str    r5, [r4, r3]							//set Rn's value <- r5
	mov    pc, r8

dp_hd_00001:									//LDRSH Rd, [Rn], -Rm
	lsls   r7, r6, #2							//get Rm's value -> r7
	ldr    r7, [r4, r7]
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	movs   r6, #0								//perform the load (C-M0 can only do LDRSH in [reg+reg] form so we must first create a zero)
	ldrsb  r6, [r5, r6]
	lsls   r1, #2								//set Rd's value <- r6
	str    r6, [r4, r1]
	subs   r5, r7								//positincrement
	str    r5, [r4, r3]							//set Rn's value <- r5
	mov    pc, r8

dp_hd_00100:									//STRD Rd, [Rn], -#imm
	lsls   r7, r2, #20							//calculate imm -> r7
	lsrs   r7, #28
	lsls   r7, #4
	orrs   r7, r6
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	lsls   r1, #2								//calc where we'll get the two regs
	adds   r1, r4
	ldr    r6, [r1, #0x00]						//get the two words from (Rd, Rd+1)
	ldr    r2, [r1, #0x04]
	str    r6, [r5, #0x00]						//perform the doublestore
	str    r2, [r5, #0x04]
	subs   r5, r7								//positincrement
	str    r5, [r4, r3]							//set Rn's value <- r5
	mov    pc, r8

dp_hd_00101:									//LDRSH Rd, [Rn], -#imm
	lsls   r7, r2, #20							//calculate imm -> r7
	lsrs   r7, #28
	lsls   r7, #4
	orrs   r7, r6
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	movs   r6, #0								//perform the load (C-M0 can only do LDRSH in [reg+reg] form so we must first create a zero)
	ldrsb  r6, [r5, r6]
	lsls   r1, #2								//set Rd's value <- r6
	str    r6, [r4, r1]
	subs   r5, r7								//positincrement
	str    r5, [r4, r3]							//set Rn's value <- r5
	mov    pc, r8

dp_hd_01000:									//STRD Rd, [Rn], +Rm
	lsls   r7, r6, #2							//get Rm's value -> r7
	ldr    r7, [r4, r7]
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	lsls   r1, #2								//calc where we'll get the two regs
	adds   r1, r4
	ldr    r6, [r1, #0x00]						//get the two words from (Rd, Rd+1)
	ldr    r2, [r1, #0x04]
	str    r6, [r5, #0x00]						//perform the doublestore
	str    r2, [r5, #0x04]
	adds   r5, r7								//positincrement
	str    r5, [r4, r3]							//set Rn's value <- r5
	mov    pc, r8

dp_hd_01001:									//LDRSH Rd, [Rn], +Rm
	lsls   r7, r6, #2							//get Rm's value -> r7
	ldr    r7, [r4, r7]
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	movs   r6, #0								//perform the load (C-M0 can only do LDRSH in [reg+reg] form so we must first create a zero)
	ldrsb  r6, [r5, r6]
	lsls   r1, #2								//set Rd's value <- r6
	str    r6, [r4, r1]
	adds   r5, r7								//positincrement
	str    r5, [r4, r3]							//set Rn's value <- r5
	mov    pc, r8

dp_hd_01100:									//STRD Rd, [Rn], +#imm
	lsls   r7, r2, #20							//calculate imm -> r7
	lsrs   r7, #28
	lsls   r7, #4
	orrs   r7, r6
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	lsls   r1, #2								//calc where we'll get the two regs
	adds   r1, r4
	ldr    r6, [r1, #0x00]						//get the two words from (Rd, Rd+1)
	ldr    r2, [r1, #0x04]
	str    r6, [r5, #0x00]						//perform the doublestore
	str    r2, [r5, #0x04]
	adds   r5, r7								//positincrement
	str    r5, [r4, r3]							//set Rn's value <- r5
	mov    pc, r8

dp_hd_01101:									//LDRSH Rd, [Rn], +#imm
	lsls   r7, r2, #20							//calculate imm -> r7
	lsrs   r7, #28
	lsls   r7, #4
	orrs   r7, r6
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	movs   r6, #0								//perform the load (C-M0 can only do LDRSH in [reg+reg] form so we must first create a zero)
	ldrsb  r6, [r5, r6]
	lsls   r1, #2								//set Rd's value <- r6
	str    r6, [r4, r1]
	adds   r5, r7								//positincrement
	str    r5, [r4, r3]							//set Rn's value <- r5
	mov    pc, r8

dp_hd_10000:									//STRD Rd, [Rn, -Rm]
	lsls   r7, r6, #2							//get Rm's value -> r7
	ldr    r7, [r4, r7]
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	subs   r5, r7								//preincrement
	lsls   r1, #2								//calc where we'll get the two regs
	adds   r1, r4
	ldr    r6, [r1, #0x00]						//get the two words from (Rd, Rd+1)
	ldr    r2, [r1, #0x04]
	str    r6, [r5, #0x00]						//perform the doublestore
	str    r2, [r5, #0x04]
	mov    pc, r8

dp_hd_10001:									//LDRSH Rd, [Rn, -Rm]
	lsls   r7, r6, #2							//get Rm's value -> r7
	ldr    r7, [r4, r7]
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	subs   r5, r7								//preincrement
	movs   r6, #0								//perform the load (C-M0 can only do LDRSH in [reg+reg] form so we must first create a zero)
	ldrsb  r6, [r5, r6]
	lsls   r1, #2								//set Rd's value <- r6
	str    r6, [r4, r1]
	mov    pc, r8

dp_hd_10010:									//STRD Rd, [Rn, -Rm]!
	lsls   r7, r6, #2							//get Rm's value -> r7
	ldr    r7, [r4, r7]
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	subs   r5, r7								//preincrement
	lsls   r1, #2								//calc where we'll get the two regs
	adds   r1, r4
	ldr    r6, [r1, #0x00]						//get the two words from (Rd, Rd+1)
	ldr    r2, [r1, #0x04]
	str    r6, [r5, #0x00]						//perform the doublestore
	str    r2, [r5, #0x04]
	str    r5, [r4, r3]							//set Rn's value <- r5 (writeback)
	mov    pc, r8

dp_hd_10011:									//LDRSH Rd, [Rn, -Rm]!
	lsls   r7, r6, #2							//get Rm's value -> r7
	ldr    r7, [r4, r7]
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	subs   r5, r7								//preincrement
	movs   r6, #0								//perform the load (C-M0 can only do LDRSH in [reg+reg] form so we must first create a zero)
	ldrsb  r6, [r5, r6]
	lsls   r1, #2								//set Rd's value <- r6
	str    r6, [r4, r1]
	str    r5, [r4, r3]							//set Rn's value <- r5 (writeback)
	mov    pc, r8

dp_hd_10100:									//STRD Rd, [Rn, -#imm]
	lsls   r7, r2, #20							//calculate imm -> r7
	lsrs   r7, #28
	lsls   r7, #4
	orrs   r7, r6
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	subs   r5, r7								//preincrement
	lsls   r1, #2								//calc where we'll get the two regs
	adds   r1, r4
	ldr    r6, [r1, #0x00]						//get the two words from (Rd, Rd+1)
	ldr    r2, [r1, #0x04]
	str    r6, [r5, #0x00]						//perform the doublestore
	str    r2, [r5, #0x04]
	mov    pc, r8

dp_hd_10101:									//LDRSH Rd, [Rn, -#imm]
	lsls   r7, r2, #20							//calculate imm -> r7
	lsrs   r7, #28
	lsls   r7, #4
	orrs   r7, r6
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	subs   r5, r7								//preincrement
	movs   r6, #0								//perform the load (C-M0 can only do LDRSH in [reg+reg] form so we must first create a zero)
	ldrsb  r6, [r5, r6]
	lsls   r1, #2								//set Rd's value <- r6
	str    r6, [r4, r1]
	mov    pc, r8

dp_hd_10110:									//STRD Rd, [Rn, -#imm]!
	lsls   r7, r2, #20							//calculate imm -> r7
	lsrs   r7, #28
	lsls   r7, #4
	orrs   r7, r6
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	subs   r5, r7								//preincrement
	lsls   r1, #2								//calc where we'll get the two regs
	adds   r1, r4
	ldr    r6, [r1, #0x00]						//get the two words from (Rd, Rd+1)
	ldr    r2, [r1, #0x04]
	str    r6, [r5, #0x00]						//perform the doublestore
	str    r2, [r5, #0x04]
	str    r5, [r4, r3]							//set Rn's value <- r5 (writeback)
	mov    pc, r8

dp_hd_10111:									//LDRSH Rd, [Rn, -#imm]!
	lsls   r7, r2, #20							//calculate imm -> r7
	lsrs   r7, #28
	lsls   r7, #4
	orrs   r7, r6
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	subs   r5, r7								//preincrement
	movs   r6, #0								//perform the load (C-M0 can only do LDRSH in [reg+reg] form so we must first create a zero)
	ldrsb  r6, [r5, r6]
	lsls   r1, #2								//set Rd's value <- r6
	str    r6, [r4, r1]
	str    r5, [r4, r3]							//set Rn's value <- r5 (writeback)
	mov    pc, r8

dp_hd_11000:									//STRD Rd, [Rn, +Rm]
	lsls   r7, r6, #2							//get Rm's value -> r7
	ldr    r7, [r4, r7]
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	adds   r5, r7								//preincrement
	lsls   r1, #2								//calc where we'll get the two regs
	adds   r1, r4
	ldr    r6, [r1, #0x00]						//get the two words from (Rd, Rd+1)
	ldr    r2, [r1, #0x04]
	str    r6, [r5, #0x00]						//perform the doublestore
	str    r2, [r5, #0x04]
	mov    pc, r8

dp_hd_11001:									//LDRSH Rd, [Rn, +Rm]
	lsls   r7, r6, #2							//get Rm's value -> r7
	ldr    r7, [r4, r7]
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	ldrsh  r6, [r5, r7]							//perform the load
	lsls   r1, #2								//set Rd's value <- r6
	str    r6, [r4, r1]
	mov    pc, r8

dp_hd_11010:									//STRD Rd, [Rn, +Rm]!
	lsls   r7, r6, #2							//get Rm's value -> r7
	ldr    r7, [r4, r7]
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	adds   r5, r7								//preincrement
	lsls   r1, #2								//calc where we'll get the two regs
	adds   r1, r4
	ldr    r6, [r1, #0x00]						//get the two words from (Rd, Rd+1)
	ldr    r2, [r1, #0x04]
	str    r6, [r5, #0x00]						//perform the doublestore
	str    r2, [r5, #0x04]
	str    r5, [r4, r3]							//set Rn's value <- r5 (writeback)
	mov    pc, r8

dp_hd_11011:									//LDRSH Rd, [Rn, +Rm]!
	lsls   r7, r6, #2							//get Rm's value -> r7
	ldr    r7, [r4, r7]
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	adds   r5, r7								//preincrement
	movs   r6, #0								//perform the load (C-M0 can only do LDRSH in [reg+reg] form so we must first create a zero)
	ldrsb  r6, [r5, r6]
	lsls   r1, #2								//set Rd's value <- r6
	str    r6, [r4, r1]
	str    r5, [r4, r3]							//set Rn's value <- r5 (writeback)
	mov    pc, r8

dp_hd_11100:									//STRD Rd, [Rn, +#imm]
	lsls   r7, r2, #20							//calculate imm -> r7
	lsrs   r7, #28
	lsls   r7, #4
	orrs   r7, r6
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	lsls   r1, #2								//calc where we'll get the two regs
	adds   r1, r4
	ldr    r6, [r1, #0x00]						//get the two words from (Rd, Rd+1)
	ldr    r2, [r1, #0x04]
	str    r6, [r5, #0x00]						//perform the doublestore
	str    r2, [r5, #0x04]
	mov    pc, r8

dp_hd_11101:									//LDRSH Rd, [Rn, +#imm]
	lsls   r7, r2, #20							//calculate imm -> r7
	lsrs   r7, #28
	lsls   r7, #4
	orrs   r7, r6
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	ldrsh  r6, [r5, r7]							//perform the load
	lsls   r1, #2								//set Rd's value <- r6
	str    r6, [r4, r1]
	mov    pc, r8

dp_hd_11110:									//STRD Rd, [Rn, +#imm]!
	lsls   r7, r2, #20							//calculate imm -> r7
	lsrs   r7, #28
	lsls   r7, #4
	orrs   r7, r6
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	adds   r5, r7								//preincrement
	lsls   r1, #2								//calc where we'll get the two regs
	adds   r1, r4
	ldr    r6, [r1, #0x00]						//get the two words from (Rd, Rd+1)
	ldr    r2, [r1, #0x04]
	str    r6, [r5, #0x00]						//perform the doublestore
	str    r2, [r5, #0x04]
	str    r5, [r4, r3]							//set Rn's value <- r5 (writeback)
	mov    pc, r8

dp_hd_11111:									//LDRSH Rd, [Rn, +#imm]!
	lsls   r7, r2, #20							//calculate imm -> r7
	lsrs   r7, #28
	lsls   r7, #4
	orrs   r7, r6
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//premultiply Rn by 4
	ldr    r5, [r4, r3]							//get Rn's value -> r5
	adds   r5, r7								//preincrement
	movs   r6, #0								//perform the load (C-M0 can only do LDRSH in [reg+reg] form so we must first create a zero)
	ldrsb  r6, [r5, r6]
	lsls   r1, #2								//set Rd's value <- r6
	str    r6, [r4, r1]
	str    r5, [r4, r3]							//set Rn's value <- r5 (writeback)
	mov    pc, r8

dp_dispatch:									//r0 has op2, r1 has APSR with C bit set to shifter carry out, r2 still has instr
	lsls   r3, r2, #6							//get [[25]opcode..S] LSL 1 for dispatch
	lsrs   r3, #25
	add    pc, r3								//dispatch on it
	nop
	b dp_and
	b dp_ands
	b dp_eor
	b dp_eors
	b dp_sub
	b dp_subs
	b dp_rsb
	b dp_rsbs
	b dp_add
	b dp_adds
	b dp_adc
	b dp_adcs
	b dp_sbc
	b dp_sbcs
	b dp_rsc
	b dp_rscs
	b dp_0x10									//MRS Rx, CPSR or edsp instrs
	b dp_tst
	b dp_0x12									//MSR CPSR, Rx or BX or BLX or BKPT or edsp instrs
	b dp_teq
	b dp_0x14									//MRS Rx, SPSR or edsp instrs
	b dp_cmp
	b dp_0x16									//MSR SPSR, Rx or CLZ or edsp instrs
	b dp_cmn
	b dp_orr
	b dp_orrs
	b dp_mov
	b dp_movs
	b dp_bic
	b dp_bics
	b dp_mvn
	b dp_mvns
	b dp_and
	b dp_ands
	b dp_eor
	b dp_eors
	b dp_sub
	b dp_subs
	b dp_rsb
	b dp_rsbs
	b dp_add
	b dp_adds
	b dp_adc
	b dp_adcs
	b dp_sbc
	b dp_sbcs
	b dp_rsc
	b dp_rscs
	b dp_udf
	b dp_tst
	b dp_msri									//MSR CPSR, imm
	b dp_teq
	b dp_udf
	b dp_cmp
	b dp_udf									//MSR SPSR, imm - unsupported
	b dp_cmn
	b dp_orr
	b dp_orrs
	b dp_mov
	b dp_movs
	b dp_bic
	b dp_bics
	b dp_mvn
	b dp_mvns

dp_udf:
	udf    #0x00

dp_and:
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//get Rn's value -> r3
	ldr    r3, [r4, r3]
	ands   r3, r0
	lsls   r0, r2, #16							//get Rd -> r0
	lsrs   r0, 28
	lsls   r2, r0, #2							//set Rd's value <- r3 (clobbers r2 aka "instr")
	str    r3, [r4, r2]
	cmp    r0, #15								//did we write pc?
	bne    j_next_instr_2						//if not, normal next instr
	bl     pc_changed_not_thumb					//if yes, handle that

dp_ands:
	msr    APSR_nzcvq, r10						//move SR to APSR
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//get Rn's value -> r3
	ldr    r3, [r4, r3]
	ands   r3, r0
	mrs    r7, APSR								//grab resulting CPSR into r7
	lsls   r0, r2, #16							//get Rd -> r0
	lsrs   r0, 28
	lsls   r2, r0, #2							//set Rd's value <- r3 (clobbers r2 aka "instr")
	str    r3, [r4, r2]
	movs   r2, #1								//create 1 << 29 aka ARM_SR_BIT_C
	lsls   r2, #29
	bics   r7, r2								//clear sr's C
	ands   r2, r1								//grab C from shifter carry out
	orrs   r7, r2								//insert it into sr's C bit's spot
	mov    r10, r7								//set CPSR <- r7
	cmp    r0, #15								//did we write pc?
	bne    j_next_instr_2						//if not, normal next instr
	bl     pc_changed_not_thumb					//if yes, handle that

dp_eor:
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//get Rn's value -> r3
	ldr    r3, [r4, r3]
	eors   r3, r0
	lsls   r0, r2, #16							//get Rd -> r0
	lsrs   r0, 28
	lsls   r2, r0, #2							//set Rd's value <- r3 (clobbers r2 aka "instr")
	str    r3, [r4, r2]
	cmp    r0, #15								//did we write pc?
	bne    j_next_instr_2						//if not, normal next instr
	bl     pc_changed_not_thumb					//if yes, handle that

dp_eors:
	msr    APSR_nzcvq, r10						//move it to APSR
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//get Rn's value -> r3
	ldr    r3, [r4, r3]
	eors   r3, r0
	mrs    r7, APSR								//grab resulting CPSR into r7
	lsls   r0, r2, #16							//get Rd -> r0
	lsrs   r0, 28
	lsls   r2, r0, #2							//set Rd's value <- r3 (clobbers r2 aka "instr")
	str    r3, [r4, r2]
	movs   r2, #1								//create 1 << 29 aka ARM_SR_BIT_C
	lsls   r2, #29
	bics   r7, r2								//clear sr's C
	ands   r2, r1								//grab C from shifter carry out
	orrs   r7, r2								//insert it into sr's C bit's spot
	mov    r10, r7								//set CPSR <- r7
	cmp    r0, #15								//did we write pc?
	bne    j_next_instr_2						//if not, normal next instr
	bl     pc_changed_not_thumb					//if yes, handle that

dp_orr:
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//get Rn's value -> r3
	ldr    r3, [r4, r3]
	orrs   r3, r0
	lsls   r0, r2, #16							//get Rd -> r0
	lsrs   r0, 28
	lsls   r2, r0, #2							//set Rd's value <- r3 (clobbers r2 aka "instr")
	str    r3, [r4, r2]
	cmp    r0, #15								//did we write pc?
	bne    j_next_instr_2						//if not, normal next instr
	bl     pc_changed_not_thumb					//if yes, handle that

dp_orrs:
	msr    APSR_nzcvq, r10						//move it to APSR
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//get Rn's value -> r3
	ldr    r3, [r4, r3]
	orrs   r3, r0
	mrs    r7, APSR								//grab resulting CPSR into r7
	lsls   r0, r2, #16							//get Rd -> r0
	lsrs   r0, 28
	lsls   r2, r0, #2							//set Rd's value <- r3 (clobbers r2 aka "instr")
	str    r3, [r4, r2]
	movs   r2, #1								//create 1 << 29 aka ARM_SR_BIT_C
	lsls   r2, #29
	bics   r7, r2								//clear sr's C
	ands   r2, r1								//grab C from shifter carry out
	orrs   r7, r2								//insert it into sr's C bit's spot
	mov    r10, r7								//set CPSR <- r7
	cmp    r0, #15								//did we write pc?
	bne    j_next_instr_2						//if not, normal next instr
	/*bl     pc_changed_not_thumb	*/			//if yes, handle that
	//fallthrough to next lbl anyways

j_next_instr_2:
	mov    pc, r8

dp_bic:
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//get Rn's value -> r3
	ldr    r3, [r4, r3]
	bics   r3, r0
	lsls   r0, r2, #16							//get Rd -> r0
	lsrs   r0, 28
	lsls   r2, r0, #2							//set Rd's value <- r3 (clobbers r2 aka "instr")
	str    r3, [r4, r2]
	cmp    r0, #15								//did we write pc?
	bne    j_next_instr_2						//if not, normal next instr
	bl     pc_changed_not_thumb					//if yes, handle that

dp_bics:
	msr    APSR_nzcvq, r10						//move it to APSR
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//get Rn's value -> r3
	ldr    r3, [r4, r3]
	bics   r3, r0
	mrs    r7, APSR								//grab resulting CPSR into r7
	lsls   r0, r2, #16							//get Rd -> r0
	lsrs   r0, #28
	lsls   r2, r0, #2							//set Rd's value <- r3 (clobbers r2 aka "instr")
	str    r3, [r4, r2]	
	movs   r2, #1								//create 1 << 29 aka ARM_SR_BIT_C
	lsls   r2, #29
	bics   r7, r2								//clear sr's C
	ands   r2, r1								//grab C from shifter carry out
	orrs   r7, r2								//insert it into sr's C bit's spot
	mov    r10, r7								//set CPSR <- r7
	cmp    r0, #15								//did we write pc?
	bne    j_next_instr_2						//if not, normal next instr
	bl     pc_changed_not_thumb					//if yes, handle that

dp_mvn:
	mvns   r3, r0
	lsls   r0, r2, #16							//get Rd -> r0
	lsrs   r0, 28
	lsls   r2, r0, #2							//set Rd's value <- r3 (clobbers r2 aka "instr")
	str    r3, [r4, r2]
	cmp    r0, #15								//did we write pc?
	bne    j_next_instr_2						//if not, normal next instr
	bl     pc_changed_not_thumb					//if yes, handle that

dp_mvns:
	msr    APSR_nzcvq, r10						//move it to APSR
	mvns   r0, r0
	mrs    r7, APSR								//grab resulting CPSR into r7
	lsls   r3, r2, #16							//get Rd -> r3
	lsrs   r3, #28
	lsls   r2, r3, #2							//set Rd's value <- r3 (clobbers r2 aka "instr")
	str    r0, [r4, r2]
	movs   r2, #1								//create 1 << 29 aka ARM_SR_BIT_C
	lsls   r2, #29
	bics   r7, r2								//clear sr's C
	ands   r2, r1								//grab C from shifter carry out
	orrs   r7, r2								//insert it into sr's C bit's spot
	mov    r10, r7								//set CPSR <- r7
	cmp    r3, #15								//did we write pc?
	bne    j_next_instr_2						//if not, normal next instr
	bl     pc_changed_not_thumb					//if yes, handle that

dp_sub:
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//get Rn's value -> r3
	ldr    r3, [r4, r3]
	subs   r3, r0
	lsls   r0, r2, #16							//get Rd -> r0
	lsrs   r0, 28
	lsls   r2, r0, #2							//set Rd's value <- r3 (clobbers r2 aka "instr")
	str    r3, [r4, r2]
	cmp    r0, #15								//did we write pc?
	bne    j_next_instr_2						//if not, normal next instr
	bl     pc_changed_not_thumb					//if yes, handle that

dp_subs:
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//get Rn's value -> r3
	ldr    r3, [r4, r3]
	subs   r3, r0
	mrs    r7, APSR								//grab resulting CPSR into r7
	lsls   r0, r2, #16							//get Rd -> r0
	lsrs   r0, 28
	lsls   r2, r0, #2							//set Rd's value <- r3 (clobbers r2 aka "instr")
	str    r3, [r4, r2]
	mov    r10, r7								//set CPSR <- r7
	cmp    r0, #15								//did we write pc?
	bne    j_next_instr_2						//if not, normal next instr
	bl     pc_changed_not_thumb					//if yes, handle that

dp_add:
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//get Rn's value -> r3
	ldr    r3, [r4, r3]
	adds   r3, r0
	lsls   r0, r2, #16							//get Rd -> r0
	lsrs   r0, 28
	lsls   r2, r0, #2							//set Rd's value <- r3 (clobbers r2 aka "instr")
	str    r3, [r4, r2]
	cmp    r0, #15								//did we write pc?
	bne    j_next_instr_2						//if not, normal next instr
	bl     pc_changed_not_thumb					//if yes, handle that

dp_adds:
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//get Rn's value -> r3
	ldr    r3, [r4, r3]
	adds   r3, r0
	mrs    r7, APSR								//grab resulting CPSR into r7
	lsls   r0, r2, #16							//get Rd -> r0
	lsrs   r0, 28
	lsls   r2, r0, #2							//set Rd's value <- r3 (clobbers r2 aka "instr")
	str    r3, [r4, r2]
	mov    r10, r7								//set CPSR <- r7
	cmp    r0, #15								//did we write pc?
	bne    j_next_instr_2						//if not, normal next instr
	bl     pc_changed_not_thumb					//if yes, handle that

dp_rsb:
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//get Rn's value -> r3
	ldr    r3, [r4, r3]
	subs   r0, r3
	lsls   r3, r2, #16							//get Rd -> r3
	lsrs   r3, #28
	lsls   r2, r3, #2							//set Rd's value <- r3 (clobbers r2 aka "instr")
	str    r0, [r4, r2]
	cmp    r3, #15								//did we write pc?
	bne    j_next_instr_3						//if not, normal next instr
	bl     pc_changed_not_thumb					//if yes, handle that

dp_rsbs:
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//get Rn's value -> r3
	ldr    r3, [r4, r3]
	subs   r0, r3
	mrs    r7, APSR								//grab resulting CPSR into r7
	lsls   r3, r2, #16							//get Rd -> r3
	lsrs   r3, #28
	lsls   r2, r3, #2							//set Rd's value <- r3 (clobbers r2 aka "instr")
	str    r0, [r4, r2]
	mov    r10, r7								//set CPSR <- r7
	cmp    r3, #15								//did we write pc?
	bne    j_next_instr_3						//if not, normal next instr
	bl     pc_changed_not_thumb					//if yes, handle that

dp_adc:
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//get Rn's value -> r3
	ldr    r3, [r4, r3]
	msr    APSR_nzcvq, r10						//get CPSR -> APSR
	adcs   r3, r0
	lsls   r0, r2, #16							//get Rd -> r0
	lsrs   r0, 28
	lsls   r2, r0, #2							//set Rd's value <- r3 (clobbers r2 aka "instr")
	str    r3, [r4, r2]
	cmp    r0, #15								//did we write pc?
	bne    j_next_instr_3						//if not, normal next instr
	bl     pc_changed_not_thumb					//if yes, handle that

dp_adcs:
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//get Rn's value -> r3
	ldr    r3, [r4, r3]
	msr    APSR_nzcvq, r10						//get CPSR -> APSR
	adcs   r3, r0
	mrs    r7, APSR								//grab resulting CPSR into r7
	lsls   r0, r2, #16							//get Rd -> r0
	lsrs   r0, 28
	lsls   r2, r0, #2							//set Rd's value <- r3 (clobbers r2 aka "instr")
	str    r3, [r4, r2]
	mov    r10, r7								//set CPSR <- r7
	cmp    r0, #15								//did we write pc?
	bne    j_next_instr_3						//if not, normal next instr
	bl     pc_changed_not_thumb					//if yes, handle that

dp_sbc:
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//get Rn's value -> r3
	ldr    r3, [r4, r3]
	msr    APSR_nzcvq, r10						//get CPSR -> APSR
	sbcs   r3, r0
	lsls   r0, r2, #16							//get Rd -> r0
	lsrs   r0, 28
	lsls   r2, r0, #2							//set Rd's value <- r3 (clobbers r2 aka "instr")
	str    r3, [r4, r2]
	cmp    r0, #15								//did we write pc?
	bne    j_next_instr_3						//if not, normal next instr
	bl     pc_changed_not_thumb					//if yes, handle that

dp_sbcs:
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//get Rn's value -> r3
	ldr    r3, [r4, r3]
	msr    APSR_nzcvq, r10						//get CPSR -> APSR
	sbcs   r3, r0
	mrs    r7, APSR								//grab resulting CPSR into r7
	lsls   r0, r2, #16							//get Rd -> r0
	lsrs   r0, 28
	lsls   r2, r0, #2							//set Rd's value <- r3 (clobbers r2 aka "instr")
	str    r3, [r4, r2]
	mov    r10, r7								//set CPSR <- r7
	cmp    r0, #15								//did we write pc?
	bne    j_next_instr_3						//if not, normal next instr
	bl     pc_changed_not_thumb					//if yes, handle that

j_next_instr_3:
	mov    pc, r8

dp_rsc:
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//get Rn's value -> r3
	ldr    r3, [r4, r3]
	msr    APSR_nzcvq, r10						//get CPSR -> APSR
	sbcs   r0, r3
	lsls   r3, r2, #16							//get Rd -> r3
	lsrs   r3, #28
	lsls   r2, r3, #2							//set Rd's value <- r3 (clobbers r2 aka "instr")
	str    r0, [r4, r2]
	cmp    r3, #15								//did we write pc?
	bne    j_next_instr_3						//if not, normal next instr
	bl     pc_changed_not_thumb					//if yes, handle that

dp_rscs:
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//get Rn's value -> r3
	ldr    r3, [r4, r3]
	msr    APSR_nzcvq, r10						//get CPSR -> APSR
	sbcs   r0, r3
	mrs    r7, APSR								//grab resulting CPSR into r7
	lsls   r3, r2, #16							//get Rd -> r3
	lsrs   r3, #28
	lsls   r2, r3, #2							//set Rd's value <- r3 (clobbers r2 aka "instr")
	str    r0, [r4, r2]
	mov    r10, r7								//set CPSR <- r7
	cmp    r3, #15								//did we write pc?
	bne    j_next_instr_3						//if not, normal next instr
	bl     pc_changed_not_thumb					//if yes, handle that

dp_mov:
	//r0 already has op2
	lsls   r3, r2, #16							//get Rd -> r3
	lsrs   r3, #28
	lsls   r2, r3, #2							//set Rd's value <- r3 (clobbers r2 aka "instr")
	str    r0, [r4, r2]
	cmp    r3, #15								//did we write pc?
	bne    j_next_instr_3						//if not, normal next instr
	bl     pc_changed_not_thumb					//if yes, handle that

dp_movs:
	msr    APSR_nzcvq, r10						//move it to APSR
	tst    r0, r0								//set NZ
	mrs    r7, APSR								//grab resulting CPSR into r7
	lsls   r3, r2, #16							//get Rd -> r3
	lsrs   r3, #28
	lsls   r2, r3, #2							//set Rd's value <- r3 (clobbers r2 aka "instr")
	str    r0, [r4, r2]
	movs   r2, #1								//create 1 << 29 aka ARM_SR_BIT_C
	lsls   r2, #29
	bics   r7, r2								//clear sr's C
	ands   r2, r1								//grab C from shifter carry out
	orrs   r7, r2								//insert it into sr's C bit's spot
	mov    r10, r7								//set CPSR <- r7
	cmp    r3, #15								//did we write pc?
	bne    j_next_instr_3						//if not, normal next instr
	bl     pc_changed_not_thumb					//if yes, handle that

dp_cmp:
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//get Rn's value -> r3
	ldr    r3, [r4, r3]
	cmp    r3, r0
	mrs    r7, APSR								//grab resulting CPSR into r7
	mov    r10, r7								//set CPSR <- r7
	mov    pc, r8

dp_cmn:
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//get Rn's value -> r3
	ldr    r3, [r4, r3]
	cmn    r3, r0
	mrs    r7, APSR								//grab resulting CPSR into r7
	mov    r10, r7								//set CPSR <- r7
	mov    pc, r8

dp_tst:
	msr    APSR_nzcvq, r10						//move it to APSR
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//get Rn's value -> r3
	ldr    r3, [r4, r3]
	tst    r3, r0
	mrs    r7, APSR								//grab resulting CPSR into r7
	movs   r2, #1								//create 1 << 29 aka ARM_SR_BIT_C
	lsls   r2, #29
	bics   r7, r2								//clear sr's C
	ands   r2, r1								//grab C from shifter carry out
	orrs   r7, r2								//insert it into sr's C bit's spot
	mov    r10, r7								//set CPSR <- r7
	mov    pc, r8

dp_teq:
	msr    APSR_nzcvq, r10						//move it to APSR
	lsls   r3, r2, #12							//get Rn -> r3
	lsrs   r3, #28
	lsls   r3, #2								//get Rn's value -> r3
	ldr    r3, [r4, r3]
	eors   r3, r0								//eors does same as teq, but is one cycle faster
	mrs    r7, APSR								//grab resulting CPSR into r7
	movs   r2, #1								//create 1 << 29 aka ARM_SR_BIT_C
	lsls   r2, #29
	bics   r7, r2								//clear sr's C
	ands   r2, r1								//grab C from shifter carry out
	orrs   r7, r2								//insert it into sr's C bit's spot
	mov    r10, r7								//set CPSR <- r7
	mov    pc, r8

dp_0x10:										//table 3.3: MRS Rx, CPSR or edsp instrs	[these are suboptimal since we calculated addr mode 1's op2 uselessly before we got here]
	lsls   r3, r2, #24							//get bits 4..7 for dispatch, LSL 1 for dispatch
	lsrs   r3, #27
	add    pc, r3								//dispatch
	nop
	b dp_mrs
	b dp_x_undef
	b dp_x_undef
	b dp_x_undef
	b dp_x_undef
	b dp_qadd
	b dp_x_undef
	b dp_x_undef
	b dp_smlabb
	b dp_x_undef
	b dp_smlatb
	b dp_x_undef
	b dp_smlabt
	b dp_x_undef
	b dp_smlatt
	b dp_x_undef

dp_0x14:										//table 3.3: edsp instrs	[these are suboptimal since we calculated addr mode 1's op2 uselessly before we got here]
	lsls   r3, r2, #24							//get bits 4..7 for dispatch, LSL 1 for dispatch
	lsrs   r3, #27
	add    pc, r3								//dispatch
	nop
	b dp_x_undef
	b dp_x_undef
	b dp_x_undef
	b dp_x_undef
	b dp_x_undef
	b dp_qdadd
	b dp_x_undef
	b dp_x_undef
	b dp_smlalbb
	b dp_x_undef
	b dp_smlaltb
	b dp_x_undef
	b dp_smlalbt
	b dp_x_undef
	b dp_smlaltt
	b dp_x_undef

dp_0x16:										//table 3.3: CLZ or edsp instrs	[these are suboptimal since we calculated addr mode 1's op2 uselessly before we got here]
	lsls   r3, r2, #24							//get bits 4..7 for dispatch, LSL 1 for dispatch
	lsrs   r3, #27
	add    pc, r3								//dispatch
	nop
	b dp_x_undef
	b dp_clz
	b dp_x_undef
	b dp_x_undef
	b dp_x_undef
	b dp_qsub
	b dp_x_undef
	b dp_x_undef
	b dp_smulbb
	b dp_x_undef
	b dp_smultb
	b dp_x_undef
	b dp_smulbt
	b dp_x_undef
	b dp_smultt
	b dp_x_undef

dp_clz:
	lsls   r1, r2, #28							//get Rm -> r1
	lsrs   r1, #28
	lsls   r1, #2								//get Rm's value -> r1
	ldr    r1, [r4, r1]
	
	//we could get clever here, but why...
	movs   r5, #0
1:
	lsls   r1, #1
	bcs    2f
	adds   r5, #1
	cmp    r5, #32
	bne    1b
2:
	
	lsls   r0, r2, #16							//get Rd -> r0
	lsrs   r0, #28
	lsls   r0, #2								//set Rd's value <- r5
	str    r5, [r4, r0]
	mov    pc, r8

dp_0x12:										//table 3.3: MSR CPSR, Rx or BX or BLX or BKPT or edsp instrs	[these are suboptimal since we calculated addr mode 1's op2 uselessly before we got here]
	lsls   r3, r2, #24							//get bits 4..7 for dispatch, LSL 1 for dispatch
	lsrs   r3, #27
	add    pc, r3								//dispatch
	nop
	b dp_msrr
	b dp_bx
	b dp_x_undef
	b dp_blx
	b dp_x_undef
	b dp_qsub
	b dp_x_undef
	b dp_x_undef
	b dp_smlawb
	b dp_x_undef
	b dp_smulwb
	b dp_x_undef
	b dp_smlawt
	b dp_x_undef
	b dp_smulwt
	b dp_x_undef

dp_x_undef:
	udf    #0x00
	
dp_msrr:
	lsls   r0, r2, #28							//get Rm -> r1
	lsrs   r0, #28
	lsls   r0, #2								//get Rm's value -> r1
	ldr    r0, [r4, r0]
	//fallthrough to dp_msri
dp_msri:										//expects value in r0
	lsrs   r1, r2, #20							//see if field mask [3] is set
	bcc    1f
	lsrs   r0, #24								// r0 &= 0xFF000000
	lsls   r0, #24
	mov    r10, r0								// store into SR
1:
	mov    pc, r8

dp_mrs:
	mov    r0, r10								//get SR
	lsrs   r0, #24								// r0 &= 0xFF000000 (only top bits stay)
	lsls   r0, #24
	adds   r0, #0x10							//user mode
	lsls   r1, r2, #16							//get Rd -> r1
	lsrs   r1, #28
	lsls   r1, #2								//set Rd's value <- sr (in r0)
	str    r0, [r4, r1]
	mov    pc, r8

dp_blx:
	lsls   r1, r2, #28							//get Rm -> r1
	lsrs   r1, #28
	lsls   r1, #2								//get Rm's value -> r1
	ldr    r1, [r4, r1]
	ldr    r0, [r4, #0x3c]						//get PC
	subs   r0, #4								//addr of next instr
	str    r0, [r4, #0x38]						//store into LR
	str    r1, [r4, #0x3c]						//store pc
	lsrs   r1, #1								//check for low bit here - it is faster since we already have it in a reg
	bcc    j_pc_changed_not_thumb_3
	b      emu_out
	
dp_bx:
	lsls   r1, r2, #28							//get Rm -> r1
	lsrs   r1, #28
	lsls   r1, #2								//get Rm's value -> r1
	ldr    r1, [r4, r1]
	str    r1, [r4, #0x3c]						//store pc
	lsrs   r1, #1								//check for low bit here - it is faster since we already have it in a reg
	bcc    j_pc_changed_not_thumb_3
	b      emu_out


j_pc_changed_not_thumb_3:
	bl     pc_changed_not_thumb

//MRS:		cccc 0001 0000 1111 dddd 0000 0000 0000
//QADD:		cccc 0001 0000 nnnn dddd 0000 0101 mmmm
//SMLAxy	cccc 0001 0000 dddd nnnn ssss 1yx0 mmmm

//MSR r:	cccc 0001 0010 ffff 1111 0000 0000 mmmm
//BX:		cccc 0001 0010 1111 1111 1111 0001 mmmm
//BLX:		cccc 0001 0010 1111 1111 1111 0011 mmmm
//QSUB:		cccc 0001 0010 nnnn dddd 0000 0101 mmmm
//SMLAWy	cccc 0001 0010 dddd nnnn ssss 1y00 mmmm
//SMULWy	cccc 0001 0010 dddd nnnn ssss 1y10 mmmm


//QDADD:	cccc 0001 0100 nnnn dddd 0000 0101 mmmm
//SMLALxy	cccc 0001 0100 dddd nnnn ssss 1yx0 mmmm

//CLZ:		cccc 0001 0110 1111 dddd 1111 0001 mmmm
//QDSUB:	cccc 0001 0110 nnnn dddd 0000 0101 mmmm
//SMULxy	cccc 0001 0110 dddd nnnn ssss 1yx0 mmmm

dp_qadd:
dp_qsub:
dp_qdadd:
dp_qdsub:
dp_smlabb:
dp_smlabt:
dp_smlatb:
dp_smlatt:
dp_smlalbb:
dp_smlalbt:
dp_smlaltb:
dp_smlaltt:
dp_smulbb:
dp_smulbt:
dp_smultb:
dp_smultt:
dp_smlawb:
dp_smulwb:
dp_smlawt:
dp_smulwt:
				//these all fall through to undef
	udf   #01

inst_swi:
	ldr    r0, =0xF1234560
	lsls   r1, r2, #4
	cmp    r1, r0
	beq    is_semihosting

inst_swi_udf:
	udf    #0x00

is_semihosting:
	mov    r0, r4					//point to r12
	adds   r0, #4 * 12
	ldmia  r0, {r0-r3}				//load r12,sp,lr,pc
	push   {r0-r3}					//push them as args
	mov    r0, r4					//point to r0...r3
	adds   r1, r0, #4
	adds   r2, r1, #4
	adds   r3, r2, #4
	bl     kernelSemihostingHandle	//remember we're on a tiny stack and this is likely to blow it
	cmp    r0, #0
	beq    inst_swi_udf				//verify success
	add    sp, #0x10				//pop params off the stack that we had pushed
	ldr    r0, =dp_dispatch			//could have been clobbered
	mov    r12, r0
	mov    pc, r8

emu_out:
	mov    r0, r10
	str    r0, [r4, #0x40]			//store sr
	mov    r0, r4
	pop    {pc}




//multiplies for c-m0


do_smull:	//lo = r1, hi = r3, a = r0, b = r1, out is r3:r1
	mov    r5, r1
	orrs   r5, r0
	bpl    do_umull		//execute "umull  r1, r3, r0, r1"
	tst    r0, r1
	bmi    negate_both	//both negative
	
	//if we got here, one thing is positive and another is negative, this is the hard case
	tst    r0, r0
	bpl    negate_r1
negate_r0:
	negs   r0, r0
	b      negation_done
negate_r1:
	negs   r1, r1
negation_done:
    push   {lr}
	bl     do_umull
	//now negate result
	movs   r5, r3
	movs   r3, #0
	negs   r1, r1
	sbcs   r3, r5
	pop    {pc}

negate_both:
	negs   r0, r0
	negs   r1, r1
	//fallthrough

do_umull:	//lo = r1, hi = r3, a = r0, b = r1, out is r3:r1
	push   {r5, r6, r7}
	movs   r2, #0
	
	lsrs   r5, r0, #16		//a.hi * b.hi
	lsrs   r6, r1, #16
	muls   r5, r6			//r5 = a.hi * b.hi
	lsrs   r3, r5, #16		//r3 = result.hi (so far)
	lsls   r5, #16			//r5 = result.lo (so far)
	
	lsrs   r6, r1, #16		//a.lo * b.hi
	uxth   r7, r0
	muls   r6, r7
	adds   r5, r6
	adcs   r3, r2

	lsrs   r6, r0, #16		//a.hi * b.lo
	uxth   r7, r1
	muls   r6, r7
	adds   r5, r6
	adcs   r3, r2
	
	lsls   r3, #16
	lsrs   r7, r5, #16
	orrs   r3, r7
	lsls   r5, #16			//result is still in r3:r5 and now in proper shape, just needs lo * lo component
	
	uxth   r0, r0			//a.lo*b.lo
	uxth   r1, r1
	muls   r1, r0
	adds   r1, r5
	adcs   r3, r2

	pop    {r5, r6, r7}
	bx     lr


do_smlal:	//execute "smlal  lo=r5, hi=r7, a=r0, b=r1",
	push   {r5, r7, lr}
	bl     do_smull				//lo = r1, hi = r3, a = r0, b = r1, out is r3:r1
	pop    {r5, r7}
	adds   r5, r1
	adcs   r7, r3
	pop    {pc}

do_umlal:	//execute "umlal  r5, r7, r0, r1"
	push   {r5, r7, lr}
	bl     do_umull				//lo = r1, hi = r3, a = r0, b = r1, out is r3:r1
	pop    {r5, r7}
	adds   r5, r1
	adcs   r7, r3
	pop    {pc}


emuCpuRunCodeEnd:



