//not a real include file. includd directly into emuJit to provide the proper pattern matcher


static void __attribute__((naked)) jitPatMatchFuncUdiv10(uint32_t val)
{
	//unsigned: divide r0 by 10, produce quotient in r0, remainder in r1
	asm volatile(
		"	ldr   r3, =0xcccccccd			\n\t"
		"	umull r2, r3, r3, r0			\n\t"
		"	lsrs  r3, #3					\n\t"	//quotient
		"	add   r2, r3, r3, lsl #2		\n\t"	//r2 = quotient * 5
		"	sub   r1, r0, r2, lsl #1		\n\t"	//r1 = remainder
		"	mov   r0, r3					\n\t"	//r0 = quotient
		"	bx    lr						\n\t" 
		"	.ltorg							\n\t"
	);
}

static void __attribute__((naked)) jitPatMatchFuncSdiv10(int32_t val)
{
	//signed: divide r0 by 10, produce quotient in r0, remainder in r1
	asm volatile(
		"	ldr   r2, =0x66666667			\n\t"
		"	asrs  r3, r0, #31				\n\t"	//sign of param
		"	smull r1, r2, r2, r0			\n\t"
		"	rsb   r3, r3, r2, asr #2		\n\t"	//quotient
		"	add   r2, r3, r3, lsl #2		\n\t"	//r2 = quotient * 5
		"	sub   r1, r0, r2, lsl #1		\n\t"	//r1 = remainder
		"	mov   r0, r3					\n\t"	//r0 = quotient
		"	bx    lr						\n\t"
		"	.ltorg							\n\t"
 	);
}

static enum EmitStatus jitPatternMatcherMatch(struct TU **tuP, struct EmitBuf *dest, const uint32_t *code, bool *tcFlushedP)
{
	struct EmitBuf savedSpace;
	enum EmitStatus now;
	int32_t mod;
	uint32_t t;
	
	//we HAVE to match & fix Linker Stub since it sometimes violates the ABI. We match its callers too, since our stub sets the "thumb supported" bit
	
	//match LinkerStub invocation
	mod = jitPatternMatchLinkerStubInvocation(code);
	if (mod != -1) {
		
		fatal("SysLinkerStub found at 0x%08x for module ID 0x%08x. we do NOT expect this to ever be directly called! this shouldn't happen\n", code, mod);
		return EmitErrNotEncodeable;
	}
	
	//normal callsite looks like this and handles libraries of up to 256 entries
	// LDR    R12, [R9]
	// LDR    R12, [R12, #moduleid * 4]
	// LDR    R12, [R12, #globalsTableOfst]
	// CMP    R12, #0
	// ADDNE  PC, R12, #funcnum * 4
	// STMFD  SP!, {PC}
	// B      per_lib_stub
	
	//ique was seen using this instead for entrypts whose indices were >= 256
    // LDR             R12, [R9]
	// LDR             R12, [R12,#0x1BC]
	// LDR             R12, [R12,#0x28]
	// CMP             R12, #0
	// ADDNE           R12, R12, #0x3C
	// STMEQFD         SP!, {PC}
	// BEQ             per_lib_stub
	// ADD             PC, R12, #0x400
	
	if (code[0] == 0xe599c000 && (code[1] & 0xfffff003) == 0xe59cc000 && (code[2] & 0xfffff003) == 0xe59cc000 && code[3] == 0xe35c0000) {
	
		//linker stub invocation likely - > check further
		int32_t libTrapNo = -1;
		uint32_t codeLen = 0;
		
		if ((code[4] & 0xfffff000) == 0x128cf000 && code[5] == 0xe92d8000 && (code[6] & 0xff000000) == 0xea000000) {
			
			libTrapNo = armShifterImmDecode(code[4] & 0xFFF);
			codeLen = 7 * sizeof(uint32_t);
		}
		else if ((code[4] & 0xfffff000) == 0x128cc000 && code[5] == 0x092d8000 && (code[6] & 0xff000000) == 0x0a000000 && (code[7] & 0xfffff000) == 0xe28cf000) {
			
			libTrapNo = armShifterImmDecode(code[4] & 0xFFF) + armShifterImmDecode(code[7] & 0xFFF);
			codeLen = 8 * sizeof(uint32_t);
		}
		(void)codeLen;
		
		if (libTrapNo >= 0) {
		
			uint32_t allegedOwnModuleId = (code[1] & 0xfff) >> 2;
			uint32_t globalsOffset = code[2] & 0xfff;
			const uint32_t *perLibStubAddr, *commonStub;
			
			perLibStubAddr = (uint32_t*)jitWorkOutArmBranchTarget(&code[6]);
			
			//now verify func offset is divisibile by 4, the target here is valid
			if (!(libTrapNo & 3) && perLibStubAddr[0] == 0xe28fc004 && perLibStubAddr[1] == 0xe92d1000 && (perLibStubAddr[2] & 0xff000000) == 0xea000000) {
			
				int32_t modId;
				
				libTrapNo /= 4;
				commonStub = (uint32_t*)jitWorkOutArmBranchTarget(&perLibStubAddr[2]);
				modId = jitPatternMatchLinkerStubInvocation(commonStub);
				
				if (modId >= 0 && (uint32_t)modId == allegedOwnModuleId) {	//match!
					
					struct TU* perLibStub = jitTuFindByExactAddr((uintptr_t)perLibStubAddr);
					
					logt("Found LazyLoaded lib call stub for func %u and common stub at 0x%08x\n", libTrapNo, commonStub);
					
					if (!perLibStub) {
						
						uint32_t maxCodeWords;
						bool tcFlushed = false;
						
						logt("PerLibStub not found - creating translation\n");
						
						//we do not need to actually translate - we know what is there, so just generate code...
						//we'll reuse currently open TU since we must close it before we can close another
						perLibStub = *tuP;
						(*tuP)->baseAddr = (uintptr_t)perLibStubAddr;	// reset addr. maxCodeWords is still valid
						#ifdef SUPPORT_ICACHE_FLUSH
							(*tuP)->srcLen = 0x30 / sizeof(uint32_t);	//lib stub is that big always
						#endif
						
						//LDR r12, =perLibStubAddr + 0x0c (where descritor is)
						now = jitEmitLoadImmToReg(dest, 12, (uintptr_t)(perLibStubAddr + 3), false, false, false);
						if (now != EmitErrNone)
							return now;
						
						//push {r12}
						now = jitEmitImmMemStr(dest, EmitCcAl, (uintptr_t)code, 12, EMIT_REG_NO_SP, -4, EmitAdrModeIndexWbak, EmitSzWord);
						if (now != EmitErrNone)
							return now;

						//and now jump to common stub
						now = jitEmitJumpToAbsThumbAddrNotInTu(dest, (uintptr_t)&LinkerStubCallout);
						if (now != EmitErrNone)
							return now;
						
						//spill any literals we might have
						now = jitPrvSpillLiterals(jitGetState(true), dest, false);
						if (now != EmitErrNone)
							return now;
						
						jitTuInsert(*tuP, dest);
						
						//now create a new tu for our actual translation
						*tuP = jitTuAllocate(&maxCodeWords, &tcFlushed);
						if (tcFlushed) {
							if (tcFlushedP)
								*tcFlushedP = true;
							return EmitErrNoSpace;
						}
						
						(*tuP)->baseAddr = (uintptr_t)code;
						emitBufferInit(dest, (*tuP)->code, maxCodeWords * sizeof(uint16_t));
						
						now = jitEmitTuPrologue(dest, (*tuP)->baseAddr);
						if (now != EmitErrNone){
							loge("prologue failed");
							return now;
						}
					}
					
					logt("now actually translating per-entry stub\n");
					
					//LDR R12, [R9]
					EMIT(LLloadImm, 12, 9, 0, EmitSzWord, false, EmitAdrModeIndex);
					
					//LDR R12, [R12, #moduleid * 4]
					EMIT(LLloadImm, 12, 12, allegedOwnModuleId * 4, EmitSzWord, false, EmitAdrModeIndex);
					
					//LDR R12, [R12, #globalsTableOfst]
					EMIT(LLloadImm, 12, 12, globalsOffset, EmitSzWord, false, EmitAdrModeIndex);
					
					//CMP R12, #0
					EMIT(LLcmpImm, 12, 0);
					
					if (libTrapNo >= 0x400) {
						
						uint32_t preAdd = (libTrapNo * 4) &~ 0x0fff;
						
						//ITT NE
						EMIT(LLitt, EmitCcNe);
						
						//ADDNE R12, R12, #(libTrapNo * 4) &~ 0xfff
						EMIT(LLaddImm, 12, 12, preAdd, EmitLeaveFlags, true);
						
						libTrapNo -= preAdd / 4;
					}
					else {
						
						//IT NE
						EMIT(LLit, EmitCcNe);
					}
					
					//LDRNE PC, [R12, #libTrapNo * 4]  // load pc (jump to instr)
					EMIT(LLloadImm, EMIT_REG_NO_PC, 12, libTrapNo * 4, EmitSzWord, false, EmitAdrModeIndex);
					
					//MOV R12, =ARM_STUB_INVOCATION_FUNC
					now = jitEmitLoadImmToReg(dest, 12, (uint32_t)code, false, false, false);
					if (now != EmitErrNone)
						return now;
					
					//push {r12}
					EMIT(HLpush, 1 << 12);
					
					//B per_lib_stub
					EMIT(HLjump, (uintptr_t)perLibStub->code);
					
					logt("Translated lib stub for call %u from client %u at 0x%08x\n", libTrapNo, allegedOwnModuleId, code);
					
					#ifdef SUPPORT_ICACHE_FLUSH
						(*tuP)->srcLen = codeLen / sizeof(uint32_t);
					#endif
					return EmitErrNone;
				}
			}
		}
	}

	//ADS's udivmod
	{
		static const uint32_t first[] = {0xE3A02000, 0xE070C1A1, 0x3A000020, 0xE070C421, 0x3A00000F, 0xE1A00400, 0xE38224FF, 0xE070C221, 0x3A000017, 0xE070C421, 0x3A000009, 0xE1A00400, 0xE38228FF, 0xE070C421, 0x21A00400, 0x23822CFF, 0xE070C221, 0x3A00000E, 0xE270C000};
		static const uint32_t second[] = {0x21A00420, 0xE070C3A1, 0x20411380, 0xE0A22002, 0xE070C321, 0x20411300, 0xE0A22002, 0xE070C2A1, 0x20411280, 0xE0A22002, 0xE070C221, 0x20411200, 0xE0A22002, 0xE070C1A1, 0x20411180, 0xE0A22002, 0xE070C121, 0x20411100, 0xE0A22002, 0xE070C0A1, 0x20411080, 0xE0A22002, 0xE070C001, 0x20411000, 0xE0B22002, 0x2AFFFFE5, 0xE1A00002, 0xE12FFF1E};
		
		if (!memcmp(code, first, sizeof(first)) && (code[sizeof(first) / 4] >> 24) == 0x2A && !memcmp(code + sizeof(first) / 4 + 1, second, sizeof(second))) {
			
			logt("ADS udivmod matched at 0x%08x\n", code);
			//(r1 / r0) -> (r0 = quo, r1 = rem)
			
			//save space for "CBZ r0, div_by_zero_lbl"
			EMIT(SaveSpace, &savedSpace, 1);
			
			//UDIV r2, r1, r0
			EMIT(LLudiv, 2, 1, 0);
			
			//MLS r1, r2, r0, r1
			EMIT(LLmlsReg, 1, 2, 0, 1);
			
			//MOV r0, r2
			EMIT(LLmov, 0, 2, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
			
			//BX LR
			now = jitEmitBxReg(dest, EmitCcAl, (uintptr_t)code, EMIT_REG_NO_LR);
			if (now != EmitErrNone)
				return now;
			
			//emit the CBZ now that we know how long our code was
			EMIT_TO(LLcbz, &savedSpace, 0, emitGetPtrToJumpHere(dest));
			
			//div_by_zero_lbl: goto div zero dest
			now = jitEmitJumpToArm(dest, EmitCcAl, jitWorkOutArmBranchTarget(&code[sizeof(first) / 4]), NULL);
			if (now != EmitErrNone)
				return now;
			
			#ifdef SUPPORT_ICACHE_FLUSH
				(*tuP)->srcLen = (sizeof(first) + sizeof(second) + 4) / sizeof(uint32_t);
			#endif
			return EmitErrNone;
		}
	}
	
	//ADS's sdivmod
	{
		static const uint32_t first[] = {0xE2102480, 0x42600000, 0xE0323041, 0x22611000, 0xE070C1A1, 0x3A000020, 0xE070C421, 0x3A00000F, 0xE1A00400, 0xE38224FF, 0xE070C221, 0x3A000017, 0xE070C421, 0x3A000009, 0xE1A00400, 0xE38228FF, 0xE070C421, 0x21A00400, 0x23822CFF, 0xE070C221, 0x3A00000E, 0xE270C000};
		static const uint32_t second[] = {0x21A00420, 0xE070C3A1, 0x20411380, 0xE0A22002, 0xE070C321, 0x20411300, 0xE0A22002, 0xE070C2A1, 0x20411280, 0xE0A22002, 0xE070C221, 0x20411200, 0xE0A22002, 0xE070C1A1, 0x20411180, 0xE0A22002, 0xE070C121, 0x20411100, 0xE0A22002, 0xE070C0A1, 0x20411080, 0xE0A22002, 0xE070C001, 0x20411000, 0xE0B22002, 0x2AFFFFE5, 0xE0320FC3, 0xE0800FA3, 0x22611000, 0xE12FFF1E};
		
		if (!memcmp(code, first, sizeof(first)) && (code[sizeof(first) / 4] >> 24) == 0x2A && !memcmp(code + sizeof(first) / 4 + 1, second, sizeof(second))) {
			
			logt("ADS sdivmod matched at 0x%08x\n", code);
			//(r1 / r0) -> (r0 = quo, r1 = rem)
			
			//save space for "CBZ r0, div_by_zero_lbl"
			EMIT(SaveSpace, &savedSpace, 1);
			
			//SDIV r2, r1, r0
			EMIT(LLsdiv, 2, 1, 0);
			
			//MLS r1, r2, r0, r1
			EMIT(LLmlsReg, 1, 2, 0, 1);
			
			//MOV r0, r2
			EMIT(LLmov, 0, 2, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
			
			//BX LR
			now = jitEmitBxReg(dest, EmitCcAl, (uintptr_t)code, EMIT_REG_NO_LR);
			if (now != EmitErrNone)
				return now;
			
			//emit the CBZ now that we know how long our code was
			EMIT_TO(LLcbz, &savedSpace, 0, emitGetPtrToJumpHere(dest));
			
			//div_by_zero_lbl: goto div zero dest
			now = jitEmitJumpToArm(dest, EmitCcAl, jitWorkOutArmBranchTarget(&code[sizeof(first) / 4]), NULL);
			if (now != EmitErrNone)
				return now;
			
			#ifdef SUPPORT_ICACHE_FLUSH
				(*tuP)->srcLen = (sizeof(first) + sizeof(second) + 4) / sizeof(uint32_t);
			#endif
			
			return EmitErrNone;
		}
	}

	//codewarrior's fast udiv
	{
		static const uint32_t match[] = {0xE3510000, 0x012FFF1E, 0xE1500001, 0x31A01000, 0x33A00000, 0x312FFF1E, 0xE3A0201C, 0xE1A03220, 0xE1510623, 0xD2422010, 0xD1A03823, 0xE1510223, 0xD2422008, 0xD1A03423, 0xE1510003, 0xD2422004, 0xD1A03223, 0xE1A00210, 0xE2611000, 0xE0900000, 0xE0822082, 0xE08FF102, 0xE1A00000, 0xE0B13083, 0x30433001, 0xE0B00000, 0xE0B13083, 0x30433001, 0xE0B00000, 0xE0B13083, 0x30433001, 0xE0B00000, 0xE0B13083, 0x30433001, 0xE0B00000, 0xE0B13083, 0x30433001, 0xE0B00000, 0xE0B13083, 0x30433001, 0xE0B00000, 0xE0B13083, 0x30433001, 0xE0B00000, 0xE0B13083, 0x30433001, 0xE0B00000, 0xE0B13083, 0x30433001, 0xE0B00000, 0xE0B13083, 0x30433001, 0xE0B00000, 0xE0B13083, 0x30433001, 0xE0B00000, 0xE0B13083, 0x30433001, 0xE0B00000, 0xE0B13083, 0x30433001, 0xE0B00000, 0xE0B13083, 0x30433001, 0xE0B00000, 0xE0B13083, 0x30433001, 0xE0B00000, 0xE0B13083, 0x30433001, 0xE0B00000, 0xE0B13083, 0x30433001, 0xE0B00000, 0xE0B13083, 0x30433001, 0xE0B00000, 0xE0B13083, 0x30433001, 0xE0B00000, 0xE0B13083, 0x30433001, 0xE0B00000, 0xE0B13083, 0x30433001, 0xE0B00000, 0xE0B13083, 0x30433001, 0xE0B00000, 0xE0B13083, 0x30433001, 0xE0B00000, 0xE0B13083, 0x30433001, 0xE0B00000, 0xE0B13083, 0x30433001, 0xE0B00000, 0xE0B13083, 0x30433001, 0xE0B00000, 0xE0B13083, 0x30433001, 0xE0B00000, 0xE0B13083, 0x30433001, 0xE0B00000, 0xE0B13083, 0x30433001, 0xE0B00000, 0xE0B13083, 0x30433001, 0xE0B00000, 0xE0B13083, 0x30433001, 0xE0B00000, 0xE0B13083, 0x30433001, 0xE0B00000, 0xE1A01003, 0xE12FFF1E, };
		
		if (!memcmp(code, match, sizeof(match))) {
			
			logt("CW's _u32_div_f matched at 0x%08x\n", code);
			//(r0 / r1) -> (r0 = quo, r1 = rem), if div by zero, return numerator
			
			//save space for "CBZ r1, exit_lbl"
			EMIT(SaveSpace, &savedSpace, 1);
			
			//UDIV r2, r0, r7
			EMIT(LLudiv, 2, 0, 1);
			
			//MLS r1, r2, r1, r0
			EMIT(LLmlsReg, 1, 2, 1, 0);
			
			//MOV r0, r2
			EMIT(LLmov, 0, 2, EmitShiftLsl, 0, EmitFlagsDoNotCare, false);
			
			//emit the CBZ now that we know how long our code was
			EMIT_TO(LLcbz, &savedSpace, 1, emitGetPtrToJumpHere(dest));
			
			//BX LR
			now = jitEmitBxReg(dest, EmitCcAl, (uintptr_t)code, EMIT_REG_NO_LR);
			if (now != EmitErrNone)
				return now;
			
			#ifdef SUPPORT_ICACHE_FLUSH
				(*tuP)->srcLen = sizeof(match) / sizeof(uint32_t);
			#endif
			
			return EmitErrNone;
		}
	}
	

	//ADS's udiv10
	{
		static const uint32_t match[] = {0xE240100A, 0xE0400120, 0xE0800220, 0xE0800420, 0xE0800820, 0xE1A001A0, 0xE0802100, 0xE0511082, 0x52800001, 0x4281100A, 0xE12FFF1E};
		
		if (!memcmp(code, match, sizeof(match))) {
			
			logt("ADS udiv10 matched at 0x%08x\n", code);
			//(r0 / 10) -> (r0 = quo, r1 = rem)
			
			EMIT(HLjump, (uintptr_t)&jitPatMatchFuncUdiv10);
			
			#ifdef SUPPORT_ICACHE_FLUSH
				(*tuP)->srcLen = sizeof(match) / sizeof(uint32_t);
			#endif
			return EmitErrNone;
		}
	}

	//ADS's sdiv10
	{
		static const uint32_t match[] = {0xE1B03000, 0x42600000, 0xE240100A, 0xE0400120, 0xE0800220, 0xE0800420, 0xE0800820, 0xE1A001A0, 0xE0802100, 0xE0511082, 0x52800001, 0x4281100A, 0xE1B03003, 0x42600000, 0x42611000, 0xE12FFF1E};
		
		if (!memcmp(code, match, sizeof(match))) {
			
			logt("ADS sdiv10 matched at 0x%08x\n", code);
			
			EMIT(HLjump, (uintptr_t)&jitPatMatchFuncSdiv10);
			
			#ifdef SUPPORT_ICACHE_FLUSH
				(*tuP)->srcLen = sizeof(match) / sizeof(uint32_t);
			#endif
			return EmitErrNone;
		}
	}

	//ADS's memcpy
	{
		static const uint32_t match[] = {0xE92D4010, 0xE2522020, 0x3A000005, 0x28B15018, 0x28A05018, 0x28B15018, 0x28A05018, 0x22522020, 0x2AFFFFF9, 0xE1B0CE02, 0x28B15018, 0x28A05018, 0x48B10018, 0x48A00018, 0xE8BD4010, 0xE1B0CF02, 0x24913004, 0x24803004, 0x012FFF1E, 0xE1B02F82, 0x44D12001, 0x24D13001, 0x24D1C001, 0x44C02001, 0x24C03001, 0x24C0C001, 0xE12FFF1E};
		
		if (!memcmp(code, match, sizeof(match))) {
			
			logt("ADS aligned memcpy matched at 0x%08x\n", code);
			
			EMIT(HLjump, (uintptr_t)&memcpy);
			
			#ifdef SUPPORT_ICACHE_FLUSH
				(*tuP)->srcLen = sizeof(match) / sizeof(uint32_t);
			#endif
			return EmitErrNone;
		}
	}
	
	
	//pattern match:
	//	long arm calls:
	//		LDR             R12, =func_offset //we check that this is a multiple of 4
	//		ADD             PC, PC, R12
	if (((t = code[0]) & 0xff7ff003) == 0xe51fc000 && (code[1] == 0xe08ff00c || code[1] == 0xe08cf00f)) {
		
		int32_t ofst = t & 0xfff;
		uint32_t fromReg = 0;
		
		//we'd like to only load PC once, which we can do IFF the offset is not -0x1000
		//we also should make sure that r12 ends up where it needs to be
		
		if (!(t & 0x00800000))
			ofst = -ofst;
	
		//adjust ofst to be from "PC_VAL" of second instr, this might push it beyond range of our normal load
		ofst -= 0x04;
	
		//PUSH {r0, r1}
		EMIT(HLpush, 0x0003);
		
		//LDR r0, =PC_VAL	//as seen at second instruction
		now = jitEmitLoadImmToReg(dest, 0, (uintptr_t)(code + 3), false, false, false);
		if (now != EmitErrNone)
			return now;
		
		//close enough
		if (ofst <= -0x100) {
			
			//ADD r12, r0, #0x1000 
			EMIT(LLaddImm, 12, 0, 0x1000, EmitLeaveFlags, false);
			fromReg = 12;
			ofst += 0x1000;
		}
		
		//LDR r12, [fromReg, #ofst]
		EMIT(LLloadImm, 12, fromReg, ofst, EmitSzWord, false, EmitAdrModeIndex);
			
		//ADD r0, r12
		EMIT(LLaddReg, 0, 0, 12, EmitShiftLsl, 0, EmitLeaveFlags, false);
		
		//STR r0, [sp, #4]
		EMIT(LLstoreImm, 0, EMIT_REG_NO_SP, sizeof(uint32_t), EmitSzWord, EmitAdrModeIndex);

		//pop {r0}
		EMIT(HLpop, 0x0001);
		
		//POP_noninterworking {pc}
		EMIT(HLjump, (uintptr_t)&jitPrvPopPcArmOnlyCallout);
		
		#ifdef SUPPORT_ICACHE_FLUSH
			(*tuP)->srcLen = 8 / sizeof(uint32_t);
		#endif
		
		return EmitErrNone;
	}
	
	//pattern match:
	//	and long interworking calls:
	//		LDR             R12, =func_offset //we check that this is a multiple of 4
	//		ADD             R12, R12, PC
	//		BX              R12
	if (((t = code[0]) & 0xff7ff003) == 0xe51fc000 && (code[1] == 0xe08fc00c || code[1] == 0xe08cc00f) && code[2] == 0xe12fff1c) {
		
		int32_t ofst = t & 0xfff;
		uint32_t fromReg = 0, ea;
		
		
		//we'd like to only load PC once, which we can do IFF the offset is not -0x1000
		//we also should make sure that r12 ends up where it needs to be
		
		if (!(t & 0x00800000))
			ofst = -ofst;
		
		ea = ofst + (uintptr_t)(code + 2);	//get addr of imm
		
		//LDR r12, =&imm &~ 0xfff	//since we can use const in instr and this way less const to load
		now = jitEmitLoadImmToReg(dest, 12, ea &~ 0x0fff, false, false, false);
		if (now != EmitErrNone)
			return now;
		
		//LDR r12, [r12]
		EMIT(LLloadImm, 12, 12, ea & 0x0fff, EmitSzWord, false, EmitAdrModeIndex);

		//ADD r12, r12, PC_VAL
		now = jitEmitArbitraryImmAdd(dest, 12, 12, (uintptr_t)(code + 3), false);
		if (now != EmitErrNone)
			return now;

		//BX r12
		EMIT(HLjump, (uintptr_t)&jitPrvBxR12Callout);
		
		#ifdef SUPPORT_ICACHE_FLUSH
			(*tuP)->srcLen = 12 / sizeof(uint32_t);
		#endif
		
		return EmitErrNone;
	}
	
	return EmitErrInvalidInput;
}


static enum EmitStatus jitPeepholeCodeReader(struct EmitBuf *dest, uint32_t **codeP, bool *terminateP)
{
	static const char* strCc[] = {"EQ","NE","CS","CC","MI","PL","VS","VC","HI","LS","GE","LT","GT","LE", "AL","NV"};
	uint32_t *code = *codeP, instr, instr2;
	enum EmitStatus now;
	
	//emit {u,s}bfx/{u,s}xt{b,h} for
	//	ubfx: MOV Rx, Ry, LSL #num1   ;  MOV Rx, Rx, LSR #num2		x may be == y
	//	sbfx: MOV Rx, Ry, LSL #num1   ;  MOV Rx, Rx, ASR #num2		x may be == y
	{
		enum EmitShiftType shiftType1, shiftType2;
		uint8_t rdNo1, rmNo1, shiftAmt1;
		uint8_t rdNo2, rmNo2, shiftAmt2;
		enum EmitCc cc1, cc2;
		
		if (jitParseArmMovToRegFomRegShiftImm(code[0], &rdNo1, &rmNo1, &shiftType1, &shiftAmt1, &cc1) &&
					(rdNo1 != EMIT_REG_NO_PC || cc1 != EmitCcAl) /* make sure next instr is even safe to read */ &&
					jitParseArmMovToRegFomRegShiftImm(code[1], &rdNo2, &rmNo2, &shiftType2, &shiftAmt2, &cc2) && cc1 == cc2 &&
					cc1 != EmitCcNv &&  shiftType1 == EmitShiftLsl && (shiftType2 == EmitShiftLsr || shiftType2 == EmitShiftAsr) &&
					shiftAmt2 >= shiftAmt1 && shiftAmt2 < 32 &&  rmNo2 == rdNo1 && rdNo2 == rmNo2 &&
					rmNo1 != EMIT_REG_NO_PC && rmNo1 != EMIT_REG_NO_SP && rdNo1 != EMIT_REG_NO_PC && rdNo1 != EMIT_REG_NO_SP) {
			
			//we have a match - emit a potentially conditional {u/s}bfx
			uint32_t msbDesired = 31 - shiftAmt1;
			uint32_t numBitsDesired = 32 - shiftAmt2;
			uint32_t lsbDesired = msbDesired - numBitsDesired + 1;
			bool isUnsigned = (shiftType2 == EmitShiftLsr);
			
			//handle conditionality
			if (cc1 != EmitCcAl) {
				EMIT(LLit, cc1);
			}
			
			//if the conditions for a using a {US}XT{HB} are met, use those
			//technically we could use rotation, but those are in no way more useful than
			//using a bitfield extract (well, except a *xth with a 24 bit rotate, but
			//nobody ever needs that)
			if (lsbDesired == 0 && (numBitsDesired == 8 || numBitsDesired == 16)) {
				
				logt("converting MOV%s R%u, R%u, LSL #%u; MOV%s R%u, R%u, %cSR #%u => %cXT%c R%u, R%u\n",
					(cc1 == EmitCcAl) ? "" : strCc[cc1], rdNo1, rmNo1, shiftAmt1,
					(cc2 == EmitCcAl) ? "" : strCc[cc2], rdNo2, rmNo2, isUnsigned ? 'L' : 'A', shiftAmt2,
					isUnsigned ? 'U' : 'S', (numBitsDesired == 8) ? 'B' : 'H', rdNo2, rmNo1);
				
				EMIT(LLextend, rdNo2, rmNo1, 0, numBitsDesired == 8, isUnsigned);
			}
			else {
				logt("converting MOV%s R%u, R%u, LSL #%u; MOV%s R%u, R%u, %cSR #%u => %cBFX R%u, R%u, #%u, #%u\n",
					(cc1 == EmitCcAl) ? "" : strCc[cc1], rdNo1, rmNo1, shiftAmt1,
					(cc2 == EmitCcAl) ? "" : strCc[cc2], rdNo2, rmNo2, (isUnsigned) ? 'L' : 'A', shiftAmt2,
					isUnsigned ? 'U' : 'S', rdNo2, rmNo1, lsbDesired, numBitsDesired);
				
				EMIT(LLbfx, rdNo2, rmNo1, lsbDesired, numBitsDesired, isUnsigned, false);
			}
			
			(*codeP) += 2;
			return EmitErrNone;
		}
	}
	
	//emit sane code for instrs of the form:  LDR Rx, [PC,Ry,shift] ; ADD PC, PC, Rx - it is commonly seen in pace
	if (((instr = code[0]) & 0xffff0010) == 0xe79f0000 && ((instr >> 12) & 0xf) != EMIT_REG_NO_PC && (code[1] & 0xfffffff0) == 0xe08ff000) {
		
		uint32_t ldrOfstReg = instr & 0x0f, ldrDstReg = (instr >> 12) & 0x0f, addSrcReg = code[1] & 0x0f, shiftAmt = (instr >> 7) & 0x1f;
		enum EmitShiftType shiftType = (enum EmitShiftType)((instr >> 5) & 3);
		
		if (ldrDstReg == addSrcReg && ldrDstReg != EMIT_REG_NO_SP && ldrDstReg != EMIT_REG_NO_PC && ldrOfstReg != EMIT_REG_NO_SP && ldrOfstReg != EMIT_REG_NO_PC) {
			
			uint32_t tmpRegPcVal = jitUtilPickLowestClearBit((1 << ldrOfstReg) | (1 << ldrDstReg));
			
			//push {lr, tmpRegPcVal}
			EMIT(HLpush, (1 << tmpRegPcVal) + (1 << EMIT_REG_NO_LR));
			
			//LDR tmpRegPcVal, =PC_VAL
			now = jitEmitLoadImmToReg(dest, tmpRegPcVal, (uintptr_t)(code + 2), false, false, false);
			if (now != EmitErrNone)
				return now;
			
			//LDR Rx, [tmpRegPcVal, Ry, shift]
			now = jitEmitRegRegMemLdr(dest, EmitCcAl, (uintptr_t)code, false, ldrDstReg, tmpRegPcVal, true, ldrOfstReg, shiftType, shiftAmt, EmitAdrModeIndex, EmitSzWord);
			if (now != EmitErrNone)
				return now;

			//ADD tmpRegPcVal, #4
			EMIT(LLaddImm, tmpRegPcVal, tmpRegPcVal, 4, EmitLeaveFlags, false);
			
			//ADD tmpRegPcVal, ldrDstReg
			EMIT(LLaddReg, tmpRegPcVal, tmpRegPcVal, ldrDstReg, EmitShiftLsl, 0, EmitLeaveFlags, false);

			//STR tmpRegPcVal, [SP, #4]
			EMIT(LLstoreImm, tmpRegPcVal, EMIT_REG_NO_SP, sizeof(uint32_t), EmitSzWord, EmitAdrModeIndex);
			
			//POP {tmpRegPcVal}
			EMIT(HLpop, (1 << tmpRegPcVal));
			
			//POP_noninterworking {pc}
			EMIT(HLjump, (uintptr_t)&jitPrvPopPcArmOnlyCallout);
			
			logt("PACE-style switch detected\n");
			
			*terminateP = true;
			(*codeP) += 2;
			
			return EmitErrNone;
		}
	}
			
	//match things of the form: LDR Rx, =imm; ADD Rx, Rx, PC
	if (((instr = code[0]) & 0xff7f0000) == 0xe51f0000 && ((instr >> 12) & 0x0f) != EMIT_REG_NO_PC && ((instr2 = code[1]) & 0xfff00ff0) == 0xe0800000) {
		
		uint32_t i1D = (instr >> 12) & 0x0f, i2D = (instr2 >> 12) & 0x0f, i2N = (instr2 >> 16) & 0x0f, i2M = instr2 & 0x0f;
		
		if (i2D == i1D && i1D != EMIT_REG_NO_SP && i1D != EMIT_REG_NO_PC && ((i2D == i2N && i2M == EMIT_REG_NO_PC) || (i2D == i2M && i2M == EMIT_REG_NO_PC))) {
			
			int32_t ofstToVal = 8 + ((instr & 0x00800000) ? (instr & 0xFFF) : -(instr & 0xFFF));	//FROM FIRST INSTR
			uint32_t val = code[ofstToVal / 4] + (uintptr_t)(code + 1 + 2);
			
			logt("matched and replacing at 0x%08x:\n"
					" LDR R%u, [PC, #%c0x%03x]\n"
					" ADD R%u, %s%u%s\n"
					"\t with\n"
					" ldr R%u = 0x%08x\n",
				code,
				i1D, (instr & 0x00800000) ? '+' : '-', (instr & 0xFFF),
				i2D, (i2N == EMIT_REG_NO_PC) ? "PC, R" : "R", i2D, (i2N == EMIT_REG_NO_PC) ? "" : ", PC",
				i2D, val);
				
				//load ea for imm
				now = jitEmitLoadImmToReg(dest, i1D, ((uintptr_t)code) + ofstToVal, false, false, false);
				if (now != EmitErrNone)
					return now;
				
				//load imm
				now = jitEmitImmMemLdr(dest, EmitCcAl, (uintptr_t)(code + 2), false, i2D, i2D, 0, EmitAdrModeIndex, EmitSzWord);
				if (now != EmitErrNone)
					return now;
				
				//add pc
				now = jitEmitArbitraryImmAdd(dest, i2D, i2D, (uintptr_t)(code + 1 + 2), false);
				if (now != EmitErrNone)
					return now;

				(*codeP) += 2;
				return EmitErrNone;
		}
	}
	
	//match switches of the form: CMP Rx, #yy; ADDLS PC, PC, Rx, LSL #2; B default_label; B label....   (x times)
	if ((((instr = code[0]) & 0xfff0ff00) == 0xe3500000) && (((instr2 = code[1]) & 0xfffffff0) == 0x908ff100)) {
		
		uint32_t i, regNo = (instr >> 16) & 0x0F, ncases = (instr & 0xff) + 1;
		bool valid = true;
		
		if (regNo != EMIT_REG_NO_PC && regNo != EMIT_REG_NO_SP && (instr2 & 0x0f) == regNo && ncases <= MAX_SWITCH_SIZE) {
			
			for (i = 0; i < ncases + 1; i++) {
				
				if ((code[2 + i] >> 24) != 0xea) {
					valid = false;
					break;
				}
			}
			
			if (valid) {		//we have deduced that this switch is worth optimizing
				
				struct EmitTableBranchState tbbs;
				uint8_t *dstb;
				
				//CMP regNo, #ncases - 1
				EMIT(LLcmpImm, regNo, ncases - 1);
				
				//BHI default
				now = jitEmitJumpToArm(dest, EmitCcHi, jitWorkOutArmBranchTarget(code + 2), NULL);
				if (now != EmitErrNone)
					return now;
				
				//TBB [PC, regNo]
				EMIT(LLtableBranch, &tbbs, regNo, false, ncases);
				
				for (i = 0; i < ncases; i++) {
					
					now = emitLLtableBranchSetCase(&tbbs, i, emitGetPtrToJumpHere(dest));
					if (now != EmitErrNone)
						return now;
					
					//if the tc entry exists this will be a short branch, if not it will be a UDF we'll replace later
					now = jitEmitJumpToArm(dest, EmitCcAl, jitWorkOutArmBranchTarget(code + 3 + i), NULL);
					if (now != EmitErrNone)
						return now;
				}
				
				*terminateP = true;
				(*codeP) += 2 + ncases + 1;
				
				return EmitErrNone;
			}
		}
	}

	//match function prologue for gcc code with frame pointers. existing codegen does ok at it, but we can do better
	//code looks like this: MOV R12, SP; PUSH {..regs..,R11,R12,LR,PC}; SUB R11, R12, #4
	//in reality the pushed PC  value is never used, but for correctness we MUST push it
	//we match just the first two instrs and replace them since the third is fine to translate as is
	//it is shown here just for completeness since it is the hallmark of the frame entry
	if (code[0] == 0xe1a0c00d && ((instr = code[1]) & 0xfffff800) == 0xe92dd800) {
		
		//mov r12, #PC_VAL_AT_SECOND_INSTR
		now = jitEmitLoadImmToReg(dest, 12, (uintptr_t)(code + 3), false, false, false);
		if (now != EmitErrNone)
			return now;
		
		//push {r12}		//aka  str r12, [sp, #-4]!
		EMIT(HLpush, 1 << 12);
		
		//add r12, sp, #4	//what r12 would have been
		EMIT(LLaddImm, 12, EMIT_REG_NO_SP, 4, EmitLeaveFlags, false);
		
		//push the remaining regs (guaranteed valid reg set for "push")
		EMIT(HLpush, instr & 0x7fff);

		(*codeP) += 2;
		return EmitErrNone;
	}
	
	//Match function epilogue for gcc code with frame pointers. It is hard to translate and oftentimes we can do better. There are two main
	//forms. The immediate form and the delayed form. Immediate form returns right away (loads PC), while the delayed form loads the return
	//address into LR, in preparation for a later "MOV PC, LR". This epilogue can be conditional, and we need to handle that. As an extra
	//point of interest, "Bejeweled!" uses the LDM Rx, {...}^" form of ths instr despite that kind not being valid in userspace code. Testing
	//on real hardware indicated that the "^" is simply ignored. So, without further ado, the two forms of this return are as follows. Delayed:
	//	LDMDB R11, {..other regs maybe..,R11,SP,LR}
	//and the immediate:
	//	LDMDB R11, {..other regs maybe..,R11,SP,PC}
	//Our normal translations for LDMDB will work, but they use a complex sp-swapping mechanism which, while correct, is overcomplex for this
	//case since we likely know where SP will end up. The idea here is to pre-load SP, see if it indeed matches the value we expect, and if so,
	//go to a better fast path. If not, execute the normal slow path for LDMDB which handles all possible cases. This strategy produces a large
	//translation (25 bytes avg), but it is faster than anything else we can do, so we live with it. This fast path will not be usable if the
	// func uses alloca() or pops a different number of regs than it pushed (these are not common).
	if ((((instr = code[0]) & 0x0fbff800) == 0x091ba800 || ((instr & 0x0fbff800) == 0x091b6800)) && (instr >> 28) != EmitCcNv) {
		
		uint32_t nPoppedRegs = jitPrvPopcount16(instr & 0xffff);
		uint32_t normalRegs = instr & 0x47ff, tmpReg1, tmpReg2;
		struct EmitBuf jumpOver, fastPathCbnz, fastPathExit;
		bool finalForm = !!(instr & (1 << EMIT_REG_NO_PC));
		enum EmitCc cc = (enum EmitCc)(instr >> 28);
		
		if (cc != EmitCcAl) {
		
			//save space for a jump-over
			EMIT(SaveSpace, &jumpOver, 1);
		}
		else if (finalForm){
			
			//unconditional final form is the end of the TU
			*terminateP = true;
		}
		
		//our fast-case availability check requires a loreg, but if we have two regs (second need not be low) it can be more efficient
		
		//if there are no loregs, push r0 (we can pop it as part of doing all this). the fast path *IS* worth it
		if (!(normalRegs & 0xff)) {
			
			//sub sp, sp, #4		//we do not know how far sp is, so we might not have space - be sure - make it!
			EMIT(LLsubImm, EMIT_REG_NO_SP, EMIT_REG_NO_SP, 4, EmitLeaveFlags, false);
			
			//str r0, [r11, #-4 * (nPoppedRegs + 1)]
			EMIT(LLstoreImm, 0, 11, -4 * (nPoppedRegs + 1), EmitSzWord, EmitAdrModeIndex);
			
			instr |= 1;
			normalRegs |= 1;
			nPoppedRegs++;
		}

		tmpReg2 = tmpReg1 = __builtin_ctz(normalRegs);	//this is our guaranteed loreg
		if (normalRegs & (normalRegs - 1))
			tmpReg2 =__builtin_ctz(normalRegs & (normalRegs - 1));
		
		//tmpReg2 == tmpReg1 if we only found one reg
		
		//ldr tmpReg1, [r11, #-8]	//load sp value we expect to pop
		EMIT(LLloadImm, tmpReg1, 11, -8, EmitSzWord, false, EmitAdrModeIndex);

		//sub tmpReg2, tmpReg1, #4	//we expect ti to be precisely 4 more than r11
		EMIT(LLsubImm, tmpReg2, tmpReg1, sizeof(uint32_t), EmitLeaveFlags, false);

		//sub tmpReg2, r11
		EMIT(LLsubReg, tmpReg2, tmpReg2, 11, EmitShiftLsl, 0, EmitLeaveFlags, false);

		//here if tmpReg2 is 0, fast path can proceed. a "cbnz tmpReg2" bails us out to the slow path otherwise
		EMIT(SaveSpace, &fastPathCbnz, 1);
		
		//FAST PATH BEGINS!!!
		
		//if tmpReg1 and tmpReg2 are the same, we need to reload it
		if (tmpReg1 == tmpReg2) {
			
			//ldr tmpReg1, [r11, #-8]	//load sp value we expect to pop
			EMIT(LLloadImm, tmpReg1, 11, -8, EmitSzWord, false, EmitAdrModeIndex);
		}
		
		//sub tmpReg1, tmpReg1, #4 * (nPoppedRegs + 1)
		EMIT(LLsubImm, tmpReg1, tmpReg1, sizeof(uint32_t) * (nPoppedRegs + 1), EmitLeaveFlags, false);

		//mov SP, tmpReg1
		EMIT(LLmov, EMIT_REG_NO_SP, tmpReg1, EmitShiftLsl, 0, EmitLeaveFlags, false);

		//pop {regs up to and including r11}, this clobbers tmpReg1 and tmpReg2
		EMIT(HLldmia, EMIT_REG_NO_SP, instr & 0x0fff, true);

		//we now need to pop lr or pc, as needed, but also advance sp one more time
		//basically we need: "LDR LR, [SP], #8]" or "LDR PC, [SP], #8]". for LR we can get tat using jitEmitImmMemLdr
		//but for pc a more optimized path is advised
		if (finalForm) {
			
			//no need to skip the "sp" slot on stack since our callout does that for us
			
			now = jitEmitJumpToAbsThumbAddrNotInTu(dest, (uintptr_t)&jitPrvPopPcAndAdvanceSpCallout);
			if (now != EmitErrNone)
				return now;
		}
		else {		//LR
			
			//add sp, sp, #4	 //skip "sp" slot on stack
			EMIT(LLaddImm, EMIT_REG_NO_SP, EMIT_REG_NO_SP, sizeof(uint32_t), EmitLeaveFlags, false);
		
			now = jitEmitImmMemLdr(dest, EmitCcAl, (uintptr_t)code, false, EMIT_REG_NO_LR, EMIT_REG_NO_SP, 2 * sizeof(uint32_t), EmitAdrModePostindex, EmitSzWord);
			if (now != EmitErrNone)
				return now;
			
			//if we got this far and it is not a final load, save the space for a branch over the slow path
			EMIT(SaveSpace, &fastPathExit, 1);
		}
		
		//fill in the cbnz in that falls through to the upcoming slow path code
		EMIT_TO(LLcbnz, &fastPathCbnz, tmpReg2, emitGetPtrToJumpHere(dest));
		
		//we need a slow path in case fast path fails
		now = jitEmitLdmdb(dest, EmitCcAl, (uintptr_t)code, 11, instr & 0xffff, false);
		if (now != EmitErrNone)
			return now;
	
		//if this instr was conditional, emit the conditional branch up top
		if (cc != EmitCcAl)
			EMIT_TO(LLbranch, &jumpOver, emitGetPtrToJumpHere(dest), emitCcInvert(cc));
		
		//if fast path succeeds in non-final form, we need a jump to here, do it
		if (!finalForm)
			EMIT_TO(LLbranch, &fastPathExit, emitGetPtrToJumpHere(dest), EmitCcAl);
		
		(*codeP) +=1;			//yup...all this for one instruction
		return EmitErrNone;
	}
	
	//no match?
	return EmitErrInvalidInput;
}

