#define WEAK __attribute__ ((weak))
#define ALIAS(f) __attribute__ ((weak, alias (#f)))

#include <stdbool.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "machSpecific.h"
#include "msioComms.h"
#include "memmap.h"
#include "printf.h"
#include "timers.h"
#include "entry.h"
#include "msio.h"
#include "boot.h"
#include "heap.h"
#include "irqs.h"
#include "cpu.h"
#include "mpu.h"


void __attribute__((used)) IntDefaultHandler(void)
{
	cpuIrqDefaultHandler();
	asm volatile("DSB 0x0f");		//c-m4f erratum
}

#define VEC_(nm, pfx)	void nm##pfx(void) __attribute__ ((weak, alias ("IntDefaultHandler"))) 
#define VEC(nm)		VEC_(nm, Handler)
#define VECI(nm)	VEC_(nm, IRQHandler)


VEC(NMI_);
VEC(HardFault_);
VEC(MemManage_);
VEC(BusFault_);
VEC(UsageFault_);
VEC(SVC_);
VEC(PendSV_);
VEC(SysTick_);


VECI(MSIO_);

VECI(WWDG_);
VECI(PVD_PVM_);
VECI(RTC_TAMP_STAMP_CSS_LSE_);
VECI(RTC_WKUP_);
VECI(FLASH_);
VECI(RCC_);
VECI(EXTI0_);
VECI(EXTI1_);
VECI(EXTI2_);
VECI(EXTI3_);
VECI(EXTI4_);
VECI(DMA1_Stream0_);
VECI(DMA1_Stream1_);
VECI(DMA1_Stream2_);
VECI(DMA1_Stream3_);
VECI(DMA1_Stream4_);
VECI(DMA1_Stream5_);
VECI(DMA1_Stream6_);
VECI(ADC_);
VECI(FDCAN1_IT0_);
VECI(FDCAN2_IT0_);
VECI(FDCAN1_IT1_);
VECI(FDCAN2_IT1_);
VECI(EXTI9_5_);
VECI(TIM1_BRK_);
VECI(TIM1_UP_);
VECI(TIM1_TRG_COM_);
VECI(TIM1_CC_);
VECI(TIM2_);
VECI(TIM3_);
VECI(TIM4_);
VECI(I2C1_EV_);
VECI(I2C1_ER_);
VECI(I2C2_EV_);
VECI(I2C2_ER_);
VECI(SPI1_);
VECI(SPI2_);
VECI(USART1_);
VECI(USART2_);
VECI(USART3_);
VECI(EXTI15_10_);
VECI(RTC_Alarm_);
VECI(DFSDM2_);
VECI(TIM8_BRK_TIM12_);
VECI(TIM8_UP_TIM13_);
VECI(TIM8_TRG_COM_TIM14_);
VECI(TIM8_CC_);
VECI(DMA1_Stream7_);
VECI(FMC_);
VECI(SDMMC1_);
VECI(TIM5_);
VECI(SPI3_);
VECI(UART4_);
VECI(UART5_);
VECI(TIM6_DAC_);
VECI(TIM7_);
VECI(DMA2_Stream0_);
VECI(DMA2_Stream1_);
VECI(DMA2_Stream2_);
VECI(DMA2_Stream3_);
VECI(DMA2_Stream4_);
VECI(FDCAN_CAL_);
VECI(DFSDM1_FLT4_);
VECI(DFSDM1_FLT5_);
VECI(DFSDM1_FLT6_);
VECI(DFSDM1_FLT7_);
VECI(DMA2_Stream5_);
VECI(DMA2_Stream6_);
VECI(DMA2_Stream7_);
VECI(USART6_);
VECI(I2C3_EV_);
VECI(I2C3_ER_);
VECI(OTG_HS_EP1_OUT_);
VECI(OTG_HS_EP1_IN_);
VECI(OTG_HS_WKUP_);
VECI(OTG_HS_);
VECI(DCMI_PSSI_);
VECI(CRYP_);
VECI(HASH_RNG_);
VECI(FPU_);
VECI(UART7_);
VECI(UART8_);
VECI(SPI4_);
VECI(SPI5_);
VECI(SPI6_);
VECI(SAI1_);
VECI(LTDC_);
VECI(LTDC_ER_);
VECI(DMA2D_);
VECI(SAI2_);
VECI(OCTOSPI1_);
VECI(LPTIM1_);
VECI(CEC_);
VECI(I2C4_EV_);
VECI(I2C4_ER_);
VECI(SPDIF_RX_);
VECI(DMAMUX1_OVR_);
VECI(DFSDM1_FLT0_);
VECI(DFSDM1_FLT1_);
VECI(DFSDM1_FLT2_);
VECI(DFSDM1_FLT3_);
VECI(SWPMI1_);
VECI(TIM15_);
VECI(TIM16_);
VECI(TIM17_);
VECI(MDIOS_WKUP_);
VECI(MDIOS_);
VECI(JPEG_);
VECI(MDMA_);
VECI(SDMMC2_);
VECI(HSEM1_);
VECI(DAC2_);
VECI(DMAMUX2_OVR_);
VECI(BDMA2_Channel0_);
VECI(BDMA2_Channel1_);
VECI(BDMA2_Channel2_);
VECI(BDMA2_Channel3_);
VECI(BDMA2_Channel4_);
VECI(BDMA2_Channel5_);
VECI(BDMA2_Channel6_);
VECI(BDMA2_Channel7_);
VECI(COMP_);
VECI(LPTIM2_);
VECI(LPTIM3_);
VECI(UART9_);
VECI(USART10_);
VECI(LPUART1_);
VECI(WWDG_RST_);
VECI(CRS_);
VECI(ECC_);
VECI(DTS_);
VECI(WAKEUP_PIN_);
VECI(OCTOSPI2_);
VECI(OTFDEC1_);
VECI(OTFDEC2_);
VECI(BDMA1_);
VECI(GFXMMU_);



//aligned by linker script as needed
__attribute__ ((section(".ramvecs"))) void (*__ISR_VECTORS[]) (void) =
{
	0,		// unused: initial sp
	0,		// unused: reset handler
	NMI_Handler,
	HardFault_Handler,
	MemManage_Handler,
	BusFault_Handler,
	UsageFault_Handler,
	0,
	0,
	0,
	0,
	SVC_Handler,		// SVCall handler
	0,					// Reserved
	0,					// Reserved
	PendSV_Handler,		// The PendSV handler
	SysTick_Handler,	// The SysTick handler
	
	// Chip Level - STM32H7
	WWDG_IRQHandler,
	PVD_PVM_IRQHandler,
	RTC_TAMP_STAMP_CSS_LSE_IRQHandler,
	RTC_WKUP_IRQHandler,
	FLASH_IRQHandler,
	RCC_IRQHandler,
	EXTI0_IRQHandler,
	EXTI1_IRQHandler,
	EXTI2_IRQHandler,
	EXTI3_IRQHandler,
	EXTI4_IRQHandler,
	DMA1_Stream0_IRQHandler,
	DMA1_Stream1_IRQHandler,
	DMA1_Stream2_IRQHandler,
	DMA1_Stream3_IRQHandler,
	DMA1_Stream4_IRQHandler,
	DMA1_Stream5_IRQHandler,
	DMA1_Stream6_IRQHandler,
	ADC_IRQHandler,
	FDCAN1_IT0_IRQHandler,
	FDCAN2_IT0_IRQHandler,
	FDCAN1_IT1_IRQHandler,
	FDCAN2_IT1_IRQHandler,
	EXTI9_5_IRQHandler,
	TIM1_BRK_IRQHandler,
	TIM1_UP_IRQHandler,
	TIM1_TRG_COM_IRQHandler,
	TIM1_CC_IRQHandler,
	TIM2_IRQHandler,
	TIM3_IRQHandler,
	TIM4_IRQHandler,
	I2C1_EV_IRQHandler,
	I2C1_ER_IRQHandler,
	I2C2_EV_IRQHandler,
	I2C2_ER_IRQHandler,
	SPI1_IRQHandler,
	SPI2_IRQHandler,
	USART1_IRQHandler,
	USART2_IRQHandler,
	USART3_IRQHandler,
	EXTI15_10_IRQHandler,
	RTC_Alarm_IRQHandler,
	DFSDM2_IRQHandler,
	TIM8_BRK_TIM12_IRQHandler,
	TIM8_UP_TIM13_IRQHandler,
	TIM8_TRG_COM_TIM14_IRQHandler,
	TIM8_CC_IRQHandler,
	DMA1_Stream7_IRQHandler,
	FMC_IRQHandler,
	SDMMC1_IRQHandler,
	TIM5_IRQHandler,
	SPI3_IRQHandler,
	UART4_IRQHandler,
	UART5_IRQHandler,
	TIM6_DAC_IRQHandler,
	TIM7_IRQHandler,
	DMA2_Stream0_IRQHandler,
	DMA2_Stream1_IRQHandler,
	DMA2_Stream2_IRQHandler,
	DMA2_Stream3_IRQHandler,
	DMA2_Stream4_IRQHandler,
	MSIO_IRQHandler,							//FAKE
	0,
	FDCAN_CAL_IRQHandler,
	DFSDM1_FLT4_IRQHandler,
	DFSDM1_FLT5_IRQHandler,
	DFSDM1_FLT6_IRQHandler,
	DFSDM1_FLT7_IRQHandler,
	DMA2_Stream5_IRQHandler,
	DMA2_Stream6_IRQHandler,
	DMA2_Stream7_IRQHandler,
	USART6_IRQHandler,
	I2C3_EV_IRQHandler,
	I2C3_ER_IRQHandler,
	OTG_HS_EP1_OUT_IRQHandler,
	OTG_HS_EP1_IN_IRQHandler,
	OTG_HS_WKUP_IRQHandler,
	OTG_HS_IRQHandler,
	DCMI_PSSI_IRQHandler,
	CRYP_IRQHandler,
	HASH_RNG_IRQHandler,
	FPU_IRQHandler,
	UART7_IRQHandler,
	UART8_IRQHandler,
	SPI4_IRQHandler,
	SPI5_IRQHandler,
	SPI6_IRQHandler,
	SAI1_IRQHandler,
	LTDC_IRQHandler,
	LTDC_ER_IRQHandler,
	DMA2D_IRQHandler,
	SAI2_IRQHandler,
	OCTOSPI1_IRQHandler,
	LPTIM1_IRQHandler,
	CEC_IRQHandler,
	I2C4_EV_IRQHandler,
	I2C4_ER_IRQHandler,
	SPDIF_RX_IRQHandler,
	0,
	0,
	0,
	0,
	DMAMUX1_OVR_IRQHandler,
	0,
	0,
	0,
	0,
	0,
	0,
	0,
	DFSDM1_FLT0_IRQHandler,
	DFSDM1_FLT1_IRQHandler,
	DFSDM1_FLT2_IRQHandler,
	DFSDM1_FLT3_IRQHandler,
	0,
	SWPMI1_IRQHandler,
	TIM15_IRQHandler,
	TIM16_IRQHandler,
	TIM17_IRQHandler,
	MDIOS_WKUP_IRQHandler,
	MDIOS_IRQHandler,
	JPEG_IRQHandler,
	MDMA_IRQHandler,
	0,
	SDMMC2_IRQHandler,
	HSEM1_IRQHandler,
	0,
	DAC2_IRQHandler,
	DMAMUX2_OVR_IRQHandler,
	BDMA2_Channel0_IRQHandler,
	BDMA2_Channel1_IRQHandler,
	BDMA2_Channel2_IRQHandler,
	BDMA2_Channel3_IRQHandler,
	BDMA2_Channel4_IRQHandler,
	BDMA2_Channel5_IRQHandler,
	BDMA2_Channel6_IRQHandler,
	BDMA2_Channel7_IRQHandler,
	COMP_IRQHandler,
	LPTIM2_IRQHandler,
	LPTIM3_IRQHandler,
	UART9_IRQHandler,
	USART10_IRQHandler,
	LPUART1_IRQHandler,
	WWDG_RST_IRQHandler,
	CRS_IRQHandler,
	ECC_IRQHandler,
	0,
	DTS_IRQHandler,
	0,
	WAKEUP_PIN_IRQHandler,
	OCTOSPI2_IRQHandler,
	OTFDEC1_IRQHandler,
	OTFDEC2_IRQHandler,
	GFXMMU_IRQHandler,
	BDMA1_IRQHandler,
};


static void gpioInit(void)
{
	#define QSPI_SPEED		3
	#define MSIO_SPEED		3
	#define NAND_SPI_SPEED	3
	
	//if all goes well, this will compile down to just a few assignments, while letting us write readable code
	
	//buildnig blocks
	#define DEFPORT(_nm)				uint32_t moder_##_nm = 0xffffffff, otyper_##_nm = 0, ospeedr_##_nm = 0, pupdr_##_nm = 0, bsrr_##_nm = 0; uint64_t afr_##_nm = 0
	#define APPLYPORT(_nm)				GPIO##_nm->MODER = moder_##_nm; GPIO##_nm->OTYPER = otyper_##_nm; GPIO##_nm->OSPEEDR = ospeedr_##_nm; GPIO##_nm->PUPDR = pupdr_##_nm; if (bsrr_##_nm) {GPIO##_nm->BSRR = bsrr_##_nm;} GPIO##_nm->AFR[0] = afr_##_nm; GPIO##_nm->AFR[1] = afr_##_nm >> 32;
	#define SETMODE(_nm, _idx, _mode)	moder_##_nm = (moder_##_nm &~ (3 << ((_idx) * 2))) | ((_mode) << ((_idx) * 2))
	#define SETOTYPE(_nm, _idx, _otype)	otyper_##_nm = (otyper_##_nm &~ (1 << (_idx))) | ((_otype) << (_idx))
	#define SETSPEED(_nm, _idx, _speed)	ospeedr_##_nm = (ospeedr_##_nm &~ (3 << ((_idx) * 2))) | ((_speed) << ((_idx) * 2))
	#define SETPUPD(_nm, _idx, _pupd)	pupdr_##_nm = (pupdr_##_nm &~ (3 << ((_idx) * 2))) | ((_pupd) << ((_idx) * 2))
	#define SETAFR(_nm, _idx, _afr)		afr_##_nm = (afr_##_nm &~ (0x0full << ((_idx) * 4))) | (((uint64_t)(_afr)) << ((_idx) * 4))
	#define SET_VAL(_nm, _idx, _hi)		bsrr_##_nm = (bsrr_##_nm &~ (0x10001 << (_idx))) | (1 << (_idx + ((_hi) ? 0 : 16)))
	
	//common cases
	#define CFG_AFR_PIN(_nm, _idx, _afr, _speed)	SETMODE(_nm, _idx, 2); SETSPEED(_nm, _idx, _speed); SETAFR(_nm, _idx, _afr)
	#define CFG_OUT_PIN(_nm, _idx, _speed, _state)	SETMODE(_nm, _idx, 1); SETSPEED(_nm, _idx, _speed); SET_VAL(_nm, _idx, _state)
	#define CFG_IN_PIN(_nm, _idx, _pupd)			SETMODE(_nm, _idx, 0); SETPUPD(_nm, _idx, _pupd);
	
	//define ports
	DEFPORT(A);
	DEFPORT(B);
	DEFPORT(C);
	DEFPORT(D);
	DEFPORT(E);
	DEFPORT(F);
	DEFPORT(G);
	DEFPORT(H);
	DEFPORT(I);
	
	//SWD
	CFG_AFR_PIN(A, 13, 0, 3);
	SETPUPD(A, 13, 1);
	CFG_AFR_PIN(A, 14, 0, 3);
	SETPUPD(A, 13, 2);
	
	//QSPI
	CFG_AFR_PIN(B, 10, 9, QSPI_SPEED);
	CFG_AFR_PIN(B, 2, 9, QSPI_SPEED);
	CFG_AFR_PIN(B, 1, 11, QSPI_SPEED);
	CFG_AFR_PIN(B, 0, 11, QSPI_SPEED);
	CFG_AFR_PIN(C, 2, 9, QSPI_SPEED);
	CFG_AFR_PIN(A, 1, 9, QSPI_SPEED);
	
	//debug uart
	CFG_AFR_PIN(A, 11, 6, 1);
	CFG_AFR_PIN(A, 12, 6, 1);
	
	//MSIO
	CFG_IN_PIN(A, 0, 0);
	CFG_AFR_PIN(A, 15, 5, MSIO_SPEED);
	CFG_AFR_PIN(B, 12, 5, MSIO_SPEED);
	CFG_AFR_PIN(A, 5, 5, MSIO_SPEED);
	CFG_AFR_PIN(A, 9, 5, MSIO_SPEED);
	CFG_AFR_PIN(B, 4, 5, MSIO_SPEED);
	CFG_AFR_PIN(B, 14, 5, MSIO_SPEED);
	CFG_AFR_PIN(C, 11, 6, MSIO_SPEED);
	CFG_AFR_PIN(B, 3, 6, MSIO_SPEED);
	CFG_AFR_PIN(A, 4, 6, MSIO_SPEED);
	CFG_IN_PIN(C, 10, 0);
	CFG_IN_PIN(A, 10, 0);
	CFG_IN_PIN(B, 15, 2);	//pull down on data is needed
	
	//debug spi3
	CFG_OUT_PIN(A, 11, 3, 0);
	CFG_OUT_PIN(A, 12, 3, 0);
	
	//SPI NAND
	CFG_AFR_PIN(A, 6, 8, NAND_SPI_SPEED);	//miso
	CFG_AFR_PIN(B, 5, 8, NAND_SPI_SPEED);	//mosi
	CFG_AFR_PIN(C, 12, 5, NAND_SPI_SPEED);	//sclk
	CFG_OUT_PIN(C, 15, NAND_SPI_SPEED, 1);	//nCS

	
	//apply values
	APPLYPORT(A);
	APPLYPORT(B);
	APPLYPORT(C);
	APPLYPORT(D);
	APPLYPORT(E);
	APPLYPORT(F);
	APPLYPORT(G);
	APPLYPORT(H);
	APPLYPORT(I);
	
	//cleanup
	#undef CFG_IN_PIN
	#undef CFG_OUT_PIN
	#undef CFG_AFR_PIN
	#undef SETAFR
	#undef SETPUPD
	#undef SETSPEED
	#undef SETOTYPE
	#undef SETMODE
	#undef APPLYPORT
	#undef DEFPORT
}

static void clockTreeInit(void)
{
	//enable clocks to important places
	RCC->CKGAENR = 0;	//TODO: clock gate most things
	RCC->AHB1ENR = RCC_AHB1ENR_DMA1EN | RCC_AHB1ENR_CRCEN;
	RCC->AHB2ENR = RCC_AHB2ENR_RNGEN | RCC_AHB2ENR_AHBSRAM2EN | RCC_AHB2ENR_AHBSRAM1EN;
	RCC->AHB3ENR = RCC_AHB3ENR_IOMNGREN | RCC_AHB3ENR_OSPI2EN | RCC_AHB3ENR_FMCEN | RCC_AHB3ENR_GFXMMUEN;
	RCC->AHB4ENR = RCC_AHB4ENR_GPIOAEN | RCC_AHB4ENR_GPIOBEN | RCC_AHB4ENR_GPIOCEN | RCC_AHB4ENR_GPIODEN | RCC_AHB4ENR_GPIOEEN | RCC_AHB4ENR_GPIOFEN | RCC_AHB4ENR_GPIOGEN | RCC_AHB4ENR_GPIOHEN | RCC_AHB4ENR_GPIOIEN | RCC_AHB4ENR_GPIOJEN | RCC_AHB4ENR_GPIOKEN;
	RCC->APB1LENR = RCC_APB1LENR_SPI2EN | RCC_APB1LENR_SPI3EN | RCC_APB1LENR_UART4EN | RCC_APB1LENR_TIM2EN | RCC_APB1LENR_TIM5EN;
	RCC->APB2ENR = RCC_APB2ENR_SPI1EN;
	RCC->APB4ENR = RCC_APB4ENR_SYSCFGEN;
	
	//set flash wait states to 6
	FLASH->ACR = 0x36;	//for 280mhz
	
	//begin underdocumented SHIT
#define PWR_CR3_SMPSLEVEL	0x30
#define PWR_CR3_SMPSEXTHP	0x08
#define PWR_CR3_SMPSEN		0x04

	PWR->CR3 = (PWR->CR3 &~ (PWR_CR3_SMPSLEVEL | PWR_CR3_SMPSEXTHP | PWR_CR3_SMPSEN | PWR_CR3_LDOEN | PWR_CR3_BYPASS)) | PWR_CR3_SMPSEN;
	while(!(PWR->CSR1 & PWR_CSR1_ACTVOSRDY));
	//end underdocumented shit
	
	//VOS0
	PWR->SRDCR = PWR_SRDCR_VOS_1 | PWR_SRDCR_VOS_0;
	while(!(PWR->SRDCR & PWR_SRDCR_VOSRDY));	
	
	//first go to safe settings: HSI
	RCC->CR = RCC_CR_HSION;													//HSI on, PLL off
	RCC->CFGR = (RCC->CFGR &~ RCC_CFGR_SW_Msk) | RCC_CFGR_SW_HSI;			//switch to HSI
	
	//set up PLL1 to use HSI/32 = 2MHz as reference
	RCC->PLLCKSELR = RCC_PLLCKSELR_DIVM1_5 | RCC_PLLCKSELR_PLLSRC_HSI;
	
	//configure PLL1.P output, PLL1's range, produce 560MHz, output that over 2
	//PLL1.Q feeds spi units 1 2 and 3, same speed
	RCC->PLLCFGR = RCC_PLLCFGR_DIVP1EN | RCC_PLLCFGR_DIVQ1EN | RCC_PLLCFGR_PLL1RGE_0;
	RCC->PLL1DIVR = ((CPU_CLOCK_RATE / 1000000) << RCC_PLL1DIVR_N1_Pos) | (1 << RCC_PLL1DIVR_P1_Pos) | (1 << RCC_PLL1DIVR_Q1_Pos) | (1 << RCC_PLL1DIVR_R1_Pos);
	
	//turn it on
	RCC->CR |= RCC_CR_PLL1ON;
	
	//while it is coming online, set up clock prescalers (all APBs at 140, all AHBs/AXI/CPU at 280)
	RCC->CDCFGR1 = RCC_CDCFGR1_CDPPRE_0;
	RCC->CDCFGR2 = RCC_CDCFGR2_CDPPRE1_0 | RCC_CDCFGR2_CDPPRE2_0;
	RCC->SRDCFGR = RCC_SRDCFGR_SRDPPRE_2;
	
	//wait for PLL to stabilize and then switch to it
	while (!(RCC->CR & RCC_CR_PLL1RDY));
	RCC->CFGR = (RCC->CFGR &~ RCC_CFGR_SW_Msk) | RCC_CFGR_SW_PLL1;			//switch to PLL
}

static void cachesInit(void)
{
	SCB_InvalidateDCache();
	SCB_InvalidateICache();
	SCB_EnableICache();
	SCB_EnableDCache();
}

static void dbgUartInit(void)
{
	UART4->CR1 = 0;
	UART4->BRR = 0x10;
	UART4->CR1 = USART_CR1_FIFOEN | USART_CR1_TE | USART_CR1_RE | USART_CR1_OVER8;
	UART4->CR1 |= USART_CR1_UE;
}

static void mmuInit(void)
{
	uint_fast16_t i;
	
	GFXMMU->CR = GFXMMU_CR_PD;	//no prefetch, our accesses are not cacheable or bufferable, but we have cache here
	GFXMMU->CCR = GFXMMU_CCR_FI;
	while (GFXMMU->CCR & GFXMMU_CCR_FI);
	
	//each buffer is 4M
	GFXMMU->B0CR = OCTOSPI2_BASE;
	GFXMMU->B1CR = OCTOSPI2_BASE + 0x0400000;
	GFXMMU->B2CR = OCTOSPI2_BASE + 0x0400000;
	GFXMMU->B3CR = 0x24000000;
	
	for (i = 0; i < 1024; i++) {
		
		GFXMMU->LUT[i * 2 + 0] = 0x00ff0001;	//whole line
		GFXMMU->LUT[i * 2 + 1] = i * 4096;
	}
}

void machIdle(void)
{
	
	//XXX: disable for now to assist debugging
	
	asm volatile("wfi\n\tnop\n\tnop\n\tnop");
}

void mpuRegCfg(uint32_t idx, uint32_t addr, uint32_t cfg)
{
	MPU->RBAR = addr | 0x10 | idx;
	MPU->RASR = cfg;
}

void mpuRegCfgPermOnly(uint32_t idx, uint32_t cfg)
{
	MPU->RNR = idx;
	MPU->RASR = cfg;
}

void machSetStorageAreaWriteable(bool writeable)
{
	mpuRegCfg(3, 0x25000000, (writeable ? MPU_PERM_U_RW_S_RW : MPU_PERM_U_RO_S_RO) | MPU_MEM_TYPE_RAM | MPU_FLAG_ENABLED | MPU_REGION_SZ_8MB | MPU_SRD_7th);		//7M region
}

static void __attribute__((naked)) dropPriv(void)
{
	asm volatile(
		"	mrs  r1, ipsr			\n\t"
		"	ubfx r1, r1, #0, #9		\n\t"
		"	cmp  r1, #3				\n\t"	//hard fault
		"	it   ne					\n\t"
		"	bxne lr					\n\t"
		"	ldr  r0, =0x01000005	\n\t"	//bus fault
		"	bic  lr, #1				\n\t"
		"	push {r0}				\n\t"
		"	push {lr}				\n\t"
		"	sub  sp, sp, #4*6		\n\t"
		"	ldr  r0, =0xfffffff1	\n\r"
		"	bx   r0					\n\t"
		".ltorg						\n\t"
	);
}

static void qspiInit(void)	//our max speed is 109mhz at 3.3v
{
	uint8_t chipid[8];
	uint_fast16_t i;
	
	//we use OSPI2 despite it not being officially present. #sosueme
	
	OCTOSPIM->PCR[0] = 0x07050333;	//this is actually for #1
	OCTOSPIM->PCR[1] = OCTOSPIM_PCR_IOLEN | OCTOSPIM_PCR_NCSEN | OCTOSPIM_PCR_CLKEN;	//this is actually for #2
	
	//as per docs
	SYSCFG->CCCSR &=~ SYSCFG_CCCSR_EN;
	
	#define TCR_RD		(6 << OCTOSPI_TCR_DCYC_Pos) | OCTOSPI_TCR_SSHIFT;
	#define CCR_RD		(OCTOSPI_CCR_DMODE_1 | OCTOSPI_CCR_DMODE_0 | OCTOSPI_CCR_ADSIZE_1 | OCTOSPI_CCR_ADMODE_1 | OCTOSPI_CCR_ADMODE_0 | OCTOSPI_CCR_IMODE_1 | OCTOSPI_CCR_IMODE_0)	//data on 4 lines, 24-bit address, on 4 lines, one-byte instruction on 4 lines)
	#define IR_RD		(0xeb)
	
	#define TCR_WR		(0 << OCTOSPI_TCR_DCYC_Pos) | OCTOSPI_TCR_SSHIFT;
	#define CCR_WR		(OCTOSPI_CCR_DQSE | OCTOSPI_CCR_DMODE_1 | OCTOSPI_CCR_DMODE_0 | OCTOSPI_CCR_ADSIZE_1 | OCTOSPI_CCR_ADMODE_1 | OCTOSPI_CCR_ADMODE_0 | OCTOSPI_CCR_IMODE_1 | OCTOSPI_CCR_IMODE_0)	//data on 4 lines, 24-bit address, on 4 lines, one-byte instruction on 4 lines, DQSE required due to erratum 2.6.8
	#define IR_WR		(0x02)
	
	//config io manager
	
	//config OCTSPI
	OCTOSPI2->DCR1 = OCTOSPI_DCR1_MTYP_1 | (22 << OCTOSPI_DCR1_DEVSIZE_Pos) | OCTOSPI_DCR1_DLYBYP | (1 << OCTOSPI_DCR1_CSHT_Pos);	//standard mode, 8MB device, no delay, mode 0
	OCTOSPI2->DCR3 = (5 << OCTOSPI_DCR3_CSBOUND_Pos);		//in theory: 1K hard address boundary. in practive we need to drain one cache line at a time!
	OCTOSPI2->DCR4 = (482 << OCTOSPI_DCR4_REFRESH_Pos);		//allow a refresh at least every 4us
	OCTOSPI2->TCR = OCTOSPI_TCR_SSHIFT;
	OCTOSPI2->LPTR = 1;
	OCTOSPI2->CR = OCTOSPI_CR_EN;
		
	//clock is kernel clock over 128
	OCTOSPI2->DCR2 = (127 << OCTOSPI_DCR2_PRESCALER_Pos);
	asm volatile("dsb sy");
	asm volatile("isb sy");

	//chip might be in quad mode or not in quad mode we do not know
	OCTOSPI2->CR &=~ OCTOSPI_CR_FMODE;	//write mode
	OCTOSPI2->CCR = OCTOSPI_CCR_IMODE_0;	//one-byte instruction on 1 lines line
	OCTOSPI2->IR = 0x35;		//ENTER QUAD MODE
	while ((OCTOSPI2->SR & (OCTOSPI_SR_BUSY | OCTOSPI_SR_TCF)) != OCTOSPI_SR_TCF);
	
	//exit quad mode - chip might still be in quad mode if it is powered on from before. pretty safe to issue an exit quad command in quad mode
	OCTOSPI2->CR &=~ OCTOSPI_CR_FMODE;	//write mode
	OCTOSPI2->CCR = OCTOSPI_CCR_IMODE_0 | OCTOSPI_CCR_IMODE_1;	//one-byte instruction on 4 lines line
	OCTOSPI2->IR = 0xf5;		//EXIT QUAD MODE
	while ((OCTOSPI2->SR & (OCTOSPI_SR_BUSY | OCTOSPI_SR_TCF)) != OCTOSPI_SR_TCF);

	//read ID
	OCTOSPI2->CR = (OCTOSPI2->CR &~ OCTOSPI_CR_FMODE) | OCTOSPI_CR_FMODE_0;	//read mode
	OCTOSPI2->DLR = sizeof(chipid) - 1;		//length
	OCTOSPI2->CCR = OCTOSPI_CCR_DMODE_0 | OCTOSPI_CCR_ADSIZE_1 | OCTOSPI_CCR_ADMODE_0 | OCTOSPI_CCR_IMODE_0;	//data on single line, 24-bit address, on single line, one-byte instruction on single line
	OCTOSPI2->IR = 0x9f;	//READ ID
	OCTOSPI2->AR = 0;		//addr is zero
	
	for (i = 0; i < sizeof(chipid); i++) {
		
		while (!(OCTOSPI2->SR & OCTOSPI_SR_FLEVEL));
		chipid[i] = *(volatile uint8_t*)&OCTOSPI2->DR;
	}
	while ((OCTOSPI2->SR & (OCTOSPI_SR_BUSY | OCTOSPI_SR_TCF)) != OCTOSPI_SR_TCF);
	
	logi(" chip id [%u]: %02x %02x %02x %02x %02x %02x %02x %02x\n", i,
		chipid[0], chipid[1], chipid[2], chipid[3],
		chipid[4], chipid[5], chipid[6], chipid[7]);
		
	if (!chipid[0] && !chipid[1] && !chipid[2] && !chipid[3] && !chipid[4] && !chipid[5] && !chipid[6] && !chipid[7]) {
		fatal("QSPI: no chip here\n");
	}
	
	if (chipid[0] != 0x0d || chipid[1] != 0x5d) {
		fatal("QSPI: not a known memory chip\n");
	}
	
	logi("QSPI: will use this 8MB chip\n");
	
	OCTOSPI2->DCR2 = (2 << OCTOSPI_DCR2_PRESCALER_Pos); //clock is kernel clock over 3 (93 MHz-ish)
	asm volatile("dsb sy");
	asm volatile("isb sy");
	
	//reset it
	OCTOSPI2->CR &=~ OCTOSPI_CR_FMODE;	//write mode
	OCTOSPI2->CCR = OCTOSPI_CCR_IMODE_0;	//one-byte instruction on single line
	OCTOSPI2->IR = 0x66;	//RESET ENABLE
	while (!(OCTOSPI2->SR & OCTOSPI_SR_TCF));

	//XXX: this was not needed before
	while (OCTOSPI2->SR & OCTOSPI_SR_FLEVEL)
		(void)*(volatile uint8_t*)&OCTOSPI2->DR;
	
	OCTOSPI2->IR = 0x99;		//RESET PERFORM
	while (!(OCTOSPI2->SR & OCTOSPI_SR_TCF));

	//XXX: this was not needed before
	while (OCTOSPI2->SR & OCTOSPI_SR_FLEVEL)
		(void)*(volatile uint8_t*)&OCTOSPI2->DR;
	
	OCTOSPI2->IR = 0x35;		//ENTER QUAD MODE
	while (!(OCTOSPI2->SR & OCTOSPI_SR_TCF));

	//XXX: this was not needed before
	while (OCTOSPI2->SR & OCTOSPI_SR_FLEVEL)
		(void)*(volatile uint8_t*)&OCTOSPI2->DR;
	
	logi("QSPI config commands...\n");
	//configure read commands for MMIO mode
	OCTOSPI2->TCR = TCR_RD;
	OCTOSPI2->CCR = CCR_RD;
	OCTOSPI2->IR = IR_RD;
	
	//configure write commands for MMIO mode
	OCTOSPI2->WTCR = TCR_WR;
	OCTOSPI2->WCCR = CCR_WR;
	OCTOSPI2->WIR = IR_WR;
	
	logi("QSPI mmio on...\n");
	//enable MMIO mode
	OCTOSPI2->CR |= OCTOSPI_CR_FMODE_1 | OCTOSPI_CR_FMODE_0;	//MMIO mode
	asm volatile("dsb sy");
	asm volatile("isb sy");
	
	//for now
	logi("QSPI  zero...\n");
	memset((void*)0x70000000, 0xaa, 8 << 20);
	
	logi("QSPI  ready\n");
}

void __attribute__((used)) machInit(uint32_t stage, const void* data)
{
	if (stage == STAGE_INIT_EARLY) {	//no globals/vectors/anything yet!!!
		
		uint_fast16_t i;
		
		//XXX: fix what our loader may have caused, not needed normally here
		for (i = 0; i < CPU_NUM_IRQS; i++)
			NVIC_DisableIRQ(i);
		SysTick->CTRL = 0;
		asm volatile("cpsie if");
		
		//enable debugging help (this costs power)
		loge("warning: setting debugging helper bits at huge costs to power\n");
		DBGMCU->CR |= DBGMCU_CR_DBG_CKSRDEN | DBGMCU_CR_DBG_CKCDEN | DBGMCU_CR_DBG_TRACECKEN | DBGMCU_CR_DBG_STOPSRD | DBGMCU_CR_DBG_STANDBYSRD | DBGMCU_CR_DBG_STANDBYCD | DBGMCU_CR_DBG_STOPCD | DBGMCU_CR_DBG_SLEEPCD;
		loge("warning: set debugging helper bits at huge costs to power\n");
		
		/*
			STM32H7 has an issue in the QSPI controller wherein it hangs the bus when using QSPI in memory
			mapped mode. It is not really clear how or why. Marking the controller as Device Memory or Strongly
			Ordered memory avoids the issue, but both of those memory types are uncached. BAD!
			
			This is a cleverer workaround - it limits the number of AXI outstanding writes that the CPU may
			have from the stock 32 to 1. Yes! To ALL Targets, even though only QSPI is the issue. Trying to
			do the same on the target side (QSPI) does *NOT* fix the issue, so we're forced to limit at the
			CPU itself. Ouch!
			
			Also the QSPI controller canot really be properly written to EVER. Writes get lost, cache or not
			piping it through GFXMMU avoids this at a speed cost
		*/
		GPV->AXI_INI2_FN_MOD = 2;
		loge("bus set up\n");
		
		clockTreeInit();
		cachesInit();
		gpioInit();
		mmuInit();
		qspiInit();
		dbgUartInit();
	}
	else if (stage == STAGE_INIT_SET_VTOR) {
		
		SCB->VTOR = (uint32_t)__ISR_VECTORS;
	}
	else if (stage == STAGE_SETUP_HEAPS) {
		
		//register CCM first so it is checked first
		kheapRegisterHeap(HAL_CCM_MEM_BASE, HAL_CCM_MEM_SIZE, MEM_USABLE_AS_STACK | MEM_FAST);
		kheapRegisterHeap(HAL_STATIC_MEM_BASE, HAL_STATIC_MEM_SIZE, MEM_USABLE_AS_STACK | MEM_USABLE_FOR_DMA | MEM_USABLE_FOR_EXEC | MEM_FAST);
	}
	else if (stage == STAGE_INIT_MPU) {

		//everything is disabled unless otherwise specified
		mpuRegCfg(0, 0x00000000, MPU_PERM_U_XX_S_XX | MPU_PERM_NX | MPU_MEM_TYPE_UNCACHED_RAM | MPU_FLAG_ENABLED | MPU_REGION_SZ_4GB);
		
		//ITCM (jit tc and msio fast code) worry not, NULL region defined later...
		mpuRegCfg(1, 0x00000000, MPU_PERM_U_RW_S_RW | MPU_MEM_TYPE_UNCACHED_RAM | MPU_FLAG_ENABLED | MPU_REGION_SZ_64KB);
		
		//dyn ram region maps 0x25800000 + 0x00800000, SRD limits that to 0x25b00000 + 0x00200000, we need 0x25b00000 + 0x0015FE00
		mpuRegCfg(2, 0x25800000, MPU_PERM_U_RW_S_RW | MPU_MEM_TYPE_RAM | MPU_FLAG_ENABLED | MPU_REGION_SZ_8MB | MPU_SRD_7th | MPU_SRD_6th | MPU_SRD_5th | MPU_SRD_2nd | MPU_SRD_1st | MPU_SRD_0th);
		
		//storage ram
		machSetStorageAreaWriteable(false);
		
		//Periphs
		mpuRegCfg(4, 0x40000000, MPU_PERM_U_RW_S_RW | MPU_MEM_TYPE_DEVICE | MPU_PERM_NX | MPU_FLAG_ENABLED | MPU_REGION_SZ_512MB);
		
		//VRAM's temp space (overlaid by vram that is protected occasionally by disp.c)
		mpuRegCfg(5, 0x24000000, MPU_PERM_U_RW_S_RW | MPU_MEM_TYPE_RAM | MPU_PERM_NX | MPU_FLAG_ENABLED | MPU_REGION_SZ_1MB | MPU_SRD_1st | MPU_SRD_0th);
		
		//AHB ram (hal static mem)
		mpuRegCfg(6, 0x30000000, MPU_PERM_U_RW_S_RW | MPU_MEM_TYPE_RAM | MPU_FLAG_ENABLED | MPU_REGION_SZ_128KB);
		
		//uncached chunk of ahb memory for msio
		mpuRegCfg(7, 0x30000000, MPU_PERM_U_RW_S_RW | MPU_MEM_TYPE_UNCACHED_RAM | MPU_FLAG_ENABLED | MPU_REGION_SZ_4KB | MPU_SRD_7th | MPU_SRD_6th);

		//DTCM ram (stacks, kernel data and bss), ZWT
		mpuRegCfg(8, 0x20000000, MPU_PERM_U_RW_S_RW | MPU_MEM_TYPE_UNCACHED_RAM | MPU_PERM_NX | MPU_FLAG_ENABLED | MPU_REGION_SZ_128KB);
		
		//ROM
		mpuRegCfg(9, 0x08000000, MPU_PERM_U_RO_S_RO | MPU_MEM_TYPE_ROM | MPU_FLAG_ENABLED | MPU_REGION_SZ_2MB);
		
		//a region for NULL faulting
		mpuRegCfg(10, 0x00000000, MPU_PERM_U_XX_S_XX | MPU_PERM_NX | MPU_MEM_TYPE_UNCACHED_RAM | MPU_FLAG_ENABLED | MPU_REGION_SZ_1KB);

		//CPUID
		mpuRegCfg(11, UID_BASE, MPU_PERM_U_RO_S_RO | MPU_MEM_TYPE_ROM | MPU_FLAG_ENABLED | MPU_REGION_SZ_32B);
		
		//spare
		mpuRegCfg(12, 0, 0);
		
		//spare
		mpuRegCfg(13, 0, 0);
		
		//region 14 is used for display tracking (the high region number is needed to overrule all other rules)
		mpuRegCfg(14, 0, 0);
		
		//region 15 is used for stack guard
		mpuRegCfg(15, 0, 0);
		
		//SCS/SCB/etc in 0xE0000000..0xE0100000 is always acessed using the default map
		
		//mpu on
		MPU->CTRL = MPU_CTRL_ENABLE_Msk | MPU_CTRL_HFNMIENA_Msk;
	}
	else if (stage == STAGE_INIT_INTERRUPTS) {
		
		const struct MachInitDataInterrupts *info = (const struct MachInitDataInterrupts*)data;
		uint32_t normalPrio = info->lowestAllowablePrio + 3;
				
		//msio irqs need HWSVC prio
		NVIC_SetPriority(EXTI0_IRQn, info->hardwareServicesPrio);
		NVIC_SetPriority(EXTI15_10_IRQn, info->hardwareServicesPrio);
		
		//the irq used to service parsed packets is high prio
		NVIC_SetPriority(MSIO_IRQn, info->lowestAllowablePrio);
		
		//scheduler timer interrupt is high prio too so nobody else can interrupt it (and more importantly - it will not interrupt syscalls and vise-versa)
		NVIC_SetPriority(TIM2_IRQn, info->schedulingTimerPrio);
		
		//audio is realtime and thus highest prio
		NVIC_SetPriority(DMA2_Stream5_IRQn, info->lowestAllowablePrio + 1);
		
		//lcd copying int is high prio as well. safe since it doesnt touch any structs
		NVIC_SetPriority(TIM5_IRQn, info->lowestAllowablePrio + 2);
		
		//set all HW ints to medium prio
		NVIC_SetPriority(RTC_WKUP_IRQn, normalPrio);
		NVIC_SetPriority(RTC_Alarm_IRQn, normalPrio);
		
		msioCommsInit();
	}
	else if (stage == STAGE_CRASH_LOCKS_BREAK) {
		
		//tell it to not send input to PalmOS (who knows what state it's in)
		msioCommsBreakLocks();
		
		//scheduling timer off and lcd refresh timer off
		NVIC_DisableIRQ(TIM2_IRQn);
		NVIC_DisableIRQ(TIM5_IRQn);
		NVIC_ClearPendingIRQ(TIM2_IRQn);
		NVIC_ClearPendingIRQ(TIM5_IRQn);
		
		//set comms irq prio high enough
		dropPriv();	//if hard fault, drop to bus-fault prio to allow interrupts
		
		NVIC_SetPriority(EXTI0_IRQn, 0);
		NVIC_SetPriority(EXTI15_10_IRQn, 0);
		
		TIM5_IRQHandler();
	}
}

static void machBusyWaitDelay(uint64_t ticks)
{
	uint64_t start = timerGetTime();
	
	while (timerGetTime() - start < ticks);
}

void machBusyWaitDelayMsec(uint32_t msec)
{
	machBusyWaitDelay((uint64_t)msec * TIMER_TICKS_PER_MSEC);
}

void machBusyWaitDelayUsec(uint32_t usec)
{
	machBusyWaitDelay((uint64_t)usec * (TIMER_TICKS_PER_MSEC / 1000));
}

bool hwMaybeGetRomToken(uint32_t name, const void **dataP, uint16_t *szP)
{
	if (name == CREATE_4CC('s','n','u','m')) {
		
		static char snumStr[12];
		
		if (!snumStr[0]) {
			
			static const char *base32 = "23456789ABCDEFGHJKLMNPQRSTUVWXYZ";
			volatile uint32_t *uid = (volatile uint32_t*)UID_BASE;
			uint32_t snumLo, snumHi, i;
			uint64_t snum;
			
			//get the values
			snumLo = uid[0] ^ uid[1];
			snumHi = uid[1] ^ uid[2];
			
			snum = (((uint64_t)snumHi) << 32) + snumLo;
			
			for (i = 0; i < sizeof(snumStr); i++, snum >>= 5)
				snumStr[i] = base32[snum & 31];
		}
		if (dataP)
			*dataP = snumStr;
		if(szP)
			*szP = sizeof(snumStr);
		
		return true;
	}
	
	return false;
}

void hwGetMiscFlags(uint16_t *miscFlagsP, uint16_t *extMiscFlagsP)
{
	const struct MsioPktBootParams *bp = msioCommsWaitForContinueBoot();

	if (miscFlagsP)
		*miscFlagsP = bp->hwrMiscFlags;
	
	if (extMiscFlagsP)
		*extMiscFlagsP = bp->hwrMiscExtFlags;
}

int32_t cpuGetClockRate(enum ClockRateDevice dev)
{
	switch (dev) {
		case CpuClockRate:
		case TimerClockRate:
			return CPU_CLOCK_RATE;
		
		default:
			return -1;
	}
}

bool hwPwrCtl(uint32_t selector, const uint32_t *newValP, uint32_t *oldValP)
{
	return false;
}

void machSleep(void)
{
	logi("pretending to sleep\n");
	SysTaskDelay(10000);
	logi("waking up\n");
	dalModifyWakeFlags(DAL_WAKE_FLAG_GENERAL, 0);
	//nothing yet
}

void deviceReset(bool doHardReset)
{
	fatal("not implemented: reset\n");
}

Err machinePaceDispatch(EmulStateRef ref, uint16_t call, Err *ret68kP)
{
	return sysErrNotAllowed;
}

