#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "audiohw.h"
#include "printf.h"
#include "heap.h"
#include "irqs.h"
#include "ral.h"
#include "cpu.h"
#include "../dma_driver/DmaDriver.h"

#define AUDIO_BUF_SZ				2048

//our cpu speed is precisely 4096 * high quality sampling rate speed

//#define LOWER_QUALITY			//uses lower sampling rate and thus less CPU




#ifdef LOWER_QUALITY
	#define AUDIO_BIT_DEPTH						13
	#ifdef OVERCLOCK
		#define AUDIO_NATIVE_RATE					AudioRate24000
	#else
		#define AUDIO_NATIVE_RATE					AudioRate22050
	#endif
#else
	#define AUDIO_BIT_DEPTH						12
	#ifdef OVERCLOCK
		#define AUDIO_NATIVE_RATE					AudioRate48000
	#else
		#define AUDIO_NATIVE_RATE					AudioRate44100
	#endif
#endif

#define AUDIO_MAX_VALID_VAL						(1 << AUDIO_BIT_DEPTH)
#define AUDIO_MIDDLE_VALUE						(AUDIO_MAX_VALID_VAL / 2)



static AudioOutHwReadyForMoreSamplesF mReadyForSamplesF;
static bool mCurOnState = false;
static DmaStream mDmaStream;
static uint16_t *mDmaMem;

//this func assumes that num is divisible by 8, else things will break!
//it also assumes that destination is 4(!!!)-byte aligned on v7E
static void __attribute__((naked)) audioOutHwConvertSamples(uint16_t *dst, const int32_t *src, uint32_t num)
{
	asm volatile(
		"	push    {r4-r9, lr}				\n\t"
		"	mov     r12, %2					\n\t"
		"1:									\n\t"
		"	ldmia   r1!, {r3-r9,lr}			\n\t"
		"	ssat    r3, %0, r3, asr %1		\n\t"
		"	ssat    r4, %0, r4, asr %1		\n\t"
		"	ssat    r5, %0, r5, asr %1		\n\t"
		"	ssat    r6, %0, r6, asr %1		\n\t"
		"	ssat    r7, %0, r7, asr %1		\n\t"
		"	ssat    r8, %0, r8, asr %1		\n\t"
		"	ssat    r9, %0, r9, asr %1		\n\t"
		"	ssat    lr, %0, lr, asr %1		\n\t"
		"	add     r3, r12					\n\t"
		"	add     r4, r12					\n\t"
		"	add     r5, r12					\n\t"
		"	add     r6, r12					\n\t"
		"	add     r7, r12					\n\t"
		"	add     r8, r12					\n\t"
		"	add     r9, r12					\n\t"
		"	add     lr, r12					\n\t"
		"	pkhbt   r3,  r3,  r4, LSL #16	\n\t"
		"	pkhbt   r5,  r5,  r6, LSL #16	\n\t"
		"	pkhbt   r7,  r7,  r8, LSL #16	\n\t"
		"	pkhbt   r9,  r9,  lr, LSL #16	\n\t"
		"	stmia   r0!, {r3, r5, r7, r9}	\n\t"
		"	subs    r2, #8					\n\t"
		"	bne     1b						\n\t"
		"	pop     {r4-r9, pc}				\n\t"
		".ltorg								\n\t"
		:
		:"I"(AUDIO_BIT_DEPTH), "I"(24 - AUDIO_BIT_DEPTH), "I"(AUDIO_MIDDLE_VALUE)
		:"memory","cc"
	);
}

static void audioOutHwPrvReqData(bool secondHalf)
{
	uint32_t r9state;
	
	r9state = ralSetSafeR9();
	audioOutHwConvertSamples(mDmaMem + (secondHalf ? (AUDIO_BUF_SZ / 2) : 0), mReadyForSamplesF(true), AUDIO_BUF_SZ / 2);
	mReadyForSamplesF(false);
	ralRestoreR9(r9state);
}

static void audioOutHwPrvDmaIrq(void* userData, uint32_t strmSta)
{
	if (strmSta & DMA_STRM_IRQ_HALF)
		audioOutHwPrvReqData(false);
	else if (strmSta & DMA_STRM_IRQ_DONE)
		audioOutHwPrvReqData(true);
}

static void audioOutHwGpioCfg(bool enabled)		//do not set what is alrady set, or you'll get clicks
{
	//B13 & A8, output, low speed, AF1
	
	if (enabled) {		//currently they are inputs with pulldowns
		
		GPIOA->MODER = (GPIOA->MODER &~ (3 << (8 * 2))) | (2 << (8 * 2));
		GPIOB->MODER = (GPIOB->MODER &~ (3 << (13 * 2))) | (2 << (13 * 2));
	}
	else {
		
		GPIOA->MODER &=~ (3 << (8 * 2));
		GPIOB->MODER &=~ (3 << (13 * 2));
	}
}

//value in CCR is how many cycles out of 4096/8192 are to be high. so accepted values are 0..4096/8192
bool audioOutHwInit(AudioOutHwReadyForMoreSamplesF readyForSamplesF, uint32_t *numSamplesPerBufP, enum AudioSampleRate* nativeRateP, bool *nativeStereoP)
{
	uint32_t i;
	static const struct DmaStreamUserCfg audioDmaCfg = {
		.magic = CFG_STRUCT_MAGIX,
		.chan = 6,
		.circBuf = 1,
		.prio = 2,
		.perSz = __builtin_ctz(sizeof(*mDmaMem)),
		.memSz = __builtin_ctz(sizeof(*mDmaMem)),
		.memIncr = true,
		.toMem = 0,
		.numItems = AUDIO_BUF_SZ,
	};
	
	TIM1->CR1 = TIM_CR1_ARPE | TIM_CR1_URS; 			//upcount, edger mode, dma req only on overflow, 
	TIM1->CR2 = TIM_CR2_CCDS;							//dma req when update event occurs
	TIM1->SMCR = 0;
	TIM1->CCMR1 = TIM_CCMR1_OC1M_2 | TIM_CCMR1_OC1M_1 | TIM_CCMR1_OC1PE;	//PWM mode with high for as many cycles as CCR says
	TIM1->CCER = TIM_CCER_CC1NE | TIM_CCER_CC1E;		//OC1 and OC1N both enabled for output, active high	(xXX: might one need ot be activ elow)
	
	TIM1->RCR = 0;										//do a dma request every 1 PWM cycle
	
	TIM1->CNT = 0;
	TIM1->ARR = AUDIO_MAX_VALID_VAL - 1;				//count is inclusive so range is 0..999
	TIM1->CCR1 = AUDIO_MIDDLE_VALUE;					//duty cycle is not inclusive
	
	TIM1->CR1 |= TIM_CR1_CEN;							//turn timer on
	TIM1->BDTR = TIM_BDTR_MOE;							//pwm output on

	if ((AUDIO_BUF_SZ / 2) & 7)
		fatal("we REQUIRE hardware buffer to be a multiple of 8 samples in size for speed\n");
	
	mReadyForSamplesF = readyForSamplesF;
	
	mDmaStream = DmaLibStreamReserve(2, 5);
	if (!mDmaStream) {
		logw("Audio failed to grab our DMA stream\n");
		goto out_err;
	}
	
	if (!DmaLibStreamConfigure(mDmaStream, &audioDmaCfg)) {
		logw("Audio failed to configure our stream\n");
		goto out_free_dma;
	}

	if (!DmaLibStreamSetIrqHandler(mDmaStream, audioOutHwPrvDmaIrq, NULL)) {
		logw("Audio failed to configure irq handler\n");
		goto out_free_dma;
	}

	if (!DmaLibStreamSetPeriphAddr(mDmaStream, (uintptr_t)&TIM1->CCR1)) {
		logw("Audio failed to configure irq DADDR\n");
		goto out_free_dma;
	}

	mDmaMem = kheapAllocEx(AUDIO_BUF_SZ * sizeof(*mDmaMem), MEM_USABLE_FOR_DMA);
	if (!mDmaMem) {
		logw("Audio failed to get buffer memory\n");
		goto out_free_dma;
	}
	
	*numSamplesPerBufP = AUDIO_BUF_SZ / 2;	//client writes half at a time (while dma plays the other half)
	*nativeRateP = AUDIO_NATIVE_RATE;
	*nativeStereoP = false;
	
	//gpio configs (thngs that do not change, and this needed for "off" state)
	//B13 & A8, input, low speed, AF=1, pulldown
	
	//input
	GPIOA->MODER &=~ (3 << (8 * 2));
	GPIOB->MODER &=~ (3 << (13 * 2));
	//very fast
	GPIOA->OSPEEDR |= (3 << (8 * 2));
	GPIOB->OSPEEDR |= (3 << (13 * 2));
	//pull down
	GPIOA->PUPDR = (GPIOA->PUPDR &~ (3 << (8 * 2))) | (2 << (8 * 2));
	GPIOB->PUPDR = (GPIOB->PUPDR &~ (3 << (13 * 2))) | (2 << (13 * 2));
	//AFR1
	GPIOA->AFR[1] = (GPIOA->AFR[1] &~ (0x0f << ((8 - 8) * 4))) | (0x01 << ((8 - 8) * 4));
	GPIOB->AFR[1] = (GPIOB->AFR[1] &~ (0x0f << ((13 - 8) * 4))) | (0x01 << ((13 - 8) * 4));
	
	return true;

out_free_dma:
	if (!DmaLibStreamRelease(mDmaStream))
		logw("Audio failed to release our dma stream\n");

out_err:
	return false;
}

void audioOutHwSetState(bool on)
{
	uint32_t i;
	bool changed = false;
	irq_state_t sta;
	
	sta = irqsAllOff();
	if (!mCurOnState != !on)
		changed = true;
	mCurOnState = on;
	irqsRestoreState(sta);

	if (!changed)
		return;
	
	if (!on) {
		
		audioOutHwGpioCfg(false);
		
		TIM1->DIER = 0;						//turn off dma reqs
		TIM1->CCR1 = AUDIO_MIDDLE_VALUE;	//set to middle level

		if (!DmaLibStreamSetIrqState(mDmaStream, 0))
			logw("Audio failed to disable dma irqs\n");
		
		TIM1->CCR1 = AUDIO_MIDDLE_VALUE;	//set to middle level (in case we caught dma mid-transfer still)
		
		if (!DmaLibStreamSetEnabled(mDmaStream, false))
			logw("Audio failed to disable dma\n");
		
		TIM1->CCR1 = AUDIO_MIDDLE_VALUE;	//set to middle level (in case we caught dma mid-transfer still)
		TIM1->EGR = TIM_EGR_UG;
	}
	else {
		
		//every turn-on we do this to make sure it starts from start of buffer
		if (!DmaLibStreamSetMemAddr(mDmaStream, 0, (uintptr_t)mDmaMem))
			logw("Audio failed to configure irq SADDR\n");
		
		//start with silence, let audio mixer do its thing for a bit
		for (i = 0; i < AUDIO_BUF_SZ; i++)
			mDmaMem[i] = AUDIO_MIDDLE_VALUE;
		
		if (!DmaLibStreamSetEnabled(mDmaStream, true))
			logw("Audio failed to enable dma\n");

		if (!DmaLibStreamSetIrqState(mDmaStream, DMA_STRM_IRQ_HALF | DMA_STRM_IRQ_DONE))
			logw("Audio failed to enable dma irqs\n");
		
		audioOutHwGpioCfg(true);
		
		//enable DMA requests
		TIM1->DIER = TIM_DIER_UDE;
	}
}

bool audioOnlySimpleOutInit(void)
{
	logw("Unexpected call to %s\n", __func__);
	return false;
}

void audioOnlySimpleTone(uint32_t freq, uint32_t amp)
{
	fatal("Unexpected call to %s\n", __func__);
}

//we do mono output (stereo is doable but we do mono)
// we do need a fast timer for fast sample rates, so we need one on APB2 (fast) or to set the TIMPRE bit.
// for self-dma it needs to be capable of dma triggering, and for output at least one channel must connect to an
// unused available GPIO. keep in mind that timers that are up-clockable using the TIMPRE bit still have a slower
// iface clock and thus are less preferable (though by a very very very small amount)
//
//always fast timers: 1, 8, 9, 10, 11
//timers we can clock up using TIMPRE: 2, 3, 4, 5, 6, 7, 12, 13, 14
//
//let's remove all timers that cannot do PWM out
//
//always fast timers: 1, 8, 9, 10, 11
//timers we can clock up using TIMPRE: 2, 3, 4, 5, 12, 13, 14
//
//let's remove all timers that cannot trigger dma
//
//always fast timers: 1, 8
//timers we can clock up using TIMPRE: 2, 3, 4, 5
//
//now we'll annotate the pins and dma channels each can use (~ = inverted)
// keep in mind some of these do not exist on our package (annotated as "*")
//
//	TIMER		OUT PINS							DMA CHs
//	1			~A7,~B0,~B1,~B13, ~E8,E9			2.1,2.2,2.3,2.4,2.6
//				~E10,E11,~E12,E13,E14
//
//	2			A0,A1,A2,A3,A5,A15,B3,B10,B11		1.1,1.5,1.6,1.7
//
//	3			A6,A7,B0,B1,B4,B5,C6,C7,C8,C9		1.2,1.4,1.5,1.7
//
//	4			B6,B7,B8,B9,D12,D13,D14,D15			1.0,1.3,1.7
//
//	5			A0,A1,A2,A3,*H10,*H11,*H12,*I0		1.0,1.1,1.2,1.3,1.4
//
//	8			~A5,~A7,~B0,~B1,~B14,~B15,C6,		2.2,2.3,2.4,2.7
//				C7,C8,C9,*~H13,*~H14,*~H15
//				*I2,*I5,*I6,*I7
//
//We use pins B13(TIM1_CH1N) and A8(TIM1_CH1), thus TIM1 channel 1 and DMA 2 stream 5 (channel 6)



