#include "screenCompress.h"



#define SCREEN_HASH_PRIME_FACTOR		769
#define SCREEN_HASH_WAYS				3

union CompressData {
	const uint32_t *mTab[SCREEN_HASH_PRIME_FACTOR][SCREEN_HASH_WAYS];
	uint32_t hist[256];
};

static inline void __attribute__((section(".ramcode"))) screenCompressPrvLocalMemSetWordAligned(void *dst, uint32_t nWords)	//do not call with zero size
{
	asm volatile(
		"1:					\n\t"
		"	stmia %0!, {%2}	\n\t"
		"	subs  %1, #1	\n\t"
		"	bne   1b		\n\t"
		:"+l"(dst), "+l"(nWords)
		:"l"(0)
		:"memory","cc"
	);
}

static inline uint8_t* __attribute__((section(".ramcode"))) screenCompressPrvCompressEmitEscapedByte(uint8_t *dst, uint8_t val, uint8_t marker)
{
	if (val != marker)
		*dst++ = val;
	else {
		*dst++ = marker;
		*dst++ = 0;
	}
	
	return dst;
}

static inline uint8_t* __attribute__((section(".ramcode"))) screenCompressPrvCompressEmitVLI(uint8_t *dst, uint32_t val)
{
	if (val < 0x80)
		*dst++ = val;
	else {
		*dst++ = 0x80 + (val >> 8);
		*dst++ = val;
	}
	
	return dst;
}

static inline uint_fast8_t __attribute__((section(".ramcode"))) screenCompressPrvCompressAddToHash(union CompressData *cd, const uint32_t *ptr, uint_fast8_t replacementIdx)
{
	cd->mTab[*ptr % SCREEN_HASH_PRIME_FACTOR][replacementIdx] = ptr;
	if (++replacementIdx == SCREEN_HASH_WAYS)
		replacementIdx = 0;
	
	return replacementIdx;
}
/*
uint32_t __attribute__((naked, section(".ramcode"))) screenCompress(uint8_t *dst, const uint32_t *src, uint32_t srcLen, bool byteswap)	//src len must be multiple of 4, buffers must be 4 byte aligned
{
	static union CompressData mData;
	
	asm volatile(
		"	lsrs	r2, #2				\n\t"
		"	push	{r4-r10, lr}		\n\t"
		"	ldr		r4, =%0				\n\t"
		"	mov		r12, r3				\n\t"
		
		//zero hist
		"	movs    r5, #0				\n\t"
		"	movs    r6, #0				\n\t"
		"	movs    r7, #128			\n\t"
		"1:								\n\t"
		"	subs    r7, #1				\n\t"
		"	stmia	r4!, {r5, r6}		\n\t"
		"	bne		1b					\n\t"
		"	sub		r4, #1024			\n\t"
		
		//collect hist
		"	mov		r5, r1				\n\t"
		"	mov		r6, r2				\n\t"
		"1:								\n\t"
		"	ldrb	r7, [r5], #1		\n\t"
		"	lsls	r7, #2				\n\t"
		"	ldr		r3, [r4, r7]		\n\t"
		"	adds	r3, #1				\n\t"
		"	subs	r6, #1				\n\t"
		"	str		r3, [r4, r7]		\n\t"
		"	bne		1b					\n\t"
		
		//find marker -> r5
		"	movs	r5, #0				\n\t"
		"	ldmia	r4!, {r6}			\n\t"
		"	movs	r7, #255			\n\t"
		"1:								\n\t"
		"	ldmia	r4!, {r3}			\n\t"
		"	cmp		r3, r6				\n\t"
		"	itt		le					\n\t"
		"	rsble	r5, r7, #256		\n\t"
		"	movle	r6, r3				\n\t"
		"	subs	r7, #1				\n\t"
		"	bne		1b					\n\t"
		"	sub		r4, #1024			\n\t"
		
		//erase hash table
		"	movs    r6, #0				\n\t"
		"	movs    r7, #0				\n\t"
		"	mov		r3, %3 / 2			\n\t"
		"1:								\n\t"
		"	subs	r3, #1				\n\t"
		"	stmia	r4!, {r6,r7}		\n\t"
		"	bne		1b					\n\t"
		
		//compress
		"1:								\n\t"
		
		// get value and hash it, get pointer to first hash entry
		"	ldr		r4, =%0				\n\t"
		"	ldmia	r1!, {r3}			\n\t"
		"	mov		r6, %1				\n\t"
		"	udiv	r6, r3, r6			\n\t"
		"	add		r4, r4, r6, lsl #2	\n\t"
		
		//check all hash chains
		
		"	mov		r8, #0				\n\t"	//bestLen
		"	mov		r9, #0				\n\t"	//bestOfst
		
		"	movs	r6, %2				\n\t"
		"2:								\n\t"
		"	ldr		r5, [r4]			\n\t"
		"	cbz		r5, 3f				\n\t"
		
		//chain nonempty - check first word
		"	ldmia	r5!, {r7}			\n\t"
		"	cmp		r5, r3				\n\t"
		"	bne		3f					\n\t"
		
		//first word matches - check max length. honor input length limit too
		"	subs	r10, r2, #1			\n\t"
		"	beq		5f					\n\t"
		"4:								\n\t"
		"	ldmia	r5!, {r7}			\n\t"
		"	ldmia	r1!, {r3}			\n\t"
		"	cmp		r3, r7				\n\t"
		"	ittt	ne					\n\t"		//at end, we want pointers just past last thing that DID match
		"	subne	r5, #4				\n\t"
		"	subne	r1, #4				\n\t"
		"	bne		5f					\n\t"
		"	subs	r10, #1				\n\t"
		"	bne		4b					\n\t"
		
		//r1 now points just past match (in src)
		//r5 now points just past match (in match)
		"5:								\n\t"
		"	ldr		r3, [r4]			\n\t"
		"	subs	r3,	r5, r3			\n\t"	//matchLen
		"	subs	r7, r1, r5			\n\t"	//matchOfst
		"	cmp		r3, r8				\n\t"
		"	it		eq					\n\t"
		"	cmpeq	r9, r7				\n\t"
		"	itt		cs					\n\t"
		"	movcs	r8, r3				\n\t"
		"	movcs	r9, r7				\n\t"
		
		"3:								\n\t"
		"	subs	r6, #2				\n\t"
		"	add		r4, %1 * 4			\n\t"
		"	bne		2b					\n\t"
	
	
		"	pop		{r4-r10, pc}		\n\t"
		:
		:"i"(&mData), "i"(SCREEN_HASH_PRIME_FACTOR), "i"(SCREEN_HASH_WAYS), "i"(SCREEN_HASH_PRIME_FACTOR * SCREEN_HASH_WAYS)
		:"memory", "cc", "r0", "r1", "r2", "r3", "r12"
	);
}
*/


uint32_t __attribute__((noinline, section(".ramcode"))) screenCompress(uint8_t *dst, const uint32_t *src, uint32_t srcLen, bool byteswap)	//src len must be multiple of 4, buffers must be 4 byte aligned
{
	const uint32_t *srcEnd = src + srcLen / sizeof(uint32_t), *srcStart = src;
	uint_fast8_t j, marker, replacementIdx = 0;
	const uint8_t *srcB = (const uint8_t*)src;
	static union CompressData mData;
	uint8_t *dstStart = dst;
	uint32_t i, bestVal;
	
	//pick a marker and emit it
	screenCompressPrvLocalMemSetWordAligned(mData.hist, sizeof(mData.hist) / sizeof(uint32_t));
	for (i = 0; i < srcLen; i++)
		mData.hist[*srcB++]++;
	for (marker = 0, bestVal = mData.hist[marker], i = 1; i < 256; i++) {
		
		if (bestVal > mData.hist[i]) {
			bestVal = mData.hist[i];
			marker = i;
		}
	}
	*dst++ = marker;
		
	//compress
	screenCompressPrvLocalMemSetWordAligned(mData.mTab, sizeof(mData.mTab) / sizeof(uint32_t));
	
	while (src < srcEnd) {
		
		uint32_t bestLen = 0, bestOst = 0, hashIdx = *src % SCREEN_HASH_PRIME_FACTOR;
		
		//search for candidates
		for (j = 0; j < SCREEN_HASH_WAYS; j++) {
			
			const uint32_t *cand = mData.mTab[hashIdx][j], *us = src;
			uint32_t ofst, thisLen;
			
			if (!cand || (ofst = us - cand) >= 0x8000)
				continue;
			
			while (us < srcEnd && *cand++ == *us)
				us++;
			thisLen = us - src;
			
			if (thisLen > bestLen || (thisLen == bestLen && ofst < bestOst)) {
				
				bestLen = us - src;
				bestOst = ofst;
			}
		}
		
		//bestLen and bestOst are in units of u32
		
		//insert us into hash
		replacementIdx = screenCompressPrvCompressAddToHash(&mData, src, replacementIdx);
		
		if (bestLen < 2) {	//no joy with a match - emit bytes
			
			uint32_t val = *src++;
			
			if (byteswap)
				asm("rev16 %0, %1":"=r"(val):"r"(val));
			
			dst = screenCompressPrvCompressEmitEscapedByte(dst, val, marker);
			*dst++ = val >> 8;
			*dst++ = val >> 16;
			*dst++ = val >> 24;
		}
		else {				//we have a match - emit it
					
			*dst++ = marker;
			dst = screenCompressPrvCompressEmitVLI(dst, bestOst);
			
			//we emit a byte for total copy rounds (of 128 words each) and a byte for partial word copies instr offset
			//number of partials is 1..128
			//this means that max words copied is 255 * 128 + 128 = 32768
			
			if (bestLen >= 32768)
				bestLen = 32768;
			
			if (bestLen % 128) {	//partial used
				*dst++ = 2 * (128 - bestLen % 128);
				*dst++ = bestLen / 128;
			}
			else {				//full used, so remove one round
				*dst++ = 0;
				*dst++ = bestLen / 128 - 1;
			}
			
			for (i = 0; i < bestLen; i++, src++)
				replacementIdx = screenCompressPrvCompressAddToHash(&mData, src, replacementIdx);
		}
	}
	
	return dst - dstStart;
}

