// https://wiki.neogeodev.org/index.php?title=68k_instructions_timings

/*
	marker data[]
	
	
	data:
		non-marker-byte		-> that byte and the 3 that follow
		marker, 0			-> marker and the 3 that follow
		marker, ofst, len	-> copy

	ofst stored as VLI (variable length integers)
	ofst must be >=1 and is stored raw
	len stored as two bytes. first, the byte offset into the duff device to jump to, second the number of full iters to do
		tthis super-preprocessed format allows faster recompression
	
	VLI:
		0x00..0x7f -> that value
		0x80..0xff -> (this byte minus 0x80) as "hi" and next byte's 8 as lo			represents 0x80 .. 0x7fff



	//todo - maybe use len == 0 to indicate screen contents didnt change in the given length
*/

.macro checkNloop
	cmpa.l		%a2, %a1
	bcs			mainloop
	bra			done
.endm

.macro docopy	ofstL		//for when length and offset both under 0xfffc
	moveq		#0, %d2
	moveq		#0, %d3
	move.b		(%a1)+, %d2					//get duff byte offset
	move.b		(%a1)+, %d3					//get duff loop count
	
	lsl.\ofstL	#2, %d1
	neg.\ofstL	%d1
	lea			(%a0, %d1.\ofstL), %a3
	
	//num full iters into d2
	//offset for partials into d1
	jmp			9f(%pc, %d2.w)
	
9:
	.rept 128
		move.l	(%a3)+, (%a0)+
	.endr
	dbra		%d3, 9b
	
	//we return here

.endm


.globl screenDecompressNew
.func

screenDecompressNew:

	move.l		4(%sp), %a0					//a0 is dst
	move.l		8(%sp), %a1					//a1 is src
	move.l		12(%sp), %d0				//d0 is srcLen
	
	movem.l		%d3/%a2-%a3, -(%a7)
	lea			(%a1, %d0.l), %a2			//get (src + srcLen) into a2
	move.b		(%a1)+, %d0					//d0 is marker
	
mainloop:
	move.b		(%a1)+, %d2					//get a byte
	cmp.b		%d2, %d0					//marker?
	beq.s		gotmarker
	
emitword:
	move.b		%d2, (%a0)+
	move.b		(%a1)+, (%a0)+
	move.b		(%a1)+, (%a0)+
	move.b		(%a1)+, (%a0)+
	checkNloop

gotmarker:
	moveq		#0, %d1
	move.b		(%a1)+, %d1					//get next byte (first byte of offset)
	beq.s		emitword					//zero means emit marker and 3 bytes

isrepeat:
	bpl			ofst_is_one_byte

ofst_is_two_bytes:
	andi.b		#0x7f, %d1
	lsl.w		#8, %d1
	move.b		(%a1)+, %d1					//get next byte (second/last byte of offset)
	docopy		l
	checkNloop

ofst_is_one_byte:
	docopy		w
	checkNloop
	
	nop

done:

	movem.l   (%a7)+, %d3/%a2-%a3
	rts

.endfunc

#define _USE_HW_TIMEOUT_KEEPING


.globl msioTpcW_20MHz
.func

recv_sled:
	.rept 256
		move.w  (%a0), (%a1)+
	.endr
	rts



msioTpcW_20MHz:		//UInt8 (struct Mb86189 *chip, UInt8 tpc, Int16 nBytes, const UInt16 *valOutP, UInt8 *staOutP)
	movea.l 4(%sp), %a0
	
	//somehow sometimes this bit gets lost
	ori.b   #0x20, 3(%a0)
	
	moveq   #0, %d0
	move.b  8(%sp), %d0
	move.w  10(%sp), %d1
	swap    %d0
	#ifdef _USE_HW_TIMEOUT_KEEPING
		ori.b   #0x07, 3(%a0)
	#else
		andi.b  #~0x07, 3(%a0)
	#endif
	lsr.l   #4, %d0
	add.w   %d1, %d0
	subi.w  #1, %d1
	lsr.w   #1, %d1
	movea.l 12(%sp), %a1
	moveq   #3, %d2
	cmp.w   %d2, %d1
	bls.s   short_write20

long_write20:
	move.w  (%a1)+, 4(%a0)
	move.w  (%a1)+, 4(%a0)
	move.w  (%a1)+, 4(%a0)
	move.w  (%a1)+, 4(%a0)
	move.w  %d0, 0(%a0)
	subq    #4, %d1
	addq.l  #4, %a0

long_write_loop20:
//on a 20MHz cpu, testing for (chipBase->MSCSL & 0x01) is not needed - we are too slow for that
//in fact this loop is too slow for a 20mhz device with a 20mhz MS clock. we'd need to cycle every 16 cy to keep up
//and the move itself takes 12 cy
//on 33mhz devices, instead, memory stick runs at 16mhz and we need to delay more
	move.w  (%a1)+, (%a0)
	dbra    %d1, long_write_loop20
	lea     -4(%a0), %a0
	bra.s   sta_wait20
		
short_write20:
	move.w  (%a1)+, 4(%a0)
	dbra    %d1, short_write20
	move.w  %d0, 0(%a0)
	bra.s   sta_wait20
	

.endfunc


.globl msioTpcR_20MHz
.func
msioTpcR_20MHz:		//UInt8 (struct Mb86189 *chip, UInt8 tpc, Int16 nBytes, UInt16 *valOutP, UInt8 *staOutP)

	movea.l 4(%sp), %a0
	
	//somehow sometimes this bit gets lost
	ori.b   #0x20, 3(%a0)
	
	moveq   #0, %d0
	move.b  8(%sp), %d0
	move.w  10(%sp), %d1
	swap    %d0
	#ifdef _USE_HW_TIMEOUT_KEEPING
		ori.b   #0x07, 3(%a0)
	#else
		andi.b  #~0x07, 3(%a0)
	#endif
	lsr.l   #4, %d0
	add.w   %d1, %d0
	move.w  %d0, 0(%a0)
	moveq   #127, %d0

	movea.l 12(%sp), %a1
	addi.w  #1, %d1
	andi.b  #0xfe, %d1
	move.l	#512, %d2
	sub.w	%d1, %d2
	pea		recv_done(%pc)
	pea		recv_sled(%pc)
	add.l	%d2, (%sp)
	
wait_drq20:
	btst    #3, 2(%a0)
	dbeq    %d0, wait_drq20

	addq	#4, %a0
	rts		//call recv
recv_done:
	lea		-4(%a0), %a0

sta_wait20:
	movea.l 16(%sp), %a1
	move.w  #0x0800, %d0

sta_loop20:
	btst    #7, 2(%a0)
	bne.s   sta_ready20
	dbra    %d0, sta_loop20

sta_timeout20:
	moveq   #1, %d0
	or.b    6(%a0), %d0
	move.b  %d0, (%a1)
	rts

sta_ready20:
	move.b  6(%a0), (%a1)
	moveq   #0, %d0
	rts

.endfunc



.globl msioTpcW_33MHz
.func
msioTpcW_33MHz:		//UInt8 (struct Mb86189 *chip, UInt8 tpc, Int16 nBytes, const UInt16 *valOutP, UInt8 *staOutP)
	movea.l 4(%sp), %a0
	
	//somehow sometimes this bit gets lost
	ori.b   #0x20, 3(%a0)
	
	moveq   #0, %d0
	move.b  8(%sp), %d0
	move.w  10(%sp), %d1
	swap    %d0
	lsr.l   #4, %d0
	add.w   %d1, %d0
	#ifdef _USE_HW_TIMEOUT_KEEPING
		ori.b   #0x07, 3(%a0)
	#else
		andi.b  #~0x07, 3(%a0)
	#endif
	subi.w  #1, %d1
	lsr.w   #1, %d1
	movea.l 12(%sp), %a1
	moveq   #3, %d2
	cmp.w   %d2, %d1
	bls.s   short_write33

long_write33:
	move.w  (%a1)+, 4(%a0)
	move.w  (%a1)+, 4(%a0)
	move.w  (%a1)+, 4(%a0)
	move.w  (%a1)+, 4(%a0)
	move.w  %d0, 0(%a0)
	subq    #4, %d1
	addq.l  #4, %a0

long_write_loop33:
	nop
	nop
	nop
	nop
	move.w  (%a1)+, (%a0)
	dbra    %d1, long_write_loop33
	lea     -4(%a0), %a0
	bra.s   sta_wait33
		
short_write33:
	move.w  (%a1)+, 4(%a0)
	dbra    %d1, short_write33
	move.w  %d0, 0(%a0)
	bra.s   sta_wait33
	

.endfunc

.globl msioTpcR_33MHz
.func
msioTpcR_33MHz:		//UInt8 (struct Mb86189 *chip, UInt8 tpc, Int16 nBytes, UInt16 *valOutP, UInt8 *staOutP)

	movea.l 4(%sp), %a0
	
	//somehow sometimes this bit gets lost
	ori.b   #0x20, 3(%a0)
	
	moveq   #0, %d0
	move.b  8(%sp), %d0
	move.w  10(%sp), %d1
	swap    %d0
	#ifdef _USE_HW_TIMEOUT_KEEPING
		ori.b   #0x07, 3(%a0)
	#else
		andi.b  #~0x07, 3(%a0)
	#endif
	lsr.l   #4, %d0
	add.w   %d1, %d0
	move.w  %d0, 0(%a0)
	moveq   #127, %d0

wait_drq33:
	btst    #3, 2(%a0)
	dbeq    %d0, wait_drq33

do_rx33:
	movea.l 12(%sp), %a1
	subi.w  #1, %d1
	lsr.w   #1, %d1
	addq    #4, %a0

rx_loop33:
	nop
	nop
	nop
	nop
	nop
	move.w  (%a0), (%a1)+
	dbra    %d1, rx_loop33
	lea     -4(%a0), %a0

sta_wait33:
	movea.l 16(%sp), %a1
	move.w  #0x0800, %d0

sta_loop33:
	btst    #7, 2(%a0)
	bne.s   sta_ready33
	dbra    %d0, sta_loop33

sta_timeout33:
	moveq   #1, %d0
	or.b    6(%a0), %d0
	move.b  %d0, (%a1)
	rts

sta_ready33:
	move.b  6(%a0), (%a1)
	moveq   #0, %d0
	rts

.endfunc






