#include "halDisplay.h"
#include "zodiac.h"
#include "printf.h"
#include <string.h>
#include "boot.h"
#include <Rect.h>
#include "kal.h"

#define TW_GFX_VIDEO_RAM_SZ					0x800000	//reported as such, unused otherwise
#define TW_GFX_VIDEO_RAM_USED_PERCENT		10			//what percent we claim is used

#define MAX_SURFACE_W						8191		//as per docs
#define MAX_SURFACE_H						8191

#define TW_GFX_ERROR_NULL_PTR				0x7101
#define TW_GFX_ERROR_BAD_VERSION			0x7102
#define TW_GFX_ERROR_INVALID_HANDLE			0x710a
#define TW_GFX_ERROR_INVALID_PIX_FMT		0x710b
#define TW_GFX_ERROR_INVALID_LOCATION		0x710c
#define TW_GFX_ERROR_INVALID_SIZE			0x710d
#define TW_GFX_ERROR_INVALID_COUNT			0x7110
#define TW_GFX_ERROR_INVALID_COORDS			0x7112
#define TW_GFX_ERROR_SFC_NOT_LOCKED			0x7114
#define TW_GFX_ERROR_SFC_IS_BUSY			0x7115

#define TW_GFX_CLIENT_MAGIX					0xff07295a	//one more than theirs
#define TW_GFX_SURFACE_MAGIX				0xff051963	//one more than theirs

#define TW_GFX_LOCATION_VRAM_AND_DRAM		0x0001
#define TW_GFX_LOCATION_VRAM				0x0101

#define TW_GFX_FLAG_SYNC2					0x01		//SYNC is just a lack of SYNC2 flag
#define TW_GFX_FLAG_ASYNC					0x80

#define TW_GFX_PIXEL_FMT_1BPP				0
#define TW_GFX_PIXEL_FMT_RGB565_LE			1
#define TW_GFX_PIXEL_FMT_RGB565_BE			2
#define TW_GFX_PIXEL_FMT_2BPP				3
#define TW_GFX_PIXEL_FMT_4BPP				4
#define TW_GFX_PIXEL_FMT_8BPP				5

typedef void (*BmpBlitF)(uint16_t *dstP, uint32_t dstPixelStride, const void *srcP, uint32_t srcByteStride, uint32_t w, uint32_t h, uint32_t srcSubByteOfst, const uint16_t *clut);
struct TwGfxClient;

struct TwGfxInfoType {		//as per docs!
	int32_t size;			//must == sizeof(struct TwGfxInfoType)
	uint32_t dispW, dispH, dispStride;
	int32_t dispPixelFormat;
	int32_t freeGfxMem;
	int32_t totalGfxMem;
};

struct TwGfxSurfaceInfoType {		//as per docs!
	int32_t size;			//must == sizeof(struct TwGfxSurfaceInfoType)
	int32_t width, height, stride;
	int32_t location;
	int32_t pixelFormat;
};

struct TwGfxRectType {		//as per docs!
	int32_t x, y, w, h;
};

struct TwGfxSpanType {		//as per docs
	int32_t x, y, w;
};

struct TwGfxPointType {		//as per docs
	int32_t x, y;
};

struct TwGfxBitmapType {	//as per docs
	int32_t size;			//must == sizeof(struct TwGfxBitmapType)
	int32_t width, height, stride;
	int32_t pixelFormat;
	void *data;
	uint16_t *pal;			//RGB565LE already
};

struct TwSurface {
	
	uint32_t magix;
	struct TwSurface *next, *prev;
	struct TwGfxClient *client;
	
	uint32_t w, h, s;			//cached width, height, stride
	void *bits;
	
	struct TwGfxRectType clip;	//as provided by user. might be beyond the surface. must be intersected with surface boundaries before being used
	
	uint16_t lockCt;
	bool dirty;	//only valid if locked
	bool haveDramBacking;
	
	struct BitmapType *bmp;
	WinHandle win;
};

struct TwGfxClient {
	uint32_t magix;
	struct TwGfxClient *next, *prev;
	struct TwSurface *surfaces;
	struct TwSurface *palmDisp;
	struct TwSurface *disp;
	
	//not inited till needed
	struct TwSurface palmDispStruct;	//these are linked togethr in the mDispSurfaces list so we can update them all
	struct TwSurface dispStruct;
};


static struct RectangleType mDispRect = {.topLeft = {.x = 0, .y = 0,}, .extent = {.x = 480, .y = 320,}, };
static struct TwGfxClient *mClients;
struct TwSurface *mDispSurfaces;
static bool mLandscape = true;		//hardware (us) is expected to do rotation, not software
static uint32_t mTwGfxLock;

static uint16_t twGfxPrvPackedColorToRGB565(uint32_t color)
{
	//this mess produces the nice set of UBFX/BFI instrs we want
	
	union {
		uint32_t val;
		struct {		//carefully tuned to do what we want
			uint32_t bLow	:3;
			uint32_t b		:5;
			uint32_t gLow	:2;
			uint32_t g		:6;
			uint32_t rLow	:3;
			uint32_t r		:5;
			uint32_t unused	:8;
		};
	} src = {.val = color, };
	
	union {
		uint16_t val;
		struct {
			uint16_t b		: 5;
			uint16_t g		: 6;
			uint16_t r		: 5;
		};
	} dst = {.b = src.b, .g = src.g, .r = src.r, };
	
	return dst.val;
}

//safe for a or b or both to be same as rectOutP
static void twGfxPrvIntersectRects(struct TwGfxRectType *rectOutP, const struct TwGfxRectType *a, const struct TwGfxRectType *b)
{
	int32_t aRight = a->x + a->w, bRight = b->x + b->w, aBot = a->y + a->h, bBot = b->y + b->h;
	int32_t rRight = aRight < bRight ? aRight : bRight;
	int32_t rBot = aBot < bBot ? aBot : bBot;
	
	rectOutP->x = a->x > b->x ? a->x : b->x;
	rectOutP->y = a->y > b->y ? a->y : b->y;
	rectOutP->w = rRight > rectOutP->x ? rRight - rectOutP->x : 0;
	rectOutP->h = rBot > rectOutP->y ? rBot - rectOutP->y : 0;
}

//safe for rectIn to be same as rectOutP
static void twGfxPrvBoundRectToSurface(struct TwGfxRectType *rectOutP, const struct TwSurface *sur, const struct TwGfxRectType *rectIn)
{
	struct TwGfxRectType surBounds = {.w = sur->w, .h = sur->h, };
	
	twGfxPrvIntersectRects(rectOutP, rectIn, &surBounds);
}

static Err twGfxPrvVerifyHandle(const struct TwGfxClient *client)
{
	//"client" should never be NULL
	if (!client)
		return TW_GFX_ERROR_INVALID_HANDLE;
	
	//magix value must always be accurate
	if (client->magix != TW_GFX_CLIENT_MAGIX)
		return TW_GFX_ERROR_INVALID_HANDLE;
	
	return errNone;
}

static Err twGfxPrvVerifySurface(const struct TwSurface *sur)
{
	if (!sur)
		return TW_GFX_ERROR_INVALID_HANDLE;

	if (sur->magix != TW_GFX_SURFACE_MAGIX)
		return TW_GFX_ERROR_INVALID_HANDLE;
	
	return errNone;
}

static Err twGfxPrvVerifyHandleAndSurface(const struct TwGfxClient *client, const struct TwSurface *sur)
{
	Err e;
	
	e = twGfxPrvVerifyHandle(client);
	if (e != errNone)
		return e;
	
	e = twGfxPrvVerifySurface(sur);
	if (e != errNone)
		return e;
	
	return errNone;
}

Err DALEXPORT impl_TwGfxGetInfo(struct TwGfxClient *client, struct TwGfxInfoType *infoP)
{
	Err e;
	
	logt("%s\n", __func__);
	
	e = twGfxPrvVerifyHandle(client);
	if (e != errNone)
		return e;
	
	if (!infoP)
		return TW_GFX_ERROR_NULL_PTR;
	
	if (infoP->size != sizeof(*infoP))
		return TW_GFX_ERROR_BAD_VERSION;
	
	e = HALDisplayGetAttributes(hwrDispHorizontal, &infoP->dispW);
	if (e != errNone)
		return e;
	
	e = HALDisplayGetAttributes(hwrDispVertical, &infoP->dispH);
	if (e != errNone)
		return e;
	
	e = HALDisplayGetAttributes(hwrDispStride, &infoP->dispStride);
	if (e != errNone)
		return e;
	
	infoP->dispPixelFormat = TW_GFX_PIXEL_FMT_RGB565_LE;
	
	infoP->freeGfxMem = TW_GFX_VIDEO_RAM_SZ * (100 - TW_GFX_VIDEO_RAM_USED_PERCENT) / 100;	//VRAM is 90% free
	infoP->totalGfxMem = TW_GFX_VIDEO_RAM_SZ;
	
	return errNone;
}

static void twGfxPrvSurfaceUnlink(struct TwSurface *sur, struct TwSurface **listHeadP)
{
	if (sur->next)
		sur->next->prev = sur->prev;
	if (sur->prev)
		sur->prev->next = sur->next;
	else
		*listHeadP = sur->next;
}

static void twGfxPrvSurfaceDestroyLocked(struct TwGfxClient *client, struct TwSurface *sur)
{
	twGfxPrvSurfaceUnlink(sur, &client->surfaces);
	
	sur->bits = NULL;
	
	WinDeleteWindow(sur->win, false);
	sur->win = NULL;
	
	BmpDelete(sur->bmp);
	sur->bmp = NULL;
	
	sur->magix = 0;
	MemChunkFree(sur);
}

Err DALEXPORT impl_TwGfxAllocSurface(struct TwGfxClient *client, struct TwSurface **surP, struct TwGfxSurfaceInfoType *descrP)
{
	struct BitmapType *bmp = NULL;
	struct TwSurface *sur = NULL;
	WinHandle win = NULL;
	uint16_t stride;
	Err e;
	
	logt("%s\n", __func__);
	
	e = twGfxPrvVerifyHandle(client);
	if (e != errNone)
		return e;
	
	if (!surP || !descrP)
		return TW_GFX_ERROR_NULL_PTR;
	
	if (descrP->size != sizeof(*descrP))
		return TW_GFX_ERROR_BAD_VERSION;
	
	if (descrP->location != TW_GFX_LOCATION_VRAM && descrP->location != TW_GFX_LOCATION_VRAM_AND_DRAM)
		return TW_GFX_ERROR_INVALID_LOCATION;
	
	if (descrP->pixelFormat != TW_GFX_PIXEL_FMT_RGB565_LE)
		return TW_GFX_ERROR_INVALID_PIX_FMT;
	
	if (descrP->width <= 0 || descrP->height <= 0 || descrP->width > MAX_SURFACE_W || descrP->height > MAX_SURFACE_H)
		return TW_GFX_ERROR_INVALID_SIZE;
	
	logt(" -> %u x %u\n", descrP->width, descrP->height);
	
	e = sysErrNoFreeRAM;
	sur = MemChunkNew(0, sizeof(*sur), 0x200);
	if (!sur)
		goto out_err_sur;
	
	bmp = BmpCreateVersion3(descrP->width, descrP->height, 16, NULL, kDensityDouble, kTransparencyNone, pixelFormat565LE, &e);
	if (!bmp) {
		if (e == errNone)
			e = sysErrNoFreeRAM;
		goto out_err_bmp;
	}
		
	win = WinCreateBitmapWindow(bmp, &e);
	if (!win) {
		if (e == errNone)
			e = sysErrNoFreeRAM;
		goto out_err_win;
	}
	
	memset(sur, 0, sizeof(*sur));
	sur->magix = TW_GFX_SURFACE_MAGIX;
	sur->bmp = bmp;
	sur->win = win;
	sur->client = client;
	sur->haveDramBacking = descrP->location == TW_GFX_LOCATION_VRAM_AND_DRAM;
	sur->w = descrP->width;
	sur->h = descrP->height;
	sur->clip.w = descrP->width;
	sur->clip.h = descrP->height;
	sur->bits = BmpGetBits(bmp);
	
	BmpGetDimensions(bmp, NULL, NULL, &stride);
	descrP->stride = stride;
	sur->s = stride;
	
	KALMutexReserve(mTwGfxLock, -1);
	
	sur->next = client->surfaces;
	if (sur->next)
		sur->next->prev = sur;
	client->surfaces = sur;
	
	KALMutexRelease(mTwGfxLock);
	
	logt(" -> %u x %u, stride %u\n", descrP->width, descrP->height, descrP->stride);
	
	*surP = sur;
	return errNone;

out_err_win:
	BmpDelete(bmp);
	
out_err_bmp:
	MemChunkFree(sur);
	
out_err_sur:
	loge("%s returning 0x%04x\n", __func__, e);
	return e;
}

Err DALEXPORT impl_TwGfxFreeSurface(struct TwGfxClient *client, struct TwSurface *sur)
{
	Err e;
	
	logt("%s\n", __func__);
	
	e = twGfxPrvVerifyHandleAndSurface(client, sur);
	if (e != errNone)
		return e;
	
	if (sur == client->palmDisp || sur == client->disp) {
		
		fatal("Cannot free disaply surface\n");
		return TW_GFX_ERROR_INVALID_HANDLE;
	}
	
	KALMutexReserve(mTwGfxLock, -1);
	
	twGfxPrvSurfaceDestroyLocked(client, sur);
	
	KALMutexRelease(mTwGfxLock);
	
	return errNone;
}

Err DALEXPORT impl_TwGfxSetClip(struct TwSurface *sur, const struct TwGfxRectType *clipP)
{
	Err e;
	
	logt("%s(0x%08x, ...)\n", __func__, sur);
	
	e = twGfxPrvVerifySurface(sur);
	if (e != errNone)
		return e;
	
	if (clipP)
		sur->clip = *clipP;
	else {
		
		sur->clip.x = 0;
		sur->clip.y = 0;
		sur->clip.w = sur->w;
		sur->clip.h = sur->h;
	}
	
	logt(" -> set clip to (%d %d) + (%d %d)\n", sur->clip.x, sur->clip.y, sur->clip.w, sur->clip.h);
	
	return errNone;
}

Err DALEXPORT impl_TwGfxGetClip(struct TwSurface *sur, struct TwGfxRectType *clipP)
{
	Err e;
	
	logt("%s\n", __func__);
	
	e = twGfxPrvVerifySurface(sur);
	if (e != errNone)
		return e;
	
	if (!clipP)
		return TW_GFX_ERROR_NULL_PTR;
	
	*clipP = sur->clip;
	
	return errNone;
}

Err DALEXPORT impl_TwGfxReadSurface(struct TwSurface *sur, void *dst, uint8_t flags)
{
	Err e;
	
	logt("%s\n", __func__);
	
	e = twGfxPrvVerifySurface(sur);
	if (e != errNone)
		return e;
	
	if (!dst)
		return TW_GFX_ERROR_NULL_PTR;
	
	memcpy(dst, sur->bits, sur->h * sur->s);
	
	return errNone;
}

Err DALEXPORT impl_TwGfxReadSurfaceRegion(struct TwSurface *sur, const struct TwGfxRectType *bounds, uint8_t *dst, int32_t dstStride, uint8_t flags)
{
	struct TwGfxRectType rect;
	uint8_t *src;
	int32_t i;
	Err e;
	
	logt("%s\n", __func__);
	
	e = twGfxPrvVerifySurface(sur);
	if (e != errNone)
		return e;
	
	if (!dst)
		return TW_GFX_ERROR_NULL_PTR;
	
	twGfxPrvBoundRectToSurface(&rect, sur, bounds);
	
	if (rect.w <=0 || rect.h <= 0)
		return TW_GFX_ERROR_INVALID_COORDS;
	
	src = sur->bits;
	src += rect.y * sur->s;
	src += rect.x * sizeof(uint16_t);
	
	for (i = 0; i < rect.h; i++, src += sur->s, dst += dstStride)
		memcpy(dst, src, rect.w * sizeof(uint16_t));
	
	return errNone;
} 

Err DALEXPORT impl_TwGfxWriteSurfaceRegion(struct TwSurface *sur, const struct TwGfxRectType *bounds, const uint8_t *src, int32_t srcStride, uint8_t flags)
{
	struct TwGfxRectType rect;
	uint8_t *dst;
	int32_t i;
	Err e;
	
	logt("%s\n", __func__);
	
	e = twGfxPrvVerifySurface(sur);
	if (e != errNone)
		return e;
	
	if (!src)
		return TW_GFX_ERROR_NULL_PTR;
	
	twGfxPrvBoundRectToSurface(&rect, sur, bounds);
	
	if (rect.w <=0 || rect.h <= 0)
		return TW_GFX_ERROR_INVALID_COORDS;
	
	dst = sur->bits;
	dst += rect.y * sur->s;
	dst += rect.x * sizeof(uint16_t);
	
	for (i = 0; i < rect.h; i++, dst += sur->s, src += srcStride)
		memcpy(dst, src, rect.w * sizeof(uint16_t));
	
	return errNone;
}

Err DALEXPORT impl_TwGfxWriteSurface(struct TwSurface *sur, const void *src, uint8_t flags)
{
	Err e;
	
	logt("%s\n", __func__);
	
	e = twGfxPrvVerifySurface(sur);
	if (e != errNone)
		return e;
	
	if (!src)
		return TW_GFX_ERROR_NULL_PTR;
	
	memcpy(sur->bits, src, sur->h * sur->s);
	
	return errNone;
}

Err DALEXPORT impl_TwGfxLockSurface(struct TwSurface *sur, void **bitsAddrP)
{
	Err e;
	
	logt("%s\n", __func__);
	
	e = twGfxPrvVerifySurface(sur);
	if (e != errNone)
		return e;
	
	if (!bitsAddrP)
		return TW_GFX_ERROR_NULL_PTR;
	
	if (!sur->lockCt)
		sur->dirty = false;
	sur->lockCt++;
	
	*bitsAddrP = sur->bits;
	
	return errNone;
}

Err DALEXPORT impl_TwGfxUnlockSurface(struct TwSurface *sur, bool updated)
{
	Err e;
	
	logt("%s\n", __func__);
	
	e = twGfxPrvVerifySurface(sur);
	if (e != errNone)
		return e;
	
	if (!sur->lockCt)
		return TW_GFX_ERROR_SFC_NOT_LOCKED;
	
	if (updated)
		sur->dirty = true;
	
	sur->lockCt--;
	
	if (!sur->lockCt) {
		
		logt("%s: unlocked surface is %s\n", sur->dirty ? "dirty" : "clean");
		//todo: if we need to so something about the bits, do so now if "dirty"
	}
	
	return errNone;
}

Err DALEXPORT impl_TwGfxIsSurfaceReady(struct TwSurface *sur)
{
	Err e;
	
	logt("%s\n", __func__);
	
	e = twGfxPrvVerifySurface(sur);
	if (e != errNone)
		return e;
	
	//as we do not do anything async (yet), we are always ready
	//if we were to be busy, we'd return TW_GFX_ERROR_SFC_IS_BUSY
	
	return errNone;
}

Err DALEXPORT impl_TwGfxGetSurfaceInfo(struct TwSurface *sur, struct TwGfxSurfaceInfoType *infoP)
{
	Err e;
	
	logt("%s\n", __func__);
	
	e = twGfxPrvVerifySurface(sur);
	if (e != errNone)
		return e;
	
	if (!infoP)
		return TW_GFX_ERROR_NULL_PTR;
	
	if (infoP->size != sizeof(*infoP))
		return TW_GFX_ERROR_BAD_VERSION;
	
	infoP->width = sur->w;
	infoP->height = sur->h;
	infoP->stride = sur->s;
	infoP->location = sur->haveDramBacking ? TW_GFX_LOCATION_VRAM_AND_DRAM : TW_GFX_LOCATION_VRAM;
	infoP->pixelFormat = TW_GFX_PIXEL_FMT_RGB565_LE;
	
	return errNone;
}

Err DALEXPORT impl_TwGfxClose(struct TwGfxClient *client)
{
	struct TwSurface *sur;
	Err e;
	
	logt("%s\n", __func__);
	
	e = twGfxPrvVerifyHandle(client);
	if (e != errNone)
		return e;
	
	KALMutexReserve(mTwGfxLock, -1);
	
	if (client->palmDisp)
		twGfxPrvSurfaceUnlink(client->palmDisp, &mDispSurfaces);
	
	if (client->next)
		client->next->prev = client->prev;
	if (client->prev)
		client->prev->next = client->next;
	else
		mClients = client->next;
	
	KALMutexRelease(mTwGfxLock);
	
	while (client->surfaces)
		twGfxPrvSurfaceDestroyLocked(client, client->surfaces);

	client->magix = 0;	
	
	MemChunkFree(client);
	
	return errNone;
}

Err DALEXPORT impl_TwGfxOpen(struct TwGfxClient **clientP, struct TwGfxInfoType *infoP)
{
	struct TwGfxClient *cli;
	Err e = errNone;
	
	logt("%s\n", __func__);
	
	if (!clientP)
		return TW_GFX_ERROR_NULL_PTR;
	
	cli = MemChunkNew(0, sizeof(*cli), 0x200);
	if (!cli)
		return sysErrNoFreeRAM;
	
	memset(cli, 0, sizeof(*cli));
	cli->magix = TW_GFX_CLIENT_MAGIX;
	
	KALMutexReserve(mTwGfxLock, -1);
	
	cli->next = mClients;
	if (mClients)
		mClients->prev = cli;
	mClients = cli;
	
	KALMutexRelease(mTwGfxLock);
	
	if (infoP) {
		
		e = impl_TwGfxGetInfo(cli, infoP);
		if (e != errNone) {
			
			(void)impl_TwGfxClose(cli);
			cli = NULL;
		}
	}
	
	*clientP = cli;
	return e;
}

bool DALEXPORT impl_TwGetUserDispAreaBounds(struct RectangleType *rectP)	//return tue if landscape
{
	logt("%s\n", __func__);
	
	if (rectP)
		*rectP = mDispRect;
	
	return mLandscape;
}

static void twGfxPrvUpdateDispSurfaceLocked(struct TwSurface *sur)
{
	uint16_t *bits = BmpGetBits((BitmapPtr)halScreenGetCurBitmap());
	
	bits += mDispRect.topLeft.y * sur->s / sizeof(*bits);
	bits += mDispRect.topLeft.x;
	
	sur->w = mDispRect.extent.x;
	sur->h = mDispRect.extent.y;
	sur->bits = bits;
}

static void twGfxPrvUpdatePalmDisplaySurfaces(void)
{
	struct TwSurface *sur;
		
	KALMutexReserve(mTwGfxLock, -1);
	
	for (sur = mDispSurfaces; sur; sur = sur->next)
		twGfxPrvUpdateDispSurfaceLocked(sur);
	
	KALMutexRelease(mTwGfxLock);
}

void zodScreenLockNotif(bool locked)
{
	twGfxPrvUpdatePalmDisplaySurfaces();
}

Err DALEXPORT impl_TwSetUserDispAreaBounds(const struct RectangleType *rectP)
{
	if (rectP) {
		
		logt("%s( {(%d,%d) _ (%d %d)} )\n", __func__, rectP->topLeft.x, rectP->topLeft.y, rectP->extent.x, rectP->extent.y);
	
		mDispRect = *rectP;
		
		if (rectP->extent.x < rectP->extent.y)
			mLandscape = false;
		else if (rectP->extent.x > rectP->extent.y)
			mLandscape = true;
		
		twGfxPrvUpdatePalmDisplaySurfaces();
	}
	else {
		
		logt("%s(NULL)\n", __func__);
	}
	
	return errNone;
}

static void twPrvIfaceGfxReset(void)
{
	logt("%s\n", __func__);
	
	KALMutexReserve(mTwGfxLock, -1);
	while (mClients) {
		
		impl_TwGfxClose(mClients);
	}
	KALMutexRelease(mTwGfxLock);
}

void* zodPrvGetGfxResetIface(void)
{
	return &twPrvIfaceGfxReset;
}

void zodTwGfxInit(void)
{
	if (errNone != KALMutexCreate(&mTwGfxLock, CREATE_4CC('_','T','w','G')))
		fatal("Failed to init TwGfx mtx\n");
}

Err DALEXPORT impl_TwGfxGetMemoryUsage(struct TwGfxClient *client, int32_t location, int32_t *bytesUsedP)
{
	Err e;
	
	logt("%s\n", __func__);
	
	e = twGfxPrvVerifyHandle(client);
	if (e != errNone)
		return e;
	
	if (!bytesUsedP)
		return TW_GFX_ERROR_NULL_PTR;
	
	if (location != TW_GFX_LOCATION_VRAM && location != TW_GFX_LOCATION_VRAM_AND_DRAM)
		return TW_GFX_ERROR_INVALID_LOCATION;
	
	*bytesUsedP = TW_GFX_VIDEO_RAM_SZ * TW_GFX_VIDEO_RAM_USED_PERCENT / 100;
	
	return errNone;
}

Err DALEXPORT impl_TwGfxInVBlank(struct TwGfxClient *client, Boolean *inVblank)
{
	Err e;
	
	logt("%s\n", __func__);
	
	e = twGfxPrvVerifyHandle(client);
	if (e != errNone)
		return e;
	
	if (!inVblank)
		return TW_GFX_ERROR_NULL_PTR;
	
	*inVblank = true;	//why not?
	
	return errNone;
}

Err DALEXPORT impl_TwGfxWaitForVBlank(struct TwGfxClient *client)
{
	Err e;
	
	logt("%s\n", __func__);
	
	e = twGfxPrvVerifyHandle(client);
	if (e != errNone)
		return e;
	
	//we're there, teehee
	
	return errNone;
}

Err DALEXPORT impl_TwGfxGetPalmDisplaySurface(struct TwGfxClient *client, struct TwSurface **surP)
{
	Err e = errNone;
	
	logt("%s\n", __func__);
	
	e = twGfxPrvVerifyHandle(client);
	if (e != errNone)
		return e;
	
	if (!surP)
		return TW_GFX_ERROR_NULL_PTR;
	
	KALMutexReserve(mTwGfxLock, -1);

	if (!client->palmDisp) {
		
		//point & init
		client->palmDisp = &client->palmDispStruct;
		memset(client->palmDisp, 0, sizeof(*client->palmDisp));
		
		//init it
		client->palmDisp->magix = TW_GFX_SURFACE_MAGIX;
		client->palmDisp->client = client;
		
		if (e == errNone)
			e = HALDisplayGetAttributes(hwrDispHorizontal, &client->palmDisp->clip.w);
		
		if (e == errNone)
			e = HALDisplayGetAttributes(hwrDispVertical, &client->palmDisp->clip.h);
		
		if (e == errNone)
			e = HALDisplayGetAttributes(hwrDispStride, &client->palmDisp->s);
		
		twGfxPrvUpdateDispSurfaceLocked(client->palmDisp);
		
		//link it in if no errors
		if (e == errNone) {
			client->palmDisp->next = mDispSurfaces;
			if (client->palmDisp->next)
				client->palmDisp->next->prev = client->palmDisp;
			mDispSurfaces = client->palmDisp;
		}
	}
	
	KALMutexRelease(mTwGfxLock);
	
	logt(" -> returning surface 0x%08x\n", client->palmDisp);
	
	*surP = client->palmDisp;
	return e;
}

Err DALEXPORT impl_TwGfxGetDisplaySurface(struct TwGfxClient *client, struct TwSurface **surP)
{
	Err e = errNone;
	
	logt("%s\n", __func__);
	
	e = twGfxPrvVerifyHandle(client);
	if (e != errNone)
		return e;
	
	if (!surP)
		return TW_GFX_ERROR_NULL_PTR;
	
	KALMutexReserve(mTwGfxLock, -1);

	if (!client->disp) {
		
		//point & init
		client->disp = &client->dispStruct;
		memset(client->disp, 0, sizeof(*client->disp));
		
		//init it
		client->disp->magix = TW_GFX_SURFACE_MAGIX;
		client->disp->client = client;
		
		if (e == errNone)
			e = HALDisplayGetAttributes(hwrDispHorizontal, &client->disp->w);
		
		if (e == errNone)
			e = HALDisplayGetAttributes(hwrDispVertical, &client->disp->h);
		
		if (e == errNone)
			e = HALDisplayGetAttributes(hwrDispStride, &client->disp->s);
		
		client->disp->bits = BmpGetBits((BitmapPtr)halScreenGetCurBitmap());
		
		client->disp->clip.w = client->disp->w;
		client->disp->clip.h = client->disp->h;
	}
	
	KALMutexRelease(mTwGfxLock);
	
	logt(" -> returning surface 0x%08x\n", client->disp);
	
	*surP = client->disp;
	return e;
}

Err DALEXPORT impl_TwGfxFillRect(struct TwSurface *sur, const struct TwGfxRectType *rectP, uint32_t color)
{
	struct TwGfxRectType effective;
	uint16_t *dst, *dst16;
	uint32_t *dst32;
	int32_t i, j;
	Err e;
	
	logt("%s(0x%08x, ...)\n", __func__, sur);
	
	e = twGfxPrvVerifySurface(sur);
	if (e != errNone)
		return e;
	
	if (!rectP)
		return TW_GFX_ERROR_NULL_PTR;
	
	twGfxPrvBoundRectToSurface(&effective, sur, rectP);
	twGfxPrvIntersectRects(&effective, &effective, &sur->clip);
	
	logt(" -> rect is (%d %d) + (%d %d)\n", rectP->x, rectP->y, rectP->w, rectP->h);
	logt(" -> clipped is (%d %d) + (%d %d)\n", effective.x, effective.y, effective.w, effective.h);
	
	if (effective.w <= 0 || effective.h <= 0)
		return errNone;
	
	color = twGfxPrvPackedColorToRGB565(color);
	color |= color << 16;
	
	dst = sur->bits;
	dst += effective.y * sur->s / sizeof(*dst);
	dst += effective.x;
	
	for (i = 0; i < effective.h; i++, dst += sur->s / sizeof(*dst)) {
		
		j = effective.w;
		dst16 = dst;
		
		if (((uintptr_t)dst16) & 3) {
			j--;
			*dst16++ = color;
		}
		
		dst32 = (uint32_t*)dst16;
		while (j >= 8) {
			
			j -= 8;
			*dst32++ = color;
			*dst32++ = color;
			*dst32++ = color;
			*dst32++ = color;
		}
		
		dst16 = (uint16_t*)dst32;
		while (j--)
			*dst16++ = color;
	}
	
	return errNone;
}

static void twGfxPrvDrawVerticalLine(struct TwSurface *sur, int32_t x, int32_t y, int32_t h, uint32_t color, const struct TwGfxRectType *clipP)
{
	int32_t b = y + h;
	uint16_t *dst;
	
	if (x < clipP->x || x >= clipP->x + clipP->w)
		return;
	
	if (y < clipP->y)
		y = clipP->y;
	if (b > clipP->y + clipP->h)
		b = clipP->y + clipP->h;
	
	if (y >= b)
		return;
	
	h = b - y;
	
	dst = sur->bits;
	dst += y * sur->s / sizeof(*dst);
	dst += x;
	
	while (h--) {
		*dst = color;
		dst += sur->s / sizeof(*dst);
	}
}

static void twGfxPrvDrawHorizLine(struct TwSurface *sur, int32_t x, int32_t y, int32_t w, uint32_t color, const struct TwGfxRectType *clipP)
{
	int32_t r = x + w;
	uint32_t *dst32;
	uint16_t *dst;
	
	if (y < clipP->y || y >= clipP->y + clipP->h)
		return;
	
	if (x < clipP->x)
		x = clipP->x;
	if (r > clipP->x + clipP->w)
		r = clipP->x + clipP->w;
	
	if (x >= r)
		return;
	
	w = r - x;
	
	dst = sur->bits;
	dst += y * sur->s / sizeof(*dst);
	dst += x;
	
	if (((uintptr_t)dst) & 3) {
		w--;
		*dst++ = color;
	}
	dst32 = (uint32_t*)dst;
	while (w >= 8) {
		w -= 8;
		*dst32++ = color;
		*dst32++ = color;
		*dst32++ = color;
		*dst32++ = color;
	}
	dst = (uint16_t*)dst32;
	while (w--)
		*dst++ = color;
}

Err DALEXPORT impl_TwGfxDrawSpans(struct TwSurface *sur, const struct TwGfxSpanType* spans, int32_t nSpans, uint32_t color)
{
	struct TwGfxRectType effective;
	Err e;
	
	logt("%s(0x%08x, ...)\n", __func__, sur);
	
	e = twGfxPrvVerifySurface(sur);
	if (e != errNone)
		return e;
	
	if (!spans)
		return TW_GFX_ERROR_NULL_PTR;
	
	if (nSpans <= 0)
		return TW_GFX_ERROR_INVALID_COUNT;
	
	color = twGfxPrvPackedColorToRGB565(color);
	color |= color << 16;
	
	twGfxPrvBoundRectToSurface(&effective, sur, &sur->clip);
	
	while (nSpans--) {
		
		twGfxPrvDrawHorizLine(sur, spans->x, spans->y, spans->w, color, &effective);
		spans++;
	}
	
	return errNone;
}

Err DALEXPORT impl_TwGfxDrawRect(struct TwSurface *sur, const struct TwGfxRectType *rectP, uint32_t color)
{
	struct TwGfxRectType effective;
	Err e;
	
	logt("%s(0x%08x, ...)\n", __func__, sur);
	
	e = twGfxPrvVerifySurface(sur);
	if (e != errNone)
		return e;
	
	if (!rectP)
		return TW_GFX_ERROR_NULL_PTR;
	
	color = twGfxPrvPackedColorToRGB565(color);
	color |= color << 16;
	
	twGfxPrvBoundRectToSurface(&effective, sur, &sur->clip);
	
	twGfxPrvDrawVerticalLine(sur, rectP->x, rectP->y, rectP->h, color, &effective);
	twGfxPrvDrawVerticalLine(sur, rectP->x + rectP->w - 1, rectP->y, rectP->h, color, &effective);
	twGfxPrvDrawHorizLine(sur, rectP->x, rectP->y, rectP->w, color, &effective);
	twGfxPrvDrawHorizLine(sur, rectP->x, rectP->y + rectP->h - 1, rectP->w, color, &effective);
	
	return errNone;
}

static Err twGfxPrvDrawPoints(struct TwSurface *sur, const struct TwGfxPointType *points, int32_t nPoints, const uint32_t *colors, bool perPointColor)
{
	struct TwGfxRectType effective;
	uint32_t color = 0;
	uint16_t *dst;
	int32_t x, y;
	Err e;
	
	e = twGfxPrvVerifySurface(sur);
	if (e != errNone)
		return e;
	
	if (!points || !colors)
		return TW_GFX_ERROR_NULL_PTR;
	
	if (nPoints <= 0)
		return TW_GFX_ERROR_INVALID_COUNT;
	
	if (!perPointColor)
		color = twGfxPrvPackedColorToRGB565(*colors);
	
	twGfxPrvBoundRectToSurface(&effective, sur, &sur->clip);
	
	dst = sur->bits;
	
	while (nPoints--) {
		
		x = points->x;
		y = points->y;
		points++;
		
		if (x >= effective.x && x - effective.x < effective.w && y >= effective.y && y - effective.y < effective.h) {
			
			if (perPointColor)
				color = twGfxPrvPackedColorToRGB565(*colors);
			
			dst[y * sur->s / sizeof(*dst) + x] = color;
		}
		colors++;
	}
	
	return errNone;
}

Err DALEXPORT impl_TwGfxDrawPoints(struct TwSurface *sur, const struct TwGfxPointType *points, int32_t nPoints, uint32_t color)
{
	logt("%s(0x%08x, ...)\n", __func__, sur);
	
	return twGfxPrvDrawPoints(sur, points, nPoints, &color, false);
}

Err DALEXPORT impl_TwGfxDrawColorPoints(struct TwSurface *sur, const struct TwGfxPointType *points, int32_t nPoints, const uint32_t *colors)
{
	logt("%s(0x%08x, ...)\n", __func__, sur);
	
	return twGfxPrvDrawPoints(sur, points, nPoints, colors, true);
}

static inline uint16_t twGfxPrvBlend16(uint16_t src, uint16_t dst, uint16_t mask)		//xxx: this could use a multiplicative inverse approach too
{
	uint8_t srcR = (src & 0xf800) >> 8;
	uint8_t srcG = (src & 0x07e0) >> 3;
	uint8_t srcB = (src & 0x001f) << 3;
	uint8_t dstR = (dst & 0xf800) >> 8;
	uint8_t dstG = (dst & 0x07e0) >> 3;
	uint8_t dstB = (dst & 0x001f) << 3;
	uint16_t retR = (srcR * mask + dstR * (15 - mask));
	uint16_t retG = (srcG * mask + dstG * (15 - mask));
	uint16_t retB = (srcB * mask + dstB * (15 - mask));
	
	retR = (4369 * (retR + 1)) >> 16;	//div 15, but faster
	retG = (4369 * (retG + 1)) >> 16;	//div 15, but faster
	retB = (4369 * (retB + 1)) >> 16;	//div 15, but faster
	
	return ((retR & 0xf8) << 8) | ((retG & 0xfc) << 3) | (retB >> 3);
}

static inline uint16_t twGfxPrvBlend256(uint16_t src, uint16_t dst, uint16_t mask)
{
	uint8_t srcR = (src & 0xf800) >> 8;
	uint8_t srcG = (src & 0x07e0) >> 3;
	uint8_t srcB = (src & 0x001f) << 3;
	uint8_t dstR = (dst & 0xf800) >> 8;
	uint8_t dstG = (dst & 0x07e0) >> 3;
	uint8_t dstB = (dst & 0x001f) << 3;
	uint16_t retR = (srcR * mask + dstR * (255 - mask));
	uint16_t retG = (srcG * mask + dstG * (255 - mask));
	uint16_t retB = (srcB * mask + dstB * (255 - mask));
	
	retR = (257 * (retR + 1)) >> 16;	//div 255, but faster
	retG = (257 * (retG + 1)) >> 16;	//div 255, but faster
	retB = (257 * (retB + 1)) >> 16;	//div 255, but faster
	
	return ((retR & 0xf8) << 8) | ((retG & 0xfc) << 3) | (retB >> 3);
}

static inline uint16_t twGfxPrvBlend256complex(uint16_t src, uint16_t dst, uint16_t maskR, uint16_t maskG, uint16_t maskB)
{
	uint8_t srcR = (src & 0xf800) >> 8;
	uint8_t srcG = (src & 0x07e0) >> 3;
	uint8_t srcB = (src & 0x001f) << 3;
	uint8_t dstR = (dst & 0xf800) >> 8;
	uint8_t dstG = (dst & 0x07e0) >> 3;
	uint8_t dstB = (dst & 0x001f) << 3;
	uint16_t retR = (srcR * maskR + dstR * (255 - maskR));
	uint16_t retG = (srcG * maskG + dstG * (255 - maskG));
	uint16_t retB = (srcB * maskB + dstB * (255 - maskB));
	
	retR = (257 * (retR + 1)) >> 16;	//div 255, but faster
	retG = (257 * (retG + 1)) >> 16;	//div 255, but faster
	retB = (257 * (retB + 1)) >> 16;	//div 255, but faster
	
	return ((retR & 0xf8) << 8) | ((retG & 0xfc) << 3) | (retB >> 3);
}

static void twGfxPrvMaskBlendBlit(uint16_t *dstP, uint32_t dstPixelStride, const uint16_t *srcP, uint32_t srcPixelStride, const uint8_t *maskP, uint32_t maskByteStride, uint32_t w, uint32_t h, bool maskStartsOdd)
{
	//we are guaranteed to not get zero-sized requests
	
	while (h--) {
		
		const uint8_t *mask = maskP;
		const uint16_t *src = srcP;
		uint16_t *dst = dstP;
		uint32_t i = w;
		
		if (maskStartsOdd) {
			
			*dst = twGfxPrvBlend16(*src++, *dst, *mask++ & 0x0f);
			dst++;
			i--;
		}
		
		while (i >= 2) {
			
			uint_fast8_t maskVal = *mask++;
			
			dst[0] = twGfxPrvBlend16(*src++, dst[0], maskVal >> 4);
			dst[1] = twGfxPrvBlend16(*src++, dst[1], maskVal & 0x0f);
			i -= 2;
			dst += 2;
		}
		
		if (i)
			*dst = twGfxPrvBlend16(*src, *dst, *mask >> 4);
		
		srcP += srcPixelStride;
		dstP += dstPixelStride;
		maskP += maskByteStride;
	}
}

static Err twGfxPrvBlitVerifyAndCalculate(struct TwSurface *dstSur, const struct TwGfxPointType *dstPt, bool verifySrcSur, const struct TwSurface *srcSur, const struct TwGfxRectType *srcRect, struct TwGfxRectType *dstRectP, struct TwGfxRectType *realSrcRectP, uint16_t **dstP, const uint16_t **srcP)
{
	Err e;
	
	e = twGfxPrvVerifySurface(dstSur);
	if (e != errNone)
		return e;
	
	if (verifySrcSur) {
		e = twGfxPrvVerifySurface(srcSur);
		if (e != errNone)
			return e;
	}
	
	if (!dstPt || !srcRect)
		return TW_GFX_ERROR_NULL_PTR;
	
	if (verifySrcSur) {
	
		if (srcRect->x < 0 || srcRect->y < 0 || srcRect->w <= 0 || srcRect->h <= 0 || srcRect->x + srcRect->w > (int32_t)srcSur->w || srcRect->y + srcRect->h > (int32_t)srcSur->h)
			return TW_GFX_ERROR_INVALID_COORDS;
	}
	
	if (dstPt->x >= (int32_t)dstSur->w || dstPt->y >= (int32_t)dstSur->h)
		return TW_GFX_ERROR_INVALID_COORDS;
	
	//calc current effective dst clip & limit our request to it
	dstRectP->x = dstPt->x;
	dstRectP->y = dstPt->y;
	dstRectP->w = srcRect->w;
	dstRectP->h = srcRect->h;
	twGfxPrvIntersectRects(dstRectP, dstRectP, &dstSur->clip);
	twGfxPrvBoundRectToSurface(dstRectP, dstSur, dstRectP);
	
	//calc the real src rect
	realSrcRectP->x = srcRect->x + dstRectP->x - dstPt->x;
	realSrcRectP->y = srcRect->y + dstRectP->y - dstPt->y;
	realSrcRectP->w = dstRectP->w;
	realSrcRectP->h = dstRectP->h;
	
	//get pointers
	if (verifySrcSur) {
		*srcP = srcSur->bits;
		*srcP += realSrcRectP->y * srcSur->s / sizeof(**srcP);
		*srcP += realSrcRectP->x;
	}
	
	*dstP = dstSur->bits;
	*dstP += dstRectP->y * dstSur->s / sizeof(**dstP);
	*dstP += dstRectP->x;
	
	//nice
	return errNone;
}

Err DALEXPORT impl_TwGfxMaskBlendBlt(struct TwSurface *dstSur, const struct TwGfxPointType *dstPt, const struct TwSurface *srcSur, const struct TwGfxRectType *srcRect, const struct TwGfxBitmapType *maskBmp)
{
	struct TwGfxRectType dstRect, realSrcRect;
	const uint16_t *src;
	const uint8_t *mask;
	uint16_t *dst;
	Err e;
	
	logt("%s(0x%08x, ..., 0x%08x, ...)\n", __func__, dstSur, srcSur);
	
	e = twGfxPrvBlitVerifyAndCalculate(dstSur, dstPt, true, srcSur, srcRect, &dstRect, &realSrcRect, &dst, &src);
	if (e != errNone)
		return e;
	
	if (!maskBmp)
		return TW_GFX_ERROR_NULL_PTR;
	
	if (maskBmp->size != sizeof(*maskBmp))
		return TW_GFX_ERROR_BAD_VERSION;
	
	if (maskBmp->pixelFormat != TW_GFX_PIXEL_FMT_4BPP)
		return TW_GFX_ERROR_INVALID_PIX_FMT;
	
	if (srcRect->w > maskBmp->width || srcRect->h > maskBmp->height)
		return TW_GFX_ERROR_INVALID_SIZE;
	
	//mayhaps we have nothing left to do?
	if (dstRect.w <= 0 || dstRect.h <= 0)
		return errNone;
	
	//get mask pointer
	mask = maskBmp->data;
	mask += realSrcRect.y * maskBmp->stride / sizeof(*mask);
	mask += realSrcRect.x / 2;	//4bpp, remember?
	
	logt(" -> src rect is (%d %d) + (%d %d)\n", realSrcRect.x, realSrcRect.y, realSrcRect.w, realSrcRect.h);
	logt(" -> dst rect is (%d %d) + (%d %d)\n", dstRect.x, dstRect.y, dstRect.w, dstRect.h);
	
	//do the work
	twGfxPrvMaskBlendBlit(dst, dstSur->s / sizeof(*dst), src, srcSur->s / sizeof(*src), mask, maskBmp->stride, realSrcRect.w, realSrcRect.h, realSrcRect.x & 1);
	
	return errNone;
}

static void twGfxPrvBlit(uint16_t *dstP, uint32_t dstPixelStride, const uint16_t *srcP, uint32_t srcPixelStride, uint32_t w, uint32_t h)
{
	//we are guaranteed to not get zero-sized requests
	
	while (h--) {
		
		memcpy(dstP, srcP, w * sizeof(*dstP));
		srcP += srcPixelStride;
		dstP += dstPixelStride;
	}
}

Err DALEXPORT impl_TwGfxBitBlt(struct TwSurface *dstSur, const struct TwGfxPointType *dstPt, const struct TwSurface *srcSur, const struct TwGfxRectType *srcRect)
{
	struct TwGfxRectType dstRect, realSrcRect;
	const uint16_t *src;
	uint16_t *dst;
	Err e;
	
	logt("%s(0x%08x, ..., 0x%08x, ...)\n", __func__, dstSur, srcSur);
	
	e = twGfxPrvBlitVerifyAndCalculate(dstSur, dstPt, true, srcSur, srcRect, &dstRect, &realSrcRect, &dst, &src);
	if (e != errNone)
		return e;
	
	//mayhaps we have nothing left to do?
	if (dstRect.w <= 0 || dstRect.h <= 0)
		return errNone;
	
	logt(" -> src rect is (%d %d) + (%d %d)\n", realSrcRect.x, realSrcRect.y, realSrcRect.w, realSrcRect.h);
	logt(" -> dst rect is (%d %d) + (%d %d)\n", dstRect.x, dstRect.y, dstRect.w, dstRect.h);
	
	//do the work
	twGfxPrvBlit(dst, dstSur->s / sizeof(*dst), src, srcSur->s / sizeof(*src), realSrcRect.w, realSrcRect.h);
	
	return errNone;
}

static void twGfxPrvTransparentBlit(uint16_t *dstP, uint32_t dstPixelStride, const uint16_t *srcP, uint32_t srcPixelStride, uint32_t w, uint32_t h, uint32_t transparentColor)
{
	uint32_t i, v;
	//we are guaranteed to not get zero-sized requests
	
	while (h--) {
		
		const uint16_t *src = srcP;
		uint16_t *dst = dstP;
		
		for (i = 0; i < w; i++) {
			if ((v = *src++) != transparentColor)
				*dst = v;
			dst++;
		}
		
		srcP += srcPixelStride;
		dstP += dstPixelStride;
	}
}

Err DALEXPORT impl_TwGfxTransparentBlt(struct TwSurface *dstSur, const struct TwGfxPointType *dstPt, const struct TwSurface *srcSur, const struct TwGfxRectType *srcRect, uint32_t transparentColor)
{
	struct TwGfxRectType dstRect, realSrcRect;
	const uint16_t *src;
	uint16_t *dst;
	Err e;
	
	logt("%s(0x%08x, ..., 0x%08x, ...)\n", __func__, dstSur, srcSur);
	
	e = twGfxPrvBlitVerifyAndCalculate(dstSur, dstPt, true, srcSur, srcRect, &dstRect, &realSrcRect, &dst, &src);
	if (e != errNone)
		return e;
	
	//mayhaps we have nothing left to do?
	if (dstRect.w <= 0 || dstRect.h <= 0)
		return errNone;
	
	logt(" -> src rect is (%d %d) + (%d %d)\n", realSrcRect.x, realSrcRect.y, realSrcRect.w, realSrcRect.h);
	logt(" -> dst rect is (%d %d) + (%d %d)\n", dstRect.x, dstRect.y, dstRect.w, dstRect.h);
	
	//do the work
	twGfxPrvTransparentBlit(dst, dstSur->s / sizeof(*dst), src, srcSur->s / sizeof(*src), realSrcRect.w, realSrcRect.h, twGfxPrvPackedColorToRGB565(transparentColor));
	
	return errNone;
}

static void twGfxPrvBlendBlitSimple(uint16_t *dstP, uint32_t dstPixelStride, const uint16_t *srcP, uint32_t srcPixelStride, uint32_t w, uint32_t h, uint32_t alpha)
{
	uint32_t i, v;
	//we are guaranteed to not get zero-sized requests
	
	while (h--) {
		
		const uint16_t *src = srcP;
		uint16_t *dst = dstP;
		
		for (i = 0; i < w; i++) {
			
			*dst = twGfxPrvBlend256(*src++, *dst, alpha);
			dst++;
		}
		
		srcP += srcPixelStride;
		dstP += dstPixelStride;
	}
}


static void twGfxPrvBlendBlitComplex(uint16_t *dstP, uint32_t dstPixelStride, const uint16_t *srcP, uint32_t srcPixelStride, uint32_t w, uint32_t h, uint32_t alphaR, uint32_t alphaG, uint32_t alphaB)
{
	uint32_t i, v;
	//we are guaranteed to not get zero-sized requests
	
	while (h--) {
		
		const uint16_t *src = srcP;
		uint16_t *dst = dstP;
		
		for (i = 0; i < w; i++) {
			
			*dst = twGfxPrvBlend256complex(*src++, *dst, alphaR, alphaG, alphaB);
			dst++;
		}
		
		srcP += srcPixelStride;
		dstP += dstPixelStride;
	}
}

Err DALEXPORT impl_TwGfxBlendBlt(struct TwSurface *dstSur, const struct TwGfxPointType *dstPt, const struct TwSurface *srcSur, const struct TwGfxRectType *srcRect, uint32_t srcAlpha)
{
	struct TwGfxRectType dstRect, realSrcRect;
	uint8_t alphaR, alphaG, alphaB;
	const uint16_t *src;
	uint16_t *dst;
	Err e;
	
	logt("%s(0x%08x, ..., 0x%08x, ...)\n", __func__, dstSur, srcSur);
	
	e = twGfxPrvBlitVerifyAndCalculate(dstSur, dstPt, true, srcSur, srcRect, &dstRect, &realSrcRect, &dst, &src);
	if (e != errNone)
		return e;
	
	//mayhaps we have nothing left to do?
	if (dstRect.w <= 0 || dstRect.h <= 0)
		return errNone;
	
	logt(" -> src rect is (%d %d) + (%d %d)\n", realSrcRect.x, realSrcRect.y, realSrcRect.w, realSrcRect.h);
	logt(" -> dst rect is (%d %d) + (%d %d)\n", dstRect.x, dstRect.y, dstRect.w, dstRect.h);
	
	alphaR = srcAlpha >> 16;
	alphaG = srcAlpha >> 8;
	alphaB = srcAlpha;
	
	//do the work
	if (alphaR == alphaG && alphaR == alphaB)
		twGfxPrvBlendBlitSimple(dst, dstSur->s / sizeof(*dst), src, srcSur->s / sizeof(*src), realSrcRect.w, realSrcRect.h, alphaR);
	else
		twGfxPrvBlendBlitComplex(dst, dstSur->s / sizeof(*dst), src, srcSur->s / sizeof(*src), realSrcRect.w, realSrcRect.h, alphaR, alphaG, alphaB);
	
	return errNone;
}

static void twGfxPrvBlitBmp1bpp(uint16_t *dst, uint32_t dstPixelStride, const void *srcP, uint32_t srcByteStride, uint32_t w, uint32_t h, uint32_t srcSubByteOfst, const uint16_t *clut)
{
	uint32_t i, dstPixelStrideMinusWidth = dstPixelStride - w;
	const uint8_t *srcRow = srcP, *src;
	uint_fast8_t val;
	
	while (h--) {
		
		src = srcRow;
		i = w;
		
		if (srcSubByteOfst) {
			
			val = *src++ << srcSubByteOfst;
			while (i && srcSubByteOfst != 8) {
				
				*dst++ = clut[val >> 7];
				val <<= 1;
				i--;
				srcSubByteOfst++;
			}
		}
		
		while (i >= 8) {
			
			uint_fast8_t val = *src++;
			
			*dst++ = clut[val >> 7];
			*dst++ = clut[(val >> 6) & 1];
			*dst++ = clut[(val >> 5) & 1];
			*dst++ = clut[(val >> 4) & 1];
			*dst++ = clut[(val >> 3) & 1];
			*dst++ = clut[(val >> 2) & 1];
			*dst++ = clut[(val >> 1) & 1];
			*dst++ = clut[val & 1];
			i -= 8;
		}
		
		if (i) {
			val = *src;
			while (i) {
				
				*dst++ = clut[val >> 7];
				val <<= 1;
				i--;
			}
		}
		
		srcRow += srcByteStride;
		dst += dstPixelStrideMinusWidth;
	}
}

static void twGfxPrvBlitBmp2bpp(uint16_t *dst, uint32_t dstPixelStride, const void *srcP, uint32_t srcByteStride, uint32_t w, uint32_t h, uint32_t srcSubByteOfst, const uint16_t *clut)
{
	uint32_t i, dstPixelStrideMinusWidth = dstPixelStride - w;
	const uint8_t *srcRow = srcP, *src;
	uint_fast8_t val;
	
	while (h--) {
		
		src = srcRow;
		i = w;
		
		if (srcSubByteOfst) {
			
			val = *src++ << (2 * srcSubByteOfst);
			while (i && srcSubByteOfst != 4) {
				
				*dst++ = clut[val >> 6];
				val <<= 2;
				i--;
				srcSubByteOfst++;
			}
		}
		
		while (i >= 4) {
			
			uint_fast8_t val = *src++;
			
			*dst++ = clut[val >> 6];
			*dst++ = clut[(val >> 4) & 3];
			*dst++ = clut[(val >> 2) & 3];
			*dst++ = clut[val & 3];
			i -= 4;
		}
		
		if (i) {
			val = *src;
			while (i) {
				
				*dst++ = clut[val >> 6];
				val <<= 2;
				i--;
			}
		}
		
		srcRow += srcByteStride;
		dst += dstPixelStrideMinusWidth;
	}
}

static void twGfxPrvBlitBmp4bpp(uint16_t *dst, uint32_t dstPixelStride, const void *srcP, uint32_t srcByteStride, uint32_t w, uint32_t h, uint32_t srcSubByteOfst, const uint16_t *clut)
{
	uint32_t i, dstPixelStrideMinusWidth = dstPixelStride - w;
	const uint8_t *srcRow = srcP, *src;
	
	while (h--) {
		
		src = srcRow;
		i = w;
		
		if (srcSubByteOfst) {
			
			*dst++ = clut[*src++ & 0x0f];
			i--;
		}
		
		while (i >= 2) {
			
			uint_fast8_t val = *src++;
			
			*dst++ = clut[val >> 4];
			*dst++ = clut[val & 0x0f];
			i -= 2;
		}
		
		if (i)
			*dst = clut[*src >> 4];
		
		srcRow += srcByteStride;
		dst += dstPixelStrideMinusWidth;
	}
}

static void twGfxPrvBlitBmp8bpp(uint16_t *dst, uint32_t dstPixelStride, const void *srcP, uint32_t srcByteStride, uint32_t w, uint32_t h, uint32_t srcSubByteOfst, const uint16_t *clut)
{
	uint32_t i, srcPixelStrideMinusWidth = srcByteStride - w, dstPixelStrideMinusWidth = dstPixelStride - w;
	const uint8_t *src = srcP;
	
	while (h--) {
	
		for (i = 0; i < w; i++)
			*dst++ = clut[*src++];
			
		src += srcPixelStrideMinusWidth;
		dst += dstPixelStrideMinusWidth;
	}
}

static void twGfxPrvBlitBmpRGB565LE(uint16_t *dstP, uint32_t dstPixelStride, const void *srcP, uint32_t srcByteStride, uint32_t w, uint32_t h, uint32_t srcSubByteOfst, const uint16_t *clut)
{
	const uint8_t *src = srcP;
	
	while (h--) {
		
		memcpy(dstP, src, w * sizeof(*dstP));
		src += srcByteStride;
		dstP += dstPixelStride;
	}
}

static void twGfxPrvBlitBmpRGB565BE(uint16_t *dst, uint32_t dstPixelStride, const void *srcP, uint32_t srcByteStride, uint32_t w, uint32_t h, uint32_t srcSubByteOfst, const uint16_t *clut)
{
	uint32_t i, srcPixelStrideMinusWidth = srcByteStride / sizeof(uint16_t) - w, dstPixelStrideMinusWidth = dstPixelStride - w;
	const uint16_t *src = srcP;
	
	while (h--) {
	
		for (i = 0; i < w; i++)
			*dst++ = __builtin_bswap16(*src++);
			
		src += srcPixelStrideMinusWidth;
		dst += dstPixelStrideMinusWidth;
	}
}

Err DALEXPORT impl_TwGfxDrawBitmap(struct TwSurface *dstSur, const struct TwGfxPointType *dstPt, const struct TwGfxBitmapType *bmp)
{
	struct TwGfxRectType srcRect = {}, dstRect, realSrcRect;
	uint32_t bppShiftR = 0, bppShiftL = 0, pixOfstMask;
	BmpBlitF blitter = NULL;
	uint16_t *dst;
	uint8_t *src;
	Err e;
	
	logt("%s(0x%08x, ...)\n", __func__, dstSur);
	
	if (!bmp)
		return TW_GFX_ERROR_NULL_PTR;
	
	if (bmp->size != sizeof(*bmp))
		return TW_GFX_ERROR_BAD_VERSION;
	
	switch (bmp->pixelFormat) {
		case TW_GFX_PIXEL_FMT_1BPP:
			blitter = twGfxPrvBlitBmp1bpp;
			bppShiftR = 3;
			pixOfstMask = 7;
			break;
		
		case TW_GFX_PIXEL_FMT_RGB565_LE:
			blitter = twGfxPrvBlitBmpRGB565LE;
			bppShiftL = 1;
			pixOfstMask = 0;
			break;
		
		case TW_GFX_PIXEL_FMT_RGB565_BE:
			blitter = twGfxPrvBlitBmpRGB565BE;
			bppShiftL = 1;
			pixOfstMask = 0;
			break;
		
		case TW_GFX_PIXEL_FMT_2BPP:
			blitter = twGfxPrvBlitBmp2bpp;
			bppShiftR = 2;
			pixOfstMask = 3;
			break;
		
		case TW_GFX_PIXEL_FMT_4BPP:
			blitter = twGfxPrvBlitBmp4bpp;
			bppShiftR = 1;
			pixOfstMask = 1;
			break;
		
		case TW_GFX_PIXEL_FMT_8BPP:
			blitter = twGfxPrvBlitBmp8bpp;
			pixOfstMask = 0;
			break;
		
		default:
			return TW_GFX_ERROR_INVALID_PIX_FMT;
	}
	
	srcRect.w = bmp->width;
	srcRect.h = bmp->height;
	
	e = twGfxPrvBlitVerifyAndCalculate(dstSur, dstPt, false, NULL, &srcRect, &dstRect, &realSrcRect, &dst, NULL);
	if (e != errNone)
		return e;
	
	src = bmp->data;
	src += realSrcRect.y * bmp->stride;
	src += (realSrcRect.x << bppShiftL) >> bppShiftR;
	
	logt(" -> src rect is (%d %d) + (%d %d)\n", realSrcRect.x, realSrcRect.y, realSrcRect.w, realSrcRect.h);
	logt(" -> dst rect is (%d %d) + (%d %d)\n", dstRect.x, dstRect.y, dstRect.w, dstRect.h);
	
	blitter(dst, dstSur->s / sizeof(*dst), src, bmp->stride, realSrcRect.w, realSrcRect.h, realSrcRect.x & pixOfstMask, bmp->pal);
	
	return errNone;
}
