From 1dd4e7bd6552c077457fe187fa86ae68d9d523c1 Mon Sep 17 00:00:00 2001
From: John Tsiombikas <nuclear@member.fsf.org>
Date: Sun, 8 Oct 2023 03:24:24 +0300
Subject: [PATCH] 3dgfx code, untested

---
 Makefile               |    5 +-
 src/3dgfx/3dgfx.c      |  895 ++++++++++++++++++++++++++++++++++++++++++++++++
 src/3dgfx/3dgfx.h      |  140 ++++++++
 src/3dgfx/mesh.c       |  601 ++++++++++++++++++++++++++++++++
 src/3dgfx/mesh.h       |   60 ++++
 src/3dgfx/polyclip.c   |  330 ++++++++++++++++++
 src/3dgfx/polyclip.h   |   38 ++
 src/3dgfx/polyfill.c   |  333 ++++++++++++++++++
 src/3dgfx/polyfill.h   |   71 ++++
 src/3dgfx/polytmpl.h   |  229 +++++++++++++
 src/cgmath/cgmath.h    |  280 +++++++++++++++
 src/cgmath/cgmmat.inl  |  667 ++++++++++++++++++++++++++++++++++++
 src/cgmath/cgmmisc.inl |  211 ++++++++++++
 src/cgmath/cgmquat.inl |  159 +++++++++
 src/cgmath/cgmray.inl  |   39 +++
 src/cgmath/cgmvec3.inl |  211 ++++++++++++
 src/cgmath/cgmvec4.inl |  168 +++++++++
 src/colormgr.c         |   31 ++
 src/colormgr.h         |    9 +
 src/dynarr.c           |  141 ++++++++
 src/dynarr.h           |   80 +++++
 src/game.c             |   18 +-
 src/gfxutil.c          |  178 ++++++++++
 src/gfxutil.h          |   17 +
 src/libc/math.h        |    1 +
 src/libc/stdint.h      |    6 +
 src/libc/string.c      |   11 +
 src/libc/string.h      |    2 +
 src/rbtree.c           |  518 ++++++++++++++++++++++++++++
 src/rbtree.h           |   79 +++++
 src/util.c             |   48 +++
 src/util.h             |  112 ++++++
 src/vga.h              |   15 +
 src/vga_s.asm          |   51 +++
 34 files changed, 5751 insertions(+), 3 deletions(-)
 create mode 100644 src/3dgfx/3dgfx.c
 create mode 100644 src/3dgfx/3dgfx.h
 create mode 100644 src/3dgfx/mesh.c
 create mode 100644 src/3dgfx/mesh.h
 create mode 100644 src/3dgfx/polyclip.c
 create mode 100644 src/3dgfx/polyclip.h
 create mode 100644 src/3dgfx/polyfill.c
 create mode 100644 src/3dgfx/polyfill.h
 create mode 100644 src/3dgfx/polytmpl.h
 create mode 100644 src/cgmath/cgmath.h
 create mode 100644 src/cgmath/cgmmat.inl
 create mode 100644 src/cgmath/cgmmisc.inl
 create mode 100644 src/cgmath/cgmquat.inl
 create mode 100644 src/cgmath/cgmray.inl
 create mode 100644 src/cgmath/cgmvec3.inl
 create mode 100644 src/cgmath/cgmvec4.inl
 create mode 100644 src/colormgr.c
 create mode 100644 src/colormgr.h
 create mode 100644 src/dynarr.c
 create mode 100644 src/dynarr.h
 create mode 100644 src/gfxutil.c
 create mode 100644 src/gfxutil.h
 create mode 100644 src/libc/stdint.h
 create mode 100644 src/rbtree.c
 create mode 100644 src/rbtree.h
 create mode 100644 src/util.c
 create mode 100644 src/util.h
 create mode 100644 src/vga.h
 create mode 100644 src/vga_s.asm

diff --git a/Makefile b/Makefile
index 9a8942f..ec659d8 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,5 @@
-csrc = $(wildcard src/*.c) $(wildcard src/kern/*.c) $(wildcard src/libc/*.c)
+csrc = $(wildcard src/*.c) $(wildcard src/kern/*.c) $(wildcard src/libc/*.c) \
+	   $(wildcard src/3dgfx/*.c)
 ssrc = $(wildcard src/*.asm) $(wildcard src/kern/*.asm) $(wildcard src/libc/*.asm)
 obj = $(csrc:.c=.o) $(ssrc:.asm=.o)
 dep = $(csrc:.c=.d)
@@ -13,7 +14,7 @@ AS = nasm
 ASFLAGS = -Isrc/ -Isrc/kern/
 CFLAGS = -m32 -march=i386 $(warn) $(opt) $(dbg) -fno-pic -ffreestanding \
 		 -fno-stack-protector -mpreferred-stack-boundary=2 -nostdinc -ffast-math \
-		 -fno-asynchronous-unwind-tables $(inc) $(def) -MMD
+		 -fno-asynchronous-unwind-tables -fno-strict-aliasing $(inc) $(def) -MMD
 LDFLAGS = -m elf_i386 -nostdlib -T com32.ld -Map game.map
 
 $(bin): $(obj)
diff --git a/src/3dgfx/3dgfx.c b/src/3dgfx/3dgfx.c
new file mode 100644
index 0000000..6df2bbe
--- /dev/null
+++ b/src/3dgfx/3dgfx.c
@@ -0,0 +1,895 @@
+#ifndef BUILD_OPENGL
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <assert.h>
+#include <alloca.h>
+#include "3dgfx.h"
+#include "polyfill.h"
+#include "polyclip.h"
+#include "inttypes.h"
+#include "gfxutil.h"
+#include "util.h"
+#include "colormgr.h"
+
+
+#undef CORRECT_NORMAL_MATRIX
+#ifdef CORRECT_NORMAL_MATRIX
+#include <cgmath/cgmath.h>
+#endif
+
+#define ENABLE_ZBUFFER
+
+#define STACK_SIZE	16
+typedef float g3d_matrix[16];
+
+#define MAX_LIGHTS		4
+
+#define IMM_VBUF_SIZE	256
+
+#define NORMALIZE(v) \
+	do { \
+		float len = sqrt((v)[0] * (v)[0] + (v)[1] * (v)[1] + (v)[2] * (v)[2]); \
+		if(len != 0.0) { \
+			float s = 1.0 / len; \
+			(v)[0] *= s; \
+			(v)[1] *= s; \
+			(v)[2] *= s; \
+		} \
+	} while(0)
+
+enum {LT_POS, LT_DIR};
+struct light {
+	int type;
+	float x, y, z;
+	float energy;
+};
+
+struct material {
+	float kd, ks;
+	float shin;
+};
+
+struct g3d_state {
+	unsigned int opt;
+	int frontface;
+	int polymode;
+
+	g3d_matrix mat[G3D_NUM_MATRICES][STACK_SIZE];
+	int mtop[G3D_NUM_MATRICES];
+	int mmode;
+
+	g3d_matrix norm_mat;
+
+	float ambient;
+	struct light lt[MAX_LIGHTS];
+	struct material mtl;
+
+	int width, height;
+	g3d_pixel *pixels;
+
+	int vport[4];
+
+	g3d_pixel clear_color;
+	uint32_t clear_depth;
+
+	/* immediate mode */
+	int imm_prim;
+	int imm_numv, imm_pcount;
+	struct g3d_vertex imm_curv;
+	struct g3d_vertex imm_vbuf[IMM_VBUF_SIZE];
+};
+
+static void calc_grad(struct g3d_vertex *v);
+
+static void imm_flush(void);
+static __inline void xform4_vec3(const float *mat, float *vec);
+static __inline void xform3_vec3(const float *mat, float *vec);
+static void shade(struct g3d_vertex *v);
+
+static struct g3d_state *st;
+static const float idmat[] = {
+	1, 0, 0, 0,
+	0, 1, 0, 0,
+	0, 0, 1, 0,
+	0, 0, 0, 1
+};
+
+int g3d_init(void)
+{
+	if(!(st = calloc(1, sizeof *st))) {
+		fprintf(stderr, "failed to allocate G3D context\n");
+		return -1;
+	}
+	g3d_reset();
+
+	return 0;
+}
+
+void g3d_destroy(void)
+{
+#ifdef ENABLE_ZBUFFER
+	free(pfill_zbuf);
+#endif
+	free(st);
+}
+
+void g3d_reset(void)
+{
+	int i;
+
+#ifdef ENABLE_ZBUFFER
+	free(pfill_zbuf);
+#endif
+	memset(st, 0, sizeof *st);
+
+	st->opt = G3D_CLIP_FRUSTUM;
+	st->polymode = POLYFILL_FLAT;
+
+	for(i=0; i<G3D_NUM_MATRICES; i++) {
+		g3d_matrix_mode(i);
+		g3d_load_identity();
+	}
+
+	for(i=0; i<MAX_LIGHTS; i++) {
+		g3d_light_dir(i, 0, 0, 1);
+		g3d_light_energy(i, 1);
+	}
+	g3d_light_ambient(0.1);
+
+	g3d_mtl_diffuse(1);
+
+	st->clear_depth = 0xffffff;
+}
+
+void g3d_framebuffer(int width, int height, void *pixels)
+{
+	static int max_height;
+
+#ifdef ENABLE_ZBUFFER
+	static int max_npixels;
+	int npixels = width * height;
+
+	if(npixels > max_npixels) {
+		free(pfill_zbuf);
+		pfill_zbuf = malloc(npixels * sizeof *pfill_zbuf);
+		max_npixels = npixels;
+	}
+#endif
+
+	if(height > max_height) {
+		polyfill_fbheight(height);
+		max_height = height;
+	}
+
+	st->width = width;
+	st->height = height;
+
+	pfill_fb.pixels = pixels;
+	pfill_fb.width = width;
+	pfill_fb.height = height;
+
+	g3d_viewport(0, 0, width, height);
+}
+
+/* set the framebuffer pointer, without resetting the size */
+void g3d_framebuffer_addr(void *pixels)
+{
+	pfill_fb.pixels = pixels;
+}
+
+void g3d_viewport(int x, int y, int w, int h)
+{
+	st->vport[0] = x;
+	st->vport[1] = y;
+	st->vport[2] = w;
+	st->vport[3] = h;
+}
+
+void g3d_clear_color(unsigned char r, unsigned char g, unsigned char b)
+{
+	st->clear_color = find_color(r, g, b);
+}
+
+void g3d_clear_depth(float z)
+{
+	int iz = (int)(z * (float)0xffffff);
+	if(iz < 0) iz = 0;
+	if(iz > 0xffffff) iz = 0xffffff;
+	st->clear_depth = iz;
+}
+
+void g3d_clear(unsigned int mask)
+{
+	if(mask & G3D_COLOR_BUFFER_BIT) {
+		memset16(pfill_fb.pixels, st->clear_color, pfill_fb.width * pfill_fb.height);
+	}
+	if(mask & G3D_DEPTH_BUFFER_BIT) {
+		memset16(pfill_zbuf, st->clear_depth, pfill_fb.width * pfill_fb.height * sizeof *pfill_zbuf / 2);
+	}
+}
+
+void g3d_enable(unsigned int opt)
+{
+	st->opt |= opt;
+}
+
+void g3d_disable(unsigned int opt)
+{
+	st->opt &= ~opt;
+}
+
+void g3d_setopt(unsigned int opt, unsigned int mask)
+{
+	st->opt = (st->opt & ~mask) | (opt & mask);
+}
+
+unsigned int g3d_getopt(unsigned int mask)
+{
+	return st->opt & mask;
+}
+
+void g3d_front_face(unsigned int order)
+{
+	st->frontface = order;
+}
+
+void g3d_polygon_mode(int pmode)
+{
+	st->polymode = pmode;
+}
+
+int g3d_get_polygon_mode(void)
+{
+	return st->polymode;
+}
+
+void g3d_matrix_mode(int mmode)
+{
+	st->mmode = mmode;
+}
+
+void g3d_load_identity(void)
+{
+	int top = st->mtop[st->mmode];
+	memcpy(st->mat[st->mmode][top], idmat, 16 * sizeof(float));
+}
+
+void g3d_load_matrix(const float *m)
+{
+	int top = st->mtop[st->mmode];
+	memcpy(st->mat[st->mmode][top], m, 16 * sizeof(float));
+}
+
+#define M(i,j)	(((i) << 2) + (j))
+void g3d_mult_matrix(const float *m2)
+{
+	int i, j, top = st->mtop[st->mmode];
+	float m1[16];
+	float *dest = st->mat[st->mmode][top];
+
+	memcpy(m1, dest, sizeof m1);
+
+	for(i=0; i<4; i++) {
+		for(j=0; j<4; j++) {
+			*dest++ = m1[M(0,j)] * m2[M(i,0)] +
+				m1[M(1,j)] * m2[M(i,1)] +
+				m1[M(2,j)] * m2[M(i,2)] +
+				m1[M(3,j)] * m2[M(i,3)];
+		}
+	}
+}
+
+void g3d_push_matrix(void)
+{
+	int top = st->mtop[st->mmode];
+	if(top >= STACK_SIZE) {
+		fprintf(stderr, "g3d_push_matrix overflow\n");
+		return;
+	}
+	memcpy(st->mat[st->mmode][top + 1], st->mat[st->mmode][top], 16 * sizeof(float));
+	st->mtop[st->mmode] = top + 1;
+}
+
+void g3d_pop_matrix(void)
+{
+	if(st->mtop[st->mmode] <= 0) {
+		fprintf(stderr, "g3d_pop_matrix underflow\n");
+		return;
+	}
+	--st->mtop[st->mmode];
+}
+
+void g3d_translate(float x, float y, float z)
+{
+	float m[16] = {1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1};
+	m[12] = x;
+	m[13] = y;
+	m[14] = z;
+	g3d_mult_matrix(m);
+}
+
+void g3d_rotate(float deg, float x, float y, float z)
+{
+	float m[16] = {0};
+
+	float angle = M_PI * deg / 180.0f;
+	float sina = sin(angle);
+	float cosa = cos(angle);
+	float one_minus_cosa = 1.0f - cosa;
+	float nxsq = x * x;
+	float nysq = y * y;
+	float nzsq = z * z;
+
+	m[0] = nxsq + (1.0f - nxsq) * cosa;
+	m[4] = x * y * one_minus_cosa - z * sina;
+	m[8] = x * z * one_minus_cosa + y * sina;
+	m[1] = x * y * one_minus_cosa + z * sina;
+	m[5] = nysq + (1.0 - nysq) * cosa;
+	m[9] = y * z * one_minus_cosa - x * sina;
+	m[2] = x * z * one_minus_cosa - y * sina;
+	m[6] = y * z * one_minus_cosa + x * sina;
+	m[10] = nzsq + (1.0 - nzsq) * cosa;
+	m[15] = 1.0f;
+
+	g3d_mult_matrix(m);
+}
+
+void g3d_scale(float x, float y, float z)
+{
+	float m[16] = {0};
+	m[0] = x;
+	m[5] = y;
+	m[10] = z;
+	m[15] = 1.0f;
+	g3d_mult_matrix(m);
+}
+
+void g3d_ortho(float left, float right, float bottom, float top, float znear, float zfar)
+{
+	float m[16] = {0};
+
+	float dx = right - left;
+	float dy = top - bottom;
+	float dz = zfar - znear;
+
+	m[0] = 2.0 / dx;
+	m[5] = 2.0 / dy;
+	m[10] = -2.0 / dz;
+	m[12] = -(right + left) / dx;
+	m[13] = -(top + bottom) / dy;
+	m[14] = -(zfar + znear) / dz;
+	m[15] = 1.0f;
+
+	g3d_mult_matrix(m);
+}
+
+void g3d_frustum(float left, float right, float bottom, float top, float nr, float fr)
+{
+	float m[16] = {0};
+
+	float dx = right - left;
+	float dy = top - bottom;
+	float dz = fr - nr;
+
+	float a = (right + left) / dx;
+	float b = (top + bottom) / dy;
+	float c = -(fr + nr) / dz;
+	float d = -2.0 * fr * nr / dz;
+
+	m[0] = 2.0 * nr / dx;
+	m[5] = 2.0 * nr / dy;
+	m[8] = a;
+	m[9] = b;
+	m[10] = c;
+	m[11] = -1.0f;
+	m[14] = d;
+
+	g3d_mult_matrix(m);
+}
+
+void g3d_perspective(float vfov_deg, float aspect, float znear, float zfar)
+{
+	float m[16] = {0};
+
+	float vfov = M_PI * vfov_deg / 180.0f;
+	float s = 1.0f / tan(vfov * 0.5f);
+	float range = znear - zfar;
+
+	m[0] = s / aspect;
+	m[5] = s;
+	m[10] = (znear + zfar) / range;
+	m[11] = -1.0f;
+	m[14] = 2.0f * znear * zfar / range;
+
+	g3d_mult_matrix(m);
+}
+
+const float *g3d_get_matrix(int which, float *m)
+{
+	int top = st->mtop[which];
+
+	if(m) {
+		memcpy(m, st->mat[which][top], 16 * sizeof(float));
+	}
+	return st->mat[which][top];
+}
+
+void g3d_light_pos(int idx, float x, float y, float z)
+{
+	int mvtop = st->mtop[G3D_MODELVIEW];
+
+	st->lt[idx].type = LT_POS;
+	st->lt[idx].x = x;
+	st->lt[idx].y = y;
+	st->lt[idx].z = z;
+
+	xform4_vec3(st->mat[G3D_MODELVIEW][mvtop], &st->lt[idx].x);
+}
+
+void g3d_light_dir(int idx, float x, float y, float z)
+{
+	int mvtop = st->mtop[G3D_MODELVIEW];
+
+	st->lt[idx].type = LT_DIR;
+	st->lt[idx].x = x;
+	st->lt[idx].y = y;
+	st->lt[idx].z = z;
+
+	/* calc the normal matrix */
+#ifdef CORRECT_NORMAL_MATRIX
+	memcpy(st->norm_mat, st->mat[G3D_MODELVIEW][mvtop], 16 * sizeof(float));
+	cgm_minverse(st->norm_mat);
+	cgm_mtranspose(st->norm_mat);
+#else
+	memcpy(st->norm_mat, st->mat[G3D_MODELVIEW][mvtop], 16 * sizeof(float));
+	st->norm_mat[12] = st->norm_mat[13] = st->norm_mat[14] = 0.0f;
+#endif
+
+	xform4_vec3(st->norm_mat, &st->lt[idx].x);
+
+	NORMALIZE(&st->lt[idx].x);
+}
+
+void g3d_light_energy(int idx, float val)
+{
+	st->lt[idx].energy = val;
+}
+
+void g3d_light_ambient(float val)
+{
+	st->ambient = val;
+}
+
+void g3d_mtl_diffuse(float diffuse)
+{
+	st->mtl.kd = diffuse;
+}
+
+void g3d_mtl_specular(float spec)
+{
+	st->mtl.ks = spec;
+}
+
+void g3d_mtl_shininess(float shin)
+{
+	st->mtl.shin = shin;
+}
+
+static inline int calc_shift(unsigned int x)
+{
+	int res = -1;
+	while(x) {
+		x >>= 1;
+		++res;
+	}
+	return res;
+}
+
+static inline int calc_mask(unsigned int x)
+{
+	return x - 1;
+}
+
+void g3d_set_texture(int xsz, int ysz, void *pixels)
+{
+	pfill_tex.pixels = pixels;
+	pfill_tex.width = xsz;
+	pfill_tex.height = ysz;
+
+	pfill_tex.xshift = calc_shift(xsz);
+	pfill_tex.yshift = calc_shift(ysz);
+	pfill_tex.xmask = calc_mask(xsz);
+	pfill_tex.ymask = calc_mask(ysz);
+}
+
+void g3d_draw(int prim, const struct g3d_vertex *varr, int varr_size)
+{
+	g3d_draw_indexed(prim, varr, varr_size, 0, 0);
+}
+
+#define NEED_NORMALS	(st->opt & (G3D_LIGHTING | G3D_TEXTURE_GEN))
+
+void g3d_draw_indexed(int prim, const struct g3d_vertex *varr, int varr_size,
+		const uint16_t *iarr, int iarr_size)
+{
+	int i, j, vnum, nfaces, fill_mode, num_tri;
+	struct pvertex pv[16], *pvtri;
+	struct g3d_vertex v[16], *vtri;
+	int mvtop = st->mtop[G3D_MODELVIEW];
+	int ptop = st->mtop[G3D_PROJECTION];
+	struct g3d_vertex *tmpv;
+
+	tmpv = alloca(prim * 6 * sizeof *tmpv);
+
+	/* calc the normal matrix */
+	if(NEED_NORMALS) {
+#ifdef CORRECT_NORMAL_MATRIX
+		memcpy(st->norm_mat, st->mat[G3D_MODELVIEW][mvtop], 16 * sizeof(float));
+		cgm_minverse(st->norm_mat);
+		cgm_mtranspose(st->norm_mat);
+#else
+		memcpy(st->norm_mat, st->mat[G3D_MODELVIEW][mvtop], 16 * sizeof(float));
+		st->norm_mat[12] = st->norm_mat[13] = st->norm_mat[14] = 0.0f;
+#endif
+	}
+
+	nfaces = (iarr ? iarr_size : varr_size) / prim;
+
+	for(j=0; j<nfaces; j++) {
+		vnum = prim;	/* reset vnum for each iteration */
+
+		for(i=0; i<vnum; i++) {
+			v[i] = iarr ? varr[*iarr++] : *varr++;
+
+			xform4_vec3(st->mat[G3D_MODELVIEW][mvtop], &v[i].x);
+
+			if(NEED_NORMALS) {
+				xform3_vec3(st->norm_mat, &v[i].nx);
+				if(st->opt & G3D_LIGHTING) {
+					shade(v + i);
+				}
+				if(st->opt & G3D_TEXTURE_GEN) {
+					v[i].u = v[i].nx * 0.5 + 0.5;
+					v[i].v = 0.5 - v[i].ny * 0.5;
+				}
+			}
+			if(st->opt & G3D_TEXTURE_MAT) {
+				float *mat = st->mat[G3D_TEXTURE][st->mtop[G3D_TEXTURE]];
+				float x = mat[0] * v[i].u + mat[4] * v[i].v + mat[12];
+				float y = mat[1] * v[i].u + mat[5] * v[i].v + mat[13];
+				float w = mat[3] * v[i].u + mat[7] * v[i].v + mat[15];
+				v[i].u = x / w;
+				v[i].v = y / w;
+			}
+			xform4_vec3(st->mat[G3D_PROJECTION][ptop], &v[i].x);
+		}
+
+		/* clipping */
+		if(st->opt & G3D_CLIP_FRUSTUM) {
+			for(i=0; i<6; i++) {
+				memcpy(tmpv, v, vnum * sizeof *v);
+
+				if(clip_frustum(v, &vnum, tmpv, vnum, i) < 0) {
+					/* polygon completely outside of view volume. discard */
+					vnum = 0;
+					break;
+				}
+			}
+
+			if(!vnum) continue;
+		}
+
+		for(i=0; i<vnum; i++) {
+			if(v[i].w != 0.0f) {
+				v[i].x /= v[i].w;
+				v[i].y /= v[i].w;
+#ifdef ENABLE_ZBUFFER
+				if(st->opt & G3D_DEPTH_TEST) {
+					v[i].z /= v[i].w;
+				}
+#endif
+			}
+
+			/* viewport transformation */
+			v[i].x = (v[i].x * 0.5f + 0.5f) * (float)st->vport[2] + st->vport[0];
+			v[i].y = (0.5f - v[i].y * 0.5f) * (float)st->vport[3] + st->vport[1];
+
+			/* convert pos to 24.8 fixed point */
+			pv[i].x = cround64(v[i].x * 256.0f);
+			pv[i].y = cround64(v[i].y * 256.0f);
+#ifdef ENABLE_ZBUFFER
+			if(st->opt & G3D_DEPTH_TEST) {
+				/* after div/w z is in [-1, 1], remap it to [0, 0xffffff] */
+				pv[i].z = cround64(v[i].z * 8388607.5f + 8388607.5f);
+			}
+#endif
+			/* convert tex coords to 16.16 fixed point */
+			pv[i].u = cround64(v[i].u * 65536.0f);
+			pv[i].v = cround64(v[i].v * 65536.0f);
+			/* pass the color through as is */
+			pv[i].l = v[i].l;
+		}
+
+		/* backface culling */
+#if 0	/* TODO fix culling */
+		if(vnum > 2 && st->opt & G3D_CULL_FACE) {
+			int32_t ax = pv[1].x - pv[0].x;
+			int32_t ay = pv[1].y - pv[0].y;
+			int32_t bx = pv[2].x - pv[0].x;
+			int32_t by = pv[2].y - pv[0].y;
+			int32_t cross_z = (ax >> 4) * (by >> 4) - (ay >> 4) * (bx >> 4);
+			int sign = (cross_z >> 31) & 1;
+
+			if(!(sign ^ st->frontface)) {
+				continue;	/* back-facing */
+			}
+		}
+#endif
+
+		switch(vnum) {
+		case 1:
+			/*
+			if(st->opt & (G3D_ALPHA_BLEND | G3D_ADD_BLEND)) {
+				int r, g, b, inv_alpha;
+				g3d_pixel *dest = pfill_fb.pixels + (pv[0].y >> 8) * st->width + (pv[0].x >> 8);
+				if(st->opt & G3D_ALPHA_BLEND) {
+					inv_alpha = 255 - pv[0].a;
+					r = ((int)pv[0].r * pv[0].a + G3D_UNPACK_R(*dest) * inv_alpha) >> 8;
+					g = ((int)pv[0].g * pv[0].a + G3D_UNPACK_G(*dest) * inv_alpha) >> 8;
+					b = ((int)pv[0].b * pv[0].a + G3D_UNPACK_B(*dest) * inv_alpha) >> 8;
+				} else {
+					r = (int)pv[0].r + G3D_UNPACK_R(*dest);
+					g = (int)pv[0].g + G3D_UNPACK_G(*dest);
+					b = (int)pv[0].b + G3D_UNPACK_B(*dest);
+					if(r > 255) r = 255;
+					if(g > 255) g = 255;
+					if(b > 255) b = 255;
+				}
+				*dest++ = G3D_PACK_RGB(r, g, b);
+			} else {
+				g3d_pixel *dest = pfill_fb.pixels + (pv[0].y >> 8) * st->width + (pv[0].x >> 8);
+				*dest = G3D_PACK_RGB(pv[0].r, pv[0].g, pv[0].b);
+			}
+			*/
+			break;
+
+		case 2:
+			{
+				g3d_pixel col = pv[0].l;
+				draw_line(pv[0].x >> 8, pv[0].y >> 8, pv[1].x >> 8, pv[1].y >> 8, col);
+			}
+			break;
+
+		default:
+			fill_mode = st->polymode;
+			if(st->opt & G3D_TEXTURE_2D) {
+				fill_mode |= POLYFILL_TEX_BIT;
+			}
+			/*
+			if(st->opt & G3D_ALPHA_BLEND) {
+				fill_mode |= POLYFILL_ALPHA_BIT;
+			} else if(st->opt & G3D_ADD_BLEND) {
+				fill_mode |= POLYFILL_ADD_BIT;
+			}
+			*/
+#ifdef ENABLE_ZBUFFER
+			if(st->opt & G3D_DEPTH_TEST) {
+				fill_mode |= POLYFILL_ZBUF_BIT;
+			}
+#endif
+			num_tri = vnum - 2;
+			vtri = v;
+			pvtri = pv;
+			for(;;) {
+				calc_grad(vtri);
+				polyfill(fill_mode, pvtri);
+				if(--num_tri == 0) break;
+				vtri[1] = vtri[0];
+				pvtri[1] = pvtri[0];
+				vtri++;
+				pvtri++;
+			}
+		}
+	}
+}
+
+#define ATTR_DELTAS(attr) \
+	float d##attr##02 = v[0].attr - v[2].attr; \
+	float d##attr##12 = v[1].attr - v[2].attr
+
+#define DFDX(attr)	\
+	(dx ? (d##attr##12 * dy02 - d##attr##02 * dy12) / dx : 0)
+#define DFDY(attr)	\
+	(dy ? (d##attr##12 * dx02 - d##attr##02 * dx12) / dy : 0)
+
+static void calc_grad(struct g3d_vertex *v)
+{
+	/*float dx01 = v[0].x - v[1].x;*/
+	float dx02 = v[0].x - v[2].x;
+	float dx12 = v[1].x - v[2].x;
+	/*float dy01 = v[0].y - v[1].y;*/
+	float dy02 = v[0].y - v[2].y;
+	float dy12 = v[1].y - v[2].y;
+
+	float dx = dx12 * dy02 - dx02 * dy12;
+	float dy = dx02 * dy12 - dx12 * dy02;
+
+	if(st->polymode == POLYFILL_GOURAUD) {
+		ATTR_DELTAS(l);
+		pgrad.dldx = cround64(DFDX(l) * 4096.0f);
+		pgrad.dldy = cround64(DFDY(l) * 4096.0f);
+		if(st->opt & G3D_ALPHA_BLEND) {
+			ATTR_DELTAS(a);
+			pgrad.dadx = cround64(DFDX(a) * 4096.0f);
+			pgrad.dady = cround64(DFDY(a) * 4096.0f);
+		}
+	}
+
+	if(st->opt & G3D_DEPTH_TEST) {
+		ATTR_DELTAS(z);
+		pgrad.dzdx = cround64(DFDX(z) * 8388607.5f);
+		pgrad.dzdy = cround64(DFDY(z) * 8388607.5f);
+	}
+
+	if(st->opt & G3D_TEXTURE_2D) {
+		ATTR_DELTAS(u);
+		ATTR_DELTAS(v);
+		pgrad.dudx = cround64(DFDX(u) * 65536.0f);
+		pgrad.dudy = cround64(DFDY(u) * 65536.0f);
+		pgrad.dvdx = cround64(DFDX(v) * 65536.0f);
+		pgrad.dvdy = cround64(DFDY(v) * 65536.0f);
+	}
+}
+
+void g3d_begin(int prim)
+{
+	st->imm_prim = prim;
+	st->imm_pcount = prim;
+	st->imm_numv = 0;
+}
+
+void g3d_end(void)
+{
+	imm_flush();
+}
+
+static void imm_flush(void)
+{
+	int numv = st->imm_numv;
+	st->imm_numv = 0;
+	g3d_draw_indexed(st->imm_prim, st->imm_vbuf, numv, 0, 0);
+}
+
+void g3d_vertex(float x, float y, float z)
+{
+	struct g3d_vertex *vptr = st->imm_vbuf + st->imm_numv++;
+	*vptr = st->imm_curv;
+	vptr->x = x;
+	vptr->y = y;
+	vptr->z = z;
+	vptr->w = 1.0f;
+
+	if(!--st->imm_pcount) {
+		if(st->imm_numv >= IMM_VBUF_SIZE - st->imm_prim) {
+			imm_flush();
+		}
+		st->imm_pcount = st->imm_prim;
+	}
+}
+
+void g3d_normal(float x, float y, float z)
+{
+	st->imm_curv.nx = x;
+	st->imm_curv.ny = y;
+	st->imm_curv.nz = z;
+}
+
+#define CLAMP(x, a, b)	((x) < (a) ? (a) : ((x) > (b) ? (b) : (x)))
+#define MIN(a, b)		((a) < (b) ? (a) : (b))
+
+void g3d_color1b(unsigned char lum)
+{
+	st->imm_curv.l = MIN(lum, 255);
+	st->imm_curv.a = 255;
+}
+
+void g3d_color2b(unsigned char lum, unsigned char a)
+{
+	st->imm_curv.l = MIN(lum, 255);
+	st->imm_curv.a = MIN(a, 255);
+}
+
+void g3d_color1f(float lum)
+{
+	int ilum = lum * 255.0f;
+	st->imm_curv.l = CLAMP(ilum, 0, 255);
+	st->imm_curv.a = 255;
+}
+
+void g3d_color2f(float lum, float a)
+{
+	int ilum = lum * 255.0f;
+	int ia = a * 255.0f;
+	st->imm_curv.l = CLAMP(ilum, 0, 255);
+	st->imm_curv.a = CLAMP(ia, 0, 255);
+}
+
+void g3d_texcoord(float u, float v)
+{
+	st->imm_curv.u = u;
+	st->imm_curv.v = v;
+}
+
+static __inline void xform4_vec3(const float *mat, float *vec)
+{
+	float x = mat[0] * vec[0] + mat[4] * vec[1] + mat[8] * vec[2] + mat[12];
+	float y = mat[1] * vec[0] + mat[5] * vec[1] + mat[9] * vec[2] + mat[13];
+	float z = mat[2] * vec[0] + mat[6] * vec[1] + mat[10] * vec[2] + mat[14];
+	vec[3] = mat[3] * vec[0] + mat[7] * vec[1] + mat[11] * vec[2] + mat[15];
+	vec[2] = z;
+	vec[1] = y;
+	vec[0] = x;
+}
+
+static __inline void xform3_vec3(const float *mat, float *vec)
+{
+	float x = mat[0] * vec[0] + mat[4] * vec[1] + mat[8] * vec[2];
+	float y = mat[1] * vec[0] + mat[5] * vec[1] + mat[9] * vec[2];
+	vec[2] = mat[2] * vec[0] + mat[6] * vec[1] + mat[10] * vec[2];
+	vec[1] = y;
+	vec[0] = x;
+}
+
+static void shade(struct g3d_vertex *v)
+{
+	int i, ilum;
+	float lum;
+
+	lum = st->ambient * st->mtl.kd;
+
+	for(i=0; i<MAX_LIGHTS; i++) {
+		float ldir[3];
+		float ndotl;
+
+		if(!(st->opt & (G3D_LIGHT0 << i))) {
+			continue;
+		}
+
+		ldir[0] = st->lt[i].x;
+		ldir[1] = st->lt[i].y;
+		ldir[2] = st->lt[i].z;
+
+		if(st->lt[i].type != LT_DIR) {
+			ldir[0] -= v->x;
+			ldir[1] -= v->y;
+			ldir[2] -= v->z;
+			NORMALIZE(ldir);
+		}
+
+		if((ndotl = v->nx * ldir[0] + v->ny * ldir[1] + v->nz * ldir[2]) < 0.0f) {
+			ndotl = 0.0f;
+		}
+
+		lum += st->mtl.kd * st->lt[i].energy * ndotl;
+
+		if(st->opt & G3D_SPECULAR) {
+			float ndoth;
+			ldir[2] += 1.0f;
+			NORMALIZE(ldir);
+			if((ndoth = v->nx * ldir[0] + v->ny * ldir[1] + v->nz * ldir[2]) < 0.0f) {
+				ndoth = 0.0f;
+			}
+			ndoth = pow(ndoth, st->mtl.shin);
+
+			lum += st->mtl.ks * st->lt[i].energy * ndoth;
+		}
+	}
+
+	ilum = cround64(lum * 255.0);
+
+	v->l = ilum > 255 ? 255 : ilum;
+}
+
+#endif	/* !def BUILD_OPENGL */
diff --git a/src/3dgfx/3dgfx.h b/src/3dgfx/3dgfx.h
new file mode 100644
index 0000000..7bed815
--- /dev/null
+++ b/src/3dgfx/3dgfx.h
@@ -0,0 +1,140 @@
+#ifndef THREEDGFX_H_
+#define THREEDGFX_H_
+
+#include <inttypes.h>
+
+typedef unsigned char g3d_pixel;
+
+
+struct g3d_vertex {
+	float x, y, z, w;
+	float nx, ny, nz;
+	float u, v;
+	unsigned char l, a;
+};
+
+enum {
+	G3D_POINTS = 1,
+	G3D_LINES = 2,
+	G3D_TRIANGLES = 3,
+	G3D_QUADS = 4
+};
+
+/* g3d_enable/g3d_disable bits */
+enum {
+	G3D_CULL_FACE	= 0x000001,
+	G3D_DEPTH_TEST	= 0x000002,
+	G3D_LIGHTING	= 0x000004,
+	G3D_LIGHT0		= 0x000008,
+	G3D_LIGHT1		= 0x000010,
+	G3D_LIGHT2		= 0x000020,
+	G3D_LIGHT3		= 0x000040,
+	G3D_TEXTURE_2D	= 0x000080,
+	G3D_ALPHA_BLEND	= 0x000100,
+	G3D_TEXTURE_GEN	= 0x000200,
+	G3D_CLIP_FRUSTUM = 0x000800,/* when disabled, don't clip against the frustum */
+	G3D_CLIP_PLANE0 = 0x001000,	/* user-defined 3D clipping planes XXX not impl. */
+	G3D_CLIP_PLANE1 = 0x002000,
+	G3D_CLIP_PLANE2 = 0x004000,
+	G3D_CLIP_PLANE3 = 0x008000,
+
+	G3D_TEXTURE_MAT	= 0x010000,
+	G3D_SPECULAR	= 0x020000,
+
+	G3D_ADD_BLEND	= 0x040000,
+
+	G3D_ALL = 0x7fffffff
+};
+
+/* arg to g3d_front_face */
+enum { G3D_CCW, G3D_CW };
+
+/* arg to g3d_polygon_mode */
+enum {
+	G3D_WIRE,
+	G3D_FLAT,
+	G3D_GOURAUD
+};
+
+/* matrix stacks */
+enum {
+	G3D_MODELVIEW,
+	G3D_PROJECTION,
+	G3D_TEXTURE,
+
+	G3D_NUM_MATRICES
+};
+
+/* clear bits */
+enum {
+	G3D_COLOR_BUFFER_BIT = 1,
+	G3D_DEPTH_BUFFER_BIT = 2
+};
+
+int g3d_init(void);
+void g3d_destroy(void);
+void g3d_reset(void);
+
+void g3d_framebuffer(int width, int height, void *pixels);
+void g3d_framebuffer_addr(void *pixels);
+void g3d_viewport(int x, int y, int w, int h);
+
+void g3d_clear_color(unsigned char r, unsigned char g, unsigned char b);
+void g3d_clear_depth(float z);
+void g3d_clear(unsigned int mask);
+
+void g3d_enable(unsigned int opt);
+void g3d_disable(unsigned int opt);
+void g3d_setopt(unsigned int opt, unsigned int mask);
+unsigned int g3d_getopt(unsigned int mask);
+
+void g3d_front_face(unsigned int order);
+void g3d_polygon_mode(int pmode);
+int g3d_get_polygon_mode(void);
+
+void g3d_matrix_mode(int mmode);
+
+void g3d_load_identity(void);
+void g3d_load_matrix(const float *m);
+void g3d_mult_matrix(const float *m);
+void g3d_push_matrix(void);
+void g3d_pop_matrix(void);
+
+void g3d_translate(float x, float y, float z);
+void g3d_rotate(float angle, float x, float y, float z);
+void g3d_scale(float x, float y, float z);
+void g3d_ortho(float left, float right, float bottom, float top, float znear, float zfar);
+void g3d_frustum(float left, float right, float bottom, float top, float znear, float zfar);
+void g3d_perspective(float vfov, float aspect, float znear, float zfar);
+
+/* returns pointer to the *internal* matrix, and if argument m is not null,
+ * also copies the internal matrix there. */
+const float *g3d_get_matrix(int which, float *m);
+
+void g3d_light_pos(int idx, float x, float y, float z);
+void g3d_light_dir(int idx, float x, float y, float z);
+void g3d_light_energy(int idx, float val);
+
+void g3d_light_ambient(float val);
+
+void g3d_mtl_diffuse(float diffuse);
+void g3d_mtl_specular(float spec);
+void g3d_mtl_shininess(float shin);
+
+void g3d_set_texture(int xsz, int ysz, void *pixels);
+
+void g3d_draw(int prim, const struct g3d_vertex *varr, int varr_size);
+void g3d_draw_indexed(int prim, const struct g3d_vertex *varr, int varr_size,
+		const uint16_t *iarr, int iarr_size);
+
+void g3d_begin(int prim);
+void g3d_end(void);
+void g3d_vertex(float x, float y, float z);
+void g3d_normal(float x, float y, float z);
+void g3d_color1b(unsigned char lum);
+void g3d_color2b(unsigned char lum, unsigned char a);
+void g3d_color1f(float lum);
+void g3d_color2f(float lum, float a);
+void g3d_texcoord(float u, float v);
+
+#endif	/* THREEDGFX_H_ */
diff --git a/src/3dgfx/mesh.c b/src/3dgfx/mesh.c
new file mode 100644
index 0000000..54d3539
--- /dev/null
+++ b/src/3dgfx/mesh.c
@@ -0,0 +1,601 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "mesh.h"
+#include "3dgfx.h"
+
+void init_g3dmtl(struct g3d_material *mtl)
+{
+	mtl->name = 0;
+	mtl->r = mtl->g = mtl->b = mtl->a = 1.0f;
+	mtl->sr = mtl->sg = mtl->sb = 0.0f;
+	mtl->shin = 60.0f;
+	/*mtl->texmap = mtl->envmap = 0;*/
+}
+
+int init_mesh(struct g3d_mesh *mesh, int prim, int num_verts, int num_idx)
+{
+	mesh->name = 0;
+	mesh->prim = prim;
+	if(num_verts > 0) {
+		if(!(mesh->varr = malloc(num_verts * sizeof *mesh->varr))) {
+			return -1;
+		}
+	} else {
+		mesh->varr = 0;
+	}
+	if(num_idx > 0) {
+		if(!(mesh->iarr = malloc(num_idx * prim * sizeof *mesh->iarr))) {
+			free(mesh->varr);
+			return -1;
+		}
+	} else {
+		mesh->iarr = 0;
+	}
+	mesh->vcount = num_verts;
+	mesh->icount = num_idx;
+	mesh->mtl = 0;
+	return 0;
+}
+
+void free_mesh(struct g3d_mesh *mesh)
+{
+	destroy_mesh(mesh);
+	free(mesh);
+}
+
+void destroy_mesh(struct g3d_mesh *mesh)
+{
+	if(mesh) {
+		free(mesh->name);
+		free(mesh->varr);
+		free(mesh->iarr);
+	}
+}
+
+int copy_mesh(struct g3d_mesh *dest, struct g3d_mesh *src)
+{
+	dest->prim = src->prim;
+	if(src->varr) {
+		if(!(dest->varr = malloc(src->vcount * sizeof *src->varr))) {
+			return -1;
+		}
+		memcpy(dest->varr, src->varr, src->vcount * sizeof *src->varr);
+	}
+	dest->vcount = src->vcount;
+	if(src->iarr) {
+		if(!(dest->iarr = malloc(src->icount * sizeof *src->iarr))) {
+			free(dest->varr);
+			dest->varr = 0;
+			return -1;
+		}
+		memcpy(dest->iarr, src->iarr, src->icount * sizeof *src->iarr);
+	}
+	dest->icount = src->icount;
+	return 0;
+}
+
+static struct {
+	int prim;
+	struct g3d_vertex *varr;
+	const float *xform;
+} zsort_cls;
+
+static int zsort_cmp(const void *aptr, const void *bptr)
+{
+	int i;
+	float za = 0.0f;
+	float zb = 0.0f;
+	const float *m = zsort_cls.xform;
+	const struct g3d_vertex *va = (const struct g3d_vertex*)aptr;
+	const struct g3d_vertex *vb = (const struct g3d_vertex*)bptr;
+
+	for(i=0; i<zsort_cls.prim; i++) {
+		za += m[2] * va->x + m[6] * va->y + m[10] * va->z + m[14];
+		zb += m[2] * vb->x + m[6] * vb->y + m[10] * vb->z + m[14];
+		++va;
+		++vb;
+	}
+
+	za -= zb;
+	return *(int*)&za;
+}
+
+static int zsort_indexed_cmp(const void *aptr, const void *bptr)
+{
+	int i;
+	float za = 0.0f;
+	float zb = 0.0f;
+	const uint16_t *a = (const uint16_t*)aptr;
+	const uint16_t *b = (const uint16_t*)bptr;
+
+	const float *m = zsort_cls.xform;
+
+	for(i=0; i<zsort_cls.prim; i++) {
+		const struct g3d_vertex *va = zsort_cls.varr + a[i];
+		const struct g3d_vertex *vb = zsort_cls.varr + b[i];
+
+		za += m[2] * va->x + m[6] * va->y + m[10] * va->z + m[14];
+		zb += m[2] * vb->x + m[6] * vb->y + m[10] * vb->z + m[14];
+	}
+
+	za -= zb;
+	return *(int*)&za;
+}
+
+
+void zsort_mesh(struct g3d_mesh *m)
+{
+	zsort_cls.varr = m->varr;
+	zsort_cls.xform = g3d_get_matrix(G3D_MODELVIEW, 0);
+	zsort_cls.prim = m->prim;
+
+	if(m->iarr) {
+		int nfaces = m->icount / m->prim;
+		qsort(m->iarr, nfaces, m->prim * sizeof *m->iarr, zsort_indexed_cmp);
+	} else {
+		int nfaces = m->vcount / m->prim;
+		qsort(m->varr, nfaces, m->prim * sizeof *m->varr, zsort_cmp);
+	}
+}
+
+
+void draw_mesh(struct g3d_mesh *mesh)
+{
+	struct g3d_material *mtl;
+
+	if((mtl = mesh->mtl)) {
+		g3d_mtl_diffuse(mtl->r);
+		g3d_mtl_specular(mtl->sr);
+		g3d_mtl_shininess(mtl->shin);
+
+		/*if(mtl->texmap) {
+			g3d_enable(G3D_TEXTURE_2D);
+			g3d_set_texture(mtl->texmap->width, mtl->texmap->height, mtl->texmap->pixels);
+		}*/
+	}
+
+	if(mesh->iarr) {
+		g3d_draw_indexed(mesh->prim, mesh->varr, mesh->vcount, mesh->iarr, mesh->icount);
+	} else {
+		g3d_draw(mesh->prim, mesh->varr, mesh->vcount);
+	}
+
+	if(mtl) {
+		/*if(mtl->texmap) {
+			g3d_disable(G3D_TEXTURE_2D);
+		}*/
+	}
+}
+
+void apply_mesh_xform(struct g3d_mesh *mesh, const float *xform)
+{
+	int i;
+	struct g3d_vertex *v = mesh->varr;
+
+	for(i=0; i<mesh->vcount; i++) {
+		float x = xform[0] * v->x + xform[4] * v->y + xform[8] * v->z + xform[12];
+		float y = xform[1] * v->x + xform[5] * v->y + xform[9] * v->z + xform[13];
+		v->z = xform[2] * v->x + xform[6] * v->y + xform[10] * v->z + xform[14];
+		v->x = x;
+		v->y = y;
+		x = xform[0] * v->nx + xform[4] * v->ny + xform[8] * v->nz;
+		y = xform[1] * v->nx + xform[5] * v->ny + xform[9] * v->nz;
+		v->nz = xform[2] * v->nx + xform[6] * v->ny + xform[10] * v->nz;
+		v->nx = x;
+		v->ny = y;
+		++v;
+	}
+}
+
+int append_mesh(struct g3d_mesh *ma, struct g3d_mesh *mb)
+{
+	int i, new_vcount, new_icount;
+	void *tmp;
+	uint16_t *iptr;
+
+	if(ma->prim != mb->prim) {
+		fprintf(stderr, "append_mesh failed, primitive mismatch\n");
+		return -1;
+	}
+
+	if(ma->iarr || mb->iarr) {
+		if(!ma->iarr) {
+			if(indexify_mesh(ma) == -1) {
+				return -1;
+			}
+		} else if(!mb->iarr) {
+			if(indexify_mesh(mb) == -1) {
+				return -1;
+			}
+		}
+
+		new_icount = ma->icount + mb->icount;
+		if(!(iptr = realloc(ma->iarr, new_icount * sizeof *iptr))) {
+			fprintf(stderr, "append_mesh: failed to allocate combined index buffer (%d indices)\n", new_icount);
+			return -1;
+		}
+		ma->iarr = iptr;
+
+		iptr += ma->icount;
+		for(i=0; i<mb->icount; i++) {
+			*iptr++ = mb->iarr[i] + ma->vcount;
+		}
+		ma->icount = new_icount;
+	}
+
+	new_vcount = ma->vcount + mb->vcount;
+	if(!(tmp = realloc(ma->varr, new_vcount * sizeof *ma->varr))) {
+		fprintf(stderr, "append_mesh: failed to allocate combined vertex buffer (%d verts)\n", new_vcount);
+		return -1;
+	}
+	ma->varr = tmp;
+	memcpy(ma->varr + ma->vcount, mb->varr, mb->vcount * sizeof *ma->varr);
+	ma->vcount = new_vcount;
+	return 0;
+}
+
+#define FEQ(a, b)	((a) - (b) < 1e-5 && (b) - (a) < 1e-5)
+static int cmp_vertex(struct g3d_vertex *a, struct g3d_vertex *b)
+{
+	if(!FEQ(a->x, b->x) || !FEQ(a->y, b->y) || !FEQ(a->z, b->z) || !FEQ(a->w, b->w))
+		return -1;
+	if(!FEQ(a->nx, b->nx) || !FEQ(a->ny, b->ny) || !FEQ(a->nz, b->nz))
+		return -1;
+	if(!FEQ(a->u, b->u) || !FEQ(a->v, b->v))
+		return -1;
+	if(a->l != b->l || a->a != b->a)
+		return -1;
+	return 0;
+}
+
+static int find_existing(struct g3d_vertex *v, struct g3d_vertex *varr, int vcount)
+{
+	int i;
+	for(i=0; i<vcount; i++) {
+		if(cmp_vertex(v, varr++) == 0) {
+			return i;
+		}
+	}
+	return -1;
+}
+
+int indexify_mesh(struct g3d_mesh *mesh)
+{
+	int i, j, nfaces, max_icount, idx;
+	int out_vcount = 0;
+	struct g3d_vertex *vin, *vout;
+	uint16_t *iout;
+
+	if(mesh->iarr) {
+		fprintf(stderr, "indexify_mesh failed: already indexed\n");
+		return -1;
+	}
+
+	nfaces = mesh->vcount / mesh->prim;
+	max_icount = mesh->vcount;
+
+	if(!(mesh->iarr = malloc(max_icount * sizeof *mesh->iarr))) {
+		fprintf(stderr, "indexify_mesh failed to allocate index buffer of %d indices\n", max_icount);
+		return -1;
+	}
+
+	vin = vout = mesh->varr;
+	iout = mesh->iarr;
+
+	for(i=0; i<nfaces; i++) {
+		for(j=0; j<mesh->prim; j++) {
+			if((idx = find_existing(vin, mesh->varr, out_vcount)) >= 0) {
+				*iout++ = idx;
+			} else {
+				*iout++ = out_vcount++;
+				if(vin != vout) {
+					*vout++ = *vin;
+				}
+			}
+			++vin;
+		}
+	}
+
+	/* XXX also shrink buffers? I'll just leave them to max size for now */
+	return 0;
+}
+
+void normalize_mesh_normals(struct g3d_mesh *mesh)
+{
+	int i;
+	struct g3d_vertex *v = mesh->varr;
+
+	for(i=0; i<mesh->vcount; i++) {
+		float mag = sqrt(v->nx * v->nx + v->ny * v->ny + v->nz * v->nz);
+		float s = (mag == 0.0f) ? 1.0f : 1.0f / mag;
+		v->nx *= s;
+		v->ny *= s;
+		v->nz *= s;
+		++v;
+	}
+}
+
+
+void calc_mesh_centroid(struct g3d_mesh *mesh, float *cent)
+{
+	int i;
+	float s = 1.0f / (float)mesh->vcount;
+	cent[0] = cent[1] = cent[2] = 0.0f;
+
+	for(i=0; i<mesh->vcount; i++) {
+		cent[0] += mesh->varr[i].x;
+		cent[1] += mesh->varr[i].y;
+		cent[2] += mesh->varr[i].z;
+	}
+	cent[0] *= s;
+	cent[1] *= s;
+	cent[2] *= s;
+}
+
+static void sphvec(float *res, float theta, float phi, float rad)
+{
+	theta = -theta;
+	res[0] = sin(theta) * sin(phi);
+	res[1] = cos(phi);
+	res[2] = cos(theta) * sin(phi);
+}
+
+int gen_sphere_mesh(struct g3d_mesh *mesh, float rad, int usub, int vsub)
+{
+	int i, j;
+	int nfaces, uverts, vverts;
+	struct g3d_vertex *vptr;
+	uint16_t *iptr;
+
+	init_mesh(mesh, G3D_QUADS, 0, 0);
+
+	if(usub < 4) usub = 4;
+	if(vsub < 2) vsub = 2;
+
+	uverts = usub + 1;
+	vverts = vsub + 1;
+
+	mesh->vcount = uverts * vverts;
+	nfaces = usub * vsub;
+	mesh->icount = nfaces * 4;
+
+	if(!(mesh->varr = malloc(mesh->vcount * sizeof *mesh->varr))) {
+		fprintf(stderr, "gen_sphere_mesh: failed to allocate vertex buffer (%d vertices)\n", mesh->vcount);
+		return -1;
+	}
+	if(!(mesh->iarr = malloc(mesh->icount * sizeof *mesh->iarr))) {
+		fprintf(stderr, "gen_sphere_mesh: failed to allocate index buffer (%d indices)\n", mesh->icount);
+		return -1;
+	}
+	vptr = mesh->varr;
+	iptr = mesh->iarr;
+
+	for(i=0; i<uverts; i++) {
+		float u = (float)i / (float)(uverts - 1);
+		float theta = u * 2.0 * M_PI;
+
+		for(j=0; j<vverts; j++) {
+			float v = (float)j / (float)(vverts - 1);
+			float phi = v * M_PI;
+			int chess = (i & 1) == (j & 1);
+
+			sphvec(&vptr->x, theta, phi, rad);
+			vptr->w = 1.0f;
+
+			vptr->nx = vptr->x / rad;
+			vptr->ny = vptr->y / rad;
+			vptr->nz = vptr->z / rad;
+			vptr->u = u;
+			vptr->v = v;
+			vptr->l = chess ? 255 : 64;
+			++vptr;
+
+			if(i < usub && j < vsub) {
+				int idx = i * vverts + j;
+				*iptr++ = idx;
+				*iptr++ = idx + 1;
+				*iptr++ = idx + vverts + 1;
+				*iptr++ = idx + vverts;
+			}
+		}
+	}
+	return 0;
+}
+
+int gen_plane_mesh(struct g3d_mesh *m, float width, float height, int usub, int vsub)
+{
+	int i, j;
+	int nfaces, nverts, nidx, uverts, vverts;
+	float x, y, u, v, du, dv;
+	struct g3d_vertex *vptr;
+	uint16_t *iptr;
+
+	init_mesh(m, G3D_QUADS, 0, 0);
+
+	if(usub < 1) usub = 1;
+	if(vsub < 1) vsub = 1;
+
+	nfaces = usub * vsub;
+	uverts = usub + 1;
+	vverts = vsub + 1;
+	du = 1.0f / (float)usub;
+	dv = 1.0f / (float)vsub;
+
+	nverts = uverts * vverts;
+	nidx = nfaces * 4;
+
+	if(!(m->varr = malloc(nverts * sizeof *m->varr))) {
+		fprintf(stderr, "gen_plane_mesh: failed to allocate vertex buffer (%d vertices)\n", nverts);
+		return -1;
+	}
+	if(!(m->iarr = malloc(nidx * sizeof *m->iarr))) {
+		fprintf(stderr, "gen_plane_mesh: failed to allocate index buffer (%d indices)\n", nidx);
+		free(m->varr);
+		m->varr = 0;
+		return -1;
+	}
+
+	m->vcount = nverts;
+	m->icount = nidx;
+
+	vptr = m->varr;
+	iptr = m->iarr;
+
+	v = 0.0f;
+	for(i=0; i<vverts; i++) {
+		y = (v - 0.5) * height;
+		u = 0.0f;
+
+		for(j=0; j<uverts; j++) {
+			x = (u - 0.5) * width;
+
+			vptr->x = x;
+			vptr->y = y;
+			vptr->z = 0.0f;
+			vptr->w = 1.0f;
+			vptr->nx = 0.0f;
+			vptr->ny = 0.0f;
+			vptr->nz = 1.0f;
+			vptr->u = u;
+			vptr->v = v;
+			vptr->l = vptr->a = 255;
+			++vptr;
+
+			u += du;
+		}
+		v += dv;
+	}
+
+	for(i=0; i<vsub; i++) {
+		for(j=0; j<usub; j++) {
+			int idx = i * uverts + j;
+			*iptr++ = idx;
+			*iptr++ = idx + 1;
+			*iptr++ = idx + uverts + 1;
+			*iptr++ = idx + uverts;
+		}
+	}
+	return 0;
+}
+
+int gen_cube_mesh(struct g3d_mesh *mesh, float sz, int sub)
+{
+	int i;
+	float offs;
+	struct g3d_mesh *m;
+	struct g3d_mesh tmpmesh;
+	static float rotface[][4] = {
+		{0, 0, 1, 0},
+		{90, 0, 1, 0},
+		{180, 0, 1, 0},
+		{270, 0, 1, 0},
+		{90, 1, 0, 0},
+		{-90, 1, 0, 0}
+	};
+
+	offs = sz;
+	sz = fabs(sz);
+
+	g3d_matrix_mode(G3D_MODELVIEW);
+	g3d_push_matrix();
+
+	for(i=0; i<6; i++) {
+		m = i > 0 ? &tmpmesh : mesh;
+		if(gen_plane_mesh(m, sz, sz, sub, sub) == -1)
+			return -1;
+		g3d_load_identity();
+		g3d_rotate(rotface[i][0], rotface[i][1], rotface[i][2], rotface[i][3]);
+		g3d_translate(0, 0, offs / 2.0f);
+		apply_mesh_xform(m, g3d_get_matrix(G3D_MODELVIEW, 0));
+		if(i > 0) {
+			if(append_mesh(mesh, m) == -1) {
+				return -1;
+			}
+		}
+	}
+
+	g3d_pop_matrix();
+	return 0;
+}
+
+static void torusvec(float *res, float theta, float phi, float mr, float rr)
+{
+	float rx, ry, rz;
+	theta = -theta;
+
+	rx = -cos(phi) * rr + mr;
+	ry = sin(phi) * rr;
+	rz = 0.0f;
+
+	res[0] = rx * sin(theta) + rz * cos(theta);
+	res[1] = ry;
+	res[2] = -rx * cos(theta) + rz * sin(theta);
+}
+
+int gen_torus_mesh(struct g3d_mesh *mesh, float rad, float ringrad, int usub, int vsub)
+{
+	int i, j;
+	int nfaces, uverts, vverts;
+	struct g3d_vertex *vptr;
+	uint16_t *iptr;
+
+	init_mesh(mesh, G3D_QUADS, 0, 0);
+
+	if(usub < 4) usub = 4;
+	if(vsub < 2) vsub = 2;
+
+	uverts = usub + 1;
+	vverts = vsub + 1;
+
+	mesh->vcount = uverts * vverts;
+	nfaces = usub * vsub;
+	mesh->icount = nfaces * 4;
+
+	printf("generating torus with %d faces (%d vertices)\n", nfaces, mesh->vcount);
+
+	if(!(mesh->varr = malloc(mesh->vcount * sizeof *mesh->varr))) {
+		return -1;
+	}
+	if(!(mesh->iarr = malloc(mesh->icount * sizeof *mesh->iarr))) {
+		return -1;
+	}
+	vptr = mesh->varr;
+	iptr = mesh->iarr;
+
+	for(i=0; i<uverts; i++) {
+		float u = (float)i / (float)(uverts - 1);
+		float theta = u * 2.0 * M_PI;
+		float rcent[3];
+
+		torusvec(rcent, theta, 0, rad, 0);
+
+		for(j=0; j<vverts; j++) {
+			float v = (float)j / (float)(vverts - 1);
+			float phi = v * 2.0 * M_PI;
+			int chess = (i & 1) == (j & 1);
+
+			torusvec(&vptr->x, theta, phi, rad, ringrad);
+			vptr->w = 1.0f;
+
+			vptr->nx = (vptr->x - rcent[0]) / ringrad;
+			vptr->ny = (vptr->y - rcent[1]) / ringrad;
+			vptr->nz = (vptr->z - rcent[2]) / ringrad;
+			vptr->u = u;
+			vptr->v = v;
+			vptr->l = chess ? 255 : 64;
+			++vptr;
+
+			if(i < usub && j < vsub) {
+				int idx = i * vverts + j;
+				*iptr++ = idx;
+				*iptr++ = idx + 1;
+				*iptr++ = idx + vverts + 1;
+				*iptr++ = idx + vverts;
+			}
+		}
+	}
+	return 0;
+}
+
diff --git a/src/3dgfx/mesh.h b/src/3dgfx/mesh.h
new file mode 100644
index 0000000..3891588
--- /dev/null
+++ b/src/3dgfx/mesh.h
@@ -0,0 +1,60 @@
+#ifndef MESH_H_
+#define MESH_H_
+
+#include "3dgfx.h"
+/*#include "image.h"*/
+#include "inttypes.h"
+
+struct g3d_material {
+	float r, g, b, a;
+	float sr, sg, sb, shin;
+
+	/*struct image *texmap, *envmap;*/
+	char *name;
+};
+
+struct g3d_mesh {
+	int prim;
+	struct g3d_vertex *varr;
+	uint16_t *iarr;
+	int vcount, icount;
+	char *name;
+
+	struct g3d_material *mtl;
+};
+
+void init_g3dmtl(struct g3d_material *mtl);
+
+int init_mesh(struct g3d_mesh *mesh, int prim, int num_verts, int num_idx);
+
+void free_mesh(struct g3d_mesh *mesh);
+void destroy_mesh(struct g3d_mesh *mesh);
+
+int copy_mesh(struct g3d_mesh *dest, struct g3d_mesh *src);
+
+/* takes pointer to a dynamic array (dynarr_*) and populates it */
+#define load_meshes(mesharr, fname) load_meshes_impl(&(mesharr), fname)
+int load_meshes_impl(struct g3d_mesh **mesh, const char *fname);
+/* TODO: idx -1 -> merge all meshes into one? */
+int load_mesh(struct g3d_mesh *mesh, const char *fname, int idx);
+int load_named_mesh(struct g3d_mesh *mesh, const char *fname, const char *mname);
+int save_mesh(struct g3d_mesh *mesh, const char *fname);
+struct g3d_mesh *find_mesh(struct g3d_mesh *meshes, const char *mname);
+
+void zsort_mesh(struct g3d_mesh *mesh);
+void draw_mesh(struct g3d_mesh *mesh);
+
+void apply_mesh_xform(struct g3d_mesh *mesh, const float *xform);
+int append_mesh(struct g3d_mesh *ma, struct g3d_mesh *mb);
+int indexify_mesh(struct g3d_mesh *mesh);
+
+void normalize_mesh_normals(struct g3d_mesh *mesh);
+
+void calc_mesh_centroid(struct g3d_mesh *mesh, float *cent);
+
+int gen_sphere_mesh(struct g3d_mesh *mesh, float rad, int usub, int vsub);
+int gen_plane_mesh(struct g3d_mesh *mesh, float width, float height, int usub, int vsub);
+int gen_cube_mesh(struct g3d_mesh *mesh, float sz, int sub);
+int gen_torus_mesh(struct g3d_mesh *mesh, float rad, float ringrad, int usub, int vsub);
+
+#endif	/* MESH_H_ */
diff --git a/src/3dgfx/polyclip.c b/src/3dgfx/polyclip.c
new file mode 100644
index 0000000..af0164a
--- /dev/null
+++ b/src/3dgfx/polyclip.c
@@ -0,0 +1,330 @@
+#include <stdio.h>
+#include <math.h>
+#include <assert.h>
+#include "polyclip.h"
+
+struct ray {
+	float origin[3];
+	float dir[3];
+};
+
+static int clip_edge(struct g3d_vertex *poly, int *vnumptr,
+		const struct g3d_vertex *v0, const struct g3d_vertex *v1,
+		const struct cplane *plane);
+static int check_clip_edge(const struct g3d_vertex *v0,
+		const struct g3d_vertex *v1, const struct cplane *plane);
+static int clip_edge_frustum(struct g3d_vertex *poly, int *vnumptr,
+		const struct g3d_vertex *v0, const struct g3d_vertex *v1, int fplane);
+static float distance_signed(float *pos, const struct cplane *plane);
+static int intersect(const struct ray *ray, const struct cplane *plane, float *t);
+static int inside_frustum_plane(const struct g3d_vertex *v, int fplane);
+
+
+int clip_poly(struct g3d_vertex *vout, int *voutnum,
+		const struct g3d_vertex *vin, int vnum, struct cplane *plane)
+{
+	int i, nextidx, res;
+	int edges_clipped = 0;
+
+	*voutnum = 0;
+
+	for(i=0; i<vnum; i++) {
+		nextidx = i + 1;
+		if(nextidx >= vnum) nextidx = 0;
+		res = clip_edge(vout, voutnum, vin + i, vin + nextidx, plane);
+		if(res == 0) {
+			++edges_clipped;
+		}
+	}
+
+	if(*voutnum <= 0) {
+		assert(edges_clipped == 0);
+		return -1;
+	}
+
+	return edges_clipped > 0 ? 0 : 1;
+}
+
+int check_clip_poly(const struct g3d_vertex *v, int vnum, struct cplane *plane)
+{
+	int i, nextidx, res = 0;
+	int edges_clipped = 0;
+
+	for(i=0; i<vnum; i++) {
+		nextidx = i + 1;
+		if(nextidx >= vnum) nextidx = 0;
+		res = check_clip_edge(v + i, v + nextidx, plane);
+		if(res == 0) {
+			++edges_clipped;
+		}
+	}
+	return edges_clipped ? 0 : res;
+}
+
+int clip_frustum(struct g3d_vertex *vout, int *voutnum,
+		const struct g3d_vertex *vin, int vnum, int fplane)
+{
+	int i, nextidx, res;
+	int edges_clipped = 0;
+
+	if(vnum == 1) {
+		/* special case: point clipping */
+		return inside_frustum_plane(vin, fplane) ? 1 : -1;
+	}
+
+	*voutnum = 0;
+
+	for(i=0; i<vnum; i++) {
+		nextidx = i + 1;
+		if(nextidx >= vnum) nextidx = 0;
+		res = clip_edge_frustum(vout, voutnum, vin + i, vin + nextidx, fplane);
+		if(res == 0) {
+			++edges_clipped;
+		}
+	}
+
+	if(*voutnum <= 0) {
+		assert(edges_clipped == 0);
+		return -1;
+	}
+
+	return edges_clipped > 0 ? 0 : 1;
+}
+
+#define LERP_VATTR(res, v0, v1, t) \
+	do { \
+		(res)->nx = (v0)->nx + ((v1)->nx - (v0)->nx) * (t); \
+		(res)->ny = (v0)->ny + ((v1)->ny - (v0)->ny) * (t); \
+		(res)->nz = (v0)->nz + ((v1)->nz - (v0)->nz) * (t); \
+		(res)->u = (v0)->u + ((v1)->u - (v0)->u) * (t); \
+		(res)->v = (v0)->v + ((v1)->v - (v0)->v) * (t); \
+		(res)->l = (v0)->l + ((v1)->l - (v0)->l) * (t); \
+		(res)->a = (v0)->a + ((v1)->a - (v0)->a) * (t); \
+	} while(0)
+
+
+/* returns:
+ *  1 -> both inside
+ *  0 -> straddling and clipped
+ * -1 -> both outside
+ *
+ *  also returns the size of the polygon through vnumptr
+ */
+static int clip_edge(struct g3d_vertex *poly, int *vnumptr,
+		const struct g3d_vertex *v0, const struct g3d_vertex *v1,
+		const struct cplane *plane)
+{
+	float pos0[3], pos1[3];
+	float d0, d1, t;
+	struct ray ray;
+	int i, vnum = *vnumptr;
+
+	pos0[0] = v0->x; pos0[1] = v0->y; pos0[2] = v0->z;
+	pos1[0] = v1->x; pos1[1] = v1->y; pos1[2] = v1->z;
+
+	d0 = distance_signed(pos0, plane);
+	d1 = distance_signed(pos1, plane);
+
+	for(i=0; i<3; i++) {
+		ray.origin[i] = pos0[i];
+		ray.dir[i] = pos1[i] - pos0[i];
+	}
+
+	if(d0 >= 0.0) {
+		/* start inside */
+		if(d1 >= 0.0) {
+			/* all inside */
+			poly[vnum++] = *v1;	/* append v1 */
+			*vnumptr = vnum;
+			return 1;
+		} else {
+			/* going out */
+			struct g3d_vertex *vptr = poly + vnum;
+
+			intersect(&ray, plane, &t);
+
+			vptr->x = ray.origin[0] + ray.dir[0] * t;
+			vptr->y = ray.origin[1] + ray.dir[1] * t;
+			vptr->z = ray.origin[2] + ray.dir[2] * t;
+			vptr->w = 1.0f;
+
+			LERP_VATTR(vptr, v0, v1, t);
+			vnum++;	/* append new vertex on the intersection point */
+		}
+	} else {
+		/* start outside */
+		if(d1 >= 0) {
+			/* going in */
+			struct g3d_vertex *vptr = poly + vnum;
+
+			intersect(&ray, plane, &t);
+
+			vptr->x = ray.origin[0] + ray.dir[0] * t;
+			vptr->y = ray.origin[1] + ray.dir[1] * t;
+			vptr->z = ray.origin[2] + ray.dir[2] * t;
+			vptr->w = 1.0f;
+
+			LERP_VATTR(vptr, v0, v1, t);
+			vnum++;	/* append new vertex on the intersection point */
+
+			/* then append v1 ... */
+			poly[vnum++] = *v1;
+		} else {
+			/* all outside */
+			return -1;
+		}
+	}
+
+	*vnumptr = vnum;
+	return 0;
+}
+
+/* same as above, but only checks for clipping and classifies the edge */
+static int check_clip_edge(const struct g3d_vertex *v0,
+		const struct g3d_vertex *v1, const struct cplane *plane)
+{
+	float pos0[3], pos1[3];
+	float d0, d1;
+
+	pos0[0] = v0->x; pos0[1] = v0->y; pos0[2] = v0->z;
+	pos1[0] = v1->x; pos1[1] = v1->y; pos1[2] = v1->z;
+
+	d0 = distance_signed(pos0, plane);
+	d1 = distance_signed(pos1, plane);
+
+	if(d0 > 0.0f && d1 > 0.0f) {
+		return 1;
+	}
+	if(d0 < 0.0f && d1 < 0.0f) {
+		return -1;
+	}
+	return 0;
+}
+
+static float distance_signed(float *pos, const struct cplane *plane)
+{
+	float dx = pos[0] - plane->x;
+	float dy = pos[1] - plane->y;
+	float dz = pos[2] - plane->z;
+	return dx * plane->nx + dy * plane->ny + dz * plane->nz;
+}
+
+static int intersect(const struct ray *ray, const struct cplane *plane, float *t)
+{
+	float orig_pt_dir[3];
+
+	float ndotdir = plane->nx * ray->dir[0] + plane->ny * ray->dir[1] + plane->nz * ray->dir[2];
+	if(fabs(ndotdir) < 1e-6) {
+		*t = 0.0f;
+		return 0;
+	}
+
+	orig_pt_dir[0] = plane->x - ray->origin[0];
+	orig_pt_dir[1] = plane->y - ray->origin[1];
+	orig_pt_dir[2] = plane->z - ray->origin[2];
+
+	*t = (plane->nx * orig_pt_dir[0] + plane->ny * orig_pt_dir[1] + plane->nz * orig_pt_dir[2]) / ndotdir;
+	return 1;
+}
+
+/* homogeneous frustum clipper helpers */
+
+static int inside_frustum_plane(const struct g3d_vertex *v, int fplane)
+{
+	switch(fplane) {
+	case CLIP_LEFT:
+		return v->x >= -v->w;
+	case CLIP_RIGHT:
+		return v->x <= v->w;
+	case CLIP_BOTTOM:
+		return v->y >= -v->w;
+	case CLIP_TOP:
+		return v->y <= v->w;
+	case CLIP_NEAR:
+		return v->z >= -v->w;
+	case CLIP_FAR:
+		return v->z <= v->w;
+	}
+	assert(0);
+	return 0;
+}
+
+static float intersect_frustum(const struct g3d_vertex *a, const struct g3d_vertex *b, int fplane)
+{
+	switch(fplane) {
+	case CLIP_LEFT:
+		return (-a->w - a->x) / (b->x - a->x + b->w - a->w);
+	case CLIP_RIGHT:
+		return (a->w - a->x) / (b->x - a->x - b->w + a->w);
+	case CLIP_BOTTOM:
+		return (-a->w - a->y) / (b->y - a->y + b->w - a->w);
+	case CLIP_TOP:
+		return (a->w - a->y) / (b->y - a->y - b->w + a->w);
+	case CLIP_NEAR:
+		return (-a->w - a->z) / (b->z - a->z + b->w - a->w);
+	case CLIP_FAR:
+		return (a->w - a->z) / (b->z - a->z - b->w + a->w);
+	}
+
+	assert(0);
+	return 0;
+}
+
+static int clip_edge_frustum(struct g3d_vertex *poly, int *vnumptr,
+		const struct g3d_vertex *v0, const struct g3d_vertex *v1, int fplane)
+{
+	int vnum = *vnumptr;
+	int in0, in1;
+	float t;
+
+	in0 = inside_frustum_plane(v0, fplane);
+	in1 = inside_frustum_plane(v1, fplane);
+
+	if(in0) {
+		/* start inside */
+		if(in1) {
+			/* all inside */
+			poly[vnum++] = *v1;	/* append v1 */
+			*vnumptr = vnum;
+			return 1;
+		} else {
+			/* going out */
+			struct g3d_vertex *vptr = poly + vnum;
+
+			t = intersect_frustum(v0, v1, fplane);
+
+			vptr->x = v0->x + (v1->x - v0->x) * t;
+			vptr->y = v0->y + (v1->y - v0->y) * t;
+			vptr->z = v0->z + (v1->z - v0->z) * t;
+			vptr->w = v0->w + (v1->w - v0->w) * t;
+
+			LERP_VATTR(vptr, v0, v1, t);
+			++vnum;	/* append new vertex on the intersection point */
+		}
+	} else {
+		/* start outside */
+		if(in1) {
+			/* going in */
+			struct g3d_vertex *vptr = poly + vnum;
+
+			t = intersect_frustum(v0, v1, fplane);
+
+			vptr->x = v0->x + (v1->x - v0->x) * t;
+			vptr->y = v0->y + (v1->y - v0->y) * t;
+			vptr->z = v0->z + (v1->z - v0->z) * t;
+			vptr->w = v0->w + (v1->w - v0->w) * t;
+
+			LERP_VATTR(vptr, v0, v1, t);
+			++vnum;	/* append new vertex on the intersection point */
+
+			/* then append v1 ... */
+			poly[vnum++] = *v1;
+		} else {
+			/* all outside */
+			return -1;
+		}
+	}
+
+	*vnumptr = vnum;
+	return 0;
+}
diff --git a/src/3dgfx/polyclip.h b/src/3dgfx/polyclip.h
new file mode 100644
index 0000000..adee29d
--- /dev/null
+++ b/src/3dgfx/polyclip.h
@@ -0,0 +1,38 @@
+#ifndef POLYCLIP_H_
+#define POLYCLIP_H_
+
+#include "3dgfx.h"
+
+struct cplane {
+	float x, y, z;
+	float nx, ny, nz;
+};
+
+enum {
+	CLIP_LEFT, CLIP_RIGHT,
+	CLIP_BOTTOM, CLIP_TOP,
+	CLIP_NEAR, CLIP_FAR
+};
+
+/* Generic polygon clipper
+ * returns:
+ *  1 -> fully inside, not clipped
+ *  0 -> straddling the plane and clipped
+ * -1 -> fully outside, not clipped
+ * in all cases, vertices are copied to vout, and the vertex count is written
+ * to wherever voutnum is pointing
+ */
+int clip_poly(struct g3d_vertex *vout, int *voutnum,
+		const struct g3d_vertex *vin, int vnum, struct cplane *plane);
+
+/* only checks if the polygon would be clipped by the plane, and classifies it
+ * as inside/outside/straddling, without actually producing a clipped polygon.
+ * return values are the same as clip_poly.
+ */
+int check_clip_poly(const struct g3d_vertex *v, int vnum, struct cplane *plane);
+
+/* Special-case frustum clipper (might be slightly faster) */
+int clip_frustum(struct g3d_vertex *vout, int *voutnum,
+		const struct g3d_vertex *vin, int vnum, int fplane);
+
+#endif	/* POLYCLIP_H_ */
diff --git a/src/3dgfx/polyfill.c b/src/3dgfx/polyfill.c
new file mode 100644
index 0000000..d195ff0
--- /dev/null
+++ b/src/3dgfx/polyfill.c
@@ -0,0 +1,333 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include "polyfill.h"
+#include "gfxutil.h"
+#include "util.h"
+#include "colormgr.h"
+
+/*#define DEBUG_OVERDRAW	G3D_PACK_RGB(10, 10, 10)*/
+
+#define FILL_POLY_BITS	0x03
+
+/*void polyfill_tex_flat_new(struct pvertex *varr);*/
+
+/* mode bits: 00-wire 01-flat 10-gouraud 11-reserved
+ *     bit 2: texture
+ *     bit 3: zbuffering
+ */
+void (*fillfunc[])(struct pvertex*) = {
+	polyfill_wire,
+	polyfill_flat,
+	polyfill_gouraud,
+	0,
+	polyfill_tex_wire,
+	polyfill_tex_flat,
+	polyfill_tex_gouraud,
+	0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	polyfill_wire,
+	polyfill_flat_zbuf,
+	polyfill_gouraud_zbuf,
+	0,
+	polyfill_tex_wire,
+	polyfill_tex_flat_zbuf,
+	polyfill_tex_gouraud_zbuf,
+	0,
+	0, 0, 0, 0, 0, 0, 0, 0
+};
+
+struct pimage pfill_fb, pfill_tex;
+uint32_t *pfill_zbuf;
+struct pgradient pgrad;
+
+#define EDGEPAD	8
+static struct pvertex *edgebuf, *left, *right;
+static int edgebuf_size;
+static int fbheight;
+
+/*
+#define CHECKEDGE(x) \
+	do { \
+		assert(x >= 0); \
+		assert(x < fbheight); \
+	} while(0)
+*/
+#define CHECKEDGE(x)
+
+
+void polyfill_fbheight(int height)
+{
+	int newsz = (height * 2 + EDGEPAD * 3) * sizeof *edgebuf;
+
+	if(newsz > edgebuf_size) {
+		free(edgebuf);
+		if(!(edgebuf = malloc(newsz))) {
+			fprintf(stderr, "failed to allocate edge table buffer (%d bytes)\n", newsz);
+			abort();
+		}
+		edgebuf_size = newsz;
+
+		left = edgebuf + EDGEPAD;
+		right = edgebuf + height + EDGEPAD * 2;
+
+#ifndef NDEBUG
+		memset(edgebuf, 0xaa, EDGEPAD * sizeof *edgebuf);
+		memset(edgebuf + height + EDGEPAD, 0xaa, EDGEPAD * sizeof *edgebuf);
+		memset(edgebuf + height * 2 + EDGEPAD * 2, 0xaa, EDGEPAD * sizeof *edgebuf);
+#endif
+	}
+
+	fbheight = height;
+}
+
+void polyfill(int mode, struct pvertex *verts)
+{
+#ifndef NDEBUG
+	if(!fillfunc[mode]) {
+		fprintf(stderr, "polyfill mode %d not implemented\n", mode);
+		abort();
+	}
+#endif
+
+	fillfunc[mode](verts);
+}
+
+void polyfill_wire(struct pvertex *verts)
+{
+	int i, x0, y0, x1, y1;
+	struct pvertex *v = verts;
+	int color = find_color(v->l, v->l, v->l);
+
+	for(i=0; i<2; i++) {
+		x0 = v->x >> 8;
+		y0 = v->y >> 8;
+		++v;
+		x1 = v->x >> 8;
+		y1 = v->y >> 8;
+		if(clip_line(&x0, &y0, &x1, &y1, 0, 0, pfill_fb.width, pfill_fb.height)) {
+			draw_line(x0, y0, x1, y1, color);
+		}
+	}
+	x0 = verts[0].x >> 8;
+	y0 = verts[0].y >> 8;
+	if(clip_line(&x1, &y1, &x0, &y0, 0, 0, pfill_fb.width, pfill_fb.height)) {
+		draw_line(x1, y1, x0, y0, color);
+	}
+}
+
+void polyfill_tex_wire(struct pvertex *verts)
+{
+	polyfill_wire(verts);	/* TODO */
+}
+
+void polyfill_alpha_wire(struct pvertex *verts)
+{
+	polyfill_wire(verts);	/* TODO */
+}
+
+void polyfill_alpha_tex_wire(struct pvertex *verts)
+{
+	polyfill_wire(verts);	/* TODO */
+}
+
+void polyfill_add_wire(struct pvertex *verts)
+{
+	polyfill_wire(verts);	/* TODO */
+}
+
+void polyfill_add_tex_wire(struct pvertex *verts)
+{
+	polyfill_wire(verts);	/* TODO */
+}
+
+#define VNEXT(p)	(((p) == varr + 2) ? varr : (p) + 1)
+#define VPREV(p)	((p) == varr ? varr + 2 : (p) - 1)
+#define VSUCC(p, side)	((side) == 0 ? VNEXT(p) : VPREV(p))
+
+/* extra bits of precision to use when interpolating colors.
+ * try tweaking this if you notice strange quantization artifacts.
+ */
+#define COLOR_SHIFT	12
+
+
+#define POLYFILL polyfill_flat
+#undef GOURAUD
+#undef TEXMAP
+#undef ZBUF
+#include "polytmpl.h"
+#undef POLYFILL
+
+#define POLYFILL polyfill_gouraud
+#define GOURAUD
+#undef TEXMAP
+#undef ZBUF
+#include "polytmpl.h"
+#undef POLYFILL
+
+#define POLYFILL polyfill_tex_flat
+#undef GOURAUD
+#define TEXMAP
+#undef ZBUF
+#include "polytmpl.h"
+#undef POLYFILL
+
+#define POLYFILL polyfill_tex_gouraud
+#define GOURAUD
+#define TEXMAP
+#undef ZBUF
+#include "polytmpl.h"
+#undef POLYFILL
+
+/* ---- zbuffer variants ----- */
+
+#define POLYFILL polyfill_flat_zbuf
+#undef GOURAUD
+#undef TEXMAP
+#define ZBUF
+#include "polytmpl.h"
+#undef POLYFILL
+
+#define POLYFILL polyfill_gouraud_zbuf
+#define GOURAUD
+#undef TEXMAP
+#define ZBUF
+#include "polytmpl.h"
+#undef POLYFILL
+
+#define POLYFILL polyfill_tex_flat_zbuf
+#undef GOURAUD
+#define TEXMAP
+#define ZBUF
+#include "polytmpl.h"
+#undef POLYFILL
+
+#define POLYFILL polyfill_tex_gouraud_zbuf
+#define GOURAUD
+#define TEXMAP
+#define ZBUF
+#include "polytmpl.h"
+#undef POLYFILL
+
+#if 0
+void polyfill_tex_flat_new(struct pvertex *varr)
+{
+	int i, line, top, bot;
+	struct pvertex *v, *vn, *tab;
+	int32_t x, y0, y1, dx, dy, slope, fx, fy;
+	int start, len;
+	g3d_pixel *fbptr, *pptr, color;
+	int32_t tu, tv, du, dv, uslope, vslope;
+	int tx, ty;
+	g3d_pixel texel;
+
+	top = pfill_fb.height;
+	bot = 0;
+
+	for(i=0; i<3; i++) {
+		/* scan the edge between the current and next vertex */
+		v = varr + i;
+		vn = VNEXT(v);
+
+		if(vn->y == v->y) continue;	/* XXX ??? */
+
+		if(vn->y >= v->y) {
+			/* inrementing Y: left side */
+			tab = left;
+		} else {
+			/* decrementing Y: right side, flip vertices to trace bottom->up */
+			tab = right;
+			v = vn;
+			vn = varr + i;
+		}
+
+		/* calculate edge slope */
+		dx = vn->x - v->x;
+		dy = vn->y - v->y;
+		slope = (dx << 8) / dy;
+
+		tu = v->u;
+		tv = v->v;
+		du = vn->u - tu;
+		dv = vn->v - tv;
+		uslope = (du << 8) / dy;
+		vslope = (dv << 8) / dy;
+
+		y0 = (v->y + 0x100) & 0xffffff00;	/* start from the next scanline */
+		fy = y0 - v->y;						/* fractional part before the next scanline */
+		fx = (fy * slope) >> 8;				/* X adjust for the step to the next scanline */
+		x = v->x + fx;						/* adjust X */
+		y1 = vn->y & 0xffffff00;			/* last scanline of the edge <= vn->y */
+
+		/* also adjust other interpolated attributes */
+		tu += (fy * uslope) >> 8;
+		tv += (fy * vslope) >> 8;
+
+		line = y0 >> 8;
+		if(line < top) top = line;
+		if((y1 >> 8) > bot) bot = y1 >> 8;
+
+		if(line > 0) tab += line;
+
+		while(line <= (y1 >> 8) && line < pfill_fb.height) {
+			if(line >= 0) {
+				int val = x < 0 ? 0 : x >> 8;
+				tab->x = val < pfill_fb.width ? val : pfill_fb.width - 1;
+				tab->u = tu;
+				tab->v = tv;
+				tab++;
+			}
+			x += slope;
+			tu += uslope;
+			tv += vslope;
+			line++;
+		}
+	}
+
+	if(top < 0) top = 0;
+	if(bot >= pfill_fb.height) bot = pfill_fb.height - 1;
+
+	fbptr = pfill_fb.pixels + top * pfill_fb.width;
+	for(i=top; i<=bot; i++) {
+		start = left[i].x;
+		len = right[i].x - start;
+		/* XXX we probably need more precision in left/right.x */
+
+		dx = len == 0 ? 256 : (len << 8);
+
+		tu = left[i].u;
+		tv = left[i].v;
+
+		pptr = fbptr + start;
+		while(len-- > 0) {
+			int cr, cg, cb;
+
+			tx = (tu >> (16 - pfill_tex.xshift)) & pfill_tex.xmask;
+			ty = (tv >> (16 - pfill_tex.yshift)) & pfill_tex.ymask;
+			texel = pfill_tex.pixels[(ty << pfill_tex.xshift) + tx];
+
+			tu += pgrad.dudx;
+			tv += pgrad.dvdx;
+
+			cr = varr[0].r;
+			cg = varr[0].g;
+			cb = varr[0].b;
+
+			/* This is not correct, should be /255, but it's much faster
+			 * to shift by 8 (/256), and won't make a huge difference
+			 */
+			cr = (cr * G3D_UNPACK_R(texel)) >> 8;
+			cg = (cg * G3D_UNPACK_G(texel)) >> 8;
+			cb = (cb * G3D_UNPACK_B(texel)) >> 8;
+
+			if(cr >= 255) cr = 255;
+			if(cg >= 255) cg = 255;
+			if(cb >= 255) cb = 255;
+			color = G3D_PACK_RGB(cr, cg, cb);
+			*pptr++ = color;
+		}
+		fbptr += pfill_fb.width;
+	}
+}
+#endif
diff --git a/src/3dgfx/polyfill.h b/src/3dgfx/polyfill.h
new file mode 100644
index 0000000..aa7bf60
--- /dev/null
+++ b/src/3dgfx/polyfill.h
@@ -0,0 +1,71 @@
+#ifndef POLYFILL_H_
+#define POLYFILL_H_
+
+#include "inttypes.h"
+#include "3dgfx.h"
+
+#define POLYFILL_MODE_MASK	0x03
+#define POLYFILL_TEX_BIT	0x04
+#define POLYFILL_ZBUF_BIT	0x08
+
+enum {
+	POLYFILL_WIRE			= 0,
+	POLYFILL_FLAT,
+	POLYFILL_GOURAUD,
+
+	POLYFILL_TEX_WIRE		= 4,
+	POLYFILL_TEX_FLAT,
+	POLYFILL_TEX_GOURAUD,
+
+	POLYFILL_WIRE_ZBUF			= 16,
+	POLYFILL_FLAT_ZBUF,
+	POLYFILL_GOURAUD_ZBUF,
+
+	POLYFILL_TEX_WIRE_ZBUF		= 20,
+	POLYFILL_TEX_FLAT_ZBUF,
+	POLYFILL_TEX_GOURAUD_ZBUF
+};
+
+/* projected vertices for the rasterizer */
+struct pvertex {
+	int32_t x, y; /* 24.8 fixed point */
+	int32_t u, v; /* 16.16 fixed point */
+	int32_t l, a;  /* int 0-255 */
+	int32_t z;	/* 0-(2^24-1) */
+};
+
+struct pgradient {
+	int32_t dudx, dudy, dvdx, dvdy;
+	int32_t dldx, dldy, dadx, dady;
+	int32_t dzdx, dzdy;
+};
+
+struct pimage {
+	g3d_pixel *pixels;
+	int width, height;
+
+	int xshift, yshift;
+	unsigned int xmask, ymask;
+};
+
+extern struct pimage pfill_fb;
+extern struct pimage pfill_tex;
+extern uint32_t *pfill_zbuf;
+extern struct pgradient pgrad;
+
+void polyfill_fbheight(int height);
+
+void polyfill(int mode, struct pvertex *verts);
+
+void polyfill_wire(struct pvertex *verts);
+void polyfill_flat(struct pvertex *verts);
+void polyfill_gouraud(struct pvertex *verts);
+void polyfill_tex_wire(struct pvertex *verts);
+void polyfill_tex_flat(struct pvertex *verts);
+void polyfill_tex_gouraud(struct pvertex *verts);
+void polyfill_flat_zbuf(struct pvertex *verts);
+void polyfill_gouraud_zbuf(struct pvertex *verts);
+void polyfill_tex_flat_zbuf(struct pvertex *verts);
+void polyfill_tex_gouraud_zbuf(struct pvertex *verts);
+
+#endif	/* POLYFILL_H_ */
diff --git a/src/3dgfx/polytmpl.h b/src/3dgfx/polytmpl.h
new file mode 100644
index 0000000..2eb711f
--- /dev/null
+++ b/src/3dgfx/polytmpl.h
@@ -0,0 +1,229 @@
+#ifdef _MSC_VER
+#pragma warning (disable: 4101)
+#endif
+
+#if !defined(GOURAUD) && !defined(TEXMAP) && !defined(ZBUF)
+#define NOLERP
+#endif
+
+void POLYFILL(struct pvertex *varr)
+{
+	int i, line, top, bot;
+	struct pvertex *v, *vn, *tab;
+	int32_t x, y0, y1, dx, dy, slope, fx, fy;
+	int start, len;
+	g3d_pixel *fbptr, *pptr, color;
+#ifdef GOURAUD
+	int32_t lum, dl, lumslope;
+#endif	/* GOURAUD */
+#ifdef TEXMAP
+	int32_t tu, tv, du, dv, uslope, vslope;
+	int tx, ty;
+	g3d_pixel texel;
+#endif
+#ifdef ZBUF
+	int32_t z, dz, zslope;
+	uint32_t *zptr;
+#endif
+
+#if !defined(GOURAUD)
+	/* for flat shading we already know the intensity */
+	color = find_color(varr[0].l, varr[0].l, varr[0].l);
+#endif
+
+	top = pfill_fb.height;
+	bot = 0;
+
+	for(i=0; i<3; i++) {
+		/* scan the edge between the current and next vertex */
+		v = varr + i;
+		vn = VNEXT(v);
+
+		if(vn->y == v->y) continue;	/* XXX ??? */
+
+		if(vn->y >= v->y) {
+			/* inrementing Y: left side */
+			tab = left;
+		} else {
+			/* decrementing Y: right side, flip vertices to trace bottom->up */
+			tab = right;
+			v = vn;
+			vn = varr + i;
+		}
+
+		/* calculate edge slope */
+		dx = vn->x - v->x;
+		dy = vn->y - v->y;
+		slope = (dx << 8) / dy;
+
+#ifdef GOURAUD
+		lum = v->l << COLOR_SHIFT;
+		dl = (vn->l << COLOR_SHIFT) - lum;
+		lumslope = (dl << 8) / dy;
+#endif	/* GOURAUD */
+#ifdef TEXMAP
+		tu = v->u;
+		tv = v->v;
+		du = vn->u - tu;
+		dv = vn->v - tv;
+		uslope = (du << 8) / dy;
+		vslope = (dv << 8) / dy;
+#endif	/* TEXMAP */
+#ifdef ZBUF
+		z = v->z;
+		dz = vn->z - z;
+		zslope = (dz << 8) / dy;
+#endif	/* ZBUF */
+
+		y0 = (v->y + 0x100) & 0xffffff00;	/* start from the next scanline */
+		fy = y0 - v->y;						/* fractional part before the next scanline */
+		fx = (fy * slope) >> 8;				/* X adjust for the step to the next scanline */
+		x = v->x + fx;						/* adjust X */
+		y1 = vn->y & 0xffffff00;			/* last scanline of the edge <= vn->y */
+
+		/* also adjust other interpolated attributes */
+#ifdef GOURAUD
+		lum += (fy * lumslope) >> 8;
+#endif	/* GOURAUD */
+#ifdef TEXMAP
+#ifdef FLTUV
+		tu += uslope * (fy / 256.0f);
+		tv += vslope * (fy / 256.0f);
+#else
+		tu += (fy * uslope) >> 8;
+		tv += (fy * vslope) >> 8;
+#endif
+#endif	/* TEXMAP */
+#ifdef ZBUF
+		z += (fy * zslope) >> 8;
+#endif
+
+		line = y0 >> 8;
+		if(line < top) top = line;
+		if((y1 >> 8) > bot) bot = y1 >> 8;
+
+		if(line > 0) tab += line;
+
+		while(line <= (y1 >> 8) && line < pfill_fb.height) {
+			if(line >= 0) {
+				int val = x < 0 ? 0 : x >> 8;
+				tab->x = val < pfill_fb.width ? val : pfill_fb.width - 1;
+#ifdef GOURAUD
+				tab->l = lum;
+#endif	/* GOURAUD */
+#ifdef TEXMAP
+				tab->u = tu;
+				tab->v = tv;
+#endif	/* TEXMAP */
+#ifdef ZBUF
+				tab->z = z;
+#endif
+				tab++;
+			}
+			x += slope;
+#ifdef GOURAUD
+			lum += lumslope;
+#endif	/* GOURAUD */
+#ifdef TEXMAP
+			tu += uslope;
+			tv += vslope;
+#endif	/* TEXMAP */
+#ifdef ZBUF
+			z += zslope;
+#endif	/* ZBUF */
+			line++;
+		}
+	}
+
+	if(top < 0) top = 0;
+	if(bot >= pfill_fb.height) bot = pfill_fb.height - 1;
+
+	fbptr = pfill_fb.pixels + top * pfill_fb.width;
+	for(i=top; i<=bot; i++) {
+		start = left[i].x;
+		len = right[i].x - start;
+		/* XXX we probably need more precision in left/right.x */
+
+#ifndef NOLERP
+		dx = len == 0 ? 256 : (len << 8);
+#endif
+
+#ifdef GOURAUD
+		lum = left[i].l;
+#endif	/* GOURAUD */
+#ifdef TEXMAP
+		tu = left[i].u;
+		tv = left[i].v;
+#endif	/* TEXMAP */
+#ifdef ZBUF
+		z = left[i].z;
+		zptr = pfill_zbuf + i * pfill_fb.width + start;
+#endif	/* ZBUF */
+
+		pptr = fbptr + start;
+		while(len-- > 0) {
+#if defined(GOURAUD) || defined(TEXMAP)
+			int inten;
+#endif
+#ifdef ZBUF
+			uint32_t cz = z;
+			z += pgrad.dzdx;
+
+			if(cz <= *zptr) {
+				*zptr++ = cz;
+			} else {
+				/* ZFAIL: advance all attributes and continue */
+#ifdef GOURAUD
+				lum += pgrad.dldx;
+#endif	/* GOURAUD */
+#ifdef TEXMAP
+				tu += pgrad.dudx;
+				tv += pgrad.dvdx;
+#endif	/* TEXMAP */
+				/* skip pixel */
+				pptr++;
+				zptr++;
+				continue;
+			}
+#endif	/* ZBUF */
+
+#ifdef GOURAUD
+			/* we upped the color precision to while interpolating the
+			 * edges, now drop the extra bits before packing
+			 */
+			inten = lum < 0 ? 0 : (lum >> COLOR_SHIFT);
+			lum += pgrad.dldx;
+#endif	/* GOURAUD */
+#ifdef TEXMAP
+			tx = (tu >> (16 - pfill_tex.xshift)) & pfill_tex.xmask;
+			ty = (tv >> (16 - pfill_tex.yshift)) & pfill_tex.ymask;
+			texel = pfill_tex.pixels[(ty << pfill_tex.xshift) + tx];
+
+			tu += pgrad.dudx;
+			tv += pgrad.dvdx;
+
+#ifndef GOURAUD
+			/* for flat textured, cr,cg,cb would not be initialized */
+			inten = varr[0].l;
+#endif	/* !GOURAUD */
+			/* This is not correct, should be /255, but it's much faster
+			 * to shift by 8 (/256), and won't make a huge difference
+			 */
+			color = shade_color(texel, inten);
+#endif	/* TEXMAP */
+
+#ifdef DEBUG_OVERDRAW
+			*pptr++ += DEBUG_OVERDRAW;
+#else
+#if defined(GOURAUD) || defined(TEXMAP)
+			if(inten >= 255) inten = 255;
+			color = find_color(inten, inten, inten);
+#endif
+			*pptr++ = color;
+#endif
+		}
+		fbptr += pfill_fb.width;
+	}
+}
+
+#undef NOLERP
diff --git a/src/cgmath/cgmath.h b/src/cgmath/cgmath.h
new file mode 100644
index 0000000..9f99361
--- /dev/null
+++ b/src/cgmath/cgmath.h
@@ -0,0 +1,280 @@
+/* gph-cmath - C graphics math library
+ * Copyright (C) 2018-2023 John Tsiombikas <nuclear@member.fsf.org>
+ *
+ * This program is free software. Feel free to use, modify, and/or redistribute
+ * it under the terms of the MIT/X11 license. See LICENSE for details.
+ * If you intend to redistribute parts of the code without the LICENSE file
+ * replace this paragraph with the full contents of the LICENSE file.
+ *
+ * Function prefixes signify the data type of their operand(s):
+ * - cgm_v... functions are operations on cgm_vec3 vectors
+ * - cgm_w... functions are operations on cgm_vec4 vectors
+ * - cgm_q... functions are operations on cgm_quat quaternions (w + xi + yj + zk)
+ * - cgm_m... functions are operations on 4x4 matrices (stored as linear 16 float arrays)
+ * - cgm_r... functions are operations on cgm_ray rays
+ *
+ * NOTE: *ALL* matrix arguments are pointers to 16 floats. Even the functions
+ * which operate on 3x3 matrices, actually use the upper 3x3 of a 4x4 matrix,
+ * and still expect an array of 16 floats.
+ *
+ * NOTE: matrices are treated by all operations as column-major, to match OpenGL
+ * conventions, so everything is pretty much transposed.
+*/
+#ifndef CGMATH_H_
+#define CGMATH_H_
+
+#include <math.h>
+#include <string.h>
+
+#define CGM_PI	3.141592653589793
+
+typedef struct {
+	float x, y;
+} cgm_vec2;
+
+typedef struct {
+	float x, y, z;
+} cgm_vec3;
+
+typedef struct {
+	float x, y, z, w;
+} cgm_vec4, cgm_quat;
+
+typedef struct {
+	cgm_vec3 origin, dir;
+} cgm_ray;
+
+typedef enum cgm_euler_mode {
+	CGM_EULER_XYZ,
+	CGM_EULER_XZY,
+	CGM_EULER_YXZ,
+	CGM_EULER_YZX,
+	CGM_EULER_ZXY,
+	CGM_EULER_ZYX,
+	CGM_EULER_ZXZ,
+	CGM_EULER_ZYZ,
+	CGM_EULER_YXY,
+	CGM_EULER_YZY,
+	CGM_EULER_XYX,
+	CGM_EULER_XZX
+} cgm_euler_mode;
+
+#ifdef __cplusplus
+#define CGM_INLINE inline
+
+extern "C" {
+#else
+
+#if (__STDC_VERSION__ >= 199901) || defined(__GNUC__)
+#define CGM_INLINE inline
+#else
+#define CGM_INLINE __inline
+#endif
+
+#endif
+
+/* --- operations on cgm_vec3 --- */
+static CGM_INLINE void cgm_vcons(cgm_vec3 *v, float x, float y, float z);
+static CGM_INLINE cgm_vec3 cgm_vvec(float x, float y, float z);
+
+static CGM_INLINE void cgm_vadd(cgm_vec3 *a, const cgm_vec3 *b);
+static CGM_INLINE void cgm_vadd_scaled(cgm_vec3 *a, const cgm_vec3 *b, float s); /* a+b*s */
+static CGM_INLINE void cgm_vsub(cgm_vec3 *a, const cgm_vec3 *b);
+static CGM_INLINE void cgm_vsub_scaled(cgm_vec3 *a, const cgm_vec3 *b, float s); /* a-b*s */
+static CGM_INLINE void cgm_vmul(cgm_vec3 *a, const cgm_vec3 *b);
+static CGM_INLINE void cgm_vscale(cgm_vec3 *v, float s);
+static CGM_INLINE void cgm_vmul_m4v3(cgm_vec3 *v, const float *m);	/* m4x4 * v */
+static CGM_INLINE void cgm_vmul_v3m4(cgm_vec3 *v, const float *m);	/* v * m4x4 */
+static CGM_INLINE void cgm_vmul_m3v3(cgm_vec3 *v, const float *m);	/* m3x3 * v (m still 16 floats) */
+static CGM_INLINE void cgm_vmul_v3m3(cgm_vec3 *v, const float *m);	/* v * m3x3 (m still 16 floats) */
+
+static CGM_INLINE float cgm_vdot(const cgm_vec3 *a, const cgm_vec3 *b);
+static CGM_INLINE void cgm_vcross(cgm_vec3 *res, const cgm_vec3 *a, const cgm_vec3 *b);
+static CGM_INLINE float cgm_vlength(const cgm_vec3 *v);
+static CGM_INLINE float cgm_vlength_sq(const cgm_vec3 *v);
+static CGM_INLINE float cgm_vdist(const cgm_vec3 *a, const cgm_vec3 *b);
+static CGM_INLINE float cgm_vdist_sq(const cgm_vec3 *a, const cgm_vec3 *b);
+static CGM_INLINE void cgm_vnormalize(cgm_vec3 *v);
+
+static CGM_INLINE void cgm_vreflect(cgm_vec3 *v, const cgm_vec3 *n);
+static CGM_INLINE void cgm_vrefract(cgm_vec3 *v, const cgm_vec3 *n, float ior);
+
+static CGM_INLINE void cgm_vrotate_quat(cgm_vec3 *v, const cgm_quat *q);
+static CGM_INLINE void cgm_vrotate_axis(cgm_vec3 *v, int axis, float angle);
+static CGM_INLINE void cgm_vrotate(cgm_vec3 *v, float angle, float x, float y, float z);
+static CGM_INLINE void cgm_vrotate_euler(cgm_vec3 *v, float a, float b, float c, enum cgm_euler_mode mode);
+
+static CGM_INLINE void cgm_vlerp(cgm_vec3 *res, const cgm_vec3 *a, const cgm_vec3 *b, float t);
+
+#define cgm_velem(vptr, idx)	((&(vptr)->x)[idx])
+
+/* --- operations on cgm_vec4 --- */
+static CGM_INLINE void cgm_wcons(cgm_vec4 *v, float x, float y, float z, float w);
+static CGM_INLINE cgm_vec4 cgm_wvec(float x, float y, float z, float w);
+
+static CGM_INLINE void cgm_wadd(cgm_vec4 *a, const cgm_vec4 *b);
+static CGM_INLINE void cgm_wsub(cgm_vec4 *a, const cgm_vec4 *b);
+static CGM_INLINE void cgm_wmul(cgm_vec4 *a, const cgm_vec4 *b);
+static CGM_INLINE void cgm_wscale(cgm_vec4 *v, float s);
+
+static CGM_INLINE void cgm_wmul_m4v4(cgm_vec4 *v, const float *m);
+static CGM_INLINE void cgm_wmul_v4m4(cgm_vec4 *v, const float *m);
+static CGM_INLINE void cgm_wmul_m34v4(cgm_vec4 *v, const float *m);	/* doesn't affect w */
+static CGM_INLINE void cgm_wmul_v4m43(cgm_vec4 *v, const float *m);	/* doesn't affect w */
+static CGM_INLINE void cgm_wmul_m3v4(cgm_vec4 *v, const float *m); /* (m still 16 floats) */
+static CGM_INLINE void cgm_wmul_v4m3(cgm_vec4 *v, const float *m); /* (m still 16 floats) */
+
+static CGM_INLINE float cgm_wdot(const cgm_vec4 *a, const cgm_vec4 *b);
+
+static CGM_INLINE float cgm_wlength(const cgm_vec4 *v);
+static CGM_INLINE float cgm_wlength_sq(const cgm_vec4 *v);
+static CGM_INLINE float cgm_wdist(const cgm_vec4 *a, const cgm_vec4 *b);
+static CGM_INLINE float cgm_wdist_sq(const cgm_vec4 *a, const cgm_vec4 *b);
+static CGM_INLINE void cgm_wnormalize(cgm_vec4 *v);
+
+static CGM_INLINE void cgm_wlerp(cgm_vec4 *res, const cgm_vec4 *a, const cgm_vec4 *b, float t);
+
+#define cgm_welem(vptr, idx)	((&(vptr)->x)[idx])
+
+/* --- operations on quaternions --- */
+static CGM_INLINE void cgm_qcons(cgm_quat *q, float x, float y, float z, float w);
+
+static CGM_INLINE void cgm_qneg(cgm_quat *q);
+static CGM_INLINE void cgm_qadd(cgm_quat *a, const cgm_quat *b);
+static CGM_INLINE void cgm_qsub(cgm_quat *a, const cgm_quat *b);
+static CGM_INLINE void cgm_qmul(cgm_quat *a, const cgm_quat *b);
+
+static CGM_INLINE float cgm_qlength(const cgm_quat *q);
+static CGM_INLINE float cgm_qlength_sq(const cgm_quat *q);
+static CGM_INLINE void cgm_qnormalize(cgm_quat *q);
+static CGM_INLINE void cgm_qconjugate(cgm_quat *q);
+static CGM_INLINE void cgm_qinvert(cgm_quat *q);
+
+static CGM_INLINE void cgm_qrotation(cgm_quat *q, float angle, float x, float y, float z);
+static CGM_INLINE void cgm_qrotate(cgm_quat *q, float angle, float x, float y, float z);
+
+static CGM_INLINE void cgm_qslerp(cgm_quat *res, const cgm_quat *a, const cgm_quat *b, float t);
+static CGM_INLINE void cgm_qlerp(cgm_quat *res, const cgm_quat *a, const cgm_quat *b, float t);
+
+#define cgm_qelem(qptr, idx)	((&(qptr)->x)[idx])
+
+/* --- operations on matrices --- */
+static CGM_INLINE void cgm_mcopy(float *dest, const float *src);
+static CGM_INLINE void cgm_mzero(float *m);
+static CGM_INLINE void cgm_midentity(float *m);
+
+static CGM_INLINE void cgm_mmul(float *a, const float *b);
+static CGM_INLINE void cgm_mpremul(float *a, const float *b);
+
+static CGM_INLINE void cgm_msubmatrix(float *m, int row, int col);
+static CGM_INLINE void cgm_mupper3(float *m);
+static CGM_INLINE float cgm_msubdet(const float *m, int row, int col);
+static CGM_INLINE float cgm_mcofactor(const float *m, int row, int col);
+static CGM_INLINE float cgm_mdet(const float *m);
+static CGM_INLINE void cgm_mtranspose(float *m);
+static CGM_INLINE void cgm_mcofmatrix(float *m);
+static CGM_INLINE int cgm_minverse(float *m);	/* returns 0 on success, -1 for singular */
+
+static CGM_INLINE void cgm_mtranslation(float *m, float x, float y, float z);
+static CGM_INLINE void cgm_mscaling(float *m, float sx, float sy, float sz);
+static CGM_INLINE void cgm_mrotation_x(float *m, float angle);
+static CGM_INLINE void cgm_mrotation_y(float *m, float angle);
+static CGM_INLINE void cgm_mrotation_z(float *m, float angle);
+static CGM_INLINE void cgm_mrotation_axis(float *m, int idx, float angle);
+static CGM_INLINE void cgm_mrotation(float *m, float angle, float x, float y, float z);
+static CGM_INLINE void cgm_mrotation_euler(float *m, float a, float b, float c, int mode);
+static CGM_INLINE void cgm_mrotation_quat(float *m, const cgm_quat *q);
+
+static CGM_INLINE void cgm_mtranslate(float *m, float x, float y, float z);
+static CGM_INLINE void cgm_mscale(float *m, float sx, float sy, float sz);
+static CGM_INLINE void cgm_mrotate_x(float *m, float angle);
+static CGM_INLINE void cgm_mrotate_y(float *m, float angle);
+static CGM_INLINE void cgm_mrotate_z(float *m, float angle);
+static CGM_INLINE void cgm_mrotate_axis(float *m, int idx, float angle);
+static CGM_INLINE void cgm_mrotate(float *m, float angle, float x, float y, float z);
+static CGM_INLINE void cgm_mrotate_euler(float *m, float a, float b, float c, int mode);
+static CGM_INLINE void cgm_mrotate_quat(float *m, const cgm_quat *q);
+
+static CGM_INLINE void cgm_mpretranslate(float *m, float x, float y, float z);
+static CGM_INLINE void cgm_mprescale(float *m, float sx, float sy, float sz);
+static CGM_INLINE void cgm_mprerotate_x(float *m, float angle);
+static CGM_INLINE void cgm_mprerotate_y(float *m, float angle);
+static CGM_INLINE void cgm_mprerotate_z(float *m, float angle);
+static CGM_INLINE void cgm_mprerotate_axis(float *m, int idx, float angle);
+static CGM_INLINE void cgm_mprerotate(float *m, float angle, float x, float y, float z);
+static CGM_INLINE void cgm_mprerotate_euler(float *m, float a, float b, float c, int mode);
+static CGM_INLINE void cgm_mprerotate_quat(float *m, const cgm_quat *q);
+
+static CGM_INLINE void cgm_mget_translation(const float *m, cgm_vec3 *res);
+static CGM_INLINE void cgm_mget_rotation(const float *m, cgm_quat *res);
+static CGM_INLINE void cgm_mget_scaling(const float *m, cgm_vec3 *res);
+static CGM_INLINE void cgm_mget_frustum_plane(const float *m, int p, cgm_vec4 *res);
+
+static CGM_INLINE void cgm_normalize_plane(cgm_vec4 *p);
+
+static CGM_INLINE void cgm_mlookat(float *m, const cgm_vec3 *pos, const cgm_vec3 *targ,
+		const cgm_vec3 *up);
+static CGM_INLINE void cgm_minv_lookat(float *m, const cgm_vec3 *pos, const cgm_vec3 *targ,
+		const cgm_vec3 *up);
+static CGM_INLINE void cgm_mortho(float *m, float left, float right, float bot, float top,
+		float znear, float zfar);
+static CGM_INLINE void cgm_mfrustum(float *m, float left, float right, float bot, float top,
+		float znear, float zfar);
+static CGM_INLINE void cgm_mperspective(float *m, float vfov, float aspect, float znear, float zfar);
+
+static CGM_INLINE void cgm_mmirror(float *m, float a, float b, float c, float d);
+
+/* --- operations on rays --- */
+static CGM_INLINE void cgm_rcons(cgm_ray *r, float x, float y, float z, float dx, float dy, float dz);
+
+static CGM_INLINE void cgm_rmul_mr(cgm_ray *ray, const float *m);	/* m4x4 * ray */
+static CGM_INLINE void cgm_rmul_rm(cgm_ray *ray, const float *m);	/* ray * m4x4 */
+
+static CGM_INLINE void cgm_rreflect(cgm_ray *ray, const cgm_vec3 *n);
+static CGM_INLINE void cgm_rrefract(cgm_ray *ray, const cgm_vec3 *n, float ior);
+
+/* --- miscellaneous utility functions --- */
+static CGM_INLINE float cgm_deg_to_rad(float deg);
+static CGM_INLINE float cgm_rad_to_deg(float rad);
+
+static CGM_INLINE float cgm_smoothstep(float a, float b, float x);
+static CGM_INLINE float cgm_lerp(float a, float b, float t);
+static CGM_INLINE float cgm_logerp(float a, float b, float t);
+static CGM_INLINE float cgm_bezier(float a, float b, float c, float d, float t);
+static CGM_INLINE float cgm_bspline(float a, float b, float c, float d, float t);
+static CGM_INLINE float cgm_spline(float a, float b, float c, float d, float t);
+
+static CGM_INLINE void cgm_discrand(cgm_vec3 *v, float rad);
+static CGM_INLINE void cgm_sphrand(cgm_vec3 *v, float rad);
+
+static CGM_INLINE void cgm_unproject(cgm_vec3 *res, const cgm_vec3 *norm_scrpos,
+		const float *inv_viewproj);
+static CGM_INLINE void cgm_glu_unproject(float winx, float winy, float winz,
+		const float *view, const float *proj, const int *vp,
+		float *objx, float *objy, float *objz);
+
+static CGM_INLINE void cgm_pick_ray(cgm_ray *ray, float nx, float ny,
+		const float *viewmat, const float *projmat);
+
+static CGM_INLINE void cgm_raypos(cgm_vec3 *p, const cgm_ray *ray, float t);
+
+/* calculate barycentric coordinates of point pt in triangle (a, b, c) */
+static CGM_INLINE void cgm_bary(cgm_vec3 *bary, const cgm_vec3 *a,
+		const cgm_vec3 *b, const cgm_vec3 *c, const cgm_vec3 *pt);
+
+/* convert between unit vectors and spherical coordinates */
+static CGM_INLINE void cgm_uvec_to_sph(float *theta, float *phi, const cgm_vec3 *v);
+static CGM_INLINE void cgm_sph_to_uvec(cgm_vec3 *v, float theta, float phi);
+
+#include "cgmvec3.inl"
+#include "cgmvec4.inl"
+#include "cgmquat.inl"
+#include "cgmmat.inl"
+#include "cgmray.inl"
+#include "cgmmisc.inl"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* CGMATH_H_ */
diff --git a/src/cgmath/cgmmat.inl b/src/cgmath/cgmmat.inl
new file mode 100644
index 0000000..f8168b4
--- /dev/null
+++ b/src/cgmath/cgmmat.inl
@@ -0,0 +1,667 @@
+/* gph-cmath - C graphics math library
+ * Copyright (C) 2018-2023 John Tsiombikas <nuclear@member.fsf.org>
+ *
+ * This program is free software. Feel free to use, modify, and/or redistribute
+ * it under the terms of the MIT/X11 license. See LICENSE for details.
+ * If you intend to redistribute parts of the code without the LICENSE file
+ * replace this paragraph with the full contents of the LICENSE file.
+ */
+static CGM_INLINE void cgm_mcopy(float *dest, const float *src)
+{
+	memcpy(dest, src, 16 * sizeof(float));
+}
+
+static CGM_INLINE void cgm_mzero(float *m)
+{
+	static float z[16];
+	cgm_mcopy(m, z);
+}
+
+static CGM_INLINE void cgm_midentity(float *m)
+{
+	static float id[16] = {1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1};
+	cgm_mcopy(m, id);
+}
+
+static CGM_INLINE void cgm_mmul(float *a, const float *b)
+{
+	int i, j;
+	float res[16];
+	float *resptr = res;
+	float *arow = a;
+
+	for(i=0; i<4; i++) {
+		for(j=0; j<4; j++) {
+			*resptr++ = arow[0] * b[j] + arow[1] * b[4 + j] +
+				arow[2] * b[8 + j] + arow[3] * b[12 + j];
+		}
+		arow += 4;
+	}
+	cgm_mcopy(a, res);
+}
+
+static CGM_INLINE void cgm_mpremul(float *a, const float *b)
+{
+	int i, j;
+	float res[16];
+	float *resptr = res;
+	const float *brow = b;
+
+	for(i=0; i<4; i++) {
+		for(j=0; j<4; j++) {
+			*resptr++ = brow[0] * a[j] + brow[1] * a[4 + j] +
+				brow[2] * a[8 + j] + brow[3] * a[12 + j];
+		}
+		brow += 4;
+	}
+	cgm_mcopy(a, res);
+}
+
+static CGM_INLINE void cgm_msubmatrix(float *m, int row, int col)
+{
+	float orig[16];
+	int i, j, subi, subj;
+
+	cgm_mcopy(orig, m);
+
+	subi = 0;
+	for(i=0; i<4; i++) {
+		if(i == row) continue;
+
+		subj = 0;
+		for(j=0; j<4; j++) {
+			if(j == col) continue;
+
+			m[subi * 4 + subj++] = orig[i * 4 + j];
+		}
+		subi++;
+	}
+
+	cgm_mupper3(m);
+}
+
+static CGM_INLINE void cgm_mupper3(float *m)
+{
+	m[3] = m[7] = m[11] = m[12] = m[13] = m[14] = 0.0f;
+	m[15] = 1.0f;
+}
+
+static CGM_INLINE float cgm_msubdet(const float *m, int row, int col)
+{
+	float tmp[16];
+	float subdet00, subdet01, subdet02;
+
+	cgm_mcopy(tmp, m);
+	cgm_msubmatrix(tmp, row, col);
+
+	subdet00 = tmp[5] * tmp[10] - tmp[6] * tmp[9];
+	subdet01 = tmp[4] * tmp[10] - tmp[6] * tmp[8];
+	subdet02 = tmp[4] * tmp[9] - tmp[5] * tmp[8];
+
+	return tmp[0] * subdet00 - tmp[1] * subdet01 + tmp[2] * subdet02;
+}
+
+static CGM_INLINE float cgm_mcofactor(const float *m, int row, int col)
+{
+	float min = cgm_msubdet(m, row, col);
+	return (row + col) & 1 ? -min : min;
+}
+
+static CGM_INLINE float cgm_mdet(const float *m)
+{
+	return m[0] * cgm_msubdet(m, 0, 0) - m[1] * cgm_msubdet(m, 0, 1) +
+		m[2] * cgm_msubdet(m, 0, 2) - m[3] * cgm_msubdet(m, 0, 3);
+}
+
+static CGM_INLINE void cgm_mtranspose(float *m)
+{
+	int i, j;
+	for(i=0; i<4; i++) {
+		for(j=0; j<i; j++) {
+			int a = i * 4 + j;
+			int b = j * 4 + i;
+			float tmp = m[a];
+			m[a] = m[b];
+			m[b] = tmp;
+		}
+	}
+}
+
+static CGM_INLINE void cgm_mcofmatrix(float *m)
+{
+	float tmp[16];
+	int i, j;
+
+	cgm_mcopy(tmp, m);
+
+	for(i=0; i<4; i++) {
+		for(j=0; j<4; j++) {
+			m[i * 4 + j] = cgm_mcofactor(tmp, i, j);
+		}
+	}
+}
+
+static CGM_INLINE int cgm_minverse(float *m)
+{
+	int i, j;
+	float tmp[16];
+	float inv_det;
+	float det = cgm_mdet(m);
+	if(det == 0.0f) return -1;
+	inv_det = 1.0f / det;
+
+	cgm_mcopy(tmp, m);
+
+	for(i=0; i<4; i++) {
+		for(j=0; j<4; j++) {
+			m[i * 4 + j] = cgm_mcofactor(tmp, j, i) * inv_det;	/* transposed */
+		}
+	}
+	return 0;
+}
+
+static CGM_INLINE void cgm_mtranslation(float *m, float x, float y, float z)
+{
+	cgm_midentity(m);
+	m[12] = x;
+	m[13] = y;
+	m[14] = z;
+}
+
+static CGM_INLINE void cgm_mscaling(float *m, float sx, float sy, float sz)
+{
+	cgm_mzero(m);
+	m[0] = sx;
+	m[5] = sy;
+	m[10] = sz;
+	m[15] = 1.0f;
+}
+
+static CGM_INLINE void cgm_mrotation_x(float *m, float angle)
+{
+	float sa = sin(angle);
+	float ca = cos(angle);
+
+	cgm_midentity(m);
+	m[5] = ca;
+	m[6] = sa;
+	m[9] = -sa;
+	m[10] = ca;
+}
+
+static CGM_INLINE void cgm_mrotation_y(float *m, float angle)
+{
+	float sa = sin(angle);
+	float ca = cos(angle);
+
+	cgm_midentity(m);
+	m[0] = ca;
+	m[2] = -sa;
+	m[8] = sa;
+	m[10] = ca;
+}
+
+static CGM_INLINE void cgm_mrotation_z(float *m, float angle)
+{
+	float sa = sin(angle);
+	float ca = cos(angle);
+
+	cgm_midentity(m);
+	m[0] = ca;
+	m[1] = sa;
+	m[4] = -sa;
+	m[5] = ca;
+}
+
+static CGM_INLINE void cgm_mrotation_axis(float *m, int idx, float angle)
+{
+	switch(idx) {
+	case 0:
+		cgm_mrotation_x(m, angle);
+		break;
+	case 1:
+		cgm_mrotation_y(m, angle);
+		break;
+	case 2:
+		cgm_mrotation_z(m, angle);
+		break;
+	}
+}
+
+static CGM_INLINE void cgm_mrotation(float *m, float angle, float x, float y, float z)
+{
+	float sa = sin(angle);
+	float ca = cos(angle);
+	float invca = 1.0f - ca;
+	float xsq = x * x;
+	float ysq = y * y;
+	float zsq = z * z;
+
+	cgm_mzero(m);
+	m[15] = 1.0f;
+
+	m[0] = xsq + (1.0f - xsq) * ca;
+	m[4] = x * y * invca - z * sa;
+	m[8] = x * z * invca + y * sa;
+
+	m[1] = x * y * invca + z * sa;
+	m[5] = ysq + (1.0f - ysq) * ca;
+	m[9] = y * z * invca - x * sa;
+
+	m[2] = x * z * invca - y * sa;
+	m[6] = y * z * invca + x * sa;
+	m[10] = zsq + (1.0f - zsq) * ca;
+}
+
+static CGM_INLINE void cgm_mrotation_euler(float *m, float a, float b, float c, int mode)
+{
+	/* this array must match the EulerMode enum */
+	static const int axis[][3] = {
+		{0, 1, 2}, {0, 2, 1},
+		{1, 0, 2}, {1, 2, 0},
+		{2, 0, 1}, {2, 1, 0},
+		{2, 0, 2}, {2, 1, 2},
+		{1, 0, 1}, {1, 2, 1},
+		{0, 1, 0}, {0, 2, 0}
+	};
+
+	float ma[16], mb[16];
+	cgm_mrotation_axis(ma, axis[mode][0], a);
+	cgm_mrotation_axis(mb, axis[mode][1], b);
+	cgm_mrotation_axis(m, axis[mode][2], c);
+	cgm_mmul(m, mb);
+	cgm_mmul(m, ma);
+}
+
+static CGM_INLINE void cgm_mrotation_quat(float *m, const cgm_quat *q)
+{
+	float xsq2 = 2.0f * q->x * q->x;
+	float ysq2 = 2.0f * q->y * q->y;
+	float zsq2 = 2.0f * q->z * q->z;
+	float sx = 1.0f - ysq2 - zsq2;
+	float sy = 1.0f - xsq2 - zsq2;
+	float sz = 1.0f - xsq2 - ysq2;
+
+	m[3] = m[7] = m[11] = m[12] = m[13] = m[14] = 0.0f;
+	m[15] = 1.0f;
+
+	m[0] = sx;
+	m[1] = 2.0f * q->x * q->y + 2.0f * q->w * q->z;
+	m[2] = 2.0f * q->z * q->x - 2.0f * q->w * q->y;
+	m[4] = 2.0f * q->x * q->y - 2.0f * q->w * q->z;
+	m[5] = sy;
+	m[6] = 2.0f * q->y * q->z + 2.0f * q->w * q->x;
+	m[8] = 2.0f * q->z * q->x + 2.0f * q->w * q->y;
+	m[9] = 2.0f * q->y * q->z - 2.0f * q->w * q->x;
+	m[10] = sz;
+}
+
+static CGM_INLINE void cgm_mtranslate(float *m, float x, float y, float z)
+{
+	float tm[16];
+	cgm_mtranslation(tm, x, y, z);
+	cgm_mmul(m, tm);
+}
+
+static CGM_INLINE void cgm_mscale(float *m, float sx, float sy, float sz)
+{
+	float sm[16];
+	cgm_mscaling(sm, sx, sy, sz);
+	cgm_mmul(m, sm);
+}
+
+static CGM_INLINE void cgm_mrotate_x(float *m, float angle)
+{
+	float rm[16];
+	cgm_mrotation_x(rm, angle);
+	cgm_mmul(m, rm);
+}
+
+static CGM_INLINE void cgm_mrotate_y(float *m, float angle)
+{
+	float rm[16];
+	cgm_mrotation_y(rm, angle);
+	cgm_mmul(m, rm);
+}
+
+static CGM_INLINE void cgm_mrotate_z(float *m, float angle)
+{
+	float rm[16];
+	cgm_mrotation_z(rm, angle);
+	cgm_mmul(m, rm);
+}
+
+static CGM_INLINE void cgm_mrotate_axis(float *m, int idx, float angle)
+{
+	float rm[16];
+	cgm_mrotation_axis(rm, idx, angle);
+	cgm_mmul(m, rm);
+}
+
+static CGM_INLINE void cgm_mrotate(float *m, float angle, float x, float y, float z)
+{
+	float rm[16];
+	cgm_mrotation(rm, angle, x, y, z);
+	cgm_mmul(m, rm);
+}
+
+static CGM_INLINE void cgm_mrotate_euler(float *m, float a, float b, float c, int mode)
+{
+	float rm[16];
+	cgm_mrotation_euler(rm, a, b, c, mode);
+	cgm_mmul(m, rm);
+}
+
+static CGM_INLINE void cgm_mrotate_quat(float *m, const cgm_quat *q)
+{
+	float rm[16];
+	cgm_mrotation_quat(rm, q);
+	cgm_mmul(m, rm);
+}
+
+
+static CGM_INLINE void cgm_mpretranslate(float *m, float x, float y, float z)
+{
+	float tm[16];
+	cgm_mtranslation(tm, x, y, z);
+	cgm_mpremul(m, tm);
+}
+
+static CGM_INLINE void cgm_mprescale(float *m, float sx, float sy, float sz)
+{
+	float sm[16];
+	cgm_mscaling(sm, sx, sy, sz);
+	cgm_mpremul(m, sm);
+}
+
+static CGM_INLINE void cgm_mprerotate_x(float *m, float angle)
+{
+	float rm[16];
+	cgm_mrotation_x(rm, angle);
+	cgm_mpremul(m, rm);
+}
+
+static CGM_INLINE void cgm_mprerotate_y(float *m, float angle)
+{
+	float rm[16];
+	cgm_mrotation_y(rm, angle);
+	cgm_mpremul(m, rm);
+}
+
+static CGM_INLINE void cgm_mprerotate_z(float *m, float angle)
+{
+	float rm[16];
+	cgm_mrotation_z(rm, angle);
+	cgm_mpremul(m, rm);
+}
+
+static CGM_INLINE void cgm_mprerotate_axis(float *m, int idx, float angle)
+{
+	float rm[16];
+	cgm_mrotation_axis(rm, idx, angle);
+	cgm_mpremul(m, rm);
+}
+
+static CGM_INLINE void cgm_mprerotate(float *m, float angle, float x, float y, float z)
+{
+	float rm[16];
+	cgm_mrotation(rm, angle, x, y, z);
+	cgm_mpremul(m, rm);
+}
+
+static CGM_INLINE void cgm_mprerotate_euler(float *m, float a, float b, float c, int mode)
+{
+	float rm[16];
+	cgm_mrotation_euler(rm, a, b, c, mode);
+	cgm_mpremul(m, rm);
+}
+
+static CGM_INLINE void cgm_mprerotate_quat(float *m, const cgm_quat *q)
+{
+	float rm[16];
+	cgm_mrotation_quat(rm, q);
+	cgm_mpremul(m, rm);
+}
+
+
+static CGM_INLINE void cgm_mget_translation(const float *m, cgm_vec3 *res)
+{
+	res->x = m[12];
+	res->y = m[13];
+	res->z = m[14];
+}
+
+/* Algorithm in Ken Shoemake's article in 1987 SIGGRAPH course notes
+ * article "Quaternion Calculus and Fast Animation".
+ * adapted from: http://www.geometrictools.com/LibMathematics/Algebra/Wm5Quaternion.inl
+ */
+static CGM_INLINE void cgm_mget_rotation(const float *m, cgm_quat *res)
+{
+	static const int next[3] = {1, 2, 0};
+	float quat[4];
+	int i, j, k;
+
+	float trace = m[0] + m[5] + m[10];
+	float root;
+
+	if(trace > 0.0f) {
+		/* |w| > 1/2 */
+		root = sqrt(trace + 1.0f);	/* 2w */
+		res->w = 0.5f * root;
+		root = 0.5f / root;	/* 1 / 4w */
+		res->x = (m[6] - m[9]) * root;
+		res->y = (m[8] - m[2]) * root;
+		res->z = (m[1] - m[4]) * root;
+	} else {
+		/* |w| <= 1/2 */
+		i = 0;
+		if(m[5] > m[0]) {
+			i = 1;
+		}
+		if(m[10] > m[i * 4 + i]) {
+			i = 2;
+		}
+		j = next[i];
+		k = next[j];
+
+		root = sqrt(m[i * 4 + i] - m[j * 4 + j] - m[k * 4 + k] + 1.0f);
+		quat[i + 1] = 0.5f * root;
+		root = 0.5f / root;
+		quat[0] = (m[j + 4 + k] - m[k * 4 + j]) * root;
+		quat[j + 1] = (m[i * 4 + j] - m[j * 4 + i]) * root;
+		quat[k + 1] = (m[i * 4 + k] - m[k * 4 + i]) * root;
+		res->w = quat[0];
+		res->x = quat[1];
+		res->y = quat[2];
+		res->z = quat[3];
+	}
+}
+
+static CGM_INLINE void cgm_mget_scaling(const float *m, cgm_vec3 *res)
+{
+	res->x = sqrt(m[0] * m[0] + m[4] * m[4] + m[8] * m[8]);
+	res->y = sqrt(m[1] * m[1] + m[5] * m[5] + m[9] * m[9]);
+	res->z = sqrt(m[2] * m[2] + m[6] * m[6] + m[10] * m[10]);
+}
+
+static CGM_INLINE void cgm_mget_frustum_plane(const float *m, int p, cgm_vec4 *res)
+{
+	switch(p) {
+	case 0:
+		res->x = m[3] + m[0];
+		res->y = m[7] + m[4];
+		res->z = m[11] + m[8];
+		res->w = m[15] + m[12];
+		break;
+
+	case 1:
+		res->x = m[3] - m[0];
+		res->y = m[7] - m[4];
+		res->z = m[11] - m[8];
+		res->w = m[15] - m[12];
+		break;
+
+	case 2:
+		res->x = m[3] + m[1];
+		res->y = m[7] + m[5];
+		res->z = m[11] + m[9];
+		res->w = m[15] + m[13];
+		break;
+
+	case 3:
+		res->x = m[3] - m[1];
+		res->y = m[7] - m[5];
+		res->z = m[11] - m[9];
+		res->w = m[15] - m[13];
+		break;
+
+	case 4:
+		res->x = m[3] + m[2];
+		res->y = m[7] + m[6];
+		res->z = m[11] + m[10];
+		res->w = m[15] + m[14];
+		break;
+
+	case 5:
+		res->x = m[3] - m[2];
+		res->y = m[7] - m[6];
+		res->z = m[11] - m[10];
+		res->w = m[15] - m[14];
+		break;
+
+	default:
+		break;
+	}
+}
+
+static CGM_INLINE void cgm_normalize_plane(cgm_vec4 *p)
+{
+	float len = cgm_vlength((cgm_vec3*)p);
+	if(len != 0.0f) {
+		float s = 1.0f / len;
+		p->x *= s;
+		p->y *= s;
+		p->z *= s;
+		p->w *= s;
+	}
+}
+
+static CGM_INLINE void cgm_mlookat(float *m, const cgm_vec3 *pos, const cgm_vec3 *targ,
+		const cgm_vec3 *up)
+{
+	float trans[16];
+	cgm_vec3 dir = *targ, right, vup;
+
+	cgm_vsub(&dir, pos);
+	cgm_vnormalize(&dir);
+	cgm_vcross(&right, &dir, up);
+	cgm_vnormalize(&right);
+	cgm_vcross(&vup, &right, &dir);
+	cgm_vnormalize(&vup);
+
+	cgm_midentity(m);
+	m[0] = right.x;
+	m[1] = right.y;
+	m[2] = right.z;
+	m[4] = vup.x;
+	m[5] = vup.y;
+	m[6] = vup.z;
+	m[8] = -dir.x;
+	m[9] = -dir.y;
+	m[10] = -dir.z;
+
+	cgm_mtranslation(trans, pos->x, pos->y, pos->z);
+	cgm_mmul(m, trans);
+}
+
+static CGM_INLINE void cgm_minv_lookat(float *m, const cgm_vec3 *pos, const cgm_vec3 *targ,
+		const cgm_vec3 *up)
+{
+	float rot[16];
+	cgm_vec3 dir = *targ, right, vup;
+
+	cgm_vsub(&dir, pos);
+	cgm_vnormalize(&dir);
+	cgm_vcross(&right, &dir, up);
+	cgm_vnormalize(&right);
+	cgm_vcross(&vup, &right, &dir);
+	cgm_vnormalize(&vup);
+
+	cgm_midentity(rot);
+	rot[0] = right.x;
+	rot[4] = right.y;
+	rot[8] = right.z;
+	rot[1] = vup.x;
+	rot[5] = vup.y;
+	rot[9] = vup.z;
+	rot[2] = -dir.x;
+	rot[6] = -dir.y;
+	rot[10] = -dir.z;
+
+	cgm_mtranslation(m, -pos->x, -pos->y, -pos->z);
+	cgm_mmul(m, rot);
+}
+
+static CGM_INLINE void cgm_mortho(float *m, float left, float right, float bot, float top,
+		float znear, float zfar)
+{
+	float dx = right - left;
+	float dy = top - bot;
+	float dz = zfar - znear;
+
+	cgm_midentity(m);
+	m[0] = 2.0f / dx;
+	m[5] = 2.0f / dy;
+	m[10] = -2.0f / dz;
+	m[12] = -(right + left) / dx;
+	m[13] = -(top + bot) / dy;
+	m[14] = -(zfar + znear) / dz;
+}
+
+static CGM_INLINE void cgm_mfrustum(float *m, float left, float right, float bot, float top,
+		float znear, float zfar)
+{
+	float dx = right - left;
+	float dy = top - bot;
+	float dz = zfar - znear;
+
+	cgm_mzero(m);
+	m[0] = 2.0f * znear / dx;
+	m[5] = 2.0f * znear / dy;
+	m[8] = (right + left) / dx;
+	m[9] = (top + bot) / dy;
+	m[10] = -(zfar + znear) / dz;
+	m[14] = -2.0f * zfar * znear / dz;
+	m[11] = -1.0f;
+}
+
+static CGM_INLINE void cgm_mperspective(float *m, float vfov, float aspect, float znear, float zfar)
+{
+	float s = 1.0f / (float)tan(vfov / 2.0f);
+	float range = znear - zfar;
+
+	cgm_mzero(m);
+	m[0] = s / aspect;
+	m[5] = s;
+	m[10] = (znear + zfar) / range;
+	m[14] = 2.0f * znear * zfar / range;
+	m[11] = -1.0f;
+}
+
+static CGM_INLINE void cgm_mmirror(float *m, float a, float b, float c, float d)
+{
+	m[0] = 1.0f - 2.0f * a * a;
+	m[5] = 1.0f - 2.0f * b * b;
+	m[10] = 1.0f - 2.0f * c * c;
+	m[15] = 1.0f;
+
+	m[1] = m[4] = -2.0f * a * b;
+	m[2] = m[8] = -2.0f * a * c;
+	m[6] = m[9] = -2.0f * b * c;
+
+	m[12] = -2.0f * a * d;
+	m[13] = -2.0f * b * d;
+	m[14] = -2.0f * c * d;
+
+	m[3] = m[7] = m[11] = 0.0f;
+}
diff --git a/src/cgmath/cgmmisc.inl b/src/cgmath/cgmmisc.inl
new file mode 100644
index 0000000..a3cdfb5
--- /dev/null
+++ b/src/cgmath/cgmmisc.inl
@@ -0,0 +1,211 @@
+/* gph-cmath - C graphics math library
+ * Copyright (C) 2018-2023 John Tsiombikas <nuclear@member.fsf.org>
+ *
+ * This program is free software. Feel free to use, modify, and/or redistribute
+ * it under the terms of the MIT/X11 license. See LICENSE for details.
+ * If you intend to redistribute parts of the code without the LICENSE file
+ * replace this paragraph with the full contents of the LICENSE file.
+ */
+#include <stdlib.h>
+
+static CGM_INLINE float cgm_deg_to_rad(float deg)
+{
+	return M_PI * deg / 180.0f;
+}
+
+static CGM_INLINE float cgm_rad_to_deg(float rad)
+{
+	return 180.0f * rad / M_PI;
+}
+
+static CGM_INLINE float cgm_smoothstep(float a, float b, float x)
+{
+	if(x < a) return 0.0f;
+	if(x >= b) return 1.0f;
+
+	x = (x - a) / (b - a);
+	return x * x * (3.0f - 2.0f * x);
+}
+
+static CGM_INLINE float cgm_lerp(float a, float b, float t)
+{
+	return a + (b - a) * t;
+}
+
+static CGM_INLINE float cgm_logerp(float a, float b, float t)
+{
+	if(a == 0.0f) return 0.0f;
+	return a * pow(b / a, t);
+}
+
+static CGM_INLINE float cgm_bezier(float a, float b, float c, float d, float t)
+{
+	float omt, omt3, t3, f;
+	t3 = t * t * t;
+	omt = 1.0f - t;
+	omt3 = omt * omt * omt;
+	f = 3.0f * t * omt;
+
+	return (a * omt3) + (b * f * omt) + (c * f * t) + (d * t3);
+}
+
+static CGM_INLINE float cgm_bspline(float a, float b, float c, float d, float t)
+{
+	static const float mat[] = {
+		-1, 3, -3, 1,
+		3, -6, 0, 4,
+		-3, 3, 3, 1,
+		1, 0, 0, 0
+	};
+	cgm_vec4 tmp, qfact;
+	float tsq = t * t;
+
+	cgm_wcons(&qfact, tsq * t, tsq, t, 1.0f);
+	cgm_wcons(&tmp, a, b, c, d);
+	cgm_wmul_m4v4(&tmp, mat);
+	cgm_wscale(&tmp, 1.0f / 6.0f);
+	return cgm_wdot(&tmp, &qfact);
+}
+
+static CGM_INLINE float cgm_spline(float a, float b, float c, float d, float t)
+{
+	static const float mat[] = {
+		-1, 2, -1, 0,
+		3, -5, 0, 2,
+		-3, 4, 1, 0,
+		1, -1, 0, 0
+	};
+	cgm_vec4 tmp, qfact;
+	float tsq = t * t;
+
+	cgm_wcons(&qfact, tsq * t, tsq, t, 1.0f);
+	cgm_wcons(&tmp, a, b, c, d);
+	cgm_wmul_m4v4(&tmp, mat);
+	cgm_wscale(&tmp, 1.0f / 6.0f);
+	return cgm_wdot(&tmp, &qfact);
+}
+
+static CGM_INLINE void cgm_discrand(cgm_vec3 *pt, float rad)
+{
+	float theta = 2.0f * M_PI * (float)rand() / RAND_MAX;
+	float r = sqrt((float)rand() / RAND_MAX) * rad;
+	pt->x = cos(theta) * r;
+	pt->y = sin(theta) * r;
+	pt->z = 0.0f;
+}
+
+static CGM_INLINE void cgm_sphrand(cgm_vec3 *pt, float rad)
+{
+	float u, v, theta, phi;
+
+	u = (float)rand() / RAND_MAX;
+	v = (float)rand() / RAND_MAX;
+
+	theta = 2.0f * M_PI * u;
+	phi = acos(2.0f * v - 1.0f);
+
+	pt->x = cos(theta) * sin(phi) * rad;
+	pt->y = sin(theta) * sin(phi) * rad;
+	pt->z = cos(phi) * rad;
+}
+
+static CGM_INLINE void cgm_unproject(cgm_vec3 *res, const cgm_vec3 *norm_scrpos,
+		const float *inv_viewproj)
+{
+	cgm_vec4 pos;
+
+	pos.x = 2.0f * norm_scrpos->x - 1.0f;
+	pos.y = 2.0f * norm_scrpos->y - 1.0f;
+	pos.z = 2.0f * norm_scrpos->z - 1.0f;
+	pos.w = 1.0f;
+
+	cgm_wmul_m4v4(&pos, inv_viewproj);
+
+	res->x = pos.x / pos.w;
+	res->y = pos.y / pos.w;
+	res->z = pos.z / pos.w;
+}
+
+static CGM_INLINE void cgm_glu_unproject(float winx, float winy, float winz,
+		const float *view, const float *proj, const int *vp,
+		float *objx, float *objy, float *objz)
+{
+	cgm_vec3 npos, res;
+	float inv_pv[16];
+
+	cgm_mcopy(inv_pv, view);
+	cgm_mmul(inv_pv, proj);
+	cgm_minverse(inv_pv);
+
+	npos.x = (winx - vp[0]) / vp[2];
+	npos.y = (winy - vp[1]) / vp[4];
+	npos.z = winz;
+
+	cgm_unproject(&res, &npos, inv_pv);
+
+	*objx = res.x;
+	*objy = res.y;
+	*objz = res.z;
+}
+
+static CGM_INLINE void cgm_pick_ray(cgm_ray *ray, float nx, float ny,
+		const float *viewmat, const float *projmat)
+{
+	cgm_vec3 npos, farpt;
+	float inv_pv[16];
+
+	cgm_mcopy(inv_pv, viewmat);
+	cgm_mmul(inv_pv, projmat);
+	cgm_minverse(inv_pv);
+
+	cgm_vcons(&npos, nx, ny, 0.0f);
+	cgm_unproject(&ray->origin, &npos, inv_pv);
+	npos.z = 1.0f;
+	cgm_unproject(&farpt, &npos, inv_pv);
+
+	ray->dir.x = farpt.x - ray->origin.x;
+	ray->dir.y = farpt.y - ray->origin.y;
+	ray->dir.z = farpt.z - ray->origin.z;
+}
+
+static CGM_INLINE void cgm_raypos(cgm_vec3 *p, const cgm_ray *ray, float t)
+{
+	p->x = ray->origin.x + ray->dir.x * t;
+	p->y = ray->origin.y + ray->dir.y * t;
+	p->z = ray->origin.z + ray->dir.z * t;
+}
+
+static CGM_INLINE void cgm_bary(cgm_vec3 *bary, const cgm_vec3 *a,
+		const cgm_vec3 *b, const cgm_vec3 *c, const cgm_vec3 *pt)
+{
+	float d00, d01, d11, d20, d21, denom;
+	cgm_vec3 v0 = *b, v1 = *c, v2 = *pt;
+
+	cgm_vsub(&v0, a);
+	cgm_vsub(&v1, a);
+	cgm_vsub(&v2, a);
+
+	d00 = cgm_vdot(&v0, &v0);
+	d01 = cgm_vdot(&v0, &v1);
+	d11 = cgm_vdot(&v1, &v1);
+	d20 = cgm_vdot(&v2, &v0);
+	d21 = cgm_vdot(&v2, &v1);
+	denom = d00 * d11 - d01 * d01;
+
+	bary->y = (d11 * d20 - d01 * d21) / denom;
+	bary->z = (d00 * d21 - d01 * d20) / denom;
+	bary->x = 1.0f - bary->y - bary->z;
+}
+
+static CGM_INLINE void cgm_uvec_to_sph(float *theta, float *phi, const cgm_vec3 *v)
+{
+	*theta = atan2(v->z, v->x);
+	*phi = acos(v->y);
+}
+
+static CGM_INLINE void cgm_sph_to_uvec(cgm_vec3 *v, float theta, float phi)
+{
+	v->x = sin(theta) * cos(phi);
+	v->y = sin(phi);
+	v->z = cos(theta) * cos(phi);
+}
diff --git a/src/cgmath/cgmquat.inl b/src/cgmath/cgmquat.inl
new file mode 100644
index 0000000..72b18a3
--- /dev/null
+++ b/src/cgmath/cgmquat.inl
@@ -0,0 +1,159 @@
+/* gph-cmath - C graphics math library
+ * Copyright (C) 2018-2023 John Tsiombikas <nuclear@member.fsf.org>
+ *
+ * This program is free software. Feel free to use, modify, and/or redistribute
+ * it under the terms of the MIT/X11 license. See LICENSE for details.
+ * If you intend to redistribute parts of the code without the LICENSE file
+ * replace this paragraph with the full contents of the LICENSE file.
+ */
+static CGM_INLINE void cgm_qcons(cgm_quat *q, float x, float y, float z, float w)
+{
+	q->x = x;
+	q->y = y;
+	q->z = z;
+	q->w = w;
+}
+
+
+static CGM_INLINE void cgm_qneg(cgm_quat *q)
+{
+	q->x = -q->x;
+	q->y = -q->y;
+	q->z = -q->z;
+	q->w = -q->w;
+}
+
+static CGM_INLINE void cgm_qadd(cgm_quat *a, const cgm_quat *b)
+{
+	a->x += b->x;
+	a->y += b->y;
+	a->z += b->z;
+	a->w += b->w;
+}
+
+static CGM_INLINE void cgm_qsub(cgm_quat *a, const cgm_quat *b)
+{
+	a->x -= b->x;
+	a->y -= b->y;
+	a->z -= b->z;
+	a->w -= b->w;
+}
+
+static CGM_INLINE void cgm_qmul(cgm_quat *a, const cgm_quat *b)
+{
+	float x, y, z, dot;
+	cgm_vec3 cross;
+
+	dot = a->x * b->x + a->y * b->y + a->z * b->z;
+	cgm_vcross(&cross, (cgm_vec3*)a, (cgm_vec3*)b);
+
+	x = a->w * b->x + b->w * a->x + cross.x;
+	y = a->w * b->y + b->w * a->y + cross.y;
+	z = a->w * b->z + b->w * a->z + cross.z;
+	a->w = a->w * b->w - dot;
+	a->x = x;
+	a->y = y;
+	a->z = z;
+}
+
+static CGM_INLINE float cgm_qlength(const cgm_quat *q)
+{
+	return sqrt(q->x * q->x + q->y * q->y + q->z * q->z + q->w * q->w);
+}
+
+static CGM_INLINE float cgm_qlength_sq(const cgm_quat *q)
+{
+	return q->x * q->x + q->y * q->y + q->z * q->z + q->w * q->w;
+}
+
+static CGM_INLINE void cgm_qnormalize(cgm_quat *q)
+{
+	float len = cgm_qlength(q);
+	if(len != 0.0f) {
+		float s = 1.0f / len;
+		q->x *= s;
+		q->y *= s;
+		q->z *= s;
+		q->w *= s;
+	}
+}
+
+static CGM_INLINE void cgm_qconjugate(cgm_quat *q)
+{
+	q->x = -q->x;
+	q->y = -q->y;
+	q->z = -q->z;
+}
+
+static CGM_INLINE void cgm_qinvert(cgm_quat *q)
+{
+	float len_sq = cgm_qlength_sq(q);
+	cgm_qconjugate(q);
+	if(len_sq != 0.0f) {
+		float s = 1.0f / len_sq;
+		q->x *= s;
+		q->y *= s;
+		q->z *= s;
+		q->w *= s;
+	}
+}
+
+static CGM_INLINE void cgm_qrotation(cgm_quat *q, float angle, float x, float y, float z)
+{
+	float hangle = angle * 0.5f;
+	float sin_ha = sin(hangle);
+	q->w = cos(hangle);
+	q->x = x * sin_ha;
+	q->y = y * sin_ha;
+	q->z = z * sin_ha;
+}
+
+static CGM_INLINE void cgm_qrotate(cgm_quat *q, float angle, float x, float y, float z)
+{
+	cgm_quat qrot;
+	cgm_qrotation(&qrot, angle, x, y, z);
+	cgm_qmul(q, &qrot);
+}
+
+static CGM_INLINE void cgm_qslerp(cgm_quat *res, const cgm_quat *quat1, const cgm_quat *q2, float t)
+{
+	float angle, dot, a, b, sin_angle;
+	cgm_quat q1 = *quat1;
+
+	dot = quat1->x * q2->x + quat1->y * q2->y + quat1->z * q2->z + quat1->w * q2->w;
+	if(dot < 0.0f) {
+		/* make sure we inteprolate across the shortest arc */
+		cgm_qneg(&q1);
+		dot = -dot;
+	}
+
+	/* clamp dot to [-1, 1] in order to avoid domain errors in acos due to
+	 * floating point imprecisions
+	 */
+	if(dot < -1.0f) dot = -1.0f;
+	if(dot > 1.0f) dot = 1.0f;
+	angle = acos(dot);
+
+	sin_angle = sin(angle);
+	if(sin_angle == 0.0f) {
+		/* use linear interpolation to avoid div/zero */
+		a = 1.0f;
+		b = t;
+	} else {
+		a = sin((1.0f - t) * angle) / sin_angle;
+		b = sin(t * angle) / sin_angle;
+	}
+
+	res->x = q1.x * a + q2->x * b;
+	res->y = q1.y * a + q2->y * b;
+	res->z = q1.z * a + q2->z * b;
+	res->w = q1.w * a + q2->w * b;
+}
+
+static CGM_INLINE void cgm_qlerp(cgm_quat *res, const cgm_quat *a, const cgm_quat *b, float t)
+{
+	res->x = a->x + (b->x - a->x) * t;
+	res->y = a->y + (b->y - a->y) * t;
+	res->z = a->z + (b->z - a->z) * t;
+	res->w = a->w + (b->w - a->w) * t;
+}
diff --git a/src/cgmath/cgmray.inl b/src/cgmath/cgmray.inl
new file mode 100644
index 0000000..b9a7dfa
--- /dev/null
+++ b/src/cgmath/cgmray.inl
@@ -0,0 +1,39 @@
+/* gph-cmath - C graphics math library
+ * Copyright (C) 2018-2023 John Tsiombikas <nuclear@member.fsf.org>
+ *
+ * This program is free software. Feel free to use, modify, and/or redistribute
+ * it under the terms of the MIT/X11 license. See LICENSE for details.
+ * If you intend to redistribute parts of the code without the LICENSE file
+ * replace this paragraph with the full contents of the LICENSE file.
+ */
+static CGM_INLINE void cgm_rcons(cgm_ray *r, float x, float y, float z, float dx, float dy, float dz)
+{
+	r->origin.x = x;
+	r->origin.y = y;
+	r->origin.z = z;
+	r->dir.x = dx;
+	r->dir.y = dy;
+	r->dir.z = dz;
+}
+
+static CGM_INLINE void cgm_rmul_mr(cgm_ray *ray, const float *m)
+{
+	cgm_vmul_m4v3(&ray->origin, m);
+	cgm_vmul_m3v3(&ray->dir, m);
+}
+
+static CGM_INLINE void cgm_rmul_rm(cgm_ray *ray, const float *m)
+{
+	cgm_vmul_v3m4(&ray->origin, m);
+	cgm_vmul_v3m3(&ray->dir, m);
+}
+
+static CGM_INLINE void cgm_rreflect(cgm_ray *ray, const cgm_vec3 *n)
+{
+	cgm_vreflect(&ray->dir, n);
+}
+
+static CGM_INLINE void cgm_rrefract(cgm_ray *ray, const cgm_vec3 *n, float ior)
+{
+	cgm_vrefract(&ray->dir, n, ior);
+}
diff --git a/src/cgmath/cgmvec3.inl b/src/cgmath/cgmvec3.inl
new file mode 100644
index 0000000..874d0fc
--- /dev/null
+++ b/src/cgmath/cgmvec3.inl
@@ -0,0 +1,211 @@
+/* gph-cmath - C graphics math library
+ * Copyright (C) 2018-2023 John Tsiombikas <nuclear@member.fsf.org>
+ *
+ * This program is free software. Feel free to use, modify, and/or redistribute
+ * it under the terms of the MIT/X11 license. See LICENSE for details.
+ * If you intend to redistribute parts of the code without the LICENSE file
+ * replace this paragraph with the full contents of the LICENSE file.
+ */
+static CGM_INLINE void cgm_vcons(cgm_vec3 *v, float x, float y, float z)
+{
+	v->x = x;
+	v->y = y;
+	v->z = z;
+}
+
+static CGM_INLINE cgm_vec3 cgm_vvec(float x, float y, float z)
+{
+	cgm_vec3 v;
+	v.x = x;
+	v.y = y;
+	v.z = z;
+	return v;
+}
+
+static CGM_INLINE void cgm_vadd(cgm_vec3 *a, const cgm_vec3 *b)
+{
+	a->x += b->x;
+	a->y += b->y;
+	a->z += b->z;
+}
+
+static CGM_INLINE void cgm_vadd_scaled(cgm_vec3 *a, const cgm_vec3 *b, float s)
+{
+	a->x += b->x * s;
+	a->y += b->y * s;
+	a->z += b->z * s;
+}
+
+static CGM_INLINE void cgm_vsub(cgm_vec3 *a, const cgm_vec3 *b)
+{
+	a->x -= b->x;
+	a->y -= b->y;
+	a->z -= b->z;
+}
+
+static CGM_INLINE void cgm_vsub_scaled(cgm_vec3 *a, const cgm_vec3 *b, float s)
+{
+	a->x -= b->x * s;
+	a->y -= b->y * s;
+	a->z -= b->z * s;
+}
+
+static CGM_INLINE void cgm_vmul(cgm_vec3 *a, const cgm_vec3 *b)
+{
+	a->x *= b->x;
+	a->y *= b->y;
+	a->z *= b->z;
+}
+
+static CGM_INLINE void cgm_vscale(cgm_vec3 *v, float s)
+{
+	v->x *= s;
+	v->y *= s;
+	v->z *= s;
+}
+
+static CGM_INLINE void cgm_vmul_m4v3(cgm_vec3 *v, const float *m)
+{
+	float x = v->x * m[0] + v->y * m[4] + v->z * m[8] + m[12];
+	float y = v->x * m[1] + v->y * m[5] + v->z * m[9] + m[13];
+	v->z = v->x * m[2] + v->y * m[6] + v->z * m[10] + m[14];
+	v->x = x;
+	v->y = y;
+}
+
+static CGM_INLINE void cgm_vmul_v3m4(cgm_vec3 *v, const float *m)
+{
+	float x = v->x * m[0] + v->y * m[1] + v->z * m[2] + m[3];
+	float y = v->x * m[4] + v->y * m[5] + v->z * m[6] + m[7];
+	v->z = v->x * m[8] + v->y * m[9] + v->z * m[10] + m[11];
+	v->x = x;
+	v->y = y;
+}
+
+static CGM_INLINE void cgm_vmul_m3v3(cgm_vec3 *v, const float *m)
+{
+	float x = v->x * m[0] + v->y * m[4] + v->z * m[8];
+	float y = v->x * m[1] + v->y * m[5] + v->z * m[9];
+	v->z = v->x * m[2] + v->y * m[6] + v->z * m[10];
+	v->x = x;
+	v->y = y;
+}
+
+static CGM_INLINE void cgm_vmul_v3m3(cgm_vec3 *v, const float *m)
+{
+	float x = v->x * m[0] + v->y * m[1] + v->z * m[2];
+	float y = v->x * m[4] + v->y * m[5] + v->z * m[6];
+	v->z = v->x * m[8] + v->y * m[9] + v->z * m[10];
+	v->x = x;
+	v->y = y;
+}
+
+static CGM_INLINE float cgm_vdot(const cgm_vec3 *a, const cgm_vec3 *b)
+{
+	return a->x * b->x + a->y * b->y + a->z * b->z;
+}
+
+static CGM_INLINE void cgm_vcross(cgm_vec3 *res, const cgm_vec3 *a, const cgm_vec3 *b)
+{
+	res->x = a->y * b->z - a->z * b->y;
+	res->y = a->z * b->x - a->x * b->z;
+	res->z = a->x * b->y - a->y * b->x;
+}
+
+static CGM_INLINE float cgm_vlength(const cgm_vec3 *v)
+{
+	return sqrt(v->x * v->x + v->y * v->y + v->z * v->z);
+}
+
+static CGM_INLINE float cgm_vlength_sq(const cgm_vec3 *v)
+{
+	return v->x * v->x + v->y * v->y + v->z * v->z;
+}
+
+static CGM_INLINE float cgm_vdist(const cgm_vec3 *a, const cgm_vec3 *b)
+{
+	float dx = a->x - b->x;
+	float dy = a->y - b->y;
+	float dz = a->z - b->z;
+	return sqrt(dx * dx + dy * dy + dz * dz);
+}
+
+static CGM_INLINE float cgm_vdist_sq(const cgm_vec3 *a, const cgm_vec3 *b)
+{
+	float dx = a->x - b->x;
+	float dy = a->y - b->y;
+	float dz = a->z - b->z;
+	return dx * dx + dy * dy + dz * dz;
+}
+
+static CGM_INLINE void cgm_vnormalize(cgm_vec3 *v)
+{
+	float len = cgm_vlength(v);
+	if(len != 0.0f) {
+		float s = 1.0f / len;
+		v->x *= s;
+		v->y *= s;
+		v->z *= s;
+	}
+}
+
+static CGM_INLINE void cgm_vreflect(cgm_vec3 *v, const cgm_vec3 *n)
+{
+	float ndotv2 = cgm_vdot(v, n) * 2.0f;
+	v->x -= n->x * ndotv2;
+	v->y -= n->y * ndotv2;
+	v->z -= n->z * ndotv2;
+}
+
+static CGM_INLINE void cgm_vrefract(cgm_vec3 *v, const cgm_vec3 *n, float ior)
+{
+	float ndotv = cgm_vdot(v, n);
+	float k = 1.0f - ior * ior * (1.0f - ndotv * ndotv);
+	if(k < 0.0f) {
+		cgm_vreflect(v, n);	/* TIR */
+	} else {
+		float sqrt_k = sqrt(k);
+		v->x = ior * v->x - (ior * ndotv + sqrt_k) * n->x;
+		v->y = ior * v->y - (ior * ndotv + sqrt_k) * n->y;
+		v->z = ior * v->z - (ior * ndotv + sqrt_k) * n->z;
+	}
+}
+
+static CGM_INLINE void cgm_vrotate_quat(cgm_vec3 *v, const cgm_quat *q)
+{
+	cgm_quat vq, inv_q = *q, tmp_q = *q;
+
+	cgm_qcons(&vq, v->x, v->y, v->z, 0.0f);
+	cgm_qinvert(&inv_q);
+	cgm_qmul(&tmp_q, &vq);
+	cgm_qmul(&tmp_q, &inv_q);
+	cgm_vcons(v, tmp_q.x, tmp_q.y, tmp_q.z);
+}
+
+static CGM_INLINE void cgm_vrotate_axis(cgm_vec3 *v, int axis, float angle)
+{
+	float m[16];
+	cgm_mrotation_axis(m, axis, angle);
+	cgm_vmul_m3v3(v, m);
+}
+
+static CGM_INLINE void cgm_vrotate(cgm_vec3 *v, float angle, float x, float y, float z)
+{
+	float m[16];
+	cgm_mrotation(m, angle, x, y, z);
+	cgm_vmul_m3v3(v, m);
+}
+
+static CGM_INLINE void cgm_vrotate_euler(cgm_vec3 *v, float a, float b, float c, enum cgm_euler_mode mode)
+{
+	float m[16];
+	cgm_mrotation_euler(m, a, b, c, mode);
+	cgm_vmul_m3v3(v, m);
+}
+
+static CGM_INLINE void cgm_vlerp(cgm_vec3 *res, const cgm_vec3 *a, const cgm_vec3 *b, float t)
+{
+	res->x = a->x + (b->x - a->x) * t;
+	res->y = a->y + (b->y - a->y) * t;
+	res->z = a->z + (b->z - a->z) * t;
+}
diff --git a/src/cgmath/cgmvec4.inl b/src/cgmath/cgmvec4.inl
new file mode 100644
index 0000000..1d66496
--- /dev/null
+++ b/src/cgmath/cgmvec4.inl
@@ -0,0 +1,168 @@
+/* gph-cmath - C graphics math library
+ * Copyright (C) 2018-2023 John Tsiombikas <nuclear@member.fsf.org>
+ *
+ * This program is free software. Feel free to use, modify, and/or redistribute
+ * it under the terms of the MIT/X11 license. See LICENSE for details.
+ * If you intend to redistribute parts of the code without the LICENSE file
+ * replace this paragraph with the full contents of the LICENSE file.
+ */
+static CGM_INLINE void cgm_wcons(cgm_vec4 *v, float x, float y, float z, float w)
+{
+	v->x = x;
+	v->y = y;
+	v->z = z;
+	v->w = w;
+}
+
+static CGM_INLINE cgm_vec4 cgm_wvec(float x, float y, float z, float w)
+{
+	cgm_vec4 v;
+	v.x = x;
+	v.y = y;
+	v.z = z;
+	v.w = w;
+	return v;
+}
+
+static CGM_INLINE void cgm_wadd(cgm_vec4 *a, const cgm_vec4 *b)
+{
+	a->x += b->x;
+	a->y += b->y;
+	a->z += b->z;
+	a->w += b->w;
+}
+
+static CGM_INLINE void cgm_wsub(cgm_vec4 *a, const cgm_vec4 *b)
+{
+	a->x -= b->x;
+	a->y -= b->y;
+	a->z -= b->z;
+	a->w -= b->w;
+}
+
+static CGM_INLINE void cgm_wmul(cgm_vec4 *a, const cgm_vec4 *b)
+{
+	a->x *= b->x;
+	a->y *= b->y;
+	a->z *= b->z;
+	a->w *= b->w;
+}
+
+static CGM_INLINE void cgm_wscale(cgm_vec4 *v, float s)
+{
+	v->x *= s;
+	v->y *= s;
+	v->z *= s;
+	v->w *= s;
+}
+
+static CGM_INLINE void cgm_wmul_m4v4(cgm_vec4 *v, const float *m)
+{
+	float x = v->x * m[0] + v->y * m[4] + v->z * m[8] + v->w * m[12];
+	float y = v->x * m[1] + v->y * m[5] + v->z * m[9] + v->w * m[13];
+	float z = v->x * m[2] + v->y * m[6] + v->z * m[10] + v->w * m[14];
+	v->w = v->x * m[3] + v->y * m[7] + v->z * m[11] + v->w * m[15];
+	v->x = x;
+	v->y = y;
+	v->z = z;
+}
+
+static CGM_INLINE void cgm_wmul_v4m4(cgm_vec4 *v, const float *m)
+{
+	float x = v->x * m[0] + v->y * m[1] + v->z * m[2] + v->w * m[3];
+	float y = v->x * m[4] + v->y * m[5] + v->z * m[6] + v->w * m[7];
+	float z = v->x * m[8] + v->y * m[9] + v->z * m[10] + v->w * m[11];
+	v->w = v->x * m[12] + v->y * m[13] + v->z * m[14] + v->w * m[15];
+	v->x = x;
+	v->y = y;
+	v->z = z;
+}
+
+static CGM_INLINE void cgm_wmul_m34v4(cgm_vec4 *v, const float *m)
+{
+	float x = v->x * m[0] + v->y * m[4] + v->z * m[8] + v->w * m[12];
+	float y = v->x * m[1] + v->y * m[5] + v->z * m[9] + v->w * m[13];
+	v->z = v->x * m[2] + v->y * m[6] + v->z * m[10] + v->w * m[14];
+	v->x = x;
+	v->y = y;
+}
+
+static CGM_INLINE void cgm_wmul_v4m43(cgm_vec4 *v, const float *m)
+{
+	float x = v->x * m[0] + v->y * m[1] + v->z * m[2] + v->w * m[3];
+	float y = v->x * m[4] + v->y * m[5] + v->z * m[6] + v->w * m[7];
+	v->z = v->x * m[8] + v->y * m[9] + v->z * m[10] + v->w * m[11];
+	v->x = x;
+	v->y = y;
+}
+
+static CGM_INLINE void cgm_wmul_m3v4(cgm_vec4 *v, const float *m)
+{
+	float x = v->x * m[0] + v->y * m[4] + v->z * m[8];
+	float y = v->x * m[1] + v->y * m[5] + v->z * m[9];
+	v->z = v->x * m[2] + v->y * m[6] + v->z * m[10];
+	v->x = x;
+	v->y = y;
+}
+
+static CGM_INLINE void cgm_wmul_v4m3(cgm_vec4 *v, const float *m)
+{
+	float x = v->x * m[0] + v->y * m[1] + v->z * m[2];
+	float y = v->x * m[4] + v->y * m[5] + v->z * m[6];
+	v->z = v->x * m[8] + v->y * m[9] + v->z * m[10];
+	v->x = x;
+	v->y = y;
+}
+
+static CGM_INLINE float cgm_wdot(const cgm_vec4 *a, const cgm_vec4 *b)
+{
+	return a->x * b->x + a->y * b->y + a->z * b->z + a->w * b->w;
+}
+
+static CGM_INLINE float cgm_wlength(const cgm_vec4 *v)
+{
+	return sqrt(v->x * v->x + v->y * v->y + v->z * v->z + v->w * v->w);
+}
+
+static CGM_INLINE float cgm_wlength_sq(const cgm_vec4 *v)
+{
+	return v->x * v->x + v->y * v->y + v->z * v->z + v->w * v->w;
+}
+
+static CGM_INLINE float cgm_wdist(const cgm_vec4 *a, const cgm_vec4 *b)
+{
+	float dx = a->x - b->x;
+	float dy = a->y - b->y;
+	float dz = a->z - b->z;
+	float dw = a->w - b->w;
+	return sqrt(dx * dx + dy * dy + dz * dz + dw * dw);
+}
+
+static CGM_INLINE float cgm_wdist_sq(const cgm_vec4 *a, const cgm_vec4 *b)
+{
+	float dx = a->x - b->x;
+	float dy = a->y - b->y;
+	float dz = a->z - b->z;
+	float dw = a->w - b->w;
+	return dx * dx + dy * dy + dz * dz + dw * dw;
+}
+
+static CGM_INLINE void cgm_wnormalize(cgm_vec4 *v)
+{
+	float len = cgm_wlength(v);
+	if(len != 0.0f) {
+		float s = 1.0f / len;
+		v->x *= s;
+		v->y *= s;
+		v->z *= s;
+		v->w *= s;
+	}
+}
+
+static CGM_INLINE void cgm_wlerp(cgm_vec4 *res, const cgm_vec4 *a, const cgm_vec4 *b, float t)
+{
+	res->x = a->x + (b->x - a->x) * t;
+	res->y = a->y + (b->y - a->y) * t;
+	res->z = a->z + (b->z - a->z) * t;
+	res->w = a->w + (b->w - a->w) * t;
+}
diff --git a/src/colormgr.c b/src/colormgr.c
new file mode 100644
index 0000000..8ab61c9
--- /dev/null
+++ b/src/colormgr.c
@@ -0,0 +1,31 @@
+#include "colormgr.h"
+#include "vga.h"
+
+/* TODO bring in a proper color manager with shade LUTs later */
+
+void init_colormgr(void)
+{
+	unsigned int i, r, g, b;
+
+	for(i=0; i<256; i++) {
+		r = i & 0xe0;
+		g = (i << 3) & 0xe0;
+		b = (i << 5) & 0xc0;
+
+		r |= r >> 3;
+		g |= g >> 3;
+		b |= (b >> 2) | (b >> 4);
+
+		vga_setpalent(i, r, g, b);
+	}
+}
+
+int find_color(int r, int g, int b)
+{
+	return (r & 0xe0) | ((g >> 3) & 0x1c) | ((b >> 6) & 3);
+}
+
+int shade_color(int col, int shade)
+{
+	return col;	/* TODO */
+}
diff --git a/src/colormgr.h b/src/colormgr.h
new file mode 100644
index 0000000..a3dd2bb
--- /dev/null
+++ b/src/colormgr.h
@@ -0,0 +1,9 @@
+#ifndef COLORMGR_H_
+#define COLORMGR_H_
+
+void init_colormgr(void);
+
+int find_color(int r, int g, int b);
+int shade_color(int col, int shade);	/* both 0-255 */
+
+#endif	/* COLORMGR_H_ */
diff --git a/src/dynarr.c b/src/dynarr.c
new file mode 100644
index 0000000..59bbf8c
--- /dev/null
+++ b/src/dynarr.c
@@ -0,0 +1,141 @@
+/* dynarr - dynamic resizable C array data structure
+ * author: John Tsiombikas <nuclear@member.fsf.org>
+ * license: public domain
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "dynarr.h"
+
+/* The array descriptor keeps auxilliary information needed to manipulate
+ * the dynamic array. It's allocated adjacent to the array buffer.
+ */
+struct arrdesc {
+	int nelem, szelem;
+	int max_elem;
+	int bufsz;	/* not including the descriptor */
+};
+
+#define DESC(x)		((struct arrdesc*)((char*)(x) - sizeof(struct arrdesc)))
+
+void *dynarr_alloc(int elem, int szelem)
+{
+	struct arrdesc *desc;
+
+	if(!(desc = malloc(elem * szelem + sizeof *desc))) {
+		return 0;
+	}
+	desc->nelem = desc->max_elem = elem;
+	desc->szelem = szelem;
+	desc->bufsz = elem * szelem;
+	return (char*)desc + sizeof *desc;
+}
+
+void dynarr_free(void *da)
+{
+	if(da) {
+		free(DESC(da));
+	}
+}
+
+void *dynarr_resize(void *da, int elem)
+{
+	int newsz;
+	void *tmp;
+	struct arrdesc *desc;
+
+	if(!da) return 0;
+	desc = DESC(da);
+
+	newsz = desc->szelem * elem;
+
+	if(!(tmp = realloc(desc, newsz + sizeof *desc))) {
+		return 0;
+	}
+	desc = tmp;
+
+	desc->nelem = desc->max_elem = elem;
+	desc->bufsz = newsz;
+	return (char*)desc + sizeof *desc;
+}
+
+int dynarr_empty(void *da)
+{
+	return DESC(da)->nelem ? 0 : 1;
+}
+
+int dynarr_size(void *da)
+{
+	return DESC(da)->nelem;
+}
+
+
+void *dynarr_clear(void *da)
+{
+	return dynarr_resize(da, 0);
+}
+
+/* stack semantics */
+void *dynarr_push(void *da, void *item)
+{
+	struct arrdesc *desc;
+	int nelem;
+
+	desc = DESC(da);
+	nelem = desc->nelem;
+
+	if(nelem >= desc->max_elem) {
+		/* need to resize */
+		struct arrdesc *tmp;
+		int newsz = desc->max_elem ? desc->max_elem * 2 : 1;
+
+		if(!(tmp = dynarr_resize(da, newsz))) {
+			fprintf(stderr, "failed to resize\n");
+			return da;
+		}
+		da = tmp;
+		desc = DESC(da);
+		desc->nelem = nelem;
+	}
+
+	if(item) {
+		memcpy((char*)da + desc->nelem * desc->szelem, item, desc->szelem);
+	}
+	desc->nelem++;
+	return da;
+}
+
+void *dynarr_pop(void *da)
+{
+	struct arrdesc *desc;
+	int nelem;
+
+	desc = DESC(da);
+	nelem = desc->nelem;
+
+	if(!nelem) return da;
+
+	if(nelem <= desc->max_elem / 3) {
+		/* reclaim space */
+		struct arrdesc *tmp;
+		int newsz = desc->max_elem / 2;
+
+		if(!(tmp = dynarr_resize(da, newsz))) {
+			fprintf(stderr, "failed to resize\n");
+			return da;
+		}
+		da = tmp;
+		desc = DESC(da);
+		desc->nelem = nelem;
+	}
+	desc->nelem--;
+
+	return da;
+}
+
+void *dynarr_finalize(void *da)
+{
+	struct arrdesc *desc = DESC(da);
+	memmove(desc, da, desc->bufsz);
+	return desc;
+}
diff --git a/src/dynarr.h b/src/dynarr.h
new file mode 100644
index 0000000..8690b5a
--- /dev/null
+++ b/src/dynarr.h
@@ -0,0 +1,80 @@
+/* dynarr - dynamic resizable C array data structure
+ * author: John Tsiombikas <nuclear@member.fsf.org>
+ * license: public domain
+ */
+#ifndef DYNARR_H_
+#define DYNARR_H_
+
+/* usage example:
+ * -------------
+ * int *arr = dynarr_alloc(0, sizeof *arr);
+ *
+ * int x = 10;
+ * arr = dynarr_push(arr, &x);
+ * x = 5;
+ * arr = dynarr_push(arr, &x);
+ * x = 42;
+ * arr = dynarr_push(arr, &x);
+ *
+ * for(i=0; i<dynarr_size(arr); i++) {
+ *     printf("%d\n", arr[i]);
+ *  }
+ *  dynarr_free(arr);
+ */
+
+void *dynarr_alloc(int elem, int szelem);
+void dynarr_free(void *da);
+void *dynarr_resize(void *da, int elem);
+
+/* dynarr_empty returns non-zero if the array is empty
+ * Complexity: O(1) */
+int dynarr_empty(void *da);
+/* dynarr_size returns the number of elements in the array
+ * Complexity: O(1) */
+int dynarr_size(void *da);
+
+void *dynarr_clear(void *da);
+
+/* stack semantics */
+void *dynarr_push(void *da, void *item);
+void *dynarr_pop(void *da);
+
+/* Finalize the array. No more resizing is possible after this call.
+ * Use free() instead of dynarr_free() to deallocate a finalized array.
+ * Returns pointer to the finalized array.
+ * dynarr_finalize can't fail.
+ * Complexity: O(n)
+ */
+void *dynarr_finalize(void *da);
+
+/* helper macros */
+#define DYNARR_RESIZE(da, n) \
+	do { (da) = dynarr_resize((da), (n)); } while(0)
+#define DYNARR_CLEAR(da) \
+	do { (da) = dynarr_clear(da); } while(0)
+#define DYNARR_PUSH(da, item) \
+	do { (da) = dynarr_push((da), (item)); } while(0)
+#define DYNARR_POP(da) \
+	do { (da) = dynarr_pop(da); } while(0)
+
+/* utility macros to push characters to a string. assumes and maintains
+ * the invariant that the last element is always a zero
+ */
+#define DYNARR_STRPUSH(da, c) \
+	do { \
+		char cnull = 0, ch = (char)(c); \
+		(da) = dynarr_pop(da); \
+		(da) = dynarr_push((da), &ch); \
+		(da) = dynarr_push((da), &cnull); \
+	} while(0)
+
+#define DYNARR_STRPOP(da) \
+	do { \
+		char cnull = 0; \
+		(da) = dynarr_pop(da); \
+		(da) = dynarr_pop(da); \
+		(da) = dynarr_push((da), &cnull); \
+	} while(0)
+
+
+#endif	/* DYNARR_H_ */
diff --git a/src/game.c b/src/game.c
index f04984c..4609b46 100644
--- a/src/game.c
+++ b/src/game.c
@@ -1,8 +1,10 @@
 #include <string.h>
 #include "game.h"
+#include "colormgr.h"
 
 int game_init(void)
 {
+	init_colormgr();
 	return 0;
 }
 
@@ -12,7 +14,21 @@ void game_shutdown(void)
 
 void game_draw(void)
 {
-	memset(framebuf, 2, 64000);
+	int i, j;
+	unsigned char *fbptr = framebuf;
+
+	for(i=0; i<200; i++) {
+		for(j=0; j<320; j++) {
+			int r, b;
+			int idx = i + (rand() & 0x1f) - 16;
+			if(idx < 0) idx = 0;
+			if(idx > 199) idx = 199;
+
+			r = 255 * idx / 199;
+			b = 255 - r;
+			*fbptr++ = find_color(r, 0, b);
+		}
+	}
 
 	game_swap_buffers();
 }
diff --git a/src/gfxutil.c b/src/gfxutil.c
new file mode 100644
index 0000000..a472192
--- /dev/null
+++ b/src/gfxutil.c
@@ -0,0 +1,178 @@
+#include <string.h>
+#include <assert.h>
+#include "game.h"
+#include "gfxutil.h"
+#include "3dgfx/3dgfx.h"
+
+enum {
+	IN		= 0,
+	LEFT	= 1,
+	RIGHT	= 2,
+	TOP		= 4,
+	BOTTOM	= 8
+};
+
+static int outcode(int x, int y, int xmin, int ymin, int xmax, int ymax)
+{
+	int code = 0;
+
+	if(x < xmin) {
+		code |= LEFT;
+	} else if(x > xmax) {
+		code |= RIGHT;
+	}
+	if(y < ymin) {
+		code |= TOP;
+	} else if(y > ymax) {
+		code |= BOTTOM;
+	}
+	return code;
+}
+
+#define FIXMUL(a, b)	(((a) * (b)) >> 8)
+#define FIXDIV(a, b)	(((a) << 8) / (b))
+
+#define LERP(a, b, t)	((a) + FIXMUL((b) - (a), (t)))
+
+int clip_line(int *x0, int *y0, int *x1, int *y1, int xmin, int ymin, int xmax, int ymax)
+{
+	int oc_out;
+
+	int oc0 = outcode(*x0, *y0, xmin, ymin, xmax, ymax);
+	int oc1 = outcode(*x1, *y1, xmin, ymin, xmax, ymax);
+
+	long fx0, fy0, fx1, fy1, fxmin, fymin, fxmax, fymax;
+
+	if(!(oc0 | oc1)) return 1;	/* both points are inside */
+
+	fx0 = *x0 << 8;
+	fy0 = *y0 << 8;
+	fx1 = *x1 << 8;
+	fy1 = *y1 << 8;
+	fxmin = xmin << 8;
+	fymin = ymin << 8;
+	fxmax = xmax << 8;
+	fymax = ymax << 8;
+
+	for(;;) {
+		long x, y, t;
+
+		if(oc0 & oc1) return 0;		/* both have points with the same outbit, not visible */
+		if(!(oc0 | oc1)) break;		/* both points are inside */
+
+		oc_out = oc0 ? oc0 : oc1;
+
+		if(oc_out & TOP) {
+			t = FIXDIV(fymin - fy0, fy1 - fy0);
+			x = LERP(fx0, fx1, t);
+			y = fymin;
+		} else if(oc_out & BOTTOM) {
+			t = FIXDIV(fymax - fy0, fy1 - fy0);
+			x = LERP(fx0, fx1, t);
+			y = fymax;
+		} else if(oc_out & LEFT) {
+			t = FIXDIV(fxmin - fx0, fx1 - fx0);
+			x = fxmin;
+			y = LERP(fy0, fy1, t);
+		} else /*if(oc_out & RIGHT)*/ {
+			t = FIXDIV(fxmax - fx0, fx1 - fx0);
+			x = fxmax;
+			y = LERP(fy0, fy1, t);
+		}
+
+		if(oc_out == oc0) {
+			fx0 = x;
+			fy0 = y;
+			oc0 = outcode(fx0 >> 8, fy0 >> 8, xmin, ymin, xmax, ymax);
+		} else {
+			fx1 = x;
+			fy1 = y;
+			oc1 = outcode(fx1 >> 8, fy1 >> 8, xmin, ymin, xmax, ymax);
+		}
+	}
+
+	*x0 = fx0 >> 8;
+	*y0 = fy0 >> 8;
+	*x1 = fx1 >> 8;
+	*y1 = fy1 >> 8;
+	return 1;
+}
+
+void draw_line(int x0, int y0, int x1, int y1, unsigned char color)
+{
+	int i, dx, dy, x_inc, y_inc, error;
+	unsigned char *fb = framebuf;
+
+	fb += y0 * FB_WIDTH + x0;
+
+	dx = x1 - x0;
+	dy = y1 - y0;
+
+	if(dx >= 0) {
+		x_inc = 1;
+	} else {
+		x_inc = -1;
+		dx = -dx;
+	}
+	if(dy >= 0) {
+		y_inc = FB_WIDTH;
+	} else {
+		y_inc = -FB_WIDTH;
+		dy = -dy;
+	}
+
+	if(dx > dy) {
+		error = dy * 2 - dx;
+		for(i=0; i<=dx; i++) {
+			*fb = color;
+			if(error >= 0) {
+				error -= dx * 2;
+				fb += y_inc;
+			}
+			error += dy * 2;
+			fb += x_inc;
+		}
+	} else {
+		error = dx * 2 - dy;
+		for(i=0; i<=dy; i++) {
+			*fb = color;
+			if(error >= 0) {
+				error -= dy * 2;
+				fb += x_inc;
+			}
+			error += dx * 2;
+			fb += y_inc;
+		}
+	}
+}
+
+void draw_billboard(float x, float y, float z, float size, int lum, int a)
+{
+	float m[16];
+	size *= 0.5f;
+
+	g3d_matrix_mode(G3D_MODELVIEW);
+	g3d_push_matrix();
+
+	g3d_translate(x, y, z);
+
+	g3d_get_matrix(G3D_MODELVIEW, m);
+	/* make the upper 3x3 part of the matrix identity */
+	m[0] = m[5] = m[10] = 1.0f;
+	m[1] = m[2] = m[3] = m[4] = m[6] = m[7] = m[8] = m[9] = 0.0f;
+	g3d_load_matrix(m);
+
+	g3d_begin(G3D_QUADS);
+	g3d_color2b(lum, a);
+	g3d_texcoord(0, 0);
+	g3d_vertex(-size, -size, 0);
+	g3d_texcoord(1, 0);
+	g3d_vertex(size, -size, 0);
+	g3d_texcoord(1, 1);
+	g3d_vertex(size, size, 0);
+	g3d_texcoord(0, 1);
+	g3d_vertex(-size, size, 0);
+	g3d_end();
+
+	g3d_pop_matrix();
+}
diff --git a/src/gfxutil.h b/src/gfxutil.h
new file mode 100644
index 0000000..563e10b
--- /dev/null
+++ b/src/gfxutil.h
@@ -0,0 +1,17 @@
+#ifndef GFXUTIL_H_
+#define GFXUTIL_H_
+
+#define PACK_RGB32(r, g, b) \
+	((((r) & 0xff) << 16) | (((g) & 0xff) << 8) | ((b) & 0xff) | 0xff000000)
+
+#define UNPACK_R32(c)	(((c) >> 16) & 0xff)
+#define UNPACK_G32(c)	(((c) >> 8) & 0xff)
+#define UNPACK_B32(c)	((c) & 0xff)
+
+
+int clip_line(int *x0, int *y0, int *x1, int *y1, int xmin, int ymin, int xmax, int ymax);
+void draw_line(int x0, int y0, int x1, int y1, unsigned char color);
+
+void draw_billboard(float x, float y, float z, float size, int lum, int a);
+
+#endif	/* GFXUTIL_H_ */
diff --git a/src/libc/math.h b/src/libc/math.h
index e3b0f23..2674a9b 100644
--- a/src/libc/math.h
+++ b/src/libc/math.h
@@ -30,6 +30,7 @@ along with this program.  If not, see <https://www.gnu.org/licenses/>.
 #define fmod(x, y)	__builtin_fmod(x, y)
 #define sqrt(x)		__builtin_sqrt(x)
 #define atan2(y, x)	__builtin_atan2(y, x)
+#define acos(x)		__builtin_acos(x)
 
 double pow(double x, double y);
 
diff --git a/src/libc/stdint.h b/src/libc/stdint.h
new file mode 100644
index 0000000..bf4b41f
--- /dev/null
+++ b/src/libc/stdint.h
@@ -0,0 +1,6 @@
+#ifndef STDINT_H_
+#define STDINT_H_
+
+#include "inttypes.h"
+
+#endif	/* STDINT_H_ */
diff --git a/src/libc/string.c b/src/libc/string.c
index 06d8f87..923d782 100644
--- a/src/libc/string.c
+++ b/src/libc/string.c
@@ -223,6 +223,17 @@ char *strncpy(char *dest, const char *src, int n)
 	return dest;
 }
 
+char *strdup(const char *s)
+{
+	char *ns;
+	int len = strlen(s);
+	if(!(ns = malloc(len + 1))) {
+		return 0;
+	}
+	memcpy(ns, s, len + 1);
+	return ns;
+}
+
 /*
 static const char *errstr[] = {
 	"Success",
diff --git a/src/libc/string.h b/src/libc/string.h
index 85490b2..de72815 100644
--- a/src/libc/string.h
+++ b/src/libc/string.h
@@ -47,6 +47,8 @@ char *strcat(char *dest, const char *src);
 
 char *strncpy(char *dest, const char *src, int n);
 
+char *strdup(const char *s);
+
 char *strerror(int err);
 
 #endif	/* STRING_H_ */
diff --git a/src/rbtree.c b/src/rbtree.c
new file mode 100644
index 0000000..765e542
--- /dev/null
+++ b/src/rbtree.c
@@ -0,0 +1,518 @@
+/*
+rbtree - simple balanced binary search tree (red-black tree) library.
+Copyright (C) 2011-2014  John Tsiombikas <nuclear@member.fsf.org>
+
+rbtree is free software, feel free to use, modify, and redistribute it, under
+the terms of the 3-clause BSD license. See COPYING for details.
+*/
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include "rbtree.h"
+
+#define INT2PTR(x)	((void*)(intptr_t)(x))
+#define PTR2INT(x)	((int)(intptr_t)(x))
+
+struct rbtree {
+	struct rbnode *root;
+
+	rb_alloc_func_t alloc;
+	rb_free_func_t free;
+
+	rb_cmp_func_t cmp;
+	rb_del_func_t del;
+	void *del_cls;
+
+	struct rbnode *rstack, *iter;
+};
+
+static int cmpaddr(const void *ap, const void *bp);
+static int cmpint(const void *ap, const void *bp);
+
+static int count_nodes(struct rbnode *node);
+static void del_tree(struct rbnode *node, void (*delfunc)(struct rbnode*, void*), void *cls);
+static struct rbnode *insert(struct rbtree *rb, struct rbnode *tree, void *key, void *data);
+static struct rbnode *delete(struct rbtree *rb, struct rbnode *tree, void *key);
+/*static struct rbnode *find(struct rbtree *rb, struct rbnode *node, void *key);*/
+static void traverse(struct rbnode *node, void (*func)(struct rbnode*, void*), void *cls);
+
+struct rbtree *rb_create(rb_cmp_func_t cmp_func)
+{
+	struct rbtree *rb;
+
+	if(!(rb = malloc(sizeof *rb))) {
+		return 0;
+	}
+	if(rb_init(rb, cmp_func) == -1) {
+		free(rb);
+		return 0;
+	}
+	return rb;
+}
+
+void rb_free(struct rbtree *rb)
+{
+	rb_destroy(rb);
+	free(rb);
+}
+
+
+int rb_init(struct rbtree *rb, rb_cmp_func_t cmp_func)
+{
+	memset(rb, 0, sizeof *rb);
+
+	if(!cmp_func) {
+		rb->cmp = cmpaddr;
+	} else if(cmp_func == RB_KEY_INT) {
+		rb->cmp = cmpint;
+	} else if(cmp_func == RB_KEY_STRING) {
+		rb->cmp = (rb_cmp_func_t)strcmp;
+	} else {
+		rb->cmp = cmp_func;
+	}
+
+	rb->alloc = malloc;
+	rb->free = free;
+	return 0;
+}
+
+void rb_destroy(struct rbtree *rb)
+{
+	del_tree(rb->root, rb->del, rb->del_cls);
+}
+
+void rb_set_allocator(struct rbtree *rb, rb_alloc_func_t alloc, rb_free_func_t free)
+{
+	rb->alloc = alloc;
+	rb->free = free;
+}
+
+
+void rb_set_compare_func(struct rbtree *rb, rb_cmp_func_t func)
+{
+	rb->cmp = func;
+}
+
+void rb_set_delete_func(struct rbtree *rb, rb_del_func_t func, void *cls)
+{
+	rb->del = func;
+	rb->del_cls = cls;
+}
+
+
+void rb_clear(struct rbtree *rb)
+{
+	del_tree(rb->root, rb->del, rb->del_cls);
+	rb->root = 0;
+}
+
+int rb_copy(struct rbtree *dest, struct rbtree *src)
+{
+	struct rbnode *node;
+
+	rb_clear(dest);
+	rb_begin(src);
+	while((node = rb_next(src))) {
+		if(rb_insert(dest, node->key, node->data) == -1) {
+			return -1;
+		}
+	}
+	return 0;
+}
+
+int rb_size(struct rbtree *rb)
+{
+	return count_nodes(rb->root);
+}
+
+int rb_insert(struct rbtree *rb, void *key, void *data)
+{
+	rb->root = insert(rb, rb->root, key, data);
+	rb->root->red = 0;
+	return 0;
+}
+
+int rb_inserti(struct rbtree *rb, int key, void *data)
+{
+	rb->root = insert(rb, rb->root, INT2PTR(key), data);
+	rb->root->red = 0;
+	return 0;
+}
+
+
+int rb_delete(struct rbtree *rb, void *key)
+{
+	if((rb->root = delete(rb, rb->root, key))) {
+		rb->root->red = 0;
+	}
+	return 0;
+}
+
+int rb_deletei(struct rbtree *rb, int key)
+{
+	if((rb->root = delete(rb, rb->root, INT2PTR(key)))) {
+		rb->root->red = 0;
+	}
+	return 0;
+}
+
+
+struct rbnode *rb_find(struct rbtree *rb, void *key)
+{
+	struct rbnode *node = rb->root;
+
+	while(node) {
+		int cmp = rb->cmp(key, node->key);
+		if(cmp == 0) {
+			return node;
+		}
+		node = cmp < 0 ? node->left : node->right;
+	}
+	return 0;
+}
+
+struct rbnode *rb_findi(struct rbtree *rb, int key)
+{
+	return rb_find(rb, INT2PTR(key));
+}
+
+
+void rb_foreach(struct rbtree *rb, void (*func)(struct rbnode*, void*), void *cls)
+{
+	traverse(rb->root, func, cls);
+}
+
+
+struct rbnode *rb_root(struct rbtree *rb)
+{
+	return rb->root;
+}
+
+void rb_begin(struct rbtree *rb)
+{
+	rb->rstack = 0;
+	rb->iter = rb->root;
+}
+
+#define push(sp, x)		((x)->next = (sp), (sp) = (x))
+#define pop(sp)			((sp) = (sp)->next)
+#define top(sp)			(sp)
+
+struct rbnode *rb_next(struct rbtree *rb)
+{
+	struct rbnode *res = 0;
+
+	while(rb->rstack || rb->iter) {
+		if(rb->iter) {
+			push(rb->rstack, rb->iter);
+			rb->iter = rb->iter->left;
+		} else {
+			rb->iter = top(rb->rstack);
+			pop(rb->rstack);
+			res = rb->iter;
+			rb->iter = rb->iter->right;
+			break;
+		}
+	}
+	return res;
+}
+
+void *rb_node_key(struct rbnode *node)
+{
+	return node ? node->key : 0;
+}
+
+int rb_node_keyi(struct rbnode *node)
+{
+	return node ? PTR2INT(node->key) : 0;
+}
+
+void *rb_node_data(struct rbnode *node)
+{
+	return node ? node->data : 0;
+}
+
+void rb_node_setdata(struct rbnode *node, void *data)
+{
+	node->data = data;
+}
+
+static int cmpaddr(const void *ap, const void *bp)
+{
+	return ap < bp ? -1 : (ap > bp ? 1 : 0);
+}
+
+static int cmpint(const void *ap, const void *bp)
+{
+	return PTR2INT(ap) - PTR2INT(bp);
+}
+
+
+/* ---- left-leaning 2-3 red-black implementation ---- */
+
+/* helper prototypes */
+static int is_red(struct rbnode *tree);
+static void color_flip(struct rbnode *tree);
+static struct rbnode *rot_left(struct rbnode *a);
+static struct rbnode *rot_right(struct rbnode *a);
+static struct rbnode *find_min(struct rbnode *tree);
+static struct rbnode *del_min(struct rbtree *rb, struct rbnode *tree);
+/*static struct rbnode *move_red_right(struct rbnode *tree);*/
+static struct rbnode *move_red_left(struct rbnode *tree);
+static struct rbnode *fix_up(struct rbnode *tree);
+
+static int count_nodes(struct rbnode *node)
+{
+	if(!node)
+		return 0;
+
+	return 1 + count_nodes(node->left) + count_nodes(node->right);
+}
+
+static void del_tree(struct rbnode *node, rb_del_func_t delfunc, void *cls)
+{
+	if(!node)
+		return;
+
+	del_tree(node->left, delfunc, cls);
+	del_tree(node->right, delfunc, cls);
+
+	if(delfunc) {
+		delfunc(node, cls);
+	}
+	free(node);
+}
+
+static struct rbnode *insert(struct rbtree *rb, struct rbnode *tree, void *key, void *data)
+{
+	int cmp;
+
+	if(!tree) {
+		struct rbnode *node = rb->alloc(sizeof *node);
+		node->red = 1;
+		node->key = key;
+		node->data = data;
+		node->left = node->right = 0;
+		return node;
+	}
+
+	cmp = rb->cmp(key, tree->key);
+
+	if(cmp < 0) {
+		tree->left = insert(rb, tree->left, key, data);
+	} else if(cmp > 0) {
+		tree->right = insert(rb, tree->right, key, data);
+	} else {
+		if(rb->del) {
+			/* The key passed in was allocated in a way that would be cleaned by the
+			 * user-supplied delete function. We can't just assign the data and ignore
+			 * key in this case, or we'll leak memory. But we also can't make a dummy
+			 * node and pass that to rb->del, because it might also expect to free data.
+			 * So we must instead delete the existing node's contents, and use the new ones.
+			 */
+			rb->del(tree, rb->del_cls);
+			tree->key = key;
+		}
+		tree->data = data;
+	}
+
+	/* fix right-leaning reds */
+	if(is_red(tree->right)) {
+		tree = rot_left(tree);
+	}
+	/* fix two reds in a row */
+	if(is_red(tree->left) && is_red(tree->left->left)) {
+		tree = rot_right(tree);
+	}
+
+	/* if 4-node, split it by color inversion */
+	if(is_red(tree->left) && is_red(tree->right)) {
+		color_flip(tree);
+	}
+
+	return tree;
+}
+
+static struct rbnode *delete(struct rbtree *rb, struct rbnode *tree, void *key)
+{
+	int cmp;
+
+	if(!tree) {
+		return 0;
+	}
+
+	cmp = rb->cmp(key, tree->key);
+
+	if(cmp < 0) {
+		if(!is_red(tree->left) && !is_red(tree->left->left)) {
+			tree = move_red_left(tree);
+		}
+		tree->left = delete(rb, tree->left, key);
+	} else {
+		/* need reds on the right */
+		if(is_red(tree->left)) {
+			tree = rot_right(tree);
+		}
+
+		/* found it at the bottom (XXX what certifies left is null?) */
+		if(cmp == 0 && !tree->right) {
+			if(rb->del) {
+				rb->del(tree, rb->del_cls);
+			}
+			rb->free(tree);
+			return 0;
+		}
+
+		if(!is_red(tree->right) && !is_red(tree->right->left)) {
+			tree = move_red_left(tree);
+		}
+
+		if(key == tree->key) {
+			struct rbnode *rmin = find_min(tree->right);
+			tree->key = rmin->key;
+			tree->data = rmin->data;
+			tree->right = del_min(rb, tree->right);
+		} else {
+			tree->right = delete(rb, tree->right, key);
+		}
+	}
+
+	return fix_up(tree);
+}
+
+/*static struct rbnode *find(struct rbtree *rb, struct rbnode *node, void *key)
+{
+	int cmp;
+
+	if(!node)
+		return 0;
+
+	if((cmp = rb->cmp(key, node->key)) == 0) {
+		return node;
+	}
+	return find(rb, cmp < 0 ? node->left : node->right, key);
+}*/
+
+static void traverse(struct rbnode *node, void (*func)(struct rbnode*, void*), void *cls)
+{
+	if(!node)
+		return;
+
+	traverse(node->left, func, cls);
+	func(node, cls);
+	traverse(node->right, func, cls);
+}
+
+/* helpers */
+
+static int is_red(struct rbnode *tree)
+{
+	return tree && tree->red;
+}
+
+static void color_flip(struct rbnode *tree)
+{
+	tree->red = !tree->red;
+	tree->left->red = !tree->left->red;
+	tree->right->red = !tree->right->red;
+}
+
+static struct rbnode *rot_left(struct rbnode *a)
+{
+	struct rbnode *b = a->right;
+	a->right = b->left;
+	b->left = a;
+	b->red = a->red;
+	a->red = 1;
+	return b;
+}
+
+static struct rbnode *rot_right(struct rbnode *a)
+{
+	struct rbnode *b = a->left;
+	a->left = b->right;
+	b->right = a;
+	b->red = a->red;
+	a->red = 1;
+	return b;
+}
+
+static struct rbnode *find_min(struct rbnode *tree)
+{
+	if(!tree)
+		return 0;
+
+	while(tree->left) {
+		tree = tree->left;
+	}
+	return tree;
+}
+
+static struct rbnode *del_min(struct rbtree *rb, struct rbnode *tree)
+{
+	if(!tree->left) {
+		if(rb->del) {
+			rb->del(tree->left, rb->del_cls);
+		}
+		rb->free(tree->left);
+		return 0;
+	}
+
+	/* make sure we've got red (3/4-nodes) at the left side so we can delete at the bottom */
+	if(!is_red(tree->left) && !is_red(tree->left->left)) {
+		tree = move_red_left(tree);
+	}
+	tree->left = del_min(rb, tree->left);
+
+	/* fix right-reds, red-reds, and split 4-nodes on the way up */
+	return fix_up(tree);
+}
+
+#if 0
+/* push a red link on this node to the right */
+static struct rbnode *move_red_right(struct rbnode *tree)
+{
+	/* flipping it makes both children go red, so we have a red to the right */
+	color_flip(tree);
+
+	/* if after the flip we've got a red-red situation to the left, fix it */
+	if(is_red(tree->left->left)) {
+		tree = rot_right(tree);
+		color_flip(tree);
+	}
+	return tree;
+}
+#endif
+
+/* push a red link on this node to the left */
+static struct rbnode *move_red_left(struct rbnode *tree)
+{
+	/* flipping it makes both children go red, so we have a red to the left */
+	color_flip(tree);
+
+	/* if after the flip we've got a red-red on the right-left, fix it */
+	if(is_red(tree->right->left)) {
+		tree->right = rot_right(tree->right);
+		tree = rot_left(tree);
+		color_flip(tree);
+	}
+	return tree;
+}
+
+static struct rbnode *fix_up(struct rbnode *tree)
+{
+	/* fix right-leaning */
+	if(is_red(tree->right)) {
+		tree = rot_left(tree);
+	}
+	/* change invalid red-red pairs into a proper 4-node */
+	if(is_red(tree->left) && is_red(tree->left->left)) {
+		tree = rot_right(tree);
+	}
+	/* split 4-nodes */
+	if(is_red(tree->left) && is_red(tree->right)) {
+		color_flip(tree);
+	}
+	return tree;
+}
diff --git a/src/rbtree.h b/src/rbtree.h
new file mode 100644
index 0000000..dada0dc
--- /dev/null
+++ b/src/rbtree.h
@@ -0,0 +1,79 @@
+/*
+rbtree - simple balanced binary search tree (red-black tree) library.
+Copyright (C) 2011-2014  John Tsiombikas <nuclear@member.fsf.org>
+
+rbtree is free software, feel free to use, modify, and redistribute it, under
+the terms of the 3-clause BSD license. See COPYING for details.
+*/
+#ifndef RBTREE_H_
+#define RBTREE_H_
+
+struct rbtree;
+
+
+struct rbnode {
+	void *key, *data;
+	int red;
+	struct rbnode *left, *right;
+	struct rbnode *next;	/* for iterator stack */
+};
+
+
+typedef void *(*rb_alloc_func_t)(size_t);
+typedef void (*rb_free_func_t)(void*);
+
+typedef int (*rb_cmp_func_t)(const void*, const void*);
+typedef void (*rb_del_func_t)(struct rbnode*, void*);
+
+#define RB_KEY_ADDR		(rb_cmp_func_t)(0)
+#define RB_KEY_INT		(rb_cmp_func_t)(1)
+#define RB_KEY_STRING	(rb_cmp_func_t)(3)
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct rbtree *rb_create(rb_cmp_func_t cmp_func);
+void rb_free(struct rbtree *rb);
+
+int rb_init(struct rbtree *rb, rb_cmp_func_t cmp_func);
+void rb_destroy(struct rbtree *rb);
+
+void rb_set_allocator(struct rbtree *rb, rb_alloc_func_t alloc, rb_free_func_t free);
+void rb_set_compare_func(struct rbtree *rb, rb_cmp_func_t func);
+void rb_set_delete_func(struct rbtree *rb, rb_del_func_t func, void *cls);
+/* TODO add user deep copy function */
+
+void rb_clear(struct rbtree *rb);
+int rb_copy(struct rbtree *dest, struct rbtree *src);
+
+int rb_size(struct rbtree *rb);
+
+int rb_insert(struct rbtree *rb, void *key, void *data);
+int rb_inserti(struct rbtree *rb, int key, void *data);
+
+int rb_delete(struct rbtree *rb, void *key);
+int rb_deletei(struct rbtree *rb, int key);
+
+struct rbnode *rb_find(struct rbtree *rb, void *key);
+struct rbnode *rb_findi(struct rbtree *rb, int key);
+
+void rb_foreach(struct rbtree *rb, void (*func)(struct rbnode*, void*), void *cls);
+
+struct rbnode *rb_root(struct rbtree *rb);
+
+void rb_begin(struct rbtree *rb);
+struct rbnode *rb_next(struct rbtree *rb);
+
+void *rb_node_key(struct rbnode *node);
+int rb_node_keyi(struct rbnode *node);
+void *rb_node_data(struct rbnode *node);
+void rb_node_setdata(struct rbnode *node, void *data);
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif	/* RBTREE_H_ */
diff --git a/src/util.c b/src/util.c
new file mode 100644
index 0000000..9d9d449
--- /dev/null
+++ b/src/util.c
@@ -0,0 +1,48 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <errno.h>
+#include "util.h"
+#include "panic.h"
+
+uint32_t perf_start_count, perf_interval_count;
+
+void *malloc_nf_impl(size_t sz, const char *file, int line)
+{
+	void *p;
+	if(!(p = malloc(sz))) {
+		panic("%s:%d failed to allocate %lu bytes\n", file, line, (unsigned long)sz);
+	}
+	return p;
+}
+
+void *calloc_nf_impl(size_t num, size_t sz, const char *file, int line)
+{
+	void *p;
+	if(!(p = calloc(num, sz))) {
+		panic("%s:%d failed to allocate %lu bytes\n", file, line, (unsigned long)(num * sz));
+	}
+	return p;
+}
+
+void *realloc_nf_impl(void *p, size_t sz, const char *file, int line)
+{
+	if(!(p = realloc(p, sz))) {
+		panic("%s:%d failed to realloc %lu bytes\n", file, line, (unsigned long)sz);
+	}
+	return p;
+}
+
+char *strdup_nf_impl(const char *s, const char *file, int line)
+{
+	int len;
+	char *res;
+
+	len = strlen(s);
+	if(!(res = malloc(len + 1))) {
+		panic("%s:%d failed to duplicate string\n", file, line);
+	}
+	memcpy(res, s, len + 1);
+	return res;
+}
diff --git a/src/util.h b/src/util.h
new file mode 100644
index 0000000..0ec72f3
--- /dev/null
+++ b/src/util.h
@@ -0,0 +1,112 @@
+#ifndef UTIL_H_
+#define UTIL_H_
+
+#include <stdlib.h>
+#include "inttypes.h"
+
+/* fast conversion of double -> 32bit int
+ * for details see:
+ *  - http://chrishecker.com/images/f/fb/Gdmfp.pdf
+ *  - http://stereopsis.com/FPU.html#convert
+ */
+static inline int32_t cround64(double val)
+{
+	val += 6755399441055744.0;
+	return *(int32_t*)&val;
+}
+
+static inline float rsqrt(float x)
+{
+	float xhalf = x * 0.5f;
+	int32_t i = *(int32_t*)&x;
+	i = 0x5f3759df - (i >> 1);
+	x = *(float*)&i;
+	x = x * (1.5f - xhalf * x * x);
+	return x;
+}
+
+extern uint32_t perf_start_count, perf_interval_count;
+
+#define memset16(dest, val, count) \
+	do { \
+		uint32_t dummy1, dummy2; \
+		asm volatile ( \
+			"cld\n\t" \
+			"test $1, %%ecx\n\t" \
+			"jz 0f\n\t" \
+			"rep stosw\n\t" \
+			"jmp 1f\n\t" \
+			"0:\n\t" \
+			"shr $1, %%ecx\n\t" \
+			"push %%ax\n\t" \
+			"shl $16, %%eax\n\t" \
+			"pop %%ax\n\t" \
+			"rep stosl\n\t" \
+			"1:\n\t"\
+			: "=D"(dummy1), "=c"(dummy2) \
+			: "0"(dest), "a"((uint16_t)(val)), "1"(count) \
+			: "flags", "memory"); \
+	} while(0)
+
+#ifdef USE_MMX
+#define memcpy64(dest, src, count) asm volatile ( \
+	"0:\n\t" \
+	"movq (%1), %%mm0\n\t" \
+	"movq %%mm0, (%0)\n\t" \
+	"add $8, %1\n\t" \
+	"add $8, %0\n\t" \
+	"dec %2\n\t" \
+	"jnz 0b\n\t" \
+	"emms\n\t" \
+	:: "r"(dest), "r"(src), "r"(count) \
+	: "%mm0")
+#else
+#define memcpy64(dest, src, count)	memcpy(dest, src, (count) << 3)
+#endif
+
+#ifndef NO_PENTIUM
+#define perf_start()  asm volatile ( \
+	"xor %%eax, %%eax\n" \
+	"cpuid\n" \
+	"rdtsc\n" \
+	"mov %%eax, %0\n" \
+	: "=m"(perf_start_count) \
+	:: "%eax", "%ebx", "%ecx", "%edx")
+
+#define perf_end() asm volatile ( \
+	"xor %%eax, %%eax\n" \
+	"cpuid\n" \
+	"rdtsc\n" \
+	"sub %1, %%eax\n" \
+	"mov %%eax, %0\n" \
+	: "=m"(perf_interval_count) \
+	: "m"(perf_start_count) \
+	: "%eax", "%ebx", "%ecx", "%edx")
+#endif	/* !def NO_PENTIUM */
+
+#define debug_break() \
+	asm volatile("int $3")
+
+#define halt() \
+	asm volatile("hlt")
+
+unsigned int get_cs(void);
+#define get_cpl()	((int)(get_cs() & 3))
+
+void get_msr(uint32_t msr, uint32_t *low, uint32_t *high);
+void set_msr(uint32_t msr, uint32_t low, uint32_t high);
+
+
+/* Non-failing versions of malloc/calloc/realloc. They never return 0, they call
+ * demo_abort on failure. Use the macros, don't call the *_impl functions.
+ */
+#define malloc_nf(sz)	malloc_nf_impl(sz, __FILE__, __LINE__)
+void *malloc_nf_impl(size_t sz, const char *file, int line);
+#define calloc_nf(n, sz)	calloc_nf_impl(n, sz, __FILE__, __LINE__)
+void *calloc_nf_impl(size_t num, size_t sz, const char *file, int line);
+#define realloc_nf(p, sz)	realloc_nf_impl(p, sz, __FILE__, __LINE__)
+void *realloc_nf_impl(void *p, size_t sz, const char *file, int line);
+#define strdup_nf(s)	strdup_nf_impl(s, __FILE__, __LINE__)
+char *strdup_nf_impl(const char *s, const char *file, int line);
+
+#endif	/* UTIL_H_ */
diff --git a/src/vga.h b/src/vga.h
new file mode 100644
index 0000000..046e905
--- /dev/null
+++ b/src/vga.h
@@ -0,0 +1,15 @@
+#ifndef VGA_H_
+#define VGA_H_
+
+void vga_setpal(int startidx, int count, unsigned char *cmap);
+void vga_setpalent(int idx, int r, int g, int b);
+
+#define wait_vsync() \
+	asm volatile ( \
+		"0: in $0x3da, %%al\n\t" \
+		"jnz 0b\n\t" \
+		"0: in $0x3da, %%al\n\t" \
+		"jz 0b\n\t" \
+		::: "eax")
+
+#endif	/* VGA_H_ */
diff --git a/src/vga_s.asm b/src/vga_s.asm
new file mode 100644
index 0000000..1d8cded
--- /dev/null
+++ b/src/vga_s.asm
@@ -0,0 +1,51 @@
+	section .text
+	bits 32
+
+	global vga_setpal
+vga_setpal:
+	mov ecx, [esp + 8]
+	test ecx, ecx
+	jz .done
+	push ebp
+	mov ebp, esp
+	push ebx
+	mov dx, 3c8h
+	mov eax, [ebp + 8]
+	out dx, al
+	inc dx
+	mov ebx, [ebp + 16]
+.loop:	mov al, [ebx]
+	shr al, 2
+	out dx, al
+	mov al, [ebx + 1]
+	shr al, 2
+	out dx, al
+	mov al, [ebx + 2]
+	shr al, 2
+	out dx, al
+	add ebx, 3
+	dec ecx
+	jnz .loop
+	pop ebx
+	pop ebp
+.done:	ret
+
+
+	global vga_setpalent
+vga_setpalent:
+	mov dx, 3c8h
+	mov eax, [esp + 4]
+	out dx, al
+	inc dx
+	mov eax, [esp + 8]
+	shr eax, 2
+	out dx, al
+	mov eax, [esp + 12]
+	shr eax, 2
+	out dx, al
+	mov eax, [esp + 16]
+	shr eax, 2
+	out dx, al
+	ret
+
+; vi:set ts=8 sts=8 sw=8 ft=nasm:
-- 
1.7.10.4