done
authorJohn Tsiombikas <nuclear@mutantstargoat.com>
Sat, 20 Aug 2016 05:43:45 +0000 (08:43 +0300)
committerJohn Tsiombikas <nuclear@mutantstargoat.com>
Sat, 20 Aug 2016 05:43:45 +0000 (08:43 +0300)
Makefile
src/main.c
src/timer.c [new file with mode: 0644]
src/timer.h [new file with mode: 0644]
src/tpool.c [new file with mode: 0644]
src/tpool.h [new file with mode: 0644]
src/tunnel.c

index cf6bb9d..b1ba5c1 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -2,8 +2,8 @@ src = $(wildcard src/*.c)
 obj = $(src:.c=.o)
 bin = fbgfx
 
-CFLAGS = -pedantic -Wall -g
-LDFLAGS = -limago -lm
+CFLAGS = -pedantic -Wall -g -O3
+LDFLAGS = -limago -lm -lpthread
 
 $(bin): $(obj)
        $(CC) -o $@ $(obj) $(LDFLAGS)
index 8752567..8881963 100644 (file)
@@ -4,6 +4,9 @@
 #include "fbgfx.h"
 #include "fbevents.h"
 #include "tunnel.h"
+#include "timer.h"
+
+unsigned long start_msec, time_msec, num_frames;
 
 static void keyboard(int key, int pressed, void *cls);
 static void mouse(int bn, int pressed, int x, int y, void *cls);
@@ -18,11 +21,14 @@ static int quit;
 int main(void)
 {
        fbgfx_save_video_mode();
-       if(!(vmem = fbgfx_set_video_mode(800, 600, 16))) {
+       fbgfx_get_video_mode(&xsz, &ysz, &depth);
+
+       if(!(vmem = fbgfx_set_video_mode(xsz, ysz, 16))) {
                return 1;
        }
        fbgfx_get_video_mode(&xsz, &ysz, &depth);
        if(depth != 16) {
+               fprintf(stderr, "failed to set color depth: 16bpp\n");
                goto end;
        }
        if(fbev_init() == -1) {
@@ -36,17 +42,25 @@ int main(void)
                goto end;
        }
 
+       start_msec = get_time_msec();
        for(;;) {
                fbev_update();
                if(quit) break;
 
+               time_msec = get_time_msec() - start_msec;
+
                draw_tunnel(vmem);
+               ++num_frames;
        }
 
+       time_msec = get_time_msec() - start_msec;
 end:
        destroy_tunnel();
        fbev_shutdown();
        fbgfx_restore_video_mode();
+       if(num_frames && time_msec) {
+               printf("\ravg framerate: %.1f\n", (float)num_frames / ((float)time_msec / 1000.0));
+       }
        return 0;
 }
 
@@ -58,7 +72,8 @@ static void keyboard(int key, int pressed, void *cls)
        case 27:
        case 'q':
        case 'Q':
-               exit(0);
+               quit = 1;
+               break;
        }
 }
 
diff --git a/src/timer.c b/src/timer.c
new file mode 100644 (file)
index 0000000..6c65815
--- /dev/null
@@ -0,0 +1,71 @@
+#include "timer.h"
+
+#if defined(__APPLE__) && !defined(__unix__)
+#define __unix__
+#endif
+
+#ifdef __unix__
+#include <time.h>
+#include <unistd.h>
+#include <sys/time.h>
+
+#ifdef CLOCK_MONOTONIC
+unsigned long get_time_msec(void)
+{
+       struct timespec ts;
+       static struct timespec ts0;
+
+       clock_gettime(CLOCK_MONOTONIC, &ts);
+       if(ts0.tv_sec == 0 && ts0.tv_nsec == 0) {
+               ts0 = ts;
+               return 0;
+       }
+       return (ts.tv_sec - ts0.tv_sec) * 1000 + (ts.tv_nsec - ts0.tv_nsec) / 1000000;
+}
+#else  /* no fancy POSIX clocks, fallback to good'ol gettimeofday */
+unsigned long get_time_msec(void)
+{
+       struct timeval tv;
+       static struct timeval tv0;
+
+       gettimeofday(&tv, 0);
+       if(tv0.tv_sec == 0 && tv0.tv_usec == 0) {
+               tv0 = tv;
+               return 0;
+       }
+       return (tv.tv_sec - tv0.tv_sec) * 1000 + (tv.tv_usec - tv0.tv_usec) / 1000;
+}
+#endif /* !posix clock */
+
+void sleep_msec(unsigned long msec)
+{
+       usleep(msec * 1000);
+}
+#endif
+
+#ifdef WIN32
+#include <windows.h>
+#pragma comment(lib, "winmm.lib")
+
+unsigned long get_time_msec(void)
+{
+       return timeGetTime();
+}
+
+void sleep_msec(unsigned long msec)
+{
+       Sleep(msec);
+}
+#endif
+
+double get_time_sec(void)
+{
+       return get_time_msec() / 1000.0f;
+}
+
+void sleep_sec(double sec)
+{
+       if(sec > 0.0f) {
+               sleep_msec(sec * 1000.0f);
+       }
+}
diff --git a/src/timer.h b/src/timer.h
new file mode 100644 (file)
index 0000000..8322b82
--- /dev/null
@@ -0,0 +1,10 @@
+#ifndef TIMER_H_
+#define TIMER_H_
+
+unsigned long get_time_msec(void);
+void sleep_msec(unsigned long msec);
+
+double get_time_sec(void);
+void sleep_sec(double sec);
+
+#endif /* TIMER_H_ */
diff --git a/src/tpool.c b/src/tpool.c
new file mode 100644 (file)
index 0000000..4c1309b
--- /dev/null
@@ -0,0 +1,311 @@
+/* worker thread pool based on POSIX threads
+ * author: John Tsiombikas <nuclear@member.fsf.org>
+ * This code is public domain.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <pthread.h>
+#include "tpool.h"
+
+struct work_item {
+       void *data;
+       tpool_callback work, done;
+       struct work_item *next;
+};
+
+struct thread_pool {
+       pthread_t *threads;
+       int num_threads;
+
+       int qsize;
+       struct work_item *workq, *workq_tail;
+       pthread_mutex_t workq_mutex;
+       pthread_cond_t workq_condvar;
+
+       int nactive;    /* number of active workers (not sleeping) */
+
+       pthread_cond_t done_condvar;
+
+       int should_quit;
+       int in_batch;
+};
+
+static void *thread_func(void *args);
+
+struct thread_pool *tpool_create(int num_threads)
+{
+       int i;
+       struct thread_pool *tpool;
+
+       if(!(tpool = calloc(1, sizeof *tpool))) {
+               return 0;
+       }
+       pthread_mutex_init(&tpool->workq_mutex, 0);
+       pthread_cond_init(&tpool->workq_condvar, 0);
+       pthread_cond_init(&tpool->done_condvar, 0);
+
+       if(num_threads <= 0) {
+               num_threads = tpool_num_processors();
+       }
+       tpool->num_threads = num_threads;
+
+       if(!(tpool->threads = calloc(num_threads, sizeof *tpool->threads))) {
+               free(tpool);
+               return 0;
+       }
+       for(i=0; i<num_threads; i++) {
+               if(pthread_create(tpool->threads + i, 0, thread_func, tpool) == -1) {
+                       tpool->threads[i] = 0;
+                       tpool_destroy(tpool);
+                       return 0;
+               }
+       }
+       return tpool;
+}
+
+void tpool_destroy(struct thread_pool *tpool)
+{
+       int i;
+       if(!tpool) return;
+
+       tpool_clear(tpool);
+       tpool->should_quit = 1;
+
+       pthread_cond_broadcast(&tpool->workq_condvar);
+
+       if(tpool->threads) {
+               printf("thread_pool: waiting for %d worker threads to stop ", tpool->num_threads);
+               fflush(stdout);
+
+               for(i=0; i<tpool->num_threads; i++) {
+                       pthread_join(tpool->threads[i], 0);
+                       putchar('.');
+                       fflush(stdout);
+               }
+               putchar('\n');
+               free(tpool->threads);
+       }
+
+       pthread_mutex_destroy(&tpool->workq_mutex);
+       pthread_cond_destroy(&tpool->workq_condvar);
+       pthread_cond_destroy(&tpool->done_condvar);
+}
+
+void tpool_begin_batch(struct thread_pool *tpool)
+{
+       tpool->in_batch = 1;
+}
+
+void tpool_end_batch(struct thread_pool *tpool)
+{
+       tpool->in_batch = 0;
+       pthread_cond_broadcast(&tpool->workq_condvar);
+}
+
+int tpool_enqueue(struct thread_pool *tpool, void *data,
+               tpool_callback work_func, tpool_callback done_func)
+{
+       struct work_item *job;
+
+       if(!(job = malloc(sizeof *job))) {
+               return -1;
+       }
+       job->work = work_func;
+       job->done = done_func;
+       job->data = data;
+       job->next = 0;
+
+       pthread_mutex_lock(&tpool->workq_mutex);
+       if(tpool->workq) {
+               tpool->workq_tail->next = job;
+               tpool->workq_tail = job;
+       } else {
+               tpool->workq = tpool->workq_tail = job;
+       }
+       ++tpool->qsize;
+       pthread_mutex_unlock(&tpool->workq_mutex);
+
+       if(!tpool->in_batch) {
+               pthread_cond_broadcast(&tpool->workq_condvar);
+       }
+       return 0;
+}
+
+void tpool_clear(struct thread_pool *tpool)
+{
+       pthread_mutex_lock(&tpool->workq_mutex);
+       while(tpool->workq) {
+               void *tmp = tpool->workq;
+               tpool->workq = tpool->workq->next;
+               free(tmp);
+       }
+       tpool->workq = tpool->workq_tail = 0;
+       tpool->qsize = 0;
+       pthread_mutex_unlock(&tpool->workq_mutex);
+}
+
+int tpool_queued_jobs(struct thread_pool *tpool)
+{
+       int res;
+       pthread_mutex_lock(&tpool->workq_mutex);
+       res = tpool->qsize;
+       pthread_mutex_unlock(&tpool->workq_mutex);
+       return res;
+}
+
+int tpool_active_jobs(struct thread_pool *tpool)
+{
+       int res;
+       pthread_mutex_lock(&tpool->workq_mutex);
+       res = tpool->nactive;
+       pthread_mutex_unlock(&tpool->workq_mutex);
+       return res;
+}
+
+int tpool_pending_jobs(struct thread_pool *tpool)
+{
+       int res;
+       pthread_mutex_lock(&tpool->workq_mutex);
+       res = tpool->qsize + tpool->nactive;
+       pthread_mutex_unlock(&tpool->workq_mutex);
+       return res;
+}
+
+void tpool_wait(struct thread_pool *tpool)
+{
+       pthread_mutex_lock(&tpool->workq_mutex);
+       while(tpool->nactive || tpool->qsize) {
+               pthread_cond_wait(&tpool->done_condvar, &tpool->workq_mutex);
+       }
+       pthread_mutex_unlock(&tpool->workq_mutex);
+}
+
+void tpool_wait_one(struct thread_pool *tpool)
+{
+       int cur_pending;
+       pthread_mutex_lock(&tpool->workq_mutex);
+       cur_pending = tpool->qsize + tpool->nactive;
+       if(cur_pending) {
+               while(tpool->qsize + tpool->nactive >= cur_pending) {
+                       pthread_cond_wait(&tpool->done_condvar, &tpool->workq_mutex);
+               }
+       }
+       pthread_mutex_unlock(&tpool->workq_mutex);
+}
+
+long tpool_timedwait(struct thread_pool *tpool, long timeout)
+{
+       struct timespec tout_ts;
+       struct timeval tv0, tv;
+       gettimeofday(&tv0, 0);
+
+       long sec = timeout / 1000;
+       tout_ts.tv_nsec = tv0.tv_usec * 1000 + (timeout % 1000) * 1000000;
+       tout_ts.tv_sec = tv0.tv_sec + sec;
+
+       pthread_mutex_lock(&tpool->workq_mutex);
+       while(tpool->nactive || tpool->qsize) {
+               if(pthread_cond_timedwait(&tpool->done_condvar,
+                                       &tpool->workq_mutex, &tout_ts) == ETIMEDOUT) {
+                       break;
+               }
+       }
+       pthread_mutex_unlock(&tpool->workq_mutex);
+
+       gettimeofday(&tv, 0);
+       return (tv.tv_sec - tv0.tv_sec) * 1000 + (tv.tv_usec - tv0.tv_usec) / 1000;
+}
+
+static void *thread_func(void *args)
+{
+       struct thread_pool *tpool = args;
+
+       pthread_mutex_lock(&tpool->workq_mutex);
+       while(!tpool->should_quit) {
+               pthread_cond_wait(&tpool->workq_condvar, &tpool->workq_mutex);
+
+               while(!tpool->should_quit && tpool->workq) {
+                       /* grab the first job */
+                       struct work_item *job = tpool->workq;
+                       tpool->workq = tpool->workq->next;
+                       if(!tpool->workq)
+                               tpool->workq_tail = 0;
+                       ++tpool->nactive;
+                       --tpool->qsize;
+                       pthread_mutex_unlock(&tpool->workq_mutex);
+
+                       /* do the job */
+                       job->work(job->data);
+                       if(job->done) {
+                               job->done(job->data);
+                       }
+
+                       pthread_mutex_lock(&tpool->workq_mutex);
+                       /* notify everyone interested that we're done with this job */
+                       pthread_cond_broadcast(&tpool->done_condvar);
+                       --tpool->nactive;
+               }
+       }
+       pthread_mutex_unlock(&tpool->workq_mutex);
+
+       return 0;
+}
+
+
+/* The following highly platform-specific code detects the number
+ * of processors available in the system. It's used by the thread pool
+ * to autodetect how many threads to spawn.
+ * Currently works on: Linux, BSD, Darwin, and Windows.
+ */
+
+#if defined(__APPLE__) && defined(__MACH__)
+# ifndef __unix__
+#  define __unix__     1
+# endif        /* unix */
+# ifndef __bsd__
+#  define __bsd__      1
+# endif        /* bsd */
+#endif /* apple */
+
+#if defined(unix) || defined(__unix__)
+#include <unistd.h>
+
+# ifdef __bsd__
+#  include <sys/sysctl.h>
+# endif
+#endif
+
+#if defined(WIN32) || defined(__WIN32__)
+#include <windows.h>
+#endif
+
+
+int tpool_num_processors(void)
+{
+#if defined(unix) || defined(__unix__)
+# if defined(__bsd__)
+       /* BSD systems provide the num.processors through sysctl */
+       int num, mib[] = {CTL_HW, HW_NCPU};
+       size_t len = sizeof num;
+
+       sysctl(mib, 2, &num, &len, 0, 0);
+       return num;
+
+# elif defined(__sgi)
+       /* SGI IRIX flavour of the _SC_NPROC_ONLN sysconf */
+       return sysconf(_SC_NPROC_ONLN);
+# else
+       /* Linux (and others?) have the _SC_NPROCESSORS_ONLN sysconf */
+       return sysconf(_SC_NPROCESSORS_ONLN);
+# endif        /* bsd/sgi/other */
+
+#elif defined(WIN32) || defined(__WIN32__)
+       /* under windows we need to call GetSystemInfo */
+       SYSTEM_INFO info;
+       GetSystemInfo(&info);
+       return info.dwNumberOfProcessors;
+#endif
+}
diff --git a/src/tpool.h b/src/tpool.h
new file mode 100644 (file)
index 0000000..2964c9b
--- /dev/null
@@ -0,0 +1,61 @@
+/* worker thread pool based on POSIX threads
+ * author: John Tsiombikas <nuclear@member.fsf.org>
+ * This code is public domain.
+ */
+#ifndef THREADPOOL_H_
+#define THREADPOOL_H_
+
+struct thread_pool;
+
+/* type of the function accepted as work or completion callback */
+typedef void (*tpool_callback)(void*);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* if num_threads == 0, auto-detect how many threads to spawn */
+struct thread_pool *tpool_create(int num_threads);
+void tpool_destroy(struct thread_pool *tpool);
+
+/* if begin_batch is called before an enqueue, the worker threads will not be
+ * signalled to start working until end_batch is called.
+ */
+void tpool_begin_batch(struct thread_pool *tpool);
+void tpool_end_batch(struct thread_pool *tpool);
+
+/* if enqueue is called without calling begin_batch first, it will immediately
+ * wake up the worker threads to start working on the enqueued item
+ */
+int tpool_enqueue(struct thread_pool *tpool, void *data,
+               tpool_callback work_func, tpool_callback done_func);
+/* clear the work queue. does not cancel any currently running jobs */
+void tpool_clear(struct thread_pool *tpool);
+
+/* returns the number of queued work items */
+int tpool_queued_jobs(struct thread_pool *tpool);
+/* returns the number of active (working) threads */
+int tpool_active_jobs(struct thread_pool *tpool);
+/* returns the number of pending jobs, both in queue and active */
+int tpool_pending_jobs(struct thread_pool *tpool);
+
+/* wait for all pending jobs to be completed */
+void tpool_wait(struct thread_pool *tpool);
+/* wait until the pending jobs are down to the target specified
+ * for example, to wait until a single job has been completed:
+ *   tpool_wait_pending(tpool, tpool_pending_jobs(tpool) - 1);
+ * this interface is slightly awkward to avoid race conditions. */
+void tpool_wait_pending(struct thread_pool *tpool, int pending_target);
+/* wait for all pending jobs to be completed for up to "timeout" milliseconds */
+long tpool_timedwait(struct thread_pool *tpool, long timeout);
+
+/* returns the number of processors on the system.
+ * individual cores in multi-core processors are counted as processors.
+ */
+int tpool_num_processors(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* THREADPOOL_H_ */
index a348431..85ca7b4 100644 (file)
 #include <stdlib.h>
 #include <math.h>
 #include <imago2.h>
+#include "tpool.h"
 #include "tunnel.h"
 
-static int xsz, ysz;
+#define TEX_FNAME      "data/grid.png"
+#define TEX_USCALE     4
+#define TEX_VSCALE     2
+
+#define USCALE 2
+#define VSCALE 1
+
+extern unsigned long time_msec;
+
+static void draw_tunnel_range(unsigned short *pixels, int starty, int num_lines);
+static int count_bits(unsigned int x);
+static int count_zeros(unsigned int x);
+
+static int xsz, ysz, vxsz, vysz;
 static unsigned int *tunnel_map;
+static unsigned char *tunnel_fog;
+
+static int tex_xsz, tex_ysz;
+static unsigned int *tex_pixels;
+static int tex_xshift, tex_yshift;
+static unsigned int tex_xmask, tex_ymask;
+
+static struct thread_pool *tpool;
 
 
 int init_tunnel(int x, int y)
 {
-       int i, j;
+       int i, j, n;
        unsigned int *tmap;
+       unsigned char *fog;
+       float aspect = (float)x / (float)y;
 
        xsz = x;
        ysz = y;
+       vxsz = xsz / USCALE;
+       vysz = ysz / VSCALE;
 
-       printf("precalculating tunnel map...\n");
-
-       if(!(tunnel_map = malloc(xsz * ysz * sizeof *tunnel_map))) {
+       if(!(tunnel_map = malloc(vxsz * vysz * sizeof *tunnel_map))) {
                fprintf(stderr, "failed to allocate tunnel map\n");
                return -1;
        }
+       if(!(tunnel_fog = malloc(vxsz * vysz))) {
+               fprintf(stderr, "failed to allocate tunnel fog map\n");
+               return -1;
+       }
+
        tmap = tunnel_map;
+       fog = tunnel_fog;
 
-       for(i=0; i<ysz; i++) {
-               float y = 2.0 * (float)i / (float)ysz - 0.5;
-               for(j=0; j<xsz; j++) {
-                       float x = 2.0 * (float)j / (float)xsz - 0.5;
+       for(i=0; i<vysz; i++) {
+               float y = 2.0 * (float)i / (float)vysz - 1.0;
+               for(j=0; j<vxsz; j++) {
+                       float x = aspect * (2.0 * (float)j / (float)vxsz - 1.0);
                        float tu = atan2(y, x) / M_PI * 0.5 + 0.5;
-                       float tv = sqrt(x*x + y*y);
+                       float d = sqrt(x * x + y * y);
+                       float tv = d == 0.0 ? 0.0 : 1.0 / d;
+
+                       int tx = (int)(tu * 65535.0 * TEX_USCALE) & 0xffff;
+                       int ty = (int)(tv * 65535.0 * TEX_VSCALE) & 0xffff;
 
-                       int tx = (int)(tu * 65535.0) & 0xffff;
-                       int ty = (int)(tv * 65535.0) & 0xffff;
+                       int f = (int)(d * 95.0);
 
                        *tmap++ = (tx << 16) | ty;
+                       *fog++ = f > 255 ? 255 : f;
                }
        }
 
+       if(!(tex_pixels = img_load_pixels(TEX_FNAME, &tex_xsz, &tex_ysz, IMG_FMT_RGBA32))) {
+               fprintf(stderr, "failed to load image " TEX_FNAME "\n");
+               return -1;
+       }
+       if((count_bits(tex_xsz) | count_bits(tex_ysz)) != 1) {
+               fprintf(stderr, "non-pow2 image (%dx%d)\n", tex_xsz, tex_ysz);
+               return -1;
+       }
+
+       n = count_zeros(tex_xsz);
+       for(i=0; i<n; i++) {
+               tex_xmask |= 1 << i;
+       }
+       tex_xshift = n;
+
+       n = count_zeros(tex_ysz);
+       for(i=0; i<n; i++) {
+               tex_ymask |= 1 << i;
+       }
+       tex_yshift = n;
+
+       if(!(tpool = tpool_create(0))) {
+               fprintf(stderr, "failed to create thread pool\n");
+               return -1;
+       }
+
        return 0;
 }
 
 void destroy_tunnel(void)
 {
+       tpool_destroy(tpool);
        free(tunnel_map);
+       free(tunnel_fog);
+}
+
+#define NUM_WORK_ITEMS 32
+
+static struct work {
+       unsigned short *pixels;
+       int starty, num_lines;
+} work[NUM_WORK_ITEMS];
+
+static void work_func(void *cls)
+{
+       struct work *w = (struct work*)cls;
+       draw_tunnel_range(w->pixels, w->starty, w->num_lines);
 }
 
 void draw_tunnel(unsigned short *pixels)
 {
-       int i, j, r, g, b;
-       unsigned int *tmap = tunnel_map;
+       int i, num_lines = vysz / NUM_WORK_ITEMS;
+       for(i=0; i<NUM_WORK_ITEMS; i++) {
+               work[i].pixels = pixels;
+               work[i].starty = i * num_lines;
+               work[i].num_lines = num_lines;
+
+               tpool_enqueue(tpool, work + i, work_func, 0);
+       }
+       tpool_wait(tpool);
+}
+
+#define PACK_RGB16(r, g, b) \
+       (((((r) >> 3) & 0x1f) << 11) | ((((g) >> 2) & 0x3f) << 5) | ((b) & 0x1f))
+
+static void draw_tunnel_range(unsigned short *pixels, int starty, int num_lines)
+{
+       int i, j, k, r, g, b;
+       unsigned int *tmap = tunnel_map + starty * vxsz;
+       unsigned char *fog = tunnel_fog + starty * vxsz;
+
+       long toffs = time_msec / 4;
+       pixels += starty * xsz * VSCALE;
 
-       for(i=0; i<ysz; i++) {
-               for(j=0; j<xsz; j++) {
-                       unsigned int tx = (*tmap >> 16) & 0xffff;
-                       unsigned int ty = *tmap & 0xffff;
+       for(i=0; i<num_lines; i++) {
+               for(j=0; j<vxsz; j++) {
+                       unsigned short *ptr;
+                       unsigned int col;
+                       unsigned int tx = (((*tmap >> 16) & 0xffff) << tex_xshift) >> 16;
+                       unsigned int ty = ((*tmap & 0xffff) << tex_yshift) >> 16;
                        ++tmap;
 
-                       r = tx >> 8;
-                       g = ty >> 8;
+                       tx += toffs;
+                       ty += toffs << 1;
+
+                       tx &= tex_xmask;
+                       ty &= tex_ymask;
 
-                       *pixels++ = ((((r >> 3) & 0x1f) << 11) |
-                                       (((g >> 2) & 0x3f) << 5));/* |
-                                       ((b >> 3) & 0x1f));*/
+                       col = tex_pixels[(ty << tex_xshift) + tx];
+                       r = col & 0xff;
+                       g = (col >> 8) & 0xff;
+                       b = (col >> 16) & 0xff;
+
+                       r = (r * *fog) >> 8;
+                       g = (g * *fog) >> 8;
+                       b = (b * *fog) >> 8;
+                       ++fog;
+
+                       col = ((((r >> 3) & 0x1f) << 11) | (((g >> 2) & 0x3f) << 5) | ((b >> 3) & 0x1f));
+
+                       ptr = pixels;
+                       for(k=0; k<VSCALE; k++) {
+                               switch(USCALE) {
+                               case 4:
+                                       ptr[3] = col;
+                               case 3:
+                                       ptr[2] = col;
+                               case 2:
+                                       ptr[1] = col;
+                               case 1:
+                                       *ptr = col;
+                               }
+                               ptr += xsz;
+                       }
+                       pixels += USCALE;
                }
+               pixels += xsz * (VSCALE - 1);
+       }
+}
+
+static int count_bits(unsigned int x)
+{
+       int i, nbits = 0;
+       for(i=0; i<32; i++) {
+               if(x & 1) ++nbits;
+               x >>= 1;
+       }
+       return nbits;
+}
+
+static int count_zeros(unsigned int x)
+{
+       int i, num = 0;
+       for(i=0; i<32; i++) {
+               if(x & 1) break;
+               ++num;
+               x >>= 1;
        }
+       return num;
 }