#include "life.h" /* * Fast C life code. * * Conway's rules: if there are exactly 2 living neighbors, and the cell * is alive, it stays alive. If there are exactly 3 living neighbors * the cell becomes alive. Other numbers of living neighbors will cause * the cell to die. * * This fast code uses parallel circuit evaluation. We exploit bit-parallelism * of the machine word to do 32 cells per series of operations. Instead of a * single circuit in one pass, we compute 3 circuits to distinguish among the * states of 0, 1, 2, 3, and too-many neighbors. This is a two bit counter and * a sticky overflow bit. * * The pixel serial version requires -3x(shift mask add) branch set/clr- * or -3x(shift mask or) add loadword or- for table lookup, or * about 11 operations per pixel. We can use up to approximately 330 * operations per cycle and still come out ahead. */ void life_xform(unsigned int *grid, unsigned int *grid2) { int words_per_row = (width + 2 + 31) / 32; int r, c; unsigned int *src, *dst; unsigned int row_prev_l, row_prev, row_prev_r; unsigned int row_cur_l, row_cur, row_cur_r; unsigned int row_next_l, row_next, row_next_r; unsigned int bit0, bit1, overflow, carry; for (c = 0; c < words_per_row; ++c) { src = grid + c + words_per_row; dst = grid2+ c + words_per_row; row_cur_l = 0; row_cur = 0; row_cur_r = 0; row_next = *src; row_next_l = (row_next >> 1) | (src[ 1] << 31); row_next_r = (row_next << 1) | (src[-1] >> 31); for (r = height; --r >= 0; ) { row_prev_l = row_cur_l; row_prev = row_cur; row_prev_r = row_cur_r; row_cur_l = row_next_l; row_cur = row_next; row_cur_r = row_next_r; src += words_per_row; row_next = *src; row_next_l = (row_next >> 1) | (src[ 1] << 31); row_next_r = (row_next << 1) | (src[-1] >> 31); bit0 = row_prev_l ^ row_prev; bit1 = row_prev_l & row_prev; carry = bit0 & row_prev_r; overflow = bit1 & carry; bit1 = bit1 ^ carry; bit0 = bit0 ^ row_prev_r; carry = bit0 & row_cur_l; overflow = overflow | (bit1 & carry); bit1 = bit1 ^ carry; bit0 = bit0 ^ row_cur_l; carry = bit0 & row_cur_r; overflow = overflow | (bit1 & carry); bit1 = bit1 ^ carry; bit0 = bit0 ^ row_cur_r; carry = bit0 & row_next_l; overflow = overflow | (bit1 & carry); bit1 = bit1 ^ carry; bit0 = bit0 ^ row_next_l; carry = bit0 & row_next; overflow = overflow | (bit1 & carry); bit1 = bit1 ^ carry; bit0 = bit0 ^ row_next; carry = bit0 & row_next_r; overflow = overflow | (bit1 & carry); bit1 = bit1 ^ carry; bit0 = bit0 ^ row_next_r; carry = ~overflow & bit0 & bit1; /* 3 */ overflow = ~overflow & ~bit0 & bit1; /* 2 */ carry = (overflow & row_cur) | carry; *dst = carry; dst += words_per_row; } } /* clear out any garbage at the beginning/end of the row */ dst = grid2; for (r = height + 2; --r >= 0; ) { *dst &= ~1; dst += words_per_row; } dst = grid2 + words_per_row - 1; bit0 = (width + 1) & 0x1f; bit1 = (1 << bit0) - 1; for (r = height + 2; --r >= 0; ) { *dst &= bit1; dst += words_per_row; } }