# # Fast assembly code. # grid in a0, grid2 in a1. .data # mask: this table gives the mask to apply on the last word of each row # of the grid. Simply use something like # "result &= mask[width&31];" .align 2 mask: .word 0x1 .word 0x3 .word 0x7 .word 0xf .word 0x1f .word 0x3f .word 0x7f .word 0xff .word 0x1ff .word 0x3ff .word 0x7ff .word 0xfff .word 0x1fff .word 0x3fff .word 0x7fff .word 0xffff .word 0x1ffff .word 0x3ffff .word 0x7ffff .word 0xfffff .word 0x1fffff .word 0x3fffff .word 0x7fffff .word 0xffffff .word 0x1ffffff .word 0x3ffffff .word 0x7ffffff .word 0xfffffff .word 0x1fffffff .word 0x3fffffff .word 0x7fffffff # keep them all but the last one .word 0 # width&31==31: the last word contains only one bit, the zeroed edge # fate: this table gives the fate of a cell: # given a 9-bit neighborhood pattern (8 bits for the neighbors, the cell # being at bit 4) (so the table has 512 entries), this table gives # back what bit should be put in the next generation. fate: .align 2 .byte 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0 .byte 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0 .byte 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0 .byte 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0 .byte 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0 .byte 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0 .byte 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 .byte 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 .byte 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0 .byte 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0 .byte 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 .byte 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 .byte 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 .byte 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 .byte 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 .byte 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 .byte 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0 .byte 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0 .byte 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 .byte 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 .byte 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 .byte 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 .byte 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 .byte 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 .byte 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 .byte 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 .byte 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 .byte 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 .byte 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 .byte 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 .text .globl life_xform life_xform: subu $sp,$sp,40 sw $ra,40($sp) # better regs use might avoid saving these # (meager savings) sw $s0,36($sp) sw $s1,32($sp) sw $s2,28($sp) sw $s3,24($sp) sw $s7,20($sp) sw $a0,16($sp) sw $a1,12($sp) sw $a2,8($sp) sw $a3,4($sp) # Various constants needed in the loop lw $s7, width # s7 = row_bytes = (width + 2 + 31) >> 5 * 4; addu $s7, $s7, 33 srl $s7, $s7, 5 sll $s7, $s7, 2 # != srl $s7, $s7, 3 (of course) lw $t0, height # a3 = grid + row_words*(height+1) addu $t0, $t0, 1 # a3 holds the upper limit for the loop mul $a3, $t0, $s7 addu $a3, $a3, $a0 addu $a0, $a0, $s7 # loop start values addu $a1, $a1, $s7 # a0 = grid + row_words, # a1 = grid2 + row_words b life_xform_loop_test life_xform_loop: # In this loop: # a0 will go from grid + row_words # to grid + row_words*(height+1) - 1 (inclusive) # by 1 word at a time # a1 will start grid2 + row_words, # progressing along with a0. # Note that everything outside the unrolled # loop is not too optimized # load the needed words in s0, s1, s2 # (this could be shared with the previous iteration) addu $t0, $a0, $s7 # s0 = above word lw $s0, ($t0) lw $s1, ($a0) # s1 = center word subu $t0, $a0, $s7 # s2 = below word lw $s2, ($t0) li $t9, 0 # res = 0 (the word to be computed) # Unroll 30 times this (b=1..30): # neighbors = ( (above >> (b-1)) & 7 ) << 6 | # ( (center >> (b-1)) & 7 ) << 3 | # ( (below >> (b-1)) & 7 ) ; # res |= ((word) fate[neighbors]) << b; # I generated it with unroll.c # Could shave an instruction on each by replacing the # lbu $t3, fate(..) by an add/lbu $t3, (some reg) # b = 1 (generated) sll $t0, $s0, 6 and $t0, $t0, 0x1c0 # 0x1c0 == 7 << 6 sll $t1, $s1, 3 and $t1, $t1, 0x38 # 0x38 == 7 << 3 and $t2, $s2, 0x7 or $t0, $t0, $t1 or $t0, $t0, $t2 lbu $t3, fate($t0) sll $t3, $t3, 1 or $t9, $t9, $t3 # b = 2 (generated) sll $t0, $s0, 5 and $t0, $t0, 0x1c0 sll $t1, $s1, 2 and $t1, $t1, 0x38 srl $t2, $s2, 1 and $t2, $t2, 0x7 or $t0, $t0, $t1 or $t0, $t0, $t2 lbu $t3, fate($t0) sll $t3, $t3, 2 or $t9, $t9, $t3 # b = 3 (generated) sll $t0, $s0, 4 and $t0, $t0, 0x1c0 sll $t1, $s1, 1 and $t1, $t1, 0x38 srl $t2, $s2, 2 and $t2, $t2, 0x7 or $t0, $t0, $t1 or $t0, $t0, $t2 lbu $t3, fate($t0) sll $t3, $t3, 3 or $t9, $t9, $t3 # b = 4 (generated) sll $t0, $s0, 3 and $t0, $t0, 0x1c0 and $t1, $s1, 0x38 srl $t2, $s2, 3 and $t2, $t2, 0x7 or $t0, $t0, $t1 or $t0, $t0, $t2 lbu $t3, fate($t0) sll $t3, $t3, 4 or $t9, $t9, $t3 # b = 5 (generated) sll $t0, $s0, 2 and $t0, $t0, 0x1c0 srl $t1, $s1, 1 and $t1, $t1, 0x38 srl $t2, $s2, 4 and $t2, $t2, 0x7 or $t0, $t0, $t1 or $t0, $t0, $t2 lbu $t3, fate($t0) sll $t3, $t3, 5 or $t9, $t9, $t3 # b = 6 (generated) sll $t0, $s0, 1 and $t0, $t0, 0x1c0 srl $t1, $s1, 2 and $t1, $t1, 0x38 srl $t2, $s2, 5 and $t2, $t2, 0x7 or $t0, $t0, $t1 or $t0, $t0, $t2 lbu $t3, fate($t0) sll $t3, $t3, 6 or $t9, $t9, $t3 # b = 7 (generated) and $t0, $s0, 0x1c0 srl $t1, $s1, 3 and $t1, $t1, 0x38 srl $t2, $s2, 6 and $t2, $t2, 0x7 or $t0, $t0, $t1 or $t0, $t0, $t2 lbu $t3, fate($t0) sll $t3, $t3, 7 or $t9, $t9, $t3 # b = 8 (generated) srl $t0, $s0, 1 and $t0, $t0, 0x1c0 srl $t1, $s1, 4 and $t1, $t1, 0x38 srl $t2, $s2, 7 and $t2, $t2, 0x7 or $t0, $t0, $t1 or $t0, $t0, $t2 lbu $t3, fate($t0) sll $t3, $t3, 8 or $t9, $t9, $t3 # b = 9 (generated) srl $t0, $s0, 2 and $t0, $t0, 0x1c0 srl $t1, $s1, 5 and $t1, $t1, 0x38 srl $t2, $s2, 8 and $t2, $t2, 0x7 or $t0, $t0, $t1 or $t0, $t0, $t2 lbu $t3, fate($t0) sll $t3, $t3, 9 or $t9, $t9, $t3 # b = 10 (generated) srl $t0, $s0, 3 and $t0, $t0, 0x1c0 srl $t1, $s1, 6 and $t1, $t1, 0x38 srl $t2, $s2, 9 and $t2, $t2, 0x7 or $t0, $t0, $t1 or $t0, $t0, $t2 lbu $t3, fate($t0) sll $t3, $t3, 10 or $t9, $t9, $t3 # b = 11 (generated) srl $t0, $s0, 4 and $t0, $t0, 0x1c0 srl $t1, $s1, 7 and $t1, $t1, 0x38 srl $t2, $s2, 10 and $t2, $t2, 0x7 or $t0, $t0, $t1 or $t0, $t0, $t2 lbu $t3, fate($t0) sll $t3, $t3, 11 or $t9, $t9, $t3 # b = 12 (generated) srl $t0, $s0, 5 and $t0, $t0, 0x1c0 srl $t1, $s1, 8 and $t1, $t1, 0x38 srl $t2, $s2, 11 and $t2, $t2, 0x7 or $t0, $t0, $t1 or $t0, $t0, $t2 lbu $t3, fate($t0) sll $t3, $t3, 12 or $t9, $t9, $t3 # b = 13 (generated) srl $t0, $s0, 6 and $t0, $t0, 0x1c0 srl $t1, $s1, 9 and $t1, $t1, 0x38 srl $t2, $s2, 12 and $t2, $t2, 0x7 or $t0, $t0, $t1 or $t0, $t0, $t2 lbu $t3, fate($t0) sll $t3, $t3, 13 or $t9, $t9, $t3 # b = 14 (generated) srl $t0, $s0, 7 and $t0, $t0, 0x1c0 srl $t1, $s1, 10 and $t1, $t1, 0x38 srl $t2, $s2, 13 and $t2, $t2, 0x7 or $t0, $t0, $t1 or $t0, $t0, $t2 lbu $t3, fate($t0) sll $t3, $t3, 14 or $t9, $t9, $t3 # b = 15 (generated) srl $t0, $s0, 8 and $t0, $t0, 0x1c0 srl $t1, $s1, 11 and $t1, $t1, 0x38 srl $t2, $s2, 14 and $t2, $t2, 0x7 or $t0, $t0, $t1 or $t0, $t0, $t2 lbu $t3, fate($t0) sll $t3, $t3, 15 or $t9, $t9, $t3 # b = 16 (generated) srl $t0, $s0, 9 and $t0, $t0, 0x1c0 srl $t1, $s1, 12 and $t1, $t1, 0x38 srl $t2, $s2, 15 and $t2, $t2, 0x7 or $t0, $t0, $t1 or $t0, $t0, $t2 lbu $t3, fate($t0) sll $t3, $t3, 16 or $t9, $t9, $t3 # b = 17 (generated) srl $t0, $s0, 10 and $t0, $t0, 0x1c0 srl $t1, $s1, 13 and $t1, $t1, 0x38 srl $t2, $s2, 16 and $t2, $t2, 0x7 or $t0, $t0, $t1 or $t0, $t0, $t2 lbu $t3, fate($t0) sll $t3, $t3, 17 or $t9, $t9, $t3 # b = 18 (generated) srl $t0, $s0, 11 and $t0, $t0, 0x1c0 srl $t1, $s1, 14 and $t1, $t1, 0x38 srl $t2, $s2, 17 and $t2, $t2, 0x7 or $t0, $t0, $t1 or $t0, $t0, $t2 lbu $t3, fate($t0) sll $t3, $t3, 18 or $t9, $t9, $t3 # b = 19 (generated) srl $t0, $s0, 12 and $t0, $t0, 0x1c0 srl $t1, $s1, 15 and $t1, $t1, 0x38 srl $t2, $s2, 18 and $t2, $t2, 0x7 or $t0, $t0, $t1 or $t0, $t0, $t2 lbu $t3, fate($t0) sll $t3, $t3, 19 or $t9, $t9, $t3 # b = 20 (generated) srl $t0, $s0, 13 and $t0, $t0, 0x1c0 srl $t1, $s1, 16 and $t1, $t1, 0x38 srl $t2, $s2, 19 and $t2, $t2, 0x7 or $t0, $t0, $t1 or $t0, $t0, $t2 lbu $t3, fate($t0) sll $t3, $t3, 20 or $t9, $t9, $t3 # b = 21 (generated) srl $t0, $s0, 14 and $t0, $t0, 0x1c0 srl $t1, $s1, 17 and $t1, $t1, 0x38 srl $t2, $s2, 20 and $t2, $t2, 0x7 or $t0, $t0, $t1 or $t0, $t0, $t2 lbu $t3, fate($t0) sll $t3, $t3, 21 or $t9, $t9, $t3 # b = 22 (generated) srl $t0, $s0, 15 and $t0, $t0, 0x1c0 srl $t1, $s1, 18 and $t1, $t1, 0x38 srl $t2, $s2, 21 and $t2, $t2, 0x7 or $t0, $t0, $t1 or $t0, $t0, $t2 lbu $t3, fate($t0) sll $t3, $t3, 22 or $t9, $t9, $t3 # b = 23 (generated) srl $t0, $s0, 16 and $t0, $t0, 0x1c0 srl $t1, $s1, 19 and $t1, $t1, 0x38 srl $t2, $s2, 22 and $t2, $t2, 0x7 or $t0, $t0, $t1 or $t0, $t0, $t2 lbu $t3, fate($t0) sll $t3, $t3, 23 or $t9, $t9, $t3 # b = 24 (generated) srl $t0, $s0, 17 and $t0, $t0, 0x1c0 srl $t1, $s1, 20 and $t1, $t1, 0x38 srl $t2, $s2, 23 and $t2, $t2, 0x7 or $t0, $t0, $t1 or $t0, $t0, $t2 lbu $t3, fate($t0) sll $t3, $t3, 24 or $t9, $t9, $t3 # b = 25 (generated) srl $t0, $s0, 18 and $t0, $t0, 0x1c0 srl $t1, $s1, 21 and $t1, $t1, 0x38 srl $t2, $s2, 24 and $t2, $t2, 0x7 or $t0, $t0, $t1 or $t0, $t0, $t2 lbu $t3, fate($t0) sll $t3, $t3, 25 or $t9, $t9, $t3 # b = 26 (generated) srl $t0, $s0, 19 and $t0, $t0, 0x1c0 srl $t1, $s1, 22 and $t1, $t1, 0x38 srl $t2, $s2, 25 and $t2, $t2, 0x7 or $t0, $t0, $t1 or $t0, $t0, $t2 lbu $t3, fate($t0) sll $t3, $t3, 26 or $t9, $t9, $t3 # b = 27 (generated) srl $t0, $s0, 20 and $t0, $t0, 0x1c0 srl $t1, $s1, 23 and $t1, $t1, 0x38 srl $t2, $s2, 26 and $t2, $t2, 0x7 or $t0, $t0, $t1 or $t0, $t0, $t2 lbu $t3, fate($t0) sll $t3, $t3, 27 or $t9, $t9, $t3 # b = 28 (generated) srl $t0, $s0, 21 and $t0, $t0, 0x1c0 srl $t1, $s1, 24 and $t1, $t1, 0x38 srl $t2, $s2, 27 and $t2, $t2, 0x7 or $t0, $t0, $t1 or $t0, $t0, $t2 lbu $t3, fate($t0) sll $t3, $t3, 28 or $t9, $t9, $t3 # b = 29 (generated) srl $t0, $s0, 22 and $t0, $t0, 0x1c0 srl $t1, $s1, 25 and $t1, $t1, 0x38 srl $t2, $s2, 28 and $t2, $t2, 0x7 or $t0, $t0, $t1 or $t0, $t0, $t2 lbu $t3, fate($t0) sll $t3, $t3, 29 or $t9, $t9, $t3 # b = 30 (generated) srl $t0, $s0, 23 and $t0, $t0, 0x1c0 srl $t1, $s1, 26 and $t1, $t1, 0x38 srl $t2, $s2, 29 and $t2, $t2, 0x7 or $t0, $t0, $t1 or $t0, $t0, $t2 lbu $t3, fate($t0) sll $t3, $t3, 30 or $t9, $t9, $t3 # end of generated file # we need to make special cases for the first and last cells. # since they use other words. # first cell: # we compute the neighbor bit pattern addu $t0, $a0, $s7 lw $t1, -4($t0) # above left neighbor in bit 6 of t1 and $t1, $t1, 0x80000000 srl $t1, $t1, 25 lw $t2, -4($a0) # center left in bit 3 of t2 and $t2, $t2, 0x80000000 srl $t2, $t2, 28 subu $t0, $a0, $s7 lw $t3, -4($t0) # below left in bit 0 of t3 and $t3, $t3, 0x80000000 srl $t3, $t3, 31 and $t4, $s0, 3 # above neighbors in bits 7+8 of t4 sll $t4, $t4, 7 and $t5, $s1, 3 # center in bits 4+5 of t5 sll $t5, $t5, 4 and $t6, $s2, 3 # below in bits 1+2 of t6 sll $t6, $t6, 1 # we now or everything to get the bit pattern or $t1, $t1, $t2 or $t3, $t3, $t4 or $t5, $t5, $t6 or $t1, $t1, $t3 or $t0, $t1, $t5 # and we fetch the fate and put in bit 0 of t9 lbu $t3, fate($t0) or $t9, $t9, $t3 # last cell: same idea. addu $t0, $a0, $s7 lw $t1, 4($t0) # above right in bit 8 and $t1, $t1, 1 sll $t1, $t1, 8 lw $t2, 4($a0) # center right in bit 5 and $t2, $t2, 1 sll $t2, $t2, 5 subu $t0, $a0, $s7 lw $t3, 4($t0) # below right in bit 2 and $t3, $t3, 1 sll $t3, $t3, 2 and $t4, $s0, 0xc0000000 # 3<<30 srl $t4, $t4, 24 # above in bits 6+7 and $t5, $s1, 0xc0000000 srl $t5, $t5, 27 # center in bits 3+4 and $t6, $s2, 0xc0000000 srl $t6, $t6, 30 # below in bits 0+1 or $t1, $t1, $t2 # or everything or $t3, $t3, $t4 or $t5, $t5, $t6 or $t1, $t1, $t3 or $t0, $t1, $t5 lbu $t3, fate($t0) # put the bit in bit 31 of t9 sll $t3, $t3, 31 or $t9, $t9, $t3 # Now write the result to memory sw $t9, ($a1) # that's all for this word :-) addu $a0, $a0, 4 addu $a1, $a1, 4 life_xform_loop_test: bltu $a0, $a3, life_xform_loop # b life_xform_end # We now mask the edge rows # (not really necessary, but well ...) # in this loop, a1 will go from grid2 to grid2+row_words-1, # by 1 word # t2 will progress the same starting from # grid2+(height+1)*row_words lw $a1, 12($sp) # a1 = grid2 add $t1, $a1, $s7 # t1 = upper bound = grid2 + row_words lw $t2, height # t2 = grid2+(height+1)*row_words addu $t2, $t2, 1 mul $t2, $t2, $s7 addu $t2, $t2, $a1 b life_xform_mask_test life_xform_mask_loop: sw $zero, ($a1) sw $zero, ($t2) addu $a1, $a1, 4 addu $t2, $t2, 4 life_xform_mask_test: bltu $a1, $t1, life_xform_mask_loop # Let's mask the first and last columns # In this loop, a1 will go from grid2 # to grid2+(height+1)*row_words inclusive, by row_words # t3 will progress along, starting from grid2+row_words-1 # first compute the masks to be applied lw $t0, width # t0 = mask[((word)width)&31] and $t0, $t0, 31 sll $t0, $t0, 2 lw $t0, mask($t0) li $t2, 0xfffffffe lw $a1, 12($sp) # a1 = grid2 lw $t1, height # t1=grid2+(height+2)*row_words (upper bound) addu $t1, $t1, 2 mul $t1, $t1, $s7 addu $t1, $t1, $a1 addu $t3, $a1, $s7 # t3 = grid2+row_words-1 subu $t3, $t3, 4 b life_xform_col_test life_xform_mask_col_loop: lw $t6, ($a1) # grid2[r*row_words] &= 0xfffffffe and $t6, $t6, $t2 sw $t6, ($a1) lw $t6, ($t3) # grid2[r*row_words+row_words-1] &= m and $t6, $t6, $t0 sw $t6, ($t3) addu $t3, $t3, $s7 addu $a1, $a1, $s7 life_xform_col_test: bltu $a1, $t1, life_xform_mask_col_loop life_xform_end: lw $ra,40($sp) lw $s0,36($sp) lw $s1,32($sp) lw $s2,28($sp) lw $s3,24($sp) lw $s7,20($sp) lw $a0,16($sp) lw $a1,12($sp) lw $a2,8($sp) lw $a3,4($sp) addu $sp,$sp,40 jr $ra