/* * asm_3dnow.S - special (hopefully much faster) transformation functions for * K6-2 with 3DNow! * * by Jeff Epler * * This file is in the public domain. Some functions are adapted from AMD's * publication "AMD-K6(R)-2 Processor Code Optimization". * * Some functions (those not yet 3dnow!ifed) are lifted directly from * asm_386.S. * * Done: * asm_transform_points3_2d * asm_transform_points3_2d_no_rot * asm_transform_points3_3d * asm_transform_points4_general * asm_transform_points3_general // Is there any more logic optimization here? * asm_transform_points3_identity // no need to change? * asm_transform_points4_identity // no need to change? * asm_project_and_cliptest_general // no need to change? * asm_project_and_cliptest_identity // no need to change? * asm_project_and_cliptest_ortho // no need to change? * asm_project_and_cliptest_perspective // no need to change? * cliptest // No need to change? No fp code * * Needs work: * asm_transform_points4_2d * asm_transform_points4_2d_no_rot * asm_transform_points4_3d * asm_transform_points4_ortho * asm_transform_points4_perspective * inverse_nofp * gl_xform_normals_3fv // uses inverse_nofp */ /* TODO: * - Implement missing functions (all unimplemented transforms are mapped to * points4_general currently) * - Put matbuf on the stack of functions which need it, so we're reentrant * - Make sure the instructions are optimially scheduled * - vbxform.c:viewport_map_vertices seems to be another good candidate for * 3dnow! optimization * - vbxform.c:shade_vertices, fog_vertices, transform_texcoords? * - shade.c ? */ /* Gains: Using the simple test program "testasm.c", we see that transforms are accurate and faster. gcc testasm.c src/asm_3dnow.s -DTHREE to time 3dnow instructions on asm_transform_points3* functions, gcc testasm.c src/asm_386.s -DTHREE to time 386 instrucitons on asm_transform_points3* functions, gcc testasm.c src/asm_3dnow.s to time 3dnow instructions on asm_transform_points4* functions, gcc testasm.c src/asm_386.s to time 386 instrucitons on asm_transform_points4* functions. First, some sanity checks are performed for correctness, and then many iterations are performed. You can specify the transform type, the number of vertexes to transform at a time, and number of times to run the transformation function. Since most of the 3dnow functions have large setup time compared to the 386 ones, large values of "n" will show greater gains. The times below are on a K6-2-266 (4*66) with 1M L2 cache, 100000 iterations. Function 3dnow 386 ratio n=1 n=10 n=100 n=1 n=10 n=100 n=10 n=100 points4_general 0.0297 0.0974 0.7788 0.6369 0.512 5.023 5.257 6.516 points3_general 0.0281 0.0955 0.7899 0.5809 0.466 4.567 4.880 5.782 points3_3d 0.0323 0.0941 0.7152 0.0446 0.329 3.229 3.496 4.514 points3_2d 0.0174 0.0449 0.3009 0.0310 0.172 1.587 3.831 5.274 points3_2d_norot0.0154 0.0342 0.2064 0.0251 0.134 1.224 3.918 5.930 And here are times for a "simulated" P5-266, being a P5-133 on 50000 iterations: 386 ratio n=1 n=10 n=100 n=10 n=100 points3_3d 0.0255 0.395 2.167 4.198 3.030 points3_2d 0.0217 0.208 1.374 4.633 4.566 points3_2d_norot 0.0175 0.154 1.303 6.082 6.313 From a PII-300: 386 ratio n=1 n=10 n=100 n=10 n=100 points4_general 0.1829 0.945 2.227 9.702 2.914 points4_identity 0.0160 0.055 ???? points4_ortho 0.0554 0.396 0.871 points4_perspec 0.0965 0.395 0.931 points4_2d 0.1382 0.300 0.690 points4_2d_norot 0.2296 0.815 1.869 (Something must be wrong here, if the P5 seems slower on the x86 asm code! Redo this until it is believable. Is it the cache size on the non-MMX P5?) So 3dnow! is never slower, and for n large, gains reach a factor of nearly 6x. Of course, this doesn't translate into a full 6x speedup by 3dnow! since time is spent in rasterizing, and outside of Mesa as well. Just imagine, we're transforming about 14,000,000 vectors per second, or 19 cycles per vector. */ #ifndef ALIGN #define ALIGN .align 4, 0x90 #endif #ifndef GLOBAL #define GLOBAL(n) .globl n #endif /* * Change this to .data if your system doesn't have .rodata */ #ifndef RODATA #if defined(FREEBSD) || defined(__EMX__) #define RODATA .data #else #define RODATA .section .rodata #endif #endif #ifndef DATA #define DATA .data #endif #ifndef TEXT #define TEXT .text #endif #define S(x) x * 4(%esi) #define D(x) x * 4(%edi) #define N(x, y) y * 16 + x * 4(%edx) // Not like for intel, we transpose first #define M(x, y) x * 16 + y * 4(%edx) #define Vertex_Stride 16 #define TDN(op,x,y) \ .byte 0x0f, 0x0f, 0xc0 | (y<<3) | x, op #define PFMUL(x,y) TDN(0xb4, x, y) #define PFADD(x,y) TDN(0x9e, x, y) #define PFACC(x,y) TDN(0xae, x, y) #define FEMMS .byte 0x0f, 0x0e DATA // For debugging, returns four floats and we can examine them outside. // barbaric, ain't it? #define OUT(a,b) \ movq a, D(0); \ movq b, D(2); \ FEMMS; \ popl %edi; \ popl %esi; \ ret GLOBAL(asm_dump_mmx_registers) ALIGN asm_dump_mmx_registers: movq %mm0, mmxbuf movq %mm1, mmxbuf+8 movq %mm2, mmxbuf+16 movq %mm3, mmxbuf+24 movq %mm4, mmxbuf+32 movq %mm5, mmxbuf+40 movq %mm6, mmxbuf+48 movq %mm7, mmxbuf+56 ret #define SEGFAULT movl 0, %eax matbuf: .space 64 mmxbuf: .space 64 One: .float 1.0 TEXT /* * void asm_transform_points3_3d( GLuint n, GLfloat d[][4], GLfloat m[16], * GLfloat s[][4] ); */ #ifdef FREEBSD GLOBAL(_asm_transform_points3_3d) ALIGN _asm_transform_points3_3d: #else GLOBAL(asm_transform_points3_3d) ALIGN asm_transform_points3_3d: #endif pushl %esi pushl %edi movl 12(%esp), %ecx /* ecx = n */ movl 20(%esp), %edi /* edi = old m; edx will = new m */ movl 24(%esp), %esi /* esi = s */ testl %ecx, %ecx leal matbuf,%edx jnz 4f popl %edi popl %esi ret 4: // We're stuck transposing the matrix into a temporary area. Sigh. movl 0(%edi), %eax nop movl %eax, 0(%edx) movl 4(%edi), %eax movl %eax, 16(%edx) movl 8(%edi), %eax movl %eax, 32(%edx) movl 12(%edi), %eax movl %eax, 48(%edx) movl 16(%edi), %eax movl %eax, 4(%edx) movl 20(%edi), %eax movl %eax, 20(%edx) movl 24(%edi), %eax movl %eax, 36(%edx) movl 28(%edi), %eax movl %eax, 52(%edx) movl 32(%edi), %eax movl %eax, 8(%edx) movl 36(%edi), %eax movl %eax, 24(%edx) movl 40(%edi), %eax movl %eax, 40(%edx) movl 44(%edi), %eax movl %eax, 56(%edx) movl 48(%edi), %eax movl %eax, 12(%edx) movl 52(%edi), %eax movl %eax, 28(%edx) movl 56(%edi), %eax movl %eax, 44(%edx) movl 60(%edi), %eax movl %eax, 60(%edx) movl 16(%esp), %edi /* edi = d */ FEMMS // These instructions pair, but take ages to load. Best case is // probably about 7 cycles for the lot. movq S(0), %mm6 // mm6 = y | x movq S(2), %mm7 // mm7 = w | z movq N(0,0), %mm0 // mm0 = m01 | m00 movq N(2,0), %mm1 // mm1 = m03 | m02 decl %ecx jz 3f ALIGN // We may waste a decode cycle here, but otherwise the instructions are aligned // poorly. 1: // prefetch Vertex_Stride(%esi) // Prefech next vertex // Vector decode, 2 cycles; but this primes the l1 cache for what // we need tomorrow. Does this take the load unit, or what? movq N(0,1), %mm2 // %mm2 = m11 | m10 PFMUL(6,0) // %mm0 = y*m01 | x*m00 // Pair, both issued in one cycle movq N(2,1), %mm3 // %mm3 = m13 | m12 PFMUL(7,1) // %mm1 = w*m03 | z*m02 // Pair, both issued in one cycle movq N(0,2), %mm4 // %mm4 = m21 | m20 PFMUL(6,2) // %mm2 = y*m11 | x*m10 // Pair, both issued in one cycle movq N(2,2), %mm5 // %mm5 = m23 | m22 PFMUL(7,3) // %mm3 = w*m13 | z*m12 // Pair, both issued in one cycle PFADD(1,0) // %mm0 = w*m03 + y*m01 | z*m02 + x*m00 PFMUL(6,4) // %mm4 = y*m21 | x*m20 // Pair, because pfadd and pfmul can execute in opposing pipes PFADD(3,2) // %mm2 = w*m13 + y*m11 | z*m12 + x*m10 PFMUL(7,5) // %mm5 = Z*m22 | W*m23 // "The %mm3 operand is forwarded from the 3Dnow! multiplier output" just in // time leal S(4), %esi ALIGN // Paired with a nop, this keeps the instructions aligned nicely // We lose half a cycle to decode, but the next PFMUL would have been badly // aligned and long/vector decoded anyway // Final sum of XResult and YResult PFACC(2,0) // mm0 = YRes | Xres movq N(2,3),%mm1 // We want WResult=1 ; N(2,3)=0|1 // Operand %mm5 is forwarded from the load unit, and %mm7 is foawrded from the // multiplier // First sum of ZResult PFADD(5,4) // mm4 = w*m23 + y*m21 | z*m22 + x*m20 movq %mm0, D(0) // Store XResult and YResult // Final sum of ZResult and WResult PFACC(1,4) // %mm4 = WRes | ZRes movq S(0), %mm6 // Begin to load next vertex // Paired movq S(2), %mm7 // Finish load of next vertex movq %mm4, D(2) // Store ZResult and WResult // Paired movq N(0,0), %mm0 // mm0 = m01 | m00 decl %ecx // Paired movq N(2,0), %mm1 // %mm1 = m03 | m02 leal D(4), %edi // Paired jnz 1b // Does jnz wait for the result of decl? ALIGN // We burn some time here, but we want our instructions aligned. 3: // We have only one vertex left, don't prefetch, don't reload matrix, don't // try to load next vertex. Saves a bit of time, and doesn't segfault this // way! movq N(0,1), %mm2 // %mm2 = m11 | m10 PFMUL(6,0) // %mm0 = y*m01 | x*m00 // Pair, both issued in one cycle movq N(2,1), %mm3 // %mm3 = m13 | m12 PFMUL(7,1) // %mm1 = w*m03 | z*m02 // Pair, both issued in one cycle movq N(0,2), %mm4 // %mm4 = m21 | m20 PFMUL(6,2) // %mm2 = y*m11 | x*m10 // Pair, both issued in one cycle movq N(2,2), %mm5 // %mm5 = m23 | m22 PFMUL(7,3) // %mm3 = w*m13 | z*m12 // Pair, both issued in one cycle PFADD(1,0) // %mm0 = w*m03 + y*m01 | z*m02 + x*m00 PFMUL(6,4) // %mm4 = y*m21 | x*m20 // Pair, because pfadd and pfmul can execute in opposing pipes PFADD(3,2) // %mm2 = w*m13 + y*m11 | z*m12 + x*m10 PFMUL(7,5) // %mm5 = Z*m22 | W*m23 // "The %mm3 operand is forwarded from the 3Dnow! multiplier output" just in // time leal S(4), %esi ALIGN // Paired with a nop, this keeps the instructions aligned nicely // We lose half a cycle to decode, but the next PFMUL would have been badly // aligned and long/vector decoded anyway // Final sum of XResult and YResult PFACC(2,0) // mm0 = YRes | Xres movq N(2,3),%mm1 // We want WResult=1 ; N(2,3)=0|1 // Operand %mm5 is forwarded from the load unit, and %mm7 is foawrded from the // multiplier // First sum of ZResult PFADD(5,4) // mm4 = w*m23 + y*m21 | z*m22 + x*m20 movq %mm0, D(0) // Store XResult and YResult // Final sum of ZResult and WResult PFACC(1,4) // %mm4 = WRes | ZRes // Paired movq %mm4, D(2) // Store ZResult and WResult // Paired popl %edi FEMMS popl %esi ret /* * void asm_transform_points3_identity( GLuint n, GLfloat d[][4], * GLfloat s[][4] ); */ #ifdef FREEBSD GLOBAL(_asm_transform_points3_identity) ALIGN _asm_transform_points3_identity: #else GLOBAL(asm_transform_points3_identity) ALIGN asm_transform_points3_identity: #endif pushl %esi pushl %edi movl 12(%esp), %ecx /* ecx = n */ movl 16(%esp), %edi /* edi = d */ movl 20(%esp), %esi /* esi = s */ pushl %ebx pushl %ebp testl %ecx, %ecx jz 2f movl $0x3f800000, %ebp ALIGN 1: movl S(0), %eax movl S(1), %edx movl S(2), %ebx leal S(4), %esi movl %eax, D(0) movl %edx, D(1) movl %ebx, D(2) movl %ebp, D(3) decl %ecx leal D(4), %edi jnz 1b 2: popl %ebp popl %ebx popl %edi popl %esi ret /* * void asm_transform_points3_2d_no_rot( GLuint n, GLfloat d[][4], * GLfloat m[16], GLfloat s[][4] ); * */ #ifdef FREEBSD GLOBAL(_asm_transform_points3_2d_no_rot) ALIGN _asm_transform_points3_2d_no_rot: #else GLOBAL(asm_transform_points3_2d_no_rot) ALIGN asm_transform_points3_2d_no_rot: #endif pushl %esi pushl %edi movl 12(%esp), %ecx /* ecx = n */ movl 16(%esp), %edi /* edi = d */ movl 20(%esp), %edx /* edx = m */ movl 24(%esp), %esi /* esi = s */ test %ecx, %ecx jnz 1f popl %edi popl %esi ret 1: FEMMS decl %ecx ALIGN //0 16 32 48 movd 0(%edx), %mm0 // mm0 = 0 | m00 movd 20(%edx), %mm4 // mm4 = 0 | m11 movq 48(%edx), %mm1 // mm2 = m13 | m03 PFACC(4,0) // mm0 = m11 | m00 movq S(0), %mm3 // mm3 = y | x jz 3f ALIGN 2: PFMUL(0,3) // mm3 = m11*y | m00*x movq S(2), %mm4 PFADD(1,3) // mm3 = m11*y + m13 | m00*x + m03 movq %mm4, D(2) // And store it movq %mm3, D(0) // And store it movq S(4), %mm3 // Load half next vertex decl %ecx // Loop leal S(4), %esi // Advance pointers leal D(4), %edi jnz 2b ALIGN 3: PFMUL(0,3) // mm3 = m11*y | m00*x movq S(2), %mm4 PFADD(1,3) // mm3 = m11*y + m13 | m00*x + m03 movq %mm4, D(2) // And store it movq %mm3, D(0) // And store it popl %edi popl %esi FEMMS ret /* * void asm_transform_points3_2d( GLuint n, GLfloat d[][4], GLfloat m[16], * GLfloat s[][4] ); */ #ifdef FREEBSD GLOBAL(_asm_transform_points3_2d) ALIGN _asm_transform_points3_2d: #else GLOBAL(asm_transform_points3_2d) ALIGN asm_transform_points3_2d: #endif pushl %esi pushl %edi movl 12(%esp), %ecx /* ecx = n */ movl 16(%esp), %edi /* edi = d */ movl 20(%esp), %edx /* edx = m */ movl 24(%esp), %esi /* esi = s */ test %ecx, %ecx jnz 1f popl %edi popl %esi ret 1: FEMMS decl %ecx ALIGN //0 16 32 48 movd 0(%edx), %mm0 // mm0 = 0 | m00 movd 16(%edx), %mm4 // mm4 = 0 | m01 movd 4(%edx), %mm1 // mm1 = 0 | m10 movd 20(%edx), %mm3 // mm3 = 0 | m11 movq 48(%edx), %mm2 // mm2 = m13 | m03 PFACC(4,0) // mm0 = m01 | m00 PFACC(3,1) // mm1 = m11 | m10 movq S(0), %mm3 // mm3 = y | x jz 3f movq %mm0, %mm6 movq %mm1, %mm7 ALIGN // mm0 = m01 | m00 // mm1 = m11 | m10 // mm2 = m13 | m03 // mm3 = y | x // mm4 = w | z 2: PFMUL(3,0) // mm0 = m01*y | m00*x movq S(2), %mm4 PFMUL(1,3) // mm3 = m11*y | m10*x movq %mm4, D(2) // And store it PFACC(3,0) // mm0 = m10*x + m11*y | m00*x + m01*y movq S(4), %mm3 // Load half next vertex PFADD(2,0) // mm0 = m10*x + m11*y + m13 | ... movq %mm0, D(0) // And store it movq %mm6, %mm0 // Reload matrix (mm1, mm2 intact) decl %ecx // Loop leal S(4), %esi // Advance pointers leal D(4), %edi jnz 2b ALIGN 3: PFMUL(3,0) // mm0 = m01*y | m00*x movq S(2), %mm4 PFMUL(1,3) // mm3 = m11*y | m10*x movq %mm4, D(2) // And store it PFACC(3,0) // mm0 = m10*x + m11*y | m00*x + m01*y PFADD(2,0) // mm0 = m10*x + m11*y + m13 | ... movq %mm0, D(0) // And store it popl %edi popl %esi FEMMS ret /* * void asm_transform_points3_general( GLuint n, GLfloat d[][4], * GLfloat m[16], GLfloat s[][4] ); */ #ifdef FREEBSD GLOBAL(_asm_transform_points3_general) ALIGN _asm_transform_points3_general: #else GLOBAL(asm_transform_points3_general) ALIGN asm_transform_points3_general: #endif /* * void asm_transform_points4_3d( GLuint n, GLfloat d[][4], GLfloat m[16], * GLfloat s[][4] ); */ #ifdef FREEBSD GLOBAL(_asm_transform_points4_3d) ALIGN _asm_transform_points4_3d: #else GLOBAL(asm_transform_points4_3d) ALIGN asm_transform_points4_3d: #endif /* * void asm_transform_points4_perspective( GLuint n, GLfloat d[][4], * GLfloat m[16], GLfloat s[][4] ); */ #ifdef FREEBSD GLOBAL(_asm_transform_points4_perspective) ALIGN _asm_transform_points4_perspective: #else GLOBAL(asm_transform_points4_perspective) ALIGN asm_transform_points4_perspective: #endif /* * void asm_transform_points4_ortho( GLuint n, GLfloat d[][4], * GLfloat m[16], GLfloat s[][4] ); */ #ifdef FREEBSD GLOBAL(_asm_transform_points4_ortho) ALIGN _asm_transform_points4_ortho: #else GLOBAL(asm_transform_points4_ortho) ALIGN asm_transform_points4_ortho: #endif /* * void asm_transform_points4_2d_no_rot( GLuint n, GLfloat d[][4], * GLfloat m[16], GLfloat s[][4] ); */ #ifdef FREEBSD GLOBAL(_asm_transform_points4_2d_no_rot) ALIGN _asm_transform_points4_2d_no_rot: #else GLOBAL(asm_transform_points4_2d_no_rot) ALIGN asm_transform_points4_2d_no_rot: #endif /* * void asm_transform_points4_2d( GLuint n, GLfloat d[][4], GLfloat m[16], * GLfloat s[][4] ); */ #ifdef FREEBSD GLOBAL(_asm_transform_points4_2d) ALIGN _asm_transform_points4_2d: #else GLOBAL(asm_transform_points4_2d) ALIGN asm_transform_points4_2d: #endif /* * void asm_transform_points4_general( GLuint n, GLfloat d[][4], * GLfloat m[16], GLfloat s[][4] ); */ #ifdef FREEBSD GLOBAL(_asm_transform_points4_general) ALIGN _asm_transform_points4_general: #else GLOBAL(asm_transform_points4_general) ALIGN asm_transform_points4_general: #endif pushl %esi pushl %edi movl 12(%esp), %ecx /* ecx = n */ movl 20(%esp), %edi /* edi = old m; edx will = new m */ movl 24(%esp), %esi /* esi = s */ testl %ecx, %ecx leal matbuf, %edx // Fix to use buffer on stack jnz 4f popl %edi popl %esi ret 4: // We're stuck transposing the matrix into a temporary area. Sigh. movl 0(%edi), %eax // D I O E F movl %eax, 0(%edx) // D I O E F movl 4(%edi), %eax // D I O E F movl %eax, 16(%edx) // D I O E F movl 8(%edi), %eax // etc movl %eax, 32(%edx) movl 12(%edi), %eax movl %eax, 48(%edx) movl 16(%edi), %eax movl %eax, 4(%edx) movl 20(%edi), %eax movl %eax, 20(%edx) movl 24(%edi), %eax movl %eax, 36(%edx) movl 28(%edi), %eax movl %eax, 52(%edx) movl 32(%edi), %eax movl %eax, 8(%edx) movl 36(%edi), %eax movl %eax, 24(%edx) movl 40(%edi), %eax movl %eax, 40(%edx) movl 44(%edi), %eax movl %eax, 56(%edx) movl 48(%edi), %eax movl %eax, 12(%edx) movl 52(%edi), %eax movl %eax, 28(%edx) movl 56(%edi), %eax movl %eax, 44(%edx) movl 60(%edi), %eax movl %eax, 60(%edx) movl 16(%esp), %edi /* edi = d */ FEMMS // These instructions pair, but take ages to load. Best case is // probably about 7 cycles for the lot. movq S(0), %mm6 // mm6 = y | x movq S(2), %mm7 // mm7 = w | z movq N(0,0), %mm0 // mm0 = m01 | m00 movq N(2,0), %mm1 // mm1 = m03 | m02 decl %ecx jz 3f ALIGN // We may waste a decode cycle here, but otherwise the instructions are aligned // poorly. 1: // prefetchw Vertex_Stride(%esi) // Prefech next vertex // Vector decode, 2 cycles; but this primes the l1 cache for what // we need tomorrow. Does this take the load unit, or what? movq N(0,1), %mm2 // %mm2 = m11 | m10 PFMUL(6,0) // %mm0 = y*m01 | x*m00 // Pair, both issued in one cycle movq N(2,1), %mm3 // %mm3 = m13 | m12 PFMUL(7,1) // %mm1 = w*m03 | z*m02 // Pair, both issued in one cycle movq N(0,2), %mm4 // %mm4 = m21 | m20 PFMUL(6,2) // %mm2 = y*m11 | x*m10 // Pair, both issued in one cycle movq N(2,2), %mm5 // %mm5 = m23 | m22 PFMUL(7,3) // %mm3 = w*m13 | z*m12 // Pair, both issued in one cycle PFADD(1,0) // %mm0 = w*m03 + y*m01 | z*m02 + x*m00 PFMUL(6,4) // %mm4 = y*m21 | x*m20 // Pair, because pfadd and pfmul can execute in opposing pipes PFADD(3,2) // %mm2 = w*m13 + y*m11 | z*m12 + x*m10 // "The %mm3 operand is forwarded from the 3Dnow! multiplier output" just in // time movq N(0,3), %mm1 // %mm1 = m31 | m30 // Pairs with above pfadd PFMUL(7,5) // %mm5 = w*m23 | z*m22 // Opearnd %mm5 is forwarded from the load unit, and %mm7 is foawrded from the // multiplier movq N(2,3), %mm3 // %mm3 = m33 | m32 // Pairs with above pfmul. We've now loaded the last of the matrix. // Final sum of XResult and YResult PFACC(2,0) // mm0 = YRes | Xres PFMUL(6,1) // %mm1 = y*m31 | x*m30 // Paired // First sum of ZResult PFADD(5,4) // mm4 = w*m23 + y*m21 | z*m22 + x*m20 PFMUL(7,3) // %mm3 = w*m33 | z*m32 // Paired // First sum of WResult PFADD(3,1) // mm1 = w*m33 + y*m31 | z*m32 + x*m30 movq S(4), %mm6 // Begin to load next vertex // Paired movq S(6), %mm7 // Finish load of next vertex movq %mm0, D(0) // Store XResult and YResult // Paired, and execute at the same time since load and store units are not the // same // Final sum of ZResult and WResult PFACC(1,4) // %mm4 = WRes | ZRes movq N(0,0), %mm0 // mm0 = m01 | m00 // Paired? pfacc goes in meu, and movq in load. movq N(2,0), %mm1 // %mm1 = m03 | m02 movq %mm4, D(2) // Store ZResult and WResult // Paired, and execute at the same time since load and store units are not the // same decl %ecx leal D(4), %edi // Paired? leal goes in load, and decl goes in an alu leal S(4), %esi // Paired with a nop, this keeps the instructions aligned nicely // We lose half a cycle to decode, but the next PFMUL would have been badly // aligned and long/vector decoded anyway jnz 1b // Does jz wait for the result of decl? ALIGN // We burn some time here, but we want our instructions aligned. 3: movq N(0,1), %mm2 // %mm2 = m11 | m10 PFMUL(6,0) // %mm0 = y*m01 | x*m00 // Pair, both issued in one cycle movq N(2,1), %mm3 // %mm3 = m13 | m12 PFMUL(7,1) // %mm1 = w*m03 | z*m02 // Pair, both issued in one cycle movq N(0,2), %mm4 // %mm4 = m21 | m20 PFMUL(6,2) movq N(2,2), %mm5 // %mm5 = m23 | m22 PFMUL(7,3) // %mm2 = y*m11 | x*m10 // Pair, both issued in one cycle PFADD(1,0) // %mm0 = w*m03 + y*m01 | z*m02 + x*m00 PFMUL(6,4) // %mm4 = y*m21 | x*m20 // Pair, because pfadd and pfmul can execute in opposing pipes PFADD(3,2) // %mm2 = w*m13 + y*m11 | z*m12 + x*m10 // "The %mm3 operand is forwarded from the 3Dnow! multiplier output" just in // time movq N(0,3), %mm1 // %mm1 = m31 | m30 // Pairs with above pfadd PFMUL(7,5) // %mm5 = w*m23 | z*m22 // Opearnd %mm5 is forwarded from the load unit, and %mm7 is foawrded from the // multiplier movq N(2,3), %mm3 // %mm3 = m33 | m32 // Pairs with above pfmul. We've now loaded the last of the matrix. // Final sum of XResult and YResult PFACC(2,0) // mm0 = YRes | Xres PFMUL(6,1) // %mm1 = y*m31 | x*m30 // Paired // First sum of ZResult PFADD(5,4) // mm4 = w*m23 + y*m21 | z*m22 + x*m20 PFMUL(7,3) // %mm3 = w*m33 | z*m32 // Paired // First sum of WResult PFADD(3,1) // mm1 = w*m33 + y*m31 | z*m32 + x*m30 movq %mm0, D(0) // Store XResult and YResult // Paired // Paired, and execute at the same time since load and store units are not the // same // Final sum of ZResult and WResult PFACC(1,4) // %mm4 = WRes | ZRes movq %mm4, D(2) // Store ZResult and WResult popl %edi // Restore regs, since we wait on // the pfacc before we can movq popl %esi FEMMS ret /* * void asm_transform_points4_identity( GLuint n, GLfloat d[][4], * GLfloat s[][4] ); */ #ifdef FREEBSD GLOBAL(_asm_transform_points4_identity) ALIGN _asm_transform_points4_identity: #else GLOBAL(asm_transform_points4_identity) ALIGN asm_transform_points4_identity: #endif pushl %esi pushl %edi movl 12(%esp), %ecx /* ecx = n */ movl 16(%esp), %edi /* edi = d */ movl 20(%esp), %esi /* esi = s */ leal (, %ecx, 4), %ecx cld rep; movsl popl %edi popl %esi ret /* * Table for clip test. * * bit6 = S(3) < 0 * bit5 = S(2) < 0 * bit4 = abs(S(2)) > abs(S(3)) * bit3 = S(1) < 0 * bit2 = abs(S(1)) > abs(S(3)) * bit1 = S(0) < 0 * bit0 = abs(S(0)) > abs(S(3)) */ /* Vertex buffer clipping flags (from vb.h) */ #define CLIP_RIGHT_BIT 0x01 #define CLIP_LEFT_BIT 0x02 #define CLIP_TOP_BIT 0x04 #define CLIP_BOTTOM_BIT 0x08 #define CLIP_NEAR_BIT 0x10 #define CLIP_FAR_BIT 0x20 #define CLIP_USER_BIT 0x40 #define CLIP_ALL_BITS 0x3f #define MAGN_X(i) (~(((i) & 1) - 1)) #define SIGN_X(i) (~((((i) >> 1) & 1) - 1)) #define MAGN_Y(i) (~((((i) >> 2) & 1) - 1)) #define SIGN_Y(i) (~((((i) >> 3) & 1) - 1)) #define MAGN_Z(i) (~((((i) >> 4) & 1) - 1)) #define SIGN_Z(i) (~((((i) >> 5) & 1) - 1)) #define SIGN_W(i) (~((((i) >> 6) & 1) - 1)) #define CLIP_VALUE(i) \ (CLIP_RIGHT_BIT \ & ((~SIGN_X(i) & SIGN_W(i)) \ | (~SIGN_X(i) & ~SIGN_W(i) & MAGN_X(i)) \ | (SIGN_X(i) & SIGN_W(i) & ~MAGN_X(i)))) \ | (CLIP_LEFT_BIT \ & ((SIGN_X(i) & SIGN_W(i)) \ | (~SIGN_X(i) & SIGN_W(i) & ~MAGN_X(i)) \ | (SIGN_X(i) & ~SIGN_W(i) & MAGN_X(i)))) \ | (CLIP_TOP_BIT \ & ((~SIGN_Y(i) & SIGN_W(i)) \ | (~SIGN_Y(i) & ~SIGN_W(i) & MAGN_Y(i)) \ | (SIGN_Y(i) & SIGN_W(i) & ~MAGN_Y(i)))) \ | (CLIP_BOTTOM_BIT \ & ((SIGN_Y(i) & SIGN_W(i)) \ | (~SIGN_Y(i) & SIGN_W(i) & ~MAGN_Y(i)) \ | (SIGN_Y(i) & ~SIGN_W(i) & MAGN_Y(i)))) \ | (CLIP_FAR_BIT \ & ((~SIGN_Z(i) & SIGN_W(i)) \ | (~SIGN_Z(i) & ~SIGN_W(i) & MAGN_Z(i)) \ | (SIGN_Z(i) & SIGN_W(i) & ~MAGN_Z(i)))) \ | (CLIP_NEAR_BIT \ & ((SIGN_Z(i) & SIGN_W(i)) \ | (~SIGN_Z(i) & SIGN_W(i) & ~MAGN_Z(i)) \ | (SIGN_Z(i) & ~SIGN_W(i) & MAGN_Z(i)))) #define CLIP_VALUE8(i) \ CLIP_VALUE(i + 0), CLIP_VALUE(i + 1), CLIP_VALUE(i + 2), CLIP_VALUE(i + 3), \ CLIP_VALUE(i + 4), CLIP_VALUE(i + 5), CLIP_VALUE(i + 6), CLIP_VALUE(i + 7) RODATA clip_table: .byte CLIP_VALUE8(0x00) .byte CLIP_VALUE8(0x08) .byte CLIP_VALUE8(0x10) .byte CLIP_VALUE8(0x18) .byte CLIP_VALUE8(0x20) .byte CLIP_VALUE8(0x28) .byte CLIP_VALUE8(0x30) .byte CLIP_VALUE8(0x38) .byte CLIP_VALUE8(0x40) .byte CLIP_VALUE8(0x48) .byte CLIP_VALUE8(0x50) .byte CLIP_VALUE8(0x58) .byte CLIP_VALUE8(0x60) .byte CLIP_VALUE8(0x68) .byte CLIP_VALUE8(0x70) .byte CLIP_VALUE8(0x78) TEXT /* * cliptest - * * inputs: * ecx = # points * esi = points * edi = clipmask[] * * inputs/outputs: * al = ormask * ah = andmask */ cliptest: testl %ecx, %ecx jz 2f pushl %ebp pushl %ebx #if defined(__ELF__) && defined(__PIC__) /* store pointer to clip_table on stack */ call 3f addl $_GLOBAL_OFFSET_TABLE_, %ebx movl clip_table@GOT(%ebx), %ebx pushl %ebx jmp 1f 3: /* store eip in ebx */ movl (%esp), %ebx ret #endif ALIGN 1: movl S(3), %ebp movl S(2), %ebx xorl %edx, %edx addl %ebp, %ebp /* %ebp = abs(S(3))*2 ; carry = sign of S(3) */ adcl %edx, %edx addl %ebx, %ebx /* %ebx = abs(S(2))*2 ; carry = sign of S(2) */ adcl %edx, %edx cmpl %ebx, %ebp /* carry = abs(S(2))*2 > abs(S(3))*2 */ adcl %edx, %edx movl S(1), %ebx addl %ebx, %ebx /* %ebx = abs(S(1))*2 ; carry = sign of S(1) */ adcl %edx, %edx cmpl %ebx, %ebp /* carry = abs(S(1))*2 > abs(S(3))*2 */ adcl %edx, %edx movl S(0), %ebx addl %ebx, %ebx /* %ebx = abs(S(0))*2 ; carry = sign of S(0) */ adcl %edx, %edx cmpl %ebx, %ebp /* carry = abs(S(0))*2 > abs(S(3))*2 */ adcl %edx, %edx #if defined(__ELF__) && defined(__PIC__) movl (%esp), %ebp leal S(4), %esi movb (%edi), %bl movb (%ebp, %edx, 1), %dl #else leal S(4), %esi movb (%edi), %bl movb clip_table(%edx), %dl #endif orb %dl, %bl orb %dl, %al andb %dl, %ah movb %bl, (%edi) incl %edi decl %ecx jnz 1b #if defined(__ELF__) && defined(__PIC__) addl $4, %esp #endif popl %ebx popl %ebp 2: ret /* * void asm_project_and_cliptest_general( GLuint n, GLfloat d[][4], GLfloat m[16], * GLfloat s[][4], GLubyte clipmask[], * GLubyte *ormask, GLubyte *andmask ); */ #ifdef FREEBSD GLOBAL(_asm_project_and_cliptest_general) ALIGN _asm_project_and_cliptest_general: #else GLOBAL(asm_project_and_cliptest_general) ALIGN asm_project_and_cliptest_general: #endif pushl %esi pushl %edi movl 12(%esp), %ecx /* ecx = n */ movl 16(%esp), %edi /* edi = d */ movl 20(%esp), %edx /* edx = m */ movl 24(%esp), %esi /* esi = s */ pushl %esi pushl %edx pushl %edi pushl %ecx #ifdef FREEBSD call _asm_transform_points4_general #else call asm_transform_points4_general #endif addl $16, %esp movl 32(%esp), %edi /* ormask */ movl 36(%esp), %esi /* andmask */ movb (%edi), %al movb (%esi), %ah movl 12(%esp), %ecx /* ecx = n */ movl 28(%esp), %edi /* edi = clipmask */ movl 16(%esp), %esi /* esi = d */ call cliptest movl 32(%esp), %edi /* ormask */ movl 36(%esp), %esi /* andmask */ movb %al, (%edi) movb %ah, (%esi) popl %edi popl %esi ret /* * void asm_project_and_cliptest_identity( GLuint n, GLfloat d[][4], * GLfloat s[][4], GLubyte clipmask[], * GLubyte *ormask, GLubyte *andmask ); */ #ifdef FREEBSD GLOBAL(_asm_project_and_cliptest_identity) ALIGN _asm_project_and_cliptest_identity: #else GLOBAL(asm_project_and_cliptest_identity) ALIGN asm_project_and_cliptest_identity: #endif pushl %esi pushl %edi movl 12(%esp), %ecx /* ecx = n */ movl 16(%esp), %edi /* edi = d */ movl 20(%esp), %esi /* esi = s */ pushl %esi pushl %edi pushl %ecx #ifdef FREEBSD call _asm_transform_points4_identity #else call asm_transform_points4_identity #endif addl $12, %esp movl 28(%esp), %edi /* ormask */ movl 32(%esp), %esi /* andmask */ movb (%edi), %al movb (%esi), %ah movl 12(%esp), %ecx /* ecx = n */ movl 24(%esp), %edi /* edi = clipmask */ movl 16(%esp), %esi /* esi = d */ call cliptest movl 28(%esp), %edi /* ormask */ movl 32(%esp), %esi /* andmask */ movb %al, (%edi) movb %ah, (%esi) popl %edi popl %esi ret /* * void asm_project_and_cliptest_ortho( GLuint n, GLfloat d[][4], GLfloat m[16], * GLfloat s[][4], GLubyte clipmask[], * GLubyte *ormask, GLubyte *andmask ); */ #ifdef FREEBSD GLOBAL(_asm_project_and_cliptest_ortho) ALIGN _asm_project_and_cliptest_ortho: #else GLOBAL(asm_project_and_cliptest_ortho) ALIGN asm_project_and_cliptest_ortho: #endif pushl %esi pushl %edi movl 12(%esp), %ecx /* ecx = n */ movl 16(%esp), %edi /* edi = d */ movl 20(%esp), %edx /* edx = m */ movl 24(%esp), %esi /* esi = s */ pushl %esi pushl %edx pushl %edi pushl %ecx #ifdef FREEBSD call _asm_transform_points4_ortho #else call asm_transform_points4_ortho #endif addl $16, %esp movl 32(%esp), %edi /* ormask */ movl 36(%esp), %esi /* andmask */ movb (%edi), %al movb (%esi), %ah movl 12(%esp), %ecx /* ecx = n */ movl 28(%esp), %edi /* edi = clipmask */ movl 16(%esp), %esi /* esi = d */ call cliptest movl 32(%esp), %edi /* ormask */ movl 36(%esp), %esi /* andmask */ movb %al, (%edi) movb %ah, (%esi) popl %edi popl %esi ret /* * void asm_project_and_cliptest_perspective( GLuint n, GLfloat d[][4], GLfloat m[16], * GLfloat s[][4], GLubyte clipmask[], * GLubyte *ormask, GLubyte *andmask ); */ #ifdef FREEBSD GLOBAL(_asm_project_and_cliptest_perspective) ALIGN _asm_project_and_cliptest_perspective: #else GLOBAL(asm_project_and_cliptest_perspective) ALIGN asm_project_and_cliptest_perspective: #endif pushl %esi pushl %edi movl 12(%esp), %ecx /* ecx = n */ movl 16(%esp), %edi /* edi = d */ movl 20(%esp), %edx /* edx = m */ movl 24(%esp), %esi /* esi = s */ pushl %esi pushl %edx pushl %edi pushl %ecx #ifdef FREEBSD call _asm_transform_points4_perspective #else call asm_transform_points4_perspective #endif addl $16, %esp movl 32(%esp), %edi /* ormask */ movl 36(%esp), %esi /* andmask */ movb (%edi), %al movb (%esi), %ah movl 12(%esp), %ecx /* ecx = n */ movl 28(%esp), %edi /* edi = clipmask */ movl 16(%esp), %esi /* esi = d */ call cliptest movl 32(%esp), %edi /* ormask */ movl 36(%esp), %esi /* andmask */ movb %al, (%edi) movb %ah, (%esi) popl %edi popl %esi ret /* * unsigned int inverse_nofp( float f ); * * Calculate the inverse of a float without using the FPU. * This function returns a float in eax, so it's return * type should be 'int' when called from C (and converted * to float with pointer/union abuse). */ ALIGN inverse_nofp: /* get mantissa in eax */ movl 4(%esp), %ecx andl $0x7fffff, %ecx /* set implicit integer */ orl $0x800000, %ecx /* div 0x10000:0x00000000 by mantissa */ xorl %eax, %eax movl $0x10000, %edx divl %ecx /* round result */ shrl $1, %eax adcl $0, %eax /* get exponent in ecx */ movl $0x7f800000, %ecx movl 4(%esp), %edx andl %edx, %ecx /* negate exponent and decrement it */ movl $253 << 23, %edx subl %ecx, %edx /* if bit 24 is set, shift and adjust exponent */ testl $0x1000000, %eax jz 1f shrl $1, %eax addl $1 << 23, %edx /* combine mantissa and exponent, then set sign */ 1: andl $0x7fffff, %eax movl 4(%esp), %ecx orl %edx, %eax andl $0x80000000, %ecx orl %ecx, %eax ret /* * void gl_xform_normals_3fv( GLuint n, GLfloat d[][4], GLfloat m[16], * GLfloat s[][4], GLboolean normalize, * GLboolean rescale ); */ #ifdef FREEBSD GLOBAL(_gl_xform_normals_3fv) ALIGN _gl_xform_normals_3fv: #else GLOBAL(gl_xform_normals_3fv) ALIGN gl_xform_normals_3fv: #endif pushl %esi pushl %edi movl 12(%esp), %ecx /* ecx = n */ movl 16(%esp), %edi /* edi = d */ movl 20(%esp), %edx /* edx = m */ movl 24(%esp), %esi /* esi = s */ testl %ecx, %ecx jz 2f /* * Check if rescale is needed */ cmpl $0, 32(%esp) jz 3f /* * Transform and rescale */ flds M(0, 2) fmuls M(0, 2) flds M(1, 2) fmuls M(1, 2) flds M(2, 2) fmuls M(2, 2) fxch faddp %st(2) faddp %st(1) fsqrt fld1 fdivp %st, %st(1) 1: flds S(0) fmuls M(0, 0) flds S(0) fmuls M(1, 0) flds S(0) fmuls M(2, 0) flds S(1) fmuls M(0, 1) flds S(1) fmuls M(1, 1) flds S(1) fmuls M(2, 1) /* * st(5) = S(0) * M(0, 0) * st(4) = S(0) * M(1, 0) * st(3) = S(0) * M(2, 0) * st(2) = S(1) * M(0, 1) * st(1) = S(1) * M(1, 1) * st(0) = S(1) * M(2, 1) */ fxch %st(2) /* 2 1 0 3 4 5 */ faddp %st, %st(5) /* 1 0 3 4 5 */ faddp %st, %st(3) /* 0 3 4 5 */ faddp %st, %st(1) /* 3 4 5 */ /* * st(2) = S(0) * M(0, 0) + S(1) * M(0, 1) * st(1) = S(0) * M(1, 0) + S(1) * M(1, 1) * st(0) = S(0) * M(2, 0) + S(1) * M(2, 1) */ flds S(2) fmuls M(0, 2) flds S(2) fmuls M(1, 2) flds S(2) fmuls M(2, 2) /* * st(5) = S(0) * M(0, 0) + S(1) * M(0, 1) * st(4) = S(0) * M(1, 0) + S(1) * M(1, 1) * st(3) = S(0) * M(2, 0) + S(1) * M(2, 1) * st(2) = S(2) * M(0, 2) * st(1) = S(2) * M(1, 2) * st(0) = S(2) * M(2, 2) */ fxch %st(2) /* 2 1 0 3 4 5 */ faddp %st, %st(5) /* 1 0 3 4 5 */ faddp %st, %st(3) /* 0 3 4 5 */ faddp %st, %st(1) /* 3 4 5 */ /* * st(2) = S(0) * M(0, 0) + S(1) * M(0, 1) + S(2) * M(0, 2) * st(1) = S(0) * M(1, 0) + S(1) * M(1, 1) + S(2) * M(1, 2) * st(0) = S(0) * M(2, 0) + S(1) * M(2, 1) + S(2) * M(2, 2) */ fld %st(3) fmul %st, %st(1) fmul %st, %st(2) fmulp %st(3) fstps D(2) fstps D(1) fstps D(0) leal S(3), %esi decl %ecx leal D(3), %edi jnz 1b fstp %st(0) jmp 4f /* * Transform (no rescale) */ ALIGN 3: flds S(0) fmuls M(0, 0) flds S(0) fmuls M(1, 0) flds S(0) fmuls M(2, 0) flds S(1) fmuls M(0, 1) flds S(1) fmuls M(1, 1) flds S(1) fmuls M(2, 1) /* * st(5) = S(0) * M(0, 0) * st(4) = S(0) * M(1, 0) * st(3) = S(0) * M(2, 0) * st(2) = S(1) * M(0, 1) * st(1) = S(1) * M(1, 1) * st(0) = S(1) * M(2, 1) */ fxch %st(2) /* 2 1 0 3 4 5 */ faddp %st, %st(5) /* 1 0 3 4 5 */ faddp %st, %st(3) /* 0 3 4 5 */ faddp %st, %st(1) /* 3 4 5 */ /* * st(2) = S(0) * M(0, 0) + S(1) * M(0, 1) * st(1) = S(0) * M(1, 0) + S(1) * M(1, 1) * st(0) = S(0) * M(2, 0) + S(1) * M(2, 1) */ flds S(2) fmuls M(0, 2) flds S(2) fmuls M(1, 2) flds S(2) fmuls M(2, 2) /* * st(5) = S(0) * M(0, 0) + S(1) * M(0, 1) * st(4) = S(0) * M(1, 0) + S(1) * M(1, 1) * st(3) = S(0) * M(2, 0) + S(1) * M(2, 1) * st(2) = S(2) * M(0, 2) * st(1) = S(2) * M(1, 2) * st(0) = S(2) * M(2, 2) */ fxch %st(2) /* 2 1 0 3 4 5 */ faddp %st, %st(5) /* 1 0 3 4 5 */ faddp %st, %st(3) /* 0 3 4 5 */ faddp %st, %st(1) /* 3 4 5 */ /* * st(2) = S(0) * M(0, 0) + S(1) * M(0, 1) + S(2) * M(0, 2) * st(1) = S(0) * M(1, 0) + S(1) * M(1, 1) + S(2) * M(1, 2) * st(0) = S(0) * M(2, 0) + S(1) * M(2, 1) + S(2) * M(2, 2) */ fxch %st(2) /* 2 1 0 */ fstps D(0) /* 1 0 */ fstps D(1) /* 0 */ fstps D(2) /* */ leal S(3), %esi decl %ecx leal D(3), %edi jnz 3b /* * Skip normalize if it isn't needed */ 4: cmpl $0, 28(%esp) jz 2f /* Normalize required */ movl 12(%esp), %esi /* esi = n */ movl 16(%esp), %edi /* edi = d */ subl $4, %esp /* temp var for 1.0 / len */ /* * (%esp) = length of first normal */ flds D(0) fmuls D(0) flds D(1) fmuls D(1) flds D(2) fmuls D(2) fxch %st(2) faddp %st(1) faddp %st(1) fsqrt fstps (%esp) jmp 3f ALIGN 1: /* %st(0) = length of next normal */ flds D(3) fmuls D(3) flds D(4) fmuls D(4) flds D(5) fmuls D(5) fxch %st(2) faddp %st(1) faddp %st(1) fsqrt /* * inverse the length of the current normal, which is * already at (%esp). This should overlap the prev * fsqrt nicely. */ call inverse_nofp movl %eax, (%esp) /* multiply normal by 1/len */ flds D(0) fmuls (%esp) flds D(1) fmuls (%esp) flds D(2) fmuls (%esp) fxch %st(3) fstps (%esp) /* store length of next normal */ fstps D(1) fstps D(0) fstps D(2) leal D(3), %edi 3: decl %esi jnz 1b /* finish up the last normal */ call inverse_nofp movl %eax, (%esp) flds D(0) fmuls (%esp) flds D(1) fmuls (%esp) flds D(2) fmuls (%esp) fxch %st(2) fstps D(0) fstps D(1) fstps D(2) addl $4, %esp 2: popl %edi popl %esi ret /* end */