546 lines
		
	
	
		
			21 KiB
		
	
	
	
		
			Haxe
		
	
	
	
	
	
		
		
			
		
	
	
			546 lines
		
	
	
		
			21 KiB
		
	
	
	
		
			Haxe
		
	
	
	
	
	
|  | package kha.audio2.ogg.tools; | ||
|  | import haxe.ds.Vector; | ||
|  | 
 | ||
|  | /** | ||
|  |  * modified discrete cosine transform | ||
|  |  * @author shohei909 | ||
|  |  */ | ||
|  | class Mdct { | ||
|  |     static public inline function inverseTransform(buffer:Vector<Float>, n:Int, a:Vector<Float>, b:Vector<Float>, c:Vector<Float>, bitReverse:Vector<Int>) | ||
|  |     { | ||
|  |         var n2 = n >> 1; | ||
|  |         var n4 = n >> 2; | ||
|  |         var n8 = n >> 3; | ||
|  |         // @OPTIMIZE: reduce register pressure by using fewer variables? | ||
|  |         //int save_point = temp_alloc_save(f); | ||
|  | 
 | ||
|  |         var buf2 = new Vector(n2); | ||
|  |         // twiddle factors | ||
|  | 
 | ||
|  |         // IMDCT algorithm from "The use of multirate filter banks for coding of high quality digital audio" | ||
|  |         // See notes about bugs in that paper in less-optimal implementation 'inverseMdct_old' after this function. | ||
|  | 
 | ||
|  |         // kernel from paper | ||
|  | 
 | ||
|  | 
 | ||
|  |         // merged: | ||
|  |         //    copy and reflect spectral data | ||
|  |         //    step 0 | ||
|  | 
 | ||
|  |         // note that it turns out that the items added together during | ||
|  |         // this step are, in fact, being added to themselves (as reflected | ||
|  |         // by step 0). inexplicable inefficiency! this became obvious | ||
|  |         // once I combined the passes. | ||
|  | 
 | ||
|  |         // so there's a missing 'times 2' here (for adding X to itself). | ||
|  |         // this propogates through linearly to the end, where the numbers | ||
|  |         // are 1/2 too small, and need to be compensated for. | ||
|  | 
 | ||
|  |         { | ||
|  |             var dOffset = n2 - 2; | ||
|  |             var aaOffset = 0; | ||
|  |             var eOffset = 0; | ||
|  |             var eStopOffset = n2; | ||
|  |             while (eOffset != eStopOffset) { | ||
|  |                 buf2[dOffset + 1] = (buffer[eOffset + 0] * a[aaOffset + 0] - buffer[eOffset + 2] * a[aaOffset + 1]); | ||
|  |                 buf2[dOffset + 0] = (buffer[eOffset + 0] * a[aaOffset + 1] + buffer[eOffset + 2] * a[aaOffset + 0]); | ||
|  |                 dOffset -= 2; | ||
|  |                 aaOffset += 2; | ||
|  |                 eOffset += 4; | ||
|  |             } | ||
|  | 
 | ||
|  |             eOffset = n2 - 3; | ||
|  |             while (dOffset >= 0) { | ||
|  |                 buf2[dOffset + 1] = (-buffer[eOffset + 2] * a[aaOffset + 0] - -buffer[eOffset + 0]*a[aaOffset + 1]); | ||
|  |                 buf2[dOffset + 0] = (-buffer[eOffset + 2] * a[aaOffset + 1] + -buffer[eOffset + 0]*a[aaOffset + 0]); | ||
|  |                 dOffset -= 2; | ||
|  |                 aaOffset += 2; | ||
|  |                 eOffset -= 4; | ||
|  |             } | ||
|  |         } | ||
|  | 
 | ||
|  |         // now we use symbolic names for these, so that we can | ||
|  |         // possibly swap their meaning as we change which operations | ||
|  |         // are in place | ||
|  | 
 | ||
|  |         var u = buffer; | ||
|  |         var v = buf2; | ||
|  | 
 | ||
|  |         // step 2     (paper output is w, now u) | ||
|  |         // this could be in place, but the data ends up in the wrong | ||
|  |         // place... _somebody_'s got to swap it, so this is nominated | ||
|  |         { | ||
|  |             var aaOffset = n2 - 8; | ||
|  |             var eOffset0 = n4; | ||
|  |             var eOffset1 = 0; | ||
|  | 
 | ||
|  |             var dOffset0 = n4; | ||
|  |             var dOffset1 = 0; | ||
|  | 
 | ||
|  |             while (aaOffset >= 0) { | ||
|  | 
 | ||
|  |                 var v41_21:Float = v[eOffset0 + 1] - v[eOffset1 + 1]; | ||
|  |                 var v40_20:Float = v[eOffset0 + 0] - v[eOffset1 + 0]; | ||
|  |                 u[dOffset0 + 1]  = v[eOffset0 + 1] + v[eOffset1 + 1]; | ||
|  |                 u[dOffset0 + 0]  = v[eOffset0 + 0] + v[eOffset1 + 0]; | ||
|  |                 u[dOffset1 + 1]  = v41_21*a[aaOffset + 4] - v40_20*a[aaOffset + 5]; | ||
|  |                 u[dOffset1 + 0]  = v40_20*a[aaOffset + 4] + v41_21*a[aaOffset + 5]; | ||
|  | 
 | ||
|  |                 v41_21 = v[eOffset0 + 3] - v[eOffset1 + 3]; | ||
|  |                 v40_20 = v[eOffset0 + 2] - v[eOffset1 + 2]; | ||
|  |                 u[dOffset0 + 3]  = v[eOffset0 + 3] + v[eOffset1 + 3]; | ||
|  |                 u[dOffset0 + 2]  = v[eOffset0 + 2] + v[eOffset1 + 2]; | ||
|  |                 u[dOffset1 + 3]  = v41_21*a[aaOffset + 0] - v40_20*a[aaOffset + 1]; | ||
|  |                 u[dOffset1 + 2]  = v40_20*a[aaOffset + 0] + v41_21*a[aaOffset + 1]; | ||
|  | 
 | ||
|  |                 aaOffset -= 8; | ||
|  | 
 | ||
|  |                 dOffset0 += 4; | ||
|  |                 dOffset1 += 4; | ||
|  |                 eOffset0 += 4; | ||
|  |                 eOffset1 += 4; | ||
|  |             } | ||
|  | 
 | ||
|  |         } | ||
|  | 
 | ||
|  |         // step 3 | ||
|  |         var ld = MathTools.ilog(n) - 1; // ilog is off-by-one from normal definitions | ||
|  | 
 | ||
|  |         // optimized step 3: | ||
|  | 
 | ||
|  |         // the original step3 loop can be nested r inside s or s inside r; | ||
|  |         // it's written originally as s inside r, but this is dumb when r | ||
|  |         // iterates many times, and s few. So I have two copies of it and | ||
|  |         // switch between them halfway. | ||
|  | 
 | ||
|  |         // this is iteration 0 of step 3 | ||
|  |         step3Iter0Loop(n >> 4, u, n2-1-n4*0, -(n >> 3), a); | ||
|  |         step3Iter0Loop(n >> 4, u, n2-1-n4*1, -(n >> 3), a); | ||
|  | 
 | ||
|  |         // this is iteration 1 of step 3 | ||
|  |         step3InnerRLoop(n >> 5, u, n2-1 - n8*0, -(n >> 4), a, 16); | ||
|  |         step3InnerRLoop(n >> 5, u, n2-1 - n8*1, -(n >> 4), a, 16); | ||
|  |         step3InnerRLoop(n >> 5, u, n2-1 - n8*2, -(n >> 4), a, 16); | ||
|  |         step3InnerRLoop(n >> 5, u, n2-1 - n8*3, -(n >> 4), a, 16); | ||
|  | 
 | ||
|  |         for (l in 2...((ld - 3) >> 1)) { | ||
|  |             var k0 = n >> (l + 2); | ||
|  |             var k0_2 = k0 >> 1; | ||
|  |             var lim = 1 << (l+1); | ||
|  |             for (i in 0...lim) { | ||
|  |                 step3InnerRLoop(n >> (l + 4), u, n2 - 1 - k0 * i, -k0_2, a, 1 << (l + 3)); | ||
|  |             } | ||
|  |         } | ||
|  | 
 | ||
|  |         for (l in ((ld - 3) >> 1)...(ld-6)) { | ||
|  |             var k0 = n >> (l + 2); | ||
|  |             var k1 = 1 << (l + 3); | ||
|  |             var k0_2 = k0 >> 1; | ||
|  |             var rlim = n >> (l+6); | ||
|  |             var lim = 1 << (l+1); | ||
|  |             var aOffset = 0; | ||
|  |             var i_off = n2 - 1; | ||
|  |             var r = rlim + 1; | ||
|  |             while (--r > 0) { | ||
|  |                 step3InnerSLoop(lim, u, i_off, -k0_2, a, aOffset, k1, k0); | ||
|  |                 aOffset += k1 * 4; | ||
|  |                 i_off -= 8; | ||
|  |             } | ||
|  |         } | ||
|  | 
 | ||
|  | 
 | ||
|  |         // iterations with count: | ||
|  |         //    ld-6,-5,-4 all interleaved together | ||
|  |         //         the big win comes from getting rid of needless flops | ||
|  |         //            due to the constants on pass 5 & 4 being all 1 and 0; | ||
|  |         //         combining them to be simultaneous to improve cache made little difference | ||
|  |         step3InnerSLoopLd654(n >> 5, u, n2-1, a, n); | ||
|  | 
 | ||
|  | 
 | ||
|  |         // output is u | ||
|  | 
 | ||
|  |         // step 4, 5, and 6 | ||
|  |         // cannot be in-place because of step 5 | ||
|  |         { | ||
|  |             // weirdly, I'd have thought reading sequentially and writing | ||
|  |             // erratically would have been better than vice-versa, but in | ||
|  |             // fact that's not what my testing showed. (That is, with | ||
|  |             // j = bitreverse(i), do you read i and write j, or read j and write i.) | ||
|  |             var brOffset = 0; | ||
|  |             var dOffset0 = n4-4; // v | ||
|  |             var dOffset1 = n2-4; // v | ||
|  | 
 | ||
|  |             while (dOffset0 >= 0) { | ||
|  |                 var k4 = bitReverse[brOffset + 0]; | ||
|  |                 v[dOffset1 +3] = u[k4+0]; | ||
|  |                 v[dOffset1 +2] = u[k4+1]; | ||
|  |                 v[dOffset0 +3] = u[k4+2]; | ||
|  |                 v[dOffset0 +2] = u[k4+3]; | ||
|  | 
 | ||
|  |                 k4 = bitReverse[brOffset + 1]; | ||
|  |                 v[dOffset1 +1] = u[k4+0]; | ||
|  |                 v[dOffset1 +0] = u[k4+1]; | ||
|  |                 v[dOffset0 +1] = u[k4+2]; | ||
|  |                 v[dOffset0 +0] = u[k4+3]; | ||
|  | 
 | ||
|  |                 dOffset0 -= 4; | ||
|  |                 dOffset1 -= 4; | ||
|  |                 brOffset += 2; | ||
|  |             } | ||
|  |         } | ||
|  | 
 | ||
|  |         // (paper output is u, now v) | ||
|  | 
 | ||
|  |         // data must be in buf2 | ||
|  |         //assert(v == buf2); | ||
|  | 
 | ||
|  |         // step 7    (paper output is v, now v) | ||
|  |         // this is now in place | ||
|  |         { | ||
|  |             var cOffset = 0; | ||
|  |             var dOffset = 0; // v | ||
|  |             var eOffset = n2 - 4; // v | ||
|  | 
 | ||
|  |             while (dOffset < eOffset) { | ||
|  |                 var a02 = v[dOffset + 0] - v[eOffset + 2]; | ||
|  |                 var a11 = v[dOffset + 1] + v[eOffset + 3]; | ||
|  | 
 | ||
|  |                 var b0 = c[cOffset + 1]*a02 + c[cOffset + 0]*a11; | ||
|  |                 var b1 = c[cOffset + 1]*a11 - c[cOffset + 0]*a02; | ||
|  | 
 | ||
|  |                 var b2 = v[dOffset + 0] + v[eOffset + 2]; | ||
|  |                 var b3 = v[dOffset + 1] - v[eOffset + 3]; | ||
|  | 
 | ||
|  |                 v[dOffset + 0] = b2 + b0; | ||
|  |                 v[dOffset + 1] = b3 + b1; | ||
|  |                 v[eOffset + 2] = b2 - b0; | ||
|  |                 v[eOffset + 3] = b1 - b3; | ||
|  | 
 | ||
|  |                 a02 = v[dOffset + 2] - v[eOffset + 0]; | ||
|  |                 a11 = v[dOffset + 3] + v[eOffset + 1]; | ||
|  | 
 | ||
|  |                 b0 = c[cOffset + 3]*a02 + c[cOffset + 2]*a11; | ||
|  |                 b1 = c[cOffset + 3]*a11 - c[cOffset + 2]*a02; | ||
|  | 
 | ||
|  |                 b2 = v[dOffset + 2] + v[eOffset + 0]; | ||
|  |                 b3 = v[dOffset + 3] - v[eOffset + 1]; | ||
|  | 
 | ||
|  |                 v[dOffset + 2] = b2 + b0; | ||
|  |                 v[dOffset + 3] = b3 + b1; | ||
|  |                 v[eOffset + 0] = b2 - b0; | ||
|  |                 v[eOffset + 1] = b1 - b3; | ||
|  | 
 | ||
|  |                 cOffset += 4; | ||
|  |                 dOffset += 4; | ||
|  |                 eOffset -= 4; | ||
|  |             } | ||
|  |         } | ||
|  | 
 | ||
|  |         // data must be in buf2 | ||
|  | 
 | ||
|  |         // step 8+decode    (paper output is X, now buffer) | ||
|  |         // this generates pairs of data a la 8 and pushes them directly through | ||
|  |         // the decode kernel (pushing rather than pulling) to avoid having | ||
|  |         // to make another pass later | ||
|  | 
 | ||
|  |         // this cannot POSSIBLY be in place, so we refer to the buffers directly | ||
|  | 
 | ||
|  |         { | ||
|  |             var bOffset = n2 - 8; //b | ||
|  |             var eOffset = n2 - 8; //buf2 | ||
|  |             var dOffset0 = 0; //buffer | ||
|  |             var dOffset1 = n2-4; //buffer | ||
|  |             var dOffset2 = n2; //buffer | ||
|  |             var dOffset3 = n - 4; //buffer | ||
|  | 
 | ||
|  |             while (eOffset >= 0) { | ||
|  |                 var p3 =  buf2[eOffset + 6]*b[bOffset + 7] - buf2[eOffset + 7]*b[bOffset + 6]; | ||
|  |                 var p2 = -buf2[eOffset + 6]*b[bOffset + 6] - buf2[eOffset + 7]*b[bOffset + 7]; | ||
|  | 
 | ||
|  |                 buffer[dOffset0 + 0] =    p3; | ||
|  |                 buffer[dOffset1 + 3] = - p3; | ||
|  |                 buffer[dOffset2 + 0] =    p2; | ||
|  |                 buffer[dOffset3 + 3] =    p2; | ||
|  | 
 | ||
|  |                 var p1 =  buf2[eOffset + 4]*b[bOffset + 5] - buf2[eOffset + 5]*b[bOffset + 4]; | ||
|  |                 var p0 = -buf2[eOffset + 4]*b[bOffset + 4] - buf2[eOffset + 5]*b[bOffset + 5]; | ||
|  | 
 | ||
|  |                 buffer[dOffset0 + 1] =    p1; | ||
|  |                 buffer[dOffset1 + 2] = - p1; | ||
|  |                 buffer[dOffset2 + 1] =    p0; | ||
|  |                 buffer[dOffset3 + 2] =    p0; | ||
|  | 
 | ||
|  |                 p3 =  buf2[eOffset + 2]*b[bOffset + 3] - buf2[eOffset + 3]*b[bOffset + 2]; | ||
|  |                 p2 = -buf2[eOffset + 2]*b[bOffset + 2] - buf2[eOffset + 3]*b[bOffset + 3]; | ||
|  | 
 | ||
|  |                 buffer[dOffset0 + 2] =    p3; | ||
|  |                 buffer[dOffset1 + 1] = - p3; | ||
|  |                 buffer[dOffset2 + 2] =    p2; | ||
|  |                 buffer[dOffset3 + 1] =    p2; | ||
|  | 
 | ||
|  |                 p1 =  buf2[eOffset + 0]*b[bOffset + 1] - buf2[eOffset + 1]*b[bOffset + 0]; | ||
|  |                 p0 = -buf2[eOffset + 0]*b[bOffset + 0] - buf2[eOffset + 1]*b[bOffset + 1]; | ||
|  | 
 | ||
|  |                 buffer[dOffset0 + 3] =    p1; | ||
|  |                 buffer[dOffset1 + 0] = - p1; | ||
|  |                 buffer[dOffset2 + 3] =    p0; | ||
|  |                 buffer[dOffset3 + 0] =    p0; | ||
|  | 
 | ||
|  |                 bOffset -= 8; | ||
|  |                 eOffset -= 8; | ||
|  |                 dOffset0 += 4; | ||
|  |                 dOffset2 += 4; | ||
|  |                 dOffset1 -= 4; | ||
|  |                 dOffset3 -= 4; | ||
|  |             } | ||
|  |         } | ||
|  |     } | ||
|  | 
 | ||
|  | 
 | ||
|  |     // the following were split out into separate functions while optimizing; | ||
|  |     // they could be pushed back up but eh. __forceinline showed no change; | ||
|  |     // they're probably already being inlined. | ||
|  |     static inline function step3Iter0Loop(n:Int, e:Vector<Float>, i_off:Int, k_off:Int, a:Vector<Float>) | ||
|  |     { | ||
|  |         var eeOffset0 = i_off; // e | ||
|  |         var eeOffset2 = i_off + k_off; // e | ||
|  |         var aOffset = 0; | ||
|  |         var i = (n >> 2) + 1; | ||
|  | 
 | ||
|  |         while (--i > 0) { | ||
|  |             var k00_20  = e[eeOffset0 +  0] - e[eeOffset2 +  0]; | ||
|  |             var k01_21  = e[eeOffset0 + -1] - e[eeOffset2 + -1]; | ||
|  | 
 | ||
|  |             e[eeOffset0 +  0] += e[eeOffset2 +  0];//e[eeOffset0 +  0] = e[eeOffset0 +  0] + e[eeOffset2 +  0]; | ||
|  |             e[eeOffset0 + -1] += e[eeOffset2 + -1];//e[eeOffset0 + -1] = e[eeOffset0 + -1] + e[eeOffset2 + -1]; | ||
|  |             e[eeOffset2 +  0] = k00_20 * a[aOffset + 0] - k01_21 * a[aOffset + 1]; | ||
|  |             e[eeOffset2 + -1] = k01_21 * a[aOffset + 0] + k00_20 * a[aOffset + 1]; | ||
|  |             aOffset +=  8; | ||
|  | 
 | ||
|  |             k00_20  = e[eeOffset0 + -2] - e[eeOffset2 + -2]; | ||
|  |             k01_21  = e[eeOffset0 + -3] - e[eeOffset2 + -3]; | ||
|  |             e[eeOffset0 + -2] += e[eeOffset2 + -2];//e[eeOffset0 + -2] = e[eeOffset0 + -2] + e[eeOffset2 + -2]; | ||
|  |             e[eeOffset0 + -3] += e[eeOffset2 + -3];//e[eeOffset0 + -3] = e[eeOffset0 + -3] + e[eeOffset2 + -3]; | ||
|  |             e[eeOffset2 + -2] = k00_20 * a[aOffset + 0] - k01_21 * a[aOffset + 1]; | ||
|  |             e[eeOffset2 + -3] = k01_21 * a[aOffset + 0] + k00_20 * a[aOffset + 1]; | ||
|  |             aOffset +=  8; | ||
|  | 
 | ||
|  |             k00_20  = e[eeOffset0 + -4] - e[eeOffset2 + -4]; | ||
|  |             k01_21  = e[eeOffset0 + -5] - e[eeOffset2 + -5]; | ||
|  |             e[eeOffset0 + -4] += e[eeOffset2 + -4];//e[eeOffset0 + -4] = e[eeOffset0 + -4] + e[eeOffset2 + -4]; | ||
|  |             e[eeOffset0 + -5] += e[eeOffset2 + -5];//e[eeOffset0 + -5] = e[eeOffset0 + -5] + e[eeOffset2 + -5]; | ||
|  |             e[eeOffset2 + -4] = k00_20 * a[aOffset + 0] - k01_21 * a[aOffset + 1]; | ||
|  |             e[eeOffset2 + -5] = k01_21 * a[aOffset + 0] + k00_20 * a[aOffset + 1]; | ||
|  |             aOffset +=  8; | ||
|  | 
 | ||
|  |             k00_20  = e[eeOffset0 + -6] - e[eeOffset2 + -6]; | ||
|  |             k01_21  = e[eeOffset0 + -7] - e[eeOffset2 + -7]; | ||
|  |             e[eeOffset0 + -6] += e[eeOffset2 + -6];//e[eeOffset0 + -6] = e[eeOffset0 + -6] + e[eeOffset2 + -6]; | ||
|  |             e[eeOffset0 + -7] += e[eeOffset2 + -7];//e[eeOffset0 + -7] = e[eeOffset0 + -7] + e[eeOffset2 + -7]; | ||
|  |             e[eeOffset2 + -6] = k00_20 * a[aOffset + 0] - k01_21 * a[aOffset + 1]; | ||
|  |             e[eeOffset2 + -7] = k01_21 * a[aOffset + 0] + k00_20 * a[aOffset + 1]; | ||
|  |             aOffset += 8; | ||
|  |             eeOffset0 -= 8; | ||
|  |             eeOffset2 -= 8; | ||
|  |         } | ||
|  |     } | ||
|  | 
 | ||
|  | 
 | ||
|  |     static inline function step3InnerRLoop(lim:Int, e:Vector<Float>, d0:Int, k_off:Int, a:Vector<Float>, k1:Int) { | ||
|  |         var aOffset = 0; | ||
|  |         var eOffset0 = d0; //e | ||
|  |         var eOffset2 = d0 + k_off; //e | ||
|  |         var i = (lim >> 2) + 1; | ||
|  | 
 | ||
|  |         while (--i > 0) { | ||
|  |             var k00_20 = e[eOffset0 + -0] - e[eOffset2 + -0]; | ||
|  |             var k01_21 = e[eOffset0 + -1] - e[eOffset2 + -1]; | ||
|  |             e[eOffset0 + -0] += e[eOffset2 + -0];//e[eOffset0 + -0] = e[eOffset0 + -0] + e[eOffset2 + -0]; | ||
|  |             e[eOffset0 + -1] += e[eOffset2 + -1];//e[eOffset0 + -1] = e[eOffset0 + -1] + e[eOffset2 + -1]; | ||
|  |             e[eOffset2 + -0] = (k00_20)*a[aOffset + 0] - (k01_21) * a[aOffset + 1]; | ||
|  |             e[eOffset2 + -1] = (k01_21)*a[aOffset + 0] + (k00_20) * a[aOffset + 1]; | ||
|  | 
 | ||
|  |             aOffset +=  k1; | ||
|  | 
 | ||
|  |             k00_20 = e[eOffset0 + -2] - e[eOffset2 + -2]; | ||
|  |             k01_21 = e[eOffset0 + -3] - e[eOffset2 + -3]; | ||
|  |             e[eOffset0 + -2] += e[eOffset2 + -2];//e[eOffset0 + -2] = e[eOffset0 + -2] + e[eOffset2 + -2]; | ||
|  |             e[eOffset0 + -3] += e[eOffset2 + -3];//e[eOffset0 + -3] = e[eOffset0 + -3] + e[eOffset2 + -3]; | ||
|  |             e[eOffset2 + -2] = (k00_20)*a[aOffset + 0] - (k01_21) * a[aOffset + 1]; | ||
|  |             e[eOffset2 + -3] = (k01_21)*a[aOffset + 0] + (k00_20) * a[aOffset + 1]; | ||
|  | 
 | ||
|  |             aOffset +=  k1; | ||
|  | 
 | ||
|  |             k00_20 = e[eOffset0 + -4] - e[eOffset2 + -4]; | ||
|  |             k01_21 = e[eOffset0 + -5] - e[eOffset2 + -5]; | ||
|  |             e[eOffset0 + -4] += e[eOffset2 + -4];//e[eOffset0 + -4] = e[eOffset0 + -4] + e[eOffset2 + -4]; | ||
|  |             e[eOffset0 + -5] += e[eOffset2 + -5];//e[eOffset0 + -5] = e[eOffset0 + -5] + e[eOffset2 + -5]; | ||
|  |             e[eOffset2 + -4] = (k00_20)*a[aOffset + 0] - (k01_21) * a[aOffset + 1]; | ||
|  |             e[eOffset2 + -5] = (k01_21)*a[aOffset + 0] + (k00_20) * a[aOffset + 1]; | ||
|  | 
 | ||
|  |             aOffset +=  k1; | ||
|  | 
 | ||
|  |             k00_20 = e[eOffset0 + -6] - e[eOffset2 + -6]; | ||
|  |             k01_21 = e[eOffset0 + -7] - e[eOffset2 + -7]; | ||
|  |             e[eOffset0 + -6] += e[eOffset2 + -6];//e[eOffset0 + -6] = e[eOffset0 + -6] + e[eOffset2 + -6]; | ||
|  |             e[eOffset0 + -7] += e[eOffset2 + -7];//e[eOffset0 + -7] = e[eOffset0 + -7] + e[eOffset2 + -7]; | ||
|  |             e[eOffset2 + -6] = (k00_20)*a[aOffset + 0] - (k01_21) * a[aOffset + 1]; | ||
|  |             e[eOffset2 + -7] = (k01_21)*a[aOffset + 0] + (k00_20) * a[aOffset + 1]; | ||
|  | 
 | ||
|  |             eOffset0 -= 8; | ||
|  |             eOffset2 -= 8; | ||
|  | 
 | ||
|  |             aOffset +=  k1; | ||
|  |         } | ||
|  |     } | ||
|  | 
 | ||
|  |     static inline function step3InnerSLoop(n:Int, e:Vector<Float>, i_off:Int, k_off:Int, a:Vector<Float>, aOffset0:Int, aOffset1:Int, k0:Int) | ||
|  |     { | ||
|  |         var A0 = a[aOffset0]; | ||
|  |         var A1 = a[aOffset0 + 1]; | ||
|  |         var A2 = a[aOffset0 + aOffset1]; | ||
|  |         var A3 = a[aOffset0 + aOffset1 + 1]; | ||
|  |         var A4 = a[aOffset0 + aOffset1 * 2+0]; | ||
|  |         var A5 = a[aOffset0 + aOffset1 * 2+1]; | ||
|  |         var A6 = a[aOffset0 + aOffset1 * 3+0]; | ||
|  |         var A7 = a[aOffset0 + aOffset1 * 3+1]; | ||
|  | 
 | ||
|  |         var eeOffset0 = i_off; // e | ||
|  |         var eeOffset2 = i_off + k_off; // e | ||
|  |         var i = n + 1; | ||
|  |         while (--i > 0) { | ||
|  |             var k00      = e[eeOffset0 +  0] - e[eeOffset2 +  0]; | ||
|  |             var k11      = e[eeOffset0 + -1] - e[eeOffset2 + -1]; | ||
|  |             e[eeOffset0 +  0] =  e[eeOffset0 +  0] + e[eeOffset2 +  0]; | ||
|  |             e[eeOffset0 + -1] =  e[eeOffset0 + -1] + e[eeOffset2 + -1]; | ||
|  |             e[eeOffset2 +  0] = (k00) * A0 - (k11) * A1; | ||
|  |             e[eeOffset2 + -1] = (k11) * A0 + (k00) * A1; | ||
|  | 
 | ||
|  |             k00      = e[eeOffset0 + -2] - e[eeOffset2 + -2]; | ||
|  |             k11      = e[eeOffset0 + -3] - e[eeOffset2 + -3]; | ||
|  |             e[eeOffset0 + -2] =  e[eeOffset0 + -2] + e[eeOffset2 + -2]; | ||
|  |             e[eeOffset0 + -3] =  e[eeOffset0 + -3] + e[eeOffset2 + -3]; | ||
|  |             e[eeOffset2 + -2] = (k00) * A2 - (k11) * A3; | ||
|  |             e[eeOffset2 + -3] = (k11) * A2 + (k00) * A3; | ||
|  | 
 | ||
|  |             k00      = e[eeOffset0 + -4] - e[eeOffset2 + -4]; | ||
|  |             k11      = e[eeOffset0 + -5] - e[eeOffset2 + -5]; | ||
|  |             e[eeOffset0 + -4] =  e[eeOffset0 + -4] + e[eeOffset2 + -4]; | ||
|  |             e[eeOffset0 + -5] =  e[eeOffset0 + -5] + e[eeOffset2 + -5]; | ||
|  |             e[eeOffset2 + -4] = (k00) * A4 - (k11) * A5; | ||
|  |             e[eeOffset2 + -5] = (k11) * A4 + (k00) * A5; | ||
|  | 
 | ||
|  |             k00      = e[eeOffset0 + -6] - e[eeOffset2 + -6]; | ||
|  |             k11      = e[eeOffset0 + -7] - e[eeOffset2 + -7]; | ||
|  |             e[eeOffset0 + -6] =  e[eeOffset0 + -6] + e[eeOffset2 + -6]; | ||
|  |             e[eeOffset0 + -7] =  e[eeOffset0 + -7] + e[eeOffset2 + -7]; | ||
|  |             e[eeOffset2 + -6] = (k00) * A6 - (k11) * A7; | ||
|  |             e[eeOffset2 + -7] = (k11) * A6 + (k00) * A7; | ||
|  | 
 | ||
|  |             eeOffset0 -= k0; | ||
|  |             eeOffset2 -= k0; | ||
|  |         } | ||
|  |     } | ||
|  | 
 | ||
|  |     static inline function iter54(e:Vector<Float>, zOffset:Int) | ||
|  |     { | ||
|  |         var t0 = e[zOffset +  0]; | ||
|  |         var t1 = e[zOffset + -4]; | ||
|  |         var k00  = t0 - t1; | ||
|  |         var y0    = t0 + t1; | ||
|  | 
 | ||
|  |         t0 = e[zOffset + -2]; | ||
|  |         t1 = e[zOffset + -6]; | ||
|  |         var y2    = t0 + t1; | ||
|  |         var k22  = t0 - t1; | ||
|  | 
 | ||
|  |         e[zOffset + -0] = y0 + y2;        // z0 + z4 + z2 + z6 | ||
|  |         e[zOffset + -2] = y0 - y2;        // z0 + z4 - z2 - z6 | ||
|  | 
 | ||
|  |         // done with y0,y2 | ||
|  | 
 | ||
|  |         var k33  = e[zOffset + -3] - e[zOffset + -7]; | ||
|  | 
 | ||
|  |         e[zOffset + -4] = k00 + k33;     // z0 - z4 + z3 - z7 | ||
|  |         e[zOffset + -6] = k00 - k33;     // z0 - z4 - z3 + z7 | ||
|  | 
 | ||
|  |         // done with k33 | ||
|  | 
 | ||
|  |         t0 = e[zOffset + -1]; | ||
|  |         t1 = e[zOffset + -5]; | ||
|  |         var k11  = t0 - t1; | ||
|  |         var y1    = t0 + t1; | ||
|  |         var y3    = e[zOffset + -3] + e[zOffset + -7]; | ||
|  | 
 | ||
|  |         e[zOffset + -1] = y1 + y3;        // z1 + z5 + z3 + z7 | ||
|  |         e[zOffset + -3] = y1 - y3;        // z1 + z5 - z3 - z7 | ||
|  |         e[zOffset + -5] = k11 - k22;     // z1 - z5 + z2 - z6 | ||
|  |         e[zOffset + -7] = k11 + k22;     // z1 - z5 - z2 + z6 | ||
|  |     } | ||
|  | 
 | ||
|  |     static inline function step3InnerSLoopLd654(n:Int, e:Vector<Float>, i_off:Int, a:Vector<Float>, baseN:Int) | ||
|  |     { | ||
|  |         var A2 = a[baseN >> 3]; | ||
|  |         var zOffset = i_off; // e | ||
|  |         var baseOffset = i_off - 16 * n; //e | ||
|  | 
 | ||
|  |         while (zOffset > baseOffset) { | ||
|  |             var t0 = e[zOffset]; | ||
|  |             var t1 = e[zOffset + -8]; | ||
|  |             e[zOffset + -8]    = t0 - t1; | ||
|  |             e[zOffset + -0] = t0 + t1; | ||
|  | 
 | ||
|  |             t0 = e[zOffset + -1]; | ||
|  |             t1 = e[zOffset + -9]; | ||
|  |             e[zOffset + -9]    = t0 - t1; | ||
|  |             e[zOffset + -1] = t0 + t1; | ||
|  | 
 | ||
|  | 
 | ||
|  |             t0 = e[zOffset +  -2]; | ||
|  |             t1 = e[zOffset + -10]; | ||
|  |             var k00    = t0 - t1; | ||
|  |             e[zOffset +  -2] = t0 + t1; | ||
|  | 
 | ||
|  |             t0 = e[zOffset +  -3]; | ||
|  |             t1 = e[zOffset + -11]; | ||
|  |             var k11    = t0 - t1; | ||
|  |             e[zOffset +  -3] = t0 + t1; | ||
|  | 
 | ||
|  |             e[zOffset + -10] = (k00+k11) * A2; | ||
|  |             e[zOffset + -11] = (k11-k00) * A2; | ||
|  | 
 | ||
|  | 
 | ||
|  |             t0 = e[zOffset +  -4]; | ||
|  |             t1 = e[zOffset + -12]; | ||
|  |             k00     = t1 - t0; // reverse to avoid a unary negation | ||
|  |             e[zOffset +  -4] = t0 + t1; | ||
|  | 
 | ||
|  |             t0 = e[zOffset +  -5]; | ||
|  |             t1 = e[zOffset + -13]; | ||
|  |             k11     = t0 - t1; | ||
|  |             e[zOffset +  -5] = t0 + t1; | ||
|  | 
 | ||
|  |             e[zOffset + -12] = k11; | ||
|  |             e[zOffset + -13] = k00; | ||
|  | 
 | ||
|  | 
 | ||
|  |             t0 = e[zOffset +  -6]; | ||
|  |             t1 = e[zOffset + -14]; | ||
|  |             k00     = t1 - t0;  // reverse to avoid a unary negation | ||
|  |             e[zOffset +  -6] = t0 + t1; | ||
|  | 
 | ||
|  |             t0 = e[zOffset +  -7]; | ||
|  |             t1 = e[zOffset + -15]; | ||
|  |             k11     = t0 - t1; | ||
|  |             e[zOffset +  -7] = t0 + t1; | ||
|  | 
 | ||
|  |             e[zOffset + -14] = (k00+k11) * A2; | ||
|  |             e[zOffset + -15] = (k00-k11) * A2; | ||
|  | 
 | ||
|  |             iter54(e, zOffset); | ||
|  |             iter54(e, zOffset - 8); | ||
|  |             zOffset -= 16; | ||
|  |         } | ||
|  |     } | ||
|  | } |