Computing Reciprocals of Fixed-Point Numbers by the Newton-Raphson Method — Division by Multiplication
* =========================================================================== *
* *
* Compute reciprocals of a Q.15 vector by Newton-Raphson method *
* y[i] = ym[i] * 2^ye[i] = 1 / x[i], -1 <= x[i] < 1 and x[i] != 0 *
* where ym is the mantissa vector and ye is the exponent vector *
* *
* C prototype: *
* void DSP_vrecip16n(short* x, // Input, Q.15 vector *
* short* ym, // Output, Q.15 vector *
* short* ye, // Output, int16 vector *
* int N); // Input, vector length *
* *
* Restriction: *
* (N % 4) == 0 and N >= 24 *
* *
* Performance: *
* 55+2.5*N (software pipelining enabled by -O2, CCS5.1) *
* *
* Relative error: *
* (-2^-16, 2^-16) *
* *
* Algorithm: *
* Input: V, abs(x[i]) normalized to [.5, 1) *
* Initialization: U0 = (V-.75)^2 + 1.4256-V, ~1/(2*V), 5.9233 bits *
* Iteration 1: U1 = U0*(1-V*U0), ~1/(4*V), 11.481 bits *
* Iteration 2: U2 = 8*U1*(.5-V*U1), ~1/(2*V), 22.585 bits *
* Output in the format of Fl16 = Q.15(mant)|Int16(expt) *
* *
* =========================================================================== *
.sect ".text: _DSP_vrecip16n"
.global _DSP_vrecip16n
_DSP_vrecip16n: .cproc A_X, B_YM, A_YE, B_n
.no_mdep
.rega A_x0, A_x1, A_rr, A_nx0, A_nx1, A_nx10, A_ny10, A_v10, A_vc10
.rega A_vs1:A_vs0, A_vs10, A_u10, A_vu0, A_vu1, A_vu10, A_u0, A_u1
.rega A_y0, A_y1, A_y32:A_y10, A_vp10, A_x32:A_x10
.rega A_ss, A_cc, A_mm, A_ww, A_w, A_rnd
.regb B_x2, B_x3, B_rr, B_nx2, B_nx3, B_nx32, B_y32, B_v32, B_vc32
.regb B_vs3:B_vs2, B_vs32, B_u32, B_vu2, B_vu3, B_vu32, B_u2, B_u3
.regb B_y2, B_y3, B_ny32:B_ny10, B_vp32
.regb B_ss, B_cc, B_mm, B_ww, B_w, B_rnd, B_nn, B_X
.reg B_i, C10, C32, Cl10, Cl32
ADD 4, A_X, B_X
MVKL 0xB67BB67B, B_mm ; Q15(1.4256)
MVKH 0xB67BB67B, B_mm ; Q15(1.4256)
MV B_mm, A_mm
MVKL 0x60006000, A_cc
MVKH 0x60006000, A_cc
MV A_cc, B_cc
MVKL 0xFFF1FFF1, B_rr
MVKH 0xFFF1FFF1, B_rr
MV B_rr, A_rr
MVKL 0x80008000, A_ww
MVKH 0x80008000, A_ww
MV A_ww, B_ww
SHL A_ww, 15, A_w ; 0x40000000
SHL B_ww, 15, B_w ; 0x40000000
ROTL A_w, 17, A_rnd ; 0x00008000
ROTL B_w, 17, B_rnd ; 0x00008000
MVK 15, A_ss
MVK 15, B_ss
SHR B_n, 2, B_i
SUB B_i, 2, B_i
LOOP_vrecip: .trip 16
LDH *A_X++, A_x0
LDH *A_X++[3], A_x1
LDH *B_X++, B_x2
LDH *B_X++[3], B_x3
NORM A_x0, A_nx0
NORM A_x1, A_nx1
NORM B_x2, B_nx2
NORM B_x3, B_nx3
PACK2 A_nx1, A_nx0, A_nx10
PACK2 B_nx3, B_nx2, B_nx32
ADD2 A_rr, A_nx10, A_ny10
ADD2 B_rr, B_nx32, B_ny32
SSHVL A_x0, A_nx0, A_x0
SSHVL A_x1, A_nx1, A_x1
SSHVL B_x2, B_nx2, B_x2
SSHVL B_x3, B_nx3, B_x3
PACKH2 A_x1, A_x0, A_v10
PACKH2 B_x3, B_x2, B_v32
; Initial value, U0=(V-.75)^2 + 1.4256-V, ~1/(2*V), 5.9233 bits
ABS2 A_v10, A_vp10
ABS2 B_v32, B_vp32
SUB2 A_vp10, A_cc, A_vc10
SUB2 B_vp32, B_cc, B_vc32
SUB2 A_mm, A_vp10, A_u10
SUB2 B_mm, B_vp32, B_u32
SMPY2 A_vc10, A_vc10, A_vs1:A_vs0
SMPY2 B_vc32, B_vc32, B_vs3:B_vs2
PACKH2 A_vs1, A_vs0, A_vs10
PACKH2 B_vs3, B_vs2, B_vs32
ADD2 A_vs10, A_u10, A_u10
ADD2 B_vs32, B_u32, B_u32
SHR2 A_v10, A_ss, C10
SHR2 B_v32, B_ss, C32
XOR C10, A_u10, A_u10
XOR C32, B_u32, B_u32
; Iteration 1, U1=U0*(1-V*U0), ~1/(4*V), 11.481 bits
SMPY2 A_v10, A_u10, A_vu1:A_vu0
SMPY2 B_v32, B_u32, B_vu3:B_vu2
PACKH2 A_vu1, A_vu0, A_vu10
PACKH2 B_vu3, B_vu2, B_vu32
SUB2 A_ww, A_vu10, A_vu10
SUB2 B_ww, B_vu32, B_vu32
SMPY2 A_u10, A_vu10, A_u1:A_u0
SMPY2 B_u32, B_vu32, B_u3:B_u2
PACKH2 A_u1, A_u0, A_u10
PACKH2 B_u3, B_u2, B_u32
; Iteration 2, U2=8*U1*(1/2-V*U1), ~1/(2*V), 22.585 bits
SMPY2 A_v10, A_u10, A_vu1:A_vu0
SMPY2 B_v32, B_u32, B_vu3:B_vu2
SUB A_w, A_vu0, A_vu0
SUB A_w, A_vu1, A_vu1
SUB B_w, B_vu2, B_vu2
SUB B_w, B_vu3, B_vu3
MPYHIR A_u0, A_vu0, A_u0
MPYHIR A_u1, A_vu1, A_u1
MPYHIR B_u2, B_vu2, B_u2
MPYHIR B_u3, B_vu3, B_u3
SSHL A_u0, 3, A_u0
SSHL A_u1, 3, A_u1
SSHL B_u2, 3, B_u2
SSHL B_u3, 3, B_u3
; Fl16 = Q15(mant)|int16(expt)
SADD A_u0, A_rnd, A_y0
SADD A_u1, A_rnd, A_y1
SADD B_u2, B_rnd, B_y2
SADD B_u3, B_rnd, B_y3
PACKH2 A_y1, A_y0, A_y10
PACKH2 B_y3, B_y2, B_y32
MV B_y32, A_y32
MV A_ny10, B_ny10
STDW A_y32:A_y10, *B_YM++
STDW B_ny32:B_ny10, *A_YE++
BDEC LOOP_vrecip, B_i
.endproc
Computing Square Root of A Vector of Fixed-Point Numbers
* =========================================================================== *
* *
* Compute square root of a Q.15 vector by 4th order polynomial fitting *
* y[i] = sqrt(x[i]), 0 <= x[i] < 1; *
* *
* C prototype: *
* void DSP_vsqrt_q15(short* x, short* y, int N); *
* *
* Performance: *
* O(4*N) or 4 cycles per number (software pipelining enabled by -O2) *
* *
* Error: *
* (-2^-15, 2^-15) *
* *
* =========================================================================== *
; chebfun, min max error
; 0xFFF1024A = Q31(-0.0005)
; 0x0046E0AB = Q31(0.0022)
; 0xFE764EE1 = Q31(-0.0120)
; 0x1277E288 = Q31(0.1443)
; 0x6ED9EBA1 = Q31(0.8660)
; polyfit, min sqr error
; 0xFFF1203D = Q31(-0.0005)
; 0x004581E7 = Q31(0.0021)
; 0xFE7645AF = Q31(-0.0120)
; 0x1278CF97 = Q31(0.1443)
; 0x6ED9E687 = Q31(0.8660)
SQRT_C4 .set 0xFFF1203D
SQRT_C3 .set 0x004581E7
SQRT_C2 .set 0xFE7645AF
SQRT_C1 .set 0x1278CF97
SQRT_C0 .set 0x6ED9E687
SQRT_S .set 0x5A82799A ; Q31(0.7071)
.sect ".text: _DSP_vsqrt_q15"
.global _DSP_vsqrt_q15
_DSP_vsqrt_q15: .cproc A_X, B_Y, A_n
.no_mdep
.rega A_C4, A_C3, A_C2, A_C1, A_C0, A_xx0, A_rnd, A_y0
.rega A_y0c4, A_y0c3, A_y0c2, A_y0c1, A_x0c3, A_x0c2, A_x0c1, A_x0c0
.rega A_S, A_e0, A_x0x, A_xm, A_y0l, A_y0h, A_y0s
.regb B_C4, B_C3, B_C2, B_C1, B_C0, B_xx1, B_rnd, B_y1, B_y10
.regb B_y1c4, B_y1c3, B_y1c2, B_y1c1, B_x1c3, B_x1c2, B_x1c1, B_x1c0
.regb B_S, B_e1, B_x1x, B_xm, B_y1l, B_y1h, B_y1s, B_X
.reg B_i, C0, C1
ADD 2, A_X, B_X
MVK 0x1, A_rnd
SHL A_rnd, 15, A_rnd
MV A_rnd, B_rnd
MVKL SQRT_C4, A_C4
MVKH SQRT_C4, A_C4
MVKL SQRT_C3, B_C3
MVKH SQRT_C3, B_C3
MV A_C4, B_C4
MV B_C3, A_C3
MVKL SQRT_C2, A_C2
MVKH SQRT_C2, A_C2
MVKL SQRT_C1, B_C1
MVKH SQRT_C1, B_C1
MV A_C2, B_C2
MV B_C1, A_C1
MVKL SQRT_C0, A_C0
MVKH SQRT_C0, A_C0
MVKL SQRT_S, B_S
MVKH SQRT_S, B_S
MV B_S, A_S
MV A_C0, B_C0
MVKL 0x6000, A_xm
SHL A_xm, 16, A_xm
MV A_xm, B_xm
SHR A_n, 1, B_i
SUB B_i, 2, B_i
LOOP_vsqrt: .trip 8
LDH.D1T1 *A_X++[2], A_xx0
LDH.D2T2 *B_X++[2], B_xx1
NORM.L1 A_xx0, A_e0
NORM.L2 B_xx1, B_e1
SSHVL.M1 A_xx0, A_e0, A_x0x
SSHVL.M2 B_xx1, B_e1, B_x1x
SUB.D1 A_e0, 16, A_e0
SUB.D2 B_e1, 16, B_e1
AND.D1 0x1, A_e0, C0
AND.D2 0x1, B_e1, C1
SHR.S1 A_e0, 1, A_e0
SHR.S2 B_e1, 1, B_e1
SUB.D1 A_x0x, A_xm, A_x0x
SUB.D2 B_x1x, B_xm, B_x1x
SHL.S1 A_x0x, 2, A_x0x
SHL.S2 B_x1x, 2, B_x1x
MPYHIR.M1 A_x0x, A_C4, A_y0c4
MPYHIR.M2 B_x1x, B_C4, B_y1c4
SADD.L1 A_y0c4, A_C3, A_x0c3
SADD.L2 B_y1c4, B_C3, B_x1c3
MPYHIR.M1 A_x0x, A_x0c3, A_y0c3
MPYHIR.M2 B_x1x, B_x1c3, B_y1c3
SADD.L1 A_y0c3, A_C2, A_x0c2
SADD.L2 B_y1c3, B_C2, B_x1c2
MPYHIR.M1 A_x0x, A_x0c2, A_y0c2
MPYHIR.M2 B_x1x, B_x1c2, B_y1c2
SADD.L1 A_y0c2, A_C1, A_x0c1
SADD.L2 B_y1c2, B_C1, B_x1c1
MPYHIR.M1 A_x0x, A_x0c1, A_y0c1
MPYHIR.M2 B_x1x, B_x1c1, B_y1c1
SADD.L1 A_y0c1, A_C0, A_x0c0
SADD.L2 B_y1c1, B_C0, B_x1c0
; A_S = B_S = 0x5A82799A ~= 0x5A820000 + 0x00008000
[C0] MPYHIR.M1 A_S, A_x0c0, A_y0h
[C1] MPYHIR.M2 B_S, B_x1c0, B_y1h
[C0] SHR A_x0c0, 16, A_y0l
[C1] SHR B_x1c0, 16, B_y1l
[C0] SADD.L1 A_y0h, A_y0l, A_x0c0
[C1] SADD.L2 B_y1h, B_y1l, B_x1c0
SHR.S1 A_x0c0, A_e0, A_y0s
SHR.S2 B_x1c0, B_e1, B_y1s
SADD.L1 A_y0s, A_rnd, A_y0
SADD.L2 B_y1s, B_rnd, B_y1
PACKH2.S2X B_y1, A_y0, B_y10
STW.D2T2 B_y10, *B_Y++
BDEC LOOP_vsqrt, B_i
.endproc
Computing Reciprocals of Fixed-Point Numbers by the Newton-Raphson Method — Division by Multiplication
* =========================================================================== *
* *
* Compute reciprocals of a Q.15 vector by Newton-Raphson method *
* y[i] = ym[i] * 2^ye[i] = 1 / x[i], -1 <= x[i] < 1 and x[i] != 0 *
* where ym is the mantissa vector and ye is the exponent vector *
* *
* C prototype: *
* void DSP_vrecip16n(short* x, // Input, Q.15 vector *
* short* ym, // Output, Q.15 vector *
* short* ye, // Output, int16 vector *
* int N); // Input, vector length *
* *
* Restriction: *
* (N % 4) == 0 and N >= 24 *
* *
* Performance: *
* 55+2.5*N (software pipelining enabled by -O2, CCS5.1) *
* *
* Relative error: *
* (-2^-16, 2^-16) *
* *
* Algorithm: *
* Input: V, abs(x[i]) normalized to [.5, 1) *
* Initialization: U0 = (V-.75)^2 + 1.4256-V, ~1/(2*V), 5.9233 bits *
* Iteration 1: U1 = U0*(1-V*U0), ~1/(4*V), 11.481 bits *
* Iteration 2: U2 = 8*U1*(.5-V*U1), ~1/(2*V), 22.585 bits *
* Output in the format of Fl16 = Q.15(mant)|Int16(expt) *
* *
* =========================================================================== *
.sect ".text: _DSP_vrecip16n"
.global _DSP_vrecip16n
_DSP_vrecip16n: .cproc A_X, B_YM, A_YE, B_n
.no_mdep
.rega A_x0, A_x1, A_rr, A_nx0, A_nx1, A_nx10, A_ny10, A_v10, A_vc10
.rega A_vs1:A_vs0, A_vs10, A_u10, A_vu0, A_vu1, A_vu10, A_u0, A_u1
.rega A_y0, A_y1, A_y32:A_y10, A_vp10, A_x32:A_x10
.rega A_ss, A_cc, A_mm, A_ww, A_w, A_rnd
.regb B_x2, B_x3, B_rr, B_nx2, B_nx3, B_nx32, B_y32, B_v32, B_vc32
.regb B_vs3:B_vs2, B_vs32, B_u32, B_vu2, B_vu3, B_vu32, B_u2, B_u3
.regb B_y2, B_y3, B_ny32:B_ny10, B_vp32
.regb B_ss, B_cc, B_mm, B_ww, B_w, B_rnd, B_nn, B_X
.reg B_i, C10, C32, Cl10, Cl32
ADD 4, A_X, B_X
MVKL 0xB67BB67B, B_mm ; Q15(1.4256)
MVKH 0xB67BB67B, B_mm ; Q15(1.4256)
MV B_mm, A_mm
MVKL 0x60006000, A_cc
MVKH 0x60006000, A_cc
MV A_cc, B_cc
MVKL 0xFFF1FFF1, B_rr
MVKH 0xFFF1FFF1, B_rr
MV B_rr, A_rr
MVKL 0x80008000, A_ww
MVKH 0x80008000, A_ww
MV A_ww, B_ww
SHL A_ww, 15, A_w ; 0x40000000
SHL B_ww, 15, B_w ; 0x40000000
ROTL A_w, 17, A_rnd ; 0x00008000
ROTL B_w, 17, B_rnd ; 0x00008000
MVK 15, A_ss
MVK 15, B_ss
SHR B_n, 2, B_i
SUB B_i, 2, B_i
LOOP_vrecip: .trip 16
LDH *A_X++, A_x0
LDH *A_X++[3], A_x1
LDH *B_X++, B_x2
LDH *B_X++[3], B_x3
NORM A_x0, A_nx0
NORM A_x1, A_nx1
NORM B_x2, B_nx2
NORM B_x3, B_nx3
PACK2 A_nx1, A_nx0, A_nx10
PACK2 B_nx3, B_nx2, B_nx32
ADD2 A_rr, A_nx10, A_ny10
ADD2 B_rr, B_nx32, B_ny32
SSHVL A_x0, A_nx0, A_x0
SSHVL A_x1, A_nx1, A_x1
SSHVL B_x2, B_nx2, B_x2
SSHVL B_x3, B_nx3, B_x3
PACKH2 A_x1, A_x0, A_v10
PACKH2 B_x3, B_x2, B_v32
; Initial value, U0=(V-.75)^2 + 1.4256-V, ~1/(2*V), 5.9233 bits
ABS2 A_v10, A_vp10
ABS2 B_v32, B_vp32
SUB2 A_vp10, A_cc, A_vc10
SUB2 B_vp32, B_cc, B_vc32
SUB2 A_mm, A_vp10, A_u10
SUB2 B_mm, B_vp32, B_u32
SMPY2 A_vc10, A_vc10, A_vs1:A_vs0
SMPY2 B_vc32, B_vc32, B_vs3:B_vs2
PACKH2 A_vs1, A_vs0, A_vs10
PACKH2 B_vs3, B_vs2, B_vs32
ADD2 A_vs10, A_u10, A_u10
ADD2 B_vs32, B_u32, B_u32
SHR2 A_v10, A_ss, C10
SHR2 B_v32, B_ss, C32
XOR C10, A_u10, A_u10
XOR C32, B_u32, B_u32
; Iteration 1, U1=U0*(1-V*U0), ~1/(4*V), 11.481 bits
SMPY2 A_v10, A_u10, A_vu1:A_vu0
SMPY2 B_v32, B_u32, B_vu3:B_vu2
PACKH2 A_vu1, A_vu0, A_vu10
PACKH2 B_vu3, B_vu2, B_vu32
SUB2 A_ww, A_vu10, A_vu10
SUB2 B_ww, B_vu32, B_vu32
SMPY2 A_u10, A_vu10, A_u1:A_u0
SMPY2 B_u32, B_vu32, B_u3:B_u2
PACKH2 A_u1, A_u0, A_u10
PACKH2 B_u3, B_u2, B_u32
; Iteration 2, U2=8*U1*(1/2-V*U1), ~1/(2*V), 22.585 bits
SMPY2 A_v10, A_u10, A_vu1:A_vu0
SMPY2 B_v32, B_u32, B_vu3:B_vu2
SUB A_w, A_vu0, A_vu0
SUB A_w, A_vu1, A_vu1
SUB B_w, B_vu2, B_vu2
SUB B_w, B_vu3, B_vu3
MPYHIR A_u0, A_vu0, A_u0
MPYHIR A_u1, A_vu1, A_u1
MPYHIR B_u2, B_vu2, B_u2
MPYHIR B_u3, B_vu3, B_u3
SSHL A_u0, 3, A_u0
SSHL A_u1, 3, A_u1
SSHL B_u2, 3, B_u2
SSHL B_u3, 3, B_u3
; Fl16 = Q15(mant)|int16(expt)
SADD A_u0, A_rnd, A_y0
SADD A_u1, A_rnd, A_y1
SADD B_u2, B_rnd, B_y2
SADD B_u3, B_rnd, B_y3
PACKH2 A_y1, A_y0, A_y10
PACKH2 B_y3, B_y2, B_y32
MV B_y32, A_y32
MV A_ny10, B_ny10
STDW A_y32:A_y10, *B_YM++
STDW B_ny32:B_ny10, *A_YE++
BDEC LOOP_vrecip, B_i
.endproc
Computing Square Root of A Vector of Fixed-Point Numbers
* =========================================================================== *
* *
* Compute square root of a Q.15 vector by 4th order polynomial fitting *
* y[i] = sqrt(x[i]), 0 <= x[i] < 1; *
* *
* C prototype: *
* void DSP_vsqrt_q15(short* x, short* y, int N); *
* *
* Performance: *
* O(4*N) or 4 cycles per number (software pipelining enabled by -O2) *
* *
* Error: *
* (-2^-15, 2^-15) *
* *
* =========================================================================== *
; chebfun, min max error
; 0xFFF1024A = Q31(-0.0005)
; 0x0046E0AB = Q31(0.0022)
; 0xFE764EE1 = Q31(-0.0120)
; 0x1277E288 = Q31(0.1443)
; 0x6ED9EBA1 = Q31(0.8660)
; polyfit, min sqr error
; 0xFFF1203D = Q31(-0.0005)
; 0x004581E7 = Q31(0.0021)
; 0xFE7645AF = Q31(-0.0120)
; 0x1278CF97 = Q31(0.1443)
; 0x6ED9E687 = Q31(0.8660)
SQRT_C4 .set 0xFFF1203D
SQRT_C3 .set 0x004581E7
SQRT_C2 .set 0xFE7645AF
SQRT_C1 .set 0x1278CF97
SQRT_C0 .set 0x6ED9E687
SQRT_S .set 0x5A82799A ; Q31(0.7071)
.sect ".text: _DSP_vsqrt_q15"
.global _DSP_vsqrt_q15
_DSP_vsqrt_q15: .cproc A_X, B_Y, A_n
.no_mdep
.rega A_C4, A_C3, A_C2, A_C1, A_C0, A_xx0, A_rnd, A_y0
.rega A_y0c4, A_y0c3, A_y0c2, A_y0c1, A_x0c3, A_x0c2, A_x0c1, A_x0c0
.rega A_S, A_e0, A_x0x, A_xm, A_y0l, A_y0h, A_y0s
.regb B_C4, B_C3, B_C2, B_C1, B_C0, B_xx1, B_rnd, B_y1, B_y10
.regb B_y1c4, B_y1c3, B_y1c2, B_y1c1, B_x1c3, B_x1c2, B_x1c1, B_x1c0
.regb B_S, B_e1, B_x1x, B_xm, B_y1l, B_y1h, B_y1s, B_X
.reg B_i, C0, C1
ADD 2, A_X, B_X
MVK 0x1, A_rnd
SHL A_rnd, 15, A_rnd
MV A_rnd, B_rnd
MVKL SQRT_C4, A_C4
MVKH SQRT_C4, A_C4
MVKL SQRT_C3, B_C3
MVKH SQRT_C3, B_C3
MV A_C4, B_C4
MV B_C3, A_C3
MVKL SQRT_C2, A_C2
MVKH SQRT_C2, A_C2
MVKL SQRT_C1, B_C1
MVKH SQRT_C1, B_C1
MV A_C2, B_C2
MV B_C1, A_C1
MVKL SQRT_C0, A_C0
MVKH SQRT_C0, A_C0
MVKL SQRT_S, B_S
MVKH SQRT_S, B_S
MV B_S, A_S
MV A_C0, B_C0
MVKL 0x6000, A_xm
SHL A_xm, 16, A_xm
MV A_xm, B_xm
SHR A_n, 1, B_i
SUB B_i, 2, B_i
LOOP_vsqrt: .trip 8
LDH.D1T1 *A_X++[2], A_xx0
LDH.D2T2 *B_X++[2], B_xx1
NORM.L1 A_xx0, A_e0
NORM.L2 B_xx1, B_e1
SSHVL.M1 A_xx0, A_e0, A_x0x
SSHVL.M2 B_xx1, B_e1, B_x1x
SUB.D1 A_e0, 16, A_e0
SUB.D2 B_e1, 16, B_e1
AND.D1 0x1, A_e0, C0
AND.D2 0x1, B_e1, C1
SHR.S1 A_e0, 1, A_e0
SHR.S2 B_e1, 1, B_e1
SUB.D1 A_x0x, A_xm, A_x0x
SUB.D2 B_x1x, B_xm, B_x1x
SHL.S1 A_x0x, 2, A_x0x
SHL.S2 B_x1x, 2, B_x1x
MPYHIR.M1 A_x0x, A_C4, A_y0c4
MPYHIR.M2 B_x1x, B_C4, B_y1c4
SADD.L1 A_y0c4, A_C3, A_x0c3
SADD.L2 B_y1c4, B_C3, B_x1c3
MPYHIR.M1 A_x0x, A_x0c3, A_y0c3
MPYHIR.M2 B_x1x, B_x1c3, B_y1c3
SADD.L1 A_y0c3, A_C2, A_x0c2
SADD.L2 B_y1c3, B_C2, B_x1c2
MPYHIR.M1 A_x0x, A_x0c2, A_y0c2
MPYHIR.M2 B_x1x, B_x1c2, B_y1c2
SADD.L1 A_y0c2, A_C1, A_x0c1
SADD.L2 B_y1c2, B_C1, B_x1c1
MPYHIR.M1 A_x0x, A_x0c1, A_y0c1
MPYHIR.M2 B_x1x, B_x1c1, B_y1c1
SADD.L1 A_y0c1, A_C0, A_x0c0
SADD.L2 B_y1c1, B_C0, B_x1c0
; A_S = B_S = 0x5A82799A ~= 0x5A820000 + 0x00008000
[C0] MPYHIR.M1 A_S, A_x0c0, A_y0h
[C1] MPYHIR.M2 B_S, B_x1c0, B_y1h
[C0] SHR A_x0c0, 16, A_y0l
[C1] SHR B_x1c0, 16, B_y1l
[C0] SADD.L1 A_y0h, A_y0l, A_x0c0
[C1] SADD.L2 B_y1h, B_y1l, B_x1c0
SHR.S1 A_x0c0, A_e0, A_y0s
SHR.S2 B_x1c0, B_e1, B_y1s
SADD.L1 A_y0s, A_rnd, A_y0
SADD.L2 B_y1s, B_rnd, B_y1
PACKH2.S2X B_y1, A_y0, B_y10
STW.D2T2 B_y10, *B_Y++
BDEC LOOP_vsqrt, B_i
.endproc