Computing Reciprocals of Fixed-Point Numbers by the Newton-Raphson Method — Division by Multiplication

August 11, 20115 comments Coded in ASM for the TI C64x
``````* =========================================================================== *
*                                                                             *
*  Compute reciprocals of a Q.15 vector by Newton-Raphson method              *
*      y[i] = ym[i] * 2^ye[i] = 1 / x[i], -1 <= x[i] < 1 and x[i] != 0        *
*      where ym is the mantissa vector and ye is the exponent vector          *
*                                                                             *
*  C prototype:                                                               *
*       void DSP_vrecip16n(short* x,    // Input, Q.15 vector                 *
*                          short* ym,   // Output, Q.15 vector                *
*                          short* ye,   // Output, int16 vector               *
*                          int    N);   // Input, vector length               *
*                                                                             *
*  Restriction:                                                               *
*       (N % 4) == 0 and N >= 24                                              *
*                                                                             *
*  Performance:                                                               *
*       55+2.5*N (software pipelining enabled by -O2, CCS5.1)                 *
*                                                                             *
*  Relative error:                                                            *
*       (-2^-16, 2^-16)                                                       *
*                                                                             *
*  Algorithm:                                                                 *
*       Input: V, abs(x[i]) normalized to [.5, 1)                             *
*       Initialization: U0 = (V-.75)^2 + 1.4256-V, ~1/(2*V), 5.9233 bits      *
*       Iteration 1:    U1 = U0*(1-V*U0),          ~1/(4*V), 11.481 bits      *
*       Iteration 2:    U2 = 8*U1*(.5-V*U1),       ~1/(2*V), 22.585 bits      *
*       Output in the format of Fl16 = Q.15(mant)|Int16(expt)                 *
*                                                                             *
* =========================================================================== *

.sect ".text: _DSP_vrecip16n"
.global _DSP_vrecip16n

_DSP_vrecip16n: .cproc  A_X, B_YM, A_YE, B_n

.no_mdep
.rega   A_x0, A_x1, A_rr, A_nx0, A_nx1, A_nx10, A_ny10, A_v10, A_vc10
.rega   A_vs1:A_vs0, A_vs10, A_u10, A_vu0, A_vu1, A_vu10, A_u0, A_u1
.rega   A_y0, A_y1, A_y32:A_y10, A_vp10, A_x32:A_x10
.rega   A_ss, A_cc, A_mm, A_ww, A_w, A_rnd
.regb   B_x2, B_x3, B_rr, B_nx2, B_nx3, B_nx32, B_y32, B_v32, B_vc32
.regb   B_vs3:B_vs2, B_vs32, B_u32, B_vu2, B_vu3, B_vu32, B_u2, B_u3
.regb   B_y2, B_y3, B_ny32:B_ny10, B_vp32
.regb   B_ss, B_cc, B_mm, B_ww, B_w, B_rnd, B_nn, B_X
.reg    B_i, C10, C32, Cl10, Cl32

MVKL        0xB67BB67B,     B_mm                    ; Q15(1.4256)
MVKH        0xB67BB67B,     B_mm                    ; Q15(1.4256)
MV          B_mm,           A_mm
MVKL        0x60006000,     A_cc
MVKH        0x60006000,     A_cc
MV          A_cc,           B_cc
MVKL        0xFFF1FFF1,     B_rr
MVKH        0xFFF1FFF1,     B_rr
MV          B_rr,           A_rr
MVKL        0x80008000,     A_ww
MVKH        0x80008000,     A_ww
MV          A_ww,           B_ww
SHL         A_ww,           15,             A_w     ; 0x40000000
SHL         B_ww,           15,             B_w     ; 0x40000000
ROTL        A_w,            17,             A_rnd   ; 0x00008000
ROTL        B_w,            17,             B_rnd   ; 0x00008000
MVK         15,             A_ss
MVK         15,             B_ss

SHR         B_n,            2,              B_i
SUB         B_i,            2,              B_i

LOOP_vrecip: .trip 16
LDH         *A_X++,         A_x0
LDH         *A_X++[3],      A_x1
LDH         *B_X++,         B_x2
LDH         *B_X++[3],      B_x3

NORM        A_x0,           A_nx0
NORM        A_x1,           A_nx1
NORM        B_x2,           B_nx2
NORM        B_x3,           B_nx3
PACK2       A_nx1,          A_nx0,          A_nx10
PACK2       B_nx3,          B_nx2,          B_nx32

SSHVL       A_x0,           A_nx0,          A_x0
SSHVL       A_x1,           A_nx1,          A_x1
SSHVL       B_x2,           B_nx2,          B_x2
SSHVL       B_x3,           B_nx3,          B_x3
PACKH2      A_x1,           A_x0,           A_v10
PACKH2      B_x3,           B_x2,           B_v32

; Initial value, U0=(V-.75)^2 + 1.4256-V, ~1/(2*V), 5.9233 bits
ABS2        A_v10,          A_vp10
ABS2        B_v32,          B_vp32
SUB2        A_vp10,         A_cc,           A_vc10
SUB2        B_vp32,         B_cc,           B_vc32
SUB2        A_mm,           A_vp10,         A_u10
SUB2        B_mm,           B_vp32,         B_u32
SMPY2       A_vc10,         A_vc10,         A_vs1:A_vs0
SMPY2       B_vc32,         B_vc32,         B_vs3:B_vs2
PACKH2      A_vs1,          A_vs0,          A_vs10
PACKH2      B_vs3,          B_vs2,          B_vs32
SHR2        A_v10,          A_ss,           C10
SHR2        B_v32,          B_ss,           C32
XOR         C10,            A_u10,          A_u10
XOR         C32,            B_u32,          B_u32

; Iteration 1, U1=U0*(1-V*U0), ~1/(4*V), 11.481 bits
SMPY2       A_v10,          A_u10,          A_vu1:A_vu0
SMPY2       B_v32,          B_u32,          B_vu3:B_vu2
PACKH2      A_vu1,          A_vu0,          A_vu10
PACKH2      B_vu3,          B_vu2,          B_vu32
SUB2        A_ww,           A_vu10,         A_vu10
SUB2        B_ww,           B_vu32,         B_vu32
SMPY2       A_u10,          A_vu10,         A_u1:A_u0
SMPY2       B_u32,          B_vu32,         B_u3:B_u2
PACKH2      A_u1,           A_u0,           A_u10
PACKH2      B_u3,           B_u2,           B_u32

; Iteration 2, U2=8*U1*(1/2-V*U1), ~1/(2*V), 22.585 bits
SMPY2       A_v10,          A_u10,          A_vu1:A_vu0
SMPY2       B_v32,          B_u32,          B_vu3:B_vu2
SUB         A_w,            A_vu0,          A_vu0
SUB         A_w,            A_vu1,          A_vu1
SUB         B_w,            B_vu2,          B_vu2
SUB         B_w,            B_vu3,          B_vu3
MPYHIR      A_u0,           A_vu0,          A_u0
MPYHIR      A_u1,           A_vu1,          A_u1
MPYHIR      B_u2,           B_vu2,          B_u2
MPYHIR      B_u3,           B_vu3,          B_u3
SSHL        A_u0,           3,              A_u0
SSHL        A_u1,           3,              A_u1
SSHL        B_u2,           3,              B_u2
SSHL        B_u3,           3,              B_u3

; Fl16 = Q15(mant)|int16(expt)
PACKH2      A_y1,           A_y0,           A_y10
PACKH2      B_y3,           B_y2,           B_y32

MV          B_y32,          A_y32
MV          A_ny10,         B_ny10

STDW        A_y32:A_y10,    *B_YM++
STDW        B_ny32:B_ny10,  *A_YE++

BDEC        LOOP_vrecip,    B_i

.endproc``````

Computing Square Root of A Vector of Fixed-Point Numbers

August 9, 2011 Coded in ASM for the TI C64x
``````* =========================================================================== *
*                                                                             *
*  Compute square root of a Q.15 vector by 4th order polynomial fitting       *
*      y[i] = sqrt(x[i]), 0 <= x[i] < 1;                                      *
*                                                                             *
*  C prototype:                                                               *
*      void DSP_vsqrt_q15(short* x, short* y, int N);                         *
*                                                                             *
*  Performance:                                                               *
*      O(4*N) or 4 cycles per number (software pipelining enabled by -O2)     *
*                                                                             *
*  Error:                                                                     *
*      (-2^-15, 2^-15)                                                        *
*                                                                             *
* =========================================================================== *

; chebfun, min max error
; 0xFFF1024A = Q31(-0.0005)
; 0x0046E0AB = Q31(0.0022)
; 0xFE764EE1 = Q31(-0.0120)
; 0x1277E288 = Q31(0.1443)
; 0x6ED9EBA1 = Q31(0.8660)

; polyfit, min sqr error
; 0xFFF1203D = Q31(-0.0005)
; 0x004581E7 = Q31(0.0021)
; 0xFE7645AF = Q31(-0.0120)
; 0x1278CF97 = Q31(0.1443)
; 0x6ED9E687 = Q31(0.8660)

SQRT_C4 .set 0xFFF1203D
SQRT_C3 .set 0x004581E7
SQRT_C2 .set 0xFE7645AF
SQRT_C1 .set 0x1278CF97
SQRT_C0 .set 0x6ED9E687
SQRT_S  .set 0x5A82799A ;  Q31(0.7071)

.sect ".text: _DSP_vsqrt_q15"
.global _DSP_vsqrt_q15

_DSP_vsqrt_q15: .cproc  A_X, B_Y, A_n

.no_mdep
.rega A_C4, A_C3, A_C2, A_C1, A_C0, A_xx0, A_rnd, A_y0
.rega A_y0c4, A_y0c3, A_y0c2, A_y0c1, A_x0c3, A_x0c2, A_x0c1, A_x0c0
.rega A_S, A_e0, A_x0x, A_xm, A_y0l, A_y0h, A_y0s
.regb B_C4, B_C3, B_C2, B_C1, B_C0, B_xx1, B_rnd, B_y1, B_y10
.regb B_y1c4, B_y1c3, B_y1c2, B_y1c1, B_x1c3, B_x1c2, B_x1c1, B_x1c0
.regb B_S, B_e1, B_x1x, B_xm, B_y1l, B_y1h, B_y1s, B_X
.reg  B_i, C0, C1

MVK         0x1,            A_rnd
SHL         A_rnd,          15,             A_rnd
MV          A_rnd,          B_rnd

MVKL        SQRT_C4,        A_C4
MVKH        SQRT_C4,        A_C4
MVKL        SQRT_C3,        B_C3
MVKH        SQRT_C3,        B_C3

MV          A_C4,           B_C4
MV          B_C3,           A_C3
MVKL        SQRT_C2,        A_C2
MVKH        SQRT_C2,        A_C2
MVKL        SQRT_C1,        B_C1
MVKH        SQRT_C1,        B_C1

MV          A_C2,           B_C2
MV          B_C1,           A_C1
MVKL        SQRT_C0,        A_C0
MVKH        SQRT_C0,        A_C0
MVKL        SQRT_S,         B_S
MVKH        SQRT_S,         B_S

MV          B_S,            A_S
MV          A_C0,           B_C0

MVKL        0x6000,         A_xm
SHL         A_xm,           16,             A_xm
MV          A_xm,           B_xm

SHR         A_n,            1,              B_i
SUB         B_i,            2,              B_i

LOOP_vsqrt: .trip 8
LDH.D1T1    *A_X++[2],      A_xx0
LDH.D2T2    *B_X++[2],      B_xx1

NORM.L1     A_xx0,          A_e0
NORM.L2     B_xx1,          B_e1
SSHVL.M1    A_xx0,          A_e0,           A_x0x
SSHVL.M2    B_xx1,          B_e1,           B_x1x

SUB.D1      A_e0,           16,             A_e0
SUB.D2      B_e1,           16,             B_e1
AND.D1      0x1,            A_e0,           C0
AND.D2      0x1,            B_e1,           C1
SHR.S1      A_e0,           1,              A_e0
SHR.S2      B_e1,           1,              B_e1

SUB.D1      A_x0x,          A_xm,           A_x0x
SUB.D2      B_x1x,          B_xm,           B_x1x
SHL.S1      A_x0x,          2,              A_x0x
SHL.S2      B_x1x,          2,              B_x1x

MPYHIR.M1   A_x0x,          A_C4,           A_y0c4
MPYHIR.M2   B_x1x,          B_C4,           B_y1c4

MPYHIR.M1   A_x0x,          A_x0c3,         A_y0c3
MPYHIR.M2   B_x1x,          B_x1c3,         B_y1c3

MPYHIR.M1   A_x0x,          A_x0c2,         A_y0c2
MPYHIR.M2   B_x1x,          B_x1c2,         B_y1c2

MPYHIR.M1   A_x0x,          A_x0c1,         A_y0c1
MPYHIR.M2   B_x1x,          B_x1c1,         B_y1c1

; A_S = B_S = 0x5A82799A ~= 0x5A820000 + 0x00008000
[C0]   MPYHIR.M1   A_S,            A_x0c0,         A_y0h
[C1]   MPYHIR.M2   B_S,            B_x1c0,         B_y1h
[C0]   SHR         A_x0c0,         16,             A_y0l
[C1]   SHR         B_x1c0,         16,             B_y1l

SHR.S1      A_x0c0,         A_e0,           A_y0s
SHR.S2      B_x1c0,         B_e1,           B_y1s

PACKH2.S2X  B_y1,           A_y0,           B_y10

STW.D2T2    B_y10,          *B_Y++

BDEC        LOOP_vsqrt,     B_i

.endproc``````

vectors_intr.asm - External Interruption Configuration

February 2, 2011 Coded in ASM for the TI C67x
``````*Vectors_intr.asm Vector file for interrupt INT11
.global _vectors     ;global symbols
.global _c_int00
.global _vector1
.global _vector2
.global _vector3
.global _c_int04     ; símbolo para EXT_INT4
.global _vector5
.global _vector6
.global _vector7
.global _vector8
.global _vector9
.global _vector10
.global _c_int11        ;for INT11
.global _vector12
.global _vector13
.global _vector14
.global _vector15

VEC_ENTRY .macro addr      ;macro for ISR
STW   B0,*--B15
B     B0
LDW   *B15++,B0
NOP   2
NOP
NOP
.endm

_vec_nmi:
B    NRP
NOP  5

_vec_dummy:
B    IRP
NOP  5

.sect ".vecs"        ;aligned IST section
.align 1024
_vectors:
_vector0:   VEC_ENTRY _c_int00     ;RESET
_vector1:   VEC_ENTRY _vec_nmi   ;NMI
_vector2:   VEC_ENTRY _vec_dummy    ;RSVD
_vector3:   VEC_ENTRY _vec_dummy
_vector4:   VEC_ENTRY _c_int04    ;INT04 Externa
_vector5:   VEC_ENTRY _vec_dummy
_vector6:   VEC_ENTRY _vec_dummy
_vector7:   VEC_ENTRY _vec_dummy
_vector8:   VEC_ENTRY _vec_dummy
_vector9:   VEC_ENTRY _vec_dummy
_vector10:  VEC_ENTRY _vec_dummy
_vector12:  VEC_ENTRY _vec_dummy
_vector13:  VEC_ENTRY _vec_dummy
_vector14:  VEC_ENTRY _vec_dummy
_vector15:  VEC_ENTRY _vec_dummy``````

Fixed-point IIR Filter Code Generator for ARM

November 30, 2010 Coded in ASM for the ARM
``````/*
* fixedp lowpass_iir(fixedp *w, fixedp x);
*
* Fixed point IIR filtering routine for ARM. Computes output y for
* input x. The output will have the same fracbits as the input.
*  w: caller-allocated array for state storage. Should be length 2*L.
*  x: sample to filter
*
* Required data:
*   LENGTH: number of sections
*   .sos: sos matrix
*   SOS_FRACBITS: sos fracbits
*   .gain: scale values array
*   G_FRACBITS: scale values fracbits
*
* Register usage:
*   r0: address of internal state array (w)
*   r1: x
*   r2: address of SOS array
*   r3: address of gain array
*   r4: w0
*   r5: w1
*   r6: y
*   r7: long multiply lo word
*   r8: long multiply hi word
*   r9:  B1
*   r10: B2
*   r11: A1
*   r12: A2
*   r14: loop counter
*/

.set LENGTH,  3
.set SOS_FRACBITS,  30
.set G_FRACBITS,  31

.section .rodata
.align 4

.sos:
.word 0x49be7eaf, 0x40000000, 0xc4c9d93a, 0x251f228c
.word 0x1d81c8a5, 0x40000000, 0xdaa0b600, 0x37cef3c1
.word 0x40000000, 0x00000000, 0xd87b730c, 0x00000000

.gain:

.word 0x06b1dbb5, 0x42fe27a0, 0x613d5d5a

.text
.arm

.global	lowpass_iir
.func   lowpass_iir
lowpass_iir:
push {r4-r11, lr}

mov  r14, #0								/* i = 0 */

cmp  r14, #LENGTH
bge  .endloop

.loop:
/* load all the SOS coefficients we need into r8-r12 and increment the SOS pointer */
ldmia r2!, {r9-r12}							/* B1, B2, A1, A2 */

/* x = gain[i]*x */
ldr    r6, [r3], #4							/* load current element of gain array into r6 and increment by 4 */
smull  r7, r8, r1, r6						/* 64-bit multiply: r5:r4 = x*gain[i]; */
mov    r1, r7, lsr#G_FRACBITS				/* shift lo word to the right by G_FRACBITS */
orr    r1, r1, r8, lsl#(32 - G_FRACBITS)		/* shift hi word to the right by G_FRACBITS  and OR with lo word*/

/* load w0 and w1 into r4, r5, but do NOT increment */
ldm    r0, {r4-r5}

/* y(r6) = x(r1) + w[W0](r4)*/

/* w0(r4) = .sos[B1](r9)*x(r1) - .sos[A1](r11)*y(r6) + w[W1](r5); */
rsb    r11, r11, #0							/* .sos[A1] = -sos[A1] */
smull  r7, r8, r9, r1						/* r8:r7 = sos[B1]*x */
smlal  r7, r8, r11, r6						/* r8:r7 += -sos[A1]*y */
mov    r4, r7, lsr#SOS_FRACBITS				/* shift lo word to the right by SOS_FRACBITS */
orr    r4, r4, r8, lsl#(32 - SOS_FRACBITS)	/* shift hi word to the right by SOS_FRACBITS and OR with lo word*/

/* w2 = sos[B2]*x(r1) - .sos[A2](r12)*y(r6); */
rsb    r12, r12, #0							/* .sos[A2] = -sos[A2] */
smull  r7, r8, r10, r1                      /* r8:r7 = sos[B2](r10)*x(r1) */
smlal  r7, r8, r12, r6						/* r8:r7 += -sos[A2](r12)*y(r6) */
mov    r5, r7, lsr#SOS_FRACBITS				/* shift lo word to the right by SOS_FRACBITS */
orr    r5, r5, r8, lsl#(32 - SOS_FRACBITS)	/* shift hi word to the right by SOS_FRACBITS and OR with lo word*/

/* need to store w0, w1 back to memory and increment */
stmia r0!, {r4-r5}

/* x = y */
mov    r1, r6

/* increment pointer and branch to top of loop */
cmp r14, #LENGTH
blt   .loop

.endloop:
/* set return val, restore stack, and return */
mov r0, r6
pop {r4-r11, lr}
bx  lr

.endfunc
.end``````

Fixed-point FIR Filter Code Generator for ARM Assembly

November 30, 2010 Coded in ASM for the ARM
``````/* **********************************************************************
*
* Fixed Point Filtering Library
*
* **********************************************************************
*
* lowpass_fir.S
*
* Jordan Rhee
* rhee.jordan@gmail.com
*
* IEEE UCSD
* http://ieee.ucsd.edu
*
* Generated with IEEE UCSD Fixed Pointer Filter Code Generator
* http://ieee.ucsd.edu/projects/qfilt.php
*
* **********************************************************************/

/*
* fixedp lowpass_fir(fixedp *w, fixedp x);
*
* Fixed point FIR filtering routine for ARM. Computes output y for
* input x. The output will have the same fracbits as the input.
*  w: caller-allocated array for state storage. Should be length LENGTH+1.
*  x: sample to filter
*
* Required data:
*   LENGTH: number of coefficients
*   .h: coefficient array
*   H_FRACBITS: fracbits of coefficients
*
*   r0: address of internal state array. w[LENGTH] contains
*       index of head of circular buffer.
*   r1: x
*   r2: address of coefficient array (h)
*   r3: j: index of current state value
*   r4: i: index of current coefficient
*   r5: h[i]: current filter coefficient
*   r6: w[j]: current state value
*   r7: long multiply lo word
*   r8: long multiply hi word
*/

.set LENGTH,  20
.set H_FRACBITS,  30

.section .rodata
.align 4

.h:

.word 0xffc5ef57, 0xfeb3416c, 0xfdf673b8, 0xffc7fb45
.word 0x02b1826b, 0x0123c987, 0xfb542f40, 0xfc248828
.word 0x0ab1bf40, 0x1b3f7457, 0x1b3f7457, 0x0ab1bf40
.word 0xfc248828, 0xfb542f40, 0x0123c987, 0x02b1826b
.word 0xffc7fb45, 0xfdf673b8, 0xfeb3416c, 0xffc5ef57

.text
.arm

.global	lowpass_fir
.func   lowpass_fir
lowpass_fir:
push {r4-r8}

/* w(r0)[j(w[N])] = x */
ldr  r3, [r0, #(4*LENGTH)]		/* load value of j */
str  r1, [r0, r3, lsl #2]			/* store x into w[j] */

/* y = 0; */
mov r7, #0
mov r8, #0

ldr r2, =.h

/* i = 0 */
mov r4, #0
cmp r4, #LENGTH
bge .endloop

.loop:
/* y += h[i] * w[j] */
ldr    r5, [r2, r4, lsl #2]					/* r5 = h[i] */
ldr    r6, [r0, r3, lsl #2]					/* r6 = w[j] */
smlal  r7, r8, r5, r6						/* r8:r7 += h[i] * w[j] */

subs r3, r3, #1								/* j-- */
movmi r3, #(LENGTH - 1)						/* if j == -1, then j = N-1 */

add   r4, r4, #1							/* i++ */
cmp r4, #LENGTH								/* is i less than N */
blt .loop

.endloop:
add r3, r3, #1								/* increment j and store back to memory */
cmp r3, #LENGTH
moveq r3, #0
str r3, [r0, #(4*LENGTH)]					/* save new value of j */

mov    r0, r7, lsr #H_FRACBITS				/* shift lo word to the right by H_FRACBITS */
orr    r0, r0, r8, lsl #(32 - H_FRACBITS)	/* shift hi word to the right by H_FRACBITS and OR with lo word*/

pop {r4-r8}
bx  lr

.endfunc
.end``````