Code Snippets Submitted by Lippold Haken
Random Number Generator
// Random number generator that sounds pleasant in audio algorithms.
// The argument and return value is a 30-bit (positive nonzero) integer.
// This is Duttaro's 30-bit pseudorandom number generator, p.124 J.AES, vol 50 no 3 March 2002;
// I found this 30-bit pseudorandom number generator sounds far superior to the 31-bit
// pseudorandom number generator presented in the same article.
// To make result a signed fixed-point value, do not shift left into the sign bit;
// instead, subtract 0x20000000 then multiply by a scale factor.
#define NextRand(rr) (((((rr >> 16) ^ (rr >> 15) ^ (rr >> 1) ^ rr) & 1) << 29) | (rr >> 1))
Fast SIMD sine and cosine
// Fast sine and cosine for SHARC ADSP-21364, callable from VisualDSP++ C.
// Simultaneously compute sine and cosine of a given angle.
// Polynomial approximation developed by Flemming Pedersen of CERN.
// This implementation by Lippold Haken of Haken Audio, March 2010, October 2012.
//
// C prototype for this function:
// typedef struct { float sin, cos; } SinCos;
// void sinCos( float fRadians, SinCos *result );
//
// 23 cycles to execute this function:
// 18 cycles for the computations
// 5 cycles for compiler environment overhead.
#include <def21364.h>
// Data table for sinCos function.
.section/dm/DOUBLE32 seg_dmda;
sinCosData:
.global sinCosData;
.type sinCosData,STT_OBJECT;
.var =
// Integer 3 for "mod 4" operation.
0x00000003, 0x00000003,
// 1.0 for neutralizing final cos multiply.
0x3F800000, 0x3F800000,
// Coefficients for sin and cos polynomials.
0xBB96BD89, // SQ7 for sin polynomial -4.6002309092153379e-003
0xBCA75707, // CQ6 for cos polynomial -2.0427240364907607e-002
0x3DA32F1D, // SQ5 for sin polynomial 7.9679708649230657e-002
0x3E81D8BD, // CQ4 for cos polynomial 2.5360671639164339e-001
0xBF255DDD, // SQ3 for sin polynomial -6.4596348437163809e-001
0xBF9DE9D1, // CQ2 for cos polynomial -1.2336979844380824e+000
0x3FC90FDB, // SQ1 for sin polynomial 1.5707963267948966e+000
0x3F800000, // CQ0 for cos polynomial 1.0000000000000000e+000
// Sign adjustment table, indexed by integer quadrant.
0x3F800000, 0x3F800000, // 1.0, 1.0
0x3F800000, 0xBF800000, // 1.0,-1.0
0xBF800000, 0xBF800000, // -1.0,-1.0
0xBF800000, 0x3F800000; // -1.0, 1.0
sinCosData.end:
.section/pm/DOUBLE32 seg_pmco;
_sinCos:
// Registers in VisualDSP environment:
// M5,M13 = 0
// M6,M14 = 1
// M7,M15 = -1
// F4 = first function argument (input angle)
// R8 = pointer to address for storing sin,cos result
// R0,R1,R2,R4,R8,R12,I4,I12,I13,M4 need not be preserved
// I6,I7 = stack and frame pointers
// C calling routine has: "CJUMP (DB); DM(I7,M7)=R2; DM(I7,M7)=PC;"
// CJUMP does these operations: "R2=I6, I6=I7"
// Before exiting must do RFRAME: "I7=I6, I6=DM(0,I6)"
// Preamble.
SF4 = F4;
BIT SET MODE1 SIMD;
// Pointer to constants.
I4 = sinCosData;
// Compute integer quadrant, fractional quadrant, fractional quadrant squared.
// Quadrant "q" = fRadians * 2./Pi
// Quadrant values (0. .. 4.) correspond to (0. .. 2Pi)
R1 = 0x3F22F983; // F1 = 2./Pi
F0 = F1 * F4, I12 = DM(M7,I6); // I12 is execution return address
// Integer quadrant = round(q) mod 4 (0..3)
// Since MODE1 TRUNC bit is zero, FIX implements round-to-nearest.
R1 = FIX F0, R2 = DM(I4,2);
R2 = R1 AND R2, F12 = DM(I4,2);
// Fractional quadrant "Xq" = q - round(q) (-.5 .. .5)
// Fractional quadrant values (-.5 .. .5) correspond to (-Pi/4..Pi/4)
F1 = FLOAT R1, F4 = DM(I4,2);
F1 = F0 - F1, M4 = R2;
// Set SZ if integer quadrant is even.
R2 = LSHIFT R2 BY 31;
// Fractional quadrant squared "Xq2" = Xq * Xq
// Set SF1 = 1.0 to avoid final Xq multiply for cosine polynomial.
F2 = F1 * F1, F12 <-> SF1;
// Registers involved in sin and cos computation:
// F1 = fractional quadrant Xq
// SF1 = 1.0
// F2,SF2 = fractional quadrant squared Xq2
// F4 = SQ7
// SF4 = CQ6
// M4 = integer quadrant
// SZ = indicates integer quadrant is even
// DM(I4) = pairs of cosine and sine coefficients
// Compute sine polynomial (PEx) and cosine polynomial (PEy).
// Xq2*SQ7 Xq2*CQ6
F0 = F2 * F4, F4 = DM(I4,2);
// SQ5+Xq2*SQ7 CQ4+Xq2*CQ6
F0 = F0 + F4;
// Xq2(SQ5+Xq2*SQ7) Xq2(CQ4+Xq2*CQ6)
F0 = F0 * F2, F4 = DM(I4,2);
// SQ3+Xq2(SQ5+Xq2*SQ7) CQ2+Xq2(CQ4+Xq2*CQ6)
F0 = F0 + F4, F4 = DM(I4,2);
// Xq2(SQ3+Xq2(SQ5+Xq2*SQ7)) Xq2(CQ2+Xq2(CQ4+Xq2*CQ6))
F0 = F0 * F2, MODIFY(I4,M4);
// SQ1+Xq2(SQ3+Xq2(SQ5+Xq2*SQ7))) CQ0+Xq2(CQ2+Xq2(CQ4+Xq2*CQ6)))
F0 = F0 + F4, F4 = DM(M4,I4);
// Xq(SQ1+Xq2(SQ3+Xq2(SQ5+Xq2*SQ7)))) no change on PEy
F0 = F0 * F1, I4 = R8; // I4 is data return address
// Registers:
// F0 = sine result so far
// SF0 = cosine result so far
// F4,SF4 = +/-1 for inverting sign for quadrant
// SZ = set for integer quadrant even
// Invert and swap sin/cos according to integer quadrant:
// quadrant F0 SF0
// 0 = 00 SIN COS
// 1 = 01 COS -SIN swap results, invert sine
// 2 = 10 -SIN -COS invert signs
// 3 = 11 -COS SIN swap results, invert cosine
IF NOT SZ F0 <-> SF0; // swap sin,cos if odd quadrant
RFRAME; // stack frame management before exit
JUMP (M14,I12) (DB), F0 = F0 * F4; // negate sin/cos if needed
DM(M5,I4) = F0; // store sin at I4, cos (SF0) at I4+1
BIT CLR MODE1 SIMD; // end SIMD during JUMP (DB) stall
._sinCos.end:
.global _sinCos;
.type _sinCos,STT_FUNC;
Fast power-of-10 approximation, for 32-bit floats
// Fast power-of-10 approximation, with RMS error of 1.77%.
// This approximation developed by Nicol Schraudolph (Neural Computation vol 11, 1999).
// Adapted for 32-bit floats by Lippold Haken of Haken Audio, April 2010.
// Set float variable's bits to integer expression.
// f=b^f is approximated by
// (int)f = f*0x00800000*log(b)/log(2) + 0x3F800000-60801*8
// f=10^f is approximated by
// (int)f = f*27866352.6 + 1064866808.0
inline void Pow10(float *f) { *(int *)f = *f * 27866352.6 + 1064866808.0; };
Inverse Sqrt and Fourth Root approximations
// Fast InvSqrt approxumation, with an error of less than 4%.
// This approximation is attributed to Greg Walsh.
inline void InvSqrt(float *x) { *(int *)x = 0x5f3759df - (*(int *)x >> 1); }
inline float SqrtSqrt(float x) { InvSqrt(&x); InvSqrt(&x); return x; }
Biquad bandpass filter bank
// Efficient implementation of biquad bandpass filter bank.
// This is the core computation for musical physical models ("modal models").
//
// This code takes 1.5 instructions to compute each floating-point bandpass filter;
// for a SHARC running at 3ns instruction cycle this is 4.5ns / biquad.
//
// Written by Lippold Haken of Haken Audio, 2010, using ADSP-21364 and VisualDSP++ C.
//
// dmXmod - pointer to DM data memory
// pmXmod - pointer to PM data memory
// bfbXComputePairs - optimized computation of a bank of bandpass filters
// bfbXCoef - coefficient computation for the bandpass filters
// CFDSP hardware-specific definitions.
#include "cfdsp21364.h"
// === Start of definitions for .h file
// The following structure definitions are shared by this code and by the calling code,
// so they should be moved to a common .h file.
// Data in dm.
typedef struct
{
#define bfb4FilterSections 64 // number of modes (number of bandpass filter sections)
float bfb_Y[2][bfb4FilterSections]; // save state: y[n-1] and y[n-2]
} DmX_bfb;
// Data in pm.
typedef struct
{
// The bfb_C[] contains 3 coefficients for each biquad, with a pair of biquad's coefficients
// interleaved so that biquads can be processed in pairs by the optimized SIMD assembly loop.
float bfb_C[ 3 * bfb4FilterSections ]; // bfb filter coefficients, see comment above
} PmX_bfb;
// Pointers to data, and a samples counter.
DmX_bfb *dmXmod;
PmX_bfb *pmXmod;
int sampInFrame; // sample counter
#define sr 48000 // sample rate in Hz
#define Pi (3.14159265)
#define ABS(x) ( ((x)>(0)) ? (x) : -(x) )
void sinCos( float fRadians, float *fSin, float *fCos ); // see Lippold Haken's sinCos code snippet
// === End of definitions for .h file
// Parameter arrays for Biquad Filter Bank (modal physical modelling) synthesis.
//
// If P is the biquad pair number, and R=0/R=1 distinguishes the two biquads within the pair:
// x[n] coefficient for biquad k is at pmXmod->bfb_C[ 6*P + R + 0 ]; this is also -x[n-2] coefficient.
// y[n-1] coefficient for biquad k is at pmXmod->bfb_C[ 6*P + R + 2 ]; this is scaled by 0.5 to avoid overflow
// y[n-2] coefficient for biquad k is at pmXmod->bfb_C[ 6*P + R + 4 ]
// The first index to dmXmod->bfb_Y[] is based on lsbs of sample counter.
// The second index to dmXmod->bfb_Y[] is 2P+R.
// Note: R=0 always for even voices, R=1 always for odd voices.
#define bfb_Ynm1(P) &(dmXmod->bfb_Y[sampInFrame & 1][2*(P)]) // start in y[n-1][] array
#define bfb_Ynm2(P) &(dmXmod->bfb_Y[1-(sampInFrame & 1)][2*(P)]) // start in y[n-2][] array
void bfbXComputePairs(int biquadP, int pairs,
float inDiff0, // x[n]-x[n-2] for even biquads
float inDiff1, // x[n]-x[n-2] for odd biquads
float *fSum0, // output for even biquad sum
float *fSum1) // output for odd biquad sum
// Sum the output of a sequence of at least 2 biquad pairs.
// For each of the sequence of pairs, the first (even) biquad in each pair has a common input "input0",
// and the second (odd) biquad of each pair has a (possibly different) common input "input1".
{
// Assmebly-coded bfb filter loop using floating-point math.
// The core loop takes 3 instructions for 2 biquads, or 4.5ns per biquad.
// We use the even-numbered biquads for one voice, the odd-numbered biquads for a second voice.
asm(
"#include <def21364.h> \n"
"BIT SET MODE1 SIMD; \n"
// For each biquad k: (k=6*biquadP for R=0)
// y[k](n) = C0[k] * (x[k](n) - x[k](n-2))
// + C1[k] * 2 * y[k](n-1)
// + C2[k] * y[k](n-2)
//
// Register usage for biquad k (R==0):
// F0 = C0[k], C1[k], and C2[k]
// F4 = input0-old_input0 aka x[k](n)-x[k](n-2)
// F6 = y[k](n-1), y[k](n-2)
// F8 = scratch for computing y[k](n)
// F10 = sum of even biquad's y[](n) in each biquad pair
// F12 = scratch for computing y[k](n)
// F13 = final value of y[k-2](n)
// DM(I4) = C0[k], C1[k], and C2[k]
// PM(I10) = read y[k](n-2) write to y[k-2](n)
// PM(I12) = read y[k](n-1)
// Register usage for biquad k+1 (R==1):
// SF0 = C0[k+1], C1[k+1], and C2[k+1]
// SF4 = input1-old_input1 aka x[k+1](n)-x[k+1](n-2)
// SF6 = y[k+1](n-1), y[k+1](n-2)
// SF8 = scratch for computing y[k+1](n)
// SF10 = sum of odd biquad's y[](n) in each biquad pair
// SF12 = scratch for computing y[k+1](n)
// SF13 = final value of y[k-1](n)
// DM(I4+1) = C0[k+1], C1[k+1], and C2[k+1]
// PM(I10+1)= read y[k+1](n-2) write y[k-1](n)
// PM(I12+1)= read y[k+1](n-1)
// Register usage for even and odd biquads:
// M4=M12=2
// M10=+4
// M11=-2
//
// The three instruction loop below computes two biquads simultaneously, for k and k+1,
// in each SIMD loop execution. The comments are written just for biquad k;
// the biquad k+1 code and comments are implied by SIMD mode.
// Set pointer increments.
" M4=2;M10=4;M11=-2;M12=2; \n"
// Loop priming, with k=0 (and SIMD k=1).
// Initialize F10=SF10=0 fetch C0[k]
" R10=R10-R10, F0=DM(I4,M4); \n"
// C0[k]*(x(n)-x(n-2)) fetch C1[k] fetch y[k](n-1)
" F8=F0*F4, F0=DM(I4,M4), F6=PM(I12,M12); \n"
// C1[k] * y[k](n-1) fetch C2[k] fetch y[k](n-2)
" F12=F0*F6, F0=DM(I4,M4), F6=PM(I10,M12); \n"
// C2[k] * y[k](n-2) biquad k sum fetch C0[k+2]
" F12=F0*F6, F8=F8+F12, F0=DM(I4,M4); \n"
// Main loop, for k = 2..K-2 by 2 (and SIMD for k = 1..K-1 by 2).
"DO (PC, endx) UNTIL LCE; \n"
// C0[k]*(x(n)-x(n-2)) F13 is y[k-2](n) fetch C1[k] fetch y[k](n-1)
" F8=F0*F4, F13=F8+F12, F0=DM(I4,M4), F6=PM(I12,M12); \n"
// C1[k] * y[k](n-1) output sum fetch C2[k] fetch y[k](n-2)
" F12=F0*F6, F10=F10+F13, F0=DM(I4,M4), F6=PM(I10,M11); \n"
// C2[k] * y[k](n-2) biquad k sum fetch C0[k+2] save y[k-2](n)
"endx: F12=F0*F6, F8=F8+F12, F0=DM(I4,M4), PM(I10,M10)=F13; \n"
// F13 is y[k-2](n)
" F13=F8+F12, modify(I10,M11); \n"
// output sum save y[k-2](n)
" F10=F10+F13, PM(I10,M10)=F13; \n"
"BIT CLR MODE1 SIMD; \n" // disable SIMD
" NOP; \n" // wait for SIMD disable
// Output register list, each element: "=rN" (varName)
: "=R10" (*fSum0), // F10 has floating sum of even & odd biquads
"=S10" (*fSum1) // SF10 has floating sum of even & odd biquads
// Input register list, each element: "rN" (varName)
: "lcntr" (pairs-1),
"R4" (inDiff0), // x[n]-x[n-2] for even biquads
"S4" (inDiff1), // x[n]-x[n-2] for odd biquads
"I4" (&pmXmod->bfb_C[6 * biquadP]), // coefficients for all biquads
"I10" (bfb_Ynm2(biquadP)), // y[n-2] for all biquads, updated to y[n]
"I12" (bfb_Ynm1(biquadP)) // y[n-1] for all biquads
// Clobbered register list:
// We must avoid "do not use" regs, Compiler and Library Manual p.1-245.
// We try to use mostly scratch regs, Compiler and Library Manual p.1-248.
: "r0", "r4", "r6", "r8", "r10", "r12", "r13",
"i4", "i10", "i12",
"m4", "m10", "m11", "m12" );
}
void bfbXCoef(int biquadP, int biquadR, float freq, float amp, float bw)
// Compute coefficients for a biquad filter section of the floating-point Biquad Filter Bank routine.
{
int startingIndex = 6 * biquadP + biquadR; // 6 * biquad pair number (+1 if second in pair)
float *pC = &pmXmod->bfb_C[ startingIndex ]; // index biquad filter coefficients array
float *y0 = &dmXmod->bfb_Y[0][2*biquadP+biquadR]; // filter memory for biquad
float *y1 = &dmXmod->bfb_Y[1][2*biquadP+biquadR]; // filter memory for biquad
// Is center frequency is below aliasing-cutoff limit?
if (freq != 0.0 && freq < 0.5 * sr)
{
// Compute sin and cos of the mode frequency.
float phase = freq * 2. * Pi / sr;
float fSin, fCos;
sinCos(phase, &fSin, &fCos); // efficient sine and cosine; see Lippold Haken's DSPrelated code snippet.
// These formulas are adapted from Robert Bristow-Johnson BLT biquad web posting.
// I use the BPF with "constant skirt gain, peak gain = Q", with amplitude scaling of input coefficients.
// Compute intermediate parameters, alpha and beta.
// alpha = sin(w0)/(2*Q)
// beta = 1/(1+alpha)
// y[n] = (beta * sin(w0)/2 ) * (x[n] - x[n-2])
// + (beta * 2*cos(w0) ) * y[n-1]
// + (beta * (alpha - 1) ) * y[n-2]
// In addition, the input x-coefficients are scaled by the amplitude of the biquad --
// I use **twice** the amplitude value.
float alpha = fSin * bw / 2.0; // this is sin/(2*q)
float beta = 1.0 / (1.0 + alpha);
pC[0] = amp * beta * fSin; // x[n] coefficient, and -x[n-2] coefficient
pC[2] = beta * 2. * fCos; // y[n-1] coefficient
pC[4] = beta * (alpha - 1.0); // y[n-2] coefficient
}
else
{
// Frequency of bfb filter is beyond aliasing frequency,
// or if we are about to start a new note.
// Zero bfb filter coefficients and zero out the filter memory.
pC[0] = pC[2] = pC[4] = 0;
*y0 = *y1 = 0;
}
}
Random Number Generator
// Random number generator that sounds pleasant in audio algorithms.
// The argument and return value is a 30-bit (positive nonzero) integer.
// This is Duttaro's 30-bit pseudorandom number generator, p.124 J.AES, vol 50 no 3 March 2002;
// I found this 30-bit pseudorandom number generator sounds far superior to the 31-bit
// pseudorandom number generator presented in the same article.
// To make result a signed fixed-point value, do not shift left into the sign bit;
// instead, subtract 0x20000000 then multiply by a scale factor.
#define NextRand(rr) (((((rr >> 16) ^ (rr >> 15) ^ (rr >> 1) ^ rr) & 1) << 29) | (rr >> 1))
Biquad bandpass filter bank
// Efficient implementation of biquad bandpass filter bank.
// This is the core computation for musical physical models ("modal models").
//
// This code takes 1.5 instructions to compute each floating-point bandpass filter;
// for a SHARC running at 3ns instruction cycle this is 4.5ns / biquad.
//
// Written by Lippold Haken of Haken Audio, 2010, using ADSP-21364 and VisualDSP++ C.
//
// dmXmod - pointer to DM data memory
// pmXmod - pointer to PM data memory
// bfbXComputePairs - optimized computation of a bank of bandpass filters
// bfbXCoef - coefficient computation for the bandpass filters
// CFDSP hardware-specific definitions.
#include "cfdsp21364.h"
// === Start of definitions for .h file
// The following structure definitions are shared by this code and by the calling code,
// so they should be moved to a common .h file.
// Data in dm.
typedef struct
{
#define bfb4FilterSections 64 // number of modes (number of bandpass filter sections)
float bfb_Y[2][bfb4FilterSections]; // save state: y[n-1] and y[n-2]
} DmX_bfb;
// Data in pm.
typedef struct
{
// The bfb_C[] contains 3 coefficients for each biquad, with a pair of biquad's coefficients
// interleaved so that biquads can be processed in pairs by the optimized SIMD assembly loop.
float bfb_C[ 3 * bfb4FilterSections ]; // bfb filter coefficients, see comment above
} PmX_bfb;
// Pointers to data, and a samples counter.
DmX_bfb *dmXmod;
PmX_bfb *pmXmod;
int sampInFrame; // sample counter
#define sr 48000 // sample rate in Hz
#define Pi (3.14159265)
#define ABS(x) ( ((x)>(0)) ? (x) : -(x) )
void sinCos( float fRadians, float *fSin, float *fCos ); // see Lippold Haken's sinCos code snippet
// === End of definitions for .h file
// Parameter arrays for Biquad Filter Bank (modal physical modelling) synthesis.
//
// If P is the biquad pair number, and R=0/R=1 distinguishes the two biquads within the pair:
// x[n] coefficient for biquad k is at pmXmod->bfb_C[ 6*P + R + 0 ]; this is also -x[n-2] coefficient.
// y[n-1] coefficient for biquad k is at pmXmod->bfb_C[ 6*P + R + 2 ]; this is scaled by 0.5 to avoid overflow
// y[n-2] coefficient for biquad k is at pmXmod->bfb_C[ 6*P + R + 4 ]
// The first index to dmXmod->bfb_Y[] is based on lsbs of sample counter.
// The second index to dmXmod->bfb_Y[] is 2P+R.
// Note: R=0 always for even voices, R=1 always for odd voices.
#define bfb_Ynm1(P) &(dmXmod->bfb_Y[sampInFrame & 1][2*(P)]) // start in y[n-1][] array
#define bfb_Ynm2(P) &(dmXmod->bfb_Y[1-(sampInFrame & 1)][2*(P)]) // start in y[n-2][] array
void bfbXComputePairs(int biquadP, int pairs,
float inDiff0, // x[n]-x[n-2] for even biquads
float inDiff1, // x[n]-x[n-2] for odd biquads
float *fSum0, // output for even biquad sum
float *fSum1) // output for odd biquad sum
// Sum the output of a sequence of at least 2 biquad pairs.
// For each of the sequence of pairs, the first (even) biquad in each pair has a common input "input0",
// and the second (odd) biquad of each pair has a (possibly different) common input "input1".
{
// Assmebly-coded bfb filter loop using floating-point math.
// The core loop takes 3 instructions for 2 biquads, or 4.5ns per biquad.
// We use the even-numbered biquads for one voice, the odd-numbered biquads for a second voice.
asm(
"#include <def21364.h> \n"
"BIT SET MODE1 SIMD; \n"
// For each biquad k: (k=6*biquadP for R=0)
// y[k](n) = C0[k] * (x[k](n) - x[k](n-2))
// + C1[k] * 2 * y[k](n-1)
// + C2[k] * y[k](n-2)
//
// Register usage for biquad k (R==0):
// F0 = C0[k], C1[k], and C2[k]
// F4 = input0-old_input0 aka x[k](n)-x[k](n-2)
// F6 = y[k](n-1), y[k](n-2)
// F8 = scratch for computing y[k](n)
// F10 = sum of even biquad's y[](n) in each biquad pair
// F12 = scratch for computing y[k](n)
// F13 = final value of y[k-2](n)
// DM(I4) = C0[k], C1[k], and C2[k]
// PM(I10) = read y[k](n-2) write to y[k-2](n)
// PM(I12) = read y[k](n-1)
// Register usage for biquad k+1 (R==1):
// SF0 = C0[k+1], C1[k+1], and C2[k+1]
// SF4 = input1-old_input1 aka x[k+1](n)-x[k+1](n-2)
// SF6 = y[k+1](n-1), y[k+1](n-2)
// SF8 = scratch for computing y[k+1](n)
// SF10 = sum of odd biquad's y[](n) in each biquad pair
// SF12 = scratch for computing y[k+1](n)
// SF13 = final value of y[k-1](n)
// DM(I4+1) = C0[k+1], C1[k+1], and C2[k+1]
// PM(I10+1)= read y[k+1](n-2) write y[k-1](n)
// PM(I12+1)= read y[k+1](n-1)
// Register usage for even and odd biquads:
// M4=M12=2
// M10=+4
// M11=-2
//
// The three instruction loop below computes two biquads simultaneously, for k and k+1,
// in each SIMD loop execution. The comments are written just for biquad k;
// the biquad k+1 code and comments are implied by SIMD mode.
// Set pointer increments.
" M4=2;M10=4;M11=-2;M12=2; \n"
// Loop priming, with k=0 (and SIMD k=1).
// Initialize F10=SF10=0 fetch C0[k]
" R10=R10-R10, F0=DM(I4,M4); \n"
// C0[k]*(x(n)-x(n-2)) fetch C1[k] fetch y[k](n-1)
" F8=F0*F4, F0=DM(I4,M4), F6=PM(I12,M12); \n"
// C1[k] * y[k](n-1) fetch C2[k] fetch y[k](n-2)
" F12=F0*F6, F0=DM(I4,M4), F6=PM(I10,M12); \n"
// C2[k] * y[k](n-2) biquad k sum fetch C0[k+2]
" F12=F0*F6, F8=F8+F12, F0=DM(I4,M4); \n"
// Main loop, for k = 2..K-2 by 2 (and SIMD for k = 1..K-1 by 2).
"DO (PC, endx) UNTIL LCE; \n"
// C0[k]*(x(n)-x(n-2)) F13 is y[k-2](n) fetch C1[k] fetch y[k](n-1)
" F8=F0*F4, F13=F8+F12, F0=DM(I4,M4), F6=PM(I12,M12); \n"
// C1[k] * y[k](n-1) output sum fetch C2[k] fetch y[k](n-2)
" F12=F0*F6, F10=F10+F13, F0=DM(I4,M4), F6=PM(I10,M11); \n"
// C2[k] * y[k](n-2) biquad k sum fetch C0[k+2] save y[k-2](n)
"endx: F12=F0*F6, F8=F8+F12, F0=DM(I4,M4), PM(I10,M10)=F13; \n"
// F13 is y[k-2](n)
" F13=F8+F12, modify(I10,M11); \n"
// output sum save y[k-2](n)
" F10=F10+F13, PM(I10,M10)=F13; \n"
"BIT CLR MODE1 SIMD; \n" // disable SIMD
" NOP; \n" // wait for SIMD disable
// Output register list, each element: "=rN" (varName)
: "=R10" (*fSum0), // F10 has floating sum of even & odd biquads
"=S10" (*fSum1) // SF10 has floating sum of even & odd biquads
// Input register list, each element: "rN" (varName)
: "lcntr" (pairs-1),
"R4" (inDiff0), // x[n]-x[n-2] for even biquads
"S4" (inDiff1), // x[n]-x[n-2] for odd biquads
"I4" (&pmXmod->bfb_C[6 * biquadP]), // coefficients for all biquads
"I10" (bfb_Ynm2(biquadP)), // y[n-2] for all biquads, updated to y[n]
"I12" (bfb_Ynm1(biquadP)) // y[n-1] for all biquads
// Clobbered register list:
// We must avoid "do not use" regs, Compiler and Library Manual p.1-245.
// We try to use mostly scratch regs, Compiler and Library Manual p.1-248.
: "r0", "r4", "r6", "r8", "r10", "r12", "r13",
"i4", "i10", "i12",
"m4", "m10", "m11", "m12" );
}
void bfbXCoef(int biquadP, int biquadR, float freq, float amp, float bw)
// Compute coefficients for a biquad filter section of the floating-point Biquad Filter Bank routine.
{
int startingIndex = 6 * biquadP + biquadR; // 6 * biquad pair number (+1 if second in pair)
float *pC = &pmXmod->bfb_C[ startingIndex ]; // index biquad filter coefficients array
float *y0 = &dmXmod->bfb_Y[0][2*biquadP+biquadR]; // filter memory for biquad
float *y1 = &dmXmod->bfb_Y[1][2*biquadP+biquadR]; // filter memory for biquad
// Is center frequency is below aliasing-cutoff limit?
if (freq != 0.0 && freq < 0.5 * sr)
{
// Compute sin and cos of the mode frequency.
float phase = freq * 2. * Pi / sr;
float fSin, fCos;
sinCos(phase, &fSin, &fCos); // efficient sine and cosine; see Lippold Haken's DSPrelated code snippet.
// These formulas are adapted from Robert Bristow-Johnson BLT biquad web posting.
// I use the BPF with "constant skirt gain, peak gain = Q", with amplitude scaling of input coefficients.
// Compute intermediate parameters, alpha and beta.
// alpha = sin(w0)/(2*Q)
// beta = 1/(1+alpha)
// y[n] = (beta * sin(w0)/2 ) * (x[n] - x[n-2])
// + (beta * 2*cos(w0) ) * y[n-1]
// + (beta * (alpha - 1) ) * y[n-2]
// In addition, the input x-coefficients are scaled by the amplitude of the biquad --
// I use **twice** the amplitude value.
float alpha = fSin * bw / 2.0; // this is sin/(2*q)
float beta = 1.0 / (1.0 + alpha);
pC[0] = amp * beta * fSin; // x[n] coefficient, and -x[n-2] coefficient
pC[2] = beta * 2. * fCos; // y[n-1] coefficient
pC[4] = beta * (alpha - 1.0); // y[n-2] coefficient
}
else
{
// Frequency of bfb filter is beyond aliasing frequency,
// or if we are about to start a new note.
// Zero bfb filter coefficients and zero out the filter memory.
pC[0] = pC[2] = pC[4] = 0;
*y0 = *y1 = 0;
}
}
Inverse Sqrt and Fourth Root approximations
// Fast InvSqrt approxumation, with an error of less than 4%.
// This approximation is attributed to Greg Walsh.
inline void InvSqrt(float *x) { *(int *)x = 0x5f3759df - (*(int *)x >> 1); }
inline float SqrtSqrt(float x) { InvSqrt(&x); InvSqrt(&x); return x; }
Fast power-of-10 approximation, for 32-bit floats
// Fast power-of-10 approximation, with RMS error of 1.77%.
// This approximation developed by Nicol Schraudolph (Neural Computation vol 11, 1999).
// Adapted for 32-bit floats by Lippold Haken of Haken Audio, April 2010.
// Set float variable's bits to integer expression.
// f=b^f is approximated by
// (int)f = f*0x00800000*log(b)/log(2) + 0x3F800000-60801*8
// f=10^f is approximated by
// (int)f = f*27866352.6 + 1064866808.0
inline void Pow10(float *f) { *(int *)f = *f * 27866352.6 + 1064866808.0; };
Fast SIMD sine and cosine
// Fast sine and cosine for SHARC ADSP-21364, callable from VisualDSP++ C.
// Simultaneously compute sine and cosine of a given angle.
// Polynomial approximation developed by Flemming Pedersen of CERN.
// This implementation by Lippold Haken of Haken Audio, March 2010, October 2012.
//
// C prototype for this function:
// typedef struct { float sin, cos; } SinCos;
// void sinCos( float fRadians, SinCos *result );
//
// 23 cycles to execute this function:
// 18 cycles for the computations
// 5 cycles for compiler environment overhead.
#include <def21364.h>
// Data table for sinCos function.
.section/dm/DOUBLE32 seg_dmda;
sinCosData:
.global sinCosData;
.type sinCosData,STT_OBJECT;
.var =
// Integer 3 for "mod 4" operation.
0x00000003, 0x00000003,
// 1.0 for neutralizing final cos multiply.
0x3F800000, 0x3F800000,
// Coefficients for sin and cos polynomials.
0xBB96BD89, // SQ7 for sin polynomial -4.6002309092153379e-003
0xBCA75707, // CQ6 for cos polynomial -2.0427240364907607e-002
0x3DA32F1D, // SQ5 for sin polynomial 7.9679708649230657e-002
0x3E81D8BD, // CQ4 for cos polynomial 2.5360671639164339e-001
0xBF255DDD, // SQ3 for sin polynomial -6.4596348437163809e-001
0xBF9DE9D1, // CQ2 for cos polynomial -1.2336979844380824e+000
0x3FC90FDB, // SQ1 for sin polynomial 1.5707963267948966e+000
0x3F800000, // CQ0 for cos polynomial 1.0000000000000000e+000
// Sign adjustment table, indexed by integer quadrant.
0x3F800000, 0x3F800000, // 1.0, 1.0
0x3F800000, 0xBF800000, // 1.0,-1.0
0xBF800000, 0xBF800000, // -1.0,-1.0
0xBF800000, 0x3F800000; // -1.0, 1.0
sinCosData.end:
.section/pm/DOUBLE32 seg_pmco;
_sinCos:
// Registers in VisualDSP environment:
// M5,M13 = 0
// M6,M14 = 1
// M7,M15 = -1
// F4 = first function argument (input angle)
// R8 = pointer to address for storing sin,cos result
// R0,R1,R2,R4,R8,R12,I4,I12,I13,M4 need not be preserved
// I6,I7 = stack and frame pointers
// C calling routine has: "CJUMP (DB); DM(I7,M7)=R2; DM(I7,M7)=PC;"
// CJUMP does these operations: "R2=I6, I6=I7"
// Before exiting must do RFRAME: "I7=I6, I6=DM(0,I6)"
// Preamble.
SF4 = F4;
BIT SET MODE1 SIMD;
// Pointer to constants.
I4 = sinCosData;
// Compute integer quadrant, fractional quadrant, fractional quadrant squared.
// Quadrant "q" = fRadians * 2./Pi
// Quadrant values (0. .. 4.) correspond to (0. .. 2Pi)
R1 = 0x3F22F983; // F1 = 2./Pi
F0 = F1 * F4, I12 = DM(M7,I6); // I12 is execution return address
// Integer quadrant = round(q) mod 4 (0..3)
// Since MODE1 TRUNC bit is zero, FIX implements round-to-nearest.
R1 = FIX F0, R2 = DM(I4,2);
R2 = R1 AND R2, F12 = DM(I4,2);
// Fractional quadrant "Xq" = q - round(q) (-.5 .. .5)
// Fractional quadrant values (-.5 .. .5) correspond to (-Pi/4..Pi/4)
F1 = FLOAT R1, F4 = DM(I4,2);
F1 = F0 - F1, M4 = R2;
// Set SZ if integer quadrant is even.
R2 = LSHIFT R2 BY 31;
// Fractional quadrant squared "Xq2" = Xq * Xq
// Set SF1 = 1.0 to avoid final Xq multiply for cosine polynomial.
F2 = F1 * F1, F12 <-> SF1;
// Registers involved in sin and cos computation:
// F1 = fractional quadrant Xq
// SF1 = 1.0
// F2,SF2 = fractional quadrant squared Xq2
// F4 = SQ7
// SF4 = CQ6
// M4 = integer quadrant
// SZ = indicates integer quadrant is even
// DM(I4) = pairs of cosine and sine coefficients
// Compute sine polynomial (PEx) and cosine polynomial (PEy).
// Xq2*SQ7 Xq2*CQ6
F0 = F2 * F4, F4 = DM(I4,2);
// SQ5+Xq2*SQ7 CQ4+Xq2*CQ6
F0 = F0 + F4;
// Xq2(SQ5+Xq2*SQ7) Xq2(CQ4+Xq2*CQ6)
F0 = F0 * F2, F4 = DM(I4,2);
// SQ3+Xq2(SQ5+Xq2*SQ7) CQ2+Xq2(CQ4+Xq2*CQ6)
F0 = F0 + F4, F4 = DM(I4,2);
// Xq2(SQ3+Xq2(SQ5+Xq2*SQ7)) Xq2(CQ2+Xq2(CQ4+Xq2*CQ6))
F0 = F0 * F2, MODIFY(I4,M4);
// SQ1+Xq2(SQ3+Xq2(SQ5+Xq2*SQ7))) CQ0+Xq2(CQ2+Xq2(CQ4+Xq2*CQ6)))
F0 = F0 + F4, F4 = DM(M4,I4);
// Xq(SQ1+Xq2(SQ3+Xq2(SQ5+Xq2*SQ7)))) no change on PEy
F0 = F0 * F1, I4 = R8; // I4 is data return address
// Registers:
// F0 = sine result so far
// SF0 = cosine result so far
// F4,SF4 = +/-1 for inverting sign for quadrant
// SZ = set for integer quadrant even
// Invert and swap sin/cos according to integer quadrant:
// quadrant F0 SF0
// 0 = 00 SIN COS
// 1 = 01 COS -SIN swap results, invert sine
// 2 = 10 -SIN -COS invert signs
// 3 = 11 -COS SIN swap results, invert cosine
IF NOT SZ F0 <-> SF0; // swap sin,cos if odd quadrant
RFRAME; // stack frame management before exit
JUMP (M14,I12) (DB), F0 = F0 * F4; // negate sin/cos if needed
DM(M5,I4) = F0; // store sin at I4, cos (SF0) at I4+1
BIT CLR MODE1 SIMD; // end SIMD during JUMP (DB) stall
._sinCos.end:
.global _sinCos;
.type _sinCos,STT_FUNC;