*****************************************************************
* Application: 	DTMF codec
* File:		goertzel.asm
* Description:	Goertzel algorithm implemented in asm
*		
* History:
* Date		Who		Comment
* 07-03-97	Gunter Schmer	creation
*
*****************************************************************


*************************************************************************
* Function: 	void goertzel(	short data[],	--> A4 arg1
*				short energy[],	--> B4 arg2
*				int coef01,	--> A6 arg3
*				int coef23,	--> B6 arg4
*				short N      );	--> A8 arg5
*						    B3 retaddr
*
* Description:	This routine computes 4 Goertzel Difference Eqns
*		The execution is done in parallel and takes
*		4 cycles per iteration (1 cycle per frequency per
*		iteration). The number of iterations is typically N=102.
*		From the filter states the Energy template is
*		computed. Number format is Q15.
*
*		Rekursive Goerztel DFEs
* 		v0(n) = 2*c0*v0(n-1) - v0(n-2) + x(n)
* 		v1(n) = 2*c1*v1(n-1) - v1(n-2) + x(n)
* 		v2(n) = 2*c2*v2(n-1) - v2(n-2) + x(n)
* 		v3(n) = 2*c3*v3(n-1) - v3(n-2) + x(n)
*
*		Energy template:
*		E0 = v0(N-1)*v0(N-1)+v0(N-2)*v0(N-2)-2*c0*v0(N-1)*v0(N-2)
*		E1 = v1(N-1)*v1(N-1)+v1(N-2)*v1(N-2)-2*c1*v1(N-1)*v1(N-2)
*		E2 = v2(N-1)*v2(N-1)+v2(N-2)*v2(N-2)-2*c2*v2(N-1)*v2(N-2)
*		E3 = v3(N-1)*v3(N-1)+v3(N-2)*v3(N-2)-2*c3*v3(N-1)*v3(N-2)
*
* Pseudo C-code:
*
*	void goertzel(	short data[],
*			short energy[],
*			int coef01,
*			int coef23,
*			int N      )
*	{
*		int n;
*		int v0n0,v1n0,v2n0,v3n0;  /* vx(n)   */
*		int v0n1,v1n1,v2n1,v3n1;  /* vx(n-1) */
*		int v0n2,v1n2,v2n2,v3n2;  /* vx(n-2) */
*
*		/* Goertzel DFEs */
*		for(n=0;n<N;n++)	{
* 			v0n0 = 2*c0*v0n1 - v0n2 + data[n];
*			v0n2 = v0n1;
*			v0n1 = v0n0;
* 			v1n0 = 2*c1*v1n1 - v1n2 + data[n];
*			v1n2 = v1n1;
*			v1n1 = v1n0;
* 			v2n0 = 2*c2*v2n1 - v2n2 + data[n];
*			v2n2 = v2n1;
*			v2n1 = v2n0;
* 			v3n0 = 2*c3*v3n1 - v3n2 + data[n];
*			v3n2 = v3n1;
*			v3n1 = v3n0;
*		}
*
*		/* Energy template */
*		energy[0] = v0n1*v0n1+v0n2*v0n2-2*c0*v0n1*v0n2;
*		energy[1] = v1n1*v1n1+v1n2*v1n2-2*c1*v1n1*v1n2;
*		energy[2] = v2n1*v2n1+v2n2*v2n2-2*c2*v2n1*v2n2;
*		energy[3] = v3n1*v3n1+v3n2*v3n2-2*c3*v3n1*v3n2;
*	}
*
* Regs used:	A0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13
*		B0,B1,B2,B3,B4,B5,B6,B7,B8,B9,B10
*
* Regs modified:A0,A1,A2,A3,A4,A5,A6,A7,A8,A9
*		B0,B1,B2,B4,B5,B6,B7,B8,B9
*
* Execution:	4N+34 cycles
*
*************************************************************************
		.global	_goertzel
_goertzel:

	STW	.D2	A13,*B15--[5]	;save A13
	STW	.D2	A10,*+B15[1]	;save A10
	STW	.D2	B10,*+B15[2]	;save B10
	STW	.D2	A11,*+B15[3]	;save A11
	STW	.D2	A12,*+B15[4]	;save A12
	
	MV	.S2x	A4,B8		;arg1	-->pxn
	MV	.S1x	B6,A13		;arg4	-->coef23

	SUB	.L2x	A8,1,B0		;arg5-1	-->loop count
||	MVK	.S1	0,A1		;v0(n-1)=0

	MVK	.S1	0,A2		;v0(n-2)=0

	MVK	.S1	0,A4		;v1(n-1)=0

	MVK	.S1	0,A5		;v1(n-2)=0

;
;------ Register Allocation for Goertzel Loop ------------------------
;	A0 : prod01	A3 : prod11	A7 : prod21	A10: prod31
;	A0 : prod02	A3 : prod12	A7 : prod22	A10: prod32
;	A1 : v0n1	A4 : v1n1	A8 : v2n1	A11: v3n1
;	A2 : v0n2	A5 : v1n2	A9 : v2n2	A12: v3n2
;	B1 : sum01	B2 : sum11	B5 : sum21	B6 : sum31
;	A6 : c0		A6 : c1		A13: c2		A13: c3
;
;	B0 : cnt
;	B3 : retaddr
;	B4 : pEner
;	B7 : xn
;	B8 : pxn
;	B14: DP
;	B15: SP
;

;------ Goertzel Loop-------------------------------------------------
	MVK	.S1	0,A8		;v2(n-1)=0
||	LDH	.D2	*B8++,B7	;load x(n)

	MVK	.S1	0,A8		;v2(n-1)=0

	MVK	.S1	0,A9		;v2(n-2)=0

	MVK	.S1	0,A11		;v3(n-1)=0
||	MPY	.M1	A6,A1,A0	;prd01=c0*v0(n-1)

	MVK	.S1	0,A12		;v3(n-2)=0
||	MPYHL	.M1	A6,A4,A3	;prd11=c1*v1(n-1)
||	LDH	.D2	*B8++,B7	;load x(n)
|| [B0]	B	.S2	loop		;branch to loop

	MPY	.M1	A13,A8,A7	;prd21=c2*v2(n-1)
||	SHR	.S1	A0,14,A0	;prd02=2*prd01
||	SUB	.L2x	B7,A2,B1	;sum01=x(n)-v0(n-2)

loop:
	MPYHL	.M1	A13,A11,A10	;prd31=c3*v3(n-1)
||	SHR	.S1	A3,14,A3	;prd12=2*prod11
||	SUB	.L2x	B7,A5,B2	;sum11=x(n)-v1(n-2)
||	ADD	.L1x	A0,B1,A1	;v0_new(n-1)=prd02+sum01
||	MV	.D1	A1,A2		;v0(n-1)-->v0(n-2)

	MPY	.M1	A6,A1,A0	;prd01=c0*v0(n-1)
||	SHR	.S1	A7,14,A7	;prd22=2*prd21
||	SUB	.L2x	B7,A9,B5	;sum21=x(n)-v2(n-2)
||	ADD	.L1x	A3,B2,A4	;v1_new(n-1)=prd12+sum11
||	MV	.D1	A4,A5		;v1(n-1)-->v1(n-2)
|| [B0]	SUB	.S2	B0,1,B0		;cnt=cnt-1

	MPYHL	.M1	A6,A4,A3	;prd11=c1*v1(n-1)
||	SHR	.S1	A10,14,A10	;prd32=2*prd31
||	SUB	.L2x	B7,A12,B6	;sum31=x(n)-v3(n-2)
||	ADD	.L1x	A7,B5,A8	;v2_new(n-1)=prd22+sum21
||	MV	.D1	A8,A9		;v2(n-1)-->v2(n-2)
||	LDH	.D2	*B8++,B7	;load x(n)
|| [B0]	B	.S2	loop		;branch to loop

	MPY	.M1	A13,A8,A7	;prd21=c2*v2(n-1)
||	SHR	.S1	A0,14,A0	;prd02=2*prd01
||	SUB	.L2x	B7,A2,B1	;sum01=x(n)-v0(n-2)
||	ADD	.L1x	A10,B6,A11	;v3_new(n-1)=prd32+sum31
||	MV	.D1	A11,A12		;v3(n-1)-->v3(n-2)
	
;
;------ Register Allocation for Energy Template Computation ------------
;	A1 : v0n1	A4 : v1n1	A8 : v2n1	A11: v3n1
;	B0 : v0n2	B1 : v1n2	B5 : v2n2	B6 : v3n2
;	A6 : c0		A6 : c1		A13: c2		A13: c3
;	v0n1 : prd01	v1n1 : prd11	v2n1 : prd21	v3n1 : prd31
;	v0n2 : prd02	v1n2 : prd12	v2n2 : prd22	v3n2 : prd32
;	B7 : prd03	B8 : prd13	B9 : prd23	B10: prd33
;	A0 : prd04	A2 : prd14	A3 : prd24	A5 : prd34
;
;	B3 : retaddr
;	B4 : pEner
;	B14: DP
;	B15: SP
;

;--------- Energy template computation ---------------------------------

	MV	.L2x	A2,B0		;move v0(n-2) to B
	MV	.L2x	A5,B1		;move v1(n-2) to B
	MV	.L2x	A9,B5		;move v2(n-2) to B

	MPY	.M2x	A1,B0,B7	;prd03=v0(n-1)*v0(n-2)

	MPY	.M1	A1,A1,A1	;prd01=v0(n-1)*v0(n-1)
||	MPY	.M2	B0,B0,B0	;prd02=v0(n-2)*v0(n-2)
||	MV	.L2x	A12,B6		;move v3(n-2) to B

	MPYLH	.M1x	A6,B7,A0	;prd04=c0*prd03
||	MPY	.M2x	A4,B1,B8	;prd13=v1(n-1)*v1(n-2)

	MPY	.M1	A4,A4,A4	;prd11=v1(n-1)*v1(n-1)
||	MPY	.M2	B1,B1,B1	;prd12=v1(n-2)*v1(n-2)
||	ADD	.L2x	A1,B0,B0	;prd02=prd01+prd02

	MPYH	.M1x	A6,B8,A2	;prd14=c1*prd13
||	MPY	.M2x	A8,B5,B9	;prd23=v2(n-1)*v2(n-2)
||	SHL	.S1	A0,2,A0		;prd04=2*prd04

	MPY	.M1	A8,A8,A8	;prd21=v2(n-1)*v2(n-1)
||	MPY	.M2	B5,B5,B5	;prd22=v2(n-2)*v2(n-2)
||	ADD	.L2x	A4,B1,B1	;prd12=prd11+prd12
||	SUB	.L1x	B0,A0,A1	;prd01=prd02-prd04
||	MVK	.S1	0,A6		;init compare value zero

	MPYLH	.M1x	A13,B9,A3	;prd24=c2*prd23
||	MPY	.M2x	A11,B6,B10	;prd33=v3(n-1)*v3(n-2)
||	SHL	.S1	A2,2,A2		;prd14=2*prd14

	MPY	.M1	A11,A11,A11	;prd31=v3(n-1)*v3(n-1)
||	MPY	.M2	B6,B6,B6	;prd32=v3(n-2)*v3(n-2)
||	ADD	.L2x	A8,B5,B5	;prd22=prd21+prd22
||	SUB	.L1x	B1,A2,A4	;prd11=prd12-prd14
||	SHR	.S1	A1,15,A1	;prd01 in Q15

	MPYH	.M1x	A13,B10,A5	;prd34=c3*prd33
||	SHL	.S1	A3,2,A3		;prd24=2*prd24
||	CMPLT	.L1	A1,A6,A2	;(prd01<0) ??

	ADD	.L2x	A11,B6,B6	;prd32=prd31+prd32
||	SUB	.L1x	B5,A3,A8	;prd21=prd22-prd24
||	SHR	.S1	A4,15,A4	;prd11 in Q15
||[!A2]	STH	.D2	A1,*B4++	;if(prd01>=0) prd01-->Energy template

	SHL	.S1	A5,2,A5		;prd34=2*prd34
||[A2]	NEG	.L1	A1,A1		;if(prd01<0) prd01=-prd01

	SUB	.L1x	B6,A5,A11	;prd31=prd32-prd34
||	SHR	.S1	A8,15,A8	;prd21 in Q15
||[A2]	STH	.D2	A1,*B4++	;if(prd01<0) -prd01-->Energy template

	SHR	.S1	A11,15,A11	;prd31 in Q15
||	CMPLT	.L1	A4,A6,A2	;(prd11<0) ??

  [!A2]	STH	.D2	A4,*B4++	;if(prd11>=0) prd11-->Energy template
||[A2]	NEG	.L1	A4,A4		;if(prd11<0) prd11=-prd11

  [A2]	STH	.D2	A4,*B4++	;if(prd11<0) -prd11-->Energy template
||	CMPLT	.L1	A8,A6,A2	;(prd21<0) ??

  [!A2]	STH	.D2	A8,*B4++	;if(prd21>=0) prd21-->Energy template
||[A2]	NEG	.L1	A8,A8		;if(prd21<0) prd21=-prd21

  [A2]	STH	.D2	A8,*B4++	;if(prd21<0) -prd21-->Energy template
||	CMPLT	.L1	A11,A6,A2	;(prd31<0) ??

  [!A2]	STH	.D2	A11,*B4++	;if(prd31>=0) prd31-->Energy template
||[A2]	NEG	.L1	A11,A11		;if(prd31<0) prd31=-prd31

  [A2]	STH	.D2	A11,*B4++	;if(prd31<0) -prd31-->Energy template

	LDW	.D2	*+B15[4],A12	;restore A12
	LDW	.D2	*+B15[3],A11	;restore A11
	LDW	.D2	*+B15[2],B10	;restore B10

	B	.S2	B3		;return
||	LDW	.D2	*+B15[1],A10	;restore A10
	LDW	.D2	*++B15[5],A13	;restore A13
	NOP		4

