*================================================================================
*
*       TEXAS INSTRUMENTS, INC.         
*
*       DOT PRODUCT
*
*       Revision Date:  04/07/97
*			07/29/97	Gunter Schmer
*					support 40-bit accumulate
*					result 40-bit long
*       
*       USAGE   This routine is C Callable and can be called as:
*               
*               long dotprod40(short a[], short b[], int N)
*
*               a[] --- first vector array 
*               b[] --- second vector array
*               N   --- number of elements of vector
*
*               If routine is not to be used as a C callable function then
*               you need to initialize values for all of the values passed
*               as these are assumed to be in registers as defined by the 
*               calling convention of the compiler, (refer to the C compiler
*               reference guide).
*
*       C CODE
*               This is the C equivalent of the assembly code.  Note that
*               the assembly code is hand optimized and restrictions may
*               apply.
*
*               long dotprod40(short a[],short b[], int N)
*               {
*               long sum;
*               int i;
*
*               sum = 0;
*               for(i=0; i<N; i++){
*                       sum += (a[i] * b[i]);
*                       }
*               return(sum);
*               }
*
*
*       DESCRIPTION
*
*               This routine takes two vectors and calculates their vector
*               product.  The inputs are 16-bit number, and the result is 
*               a 40-bit number.
*               
*       TECHNIQUES
*
*               1.  Load words are used to load two 16-bit values at a time
*               2.  The loop is unrolled once
*       
*       ASSUMPTIONS
*
*               1.  N is an even number greater than 2 
*               2.  Vectors a and b should be aligned on word boundaries
*               
*       MEMORY NOTE
*
*               Vectors a and b should be aligned on opposite word
*               boundaries to avoid memory hits.
*
*       ARGUMENTS PASSED
*
*               a[]  ->  A4
*               b[]  ->  B4
*               N    ->  A6
*
*
*       CYCLES
*
*               N/2 + 15
*
*================================================================================
        .global _dotprod40
        .text

_dotprod40:               
        ZERO    .L1     A3              ; clear reg.
||      ZERO    .L2     B3              ; clear reg.
||	MV	.S2	B3,B8		; save return addr in B8

        LDW     .D1     *A4++,A0        ; aData[0] & aData[1]
||      LDW     .D2     *B4++,B0        ; bData[0] & bData[1]
||      B       .S2     LOOP            ; branch to loop
||      ZERO    .L1     A2              ; clear reg.
||      ZERO    .L2     B2              ; clear reg.
||      MPY     .M1     A7,0,A7         ; clear reg.
||      MPY     .M2     B7,0,B7         ; clear reg.
||      MV      .S1     A6,A1           ; N

        LDW     .D1     *A4++,A0        ; aData[2] & aData[3]
||      LDW     .D2     *B4++,B0        ; bData[2] & bData[3]
||[A1]  B       .S2     LOOP            ; branch to loop
||      SUB     .L1     A1,2,A1         ; decrement loop counter

        LDW     .D1     *A4++,A0        ; aData[4] & aData[5]
||      LDW     .D2     *B4++,B0        ; bData[4] & bData[5]
||[A1]  B       .S2     LOOP            ; branch to loop
||[A1]  SUB     .S1     A1,2,A1         ; decrement loop counter

        LDW     .D1     *A4++,A0        ; aData[6] & aData[7]
||      LDW     .D2     *B4++,B0        ; bData[6] & bData[7]
||[A1]  B       .S2     LOOP            ; branch to loop
||[A1]  SUB     .S1     A1,2,A1         ; decrement loop counter

        LDW     .D1     *A4++,A0        ; aData[8] & aData[9]
||      LDW     .D2     *B4++,B0        ; bData[8] & bData[9]
||[A1]  B       .S2     LOOP            ; branch to loop
||[A1]  SUB     .S1     A1,2,A1         ; decrement loop counter

LOOP:

        LDW     .D1     *A4++,A0        ; aData[10] & aData[11]
||      LDW     .D2     *B4++,B0        ; bData[10] & bData[11]
||      MPY     .M1X    A0,B0,A7        ; aData[0]*bData[0]
||      MPYH    .M2X    A0,B0,B7        ; aData[1]*bData[1]
||      ADD     .L1     A7,A3:A2,A3:A2  ; Sum1 += a[i] * b[i]
||      ADD     .L2     B7,B3:B2,B3:B2  ; Sum2 += a[i+1] * b[i+1]
||[A1]  SUB     .S1     A1,2,A1         ; decrement loop counter
||[A1]  B       .S2     LOOP            ; branch to loop

	ADDU	.L1X	A3:A2,B2,A5:A4	; Sum40 = Sum1 + Sum2
	ADD     .L1X    B3,A5,A5	; Sum40 = Sum1 + Sum2
	EXTU	.S1	A5,24,24,A5	; extract bottom for store
||	MV	.S2	B8,B3		; restore return addr

        B       .S2     B3              ; return                
        NOP     5


