#include "pitsTT6lib.h"

#define UNROLL_A 8
#define BREAKb 4
#define BREAKc 4
#define BREAKd 4
#define PARAM ((16 << 24) | (32 << 16) | 64)

#define AV_SIZE 24
#define AV_COUNT 16
#define AV_STRIDE 0

#include "trace.h"

#if TRACE

void start_trace(char *name)
{
	char buf[16];

	strcpy(buf, name);
	startTrace(buf);
}

void stop_trace(void)
{
	stopTrace();
}

#else

#define start_trace(name)
#define stop_trace()

#endif

void do_copy(double *a, double *b, double *c, double scalar, int N)
{
	int i;
	register double *ai, *ci;
	register int k8 = 8;
	register int k40 = 40;

	start_trace("copy");

	vec_dstt((__vector float *) &a[0], (2 << AV_SIZE) | (255 <<
AV_COUNT) | (32 << AV_STRIDE), 0);

	ai = &a[-1];		// bias backwards by 8 bytes; allows use of
address-update below
	ci = &c[-1];

	for (i = 0; i < N; i += UNROLL_A) {
		register double t0, t1, t2, t3;

		asm {
				dcbz	k8, ci
				dcbz	k40, ci

				lfd		t0, 8(ai)
				lfd		t1, 16(ai)
				lfd		t2, 24(ai)
				lfd		t3, 32(ai)
				stfd	t0, 8(ci)
				stfd	t1, 16(ci)
				stfd	t2, 24(ci)
				stfd	t3, 32(ci)

				lfd		t0, 40(ai)
				lfd		t1, 48(ai)
				lfd		t2, 56(ai)
				lfdu	t3, 64(ai)
				stfd	t0, 40(ci)
				stfd	t1, 48(ci)
				stfd	t2, 56(ci)
				stfdu   t3, 64(ci)
		}

		if ((i & 255) == 256-UNROLL_A)
			vec_dstt((__vector float *) &a[i+128], (2 <<
AV_SIZE) | (255 << AV_COUNT) | (32 << AV_STRIDE), 0);
	}

	vec_dssall();

	stop_trace();
}

void do_scale(double *a, double *b, double *c, register double scalar, int N)
{
	int i;
	register double *ai, *ci;
	register int k8 = 8;
	register int k40 = 40;

	start_trace("scale");

	vec_dstt((__vector float *) &a[0], (2 << AV_SIZE) | (255 <<
AV_COUNT) | (32 << AV_STRIDE), 0);

	ai = &a[-1];		// bias backwards by 8 bytes; allows use of
address-update below
	ci = &c[-1];

	for (i = 0; i < N; i += UNROLL_A) {
		register double t0, t1, t2, t3, t4, t5, t6, t7;

		asm {
				dcbz	k8, ci
				dcbz	k40, ci

				lfd		t0, 8(ai)
				lfd		t1, 16(ai)
				fmul	t0, t0, scalar
				lfd		t2, 24(ai)
				lfd		t3, 32(ai)
				fmul	t1, t1, scalar
				stfd	t0, 8(ci)
				fmul	t2, t2, scalar
				stfd	t1, 16(ci)
				fmul	t3, t3, scalar
				stfd	t2, 24(ci)
				stfd	t3, 32(ci)


				lfd		t0, 40(ai)
				lfd		t1, 48(ai)
				fmul	t0, t0, scalar
				lfd		t2, 56(ai)
				lfdu	t3, 64(ai)
				fmul	t1, t1, scalar
				stfd	t0, 40(ci)
				fmul	t2, t2, scalar
				stfd	t1, 48(ci)
				fmul	t3, t3, scalar
				stfd	t2, 56(ci)
				stfdu   t3, 64(ci)
		}

		if ((i & 255) == 256-UNROLL_A)
			vec_dstt((__vector float *) &a[i+128], (2 <<
AV_SIZE) | (255 << AV_COUNT) | (32 << AV_STRIDE), 0);
	}

	vec_dssall();

	stop_trace();
}

void do_add(double *a, double *b, double *c, double scalar, int N)
{
	int i;
	register double *ai, *bi, *ci;
	register int k8 = 8;
	register int k40 = 40;

	start_trace("add");

	vec_dstt((__vector float *) &a[0], (2 << AV_SIZE) | (255 <<
AV_COUNT) | (32 << AV_STRIDE), 0);
	vec_dstt((__vector float *) &b[0], (2 << AV_SIZE) | (255 <<
AV_COUNT) | (32 << AV_STRIDE), 1);

	ai = &a[-1];		// bias backwards by 8 bytes; allows use of
address-update below
	bi = &b[-1];
	ci = &c[-1];

	for (i = 0; i < N; i += UNROLL_A) {
		register double t0, t1, t2, t3, u0, u1, u2, u3;

		asm {
				dcbz	k8, ci
				dcbz	k40, ci

				lfd		t0, 8(ai)
				lfd		t1, 16(ai)
				lfd		u0, 8(bi)
				lfd		u1, 16(bi)
				fadd	t0, t0, u0
				lfd		t2, 24(ai)
				lfd		t3, 32(ai)
				fadd	t1, t1, u1
				lfd		u2, 24(bi)
				lfd		u3, 32(bi)
				stfd	t0, 8(ci)
				fadd	t2, t2, u2
				stfd	t1, 16(ci)
				fadd	t3, t3, u3
				stfd	t2, 24(ci)
				stfd	t3, 32(ci)

				lfd		t0, 40(ai)
				lfd		t1, 48(ai)
				lfd		u0, 40(bi)
				lfd		u1, 48(bi)
				fadd	t0, t0, u0
				lfd		t2, 56(ai)
				lfdu	t3, 64(ai)
				fadd	t1, t1, u1
				lfd		u2, 56(bi)
				lfdu	u3, 64(bi)
				stfd	t0, 40(ci)
				fadd	t2, t2, u2
				stfd	t1, 48(ci)
				fadd	t3, t3, u3
				stfd	t2, 56(ci)
				stfdu   t3, 64(ci)
		}

		if ((i & 255) == 256-UNROLL_A) {
			vec_dstt((__vector float *) &a[i+128], (2 <<
AV_SIZE) | (255 << AV_COUNT) | (32 << AV_STRIDE), 0);
			vec_dstt((__vector float *) &b[i+128], (2 <<
AV_SIZE) | (255 << AV_COUNT) | (32 << AV_STRIDE), 1);
		}
	}

	vec_dssall();

	stop_trace();
}

void do_triad(double *a, double *b, double *c, register double scalar, int N)
{
	int i;
	register double *ai, *bi, *ci;
	register int k8 = 8;
	register int k40 = 40;

	start_trace("triad");

	vec_dstt((__vector float *) &a[0], (2 << AV_SIZE) | (255 <<
AV_COUNT) | (32 << AV_STRIDE), 0);
	vec_dstt((__vector float *) &b[0], (2 << AV_SIZE) | (255 <<
AV_COUNT) | (32 << AV_STRIDE), 1);

	ai = &a[-1];		// bias backwards by 8 bytes; allows use of
address-update below
	bi = &b[-1];
	ci = &c[-1];

	for (i = 0; i < N; i += UNROLL_A) {
		register double t0, t1, t2, t3, u0, u1, u2, u3;

		asm {
				dcbz	k8, ci
				dcbz	k40, ci

				lfd		t0, 8(ai)
				lfd		t1, 16(ai)
				lfd		u0, 8(bi)
				lfd		u1, 16(bi)
				fmadd	t0, scalar, t0, u0
				lfd		t2, 24(ai)
				lfd		t3, 32(ai)
				fmadd	t1, scalar, t1, u1
				lfd		u2, 24(bi)
				lfd		u3, 32(bi)
				stfd	t0, 8(ci)
				fmadd	t2, scalar, t2, u2
				stfd	t1, 16(ci)
				fmadd	t3, scalar, t3, u3
				stfd	t2, 24(ci)
				stfd	t3, 32(ci)

				lfd		t0, 40(ai)
				lfd		t1, 48(ai)
				lfd		u0, 40(bi)
				lfd		u1, 48(bi)
				fmadd	t0, scalar, t0, u0
				lfd		t2, 56(ai)
				lfdu	t3, 64(ai)
				fmadd	t1, scalar, t1, u1
				lfd		u2, 56(bi)
				lfdu	u3, 64(bi)
				stfd	t0, 40(ci)
				fmadd	t2, scalar, t2, u2
				stfd	t1, 48(ci)
				fmadd	t3, scalar, t3, u3
				stfd	t2, 56(ci)
				stfdu   t3, 64(ci)
		}

		if ((i & 255) == 256-UNROLL_A) {
			vec_dstt((__vector float *) &a[i+128], (2 <<
AV_SIZE) | (255 << AV_COUNT) | (32 << AV_STRIDE), 0);
			vec_dstt((__vector float *) &b[i+128], (2 <<
AV_SIZE) | (255 << AV_COUNT) | (32 << AV_STRIDE), 1);
		}
	}

	vec_dssall();

	stop_trace();
}

