Copyright 2000, 2001 Free Software Foundation, Inc.

This file is part of the GNU MP Library.

The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or (at your
option) any later version.

The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
License for more details.

You should have received a copy of the GNU Lesser General Public License
along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
02111-1307, USA.






The IA-64 ISA is strange and the Itanium pipeline is bizarre!  How
come everybody else is moving from static to dynamic OOO pipelines,
and Intel moves in the opposite direction?  I think compiler writers
failed badly to generate reasonable code for the static RISC pipelines
of the late 1990'ies, and they sure won't do any better with the IA-64
ISA and its Itanium implementation!

The IA-64 ISA keeps instructions three and three in 128 bit bundles.
Programmers/compilers need to put explicit breaks `;;' between WAW or
RAW dependencies.  Such breaks can be anywhere in a bundle, or between
bundles.

The Itanium implementation can under ideal conditions execute two
bundles per cycle.  But to reach that rate for integer code, one needs
to sprinkle the code with `nop.f' instructions, one per bundle.  With
good scheduling, one can thus reach a peak execution rate of 4
instructions/cycle.

Taken cloop branches seem to insert a bubble into the pipeline most of
the time.

Loads to the fp registers bypass the L1 cache and thus get extremely
long latencies, 9 cycles.  It is faster to load stuff via the integer
registers and then use setf.sig to move it to a fp register.


================================================================
mpn_addmul_1:

.Loop:
  { .mfi	getf.sig	r16 = f48
		xma.l		f48 = f32, f97, f40
	   (p6)	cmp.leu
} { .mfi	getf.sig	r24 = f56
		xma.hu		f56 = f32, f97, f40
	   (p7)	cmp.ltu
		;;
} { .mib	ldfp8		f32, f33 = [r33], 8
	   (p6)	add 1
		nop.b
} { .mib	stf8
	   (p7)	add
		nop.b
		;;
} { .mfi	getf.sig	r17 = f49
		xma.l		f49 = f33, f97, f41
	   (p6)	cmp.leu
} { .mfi	getf.sig	r25 = f57
		xma.hu		f57 = f33, f97, f41
	   (p7)	cmp.ltu
		;;
} { .mib	ldfp8		f40, f41 = [r32], 8
	   (p6)	add 1
		nop.b
} { .mib	stf8
	   (p7)	add
		br.cloop	.Loop
		;;
} 

2 limbs/20 instructions
	   20 insn/max 6 insn/cycle:		3.3 cycles/2limb
	   8 memops/max 2 memops/cycle:		4.0 cycles/2limb
	   8 intops/max 2 intops/cycle:		4.0 cycles/2limb
	   4 fpops/max 2 fpops/cycle:		2.0 cycles/2limb

================================================================
mpn_submul_1:

Like mpn_addmul_1, but complement s2limb.  When s2limb is
complemented, the low product limb becomes complement of true product.
This should allow us to use the accumulation of xma.

================================================================
mpn_mul_1:

  { .mfi	getf.sig	r14 = f34
		xma.l		f34 = f32, f97, f0
	   (p6) cmp.leu
} { .mfi	getf.sig	r15 = f35
		xma.hu		f35 = f32, f97, f0
	   (p7) cmp.ltu
		;;
} { .mfi	ldf8		f32 = [r33], 8
		nop.f
	   (p6) add 1
} { .mfi	stf8
		nop.f
	   (p7) add
		;;
}

1 limb/10 instructions
	   10 insn/max 6 insn/cycle:		1.67 cycles/limb
	   4 memops/max 2 memops/cycle:		2.0 cycles/limb
	   4 intops/max 2 intops/cycle:		2.0 cycles/limb
	   2 fpops/max 2 fpops/cycle:		1.0 cycles/limb

================================================================
mpn_mul_8

/* First load the 8 values from v */
	ldfp8		v0, v1 = [r35], 16;;
	ldfp8		v2, v3 = [r35], 16;;
	ldfp8		v4, v5 = [r35], 16;;
	ldfp8		v6, v7 = [r35], 16;;

/* In the inner loop, get a new U limb and store a result limb. */
	mov		lc = un
Loop:	ldf8		u0 = [r33], 8
	xma.l		lp0 = v0, u0, hp0
	xma.hu		hp0 = v0, u0, hp0
	xma.l		lp1 = v1, u0, hp1
	xma.hu		hp1 = v1, u0, hp1
	xma.l		lp2 = v2, u0, hp2
	xma.hu		hp2 = v2, u0, hp2
	xma.l		lp3 = v3, u0, hp3
	xma.hu		hp3 = v3, u0, hp3
	xma.l		lp4 = v4, u0, hp4
	xma.hu		hp4 = v4, u0, hp4
	xma.l		lp5 = v5, u0, hp5
	xma.hu		hp5 = v5, u0, hp5
	xma.l		lp6 = v6, u0, hp6
	xma.hu		hp6 = v6, u0, hp6
	xma.l		lp7 = v7, u0, hp7
	xma.hu		hp7 = v7, u0, hp7
	getf.sig	l0 = lp0
	getf.sig	l1 = lp1
	getf.sig	l2 = lp2
	getf.sig	l3 = lp3
	getf.sig	l4 = lp4
	getf.sig	l5 = lp5
	getf.sig	l6 = lp6
	getf.sig	l7 = lp7
	add+add+cmp+cmp	l0, l0, h0
	add+add+cmp+cmp	l1, l1, h1
	add+add+cmp+cmp	l2, l2, h2
	add+add+cmp+cmp	l3, l3, h3
	add+add+cmp+cmp	l4, l4, h4
	add+add+cmp+cmp	l5, l5, h5
	add+add+cmp+cmp	l6, l6, h6
	add+add+cmp+cmp	l7, l7, h7
	st8		[r32] = xx, 8
	br.cloop Loop
================================================================

mpn_lshift
mpn_rshift
	shrp	r1=r2,r3,imm			// only immediate count
						// will need 63 loops.  Cute.
