mirror of
				https://github.com/godotengine/godot.git
				synced 2025-10-31 13:41:03 +00:00 
			
		
		
		
	
		
			
	
	
		
			2202 lines
		
	
	
	
		
			37 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
		
		
			
		
	
	
			2202 lines
		
	
	
	
		
			37 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
|   | .rdata | ||
|  | .asciiz	"mips3.s, Version 1.1" | ||
|  | .asciiz	"MIPS III/IV ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
 | ||
|  | 
 | ||
|  | /* | ||
|  |  * ==================================================================== | ||
|  |  * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
 | ||
|  |  * project. | ||
|  |  * | ||
|  |  * Rights for redistribution and usage in source and binary forms are | ||
|  |  * granted according to the OpenSSL license. Warranty of any kind is | ||
|  |  * disclaimed. | ||
|  |  * ==================================================================== | ||
|  |  */ | ||
|  | 
 | ||
|  | /* | ||
|  |  * This is my modest contributon to the OpenSSL project (see | ||
|  |  * http://www.openssl.org/ for more information about it) and is | ||
|  |  * a drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c | ||
|  |  * module. For updates see http://fy.chalmers.se/~appro/hpe/. | ||
|  |  * | ||
|  |  * The module is designed to work with either of the "new" MIPS ABI(5), | ||
|  |  * namely N32 or N64, offered by IRIX 6.x. It's not ment to work under | ||
|  |  * IRIX 5.x not only because it doesn't support new ABIs but also | ||
|  |  * because 5.x kernels put R4x00 CPU into 32-bit mode and all those | ||
|  |  * 64-bit instructions (daddu, dmultu, etc.) found below gonna only | ||
|  |  * cause illegal instruction exception:-( | ||
|  |  * | ||
|  |  * In addition the code depends on preprocessor flags set up by MIPSpro | ||
|  |  * compiler driver (either as or cc) and therefore (probably?) can't be | ||
|  |  * compiled by the GNU assembler. GNU C driver manages fine though... | ||
|  |  * I mean as long as -mmips-as is specified or is the default option, | ||
|  |  * because then it simply invokes /usr/bin/as which in turn takes | ||
|  |  * perfect care of the preprocessor definitions. Another neat feature | ||
|  |  * offered by the MIPSpro assembler is an optimization pass. This gave | ||
|  |  * me the opportunity to have the code looking more regular as all those | ||
|  |  * architecture dependent instruction rescheduling details were left to | ||
|  |  * the assembler. Cool, huh? | ||
|  |  * | ||
|  |  * Performance improvement is astonishing! 'apps/openssl speed rsa dsa' | ||
|  |  * goes way over 3 times faster! | ||
|  |  * | ||
|  |  *					<appro@fy.chalmers.se>
 | ||
|  |  */ | ||
|  | #include <asm.h> | ||
|  | #include <regdef.h> | ||
|  | 
 | ||
|  | #if _MIPS_ISA>=4 | ||
|  | #define	MOVNZ(cond,dst,src)	\ | ||
|  | 	movn	dst,src,cond | ||
|  | #else | ||
|  | #define	MOVNZ(cond,dst,src)	\ | ||
|  | 	.set	noreorder;	\
 | ||
|  | 	bnezl	cond,.+8;	\
 | ||
|  | 	move	dst,src;	\
 | ||
|  | 	.set	reorder
 | ||
|  | #endif | ||
|  | 
 | ||
|  | .text | ||
|  | 
 | ||
|  | .set	noat
 | ||
|  | .set	reorder
 | ||
|  | 
 | ||
|  | #define	MINUS4	v1 | ||
|  | 
 | ||
|  | .align	5
 | ||
|  | LEAF(bn_mul_add_words) | ||
|  | 	.set	noreorder
 | ||
|  | 	bgtzl	a2,.L_bn_mul_add_words_proceed | ||
|  | 	ld	t0,0(a1) | ||
|  | 	jr	ra | ||
|  | 	move	v0,zero | ||
|  | 	.set	reorder
 | ||
|  | 
 | ||
|  | .L_bn_mul_add_words_proceed: | ||
|  | 	li	MINUS4,-4 | ||
|  | 	and	ta0,a2,MINUS4 | ||
|  | 	move	v0,zero | ||
|  | 	beqz	ta0,.L_bn_mul_add_words_tail | ||
|  | 
 | ||
|  | .L_bn_mul_add_words_loop: | ||
|  | 	dmultu	t0,a3 | ||
|  | 	ld	t1,0(a0) | ||
|  | 	ld	t2,8(a1) | ||
|  | 	ld	t3,8(a0) | ||
|  | 	ld	ta0,16(a1) | ||
|  | 	ld	ta1,16(a0) | ||
|  | 	daddu	t1,v0 | ||
|  | 	sltu	v0,t1,v0	/* All manuals say it "compares 32-bit | ||
|  | 				 * values", but it seems to work fine | ||
|  | 				 * even on 64-bit registers. */ | ||
|  | 	mflo	AT | ||
|  | 	mfhi	t0 | ||
|  | 	daddu	t1,AT | ||
|  | 	daddu	v0,t0 | ||
|  | 	sltu	AT,t1,AT | ||
|  | 	sd	t1,0(a0) | ||
|  | 	daddu	v0,AT | ||
|  | 
 | ||
|  | 	dmultu	t2,a3 | ||
|  | 	ld	ta2,24(a1) | ||
|  | 	ld	ta3,24(a0) | ||
|  | 	daddu	t3,v0 | ||
|  | 	sltu	v0,t3,v0 | ||
|  | 	mflo	AT | ||
|  | 	mfhi	t2 | ||
|  | 	daddu	t3,AT | ||
|  | 	daddu	v0,t2 | ||
|  | 	sltu	AT,t3,AT | ||
|  | 	sd	t3,8(a0) | ||
|  | 	daddu	v0,AT | ||
|  | 
 | ||
|  | 	dmultu	ta0,a3 | ||
|  | 	subu	a2,4 | ||
|  | 	PTR_ADD	a0,32 | ||
|  | 	PTR_ADD	a1,32 | ||
|  | 	daddu	ta1,v0 | ||
|  | 	sltu	v0,ta1,v0 | ||
|  | 	mflo	AT | ||
|  | 	mfhi	ta0 | ||
|  | 	daddu	ta1,AT | ||
|  | 	daddu	v0,ta0 | ||
|  | 	sltu	AT,ta1,AT | ||
|  | 	sd	ta1,-16(a0) | ||
|  | 	daddu	v0,AT | ||
|  | 
 | ||
|  | 
 | ||
|  | 	dmultu	ta2,a3 | ||
|  | 	and	ta0,a2,MINUS4 | ||
|  | 	daddu	ta3,v0 | ||
|  | 	sltu	v0,ta3,v0 | ||
|  | 	mflo	AT | ||
|  | 	mfhi	ta2 | ||
|  | 	daddu	ta3,AT | ||
|  | 	daddu	v0,ta2 | ||
|  | 	sltu	AT,ta3,AT | ||
|  | 	sd	ta3,-8(a0) | ||
|  | 	daddu	v0,AT | ||
|  | 	.set	noreorder
 | ||
|  | 	bgtzl	ta0,.L_bn_mul_add_words_loop | ||
|  | 	ld	t0,0(a1) | ||
|  | 
 | ||
|  | 	bnezl	a2,.L_bn_mul_add_words_tail | ||
|  | 	ld	t0,0(a1) | ||
|  | 	.set	reorder
 | ||
|  | 
 | ||
|  | .L_bn_mul_add_words_return: | ||
|  | 	jr	ra | ||
|  | 
 | ||
|  | .L_bn_mul_add_words_tail: | ||
|  | 	dmultu	t0,a3 | ||
|  | 	ld	t1,0(a0) | ||
|  | 	subu	a2,1 | ||
|  | 	daddu	t1,v0 | ||
|  | 	sltu	v0,t1,v0 | ||
|  | 	mflo	AT | ||
|  | 	mfhi	t0 | ||
|  | 	daddu	t1,AT | ||
|  | 	daddu	v0,t0 | ||
|  | 	sltu	AT,t1,AT | ||
|  | 	sd	t1,0(a0) | ||
|  | 	daddu	v0,AT | ||
|  | 	beqz	a2,.L_bn_mul_add_words_return | ||
|  | 
 | ||
|  | 	ld	t0,8(a1) | ||
|  | 	dmultu	t0,a3 | ||
|  | 	ld	t1,8(a0) | ||
|  | 	subu	a2,1 | ||
|  | 	daddu	t1,v0 | ||
|  | 	sltu	v0,t1,v0 | ||
|  | 	mflo	AT | ||
|  | 	mfhi	t0 | ||
|  | 	daddu	t1,AT | ||
|  | 	daddu	v0,t0 | ||
|  | 	sltu	AT,t1,AT | ||
|  | 	sd	t1,8(a0) | ||
|  | 	daddu	v0,AT | ||
|  | 	beqz	a2,.L_bn_mul_add_words_return | ||
|  | 
 | ||
|  | 	ld	t0,16(a1) | ||
|  | 	dmultu	t0,a3 | ||
|  | 	ld	t1,16(a0) | ||
|  | 	daddu	t1,v0 | ||
|  | 	sltu	v0,t1,v0 | ||
|  | 	mflo	AT | ||
|  | 	mfhi	t0 | ||
|  | 	daddu	t1,AT | ||
|  | 	daddu	v0,t0 | ||
|  | 	sltu	AT,t1,AT | ||
|  | 	sd	t1,16(a0) | ||
|  | 	daddu	v0,AT | ||
|  | 	jr	ra | ||
|  | END(bn_mul_add_words) | ||
|  | 
 | ||
|  | .align	5
 | ||
|  | LEAF(bn_mul_words) | ||
|  | 	.set	noreorder
 | ||
|  | 	bgtzl	a2,.L_bn_mul_words_proceed | ||
|  | 	ld	t0,0(a1) | ||
|  | 	jr	ra | ||
|  | 	move	v0,zero | ||
|  | 	.set	reorder
 | ||
|  | 
 | ||
|  | .L_bn_mul_words_proceed: | ||
|  | 	li	MINUS4,-4 | ||
|  | 	and	ta0,a2,MINUS4 | ||
|  | 	move	v0,zero | ||
|  | 	beqz	ta0,.L_bn_mul_words_tail | ||
|  | 
 | ||
|  | .L_bn_mul_words_loop: | ||
|  | 	dmultu	t0,a3 | ||
|  | 	ld	t2,8(a1) | ||
|  | 	ld	ta0,16(a1) | ||
|  | 	ld	ta2,24(a1) | ||
|  | 	mflo	AT | ||
|  | 	mfhi	t0 | ||
|  | 	daddu	v0,AT | ||
|  | 	sltu	t1,v0,AT | ||
|  | 	sd	v0,0(a0) | ||
|  | 	daddu	v0,t1,t0 | ||
|  | 
 | ||
|  | 	dmultu	t2,a3 | ||
|  | 	subu	a2,4 | ||
|  | 	PTR_ADD	a0,32 | ||
|  | 	PTR_ADD	a1,32 | ||
|  | 	mflo	AT | ||
|  | 	mfhi	t2 | ||
|  | 	daddu	v0,AT | ||
|  | 	sltu	t3,v0,AT | ||
|  | 	sd	v0,-24(a0) | ||
|  | 	daddu	v0,t3,t2 | ||
|  | 
 | ||
|  | 	dmultu	ta0,a3 | ||
|  | 	mflo	AT | ||
|  | 	mfhi	ta0 | ||
|  | 	daddu	v0,AT | ||
|  | 	sltu	ta1,v0,AT | ||
|  | 	sd	v0,-16(a0) | ||
|  | 	daddu	v0,ta1,ta0 | ||
|  | 
 | ||
|  | 
 | ||
|  | 	dmultu	ta2,a3 | ||
|  | 	and	ta0,a2,MINUS4 | ||
|  | 	mflo	AT | ||
|  | 	mfhi	ta2 | ||
|  | 	daddu	v0,AT | ||
|  | 	sltu	ta3,v0,AT | ||
|  | 	sd	v0,-8(a0) | ||
|  | 	daddu	v0,ta3,ta2 | ||
|  | 	.set	noreorder
 | ||
|  | 	bgtzl	ta0,.L_bn_mul_words_loop | ||
|  | 	ld	t0,0(a1) | ||
|  | 
 | ||
|  | 	bnezl	a2,.L_bn_mul_words_tail | ||
|  | 	ld	t0,0(a1) | ||
|  | 	.set	reorder
 | ||
|  | 
 | ||
|  | .L_bn_mul_words_return: | ||
|  | 	jr	ra | ||
|  | 
 | ||
|  | .L_bn_mul_words_tail: | ||
|  | 	dmultu	t0,a3 | ||
|  | 	subu	a2,1 | ||
|  | 	mflo	AT | ||
|  | 	mfhi	t0 | ||
|  | 	daddu	v0,AT | ||
|  | 	sltu	t1,v0,AT | ||
|  | 	sd	v0,0(a0) | ||
|  | 	daddu	v0,t1,t0 | ||
|  | 	beqz	a2,.L_bn_mul_words_return | ||
|  | 
 | ||
|  | 	ld	t0,8(a1) | ||
|  | 	dmultu	t0,a3 | ||
|  | 	subu	a2,1 | ||
|  | 	mflo	AT | ||
|  | 	mfhi	t0 | ||
|  | 	daddu	v0,AT | ||
|  | 	sltu	t1,v0,AT | ||
|  | 	sd	v0,8(a0) | ||
|  | 	daddu	v0,t1,t0 | ||
|  | 	beqz	a2,.L_bn_mul_words_return | ||
|  | 
 | ||
|  | 	ld	t0,16(a1) | ||
|  | 	dmultu	t0,a3 | ||
|  | 	mflo	AT | ||
|  | 	mfhi	t0 | ||
|  | 	daddu	v0,AT | ||
|  | 	sltu	t1,v0,AT | ||
|  | 	sd	v0,16(a0) | ||
|  | 	daddu	v0,t1,t0 | ||
|  | 	jr	ra | ||
|  | END(bn_mul_words) | ||
|  | 
 | ||
|  | .align	5
 | ||
|  | LEAF(bn_sqr_words) | ||
|  | 	.set	noreorder
 | ||
|  | 	bgtzl	a2,.L_bn_sqr_words_proceed | ||
|  | 	ld	t0,0(a1) | ||
|  | 	jr	ra | ||
|  | 	move	v0,zero | ||
|  | 	.set	reorder
 | ||
|  | 
 | ||
|  | .L_bn_sqr_words_proceed: | ||
|  | 	li	MINUS4,-4 | ||
|  | 	and	ta0,a2,MINUS4 | ||
|  | 	move	v0,zero | ||
|  | 	beqz	ta0,.L_bn_sqr_words_tail | ||
|  | 
 | ||
|  | .L_bn_sqr_words_loop: | ||
|  | 	dmultu	t0,t0 | ||
|  | 	ld	t2,8(a1) | ||
|  | 	ld	ta0,16(a1) | ||
|  | 	ld	ta2,24(a1) | ||
|  | 	mflo	t1 | ||
|  | 	mfhi	t0 | ||
|  | 	sd	t1,0(a0) | ||
|  | 	sd	t0,8(a0) | ||
|  | 
 | ||
|  | 	dmultu	t2,t2 | ||
|  | 	subu	a2,4 | ||
|  | 	PTR_ADD	a0,64 | ||
|  | 	PTR_ADD	a1,32 | ||
|  | 	mflo	t3 | ||
|  | 	mfhi	t2 | ||
|  | 	sd	t3,-48(a0) | ||
|  | 	sd	t2,-40(a0) | ||
|  | 
 | ||
|  | 	dmultu	ta0,ta0 | ||
|  | 	mflo	ta1 | ||
|  | 	mfhi	ta0 | ||
|  | 	sd	ta1,-32(a0) | ||
|  | 	sd	ta0,-24(a0) | ||
|  | 
 | ||
|  | 
 | ||
|  | 	dmultu	ta2,ta2 | ||
|  | 	and	ta0,a2,MINUS4 | ||
|  | 	mflo	ta3 | ||
|  | 	mfhi	ta2 | ||
|  | 	sd	ta3,-16(a0) | ||
|  | 	sd	ta2,-8(a0) | ||
|  | 
 | ||
|  | 	.set	noreorder
 | ||
|  | 	bgtzl	ta0,.L_bn_sqr_words_loop | ||
|  | 	ld	t0,0(a1) | ||
|  | 
 | ||
|  | 	bnezl	a2,.L_bn_sqr_words_tail | ||
|  | 	ld	t0,0(a1) | ||
|  | 	.set	reorder
 | ||
|  | 
 | ||
|  | .L_bn_sqr_words_return: | ||
|  | 	move	v0,zero | ||
|  | 	jr	ra | ||
|  | 
 | ||
|  | .L_bn_sqr_words_tail: | ||
|  | 	dmultu	t0,t0 | ||
|  | 	subu	a2,1 | ||
|  | 	mflo	t1 | ||
|  | 	mfhi	t0 | ||
|  | 	sd	t1,0(a0) | ||
|  | 	sd	t0,8(a0) | ||
|  | 	beqz	a2,.L_bn_sqr_words_return | ||
|  | 
 | ||
|  | 	ld	t0,8(a1) | ||
|  | 	dmultu	t0,t0 | ||
|  | 	subu	a2,1 | ||
|  | 	mflo	t1 | ||
|  | 	mfhi	t0 | ||
|  | 	sd	t1,16(a0) | ||
|  | 	sd	t0,24(a0) | ||
|  | 	beqz	a2,.L_bn_sqr_words_return | ||
|  | 
 | ||
|  | 	ld	t0,16(a1) | ||
|  | 	dmultu	t0,t0 | ||
|  | 	mflo	t1 | ||
|  | 	mfhi	t0 | ||
|  | 	sd	t1,32(a0) | ||
|  | 	sd	t0,40(a0) | ||
|  | 	jr	ra | ||
|  | END(bn_sqr_words) | ||
|  | 
 | ||
|  | .align	5
 | ||
|  | LEAF(bn_add_words) | ||
|  | 	.set	noreorder
 | ||
|  | 	bgtzl	a3,.L_bn_add_words_proceed | ||
|  | 	ld	t0,0(a1) | ||
|  | 	jr	ra | ||
|  | 	move	v0,zero | ||
|  | 	.set	reorder
 | ||
|  | 
 | ||
|  | .L_bn_add_words_proceed: | ||
|  | 	li	MINUS4,-4 | ||
|  | 	and	AT,a3,MINUS4 | ||
|  | 	move	v0,zero | ||
|  | 	beqz	AT,.L_bn_add_words_tail | ||
|  | 
 | ||
|  | .L_bn_add_words_loop: | ||
|  | 	ld	ta0,0(a2) | ||
|  | 	subu	a3,4 | ||
|  | 	ld	t1,8(a1) | ||
|  | 	and	AT,a3,MINUS4 | ||
|  | 	ld	t2,16(a1) | ||
|  | 	PTR_ADD	a2,32 | ||
|  | 	ld	t3,24(a1) | ||
|  | 	PTR_ADD	a0,32 | ||
|  | 	ld	ta1,-24(a2) | ||
|  | 	PTR_ADD	a1,32 | ||
|  | 	ld	ta2,-16(a2) | ||
|  | 	ld	ta3,-8(a2) | ||
|  | 	daddu	ta0,t0 | ||
|  | 	sltu	t8,ta0,t0 | ||
|  | 	daddu	t0,ta0,v0 | ||
|  | 	sltu	v0,t0,ta0 | ||
|  | 	sd	t0,-32(a0) | ||
|  | 	daddu	v0,t8 | ||
|  | 
 | ||
|  | 	daddu	ta1,t1 | ||
|  | 	sltu	t9,ta1,t1 | ||
|  | 	daddu	t1,ta1,v0 | ||
|  | 	sltu	v0,t1,ta1 | ||
|  | 	sd	t1,-24(a0) | ||
|  | 	daddu	v0,t9 | ||
|  | 
 | ||
|  | 	daddu	ta2,t2 | ||
|  | 	sltu	t8,ta2,t2 | ||
|  | 	daddu	t2,ta2,v0 | ||
|  | 	sltu	v0,t2,ta2 | ||
|  | 	sd	t2,-16(a0) | ||
|  | 	daddu	v0,t8 | ||
|  | 	 | ||
|  | 	daddu	ta3,t3 | ||
|  | 	sltu	t9,ta3,t3 | ||
|  | 	daddu	t3,ta3,v0 | ||
|  | 	sltu	v0,t3,ta3 | ||
|  | 	sd	t3,-8(a0) | ||
|  | 	daddu	v0,t9 | ||
|  | 	 | ||
|  | 	.set	noreorder
 | ||
|  | 	bgtzl	AT,.L_bn_add_words_loop | ||
|  | 	ld	t0,0(a1) | ||
|  | 
 | ||
|  | 	bnezl	a3,.L_bn_add_words_tail | ||
|  | 	ld	t0,0(a1) | ||
|  | 	.set	reorder
 | ||
|  | 
 | ||
|  | .L_bn_add_words_return: | ||
|  | 	jr	ra | ||
|  | 
 | ||
|  | .L_bn_add_words_tail: | ||
|  | 	ld	ta0,0(a2) | ||
|  | 	daddu	ta0,t0 | ||
|  | 	subu	a3,1 | ||
|  | 	sltu	t8,ta0,t0 | ||
|  | 	daddu	t0,ta0,v0 | ||
|  | 	sltu	v0,t0,ta0 | ||
|  | 	sd	t0,0(a0) | ||
|  | 	daddu	v0,t8 | ||
|  | 	beqz	a3,.L_bn_add_words_return | ||
|  | 
 | ||
|  | 	ld	t1,8(a1) | ||
|  | 	ld	ta1,8(a2) | ||
|  | 	daddu	ta1,t1 | ||
|  | 	subu	a3,1 | ||
|  | 	sltu	t9,ta1,t1 | ||
|  | 	daddu	t1,ta1,v0 | ||
|  | 	sltu	v0,t1,ta1 | ||
|  | 	sd	t1,8(a0) | ||
|  | 	daddu	v0,t9 | ||
|  | 	beqz	a3,.L_bn_add_words_return | ||
|  | 
 | ||
|  | 	ld	t2,16(a1) | ||
|  | 	ld	ta2,16(a2) | ||
|  | 	daddu	ta2,t2 | ||
|  | 	sltu	t8,ta2,t2 | ||
|  | 	daddu	t2,ta2,v0 | ||
|  | 	sltu	v0,t2,ta2 | ||
|  | 	sd	t2,16(a0) | ||
|  | 	daddu	v0,t8 | ||
|  | 	jr	ra | ||
|  | END(bn_add_words) | ||
|  | 
 | ||
|  | .align	5
 | ||
|  | LEAF(bn_sub_words) | ||
|  | 	.set	noreorder
 | ||
|  | 	bgtzl	a3,.L_bn_sub_words_proceed | ||
|  | 	ld	t0,0(a1) | ||
|  | 	jr	ra | ||
|  | 	move	v0,zero | ||
|  | 	.set	reorder
 | ||
|  | 
 | ||
|  | .L_bn_sub_words_proceed: | ||
|  | 	li	MINUS4,-4 | ||
|  | 	and	AT,a3,MINUS4 | ||
|  | 	move	v0,zero | ||
|  | 	beqz	AT,.L_bn_sub_words_tail | ||
|  | 
 | ||
|  | .L_bn_sub_words_loop: | ||
|  | 	ld	ta0,0(a2) | ||
|  | 	subu	a3,4 | ||
|  | 	ld	t1,8(a1) | ||
|  | 	and	AT,a3,MINUS4 | ||
|  | 	ld	t2,16(a1) | ||
|  | 	PTR_ADD	a2,32 | ||
|  | 	ld	t3,24(a1) | ||
|  | 	PTR_ADD	a0,32 | ||
|  | 	ld	ta1,-24(a2) | ||
|  | 	PTR_ADD	a1,32 | ||
|  | 	ld	ta2,-16(a2) | ||
|  | 	ld	ta3,-8(a2) | ||
|  | 	sltu	t8,t0,ta0 | ||
|  | 	dsubu	t0,ta0 | ||
|  | 	dsubu	ta0,t0,v0 | ||
|  | 	sd	ta0,-32(a0) | ||
|  | 	MOVNZ	(t0,v0,t8) | ||
|  | 
 | ||
|  | 	sltu	t9,t1,ta1 | ||
|  | 	dsubu	t1,ta1 | ||
|  | 	dsubu	ta1,t1,v0 | ||
|  | 	sd	ta1,-24(a0) | ||
|  | 	MOVNZ	(t1,v0,t9) | ||
|  | 
 | ||
|  | 
 | ||
|  | 	sltu	t8,t2,ta2 | ||
|  | 	dsubu	t2,ta2 | ||
|  | 	dsubu	ta2,t2,v0 | ||
|  | 	sd	ta2,-16(a0) | ||
|  | 	MOVNZ	(t2,v0,t8) | ||
|  | 
 | ||
|  | 	sltu	t9,t3,ta3 | ||
|  | 	dsubu	t3,ta3 | ||
|  | 	dsubu	ta3,t3,v0 | ||
|  | 	sd	ta3,-8(a0) | ||
|  | 	MOVNZ	(t3,v0,t9) | ||
|  | 
 | ||
|  | 	.set	noreorder
 | ||
|  | 	bgtzl	AT,.L_bn_sub_words_loop | ||
|  | 	ld	t0,0(a1) | ||
|  | 
 | ||
|  | 	bnezl	a3,.L_bn_sub_words_tail | ||
|  | 	ld	t0,0(a1) | ||
|  | 	.set	reorder
 | ||
|  | 
 | ||
|  | .L_bn_sub_words_return: | ||
|  | 	jr	ra | ||
|  | 
 | ||
|  | .L_bn_sub_words_tail: | ||
|  | 	ld	ta0,0(a2) | ||
|  | 	subu	a3,1 | ||
|  | 	sltu	t8,t0,ta0 | ||
|  | 	dsubu	t0,ta0 | ||
|  | 	dsubu	ta0,t0,v0 | ||
|  | 	MOVNZ	(t0,v0,t8) | ||
|  | 	sd	ta0,0(a0) | ||
|  | 	beqz	a3,.L_bn_sub_words_return | ||
|  | 
 | ||
|  | 	ld	t1,8(a1) | ||
|  | 	subu	a3,1 | ||
|  | 	ld	ta1,8(a2) | ||
|  | 	sltu	t9,t1,ta1 | ||
|  | 	dsubu	t1,ta1 | ||
|  | 	dsubu	ta1,t1,v0 | ||
|  | 	MOVNZ	(t1,v0,t9) | ||
|  | 	sd	ta1,8(a0) | ||
|  | 	beqz	a3,.L_bn_sub_words_return | ||
|  | 
 | ||
|  | 	ld	t2,16(a1) | ||
|  | 	ld	ta2,16(a2) | ||
|  | 	sltu	t8,t2,ta2 | ||
|  | 	dsubu	t2,ta2 | ||
|  | 	dsubu	ta2,t2,v0 | ||
|  | 	MOVNZ	(t2,v0,t8) | ||
|  | 	sd	ta2,16(a0) | ||
|  | 	jr	ra | ||
|  | END(bn_sub_words) | ||
|  | 
 | ||
|  | #undef	MINUS4 | ||
|  | 
 | ||
|  | .align 5
 | ||
|  | LEAF(bn_div_3_words) | ||
|  | 	.set	reorder
 | ||
|  | 	move	a3,a0		/* we know that bn_div_words doesn't | ||
|  | 				 * touch a3, ta2, ta3 and preserves a2 | ||
|  | 				 * so that we can save two arguments | ||
|  | 				 * and return address in registers | ||
|  | 				 * instead of stack:-) | ||
|  | 				 */ | ||
|  | 	ld	a0,(a3) | ||
|  | 	move	ta2,a1 | ||
|  | 	ld	a1,-8(a3) | ||
|  | 	bne	a0,a2,.L_bn_div_3_words_proceed | ||
|  | 	li	v0,-1 | ||
|  | 	jr	ra | ||
|  | .L_bn_div_3_words_proceed: | ||
|  | 	move	ta3,ra | ||
|  | 	bal	bn_div_words | ||
|  | 	move	ra,ta3 | ||
|  | 	dmultu	ta2,v0 | ||
|  | 	ld	t2,-16(a3) | ||
|  | 	move	ta0,zero | ||
|  | 	mfhi	t1 | ||
|  | 	mflo	t0 | ||
|  | 	sltu	t8,t1,v1 | ||
|  | .L_bn_div_3_words_inner_loop: | ||
|  | 	bnez	t8,.L_bn_div_3_words_inner_loop_done | ||
|  | 	sgeu	AT,t2,t0 | ||
|  | 	seq	t9,t1,v1 | ||
|  | 	and	AT,t9 | ||
|  | 	sltu	t3,t0,ta2 | ||
|  | 	daddu	v1,a2 | ||
|  | 	dsubu	t1,t3 | ||
|  | 	dsubu	t0,ta2 | ||
|  | 	sltu	t8,t1,v1 | ||
|  | 	sltu	ta0,v1,a2 | ||
|  | 	or	t8,ta0 | ||
|  | 	.set	noreorder
 | ||
|  | 	beqzl	AT,.L_bn_div_3_words_inner_loop | ||
|  | 	dsubu	v0,1 | ||
|  | 	.set	reorder
 | ||
|  | .L_bn_div_3_words_inner_loop_done: | ||
|  | 	jr	ra | ||
|  | END(bn_div_3_words) | ||
|  | 
 | ||
|  | .align	5
 | ||
|  | LEAF(bn_div_words) | ||
|  | 	.set	noreorder
 | ||
|  | 	bnezl	a2,.L_bn_div_words_proceed | ||
|  | 	move	v1,zero | ||
|  | 	jr	ra | ||
|  | 	li	v0,-1		/* I'd rather signal div-by-zero | ||
|  | 				 * which can be done with 'break 7' */ | ||
|  | 
 | ||
|  | .L_bn_div_words_proceed: | ||
|  | 	bltz	a2,.L_bn_div_words_body | ||
|  | 	move	t9,v1 | ||
|  | 	dsll	a2,1 | ||
|  | 	bgtz	a2,.-4 | ||
|  | 	addu	t9,1 | ||
|  | 
 | ||
|  | 	.set	reorder
 | ||
|  | 	negu	t1,t9 | ||
|  | 	li	t2,-1 | ||
|  | 	dsll	t2,t1 | ||
|  | 	and	t2,a0 | ||
|  | 	dsrl	AT,a1,t1 | ||
|  | 	.set	noreorder
 | ||
|  | 	bnezl	t2,.+8 | ||
|  | 	break	6		/* signal overflow */ | ||
|  | 	.set	reorder
 | ||
|  | 	dsll	a0,t9 | ||
|  | 	dsll	a1,t9 | ||
|  | 	or	a0,AT | ||
|  | 
 | ||
|  | #define	QT	ta0 | ||
|  | #define	HH	ta1 | ||
|  | #define	DH	v1 | ||
|  | .L_bn_div_words_body: | ||
|  | 	dsrl	DH,a2,32 | ||
|  | 	sgeu	AT,a0,a2 | ||
|  | 	.set	noreorder
 | ||
|  | 	bnezl	AT,.+8 | ||
|  | 	dsubu	a0,a2 | ||
|  | 	.set	reorder
 | ||
|  | 
 | ||
|  | 	li	QT,-1 | ||
|  | 	dsrl	HH,a0,32 | ||
|  | 	dsrl	QT,32	/* q=0xffffffff */ | ||
|  | 	beq	DH,HH,.L_bn_div_words_skip_div1 | ||
|  | 	ddivu	zero,a0,DH | ||
|  | 	mflo	QT | ||
|  | .L_bn_div_words_skip_div1: | ||
|  | 	dmultu	a2,QT | ||
|  | 	dsll	t3,a0,32 | ||
|  | 	dsrl	AT,a1,32 | ||
|  | 	or	t3,AT | ||
|  | 	mflo	t0 | ||
|  | 	mfhi	t1 | ||
|  | .L_bn_div_words_inner_loop1: | ||
|  | 	sltu	t2,t3,t0 | ||
|  | 	seq	t8,HH,t1 | ||
|  | 	sltu	AT,HH,t1 | ||
|  | 	and	t2,t8 | ||
|  | 	sltu	v0,t0,a2 | ||
|  | 	or	AT,t2 | ||
|  | 	.set	noreorder
 | ||
|  | 	beqz	AT,.L_bn_div_words_inner_loop1_done | ||
|  | 	dsubu	t1,v0 | ||
|  | 	dsubu	t0,a2 | ||
|  | 	b	.L_bn_div_words_inner_loop1 | ||
|  | 	dsubu	QT,1 | ||
|  | 	.set	reorder
 | ||
|  | .L_bn_div_words_inner_loop1_done: | ||
|  | 
 | ||
|  | 	dsll	a1,32 | ||
|  | 	dsubu	a0,t3,t0 | ||
|  | 	dsll	v0,QT,32 | ||
|  | 
 | ||
|  | 	li	QT,-1 | ||
|  | 	dsrl	HH,a0,32 | ||
|  | 	dsrl	QT,32	/* q=0xffffffff */ | ||
|  | 	beq	DH,HH,.L_bn_div_words_skip_div2 | ||
|  | 	ddivu	zero,a0,DH | ||
|  | 	mflo	QT | ||
|  | .L_bn_div_words_skip_div2: | ||
|  | #undef	DH | ||
|  | 	dmultu	a2,QT | ||
|  | 	dsll	t3,a0,32 | ||
|  | 	dsrl	AT,a1,32 | ||
|  | 	or	t3,AT | ||
|  | 	mflo	t0 | ||
|  | 	mfhi	t1 | ||
|  | .L_bn_div_words_inner_loop2: | ||
|  | 	sltu	t2,t3,t0 | ||
|  | 	seq	t8,HH,t1 | ||
|  | 	sltu	AT,HH,t1 | ||
|  | 	and	t2,t8 | ||
|  | 	sltu	v1,t0,a2 | ||
|  | 	or	AT,t2 | ||
|  | 	.set	noreorder
 | ||
|  | 	beqz	AT,.L_bn_div_words_inner_loop2_done | ||
|  | 	dsubu	t1,v1 | ||
|  | 	dsubu	t0,a2 | ||
|  | 	b	.L_bn_div_words_inner_loop2 | ||
|  | 	dsubu	QT,1 | ||
|  | 	.set	reorder
 | ||
|  | .L_bn_div_words_inner_loop2_done:	 | ||
|  | #undef	HH | ||
|  | 
 | ||
|  | 	dsubu	a0,t3,t0 | ||
|  | 	or	v0,QT | ||
|  | 	dsrl	v1,a0,t9	/* v1 contains remainder if anybody wants it */ | ||
|  | 	dsrl	a2,t9		/* restore a2 */ | ||
|  | 	jr	ra | ||
|  | #undef	QT | ||
|  | END(bn_div_words) | ||
|  | 
 | ||
|  | #define	a_0	t0 | ||
|  | #define	a_1	t1 | ||
|  | #define	a_2	t2 | ||
|  | #define	a_3	t3 | ||
|  | #define	b_0	ta0 | ||
|  | #define	b_1	ta1 | ||
|  | #define	b_2	ta2 | ||
|  | #define	b_3	ta3 | ||
|  | 
 | ||
|  | #define	a_4	s0 | ||
|  | #define	a_5	s2 | ||
|  | #define	a_6	s4 | ||
|  | #define	a_7	a1	/* once we load a[7] we don't need a anymore */ | ||
|  | #define	b_4	s1 | ||
|  | #define	b_5	s3 | ||
|  | #define	b_6	s5 | ||
|  | #define	b_7	a2	/* once we load b[7] we don't need b anymore */ | ||
|  | 
 | ||
|  | #define	t_1	t8 | ||
|  | #define	t_2	t9 | ||
|  | 
 | ||
|  | #define	c_1	v0 | ||
|  | #define	c_2	v1 | ||
|  | #define	c_3	a3 | ||
|  | 
 | ||
|  | #define	FRAME_SIZE	48 | ||
|  | 
 | ||
|  | .align	5
 | ||
|  | LEAF(bn_mul_comba8) | ||
|  | 	.set	noreorder
 | ||
|  | 	PTR_SUB	sp,FRAME_SIZE | ||
|  | 	.frame	sp,64,ra | ||
|  | 	.set	reorder
 | ||
|  | 	ld	a_0,0(a1)	/* If compiled with -mips3 option on | ||
|  | 				 * R5000 box assembler barks on this | ||
|  | 				 * line with "shouldn't have mult/div | ||
|  | 				 * as last instruction in bb (R10K | ||
|  | 				 * bug)" warning. If anybody out there | ||
|  | 				 * has a clue about how to circumvent | ||
|  | 				 * this do send me a note. | ||
|  | 				 *		<appro@fy.chalmers.se>
 | ||
|  | 				 */ | ||
|  | 	ld	b_0,0(a2) | ||
|  | 	ld	a_1,8(a1) | ||
|  | 	ld	a_2,16(a1) | ||
|  | 	ld	a_3,24(a1) | ||
|  | 	ld	b_1,8(a2) | ||
|  | 	ld	b_2,16(a2) | ||
|  | 	ld	b_3,24(a2) | ||
|  | 	dmultu	a_0,b_0		/* mul_add_c(a[0],b[0],c1,c2,c3); */
 | ||
|  | 	sd	s0,0(sp) | ||
|  | 	sd	s1,8(sp) | ||
|  | 	sd	s2,16(sp) | ||
|  | 	sd	s3,24(sp) | ||
|  | 	sd	s4,32(sp) | ||
|  | 	sd	s5,40(sp) | ||
|  | 	mflo	c_1 | ||
|  | 	mfhi	c_2 | ||
|  | 
 | ||
|  | 	dmultu	a_0,b_1		/* mul_add_c(a[0],b[1],c2,c3,c1); */
 | ||
|  | 	ld	a_4,32(a1) | ||
|  | 	ld	a_5,40(a1) | ||
|  | 	ld	a_6,48(a1) | ||
|  | 	ld	a_7,56(a1) | ||
|  | 	ld	b_4,32(a2) | ||
|  | 	ld	b_5,40(a2) | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	c_3,t_2,AT | ||
|  | 	dmultu	a_1,b_0		/* mul_add_c(a[1],b[0],c2,c3,c1); */
 | ||
|  | 	ld	b_6,48(a2) | ||
|  | 	ld	b_7,56(a2) | ||
|  | 	sd	c_1,0(a0)	/* r[0]=c1; */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_3,t_2 | ||
|  | 	sltu	c_1,c_3,t_2 | ||
|  | 	sd	c_2,8(a0)	/* r[1]=c2; */
 | ||
|  | 
 | ||
|  | 	dmultu	a_2,b_0		/* mul_add_c(a[2],b[0],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	dmultu	a_1,b_1		/* mul_add_c(a[1],b[1],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	sltu	c_2,c_1,t_2 | ||
|  | 	dmultu	a_0,b_2		/* mul_add_c(a[0],b[2],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	sltu	AT,c_1,t_2 | ||
|  | 	daddu	c_2,AT | ||
|  | 	sd	c_3,16(a0)	/* r[2]=c3; */
 | ||
|  | 
 | ||
|  | 	dmultu	a_0,b_3		/* mul_add_c(a[0],b[3],c1,c2,c3); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_1,t_1 | ||
|  | 	sltu	AT,c_1,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_2,t_2 | ||
|  | 	sltu	c_3,c_2,t_2 | ||
|  | 	dmultu	a_1,b_2		/* mul_add_c(a[1],b[2],c1,c2,c3); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_1,t_1 | ||
|  | 	sltu	AT,c_1,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_2,t_2 | ||
|  | 	sltu	AT,c_2,t_2 | ||
|  | 	daddu	c_3,AT | ||
|  | 	dmultu	a_2,b_1		/* mul_add_c(a[2],b[1],c1,c2,c3); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_1,t_1 | ||
|  | 	sltu	AT,c_1,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_2,t_2 | ||
|  | 	sltu	AT,c_2,t_2 | ||
|  | 	daddu	c_3,AT | ||
|  | 	dmultu	a_3,b_0		/* mul_add_c(a[3],b[0],c1,c2,c3); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_1,t_1 | ||
|  | 	sltu	AT,c_1,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_2,t_2 | ||
|  | 	sltu	AT,c_2,t_2 | ||
|  | 	daddu	c_3,AT | ||
|  | 	sd	c_1,24(a0)	/* r[3]=c1; */
 | ||
|  | 
 | ||
|  | 	dmultu	a_4,b_0		/* mul_add_c(a[4],b[0],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_3,t_2 | ||
|  | 	sltu	c_1,c_3,t_2 | ||
|  | 	dmultu	a_3,b_1		/* mul_add_c(a[3],b[1],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_3,t_2 | ||
|  | 	sltu	AT,c_3,t_2 | ||
|  | 	daddu	c_1,AT | ||
|  | 	dmultu	a_2,b_2		/* mul_add_c(a[2],b[2],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_3,t_2 | ||
|  | 	sltu	AT,c_3,t_2 | ||
|  | 	daddu	c_1,AT | ||
|  | 	dmultu	a_1,b_3		/* mul_add_c(a[1],b[3],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_3,t_2 | ||
|  | 	sltu	AT,c_3,t_2 | ||
|  | 	daddu	c_1,AT | ||
|  | 	dmultu	a_0,b_4		/* mul_add_c(a[0],b[4],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_3,t_2 | ||
|  | 	sltu	AT,c_3,t_2 | ||
|  | 	daddu	c_1,AT | ||
|  | 	sd	c_2,32(a0)	/* r[4]=c2; */
 | ||
|  | 
 | ||
|  | 	dmultu	a_0,b_5		/* mul_add_c(a[0],b[5],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	sltu	c_2,c_1,t_2 | ||
|  | 	dmultu	a_1,b_4		/* mul_add_c(a[1],b[4],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	sltu	AT,c_1,t_2 | ||
|  | 	daddu	c_2,AT | ||
|  | 	dmultu	a_2,b_3		/* mul_add_c(a[2],b[3],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	sltu	AT,c_1,t_2 | ||
|  | 	daddu	c_2,AT | ||
|  | 	dmultu	a_3,b_2		/* mul_add_c(a[3],b[2],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	sltu	AT,c_1,t_2 | ||
|  | 	daddu	c_2,AT | ||
|  | 	dmultu	a_4,b_1		/* mul_add_c(a[4],b[1],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	sltu	AT,c_1,t_2 | ||
|  | 	daddu	c_2,AT | ||
|  | 	dmultu	a_5,b_0		/* mul_add_c(a[5],b[0],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	sltu	AT,c_1,t_2 | ||
|  | 	daddu	c_2,AT | ||
|  | 	sd	c_3,40(a0)	/* r[5]=c3; */
 | ||
|  | 
 | ||
|  | 	dmultu	a_6,b_0		/* mul_add_c(a[6],b[0],c1,c2,c3); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_1,t_1 | ||
|  | 	sltu	AT,c_1,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_2,t_2 | ||
|  | 	sltu	c_3,c_2,t_2 | ||
|  | 	dmultu	a_5,b_1		/* mul_add_c(a[5],b[1],c1,c2,c3); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_1,t_1 | ||
|  | 	sltu	AT,c_1,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_2,t_2 | ||
|  | 	sltu	AT,c_2,t_2 | ||
|  | 	daddu	c_3,AT | ||
|  | 	dmultu	a_4,b_2		/* mul_add_c(a[4],b[2],c1,c2,c3); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_1,t_1 | ||
|  | 	sltu	AT,c_1,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_2,t_2 | ||
|  | 	sltu	AT,c_2,t_2 | ||
|  | 	daddu	c_3,AT | ||
|  | 	dmultu	a_3,b_3		/* mul_add_c(a[3],b[3],c1,c2,c3); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_1,t_1 | ||
|  | 	sltu	AT,c_1,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_2,t_2 | ||
|  | 	sltu	AT,c_2,t_2 | ||
|  | 	daddu	c_3,AT | ||
|  | 	dmultu	a_2,b_4		/* mul_add_c(a[2],b[4],c1,c2,c3); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_1,t_1 | ||
|  | 	sltu	AT,c_1,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_2,t_2 | ||
|  | 	sltu	AT,c_2,t_2 | ||
|  | 	daddu	c_3,AT | ||
|  | 	dmultu	a_1,b_5		/* mul_add_c(a[1],b[5],c1,c2,c3); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_1,t_1 | ||
|  | 	sltu	AT,c_1,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_2,t_2 | ||
|  | 	sltu	AT,c_2,t_2 | ||
|  | 	daddu	c_3,AT | ||
|  | 	dmultu	a_0,b_6		/* mul_add_c(a[0],b[6],c1,c2,c3); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_1,t_1 | ||
|  | 	sltu	AT,c_1,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_2,t_2 | ||
|  | 	sltu	AT,c_2,t_2 | ||
|  | 	daddu	c_3,AT | ||
|  | 	sd	c_1,48(a0)	/* r[6]=c1; */
 | ||
|  | 
 | ||
|  | 	dmultu	a_0,b_7		/* mul_add_c(a[0],b[7],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_3,t_2 | ||
|  | 	sltu	c_1,c_3,t_2 | ||
|  | 	dmultu	a_1,b_6		/* mul_add_c(a[1],b[6],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_3,t_2 | ||
|  | 	sltu	AT,c_3,t_2 | ||
|  | 	daddu	c_1,AT | ||
|  | 	dmultu	a_2,b_5		/* mul_add_c(a[2],b[5],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_3,t_2 | ||
|  | 	sltu	AT,c_3,t_2 | ||
|  | 	daddu	c_1,AT | ||
|  | 	dmultu	a_3,b_4		/* mul_add_c(a[3],b[4],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_3,t_2 | ||
|  | 	sltu	AT,c_3,t_2 | ||
|  | 	daddu	c_1,AT | ||
|  | 	dmultu	a_4,b_3		/* mul_add_c(a[4],b[3],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_3,t_2 | ||
|  | 	sltu	AT,c_3,t_2 | ||
|  | 	daddu	c_1,AT | ||
|  | 	dmultu	a_5,b_2		/* mul_add_c(a[5],b[2],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_3,t_2 | ||
|  | 	sltu	AT,c_3,t_2 | ||
|  | 	daddu	c_1,AT | ||
|  | 	dmultu	a_6,b_1		/* mul_add_c(a[6],b[1],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_3,t_2 | ||
|  | 	sltu	AT,c_3,t_2 | ||
|  | 	daddu	c_1,AT | ||
|  | 	dmultu	a_7,b_0		/* mul_add_c(a[7],b[0],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_3,t_2 | ||
|  | 	sltu	AT,c_3,t_2 | ||
|  | 	daddu	c_1,AT | ||
|  | 	sd	c_2,56(a0)	/* r[7]=c2; */
 | ||
|  | 
 | ||
|  | 	dmultu	a_7,b_1		/* mul_add_c(a[7],b[1],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	sltu	c_2,c_1,t_2 | ||
|  | 	dmultu	a_6,b_2		/* mul_add_c(a[6],b[2],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	sltu	AT,c_1,t_2 | ||
|  | 	daddu	c_2,AT | ||
|  | 	dmultu	a_5,b_3		/* mul_add_c(a[5],b[3],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	sltu	AT,c_1,t_2 | ||
|  | 	daddu	c_2,AT | ||
|  | 	dmultu	a_4,b_4		/* mul_add_c(a[4],b[4],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	sltu	AT,c_1,t_2 | ||
|  | 	daddu	c_2,AT | ||
|  | 	dmultu	a_3,b_5		/* mul_add_c(a[3],b[5],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	sltu	AT,c_1,t_2 | ||
|  | 	daddu	c_2,AT | ||
|  | 	dmultu	a_2,b_6		/* mul_add_c(a[2],b[6],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	sltu	AT,c_1,t_2 | ||
|  | 	daddu	c_2,AT | ||
|  | 	dmultu	a_1,b_7		/* mul_add_c(a[1],b[7],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	sltu	AT,c_1,t_2 | ||
|  | 	daddu	c_2,AT | ||
|  | 	sd	c_3,64(a0)	/* r[8]=c3; */
 | ||
|  | 
 | ||
|  | 	dmultu	a_2,b_7		/* mul_add_c(a[2],b[7],c1,c2,c3); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_1,t_1 | ||
|  | 	sltu	AT,c_1,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_2,t_2 | ||
|  | 	sltu	c_3,c_2,t_2 | ||
|  | 	dmultu	a_3,b_6		/* mul_add_c(a[3],b[6],c1,c2,c3); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_1,t_1 | ||
|  | 	sltu	AT,c_1,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_2,t_2 | ||
|  | 	sltu	AT,c_2,t_2 | ||
|  | 	daddu	c_3,AT | ||
|  | 	dmultu	a_4,b_5		/* mul_add_c(a[4],b[5],c1,c2,c3); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_1,t_1 | ||
|  | 	sltu	AT,c_1,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_2,t_2 | ||
|  | 	sltu	AT,c_2,t_2 | ||
|  | 	daddu	c_3,AT | ||
|  | 	dmultu	a_5,b_4		/* mul_add_c(a[5],b[4],c1,c2,c3); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_1,t_1 | ||
|  | 	sltu	AT,c_1,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_2,t_2 | ||
|  | 	sltu	AT,c_2,t_2 | ||
|  | 	daddu	c_3,AT | ||
|  | 	dmultu	a_6,b_3		/* mul_add_c(a[6],b[3],c1,c2,c3); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_1,t_1 | ||
|  | 	sltu	AT,c_1,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_2,t_2 | ||
|  | 	sltu	AT,c_2,t_2 | ||
|  | 	daddu	c_3,AT | ||
|  | 	dmultu	a_7,b_2		/* mul_add_c(a[7],b[2],c1,c2,c3); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_1,t_1 | ||
|  | 	sltu	AT,c_1,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_2,t_2 | ||
|  | 	sltu	AT,c_2,t_2 | ||
|  | 	daddu	c_3,AT | ||
|  | 	sd	c_1,72(a0)	/* r[9]=c1; */
 | ||
|  | 
 | ||
|  | 	dmultu	a_7,b_3		/* mul_add_c(a[7],b[3],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_3,t_2 | ||
|  | 	sltu	c_1,c_3,t_2 | ||
|  | 	dmultu	a_6,b_4		/* mul_add_c(a[6],b[4],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_3,t_2 | ||
|  | 	sltu	AT,c_3,t_2 | ||
|  | 	daddu	c_1,AT | ||
|  | 	dmultu	a_5,b_5		/* mul_add_c(a[5],b[5],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_3,t_2 | ||
|  | 	sltu	AT,c_3,t_2 | ||
|  | 	daddu	c_1,AT | ||
|  | 	dmultu	a_4,b_6		/* mul_add_c(a[4],b[6],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_3,t_2 | ||
|  | 	sltu	AT,c_3,t_2 | ||
|  | 	daddu	c_1,AT | ||
|  | 	dmultu	a_3,b_7		/* mul_add_c(a[3],b[7],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_3,t_2 | ||
|  | 	sltu	AT,c_3,t_2 | ||
|  | 	daddu	c_1,AT | ||
|  | 	sd	c_2,80(a0)	/* r[10]=c2; */
 | ||
|  | 
 | ||
|  | 	dmultu	a_4,b_7		/* mul_add_c(a[4],b[7],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	sltu	c_2,c_1,t_2 | ||
|  | 	dmultu	a_5,b_6		/* mul_add_c(a[5],b[6],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	sltu	AT,c_1,t_2 | ||
|  | 	daddu	c_2,AT | ||
|  | 	dmultu	a_6,b_5		/* mul_add_c(a[6],b[5],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	sltu	AT,c_1,t_2 | ||
|  | 	daddu	c_2,AT | ||
|  | 	dmultu	a_7,b_4		/* mul_add_c(a[7],b[4],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	sltu	AT,c_1,t_2 | ||
|  | 	daddu	c_2,AT | ||
|  | 	sd	c_3,88(a0)	/* r[11]=c3; */
 | ||
|  | 
 | ||
|  | 	dmultu	a_7,b_5		/* mul_add_c(a[7],b[5],c1,c2,c3); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_1,t_1 | ||
|  | 	sltu	AT,c_1,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_2,t_2 | ||
|  | 	sltu	c_3,c_2,t_2 | ||
|  | 	dmultu	a_6,b_6		/* mul_add_c(a[6],b[6],c1,c2,c3); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_1,t_1 | ||
|  | 	sltu	AT,c_1,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_2,t_2 | ||
|  | 	sltu	AT,c_2,t_2 | ||
|  | 	daddu	c_3,AT | ||
|  | 	dmultu	a_5,b_7		/* mul_add_c(a[5],b[7],c1,c2,c3); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_1,t_1 | ||
|  | 	sltu	AT,c_1,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_2,t_2 | ||
|  | 	sltu	AT,c_2,t_2 | ||
|  | 	daddu	c_3,AT | ||
|  | 	sd	c_1,96(a0)	/* r[12]=c1; */
 | ||
|  | 
 | ||
|  | 	dmultu	a_6,b_7		/* mul_add_c(a[6],b[7],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_3,t_2 | ||
|  | 	sltu	c_1,c_3,t_2 | ||
|  | 	dmultu	a_7,b_6		/* mul_add_c(a[7],b[6],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_3,t_2 | ||
|  | 	sltu	AT,c_3,t_2 | ||
|  | 	daddu	c_1,AT | ||
|  | 	sd	c_2,104(a0)	/* r[13]=c2; */
 | ||
|  | 
 | ||
|  | 	dmultu	a_7,b_7		/* mul_add_c(a[7],b[7],c3,c1,c2); */
 | ||
|  | 	ld	s0,0(sp) | ||
|  | 	ld	s1,8(sp) | ||
|  | 	ld	s2,16(sp) | ||
|  | 	ld	s3,24(sp) | ||
|  | 	ld	s4,32(sp) | ||
|  | 	ld	s5,40(sp) | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	sd	c_3,112(a0)	/* r[14]=c3; */
 | ||
|  | 	sd	c_1,120(a0)	/* r[15]=c1; */
 | ||
|  | 
 | ||
|  | 	PTR_ADD	sp,FRAME_SIZE | ||
|  | 
 | ||
|  | 	jr	ra | ||
|  | END(bn_mul_comba8) | ||
|  | 
 | ||
|  | .align	5
 | ||
|  | LEAF(bn_mul_comba4) | ||
|  | 	.set	reorder
 | ||
|  | 	ld	a_0,0(a1) | ||
|  | 	ld	b_0,0(a2) | ||
|  | 	ld	a_1,8(a1) | ||
|  | 	ld	a_2,16(a1) | ||
|  | 	dmultu	a_0,b_0		/* mul_add_c(a[0],b[0],c1,c2,c3); */
 | ||
|  | 	ld	a_3,24(a1) | ||
|  | 	ld	b_1,8(a2) | ||
|  | 	ld	b_2,16(a2) | ||
|  | 	ld	b_3,24(a2) | ||
|  | 	mflo	c_1 | ||
|  | 	mfhi	c_2 | ||
|  | 	sd	c_1,0(a0) | ||
|  | 
 | ||
|  | 	dmultu	a_0,b_1		/* mul_add_c(a[0],b[1],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	c_3,t_2,AT | ||
|  | 	dmultu	a_1,b_0		/* mul_add_c(a[1],b[0],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_3,t_2 | ||
|  | 	sltu	c_1,c_3,t_2 | ||
|  | 	sd	c_2,8(a0) | ||
|  | 
 | ||
|  | 	dmultu	a_2,b_0		/* mul_add_c(a[2],b[0],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	dmultu	a_1,b_1		/* mul_add_c(a[1],b[1],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	sltu	c_2,c_1,t_2 | ||
|  | 	dmultu	a_0,b_2		/* mul_add_c(a[0],b[2],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	sltu	AT,c_1,t_2 | ||
|  | 	daddu	c_2,AT | ||
|  | 	sd	c_3,16(a0) | ||
|  | 
 | ||
|  | 	dmultu	a_0,b_3		/* mul_add_c(a[0],b[3],c1,c2,c3); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_1,t_1 | ||
|  | 	sltu	AT,c_1,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_2,t_2 | ||
|  | 	sltu	c_3,c_2,t_2 | ||
|  | 	dmultu	a_1,b_2		/* mul_add_c(a[1],b[2],c1,c2,c3); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_1,t_1 | ||
|  | 	sltu	AT,c_1,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_2,t_2 | ||
|  | 	sltu	AT,c_2,t_2 | ||
|  | 	daddu	c_3,AT | ||
|  | 	dmultu	a_2,b_1		/* mul_add_c(a[2],b[1],c1,c2,c3); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_1,t_1 | ||
|  | 	sltu	AT,c_1,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_2,t_2 | ||
|  | 	sltu	AT,c_2,t_2 | ||
|  | 	daddu	c_3,AT | ||
|  | 	dmultu	a_3,b_0		/* mul_add_c(a[3],b[0],c1,c2,c3); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_1,t_1 | ||
|  | 	sltu	AT,c_1,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_2,t_2 | ||
|  | 	sltu	AT,c_2,t_2 | ||
|  | 	daddu	c_3,AT | ||
|  | 	sd	c_1,24(a0) | ||
|  | 
 | ||
|  | 	dmultu	a_3,b_1		/* mul_add_c(a[3],b[1],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_3,t_2 | ||
|  | 	sltu	c_1,c_3,t_2 | ||
|  | 	dmultu	a_2,b_2		/* mul_add_c(a[2],b[2],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_3,t_2 | ||
|  | 	sltu	AT,c_3,t_2 | ||
|  | 	daddu	c_1,AT | ||
|  | 	dmultu	a_1,b_3		/* mul_add_c(a[1],b[3],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_3,t_2 | ||
|  | 	sltu	AT,c_3,t_2 | ||
|  | 	daddu	c_1,AT | ||
|  | 	sd	c_2,32(a0) | ||
|  | 
 | ||
|  | 	dmultu	a_2,b_3		/* mul_add_c(a[2],b[3],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	sltu	c_2,c_1,t_2 | ||
|  | 	dmultu	a_3,b_2		/* mul_add_c(a[3],b[2],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	sltu	AT,c_1,t_2 | ||
|  | 	daddu	c_2,AT | ||
|  | 	sd	c_3,40(a0) | ||
|  | 
 | ||
|  | 	dmultu	a_3,b_3		/* mul_add_c(a[3],b[3],c1,c2,c3); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_1,t_1 | ||
|  | 	sltu	AT,c_1,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_2,t_2 | ||
|  | 	sd	c_1,48(a0) | ||
|  | 	sd	c_2,56(a0) | ||
|  | 
 | ||
|  | 	jr	ra | ||
|  | END(bn_mul_comba4) | ||
|  | 
 | ||
|  | #undef	a_4 | ||
|  | #undef	a_5 | ||
|  | #undef	a_6 | ||
|  | #undef	a_7 | ||
|  | #define	a_4	b_0 | ||
|  | #define	a_5	b_1 | ||
|  | #define	a_6	b_2 | ||
|  | #define	a_7	b_3 | ||
|  | 
 | ||
|  | .align	5
 | ||
|  | LEAF(bn_sqr_comba8) | ||
|  | 	.set	reorder
 | ||
|  | 	ld	a_0,0(a1) | ||
|  | 	ld	a_1,8(a1) | ||
|  | 	ld	a_2,16(a1) | ||
|  | 	ld	a_3,24(a1) | ||
|  | 
 | ||
|  | 	dmultu	a_0,a_0		/* mul_add_c(a[0],b[0],c1,c2,c3); */
 | ||
|  | 	ld	a_4,32(a1) | ||
|  | 	ld	a_5,40(a1) | ||
|  | 	ld	a_6,48(a1) | ||
|  | 	ld	a_7,56(a1) | ||
|  | 	mflo	c_1 | ||
|  | 	mfhi	c_2 | ||
|  | 	sd	c_1,0(a0) | ||
|  | 
 | ||
|  | 	dmultu	a_0,a_1		/* mul_add_c2(a[0],b[1],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	slt	c_1,t_2,zero | ||
|  | 	dsll	t_2,1 | ||
|  | 	slt	a2,t_1,zero | ||
|  | 	daddu	t_2,a2 | ||
|  | 	dsll	t_1,1 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	c_3,t_2,AT | ||
|  | 	sd	c_2,8(a0) | ||
|  | 
 | ||
|  | 	dmultu	a_2,a_0		/* mul_add_c2(a[2],b[0],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	slt	c_2,t_2,zero | ||
|  | 	dsll	t_2,1 | ||
|  | 	slt	a2,t_1,zero | ||
|  | 	daddu	t_2,a2 | ||
|  | 	dsll	t_1,1 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	sltu	AT,c_1,t_2 | ||
|  | 	daddu	c_2,AT | ||
|  | 	dmultu	a_1,a_1		/* mul_add_c(a[1],b[1],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	sltu	AT,c_1,t_2 | ||
|  | 	daddu	c_2,AT | ||
|  | 	sd	c_3,16(a0) | ||
|  | 
 | ||
|  | 	dmultu	a_0,a_3		/* mul_add_c2(a[0],b[3],c1,c2,c3); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	slt	c_3,t_2,zero | ||
|  | 	dsll	t_2,1 | ||
|  | 	slt	a2,t_1,zero | ||
|  | 	daddu	t_2,a2 | ||
|  | 	dsll	t_1,1 | ||
|  | 	daddu	c_1,t_1 | ||
|  | 	sltu	AT,c_1,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_2,t_2 | ||
|  | 	sltu	AT,c_2,t_2 | ||
|  | 	daddu	c_3,AT | ||
|  | 	dmultu	a_1,a_2		/* mul_add_c2(a[1],b[2],c1,c2,c3); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	slt	AT,t_2,zero | ||
|  | 	daddu	c_3,AT | ||
|  | 	dsll	t_2,1 | ||
|  | 	slt	a2,t_1,zero | ||
|  | 	daddu	t_2,a2 | ||
|  | 	dsll	t_1,1 | ||
|  | 	daddu	c_1,t_1 | ||
|  | 	sltu	AT,c_1,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_2,t_2 | ||
|  | 	sltu	AT,c_2,t_2 | ||
|  | 	daddu	c_3,AT | ||
|  | 	sd	c_1,24(a0) | ||
|  | 
 | ||
|  | 	dmultu	a_4,a_0		/* mul_add_c2(a[4],b[0],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	slt	c_1,t_2,zero | ||
|  | 	dsll	t_2,1 | ||
|  | 	slt	a2,t_1,zero | ||
|  | 	daddu	t_2,a2 | ||
|  | 	dsll	t_1,1 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_3,t_2 | ||
|  | 	sltu	AT,c_3,t_2 | ||
|  | 	daddu	c_1,AT | ||
|  | 	dmultu	a_3,a_1		/* mul_add_c2(a[3],b[1],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	slt	AT,t_2,zero | ||
|  | 	daddu	c_1,AT | ||
|  | 	dsll	t_2,1 | ||
|  | 	slt	a2,t_1,zero | ||
|  | 	daddu	t_2,a2 | ||
|  | 	dsll	t_1,1 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_3,t_2 | ||
|  | 	sltu	AT,c_3,t_2 | ||
|  | 	daddu	c_1,AT | ||
|  | 	dmultu	a_2,a_2		/* mul_add_c(a[2],b[2],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_3,t_2 | ||
|  | 	sltu	AT,c_3,t_2 | ||
|  | 	daddu	c_1,AT | ||
|  | 	sd	c_2,32(a0) | ||
|  | 
 | ||
|  | 	dmultu	a_0,a_5		/* mul_add_c2(a[0],b[5],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	slt	c_2,t_2,zero | ||
|  | 	dsll	t_2,1 | ||
|  | 	slt	a2,t_1,zero | ||
|  | 	daddu	t_2,a2 | ||
|  | 	dsll	t_1,1 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	sltu	AT,c_1,t_2 | ||
|  | 	daddu	c_2,AT | ||
|  | 	dmultu	a_1,a_4		/* mul_add_c2(a[1],b[4],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	slt	AT,t_2,zero | ||
|  | 	daddu	c_2,AT | ||
|  | 	dsll	t_2,1 | ||
|  | 	slt	a2,t_1,zero | ||
|  | 	daddu	t_2,a2 | ||
|  | 	dsll	t_1,1 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	sltu	AT,c_1,t_2 | ||
|  | 	daddu	c_2,AT | ||
|  | 	dmultu	a_2,a_3		/* mul_add_c2(a[2],b[3],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	slt	AT,t_2,zero | ||
|  | 	daddu	c_2,AT | ||
|  | 	dsll	t_2,1 | ||
|  | 	slt	a2,t_1,zero | ||
|  | 	daddu	t_2,a2 | ||
|  | 	dsll	t_1,1 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	sltu	AT,c_1,t_2 | ||
|  | 	daddu	c_2,AT | ||
|  | 	sd	c_3,40(a0) | ||
|  | 
 | ||
|  | 	dmultu	a_6,a_0		/* mul_add_c2(a[6],b[0],c1,c2,c3); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	slt	c_3,t_2,zero | ||
|  | 	dsll	t_2,1 | ||
|  | 	slt	a2,t_1,zero | ||
|  | 	daddu	t_2,a2 | ||
|  | 	dsll	t_1,1 | ||
|  | 	daddu	c_1,t_1 | ||
|  | 	sltu	AT,c_1,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_2,t_2 | ||
|  | 	sltu	AT,c_2,t_2 | ||
|  | 	daddu	c_3,AT | ||
|  | 	dmultu	a_5,a_1		/* mul_add_c2(a[5],b[1],c1,c2,c3); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	slt	AT,t_2,zero | ||
|  | 	daddu	c_3,AT | ||
|  | 	dsll	t_2,1 | ||
|  | 	slt	a2,t_1,zero | ||
|  | 	daddu	t_2,a2 | ||
|  | 	dsll	t_1,1 | ||
|  | 	daddu	c_1,t_1 | ||
|  | 	sltu	AT,c_1,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_2,t_2 | ||
|  | 	sltu	AT,c_2,t_2 | ||
|  | 	daddu	c_3,AT | ||
|  | 	dmultu	a_4,a_2		/* mul_add_c2(a[4],b[2],c1,c2,c3); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	slt	AT,t_2,zero | ||
|  | 	daddu	c_3,AT | ||
|  | 	dsll	t_2,1 | ||
|  | 	slt	a2,t_1,zero | ||
|  | 	daddu	t_2,a2 | ||
|  | 	dsll	t_1,1 | ||
|  | 	daddu	c_1,t_1 | ||
|  | 	sltu	AT,c_1,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_2,t_2 | ||
|  | 	sltu	AT,c_2,t_2 | ||
|  | 	daddu	c_3,AT | ||
|  | 	dmultu	a_3,a_3		/* mul_add_c(a[3],b[3],c1,c2,c3); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_1,t_1 | ||
|  | 	sltu	AT,c_1,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_2,t_2 | ||
|  | 	sltu	AT,c_2,t_2 | ||
|  | 	daddu	c_3,AT | ||
|  | 	sd	c_1,48(a0) | ||
|  | 
 | ||
|  | 	dmultu	a_0,a_7		/* mul_add_c2(a[0],b[7],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	slt	c_1,t_2,zero | ||
|  | 	dsll	t_2,1 | ||
|  | 	slt	a2,t_1,zero | ||
|  | 	daddu	t_2,a2 | ||
|  | 	dsll	t_1,1 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_3,t_2 | ||
|  | 	sltu	AT,c_3,t_2 | ||
|  | 	daddu	c_1,AT | ||
|  | 	dmultu	a_1,a_6		/* mul_add_c2(a[1],b[6],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	slt	AT,t_2,zero | ||
|  | 	daddu	c_1,AT | ||
|  | 	dsll	t_2,1 | ||
|  | 	slt	a2,t_1,zero | ||
|  | 	daddu	t_2,a2 | ||
|  | 	dsll	t_1,1 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_3,t_2 | ||
|  | 	sltu	AT,c_3,t_2 | ||
|  | 	daddu	c_1,AT | ||
|  | 	dmultu	a_2,a_5		/* mul_add_c2(a[2],b[5],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	slt	AT,t_2,zero | ||
|  | 	daddu	c_1,AT | ||
|  | 	dsll	t_2,1 | ||
|  | 	slt	a2,t_1,zero | ||
|  | 	daddu	t_2,a2 | ||
|  | 	dsll	t_1,1 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_3,t_2 | ||
|  | 	sltu	AT,c_3,t_2 | ||
|  | 	daddu	c_1,AT | ||
|  | 	dmultu	a_3,a_4		/* mul_add_c2(a[3],b[4],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	slt	AT,t_2,zero | ||
|  | 	daddu	c_1,AT | ||
|  | 	dsll	t_2,1 | ||
|  | 	slt	a2,t_1,zero | ||
|  | 	daddu	t_2,a2 | ||
|  | 	dsll	t_1,1 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_3,t_2 | ||
|  | 	sltu	AT,c_3,t_2 | ||
|  | 	daddu	c_1,AT | ||
|  | 	sd	c_2,56(a0) | ||
|  | 
 | ||
|  | 	dmultu	a_7,a_1		/* mul_add_c2(a[7],b[1],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	slt	c_2,t_2,zero | ||
|  | 	dsll	t_2,1 | ||
|  | 	slt	a2,t_1,zero | ||
|  | 	daddu	t_2,a2 | ||
|  | 	dsll	t_1,1 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	sltu	AT,c_1,t_2 | ||
|  | 	daddu	c_2,AT | ||
|  | 	dmultu	a_6,a_2		/* mul_add_c2(a[6],b[2],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	slt	AT,t_2,zero | ||
|  | 	daddu	c_2,AT | ||
|  | 	dsll	t_2,1 | ||
|  | 	slt	a2,t_1,zero | ||
|  | 	daddu	t_2,a2 | ||
|  | 	dsll	t_1,1 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	sltu	AT,c_1,t_2 | ||
|  | 	daddu	c_2,AT | ||
|  | 	dmultu	a_5,a_3		/* mul_add_c2(a[5],b[3],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	slt	AT,t_2,zero | ||
|  | 	daddu	c_2,AT | ||
|  | 	dsll	t_2,1 | ||
|  | 	slt	a2,t_1,zero | ||
|  | 	daddu	t_2,a2 | ||
|  | 	dsll	t_1,1 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	sltu	AT,c_1,t_2 | ||
|  | 	daddu	c_2,AT | ||
|  | 	dmultu	a_4,a_4		/* mul_add_c(a[4],b[4],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	sltu	AT,c_1,t_2 | ||
|  | 	daddu	c_2,AT | ||
|  | 	sd	c_3,64(a0) | ||
|  | 
 | ||
|  | 	dmultu	a_2,a_7		/* mul_add_c2(a[2],b[7],c1,c2,c3); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	slt	c_3,t_2,zero | ||
|  | 	dsll	t_2,1 | ||
|  | 	slt	a2,t_1,zero | ||
|  | 	daddu	t_2,a2 | ||
|  | 	dsll	t_1,1 | ||
|  | 	daddu	c_1,t_1 | ||
|  | 	sltu	AT,c_1,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_2,t_2 | ||
|  | 	sltu	AT,c_2,t_2 | ||
|  | 	daddu	c_3,AT | ||
|  | 	dmultu	a_3,a_6		/* mul_add_c2(a[3],b[6],c1,c2,c3); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	slt	AT,t_2,zero | ||
|  | 	daddu	c_3,AT | ||
|  | 	dsll	t_2,1 | ||
|  | 	slt	a2,t_1,zero | ||
|  | 	daddu	t_2,a2 | ||
|  | 	dsll	t_1,1 | ||
|  | 	daddu	c_1,t_1 | ||
|  | 	sltu	AT,c_1,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_2,t_2 | ||
|  | 	sltu	AT,c_2,t_2 | ||
|  | 	daddu	c_3,AT | ||
|  | 	dmultu	a_4,a_5		/* mul_add_c2(a[4],b[5],c1,c2,c3); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	slt	AT,t_2,zero | ||
|  | 	daddu	c_3,AT | ||
|  | 	dsll	t_2,1 | ||
|  | 	slt	a2,t_1,zero | ||
|  | 	daddu	t_2,a2 | ||
|  | 	dsll	t_1,1 | ||
|  | 	daddu	c_1,t_1 | ||
|  | 	sltu	AT,c_1,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_2,t_2 | ||
|  | 	sltu	AT,c_2,t_2 | ||
|  | 	daddu	c_3,AT | ||
|  | 	sd	c_1,72(a0) | ||
|  | 
 | ||
|  | 	dmultu	a_7,a_3		/* mul_add_c2(a[7],b[3],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	slt	c_1,t_2,zero | ||
|  | 	dsll	t_2,1 | ||
|  | 	slt	a2,t_1,zero | ||
|  | 	daddu	t_2,a2 | ||
|  | 	dsll	t_1,1 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_3,t_2 | ||
|  | 	sltu	AT,c_3,t_2 | ||
|  | 	daddu	c_1,AT | ||
|  | 	dmultu	a_6,a_4		/* mul_add_c2(a[6],b[4],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	slt	AT,t_2,zero | ||
|  | 	daddu	c_1,AT | ||
|  | 	dsll	t_2,1 | ||
|  | 	slt	a2,t_1,zero | ||
|  | 	daddu	t_2,a2 | ||
|  | 	dsll	t_1,1 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_3,t_2 | ||
|  | 	sltu	AT,c_3,t_2 | ||
|  | 	daddu	c_1,AT | ||
|  | 	dmultu	a_5,a_5		/* mul_add_c(a[5],b[5],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_3,t_2 | ||
|  | 	sltu	AT,c_3,t_2 | ||
|  | 	daddu	c_1,AT | ||
|  | 	sd	c_2,80(a0) | ||
|  | 
 | ||
|  | 	dmultu	a_4,a_7		/* mul_add_c2(a[4],b[7],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	slt	c_2,t_2,zero | ||
|  | 	dsll	t_2,1 | ||
|  | 	slt	a2,t_1,zero | ||
|  | 	daddu	t_2,a2 | ||
|  | 	dsll	t_1,1 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	sltu	AT,c_1,t_2 | ||
|  | 	daddu	c_2,AT | ||
|  | 	dmultu	a_5,a_6		/* mul_add_c2(a[5],b[6],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	slt	AT,t_2,zero | ||
|  | 	daddu	c_2,AT | ||
|  | 	dsll	t_2,1 | ||
|  | 	slt	a2,t_1,zero | ||
|  | 	daddu	t_2,a2 | ||
|  | 	dsll	t_1,1 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	sltu	AT,c_1,t_2 | ||
|  | 	daddu	c_2,AT | ||
|  | 	sd	c_3,88(a0) | ||
|  | 
 | ||
|  | 	dmultu	a_7,a_5		/* mul_add_c2(a[7],b[5],c1,c2,c3); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	slt	c_3,t_2,zero | ||
|  | 	dsll	t_2,1 | ||
|  | 	slt	a2,t_1,zero | ||
|  | 	daddu	t_2,a2 | ||
|  | 	dsll	t_1,1 | ||
|  | 	daddu	c_1,t_1 | ||
|  | 	sltu	AT,c_1,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_2,t_2 | ||
|  | 	sltu	AT,c_2,t_2 | ||
|  | 	daddu	c_3,AT | ||
|  | 	dmultu	a_6,a_6		/* mul_add_c(a[6],b[6],c1,c2,c3); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_1,t_1 | ||
|  | 	sltu	AT,c_1,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_2,t_2 | ||
|  | 	sltu	AT,c_2,t_2 | ||
|  | 	daddu	c_3,AT | ||
|  | 	sd	c_1,96(a0) | ||
|  | 
 | ||
|  | 	dmultu	a_6,a_7		/* mul_add_c2(a[6],b[7],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	slt	c_1,t_2,zero | ||
|  | 	dsll	t_2,1 | ||
|  | 	slt	a2,t_1,zero | ||
|  | 	daddu	t_2,a2 | ||
|  | 	dsll	t_1,1 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_3,t_2 | ||
|  | 	sltu	AT,c_3,t_2 | ||
|  | 	daddu	c_1,AT | ||
|  | 	sd	c_2,104(a0) | ||
|  | 
 | ||
|  | 	dmultu	a_7,a_7		/* mul_add_c(a[7],b[7],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	sd	c_3,112(a0) | ||
|  | 	sd	c_1,120(a0) | ||
|  | 
 | ||
|  | 	jr	ra | ||
|  | END(bn_sqr_comba8) | ||
|  | 
 | ||
|  | .align	5
 | ||
|  | LEAF(bn_sqr_comba4) | ||
|  | 	.set	reorder
 | ||
|  | 	ld	a_0,0(a1) | ||
|  | 	ld	a_1,8(a1) | ||
|  | 	ld	a_2,16(a1) | ||
|  | 	ld	a_3,24(a1) | ||
|  | 	dmultu	a_0,a_0		/* mul_add_c(a[0],b[0],c1,c2,c3); */
 | ||
|  | 	mflo	c_1 | ||
|  | 	mfhi	c_2 | ||
|  | 	sd	c_1,0(a0) | ||
|  | 
 | ||
|  | 	dmultu	a_0,a_1		/* mul_add_c2(a[0],b[1],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	slt	c_1,t_2,zero | ||
|  | 	dsll	t_2,1 | ||
|  | 	slt	a2,t_1,zero | ||
|  | 	daddu	t_2,a2 | ||
|  | 	dsll	t_1,1 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	c_3,t_2,AT | ||
|  | 	sd	c_2,8(a0) | ||
|  | 
 | ||
|  | 	dmultu	a_2,a_0		/* mul_add_c2(a[2],b[0],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	slt	c_2,t_2,zero | ||
|  | 	dsll	t_2,1 | ||
|  | 	slt	a2,t_1,zero | ||
|  | 	daddu	t_2,a2 | ||
|  | 	dsll	t_1,1 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	sltu	AT,c_1,t_2 | ||
|  | 	daddu	c_2,AT | ||
|  | 	dmultu	a_1,a_1		/* mul_add_c(a[1],b[1],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	sltu	AT,c_1,t_2 | ||
|  | 	daddu	c_2,AT | ||
|  | 	sd	c_3,16(a0) | ||
|  | 
 | ||
|  | 	dmultu	a_0,a_3		/* mul_add_c2(a[0],b[3],c1,c2,c3); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	slt	c_3,t_2,zero | ||
|  | 	dsll	t_2,1 | ||
|  | 	slt	a2,t_1,zero | ||
|  | 	daddu	t_2,a2 | ||
|  | 	dsll	t_1,1 | ||
|  | 	daddu	c_1,t_1 | ||
|  | 	sltu	AT,c_1,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_2,t_2 | ||
|  | 	sltu	AT,c_2,t_2 | ||
|  | 	daddu	c_3,AT | ||
|  | 	dmultu	a_1,a_2		/* mul_add_c(a2[1],b[2],c1,c2,c3); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	slt	AT,t_2,zero | ||
|  | 	daddu	c_3,AT | ||
|  | 	dsll	t_2,1 | ||
|  | 	slt	a2,t_1,zero | ||
|  | 	daddu	t_2,a2 | ||
|  | 	dsll	t_1,1 | ||
|  | 	daddu	c_1,t_1 | ||
|  | 	sltu	AT,c_1,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_2,t_2 | ||
|  | 	sltu	AT,c_2,t_2 | ||
|  | 	daddu	c_3,AT | ||
|  | 	sd	c_1,24(a0) | ||
|  | 
 | ||
|  | 	dmultu	a_3,a_1		/* mul_add_c2(a[3],b[1],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	slt	c_1,t_2,zero | ||
|  | 	dsll	t_2,1 | ||
|  | 	slt	a2,t_1,zero | ||
|  | 	daddu	t_2,a2 | ||
|  | 	dsll	t_1,1 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_3,t_2 | ||
|  | 	sltu	AT,c_3,t_2 | ||
|  | 	daddu	c_1,AT | ||
|  | 	dmultu	a_2,a_2		/* mul_add_c(a[2],b[2],c2,c3,c1); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_2,t_1 | ||
|  | 	sltu	AT,c_2,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_3,t_2 | ||
|  | 	sltu	AT,c_3,t_2 | ||
|  | 	daddu	c_1,AT | ||
|  | 	sd	c_2,32(a0) | ||
|  | 
 | ||
|  | 	dmultu	a_2,a_3		/* mul_add_c2(a[2],b[3],c3,c1,c2); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	slt	c_2,t_2,zero | ||
|  | 	dsll	t_2,1 | ||
|  | 	slt	a2,t_1,zero | ||
|  | 	daddu	t_2,a2 | ||
|  | 	dsll	t_1,1 | ||
|  | 	daddu	c_3,t_1 | ||
|  | 	sltu	AT,c_3,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_1,t_2 | ||
|  | 	sltu	AT,c_1,t_2 | ||
|  | 	daddu	c_2,AT | ||
|  | 	sd	c_3,40(a0) | ||
|  | 
 | ||
|  | 	dmultu	a_3,a_3		/* mul_add_c(a[3],b[3],c1,c2,c3); */
 | ||
|  | 	mflo	t_1 | ||
|  | 	mfhi	t_2 | ||
|  | 	daddu	c_1,t_1 | ||
|  | 	sltu	AT,c_1,t_1 | ||
|  | 	daddu	t_2,AT | ||
|  | 	daddu	c_2,t_2 | ||
|  | 	sd	c_1,48(a0) | ||
|  | 	sd	c_2,56(a0) | ||
|  | 
 | ||
|  | 	jr	ra | ||
|  | END(bn_sqr_comba4) |