//1 = movw wird unterstüzt, 0 = konvetionelles mov verwenden
.set useMOVW = 1
//1 = Geschwindigkeitsoptimierte Version (~600clk, ~1300byte)
//0 = Platzoptimierte Version (~1300 clk, ~150byte)
.set useSpeedOptimized = 1


wurzel32fast:
/* #######################################################################
Based on an Algorithm by Jim Ulery
	http://www.azillionmonkeys.com/qed/ulerysqroot.pdf

	Ported to ASM by Läubi, 2008

	Input r5:r4:r3:r2
	Output r1:r0

	! Achtung! Die Eingabe wird hierbei zerstört!
	! Wenn diese noch benötigt wird vorher sichern!

	C-Version:
  static unsigned julery_isqrt(unsigned long val) {
      unsigned long temp, g=0, b = 0x8000, bshft = 15;
      do {
          if ( val >= ( temp = (((g << 1) + b)<<bshft--) ) ) {
             g += b;
             val -= temp;
          }
      } while (b >>= 1);
      return g;
  }
####################################################################### */

//Macros
.macro WURZEL32_INNERLOOP
	lsl r6
	rol r7
	rol r8
	rol r9
.endmacro

// @0 = bshft
.macro WURZEL32_OUTERLOOP
		//temp = g
		
		.if useMOVW == 1
			movw r9:r8, r17:r16
			movw r7:r6, r1:r0
		.else
			clr r9
			clr r8
			mov r7, r1
			mov r6, r0
		.endif
	
		//g << 1
		lsl r6
		rol r7
		rol r8
		//rol r9 kann ausgelassen werden da g maximal 16 bit breit
		//und r9 = 0
		//+b
		add r6, XL
		adc r7, XH
		adc r8, r16
		adc r9, r16
		//man könnte auch das addieren und g<<1 hier noch reinziehen
		//spart aber effektiv nur ein paar takte
		.if @0 == 15
			//optimierung für shift um 15 nach links
			//wir "schieben" um 16 nach links und dann einen nach rechts
			mov r9, r8
			mov r8, r7
			mov r7, r6
			lsr r9
			ror r8
			ror r7
			clr r6
			//4*15-7 = 53 takte ersparnis
		.elif @0 == 14
			//optimierung für shift um 14 nach links
			//wir "schieben" um 16 nach links und dann zwei nach rechts
			mov r9, r8
			mov r8, r7
			mov r7, r6
			lsr r9
			ror r8
			ror r7
			lsr r9
			ror r8
			ror r7
			clr r6
			//4*14-10 = 46 takte ersparnis
		.elif @0 == 13
			//optimierung für shift um 13 nach links
			//wir "schieben" um 16 nach links und dann drei nach rechts
			mov r9, r8
			mov r8, r7
			mov r7, r6
			lsr r9
			ror r8
			ror r7
			lsr r9
			ror r8
			ror r7
			lsr r9
			ror r8
			ror r7
			clr r6
			//4*13-13 = 39 takte ersparnis
		.elif @0 == 12
			//optimierung  für shift um 12 nach links
			//wir schieben vier nach links und tauschen dann die bytes
			WURZEL32_INNERLOOP
			WURZEL32_INNERLOOP
			WURZEL32_INNERLOOP
			WURZEL32_INNERLOOP
			mov r9, r8
			mov r8, r7
			mov r7, r6
			clr r6
			//12*4 - 20 = 28 takt ersparnis
		.elif  @0 == 8
			//optimierung für shift nach links um 8 stellen
			mov r8, r7
			mov r7, r6
			clr r6
			//4*8-3 = 29 takte ersparnis
		.else
		// << bshft
			//unrolled inner loop --> Wir haben leider keine barell shifter
			//hier ist noch optimierungspotental, beispiele siehe oben
			.if @0 > 0
				WURZEL32_INNERLOOP
			.endif
			.if @0 > 1
				WURZEL32_INNERLOOP
			.endif
			.if @0 > 2
				WURZEL32_INNERLOOP
			.endif
			.if @0 > 3
				WURZEL32_INNERLOOP
			.endif
			.if @0 > 4
				WURZEL32_INNERLOOP
			.endif
			.if @0 > 5
				WURZEL32_INNERLOOP
			.endif
			.if @0 > 6
				WURZEL32_INNERLOOP
			.endif
			.if @0 > 7
				WURZEL32_INNERLOOP
			.endif
			.if @0 > 8
				WURZEL32_INNERLOOP
			.endif
			.if @0 > 9
				WURZEL32_INNERLOOP
			.endif
			.if @0 > 10
				WURZEL32_INNERLOOP
			.endif
			.if @0 > 11
				WURZEL32_INNERLOOP
			.endif
			.if @0 > 12
				WURZEL32_INNERLOOP
			.endif
			.if @0 > 13
				WURZEL32_INNERLOOP
			.endif
			.if @0 > 14
				WURZEL32_INNERLOOP
			.endif
		.endif
		//val >= temp
		cp r2, r6
		cpc r3, r7
		cpc r4, r8
		cpc r5, r9
		brlo wurzel32_2f_wd
		//g + = b
		add r0, XL
		adc r1, XH
		//val -=temp
		sub r2, r6
		sbc r3, r7
		sbc r4, r8
		sbc r5, r9
	//b >> 1 != 0?
	wurzel32_2f_wd:
		lsr XH
		ror XL
.endmacro

//function
.if useSpeedOptimized == 1
	//Register sichern
	push XL
	push XH
	push r17
	push r16
	push r6
	push r7
	push r8
	push r9
	
	//X = b
	ldi XL,  LOW(0x8000)
	ldi XH, HIGH(0x8000)
	//r1:r0 = g = 0
	clr YL
	clr YH
	clr r16
	clr r17
		//unrooling outer loop
		WURZEL32_OUTERLOOP 15//, 0x8000	
		WURZEL32_OUTERLOOP 14//, (0x8000 >> 1)
		WURZEL32_OUTERLOOP 13//, (0x8000 >> 2)	
		WURZEL32_OUTERLOOP 12//, (0x8000 >> 3)	
		WURZEL32_OUTERLOOP 11//, (0x8000 >> 4)
		WURZEL32_OUTERLOOP 10//, (0x8000 >> 5)	
		WURZEL32_OUTERLOOP  9//, (0x8000 >> 6)	
		WURZEL32_OUTERLOOP	8//, (0x8000 >> 7)
		WURZEL32_OUTERLOOP	7//, (0x8000 >> 8)
		WURZEL32_OUTERLOOP	6//, (0x8000 >> 9)
		WURZEL32_OUTERLOOP	5//, (0x8000 >> 10)
		WURZEL32_OUTERLOOP	4//, (0x8000 >> 11)
		WURZEL32_OUTERLOOP	3//, (0x8000 >> 12)
		WURZEL32_OUTERLOOP	2//, (0x8000 >> 13)
		WURZEL32_OUTERLOOP	1//, (0x8000 >> 14)
		WURZEL32_OUTERLOOP	0//, (0x8000 >> 15)
	wurzel32_2f_wloop:
	//Register wieder herstellen
	
	pop r9
	pop r8
	pop r7
	pop r6
	pop r16
	pop r17
	pop XH
	pop XL
ret

.else
//Register sichern
	push XL
	push XH
	push r17
	push r16
	push r6
	push r7
	push r8
	push r9
	push r10
	//X = b
	ldi XL,  LOW(0x8000)
	ldi XH, HIGH(0x8000)
	//r1:r0 = g = 0
	clr r0
	clr r1
	//temp = bshft
	ldi r17, 15
	clr r16
	wurzel32_2_wloop:
		//temp = g
		clr r9
		clr r8
		mov r7, r1
		mov r6, r0
		//g << 1
		lsl r6
		rol r7
		rol r8
		//leztes rol kann ausgelassen werden da g maximal 16 bit breit
		//und r9 = 0
		//+b
		add r6, XL
		adc r7, XH
		adc r8, r16
		adc r9, r16
		// << bshft
		mov r10, r17
		wurzel32_2_wl:
			lsl r6
			rol r7
			rol r8
			rol r9
			dec r17
		brne wurzel32_2_wl
		//bshft--
		mov r17, r10
		dec r17
		//val >= temp
		cp r2, r6
		cpc r3, r7
		cpc r4, r8
		cpc r5, r9
		brlo wurzel32_2_wd
		//g + = b
		add r0, XL
		adc r1, XH
		//val -=temp
		sub r2, r6
		sbc r3, r7
		sbc r4, r8
		sbc r5, r9
	//b >> 1 != 0?
	wurzel32_2_wd:
		lsr XH
		ror XL
		cpi XL, 1
		cpc XH, r16
		brne wurzel32_2_wloop
	//Register wieder herstellen
	pop r10
	pop r9
	pop r8
	pop r7
	pop r6
	pop r16
	pop r17
	pop XH
	pop XL
	ret	
.endif