intsqrt.c
1 | /* intsqrt.c
| 2 | *
| 3 | * Quadratwurzelfunktionen fuer Integer-Variablen
| 4 | *
| 5 | * $Date: 2019-04-03 16:52:36 +0200 (Mi, 03 Apr 2019) $
| 6 | * $Revision: 600 $
| 7 | *
| 8 | * 10.01.2018 Nicolas */
| 9 |
| 10 | #include "intsqrt.h"
| 11 | #include "myassert.h"
| 12 | #if !NDEBUG
| 13 | #include <stdio.h>
| 14 | #endif
| 15 |
| 16 |
| 17 |
| 18 |
| 19 | /* Schnelle Quadratwurzel */
| 20 |
| 21 | /* https://stackoverflow.com/questions/1100090/looking-for-an-efficient-integer-square-root-algorithm-for-arm-thumb2 */
| 22 | /* Over the range 1->2^20, the maximum error is 11, and over the range 1->2^30, it's about 256. You could use larger tables and minimise this. It's worth mentioning that the error will always be negative - i.e. when it's wrong, the value will be LESS than the correct value.
| 23 |
| 24 | You might do well to follow this with a refining stage.
| 25 |
| 26 | The idea is simple enough: (ab)^0.5 = a^0.b * b^0.5.
| 27 |
| 28 | So, we take the input X = A*B where A=2^N and 1<=B<2 Then we have a lookuptable for sqrt(2^N), and a lookuptable for sqrt(1<=B<2). We store the lookuptable for sqrt(2^N) as integer, which might be a mistake (testing shows no ill effects), and we store the lookuptable for sqrt(1<=B<2) at 15bits of fixed-point.
| 29 |
| 30 | We know that 1<=sqrt(2^N)<65536, so that's 16bit, and we know that we can really only multiply 16bitx15bit on an ARM, without fear of reprisal, so that's what we do.
| 31 |
| 32 | In terms of implementation, the while(t) {cnt++;t>>=1;} is effectively a count-leading-bits instruction (CLB), so if your version of the chipset has that, you're winning! Also, the shift instruction would be easy to implement with a bidirectional shifter, if you have one? There's a Lg[N] algorithm for counting the highest set bit here.
| 33 |
| 34 | In terms of magic numbers, for changing table sizes, THE magic number for ftbl2 is 32, though note that 6 (Lg[32]+1) is used for the shifting. */
| 35 |
| 36 | const uint32_t ftbl[33]={0,1,1,2,2,4,5,8,11,16,22,32,45,64,90,128,181,256,362,512,724,1024,1448,2048,2896,4096,5792,8192,11585,16384,23170,32768,46340};
| 37 | const uint32_t ftbl2[32]={ 32768,33276,33776,34269,34755,35235,35708,36174,36635,37090,37540,37984,38423,38858,39287,39712,40132,40548,40960,41367,41771,42170,42566,42959,43347,43733,44115,44493,44869,45241,45611,45977};
| 38 |
| 39 | uint32_t usqrt32_approx(uint32_t val)
| 40 | {
| 41 | assert( val <= INT32_MAX );
| 42 | int32_t cnt=0;
| 43 | int32_t t=val;
| 44 |
| 45 | // Test Geschwindigkeitsunterschied builtin clz() vs. einfaches zaehlen
| 46 | #if 1
| 47 | if( val == 0 )
| 48 | return 0;
| 49 | else
| 50 | cnt = 32-__builtin_clzl(val);
| 51 |
| 52 | #else
| 53 | while (t)
| 54 | {
| 55 | cnt++;
| 56 | t>>=1;
| 57 | }
| 58 | #endif
| 59 |
| 60 |
| 61 |
| 62 |
| 63 | if (6>=cnt)
| 64 | t=(val<<(6-cnt));
| 65 | else
| 66 | t=(val>>(cnt-6));
| 67 |
| 68 | return (ftbl[cnt]*ftbl2[t&31])>>15;
| 69 | }
| 70 |
| 71 |
| 72 |
| 73 | /* Langsame, genaue Quadratwurzel
| 74 | * hierbei wird die Tatsache genutzt, dass der obengenannte schnelle Algorithmus immer
| 75 | * negativ vom exakten Ergebnis abweicht.
| 76 | * Fuer grosse Zahlen wird er recht langsam. */
| 77 | uint32_t usqrt32_naive(uint32_t val)
| 78 | {
| 79 | assert( val <= INT32_MAX );
| 80 | uint32_t root = usqrt32_approx(val);
| 81 | while( root * root <= val ) root++;
| 82 | return --root;
| 83 | }
| 84 |
| 85 |
| 86 |
| 87 |
| 88 | /* Bisektionsverfahren,
| 89 | * bietet (halbwegs) gleiche Rechenzeit ueber den gesamten Wertebereich (s.u. !) und ist
| 90 | * hoffentlich deutlich schneller als der naive Ansatz */
| 91 | uint32_t usqrt32_bisect(uint32_t val)
| 92 | {
| 93 | assert( val <= INT32_MAX );
| 94 |
| 95 | uint32_t root0, root1, root2;
| 96 | uint32_t sq0, sq1, sq2;
| 97 |
| 98 | // Start value: Lower boundary
| 99 | root0 = usqrt32_approx(val);
| 100 |
| 101 | // Start value: Upper boundary
| 102 | if( val < 1<<20 )
| 103 | // in range 0 ... 2^20, error of approximation is smaller or equal 16
| 104 | // bisection method takes 4 iterations
| 105 | root2 = root0 + 16;
| 106 | else
| 107 | // in range 2^20 ... 2^31, error of approximation is smaller or equal 508
| 108 | // bisection method takes 9 iterations
| 109 | root2 = root0 + 512;
| 110 |
| 111 |
| 112 | #if !NDEBUG
| 113 | sq0 = root0 * root0;
| 114 | sq2 = root2 * root2;
| 115 | if( (sq2 <= val) )
| 116 | {
| 117 | printf("\nval=%i, root0=%i, sq0 = %i, root2=%i, sq2 = %i\n", val, root0, sq0, root2, sq2);
| 118 | error("");
| 119 | }
| 120 | #endif
| 121 |
| 122 |
| 123 | do
| 124 | {
| 125 | root1 = (root0 + root2)/2;
| 126 | sq1 = root1 * root1;
| 127 |
| 128 | if( sq1 == val )
| 129 | {
| 130 | // Shortcut Exit
| 131 | return root1;
| 132 | }
| 133 | else if( sq1 < val )
| 134 | {
| 135 | root0 = root1;
| 136 | }
| 137 | else
| 138 | {
| 139 | root2 = root1;
| 140 | }
| 141 | }
| 142 | while( (root2 - root0) > 1 );
| 143 |
| 144 | return root0;
| 145 | }
|
|