intsqrt.c


1
/* intsqrt.c
2
 *
3
 * Quadratwurzelfunktionen fuer Integer-Variablen
4
 *
5
 * $Date: 2019-04-03 16:52:36 +0200 (Mi, 03 Apr 2019) $
6
 * $Revision: 600 $
7
 *
8
 * 10.01.2018 Nicolas */
9
10
#include "intsqrt.h"
11
#include "myassert.h"
12
#if !NDEBUG
13
    #include <stdio.h>
14
#endif
15
16
17
18
19
/* Schnelle Quadratwurzel */
20
21
/* https://stackoverflow.com/questions/1100090/looking-for-an-efficient-integer-square-root-algorithm-for-arm-thumb2 */
22
/* Over the range 1->2^20, the maximum error is 11, and over the range 1->2^30, it's about 256. You could use larger tables and minimise this. It's worth mentioning that the error will always be negative - i.e. when it's wrong, the value will be LESS than the correct value.
23
24
You might do well to follow this with a refining stage.
25
26
The idea is simple enough: (ab)^0.5 = a^0.b * b^0.5.
27
28
So, we take the input X = A*B where A=2^N and 1<=B<2 Then we have a lookuptable for sqrt(2^N), and a lookuptable for sqrt(1<=B<2). We store the lookuptable for sqrt(2^N) as integer, which might be a mistake (testing shows no ill effects), and we store the lookuptable for sqrt(1<=B<2) at 15bits of fixed-point.
29
30
We know that 1<=sqrt(2^N)<65536, so that's 16bit, and we know that we can really only multiply 16bitx15bit on an ARM, without fear of reprisal, so that's what we do.
31
32
In terms of implementation, the while(t) {cnt++;t>>=1;} is effectively a count-leading-bits instruction (CLB), so if your version of the chipset has that, you're winning! Also, the shift instruction would be easy to implement with a bidirectional shifter, if you have one? There's a Lg[N] algorithm for counting the highest set bit here.
33
34
In terms of magic numbers, for changing table sizes, THE magic number for ftbl2 is 32, though note that 6 (Lg[32]+1) is used for the shifting. */
35
36
const uint32_t ftbl[33]={0,1,1,2,2,4,5,8,11,16,22,32,45,64,90,128,181,256,362,512,724,1024,1448,2048,2896,4096,5792,8192,11585,16384,23170,32768,46340};
37
const uint32_t ftbl2[32]={ 32768,33276,33776,34269,34755,35235,35708,36174,36635,37090,37540,37984,38423,38858,39287,39712,40132,40548,40960,41367,41771,42170,42566,42959,43347,43733,44115,44493,44869,45241,45611,45977};
38
39
uint32_t usqrt32_approx(uint32_t val)
40
{
41
    assert( val <= INT32_MAX );
42
    int32_t cnt=0;
43
    int32_t t=val;
44
45
    // Test Geschwindigkeitsunterschied builtin clz() vs. einfaches zaehlen
46
    #if 1
47
        if( val == 0 )
48
            return 0;
49
        else
50
            cnt = 32-__builtin_clzl(val);
51
52
    #else
53
        while (t)
54
        {
55
            cnt++;
56
            t>>=1;
57
        }
58
    #endif
59
60
61
62
63
    if (6>=cnt)
64
        t=(val<<(6-cnt));
65
    else
66
        t=(val>>(cnt-6));
67
68
    return (ftbl[cnt]*ftbl2[t&31])>>15;
69
}
70
71
72
73
/* Langsame, genaue Quadratwurzel
74
 * hierbei wird die Tatsache genutzt, dass der obengenannte schnelle Algorithmus immer
75
 * negativ vom exakten Ergebnis abweicht.
76
 * Fuer grosse Zahlen wird er recht langsam. */
77
uint32_t usqrt32_naive(uint32_t val)
78
{
79
    assert( val <= INT32_MAX );
80
    uint32_t root = usqrt32_approx(val);
81
    while( root * root <= val ) root++;
82
    return --root;
83
}
84
85
86
87
88
/* Bisektionsverfahren,
89
 * bietet (halbwegs) gleiche Rechenzeit ueber den gesamten Wertebereich (s.u. !) und ist
90
 * hoffentlich deutlich schneller als der naive Ansatz */
91
uint32_t usqrt32_bisect(uint32_t val)
92
{
93
    assert( val <= INT32_MAX );
94
95
    uint32_t root0, root1, root2;
96
    uint32_t sq0, sq1, sq2;
97
98
    // Start value: Lower boundary
99
    root0 = usqrt32_approx(val);
100
101
    // Start value: Upper boundary
102
    if( val < 1<<20 )
103
        // in range 0 ... 2^20, error of approximation is smaller or equal 16
104
        // bisection method takes 4 iterations
105
        root2 = root0 + 16;
106
    else
107
        // in range 2^20 ... 2^31, error of approximation is smaller or equal 508
108
        // bisection method takes 9 iterations
109
        root2 = root0 + 512;
110
111
112
    #if !NDEBUG
113
        sq0 = root0 * root0;
114
        sq2 = root2 * root2;
115
        if( (sq2 <= val) )
116
        {
117
            printf("\nval=%i, root0=%i, sq0 = %i, root2=%i, sq2 = %i\n", val, root0, sq0, root2, sq2);
118
            error("");
119
        }
120
    #endif
121
122
123
    do
124
    {
125
        root1 = (root0 + root2)/2;
126
        sq1 = root1 * root1;
127
128
        if( sq1 == val )
129
        {
130
            // Shortcut Exit
131
            return root1;
132
        }
133
        else if( sq1 < val )
134
        {
135
            root0 = root1;
136
        }
137
        else
138
        {
139
            root2 = root1;
140
        }
141
    }
142
    while( (root2 - root0) > 1 );
143
144
    return root0;
145
}