source: git/ntl/doc/quad_float.txt @ 2cfffe

spielwiese
Last change on this file since 2cfffe was 2cfffe, checked in by Hans Schönemann <hannes@…>, 21 years ago
This commit was generated by cvs2svn to compensate for changes in r6316, which included commits to RCS files with non-trunk default branches. git-svn-id: file:///usr/local/Singular/svn/trunk@6317 2c84dea3-7e68-4137-9b89-c4e89433aadc
  • Property mode set to 100644
File size: 12.6 KB
Line 
1
2
3/**************************************************************************\
4
5MODULE: quad_float
6
7SUMMARY:
8
9The class quad_float is used to represent quadruple precision numbers.
10Thus, with standard IEEE floating point, you should get the equivalent
11of about 106 bits of precision (but actually just a bit less).
12
13The interface allows you to treat quad_floats as if they were
14"ordinary" floating point types.
15
16See below for more implementation details.
17
18
19\**************************************************************************/
20
21#include <NTL/ZZ.h>
22
23
24class quad_float {
25public:
26
27quad_float(); // = 0
28
29quad_float(const quad_float& a);  // copy constructor
30
31quad_float& operator=(const quad_float& a);  // assignment operator
32quad_float& operator=(double a);
33
34~quad_float();
35
36
37static void SetOutputPrecision(long p);
38// This sets the number of decimal digits to be output.  Default is
39// 10.
40
41
42static long OutputPrecision();
43// returns current output precision.
44
45
46};
47
48
49/**************************************************************************\
50
51                             Arithmetic Operations
52
53\**************************************************************************/
54
55
56
57
58quad_float operator +(const quad_float& x, const quad_float& y);
59quad_float operator -(const quad_float& x, const quad_float& y);
60quad_float operator *(const quad_float& x, const quad_float& y);
61quad_float operator /(const quad_float& x, const quad_float& y);
62
63
64// PROMOTIONS: operators +, -, *, / promote double to quad_float
65// on (x, y).
66
67quad_float operator -(const quad_float& x);
68
69quad_float& operator += (quad_float& x, const quad_float& y);
70quad_float& operator += (quad_float& x, double y);
71
72quad_float& operator -= (quad_float& x, const quad_float& y);
73quad_float& operator -= (quad_float& x, double y);
74
75quad_float& operator *= (quad_float& x, const quad_float& y);
76quad_float& operator *= (quad_float& x, double y);
77
78quad_float& operator /= (quad_float& x, const quad_float& y);
79quad_float& operator /= (quad_float& x, double y);
80
81quad_float& operator++(quad_float& a); // prefix
82void operator++(quad_float& a, int); // postfix
83
84quad_float& operator--(quad_float& a); // prefix
85void operator--(quad_float& a, int); // postfix
86
87
88
89/**************************************************************************\
90
91                                  Comparison
92
93\**************************************************************************/
94
95
96long operator> (const quad_float& x, const quad_float& y);
97long operator>=(const quad_float& x, const quad_float& y);
98long operator< (const quad_float& x, const quad_float& y);
99long operator<=(const quad_float& x, const quad_float& y);
100long operator==(const quad_float& x, const quad_float& y);
101long operator!=(const quad_float& x, const quad_float& y);
102
103long sign(const quad_float& x);  // sign of x, -1, 0, +1
104long compare(const quad_float& x, const quad_float& y); // sign of x - y
105
106// PROMOTIONS: operators >, ..., != and function compare
107// promote double to quad_float on (x, y).
108
109
110/**************************************************************************\
111
112                               Input/Output
113Input Syntax:
114
115<number>: [ "-" ] <unsigned-number>
116<unsigned-number>: <dotted-number> [ <e-part> ] | <e-part>
117<dotted-number>: <digits> | <digits> "." <digits> | "." <digits> | <digits> "."
118<digits>: <digit> <digits> | <digit>
119<digit>: "0" | ... | "9"
120<e-part>: ( "E" | "e" ) [ "+" | "-" ] <digits>
121
122Examples of valid input:
123
12417 1.5 0.5 .5  5.  -.5 e10 e-10 e+10 1.5e10 .5e10 .5E10
125
126Note that the number of decimal digits of precision that are used
127for output can be set to any number p >= 1 by calling
128the routine quad_float::SetOutputPrecision(p). 
129The default value of p is 10.
130The current value of p is returned by a call to quad_float::OutputPrecision().
131
132
133
134\**************************************************************************/
135
136
137istream& operator >> (istream& s, quad_float& x);
138ostream& operator << (ostream& s, const quad_float& x);
139
140
141/**************************************************************************\
142
143                                  Miscellaneous
144
145\**************************************************************************/
146
147
148
149quad_float sqrt(const quad_float& x);
150quad_float floor(const quad_float& x);
151quad_float ceil(const quad_float& x);
152quad_float trunc(const quad_float& x);
153quad_float fabs(const quad_float& x);
154quad_float exp(const quad_float& x);
155quad_float log(const quad_float& x);
156
157
158void power(quad_float& x, const quad_float& a, long e); // x = a^e
159quad_float power(const quad_float& a, long e);
160
161void power2(quad_float& x, long e); // x = 2^e
162quad_float power2_quad_float(long e);
163
164quad_float ldexp(const quad_float& x, long e);  // return x*2^e
165
166long IsFinite(quad_float *x); // checks if x is "finite"   
167                              // pointer is used for compatability with
168                              // IsFinite(double*)
169
170
171void random(quad_float& x);
172quad_float random_quad_float();
173// generate a random quad_float x with 0 <= x <= 1
174
175
176
177
178
179/***********************************************************************\
180
181IMPLEMENTATION DETAILS
182
183A quad_float x is represented as a pair of doubles, x.hi and x.lo,
184such that the number represented by x is x.hi + x.lo, where
185
186   |x.lo| <= 0.5*ulp(x.hi),  (*)
187
188and ulp(y) means "unit in the last place of y". 
189
190For the software to work correctly, IEEE Standard Arithmetic is sufficient. 
191That includes just about every modern computer; the only exception I'm
192aware of is Intel x86 platforms running Linux (but you can still
193use this platform--see below).
194
195Also sufficient is any platform that implements arithmetic with correct
196rounding, i.e., given double floating point numbers a and b, a op b
197is computed exactly and then rounded to the nearest double. 
198The tie-breaking rule is not important.
199
200This is a rather wierd representation;  although it gives one
201essentially twice the precision of an ordinary double, it is
202not really the equivalent of quadratic precision (despite the name).
203For example, the number 1 + 2^{-200} can be represented exactly as
204a quad_float.  Also, there is no real notion of "machine precision".
205
206THE INTEL x86 PROBLEM
207
208Although just about every modern processor implements the IEEE
209floating point standard, there is still can be problems
210on processors that support IEEE extended double precision.
211The only processor I know of that supports this is the x86/Pentium.
212
213While extended double precision may sound like a nice thing,
214it is not.  Normal double precision has 53 bits of precision.
215Extended has 64.  On x86s, the FP registers have 53 or 64 bits
216of precision---this can be set at run-time by modifying
217the cpu "control word" (something that can be done
218only in assembly code).
219However, doubles stored in memory always have only 53 bits.
220Compilers may move values between memory and registers
221whenever they want, which can effectively change the value
222of a floating point number even though at the C/C++ level,
223nothing has happened that should have changed the value.
224Is that sick, or what?
225
226This is a real headache, and if one is not just a bit careful,
227the quad_float code will break.  This breaking is not at all subtle,
228and the program QuadTest will catch the problem if it exists.
229
230You should not need to worry about any of this, because NTL automatically
231detects and works around these problems as best it can, as described below.
232It shouldn't make a mistake, but if it does, you will
233catch it in the QuadTest program.
234If things don't work quite right, you might try
235setting NTL_FIX_X86 or NTL_NO_FIX_X86 flags in ntl_config.h,
236but this should not be necessary.
237
238Here are the details about how NTL fixes the problem.
239
240The first and best way is to have the default setting of the control word
241be 53 bits.  However, you are at the mercy of your platform
242(compiler, OS, run-time libraries).  Windows does this,
243and so the problem simply does not arise here, and NTL neither
244detects nor fixes the problem.  Linux, however, does not do this,
245which really sucks.  Can we talk these Linux people into changing this?
246
247The second way to fix the problem is by having NTL
248fiddle with control word itself.  If you compile NTL using a GNU compiler
249on an x86, this should happen automatically.
250On the one hand, this is not a general, portable solution,
251since it will only work if you use a GNU compiler, or at least one that
252supports GNU 'asm' syntax. 
253On the other hand, almost everybody who compiles C++ on x86/Linux
254platforms uses GNU compilers (although are some commercial
255compilers out there that I don't know too much about).
256
257The third way to fix the problem is to 'force' all intermediate
258floating point results into memory.  This is not an 'ideal' fix,
259since it is not fully equivalent to 53-bit precision (because of
260double rounding), but it works (although to be honest, I've never seen
261a full proof of correctness in this case).
262NTL's quad_float code does this by storing intermediate results
263in local variables declared to be 'volatile'.
264This is the solution to the problem that NTL uses if it detects
265the problem and can't fix it using the GNU 'asm' hack mentioned above.
266This solution should work on any platform that faithfully
267implements 'volatile' according to the ANSI C standard.
268
269
270
271BACKGROUND INFO
272
273The code NTL uses algorithms designed by Knuth, Kahan, Dekker, and
274Linnainmaa.  The original transcription to C++ was done by Douglas
275Priest.  Enhancements and bug fixes were done by Keith Briggs
276(http://epidem13.plantsci.cam.ac.uk/~kbriggs).  The NTL version is a
277stripped down version of Briggs' code, with a couple of bug fixes and
278portability improvements.  Briggs has continued to develop his
279library;  see his web page above for the latest version and more information.
280
281Here is a brief annotated bibliography (compiled by Priest) of papers
282dealing with DP and similar techniques, arranged chronologically.
283
284Kahan, W., Further Remarks on Reducing Truncation Errors,
285  {\it Comm.\ ACM\/} {\bf 8} (1965), 40.
286
287M{\o}ller, O., Quasi Double Precision in Floating-Point Addition,
288  {\it BIT\/} {\bf 5} (1965), 37--50.
289
290  The two papers that first presented the idea of recovering the
291  roundoff of a sum.
292
293Dekker, T., A Floating-Point Technique for Extending the Available
294  Precision, {\it Numer.\ Math.} {\bf 18} (1971), 224--242.
295
296  The classic reference for DP algorithms for sum, product, quotient,
297  and square root.
298
299Pichat, M., Correction d'une Somme en Arithmetique \`a Virgule
300  Flottante, {\it Numer.\ Math.} {\bf 19} (1972), 400--406.
301
302  An iterative algorithm for computing a protracted sum to working
303  precision by repeatedly applying the sum-and-roundoff method.
304
305Linnainmaa, S., Analysis of Some Known Methods of Improving the Accuracy
306  of Floating-Point Sums, {\it BIT\/} {\bf 14} (1974), 167--202.
307
308  Comparison of Kahan and M{\o}ller algorithms with variations given
309  by Knuth.
310
311Bohlender, G., Floating-Point Computation of Functions with Maximum
312  Accuracy, {\it IEEE Trans.\ Comput.} {\bf C-26} (1977), 621--632.
313
314  Extended the analysis of Pichat's algorithm to compute a multi-word
315  representation of the exact sum of n working precision numbers.
316  This is the algorithm Kahan has called "distillation".
317
318Linnainmaa, S., Software for Doubled-Precision Floating-Point Computations,
319  {\it ACM Trans.\ Math.\ Soft.} {\bf 7} (1981), 272--283.
320
321  Generalized the hypotheses of Dekker and showed how to take advantage
322  of extended precision where available.
323
324Leuprecht, H., and W.~Oberaigner, Parallel Algorithms for the Rounding-Exact
325  Summation of Floating-Point Numbers, {\it Computing} {\bf 28} (1982), 89--104.
326
327  Variations of distillation appropriate for parallel and vector
328  architectures.
329
330Kahan, W., Paradoxes in Concepts of Accuracy, lecture notes from Joint
331  Seminar on Issues and Directions in Scientific Computation, Berkeley, 1989.
332
333  Gives the more accurate DP sum I've shown above, discusses some
334  examples.
335
336Priest, D., Algorithms for Arbitrary Precision Floating Point Arithmetic,
337  in P.~Kornerup and D.~Matula, Eds., {\it Proc.\ 10th Symposium on Com-
338  puter Arithmetic}, IEEE Computer Society Press, Los Alamitos, Calif., 1991.
339
340  Extends from DP to arbitrary precision; gives portable algorithms and
341  general proofs.
342
343Sorensen, D., and P.~Tang, On the Orthogonality of Eigenvectors Computed
344  by Divide-and-Conquer Techniques, {\it SIAM J.\ Num.\ Anal.} {\bf 28}
345  (1991), 1752--1775.
346
347  Uses some DP arithmetic to retain orthogonality of eigenvectors
348  computed by a parallel divide-and-conquer scheme.
349
350Priest, D., On Properties of Floating Point Arithmetics: Numerical Stability
351  and the Cost of Accurate Computations, Ph.D. dissertation, University
352  of California at Berkeley, 1992.
353
354  More examples, organizes proofs in terms of common properties of fp
355  addition/subtraction, gives other summation algorithms.
356
357\***********************************************************************/
358
Note: See TracBrowser for help on using the repository browser.