pitimes.c
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
#include <math.h> #include <stdio.h> #include <time.h> #define ITERS 10000000 #define TESTWITH(x) { \ diff = 0.0; \ time1 = clock(); \ for (i = 0; i < ITERS; ++i) \ diff += (x) - M_PI; \ time2 = clock(); \ printf("%s\t=> %e, time => %f\n", #x, diff, diffclock(time2, time1)); \ } static inline double diffclock(clock_t time1, clock_t time0) { return (double) (time1 - time0) / CLOCKS_PER_SEC; } int main() { int i; clock_t time1, time2; double diff; /* Warmup. The atan2 case catches GCC's atan folding (which would * optimise the ``4 * atan(1) - M_PI'' to a no-op), if -fno-builtin * is not used. */ TESTWITH(4 * atan(1)) TESTWITH(4 * atan2(1, 1)) #if defined(__GNUC__) && (defined(__i386__) || defined(__amd64__)) extern double fldpi(); TESTWITH(fldpi()) #endif /* Actual tests start here. */ TESTWITH(atan2(0, -1)) TESTWITH(acos(-1)) TESTWITH(2 * asin(1)) TESTWITH(4 * atan2(1, 1)) TESTWITH(4 * atan(1)) return 0; }
fldpi.c
1 2 3 4 5 6 7
double fldpi() { double pi; asm("fldpi" : "=t" (pi)); return pi; }
build.sh
1 2 3 4 5 6 7 8 9 10
#!/bin/sh
gcc -O3 -Wall -c -m32 -o fldpi-32.o fldpi.c
gcc -O3 -Wall -c -m64 -o fldpi-64.o fldpi.c
gcc -O3 -Wall -ffast-math -m32 -o pitimes1-32 pitimes.c fldpi-32.o
gcc -O3 -Wall -m32 -o pitimes2-32 pitimes.c fldpi-32.o -lm
gcc -O3 -Wall -fno-builtin -m32 -o pitimes3-32 pitimes.c fldpi-32.o -lm
gcc -O3 -Wall -ffast-math -m64 -o pitimes1-64 pitimes.c fldpi-64.o -lm
gcc -O3 -Wall -m64 -o pitimes2-64 pitimes.c fldpi-64.o -lm
gcc -O3 -Wall -fno-builtin -m64 -o pitimes3-64 pitimes.c fldpi-64.o -lm
Refactorings
No refactoring yet !
evilteach.blogspot.com
August 31, 2008, August 31, 2008 21:53, permalink
The trig functions are limited in the accuracy they provide, that really puts an upper limit on the accuracy you can expect. I would advise going to the wikipedia page on pi. It provides an excellent overview of the topic, with some algorithms that should give better results.
Here is a small erlang program that demonstrates one of the techniques.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
% % A simple test program to calculate pi, using an easy, slow converging formula % -module(pi). -export([main/0]). % The formual we use % pi^2 = 6*(1/(1^2) + 1/(2^2) + 1/(3^2) ...) % When we reach the 0th term, we are done. pi_engine(0, Sum) -> io:format("~20w ~30.25f ~n", [0, Sum]), io:format("The value of Pi is about ~30.25f ~n", [math:sqrt(Sum * 6)]); % Calculate the Nth term pi_engine(N, Sum) when N > 0 -> %io:format("~20w ~30.25f ~n", [N, Sum]), pi_engine(N - 1, Sum + (1 / (N * N))); pi_engine(N, _Sum) when N < 0 -> io:format("A negative number of terms does not make sense~w ~n", [N]). % provide an interface into the engine, as we must pass the sum along in order to % be able to accumulate the running total pi(TermsToCalculate) -> InitialValue = 0.0, pi_engine(TermsToCalculate, InitialValue). main() -> io:format("main begins ~n", []), % Decide how many terms should be used for this run TermsToCalculate = 100000, % Perform the calculation, and display the result pi(TermsToCalculate), io:format("main ends ~n", []).
Chris Jester-Young
August 31, 2008, August 31, 2008 22:34, permalink
@evilteach: Thanks for that. My question was going for the fastest double-precision method applicable, however, your approach of going for more precision is nice too.
I noticed that your final answer has a sqrt in it. Do you have an arbitrary-precision sqrt function? Does Erlang provide one, or would you have to do an iterative approximation by hand?
I've taken the liberty of taking your code and marking it as Erlang, so that the syntax highlighting works better for it. Hope you don't mind! :-)
evilteach's code formatted as Erlang
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
% % A simple test program to calculate pi, using an easy, slow converging formula % -module(pi). -export([main/0]). % The formual we use % pi^2 = 6*(1/(1^2) + 1/(2^2) + 1/(3^2) ...) % When we reach the 0th term, we are done. pi_engine(0, Sum) -> io:format("~20w ~30.25f ~n", [0, Sum]), io:format("The value of Pi is about ~30.25f ~n", [math:sqrt(Sum * 6)]); % Calculate the Nth term pi_engine(N, Sum) when N > 0 -> %io:format("~20w ~30.25f ~n", [N, Sum]), pi_engine(N - 1, Sum + (1 / (N * N))); pi_engine(N, _Sum) when N < 0 -> io:format("A negative number of terms does not make sense~w ~n", [N]). % provide an interface into the engine, as we must pass the sum along in order to % be able to accumulate the running total pi(TermsToCalculate) -> InitialValue = 0.0, pi_engine(TermsToCalculate, InitialValue). main() -> io:format("main begins ~n", []), % Decide how many terms should be used for this run TermsToCalculate = 100000, % Perform the calculation, and display the result pi(TermsToCalculate), io:format("main ends ~n", []).
EvilTeach
September 1, 2008, September 01, 2008 12:45, permalink
Ya, off the top of my head, I don't know if sqrt would give good precision.
The purpose of the program was to demonstrate the technique.
However, if you are only need double precision, I kind of like this implementation.
1 2 3 4 5 6 7 8
double pi ( void ) { return 3.1415926535897932; }
Solutions welcome in any language. :-) I'm looking for the fastest way to obtain the value of pi, as a personal challenge. More specifically I'm using ways that don't involve using #defined constants like M_PI, or hardcoding the number in.
The program below tests the various ways I know of. The inline assembly version is, in theory, the fastest option, though clearly not portable; I've included it as a baseline to compare the other versions against. In my tests, with builtins, the 4 * atan(1) version is fastest on GCC 4.2, because it auto-folds the atan(1) into a constant. With -fno-builtin specified, the atan2(0, -1) version is fastest.
Apart from testing between various compiler flags (I've compared 32-bit against 64-bit too, because the optimisations are different), I've also tried switching the order of the tests around. The atan2(0, -1) version still comes out top every time, though.
I'm keen to hear what results you have, as well as improvements to the testing process. :-)
ETA: I noticed that the inline assembly was treated as a constant and hoisted out of the loop. Thus, I've included the fldpi in a separate file, so that it gets treated like a function call. As a bonus, now the frame setup times get counted too, just like all the other function calls.