1 | /* Create tuned thresholds for various algorithms. |
---|
2 | |
---|
3 | Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc. |
---|
4 | |
---|
5 | This file is part of the GNU MP Library. |
---|
6 | |
---|
7 | The GNU MP Library is free software; you can redistribute it and/or modify |
---|
8 | it under the terms of the GNU Lesser General Public License as published by |
---|
9 | the Free Software Foundation; either version 2.1 of the License, or (at your |
---|
10 | option) any later version. |
---|
11 | |
---|
12 | The GNU MP Library is distributed in the hope that it will be useful, but |
---|
13 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
---|
14 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public |
---|
15 | License for more details. |
---|
16 | |
---|
17 | You should have received a copy of the GNU Lesser General Public License |
---|
18 | along with the GNU MP Library; see the file COPYING.LIB. If not, write to |
---|
19 | the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, |
---|
20 | MA 02111-1307, USA. */ |
---|
21 | |
---|
22 | |
---|
23 | /* Usage: tune [-t] [-t] [-p precision] |
---|
24 | |
---|
25 | -t turns on some diagnostic traces, a second -t turns on more traces. |
---|
26 | |
---|
27 | The thresholds are determined as follows. A crossover may not be a |
---|
28 | single size but rather a range where it oscillates between method A or |
---|
29 | method B faster. If the threshold is set making B used where A is faster |
---|
30 | (or vice versa) that's bad. Badness is the percentage time lost and |
---|
31 | total badness is the sum of this over all sizes measured. The threshold |
---|
32 | is set to minimize total badness. |
---|
33 | |
---|
34 | Suppose, as sizes increase, method B becomes faster than method A. The |
---|
35 | effect of the rule is that, as you look at increasing sizes, isolated |
---|
36 | points where B is faster are ignored, but when it's consistently faster, |
---|
37 | or faster on balance, then the threshold is set there. The same result |
---|
38 | is obtained thinking in the other direction of A becoming faster at |
---|
39 | smaller sizes. |
---|
40 | |
---|
41 | In practice the thresholds tend to be chosen to bring on the next |
---|
42 | algorithm fairly quickly. |
---|
43 | |
---|
44 | This rule is attractive because it's got a basis in reason and is fairly |
---|
45 | easy to implement, but no work has been done to actually compare it in |
---|
46 | absolute terms to other possibilities. |
---|
47 | |
---|
48 | Sometimes running the program twice produces slightly different results. |
---|
49 | This is probably because there's so little separating algorithms near |
---|
50 | their crossover, and on that basis it should make little or no difference |
---|
51 | to the final speed of the relevant routines, but nothing has been done to |
---|
52 | check that carefully. |
---|
53 | |
---|
54 | Remarks: |
---|
55 | |
---|
56 | The code here isn't a vision of loveliness, mainly because it's subject |
---|
57 | to ongoing modifications according to new things wanting to be tuned and |
---|
58 | practical requirements of systems tested. |
---|
59 | |
---|
60 | The way parts of the library are recompiled to insinuate the tuning |
---|
61 | variables is a bit subtle, but unavoidable since of course the main |
---|
62 | library has fixed thresholds compiled-in but we want to vary them here. |
---|
63 | Most of the nonsense for this can be found in tune/Makefile.am and under |
---|
64 | TUNE_PROGRAM_BUILD in gmp-impl.h. |
---|
65 | |
---|
66 | The dirty hack which the "second_start_min" feature could perhaps be done |
---|
67 | more generally, so if say karatsuba is never better than toom3 then it |
---|
68 | can be detected and omitted. Currently we're hoping very hard that this |
---|
69 | doesn't arise in practice, and if it does then it indicates something |
---|
70 | badly sub-optimal in the karatsuba implementation. |
---|
71 | |
---|
72 | Limitations: |
---|
73 | |
---|
74 | The FFTs aren't subject to the same badness rule as the other thresholds, |
---|
75 | so each k is probably being brought on a touch early. This isn't likely |
---|
76 | to make a difference, and the simpler probing means fewer tests. |
---|
77 | |
---|
78 | */ |
---|
79 | |
---|
80 | #define TUNE_PROGRAM_BUILD 1 /* for gmp-impl.h */ |
---|
81 | |
---|
82 | #include "config.h" |
---|
83 | |
---|
84 | #include <math.h> |
---|
85 | #include <stdio.h> |
---|
86 | #include <stdlib.h> |
---|
87 | #include <time.h> |
---|
88 | #if HAVE_UNISTD_H |
---|
89 | #include <unistd.h> |
---|
90 | #endif |
---|
91 | |
---|
92 | #include "gmp.h" |
---|
93 | #include "gmp-impl.h" |
---|
94 | #include "longlong.h" |
---|
95 | |
---|
96 | #include "tests.h" |
---|
97 | #include "speed.h" |
---|
98 | |
---|
99 | #if !HAVE_DECL_OPTARG |
---|
100 | extern char *optarg; |
---|
101 | extern int optind, opterr; |
---|
102 | #endif |
---|
103 | |
---|
104 | |
---|
105 | #define DEFAULT_MAX_SIZE 1000 /* limbs */ |
---|
106 | #define MAX_TABLE 5 /* entries */ |
---|
107 | |
---|
108 | #if WANT_FFT |
---|
109 | mp_size_t option_fft_max_size = 50000; /* limbs */ |
---|
110 | #else |
---|
111 | mp_size_t option_fft_max_size = 0; |
---|
112 | #endif |
---|
113 | int option_trace = 0; |
---|
114 | int option_fft_trace = 0; |
---|
115 | struct speed_params s; |
---|
116 | |
---|
117 | struct dat_t { |
---|
118 | mp_size_t size; |
---|
119 | double d; |
---|
120 | } *dat = NULL; |
---|
121 | int ndat = 0; |
---|
122 | int allocdat = 0; |
---|
123 | |
---|
124 | |
---|
125 | /* Each "_threshold" array must be 1 bigger than the number of thresholds |
---|
126 | being tuned in a set, because one() stores a value in the entry above |
---|
127 | the one it's determining. */ |
---|
128 | |
---|
129 | mp_size_t mul_threshold[MAX_TABLE+1] = { MP_SIZE_T_MAX }; |
---|
130 | mp_size_t sqr_threshold[MAX_TABLE+1] = { MP_SIZE_T_MAX }; |
---|
131 | mp_size_t sb_preinv_threshold[2] = { MP_SIZE_T_MAX }; |
---|
132 | mp_size_t dc_threshold[2] = { MP_SIZE_T_MAX }; |
---|
133 | mp_size_t powm_threshold[2] = { MP_SIZE_T_MAX }; |
---|
134 | mp_size_t gcd_accel_threshold[2] = { MP_SIZE_T_MAX }; |
---|
135 | mp_size_t gcdext_threshold[2] = { MP_SIZE_T_MAX }; |
---|
136 | mp_size_t divexact_1_threshold[2] = { MP_SIZE_T_MAX }; |
---|
137 | mp_size_t divrem_1_norm_threshold[2] = { MP_SIZE_T_MAX }; |
---|
138 | mp_size_t divrem_1_unnorm_threshold[2] = { MP_SIZE_T_MAX }; |
---|
139 | mp_size_t divrem_2_threshold[2] = { MP_SIZE_T_MAX }; |
---|
140 | mp_size_t mod_1_norm_threshold[2] = { MP_SIZE_T_MAX }; |
---|
141 | mp_size_t mod_1_unnorm_threshold[2] = { MP_SIZE_T_MAX }; |
---|
142 | mp_size_t modexact_1_odd_threshold[2] = { MP_SIZE_T_MAX }; |
---|
143 | mp_size_t get_str_basecase_threshold[2] = { MP_SIZE_T_MAX }; |
---|
144 | mp_size_t get_str_precompute_threshold[2] = { MP_SIZE_T_MAX }; |
---|
145 | mp_size_t set_str_threshold[2] = { MP_SIZE_T_MAX }; |
---|
146 | |
---|
147 | mp_size_t fft_modf_sqr_threshold = MP_SIZE_T_MAX; |
---|
148 | mp_size_t fft_modf_mul_threshold = MP_SIZE_T_MAX; |
---|
149 | |
---|
150 | #ifndef TUNE_SQR_KARATSUBA_MAX |
---|
151 | #define TUNE_SQR_KARATSUBA_MAX 0 /* meaning no limit */ |
---|
152 | #endif |
---|
153 | |
---|
154 | struct param_t { |
---|
155 | const char *name[MAX_TABLE]; |
---|
156 | speed_function_t function; |
---|
157 | speed_function_t function2; |
---|
158 | double step_factor; /* how much to step sizes (rounded down) */ |
---|
159 | double function_fudge; /* multiplier for "function" speeds */ |
---|
160 | int stop_since_change; |
---|
161 | double stop_factor; |
---|
162 | mp_size_t min_size[MAX_TABLE]; |
---|
163 | int min_is_always; |
---|
164 | int second_start_min; |
---|
165 | mp_size_t max_size[MAX_TABLE]; |
---|
166 | mp_size_t check_size; |
---|
167 | mp_size_t size_extra; |
---|
168 | |
---|
169 | #define DATA_HIGH_LT_R 1 |
---|
170 | #define DATA_HIGH_GE_R 2 |
---|
171 | int data_high; |
---|
172 | |
---|
173 | int noprint; |
---|
174 | }; |
---|
175 | |
---|
176 | #ifndef UDIV_PREINV_ALWAYS |
---|
177 | #define UDIV_PREINV_ALWAYS 0 |
---|
178 | #endif |
---|
179 | |
---|
180 | |
---|
181 | mp_limb_t |
---|
182 | randlimb_norm (void) |
---|
183 | { |
---|
184 | mp_limb_t n; |
---|
185 | mpn_random (&n, 1); |
---|
186 | n |= GMP_LIMB_HIGHBIT; |
---|
187 | return n; |
---|
188 | } |
---|
189 | |
---|
190 | #define MP_LIMB_T_HALFMASK ((CNST_LIMB(1) << (BITS_PER_MP_LIMB/2)) - 1) |
---|
191 | |
---|
192 | mp_limb_t |
---|
193 | randlimb_half (void) |
---|
194 | { |
---|
195 | mp_limb_t n; |
---|
196 | mpn_random (&n, 1); |
---|
197 | n &= MP_LIMB_T_HALFMASK; |
---|
198 | n += (n==0); |
---|
199 | return n; |
---|
200 | } |
---|
201 | |
---|
202 | |
---|
203 | /* Add an entry to the end of the dat[] array, reallocing to make it bigger |
---|
204 | if necessary. */ |
---|
205 | void |
---|
206 | add_dat (mp_size_t size, double d) |
---|
207 | { |
---|
208 | #define ALLOCDAT_STEP 500 |
---|
209 | |
---|
210 | ASSERT_ALWAYS (ndat <= allocdat); |
---|
211 | |
---|
212 | if (ndat == allocdat) |
---|
213 | { |
---|
214 | dat = (struct dat_t *) __gmp_allocate_or_reallocate |
---|
215 | (dat, allocdat * sizeof(dat[0]), |
---|
216 | (allocdat+ALLOCDAT_STEP) * sizeof(dat[0])); |
---|
217 | allocdat += ALLOCDAT_STEP; |
---|
218 | } |
---|
219 | |
---|
220 | dat[ndat].size = size; |
---|
221 | dat[ndat].d = d; |
---|
222 | ndat++; |
---|
223 | } |
---|
224 | |
---|
225 | |
---|
226 | /* Return the threshold size based on the data accumulated. */ |
---|
227 | mp_size_t |
---|
228 | analyze_dat (int i, int final) |
---|
229 | { |
---|
230 | double x, min_x; |
---|
231 | int j, min_j; |
---|
232 | |
---|
233 | /* If the threshold is set at dat[0].size, any positive values are bad. */ |
---|
234 | x = 0.0; |
---|
235 | for (j = 0; j < ndat; j++) |
---|
236 | if (dat[j].d > 0.0) |
---|
237 | x += dat[j].d; |
---|
238 | |
---|
239 | if (option_trace >= 2 && final) |
---|
240 | { |
---|
241 | printf ("\n"); |
---|
242 | printf ("x is the sum of the badness from setting thresh at given size\n"); |
---|
243 | printf (" (minimum x is sought)\n"); |
---|
244 | printf ("i=%d size=%ld first x=%.4f\n", i, dat[j].size, x); |
---|
245 | } |
---|
246 | |
---|
247 | min_x = x; |
---|
248 | min_j = 0; |
---|
249 | |
---|
250 | |
---|
251 | /* When stepping to the next dat[j].size, positive values are no longer |
---|
252 | bad (so subtracted), negative values become bad (so add the absolute |
---|
253 | value, meaning subtract). */ |
---|
254 | for (j = 0; j < ndat; x -= dat[j].d, j++) |
---|
255 | { |
---|
256 | if (option_trace >= 2 && final) |
---|
257 | printf ("i=%d size=%ld x=%.4f\n", i, dat[j].size, x); |
---|
258 | |
---|
259 | if (x < min_x) |
---|
260 | { |
---|
261 | min_x = x; |
---|
262 | min_j = j; |
---|
263 | } |
---|
264 | } |
---|
265 | |
---|
266 | return min_j; |
---|
267 | } |
---|
268 | |
---|
269 | |
---|
270 | /* Measuring for recompiled mpn/generic/divrem_1.c and mpn/generic/mod_1.c */ |
---|
271 | |
---|
272 | mp_limb_t mpn_divrem_1_tune _PROTO ((mp_ptr qp, mp_size_t xsize, |
---|
273 | mp_srcptr ap, mp_size_t size, |
---|
274 | mp_limb_t d)); |
---|
275 | mp_limb_t mpn_mod_1_tune _PROTO ((mp_srcptr ap, mp_size_t size, mp_limb_t d)); |
---|
276 | |
---|
277 | double |
---|
278 | speed_mpn_mod_1_tune (struct speed_params *s) |
---|
279 | { |
---|
280 | SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_tune); |
---|
281 | } |
---|
282 | double |
---|
283 | speed_mpn_divrem_1_tune (struct speed_params *s) |
---|
284 | { |
---|
285 | SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_tune); |
---|
286 | } |
---|
287 | |
---|
288 | |
---|
289 | double |
---|
290 | tuneup_measure (speed_function_t fun, |
---|
291 | const struct param_t *param, |
---|
292 | struct speed_params *s) |
---|
293 | { |
---|
294 | static struct param_t dummy; |
---|
295 | double t; |
---|
296 | TMP_DECL (marker); |
---|
297 | |
---|
298 | if (! param) |
---|
299 | param = &dummy; |
---|
300 | |
---|
301 | s->size += param->size_extra; |
---|
302 | |
---|
303 | TMP_MARK (marker); |
---|
304 | s->xp = SPEED_TMP_ALLOC_LIMBS (s->size, 0); |
---|
305 | s->yp = SPEED_TMP_ALLOC_LIMBS (s->size, 0); |
---|
306 | |
---|
307 | mpn_random (s->xp, s->size); |
---|
308 | mpn_random (s->yp, s->size); |
---|
309 | |
---|
310 | switch (param->data_high) { |
---|
311 | case DATA_HIGH_LT_R: |
---|
312 | s->xp[s->size-1] %= s->r; |
---|
313 | s->yp[s->size-1] %= s->r; |
---|
314 | break; |
---|
315 | case DATA_HIGH_GE_R: |
---|
316 | s->xp[s->size-1] |= s->r; |
---|
317 | s->yp[s->size-1] |= s->r; |
---|
318 | break; |
---|
319 | } |
---|
320 | |
---|
321 | t = speed_measure (fun, s); |
---|
322 | |
---|
323 | s->size -= param->size_extra; |
---|
324 | |
---|
325 | TMP_FREE (marker); |
---|
326 | return t; |
---|
327 | } |
---|
328 | |
---|
329 | |
---|
330 | #define PRINT_WIDTH 28 |
---|
331 | |
---|
332 | void |
---|
333 | print_define_start (const char *name) |
---|
334 | { |
---|
335 | printf ("#define %-*s ", PRINT_WIDTH, name); |
---|
336 | if (option_trace) |
---|
337 | printf ("...\n"); |
---|
338 | } |
---|
339 | |
---|
340 | void |
---|
341 | print_define_end_remark (const char *name, mp_size_t value, const char *remark) |
---|
342 | { |
---|
343 | if (option_trace) |
---|
344 | printf ("#define %-*s ", PRINT_WIDTH, name); |
---|
345 | |
---|
346 | if (value == MP_SIZE_T_MAX) |
---|
347 | printf ("MP_SIZE_T_MAX"); |
---|
348 | else |
---|
349 | printf ("%5ld", value); |
---|
350 | |
---|
351 | if (remark != NULL) |
---|
352 | printf (" /* %s */", remark); |
---|
353 | printf ("\n"); |
---|
354 | } |
---|
355 | |
---|
356 | void |
---|
357 | print_define_end (const char *name, mp_size_t value) |
---|
358 | { |
---|
359 | const char *remark; |
---|
360 | if (value == MP_SIZE_T_MAX) |
---|
361 | remark = "never"; |
---|
362 | else if (value == 0) |
---|
363 | remark = "always"; |
---|
364 | else |
---|
365 | remark = NULL; |
---|
366 | print_define_end_remark (name, value, remark); |
---|
367 | } |
---|
368 | |
---|
369 | void |
---|
370 | print_define (const char *name, mp_size_t value) |
---|
371 | { |
---|
372 | print_define_start (name); |
---|
373 | print_define_end (name, value); |
---|
374 | } |
---|
375 | |
---|
376 | void |
---|
377 | print_define_remark (const char *name, mp_size_t value, const char *remark) |
---|
378 | { |
---|
379 | print_define_start (name); |
---|
380 | print_define_end_remark (name, value, remark); |
---|
381 | } |
---|
382 | |
---|
383 | |
---|
384 | /* table[i+1] needs to be set to a sensible value when testing method i+1 |
---|
385 | because mpn_mul_n uses MUL_TOOM3_THRESHOLD to size the temporary |
---|
386 | workspace for mpn_kara_mul_n. */ |
---|
387 | |
---|
388 | void |
---|
389 | one (mp_size_t table[], size_t max_table, struct param_t *param) |
---|
390 | { |
---|
391 | mp_size_t table_save0 = 0; |
---|
392 | int since_positive, since_thresh_change; |
---|
393 | int thresh_idx, new_thresh_idx; |
---|
394 | int i; |
---|
395 | |
---|
396 | ASSERT_ALWAYS (max_table <= MAX_TABLE); |
---|
397 | |
---|
398 | #define DEFAULT(x,n) if (! (param->x)) param->x = (n); |
---|
399 | |
---|
400 | DEFAULT (function_fudge, 1.0); |
---|
401 | DEFAULT (function2, param->function); |
---|
402 | DEFAULT (step_factor, 0.01); /* small steps by default */ |
---|
403 | DEFAULT (stop_since_change, 80); |
---|
404 | DEFAULT (stop_factor, 1.2); |
---|
405 | for (i = 0; i < max_table; i++) |
---|
406 | DEFAULT (min_size[i], 10); |
---|
407 | for (i = 0; i < max_table; i++) |
---|
408 | DEFAULT (max_size[i], DEFAULT_MAX_SIZE); |
---|
409 | |
---|
410 | if (param->check_size != 0) |
---|
411 | { |
---|
412 | double t1, t2; |
---|
413 | s.size = param->check_size; |
---|
414 | |
---|
415 | table[0] = s.size+1; |
---|
416 | table[1] = param->max_size[0]; |
---|
417 | t1 = tuneup_measure (param->function, param, &s); |
---|
418 | |
---|
419 | table[0] = s.size; |
---|
420 | table[1] = s.size+1; |
---|
421 | t2 = tuneup_measure (param->function2, param, &s); |
---|
422 | if (t1 == -1.0 || t2 == -1.0) |
---|
423 | { |
---|
424 | printf ("Oops, can't run both functions at size %ld\n", s.size); |
---|
425 | abort (); |
---|
426 | } |
---|
427 | t1 *= param->function_fudge; |
---|
428 | |
---|
429 | /* ask that t2 is at least 4% below t1 */ |
---|
430 | if (t1 < t2*1.04) |
---|
431 | { |
---|
432 | if (option_trace) |
---|
433 | printf ("function2 never enough faster: t1=%.9f t2=%.9f\n", t1, t2); |
---|
434 | table[0] = MP_SIZE_T_MAX; |
---|
435 | if (! param->noprint) |
---|
436 | print_define (param->name[0], table[0]); |
---|
437 | return; |
---|
438 | } |
---|
439 | |
---|
440 | if (option_trace >= 2) |
---|
441 | printf ("function2 enough faster at size=%ld: t1=%.9f t2=%.9f\n", |
---|
442 | s.size, t1, t2); |
---|
443 | } |
---|
444 | |
---|
445 | for (i = 0, s.size = 1; i < max_table && s.size < param->max_size[i]; i++) |
---|
446 | { |
---|
447 | if (i == 1 && param->second_start_min) |
---|
448 | s.size = 1; |
---|
449 | |
---|
450 | if (s.size < param->min_size[i]) |
---|
451 | s.size = param->min_size[i]; |
---|
452 | |
---|
453 | if (! (param->noprint || (i == 1 && param->second_start_min))) |
---|
454 | print_define_start (param->name[i]); |
---|
455 | |
---|
456 | ndat = 0; |
---|
457 | since_positive = 0; |
---|
458 | since_thresh_change = 0; |
---|
459 | thresh_idx = 0; |
---|
460 | |
---|
461 | if (option_trace >= 2) |
---|
462 | { |
---|
463 | printf (" algorithm-A algorithm-B ratio possible\n"); |
---|
464 | printf (" (seconds) (seconds) diff thresh\n"); |
---|
465 | } |
---|
466 | |
---|
467 | for (; |
---|
468 | s.size < param->max_size[i]; |
---|
469 | s.size += MAX ((mp_size_t) floor (s.size * param->step_factor), 1)) |
---|
470 | { |
---|
471 | double ti, tiplus1, d; |
---|
472 | |
---|
473 | /* If there's a size limit and it's reached then it should still |
---|
474 | be sensible to analyze the data since we want the threshold put |
---|
475 | either at or near the limit. */ |
---|
476 | if (s.size >= param->max_size[i]) |
---|
477 | { |
---|
478 | if (option_trace) |
---|
479 | printf ("Reached maximum size (%ld) without otherwise stopping\n", |
---|
480 | param->max_size[i]); |
---|
481 | break; |
---|
482 | } |
---|
483 | |
---|
484 | /* |
---|
485 | FIXME: check minimum size requirements are met, possibly by just |
---|
486 | checking for the -1 returns from the speed functions. |
---|
487 | */ |
---|
488 | |
---|
489 | /* under this hack, don't let method 0 get used at s.size */ |
---|
490 | if (i == 1 && param->second_start_min) |
---|
491 | table[0] = MIN (s.size-1, table_save0); |
---|
492 | |
---|
493 | /* using method i at this size */ |
---|
494 | table[i] = s.size+1; |
---|
495 | table[i+1] = param->max_size[i]; |
---|
496 | ti = tuneup_measure (param->function, param, &s); |
---|
497 | if (ti == -1.0) |
---|
498 | abort (); |
---|
499 | ti *= param->function_fudge; |
---|
500 | |
---|
501 | /* using method i+1 at this size */ |
---|
502 | table[i] = s.size; |
---|
503 | table[i+1] = s.size+1; |
---|
504 | tiplus1 = tuneup_measure (param->function2, param, &s); |
---|
505 | if (tiplus1 == -1.0) |
---|
506 | abort (); |
---|
507 | |
---|
508 | /* Calculate the fraction by which the one or the other routine is |
---|
509 | slower. */ |
---|
510 | if (tiplus1 >= ti) |
---|
511 | d = (tiplus1 - ti) / tiplus1; /* negative */ |
---|
512 | else |
---|
513 | d = (tiplus1 - ti) / ti; /* positive */ |
---|
514 | |
---|
515 | add_dat (s.size, d); |
---|
516 | |
---|
517 | new_thresh_idx = analyze_dat (i, 0); |
---|
518 | |
---|
519 | |
---|
520 | if (option_trace >= 2) |
---|
521 | printf ("i=%d size=%ld %.9f %.9f % .4f %c %ld\n", |
---|
522 | i, s.size, ti, tiplus1, d, |
---|
523 | ti > tiplus1 ? '#' : ' ', |
---|
524 | dat[new_thresh_idx].size); |
---|
525 | |
---|
526 | /* Stop if the last time method i was faster was more than a |
---|
527 | certain number of measurements ago. */ |
---|
528 | #define STOP_SINCE_POSITIVE 200 |
---|
529 | if (d >= 0) |
---|
530 | since_positive = 0; |
---|
531 | else |
---|
532 | if (++since_positive > STOP_SINCE_POSITIVE) |
---|
533 | { |
---|
534 | if (option_trace >= 1) |
---|
535 | printf ("i=%d stopped due to since_positive (%d)\n", |
---|
536 | i, STOP_SINCE_POSITIVE); |
---|
537 | break; |
---|
538 | } |
---|
539 | |
---|
540 | /* Stop if method i has become slower by a certain factor. */ |
---|
541 | if (ti >= tiplus1 * param->stop_factor) |
---|
542 | { |
---|
543 | if (option_trace >= 1) |
---|
544 | printf ("i=%d stopped due to ti >= tiplus1 * factor (%.1f)\n", |
---|
545 | i, param->stop_factor); |
---|
546 | break; |
---|
547 | } |
---|
548 | |
---|
549 | /* Stop if the threshold implied hasn't changed in a certain |
---|
550 | number of measurements. (It's this condition that ususally |
---|
551 | stops the loop.) */ |
---|
552 | if (thresh_idx != new_thresh_idx) |
---|
553 | since_thresh_change = 0, thresh_idx = new_thresh_idx; |
---|
554 | else |
---|
555 | if (++since_thresh_change > param->stop_since_change) |
---|
556 | { |
---|
557 | if (option_trace >= 1) |
---|
558 | printf ("i=%d stopped due to since_thresh_change (%d)\n", |
---|
559 | i, param->stop_since_change); |
---|
560 | break; |
---|
561 | } |
---|
562 | |
---|
563 | /* Stop if the threshold implied is more than a certain number of |
---|
564 | measurements ago. */ |
---|
565 | #define STOP_SINCE_AFTER 500 |
---|
566 | if (ndat - thresh_idx > STOP_SINCE_AFTER) |
---|
567 | { |
---|
568 | if (option_trace >= 1) |
---|
569 | printf ("i=%d stopped due to ndat - thresh_idx > amount (%d)\n", |
---|
570 | i, STOP_SINCE_AFTER); |
---|
571 | break; |
---|
572 | } |
---|
573 | } |
---|
574 | |
---|
575 | /* Stop when the size limit is reached before the end of the |
---|
576 | crossover, but only show this as an error for >= the default max |
---|
577 | size. FIXME: Maybe should make it a param choice whether this is |
---|
578 | an error. */ |
---|
579 | if (s.size >= param->max_size[i] |
---|
580 | && param->max_size[i] >= DEFAULT_MAX_SIZE) |
---|
581 | { |
---|
582 | fprintf (stderr, "%s\n", param->name[i]); |
---|
583 | fprintf (stderr, "i=%d sizes %ld to %ld total %d measurements\n", |
---|
584 | i, dat[0].size, dat[ndat-1].size, ndat); |
---|
585 | fprintf (stderr, " max size reached before end of crossover\n"); |
---|
586 | break; |
---|
587 | } |
---|
588 | |
---|
589 | if (option_trace >= 1) |
---|
590 | printf ("i=%d sizes %ld to %ld total %d measurements\n", |
---|
591 | i, dat[0].size, dat[ndat-1].size, ndat); |
---|
592 | |
---|
593 | if (ndat == 0) |
---|
594 | break; |
---|
595 | |
---|
596 | table[i] = dat[analyze_dat (i, 1)].size; |
---|
597 | |
---|
598 | /* fudge here, let min_is_always apply only to i==0, that's what the |
---|
599 | sqr_n thresholds want */ |
---|
600 | if (i == 0 && param->min_is_always && table[i] == param->min_size[i]) |
---|
601 | table[i] = 0; |
---|
602 | |
---|
603 | /* under the second_start_min fudge, if the second threshold turns out |
---|
604 | to be lower than the first, then the second method is unwanted, we |
---|
605 | should go straight from algorithm 1 to algorithm 3. */ |
---|
606 | if (param->second_start_min) |
---|
607 | { |
---|
608 | if (i == 0) |
---|
609 | { |
---|
610 | table_save0 = table[0]; |
---|
611 | table[0] = 0; |
---|
612 | } |
---|
613 | else if (i == 1) |
---|
614 | { |
---|
615 | table[0] = table_save0; |
---|
616 | if (table[1] <= table[0]) |
---|
617 | { |
---|
618 | table[0] = table[1]; |
---|
619 | table[1] = 0; |
---|
620 | } |
---|
621 | } |
---|
622 | s.size = MAX (table[0], table[1]) + 1; |
---|
623 | } |
---|
624 | |
---|
625 | if (! (param->noprint || (i == 0 && param->second_start_min))) |
---|
626 | { |
---|
627 | if (i == 1 && param->second_start_min) |
---|
628 | { |
---|
629 | print_define_end (param->name[0], table[0]); |
---|
630 | print_define_start (param->name[1]); |
---|
631 | } |
---|
632 | |
---|
633 | print_define_end (param->name[i], table[i]); |
---|
634 | } |
---|
635 | |
---|
636 | /* Look for the next threshold starting from the current one. */ |
---|
637 | s.size = table[i]+1; |
---|
638 | |
---|
639 | /* Take a MAX of all to allow for second_start_min producing a 0. */ |
---|
640 | { |
---|
641 | int j; |
---|
642 | for (j = 0; j < i; j++) |
---|
643 | s.size = MAX (s.size, table[j]+1); |
---|
644 | } |
---|
645 | } |
---|
646 | } |
---|
647 | |
---|
648 | |
---|
649 | /* Special probing for the fft thresholds. The size restrictions on the |
---|
650 | FFTs mean the graph of time vs size has a step effect. See this for |
---|
651 | example using |
---|
652 | |
---|
653 | ./speed -s 4096-16384 -t 128 -P foo mpn_mul_fft.8 mpn_mul_fft.9 |
---|
654 | gnuplot foo.gnuplot |
---|
655 | |
---|
656 | The current approach is to compare routines at the midpoint of relevant |
---|
657 | steps. Arguably a more sophisticated system of threshold data is wanted |
---|
658 | if this step effect remains. */ |
---|
659 | |
---|
660 | struct fft_param_t { |
---|
661 | const char *table_name; |
---|
662 | const char *threshold_name; |
---|
663 | const char *modf_threshold_name; |
---|
664 | mp_size_t *p_threshold; |
---|
665 | mp_size_t *p_modf_threshold; |
---|
666 | mp_size_t first_size; |
---|
667 | mp_size_t max_size; |
---|
668 | speed_function_t function; |
---|
669 | speed_function_t mul_function; |
---|
670 | mp_size_t sqr; |
---|
671 | }; |
---|
672 | |
---|
673 | |
---|
674 | /* mpn_mul_fft requires pl a multiple of 2^k limbs, but with |
---|
675 | N=pl*BIT_PER_MP_LIMB it internally also pads out so N/2^k is a multiple |
---|
676 | of 2^(k-1) bits. */ |
---|
677 | |
---|
678 | mp_size_t |
---|
679 | fft_step_size (int k) |
---|
680 | { |
---|
681 | mp_size_t step; |
---|
682 | |
---|
683 | step = MAX ((mp_size_t) 1 << (k-1), BITS_PER_MP_LIMB) / BITS_PER_MP_LIMB; |
---|
684 | step *= (mp_size_t) 1 << k; |
---|
685 | |
---|
686 | if (step <= 0) |
---|
687 | { |
---|
688 | printf ("Can't handle k=%d\n", k); |
---|
689 | abort (); |
---|
690 | } |
---|
691 | |
---|
692 | return step; |
---|
693 | } |
---|
694 | |
---|
695 | mp_size_t |
---|
696 | fft_next_size (mp_size_t pl, int k) |
---|
697 | { |
---|
698 | mp_size_t m = fft_step_size (k); |
---|
699 | |
---|
700 | /* printf ("[k=%d %ld] %ld ->", k, m, pl); */ |
---|
701 | |
---|
702 | if (pl == 0 || (pl & (m-1)) != 0) |
---|
703 | pl = (pl | (m-1)) + 1; |
---|
704 | |
---|
705 | /* printf (" %ld\n", pl); */ |
---|
706 | return pl; |
---|
707 | } |
---|
708 | |
---|
709 | void |
---|
710 | fft (struct fft_param_t *p) |
---|
711 | { |
---|
712 | mp_size_t size; |
---|
713 | int i, k; |
---|
714 | |
---|
715 | for (i = 0; i < numberof (mpn_fft_table[p->sqr]); i++) |
---|
716 | mpn_fft_table[p->sqr][i] = MP_SIZE_T_MAX; |
---|
717 | |
---|
718 | *p->p_threshold = MP_SIZE_T_MAX; |
---|
719 | *p->p_modf_threshold = MP_SIZE_T_MAX; |
---|
720 | |
---|
721 | option_trace = MAX (option_trace, option_fft_trace); |
---|
722 | |
---|
723 | printf ("#define %s {", p->table_name); |
---|
724 | if (option_trace >= 2) |
---|
725 | printf ("\n"); |
---|
726 | |
---|
727 | k = FFT_FIRST_K; |
---|
728 | size = p->first_size; |
---|
729 | for (;;) |
---|
730 | { |
---|
731 | double tk, tk1; |
---|
732 | |
---|
733 | size = fft_next_size (size+1, k+1); |
---|
734 | |
---|
735 | if (size >= p->max_size) |
---|
736 | break; |
---|
737 | if (k >= FFT_FIRST_K + numberof (mpn_fft_table[p->sqr])) |
---|
738 | break; |
---|
739 | |
---|
740 | /* compare k to k+1 in the middle of the current k+1 step */ |
---|
741 | s.size = size + fft_step_size (k+1) / 2; |
---|
742 | s.r = k; |
---|
743 | tk = tuneup_measure (p->function, NULL, &s); |
---|
744 | if (tk == -1.0) |
---|
745 | abort (); |
---|
746 | |
---|
747 | s.r = k+1; |
---|
748 | tk1 = tuneup_measure (p->function, NULL, &s); |
---|
749 | if (tk1 == -1.0) |
---|
750 | abort (); |
---|
751 | |
---|
752 | if (option_trace >= 2) |
---|
753 | printf ("at %ld size=%ld k=%d %.9f k=%d %.9f\n", |
---|
754 | size, s.size, k, tk, k+1, tk1); |
---|
755 | |
---|
756 | /* declare the k+1 threshold as soon as it's faster at its midpoint */ |
---|
757 | if (tk1 < tk) |
---|
758 | { |
---|
759 | mpn_fft_table[p->sqr][k-FFT_FIRST_K] = s.size; |
---|
760 | printf (" %ld,", s.size); |
---|
761 | if (option_trace >= 2) printf ("\n"); |
---|
762 | k++; |
---|
763 | } |
---|
764 | } |
---|
765 | |
---|
766 | mpn_fft_table[p->sqr][k-FFT_FIRST_K] = 0; |
---|
767 | printf (" 0 }\n"); |
---|
768 | |
---|
769 | |
---|
770 | size = p->first_size; |
---|
771 | |
---|
772 | /* Declare an FFT faster than a plain toom3 etc multiplication found as |
---|
773 | soon as one faster measurement obtained. A multiplication in the |
---|
774 | middle of the FFT step is tested. */ |
---|
775 | for (;;) |
---|
776 | { |
---|
777 | int modf = (*p->p_modf_threshold == MP_SIZE_T_MAX); |
---|
778 | double tk, tm; |
---|
779 | |
---|
780 | /* k=7 should be the first FFT which can beat toom3 on a full |
---|
781 | multiply, so jump to that threshold and save some probing after the |
---|
782 | modf threshold is found. */ |
---|
783 | if (!modf && size < mpn_fft_table[p->sqr][2]) |
---|
784 | { |
---|
785 | size = mpn_fft_table[p->sqr][2]; |
---|
786 | if (option_trace >= 2) |
---|
787 | printf ("jump to size=%ld\n", size); |
---|
788 | } |
---|
789 | |
---|
790 | size = fft_next_size (size+1, mpn_fft_best_k (size, p->sqr)); |
---|
791 | k = mpn_fft_best_k (size, p->sqr); |
---|
792 | |
---|
793 | if (size >= p->max_size) |
---|
794 | break; |
---|
795 | |
---|
796 | s.size = size + fft_step_size (k) / 2; |
---|
797 | s.r = k; |
---|
798 | tk = tuneup_measure (p->function, NULL, &s); |
---|
799 | if (tk == -1.0) |
---|
800 | abort (); |
---|
801 | |
---|
802 | if (!modf) s.size /= 2; |
---|
803 | tm = tuneup_measure (p->mul_function, NULL, &s); |
---|
804 | if (tm == -1.0) |
---|
805 | abort (); |
---|
806 | |
---|
807 | if (option_trace >= 2) |
---|
808 | printf ("at %ld size=%ld k=%d %.9f size=%ld %s mul %.9f\n", |
---|
809 | size, |
---|
810 | size + fft_step_size (k) / 2, k, tk, |
---|
811 | s.size, modf ? "modf" : "full", tm); |
---|
812 | |
---|
813 | if (tk < tm) |
---|
814 | { |
---|
815 | if (modf) |
---|
816 | { |
---|
817 | *p->p_modf_threshold = s.size; |
---|
818 | print_define (p->modf_threshold_name, *p->p_modf_threshold); |
---|
819 | } |
---|
820 | else |
---|
821 | { |
---|
822 | *p->p_threshold = s.size; |
---|
823 | print_define (p->threshold_name, *p->p_threshold); |
---|
824 | break; |
---|
825 | } |
---|
826 | } |
---|
827 | } |
---|
828 | |
---|
829 | } |
---|
830 | |
---|
831 | |
---|
832 | |
---|
833 | /* Start karatsuba from 4, since the Cray t90 ieee code is much faster at 2, |
---|
834 | giving wrong results. */ |
---|
835 | void |
---|
836 | tune_mul (void) |
---|
837 | { |
---|
838 | static struct param_t param; |
---|
839 | param.name[0] = "MUL_KARATSUBA_THRESHOLD"; |
---|
840 | param.name[1] = "MUL_TOOM3_THRESHOLD"; |
---|
841 | param.function = speed_mpn_mul_n; |
---|
842 | param.min_size[0] = MAX (4, MPN_KARA_MUL_N_MINSIZE); |
---|
843 | param.max_size[0] = MUL_TOOM3_THRESHOLD_LIMIT-1; |
---|
844 | param.max_size[1] = MUL_TOOM3_THRESHOLD_LIMIT-1; |
---|
845 | one (mul_threshold, 2, ¶m); |
---|
846 | |
---|
847 | /* disabled until tuned */ |
---|
848 | MUL_FFT_THRESHOLD = MP_SIZE_T_MAX; |
---|
849 | } |
---|
850 | |
---|
851 | |
---|
852 | /* Start the basecase from 3, since 1 is a special case, and if mul_basecase |
---|
853 | is faster only at size==2 then we don't want to bother with extra code |
---|
854 | just for that. Start karatsuba from 4 same as MUL above. */ |
---|
855 | void |
---|
856 | tune_sqr (void) |
---|
857 | { |
---|
858 | static struct param_t param; |
---|
859 | param.name[0] = "SQR_BASECASE_THRESHOLD"; |
---|
860 | param.name[1] = "SQR_KARATSUBA_THRESHOLD"; |
---|
861 | param.name[2] = "SQR_TOOM3_THRESHOLD"; |
---|
862 | param.function = speed_mpn_sqr_n; |
---|
863 | param.min_is_always = 1; |
---|
864 | param.second_start_min = 1; |
---|
865 | param.min_size[0] = 3; |
---|
866 | param.min_size[1] = MAX (4, MPN_KARA_SQR_N_MINSIZE); |
---|
867 | param.min_size[2] = MPN_TOOM3_SQR_N_MINSIZE; |
---|
868 | param.max_size[0] = TUNE_SQR_KARATSUBA_MAX; |
---|
869 | param.max_size[1] = TUNE_SQR_KARATSUBA_MAX; |
---|
870 | one (sqr_threshold, 3, ¶m); |
---|
871 | |
---|
872 | /* disabled until tuned */ |
---|
873 | SQR_FFT_THRESHOLD = MP_SIZE_T_MAX; |
---|
874 | } |
---|
875 | |
---|
876 | |
---|
877 | void |
---|
878 | tune_sb_preinv (void) |
---|
879 | { |
---|
880 | static struct param_t param; |
---|
881 | |
---|
882 | if (UDIV_PREINV_ALWAYS) |
---|
883 | { |
---|
884 | print_define_remark ("DIV_SB_PREINV_THRESHOLD", 0L, "preinv always"); |
---|
885 | return; |
---|
886 | } |
---|
887 | |
---|
888 | param.check_size = 256; |
---|
889 | param.min_size[0] = 3; |
---|
890 | param.min_is_always = 1; |
---|
891 | param.size_extra = 3; |
---|
892 | param.stop_factor = 2.0; |
---|
893 | param.name[0] = "DIV_SB_PREINV_THRESHOLD"; |
---|
894 | param.function = speed_mpn_sb_divrem_m3; |
---|
895 | one (sb_preinv_threshold, 1, ¶m); |
---|
896 | } |
---|
897 | |
---|
898 | |
---|
899 | void |
---|
900 | tune_dc (void) |
---|
901 | { |
---|
902 | static struct param_t param; |
---|
903 | param.name[0] = "DIV_DC_THRESHOLD"; |
---|
904 | param.function = speed_mpn_dc_tdiv_qr; |
---|
905 | one (dc_threshold, 1, ¶m); |
---|
906 | } |
---|
907 | |
---|
908 | |
---|
909 | /* This is an indirect determination, based on a comparison between redc and |
---|
910 | mpz_mod. A fudge factor of 1.04 is applied to redc, to represent |
---|
911 | additional overheads it gets in mpz_powm. |
---|
912 | |
---|
913 | stop_factor is 1.1 to hopefully help cray vector systems, where otherwise |
---|
914 | currently it hits the 1000 limb limit with only a factor of about 1.18 |
---|
915 | (threshold should be around 650). */ |
---|
916 | |
---|
917 | void |
---|
918 | tune_powm (void) |
---|
919 | { |
---|
920 | static struct param_t param; |
---|
921 | param.name[0] = "POWM_THRESHOLD"; |
---|
922 | param.function = speed_redc; |
---|
923 | param.function2 = speed_mpz_mod; |
---|
924 | param.step_factor = 0.03; |
---|
925 | param.stop_factor = 1.1; |
---|
926 | param.function_fudge = 1.04; |
---|
927 | one (powm_threshold, 1, ¶m); |
---|
928 | } |
---|
929 | |
---|
930 | |
---|
931 | void |
---|
932 | tune_gcd_accel (void) |
---|
933 | { |
---|
934 | static struct param_t param; |
---|
935 | param.name[0] = "GCD_ACCEL_THRESHOLD"; |
---|
936 | param.function = speed_mpn_gcd; |
---|
937 | param.min_size[0] = 1; |
---|
938 | one (gcd_accel_threshold, 1, ¶m); |
---|
939 | } |
---|
940 | |
---|
941 | |
---|
942 | |
---|
943 | /* A comparison between the speed of a single limb step and a double limb |
---|
944 | step is made. On a 32-bit limb the ratio is about 2.2 single steps to |
---|
945 | equal a double step, or on a 64-bit limb about 2.09. (These were found |
---|
946 | from counting the steps on a 10000 limb gcdext. */ |
---|
947 | void |
---|
948 | tune_gcdext (void) |
---|
949 | { |
---|
950 | static struct param_t param; |
---|
951 | param.name[0] = "GCDEXT_THRESHOLD"; |
---|
952 | param.function = speed_mpn_gcdext_one_single; |
---|
953 | param.function2 = speed_mpn_gcdext_one_double; |
---|
954 | switch (BITS_PER_MP_LIMB) { |
---|
955 | case 32: param.function_fudge = 2.2; break; |
---|
956 | case 64: param.function_fudge = 2.09; break; |
---|
957 | default: |
---|
958 | printf ("Don't know GCDEXT_THERSHOLD factor for BITS_PER_MP_LIMB == %d\n", |
---|
959 | BITS_PER_MP_LIMB); |
---|
960 | abort (); |
---|
961 | } |
---|
962 | param.min_size[0] = 5; |
---|
963 | param.min_is_always = 1; |
---|
964 | param.max_size[0] = 300; |
---|
965 | param.check_size = 300; |
---|
966 | one (gcdext_threshold, 1, ¶m); |
---|
967 | } |
---|
968 | |
---|
969 | |
---|
970 | /* size_extra==1 reflects the fact that with high<divisor one division is |
---|
971 | always skipped. Forcing high<divisor while testing ensures consistency |
---|
972 | while stepping through sizes, ie. that size-1 divides will be done each |
---|
973 | time. |
---|
974 | |
---|
975 | min_size==2 and min_is_always are used so that if plain division is only |
---|
976 | better at size==1 then don't bother including that code just for that |
---|
977 | case, instead go with preinv always and get a size saving. */ |
---|
978 | |
---|
979 | #define DIV_1_PARAMS \ |
---|
980 | param.check_size = 256; \ |
---|
981 | param.min_size[0] = 2; \ |
---|
982 | param.min_is_always = 1; \ |
---|
983 | param.data_high = DATA_HIGH_LT_R; \ |
---|
984 | param.size_extra = 1; \ |
---|
985 | param.stop_factor = 2.0; |
---|
986 | |
---|
987 | |
---|
988 | double (*tuned_speed_mpn_divrem_1) _PROTO ((struct speed_params *s)); |
---|
989 | |
---|
990 | void |
---|
991 | tune_divrem_1 (void) |
---|
992 | { |
---|
993 | /* plain version by default */ |
---|
994 | tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1; |
---|
995 | |
---|
996 | #ifndef HAVE_NATIVE_mpn_divrem_1 |
---|
997 | #define HAVE_NATIVE_mpn_divrem_1 0 |
---|
998 | #endif |
---|
999 | |
---|
1000 | /* No support for tuning native assembler code, do that by hand and put |
---|
1001 | the results in the .asm file, there's no need for such thresholds to |
---|
1002 | appear in gmp-mparam.h. */ |
---|
1003 | if (HAVE_NATIVE_mpn_divrem_1) |
---|
1004 | return; |
---|
1005 | |
---|
1006 | if (UDIV_PREINV_ALWAYS) |
---|
1007 | { |
---|
1008 | print_define_remark ("DIVREM_1_NORM_THRESHOLD", 0L, "preinv always"); |
---|
1009 | print_define ("DIVREM_1_UNNORM_THRESHOLD", 0L); |
---|
1010 | return; |
---|
1011 | } |
---|
1012 | |
---|
1013 | tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1_tune; |
---|
1014 | |
---|
1015 | /* Tune for the integer part of mpn_divrem_1. This will very possibly be |
---|
1016 | a bit out for the fractional part, but that's too bad, the integer part |
---|
1017 | is more important. */ |
---|
1018 | { |
---|
1019 | static struct param_t param; |
---|
1020 | param.name[0] = "DIVREM_1_NORM_THRESHOLD"; |
---|
1021 | DIV_1_PARAMS; |
---|
1022 | s.r = randlimb_norm (); |
---|
1023 | param.function = speed_mpn_divrem_1_tune; |
---|
1024 | one (divrem_1_norm_threshold, 1, ¶m); |
---|
1025 | } |
---|
1026 | { |
---|
1027 | static struct param_t param; |
---|
1028 | param.name[0] = "DIVREM_1_UNNORM_THRESHOLD"; |
---|
1029 | DIV_1_PARAMS; |
---|
1030 | s.r = randlimb_half (); |
---|
1031 | param.function = speed_mpn_divrem_1_tune; |
---|
1032 | one (divrem_1_unnorm_threshold, 1, ¶m); |
---|
1033 | } |
---|
1034 | } |
---|
1035 | |
---|
1036 | |
---|
1037 | double (*tuned_speed_mpn_mod_1) _PROTO ((struct speed_params *s)); |
---|
1038 | |
---|
1039 | void |
---|
1040 | tune_mod_1 (void) |
---|
1041 | { |
---|
1042 | /* plain version by default */ |
---|
1043 | tuned_speed_mpn_mod_1 = speed_mpn_mod_1; |
---|
1044 | |
---|
1045 | #ifndef HAVE_NATIVE_mpn_mod_1 |
---|
1046 | #define HAVE_NATIVE_mpn_mod_1 0 |
---|
1047 | #endif |
---|
1048 | |
---|
1049 | /* No support for tuning native assembler code, do that by hand and put |
---|
1050 | the results in the .asm file, there's no need for such thresholds to |
---|
1051 | appear in gmp-mparam.h. */ |
---|
1052 | if (HAVE_NATIVE_mpn_mod_1) |
---|
1053 | return; |
---|
1054 | |
---|
1055 | if (UDIV_PREINV_ALWAYS) |
---|
1056 | { |
---|
1057 | print_define ("MOD_1_NORM_THRESHOLD", 0L); |
---|
1058 | print_define ("MOD_1_UNNORM_THRESHOLD", 0L); |
---|
1059 | return; |
---|
1060 | } |
---|
1061 | |
---|
1062 | tuned_speed_mpn_mod_1 = speed_mpn_mod_1_tune; |
---|
1063 | |
---|
1064 | { |
---|
1065 | static struct param_t param; |
---|
1066 | param.name[0] = "MOD_1_NORM_THRESHOLD"; |
---|
1067 | DIV_1_PARAMS; |
---|
1068 | s.r = randlimb_norm (); |
---|
1069 | param.function = speed_mpn_mod_1_tune; |
---|
1070 | one (mod_1_norm_threshold, 1, ¶m); |
---|
1071 | } |
---|
1072 | { |
---|
1073 | static struct param_t param; |
---|
1074 | param.name[0] = "MOD_1_UNNORM_THRESHOLD"; |
---|
1075 | DIV_1_PARAMS; |
---|
1076 | s.r = randlimb_half (); |
---|
1077 | param.function = speed_mpn_mod_1_tune; |
---|
1078 | one (mod_1_unnorm_threshold, 1, ¶m); |
---|
1079 | } |
---|
1080 | } |
---|
1081 | |
---|
1082 | |
---|
1083 | /* A non-zero DIVREM_1_UNNORM_THRESHOLD (or DIVREM_1_NORM_THRESHOLD) would |
---|
1084 | imply that udiv_qrnnd_preinv is worth using, but it seems most |
---|
1085 | straightforward to compare mpn_preinv_divrem_1 and mpn_divrem_1_div |
---|
1086 | directly. */ |
---|
1087 | |
---|
1088 | void |
---|
1089 | tune_preinv_divrem_1 (void) |
---|
1090 | { |
---|
1091 | static struct param_t param; |
---|
1092 | speed_function_t divrem_1; |
---|
1093 | const char *divrem_1_name; |
---|
1094 | double t1, t2; |
---|
1095 | |
---|
1096 | #ifndef HAVE_NATIVE_mpn_preinv_divrem_1 |
---|
1097 | #define HAVE_NATIVE_mpn_preinv_divrem_1 0 |
---|
1098 | #endif |
---|
1099 | |
---|
1100 | /* Any native version of mpn_preinv_divrem_1 is assumed to exist because |
---|
1101 | it's faster than mpn_divrem_1. */ |
---|
1102 | if (HAVE_NATIVE_mpn_preinv_divrem_1) |
---|
1103 | { |
---|
1104 | print_define_remark ("USE_PREINV_DIVREM_1", 1, "native"); |
---|
1105 | return; |
---|
1106 | } |
---|
1107 | |
---|
1108 | /* If udiv_qrnnd_preinv is the only division method then of course |
---|
1109 | mpn_preinv_divrem_1 should be used. */ |
---|
1110 | if (UDIV_PREINV_ALWAYS) |
---|
1111 | { |
---|
1112 | print_define_remark ("USE_PREINV_DIVREM_1", 1, "preinv always"); |
---|
1113 | return; |
---|
1114 | } |
---|
1115 | |
---|
1116 | /* If we've got an assembler version of mpn_divrem_1, then compare against |
---|
1117 | that, not the mpn_divrem_1_div generic C. */ |
---|
1118 | if (HAVE_NATIVE_mpn_divrem_1) |
---|
1119 | { |
---|
1120 | divrem_1 = speed_mpn_divrem_1; |
---|
1121 | divrem_1_name = "mpn_divrem_1"; |
---|
1122 | } |
---|
1123 | else |
---|
1124 | { |
---|
1125 | divrem_1 = speed_mpn_divrem_1_div; |
---|
1126 | divrem_1_name = "mpn_divrem_1_div"; |
---|
1127 | } |
---|
1128 | |
---|
1129 | param.data_high = DATA_HIGH_LT_R; /* allow skip one division */ |
---|
1130 | s.size = 200; /* generous but not too big */ |
---|
1131 | /* Divisor, nonzero. Unnormalized so as to exercise the shift!=0 case, |
---|
1132 | since in general that's probably most common, though in fact for a |
---|
1133 | 64-bit limb mp_bases[10].big_base is normalized. */ |
---|
1134 | s.r = urandom() & (MP_LIMB_T_MAX >> 4); |
---|
1135 | if (s.r == 0) s.r = 123; |
---|
1136 | |
---|
1137 | t1 = tuneup_measure (speed_mpn_preinv_divrem_1, ¶m, &s); |
---|
1138 | t2 = tuneup_measure (divrem_1, ¶m, &s); |
---|
1139 | if (t1 == -1.0 || t2 == -1.0) |
---|
1140 | { |
---|
1141 | printf ("Oops, can't measure mpn_preinv_divrem_1 and %s at %ld\n", |
---|
1142 | divrem_1_name, s.size); |
---|
1143 | abort (); |
---|
1144 | } |
---|
1145 | if (option_trace >= 1) |
---|
1146 | printf ("size=%ld, mpn_preinv_divrem_1 %.9f, %s %.9f\n", |
---|
1147 | s.size, t1, divrem_1_name, t2); |
---|
1148 | |
---|
1149 | print_define_remark ("USE_PREINV_DIVREM_1", (mp_size_t) (t1 < t2), NULL); |
---|
1150 | } |
---|
1151 | |
---|
1152 | |
---|
1153 | /* A non-zero MOD_1_UNNORM_THRESHOLD (or MOD_1_NORM_THRESHOLD) would imply |
---|
1154 | that udiv_qrnnd_preinv is worth using, but it seems most straightforward |
---|
1155 | to compare mpn_preinv_mod_1 and mpn_mod_1_div directly. */ |
---|
1156 | |
---|
1157 | void |
---|
1158 | tune_preinv_mod_1 (void) |
---|
1159 | { |
---|
1160 | static struct param_t param; |
---|
1161 | speed_function_t mod_1; |
---|
1162 | const char *mod_1_name; |
---|
1163 | double t1, t2; |
---|
1164 | |
---|
1165 | #ifndef HAVE_NATIVE_mpn_preinv_mod_1 |
---|
1166 | #define HAVE_NATIVE_mpn_preinv_mod_1 0 |
---|
1167 | #endif |
---|
1168 | |
---|
1169 | /* Any native version of mpn_preinv_mod_1 is assumed to exist because it's |
---|
1170 | faster than mpn_mod_1. */ |
---|
1171 | if (HAVE_NATIVE_mpn_preinv_mod_1) |
---|
1172 | { |
---|
1173 | print_define_remark ("USE_PREINV_MOD_1", 1, "native"); |
---|
1174 | return; |
---|
1175 | } |
---|
1176 | |
---|
1177 | /* If udiv_qrnnd_preinv is the only division method then of course |
---|
1178 | mpn_preinv_mod_1 should be used. */ |
---|
1179 | if (UDIV_PREINV_ALWAYS) |
---|
1180 | { |
---|
1181 | print_define_remark ("USE_PREINV_MOD_1", 1, "preinv always"); |
---|
1182 | return; |
---|
1183 | } |
---|
1184 | |
---|
1185 | /* If we've got an assembler version of mpn_mod_1, then compare against |
---|
1186 | that, not the mpn_mod_1_div generic C. */ |
---|
1187 | if (HAVE_NATIVE_mpn_mod_1) |
---|
1188 | { |
---|
1189 | mod_1 = speed_mpn_mod_1; |
---|
1190 | mod_1_name = "mpn_mod_1"; |
---|
1191 | } |
---|
1192 | else |
---|
1193 | { |
---|
1194 | mod_1 = speed_mpn_mod_1_div; |
---|
1195 | mod_1_name = "mpn_mod_1_div"; |
---|
1196 | } |
---|
1197 | |
---|
1198 | param.data_high = DATA_HIGH_LT_R; /* let mpn_mod_1 skip one division */ |
---|
1199 | s.size = 200; /* generous but not too big */ |
---|
1200 | s.r = randlimb_norm(); /* divisor */ |
---|
1201 | |
---|
1202 | t1 = tuneup_measure (speed_mpn_preinv_mod_1, ¶m, &s); |
---|
1203 | t2 = tuneup_measure (mod_1, ¶m, &s); |
---|
1204 | if (t1 == -1.0 || t2 == -1.0) |
---|
1205 | { |
---|
1206 | printf ("Oops, can't measure mpn_preinv_mod_1 and %s at %ld\n", |
---|
1207 | mod_1_name, s.size); |
---|
1208 | abort (); |
---|
1209 | } |
---|
1210 | if (option_trace >= 1) |
---|
1211 | printf ("size=%ld, mpn_preinv_mod_1 %.9f, %s %.9f\n", |
---|
1212 | s.size, t1, mod_1_name, t2); |
---|
1213 | |
---|
1214 | print_define_remark ("USE_PREINV_MOD_1", (mp_size_t) (t1 < t2), NULL); |
---|
1215 | } |
---|
1216 | |
---|
1217 | |
---|
1218 | void |
---|
1219 | tune_divrem_2 (void) |
---|
1220 | { |
---|
1221 | static struct param_t param; |
---|
1222 | |
---|
1223 | #ifndef HAVE_NATIVE_mpn_divrem_2 |
---|
1224 | #define HAVE_NATIVE_mpn_divrem_2 0 |
---|
1225 | #endif |
---|
1226 | |
---|
1227 | /* No support for tuning native assembler code, do that by hand and put |
---|
1228 | the results in the .asm file, and there's no need for such thresholds |
---|
1229 | to appear in gmp-mparam.h. */ |
---|
1230 | if (HAVE_NATIVE_mpn_divrem_2) |
---|
1231 | return; |
---|
1232 | |
---|
1233 | if (UDIV_PREINV_ALWAYS) |
---|
1234 | { |
---|
1235 | print_define_remark ("DIVREM_2_THRESHOLD", 0L, "preinv always"); |
---|
1236 | return; |
---|
1237 | } |
---|
1238 | |
---|
1239 | /* Tune for the integer part of mpn_divrem_2. This will very possibly be |
---|
1240 | a bit out for the fractional part, but that's too bad, the integer part |
---|
1241 | is more important. |
---|
1242 | |
---|
1243 | min_size must be >=2 since nsize>=2 is required, but is set to 4 to save |
---|
1244 | code space if plain division is better only at size==2 or size==3. */ |
---|
1245 | param.name[0] = "DIVREM_2_THRESHOLD"; |
---|
1246 | param.check_size = 256; |
---|
1247 | param.min_size[0] = 4; |
---|
1248 | param.min_is_always = 1; |
---|
1249 | param.size_extra = 2; /* does qsize==nsize-2 divisions */ |
---|
1250 | param.stop_factor = 2.0; |
---|
1251 | |
---|
1252 | s.r = randlimb_norm (); |
---|
1253 | param.function = speed_mpn_divrem_2; |
---|
1254 | one (divrem_2_threshold, 1, ¶m); |
---|
1255 | } |
---|
1256 | |
---|
1257 | |
---|
1258 | /* mpn_divexact_1 is vaguely expected to be used on smallish divisors, so |
---|
1259 | tune for that. Its speed can differ on odd or even divisor, so take an |
---|
1260 | average threshold for the two. |
---|
1261 | |
---|
1262 | mpn_divrem_1 can vary with high<divisor or not, whereas mpn_divexact_1 |
---|
1263 | might not vary that way, but don't test this since high<divisor isn't |
---|
1264 | expected to occur often with small divisors. */ |
---|
1265 | |
---|
1266 | void |
---|
1267 | tune_divexact_1 (void) |
---|
1268 | { |
---|
1269 | static struct param_t param; |
---|
1270 | mp_size_t thresh[2], average; |
---|
1271 | int low, i; |
---|
1272 | |
---|
1273 | ASSERT_ALWAYS (tuned_speed_mpn_divrem_1 != NULL); |
---|
1274 | |
---|
1275 | param.name[0] = "DIVEXACT_1_THRESHOLD"; |
---|
1276 | param.data_high = DATA_HIGH_GE_R; |
---|
1277 | param.check_size = 256; |
---|
1278 | param.min_size[0] = 2; |
---|
1279 | param.stop_factor = 1.5; |
---|
1280 | param.function = tuned_speed_mpn_divrem_1; |
---|
1281 | param.function2 = speed_mpn_divexact_1; |
---|
1282 | param.noprint = 1; |
---|
1283 | |
---|
1284 | print_define_start (param.name[0]); |
---|
1285 | |
---|
1286 | for (low = 0; low <= 1; low++) |
---|
1287 | { |
---|
1288 | s.r = randlimb_half(); |
---|
1289 | if (low == 0) |
---|
1290 | s.r |= 1; |
---|
1291 | else |
---|
1292 | s.r &= ~CNST_LIMB(7); |
---|
1293 | |
---|
1294 | one (divexact_1_threshold, 1, ¶m); |
---|
1295 | if (option_trace) |
---|
1296 | printf ("low=%d thresh %ld\n", low, divexact_1_threshold[0]); |
---|
1297 | |
---|
1298 | if (divexact_1_threshold[0] == MP_SIZE_T_MAX) |
---|
1299 | { |
---|
1300 | average = MP_SIZE_T_MAX; |
---|
1301 | goto divexact_1_done; |
---|
1302 | } |
---|
1303 | |
---|
1304 | thresh[low] = divexact_1_threshold[0]; |
---|
1305 | } |
---|
1306 | |
---|
1307 | if (option_trace) |
---|
1308 | { |
---|
1309 | printf ("average of:"); |
---|
1310 | for (i = 0; i < numberof(thresh); i++) |
---|
1311 | printf (" %ld", thresh[i]); |
---|
1312 | printf ("\n"); |
---|
1313 | } |
---|
1314 | |
---|
1315 | average = 0; |
---|
1316 | for (i = 0; i < numberof(thresh); i++) |
---|
1317 | average += thresh[i]; |
---|
1318 | average /= numberof(thresh); |
---|
1319 | |
---|
1320 | /* If divexact turns out to be better as early as 3 limbs, then use it |
---|
1321 | always, so as to reduce code size and conditional jumps. */ |
---|
1322 | if (average <= 3) |
---|
1323 | average = 0; |
---|
1324 | |
---|
1325 | divexact_1_done: |
---|
1326 | print_define_end (param.name[0], average); |
---|
1327 | } |
---|
1328 | |
---|
1329 | |
---|
1330 | /* The generic mpn_modexact_1_odd skips a divide step if high<divisor, the |
---|
1331 | same as mpn_mod_1, but this might not be true of an assembler |
---|
1332 | implementation. The threshold used is an average based on data where a |
---|
1333 | divide can be skipped and where it can't. |
---|
1334 | |
---|
1335 | If modexact turns out to be better as early as 3 limbs, then use it |
---|
1336 | always, so as to reduce code size and conditional jumps. */ |
---|
1337 | |
---|
1338 | void |
---|
1339 | tune_modexact_1_odd (void) |
---|
1340 | { |
---|
1341 | static struct param_t param; |
---|
1342 | mp_size_t thresh_lt; |
---|
1343 | |
---|
1344 | ASSERT_ALWAYS (tuned_speed_mpn_mod_1 != NULL); |
---|
1345 | |
---|
1346 | param.name[0] = "MODEXACT_1_ODD_THRESHOLD"; |
---|
1347 | param.check_size = 256; |
---|
1348 | param.min_size[0] = 2; |
---|
1349 | param.stop_factor = 1.5; |
---|
1350 | param.function = tuned_speed_mpn_mod_1; |
---|
1351 | param.function2 = speed_mpn_modexact_1c_odd; |
---|
1352 | param.noprint = 1; |
---|
1353 | s.r = randlimb_half () | 1; |
---|
1354 | |
---|
1355 | print_define_start (param.name[0]); |
---|
1356 | |
---|
1357 | param.data_high = DATA_HIGH_LT_R; |
---|
1358 | one (modexact_1_odd_threshold, 1, ¶m); |
---|
1359 | if (option_trace) |
---|
1360 | printf ("lt thresh %ld\n", modexact_1_odd_threshold[0]); |
---|
1361 | |
---|
1362 | thresh_lt = modexact_1_odd_threshold[0]; |
---|
1363 | if (modexact_1_odd_threshold[0] != MP_SIZE_T_MAX) |
---|
1364 | { |
---|
1365 | param.data_high = DATA_HIGH_GE_R; |
---|
1366 | one (modexact_1_odd_threshold, 1, ¶m); |
---|
1367 | if (option_trace) |
---|
1368 | printf ("ge thresh %ld\n", modexact_1_odd_threshold[0]); |
---|
1369 | |
---|
1370 | if (modexact_1_odd_threshold[0] != MP_SIZE_T_MAX) |
---|
1371 | { |
---|
1372 | modexact_1_odd_threshold[0] |
---|
1373 | = (modexact_1_odd_threshold[0] + thresh_lt) / 2; |
---|
1374 | if (modexact_1_odd_threshold[0] <= 3) |
---|
1375 | modexact_1_odd_threshold[0] = 0; |
---|
1376 | } |
---|
1377 | } |
---|
1378 | |
---|
1379 | print_define_end (param.name[0], modexact_1_odd_threshold[0]); |
---|
1380 | } |
---|
1381 | |
---|
1382 | |
---|
1383 | void |
---|
1384 | tune_jacobi_base (void) |
---|
1385 | { |
---|
1386 | static struct param_t param; |
---|
1387 | double t1, t2, t3; |
---|
1388 | int method; |
---|
1389 | |
---|
1390 | s.size = BITS_PER_MP_LIMB * 3 / 4; |
---|
1391 | |
---|
1392 | t1 = tuneup_measure (speed_mpn_jacobi_base_1, ¶m, &s); |
---|
1393 | if (option_trace >= 1) |
---|
1394 | printf ("size=%ld, mpn_jacobi_base_1 %.9f\n", s.size, t1); |
---|
1395 | |
---|
1396 | t2 = tuneup_measure (speed_mpn_jacobi_base_2, ¶m, &s); |
---|
1397 | if (option_trace >= 1) |
---|
1398 | printf ("size=%ld, mpn_jacobi_base_2 %.9f\n", s.size, t2); |
---|
1399 | |
---|
1400 | t3 = tuneup_measure (speed_mpn_jacobi_base_3, ¶m, &s); |
---|
1401 | if (option_trace >= 1) |
---|
1402 | printf ("size=%ld, mpn_jacobi_base_3 %.9f\n", s.size, t3); |
---|
1403 | |
---|
1404 | if (t1 == -1.0 || t2 == -1.0 || t3 == -1.0) |
---|
1405 | { |
---|
1406 | printf ("Oops, can't measure all mpn_jacobi_base methods at %ld\n", |
---|
1407 | s.size); |
---|
1408 | abort (); |
---|
1409 | } |
---|
1410 | |
---|
1411 | if (t1 < t2 && t1 < t3) |
---|
1412 | method = 1; |
---|
1413 | else if (t2 < t3) |
---|
1414 | method = 2; |
---|
1415 | else |
---|
1416 | method = 3; |
---|
1417 | |
---|
1418 | print_define ("JACOBI_BASE_METHOD", method); |
---|
1419 | } |
---|
1420 | |
---|
1421 | |
---|
1422 | void |
---|
1423 | tune_get_str (void) |
---|
1424 | { |
---|
1425 | /* Tune for decimal, it being most common. Some rough testing suggests |
---|
1426 | other bases are different, but not by very much. */ |
---|
1427 | s.r = 10; |
---|
1428 | { |
---|
1429 | static struct param_t param; |
---|
1430 | get_str_precompute_threshold[0] = 0; |
---|
1431 | param.name[0] = "GET_STR_DC_THRESHOLD"; |
---|
1432 | param.function = speed_mpn_get_str; |
---|
1433 | param.min_size[0] = 2; |
---|
1434 | param.max_size[0] = GET_STR_THRESHOLD_LIMIT; |
---|
1435 | one (get_str_basecase_threshold, 1, ¶m); |
---|
1436 | } |
---|
1437 | { |
---|
1438 | static struct param_t param; |
---|
1439 | param.name[0] = "GET_STR_PRECOMPUTE_THRESHOLD"; |
---|
1440 | param.function = speed_mpn_get_str; |
---|
1441 | param.min_size[0] = get_str_basecase_threshold[0]; |
---|
1442 | param.max_size[0] = GET_STR_THRESHOLD_LIMIT; |
---|
1443 | one (get_str_precompute_threshold, 1, ¶m); |
---|
1444 | } |
---|
1445 | } |
---|
1446 | |
---|
1447 | |
---|
1448 | void |
---|
1449 | tune_set_str (void) |
---|
1450 | { |
---|
1451 | static struct param_t param; |
---|
1452 | |
---|
1453 | s.r = 10; /* decimal */ |
---|
1454 | param.step_factor = 0.04; |
---|
1455 | param.name[0] = "SET_STR_THRESHOLD"; |
---|
1456 | param.function = speed_mpn_set_str_basecase; |
---|
1457 | param.function2 = speed_mpn_set_str_subquad; |
---|
1458 | param.min_size[0] = 100; |
---|
1459 | param.max_size[0] = 150000; |
---|
1460 | one (set_str_threshold, 1, ¶m); |
---|
1461 | } |
---|
1462 | |
---|
1463 | |
---|
1464 | void |
---|
1465 | tune_fft_mul (void) |
---|
1466 | { |
---|
1467 | static struct fft_param_t param; |
---|
1468 | |
---|
1469 | if (option_fft_max_size == 0) |
---|
1470 | return; |
---|
1471 | |
---|
1472 | param.table_name = "MUL_FFT_TABLE"; |
---|
1473 | param.threshold_name = "MUL_FFT_THRESHOLD"; |
---|
1474 | param.p_threshold = &MUL_FFT_THRESHOLD; |
---|
1475 | param.modf_threshold_name = "MUL_FFT_MODF_THRESHOLD"; |
---|
1476 | param.p_modf_threshold = &MUL_FFT_MODF_THRESHOLD; |
---|
1477 | param.first_size = MUL_TOOM3_THRESHOLD / 2; |
---|
1478 | param.max_size = option_fft_max_size; |
---|
1479 | param.function = speed_mpn_mul_fft; |
---|
1480 | param.mul_function = speed_mpn_mul_n; |
---|
1481 | param.sqr = 0; |
---|
1482 | fft (¶m); |
---|
1483 | } |
---|
1484 | |
---|
1485 | |
---|
1486 | void |
---|
1487 | tune_fft_sqr (void) |
---|
1488 | { |
---|
1489 | static struct fft_param_t param; |
---|
1490 | |
---|
1491 | if (option_fft_max_size == 0) |
---|
1492 | return; |
---|
1493 | |
---|
1494 | param.table_name = "SQR_FFT_TABLE"; |
---|
1495 | param.threshold_name = "SQR_FFT_THRESHOLD"; |
---|
1496 | param.p_threshold = &SQR_FFT_THRESHOLD; |
---|
1497 | param.modf_threshold_name = "SQR_FFT_MODF_THRESHOLD"; |
---|
1498 | param.p_modf_threshold = &SQR_FFT_MODF_THRESHOLD; |
---|
1499 | param.first_size = SQR_TOOM3_THRESHOLD / 2; |
---|
1500 | param.max_size = option_fft_max_size; |
---|
1501 | param.function = speed_mpn_mul_fft_sqr; |
---|
1502 | param.mul_function = speed_mpn_sqr_n; |
---|
1503 | param.sqr = 0; |
---|
1504 | fft (¶m); |
---|
1505 | } |
---|
1506 | |
---|
1507 | |
---|
1508 | void |
---|
1509 | all (void) |
---|
1510 | { |
---|
1511 | time_t start_time, end_time; |
---|
1512 | TMP_DECL (marker); |
---|
1513 | |
---|
1514 | TMP_MARK (marker); |
---|
1515 | s.xp_block = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, 0); |
---|
1516 | s.yp_block = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, 0); |
---|
1517 | |
---|
1518 | mpn_random (s.xp_block, SPEED_BLOCK_SIZE); |
---|
1519 | mpn_random (s.yp_block, SPEED_BLOCK_SIZE); |
---|
1520 | |
---|
1521 | fprintf (stderr, "Parameters for %s\n", GMP_MPARAM_H_SUGGEST); |
---|
1522 | |
---|
1523 | speed_time_init (); |
---|
1524 | fprintf (stderr, "Using: %s\n", speed_time_string); |
---|
1525 | |
---|
1526 | fprintf (stderr, "speed_precision %d", speed_precision); |
---|
1527 | if (speed_unittime == 1.0) |
---|
1528 | fprintf (stderr, ", speed_unittime 1 cycle"); |
---|
1529 | else |
---|
1530 | fprintf (stderr, ", speed_unittime %.2e secs", speed_unittime); |
---|
1531 | if (speed_cycletime == 1.0 || speed_cycletime == 0.0) |
---|
1532 | fprintf (stderr, ", CPU freq unknown\n"); |
---|
1533 | else |
---|
1534 | fprintf (stderr, ", CPU freq %.2f MHz\n", 1e-6/speed_cycletime); |
---|
1535 | |
---|
1536 | fprintf (stderr, "DEFAULT_MAX_SIZE %d, fft_max_size %ld\n", |
---|
1537 | DEFAULT_MAX_SIZE, option_fft_max_size); |
---|
1538 | fprintf (stderr, "\n"); |
---|
1539 | |
---|
1540 | time (&start_time); |
---|
1541 | { |
---|
1542 | struct tm *tp; |
---|
1543 | tp = localtime (&start_time); |
---|
1544 | printf ("/* Generated by tuneup.c, %d-%02d-%02d, ", |
---|
1545 | tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday); |
---|
1546 | |
---|
1547 | #ifdef __GNUC__ |
---|
1548 | /* gcc sub-minor version doesn't seem to come through as a define */ |
---|
1549 | printf ("gcc %d.%d */\n", __GNUC__, __GNUC_MINOR__); |
---|
1550 | #define PRINTED_COMPILER |
---|
1551 | #endif |
---|
1552 | #if defined (__SUNPRO_C) |
---|
1553 | printf ("Sun C %d.%d */\n", __SUNPRO_C / 0x100, __SUNPRO_C % 0x100); |
---|
1554 | #define PRINTED_COMPILER |
---|
1555 | #endif |
---|
1556 | #if ! defined (__GNUC__) && defined (__sgi) && defined (_COMPILER_VERSION) |
---|
1557 | /* gcc defines __sgi and _COMPILER_VERSION on irix 6, avoid that */ |
---|
1558 | printf ("MIPSpro C %d.%d.%d */\n", |
---|
1559 | _COMPILER_VERSION / 100, |
---|
1560 | _COMPILER_VERSION / 10 % 10, |
---|
1561 | _COMPILER_VERSION % 10); |
---|
1562 | #define PRINTED_COMPILER |
---|
1563 | #endif |
---|
1564 | #if defined (__DECC) && defined (__DECC_VER) |
---|
1565 | printf ("DEC C %d */\n", __DECC_VER); |
---|
1566 | #define PRINTED_COMPILER |
---|
1567 | #endif |
---|
1568 | #if ! defined (PRINTED_COMPILER) |
---|
1569 | printf ("system compiler */\n"); |
---|
1570 | #endif |
---|
1571 | } |
---|
1572 | printf ("\n"); |
---|
1573 | |
---|
1574 | tune_mul (); |
---|
1575 | printf("\n"); |
---|
1576 | |
---|
1577 | tune_sqr (); |
---|
1578 | printf("\n"); |
---|
1579 | |
---|
1580 | tune_sb_preinv (); |
---|
1581 | tune_dc (); |
---|
1582 | tune_powm (); |
---|
1583 | printf("\n"); |
---|
1584 | |
---|
1585 | tune_gcd_accel (); |
---|
1586 | tune_gcdext (); |
---|
1587 | tune_jacobi_base (); |
---|
1588 | printf("\n"); |
---|
1589 | |
---|
1590 | tune_divrem_1 (); |
---|
1591 | tune_mod_1 (); |
---|
1592 | tune_preinv_divrem_1 (); |
---|
1593 | tune_preinv_mod_1 (); |
---|
1594 | tune_divrem_2 (); |
---|
1595 | tune_divexact_1 (); |
---|
1596 | tune_modexact_1_odd (); |
---|
1597 | printf("\n"); |
---|
1598 | |
---|
1599 | tune_get_str (); |
---|
1600 | tune_set_str (); |
---|
1601 | printf("\n"); |
---|
1602 | |
---|
1603 | tune_fft_mul (); |
---|
1604 | printf("\n"); |
---|
1605 | |
---|
1606 | tune_fft_sqr (); |
---|
1607 | printf ("\n"); |
---|
1608 | |
---|
1609 | time (&end_time); |
---|
1610 | printf ("/* Tuneup completed successfully, took %ld seconds */\n", |
---|
1611 | end_time - start_time); |
---|
1612 | |
---|
1613 | TMP_FREE (marker); |
---|
1614 | } |
---|
1615 | |
---|
1616 | |
---|
1617 | int |
---|
1618 | main (int argc, char *argv[]) |
---|
1619 | { |
---|
1620 | int opt; |
---|
1621 | |
---|
1622 | /* Unbuffered so if output is redirected to a file it isn't lost if the |
---|
1623 | program is killed part way through. */ |
---|
1624 | setbuf (stdout, NULL); |
---|
1625 | setbuf (stderr, NULL); |
---|
1626 | |
---|
1627 | while ((opt = getopt(argc, argv, "f:o:p:t")) != EOF) |
---|
1628 | { |
---|
1629 | switch (opt) { |
---|
1630 | case 'f': |
---|
1631 | if (optarg[0] == 't') |
---|
1632 | option_fft_trace = 2; |
---|
1633 | else |
---|
1634 | option_fft_max_size = atol (optarg); |
---|
1635 | break; |
---|
1636 | case 'o': |
---|
1637 | speed_option_set (optarg); |
---|
1638 | break; |
---|
1639 | case 'p': |
---|
1640 | speed_precision = atoi (optarg); |
---|
1641 | break; |
---|
1642 | case 't': |
---|
1643 | option_trace++; |
---|
1644 | break; |
---|
1645 | case '?': |
---|
1646 | exit(1); |
---|
1647 | } |
---|
1648 | } |
---|
1649 | |
---|
1650 | all (); |
---|
1651 | exit (0); |
---|
1652 | } |
---|