source: trunk/third/gmp/tune/tuneup.c @ 18191

Revision 18191, 45.3 KB checked in by ghudson, 22 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r18190, which included commits to RCS files with non-trunk default branches.
Line 
1/* Create tuned thresholds for various algorithms.
2
3Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
4
5This file is part of the GNU MP Library.
6
7The GNU MP Library is free software; you can redistribute it and/or modify
8it under the terms of the GNU Lesser General Public License as published by
9the Free Software Foundation; either version 2.1 of the License, or (at your
10option) any later version.
11
12The GNU MP Library is distributed in the hope that it will be useful, but
13WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15License for more details.
16
17You should have received a copy of the GNU Lesser General Public License
18along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
19the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
20MA 02111-1307, USA. */
21
22
23/* Usage: tune [-t] [-t] [-p precision]
24
25   -t turns on some diagnostic traces, a second -t turns on more traces.
26
27   The thresholds are determined as follows.  A crossover may not be a
28   single size but rather a range where it oscillates between method A or
29   method B faster.  If the threshold is set making B used where A is faster
30   (or vice versa) that's bad.  Badness is the percentage time lost and
31   total badness is the sum of this over all sizes measured.  The threshold
32   is set to minimize total badness.
33
34   Suppose, as sizes increase, method B becomes faster than method A.  The
35   effect of the rule is that, as you look at increasing sizes, isolated
36   points where B is faster are ignored, but when it's consistently faster,
37   or faster on balance, then the threshold is set there.  The same result
38   is obtained thinking in the other direction of A becoming faster at
39   smaller sizes.
40
41   In practice the thresholds tend to be chosen to bring on the next
42   algorithm fairly quickly.
43
44   This rule is attractive because it's got a basis in reason and is fairly
45   easy to implement, but no work has been done to actually compare it in
46   absolute terms to other possibilities.
47
48   Sometimes running the program twice produces slightly different results.
49   This is probably because there's so little separating algorithms near
50   their crossover, and on that basis it should make little or no difference
51   to the final speed of the relevant routines, but nothing has been done to
52   check that carefully.
53
54   Remarks:
55
56   The code here isn't a vision of loveliness, mainly because it's subject
57   to ongoing modifications according to new things wanting to be tuned and
58   practical requirements of systems tested.
59
60   The way parts of the library are recompiled to insinuate the tuning
61   variables is a bit subtle, but unavoidable since of course the main
62   library has fixed thresholds compiled-in but we want to vary them here.
63   Most of the nonsense for this can be found in tune/Makefile.am and under
64   TUNE_PROGRAM_BUILD in gmp-impl.h.
65
66   The dirty hack which the "second_start_min" feature could perhaps be done
67   more generally, so if say karatsuba is never better than toom3 then it
68   can be detected and omitted.  Currently we're hoping very hard that this
69   doesn't arise in practice, and if it does then it indicates something
70   badly sub-optimal in the karatsuba implementation.
71
72   Limitations:
73
74   The FFTs aren't subject to the same badness rule as the other thresholds,
75   so each k is probably being brought on a touch early.  This isn't likely
76   to make a difference, and the simpler probing means fewer tests.
77
78*/
79
80#define TUNE_PROGRAM_BUILD  1   /* for gmp-impl.h */
81
82#include "config.h"
83
84#include <math.h>
85#include <stdio.h>
86#include <stdlib.h>
87#include <time.h>
88#if HAVE_UNISTD_H
89#include <unistd.h>
90#endif
91
92#include "gmp.h"
93#include "gmp-impl.h"
94#include "longlong.h"
95
96#include "tests.h"
97#include "speed.h"
98
99#if !HAVE_DECL_OPTARG
100extern char *optarg;
101extern int optind, opterr;
102#endif
103
104
105#define DEFAULT_MAX_SIZE   1000  /* limbs */
106#define MAX_TABLE             5  /* entries */
107
108#if WANT_FFT
109mp_size_t  option_fft_max_size = 50000;  /* limbs */
110#else
111mp_size_t  option_fft_max_size = 0;
112#endif
113int        option_trace = 0;
114int        option_fft_trace = 0;
115struct speed_params  s;
116
117struct dat_t {
118  mp_size_t  size;
119  double     d;
120} *dat = NULL;
121int  ndat = 0;
122int  allocdat = 0;
123
124
125/* Each "_threshold" array must be 1 bigger than the number of thresholds
126   being tuned in a set, because one() stores a value in the entry above
127   the one it's determining. */
128
129mp_size_t  mul_threshold[MAX_TABLE+1] = { MP_SIZE_T_MAX };
130mp_size_t  sqr_threshold[MAX_TABLE+1] = { MP_SIZE_T_MAX };
131mp_size_t  sb_preinv_threshold[2] = { MP_SIZE_T_MAX };
132mp_size_t  dc_threshold[2] = { MP_SIZE_T_MAX };
133mp_size_t  powm_threshold[2] = { MP_SIZE_T_MAX };
134mp_size_t  gcd_accel_threshold[2] = { MP_SIZE_T_MAX };
135mp_size_t  gcdext_threshold[2] = { MP_SIZE_T_MAX };
136mp_size_t  divexact_1_threshold[2] = { MP_SIZE_T_MAX };
137mp_size_t  divrem_1_norm_threshold[2] = { MP_SIZE_T_MAX };
138mp_size_t  divrem_1_unnorm_threshold[2] = { MP_SIZE_T_MAX };
139mp_size_t  divrem_2_threshold[2] = { MP_SIZE_T_MAX };
140mp_size_t  mod_1_norm_threshold[2] = { MP_SIZE_T_MAX };
141mp_size_t  mod_1_unnorm_threshold[2] = { MP_SIZE_T_MAX };
142mp_size_t  modexact_1_odd_threshold[2] = { MP_SIZE_T_MAX };
143mp_size_t  get_str_basecase_threshold[2] = { MP_SIZE_T_MAX };
144mp_size_t  get_str_precompute_threshold[2] = { MP_SIZE_T_MAX };
145mp_size_t  set_str_threshold[2] = { MP_SIZE_T_MAX };
146
147mp_size_t  fft_modf_sqr_threshold = MP_SIZE_T_MAX;
148mp_size_t  fft_modf_mul_threshold = MP_SIZE_T_MAX;
149
150#ifndef TUNE_SQR_KARATSUBA_MAX
151#define TUNE_SQR_KARATSUBA_MAX  0 /* meaning no limit */
152#endif
153
154struct param_t {
155  const char        *name[MAX_TABLE];
156  speed_function_t  function;
157  speed_function_t  function2;
158  double            step_factor;    /* how much to step sizes (rounded down) */
159  double            function_fudge; /* multiplier for "function" speeds */
160  int               stop_since_change;
161  double            stop_factor;
162  mp_size_t         min_size[MAX_TABLE];
163  int               min_is_always;
164  int               second_start_min;
165  mp_size_t         max_size[MAX_TABLE];
166  mp_size_t         check_size;
167  mp_size_t         size_extra;
168
169#define DATA_HIGH_LT_R  1
170#define DATA_HIGH_GE_R  2
171  int               data_high;
172
173  int               noprint;
174};
175
176#ifndef UDIV_PREINV_ALWAYS
177#define UDIV_PREINV_ALWAYS 0
178#endif
179
180
181mp_limb_t
182randlimb_norm (void)
183{
184  mp_limb_t  n;
185  mpn_random (&n, 1);
186  n |= GMP_LIMB_HIGHBIT;
187  return n;
188}
189
190#define MP_LIMB_T_HALFMASK  ((CNST_LIMB(1) << (BITS_PER_MP_LIMB/2)) - 1)
191
192mp_limb_t
193randlimb_half (void)
194{
195  mp_limb_t  n;
196  mpn_random (&n, 1);
197  n &= MP_LIMB_T_HALFMASK;
198  n += (n==0);
199  return n;
200}
201
202
203/* Add an entry to the end of the dat[] array, reallocing to make it bigger
204   if necessary.  */
205void
206add_dat (mp_size_t size, double d)
207{
208#define ALLOCDAT_STEP  500
209
210  ASSERT_ALWAYS (ndat <= allocdat);
211
212  if (ndat == allocdat)
213    {
214      dat = (struct dat_t *) __gmp_allocate_or_reallocate
215        (dat, allocdat * sizeof(dat[0]),
216         (allocdat+ALLOCDAT_STEP) * sizeof(dat[0]));
217      allocdat += ALLOCDAT_STEP;
218    }
219
220  dat[ndat].size = size;
221  dat[ndat].d = d;
222  ndat++;
223}
224
225
226/* Return the threshold size based on the data accumulated. */
227mp_size_t
228analyze_dat (int i, int final)
229{
230  double  x, min_x;
231  int     j, min_j;
232
233  /* If the threshold is set at dat[0].size, any positive values are bad. */
234  x = 0.0;
235  for (j = 0; j < ndat; j++)
236    if (dat[j].d > 0.0)
237      x += dat[j].d;
238
239  if (option_trace >= 2 && final)
240    {
241      printf ("\n");
242      printf ("x is the sum of the badness from setting thresh at given size\n");
243      printf ("  (minimum x is sought)\n");
244      printf ("i=%d size=%ld  first x=%.4f\n", i, dat[j].size, x);
245    }
246
247  min_x = x;
248  min_j = 0;
249
250
251  /* When stepping to the next dat[j].size, positive values are no longer
252     bad (so subtracted), negative values become bad (so add the absolute
253     value, meaning subtract). */
254  for (j = 0; j < ndat; x -= dat[j].d, j++)
255    {
256      if (option_trace >= 2 && final)
257        printf ("i=%d size=%ld  x=%.4f\n", i, dat[j].size, x);
258
259      if (x < min_x)
260        {
261          min_x = x;
262          min_j = j;
263        }
264    }
265
266  return min_j;
267}
268
269
270/* Measuring for recompiled mpn/generic/divrem_1.c and mpn/generic/mod_1.c */
271
272mp_limb_t mpn_divrem_1_tune _PROTO ((mp_ptr qp, mp_size_t xsize,
273                                    mp_srcptr ap, mp_size_t size,
274                                    mp_limb_t d));
275mp_limb_t mpn_mod_1_tune _PROTO ((mp_srcptr ap, mp_size_t size, mp_limb_t d));
276
277double
278speed_mpn_mod_1_tune (struct speed_params *s)
279{
280  SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_tune);
281}
282double
283speed_mpn_divrem_1_tune (struct speed_params *s)
284{
285  SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_tune);
286}
287
288
289double
290tuneup_measure (speed_function_t fun,
291                const struct param_t *param,
292                struct speed_params *s)
293{
294  static struct param_t  dummy;
295  double   t;
296  TMP_DECL (marker);
297
298  if (! param)
299    param = &dummy;
300
301  s->size += param->size_extra;
302
303  TMP_MARK (marker);
304  s->xp = SPEED_TMP_ALLOC_LIMBS (s->size, 0);
305  s->yp = SPEED_TMP_ALLOC_LIMBS (s->size, 0);
306
307  mpn_random (s->xp, s->size);
308  mpn_random (s->yp, s->size);
309
310  switch (param->data_high) {
311  case DATA_HIGH_LT_R:
312    s->xp[s->size-1] %= s->r;
313    s->yp[s->size-1] %= s->r;
314    break;
315  case DATA_HIGH_GE_R:
316    s->xp[s->size-1] |= s->r;
317    s->yp[s->size-1] |= s->r;
318    break;
319  }
320
321  t = speed_measure (fun, s);
322
323  s->size -= param->size_extra;
324
325  TMP_FREE (marker);
326  return t;
327}
328
329
330#define PRINT_WIDTH  28
331
332void
333print_define_start (const char *name)
334{
335  printf ("#define %-*s  ", PRINT_WIDTH, name);
336  if (option_trace)
337    printf ("...\n");
338}
339
340void
341print_define_end_remark (const char *name, mp_size_t value, const char *remark)
342{
343  if (option_trace)
344    printf ("#define %-*s  ", PRINT_WIDTH, name);
345
346  if (value == MP_SIZE_T_MAX)
347    printf ("MP_SIZE_T_MAX");
348  else
349    printf ("%5ld", value);
350
351  if (remark != NULL)
352    printf ("  /* %s */", remark);
353  printf ("\n");
354}
355
356void
357print_define_end (const char *name, mp_size_t value)
358{
359  const char  *remark;
360  if (value == MP_SIZE_T_MAX)
361    remark = "never";
362  else if (value == 0)
363    remark = "always";
364  else
365    remark = NULL;
366  print_define_end_remark (name, value, remark);
367}
368
369void
370print_define (const char *name, mp_size_t value)
371{
372  print_define_start (name);
373  print_define_end (name, value);
374}
375
376void
377print_define_remark (const char *name, mp_size_t value, const char *remark)
378{
379  print_define_start (name);
380  print_define_end_remark (name, value, remark);
381}
382
383
384/* table[i+1] needs to be set to a sensible value when testing method i+1
385   because mpn_mul_n uses MUL_TOOM3_THRESHOLD to size the temporary
386   workspace for mpn_kara_mul_n. */
387
388void
389one (mp_size_t table[], size_t max_table, struct param_t *param)
390{
391  mp_size_t  table_save0 = 0;
392  int  since_positive, since_thresh_change;
393  int  thresh_idx, new_thresh_idx;
394  int  i;
395
396  ASSERT_ALWAYS (max_table <= MAX_TABLE);
397
398#define DEFAULT(x,n)  if (! (param->x))  param->x = (n);
399
400  DEFAULT (function_fudge, 1.0);
401  DEFAULT (function2, param->function);
402  DEFAULT (step_factor, 0.01);  /* small steps by default */
403  DEFAULT (stop_since_change, 80);
404  DEFAULT (stop_factor, 1.2);
405  for (i = 0; i < max_table; i++)
406    DEFAULT (min_size[i], 10);
407  for (i = 0; i < max_table; i++)
408    DEFAULT (max_size[i], DEFAULT_MAX_SIZE);
409
410  if (param->check_size != 0)
411    {
412      double   t1, t2;
413      s.size = param->check_size;
414
415      table[0] = s.size+1;
416      table[1] = param->max_size[0];
417      t1 = tuneup_measure (param->function, param, &s);
418
419      table[0] = s.size;
420      table[1] = s.size+1;
421      t2 = tuneup_measure (param->function2, param, &s);
422      if (t1 == -1.0 || t2 == -1.0)
423        {
424          printf ("Oops, can't run both functions at size %ld\n", s.size);
425          abort ();
426        }
427      t1 *= param->function_fudge;
428
429      /* ask that t2 is at least 4% below t1 */
430      if (t1 < t2*1.04)
431        {
432          if (option_trace)
433            printf ("function2 never enough faster: t1=%.9f t2=%.9f\n", t1, t2);
434          table[0] = MP_SIZE_T_MAX;
435          if (! param->noprint)
436            print_define (param->name[0], table[0]);
437          return;
438        }
439
440      if (option_trace >= 2)
441        printf ("function2 enough faster at size=%ld: t1=%.9f t2=%.9f\n",
442                s.size, t1, t2);
443    }
444
445  for (i = 0, s.size = 1; i < max_table && s.size < param->max_size[i]; i++)
446    {
447      if (i == 1 && param->second_start_min)
448        s.size = 1;
449
450      if (s.size < param->min_size[i])
451        s.size = param->min_size[i];
452
453      if (! (param->noprint || (i == 1 && param->second_start_min)))
454        print_define_start (param->name[i]);
455
456      ndat = 0;
457      since_positive = 0;
458      since_thresh_change = 0;
459      thresh_idx = 0;
460
461      if (option_trace >= 2)
462        {
463          printf ("             algorithm-A  algorithm-B   ratio  possible\n");
464          printf ("              (seconds)    (seconds)    diff    thresh\n");
465        }
466
467      for (;
468           s.size < param->max_size[i];
469           s.size += MAX ((mp_size_t) floor (s.size * param->step_factor), 1))
470        {
471          double   ti, tiplus1, d;
472
473          /* If there's a size limit and it's reached then it should still
474             be sensible to analyze the data since we want the threshold put
475             either at or near the limit.  */
476          if (s.size >= param->max_size[i])
477            {
478              if (option_trace)
479                printf ("Reached maximum size (%ld) without otherwise stopping\n",
480                        param->max_size[i]);
481              break;
482            }
483
484          /*
485            FIXME: check minimum size requirements are met, possibly by just
486            checking for the -1 returns from the speed functions.
487          */
488
489          /* under this hack, don't let method 0 get used at s.size */
490          if (i == 1 && param->second_start_min)
491            table[0] = MIN (s.size-1, table_save0);
492
493          /* using method i at this size */
494          table[i] = s.size+1;
495          table[i+1] = param->max_size[i];
496          ti = tuneup_measure (param->function, param, &s);
497          if (ti == -1.0)
498            abort ();
499          ti *= param->function_fudge;
500
501          /* using method i+1 at this size */
502          table[i] = s.size;
503          table[i+1] = s.size+1;
504          tiplus1 = tuneup_measure (param->function2, param, &s);
505          if (tiplus1 == -1.0)
506            abort ();
507
508          /* Calculate the fraction by which the one or the other routine is
509             slower.  */
510          if (tiplus1 >= ti)
511            d = (tiplus1 - ti) / tiplus1;  /* negative */
512          else
513            d = (tiplus1 - ti) / ti;       /* positive */
514
515          add_dat (s.size, d);
516
517          new_thresh_idx = analyze_dat (i, 0);
518
519
520          if (option_trace >= 2)
521            printf ("i=%d size=%ld  %.9f  %.9f  % .4f %c  %ld\n",
522                    i, s.size, ti, tiplus1, d,
523                    ti > tiplus1 ? '#' : ' ',
524                    dat[new_thresh_idx].size);
525
526          /* Stop if the last time method i was faster was more than a
527             certain number of measurements ago.  */
528#define STOP_SINCE_POSITIVE  200
529          if (d >= 0)
530            since_positive = 0;
531          else
532            if (++since_positive > STOP_SINCE_POSITIVE)
533              {
534                if (option_trace >= 1)
535                  printf ("i=%d stopped due to since_positive (%d)\n",
536                          i, STOP_SINCE_POSITIVE);
537                break;
538              }
539
540          /* Stop if method i has become slower by a certain factor. */
541          if (ti >= tiplus1 * param->stop_factor)
542            {
543              if (option_trace >= 1)
544                printf ("i=%d stopped due to ti >= tiplus1 * factor (%.1f)\n",
545                        i, param->stop_factor);
546              break;
547            }
548
549          /* Stop if the threshold implied hasn't changed in a certain
550             number of measurements.  (It's this condition that ususally
551             stops the loop.) */
552          if (thresh_idx != new_thresh_idx)
553            since_thresh_change = 0, thresh_idx = new_thresh_idx;
554          else
555            if (++since_thresh_change > param->stop_since_change)
556              {
557                if (option_trace >= 1)
558                  printf ("i=%d stopped due to since_thresh_change (%d)\n",
559                          i, param->stop_since_change);
560                break;
561              }
562
563          /* Stop if the threshold implied is more than a certain number of
564             measurements ago.  */
565#define STOP_SINCE_AFTER   500
566          if (ndat - thresh_idx > STOP_SINCE_AFTER)
567            {
568              if (option_trace >= 1)
569                printf ("i=%d stopped due to ndat - thresh_idx > amount (%d)\n",
570                        i, STOP_SINCE_AFTER);
571              break;
572            }
573        }
574
575      /* Stop when the size limit is reached before the end of the
576         crossover, but only show this as an error for >= the default max
577         size.  FIXME: Maybe should make it a param choice whether this is
578         an error.  */
579      if (s.size >= param->max_size[i]
580          && param->max_size[i] >= DEFAULT_MAX_SIZE)
581        {
582          fprintf (stderr, "%s\n", param->name[i]);
583          fprintf (stderr, "i=%d sizes %ld to %ld total %d measurements\n",
584                   i, dat[0].size, dat[ndat-1].size, ndat);
585          fprintf (stderr, "    max size reached before end of crossover\n");
586          break;
587        }
588
589      if (option_trace >= 1)
590        printf ("i=%d sizes %ld to %ld total %d measurements\n",
591                i, dat[0].size, dat[ndat-1].size, ndat);
592
593      if (ndat == 0)
594        break;
595
596      table[i] = dat[analyze_dat (i, 1)].size;
597
598      /* fudge here, let min_is_always apply only to i==0, that's what the
599         sqr_n thresholds want */
600      if (i == 0 && param->min_is_always && table[i] == param->min_size[i])
601        table[i] = 0;
602
603      /* under the second_start_min fudge, if the second threshold turns out
604         to be lower than the first, then the second method is unwanted, we
605         should go straight from algorithm 1 to algorithm 3.  */
606      if (param->second_start_min)
607        {
608          if (i == 0)
609            {
610              table_save0 = table[0];
611              table[0] = 0;
612            }
613          else if (i == 1)
614            {
615              table[0] = table_save0;
616              if (table[1] <= table[0])
617                {
618                  table[0] = table[1];
619                  table[1] = 0;
620                }
621            }
622          s.size = MAX (table[0], table[1]) + 1;
623        }
624
625      if (! (param->noprint || (i == 0 && param->second_start_min)))
626        {
627          if (i == 1 && param->second_start_min)
628            {
629              print_define_end (param->name[0], table[0]);
630              print_define_start (param->name[1]);
631            }
632
633          print_define_end (param->name[i], table[i]);
634        }
635
636      /* Look for the next threshold starting from the current one. */
637      s.size = table[i]+1;
638
639      /* Take a MAX of all to allow for second_start_min producing a 0. */
640      {
641        int  j;
642        for (j = 0; j < i; j++)
643          s.size = MAX (s.size, table[j]+1);
644      }
645    }
646}
647
648
649/* Special probing for the fft thresholds.  The size restrictions on the
650   FFTs mean the graph of time vs size has a step effect.  See this for
651   example using
652
653       ./speed -s 4096-16384 -t 128 -P foo mpn_mul_fft.8 mpn_mul_fft.9
654       gnuplot foo.gnuplot
655
656   The current approach is to compare routines at the midpoint of relevant
657   steps.  Arguably a more sophisticated system of threshold data is wanted
658   if this step effect remains. */
659
660struct fft_param_t {
661  const char        *table_name;
662  const char        *threshold_name;
663  const char        *modf_threshold_name;
664  mp_size_t         *p_threshold;
665  mp_size_t         *p_modf_threshold;
666  mp_size_t         first_size;
667  mp_size_t         max_size;
668  speed_function_t  function;
669  speed_function_t  mul_function;
670  mp_size_t         sqr;
671};
672
673
674/* mpn_mul_fft requires pl a multiple of 2^k limbs, but with
675   N=pl*BIT_PER_MP_LIMB it internally also pads out so N/2^k is a multiple
676   of 2^(k-1) bits. */
677
678mp_size_t
679fft_step_size (int k)
680{
681  mp_size_t  step;
682
683  step = MAX ((mp_size_t) 1 << (k-1), BITS_PER_MP_LIMB) / BITS_PER_MP_LIMB;
684  step *= (mp_size_t) 1 << k;
685
686  if (step <= 0)
687    {
688      printf ("Can't handle k=%d\n", k);
689      abort ();
690    }
691
692  return step;
693}
694
695mp_size_t
696fft_next_size (mp_size_t pl, int k)
697{
698  mp_size_t  m = fft_step_size (k);
699
700/*    printf ("[k=%d %ld] %ld ->", k, m, pl); */
701
702  if (pl == 0 || (pl & (m-1)) != 0)
703    pl = (pl | (m-1)) + 1;
704
705/*    printf (" %ld\n", pl); */
706  return pl;
707}
708
709void
710fft (struct fft_param_t *p)
711{
712  mp_size_t  size;
713  int        i, k;
714
715  for (i = 0; i < numberof (mpn_fft_table[p->sqr]); i++)
716    mpn_fft_table[p->sqr][i] = MP_SIZE_T_MAX;
717
718  *p->p_threshold = MP_SIZE_T_MAX;
719  *p->p_modf_threshold = MP_SIZE_T_MAX;
720
721  option_trace = MAX (option_trace, option_fft_trace);
722
723  printf ("#define %s  {", p->table_name);
724  if (option_trace >= 2)
725    printf ("\n");
726
727  k = FFT_FIRST_K;
728  size = p->first_size;
729  for (;;)
730    {
731      double  tk, tk1;
732
733      size = fft_next_size (size+1, k+1);
734
735      if (size >= p->max_size)
736        break;
737      if (k >= FFT_FIRST_K + numberof (mpn_fft_table[p->sqr]))
738        break;
739
740      /* compare k to k+1 in the middle of the current k+1 step */
741      s.size = size + fft_step_size (k+1) / 2;
742      s.r = k;
743      tk = tuneup_measure (p->function, NULL, &s);
744      if (tk == -1.0)
745        abort ();
746
747      s.r = k+1;
748      tk1 = tuneup_measure (p->function, NULL, &s);
749      if (tk1 == -1.0)
750        abort ();
751
752      if (option_trace >= 2)
753        printf ("at %ld   size=%ld  k=%d  %.9f   k=%d %.9f\n",
754                size, s.size, k, tk, k+1, tk1);
755
756      /* declare the k+1 threshold as soon as it's faster at its midpoint */
757      if (tk1 < tk)
758        {
759          mpn_fft_table[p->sqr][k-FFT_FIRST_K] = s.size;
760          printf (" %ld,", s.size);
761          if (option_trace >= 2) printf ("\n");
762          k++;
763        }
764    }
765
766  mpn_fft_table[p->sqr][k-FFT_FIRST_K] = 0;
767  printf (" 0 }\n");
768
769
770  size = p->first_size;
771
772  /* Declare an FFT faster than a plain toom3 etc multiplication found as
773     soon as one faster measurement obtained.  A multiplication in the
774     middle of the FFT step is tested.  */
775  for (;;)
776    {
777      int     modf = (*p->p_modf_threshold == MP_SIZE_T_MAX);
778      double  tk, tm;
779
780      /* k=7 should be the first FFT which can beat toom3 on a full
781         multiply, so jump to that threshold and save some probing after the
782         modf threshold is found.  */
783      if (!modf && size < mpn_fft_table[p->sqr][2])
784        {
785          size = mpn_fft_table[p->sqr][2];
786          if (option_trace >= 2)
787            printf ("jump to size=%ld\n", size);
788        }
789
790      size = fft_next_size (size+1, mpn_fft_best_k (size, p->sqr));
791      k = mpn_fft_best_k (size, p->sqr);
792
793      if (size >= p->max_size)
794        break;
795
796      s.size = size + fft_step_size (k) / 2;
797      s.r = k;
798      tk = tuneup_measure (p->function, NULL, &s);
799      if (tk == -1.0)
800        abort ();
801
802      if (!modf)  s.size /= 2;
803      tm = tuneup_measure (p->mul_function, NULL, &s);
804      if (tm == -1.0)
805        abort ();
806
807      if (option_trace >= 2)
808        printf ("at %ld   size=%ld   k=%d  %.9f   size=%ld %s mul %.9f\n",
809                size,
810                size + fft_step_size (k) / 2, k, tk,
811                s.size, modf ? "modf" : "full", tm);
812
813      if (tk < tm)
814        {
815          if (modf)
816            {
817              *p->p_modf_threshold = s.size;
818              print_define (p->modf_threshold_name, *p->p_modf_threshold);
819            }
820          else
821            {
822              *p->p_threshold = s.size;
823              print_define (p->threshold_name,      *p->p_threshold);
824              break;
825            }
826        }
827    }
828
829}
830
831
832
833/* Start karatsuba from 4, since the Cray t90 ieee code is much faster at 2,
834   giving wrong results.  */
835void
836tune_mul (void)
837{
838  static struct param_t  param;
839  param.name[0] = "MUL_KARATSUBA_THRESHOLD";
840  param.name[1] = "MUL_TOOM3_THRESHOLD";
841  param.function = speed_mpn_mul_n;
842  param.min_size[0] = MAX (4, MPN_KARA_MUL_N_MINSIZE);
843  param.max_size[0] = MUL_TOOM3_THRESHOLD_LIMIT-1;
844  param.max_size[1] = MUL_TOOM3_THRESHOLD_LIMIT-1;
845  one (mul_threshold, 2, &param);
846
847  /* disabled until tuned */
848  MUL_FFT_THRESHOLD = MP_SIZE_T_MAX;
849}
850
851
852/* Start the basecase from 3, since 1 is a special case, and if mul_basecase
853   is faster only at size==2 then we don't want to bother with extra code
854   just for that.  Start karatsuba from 4 same as MUL above.  */
855void
856tune_sqr (void)
857{
858  static struct param_t  param;
859  param.name[0] = "SQR_BASECASE_THRESHOLD";
860  param.name[1] = "SQR_KARATSUBA_THRESHOLD";
861  param.name[2] = "SQR_TOOM3_THRESHOLD";
862  param.function = speed_mpn_sqr_n;
863  param.min_is_always = 1;
864  param.second_start_min = 1;
865  param.min_size[0] = 3;
866  param.min_size[1] = MAX (4, MPN_KARA_SQR_N_MINSIZE);
867  param.min_size[2] = MPN_TOOM3_SQR_N_MINSIZE;
868  param.max_size[0] = TUNE_SQR_KARATSUBA_MAX;
869  param.max_size[1] = TUNE_SQR_KARATSUBA_MAX;
870  one (sqr_threshold, 3, &param);
871
872  /* disabled until tuned */
873  SQR_FFT_THRESHOLD = MP_SIZE_T_MAX;
874}
875
876
877void
878tune_sb_preinv (void)
879{
880  static struct param_t  param;
881
882  if (UDIV_PREINV_ALWAYS)
883    {
884      print_define_remark ("DIV_SB_PREINV_THRESHOLD", 0L, "preinv always");
885      return;
886    }
887
888  param.check_size = 256;
889  param.min_size[0] = 3;
890  param.min_is_always = 1;
891  param.size_extra = 3;
892  param.stop_factor = 2.0;
893  param.name[0] = "DIV_SB_PREINV_THRESHOLD";
894  param.function = speed_mpn_sb_divrem_m3;
895  one (sb_preinv_threshold, 1, &param);
896}
897
898
899void
900tune_dc (void)
901{
902  static struct param_t  param;
903  param.name[0] = "DIV_DC_THRESHOLD";
904  param.function = speed_mpn_dc_tdiv_qr;
905  one (dc_threshold, 1, &param);
906}
907
908
909/* This is an indirect determination, based on a comparison between redc and
910   mpz_mod.  A fudge factor of 1.04 is applied to redc, to represent
911   additional overheads it gets in mpz_powm.
912
913   stop_factor is 1.1 to hopefully help cray vector systems, where otherwise
914   currently it hits the 1000 limb limit with only a factor of about 1.18
915   (threshold should be around 650).  */
916
917void
918tune_powm (void)
919{
920  static struct param_t  param;
921  param.name[0] = "POWM_THRESHOLD";
922  param.function = speed_redc;
923  param.function2 = speed_mpz_mod;
924  param.step_factor = 0.03;
925  param.stop_factor = 1.1;
926  param.function_fudge = 1.04;
927  one (powm_threshold, 1, &param);
928}
929
930
931void
932tune_gcd_accel (void)
933{
934  static struct param_t  param;
935  param.name[0] = "GCD_ACCEL_THRESHOLD";
936  param.function = speed_mpn_gcd;
937  param.min_size[0] = 1;
938  one (gcd_accel_threshold, 1, &param);
939}
940
941
942
943/* A comparison between the speed of a single limb step and a double limb
944   step is made.  On a 32-bit limb the ratio is about 2.2 single steps to
945   equal a double step, or on a 64-bit limb about 2.09.  (These were found
946   from counting the steps on a 10000 limb gcdext.  */
947void
948tune_gcdext (void)
949{
950  static struct param_t  param;
951  param.name[0] = "GCDEXT_THRESHOLD";
952  param.function = speed_mpn_gcdext_one_single;
953  param.function2 = speed_mpn_gcdext_one_double;
954  switch (BITS_PER_MP_LIMB) {
955  case 32: param.function_fudge = 2.2; break;
956  case 64: param.function_fudge = 2.09; break;
957  default:
958    printf ("Don't know GCDEXT_THERSHOLD factor for BITS_PER_MP_LIMB == %d\n",
959            BITS_PER_MP_LIMB);
960    abort ();
961  }
962  param.min_size[0] = 5;
963  param.min_is_always = 1;
964  param.max_size[0] = 300;
965  param.check_size = 300;
966  one (gcdext_threshold, 1, &param);
967}
968
969
970/* size_extra==1 reflects the fact that with high<divisor one division is
971   always skipped.  Forcing high<divisor while testing ensures consistency
972   while stepping through sizes, ie. that size-1 divides will be done each
973   time.
974
975   min_size==2 and min_is_always are used so that if plain division is only
976   better at size==1 then don't bother including that code just for that
977   case, instead go with preinv always and get a size saving.  */
978
979#define DIV_1_PARAMS                    \
980  param.check_size = 256;               \
981  param.min_size[0] = 2;                   \
982  param.min_is_always = 1;              \
983  param.data_high = DATA_HIGH_LT_R;     \
984  param.size_extra = 1;                 \
985  param.stop_factor = 2.0;
986
987
988double (*tuned_speed_mpn_divrem_1) _PROTO ((struct speed_params *s));
989
990void
991tune_divrem_1 (void)
992{
993  /* plain version by default */
994  tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1;
995
996#ifndef HAVE_NATIVE_mpn_divrem_1
997#define HAVE_NATIVE_mpn_divrem_1 0
998#endif
999
1000  /* No support for tuning native assembler code, do that by hand and put
1001     the results in the .asm file, there's no need for such thresholds to
1002     appear in gmp-mparam.h.  */
1003  if (HAVE_NATIVE_mpn_divrem_1)
1004    return;
1005
1006  if (UDIV_PREINV_ALWAYS)
1007    {
1008      print_define_remark ("DIVREM_1_NORM_THRESHOLD", 0L, "preinv always");
1009      print_define ("DIVREM_1_UNNORM_THRESHOLD", 0L);
1010      return;
1011    }
1012
1013  tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1_tune;
1014
1015  /* Tune for the integer part of mpn_divrem_1.  This will very possibly be
1016     a bit out for the fractional part, but that's too bad, the integer part
1017     is more important. */
1018  {
1019    static struct param_t  param;
1020    param.name[0] = "DIVREM_1_NORM_THRESHOLD";
1021    DIV_1_PARAMS;
1022    s.r = randlimb_norm ();
1023    param.function = speed_mpn_divrem_1_tune;
1024    one (divrem_1_norm_threshold, 1, &param);
1025  }
1026  {
1027    static struct param_t  param;
1028    param.name[0] = "DIVREM_1_UNNORM_THRESHOLD";
1029    DIV_1_PARAMS;
1030    s.r = randlimb_half ();
1031    param.function = speed_mpn_divrem_1_tune;
1032    one (divrem_1_unnorm_threshold, 1, &param);
1033  }
1034}
1035
1036
1037double (*tuned_speed_mpn_mod_1) _PROTO ((struct speed_params *s));
1038
1039void
1040tune_mod_1 (void)
1041{
1042  /* plain version by default */
1043  tuned_speed_mpn_mod_1 = speed_mpn_mod_1;
1044
1045#ifndef HAVE_NATIVE_mpn_mod_1
1046#define HAVE_NATIVE_mpn_mod_1 0
1047#endif
1048
1049  /* No support for tuning native assembler code, do that by hand and put
1050     the results in the .asm file, there's no need for such thresholds to
1051     appear in gmp-mparam.h.  */
1052  if (HAVE_NATIVE_mpn_mod_1)
1053    return;
1054
1055  if (UDIV_PREINV_ALWAYS)
1056    {
1057      print_define ("MOD_1_NORM_THRESHOLD", 0L);
1058      print_define ("MOD_1_UNNORM_THRESHOLD", 0L);
1059      return;
1060    }
1061
1062  tuned_speed_mpn_mod_1 = speed_mpn_mod_1_tune;
1063
1064  {
1065    static struct param_t  param;
1066    param.name[0] = "MOD_1_NORM_THRESHOLD";
1067    DIV_1_PARAMS;
1068    s.r = randlimb_norm ();
1069    param.function = speed_mpn_mod_1_tune;
1070    one (mod_1_norm_threshold, 1, &param);
1071  }
1072  {
1073    static struct param_t  param;
1074    param.name[0] = "MOD_1_UNNORM_THRESHOLD";
1075    DIV_1_PARAMS;
1076    s.r = randlimb_half ();
1077    param.function = speed_mpn_mod_1_tune;
1078    one (mod_1_unnorm_threshold, 1, &param);
1079  }
1080}
1081
1082
1083/* A non-zero DIVREM_1_UNNORM_THRESHOLD (or DIVREM_1_NORM_THRESHOLD) would
1084   imply that udiv_qrnnd_preinv is worth using, but it seems most
1085   straightforward to compare mpn_preinv_divrem_1 and mpn_divrem_1_div
1086   directly.  */
1087
1088void
1089tune_preinv_divrem_1 (void)
1090{
1091  static struct param_t  param;
1092  speed_function_t  divrem_1;
1093  const char        *divrem_1_name;
1094  double            t1, t2;
1095
1096#ifndef HAVE_NATIVE_mpn_preinv_divrem_1
1097#define HAVE_NATIVE_mpn_preinv_divrem_1 0
1098#endif
1099
1100  /* Any native version of mpn_preinv_divrem_1 is assumed to exist because
1101     it's faster than mpn_divrem_1.  */
1102  if (HAVE_NATIVE_mpn_preinv_divrem_1)
1103    {
1104      print_define_remark ("USE_PREINV_DIVREM_1", 1, "native");
1105      return;
1106    }
1107
1108  /* If udiv_qrnnd_preinv is the only division method then of course
1109     mpn_preinv_divrem_1 should be used.  */
1110  if (UDIV_PREINV_ALWAYS)
1111    {
1112      print_define_remark ("USE_PREINV_DIVREM_1", 1, "preinv always");
1113      return;
1114    }
1115
1116  /* If we've got an assembler version of mpn_divrem_1, then compare against
1117     that, not the mpn_divrem_1_div generic C.  */
1118  if (HAVE_NATIVE_mpn_divrem_1)
1119    {
1120      divrem_1 = speed_mpn_divrem_1;
1121      divrem_1_name = "mpn_divrem_1";
1122    }
1123  else
1124    {
1125      divrem_1 = speed_mpn_divrem_1_div;
1126      divrem_1_name = "mpn_divrem_1_div";
1127    }
1128
1129  param.data_high = DATA_HIGH_LT_R; /* allow skip one division */
1130  s.size = 200;                     /* generous but not too big */
1131  /* Divisor, nonzero.  Unnormalized so as to exercise the shift!=0 case,
1132     since in general that's probably most common, though in fact for a
1133     64-bit limb mp_bases[10].big_base is normalized.  */
1134  s.r = urandom() & (MP_LIMB_T_MAX >> 4);
1135  if (s.r == 0) s.r = 123;
1136
1137  t1 = tuneup_measure (speed_mpn_preinv_divrem_1, &param, &s);
1138  t2 = tuneup_measure (divrem_1, &param, &s);
1139  if (t1 == -1.0 || t2 == -1.0)
1140    {
1141      printf ("Oops, can't measure mpn_preinv_divrem_1 and %s at %ld\n",
1142              divrem_1_name, s.size);
1143      abort ();
1144    }
1145  if (option_trace >= 1)
1146    printf ("size=%ld, mpn_preinv_divrem_1 %.9f, %s %.9f\n",
1147            s.size, t1, divrem_1_name, t2);
1148
1149  print_define_remark ("USE_PREINV_DIVREM_1", (mp_size_t) (t1 < t2), NULL);
1150}
1151
1152
1153/* A non-zero MOD_1_UNNORM_THRESHOLD (or MOD_1_NORM_THRESHOLD) would imply
1154   that udiv_qrnnd_preinv is worth using, but it seems most straightforward
1155   to compare mpn_preinv_mod_1 and mpn_mod_1_div directly.  */
1156
1157void
1158tune_preinv_mod_1 (void)
1159{
1160  static struct param_t  param;
1161  speed_function_t  mod_1;
1162  const char        *mod_1_name;
1163  double            t1, t2;
1164
1165#ifndef HAVE_NATIVE_mpn_preinv_mod_1
1166#define HAVE_NATIVE_mpn_preinv_mod_1 0
1167#endif
1168
1169  /* Any native version of mpn_preinv_mod_1 is assumed to exist because it's
1170     faster than mpn_mod_1.  */
1171  if (HAVE_NATIVE_mpn_preinv_mod_1)
1172    {
1173      print_define_remark ("USE_PREINV_MOD_1", 1, "native");
1174      return;
1175    }
1176
1177  /* If udiv_qrnnd_preinv is the only division method then of course
1178     mpn_preinv_mod_1 should be used.  */
1179  if (UDIV_PREINV_ALWAYS)
1180    {
1181      print_define_remark ("USE_PREINV_MOD_1", 1, "preinv always");
1182      return;
1183    }
1184
1185  /* If we've got an assembler version of mpn_mod_1, then compare against
1186     that, not the mpn_mod_1_div generic C.  */
1187  if (HAVE_NATIVE_mpn_mod_1)
1188    {
1189      mod_1 = speed_mpn_mod_1;
1190      mod_1_name = "mpn_mod_1";
1191    }
1192  else
1193    {
1194      mod_1 = speed_mpn_mod_1_div;
1195      mod_1_name = "mpn_mod_1_div";
1196    }
1197
1198  param.data_high = DATA_HIGH_LT_R; /* let mpn_mod_1 skip one division */
1199  s.size = 200;                     /* generous but not too big */
1200  s.r = randlimb_norm();            /* divisor */
1201
1202  t1 = tuneup_measure (speed_mpn_preinv_mod_1, &param, &s);
1203  t2 = tuneup_measure (mod_1, &param, &s);
1204  if (t1 == -1.0 || t2 == -1.0)
1205    {
1206      printf ("Oops, can't measure mpn_preinv_mod_1 and %s at %ld\n",
1207              mod_1_name, s.size);
1208      abort ();
1209    }
1210  if (option_trace >= 1)
1211    printf ("size=%ld, mpn_preinv_mod_1 %.9f, %s %.9f\n",
1212            s.size, t1, mod_1_name, t2);
1213
1214  print_define_remark ("USE_PREINV_MOD_1", (mp_size_t) (t1 < t2), NULL);
1215}
1216
1217
1218void
1219tune_divrem_2 (void)
1220{
1221  static struct param_t  param;
1222
1223#ifndef HAVE_NATIVE_mpn_divrem_2
1224#define HAVE_NATIVE_mpn_divrem_2 0
1225#endif
1226
1227  /* No support for tuning native assembler code, do that by hand and put
1228     the results in the .asm file, and there's no need for such thresholds
1229     to appear in gmp-mparam.h.  */
1230  if (HAVE_NATIVE_mpn_divrem_2)
1231    return;
1232
1233  if (UDIV_PREINV_ALWAYS)
1234    {
1235      print_define_remark ("DIVREM_2_THRESHOLD", 0L, "preinv always");
1236      return;
1237    }
1238
1239  /* Tune for the integer part of mpn_divrem_2.  This will very possibly be
1240     a bit out for the fractional part, but that's too bad, the integer part
1241     is more important.
1242
1243     min_size must be >=2 since nsize>=2 is required, but is set to 4 to save
1244     code space if plain division is better only at size==2 or size==3. */
1245  param.name[0] = "DIVREM_2_THRESHOLD";
1246  param.check_size = 256;
1247  param.min_size[0] = 4;
1248  param.min_is_always = 1;
1249  param.size_extra = 2;      /* does qsize==nsize-2 divisions */
1250  param.stop_factor = 2.0;
1251
1252  s.r = randlimb_norm ();
1253  param.function = speed_mpn_divrem_2;
1254  one (divrem_2_threshold, 1, &param);
1255}
1256
1257
1258/* mpn_divexact_1 is vaguely expected to be used on smallish divisors, so
1259   tune for that.  Its speed can differ on odd or even divisor, so take an
1260   average threshold for the two.
1261
1262   mpn_divrem_1 can vary with high<divisor or not, whereas mpn_divexact_1
1263   might not vary that way, but don't test this since high<divisor isn't
1264   expected to occur often with small divisors.  */
1265
1266void
1267tune_divexact_1 (void)
1268{
1269  static struct param_t  param;
1270  mp_size_t  thresh[2], average;
1271  int        low, i;
1272
1273  ASSERT_ALWAYS (tuned_speed_mpn_divrem_1 != NULL);
1274
1275  param.name[0] = "DIVEXACT_1_THRESHOLD";
1276  param.data_high = DATA_HIGH_GE_R;
1277  param.check_size = 256;
1278  param.min_size[0] = 2;
1279  param.stop_factor = 1.5;
1280  param.function  = tuned_speed_mpn_divrem_1;
1281  param.function2 = speed_mpn_divexact_1;
1282  param.noprint = 1;
1283
1284  print_define_start (param.name[0]);
1285
1286  for (low = 0; low <= 1; low++)
1287    {
1288      s.r = randlimb_half();
1289      if (low == 0)
1290        s.r |= 1;
1291      else
1292        s.r &= ~CNST_LIMB(7);
1293
1294      one (divexact_1_threshold, 1, &param);
1295      if (option_trace)
1296        printf ("low=%d thresh %ld\n", low, divexact_1_threshold[0]);
1297
1298      if (divexact_1_threshold[0] == MP_SIZE_T_MAX)
1299        {
1300          average = MP_SIZE_T_MAX;
1301          goto divexact_1_done;
1302        }
1303
1304      thresh[low] = divexact_1_threshold[0];
1305    }
1306
1307  if (option_trace)
1308    {
1309      printf ("average of:");
1310      for (i = 0; i < numberof(thresh); i++)
1311        printf (" %ld", thresh[i]);
1312      printf ("\n");
1313    }
1314
1315  average = 0;
1316  for (i = 0; i < numberof(thresh); i++)
1317    average += thresh[i];
1318  average /= numberof(thresh);
1319
1320  /* If divexact turns out to be better as early as 3 limbs, then use it
1321     always, so as to reduce code size and conditional jumps.  */
1322  if (average <= 3)
1323    average = 0;
1324
1325 divexact_1_done:
1326  print_define_end (param.name[0], average);
1327}
1328
1329
1330/* The generic mpn_modexact_1_odd skips a divide step if high<divisor, the
1331   same as mpn_mod_1, but this might not be true of an assembler
1332   implementation.  The threshold used is an average based on data where a
1333   divide can be skipped and where it can't.
1334
1335   If modexact turns out to be better as early as 3 limbs, then use it
1336   always, so as to reduce code size and conditional jumps.  */
1337
1338void
1339tune_modexact_1_odd (void)
1340{
1341  static struct param_t  param;
1342  mp_size_t  thresh_lt;
1343
1344  ASSERT_ALWAYS (tuned_speed_mpn_mod_1 != NULL);
1345
1346  param.name[0] = "MODEXACT_1_ODD_THRESHOLD";
1347  param.check_size = 256;
1348  param.min_size[0] = 2;
1349  param.stop_factor = 1.5;
1350  param.function  = tuned_speed_mpn_mod_1;
1351  param.function2 = speed_mpn_modexact_1c_odd;
1352  param.noprint = 1;
1353  s.r = randlimb_half () | 1;
1354
1355  print_define_start (param.name[0]);
1356
1357  param.data_high = DATA_HIGH_LT_R;
1358  one (modexact_1_odd_threshold, 1, &param);
1359  if (option_trace)
1360    printf ("lt thresh %ld\n", modexact_1_odd_threshold[0]);
1361
1362  thresh_lt = modexact_1_odd_threshold[0];
1363  if (modexact_1_odd_threshold[0] != MP_SIZE_T_MAX)
1364    {
1365      param.data_high = DATA_HIGH_GE_R;
1366      one (modexact_1_odd_threshold, 1, &param);
1367      if (option_trace)
1368        printf ("ge thresh %ld\n", modexact_1_odd_threshold[0]);
1369
1370      if (modexact_1_odd_threshold[0] != MP_SIZE_T_MAX)
1371        {
1372          modexact_1_odd_threshold[0]
1373            = (modexact_1_odd_threshold[0] + thresh_lt) / 2;
1374          if (modexact_1_odd_threshold[0] <= 3)
1375            modexact_1_odd_threshold[0] = 0;
1376        }
1377    }
1378
1379  print_define_end (param.name[0], modexact_1_odd_threshold[0]);
1380}
1381
1382
1383void
1384tune_jacobi_base (void)
1385{
1386  static struct param_t  param;
1387  double   t1, t2, t3;
1388  int      method;
1389
1390  s.size = BITS_PER_MP_LIMB * 3 / 4;
1391
1392  t1 = tuneup_measure (speed_mpn_jacobi_base_1, &param, &s);
1393  if (option_trace >= 1)
1394    printf ("size=%ld, mpn_jacobi_base_1 %.9f\n", s.size, t1);
1395
1396  t2 = tuneup_measure (speed_mpn_jacobi_base_2, &param, &s);
1397  if (option_trace >= 1)
1398    printf ("size=%ld, mpn_jacobi_base_2 %.9f\n", s.size, t2);
1399
1400  t3 = tuneup_measure (speed_mpn_jacobi_base_3, &param, &s);
1401  if (option_trace >= 1)
1402    printf ("size=%ld, mpn_jacobi_base_3 %.9f\n", s.size, t3);
1403
1404  if (t1 == -1.0 || t2 == -1.0 || t3 == -1.0)
1405    {
1406      printf ("Oops, can't measure all mpn_jacobi_base methods at %ld\n",
1407              s.size);
1408      abort ();
1409    }
1410
1411  if (t1 < t2 && t1 < t3)
1412    method = 1;
1413  else if (t2 < t3)
1414    method = 2;
1415  else
1416    method = 3;
1417
1418  print_define ("JACOBI_BASE_METHOD", method);
1419}
1420
1421
1422void
1423tune_get_str (void)
1424{
1425  /* Tune for decimal, it being most common.  Some rough testing suggests
1426     other bases are different, but not by very much.  */
1427  s.r = 10;
1428  {
1429    static struct param_t  param;
1430    get_str_precompute_threshold[0] = 0;
1431    param.name[0] = "GET_STR_DC_THRESHOLD";
1432    param.function = speed_mpn_get_str;
1433    param.min_size[0] = 2;
1434    param.max_size[0] = GET_STR_THRESHOLD_LIMIT;
1435    one (get_str_basecase_threshold, 1, &param);
1436  }
1437  {
1438    static struct param_t  param;
1439    param.name[0] = "GET_STR_PRECOMPUTE_THRESHOLD";
1440    param.function = speed_mpn_get_str;
1441    param.min_size[0] = get_str_basecase_threshold[0];
1442    param.max_size[0] = GET_STR_THRESHOLD_LIMIT;
1443    one (get_str_precompute_threshold, 1, &param);
1444  }
1445}
1446
1447
1448void
1449tune_set_str (void)
1450{
1451  static struct param_t  param;
1452
1453  s.r = 10;  /* decimal */
1454  param.step_factor = 0.04;
1455  param.name[0] = "SET_STR_THRESHOLD";
1456  param.function = speed_mpn_set_str_basecase;
1457  param.function2 = speed_mpn_set_str_subquad;
1458  param.min_size[0] = 100;
1459  param.max_size[0] = 150000;
1460  one (set_str_threshold, 1, &param);
1461}
1462
1463
1464void
1465tune_fft_mul (void)
1466{
1467  static struct fft_param_t  param;
1468
1469  if (option_fft_max_size == 0)
1470    return;
1471
1472  param.table_name          = "MUL_FFT_TABLE";
1473  param.threshold_name      = "MUL_FFT_THRESHOLD";
1474  param.p_threshold         = &MUL_FFT_THRESHOLD;
1475  param.modf_threshold_name = "MUL_FFT_MODF_THRESHOLD";
1476  param.p_modf_threshold    = &MUL_FFT_MODF_THRESHOLD;
1477  param.first_size          = MUL_TOOM3_THRESHOLD / 2;
1478  param.max_size            = option_fft_max_size;
1479  param.function            = speed_mpn_mul_fft;
1480  param.mul_function        = speed_mpn_mul_n;
1481  param.sqr = 0;
1482  fft (&param);
1483}
1484
1485
1486void
1487tune_fft_sqr (void)
1488{
1489  static struct fft_param_t  param;
1490
1491  if (option_fft_max_size == 0)
1492    return;
1493
1494  param.table_name          = "SQR_FFT_TABLE";
1495  param.threshold_name      = "SQR_FFT_THRESHOLD";
1496  param.p_threshold         = &SQR_FFT_THRESHOLD;
1497  param.modf_threshold_name = "SQR_FFT_MODF_THRESHOLD";
1498  param.p_modf_threshold    = &SQR_FFT_MODF_THRESHOLD;
1499  param.first_size          = SQR_TOOM3_THRESHOLD / 2;
1500  param.max_size            = option_fft_max_size;
1501  param.function            = speed_mpn_mul_fft_sqr;
1502  param.mul_function        = speed_mpn_sqr_n;
1503  param.sqr = 0;
1504  fft (&param);
1505}
1506
1507
1508void
1509all (void)
1510{
1511  time_t  start_time, end_time;
1512  TMP_DECL (marker);
1513
1514  TMP_MARK (marker);
1515  s.xp_block = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, 0);
1516  s.yp_block = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, 0);
1517
1518  mpn_random (s.xp_block, SPEED_BLOCK_SIZE);
1519  mpn_random (s.yp_block, SPEED_BLOCK_SIZE);
1520
1521  fprintf (stderr, "Parameters for %s\n", GMP_MPARAM_H_SUGGEST);
1522
1523  speed_time_init ();
1524  fprintf (stderr, "Using: %s\n", speed_time_string);
1525
1526  fprintf (stderr, "speed_precision %d", speed_precision);
1527  if (speed_unittime == 1.0)
1528    fprintf (stderr, ", speed_unittime 1 cycle");
1529  else
1530    fprintf (stderr, ", speed_unittime %.2e secs", speed_unittime);
1531  if (speed_cycletime == 1.0 || speed_cycletime == 0.0)
1532    fprintf (stderr, ", CPU freq unknown\n");
1533  else
1534    fprintf (stderr, ", CPU freq %.2f MHz\n", 1e-6/speed_cycletime);
1535
1536  fprintf (stderr, "DEFAULT_MAX_SIZE %d, fft_max_size %ld\n",
1537           DEFAULT_MAX_SIZE, option_fft_max_size);
1538  fprintf (stderr, "\n");
1539
1540  time (&start_time);
1541  {
1542    struct tm  *tp;
1543    tp = localtime (&start_time);
1544    printf ("/* Generated by tuneup.c, %d-%02d-%02d, ",
1545            tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday);
1546
1547#ifdef __GNUC__
1548    /* gcc sub-minor version doesn't seem to come through as a define */
1549    printf ("gcc %d.%d */\n", __GNUC__, __GNUC_MINOR__);
1550#define PRINTED_COMPILER
1551#endif
1552#if defined (__SUNPRO_C)
1553    printf ("Sun C %d.%d */\n", __SUNPRO_C / 0x100, __SUNPRO_C % 0x100);
1554#define PRINTED_COMPILER
1555#endif
1556#if ! defined (__GNUC__) && defined (__sgi) && defined (_COMPILER_VERSION)
1557    /* gcc defines __sgi and _COMPILER_VERSION on irix 6, avoid that */
1558    printf ("MIPSpro C %d.%d.%d */\n",
1559            _COMPILER_VERSION / 100,
1560            _COMPILER_VERSION / 10 % 10,
1561            _COMPILER_VERSION % 10);
1562#define PRINTED_COMPILER
1563#endif
1564#if defined (__DECC) && defined (__DECC_VER)
1565    printf ("DEC C %d */\n", __DECC_VER);
1566#define PRINTED_COMPILER
1567#endif
1568#if ! defined (PRINTED_COMPILER)
1569    printf ("system compiler */\n");
1570#endif
1571  }
1572  printf ("\n");
1573
1574  tune_mul ();
1575  printf("\n");
1576
1577  tune_sqr ();
1578  printf("\n");
1579
1580  tune_sb_preinv ();
1581  tune_dc ();
1582  tune_powm ();
1583  printf("\n");
1584
1585  tune_gcd_accel ();
1586  tune_gcdext ();
1587  tune_jacobi_base ();
1588  printf("\n");
1589
1590  tune_divrem_1 ();
1591  tune_mod_1 ();
1592  tune_preinv_divrem_1 ();
1593  tune_preinv_mod_1 ();
1594  tune_divrem_2 ();
1595  tune_divexact_1 ();
1596  tune_modexact_1_odd ();
1597  printf("\n");
1598
1599  tune_get_str ();
1600  tune_set_str ();
1601  printf("\n");
1602
1603  tune_fft_mul ();
1604  printf("\n");
1605
1606  tune_fft_sqr ();
1607  printf ("\n");
1608
1609  time (&end_time);
1610  printf ("/* Tuneup completed successfully, took %ld seconds */\n",
1611          end_time - start_time);
1612
1613  TMP_FREE (marker);
1614}
1615
1616
1617int
1618main (int argc, char *argv[])
1619{
1620  int  opt;
1621
1622  /* Unbuffered so if output is redirected to a file it isn't lost if the
1623     program is killed part way through.  */
1624  setbuf (stdout, NULL);
1625  setbuf (stderr, NULL);
1626
1627  while ((opt = getopt(argc, argv, "f:o:p:t")) != EOF)
1628    {
1629      switch (opt) {
1630      case 'f':
1631        if (optarg[0] == 't')
1632          option_fft_trace = 2;
1633        else
1634          option_fft_max_size = atol (optarg);
1635        break;
1636      case 'o':
1637        speed_option_set (optarg);
1638        break;
1639      case 'p':
1640        speed_precision = atoi (optarg);
1641        break;
1642      case 't':
1643        option_trace++;
1644        break;
1645      case '?':
1646        exit(1);
1647      }
1648    }
1649
1650  all ();
1651  exit (0);
1652}
Note: See TracBrowser for help on using the repository browser.