source: trunk/third/gmp/tune/common.c @ 18191

Revision 18191, 45.1 KB checked in by ghudson, 22 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r18190, which included commits to RCS files with non-trunk default branches.
Line 
1/* Shared speed subroutines.
2
3Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
4
5This file is part of the GNU MP Library.
6
7The GNU MP Library is free software; you can redistribute it and/or modify
8it under the terms of the GNU Lesser General Public License as published by
9the Free Software Foundation; either version 2.1 of the License, or (at your
10option) any later version.
11
12The GNU MP Library is distributed in the hope that it will be useful, but
13WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15License for more details.
16
17You should have received a copy of the GNU Lesser General Public License
18along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
19the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
20MA 02111-1307, USA. */
21
22#include <errno.h>
23#include <fcntl.h>
24#include <math.h>
25#include <stdio.h>
26#include <stdlib.h> /* for qsort */
27#include <string.h>
28#include <unistd.h>
29#if 0
30#include <sys/ioctl.h>
31#endif
32
33#include "gmp.h"
34#include "gmp-impl.h"
35#include "longlong.h"
36
37#include "tests.h"
38#include "speed.h"
39
40
41int   speed_option_addrs = 0;
42int   speed_option_verbose = 0;
43
44
45/* Provide __clz_tab even if it's not required, for the benefit of new code
46   being tested with many.pl. */
47#ifndef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
48#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
49#include "mp_clz_tab.c"
50#undef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
51#endif
52
53
54void
55pentium_wbinvd(void)
56{
57#if 0
58  {
59    static int  fd = -2;
60
61    if (fd == -2)
62      {
63        fd = open ("/dev/wbinvd", O_RDWR);
64        if (fd == -1)
65          perror ("open /dev/wbinvd");
66      }
67
68    if (fd != -1)
69      ioctl (fd, 0, 0);
70  }
71#endif
72
73#if 0
74#define WBINVDSIZE  1024*1024*2
75  {
76    static char  *p = NULL;
77    int   i, sum;
78
79    if (p == NULL)
80      p = malloc (WBINVDSIZE);
81
82#if 0
83    for (i = 0; i < WBINVDSIZE; i++)
84      p[i] = i & 0xFF;
85#endif
86
87    sum = 0;
88    for (i = 0; i < WBINVDSIZE; i++)
89      sum += p[i];
90
91    mpn_cache_fill_dummy (sum);
92  }
93#endif
94}
95
96
97int
98double_cmp_ptr (const double *p, const double *q)
99{
100  if (*p > *q)  return 1;
101  if (*p < *q)  return -1;
102  return 0;
103}
104
105
106/* Measure the speed of a given routine.
107
108   The routine is run with enough repetitions to make it take at least
109   speed_precision * speed_unittime.  This aims to minimize the effects of a
110   limited accuracy time base and the overhead of the measuring itself.
111
112   Measurements are made looking for 4 results within TOLERANCE of each
113   other (or 3 for routines taking longer than 2 seconds).  This aims to get
114   an accurate reading even if some runs are bloated by interrupts or task
115   switches or whatever.
116
117   The given (*fun)() is expected to run its function "s->reps" many times
118   and return the total elapsed time measured using speed_starttime() and
119   speed_endtime().  If the function doesn't support the given s->size or
120   s->r, -1.0 should be returned.  See the various base routines below.  */
121
122double
123speed_measure (double (*fun) _PROTO ((struct speed_params *s)),
124               struct speed_params *s)
125{
126#define TOLERANCE    1.005  /* 0.5% */
127
128  struct speed_params  s_dummy;
129  int     i, j, e;
130  double  t[30];
131  double  t_unsorted[30];
132  double  reps_d;
133
134  /* Use dummy parameters if caller doesn't provide any.  Only a few special
135     "fun"s will cope with this, speed_noop() is one.  */
136  if (s == NULL)
137    {
138      memset (&s_dummy, '\0', sizeof (s_dummy));
139      s = &s_dummy;
140    }
141 
142  s->reps = 1;
143  s->time_divisor = 1.0;
144  for (i = 0; i < numberof (t); i++)
145    {
146      for (;;)
147        {
148          s->src_num = 0;
149          s->dst_num = 0;
150
151          t[i] = (*fun) (s);
152
153          if (speed_option_verbose >= 3)
154            printf("size=%ld reps=%u r=%ld attempt=%d  %.9f\n",
155                   s->size, s->reps, s->r, i, t[i]);
156
157          if (t[i] == -1.0)
158            return -1.0;
159
160          if (t[i] >= speed_unittime * speed_precision)
161            break;
162
163          /* go to a value of reps to make t[i] >= precision */
164          reps_d = ceil (1.1 * s->reps
165                         * speed_unittime * speed_precision
166                         / MAX (t[i], speed_unittime));
167          if (reps_d > 2e9 || reps_d < 1.0)
168            {
169              fprintf (stderr, "Fatal error: new reps bad: %.2f\n", reps_d);
170              fprintf (stderr, "  (old reps %u, unittime %.4g, precision %d, t[i] %.4g)\n",
171                       s->reps, speed_unittime, speed_precision, t[i]);
172              abort ();
173            }
174          s->reps = (unsigned) reps_d;
175        }
176      t[i] /= s->reps;
177      t_unsorted[i] = t[i];
178
179      if (speed_precision == 0)
180        return t[i];
181
182      /* require 3 values within TOLERANCE when >= 2 secs, 4 when below */
183      if (t[0] >= 2.0)
184        e = 3;
185      else
186        e = 4;
187
188      /* Look for e many t[]'s within TOLERANCE of each other to consider a
189         valid measurement.  Return smallest among them.  */
190      if (i >= e)
191        {
192          qsort (t, i+1, sizeof(t[0]), (qsort_function_t) double_cmp_ptr);
193          for (j = e-1; j < i; j++)
194            if (t[j] <= t[j-e+1] * TOLERANCE)
195              return t[j-e+1] / s->time_divisor;
196        }
197    }
198
199  fprintf (stderr, "speed_measure() could not get %d results within %.1f%%\n",
200           e, (TOLERANCE-1.0)*100.0);
201  fprintf (stderr, "    unsorted         sorted\n");
202  fprintf (stderr, "  %.12f    %.12f    is about 0.5%%\n",
203           t_unsorted[0]*(TOLERANCE-1.0), t[0]*(TOLERANCE-1.0));
204  for (i = 0; i < numberof (t); i++)
205    fprintf (stderr, "  %.09f       %.09f\n", t_unsorted[i], t[i]);
206
207  return -1.0;
208}
209
210
211/* Read all of ptr,size to get it into the CPU memory cache.
212 
213   A call to mpn_cache_fill_dummy() is used to make sure the compiler
214   doesn't optimize away the whole loop.  Using "volatile mp_limb_t sum"
215   would work too, but the function call means we don't rely on every
216   compiler actually implementing volatile properly.
217
218   mpn_cache_fill_dummy() is in a separate source file to stop gcc thinking
219   it can inline it.  */
220
221void
222mpn_cache_fill (mp_srcptr ptr, mp_size_t size)
223{
224  mp_limb_t  sum = 0;
225  mp_size_t  i;
226
227  for (i = 0; i < size; i++)
228    sum += ptr[i];
229
230  mpn_cache_fill_dummy(sum);
231}
232
233
234void
235mpn_cache_fill_write (mp_ptr ptr, mp_size_t size)
236{
237  mpn_cache_fill (ptr, size);
238
239#if 0
240  mpn_random (ptr, size);
241#endif
242
243#if 0
244  mp_size_t  i;
245
246  for (i = 0; i < size; i++)
247    ptr[i] = i;
248#endif
249}
250
251
252void
253speed_operand_src (struct speed_params *s, mp_ptr ptr, mp_size_t size)
254{
255  if (s->src_num >= numberof (s->src))
256    {
257      fprintf (stderr, "speed_operand_src: no room left in s->src[]\n");
258      abort ();
259    }
260  s->src[s->src_num].ptr = ptr;
261  s->src[s->src_num].size = size;
262  s->src_num++;
263}
264
265
266void
267speed_operand_dst (struct speed_params *s, mp_ptr ptr, mp_size_t size)
268{
269  if (s->dst_num >= numberof (s->dst))
270    {
271      fprintf (stderr, "speed_operand_dst: no room left in s->dst[]\n");
272      abort ();
273    }
274  s->dst[s->dst_num].ptr = ptr;
275  s->dst[s->dst_num].size = size;
276  s->dst_num++;
277}
278
279
280void
281speed_cache_fill (struct speed_params *s)
282{
283  static struct speed_params  prev;
284  int  i;
285
286  /* FIXME: need a better way to get the format string for a pointer */
287
288  if (speed_option_addrs)
289    {
290      int  different;
291
292      different = (s->dst_num != prev.dst_num || s->src_num != prev.src_num);
293      for (i = 0; i < s->dst_num; i++)
294        different |= (s->dst[i].ptr != prev.dst[i].ptr);
295      for (i = 0; i < s->src_num; i++)
296        different |= (s->src[i].ptr != prev.src[i].ptr);
297
298      if (different)
299        {
300          if (s->dst_num != 0)
301            {
302              printf ("dst");
303              for (i = 0; i < s->dst_num; i++)
304                printf (" %08lX", (unsigned long) s->dst[i].ptr);
305              printf (" ");
306            }
307
308          if (s->src_num != 0)
309            {
310              printf ("src");
311              for (i = 0; i < s->src_num; i++)
312                printf (" %08lX", (unsigned long) s->src[i].ptr);
313              printf (" ");
314            }
315          printf ("  (cf sp approx %08lX)\n", (unsigned long) &different);
316
317        }
318
319      memcpy (&prev, s, sizeof(prev));
320    }
321
322  switch (s->cache) {
323  case 0:
324    for (i = 0; i < s->dst_num; i++)
325      mpn_cache_fill_write (s->dst[i].ptr, s->dst[i].size);
326    for (i = 0; i < s->src_num; i++)
327      mpn_cache_fill (s->src[i].ptr, s->src[i].size);
328    break;
329  case 1:
330    pentium_wbinvd();
331    break;
332  }
333}
334
335
336/* Adjust ptr to align to CACHE_LINE_SIZE bytes plus "align" limbs.  ptr
337   needs to have room for up to CACHE_LINE_SIZE-4 extra bytes.  */
338
339mp_ptr
340speed_tmp_alloc_adjust (void *ptr, mp_size_t align)
341{
342  /*
343  printf("%p %ld -> %p %X %X\n", ptr, align,
344         (mp_ptr) ptr
345         + ((align - ((mp_size_t) ptr >> 2)) &
346            SPEED_TMP_ALLOC_ADJUST_MASK),
347         ((mp_size_t) ptr >> 2) & SPEED_TMP_ALLOC_ADJUST_MASK,
348         SPEED_TMP_ALLOC_ADJUST_MASK);
349  */
350
351  return (mp_ptr) ptr
352    + ((align - ((mp_size_t) ptr >> 2)) & SPEED_TMP_ALLOC_ADJUST_MASK);
353}
354
355
356/* Miscellanous options accepted by tune and speed programs under -o. */
357
358void
359speed_option_set (const char *s)
360{
361  int  n;
362
363  if (strcmp (s, "addrs") == 0)
364    {
365      speed_option_addrs = 1;
366    }
367  else if (strcmp (s, "verbose") == 0)
368    {
369      speed_option_verbose++;
370    }
371  else if (sscanf (s, "verbose=%d", &n) == 1)
372    {
373      speed_option_verbose = n;
374    }
375  else
376    {
377      printf ("Unrecognised -o option: %s\n", s);
378      exit (1);
379    }
380}
381
382
383/* The following are basic speed running routines for various gmp functions.
384   Many are very similar and use speed.h macros.
385
386   Each routine allocates it's own destination space for the result of the
387   function, because only it can know what the function needs.
388
389   speed_starttime() and speed_endtime() are put tight around the code to be
390   measured.  Any setups are done outside the timed portion.
391
392   Each routine is responsible for its own cache priming.
393   speed_cache_fill() is a good way to do this, see examples in speed.h.
394   One cache priming possibility, for CPUs with write-allocate cache, and
395   functions that don't take too long, is to do one dummy call before timing
396   so as to cache everything that gets used.  But speed_measure() runs a
397   routine at least twice and will take the smaller time, so this might not
398   be necessary.
399
400   Data alignment will be important, for source, destination and temporary
401   workspace.  A routine can align its destination and workspace.  Programs
402   using the routines will ensure s->xp and s->yp are aligned.  Aligning
403   onto a CACHE_LINE_SIZE boundary is suggested.  s->align_wp and
404   s->align_wp2 should be respected where it makes sense to do so.
405   SPEED_TMP_ALLOC_LIMBS is a good way to do this.
406
407   A loop of the following form can be expected to turn into good assembler
408   code on most CPUs, thereby minimizing overhead in the measurement.  It
409   can always be assumed s->reps >= 1.
410
411          i = s->reps
412          do
413            foo();
414          while (--i != 0);
415
416   Additional parameters might be added to "struct speed_params" in the
417   future.  Routines should ignore anything they don't use.
418
419   s->size can be used creatively, and s->xp and s->yp can be ignored.  For
420   example, speed_mpz_fac_ui() uses s->size as n for the factorial.  s->r is
421   just a user-supplied parameter.  speed_mpn_lshift() uses it as a shift,
422   speed_mpn_mul_1() uses it as a multiplier.  */
423
424
425/* MPN_COPY etc can be macros, so the _CALL forms are necessary */
426double
427speed_MPN_COPY (struct speed_params *s)
428{
429  SPEED_ROUTINE_MPN_COPY (MPN_COPY);
430}
431double
432speed_MPN_COPY_INCR (struct speed_params *s)
433{
434  SPEED_ROUTINE_MPN_COPY (MPN_COPY_INCR);
435}
436double
437speed_MPN_COPY_DECR (struct speed_params *s)
438{
439  SPEED_ROUTINE_MPN_COPY (MPN_COPY_DECR);
440}
441#if HAVE_NATIVE_mpn_copyi
442double
443speed_mpn_copyi (struct speed_params *s)
444{
445  SPEED_ROUTINE_MPN_COPY (mpn_copyi);
446}
447#endif
448#if HAVE_NATIVE_mpn_copyd
449double
450speed_mpn_copyd (struct speed_params *s)
451{
452  SPEED_ROUTINE_MPN_COPY (mpn_copyd);
453}
454#endif
455double
456speed_memcpy (struct speed_params *s)
457{
458  SPEED_ROUTINE_MPN_COPY_BYTES (memcpy);
459}
460double
461speed_mpn_com_n (struct speed_params *s)
462{
463  SPEED_ROUTINE_MPN_COPY (mpn_com_n);
464}
465
466
467double
468speed_mpn_addmul_1 (struct speed_params *s)
469{
470  SPEED_ROUTINE_MPN_UNARY_1 (mpn_addmul_1);
471}
472double
473speed_mpn_submul_1 (struct speed_params *s)
474{
475  SPEED_ROUTINE_MPN_UNARY_1 (mpn_submul_1);
476}
477
478
479double
480speed_mpn_mul_1 (struct speed_params *s)
481{
482  SPEED_ROUTINE_MPN_UNARY_1 (mpn_mul_1);
483}
484double
485speed_mpn_mul_1_inplace (struct speed_params *s)
486{
487  SPEED_ROUTINE_MPN_UNARY_1_INPLACE (mpn_mul_1);
488}
489
490#if HAVE_NATIVE_mpn_mul_2
491double
492speed_mpn_mul_2 (struct speed_params *s)
493{
494  SPEED_ROUTINE_MPN_MUL_2 (mpn_mul_2);
495}
496#endif
497
498
499double
500speed_mpn_lshift (struct speed_params *s)
501{
502  SPEED_ROUTINE_MPN_UNARY_1 (mpn_lshift);
503}
504double
505speed_mpn_rshift (struct speed_params *s)
506{
507  SPEED_ROUTINE_MPN_UNARY_1 (mpn_rshift);
508}
509
510
511/* The carry-in variants (if available) are good for measuring because they
512   won't skip a division if high<divisor.  Alternately, use -1 as a divisor
513   with the plain _1 forms. */
514double
515speed_mpn_divrem_1 (struct speed_params *s)
516{
517  SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1);
518}
519double
520speed_mpn_divrem_1f (struct speed_params *s)
521{
522  SPEED_ROUTINE_MPN_DIVREM_1F (mpn_divrem_1);
523}
524#if HAVE_NATIVE_mpn_divrem_1c
525double
526speed_mpn_divrem_1c (struct speed_params *s)
527{
528  SPEED_ROUTINE_MPN_DIVREM_1C (mpn_divrem_1c);
529}
530double
531speed_mpn_divrem_1cf (struct speed_params *s)
532{
533  SPEED_ROUTINE_MPN_DIVREM_1CF (mpn_divrem_1c);
534}
535#endif
536
537double
538speed_mpn_divrem_1_div (struct speed_params *s)
539{
540  SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_div);
541}
542double
543speed_mpn_divrem_1f_div (struct speed_params *s)
544{
545  SPEED_ROUTINE_MPN_DIVREM_1F (mpn_divrem_1_div);
546}
547double
548speed_mpn_divrem_1_inv (struct speed_params *s)
549{
550  SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_inv);
551}
552double
553speed_mpn_divrem_1f_inv (struct speed_params *s)
554{
555  SPEED_ROUTINE_MPN_DIVREM_1F (mpn_divrem_1_inv);
556}
557double
558speed_mpn_mod_1_div (struct speed_params *s)
559{
560  SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_div);
561}
562double
563speed_mpn_mod_1_inv (struct speed_params *s)
564{
565  SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_inv);
566}
567
568double
569speed_mpn_preinv_divrem_1 (struct speed_params *s)
570{
571  SPEED_ROUTINE_MPN_PREINV_DIVREM_1 (mpn_preinv_divrem_1);
572}
573double
574speed_mpn_preinv_divrem_1f (struct speed_params *s)
575{
576  SPEED_ROUTINE_MPN_PREINV_DIVREM_1F (mpn_preinv_divrem_1);
577}
578
579double
580speed_mpn_mod_34lsub1 (struct speed_params *s)
581{
582  SPEED_ROUTINE_MPN_MOD_34LSUB1 (mpn_mod_34lsub1);
583}
584
585double
586speed_mpn_divrem_2 (struct speed_params *s)
587{
588  SPEED_ROUTINE_MPN_DIVREM_2 (mpn_divrem_2);
589}
590double
591speed_mpn_divrem_2_div (struct speed_params *s)
592{
593  SPEED_ROUTINE_MPN_DIVREM_2 (mpn_divrem_2_div);
594}
595double
596speed_mpn_divrem_2_inv (struct speed_params *s)
597{
598  SPEED_ROUTINE_MPN_DIVREM_2 (mpn_divrem_2_inv);
599}
600
601double
602speed_mpn_mod_1 (struct speed_params *s)
603{
604  SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1);
605}
606#if HAVE_NATIVE_mpn_mod_1c
607double
608speed_mpn_mod_1c (struct speed_params *s)
609{
610  SPEED_ROUTINE_MPN_MOD_1C (mpn_mod_1c);
611}
612#endif
613double
614speed_mpn_preinv_mod_1 (struct speed_params *s)
615{
616  SPEED_ROUTINE_MPN_PREINV_MOD_1 (mpn_preinv_mod_1);
617}
618
619double
620speed_mpn_divexact_1 (struct speed_params *s)
621{
622  SPEED_ROUTINE_MPN_DIVEXACT_1 (mpn_divexact_1);
623}
624
625double
626speed_mpn_divexact_by3 (struct speed_params *s)
627{
628  SPEED_ROUTINE_MPN_COPY (mpn_divexact_by3);
629}
630
631#if HAVE_NATIVE_mpn_modexact_1_odd
632double
633speed_mpn_modexact_1_odd (struct speed_params *s)
634{
635  SPEED_ROUTINE_MPN_MODEXACT_1_ODD (mpn_modexact_1_odd);
636}
637#endif
638
639double
640speed_mpn_modexact_1c_odd (struct speed_params *s)
641{
642  SPEED_ROUTINE_MPN_MODEXACT_1C_ODD (mpn_modexact_1c_odd);
643}
644
645
646double
647speed_mpn_dc_tdiv_qr (struct speed_params *s)
648{
649  SPEED_ROUTINE_MPN_DC_TDIV_QR (mpn_tdiv_qr);
650}
651double
652speed_mpn_dc_divrem_n (struct speed_params *s)
653{
654  SPEED_ROUTINE_MPN_DC_DIVREM_N (mpn_dc_divrem_n);
655}
656double
657speed_mpn_dc_divrem_sb (struct speed_params *s)
658{
659  SPEED_ROUTINE_MPN_DC_DIVREM_SB (mpn_sb_divrem_mn);
660}
661double
662speed_mpn_dc_divrem_sb_div (struct speed_params *s)
663{
664  SPEED_ROUTINE_MPN_DC_DIVREM_SB (mpn_sb_divrem_mn_div);
665}
666double
667speed_mpn_dc_divrem_sb_inv (struct speed_params *s)
668{
669  SPEED_ROUTINE_MPN_DC_DIVREM_SB (mpn_sb_divrem_mn_inv);
670}
671
672double
673speed_mpn_sb_divrem_m3 (struct speed_params *s)
674{
675  SPEED_ROUTINE_MPN_SB_DIVREM_M3 (mpn_sb_divrem_mn);
676}
677double
678speed_mpn_sb_divrem_m3_div (struct speed_params *s)
679{
680  SPEED_ROUTINE_MPN_SB_DIVREM_M3 (mpn_sb_divrem_mn_div);
681}
682double
683speed_mpn_sb_divrem_m3_inv (struct speed_params *s)
684{
685  SPEED_ROUTINE_MPN_SB_DIVREM_M3 (mpn_sb_divrem_mn_inv);
686}
687
688double
689speed_mpz_mod (struct speed_params *s)
690{
691  SPEED_ROUTINE_MPZ_MOD (mpz_mod);
692}
693double
694speed_redc (struct speed_params *s)
695{
696  SPEED_ROUTINE_REDC (redc);
697}
698
699
700double
701speed_mpn_popcount (struct speed_params *s)
702{
703  SPEED_ROUTINE_MPN_POPCOUNT (mpn_popcount);
704}
705double
706speed_mpn_hamdist (struct speed_params *s)
707{
708  SPEED_ROUTINE_MPN_HAMDIST (mpn_hamdist);
709}
710
711
712double
713speed_mpn_add_n (struct speed_params *s)
714{
715  SPEED_ROUTINE_MPN_BINARY_N (mpn_add_n);
716}
717double
718speed_mpn_sub_n (struct speed_params *s)
719{
720SPEED_ROUTINE_MPN_BINARY_N (mpn_sub_n);
721}
722
723
724/* mpn_and_n etc can be macros and so have to be handled with
725   SPEED_ROUTINE_MPN_BINARY_N_CALL forms */
726double
727speed_mpn_and_n (struct speed_params *s)
728{
729  SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_and_n (wp, s->xp, s->yp, s->size));
730}
731double
732speed_mpn_andn_n (struct speed_params *s)
733{
734SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_andn_n (wp, s->xp, s->yp, s->size));
735}
736double
737speed_mpn_nand_n (struct speed_params *s)
738{
739  SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_nand_n (wp, s->xp, s->yp, s->size));
740}
741double
742speed_mpn_ior_n (struct speed_params *s)
743{
744SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_ior_n (wp, s->xp, s->yp, s->size));
745}
746double
747speed_mpn_iorn_n (struct speed_params *s)
748{
749  SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_iorn_n (wp, s->xp, s->yp, s->size));
750}
751double
752speed_mpn_nior_n (struct speed_params *s)
753{
754  SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_nior_n (wp, s->xp, s->yp, s->size));
755}
756double
757speed_mpn_xor_n (struct speed_params *s)
758{
759  SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_xor_n (wp, s->xp, s->yp, s->size));
760}
761double
762speed_mpn_xnor_n (struct speed_params *s)
763{
764  SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_xnor_n (wp, s->xp, s->yp, s->size));
765}
766
767
768double
769speed_mpn_mul_n (struct speed_params *s)
770{
771  SPEED_ROUTINE_MPN_MUL_N (mpn_mul_n);
772}
773double
774speed_mpn_sqr_n (struct speed_params *s)
775{
776  SPEED_ROUTINE_MPN_SQR (mpn_sqr_n);
777}
778double
779speed_mpn_mul_n_sqr (struct speed_params *s)
780{
781  SPEED_ROUTINE_MPN_SQR_CALL (mpn_mul_n (wp, s->xp, s->xp, s->size));
782}
783
784double
785speed_mpn_mul_basecase (struct speed_params *s)
786{
787  SPEED_ROUTINE_MPN_MUL_BASECASE(mpn_mul_basecase);
788}
789double
790speed_mpn_sqr_basecase (struct speed_params *s)
791{
792  /* FIXME: size restrictions on some versions of sqr_basecase */
793  SPEED_ROUTINE_MPN_SQR (mpn_sqr_basecase);
794}
795
796#if HAVE_NATIVE_mpn_sqr_diagonal
797double
798speed_mpn_sqr_diagonal (struct speed_params *s)
799{
800  SPEED_ROUTINE_MPN_SQR (mpn_sqr_diagonal);
801}
802#endif
803
804double
805speed_mpn_kara_mul_n (struct speed_params *s)
806{
807  SPEED_ROUTINE_MPN_KARA_MUL_N (mpn_kara_mul_n);
808}
809double
810speed_mpn_kara_sqr_n (struct speed_params *s)
811{
812  SPEED_ROUTINE_MPN_KARA_SQR_N (mpn_kara_sqr_n);
813}
814
815double
816speed_mpn_toom3_mul_n (struct speed_params *s)
817{
818  SPEED_ROUTINE_MPN_TOOM3_MUL_N (mpn_toom3_mul_n);
819}
820double
821speed_mpn_toom3_sqr_n (struct speed_params *s)
822{
823  SPEED_ROUTINE_MPN_TOOM3_SQR_N (mpn_toom3_sqr_n);
824}
825
826double
827speed_mpn_toom3_mul_n_mpn (struct speed_params *s)
828{
829  SPEED_ROUTINE_MPN_TOOM3_MUL_N (mpn_toom3_mul_n_mpn);
830}
831double
832speed_mpn_toom3_mul_n_open (struct speed_params *s)
833{
834  SPEED_ROUTINE_MPN_TOOM3_MUL_N (mpn_toom3_mul_n_open);
835}
836double
837speed_mpn_toom3_sqr_n_mpn (struct speed_params *s)
838{
839  SPEED_ROUTINE_MPN_TOOM3_SQR_N (mpn_toom3_sqr_n_mpn);
840}
841double
842speed_mpn_toom3_sqr_n_open (struct speed_params *s)
843{
844  SPEED_ROUTINE_MPN_TOOM3_SQR_N (mpn_toom3_sqr_n_open);
845}
846
847double
848speed_mpn_mul_fft_full (struct speed_params *s)
849{
850  SPEED_ROUTINE_MPN_MUL_N_CALL
851    (mpn_mul_fft_full (wp, s->xp, s->size, s->yp, s->size));
852}
853double
854speed_mpn_mul_fft_full_sqr (struct speed_params *s)
855{
856  SPEED_ROUTINE_MPN_SQR_CALL
857    (mpn_mul_fft_full (wp, s->xp, s->size, s->xp, s->size));
858}
859
860
861/* These are mod 2^N+1 multiplies and squares.  If s->r is supplied it's
862   used as k, otherwise the best k for the size is used.  If s->size isn't a
863   multiple of 2^k it's rounded up to make the effective operation size.  */
864
865#define SPEED_ROUTINE_MPN_MUL_FFT_CALL(call, sqr)       \
866  {                                                     \
867    mp_ptr     wp;                                      \
868    mp_size_t  pl;                                      \
869    int        k;                                       \
870    unsigned   i;                                       \
871    double     t;                                       \
872    TMP_DECL (marker);                                  \
873                                                        \
874    SPEED_RESTRICT_COND (s->size >= 1);                 \
875                                                        \
876    if (s->r != 0)                                      \
877      k = s->r;                                         \
878    else                                                \
879      k = mpn_fft_best_k (s->size, sqr);                \
880                                                        \
881    TMP_MARK (marker);                                  \
882    pl = mpn_fft_next_size (s->size, k);                \
883    wp = SPEED_TMP_ALLOC_LIMBS (pl+1, s->align_wp);     \
884                                                        \
885    speed_operand_src (s, s->xp, s->size);              \
886    if (!sqr)                                           \
887      speed_operand_src (s, s->yp, s->size);            \
888    speed_operand_dst (s, wp, pl+1);                    \
889    speed_cache_fill (s);                               \
890                                                        \
891    speed_starttime ();                                 \
892    i = s->reps;                                        \
893    do                                                  \
894      call;                                             \
895    while (--i != 0);                                   \
896    t = speed_endtime ();                               \
897                                                        \
898    TMP_FREE (marker);                                  \
899    return t;                                           \
900  } 
901
902double
903speed_mpn_mul_fft (struct speed_params *s)
904{
905  SPEED_ROUTINE_MPN_MUL_FFT_CALL
906    (mpn_mul_fft (wp, pl, s->xp, s->size, s->yp, s->size, k), 0);
907}
908
909double
910speed_mpn_mul_fft_sqr (struct speed_params *s)
911{
912  SPEED_ROUTINE_MPN_MUL_FFT_CALL
913    (mpn_mul_fft (wp, pl, s->xp, s->size, s->xp, s->size, k), 1);
914}
915
916
917double
918speed_mpn_gcd (struct speed_params *s)
919{
920  SPEED_ROUTINE_MPN_GCD (mpn_gcd);
921}
922double
923speed_mpn_gcd_binary (struct speed_params *s)
924{
925  SPEED_ROUTINE_MPN_GCD (mpn_gcd_binary);
926}
927
928#if HAVE_NATIVE_mpn_gcd_finda
929double
930speed_mpn_gcd_finda (struct speed_params *s)
931{
932  SPEED_ROUTINE_MPN_GCD_FINDA (mpn_gcd_finda);
933
934#endif
935
936
937double
938speed_mpn_gcdext (struct speed_params *s)
939{
940  SPEED_ROUTINE_MPN_GCDEXT (mpn_gcdext);
941}
942double
943speed_mpn_gcdext_single (struct speed_params *s)
944{
945  SPEED_ROUTINE_MPN_GCDEXT (mpn_gcdext_single);
946}
947double
948speed_mpn_gcdext_double (struct speed_params *s)
949{
950  SPEED_ROUTINE_MPN_GCDEXT (mpn_gcdext_double);
951}
952double
953speed_mpn_gcdext_one_single (struct speed_params *s)
954{
955  SPEED_ROUTINE_MPN_GCDEXT_ONE (mpn_gcdext_one_single);
956}
957double
958speed_mpn_gcdext_one_double (struct speed_params *s)
959{
960  SPEED_ROUTINE_MPN_GCDEXT_ONE (mpn_gcdext_one_double);
961}
962double
963speed_mpn_gcd_1 (struct speed_params *s)
964{
965  SPEED_ROUTINE_MPN_GCD_1 (mpn_gcd_1);
966}
967double
968speed_mpn_gcd_1N (struct speed_params *s)
969{
970  SPEED_ROUTINE_MPN_GCD_1N (mpn_gcd_1);
971}
972
973
974double
975speed_mpz_jacobi (struct speed_params *s)
976{
977  SPEED_ROUTINE_MPZ_JACOBI (mpz_jacobi);
978}
979double
980speed_mpn_jacobi_base (struct speed_params *s)
981{
982  SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base);
983}
984double
985speed_mpn_jacobi_base_1 (struct speed_params *s)
986{
987  SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_1);
988}
989double
990speed_mpn_jacobi_base_2 (struct speed_params *s)
991{
992  SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_2);
993}
994double
995speed_mpn_jacobi_base_3 (struct speed_params *s)
996{
997  SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_3);
998}
999
1000
1001double
1002speed_mpn_sqrtrem (struct speed_params *s)
1003{
1004  SPEED_ROUTINE_MPN_SQRTREM (mpn_sqrtrem);
1005}
1006
1007
1008double
1009speed_mpz_fac_ui (struct speed_params *s)
1010{
1011  SPEED_ROUTINE_MPZ_FAC_UI (mpz_fac_ui);
1012}
1013
1014
1015double
1016speed_mpn_fib2_ui (struct speed_params *s)
1017{
1018  SPEED_ROUTINE_MPN_FIB2_UI (mpn_fib2_ui);
1019}
1020double
1021speed_mpz_fib_ui (struct speed_params *s)
1022{
1023  SPEED_ROUTINE_MPZ_FIB_UI (mpz_fib_ui);
1024}
1025double
1026speed_mpz_fib2_ui (struct speed_params *s)
1027{
1028  SPEED_ROUTINE_MPZ_FIB2_UI (mpz_fib2_ui);
1029}
1030double
1031speed_mpz_lucnum_ui (struct speed_params *s)
1032{
1033  SPEED_ROUTINE_MPZ_LUCNUM_UI (mpz_lucnum_ui);
1034}
1035double
1036speed_mpz_lucnum2_ui (struct speed_params *s)
1037{
1038  SPEED_ROUTINE_MPZ_LUCNUM2_UI (mpz_lucnum2_ui);
1039}
1040
1041
1042double
1043speed_mpz_powm (struct speed_params *s)
1044{
1045  SPEED_ROUTINE_MPZ_POWM (mpz_powm);
1046}
1047double
1048speed_mpz_powm_mod (struct speed_params *s)
1049{
1050  SPEED_ROUTINE_MPZ_POWM (mpz_powm_mod);
1051}
1052double
1053speed_mpz_powm_redc (struct speed_params *s)
1054{
1055  SPEED_ROUTINE_MPZ_POWM (mpz_powm_redc);
1056}
1057double
1058speed_mpz_powm_ui (struct speed_params *s)
1059{
1060  SPEED_ROUTINE_MPZ_POWM_UI (mpz_powm_ui);
1061}
1062
1063
1064double
1065speed_modlimb_invert (struct speed_params *s)
1066{
1067  SPEED_ROUTINE_MODLIMB_INVERT (modlimb_invert);
1068}
1069
1070
1071double
1072speed_noop (struct speed_params *s)
1073{
1074  unsigned  i;
1075 
1076  speed_starttime ();
1077  i = s->reps;
1078  do
1079    noop ();
1080  while (--i != 0);
1081  return speed_endtime ();
1082
1083
1084double
1085speed_noop_wxs (struct speed_params *s)
1086{
1087  mp_ptr   wp;
1088  unsigned i;
1089  double   t;
1090  TMP_DECL (marker);
1091
1092  TMP_MARK (marker);
1093  wp = TMP_ALLOC_LIMBS (1);
1094
1095  speed_starttime ();
1096  i = s->reps;
1097  do
1098    noop_wxs (wp, s->xp, s->size);
1099  while (--i != 0);
1100  t = speed_endtime ();
1101
1102  TMP_FREE (marker);
1103  return t;
1104
1105
1106double
1107speed_noop_wxys (struct speed_params *s)
1108{
1109  mp_ptr   wp;
1110  unsigned i;
1111  double   t;
1112  TMP_DECL (marker);
1113
1114  TMP_MARK (marker);
1115  wp = TMP_ALLOC_LIMBS (1);
1116
1117  speed_starttime ();
1118  i = s->reps;
1119  do
1120    noop_wxys (wp, s->xp, s->yp, s->size);
1121  while (--i != 0);
1122  t = speed_endtime ();
1123
1124  TMP_FREE (marker);
1125  return t;
1126
1127
1128
1129#define SPEED_ROUTINE_ALLOC_FREE(variables, calls)      \
1130  {                                                     \
1131    unsigned  i;                                        \
1132    variables;                                          \
1133                                                        \
1134    speed_starttime ();                                 \
1135    i = s->reps;                                        \
1136    do                                                  \
1137      {                                                 \
1138        calls;                                          \
1139      }                                                 \
1140    while (--i != 0);                                   \
1141    return speed_endtime ();                            \
1142  }
1143
1144
1145/* Compare these to see how much malloc/free costs and then how much
1146   __gmp_default_allocate/free and mpz_init/clear add.  mpz_init/clear or
1147   mpq_init/clear will be doing a 1 limb allocate, so use that as the size
1148   when including them in comparisons.  */
1149
1150double
1151speed_malloc_free (struct speed_params *s)
1152{
1153  size_t  bytes = s->size * BYTES_PER_MP_LIMB;
1154  SPEED_ROUTINE_ALLOC_FREE (void *p,
1155                            p = malloc (bytes);
1156                            free (p));
1157
1158
1159double
1160speed_malloc_realloc_free (struct speed_params *s)
1161{
1162  size_t  bytes = s->size * BYTES_PER_MP_LIMB;
1163  SPEED_ROUTINE_ALLOC_FREE (void *p,
1164                            p = malloc (BYTES_PER_MP_LIMB);
1165                            p = realloc (p, bytes);
1166                            free (p));
1167
1168
1169double
1170speed_gmp_allocate_free (struct speed_params *s)
1171{
1172  size_t  bytes = s->size * BYTES_PER_MP_LIMB;
1173  SPEED_ROUTINE_ALLOC_FREE (void *p,
1174                            p = (*__gmp_allocate_func) (bytes);
1175                            (*__gmp_free_func) (p, bytes));
1176
1177
1178double
1179speed_gmp_allocate_reallocate_free (struct speed_params *s)
1180{
1181  size_t  bytes = s->size * BYTES_PER_MP_LIMB;
1182  SPEED_ROUTINE_ALLOC_FREE
1183    (void *p,
1184     p = (*__gmp_allocate_func) (BYTES_PER_MP_LIMB);
1185     p = (*__gmp_reallocate_func) (p, bytes, BYTES_PER_MP_LIMB);
1186     (*__gmp_free_func) (p, bytes));
1187
1188
1189double
1190speed_mpz_init_clear (struct speed_params *s)
1191{
1192  SPEED_ROUTINE_ALLOC_FREE (mpz_t z,
1193                            mpz_init (z);
1194                            mpz_clear (z));
1195
1196
1197double
1198speed_mpz_init_realloc_clear (struct speed_params *s)
1199{
1200  SPEED_ROUTINE_ALLOC_FREE (mpz_t z,
1201                            mpz_init (z);
1202                            _mpz_realloc (z, s->size);
1203                            mpz_clear (z));
1204
1205
1206double
1207speed_mpq_init_clear (struct speed_params *s)
1208{
1209  SPEED_ROUTINE_ALLOC_FREE (mpq_t q,
1210                            mpq_init (q);
1211                            mpq_clear (q));
1212
1213
1214double
1215speed_mpf_init_clear (struct speed_params *s)
1216{
1217  SPEED_ROUTINE_ALLOC_FREE (mpf_t f,
1218                            mpf_init (f);
1219                            mpf_clear (f));
1220
1221
1222
1223/* Compare this to mpn_add_n to see how much overhead mpz_add adds.  Note
1224   that repeatedly calling mpz_add with the same data gives branch predition
1225   in it an advantage.  */
1226
1227double
1228speed_mpz_add (struct speed_params *s)
1229{
1230  mpz_t     w, x, y;
1231  unsigned  i;
1232  double    t;
1233
1234  mpz_init (w);
1235  mpz_init (x);
1236  mpz_init (y);
1237
1238  mpz_set_n (x, s->xp, s->size);
1239  mpz_set_n (y, s->yp, s->size);
1240  mpz_add (w, x, y);
1241
1242  speed_starttime ();
1243  i = s->reps;
1244  do
1245    {
1246      mpz_add (w, x, y);
1247    }
1248  while (--i != 0);
1249  t = speed_endtime ();
1250
1251  mpz_clear (w);
1252  mpz_clear (x);
1253  mpz_clear (y);
1254  return t;
1255
1256
1257
1258/* If r==0, calculate (size,size/2),
1259   otherwise calculate (size,r). */
1260
1261double
1262speed_mpz_bin_uiui (struct speed_params *s)
1263{
1264  mpz_t          w;
1265  unsigned long  k;
1266  unsigned  i;
1267  double    t;
1268
1269  mpz_init (w);
1270  if (s->r != 0)
1271    k = s->r;
1272  else
1273    k = s->size/2;
1274
1275  speed_starttime ();
1276  i = s->reps;
1277  do
1278    {
1279      mpz_bin_uiui (w, s->size, k);
1280    }
1281  while (--i != 0);
1282  t = speed_endtime ();
1283
1284  mpz_clear (w);
1285  return t;
1286
1287
1288
1289/* The multiplies are successively dependent so the latency is measured, not
1290   the issue rate.  There's only 10 per loop so the code doesn't get too big
1291   since umul_ppmm is several instructions on some cpus.
1292
1293   Putting the arguments as "h,l,l,h" gets slightly better code from gcc
1294   2.95.2 on x86, it puts only one mov between each mul, not two.  That mov
1295   though will probably show up as a bogus extra cycle though.
1296
1297   The measuring function macros are into three parts to avoid overflowing
1298   preprocessor expansion space if umul_ppmm is big.
1299
1300   Limitations:
1301
1302   Don't blindly use this to set UMUL_TIME in gmp-mparam.h, check the code
1303   generated first, especially on CPUs with low latency multipliers.
1304
1305   The default umul_ppmm doing h*l will be getting increasing numbers of
1306   high zero bits in the calculation.  CPUs with data-dependent multipliers
1307   will want to use umul_ppmm.1 to get some randomization into the
1308   calculation.  The extra xors and fetches will be a slowdown of course.  */
1309
1310#define SPEED_MACRO_UMUL_PPMM_A \
1311  {                             \
1312    mp_limb_t  h, l;            \
1313    unsigned   i;               \
1314    double     t;               \
1315                                \
1316    s->time_divisor = 10;       \
1317                                \
1318    h = s->xp[0];               \
1319    l = s->yp[0];               \
1320                                \
1321    if (s->r == 1)              \
1322      {                         \
1323        speed_starttime ();     \
1324        i = s->reps;            \
1325        do                      \
1326          {
1327
1328#define SPEED_MACRO_UMUL_PPMM_B \
1329          }                     \
1330        while (--i != 0);       \
1331        t = speed_endtime ();   \
1332      }                         \
1333    else                        \
1334      {                         \
1335        speed_starttime ();     \
1336        i = s->reps;            \
1337        do                      \
1338          {
1339
1340#define SPEED_MACRO_UMUL_PPMM_C                                         \
1341          }                                                             \
1342        while (--i != 0);                                               \
1343        t = speed_endtime ();                                           \
1344      }                                                                 \
1345                                                                        \
1346    /* stop the compiler optimizing away the whole calculation! */      \
1347    noop_1 (h);                                                         \
1348    noop_1 (l);                                                         \
1349                                                                        \
1350    return t;                                                           \
1351  }
1352
1353
1354double
1355speed_umul_ppmm (struct speed_params *s)
1356{
1357  SPEED_MACRO_UMUL_PPMM_A;
1358  {
1359    umul_ppmm (h, l, l, h);  h ^= s->xp_block[0]; l ^= s->yp_block[0];
1360     umul_ppmm (h, l, l, h); h ^= s->xp_block[1]; l ^= s->yp_block[1];
1361     umul_ppmm (h, l, l, h); h ^= s->xp_block[2]; l ^= s->yp_block[2];
1362    umul_ppmm (h, l, l, h);  h ^= s->xp_block[3]; l ^= s->yp_block[3];
1363     umul_ppmm (h, l, l, h); h ^= s->xp_block[4]; l ^= s->yp_block[4];
1364     umul_ppmm (h, l, l, h); h ^= s->xp_block[5]; l ^= s->yp_block[5];
1365    umul_ppmm (h, l, l, h);  h ^= s->xp_block[6]; l ^= s->yp_block[6];
1366     umul_ppmm (h, l, l, h); h ^= s->xp_block[7]; l ^= s->yp_block[7];
1367     umul_ppmm (h, l, l, h); h ^= s->xp_block[8]; l ^= s->yp_block[8];
1368    umul_ppmm (h, l, l, h);  h ^= s->xp_block[9]; l ^= s->yp_block[9];
1369  }
1370  SPEED_MACRO_UMUL_PPMM_B;
1371  {
1372    umul_ppmm (h, l, l, h);
1373     umul_ppmm (h, l, l, h);
1374     umul_ppmm (h, l, l, h);
1375    umul_ppmm (h, l, l, h);
1376     umul_ppmm (h, l, l, h);
1377     umul_ppmm (h, l, l, h);
1378    umul_ppmm (h, l, l, h);
1379     umul_ppmm (h, l, l, h);
1380     umul_ppmm (h, l, l, h);
1381    umul_ppmm (h, l, l, h);
1382  }
1383  SPEED_MACRO_UMUL_PPMM_C;
1384}
1385
1386
1387#if HAVE_NATIVE_mpn_umul_ppmm
1388
1389#if defined (__hppa) && W_TYPE_SIZE == 64
1390#define CALL_MPN_UMUL_PPMM  (h = __MPN (umul_ppmm) (h, l, &l))
1391#else
1392#define CALL_MPN_UMUL_PPMM  (h = __MPN (umul_ppmm) (&l, h, l))
1393#endif
1394
1395double
1396speed_mpn_umul_ppmm (struct speed_params *s)
1397{
1398  SPEED_MACRO_UMUL_PPMM_A;
1399  {
1400    CALL_MPN_UMUL_PPMM;  h ^= s->xp_block[0]; l ^= s->yp_block[0];
1401     CALL_MPN_UMUL_PPMM; h ^= s->xp_block[1]; l ^= s->yp_block[1];
1402     CALL_MPN_UMUL_PPMM; h ^= s->xp_block[2]; l ^= s->yp_block[2];
1403    CALL_MPN_UMUL_PPMM;  h ^= s->xp_block[3]; l ^= s->yp_block[3];
1404     CALL_MPN_UMUL_PPMM; h ^= s->xp_block[4]; l ^= s->yp_block[4];
1405     CALL_MPN_UMUL_PPMM; h ^= s->xp_block[5]; l ^= s->yp_block[5];
1406    CALL_MPN_UMUL_PPMM;  h ^= s->xp_block[6]; l ^= s->yp_block[6];
1407     CALL_MPN_UMUL_PPMM; h ^= s->xp_block[7]; l ^= s->yp_block[7];
1408     CALL_MPN_UMUL_PPMM; h ^= s->xp_block[8]; l ^= s->yp_block[8];
1409    CALL_MPN_UMUL_PPMM;  h ^= s->xp_block[9]; l ^= s->yp_block[9];
1410  }
1411  SPEED_MACRO_UMUL_PPMM_B;
1412  {
1413    CALL_MPN_UMUL_PPMM;
1414     CALL_MPN_UMUL_PPMM;
1415     CALL_MPN_UMUL_PPMM;
1416    CALL_MPN_UMUL_PPMM;
1417     CALL_MPN_UMUL_PPMM;
1418     CALL_MPN_UMUL_PPMM;
1419    CALL_MPN_UMUL_PPMM;
1420     CALL_MPN_UMUL_PPMM;
1421     CALL_MPN_UMUL_PPMM;
1422    CALL_MPN_UMUL_PPMM;
1423  }
1424  SPEED_MACRO_UMUL_PPMM_C;
1425}
1426#endif
1427
1428
1429/* The divisions are successively dependent so latency is measured, not
1430   issue rate.  There's only 10 per loop so the code doesn't get too big,
1431   especially for udiv_qrnnd_preinv and preinv2norm, which are several
1432   instructions each.
1433
1434   Note that it's only the division which is measured here, there's no data
1435   fetching and no shifting if the divisor gets normalized.
1436
1437   In speed_udiv_qrnnd with gcc 2.95.2 on x86 the parameters "q,r,r,q,d"
1438   generate x86 div instructions with nothing in between.
1439
1440   The measuring function macros are in two parts to avoid overflowing
1441   preprocessor expansion space if udiv_qrnnd etc are big.
1442
1443   Limitations:
1444
1445   Don't blindly use this to set UDIV_TIME in gmp-mparam.h, check the code
1446   generated first.
1447
1448   CPUs with data-dependent divisions may want more attention paid to the
1449   randomness of the data used.  Probably the measurement wanted is over
1450   uniformly distributed numbers, but what's here might not be giving that.  */
1451
1452#define SPEED_ROUTINE_UDIV_QRNND_A(normalize)           \
1453  {                                                     \
1454    double     t;                                       \
1455    unsigned   i;                                       \
1456    mp_limb_t  q, r, d;                                 \
1457    mp_limb_t  dinv;                                    \
1458                                                        \
1459    s->time_divisor = 10;                               \
1460                                                        \
1461    /* divisor from "r" parameter, or a default */      \
1462    d = s->r;                                           \
1463    if (d == 0)                                         \
1464      d = __mp_bases[10].big_base;                      \
1465                                                        \
1466    if (normalize)                                      \
1467      {                                                 \
1468        unsigned  norm;                                 \
1469        count_leading_zeros (norm, d);                  \
1470        d <<= norm;                                     \
1471        invert_limb (dinv, d);                          \
1472      }                                                 \
1473                                                        \
1474    q = s->xp[0];                                       \
1475    r = s->yp[0] % d;                                   \
1476                                                        \
1477    speed_starttime ();                                 \
1478    i = s->reps;                                        \
1479    do                                                  \
1480      {
1481
1482#define SPEED_ROUTINE_UDIV_QRNND_B                                      \
1483      }                                                                 \
1484    while (--i != 0);                                                   \
1485    t = speed_endtime ();                                               \
1486                                                                        \
1487    /* stop the compiler optimizing away the whole calculation! */      \
1488    noop_1 (q);                                                         \
1489    noop_1 (r);                                                         \
1490                                                                        \
1491    return t;                                                           \
1492  }
1493
1494double
1495speed_udiv_qrnnd (struct speed_params *s)
1496{
1497  SPEED_ROUTINE_UDIV_QRNND_A (UDIV_NEEDS_NORMALIZATION);
1498  {
1499    udiv_qrnnd (q, r, r, q, d);
1500     udiv_qrnnd (q, r, r, q, d);
1501     udiv_qrnnd (q, r, r, q, d);
1502    udiv_qrnnd (q, r, r, q, d);
1503     udiv_qrnnd (q, r, r, q, d);
1504     udiv_qrnnd (q, r, r, q, d);
1505    udiv_qrnnd (q, r, r, q, d);
1506     udiv_qrnnd (q, r, r, q, d);
1507     udiv_qrnnd (q, r, r, q, d);
1508    udiv_qrnnd (q, r, r, q, d);
1509  }
1510  SPEED_ROUTINE_UDIV_QRNND_B;
1511}
1512
1513double
1514speed_udiv_qrnnd_preinv (struct speed_params *s)
1515{
1516  SPEED_ROUTINE_UDIV_QRNND_A (1);
1517  {
1518    udiv_qrnnd_preinv (q, r, r, q, d, dinv);
1519     udiv_qrnnd_preinv (q, r, r, q, d, dinv);
1520     udiv_qrnnd_preinv (q, r, r, q, d, dinv);
1521    udiv_qrnnd_preinv (q, r, r, q, d, dinv);
1522     udiv_qrnnd_preinv (q, r, r, q, d, dinv);
1523     udiv_qrnnd_preinv (q, r, r, q, d, dinv);
1524    udiv_qrnnd_preinv (q, r, r, q, d, dinv);
1525     udiv_qrnnd_preinv (q, r, r, q, d, dinv);
1526     udiv_qrnnd_preinv (q, r, r, q, d, dinv);
1527    udiv_qrnnd_preinv (q, r, r, q, d, dinv);
1528  }
1529  SPEED_ROUTINE_UDIV_QRNND_B;
1530
1531
1532double
1533speed_udiv_qrnnd_preinv2norm (struct speed_params *s)
1534{
1535  SPEED_ROUTINE_UDIV_QRNND_A (1);
1536  {
1537    udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
1538     udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
1539     udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
1540    udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
1541     udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
1542     udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
1543    udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
1544     udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
1545     udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
1546    udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
1547  }
1548  SPEED_ROUTINE_UDIV_QRNND_B;
1549}
1550
1551double
1552speed_udiv_qrnnd_c (struct speed_params *s)
1553{
1554  SPEED_ROUTINE_UDIV_QRNND_A (1);
1555  {
1556    __udiv_qrnnd_c (q, r, r, q, d);
1557     __udiv_qrnnd_c (q, r, r, q, d);
1558     __udiv_qrnnd_c (q, r, r, q, d);
1559    __udiv_qrnnd_c (q, r, r, q, d);
1560     __udiv_qrnnd_c (q, r, r, q, d);
1561     __udiv_qrnnd_c (q, r, r, q, d);
1562    __udiv_qrnnd_c (q, r, r, q, d);
1563     __udiv_qrnnd_c (q, r, r, q, d);
1564     __udiv_qrnnd_c (q, r, r, q, d);
1565    __udiv_qrnnd_c (q, r, r, q, d);
1566  }
1567  SPEED_ROUTINE_UDIV_QRNND_B;
1568
1569
1570#if HAVE_NATIVE_mpn_udiv_qrnnd
1571
1572#if defined (__hppa) && W_TYPE_SIZE == 64
1573#define CALL_MPN_UDIV_QRNND  (q = __MPN (udiv_qrnnd) (r, q, d, &r))
1574#else
1575#define CALL_MPN_UDIV_QRNND  (q = __MPN (udiv_qrnnd) (&r, r, q, d))
1576#endif
1577
1578double
1579speed_mpn_udiv_qrnnd (struct speed_params *s)
1580{
1581  SPEED_ROUTINE_UDIV_QRNND_A (1);
1582  {
1583    CALL_MPN_UDIV_QRNND;
1584     CALL_MPN_UDIV_QRNND;
1585     CALL_MPN_UDIV_QRNND;
1586    CALL_MPN_UDIV_QRNND;
1587     CALL_MPN_UDIV_QRNND;
1588     CALL_MPN_UDIV_QRNND;
1589    CALL_MPN_UDIV_QRNND;
1590     CALL_MPN_UDIV_QRNND;
1591     CALL_MPN_UDIV_QRNND;
1592    CALL_MPN_UDIV_QRNND;
1593  }
1594  SPEED_ROUTINE_UDIV_QRNND_B;
1595}
1596#endif
1597
1598
1599double
1600speed_invert_limb (struct speed_params *s)
1601{
1602  SPEED_ROUTINE_INVERT_LIMB_CALL (invert_limb (dinv, d));
1603}
1604
1605
1606/* xp[0] might not be particularly random, but should give an indication how
1607   "/" runs.  Same for speed_operator_mod below.  */
1608double
1609speed_operator_div (struct speed_params *s)
1610{
1611  double     t;
1612  unsigned   i;
1613  mp_limb_t  x, q, d;
1614
1615  s->time_divisor = 10;
1616
1617  /* divisor from "r" parameter, or a default */
1618  d = s->r;
1619  if (d == 0)
1620    d = __mp_bases[10].big_base;
1621
1622  x = s->xp[0];
1623  q = 0;
1624
1625  speed_starttime ();
1626  i = s->reps;
1627  do
1628    {
1629      q ^= x; q /= d;
1630       q ^= x; q /= d;
1631       q ^= x; q /= d;
1632      q ^= x; q /= d;
1633       q ^= x; q /= d;
1634       q ^= x; q /= d;
1635      q ^= x; q /= d;
1636       q ^= x; q /= d;
1637       q ^= x; q /= d;
1638      q ^= x; q /= d;
1639    }
1640  while (--i != 0);
1641  t = speed_endtime ();
1642
1643  /* stop the compiler optimizing away the whole calculation! */
1644  noop_1 (q);
1645
1646  return t;
1647}
1648
1649double
1650speed_operator_mod (struct speed_params *s)
1651{
1652  double     t;
1653  unsigned   i;
1654  mp_limb_t  x, r, d;
1655
1656  s->time_divisor = 10;
1657
1658  /* divisor from "r" parameter, or a default */
1659  d = s->r;
1660  if (d == 0)
1661    d = __mp_bases[10].big_base;
1662
1663  x = s->xp[0];
1664  r = 0;
1665
1666  speed_starttime ();
1667  i = s->reps;
1668  do
1669    {
1670      r ^= x; r %= d;
1671       r ^= x; r %= d;
1672       r ^= x; r %= d;
1673      r ^= x; r %= d;
1674       r ^= x; r %= d;
1675       r ^= x; r %= d;
1676      r ^= x; r %= d;
1677       r ^= x; r %= d;
1678       r ^= x; r %= d;
1679      r ^= x; r %= d;
1680    }
1681  while (--i != 0);
1682  t = speed_endtime ();
1683
1684  /* stop the compiler optimizing away the whole calculation! */
1685  noop_1 (r);
1686
1687  return t;
1688}
1689
1690
1691/* r==0 measures on data with the values uniformly distributed.  This will
1692   be typical for count_trailing_zeros in a GCD etc.
1693
1694   r==1 measures on data with the resultant count uniformly distributed
1695   between 0 and BITS_PER_MP_LIMB-1.  This is probably sensible for
1696   count_leading_zeros on the high limbs of divisors.  */
1697
1698int
1699speed_routine_count_zeros_setup (struct speed_params *s,
1700                                 mp_ptr xp, int leading, int zero)
1701{
1702  int        i, c;
1703  mp_limb_t  n;
1704
1705  if (s->r == 0)
1706    {
1707      /* Make uniformly distributed data.  If zero isn't allowed then change
1708         it to 1 for leading, or 0x800..00 for trailing.  */
1709      MPN_COPY (xp, s->xp_block, SPEED_BLOCK_SIZE);
1710      if (! zero)
1711        for (i = 0; i < SPEED_BLOCK_SIZE; i++)
1712          if (xp[i] == 0)
1713            xp[i] = leading ? 1 : GMP_LIMB_HIGHBIT;
1714    }
1715  else if (s->r == 1)
1716    {
1717      /* Make counts uniformly distributed.  A randomly chosen bit is set, and
1718         for leading the rest above it are cleared, or for trailing then the
1719         rest below.  */
1720      for (i = 0; i < SPEED_BLOCK_SIZE; i++)
1721        {
1722          mp_limb_t  set = CNST_LIMB(1) << (s->yp_block[i] % BITS_PER_MP_LIMB);
1723          mp_limb_t  keep_below = set-1;
1724          mp_limb_t  keep_above = MP_LIMB_T_MAX ^ keep_below;
1725          mp_limb_t  keep = (leading ? keep_below : keep_above);
1726          xp[i] = (s->xp_block[i] & keep) | set;
1727        }
1728    }
1729  else
1730    {
1731      return 0;
1732    }
1733
1734  /* Account for the effect of n^=c. */
1735  c = 0;
1736  for (i = 0; i < SPEED_BLOCK_SIZE; i++)
1737    {
1738      n = xp[i];
1739      xp[i] ^= c;
1740
1741      if (leading)
1742        count_leading_zeros (c, n);
1743      else
1744        count_trailing_zeros (c, n);
1745    }
1746
1747  return 1;
1748}
1749
1750double
1751speed_count_leading_zeros (struct speed_params *s)
1752{
1753#ifdef COUNT_LEADING_ZEROS_0
1754#define COUNT_LEADING_ZEROS_0_ALLOWED   1
1755#else
1756#define COUNT_LEADING_ZEROS_0_ALLOWED   0
1757#endif
1758
1759  SPEED_ROUTINE_COUNT_ZEROS_A (1, COUNT_LEADING_ZEROS_0_ALLOWED);
1760  count_leading_zeros (c, n);
1761  SPEED_ROUTINE_COUNT_ZEROS_B ();
1762}
1763double
1764speed_count_trailing_zeros (struct speed_params *s)
1765{
1766  SPEED_ROUTINE_COUNT_ZEROS_A (0, 0);
1767  count_trailing_zeros (c, n);
1768  SPEED_ROUTINE_COUNT_ZEROS_B ();
1769}
1770
1771
1772double
1773speed_mpn_get_str (struct speed_params *s)
1774{
1775  SPEED_ROUTINE_MPN_GET_STR (mpn_get_str);
1776
1777
1778double
1779speed_mpn_set_str (struct speed_params *s)
1780{
1781  SPEED_ROUTINE_MPN_SET_STR (mpn_set_str);
1782
1783double
1784speed_mpn_set_str_basecase (struct speed_params *s)
1785{
1786  SPEED_ROUTINE_MPN_SET_STR (mpn_set_str_basecase);
1787}
1788double
1789speed_mpn_set_str_subquad (struct speed_params *s)
1790{
1791  SPEED_ROUTINE_MPN_SET_STR (mpn_set_str_subquad);
1792}
1793
1794
1795double
1796speed_MPN_ZERO (struct speed_params *s)
1797{
1798  SPEED_ROUTINE_MPN_ZERO_CALL (MPN_ZERO (wp, s->size));
1799
Note: See TracBrowser for help on using the repository browser.