1 | /* Shared speed subroutines. |
---|
2 | |
---|
3 | Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc. |
---|
4 | |
---|
5 | This file is part of the GNU MP Library. |
---|
6 | |
---|
7 | The GNU MP Library is free software; you can redistribute it and/or modify |
---|
8 | it under the terms of the GNU Lesser General Public License as published by |
---|
9 | the Free Software Foundation; either version 2.1 of the License, or (at your |
---|
10 | option) any later version. |
---|
11 | |
---|
12 | The GNU MP Library is distributed in the hope that it will be useful, but |
---|
13 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
---|
14 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public |
---|
15 | License for more details. |
---|
16 | |
---|
17 | You should have received a copy of the GNU Lesser General Public License |
---|
18 | along with the GNU MP Library; see the file COPYING.LIB. If not, write to |
---|
19 | the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, |
---|
20 | MA 02111-1307, USA. */ |
---|
21 | |
---|
22 | #include <errno.h> |
---|
23 | #include <fcntl.h> |
---|
24 | #include <math.h> |
---|
25 | #include <stdio.h> |
---|
26 | #include <stdlib.h> /* for qsort */ |
---|
27 | #include <string.h> |
---|
28 | #include <unistd.h> |
---|
29 | #if 0 |
---|
30 | #include <sys/ioctl.h> |
---|
31 | #endif |
---|
32 | |
---|
33 | #include "gmp.h" |
---|
34 | #include "gmp-impl.h" |
---|
35 | #include "longlong.h" |
---|
36 | |
---|
37 | #include "tests.h" |
---|
38 | #include "speed.h" |
---|
39 | |
---|
40 | |
---|
41 | int speed_option_addrs = 0; |
---|
42 | int speed_option_verbose = 0; |
---|
43 | |
---|
44 | |
---|
45 | /* Provide __clz_tab even if it's not required, for the benefit of new code |
---|
46 | being tested with many.pl. */ |
---|
47 | #ifndef COUNT_LEADING_ZEROS_NEED_CLZ_TAB |
---|
48 | #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB |
---|
49 | #include "mp_clz_tab.c" |
---|
50 | #undef COUNT_LEADING_ZEROS_NEED_CLZ_TAB |
---|
51 | #endif |
---|
52 | |
---|
53 | |
---|
54 | void |
---|
55 | pentium_wbinvd(void) |
---|
56 | { |
---|
57 | #if 0 |
---|
58 | { |
---|
59 | static int fd = -2; |
---|
60 | |
---|
61 | if (fd == -2) |
---|
62 | { |
---|
63 | fd = open ("/dev/wbinvd", O_RDWR); |
---|
64 | if (fd == -1) |
---|
65 | perror ("open /dev/wbinvd"); |
---|
66 | } |
---|
67 | |
---|
68 | if (fd != -1) |
---|
69 | ioctl (fd, 0, 0); |
---|
70 | } |
---|
71 | #endif |
---|
72 | |
---|
73 | #if 0 |
---|
74 | #define WBINVDSIZE 1024*1024*2 |
---|
75 | { |
---|
76 | static char *p = NULL; |
---|
77 | int i, sum; |
---|
78 | |
---|
79 | if (p == NULL) |
---|
80 | p = malloc (WBINVDSIZE); |
---|
81 | |
---|
82 | #if 0 |
---|
83 | for (i = 0; i < WBINVDSIZE; i++) |
---|
84 | p[i] = i & 0xFF; |
---|
85 | #endif |
---|
86 | |
---|
87 | sum = 0; |
---|
88 | for (i = 0; i < WBINVDSIZE; i++) |
---|
89 | sum += p[i]; |
---|
90 | |
---|
91 | mpn_cache_fill_dummy (sum); |
---|
92 | } |
---|
93 | #endif |
---|
94 | } |
---|
95 | |
---|
96 | |
---|
97 | int |
---|
98 | double_cmp_ptr (const double *p, const double *q) |
---|
99 | { |
---|
100 | if (*p > *q) return 1; |
---|
101 | if (*p < *q) return -1; |
---|
102 | return 0; |
---|
103 | } |
---|
104 | |
---|
105 | |
---|
106 | /* Measure the speed of a given routine. |
---|
107 | |
---|
108 | The routine is run with enough repetitions to make it take at least |
---|
109 | speed_precision * speed_unittime. This aims to minimize the effects of a |
---|
110 | limited accuracy time base and the overhead of the measuring itself. |
---|
111 | |
---|
112 | Measurements are made looking for 4 results within TOLERANCE of each |
---|
113 | other (or 3 for routines taking longer than 2 seconds). This aims to get |
---|
114 | an accurate reading even if some runs are bloated by interrupts or task |
---|
115 | switches or whatever. |
---|
116 | |
---|
117 | The given (*fun)() is expected to run its function "s->reps" many times |
---|
118 | and return the total elapsed time measured using speed_starttime() and |
---|
119 | speed_endtime(). If the function doesn't support the given s->size or |
---|
120 | s->r, -1.0 should be returned. See the various base routines below. */ |
---|
121 | |
---|
122 | double |
---|
123 | speed_measure (double (*fun) _PROTO ((struct speed_params *s)), |
---|
124 | struct speed_params *s) |
---|
125 | { |
---|
126 | #define TOLERANCE 1.005 /* 0.5% */ |
---|
127 | |
---|
128 | struct speed_params s_dummy; |
---|
129 | int i, j, e; |
---|
130 | double t[30]; |
---|
131 | double t_unsorted[30]; |
---|
132 | double reps_d; |
---|
133 | |
---|
134 | /* Use dummy parameters if caller doesn't provide any. Only a few special |
---|
135 | "fun"s will cope with this, speed_noop() is one. */ |
---|
136 | if (s == NULL) |
---|
137 | { |
---|
138 | memset (&s_dummy, '\0', sizeof (s_dummy)); |
---|
139 | s = &s_dummy; |
---|
140 | } |
---|
141 | |
---|
142 | s->reps = 1; |
---|
143 | s->time_divisor = 1.0; |
---|
144 | for (i = 0; i < numberof (t); i++) |
---|
145 | { |
---|
146 | for (;;) |
---|
147 | { |
---|
148 | s->src_num = 0; |
---|
149 | s->dst_num = 0; |
---|
150 | |
---|
151 | t[i] = (*fun) (s); |
---|
152 | |
---|
153 | if (speed_option_verbose >= 3) |
---|
154 | printf("size=%ld reps=%u r=%ld attempt=%d %.9f\n", |
---|
155 | s->size, s->reps, s->r, i, t[i]); |
---|
156 | |
---|
157 | if (t[i] == -1.0) |
---|
158 | return -1.0; |
---|
159 | |
---|
160 | if (t[i] >= speed_unittime * speed_precision) |
---|
161 | break; |
---|
162 | |
---|
163 | /* go to a value of reps to make t[i] >= precision */ |
---|
164 | reps_d = ceil (1.1 * s->reps |
---|
165 | * speed_unittime * speed_precision |
---|
166 | / MAX (t[i], speed_unittime)); |
---|
167 | if (reps_d > 2e9 || reps_d < 1.0) |
---|
168 | { |
---|
169 | fprintf (stderr, "Fatal error: new reps bad: %.2f\n", reps_d); |
---|
170 | fprintf (stderr, " (old reps %u, unittime %.4g, precision %d, t[i] %.4g)\n", |
---|
171 | s->reps, speed_unittime, speed_precision, t[i]); |
---|
172 | abort (); |
---|
173 | } |
---|
174 | s->reps = (unsigned) reps_d; |
---|
175 | } |
---|
176 | t[i] /= s->reps; |
---|
177 | t_unsorted[i] = t[i]; |
---|
178 | |
---|
179 | if (speed_precision == 0) |
---|
180 | return t[i]; |
---|
181 | |
---|
182 | /* require 3 values within TOLERANCE when >= 2 secs, 4 when below */ |
---|
183 | if (t[0] >= 2.0) |
---|
184 | e = 3; |
---|
185 | else |
---|
186 | e = 4; |
---|
187 | |
---|
188 | /* Look for e many t[]'s within TOLERANCE of each other to consider a |
---|
189 | valid measurement. Return smallest among them. */ |
---|
190 | if (i >= e) |
---|
191 | { |
---|
192 | qsort (t, i+1, sizeof(t[0]), (qsort_function_t) double_cmp_ptr); |
---|
193 | for (j = e-1; j < i; j++) |
---|
194 | if (t[j] <= t[j-e+1] * TOLERANCE) |
---|
195 | return t[j-e+1] / s->time_divisor; |
---|
196 | } |
---|
197 | } |
---|
198 | |
---|
199 | fprintf (stderr, "speed_measure() could not get %d results within %.1f%%\n", |
---|
200 | e, (TOLERANCE-1.0)*100.0); |
---|
201 | fprintf (stderr, " unsorted sorted\n"); |
---|
202 | fprintf (stderr, " %.12f %.12f is about 0.5%%\n", |
---|
203 | t_unsorted[0]*(TOLERANCE-1.0), t[0]*(TOLERANCE-1.0)); |
---|
204 | for (i = 0; i < numberof (t); i++) |
---|
205 | fprintf (stderr, " %.09f %.09f\n", t_unsorted[i], t[i]); |
---|
206 | |
---|
207 | return -1.0; |
---|
208 | } |
---|
209 | |
---|
210 | |
---|
211 | /* Read all of ptr,size to get it into the CPU memory cache. |
---|
212 | |
---|
213 | A call to mpn_cache_fill_dummy() is used to make sure the compiler |
---|
214 | doesn't optimize away the whole loop. Using "volatile mp_limb_t sum" |
---|
215 | would work too, but the function call means we don't rely on every |
---|
216 | compiler actually implementing volatile properly. |
---|
217 | |
---|
218 | mpn_cache_fill_dummy() is in a separate source file to stop gcc thinking |
---|
219 | it can inline it. */ |
---|
220 | |
---|
221 | void |
---|
222 | mpn_cache_fill (mp_srcptr ptr, mp_size_t size) |
---|
223 | { |
---|
224 | mp_limb_t sum = 0; |
---|
225 | mp_size_t i; |
---|
226 | |
---|
227 | for (i = 0; i < size; i++) |
---|
228 | sum += ptr[i]; |
---|
229 | |
---|
230 | mpn_cache_fill_dummy(sum); |
---|
231 | } |
---|
232 | |
---|
233 | |
---|
234 | void |
---|
235 | mpn_cache_fill_write (mp_ptr ptr, mp_size_t size) |
---|
236 | { |
---|
237 | mpn_cache_fill (ptr, size); |
---|
238 | |
---|
239 | #if 0 |
---|
240 | mpn_random (ptr, size); |
---|
241 | #endif |
---|
242 | |
---|
243 | #if 0 |
---|
244 | mp_size_t i; |
---|
245 | |
---|
246 | for (i = 0; i < size; i++) |
---|
247 | ptr[i] = i; |
---|
248 | #endif |
---|
249 | } |
---|
250 | |
---|
251 | |
---|
252 | void |
---|
253 | speed_operand_src (struct speed_params *s, mp_ptr ptr, mp_size_t size) |
---|
254 | { |
---|
255 | if (s->src_num >= numberof (s->src)) |
---|
256 | { |
---|
257 | fprintf (stderr, "speed_operand_src: no room left in s->src[]\n"); |
---|
258 | abort (); |
---|
259 | } |
---|
260 | s->src[s->src_num].ptr = ptr; |
---|
261 | s->src[s->src_num].size = size; |
---|
262 | s->src_num++; |
---|
263 | } |
---|
264 | |
---|
265 | |
---|
266 | void |
---|
267 | speed_operand_dst (struct speed_params *s, mp_ptr ptr, mp_size_t size) |
---|
268 | { |
---|
269 | if (s->dst_num >= numberof (s->dst)) |
---|
270 | { |
---|
271 | fprintf (stderr, "speed_operand_dst: no room left in s->dst[]\n"); |
---|
272 | abort (); |
---|
273 | } |
---|
274 | s->dst[s->dst_num].ptr = ptr; |
---|
275 | s->dst[s->dst_num].size = size; |
---|
276 | s->dst_num++; |
---|
277 | } |
---|
278 | |
---|
279 | |
---|
280 | void |
---|
281 | speed_cache_fill (struct speed_params *s) |
---|
282 | { |
---|
283 | static struct speed_params prev; |
---|
284 | int i; |
---|
285 | |
---|
286 | /* FIXME: need a better way to get the format string for a pointer */ |
---|
287 | |
---|
288 | if (speed_option_addrs) |
---|
289 | { |
---|
290 | int different; |
---|
291 | |
---|
292 | different = (s->dst_num != prev.dst_num || s->src_num != prev.src_num); |
---|
293 | for (i = 0; i < s->dst_num; i++) |
---|
294 | different |= (s->dst[i].ptr != prev.dst[i].ptr); |
---|
295 | for (i = 0; i < s->src_num; i++) |
---|
296 | different |= (s->src[i].ptr != prev.src[i].ptr); |
---|
297 | |
---|
298 | if (different) |
---|
299 | { |
---|
300 | if (s->dst_num != 0) |
---|
301 | { |
---|
302 | printf ("dst"); |
---|
303 | for (i = 0; i < s->dst_num; i++) |
---|
304 | printf (" %08lX", (unsigned long) s->dst[i].ptr); |
---|
305 | printf (" "); |
---|
306 | } |
---|
307 | |
---|
308 | if (s->src_num != 0) |
---|
309 | { |
---|
310 | printf ("src"); |
---|
311 | for (i = 0; i < s->src_num; i++) |
---|
312 | printf (" %08lX", (unsigned long) s->src[i].ptr); |
---|
313 | printf (" "); |
---|
314 | } |
---|
315 | printf (" (cf sp approx %08lX)\n", (unsigned long) &different); |
---|
316 | |
---|
317 | } |
---|
318 | |
---|
319 | memcpy (&prev, s, sizeof(prev)); |
---|
320 | } |
---|
321 | |
---|
322 | switch (s->cache) { |
---|
323 | case 0: |
---|
324 | for (i = 0; i < s->dst_num; i++) |
---|
325 | mpn_cache_fill_write (s->dst[i].ptr, s->dst[i].size); |
---|
326 | for (i = 0; i < s->src_num; i++) |
---|
327 | mpn_cache_fill (s->src[i].ptr, s->src[i].size); |
---|
328 | break; |
---|
329 | case 1: |
---|
330 | pentium_wbinvd(); |
---|
331 | break; |
---|
332 | } |
---|
333 | } |
---|
334 | |
---|
335 | |
---|
336 | /* Adjust ptr to align to CACHE_LINE_SIZE bytes plus "align" limbs. ptr |
---|
337 | needs to have room for up to CACHE_LINE_SIZE-4 extra bytes. */ |
---|
338 | |
---|
339 | mp_ptr |
---|
340 | speed_tmp_alloc_adjust (void *ptr, mp_size_t align) |
---|
341 | { |
---|
342 | /* |
---|
343 | printf("%p %ld -> %p %X %X\n", ptr, align, |
---|
344 | (mp_ptr) ptr |
---|
345 | + ((align - ((mp_size_t) ptr >> 2)) & |
---|
346 | SPEED_TMP_ALLOC_ADJUST_MASK), |
---|
347 | ((mp_size_t) ptr >> 2) & SPEED_TMP_ALLOC_ADJUST_MASK, |
---|
348 | SPEED_TMP_ALLOC_ADJUST_MASK); |
---|
349 | */ |
---|
350 | |
---|
351 | return (mp_ptr) ptr |
---|
352 | + ((align - ((mp_size_t) ptr >> 2)) & SPEED_TMP_ALLOC_ADJUST_MASK); |
---|
353 | } |
---|
354 | |
---|
355 | |
---|
356 | /* Miscellanous options accepted by tune and speed programs under -o. */ |
---|
357 | |
---|
358 | void |
---|
359 | speed_option_set (const char *s) |
---|
360 | { |
---|
361 | int n; |
---|
362 | |
---|
363 | if (strcmp (s, "addrs") == 0) |
---|
364 | { |
---|
365 | speed_option_addrs = 1; |
---|
366 | } |
---|
367 | else if (strcmp (s, "verbose") == 0) |
---|
368 | { |
---|
369 | speed_option_verbose++; |
---|
370 | } |
---|
371 | else if (sscanf (s, "verbose=%d", &n) == 1) |
---|
372 | { |
---|
373 | speed_option_verbose = n; |
---|
374 | } |
---|
375 | else |
---|
376 | { |
---|
377 | printf ("Unrecognised -o option: %s\n", s); |
---|
378 | exit (1); |
---|
379 | } |
---|
380 | } |
---|
381 | |
---|
382 | |
---|
383 | /* The following are basic speed running routines for various gmp functions. |
---|
384 | Many are very similar and use speed.h macros. |
---|
385 | |
---|
386 | Each routine allocates it's own destination space for the result of the |
---|
387 | function, because only it can know what the function needs. |
---|
388 | |
---|
389 | speed_starttime() and speed_endtime() are put tight around the code to be |
---|
390 | measured. Any setups are done outside the timed portion. |
---|
391 | |
---|
392 | Each routine is responsible for its own cache priming. |
---|
393 | speed_cache_fill() is a good way to do this, see examples in speed.h. |
---|
394 | One cache priming possibility, for CPUs with write-allocate cache, and |
---|
395 | functions that don't take too long, is to do one dummy call before timing |
---|
396 | so as to cache everything that gets used. But speed_measure() runs a |
---|
397 | routine at least twice and will take the smaller time, so this might not |
---|
398 | be necessary. |
---|
399 | |
---|
400 | Data alignment will be important, for source, destination and temporary |
---|
401 | workspace. A routine can align its destination and workspace. Programs |
---|
402 | using the routines will ensure s->xp and s->yp are aligned. Aligning |
---|
403 | onto a CACHE_LINE_SIZE boundary is suggested. s->align_wp and |
---|
404 | s->align_wp2 should be respected where it makes sense to do so. |
---|
405 | SPEED_TMP_ALLOC_LIMBS is a good way to do this. |
---|
406 | |
---|
407 | A loop of the following form can be expected to turn into good assembler |
---|
408 | code on most CPUs, thereby minimizing overhead in the measurement. It |
---|
409 | can always be assumed s->reps >= 1. |
---|
410 | |
---|
411 | i = s->reps |
---|
412 | do |
---|
413 | foo(); |
---|
414 | while (--i != 0); |
---|
415 | |
---|
416 | Additional parameters might be added to "struct speed_params" in the |
---|
417 | future. Routines should ignore anything they don't use. |
---|
418 | |
---|
419 | s->size can be used creatively, and s->xp and s->yp can be ignored. For |
---|
420 | example, speed_mpz_fac_ui() uses s->size as n for the factorial. s->r is |
---|
421 | just a user-supplied parameter. speed_mpn_lshift() uses it as a shift, |
---|
422 | speed_mpn_mul_1() uses it as a multiplier. */ |
---|
423 | |
---|
424 | |
---|
425 | /* MPN_COPY etc can be macros, so the _CALL forms are necessary */ |
---|
426 | double |
---|
427 | speed_MPN_COPY (struct speed_params *s) |
---|
428 | { |
---|
429 | SPEED_ROUTINE_MPN_COPY (MPN_COPY); |
---|
430 | } |
---|
431 | double |
---|
432 | speed_MPN_COPY_INCR (struct speed_params *s) |
---|
433 | { |
---|
434 | SPEED_ROUTINE_MPN_COPY (MPN_COPY_INCR); |
---|
435 | } |
---|
436 | double |
---|
437 | speed_MPN_COPY_DECR (struct speed_params *s) |
---|
438 | { |
---|
439 | SPEED_ROUTINE_MPN_COPY (MPN_COPY_DECR); |
---|
440 | } |
---|
441 | #if HAVE_NATIVE_mpn_copyi |
---|
442 | double |
---|
443 | speed_mpn_copyi (struct speed_params *s) |
---|
444 | { |
---|
445 | SPEED_ROUTINE_MPN_COPY (mpn_copyi); |
---|
446 | } |
---|
447 | #endif |
---|
448 | #if HAVE_NATIVE_mpn_copyd |
---|
449 | double |
---|
450 | speed_mpn_copyd (struct speed_params *s) |
---|
451 | { |
---|
452 | SPEED_ROUTINE_MPN_COPY (mpn_copyd); |
---|
453 | } |
---|
454 | #endif |
---|
455 | double |
---|
456 | speed_memcpy (struct speed_params *s) |
---|
457 | { |
---|
458 | SPEED_ROUTINE_MPN_COPY_BYTES (memcpy); |
---|
459 | } |
---|
460 | double |
---|
461 | speed_mpn_com_n (struct speed_params *s) |
---|
462 | { |
---|
463 | SPEED_ROUTINE_MPN_COPY (mpn_com_n); |
---|
464 | } |
---|
465 | |
---|
466 | |
---|
467 | double |
---|
468 | speed_mpn_addmul_1 (struct speed_params *s) |
---|
469 | { |
---|
470 | SPEED_ROUTINE_MPN_UNARY_1 (mpn_addmul_1); |
---|
471 | } |
---|
472 | double |
---|
473 | speed_mpn_submul_1 (struct speed_params *s) |
---|
474 | { |
---|
475 | SPEED_ROUTINE_MPN_UNARY_1 (mpn_submul_1); |
---|
476 | } |
---|
477 | |
---|
478 | |
---|
479 | double |
---|
480 | speed_mpn_mul_1 (struct speed_params *s) |
---|
481 | { |
---|
482 | SPEED_ROUTINE_MPN_UNARY_1 (mpn_mul_1); |
---|
483 | } |
---|
484 | double |
---|
485 | speed_mpn_mul_1_inplace (struct speed_params *s) |
---|
486 | { |
---|
487 | SPEED_ROUTINE_MPN_UNARY_1_INPLACE (mpn_mul_1); |
---|
488 | } |
---|
489 | |
---|
490 | #if HAVE_NATIVE_mpn_mul_2 |
---|
491 | double |
---|
492 | speed_mpn_mul_2 (struct speed_params *s) |
---|
493 | { |
---|
494 | SPEED_ROUTINE_MPN_MUL_2 (mpn_mul_2); |
---|
495 | } |
---|
496 | #endif |
---|
497 | |
---|
498 | |
---|
499 | double |
---|
500 | speed_mpn_lshift (struct speed_params *s) |
---|
501 | { |
---|
502 | SPEED_ROUTINE_MPN_UNARY_1 (mpn_lshift); |
---|
503 | } |
---|
504 | double |
---|
505 | speed_mpn_rshift (struct speed_params *s) |
---|
506 | { |
---|
507 | SPEED_ROUTINE_MPN_UNARY_1 (mpn_rshift); |
---|
508 | } |
---|
509 | |
---|
510 | |
---|
511 | /* The carry-in variants (if available) are good for measuring because they |
---|
512 | won't skip a division if high<divisor. Alternately, use -1 as a divisor |
---|
513 | with the plain _1 forms. */ |
---|
514 | double |
---|
515 | speed_mpn_divrem_1 (struct speed_params *s) |
---|
516 | { |
---|
517 | SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1); |
---|
518 | } |
---|
519 | double |
---|
520 | speed_mpn_divrem_1f (struct speed_params *s) |
---|
521 | { |
---|
522 | SPEED_ROUTINE_MPN_DIVREM_1F (mpn_divrem_1); |
---|
523 | } |
---|
524 | #if HAVE_NATIVE_mpn_divrem_1c |
---|
525 | double |
---|
526 | speed_mpn_divrem_1c (struct speed_params *s) |
---|
527 | { |
---|
528 | SPEED_ROUTINE_MPN_DIVREM_1C (mpn_divrem_1c); |
---|
529 | } |
---|
530 | double |
---|
531 | speed_mpn_divrem_1cf (struct speed_params *s) |
---|
532 | { |
---|
533 | SPEED_ROUTINE_MPN_DIVREM_1CF (mpn_divrem_1c); |
---|
534 | } |
---|
535 | #endif |
---|
536 | |
---|
537 | double |
---|
538 | speed_mpn_divrem_1_div (struct speed_params *s) |
---|
539 | { |
---|
540 | SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_div); |
---|
541 | } |
---|
542 | double |
---|
543 | speed_mpn_divrem_1f_div (struct speed_params *s) |
---|
544 | { |
---|
545 | SPEED_ROUTINE_MPN_DIVREM_1F (mpn_divrem_1_div); |
---|
546 | } |
---|
547 | double |
---|
548 | speed_mpn_divrem_1_inv (struct speed_params *s) |
---|
549 | { |
---|
550 | SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_inv); |
---|
551 | } |
---|
552 | double |
---|
553 | speed_mpn_divrem_1f_inv (struct speed_params *s) |
---|
554 | { |
---|
555 | SPEED_ROUTINE_MPN_DIVREM_1F (mpn_divrem_1_inv); |
---|
556 | } |
---|
557 | double |
---|
558 | speed_mpn_mod_1_div (struct speed_params *s) |
---|
559 | { |
---|
560 | SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_div); |
---|
561 | } |
---|
562 | double |
---|
563 | speed_mpn_mod_1_inv (struct speed_params *s) |
---|
564 | { |
---|
565 | SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_inv); |
---|
566 | } |
---|
567 | |
---|
568 | double |
---|
569 | speed_mpn_preinv_divrem_1 (struct speed_params *s) |
---|
570 | { |
---|
571 | SPEED_ROUTINE_MPN_PREINV_DIVREM_1 (mpn_preinv_divrem_1); |
---|
572 | } |
---|
573 | double |
---|
574 | speed_mpn_preinv_divrem_1f (struct speed_params *s) |
---|
575 | { |
---|
576 | SPEED_ROUTINE_MPN_PREINV_DIVREM_1F (mpn_preinv_divrem_1); |
---|
577 | } |
---|
578 | |
---|
579 | double |
---|
580 | speed_mpn_mod_34lsub1 (struct speed_params *s) |
---|
581 | { |
---|
582 | SPEED_ROUTINE_MPN_MOD_34LSUB1 (mpn_mod_34lsub1); |
---|
583 | } |
---|
584 | |
---|
585 | double |
---|
586 | speed_mpn_divrem_2 (struct speed_params *s) |
---|
587 | { |
---|
588 | SPEED_ROUTINE_MPN_DIVREM_2 (mpn_divrem_2); |
---|
589 | } |
---|
590 | double |
---|
591 | speed_mpn_divrem_2_div (struct speed_params *s) |
---|
592 | { |
---|
593 | SPEED_ROUTINE_MPN_DIVREM_2 (mpn_divrem_2_div); |
---|
594 | } |
---|
595 | double |
---|
596 | speed_mpn_divrem_2_inv (struct speed_params *s) |
---|
597 | { |
---|
598 | SPEED_ROUTINE_MPN_DIVREM_2 (mpn_divrem_2_inv); |
---|
599 | } |
---|
600 | |
---|
601 | double |
---|
602 | speed_mpn_mod_1 (struct speed_params *s) |
---|
603 | { |
---|
604 | SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1); |
---|
605 | } |
---|
606 | #if HAVE_NATIVE_mpn_mod_1c |
---|
607 | double |
---|
608 | speed_mpn_mod_1c (struct speed_params *s) |
---|
609 | { |
---|
610 | SPEED_ROUTINE_MPN_MOD_1C (mpn_mod_1c); |
---|
611 | } |
---|
612 | #endif |
---|
613 | double |
---|
614 | speed_mpn_preinv_mod_1 (struct speed_params *s) |
---|
615 | { |
---|
616 | SPEED_ROUTINE_MPN_PREINV_MOD_1 (mpn_preinv_mod_1); |
---|
617 | } |
---|
618 | |
---|
619 | double |
---|
620 | speed_mpn_divexact_1 (struct speed_params *s) |
---|
621 | { |
---|
622 | SPEED_ROUTINE_MPN_DIVEXACT_1 (mpn_divexact_1); |
---|
623 | } |
---|
624 | |
---|
625 | double |
---|
626 | speed_mpn_divexact_by3 (struct speed_params *s) |
---|
627 | { |
---|
628 | SPEED_ROUTINE_MPN_COPY (mpn_divexact_by3); |
---|
629 | } |
---|
630 | |
---|
631 | #if HAVE_NATIVE_mpn_modexact_1_odd |
---|
632 | double |
---|
633 | speed_mpn_modexact_1_odd (struct speed_params *s) |
---|
634 | { |
---|
635 | SPEED_ROUTINE_MPN_MODEXACT_1_ODD (mpn_modexact_1_odd); |
---|
636 | } |
---|
637 | #endif |
---|
638 | |
---|
639 | double |
---|
640 | speed_mpn_modexact_1c_odd (struct speed_params *s) |
---|
641 | { |
---|
642 | SPEED_ROUTINE_MPN_MODEXACT_1C_ODD (mpn_modexact_1c_odd); |
---|
643 | } |
---|
644 | |
---|
645 | |
---|
646 | double |
---|
647 | speed_mpn_dc_tdiv_qr (struct speed_params *s) |
---|
648 | { |
---|
649 | SPEED_ROUTINE_MPN_DC_TDIV_QR (mpn_tdiv_qr); |
---|
650 | } |
---|
651 | double |
---|
652 | speed_mpn_dc_divrem_n (struct speed_params *s) |
---|
653 | { |
---|
654 | SPEED_ROUTINE_MPN_DC_DIVREM_N (mpn_dc_divrem_n); |
---|
655 | } |
---|
656 | double |
---|
657 | speed_mpn_dc_divrem_sb (struct speed_params *s) |
---|
658 | { |
---|
659 | SPEED_ROUTINE_MPN_DC_DIVREM_SB (mpn_sb_divrem_mn); |
---|
660 | } |
---|
661 | double |
---|
662 | speed_mpn_dc_divrem_sb_div (struct speed_params *s) |
---|
663 | { |
---|
664 | SPEED_ROUTINE_MPN_DC_DIVREM_SB (mpn_sb_divrem_mn_div); |
---|
665 | } |
---|
666 | double |
---|
667 | speed_mpn_dc_divrem_sb_inv (struct speed_params *s) |
---|
668 | { |
---|
669 | SPEED_ROUTINE_MPN_DC_DIVREM_SB (mpn_sb_divrem_mn_inv); |
---|
670 | } |
---|
671 | |
---|
672 | double |
---|
673 | speed_mpn_sb_divrem_m3 (struct speed_params *s) |
---|
674 | { |
---|
675 | SPEED_ROUTINE_MPN_SB_DIVREM_M3 (mpn_sb_divrem_mn); |
---|
676 | } |
---|
677 | double |
---|
678 | speed_mpn_sb_divrem_m3_div (struct speed_params *s) |
---|
679 | { |
---|
680 | SPEED_ROUTINE_MPN_SB_DIVREM_M3 (mpn_sb_divrem_mn_div); |
---|
681 | } |
---|
682 | double |
---|
683 | speed_mpn_sb_divrem_m3_inv (struct speed_params *s) |
---|
684 | { |
---|
685 | SPEED_ROUTINE_MPN_SB_DIVREM_M3 (mpn_sb_divrem_mn_inv); |
---|
686 | } |
---|
687 | |
---|
688 | double |
---|
689 | speed_mpz_mod (struct speed_params *s) |
---|
690 | { |
---|
691 | SPEED_ROUTINE_MPZ_MOD (mpz_mod); |
---|
692 | } |
---|
693 | double |
---|
694 | speed_redc (struct speed_params *s) |
---|
695 | { |
---|
696 | SPEED_ROUTINE_REDC (redc); |
---|
697 | } |
---|
698 | |
---|
699 | |
---|
700 | double |
---|
701 | speed_mpn_popcount (struct speed_params *s) |
---|
702 | { |
---|
703 | SPEED_ROUTINE_MPN_POPCOUNT (mpn_popcount); |
---|
704 | } |
---|
705 | double |
---|
706 | speed_mpn_hamdist (struct speed_params *s) |
---|
707 | { |
---|
708 | SPEED_ROUTINE_MPN_HAMDIST (mpn_hamdist); |
---|
709 | } |
---|
710 | |
---|
711 | |
---|
712 | double |
---|
713 | speed_mpn_add_n (struct speed_params *s) |
---|
714 | { |
---|
715 | SPEED_ROUTINE_MPN_BINARY_N (mpn_add_n); |
---|
716 | } |
---|
717 | double |
---|
718 | speed_mpn_sub_n (struct speed_params *s) |
---|
719 | { |
---|
720 | SPEED_ROUTINE_MPN_BINARY_N (mpn_sub_n); |
---|
721 | } |
---|
722 | |
---|
723 | |
---|
724 | /* mpn_and_n etc can be macros and so have to be handled with |
---|
725 | SPEED_ROUTINE_MPN_BINARY_N_CALL forms */ |
---|
726 | double |
---|
727 | speed_mpn_and_n (struct speed_params *s) |
---|
728 | { |
---|
729 | SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_and_n (wp, s->xp, s->yp, s->size)); |
---|
730 | } |
---|
731 | double |
---|
732 | speed_mpn_andn_n (struct speed_params *s) |
---|
733 | { |
---|
734 | SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_andn_n (wp, s->xp, s->yp, s->size)); |
---|
735 | } |
---|
736 | double |
---|
737 | speed_mpn_nand_n (struct speed_params *s) |
---|
738 | { |
---|
739 | SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_nand_n (wp, s->xp, s->yp, s->size)); |
---|
740 | } |
---|
741 | double |
---|
742 | speed_mpn_ior_n (struct speed_params *s) |
---|
743 | { |
---|
744 | SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_ior_n (wp, s->xp, s->yp, s->size)); |
---|
745 | } |
---|
746 | double |
---|
747 | speed_mpn_iorn_n (struct speed_params *s) |
---|
748 | { |
---|
749 | SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_iorn_n (wp, s->xp, s->yp, s->size)); |
---|
750 | } |
---|
751 | double |
---|
752 | speed_mpn_nior_n (struct speed_params *s) |
---|
753 | { |
---|
754 | SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_nior_n (wp, s->xp, s->yp, s->size)); |
---|
755 | } |
---|
756 | double |
---|
757 | speed_mpn_xor_n (struct speed_params *s) |
---|
758 | { |
---|
759 | SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_xor_n (wp, s->xp, s->yp, s->size)); |
---|
760 | } |
---|
761 | double |
---|
762 | speed_mpn_xnor_n (struct speed_params *s) |
---|
763 | { |
---|
764 | SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_xnor_n (wp, s->xp, s->yp, s->size)); |
---|
765 | } |
---|
766 | |
---|
767 | |
---|
768 | double |
---|
769 | speed_mpn_mul_n (struct speed_params *s) |
---|
770 | { |
---|
771 | SPEED_ROUTINE_MPN_MUL_N (mpn_mul_n); |
---|
772 | } |
---|
773 | double |
---|
774 | speed_mpn_sqr_n (struct speed_params *s) |
---|
775 | { |
---|
776 | SPEED_ROUTINE_MPN_SQR (mpn_sqr_n); |
---|
777 | } |
---|
778 | double |
---|
779 | speed_mpn_mul_n_sqr (struct speed_params *s) |
---|
780 | { |
---|
781 | SPEED_ROUTINE_MPN_SQR_CALL (mpn_mul_n (wp, s->xp, s->xp, s->size)); |
---|
782 | } |
---|
783 | |
---|
784 | double |
---|
785 | speed_mpn_mul_basecase (struct speed_params *s) |
---|
786 | { |
---|
787 | SPEED_ROUTINE_MPN_MUL_BASECASE(mpn_mul_basecase); |
---|
788 | } |
---|
789 | double |
---|
790 | speed_mpn_sqr_basecase (struct speed_params *s) |
---|
791 | { |
---|
792 | /* FIXME: size restrictions on some versions of sqr_basecase */ |
---|
793 | SPEED_ROUTINE_MPN_SQR (mpn_sqr_basecase); |
---|
794 | } |
---|
795 | |
---|
796 | #if HAVE_NATIVE_mpn_sqr_diagonal |
---|
797 | double |
---|
798 | speed_mpn_sqr_diagonal (struct speed_params *s) |
---|
799 | { |
---|
800 | SPEED_ROUTINE_MPN_SQR (mpn_sqr_diagonal); |
---|
801 | } |
---|
802 | #endif |
---|
803 | |
---|
804 | double |
---|
805 | speed_mpn_kara_mul_n (struct speed_params *s) |
---|
806 | { |
---|
807 | SPEED_ROUTINE_MPN_KARA_MUL_N (mpn_kara_mul_n); |
---|
808 | } |
---|
809 | double |
---|
810 | speed_mpn_kara_sqr_n (struct speed_params *s) |
---|
811 | { |
---|
812 | SPEED_ROUTINE_MPN_KARA_SQR_N (mpn_kara_sqr_n); |
---|
813 | } |
---|
814 | |
---|
815 | double |
---|
816 | speed_mpn_toom3_mul_n (struct speed_params *s) |
---|
817 | { |
---|
818 | SPEED_ROUTINE_MPN_TOOM3_MUL_N (mpn_toom3_mul_n); |
---|
819 | } |
---|
820 | double |
---|
821 | speed_mpn_toom3_sqr_n (struct speed_params *s) |
---|
822 | { |
---|
823 | SPEED_ROUTINE_MPN_TOOM3_SQR_N (mpn_toom3_sqr_n); |
---|
824 | } |
---|
825 | |
---|
826 | double |
---|
827 | speed_mpn_toom3_mul_n_mpn (struct speed_params *s) |
---|
828 | { |
---|
829 | SPEED_ROUTINE_MPN_TOOM3_MUL_N (mpn_toom3_mul_n_mpn); |
---|
830 | } |
---|
831 | double |
---|
832 | speed_mpn_toom3_mul_n_open (struct speed_params *s) |
---|
833 | { |
---|
834 | SPEED_ROUTINE_MPN_TOOM3_MUL_N (mpn_toom3_mul_n_open); |
---|
835 | } |
---|
836 | double |
---|
837 | speed_mpn_toom3_sqr_n_mpn (struct speed_params *s) |
---|
838 | { |
---|
839 | SPEED_ROUTINE_MPN_TOOM3_SQR_N (mpn_toom3_sqr_n_mpn); |
---|
840 | } |
---|
841 | double |
---|
842 | speed_mpn_toom3_sqr_n_open (struct speed_params *s) |
---|
843 | { |
---|
844 | SPEED_ROUTINE_MPN_TOOM3_SQR_N (mpn_toom3_sqr_n_open); |
---|
845 | } |
---|
846 | |
---|
847 | double |
---|
848 | speed_mpn_mul_fft_full (struct speed_params *s) |
---|
849 | { |
---|
850 | SPEED_ROUTINE_MPN_MUL_N_CALL |
---|
851 | (mpn_mul_fft_full (wp, s->xp, s->size, s->yp, s->size)); |
---|
852 | } |
---|
853 | double |
---|
854 | speed_mpn_mul_fft_full_sqr (struct speed_params *s) |
---|
855 | { |
---|
856 | SPEED_ROUTINE_MPN_SQR_CALL |
---|
857 | (mpn_mul_fft_full (wp, s->xp, s->size, s->xp, s->size)); |
---|
858 | } |
---|
859 | |
---|
860 | |
---|
861 | /* These are mod 2^N+1 multiplies and squares. If s->r is supplied it's |
---|
862 | used as k, otherwise the best k for the size is used. If s->size isn't a |
---|
863 | multiple of 2^k it's rounded up to make the effective operation size. */ |
---|
864 | |
---|
865 | #define SPEED_ROUTINE_MPN_MUL_FFT_CALL(call, sqr) \ |
---|
866 | { \ |
---|
867 | mp_ptr wp; \ |
---|
868 | mp_size_t pl; \ |
---|
869 | int k; \ |
---|
870 | unsigned i; \ |
---|
871 | double t; \ |
---|
872 | TMP_DECL (marker); \ |
---|
873 | \ |
---|
874 | SPEED_RESTRICT_COND (s->size >= 1); \ |
---|
875 | \ |
---|
876 | if (s->r != 0) \ |
---|
877 | k = s->r; \ |
---|
878 | else \ |
---|
879 | k = mpn_fft_best_k (s->size, sqr); \ |
---|
880 | \ |
---|
881 | TMP_MARK (marker); \ |
---|
882 | pl = mpn_fft_next_size (s->size, k); \ |
---|
883 | wp = SPEED_TMP_ALLOC_LIMBS (pl+1, s->align_wp); \ |
---|
884 | \ |
---|
885 | speed_operand_src (s, s->xp, s->size); \ |
---|
886 | if (!sqr) \ |
---|
887 | speed_operand_src (s, s->yp, s->size); \ |
---|
888 | speed_operand_dst (s, wp, pl+1); \ |
---|
889 | speed_cache_fill (s); \ |
---|
890 | \ |
---|
891 | speed_starttime (); \ |
---|
892 | i = s->reps; \ |
---|
893 | do \ |
---|
894 | call; \ |
---|
895 | while (--i != 0); \ |
---|
896 | t = speed_endtime (); \ |
---|
897 | \ |
---|
898 | TMP_FREE (marker); \ |
---|
899 | return t; \ |
---|
900 | } |
---|
901 | |
---|
902 | double |
---|
903 | speed_mpn_mul_fft (struct speed_params *s) |
---|
904 | { |
---|
905 | SPEED_ROUTINE_MPN_MUL_FFT_CALL |
---|
906 | (mpn_mul_fft (wp, pl, s->xp, s->size, s->yp, s->size, k), 0); |
---|
907 | } |
---|
908 | |
---|
909 | double |
---|
910 | speed_mpn_mul_fft_sqr (struct speed_params *s) |
---|
911 | { |
---|
912 | SPEED_ROUTINE_MPN_MUL_FFT_CALL |
---|
913 | (mpn_mul_fft (wp, pl, s->xp, s->size, s->xp, s->size, k), 1); |
---|
914 | } |
---|
915 | |
---|
916 | |
---|
917 | double |
---|
918 | speed_mpn_gcd (struct speed_params *s) |
---|
919 | { |
---|
920 | SPEED_ROUTINE_MPN_GCD (mpn_gcd); |
---|
921 | } |
---|
922 | double |
---|
923 | speed_mpn_gcd_binary (struct speed_params *s) |
---|
924 | { |
---|
925 | SPEED_ROUTINE_MPN_GCD (mpn_gcd_binary); |
---|
926 | } |
---|
927 | |
---|
928 | #if HAVE_NATIVE_mpn_gcd_finda |
---|
929 | double |
---|
930 | speed_mpn_gcd_finda (struct speed_params *s) |
---|
931 | { |
---|
932 | SPEED_ROUTINE_MPN_GCD_FINDA (mpn_gcd_finda); |
---|
933 | } |
---|
934 | #endif |
---|
935 | |
---|
936 | |
---|
937 | double |
---|
938 | speed_mpn_gcdext (struct speed_params *s) |
---|
939 | { |
---|
940 | SPEED_ROUTINE_MPN_GCDEXT (mpn_gcdext); |
---|
941 | } |
---|
942 | double |
---|
943 | speed_mpn_gcdext_single (struct speed_params *s) |
---|
944 | { |
---|
945 | SPEED_ROUTINE_MPN_GCDEXT (mpn_gcdext_single); |
---|
946 | } |
---|
947 | double |
---|
948 | speed_mpn_gcdext_double (struct speed_params *s) |
---|
949 | { |
---|
950 | SPEED_ROUTINE_MPN_GCDEXT (mpn_gcdext_double); |
---|
951 | } |
---|
952 | double |
---|
953 | speed_mpn_gcdext_one_single (struct speed_params *s) |
---|
954 | { |
---|
955 | SPEED_ROUTINE_MPN_GCDEXT_ONE (mpn_gcdext_one_single); |
---|
956 | } |
---|
957 | double |
---|
958 | speed_mpn_gcdext_one_double (struct speed_params *s) |
---|
959 | { |
---|
960 | SPEED_ROUTINE_MPN_GCDEXT_ONE (mpn_gcdext_one_double); |
---|
961 | } |
---|
962 | double |
---|
963 | speed_mpn_gcd_1 (struct speed_params *s) |
---|
964 | { |
---|
965 | SPEED_ROUTINE_MPN_GCD_1 (mpn_gcd_1); |
---|
966 | } |
---|
967 | double |
---|
968 | speed_mpn_gcd_1N (struct speed_params *s) |
---|
969 | { |
---|
970 | SPEED_ROUTINE_MPN_GCD_1N (mpn_gcd_1); |
---|
971 | } |
---|
972 | |
---|
973 | |
---|
974 | double |
---|
975 | speed_mpz_jacobi (struct speed_params *s) |
---|
976 | { |
---|
977 | SPEED_ROUTINE_MPZ_JACOBI (mpz_jacobi); |
---|
978 | } |
---|
979 | double |
---|
980 | speed_mpn_jacobi_base (struct speed_params *s) |
---|
981 | { |
---|
982 | SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base); |
---|
983 | } |
---|
984 | double |
---|
985 | speed_mpn_jacobi_base_1 (struct speed_params *s) |
---|
986 | { |
---|
987 | SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_1); |
---|
988 | } |
---|
989 | double |
---|
990 | speed_mpn_jacobi_base_2 (struct speed_params *s) |
---|
991 | { |
---|
992 | SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_2); |
---|
993 | } |
---|
994 | double |
---|
995 | speed_mpn_jacobi_base_3 (struct speed_params *s) |
---|
996 | { |
---|
997 | SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_3); |
---|
998 | } |
---|
999 | |
---|
1000 | |
---|
1001 | double |
---|
1002 | speed_mpn_sqrtrem (struct speed_params *s) |
---|
1003 | { |
---|
1004 | SPEED_ROUTINE_MPN_SQRTREM (mpn_sqrtrem); |
---|
1005 | } |
---|
1006 | |
---|
1007 | |
---|
1008 | double |
---|
1009 | speed_mpz_fac_ui (struct speed_params *s) |
---|
1010 | { |
---|
1011 | SPEED_ROUTINE_MPZ_FAC_UI (mpz_fac_ui); |
---|
1012 | } |
---|
1013 | |
---|
1014 | |
---|
1015 | double |
---|
1016 | speed_mpn_fib2_ui (struct speed_params *s) |
---|
1017 | { |
---|
1018 | SPEED_ROUTINE_MPN_FIB2_UI (mpn_fib2_ui); |
---|
1019 | } |
---|
1020 | double |
---|
1021 | speed_mpz_fib_ui (struct speed_params *s) |
---|
1022 | { |
---|
1023 | SPEED_ROUTINE_MPZ_FIB_UI (mpz_fib_ui); |
---|
1024 | } |
---|
1025 | double |
---|
1026 | speed_mpz_fib2_ui (struct speed_params *s) |
---|
1027 | { |
---|
1028 | SPEED_ROUTINE_MPZ_FIB2_UI (mpz_fib2_ui); |
---|
1029 | } |
---|
1030 | double |
---|
1031 | speed_mpz_lucnum_ui (struct speed_params *s) |
---|
1032 | { |
---|
1033 | SPEED_ROUTINE_MPZ_LUCNUM_UI (mpz_lucnum_ui); |
---|
1034 | } |
---|
1035 | double |
---|
1036 | speed_mpz_lucnum2_ui (struct speed_params *s) |
---|
1037 | { |
---|
1038 | SPEED_ROUTINE_MPZ_LUCNUM2_UI (mpz_lucnum2_ui); |
---|
1039 | } |
---|
1040 | |
---|
1041 | |
---|
1042 | double |
---|
1043 | speed_mpz_powm (struct speed_params *s) |
---|
1044 | { |
---|
1045 | SPEED_ROUTINE_MPZ_POWM (mpz_powm); |
---|
1046 | } |
---|
1047 | double |
---|
1048 | speed_mpz_powm_mod (struct speed_params *s) |
---|
1049 | { |
---|
1050 | SPEED_ROUTINE_MPZ_POWM (mpz_powm_mod); |
---|
1051 | } |
---|
1052 | double |
---|
1053 | speed_mpz_powm_redc (struct speed_params *s) |
---|
1054 | { |
---|
1055 | SPEED_ROUTINE_MPZ_POWM (mpz_powm_redc); |
---|
1056 | } |
---|
1057 | double |
---|
1058 | speed_mpz_powm_ui (struct speed_params *s) |
---|
1059 | { |
---|
1060 | SPEED_ROUTINE_MPZ_POWM_UI (mpz_powm_ui); |
---|
1061 | } |
---|
1062 | |
---|
1063 | |
---|
1064 | double |
---|
1065 | speed_modlimb_invert (struct speed_params *s) |
---|
1066 | { |
---|
1067 | SPEED_ROUTINE_MODLIMB_INVERT (modlimb_invert); |
---|
1068 | } |
---|
1069 | |
---|
1070 | |
---|
1071 | double |
---|
1072 | speed_noop (struct speed_params *s) |
---|
1073 | { |
---|
1074 | unsigned i; |
---|
1075 | |
---|
1076 | speed_starttime (); |
---|
1077 | i = s->reps; |
---|
1078 | do |
---|
1079 | noop (); |
---|
1080 | while (--i != 0); |
---|
1081 | return speed_endtime (); |
---|
1082 | } |
---|
1083 | |
---|
1084 | double |
---|
1085 | speed_noop_wxs (struct speed_params *s) |
---|
1086 | { |
---|
1087 | mp_ptr wp; |
---|
1088 | unsigned i; |
---|
1089 | double t; |
---|
1090 | TMP_DECL (marker); |
---|
1091 | |
---|
1092 | TMP_MARK (marker); |
---|
1093 | wp = TMP_ALLOC_LIMBS (1); |
---|
1094 | |
---|
1095 | speed_starttime (); |
---|
1096 | i = s->reps; |
---|
1097 | do |
---|
1098 | noop_wxs (wp, s->xp, s->size); |
---|
1099 | while (--i != 0); |
---|
1100 | t = speed_endtime (); |
---|
1101 | |
---|
1102 | TMP_FREE (marker); |
---|
1103 | return t; |
---|
1104 | } |
---|
1105 | |
---|
1106 | double |
---|
1107 | speed_noop_wxys (struct speed_params *s) |
---|
1108 | { |
---|
1109 | mp_ptr wp; |
---|
1110 | unsigned i; |
---|
1111 | double t; |
---|
1112 | TMP_DECL (marker); |
---|
1113 | |
---|
1114 | TMP_MARK (marker); |
---|
1115 | wp = TMP_ALLOC_LIMBS (1); |
---|
1116 | |
---|
1117 | speed_starttime (); |
---|
1118 | i = s->reps; |
---|
1119 | do |
---|
1120 | noop_wxys (wp, s->xp, s->yp, s->size); |
---|
1121 | while (--i != 0); |
---|
1122 | t = speed_endtime (); |
---|
1123 | |
---|
1124 | TMP_FREE (marker); |
---|
1125 | return t; |
---|
1126 | } |
---|
1127 | |
---|
1128 | |
---|
1129 | #define SPEED_ROUTINE_ALLOC_FREE(variables, calls) \ |
---|
1130 | { \ |
---|
1131 | unsigned i; \ |
---|
1132 | variables; \ |
---|
1133 | \ |
---|
1134 | speed_starttime (); \ |
---|
1135 | i = s->reps; \ |
---|
1136 | do \ |
---|
1137 | { \ |
---|
1138 | calls; \ |
---|
1139 | } \ |
---|
1140 | while (--i != 0); \ |
---|
1141 | return speed_endtime (); \ |
---|
1142 | } |
---|
1143 | |
---|
1144 | |
---|
1145 | /* Compare these to see how much malloc/free costs and then how much |
---|
1146 | __gmp_default_allocate/free and mpz_init/clear add. mpz_init/clear or |
---|
1147 | mpq_init/clear will be doing a 1 limb allocate, so use that as the size |
---|
1148 | when including them in comparisons. */ |
---|
1149 | |
---|
1150 | double |
---|
1151 | speed_malloc_free (struct speed_params *s) |
---|
1152 | { |
---|
1153 | size_t bytes = s->size * BYTES_PER_MP_LIMB; |
---|
1154 | SPEED_ROUTINE_ALLOC_FREE (void *p, |
---|
1155 | p = malloc (bytes); |
---|
1156 | free (p)); |
---|
1157 | } |
---|
1158 | |
---|
1159 | double |
---|
1160 | speed_malloc_realloc_free (struct speed_params *s) |
---|
1161 | { |
---|
1162 | size_t bytes = s->size * BYTES_PER_MP_LIMB; |
---|
1163 | SPEED_ROUTINE_ALLOC_FREE (void *p, |
---|
1164 | p = malloc (BYTES_PER_MP_LIMB); |
---|
1165 | p = realloc (p, bytes); |
---|
1166 | free (p)); |
---|
1167 | } |
---|
1168 | |
---|
1169 | double |
---|
1170 | speed_gmp_allocate_free (struct speed_params *s) |
---|
1171 | { |
---|
1172 | size_t bytes = s->size * BYTES_PER_MP_LIMB; |
---|
1173 | SPEED_ROUTINE_ALLOC_FREE (void *p, |
---|
1174 | p = (*__gmp_allocate_func) (bytes); |
---|
1175 | (*__gmp_free_func) (p, bytes)); |
---|
1176 | } |
---|
1177 | |
---|
1178 | double |
---|
1179 | speed_gmp_allocate_reallocate_free (struct speed_params *s) |
---|
1180 | { |
---|
1181 | size_t bytes = s->size * BYTES_PER_MP_LIMB; |
---|
1182 | SPEED_ROUTINE_ALLOC_FREE |
---|
1183 | (void *p, |
---|
1184 | p = (*__gmp_allocate_func) (BYTES_PER_MP_LIMB); |
---|
1185 | p = (*__gmp_reallocate_func) (p, bytes, BYTES_PER_MP_LIMB); |
---|
1186 | (*__gmp_free_func) (p, bytes)); |
---|
1187 | } |
---|
1188 | |
---|
1189 | double |
---|
1190 | speed_mpz_init_clear (struct speed_params *s) |
---|
1191 | { |
---|
1192 | SPEED_ROUTINE_ALLOC_FREE (mpz_t z, |
---|
1193 | mpz_init (z); |
---|
1194 | mpz_clear (z)); |
---|
1195 | } |
---|
1196 | |
---|
1197 | double |
---|
1198 | speed_mpz_init_realloc_clear (struct speed_params *s) |
---|
1199 | { |
---|
1200 | SPEED_ROUTINE_ALLOC_FREE (mpz_t z, |
---|
1201 | mpz_init (z); |
---|
1202 | _mpz_realloc (z, s->size); |
---|
1203 | mpz_clear (z)); |
---|
1204 | } |
---|
1205 | |
---|
1206 | double |
---|
1207 | speed_mpq_init_clear (struct speed_params *s) |
---|
1208 | { |
---|
1209 | SPEED_ROUTINE_ALLOC_FREE (mpq_t q, |
---|
1210 | mpq_init (q); |
---|
1211 | mpq_clear (q)); |
---|
1212 | } |
---|
1213 | |
---|
1214 | double |
---|
1215 | speed_mpf_init_clear (struct speed_params *s) |
---|
1216 | { |
---|
1217 | SPEED_ROUTINE_ALLOC_FREE (mpf_t f, |
---|
1218 | mpf_init (f); |
---|
1219 | mpf_clear (f)); |
---|
1220 | } |
---|
1221 | |
---|
1222 | |
---|
1223 | /* Compare this to mpn_add_n to see how much overhead mpz_add adds. Note |
---|
1224 | that repeatedly calling mpz_add with the same data gives branch predition |
---|
1225 | in it an advantage. */ |
---|
1226 | |
---|
1227 | double |
---|
1228 | speed_mpz_add (struct speed_params *s) |
---|
1229 | { |
---|
1230 | mpz_t w, x, y; |
---|
1231 | unsigned i; |
---|
1232 | double t; |
---|
1233 | |
---|
1234 | mpz_init (w); |
---|
1235 | mpz_init (x); |
---|
1236 | mpz_init (y); |
---|
1237 | |
---|
1238 | mpz_set_n (x, s->xp, s->size); |
---|
1239 | mpz_set_n (y, s->yp, s->size); |
---|
1240 | mpz_add (w, x, y); |
---|
1241 | |
---|
1242 | speed_starttime (); |
---|
1243 | i = s->reps; |
---|
1244 | do |
---|
1245 | { |
---|
1246 | mpz_add (w, x, y); |
---|
1247 | } |
---|
1248 | while (--i != 0); |
---|
1249 | t = speed_endtime (); |
---|
1250 | |
---|
1251 | mpz_clear (w); |
---|
1252 | mpz_clear (x); |
---|
1253 | mpz_clear (y); |
---|
1254 | return t; |
---|
1255 | } |
---|
1256 | |
---|
1257 | |
---|
1258 | /* If r==0, calculate (size,size/2), |
---|
1259 | otherwise calculate (size,r). */ |
---|
1260 | |
---|
1261 | double |
---|
1262 | speed_mpz_bin_uiui (struct speed_params *s) |
---|
1263 | { |
---|
1264 | mpz_t w; |
---|
1265 | unsigned long k; |
---|
1266 | unsigned i; |
---|
1267 | double t; |
---|
1268 | |
---|
1269 | mpz_init (w); |
---|
1270 | if (s->r != 0) |
---|
1271 | k = s->r; |
---|
1272 | else |
---|
1273 | k = s->size/2; |
---|
1274 | |
---|
1275 | speed_starttime (); |
---|
1276 | i = s->reps; |
---|
1277 | do |
---|
1278 | { |
---|
1279 | mpz_bin_uiui (w, s->size, k); |
---|
1280 | } |
---|
1281 | while (--i != 0); |
---|
1282 | t = speed_endtime (); |
---|
1283 | |
---|
1284 | mpz_clear (w); |
---|
1285 | return t; |
---|
1286 | } |
---|
1287 | |
---|
1288 | |
---|
1289 | /* The multiplies are successively dependent so the latency is measured, not |
---|
1290 | the issue rate. There's only 10 per loop so the code doesn't get too big |
---|
1291 | since umul_ppmm is several instructions on some cpus. |
---|
1292 | |
---|
1293 | Putting the arguments as "h,l,l,h" gets slightly better code from gcc |
---|
1294 | 2.95.2 on x86, it puts only one mov between each mul, not two. That mov |
---|
1295 | though will probably show up as a bogus extra cycle though. |
---|
1296 | |
---|
1297 | The measuring function macros are into three parts to avoid overflowing |
---|
1298 | preprocessor expansion space if umul_ppmm is big. |
---|
1299 | |
---|
1300 | Limitations: |
---|
1301 | |
---|
1302 | Don't blindly use this to set UMUL_TIME in gmp-mparam.h, check the code |
---|
1303 | generated first, especially on CPUs with low latency multipliers. |
---|
1304 | |
---|
1305 | The default umul_ppmm doing h*l will be getting increasing numbers of |
---|
1306 | high zero bits in the calculation. CPUs with data-dependent multipliers |
---|
1307 | will want to use umul_ppmm.1 to get some randomization into the |
---|
1308 | calculation. The extra xors and fetches will be a slowdown of course. */ |
---|
1309 | |
---|
1310 | #define SPEED_MACRO_UMUL_PPMM_A \ |
---|
1311 | { \ |
---|
1312 | mp_limb_t h, l; \ |
---|
1313 | unsigned i; \ |
---|
1314 | double t; \ |
---|
1315 | \ |
---|
1316 | s->time_divisor = 10; \ |
---|
1317 | \ |
---|
1318 | h = s->xp[0]; \ |
---|
1319 | l = s->yp[0]; \ |
---|
1320 | \ |
---|
1321 | if (s->r == 1) \ |
---|
1322 | { \ |
---|
1323 | speed_starttime (); \ |
---|
1324 | i = s->reps; \ |
---|
1325 | do \ |
---|
1326 | { |
---|
1327 | |
---|
1328 | #define SPEED_MACRO_UMUL_PPMM_B \ |
---|
1329 | } \ |
---|
1330 | while (--i != 0); \ |
---|
1331 | t = speed_endtime (); \ |
---|
1332 | } \ |
---|
1333 | else \ |
---|
1334 | { \ |
---|
1335 | speed_starttime (); \ |
---|
1336 | i = s->reps; \ |
---|
1337 | do \ |
---|
1338 | { |
---|
1339 | |
---|
1340 | #define SPEED_MACRO_UMUL_PPMM_C \ |
---|
1341 | } \ |
---|
1342 | while (--i != 0); \ |
---|
1343 | t = speed_endtime (); \ |
---|
1344 | } \ |
---|
1345 | \ |
---|
1346 | /* stop the compiler optimizing away the whole calculation! */ \ |
---|
1347 | noop_1 (h); \ |
---|
1348 | noop_1 (l); \ |
---|
1349 | \ |
---|
1350 | return t; \ |
---|
1351 | } |
---|
1352 | |
---|
1353 | |
---|
1354 | double |
---|
1355 | speed_umul_ppmm (struct speed_params *s) |
---|
1356 | { |
---|
1357 | SPEED_MACRO_UMUL_PPMM_A; |
---|
1358 | { |
---|
1359 | umul_ppmm (h, l, l, h); h ^= s->xp_block[0]; l ^= s->yp_block[0]; |
---|
1360 | umul_ppmm (h, l, l, h); h ^= s->xp_block[1]; l ^= s->yp_block[1]; |
---|
1361 | umul_ppmm (h, l, l, h); h ^= s->xp_block[2]; l ^= s->yp_block[2]; |
---|
1362 | umul_ppmm (h, l, l, h); h ^= s->xp_block[3]; l ^= s->yp_block[3]; |
---|
1363 | umul_ppmm (h, l, l, h); h ^= s->xp_block[4]; l ^= s->yp_block[4]; |
---|
1364 | umul_ppmm (h, l, l, h); h ^= s->xp_block[5]; l ^= s->yp_block[5]; |
---|
1365 | umul_ppmm (h, l, l, h); h ^= s->xp_block[6]; l ^= s->yp_block[6]; |
---|
1366 | umul_ppmm (h, l, l, h); h ^= s->xp_block[7]; l ^= s->yp_block[7]; |
---|
1367 | umul_ppmm (h, l, l, h); h ^= s->xp_block[8]; l ^= s->yp_block[8]; |
---|
1368 | umul_ppmm (h, l, l, h); h ^= s->xp_block[9]; l ^= s->yp_block[9]; |
---|
1369 | } |
---|
1370 | SPEED_MACRO_UMUL_PPMM_B; |
---|
1371 | { |
---|
1372 | umul_ppmm (h, l, l, h); |
---|
1373 | umul_ppmm (h, l, l, h); |
---|
1374 | umul_ppmm (h, l, l, h); |
---|
1375 | umul_ppmm (h, l, l, h); |
---|
1376 | umul_ppmm (h, l, l, h); |
---|
1377 | umul_ppmm (h, l, l, h); |
---|
1378 | umul_ppmm (h, l, l, h); |
---|
1379 | umul_ppmm (h, l, l, h); |
---|
1380 | umul_ppmm (h, l, l, h); |
---|
1381 | umul_ppmm (h, l, l, h); |
---|
1382 | } |
---|
1383 | SPEED_MACRO_UMUL_PPMM_C; |
---|
1384 | } |
---|
1385 | |
---|
1386 | |
---|
1387 | #if HAVE_NATIVE_mpn_umul_ppmm |
---|
1388 | |
---|
1389 | #if defined (__hppa) && W_TYPE_SIZE == 64 |
---|
1390 | #define CALL_MPN_UMUL_PPMM (h = __MPN (umul_ppmm) (h, l, &l)) |
---|
1391 | #else |
---|
1392 | #define CALL_MPN_UMUL_PPMM (h = __MPN (umul_ppmm) (&l, h, l)) |
---|
1393 | #endif |
---|
1394 | |
---|
1395 | double |
---|
1396 | speed_mpn_umul_ppmm (struct speed_params *s) |
---|
1397 | { |
---|
1398 | SPEED_MACRO_UMUL_PPMM_A; |
---|
1399 | { |
---|
1400 | CALL_MPN_UMUL_PPMM; h ^= s->xp_block[0]; l ^= s->yp_block[0]; |
---|
1401 | CALL_MPN_UMUL_PPMM; h ^= s->xp_block[1]; l ^= s->yp_block[1]; |
---|
1402 | CALL_MPN_UMUL_PPMM; h ^= s->xp_block[2]; l ^= s->yp_block[2]; |
---|
1403 | CALL_MPN_UMUL_PPMM; h ^= s->xp_block[3]; l ^= s->yp_block[3]; |
---|
1404 | CALL_MPN_UMUL_PPMM; h ^= s->xp_block[4]; l ^= s->yp_block[4]; |
---|
1405 | CALL_MPN_UMUL_PPMM; h ^= s->xp_block[5]; l ^= s->yp_block[5]; |
---|
1406 | CALL_MPN_UMUL_PPMM; h ^= s->xp_block[6]; l ^= s->yp_block[6]; |
---|
1407 | CALL_MPN_UMUL_PPMM; h ^= s->xp_block[7]; l ^= s->yp_block[7]; |
---|
1408 | CALL_MPN_UMUL_PPMM; h ^= s->xp_block[8]; l ^= s->yp_block[8]; |
---|
1409 | CALL_MPN_UMUL_PPMM; h ^= s->xp_block[9]; l ^= s->yp_block[9]; |
---|
1410 | } |
---|
1411 | SPEED_MACRO_UMUL_PPMM_B; |
---|
1412 | { |
---|
1413 | CALL_MPN_UMUL_PPMM; |
---|
1414 | CALL_MPN_UMUL_PPMM; |
---|
1415 | CALL_MPN_UMUL_PPMM; |
---|
1416 | CALL_MPN_UMUL_PPMM; |
---|
1417 | CALL_MPN_UMUL_PPMM; |
---|
1418 | CALL_MPN_UMUL_PPMM; |
---|
1419 | CALL_MPN_UMUL_PPMM; |
---|
1420 | CALL_MPN_UMUL_PPMM; |
---|
1421 | CALL_MPN_UMUL_PPMM; |
---|
1422 | CALL_MPN_UMUL_PPMM; |
---|
1423 | } |
---|
1424 | SPEED_MACRO_UMUL_PPMM_C; |
---|
1425 | } |
---|
1426 | #endif |
---|
1427 | |
---|
1428 | |
---|
1429 | /* The divisions are successively dependent so latency is measured, not |
---|
1430 | issue rate. There's only 10 per loop so the code doesn't get too big, |
---|
1431 | especially for udiv_qrnnd_preinv and preinv2norm, which are several |
---|
1432 | instructions each. |
---|
1433 | |
---|
1434 | Note that it's only the division which is measured here, there's no data |
---|
1435 | fetching and no shifting if the divisor gets normalized. |
---|
1436 | |
---|
1437 | In speed_udiv_qrnnd with gcc 2.95.2 on x86 the parameters "q,r,r,q,d" |
---|
1438 | generate x86 div instructions with nothing in between. |
---|
1439 | |
---|
1440 | The measuring function macros are in two parts to avoid overflowing |
---|
1441 | preprocessor expansion space if udiv_qrnnd etc are big. |
---|
1442 | |
---|
1443 | Limitations: |
---|
1444 | |
---|
1445 | Don't blindly use this to set UDIV_TIME in gmp-mparam.h, check the code |
---|
1446 | generated first. |
---|
1447 | |
---|
1448 | CPUs with data-dependent divisions may want more attention paid to the |
---|
1449 | randomness of the data used. Probably the measurement wanted is over |
---|
1450 | uniformly distributed numbers, but what's here might not be giving that. */ |
---|
1451 | |
---|
1452 | #define SPEED_ROUTINE_UDIV_QRNND_A(normalize) \ |
---|
1453 | { \ |
---|
1454 | double t; \ |
---|
1455 | unsigned i; \ |
---|
1456 | mp_limb_t q, r, d; \ |
---|
1457 | mp_limb_t dinv; \ |
---|
1458 | \ |
---|
1459 | s->time_divisor = 10; \ |
---|
1460 | \ |
---|
1461 | /* divisor from "r" parameter, or a default */ \ |
---|
1462 | d = s->r; \ |
---|
1463 | if (d == 0) \ |
---|
1464 | d = __mp_bases[10].big_base; \ |
---|
1465 | \ |
---|
1466 | if (normalize) \ |
---|
1467 | { \ |
---|
1468 | unsigned norm; \ |
---|
1469 | count_leading_zeros (norm, d); \ |
---|
1470 | d <<= norm; \ |
---|
1471 | invert_limb (dinv, d); \ |
---|
1472 | } \ |
---|
1473 | \ |
---|
1474 | q = s->xp[0]; \ |
---|
1475 | r = s->yp[0] % d; \ |
---|
1476 | \ |
---|
1477 | speed_starttime (); \ |
---|
1478 | i = s->reps; \ |
---|
1479 | do \ |
---|
1480 | { |
---|
1481 | |
---|
1482 | #define SPEED_ROUTINE_UDIV_QRNND_B \ |
---|
1483 | } \ |
---|
1484 | while (--i != 0); \ |
---|
1485 | t = speed_endtime (); \ |
---|
1486 | \ |
---|
1487 | /* stop the compiler optimizing away the whole calculation! */ \ |
---|
1488 | noop_1 (q); \ |
---|
1489 | noop_1 (r); \ |
---|
1490 | \ |
---|
1491 | return t; \ |
---|
1492 | } |
---|
1493 | |
---|
1494 | double |
---|
1495 | speed_udiv_qrnnd (struct speed_params *s) |
---|
1496 | { |
---|
1497 | SPEED_ROUTINE_UDIV_QRNND_A (UDIV_NEEDS_NORMALIZATION); |
---|
1498 | { |
---|
1499 | udiv_qrnnd (q, r, r, q, d); |
---|
1500 | udiv_qrnnd (q, r, r, q, d); |
---|
1501 | udiv_qrnnd (q, r, r, q, d); |
---|
1502 | udiv_qrnnd (q, r, r, q, d); |
---|
1503 | udiv_qrnnd (q, r, r, q, d); |
---|
1504 | udiv_qrnnd (q, r, r, q, d); |
---|
1505 | udiv_qrnnd (q, r, r, q, d); |
---|
1506 | udiv_qrnnd (q, r, r, q, d); |
---|
1507 | udiv_qrnnd (q, r, r, q, d); |
---|
1508 | udiv_qrnnd (q, r, r, q, d); |
---|
1509 | } |
---|
1510 | SPEED_ROUTINE_UDIV_QRNND_B; |
---|
1511 | } |
---|
1512 | |
---|
1513 | double |
---|
1514 | speed_udiv_qrnnd_preinv (struct speed_params *s) |
---|
1515 | { |
---|
1516 | SPEED_ROUTINE_UDIV_QRNND_A (1); |
---|
1517 | { |
---|
1518 | udiv_qrnnd_preinv (q, r, r, q, d, dinv); |
---|
1519 | udiv_qrnnd_preinv (q, r, r, q, d, dinv); |
---|
1520 | udiv_qrnnd_preinv (q, r, r, q, d, dinv); |
---|
1521 | udiv_qrnnd_preinv (q, r, r, q, d, dinv); |
---|
1522 | udiv_qrnnd_preinv (q, r, r, q, d, dinv); |
---|
1523 | udiv_qrnnd_preinv (q, r, r, q, d, dinv); |
---|
1524 | udiv_qrnnd_preinv (q, r, r, q, d, dinv); |
---|
1525 | udiv_qrnnd_preinv (q, r, r, q, d, dinv); |
---|
1526 | udiv_qrnnd_preinv (q, r, r, q, d, dinv); |
---|
1527 | udiv_qrnnd_preinv (q, r, r, q, d, dinv); |
---|
1528 | } |
---|
1529 | SPEED_ROUTINE_UDIV_QRNND_B; |
---|
1530 | } |
---|
1531 | |
---|
1532 | double |
---|
1533 | speed_udiv_qrnnd_preinv2norm (struct speed_params *s) |
---|
1534 | { |
---|
1535 | SPEED_ROUTINE_UDIV_QRNND_A (1); |
---|
1536 | { |
---|
1537 | udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv); |
---|
1538 | udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv); |
---|
1539 | udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv); |
---|
1540 | udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv); |
---|
1541 | udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv); |
---|
1542 | udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv); |
---|
1543 | udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv); |
---|
1544 | udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv); |
---|
1545 | udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv); |
---|
1546 | udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv); |
---|
1547 | } |
---|
1548 | SPEED_ROUTINE_UDIV_QRNND_B; |
---|
1549 | } |
---|
1550 | |
---|
1551 | double |
---|
1552 | speed_udiv_qrnnd_c (struct speed_params *s) |
---|
1553 | { |
---|
1554 | SPEED_ROUTINE_UDIV_QRNND_A (1); |
---|
1555 | { |
---|
1556 | __udiv_qrnnd_c (q, r, r, q, d); |
---|
1557 | __udiv_qrnnd_c (q, r, r, q, d); |
---|
1558 | __udiv_qrnnd_c (q, r, r, q, d); |
---|
1559 | __udiv_qrnnd_c (q, r, r, q, d); |
---|
1560 | __udiv_qrnnd_c (q, r, r, q, d); |
---|
1561 | __udiv_qrnnd_c (q, r, r, q, d); |
---|
1562 | __udiv_qrnnd_c (q, r, r, q, d); |
---|
1563 | __udiv_qrnnd_c (q, r, r, q, d); |
---|
1564 | __udiv_qrnnd_c (q, r, r, q, d); |
---|
1565 | __udiv_qrnnd_c (q, r, r, q, d); |
---|
1566 | } |
---|
1567 | SPEED_ROUTINE_UDIV_QRNND_B; |
---|
1568 | } |
---|
1569 | |
---|
1570 | #if HAVE_NATIVE_mpn_udiv_qrnnd |
---|
1571 | |
---|
1572 | #if defined (__hppa) && W_TYPE_SIZE == 64 |
---|
1573 | #define CALL_MPN_UDIV_QRNND (q = __MPN (udiv_qrnnd) (r, q, d, &r)) |
---|
1574 | #else |
---|
1575 | #define CALL_MPN_UDIV_QRNND (q = __MPN (udiv_qrnnd) (&r, r, q, d)) |
---|
1576 | #endif |
---|
1577 | |
---|
1578 | double |
---|
1579 | speed_mpn_udiv_qrnnd (struct speed_params *s) |
---|
1580 | { |
---|
1581 | SPEED_ROUTINE_UDIV_QRNND_A (1); |
---|
1582 | { |
---|
1583 | CALL_MPN_UDIV_QRNND; |
---|
1584 | CALL_MPN_UDIV_QRNND; |
---|
1585 | CALL_MPN_UDIV_QRNND; |
---|
1586 | CALL_MPN_UDIV_QRNND; |
---|
1587 | CALL_MPN_UDIV_QRNND; |
---|
1588 | CALL_MPN_UDIV_QRNND; |
---|
1589 | CALL_MPN_UDIV_QRNND; |
---|
1590 | CALL_MPN_UDIV_QRNND; |
---|
1591 | CALL_MPN_UDIV_QRNND; |
---|
1592 | CALL_MPN_UDIV_QRNND; |
---|
1593 | } |
---|
1594 | SPEED_ROUTINE_UDIV_QRNND_B; |
---|
1595 | } |
---|
1596 | #endif |
---|
1597 | |
---|
1598 | |
---|
1599 | double |
---|
1600 | speed_invert_limb (struct speed_params *s) |
---|
1601 | { |
---|
1602 | SPEED_ROUTINE_INVERT_LIMB_CALL (invert_limb (dinv, d)); |
---|
1603 | } |
---|
1604 | |
---|
1605 | |
---|
1606 | /* xp[0] might not be particularly random, but should give an indication how |
---|
1607 | "/" runs. Same for speed_operator_mod below. */ |
---|
1608 | double |
---|
1609 | speed_operator_div (struct speed_params *s) |
---|
1610 | { |
---|
1611 | double t; |
---|
1612 | unsigned i; |
---|
1613 | mp_limb_t x, q, d; |
---|
1614 | |
---|
1615 | s->time_divisor = 10; |
---|
1616 | |
---|
1617 | /* divisor from "r" parameter, or a default */ |
---|
1618 | d = s->r; |
---|
1619 | if (d == 0) |
---|
1620 | d = __mp_bases[10].big_base; |
---|
1621 | |
---|
1622 | x = s->xp[0]; |
---|
1623 | q = 0; |
---|
1624 | |
---|
1625 | speed_starttime (); |
---|
1626 | i = s->reps; |
---|
1627 | do |
---|
1628 | { |
---|
1629 | q ^= x; q /= d; |
---|
1630 | q ^= x; q /= d; |
---|
1631 | q ^= x; q /= d; |
---|
1632 | q ^= x; q /= d; |
---|
1633 | q ^= x; q /= d; |
---|
1634 | q ^= x; q /= d; |
---|
1635 | q ^= x; q /= d; |
---|
1636 | q ^= x; q /= d; |
---|
1637 | q ^= x; q /= d; |
---|
1638 | q ^= x; q /= d; |
---|
1639 | } |
---|
1640 | while (--i != 0); |
---|
1641 | t = speed_endtime (); |
---|
1642 | |
---|
1643 | /* stop the compiler optimizing away the whole calculation! */ |
---|
1644 | noop_1 (q); |
---|
1645 | |
---|
1646 | return t; |
---|
1647 | } |
---|
1648 | |
---|
1649 | double |
---|
1650 | speed_operator_mod (struct speed_params *s) |
---|
1651 | { |
---|
1652 | double t; |
---|
1653 | unsigned i; |
---|
1654 | mp_limb_t x, r, d; |
---|
1655 | |
---|
1656 | s->time_divisor = 10; |
---|
1657 | |
---|
1658 | /* divisor from "r" parameter, or a default */ |
---|
1659 | d = s->r; |
---|
1660 | if (d == 0) |
---|
1661 | d = __mp_bases[10].big_base; |
---|
1662 | |
---|
1663 | x = s->xp[0]; |
---|
1664 | r = 0; |
---|
1665 | |
---|
1666 | speed_starttime (); |
---|
1667 | i = s->reps; |
---|
1668 | do |
---|
1669 | { |
---|
1670 | r ^= x; r %= d; |
---|
1671 | r ^= x; r %= d; |
---|
1672 | r ^= x; r %= d; |
---|
1673 | r ^= x; r %= d; |
---|
1674 | r ^= x; r %= d; |
---|
1675 | r ^= x; r %= d; |
---|
1676 | r ^= x; r %= d; |
---|
1677 | r ^= x; r %= d; |
---|
1678 | r ^= x; r %= d; |
---|
1679 | r ^= x; r %= d; |
---|
1680 | } |
---|
1681 | while (--i != 0); |
---|
1682 | t = speed_endtime (); |
---|
1683 | |
---|
1684 | /* stop the compiler optimizing away the whole calculation! */ |
---|
1685 | noop_1 (r); |
---|
1686 | |
---|
1687 | return t; |
---|
1688 | } |
---|
1689 | |
---|
1690 | |
---|
1691 | /* r==0 measures on data with the values uniformly distributed. This will |
---|
1692 | be typical for count_trailing_zeros in a GCD etc. |
---|
1693 | |
---|
1694 | r==1 measures on data with the resultant count uniformly distributed |
---|
1695 | between 0 and BITS_PER_MP_LIMB-1. This is probably sensible for |
---|
1696 | count_leading_zeros on the high limbs of divisors. */ |
---|
1697 | |
---|
1698 | int |
---|
1699 | speed_routine_count_zeros_setup (struct speed_params *s, |
---|
1700 | mp_ptr xp, int leading, int zero) |
---|
1701 | { |
---|
1702 | int i, c; |
---|
1703 | mp_limb_t n; |
---|
1704 | |
---|
1705 | if (s->r == 0) |
---|
1706 | { |
---|
1707 | /* Make uniformly distributed data. If zero isn't allowed then change |
---|
1708 | it to 1 for leading, or 0x800..00 for trailing. */ |
---|
1709 | MPN_COPY (xp, s->xp_block, SPEED_BLOCK_SIZE); |
---|
1710 | if (! zero) |
---|
1711 | for (i = 0; i < SPEED_BLOCK_SIZE; i++) |
---|
1712 | if (xp[i] == 0) |
---|
1713 | xp[i] = leading ? 1 : GMP_LIMB_HIGHBIT; |
---|
1714 | } |
---|
1715 | else if (s->r == 1) |
---|
1716 | { |
---|
1717 | /* Make counts uniformly distributed. A randomly chosen bit is set, and |
---|
1718 | for leading the rest above it are cleared, or for trailing then the |
---|
1719 | rest below. */ |
---|
1720 | for (i = 0; i < SPEED_BLOCK_SIZE; i++) |
---|
1721 | { |
---|
1722 | mp_limb_t set = CNST_LIMB(1) << (s->yp_block[i] % BITS_PER_MP_LIMB); |
---|
1723 | mp_limb_t keep_below = set-1; |
---|
1724 | mp_limb_t keep_above = MP_LIMB_T_MAX ^ keep_below; |
---|
1725 | mp_limb_t keep = (leading ? keep_below : keep_above); |
---|
1726 | xp[i] = (s->xp_block[i] & keep) | set; |
---|
1727 | } |
---|
1728 | } |
---|
1729 | else |
---|
1730 | { |
---|
1731 | return 0; |
---|
1732 | } |
---|
1733 | |
---|
1734 | /* Account for the effect of n^=c. */ |
---|
1735 | c = 0; |
---|
1736 | for (i = 0; i < SPEED_BLOCK_SIZE; i++) |
---|
1737 | { |
---|
1738 | n = xp[i]; |
---|
1739 | xp[i] ^= c; |
---|
1740 | |
---|
1741 | if (leading) |
---|
1742 | count_leading_zeros (c, n); |
---|
1743 | else |
---|
1744 | count_trailing_zeros (c, n); |
---|
1745 | } |
---|
1746 | |
---|
1747 | return 1; |
---|
1748 | } |
---|
1749 | |
---|
1750 | double |
---|
1751 | speed_count_leading_zeros (struct speed_params *s) |
---|
1752 | { |
---|
1753 | #ifdef COUNT_LEADING_ZEROS_0 |
---|
1754 | #define COUNT_LEADING_ZEROS_0_ALLOWED 1 |
---|
1755 | #else |
---|
1756 | #define COUNT_LEADING_ZEROS_0_ALLOWED 0 |
---|
1757 | #endif |
---|
1758 | |
---|
1759 | SPEED_ROUTINE_COUNT_ZEROS_A (1, COUNT_LEADING_ZEROS_0_ALLOWED); |
---|
1760 | count_leading_zeros (c, n); |
---|
1761 | SPEED_ROUTINE_COUNT_ZEROS_B (); |
---|
1762 | } |
---|
1763 | double |
---|
1764 | speed_count_trailing_zeros (struct speed_params *s) |
---|
1765 | { |
---|
1766 | SPEED_ROUTINE_COUNT_ZEROS_A (0, 0); |
---|
1767 | count_trailing_zeros (c, n); |
---|
1768 | SPEED_ROUTINE_COUNT_ZEROS_B (); |
---|
1769 | } |
---|
1770 | |
---|
1771 | |
---|
1772 | double |
---|
1773 | speed_mpn_get_str (struct speed_params *s) |
---|
1774 | { |
---|
1775 | SPEED_ROUTINE_MPN_GET_STR (mpn_get_str); |
---|
1776 | } |
---|
1777 | |
---|
1778 | double |
---|
1779 | speed_mpn_set_str (struct speed_params *s) |
---|
1780 | { |
---|
1781 | SPEED_ROUTINE_MPN_SET_STR (mpn_set_str); |
---|
1782 | } |
---|
1783 | double |
---|
1784 | speed_mpn_set_str_basecase (struct speed_params *s) |
---|
1785 | { |
---|
1786 | SPEED_ROUTINE_MPN_SET_STR (mpn_set_str_basecase); |
---|
1787 | } |
---|
1788 | double |
---|
1789 | speed_mpn_set_str_subquad (struct speed_params *s) |
---|
1790 | { |
---|
1791 | SPEED_ROUTINE_MPN_SET_STR (mpn_set_str_subquad); |
---|
1792 | } |
---|
1793 | |
---|
1794 | |
---|
1795 | double |
---|
1796 | speed_MPN_ZERO (struct speed_params *s) |
---|
1797 | { |
---|
1798 | SPEED_ROUTINE_MPN_ZERO_CALL (MPN_ZERO (wp, s->size)); |
---|
1799 | } |
---|