#
source:
trunk/third/gmp/tune/common.c
@
18191

Revision 18191, 45.1 KB checked in by ghudson, 22 years ago (diff) |
---|

Line | |
---|---|

1 | /* Shared speed subroutines. |

2 | |

3 | Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc. |

4 | |

5 | This file is part of the GNU MP Library. |

6 | |

7 | The GNU MP Library is free software; you can redistribute it and/or modify |

8 | it under the terms of the GNU Lesser General Public License as published by |

9 | the Free Software Foundation; either version 2.1 of the License, or (at your |

10 | option) any later version. |

11 | |

12 | The GNU MP Library is distributed in the hope that it will be useful, but |

13 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |

14 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public |

15 | License for more details. |

16 | |

17 | You should have received a copy of the GNU Lesser General Public License |

18 | along with the GNU MP Library; see the file COPYING.LIB. If not, write to |

19 | the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, |

20 | MA 02111-1307, USA. */ |

21 | |

22 | #include <errno.h> |

23 | #include <fcntl.h> |

24 | #include <math.h> |

25 | #include <stdio.h> |

26 | #include <stdlib.h> /* for qsort */ |

27 | #include <string.h> |

28 | #include <unistd.h> |

29 | #if 0 |

30 | #include <sys/ioctl.h> |

31 | #endif |

32 | |

33 | #include "gmp.h" |

34 | #include "gmp-impl.h" |

35 | #include "longlong.h" |

36 | |

37 | #include "tests.h" |

38 | #include "speed.h" |

39 | |

40 | |

41 | int speed_option_addrs = 0; |

42 | int speed_option_verbose = 0; |

43 | |

44 | |

45 | /* Provide __clz_tab even if it's not required, for the benefit of new code |

46 | being tested with many.pl. */ |

47 | #ifndef COUNT_LEADING_ZEROS_NEED_CLZ_TAB |

48 | #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB |

49 | #include "mp_clz_tab.c" |

50 | #undef COUNT_LEADING_ZEROS_NEED_CLZ_TAB |

51 | #endif |

52 | |

53 | |

54 | void |

55 | pentium_wbinvd(void) |

56 | { |

57 | #if 0 |

58 | { |

59 | static int fd = -2; |

60 | |

61 | if (fd == -2) |

62 | { |

63 | fd = open ("/dev/wbinvd", O_RDWR); |

64 | if (fd == -1) |

65 | perror ("open /dev/wbinvd"); |

66 | } |

67 | |

68 | if (fd != -1) |

69 | ioctl (fd, 0, 0); |

70 | } |

71 | #endif |

72 | |

73 | #if 0 |

74 | #define WBINVDSIZE 1024*1024*2 |

75 | { |

76 | static char *p = NULL; |

77 | int i, sum; |

78 | |

79 | if (p == NULL) |

80 | p = malloc (WBINVDSIZE); |

81 | |

82 | #if 0 |

83 | for (i = 0; i < WBINVDSIZE; i++) |

84 | p[i] = i & 0xFF; |

85 | #endif |

86 | |

87 | sum = 0; |

88 | for (i = 0; i < WBINVDSIZE; i++) |

89 | sum += p[i]; |

90 | |

91 | mpn_cache_fill_dummy (sum); |

92 | } |

93 | #endif |

94 | } |

95 | |

96 | |

97 | int |

98 | double_cmp_ptr (const double *p, const double *q) |

99 | { |

100 | if (*p > *q) return 1; |

101 | if (*p < *q) return -1; |

102 | return 0; |

103 | } |

104 | |

105 | |

106 | /* Measure the speed of a given routine. |

107 | |

108 | The routine is run with enough repetitions to make it take at least |

109 | speed_precision * speed_unittime. This aims to minimize the effects of a |

110 | limited accuracy time base and the overhead of the measuring itself. |

111 | |

112 | Measurements are made looking for 4 results within TOLERANCE of each |

113 | other (or 3 for routines taking longer than 2 seconds). This aims to get |

114 | an accurate reading even if some runs are bloated by interrupts or task |

115 | switches or whatever. |

116 | |

117 | The given (*fun)() is expected to run its function "s->reps" many times |

118 | and return the total elapsed time measured using speed_starttime() and |

119 | speed_endtime(). If the function doesn't support the given s->size or |

120 | s->r, -1.0 should be returned. See the various base routines below. */ |

121 | |

122 | double |

123 | speed_measure (double (*fun) _PROTO ((struct speed_params *s)), |

124 | struct speed_params *s) |

125 | { |

126 | #define TOLERANCE 1.005 /* 0.5% */ |

127 | |

128 | struct speed_params s_dummy; |

129 | int i, j, e; |

130 | double t[30]; |

131 | double t_unsorted[30]; |

132 | double reps_d; |

133 | |

134 | /* Use dummy parameters if caller doesn't provide any. Only a few special |

135 | "fun"s will cope with this, speed_noop() is one. */ |

136 | if (s == NULL) |

137 | { |

138 | memset (&s_dummy, '\0', sizeof (s_dummy)); |

139 | s = &s_dummy; |

140 | } |

141 | |

142 | s->reps = 1; |

143 | s->time_divisor = 1.0; |

144 | for (i = 0; i < numberof (t); i++) |

145 | { |

146 | for (;;) |

147 | { |

148 | s->src_num = 0; |

149 | s->dst_num = 0; |

150 | |

151 | t[i] = (*fun) (s); |

152 | |

153 | if (speed_option_verbose >= 3) |

154 | printf("size=%ld reps=%u r=%ld attempt=%d %.9f\n", |

155 | s->size, s->reps, s->r, i, t[i]); |

156 | |

157 | if (t[i] == -1.0) |

158 | return -1.0; |

159 | |

160 | if (t[i] >= speed_unittime * speed_precision) |

161 | break; |

162 | |

163 | /* go to a value of reps to make t[i] >= precision */ |

164 | reps_d = ceil (1.1 * s->reps |

165 | * speed_unittime * speed_precision |

166 | / MAX (t[i], speed_unittime)); |

167 | if (reps_d > 2e9 || reps_d < 1.0) |

168 | { |

169 | fprintf (stderr, "Fatal error: new reps bad: %.2f\n", reps_d); |

170 | fprintf (stderr, " (old reps %u, unittime %.4g, precision %d, t[i] %.4g)\n", |

171 | s->reps, speed_unittime, speed_precision, t[i]); |

172 | abort (); |

173 | } |

174 | s->reps = (unsigned) reps_d; |

175 | } |

176 | t[i] /= s->reps; |

177 | t_unsorted[i] = t[i]; |

178 | |

179 | if (speed_precision == 0) |

180 | return t[i]; |

181 | |

182 | /* require 3 values within TOLERANCE when >= 2 secs, 4 when below */ |

183 | if (t[0] >= 2.0) |

184 | e = 3; |

185 | else |

186 | e = 4; |

187 | |

188 | /* Look for e many t[]'s within TOLERANCE of each other to consider a |

189 | valid measurement. Return smallest among them. */ |

190 | if (i >= e) |

191 | { |

192 | qsort (t, i+1, sizeof(t[0]), (qsort_function_t) double_cmp_ptr); |

193 | for (j = e-1; j < i; j++) |

194 | if (t[j] <= t[j-e+1] * TOLERANCE) |

195 | return t[j-e+1] / s->time_divisor; |

196 | } |

197 | } |

198 | |

199 | fprintf (stderr, "speed_measure() could not get %d results within %.1f%%\n", |

200 | e, (TOLERANCE-1.0)*100.0); |

201 | fprintf (stderr, " unsorted sorted\n"); |

202 | fprintf (stderr, " %.12f %.12f is about 0.5%%\n", |

203 | t_unsorted[0]*(TOLERANCE-1.0), t[0]*(TOLERANCE-1.0)); |

204 | for (i = 0; i < numberof (t); i++) |

205 | fprintf (stderr, " %.09f %.09f\n", t_unsorted[i], t[i]); |

206 | |

207 | return -1.0; |

208 | } |

209 | |

210 | |

211 | /* Read all of ptr,size to get it into the CPU memory cache. |

212 | |

213 | A call to mpn_cache_fill_dummy() is used to make sure the compiler |

214 | doesn't optimize away the whole loop. Using "volatile mp_limb_t sum" |

215 | would work too, but the function call means we don't rely on every |

216 | compiler actually implementing volatile properly. |

217 | |

218 | mpn_cache_fill_dummy() is in a separate source file to stop gcc thinking |

219 | it can inline it. */ |

220 | |

221 | void |

222 | mpn_cache_fill (mp_srcptr ptr, mp_size_t size) |

223 | { |

224 | mp_limb_t sum = 0; |

225 | mp_size_t i; |

226 | |

227 | for (i = 0; i < size; i++) |

228 | sum += ptr[i]; |

229 | |

230 | mpn_cache_fill_dummy(sum); |

231 | } |

232 | |

233 | |

234 | void |

235 | mpn_cache_fill_write (mp_ptr ptr, mp_size_t size) |

236 | { |

237 | mpn_cache_fill (ptr, size); |

238 | |

239 | #if 0 |

240 | mpn_random (ptr, size); |

241 | #endif |

242 | |

243 | #if 0 |

244 | mp_size_t i; |

245 | |

246 | for (i = 0; i < size; i++) |

247 | ptr[i] = i; |

248 | #endif |

249 | } |

250 | |

251 | |

252 | void |

253 | speed_operand_src (struct speed_params *s, mp_ptr ptr, mp_size_t size) |

254 | { |

255 | if (s->src_num >= numberof (s->src)) |

256 | { |

257 | fprintf (stderr, "speed_operand_src: no room left in s->src[]\n"); |

258 | abort (); |

259 | } |

260 | s->src[s->src_num].ptr = ptr; |

261 | s->src[s->src_num].size = size; |

262 | s->src_num++; |

263 | } |

264 | |

265 | |

266 | void |

267 | speed_operand_dst (struct speed_params *s, mp_ptr ptr, mp_size_t size) |

268 | { |

269 | if (s->dst_num >= numberof (s->dst)) |

270 | { |

271 | fprintf (stderr, "speed_operand_dst: no room left in s->dst[]\n"); |

272 | abort (); |

273 | } |

274 | s->dst[s->dst_num].ptr = ptr; |

275 | s->dst[s->dst_num].size = size; |

276 | s->dst_num++; |

277 | } |

278 | |

279 | |

280 | void |

281 | speed_cache_fill (struct speed_params *s) |

282 | { |

283 | static struct speed_params prev; |

284 | int i; |

285 | |

286 | /* FIXME: need a better way to get the format string for a pointer */ |

287 | |

288 | if (speed_option_addrs) |

289 | { |

290 | int different; |

291 | |

292 | different = (s->dst_num != prev.dst_num || s->src_num != prev.src_num); |

293 | for (i = 0; i < s->dst_num; i++) |

294 | different |= (s->dst[i].ptr != prev.dst[i].ptr); |

295 | for (i = 0; i < s->src_num; i++) |

296 | different |= (s->src[i].ptr != prev.src[i].ptr); |

297 | |

298 | if (different) |

299 | { |

300 | if (s->dst_num != 0) |

301 | { |

302 | printf ("dst"); |

303 | for (i = 0; i < s->dst_num; i++) |

304 | printf (" %08lX", (unsigned long) s->dst[i].ptr); |

305 | printf (" "); |

306 | } |

307 | |

308 | if (s->src_num != 0) |

309 | { |

310 | printf ("src"); |

311 | for (i = 0; i < s->src_num; i++) |

312 | printf (" %08lX", (unsigned long) s->src[i].ptr); |

313 | printf (" "); |

314 | } |

315 | printf (" (cf sp approx %08lX)\n", (unsigned long) &different); |

316 | |

317 | } |

318 | |

319 | memcpy (&prev, s, sizeof(prev)); |

320 | } |

321 | |

322 | switch (s->cache) { |

323 | case 0: |

324 | for (i = 0; i < s->dst_num; i++) |

325 | mpn_cache_fill_write (s->dst[i].ptr, s->dst[i].size); |

326 | for (i = 0; i < s->src_num; i++) |

327 | mpn_cache_fill (s->src[i].ptr, s->src[i].size); |

328 | break; |

329 | case 1: |

330 | pentium_wbinvd(); |

331 | break; |

332 | } |

333 | } |

334 | |

335 | |

336 | /* Adjust ptr to align to CACHE_LINE_SIZE bytes plus "align" limbs. ptr |

337 | needs to have room for up to CACHE_LINE_SIZE-4 extra bytes. */ |

338 | |

339 | mp_ptr |

340 | speed_tmp_alloc_adjust (void *ptr, mp_size_t align) |

341 | { |

342 | /* |

343 | printf("%p %ld -> %p %X %X\n", ptr, align, |

344 | (mp_ptr) ptr |

345 | + ((align - ((mp_size_t) ptr >> 2)) & |

346 | SPEED_TMP_ALLOC_ADJUST_MASK), |

347 | ((mp_size_t) ptr >> 2) & SPEED_TMP_ALLOC_ADJUST_MASK, |

348 | SPEED_TMP_ALLOC_ADJUST_MASK); |

349 | */ |

350 | |

351 | return (mp_ptr) ptr |

352 | + ((align - ((mp_size_t) ptr >> 2)) & SPEED_TMP_ALLOC_ADJUST_MASK); |

353 | } |

354 | |

355 | |

356 | /* Miscellanous options accepted by tune and speed programs under -o. */ |

357 | |

358 | void |

359 | speed_option_set (const char *s) |

360 | { |

361 | int n; |

362 | |

363 | if (strcmp (s, "addrs") == 0) |

364 | { |

365 | speed_option_addrs = 1; |

366 | } |

367 | else if (strcmp (s, "verbose") == 0) |

368 | { |

369 | speed_option_verbose++; |

370 | } |

371 | else if (sscanf (s, "verbose=%d", &n) == 1) |

372 | { |

373 | speed_option_verbose = n; |

374 | } |

375 | else |

376 | { |

377 | printf ("Unrecognised -o option: %s\n", s); |

378 | exit (1); |

379 | } |

380 | } |

381 | |

382 | |

383 | /* The following are basic speed running routines for various gmp functions. |

384 | Many are very similar and use speed.h macros. |

385 | |

386 | Each routine allocates it's own destination space for the result of the |

387 | function, because only it can know what the function needs. |

388 | |

389 | speed_starttime() and speed_endtime() are put tight around the code to be |

390 | measured. Any setups are done outside the timed portion. |

391 | |

392 | Each routine is responsible for its own cache priming. |

393 | speed_cache_fill() is a good way to do this, see examples in speed.h. |

394 | One cache priming possibility, for CPUs with write-allocate cache, and |

395 | functions that don't take too long, is to do one dummy call before timing |

396 | so as to cache everything that gets used. But speed_measure() runs a |

397 | routine at least twice and will take the smaller time, so this might not |

398 | be necessary. |

399 | |

400 | Data alignment will be important, for source, destination and temporary |

401 | workspace. A routine can align its destination and workspace. Programs |

402 | using the routines will ensure s->xp and s->yp are aligned. Aligning |

403 | onto a CACHE_LINE_SIZE boundary is suggested. s->align_wp and |

404 | s->align_wp2 should be respected where it makes sense to do so. |

405 | SPEED_TMP_ALLOC_LIMBS is a good way to do this. |

406 | |

407 | A loop of the following form can be expected to turn into good assembler |

408 | code on most CPUs, thereby minimizing overhead in the measurement. It |

409 | can always be assumed s->reps >= 1. |

410 | |

411 | i = s->reps |

412 | do |

413 | foo(); |

414 | while (--i != 0); |

415 | |

416 | Additional parameters might be added to "struct speed_params" in the |

417 | future. Routines should ignore anything they don't use. |

418 | |

419 | s->size can be used creatively, and s->xp and s->yp can be ignored. For |

420 | example, speed_mpz_fac_ui() uses s->size as n for the factorial. s->r is |

421 | just a user-supplied parameter. speed_mpn_lshift() uses it as a shift, |

422 | speed_mpn_mul_1() uses it as a multiplier. */ |

423 | |

424 | |

425 | /* MPN_COPY etc can be macros, so the _CALL forms are necessary */ |

426 | double |

427 | speed_MPN_COPY (struct speed_params *s) |

428 | { |

429 | SPEED_ROUTINE_MPN_COPY (MPN_COPY); |

430 | } |

431 | double |

432 | speed_MPN_COPY_INCR (struct speed_params *s) |

433 | { |

434 | SPEED_ROUTINE_MPN_COPY (MPN_COPY_INCR); |

435 | } |

436 | double |

437 | speed_MPN_COPY_DECR (struct speed_params *s) |

438 | { |

439 | SPEED_ROUTINE_MPN_COPY (MPN_COPY_DECR); |

440 | } |

441 | #if HAVE_NATIVE_mpn_copyi |

442 | double |

443 | speed_mpn_copyi (struct speed_params *s) |

444 | { |

445 | SPEED_ROUTINE_MPN_COPY (mpn_copyi); |

446 | } |

447 | #endif |

448 | #if HAVE_NATIVE_mpn_copyd |

449 | double |

450 | speed_mpn_copyd (struct speed_params *s) |

451 | { |

452 | SPEED_ROUTINE_MPN_COPY (mpn_copyd); |

453 | } |

454 | #endif |

455 | double |

456 | speed_memcpy (struct speed_params *s) |

457 | { |

458 | SPEED_ROUTINE_MPN_COPY_BYTES (memcpy); |

459 | } |

460 | double |

461 | speed_mpn_com_n (struct speed_params *s) |

462 | { |

463 | SPEED_ROUTINE_MPN_COPY (mpn_com_n); |

464 | } |

465 | |

466 | |

467 | double |

468 | speed_mpn_addmul_1 (struct speed_params *s) |

469 | { |

470 | SPEED_ROUTINE_MPN_UNARY_1 (mpn_addmul_1); |

471 | } |

472 | double |

473 | speed_mpn_submul_1 (struct speed_params *s) |

474 | { |

475 | SPEED_ROUTINE_MPN_UNARY_1 (mpn_submul_1); |

476 | } |

477 | |

478 | |

479 | double |

480 | speed_mpn_mul_1 (struct speed_params *s) |

481 | { |

482 | SPEED_ROUTINE_MPN_UNARY_1 (mpn_mul_1); |

483 | } |

484 | double |

485 | speed_mpn_mul_1_inplace (struct speed_params *s) |

486 | { |

487 | SPEED_ROUTINE_MPN_UNARY_1_INPLACE (mpn_mul_1); |

488 | } |

489 | |

490 | #if HAVE_NATIVE_mpn_mul_2 |

491 | double |

492 | speed_mpn_mul_2 (struct speed_params *s) |

493 | { |

494 | SPEED_ROUTINE_MPN_MUL_2 (mpn_mul_2); |

495 | } |

496 | #endif |

497 | |

498 | |

499 | double |

500 | speed_mpn_lshift (struct speed_params *s) |

501 | { |

502 | SPEED_ROUTINE_MPN_UNARY_1 (mpn_lshift); |

503 | } |

504 | double |

505 | speed_mpn_rshift (struct speed_params *s) |

506 | { |

507 | SPEED_ROUTINE_MPN_UNARY_1 (mpn_rshift); |

508 | } |

509 | |

510 | |

511 | /* The carry-in variants (if available) are good for measuring because they |

512 | won't skip a division if high<divisor. Alternately, use -1 as a divisor |

513 | with the plain _1 forms. */ |

514 | double |

515 | speed_mpn_divrem_1 (struct speed_params *s) |

516 | { |

517 | SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1); |

518 | } |

519 | double |

520 | speed_mpn_divrem_1f (struct speed_params *s) |

521 | { |

522 | SPEED_ROUTINE_MPN_DIVREM_1F (mpn_divrem_1); |

523 | } |

524 | #if HAVE_NATIVE_mpn_divrem_1c |

525 | double |

526 | speed_mpn_divrem_1c (struct speed_params *s) |

527 | { |

528 | SPEED_ROUTINE_MPN_DIVREM_1C (mpn_divrem_1c); |

529 | } |

530 | double |

531 | speed_mpn_divrem_1cf (struct speed_params *s) |

532 | { |

533 | SPEED_ROUTINE_MPN_DIVREM_1CF (mpn_divrem_1c); |

534 | } |

535 | #endif |

536 | |

537 | double |

538 | speed_mpn_divrem_1_div (struct speed_params *s) |

539 | { |

540 | SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_div); |

541 | } |

542 | double |

543 | speed_mpn_divrem_1f_div (struct speed_params *s) |

544 | { |

545 | SPEED_ROUTINE_MPN_DIVREM_1F (mpn_divrem_1_div); |

546 | } |

547 | double |

548 | speed_mpn_divrem_1_inv (struct speed_params *s) |

549 | { |

550 | SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_inv); |

551 | } |

552 | double |

553 | speed_mpn_divrem_1f_inv (struct speed_params *s) |

554 | { |

555 | SPEED_ROUTINE_MPN_DIVREM_1F (mpn_divrem_1_inv); |

556 | } |

557 | double |

558 | speed_mpn_mod_1_div (struct speed_params *s) |

559 | { |

560 | SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_div); |

561 | } |

562 | double |

563 | speed_mpn_mod_1_inv (struct speed_params *s) |

564 | { |

565 | SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_inv); |

566 | } |

567 | |

568 | double |

569 | speed_mpn_preinv_divrem_1 (struct speed_params *s) |

570 | { |

571 | SPEED_ROUTINE_MPN_PREINV_DIVREM_1 (mpn_preinv_divrem_1); |

572 | } |

573 | double |

574 | speed_mpn_preinv_divrem_1f (struct speed_params *s) |

575 | { |

576 | SPEED_ROUTINE_MPN_PREINV_DIVREM_1F (mpn_preinv_divrem_1); |

577 | } |

578 | |

579 | double |

580 | speed_mpn_mod_34lsub1 (struct speed_params *s) |

581 | { |

582 | SPEED_ROUTINE_MPN_MOD_34LSUB1 (mpn_mod_34lsub1); |

583 | } |

584 | |

585 | double |

586 | speed_mpn_divrem_2 (struct speed_params *s) |

587 | { |

588 | SPEED_ROUTINE_MPN_DIVREM_2 (mpn_divrem_2); |

589 | } |

590 | double |

591 | speed_mpn_divrem_2_div (struct speed_params *s) |

592 | { |

593 | SPEED_ROUTINE_MPN_DIVREM_2 (mpn_divrem_2_div); |

594 | } |

595 | double |

596 | speed_mpn_divrem_2_inv (struct speed_params *s) |

597 | { |

598 | SPEED_ROUTINE_MPN_DIVREM_2 (mpn_divrem_2_inv); |

599 | } |

600 | |

601 | double |

602 | speed_mpn_mod_1 (struct speed_params *s) |

603 | { |

604 | SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1); |

605 | } |

606 | #if HAVE_NATIVE_mpn_mod_1c |

607 | double |

608 | speed_mpn_mod_1c (struct speed_params *s) |

609 | { |

610 | SPEED_ROUTINE_MPN_MOD_1C (mpn_mod_1c); |

611 | } |

612 | #endif |

613 | double |

614 | speed_mpn_preinv_mod_1 (struct speed_params *s) |

615 | { |

616 | SPEED_ROUTINE_MPN_PREINV_MOD_1 (mpn_preinv_mod_1); |

617 | } |

618 | |

619 | double |

620 | speed_mpn_divexact_1 (struct speed_params *s) |

621 | { |

622 | SPEED_ROUTINE_MPN_DIVEXACT_1 (mpn_divexact_1); |

623 | } |

624 | |

625 | double |

626 | speed_mpn_divexact_by3 (struct speed_params *s) |

627 | { |

628 | SPEED_ROUTINE_MPN_COPY (mpn_divexact_by3); |

629 | } |

630 | |

631 | #if HAVE_NATIVE_mpn_modexact_1_odd |

632 | double |

633 | speed_mpn_modexact_1_odd (struct speed_params *s) |

634 | { |

635 | SPEED_ROUTINE_MPN_MODEXACT_1_ODD (mpn_modexact_1_odd); |

636 | } |

637 | #endif |

638 | |

639 | double |

640 | speed_mpn_modexact_1c_odd (struct speed_params *s) |

641 | { |

642 | SPEED_ROUTINE_MPN_MODEXACT_1C_ODD (mpn_modexact_1c_odd); |

643 | } |

644 | |

645 | |

646 | double |

647 | speed_mpn_dc_tdiv_qr (struct speed_params *s) |

648 | { |

649 | SPEED_ROUTINE_MPN_DC_TDIV_QR (mpn_tdiv_qr); |

650 | } |

651 | double |

652 | speed_mpn_dc_divrem_n (struct speed_params *s) |

653 | { |

654 | SPEED_ROUTINE_MPN_DC_DIVREM_N (mpn_dc_divrem_n); |

655 | } |

656 | double |

657 | speed_mpn_dc_divrem_sb (struct speed_params *s) |

658 | { |

659 | SPEED_ROUTINE_MPN_DC_DIVREM_SB (mpn_sb_divrem_mn); |

660 | } |

661 | double |

662 | speed_mpn_dc_divrem_sb_div (struct speed_params *s) |

663 | { |

664 | SPEED_ROUTINE_MPN_DC_DIVREM_SB (mpn_sb_divrem_mn_div); |

665 | } |

666 | double |

667 | speed_mpn_dc_divrem_sb_inv (struct speed_params *s) |

668 | { |

669 | SPEED_ROUTINE_MPN_DC_DIVREM_SB (mpn_sb_divrem_mn_inv); |

670 | } |

671 | |

672 | double |

673 | speed_mpn_sb_divrem_m3 (struct speed_params *s) |

674 | { |

675 | SPEED_ROUTINE_MPN_SB_DIVREM_M3 (mpn_sb_divrem_mn); |

676 | } |

677 | double |

678 | speed_mpn_sb_divrem_m3_div (struct speed_params *s) |

679 | { |

680 | SPEED_ROUTINE_MPN_SB_DIVREM_M3 (mpn_sb_divrem_mn_div); |

681 | } |

682 | double |

683 | speed_mpn_sb_divrem_m3_inv (struct speed_params *s) |

684 | { |

685 | SPEED_ROUTINE_MPN_SB_DIVREM_M3 (mpn_sb_divrem_mn_inv); |

686 | } |

687 | |

688 | double |

689 | speed_mpz_mod (struct speed_params *s) |

690 | { |

691 | SPEED_ROUTINE_MPZ_MOD (mpz_mod); |

692 | } |

693 | double |

694 | speed_redc (struct speed_params *s) |

695 | { |

696 | SPEED_ROUTINE_REDC (redc); |

697 | } |

698 | |

699 | |

700 | double |

701 | speed_mpn_popcount (struct speed_params *s) |

702 | { |

703 | SPEED_ROUTINE_MPN_POPCOUNT (mpn_popcount); |

704 | } |

705 | double |

706 | speed_mpn_hamdist (struct speed_params *s) |

707 | { |

708 | SPEED_ROUTINE_MPN_HAMDIST (mpn_hamdist); |

709 | } |

710 | |

711 | |

712 | double |

713 | speed_mpn_add_n (struct speed_params *s) |

714 | { |

715 | SPEED_ROUTINE_MPN_BINARY_N (mpn_add_n); |

716 | } |

717 | double |

718 | speed_mpn_sub_n (struct speed_params *s) |

719 | { |

720 | SPEED_ROUTINE_MPN_BINARY_N (mpn_sub_n); |

721 | } |

722 | |

723 | |

724 | /* mpn_and_n etc can be macros and so have to be handled with |

725 | SPEED_ROUTINE_MPN_BINARY_N_CALL forms */ |

726 | double |

727 | speed_mpn_and_n (struct speed_params *s) |

728 | { |

729 | SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_and_n (wp, s->xp, s->yp, s->size)); |

730 | } |

731 | double |

732 | speed_mpn_andn_n (struct speed_params *s) |

733 | { |

734 | SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_andn_n (wp, s->xp, s->yp, s->size)); |

735 | } |

736 | double |

737 | speed_mpn_nand_n (struct speed_params *s) |

738 | { |

739 | SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_nand_n (wp, s->xp, s->yp, s->size)); |

740 | } |

741 | double |

742 | speed_mpn_ior_n (struct speed_params *s) |

743 | { |

744 | SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_ior_n (wp, s->xp, s->yp, s->size)); |

745 | } |

746 | double |

747 | speed_mpn_iorn_n (struct speed_params *s) |

748 | { |

749 | SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_iorn_n (wp, s->xp, s->yp, s->size)); |

750 | } |

751 | double |

752 | speed_mpn_nior_n (struct speed_params *s) |

753 | { |

754 | SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_nior_n (wp, s->xp, s->yp, s->size)); |

755 | } |

756 | double |

757 | speed_mpn_xor_n (struct speed_params *s) |

758 | { |

759 | SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_xor_n (wp, s->xp, s->yp, s->size)); |

760 | } |

761 | double |

762 | speed_mpn_xnor_n (struct speed_params *s) |

763 | { |

764 | SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_xnor_n (wp, s->xp, s->yp, s->size)); |

765 | } |

766 | |

767 | |

768 | double |

769 | speed_mpn_mul_n (struct speed_params *s) |

770 | { |

771 | SPEED_ROUTINE_MPN_MUL_N (mpn_mul_n); |

772 | } |

773 | double |

774 | speed_mpn_sqr_n (struct speed_params *s) |

775 | { |

776 | SPEED_ROUTINE_MPN_SQR (mpn_sqr_n); |

777 | } |

778 | double |

779 | speed_mpn_mul_n_sqr (struct speed_params *s) |

780 | { |

781 | SPEED_ROUTINE_MPN_SQR_CALL (mpn_mul_n (wp, s->xp, s->xp, s->size)); |

782 | } |

783 | |

784 | double |

785 | speed_mpn_mul_basecase (struct speed_params *s) |

786 | { |

787 | SPEED_ROUTINE_MPN_MUL_BASECASE(mpn_mul_basecase); |

788 | } |

789 | double |

790 | speed_mpn_sqr_basecase (struct speed_params *s) |

791 | { |

792 | /* FIXME: size restrictions on some versions of sqr_basecase */ |

793 | SPEED_ROUTINE_MPN_SQR (mpn_sqr_basecase); |

794 | } |

795 | |

796 | #if HAVE_NATIVE_mpn_sqr_diagonal |

797 | double |

798 | speed_mpn_sqr_diagonal (struct speed_params *s) |

799 | { |

800 | SPEED_ROUTINE_MPN_SQR (mpn_sqr_diagonal); |

801 | } |

802 | #endif |

803 | |

804 | double |

805 | speed_mpn_kara_mul_n (struct speed_params *s) |

806 | { |

807 | SPEED_ROUTINE_MPN_KARA_MUL_N (mpn_kara_mul_n); |

808 | } |

809 | double |

810 | speed_mpn_kara_sqr_n (struct speed_params *s) |

811 | { |

812 | SPEED_ROUTINE_MPN_KARA_SQR_N (mpn_kara_sqr_n); |

813 | } |

814 | |

815 | double |

816 | speed_mpn_toom3_mul_n (struct speed_params *s) |

817 | { |

818 | SPEED_ROUTINE_MPN_TOOM3_MUL_N (mpn_toom3_mul_n); |

819 | } |

820 | double |

821 | speed_mpn_toom3_sqr_n (struct speed_params *s) |

822 | { |

823 | SPEED_ROUTINE_MPN_TOOM3_SQR_N (mpn_toom3_sqr_n); |

824 | } |

825 | |

826 | double |

827 | speed_mpn_toom3_mul_n_mpn (struct speed_params *s) |

828 | { |

829 | SPEED_ROUTINE_MPN_TOOM3_MUL_N (mpn_toom3_mul_n_mpn); |

830 | } |

831 | double |

832 | speed_mpn_toom3_mul_n_open (struct speed_params *s) |

833 | { |

834 | SPEED_ROUTINE_MPN_TOOM3_MUL_N (mpn_toom3_mul_n_open); |

835 | } |

836 | double |

837 | speed_mpn_toom3_sqr_n_mpn (struct speed_params *s) |

838 | { |

839 | SPEED_ROUTINE_MPN_TOOM3_SQR_N (mpn_toom3_sqr_n_mpn); |

840 | } |

841 | double |

842 | speed_mpn_toom3_sqr_n_open (struct speed_params *s) |

843 | { |

844 | SPEED_ROUTINE_MPN_TOOM3_SQR_N (mpn_toom3_sqr_n_open); |

845 | } |

846 | |

847 | double |

848 | speed_mpn_mul_fft_full (struct speed_params *s) |

849 | { |

850 | SPEED_ROUTINE_MPN_MUL_N_CALL |

851 | (mpn_mul_fft_full (wp, s->xp, s->size, s->yp, s->size)); |

852 | } |

853 | double |

854 | speed_mpn_mul_fft_full_sqr (struct speed_params *s) |

855 | { |

856 | SPEED_ROUTINE_MPN_SQR_CALL |

857 | (mpn_mul_fft_full (wp, s->xp, s->size, s->xp, s->size)); |

858 | } |

859 | |

860 | |

861 | /* These are mod 2^N+1 multiplies and squares. If s->r is supplied it's |

862 | used as k, otherwise the best k for the size is used. If s->size isn't a |

863 | multiple of 2^k it's rounded up to make the effective operation size. */ |

864 | |

865 | #define SPEED_ROUTINE_MPN_MUL_FFT_CALL(call, sqr) \ |

866 | { \ |

867 | mp_ptr wp; \ |

868 | mp_size_t pl; \ |

869 | int k; \ |

870 | unsigned i; \ |

871 | double t; \ |

872 | TMP_DECL (marker); \ |

873 | \ |

874 | SPEED_RESTRICT_COND (s->size >= 1); \ |

875 | \ |

876 | if (s->r != 0) \ |

877 | k = s->r; \ |

878 | else \ |

879 | k = mpn_fft_best_k (s->size, sqr); \ |

880 | \ |

881 | TMP_MARK (marker); \ |

882 | pl = mpn_fft_next_size (s->size, k); \ |

883 | wp = SPEED_TMP_ALLOC_LIMBS (pl+1, s->align_wp); \ |

884 | \ |

885 | speed_operand_src (s, s->xp, s->size); \ |

886 | if (!sqr) \ |

887 | speed_operand_src (s, s->yp, s->size); \ |

888 | speed_operand_dst (s, wp, pl+1); \ |

889 | speed_cache_fill (s); \ |

890 | \ |

891 | speed_starttime (); \ |

892 | i = s->reps; \ |

893 | do \ |

894 | call; \ |

895 | while (--i != 0); \ |

896 | t = speed_endtime (); \ |

897 | \ |

898 | TMP_FREE (marker); \ |

899 | return t; \ |

900 | } |

901 | |

902 | double |

903 | speed_mpn_mul_fft (struct speed_params *s) |

904 | { |

905 | SPEED_ROUTINE_MPN_MUL_FFT_CALL |

906 | (mpn_mul_fft (wp, pl, s->xp, s->size, s->yp, s->size, k), 0); |

907 | } |

908 | |

909 | double |

910 | speed_mpn_mul_fft_sqr (struct speed_params *s) |

911 | { |

912 | SPEED_ROUTINE_MPN_MUL_FFT_CALL |

913 | (mpn_mul_fft (wp, pl, s->xp, s->size, s->xp, s->size, k), 1); |

914 | } |

915 | |

916 | |

917 | double |

918 | speed_mpn_gcd (struct speed_params *s) |

919 | { |

920 | SPEED_ROUTINE_MPN_GCD (mpn_gcd); |

921 | } |

922 | double |

923 | speed_mpn_gcd_binary (struct speed_params *s) |

924 | { |

925 | SPEED_ROUTINE_MPN_GCD (mpn_gcd_binary); |

926 | } |

927 | |

928 | #if HAVE_NATIVE_mpn_gcd_finda |

929 | double |

930 | speed_mpn_gcd_finda (struct speed_params *s) |

931 | { |

932 | SPEED_ROUTINE_MPN_GCD_FINDA (mpn_gcd_finda); |

933 | } |

934 | #endif |

935 | |

936 | |

937 | double |

938 | speed_mpn_gcdext (struct speed_params *s) |

939 | { |

940 | SPEED_ROUTINE_MPN_GCDEXT (mpn_gcdext); |

941 | } |

942 | double |

943 | speed_mpn_gcdext_single (struct speed_params *s) |

944 | { |

945 | SPEED_ROUTINE_MPN_GCDEXT (mpn_gcdext_single); |

946 | } |

947 | double |

948 | speed_mpn_gcdext_double (struct speed_params *s) |

949 | { |

950 | SPEED_ROUTINE_MPN_GCDEXT (mpn_gcdext_double); |

951 | } |

952 | double |

953 | speed_mpn_gcdext_one_single (struct speed_params *s) |

954 | { |

955 | SPEED_ROUTINE_MPN_GCDEXT_ONE (mpn_gcdext_one_single); |

956 | } |

957 | double |

958 | speed_mpn_gcdext_one_double (struct speed_params *s) |

959 | { |

960 | SPEED_ROUTINE_MPN_GCDEXT_ONE (mpn_gcdext_one_double); |

961 | } |

962 | double |

963 | speed_mpn_gcd_1 (struct speed_params *s) |

964 | { |

965 | SPEED_ROUTINE_MPN_GCD_1 (mpn_gcd_1); |

966 | } |

967 | double |

968 | speed_mpn_gcd_1N (struct speed_params *s) |

969 | { |

970 | SPEED_ROUTINE_MPN_GCD_1N (mpn_gcd_1); |

971 | } |

972 | |

973 | |

974 | double |

975 | speed_mpz_jacobi (struct speed_params *s) |

976 | { |

977 | SPEED_ROUTINE_MPZ_JACOBI (mpz_jacobi); |

978 | } |

979 | double |

980 | speed_mpn_jacobi_base (struct speed_params *s) |

981 | { |

982 | SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base); |

983 | } |

984 | double |

985 | speed_mpn_jacobi_base_1 (struct speed_params *s) |

986 | { |

987 | SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_1); |

988 | } |

989 | double |

990 | speed_mpn_jacobi_base_2 (struct speed_params *s) |

991 | { |

992 | SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_2); |

993 | } |

994 | double |

995 | speed_mpn_jacobi_base_3 (struct speed_params *s) |

996 | { |

997 | SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_3); |

998 | } |

999 | |

1000 | |

1001 | double |

1002 | speed_mpn_sqrtrem (struct speed_params *s) |

1003 | { |

1004 | SPEED_ROUTINE_MPN_SQRTREM (mpn_sqrtrem); |

1005 | } |

1006 | |

1007 | |

1008 | double |

1009 | speed_mpz_fac_ui (struct speed_params *s) |

1010 | { |

1011 | SPEED_ROUTINE_MPZ_FAC_UI (mpz_fac_ui); |

1012 | } |

1013 | |

1014 | |

1015 | double |

1016 | speed_mpn_fib2_ui (struct speed_params *s) |

1017 | { |

1018 | SPEED_ROUTINE_MPN_FIB2_UI (mpn_fib2_ui); |

1019 | } |

1020 | double |

1021 | speed_mpz_fib_ui (struct speed_params *s) |

1022 | { |

1023 | SPEED_ROUTINE_MPZ_FIB_UI (mpz_fib_ui); |

1024 | } |

1025 | double |

1026 | speed_mpz_fib2_ui (struct speed_params *s) |

1027 | { |

1028 | SPEED_ROUTINE_MPZ_FIB2_UI (mpz_fib2_ui); |

1029 | } |

1030 | double |

1031 | speed_mpz_lucnum_ui (struct speed_params *s) |

1032 | { |

1033 | SPEED_ROUTINE_MPZ_LUCNUM_UI (mpz_lucnum_ui); |

1034 | } |

1035 | double |

1036 | speed_mpz_lucnum2_ui (struct speed_params *s) |

1037 | { |

1038 | SPEED_ROUTINE_MPZ_LUCNUM2_UI (mpz_lucnum2_ui); |

1039 | } |

1040 | |

1041 | |

1042 | double |

1043 | speed_mpz_powm (struct speed_params *s) |

1044 | { |

1045 | SPEED_ROUTINE_MPZ_POWM (mpz_powm); |

1046 | } |

1047 | double |

1048 | speed_mpz_powm_mod (struct speed_params *s) |

1049 | { |

1050 | SPEED_ROUTINE_MPZ_POWM (mpz_powm_mod); |

1051 | } |

1052 | double |

1053 | speed_mpz_powm_redc (struct speed_params *s) |

1054 | { |

1055 | SPEED_ROUTINE_MPZ_POWM (mpz_powm_redc); |

1056 | } |

1057 | double |

1058 | speed_mpz_powm_ui (struct speed_params *s) |

1059 | { |

1060 | SPEED_ROUTINE_MPZ_POWM_UI (mpz_powm_ui); |

1061 | } |

1062 | |

1063 | |

1064 | double |

1065 | speed_modlimb_invert (struct speed_params *s) |

1066 | { |

1067 | SPEED_ROUTINE_MODLIMB_INVERT (modlimb_invert); |

1068 | } |

1069 | |

1070 | |

1071 | double |

1072 | speed_noop (struct speed_params *s) |

1073 | { |

1074 | unsigned i; |

1075 | |

1076 | speed_starttime (); |

1077 | i = s->reps; |

1078 | do |

1079 | noop (); |

1080 | while (--i != 0); |

1081 | return speed_endtime (); |

1082 | } |

1083 | |

1084 | double |

1085 | speed_noop_wxs (struct speed_params *s) |

1086 | { |

1087 | mp_ptr wp; |

1088 | unsigned i; |

1089 | double t; |

1090 | TMP_DECL (marker); |

1091 | |

1092 | TMP_MARK (marker); |

1093 | wp = TMP_ALLOC_LIMBS (1); |

1094 | |

1095 | speed_starttime (); |

1096 | i = s->reps; |

1097 | do |

1098 | noop_wxs (wp, s->xp, s->size); |

1099 | while (--i != 0); |

1100 | t = speed_endtime (); |

1101 | |

1102 | TMP_FREE (marker); |

1103 | return t; |

1104 | } |

1105 | |

1106 | double |

1107 | speed_noop_wxys (struct speed_params *s) |

1108 | { |

1109 | mp_ptr wp; |

1110 | unsigned i; |

1111 | double t; |

1112 | TMP_DECL (marker); |

1113 | |

1114 | TMP_MARK (marker); |

1115 | wp = TMP_ALLOC_LIMBS (1); |

1116 | |

1117 | speed_starttime (); |

1118 | i = s->reps; |

1119 | do |

1120 | noop_wxys (wp, s->xp, s->yp, s->size); |

1121 | while (--i != 0); |

1122 | t = speed_endtime (); |

1123 | |

1124 | TMP_FREE (marker); |

1125 | return t; |

1126 | } |

1127 | |

1128 | |

1129 | #define SPEED_ROUTINE_ALLOC_FREE(variables, calls) \ |

1130 | { \ |

1131 | unsigned i; \ |

1132 | variables; \ |

1133 | \ |

1134 | speed_starttime (); \ |

1135 | i = s->reps; \ |

1136 | do \ |

1137 | { \ |

1138 | calls; \ |

1139 | } \ |

1140 | while (--i != 0); \ |

1141 | return speed_endtime (); \ |

1142 | } |

1143 | |

1144 | |

1145 | /* Compare these to see how much malloc/free costs and then how much |

1146 | __gmp_default_allocate/free and mpz_init/clear add. mpz_init/clear or |

1147 | mpq_init/clear will be doing a 1 limb allocate, so use that as the size |

1148 | when including them in comparisons. */ |

1149 | |

1150 | double |

1151 | speed_malloc_free (struct speed_params *s) |

1152 | { |

1153 | size_t bytes = s->size * BYTES_PER_MP_LIMB; |

1154 | SPEED_ROUTINE_ALLOC_FREE (void *p, |

1155 | p = malloc (bytes); |

1156 | free (p)); |

1157 | } |

1158 | |

1159 | double |

1160 | speed_malloc_realloc_free (struct speed_params *s) |

1161 | { |

1162 | size_t bytes = s->size * BYTES_PER_MP_LIMB; |

1163 | SPEED_ROUTINE_ALLOC_FREE (void *p, |

1164 | p = malloc (BYTES_PER_MP_LIMB); |

1165 | p = realloc (p, bytes); |

1166 | free (p)); |

1167 | } |

1168 | |

1169 | double |

1170 | speed_gmp_allocate_free (struct speed_params *s) |

1171 | { |

1172 | size_t bytes = s->size * BYTES_PER_MP_LIMB; |

1173 | SPEED_ROUTINE_ALLOC_FREE (void *p, |

1174 | p = (*__gmp_allocate_func) (bytes); |

1175 | (*__gmp_free_func) (p, bytes)); |

1176 | } |

1177 | |

1178 | double |

1179 | speed_gmp_allocate_reallocate_free (struct speed_params *s) |

1180 | { |

1181 | size_t bytes = s->size * BYTES_PER_MP_LIMB; |

1182 | SPEED_ROUTINE_ALLOC_FREE |

1183 | (void *p, |

1184 | p = (*__gmp_allocate_func) (BYTES_PER_MP_LIMB); |

1185 | p = (*__gmp_reallocate_func) (p, bytes, BYTES_PER_MP_LIMB); |

1186 | (*__gmp_free_func) (p, bytes)); |

1187 | } |

1188 | |

1189 | double |

1190 | speed_mpz_init_clear (struct speed_params *s) |

1191 | { |

1192 | SPEED_ROUTINE_ALLOC_FREE (mpz_t z, |

1193 | mpz_init (z); |

1194 | mpz_clear (z)); |

1195 | } |

1196 | |

1197 | double |

1198 | speed_mpz_init_realloc_clear (struct speed_params *s) |

1199 | { |

1200 | SPEED_ROUTINE_ALLOC_FREE (mpz_t z, |

1201 | mpz_init (z); |

1202 | _mpz_realloc (z, s->size); |

1203 | mpz_clear (z)); |

1204 | } |

1205 | |

1206 | double |

1207 | speed_mpq_init_clear (struct speed_params *s) |

1208 | { |

1209 | SPEED_ROUTINE_ALLOC_FREE (mpq_t q, |

1210 | mpq_init (q); |

1211 | mpq_clear (q)); |

1212 | } |

1213 | |

1214 | double |

1215 | speed_mpf_init_clear (struct speed_params *s) |

1216 | { |

1217 | SPEED_ROUTINE_ALLOC_FREE (mpf_t f, |

1218 | mpf_init (f); |

1219 | mpf_clear (f)); |

1220 | } |

1221 | |

1222 | |

1223 | /* Compare this to mpn_add_n to see how much overhead mpz_add adds. Note |

1224 | that repeatedly calling mpz_add with the same data gives branch predition |

1225 | in it an advantage. */ |

1226 | |

1227 | double |

1228 | speed_mpz_add (struct speed_params *s) |

1229 | { |

1230 | mpz_t w, x, y; |

1231 | unsigned i; |

1232 | double t; |

1233 | |

1234 | mpz_init (w); |

1235 | mpz_init (x); |

1236 | mpz_init (y); |

1237 | |

1238 | mpz_set_n (x, s->xp, s->size); |

1239 | mpz_set_n (y, s->yp, s->size); |

1240 | mpz_add (w, x, y); |

1241 | |

1242 | speed_starttime (); |

1243 | i = s->reps; |

1244 | do |

1245 | { |

1246 | mpz_add (w, x, y); |

1247 | } |

1248 | while (--i != 0); |

1249 | t = speed_endtime (); |

1250 | |

1251 | mpz_clear (w); |

1252 | mpz_clear (x); |

1253 | mpz_clear (y); |

1254 | return t; |

1255 | } |

1256 | |

1257 | |

1258 | /* If r==0, calculate (size,size/2), |

1259 | otherwise calculate (size,r). */ |

1260 | |

1261 | double |

1262 | speed_mpz_bin_uiui (struct speed_params *s) |

1263 | { |

1264 | mpz_t w; |

1265 | unsigned long k; |

1266 | unsigned i; |

1267 | double t; |

1268 | |

1269 | mpz_init (w); |

1270 | if (s->r != 0) |

1271 | k = s->r; |

1272 | else |

1273 | k = s->size/2; |

1274 | |

1275 | speed_starttime (); |

1276 | i = s->reps; |

1277 | do |

1278 | { |

1279 | mpz_bin_uiui (w, s->size, k); |

1280 | } |

1281 | while (--i != 0); |

1282 | t = speed_endtime (); |

1283 | |

1284 | mpz_clear (w); |

1285 | return t; |

1286 | } |

1287 | |

1288 | |

1289 | /* The multiplies are successively dependent so the latency is measured, not |

1290 | the issue rate. There's only 10 per loop so the code doesn't get too big |

1291 | since umul_ppmm is several instructions on some cpus. |

1292 | |

1293 | Putting the arguments as "h,l,l,h" gets slightly better code from gcc |

1294 | 2.95.2 on x86, it puts only one mov between each mul, not two. That mov |

1295 | though will probably show up as a bogus extra cycle though. |

1296 | |

1297 | The measuring function macros are into three parts to avoid overflowing |

1298 | preprocessor expansion space if umul_ppmm is big. |

1299 | |

1300 | Limitations: |

1301 | |

1302 | Don't blindly use this to set UMUL_TIME in gmp-mparam.h, check the code |

1303 | generated first, especially on CPUs with low latency multipliers. |

1304 | |

1305 | The default umul_ppmm doing h*l will be getting increasing numbers of |

1306 | high zero bits in the calculation. CPUs with data-dependent multipliers |

1307 | will want to use umul_ppmm.1 to get some randomization into the |

1308 | calculation. The extra xors and fetches will be a slowdown of course. */ |

1309 | |

1310 | #define SPEED_MACRO_UMUL_PPMM_A \ |

1311 | { \ |

1312 | mp_limb_t h, l; \ |

1313 | unsigned i; \ |

1314 | double t; \ |

1315 | \ |

1316 | s->time_divisor = 10; \ |

1317 | \ |

1318 | h = s->xp[0]; \ |

1319 | l = s->yp[0]; \ |

1320 | \ |

1321 | if (s->r == 1) \ |

1322 | { \ |

1323 | speed_starttime (); \ |

1324 | i = s->reps; \ |

1325 | do \ |

1326 | { |

1327 | |

1328 | #define SPEED_MACRO_UMUL_PPMM_B \ |

1329 | } \ |

1330 | while (--i != 0); \ |

1331 | t = speed_endtime (); \ |

1332 | } \ |

1333 | else \ |

1334 | { \ |

1335 | speed_starttime (); \ |

1336 | i = s->reps; \ |

1337 | do \ |

1338 | { |

1339 | |

1340 | #define SPEED_MACRO_UMUL_PPMM_C \ |

1341 | } \ |

1342 | while (--i != 0); \ |

1343 | t = speed_endtime (); \ |

1344 | } \ |

1345 | \ |

1346 | /* stop the compiler optimizing away the whole calculation! */ \ |

1347 | noop_1 (h); \ |

1348 | noop_1 (l); \ |

1349 | \ |

1350 | return t; \ |

1351 | } |

1352 | |

1353 | |

1354 | double |

1355 | speed_umul_ppmm (struct speed_params *s) |

1356 | { |

1357 | SPEED_MACRO_UMUL_PPMM_A; |

1358 | { |

1359 | umul_ppmm (h, l, l, h); h ^= s->xp_block[0]; l ^= s->yp_block[0]; |

1360 | umul_ppmm (h, l, l, h); h ^= s->xp_block[1]; l ^= s->yp_block[1]; |

1361 | umul_ppmm (h, l, l, h); h ^= s->xp_block[2]; l ^= s->yp_block[2]; |

1362 | umul_ppmm (h, l, l, h); h ^= s->xp_block[3]; l ^= s->yp_block[3]; |

1363 | umul_ppmm (h, l, l, h); h ^= s->xp_block[4]; l ^= s->yp_block[4]; |

1364 | umul_ppmm (h, l, l, h); h ^= s->xp_block[5]; l ^= s->yp_block[5]; |

1365 | umul_ppmm (h, l, l, h); h ^= s->xp_block[6]; l ^= s->yp_block[6]; |

1366 | umul_ppmm (h, l, l, h); h ^= s->xp_block[7]; l ^= s->yp_block[7]; |

1367 | umul_ppmm (h, l, l, h); h ^= s->xp_block[8]; l ^= s->yp_block[8]; |

1368 | umul_ppmm (h, l, l, h); h ^= s->xp_block[9]; l ^= s->yp_block[9]; |

1369 | } |

1370 | SPEED_MACRO_UMUL_PPMM_B; |

1371 | { |

1372 | umul_ppmm (h, l, l, h); |

1373 | umul_ppmm (h, l, l, h); |

1374 | umul_ppmm (h, l, l, h); |

1375 | umul_ppmm (h, l, l, h); |

1376 | umul_ppmm (h, l, l, h); |

1377 | umul_ppmm (h, l, l, h); |

1378 | umul_ppmm (h, l, l, h); |

1379 | umul_ppmm (h, l, l, h); |

1380 | umul_ppmm (h, l, l, h); |

1381 | umul_ppmm (h, l, l, h); |

1382 | } |

1383 | SPEED_MACRO_UMUL_PPMM_C; |

1384 | } |

1385 | |

1386 | |

1387 | #if HAVE_NATIVE_mpn_umul_ppmm |

1388 | |

1389 | #if defined (__hppa) && W_TYPE_SIZE == 64 |

1390 | #define CALL_MPN_UMUL_PPMM (h = __MPN (umul_ppmm) (h, l, &l)) |

1391 | #else |

1392 | #define CALL_MPN_UMUL_PPMM (h = __MPN (umul_ppmm) (&l, h, l)) |

1393 | #endif |

1394 | |

1395 | double |

1396 | speed_mpn_umul_ppmm (struct speed_params *s) |

1397 | { |

1398 | SPEED_MACRO_UMUL_PPMM_A; |

1399 | { |

1400 | CALL_MPN_UMUL_PPMM; h ^= s->xp_block[0]; l ^= s->yp_block[0]; |

1401 | CALL_MPN_UMUL_PPMM; h ^= s->xp_block[1]; l ^= s->yp_block[1]; |

1402 | CALL_MPN_UMUL_PPMM; h ^= s->xp_block[2]; l ^= s->yp_block[2]; |

1403 | CALL_MPN_UMUL_PPMM; h ^= s->xp_block[3]; l ^= s->yp_block[3]; |

1404 | CALL_MPN_UMUL_PPMM; h ^= s->xp_block[4]; l ^= s->yp_block[4]; |

1405 | CALL_MPN_UMUL_PPMM; h ^= s->xp_block[5]; l ^= s->yp_block[5]; |

1406 | CALL_MPN_UMUL_PPMM; h ^= s->xp_block[6]; l ^= s->yp_block[6]; |

1407 | CALL_MPN_UMUL_PPMM; h ^= s->xp_block[7]; l ^= s->yp_block[7]; |

1408 | CALL_MPN_UMUL_PPMM; h ^= s->xp_block[8]; l ^= s->yp_block[8]; |

1409 | CALL_MPN_UMUL_PPMM; h ^= s->xp_block[9]; l ^= s->yp_block[9]; |

1410 | } |

1411 | SPEED_MACRO_UMUL_PPMM_B; |

1412 | { |

1413 | CALL_MPN_UMUL_PPMM; |

1414 | CALL_MPN_UMUL_PPMM; |

1415 | CALL_MPN_UMUL_PPMM; |

1416 | CALL_MPN_UMUL_PPMM; |

1417 | CALL_MPN_UMUL_PPMM; |

1418 | CALL_MPN_UMUL_PPMM; |

1419 | CALL_MPN_UMUL_PPMM; |

1420 | CALL_MPN_UMUL_PPMM; |

1421 | CALL_MPN_UMUL_PPMM; |

1422 | CALL_MPN_UMUL_PPMM; |

1423 | } |

1424 | SPEED_MACRO_UMUL_PPMM_C; |

1425 | } |

1426 | #endif |

1427 | |

1428 | |

1429 | /* The divisions are successively dependent so latency is measured, not |

1430 | issue rate. There's only 10 per loop so the code doesn't get too big, |

1431 | especially for udiv_qrnnd_preinv and preinv2norm, which are several |

1432 | instructions each. |

1433 | |

1434 | Note that it's only the division which is measured here, there's no data |

1435 | fetching and no shifting if the divisor gets normalized. |

1436 | |

1437 | In speed_udiv_qrnnd with gcc 2.95.2 on x86 the parameters "q,r,r,q,d" |

1438 | generate x86 div instructions with nothing in between. |

1439 | |

1440 | The measuring function macros are in two parts to avoid overflowing |

1441 | preprocessor expansion space if udiv_qrnnd etc are big. |

1442 | |

1443 | Limitations: |

1444 | |

1445 | Don't blindly use this to set UDIV_TIME in gmp-mparam.h, check the code |

1446 | generated first. |

1447 | |

1448 | CPUs with data-dependent divisions may want more attention paid to the |

1449 | randomness of the data used. Probably the measurement wanted is over |

1450 | uniformly distributed numbers, but what's here might not be giving that. */ |

1451 | |

1452 | #define SPEED_ROUTINE_UDIV_QRNND_A(normalize) \ |

1453 | { \ |

1454 | double t; \ |

1455 | unsigned i; \ |

1456 | mp_limb_t q, r, d; \ |

1457 | mp_limb_t dinv; \ |

1458 | \ |

1459 | s->time_divisor = 10; \ |

1460 | \ |

1461 | /* divisor from "r" parameter, or a default */ \ |

1462 | d = s->r; \ |

1463 | if (d == 0) \ |

1464 | d = __mp_bases[10].big_base; \ |

1465 | \ |

1466 | if (normalize) \ |

1467 | { \ |

1468 | unsigned norm; \ |

1469 | count_leading_zeros (norm, d); \ |

1470 | d <<= norm; \ |

1471 | invert_limb (dinv, d); \ |

1472 | } \ |

1473 | \ |

1474 | q = s->xp[0]; \ |

1475 | r = s->yp[0] % d; \ |

1476 | \ |

1477 | speed_starttime (); \ |

1478 | i = s->reps; \ |

1479 | do \ |

1480 | { |

1481 | |

1482 | #define SPEED_ROUTINE_UDIV_QRNND_B \ |

1483 | } \ |

1484 | while (--i != 0); \ |

1485 | t = speed_endtime (); \ |

1486 | \ |

1487 | /* stop the compiler optimizing away the whole calculation! */ \ |

1488 | noop_1 (q); \ |

1489 | noop_1 (r); \ |

1490 | \ |

1491 | return t; \ |

1492 | } |

1493 | |

1494 | double |

1495 | speed_udiv_qrnnd (struct speed_params *s) |

1496 | { |

1497 | SPEED_ROUTINE_UDIV_QRNND_A (UDIV_NEEDS_NORMALIZATION); |

1498 | { |

1499 | udiv_qrnnd (q, r, r, q, d); |

1500 | udiv_qrnnd (q, r, r, q, d); |

1501 | udiv_qrnnd (q, r, r, q, d); |

1502 | udiv_qrnnd (q, r, r, q, d); |

1503 | udiv_qrnnd (q, r, r, q, d); |

1504 | udiv_qrnnd (q, r, r, q, d); |

1505 | udiv_qrnnd (q, r, r, q, d); |

1506 | udiv_qrnnd (q, r, r, q, d); |

1507 | udiv_qrnnd (q, r, r, q, d); |

1508 | udiv_qrnnd (q, r, r, q, d); |

1509 | } |

1510 | SPEED_ROUTINE_UDIV_QRNND_B; |

1511 | } |

1512 | |

1513 | double |

1514 | speed_udiv_qrnnd_preinv (struct speed_params *s) |

1515 | { |

1516 | SPEED_ROUTINE_UDIV_QRNND_A (1); |

1517 | { |

1518 | udiv_qrnnd_preinv (q, r, r, q, d, dinv); |

1519 | udiv_qrnnd_preinv (q, r, r, q, d, dinv); |

1520 | udiv_qrnnd_preinv (q, r, r, q, d, dinv); |

1521 | udiv_qrnnd_preinv (q, r, r, q, d, dinv); |

1522 | udiv_qrnnd_preinv (q, r, r, q, d, dinv); |

1523 | udiv_qrnnd_preinv (q, r, r, q, d, dinv); |

1524 | udiv_qrnnd_preinv (q, r, r, q, d, dinv); |

1525 | udiv_qrnnd_preinv (q, r, r, q, d, dinv); |

1526 | udiv_qrnnd_preinv (q, r, r, q, d, dinv); |

1527 | udiv_qrnnd_preinv (q, r, r, q, d, dinv); |

1528 | } |

1529 | SPEED_ROUTINE_UDIV_QRNND_B; |

1530 | } |

1531 | |

1532 | double |

1533 | speed_udiv_qrnnd_preinv2norm (struct speed_params *s) |

1534 | { |

1535 | SPEED_ROUTINE_UDIV_QRNND_A (1); |

1536 | { |

1537 | udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv); |

1538 | udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv); |

1539 | udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv); |

1540 | udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv); |

1541 | udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv); |

1542 | udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv); |

1543 | udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv); |

1544 | udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv); |

1545 | udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv); |

1546 | udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv); |

1547 | } |

1548 | SPEED_ROUTINE_UDIV_QRNND_B; |

1549 | } |

1550 | |

1551 | double |

1552 | speed_udiv_qrnnd_c (struct speed_params *s) |

1553 | { |

1554 | SPEED_ROUTINE_UDIV_QRNND_A (1); |

1555 | { |

1556 | __udiv_qrnnd_c (q, r, r, q, d); |

1557 | __udiv_qrnnd_c (q, r, r, q, d); |

1558 | __udiv_qrnnd_c (q, r, r, q, d); |

1559 | __udiv_qrnnd_c (q, r, r, q, d); |

1560 | __udiv_qrnnd_c (q, r, r, q, d); |

1561 | __udiv_qrnnd_c (q, r, r, q, d); |

1562 | __udiv_qrnnd_c (q, r, r, q, d); |

1563 | __udiv_qrnnd_c (q, r, r, q, d); |

1564 | __udiv_qrnnd_c (q, r, r, q, d); |

1565 | __udiv_qrnnd_c (q, r, r, q, d); |

1566 | } |

1567 | SPEED_ROUTINE_UDIV_QRNND_B; |

1568 | } |

1569 | |

1570 | #if HAVE_NATIVE_mpn_udiv_qrnnd |

1571 | |

1572 | #if defined (__hppa) && W_TYPE_SIZE == 64 |

1573 | #define CALL_MPN_UDIV_QRNND (q = __MPN (udiv_qrnnd) (r, q, d, &r)) |

1574 | #else |

1575 | #define CALL_MPN_UDIV_QRNND (q = __MPN (udiv_qrnnd) (&r, r, q, d)) |

1576 | #endif |

1577 | |

1578 | double |

1579 | speed_mpn_udiv_qrnnd (struct speed_params *s) |

1580 | { |

1581 | SPEED_ROUTINE_UDIV_QRNND_A (1); |

1582 | { |

1583 | CALL_MPN_UDIV_QRNND; |

1584 | CALL_MPN_UDIV_QRNND; |

1585 | CALL_MPN_UDIV_QRNND; |

1586 | CALL_MPN_UDIV_QRNND; |

1587 | CALL_MPN_UDIV_QRNND; |

1588 | CALL_MPN_UDIV_QRNND; |

1589 | CALL_MPN_UDIV_QRNND; |

1590 | CALL_MPN_UDIV_QRNND; |

1591 | CALL_MPN_UDIV_QRNND; |

1592 | CALL_MPN_UDIV_QRNND; |

1593 | } |

1594 | SPEED_ROUTINE_UDIV_QRNND_B; |

1595 | } |

1596 | #endif |

1597 | |

1598 | |

1599 | double |

1600 | speed_invert_limb (struct speed_params *s) |

1601 | { |

1602 | SPEED_ROUTINE_INVERT_LIMB_CALL (invert_limb (dinv, d)); |

1603 | } |

1604 | |

1605 | |

1606 | /* xp[0] might not be particularly random, but should give an indication how |

1607 | "/" runs. Same for speed_operator_mod below. */ |

1608 | double |

1609 | speed_operator_div (struct speed_params *s) |

1610 | { |

1611 | double t; |

1612 | unsigned i; |

1613 | mp_limb_t x, q, d; |

1614 | |

1615 | s->time_divisor = 10; |

1616 | |

1617 | /* divisor from "r" parameter, or a default */ |

1618 | d = s->r; |

1619 | if (d == 0) |

1620 | d = __mp_bases[10].big_base; |

1621 | |

1622 | x = s->xp[0]; |

1623 | q = 0; |

1624 | |

1625 | speed_starttime (); |

1626 | i = s->reps; |

1627 | do |

1628 | { |

1629 | q ^= x; q /= d; |

1630 | q ^= x; q /= d; |

1631 | q ^= x; q /= d; |

1632 | q ^= x; q /= d; |

1633 | q ^= x; q /= d; |

1634 | q ^= x; q /= d; |

1635 | q ^= x; q /= d; |

1636 | q ^= x; q /= d; |

1637 | q ^= x; q /= d; |

1638 | q ^= x; q /= d; |

1639 | } |

1640 | while (--i != 0); |

1641 | t = speed_endtime (); |

1642 | |

1643 | /* stop the compiler optimizing away the whole calculation! */ |

1644 | noop_1 (q); |

1645 | |

1646 | return t; |

1647 | } |

1648 | |

1649 | double |

1650 | speed_operator_mod (struct speed_params *s) |

1651 | { |

1652 | double t; |

1653 | unsigned i; |

1654 | mp_limb_t x, r, d; |

1655 | |

1656 | s->time_divisor = 10; |

1657 | |

1658 | /* divisor from "r" parameter, or a default */ |

1659 | d = s->r; |

1660 | if (d == 0) |

1661 | d = __mp_bases[10].big_base; |

1662 | |

1663 | x = s->xp[0]; |

1664 | r = 0; |

1665 | |

1666 | speed_starttime (); |

1667 | i = s->reps; |

1668 | do |

1669 | { |

1670 | r ^= x; r %= d; |

1671 | r ^= x; r %= d; |

1672 | r ^= x; r %= d; |

1673 | r ^= x; r %= d; |

1674 | r ^= x; r %= d; |

1675 | r ^= x; r %= d; |

1676 | r ^= x; r %= d; |

1677 | r ^= x; r %= d; |

1678 | r ^= x; r %= d; |

1679 | r ^= x; r %= d; |

1680 | } |

1681 | while (--i != 0); |

1682 | t = speed_endtime (); |

1683 | |

1684 | /* stop the compiler optimizing away the whole calculation! */ |

1685 | noop_1 (r); |

1686 | |

1687 | return t; |

1688 | } |

1689 | |

1690 | |

1691 | /* r==0 measures on data with the values uniformly distributed. This will |

1692 | be typical for count_trailing_zeros in a GCD etc. |

1693 | |

1694 | r==1 measures on data with the resultant count uniformly distributed |

1695 | between 0 and BITS_PER_MP_LIMB-1. This is probably sensible for |

1696 | count_leading_zeros on the high limbs of divisors. */ |

1697 | |

1698 | int |

1699 | speed_routine_count_zeros_setup (struct speed_params *s, |

1700 | mp_ptr xp, int leading, int zero) |

1701 | { |

1702 | int i, c; |

1703 | mp_limb_t n; |

1704 | |

1705 | if (s->r == 0) |

1706 | { |

1707 | /* Make uniformly distributed data. If zero isn't allowed then change |

1708 | it to 1 for leading, or 0x800..00 for trailing. */ |

1709 | MPN_COPY (xp, s->xp_block, SPEED_BLOCK_SIZE); |

1710 | if (! zero) |

1711 | for (i = 0; i < SPEED_BLOCK_SIZE; i++) |

1712 | if (xp[i] == 0) |

1713 | xp[i] = leading ? 1 : GMP_LIMB_HIGHBIT; |

1714 | } |

1715 | else if (s->r == 1) |

1716 | { |

1717 | /* Make counts uniformly distributed. A randomly chosen bit is set, and |

1718 | for leading the rest above it are cleared, or for trailing then the |

1719 | rest below. */ |

1720 | for (i = 0; i < SPEED_BLOCK_SIZE; i++) |

1721 | { |

1722 | mp_limb_t set = CNST_LIMB(1) << (s->yp_block[i] % BITS_PER_MP_LIMB); |

1723 | mp_limb_t keep_below = set-1; |

1724 | mp_limb_t keep_above = MP_LIMB_T_MAX ^ keep_below; |

1725 | mp_limb_t keep = (leading ? keep_below : keep_above); |

1726 | xp[i] = (s->xp_block[i] & keep) | set; |

1727 | } |

1728 | } |

1729 | else |

1730 | { |

1731 | return 0; |

1732 | } |

1733 | |

1734 | /* Account for the effect of n^=c. */ |

1735 | c = 0; |

1736 | for (i = 0; i < SPEED_BLOCK_SIZE; i++) |

1737 | { |

1738 | n = xp[i]; |

1739 | xp[i] ^= c; |

1740 | |

1741 | if (leading) |

1742 | count_leading_zeros (c, n); |

1743 | else |

1744 | count_trailing_zeros (c, n); |

1745 | } |

1746 | |

1747 | return 1; |

1748 | } |

1749 | |

1750 | double |

1751 | speed_count_leading_zeros (struct speed_params *s) |

1752 | { |

1753 | #ifdef COUNT_LEADING_ZEROS_0 |

1754 | #define COUNT_LEADING_ZEROS_0_ALLOWED 1 |

1755 | #else |

1756 | #define COUNT_LEADING_ZEROS_0_ALLOWED 0 |

1757 | #endif |

1758 | |

1759 | SPEED_ROUTINE_COUNT_ZEROS_A (1, COUNT_LEADING_ZEROS_0_ALLOWED); |

1760 | count_leading_zeros (c, n); |

1761 | SPEED_ROUTINE_COUNT_ZEROS_B (); |

1762 | } |

1763 | double |

1764 | speed_count_trailing_zeros (struct speed_params *s) |

1765 | { |

1766 | SPEED_ROUTINE_COUNT_ZEROS_A (0, 0); |

1767 | count_trailing_zeros (c, n); |

1768 | SPEED_ROUTINE_COUNT_ZEROS_B (); |

1769 | } |

1770 | |

1771 | |

1772 | double |

1773 | speed_mpn_get_str (struct speed_params *s) |

1774 | { |

1775 | SPEED_ROUTINE_MPN_GET_STR (mpn_get_str); |

1776 | } |

1777 | |

1778 | double |

1779 | speed_mpn_set_str (struct speed_params *s) |

1780 | { |

1781 | SPEED_ROUTINE_MPN_SET_STR (mpn_set_str); |

1782 | } |

1783 | double |

1784 | speed_mpn_set_str_basecase (struct speed_params *s) |

1785 | { |

1786 | SPEED_ROUTINE_MPN_SET_STR (mpn_set_str_basecase); |

1787 | } |

1788 | double |

1789 | speed_mpn_set_str_subquad (struct speed_params *s) |

1790 | { |

1791 | SPEED_ROUTINE_MPN_SET_STR (mpn_set_str_subquad); |

1792 | } |

1793 | |

1794 | |

1795 | double |

1796 | speed_MPN_ZERO (struct speed_params *s) |

1797 | { |

1798 | SPEED_ROUTINE_MPN_ZERO_CALL (MPN_ZERO (wp, s->size)); |

1799 | } |

**Note:**See TracBrowser for help on using the repository browser.