source: trunk/third/gmp/mpz/n_pow_ui.c @ 18191

Revision 18191, 16.8 KB checked in by ghudson, 22 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r18190, which included commits to RCS files with non-trunk default branches.
Line 
1/* mpz_n_pow_ui -- mpn raised to ulong.
2
3Copyright 2001, 2002 Free Software Foundation, Inc.
4
5This file is part of the GNU MP Library.
6
7The GNU MP Library is free software; you can redistribute it and/or modify
8it under the terms of the GNU Lesser General Public License as published by
9the Free Software Foundation; either version 2.1 of the License, or (at your
10option) any later version.
11
12The GNU MP Library is distributed in the hope that it will be useful, but
13WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15License for more details.
16
17You should have received a copy of the GNU Lesser General Public License
18along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
19the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
20MA 02111-1307, USA. */
21
22#include "gmp.h"
23#include "gmp-impl.h"
24#include "longlong.h"
25
26
27/* Change this to "#define TRACE(x) x" for some traces. */
28#define TRACE(x)
29
30
31/* Use this to test the mul_2 code on a CPU without a native version of that
32   routine.  */
33#if 0
34#define mpn_mul_2  refmpn_mul_2
35#define HAVE_NATIVE_mpn_mul_2  1
36#endif
37
38
39/* mpz_pow_ui and mpz_ui_pow_ui want to share almost all of this code.
40   ui_pow_ui doesn't need the mpn_mul based powering loop or the tests on
41   bsize==2 or >2, but separating that isn't easy because there's shared
42   code both before and after (the size calculations and the powers of 2
43   handling).
44
45   Alternatives:
46
47   It would work to just use the mpn_mul powering loop for 1 and 2 limb
48   bases, but the current separate loop allows mul_1 and mul_2 to be done
49   in-place, which might help cache locality a bit.  If mpn_mul was relaxed
50   to allow source==dest when vn==1 or 2 then some pointer twiddling might
51   let us get the same effect in one loop.
52
53   The initial powering for bsize==1 into blimb or blimb:blimb_low doesn't
54   form the biggest possible power of b that fits, only the biggest power of
55   2 power, ie. b^(2^n).  It'd be possible to choose a bigger power, perhaps
56   using __mp_bases[b].big_base for small b, and thereby get better value
57   from mpn_mul_1 or mpn_mul_2 in the bignum powering.  It's felt that doing
58   so would be more complicated than it's worth, and could well end up being
59   a slowdown for small e.  For big e on the other hand the algorithm is
60   dominated by mpn_sqr_n so there wouldn't much of a saving.  The current
61   code can be viewed as simply doing the first few steps of the powering in
62   a single or double limb where possible.
63
64   If r==b, and blow_twos==0, and r must be realloc'ed, then the temporary
65   copy made of b is unnecessary.  We could just use the old alloc'ed block
66   and free it at the end.  But arranging this seems like a lot more trouble
67   than it's worth.  */
68
69
70/* floor(sqrt(GMP_NUMB_MAX)), ie. the biggest value that can be squared in
71   a limb without overflowing.
72   FIXME: This formula is an underestimate when GMP_NUMB_BITS is odd. */
73
74#define GMP_NUMB_HALFMAX  (((mp_limb_t) 1 << GMP_NUMB_BITS/2) - 1)
75
76
77/* The following are for convenience, they update the size and check the
78   alloc.  */
79
80#define MPN_SQR_N(dst, alloc, src, size)        \
81  do {                                          \
82    ASSERT (2*(size) <= (alloc));               \
83    mpn_sqr_n (dst, src, size);                 \
84    (size) *= 2;                                \
85    (size) -= ((dst)[(size)-1] == 0);           \
86  } while (0)
87
88#define MPN_MUL(dst, alloc, src, size, src2, size2)     \
89  do {                                                  \
90    mp_limb_t  cy;                                      \
91    ASSERT ((size) + (size2) <= (alloc));               \
92    cy = mpn_mul (dst, src, size, src2, size2);         \
93    (size) += (size2) - (cy == 0);                      \
94  } while (0)
95
96#define MPN_MUL_2(ptr, size, alloc, mult)       \
97  do {                                          \
98    mp_limb_t  cy;                              \
99    ASSERT ((size)+2 <= (alloc));               \
100    cy = mpn_mul_2 (ptr, ptr, size, mult);      \
101    (size)++;                                   \
102    (ptr)[(size)] = cy;                         \
103    (size) += (cy != 0);                        \
104  } while (0)
105
106#define MPN_MUL_1(ptr, size, alloc, limb)       \
107  do {                                          \
108    mp_limb_t  cy;                              \
109    ASSERT ((size)+1 <= (alloc));               \
110    cy = mpn_mul_1 (ptr, ptr, size, limb);      \
111    (ptr)[size] = cy;                           \
112    (size) += (cy != 0);                        \
113  } while (0)
114
115#define MPN_LSHIFT(ptr, size, alloc, shift)     \
116  do {                                          \
117    mp_limb_t  cy;                              \
118    ASSERT ((size)+1 <= (alloc));               \
119    cy = mpn_lshift (ptr, ptr, size, shift);    \
120    (ptr)[size] = cy;                           \
121    (size) += (cy != 0);                        \
122  } while (0)
123
124#define MPN_RSHIFT_OR_COPY(dst, src, size, shift)       \
125  do {                                                  \
126    if ((shift) == 0)                                   \
127      MPN_COPY (dst, src, size);                        \
128    else                                                \
129      {                                                 \
130        mpn_rshift (dst, src, size, shift);             \
131        (size) -= ((dst)[(size)-1] == 0);               \
132      }                                                 \
133  } while (0)
134
135
136/* ralloc and talloc are only wanted for ASSERTs, after the initial space
137   allocations.  Avoid writing values to them in a normal build, to ensure
138   the compiler lets them go dead.  gcc already figures this out itself
139   actually.  */
140
141#define SWAP_RP_TP                                      \
142  do {                                                  \
143    MP_PTR_SWAP (rp, tp);                               \
144    ASSERT_CODE (MP_SIZE_T_SWAP (ralloc, talloc));      \
145  } while (0)
146
147
148void
149mpz_n_pow_ui (mpz_ptr r, mp_srcptr bp, mp_size_t bsize, unsigned long int e)
150{
151  mp_ptr         rp;
152  mp_size_t      rtwos_limbs, ralloc, rsize;
153  int            rneg, i, cnt, btwos, r_bp_overlap;
154  mp_limb_t      blimb, rl;
155  unsigned long  rtwos_bits;
156#if HAVE_NATIVE_mpn_mul_2
157  mp_limb_t      blimb_low, rl_high;
158#else
159  mp_limb_t      b_twolimbs[2];
160#endif
161  TMP_DECL (marker);
162
163  TRACE (printf ("mpz_n_pow_ui rp=0x%lX bp=0x%lX bsize=%ld e=%lu (0x%lX)\n",
164                 PTR(r), bp, bsize, e, e);
165         mpn_trace ("b", bp, bsize));
166
167  ASSERT (bsize == 0 || bp[ABS(bsize)-1] != 0);
168  ASSERT (MPN_SAME_OR_SEPARATE2_P (PTR(r), ABSIZ(r), bp, bsize));
169
170  /* b^0 == 1, including 0^0 == 1 */
171  if (e == 0)
172    {
173      PTR(r)[0] = 1;
174      SIZ(r) = 1;
175      return;
176    }
177
178  /* 0^e == 0 apart from 0^0 above */
179  if (bsize == 0)
180    {
181      SIZ(r) = 0;
182      return;
183    }
184
185  /* Sign of the final result. */
186  rneg = (bsize < 0 && (e & 1) != 0);
187  bsize = ABS (bsize);
188  TRACE (printf ("rneg %d\n", rneg));
189
190  r_bp_overlap = (PTR(r) == bp);
191
192  /* Strip low zero limbs from b. */
193  rtwos_limbs = 0;
194  for (blimb = *bp; blimb == 0; blimb = *++bp)
195    {
196      rtwos_limbs += e;
197      bsize--; ASSERT (bsize >= 1);
198    }
199  TRACE (printf ("trailing zero rtwos_limbs=%ld\n", rtwos_limbs));
200
201  /* Strip low zero bits from b. */
202  count_trailing_zeros (btwos, blimb);
203  blimb >>= btwos;
204  rtwos_bits = e * btwos;
205  rtwos_limbs += rtwos_bits / GMP_NUMB_BITS;
206  rtwos_bits %= GMP_NUMB_BITS;
207  TRACE (printf ("trailing zero btwos=%d rtwos_limbs=%ld rtwos_bits=%lu\n",
208                 btwos, rtwos_limbs, rtwos_bits));
209
210  TMP_MARK (marker);
211
212  rl = 1;
213#if HAVE_NATIVE_mpn_mul_2
214  rl_high = 0;
215#endif
216
217  if (bsize == 1)
218    {
219    bsize_1:
220      /* Power up as far as possible within blimb.  We start here with e!=0,
221         but if e is small then we might reach e==0 and the whole b^e in rl.
222         Notice this code works when blimb==1 too, reaching e==0.  */
223
224      while (blimb <= GMP_NUMB_HALFMAX)
225        {
226          TRACE (printf ("small e=0x%lX blimb=0x%lX rl=0x%lX\n",
227                         e, blimb, rl));
228          ASSERT (e != 0);
229          if ((e & 1) != 0)
230            rl *= blimb;
231          e >>= 1;
232          if (e == 0)
233            goto got_rl;
234          blimb *= blimb;
235        }
236
237#if HAVE_NATIVE_mpn_mul_2
238      TRACE (printf ("single power, e=0x%lX b=0x%lX rl=0x%lX\n",
239                     e, blimb, rl));
240
241      /* Can power b once more into blimb:blimb_low */
242      bsize = 2;
243      ASSERT (e != 0);
244      if ((e & 1) != 0)
245        {
246          umul_ppmm (rl_high, rl, rl, blimb << GMP_NAIL_BITS);
247          rl >>= GMP_NAIL_BITS;
248        }
249      e >>= 1;
250      umul_ppmm (blimb, blimb_low, blimb, blimb << GMP_NAIL_BITS);
251      blimb_low >>= GMP_NAIL_BITS;
252
253    got_rl:
254      TRACE (printf ("double power e=0x%lX blimb=0x%lX:0x%lX rl=0x%lX:%lX\n",
255                     e, blimb, blimb_low, rl_high, rl));
256
257      /* Combine left-over rtwos_bits into rl_high:rl to be handled by the
258         final mul_1 or mul_2 rather than a separate lshift.
259         - rl_high:rl mustn't be 1 (since then there's no final mul)
260         - rl_high mustn't overflow
261         - rl_high mustn't change to non-zero, since mul_1+lshift is
262         probably faster than mul_2 (FIXME: is this true?)  */
263
264      if (rtwos_bits != 0
265          && ! (rl_high == 0 && rl == 1)
266          && (rl_high >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
267        {
268          mp_limb_t  new_rl_high = (rl_high << rtwos_bits)
269            | (rl >> (GMP_NUMB_BITS-rtwos_bits));
270          if (! (rl_high == 0 && new_rl_high != 0))
271            {
272              rl_high = new_rl_high;
273              rl <<= rtwos_bits;
274              rtwos_bits = 0;
275              TRACE (printf ("merged rtwos_bits, rl=0x%lX:%lX\n",
276                             rl_high, rl));
277            }
278        }
279#else
280    got_rl:
281      TRACE (printf ("small power e=0x%lX blimb=0x%lX rl=0x%lX\n",
282                     e, blimb, rl));
283
284      /* Combine left-over rtwos_bits into rl to be handled by the final
285         mul_1 rather than a separate lshift.
286         - rl mustn't be 1 (since then there's no final mul)
287         - rl mustn't overflow  */
288
289      if (rtwos_bits != 0
290          && rl != 1
291          && (rl >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
292        {
293          rl <<= rtwos_bits;
294          rtwos_bits = 0;
295          TRACE (printf ("merged rtwos_bits, rl=0x%lX\n", rl));
296        }
297#endif
298    }
299  else if (bsize == 2)
300    {
301      mp_limb_t  bsecond = bp[1];
302      if (btwos != 0)
303        blimb |= (bsecond << (GMP_NUMB_BITS - btwos)) & GMP_NUMB_MASK;
304      bsecond >>= btwos;
305      if (bsecond == 0)
306        {
307          /* Two limbs became one after rshift. */
308          bsize = 1;
309          goto bsize_1;
310        }
311
312      TRACE (printf ("bsize==2 using b=0x%lX:%lX", bsecond, blimb));
313#if HAVE_NATIVE_mpn_mul_2
314      blimb_low = blimb;
315#else
316      bp = b_twolimbs;
317      b_twolimbs[0] = blimb;
318      b_twolimbs[1] = bsecond;
319#endif
320      blimb = bsecond;
321    }
322  else
323    {
324      if (r_bp_overlap || btwos != 0)
325        {
326          mp_ptr tp = TMP_ALLOC_LIMBS (bsize);
327          MPN_RSHIFT_OR_COPY (tp, bp, bsize, btwos);
328          bp = tp;
329          TRACE (printf ("rshift or copy bp,bsize, new bsize=%ld\n", bsize));
330        }
331#if HAVE_NATIVE_mpn_mul_2
332      /* in case 3 limbs rshift to 2 and hence use the mul_2 loop below */
333      blimb_low = bp[0];
334#endif
335      blimb = bp[bsize-1];
336
337      TRACE (printf ("big bsize=%ld  ", bsize);
338             mpn_trace ("b", bp, bsize));
339    }
340
341  /* At this point blimb is the most significant limb of the base to use.
342
343     Each factor of b takes (bsize*BPML-cnt) bits and there's e of them; +1
344     limb to round up the division; +1 for multiplies all using an extra
345     limb over the true size; +2 for rl at the end; +1 for lshift at the
346     end.
347
348     The size calculation here is reasonably accurate.  The base is at least
349     half a limb, so in 32 bits the worst case is 2^16+1 treated as 17 bits
350     when it will power up as just over 16, an overestimate of 17/16 =
351     6.25%.  For a 64-bit limb it's half that.
352
353     If e==0 then blimb won't be anything useful (though it will be
354     non-zero), but that doesn't matter since we just end up with ralloc==5,
355     and that's fine for 2 limbs of rl and 1 of lshift.  */
356
357  ASSERT (blimb != 0);
358  count_leading_zeros (cnt, blimb);
359  ralloc = (bsize*GMP_NUMB_BITS - cnt + GMP_NAIL_BITS) * e / GMP_NUMB_BITS + 5;
360  TRACE (printf ("ralloc %ld, from bsize=%ld blimb=0x%lX cnt=%d\n",
361                 ralloc, bsize, blimb, cnt));
362  MPZ_REALLOC (r, ralloc + rtwos_limbs);
363  rp = PTR(r);
364
365  /* Low zero limbs resulting from powers of 2. */
366  MPN_ZERO (rp, rtwos_limbs);
367  rp += rtwos_limbs;
368
369  if (e == 0)
370    {
371      /* Any e==0 other than via bsize==1 or bsize==2 is covered at the
372         start. */
373      rp[0] = rl;
374      rsize = 1;
375#if HAVE_NATIVE_mpn_mul_2
376      rp[1] = rl_high;
377      rsize += (rl_high != 0);
378#endif
379      ASSERT (rp[rsize-1] != 0);
380    }
381  else
382    {
383      mp_ptr     tp;
384      mp_size_t  talloc;
385
386      /* In the mpn_mul_1 or mpn_mul_2 loops or in the mpn_mul loop when the
387         low bit of e is zero, tp only has to hold the second last power
388         step, which is half the size of the final result.  There's no need
389         to round up the divide by 2, since ralloc includes a +2 for rl
390         which not needed by tp.  In the mpn_mul loop when the low bit of e
391         is 1, tp must hold nearly the full result, so just size it the same
392         as rp.  */
393
394      talloc = ralloc;
395#if HAVE_NATIVE_mpn_mul_2
396      if (bsize <= 2 || (e & 1) == 0)
397        talloc /= 2;
398#else
399      if (bsize <= 1 || (e & 1) == 0)
400        talloc /= 2;
401#endif
402      TRACE (printf ("talloc %ld\n", talloc));
403      tp = TMP_ALLOC_LIMBS (talloc);
404
405      /* Go from high to low over the bits of e, starting with i pointing at
406         the bit below the highest 1 (which will mean i==-1 if e==1).  */
407      count_leading_zeros (cnt, e);
408      i = GMP_LIMB_BITS - cnt - 2;
409
410#if HAVE_NATIVE_mpn_mul_2
411      if (bsize <= 2)
412        {
413          mp_limb_t  mult[2];
414
415          /* Any bsize==1 will have been powered above to be two limbs. */
416          ASSERT (bsize == 2);
417          ASSERT (blimb != 0);
418
419          /* Arrange the final result ends up in r, not in the temp space */
420          if ((i & 1) == 0)
421            SWAP_RP_TP;
422
423          rp[0] = blimb_low;
424          rp[1] = blimb;
425          rsize = 2;
426
427          mult[0] = blimb_low;
428          mult[1] = blimb;
429
430          for ( ; i >= 0; i--)
431            {
432              TRACE (printf ("mul_2 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
433                             i, e, rsize, ralloc, talloc);
434                     mpn_trace ("r", rp, rsize));
435
436              MPN_SQR_N (tp, talloc, rp, rsize);
437              SWAP_RP_TP;
438              if ((e & (1L << i)) != 0)
439                MPN_MUL_2 (rp, rsize, ralloc, mult);
440            }
441
442          TRACE (mpn_trace ("mul_2 before rl, r", rp, rsize));
443          if (rl_high != 0)
444            {
445              mult[0] = rl;
446              mult[1] = rl_high;
447              MPN_MUL_2 (rp, rsize, ralloc, mult);
448            }
449          else if (rl != 1)
450            MPN_MUL_1 (rp, rsize, ralloc, rl);
451        }
452#else
453      if (bsize == 1)
454        {
455          /* Arrange the final result ends up in r, not in the temp space */
456          if ((i & 1) == 0)
457            SWAP_RP_TP;
458
459          rp[0] = blimb;
460          rsize = 1;
461
462          for ( ; i >= 0; i--)
463            {
464              TRACE (printf ("mul_1 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
465                             i, e, rsize, ralloc, talloc);
466                     mpn_trace ("r", rp, rsize));
467
468              MPN_SQR_N (tp, talloc, rp, rsize);
469              SWAP_RP_TP;
470              if ((e & (1L << i)) != 0)
471                MPN_MUL_1 (rp, rsize, ralloc, blimb);
472            }
473
474          TRACE (mpn_trace ("mul_1 before rl, r", rp, rsize));
475          if (rl != 1)
476            MPN_MUL_1 (rp, rsize, ralloc, rl);
477        }
478#endif
479      else
480        {
481          int  parity;
482
483          /* Arrange the final result ends up in r, not in the temp space */
484          ULONG_PARITY (parity, e);
485          if (((parity ^ i) & 1) != 0)
486            SWAP_RP_TP;
487
488          MPN_COPY (rp, bp, bsize);
489          rsize = bsize;
490
491          for ( ; i >= 0; i--)
492            {
493              TRACE (printf ("mul loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
494                             i, e, rsize, ralloc, talloc);
495                     mpn_trace ("r", rp, rsize));
496
497              MPN_SQR_N (tp, talloc, rp, rsize);
498              SWAP_RP_TP;
499              if ((e & (1L << i)) != 0)
500                {
501                  MPN_MUL (tp, talloc, rp, rsize, bp, bsize);
502                  SWAP_RP_TP;
503                }
504            }
505        }
506    }
507
508  ASSERT (rp == PTR(r) + rtwos_limbs);
509  TRACE (mpn_trace ("end loop r", rp, rsize));
510  TMP_FREE (marker);
511
512  /* Apply any partial limb factors of 2. */
513  if (rtwos_bits != 0)
514    {
515      MPN_LSHIFT (rp, rsize, ralloc, (unsigned) rtwos_bits);
516      TRACE (mpn_trace ("lshift r", rp, rsize));
517    }
518
519  rsize += rtwos_limbs;
520  SIZ(r) = (rneg ? -rsize : rsize);
521}
Note: See TracBrowser for help on using the repository browser.