source: trunk/third/gettext/lib/linebreak.c @ 16931

Revision 16931, 59.0 KB checked in by ghudson, 23 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r16930, which included commits to RCS files with non-trunk default branches.
Line 
1/* linebreak.c - line breaking of Unicode strings
2   Copyright (C) 2001 Free Software Foundation, Inc.
3   Written by Bruno Haible <haible@clisp.cons.org>, 2001.
4
5This program is free software; you can redistribute it and/or modify
6it under the terms of the GNU General Public License as published by
7the Free Software Foundation; either version 2, or (at your option)
8any later version.
9
10This program is distributed in the hope that it will be useful,
11but WITHOUT ANY WARRANTY; without even the implied warranty of
12MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13GNU General Public License for more details.
14
15You should have received a copy of the GNU General Public License
16along with this program; if not, write to the Free Software
17Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
18
19#ifdef HAVE_CONFIG_H
20# include <config.h>
21#endif
22
23#include <stddef.h>
24#include <string.h>
25#include "linebreak.h"
26#include "c-ctype.h"
27
28
29/* Return the length (number of units) of the first character in S, putting
30   its 'ucs4_t' representation in *PUC.  */
31static int
32u8_mbtouc_aux (puc, s, n)
33     unsigned int *puc;
34     const unsigned char *s;
35     size_t n;
36{
37  unsigned char c = *s;
38
39  if (c >= 0xc2)
40    {
41      if (c < 0xe0)
42        {
43          if (n >= 2)
44            {
45              if ((s[1] ^ 0x80) < 0x40)
46                {
47                  *puc = ((unsigned int) (c & 0x1f) << 6)
48                         | (unsigned int) (s[1] ^ 0x80);
49                  return 2;
50                }
51              /* invalid multibyte character */
52            }
53          else
54            {
55              /* incomplete multibyte character */
56              *puc = 0xfffd;
57              return n;
58            }
59        }
60      else if (c < 0xf0)
61        {
62          if (n >= 3)
63            {
64              if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
65                  && (c >= 0xe1 || s[1] >= 0xa0))
66                {
67                  *puc = ((unsigned int) (c & 0x0f) << 12)
68                         | ((unsigned int) (s[1] ^ 0x80) << 6)
69                         | (unsigned int) (s[2] ^ 0x80);
70                  return 3;
71                }
72              /* invalid multibyte character */
73            }
74          else
75            {
76              /* incomplete multibyte character */
77              *puc = 0xfffd;
78              return n;
79            }
80        }
81      else if (c < 0xf8)
82        {
83          if (n >= 4)
84            {
85              if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
86                  && (s[3] ^ 0x80) < 0x40
87                  && (c >= 0xf1 || s[1] >= 0x90))
88                {
89                  *puc = ((unsigned int) (c & 0x07) << 18)
90                         | ((unsigned int) (s[1] ^ 0x80) << 12)
91                         | ((unsigned int) (s[2] ^ 0x80) << 6)
92                         | (unsigned int) (s[3] ^ 0x80);
93                  return 4;
94                }
95              /* invalid multibyte character */
96            }
97          else
98            {
99              /* incomplete multibyte character */
100              *puc = 0xfffd;
101              return n;
102            }
103        }
104#if 0
105      else if (c < 0xfc)
106        {
107          if (n >= 5)
108            {
109              if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
110                  && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40
111                  && (c >= 0xf9 || s[1] >= 0x88))
112                {
113                  *puc = ((unsigned int) (c & 0x03) << 24)
114                         | ((unsigned int) (s[1] ^ 0x80) << 18)
115                         | ((unsigned int) (s[2] ^ 0x80) << 12)
116                         | ((unsigned int) (s[3] ^ 0x80) << 6)
117                         | (unsigned int) (s[4] ^ 0x80);
118                  return 5;
119                }
120              /* invalid multibyte character */
121            }
122          else
123            {
124              /* incomplete multibyte character */
125              *puc = 0xfffd;
126              return n;
127            }
128        }
129      else if (c < 0xfe)
130        {
131          if (n >= 6)
132            {
133              if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
134                  && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40
135                  && (s[5] ^ 0x80) < 0x40
136                  && (c >= 0xfd || s[1] >= 0x84))
137                {
138                  *puc = ((unsigned int) (c & 0x01) << 30)
139                         | ((unsigned int) (s[1] ^ 0x80) << 24)
140                         | ((unsigned int) (s[2] ^ 0x80) << 18)
141                         | ((unsigned int) (s[3] ^ 0x80) << 12)
142                         | ((unsigned int) (s[4] ^ 0x80) << 6)
143                         | (unsigned int) (s[5] ^ 0x80);
144                  return 6;
145                }
146              /* invalid multibyte character */
147            }
148          else
149            {
150              /* incomplete multibyte character */
151              *puc = 0xfffd;
152              return n;
153            }
154        }
155#endif
156    }
157  /* invalid multibyte character */
158  *puc = 0xfffd;
159  return 1;
160}
161static inline int
162u8_mbtouc (puc, s, n)
163     unsigned int *puc;
164     const unsigned char *s;
165     size_t n;
166{
167  unsigned char c = *s;
168
169  if (c < 0x80)
170    {
171      *puc = c;
172      return 1;
173    }
174  else
175    return u8_mbtouc_aux (puc, s, n);
176}
177
178#ifdef unused
179static int
180u16_mbtouc_aux (puc, s, n)
181     unsigned int *puc;
182     const unsigned short *s;
183     size_t n;
184{
185  unsigned short c = *s;
186
187  if (c < 0xdc00)
188    {
189      if (n >= 2)
190        {
191          if (s[1] >= 0xdc00 && s[1] < 0xe000)
192            {
193              *puc = 0x10000 + ((c - 0xd800) << 10) + (s[1] - 0xdc00);
194              return 2;
195            }
196          /* invalid multibyte character */
197        }
198      else
199        {
200          /* incomplete multibyte character */
201          *puc = 0xfffd;
202          return n;
203        }
204    }
205  /* invalid multibyte character */
206  *puc = 0xfffd;
207  return 1;
208}
209static inline int
210u16_mbtouc (puc, s, n)
211     unsigned int *puc;
212     const unsigned short *s;
213     size_t n;
214{
215  unsigned short c = *s;
216
217  if (c < 0xd800 || c >= 0xe000)
218    {
219      *puc = c;
220      return 1;
221    }
222  else
223    return u16_mbtouc_aux (puc, s, n);
224}
225
226static inline int
227u32_mbtouc (puc, s, n)
228     unsigned int *puc;
229     const unsigned int *s;
230     size_t n;
231{
232  *puc = *s;
233  return 1;
234}
235#endif
236
237
238/* Help GCC to generate good code for string comparisons with
239   immediate strings. */
240#if defined (__GNUC__) && defined (__OPTIMIZE__)
241
242static inline int
243streq9 (const char *s1, const char *s2)
244{
245  return strcmp (s1 + 9, s2 + 9) == 0;
246}
247
248static inline int
249streq8 (const char *s1, const char *s2, char s28)
250{
251  if (s1[8] == s28)
252    {
253      if (s28 == 0)
254        return 1;
255      else
256        return streq9 (s1, s2);
257    }
258  else
259    return 0;
260}
261
262static inline int
263streq7 (const char *s1, const char *s2, char s27, char s28)
264{
265  if (s1[7] == s27)
266    {
267      if (s27 == 0)
268        return 1;
269      else
270        return streq8 (s1, s2, s28);
271    }
272  else
273    return 0;
274}
275
276static inline int
277streq6 (const char *s1, const char *s2, char s26, char s27, char s28)
278{
279  if (s1[6] == s26)
280    {
281      if (s26 == 0)
282        return 1;
283      else
284        return streq7 (s1, s2, s27, s28);
285    }
286  else
287    return 0;
288}
289
290static inline int
291streq5 (const char *s1, const char *s2, char s25, char s26, char s27, char s28)
292{
293  if (s1[5] == s25)
294    {
295      if (s25 == 0)
296        return 1;
297      else
298        return streq6 (s1, s2, s26, s27, s28);
299    }
300  else
301    return 0;
302}
303
304static inline int
305streq4 (const char *s1, const char *s2, char s24, char s25, char s26, char s27, char s28)
306{
307  if (s1[4] == s24)
308    {
309      if (s24 == 0)
310        return 1;
311      else
312        return streq5 (s1, s2, s25, s26, s27, s28);
313    }
314  else
315    return 0;
316}
317
318static inline int
319streq3 (const char *s1, const char *s2, char s23, char s24, char s25, char s26, char s27, char s28)
320{
321  if (s1[3] == s23)
322    {
323      if (s23 == 0)
324        return 1;
325      else
326        return streq4 (s1, s2, s24, s25, s26, s27, s28);
327    }
328  else
329    return 0;
330}
331
332static inline int
333streq2 (const char *s1, const char *s2, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
334{
335  if (s1[2] == s22)
336    {
337      if (s22 == 0)
338        return 1;
339      else
340        return streq3 (s1, s2, s23, s24, s25, s26, s27, s28);
341    }
342  else
343    return 0;
344}
345
346static inline int
347streq1 (const char *s1, const char *s2, char s21, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
348{
349  if (s1[1] == s21)
350    {
351      if (s21 == 0)
352        return 1;
353      else
354        return streq2 (s1, s2, s22, s23, s24, s25, s26, s27, s28);
355    }
356  else
357    return 0;
358}
359
360static inline int
361streq0 (const char *s1, const char *s2, char s20, char s21, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
362{
363  if (s1[0] == s20)
364    {
365      if (s20 == 0)
366        return 1;
367      else
368        return streq1 (s1, s2, s21, s22, s23, s24, s25, s26, s27, s28);
369    }
370  else
371    return 0;
372}
373
374#define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \
375  streq0 (s1, s2, s20, s21, s22, s23, s24, s25, s26, s27, s28)
376
377#else
378
379#define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \
380  (strcmp (s1, s2) == 0)
381
382#endif
383
384
385static int
386is_cjk_encoding (encoding)
387     const char *encoding;
388{
389  if (0
390      /* Legacy Japanese encodings */
391      || STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0)
392      /* Legacy Chinese encodings */
393      || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
394      || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
395      || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
396      || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
397      /* Legacy Korean encodings */
398      || STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
399      || STREQ (encoding, "CP949", 'C', 'P', '9', '4', '9', 0, 0, 0, 0)
400      || STREQ (encoding, "JOHAB", 'J', 'O', 'H', 'A', 'B', 0, 0, 0, 0))
401    return 1;
402  return 0;
403}
404
405static int
406is_utf8_encoding (encoding)
407     const char *encoding;
408{
409  if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0 ,0))
410    return 1;
411  return 0;
412}
413
414
415/* Determine number of column positions required for UC. */
416int uc_width PARAMS ((unsigned int uc, const char *encoding));
417
418/*
419 * Non-spacing attribute table.
420 * See PropList.txt, or grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt
421 * Control characters are also marked non-spacing here, because they are not
422 * printable. Zero width characters are also marked non-spacing here.
423 */
424static const unsigned char nonspacing_table_data[15*64] = {
425  /* 0x0000-0x01ff */
426  0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, /* 0x0000-0x003f */
427  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0x0040-0x007f */
428  0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, /* 0x0080-0x00bf */
429  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00c0-0x00ff */
430  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0100-0x013f */
431  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0140-0x017f */
432  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0180-0x01bf */
433  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x01c0-0x01ff */
434  /* 0x0200-0x03ff */
435  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0200-0x023f */
436  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0240-0x027f */
437  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0280-0x02bf */
438  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x02c0-0x02ff */
439  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x0300-0x033f */
440  0xff, 0x7f, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, /* 0x0340-0x037f */
441  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0380-0x03bf */
442  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x03c0-0x03ff */
443  /* 0x0400-0x05ff */
444  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0400-0x043f */
445  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0440-0x047f */
446  0x78, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0480-0x04bf */
447  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04c0-0x04ff */
448  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0500-0x053f */
449  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0540-0x057f */
450  0x00, 0x00, 0xfe, 0xff, 0xfb, 0xff, 0xff, 0xbb, /* 0x0580-0x05bf */
451  0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x05c0-0x05ff */
452  /* 0x0600-0x07ff */
453  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0600-0x063f */
454  0x00, 0xf8, 0x3f, 0x00, 0x00, 0x00, 0x01, 0x00, /* 0x0640-0x067f */
455  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0680-0x06bf */
456  0x00, 0x00, 0xc0, 0xff, 0x9f, 0x3d, 0x00, 0x00, /* 0x06c0-0x06ff */
457  0x00, 0x80, 0x02, 0x00, 0x00, 0x00, 0xff, 0xff, /* 0x0700-0x073f */
458  0xff, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0740-0x077f */
459  0x00, 0x00, 0x00, 0x00, 0xc0, 0xff, 0x01, 0x00, /* 0x0780-0x07bf */
460  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x07c0-0x07ff */
461  /* 0x0800-0x09ff */
462  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0800-0x083f */
463  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0840-0x087f */
464  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0880-0x08bf */
465  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08c0-0x08ff */
466  0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0900-0x093f */
467  0xfe, 0x21, 0x1e, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x0940-0x097f */
468  0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0980-0x09bf */
469  0x1e, 0x20, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x09c0-0x09ff */
470  /* 0x0a00-0x0bff */
471  0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0a00-0x0a3f */
472  0x86, 0x39, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, /* 0x0a40-0x0a7f */
473  0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0a80-0x0abf */
474  0xbe, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0ac0-0x0aff */
475  0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, /* 0x0b00-0x0b3f */
476  0x0e, 0x20, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0b40-0x0b7f */
477  0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0b80-0x0bbf */
478  0x01, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0bc0-0x0bff */
479  /* 0x0c00-0x0dff */
480  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, /* 0x0c00-0x0c3f */
481  0xc1, 0x3d, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0c40-0x0c7f */
482  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0x0c80-0x0cbf */
483  0x40, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0cc0-0x0cff */
484  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d00-0x0d3f */
485  0x0e, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d40-0x0d7f */
486  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d80-0x0dbf */
487  0x00, 0x04, 0x5c, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0dc0-0x0dff */
488  /* 0x0e00-0x0fff */
489  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x07, /* 0x0e00-0x0e3f */
490  0x80, 0x7f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0e40-0x0e7f */
491  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x1b, /* 0x0e80-0x0ebf */
492  0x00, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0ec0-0x0eff */
493  0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0xa0, 0x02, /* 0x0f00-0x0f3f */
494  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x7f, /* 0x0f40-0x0f7f */
495  0xdf, 0x00, 0xff, 0xfe, 0xff, 0xff, 0xff, 0x1f, /* 0x0f80-0x0fbf */
496  0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0fc0-0x0fff */
497  /* 0x1000-0x11ff */
498  0x00, 0x00, 0x00, 0x00, 0x00, 0xe0, 0xc5, 0x02, /* 0x1000-0x103f */
499  0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, /* 0x1040-0x107f */
500  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1080-0x10bf */
501  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10c0-0x10ff */
502  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1100-0x113f */
503  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1140-0x117f */
504  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1180-0x11bf */
505  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x11c0-0x11ff */
506  /* 0x1600-0x17ff */
507  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1600-0x163f */
508  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1640-0x167f */
509  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1680-0x16bf */
510  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x16c0-0x16ff */
511  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1700-0x173f */
512  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1740-0x177f */
513  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3f, /* 0x1780-0x17bf */
514  0x40, 0xfe, 0x0f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x17c0-0x17ff */
515  /* 0x1800-0x19ff */
516  0x00, 0x78, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1800-0x183f */
517  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1840-0x187f */
518  0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, /* 0x1880-0x18bf */
519  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18c0-0x18ff */
520  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1900-0x193f */
521  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1940-0x197f */
522  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1980-0x19bf */
523  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x19c0-0x19ff */
524  /* 0x2000-0x21ff */
525  0x00, 0xf8, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, /* 0x2000-0x203f */
526  0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x00, 0x00, /* 0x2040-0x207f */
527  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2080-0x20bf */
528  0x00, 0x00, 0xff, 0xff, 0x0f, 0x00, 0x00, 0x00, /* 0x20c0-0x20ff */
529  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2100-0x213f */
530  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2140-0x217f */
531  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2180-0x21bf */
532  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x21c0-0x21ff */
533  /* 0x3000-0x31ff */
534  0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x00, 0x00, /* 0x3000-0x303f */
535  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3040-0x307f */
536  0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, /* 0x3080-0x30bf */
537  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30c0-0x30ff */
538  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3100-0x313f */
539  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3140-0x317f */
540  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3180-0x31bf */
541  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x31c0-0x31ff */
542  /* 0xfa00-0xfbff */
543  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa00-0xfa3f */
544  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa40-0xfa7f */
545  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa80-0xfabf */
546  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfac0-0xfaff */
547  0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, /* 0xfb00-0xfb3f */
548  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfb40-0xfb7f */
549  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfb80-0xfbbf */
550  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfbc0-0xfbff */
551  /* 0xfe00-0xffff */
552  0x00, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, /* 0xfe00-0xfe3f */
553  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfe40-0xfe7f */
554  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfe80-0xfebf */
555  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0xfec0-0xfeff */
556  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff00-0xff3f */
557  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff40-0xff7f */
558  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff80-0xffbf */
559  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e  /* 0xffc0-0xffff */
560};
561static const signed char nonspacing_table_ind[128] = {
562   0,  1,  2,  3,  4,  5,  6,  7, /* 0x0000-0x0fff */
563   8, -1, -1,  9, 10, -1, -1, -1, /* 0x1000-0x1fff */
564  11, -1, -1, -1, -1, -1, -1, -1, /* 0x2000-0x2fff */
565  12, -1, -1, -1, -1, -1, -1, -1, /* 0x3000-0x3fff */
566  -1, -1, -1, -1, -1, -1, -1, -1, /* 0x4000-0x4fff */
567  -1, -1, -1, -1, -1, -1, -1, -1, /* 0x5000-0x5fff */
568  -1, -1, -1, -1, -1, -1, -1, -1, /* 0x6000-0x6fff */
569  -1, -1, -1, -1, -1, -1, -1, -1, /* 0x7000-0x7fff */
570  -1, -1, -1, -1, -1, -1, -1, -1, /* 0x8000-0x8fff */
571  -1, -1, -1, -1, -1, -1, -1, -1, /* 0x9000-0x9fff */
572  -1, -1, -1, -1, -1, -1, -1, -1, /* 0xa000-0xafff */
573  -1, -1, -1, -1, -1, -1, -1, -1, /* 0xb000-0xbfff */
574  -1, -1, -1, -1, -1, -1, -1, -1, /* 0xc000-0xcfff */
575  -1, -1, -1, -1, -1, -1, -1, -1, /* 0xd000-0xdfff */
576  -1, -1, -1, -1, -1, -1, -1, -1, /* 0xe000-0xefff */
577  -1, -1, -1, -1, -1, 13, -1, 14  /* 0xf000-0xffff */
578};
579
580/* Determine number of column positions required for UC. */
581int
582uc_width (uc, encoding)
583     unsigned int uc;
584     const char *encoding;
585{
586  /* Test for non-spacing or control character. */
587  if ((uc >> 9) < 128)
588    {
589      int ind = nonspacing_table_ind[uc >> 9];
590      if (ind >= 0)
591        if ((nonspacing_table_data[64*ind + ((uc >> 3) & 63)] >> (uc & 7)) & 1)
592          {
593            if (uc > 0 && uc < 0x100)
594              return -1;
595            else
596              return 0;
597          }
598    }
599  /* Test for double-width character.
600   * Generated from "grep '^....;[WF]' EastAsianWidth.txt"
601   * and            "grep '^....;[^WF]' EastAsianWidth.txt"
602   */
603  if (uc >= 0x1100
604      && ((uc < 0x1160) /* Hangul Jamo */
605          || (uc >= 0x2e80 && uc < 0xa4d0  /* CJK ... Yi */
606              && !((uc & ~0x0011) == 0x300a || uc == 0x303f))
607          || (uc >= 0xac00 && uc < 0xd7a4) /* Hangul Syllables */
608          || (uc >= 0xf900 && uc < 0xfb00) /* CJK Compatibility Ideographs */
609          || (uc >= 0xfe30 && uc < 0xfe70) /* CJK Compatibility Forms */
610          || (uc >= 0xff00 && uc < 0xff60) /* Fullwidth Forms */
611          || (uc >= 0xffe0 && uc < 0xffe7)))
612    return 2;
613  /* In ancient CJK encodings, Cyrillic and most other characters are
614     double-width as well. */
615  if (uc >= 0x00A1 && uc < 0xFF60 && uc != 0x20A9
616      && is_cjk_encoding (encoding))
617    return 2;
618  return 1;
619}
620
621
622#ifdef unused
623
624/* Determine number of column positions required for first N units
625   (or fewer if S ends before this) in S.  */
626
627int
628u8_width (s, n, encoding)
629     const unsigned char *s;
630     size_t n;
631     const char *encoding;
632{
633  const unsigned char *s_end = s + n;
634  int width = 0;
635
636  while (s < s_end)
637    {
638      unsigned int uc;
639      int w;
640
641      s += u8_mbtouc (&uc, s, s_end - s);
642
643      if (uc == 0)
644        break; /* end of string reached */
645
646      w = uc_width (uc, encoding);
647      if (w >= 0) /* ignore control characters in the string */
648        width += w;
649    }
650
651  return width;
652}
653
654int
655u16_width (s, n, encoding)
656     const unsigned short *s;
657     size_t n;
658     const char *encoding;
659{
660  const unsigned short *s_end = s + n;
661  int width = 0;
662
663  while (s < s_end)
664    {
665      unsigned int uc;
666      int w;
667
668      s += u16_mbtouc (&uc, s, s_end - s);
669
670      if (uc == 0)
671        break; /* end of string reached */
672
673      w = uc_width (uc, encoding);
674      if (w >= 0) /* ignore control characters in the string */
675        width += w;
676    }
677
678  return width;
679}
680
681int
682u32_width (s, n, encoding)
683     const unsigned int *s;
684     size_t n;
685     const char *encoding;
686{
687  const unsigned int *s_end = s + n;
688  int width = 0;
689
690  while (s < s_end)
691    {
692      unsigned int uc = *s++;
693      int w;
694
695      if (uc == 0)
696        break; /* end of string reached */
697
698      w = uc_width (uc, encoding);
699      if (w >= 0) /* ignore control characters in the string */
700        width += w;
701    }
702
703  return width;
704}
705
706#endif
707
708
709/* Determine the line break points in S, and store the result at p[0..n-1].  */
710/* We don't support line breaking of complex-context dependent characters
711   (Thai, Lao, Myanmar, Khmer) yet, because it requires dictionary lookup. */
712
713/* Line breaking classification.  */
714
715enum
716{
717  /* Values >= 20 are resolved at run time. */
718  LBP_BK =  0, /* mandatory break */
719/*LBP_CR,         carriage return - not used here because it's a DOSism */
720/*LBP_LF,         line feed - not used here because it's a DOSism */
721  LBP_CM = 20, /* attached characters and combining marks */
722/*LBP_SG,         surrogates - not used here because they are not characters */
723  LBP_ZW =  1, /* zero width space */
724  LBP_IN =  2, /* inseparable */
725  LBP_GL =  3, /* non-breaking (glue) */
726  LBP_CB = 22, /* contingent break opportunity */
727  LBP_SP = 21, /* space */
728  LBP_BA =  4, /* break opportunity after */
729  LBP_BB =  5, /* break opportunity before */
730  LBP_B2 =  6, /* break opportunity before and after */
731  LBP_HY =  7, /* hyphen */
732  LBP_NS =  8, /* non starter */
733  LBP_OP =  9, /* opening punctuation */
734  LBP_CL = 10, /* closing punctuation */
735  LBP_QU = 11, /* ambiguous quotation */
736  LBP_EX = 12, /* exclamation/interrogation */
737  LBP_ID = 13, /* ideographic */
738  LBP_NU = 14, /* numeric */
739  LBP_IS = 15, /* infix separator (numeric) */
740  LBP_SY = 16, /* symbols allowing breaks */
741  LBP_AL = 17, /* ordinary alphabetic and symbol characters */
742  LBP_PR = 18, /* prefix (numeric) */
743  LBP_PO = 19, /* postfix (numeric) */
744  LBP_SA = 23, /* complex context (South East Asian) */
745  LBP_AI = 24, /* ambiguous (alphabetic or ideograph) */
746  LBP_XX = 25  /* unknown */
747};
748
749#include "lbrkprop.h"
750
751static inline unsigned char
752lbrkprop_lookup (uc)
753     unsigned int uc;
754{
755  unsigned int index1 = uc >> lbrkprop_header_0;
756  if (index1 < lbrkprop_header_1)
757    {
758      int lookup1 = lbrkprop.level1[index1];
759      if (lookup1 >= 0)
760        {
761          unsigned int index2 = (uc >> lbrkprop_header_2) & lbrkprop_header_3;
762          int lookup2 = lbrkprop.level2[lookup1 + index2];
763          if (lookup2 >= 0)
764            {
765              unsigned int index3 = uc & lbrkprop_header_4;
766              return lbrkprop.level3[lookup2 + index3];
767            }
768        }
769    }
770  return LBP_XX;
771}
772
773/* Table indexed by two line breaking classifications.  */
774#define D 1  /* direct break opportunity, empty in table 7.3 of UTR #14 */
775#define I 2  /* indirect break opportunity, '%' in table 7.3 of UTR #14 */
776#define P 3  /* prohibited break,           '^' in table 7.3 of UTR #14 */
777static const unsigned char lbrk_table[19][19] = {
778                                /* after */
779        /* ZW IN GL BA BB B2 HY NS OP CL QU EX ID NU IS SY AL PR PO */
780/* ZW */ { P, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, },
781/* IN */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
782/* GL */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
783/* BA */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
784/* BB */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
785/* B2 */ { P, D, I, I, D, P, I, I, D, P, I, P, D, D, P, P, D, D, D, },
786/* HY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
787/* NS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
788/* OP */ { P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, },
789/* CL */ { P, D, I, I, D, D, I, P, D, P, I, P, D, D, P, P, D, D, I, },
790/* QU */ { P, I, I, I, I, I, I, I, P, P, I, P, I, I, P, P, I, I, I, },
791/* EX */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
792/* ID */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, I, },
793/* NU */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, I, },
794/* IS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
795/* SY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
796/* AL */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, D, },
797/* PR */ { P, D, I, I, D, D, I, I, I, P, I, P, I, I, P, P, I, D, D, },
798/* PO */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
799/* "" */
800/* before */
801};
802/* Note: The (B2,B2) entry should probably be D instead of P.  */
803/* Note: The (PR,ID) entry should probably be D instead of I.  */
804
805void
806u8_possible_linebreaks (s, n, encoding, p)
807     const unsigned char *s;
808     size_t n;
809     const char *encoding;
810     char *p;
811{
812  int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
813  const unsigned char *s_end = s + n;
814  int last_prop = LBP_BK; /* line break property of last non-space character */
815  char *seen_space = NULL; /* Was a space seen after the last non-space character? */
816  char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
817
818  /* Don't break inside multibyte characters.  */
819  memset (p, UC_BREAK_PROHIBITED, n);
820
821  while (s < s_end)
822    {
823      unsigned int uc;
824      int count = u8_mbtouc (&uc, s, s_end - s);
825      int prop = lbrkprop_lookup (uc);
826
827      if (prop == LBP_BK)
828        {
829          /* Mandatory break.  */
830          *p = UC_BREAK_MANDATORY;
831          last_prop = LBP_BK;
832          seen_space = NULL;
833          seen_space2 = NULL;
834        }
835      else
836        {
837          char *q;
838
839          /* Resolve property values whose behaviour is not fixed.  */
840          switch (prop)
841            {
842              case LBP_AI:
843                /* Resolve ambiguous.  */
844                prop = LBP_AI_REPLACEMENT;
845                break;
846              case LBP_CB:
847                /* This is arbitrary.  */
848                prop = LBP_ID;
849                break;
850              case LBP_SA:
851                /* We don't handle complex scripts yet.
852                   Treat LBP_SA like LBP_XX.  */
853              case LBP_XX:
854                /* This is arbitrary.  */
855                prop = LBP_AL;
856                break;
857            }
858
859          /* Deal with combining characters.  */
860          q = p;
861          if (prop == LBP_CM)
862            {
863              /* Don't break just before a combining character.  */
864              *p = UC_BREAK_PROHIBITED;
865              /* A combining character turns a preceding space into LBP_AL.  */
866              if (seen_space != NULL)
867                {
868                  q = seen_space;
869                  seen_space = seen_space2;
870                  prop = LBP_AL;
871                  goto lookup_via_table;
872                }
873            }
874          else if (prop == LBP_SP)
875            {
876              /* Don't break just before a space.  */
877              *p = UC_BREAK_PROHIBITED;
878              seen_space2 = seen_space;
879              seen_space = p;
880            }
881          else
882            {
883             lookup_via_table:
884              /* prop must be usable as an index for table 7.3 of UTR #14.  */
885              if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
886                abort ();
887
888              if (last_prop == LBP_BK)
889                {
890                  /* Don't break at the beginning of a line.  */
891                  *q = UC_BREAK_PROHIBITED;
892                }
893              else
894                {
895                  switch (lbrk_table [last_prop-1] [prop-1])
896                    {
897                      case D:
898                        *q = UC_BREAK_POSSIBLE;
899                        break;
900                      case I:
901                        *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
902                        break;
903                      case P:
904                        *q = UC_BREAK_PROHIBITED;
905                        break;
906                      default:
907                        abort ();
908                    }
909                }
910              last_prop = prop;
911              seen_space = NULL;
912              seen_space2 = NULL;
913            }
914        }
915
916      s += count;
917      p += count;
918    }
919}
920
921#ifdef unused
922
923void
924u16_possible_linebreaks (s, n, encoding, p)
925     const unsigned short *s;
926     size_t n;
927     const char *encoding;
928     char *p;
929{
930  int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
931  const unsigned short *s_end = s + n;
932  int last_prop = LBP_BK; /* line break property of last non-space character */
933  char *seen_space = NULL; /* Was a space seen after the last non-space character? */
934  char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
935
936  /* Don't break inside multibyte characters.  */
937  memset (p, UC_BREAK_PROHIBITED, n);
938
939  while (s < s_end)
940    {
941      unsigned int uc;
942      int count = u16_mbtouc (&uc, s, s_end - s);
943      int prop = lbrkprop_lookup (uc);
944
945      if (prop == LBP_BK)
946        {
947          /* Mandatory break.  */
948          *p = UC_BREAK_MANDATORY;
949          last_prop = LBP_BK;
950          seen_space = NULL;
951          seen_space2 = NULL;
952        }
953      else
954        {
955          char *q;
956
957          /* Resolve property values whose behaviour is not fixed.  */
958          switch (prop)
959            {
960              case LBP_AI:
961                /* Resolve ambiguous.  */
962                prop = LBP_AI_REPLACEMENT;
963                break;
964              case LBP_CB:
965                /* This is arbitrary.  */
966                prop = LBP_ID;
967                break;
968              case LBP_SA:
969                /* We don't handle complex scripts yet.
970                   Treat LBP_SA like LBP_XX.  */
971              case LBP_XX:
972                /* This is arbitrary.  */
973                prop = LBP_AL;
974                break;
975            }
976
977          /* Deal with combining characters.  */
978          q = p;
979          if (prop == LBP_CM)
980            {
981              /* Don't break just before a combining character.  */
982              *p = UC_BREAK_PROHIBITED;
983              /* A combining character turns a preceding space into LBP_AL.  */
984              if (seen_space != NULL)
985                {
986                  q = seen_space;
987                  seen_space = seen_space2;
988                  prop = LBP_AL;
989                  goto lookup_via_table;
990                }
991            }
992          else if (prop == LBP_SP)
993            {
994              /* Don't break just before a space.  */
995              *p = UC_BREAK_PROHIBITED;
996              seen_space2 = seen_space;
997              seen_space = p;
998            }
999          else
1000            {
1001             lookup_via_table:
1002              /* prop must be usable as an index for table 7.3 of UTR #14.  */
1003              if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
1004                abort ();
1005
1006              if (last_prop == LBP_BK)
1007                {
1008                  /* Don't break at the beginning of a line.  */
1009                  *q = UC_BREAK_PROHIBITED;
1010                }
1011              else
1012                {
1013                  switch (lbrk_table [last_prop-1] [prop-1])
1014                    {
1015                      case D:
1016                        *q = UC_BREAK_POSSIBLE;
1017                        break;
1018                      case I:
1019                        *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
1020                        break;
1021                      case P:
1022                        *q = UC_BREAK_PROHIBITED;
1023                        break;
1024                      default:
1025                        abort ();
1026                    }
1027                }
1028              last_prop = prop;
1029              seen_space = NULL;
1030              seen_space2 = NULL;
1031            }
1032        }
1033
1034      s += count;
1035      p += count;
1036    }
1037}
1038
1039void
1040u32_possible_linebreaks (s, n, encoding, p)
1041     const unsigned int *s;
1042     size_t n;
1043     const char *encoding;
1044     char *p;
1045{
1046  int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
1047  const unsigned int *s_end = s + n;
1048  int last_prop = LBP_BK; /* line break property of last non-space character */
1049  char *seen_space = NULL; /* Was a space seen after the last non-space character? */
1050  char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
1051
1052  while (s < s_end)
1053    {
1054      unsigned int uc = *s;
1055      int prop = lbrkprop_lookup (uc);
1056
1057      if (prop == LBP_BK)
1058        {
1059          /* Mandatory break.  */
1060          *p = UC_BREAK_MANDATORY;
1061          last_prop = LBP_BK;
1062          seen_space = NULL;
1063          seen_space2 = NULL;
1064        }
1065      else
1066        {
1067          char *q;
1068
1069          /* Resolve property values whose behaviour is not fixed.  */
1070          switch (prop)
1071            {
1072              case LBP_AI:
1073                /* Resolve ambiguous.  */
1074                prop = LBP_AI_REPLACEMENT;
1075                break;
1076              case LBP_CB:
1077                /* This is arbitrary.  */
1078                prop = LBP_ID;
1079                break;
1080              case LBP_SA:
1081                /* We don't handle complex scripts yet.
1082                   Treat LBP_SA like LBP_XX.  */
1083              case LBP_XX:
1084                /* This is arbitrary.  */
1085                prop = LBP_AL;
1086                break;
1087            }
1088
1089          /* Deal with combining characters.  */
1090          q = p;
1091          if (prop == LBP_CM)
1092            {
1093              /* Don't break just before a combining character.  */
1094              *p = UC_BREAK_PROHIBITED;
1095              /* A combining character turns a preceding space into LBP_AL.  */
1096              if (seen_space != NULL)
1097                {
1098                  q = seen_space;
1099                  seen_space = seen_space2;
1100                  prop = LBP_AL;
1101                  goto lookup_via_table;
1102                }
1103            }
1104          else if (prop == LBP_SP)
1105            {
1106              /* Don't break just before a space.  */
1107              *p = UC_BREAK_PROHIBITED;
1108              seen_space2 = seen_space;
1109              seen_space = p;
1110            }
1111          else
1112            {
1113             lookup_via_table:
1114              /* prop must be usable as an index for table 7.3 of UTR #14.  */
1115              if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
1116                abort ();
1117
1118              if (last_prop == LBP_BK)
1119                {
1120                  /* Don't break at the beginning of a line.  */
1121                  *q = UC_BREAK_PROHIBITED;
1122                }
1123              else
1124                {
1125                  switch (lbrk_table [last_prop-1] [prop-1])
1126                    {
1127                      case D:
1128                        *q = UC_BREAK_POSSIBLE;
1129                        break;
1130                      case I:
1131                        *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
1132                        break;
1133                      case P:
1134                        *q = UC_BREAK_PROHIBITED;
1135                        break;
1136                      default:
1137                        abort ();
1138                    }
1139                }
1140              last_prop = prop;
1141              seen_space = NULL;
1142              seen_space2 = NULL;
1143            }
1144        }
1145
1146      s++;
1147      p++;
1148    }
1149}
1150
1151#endif
1152
1153
1154/* Choose the best line breaks, assuming the uc_width function.
1155   Return the column after the end of the string.  */
1156
1157int
1158u8_width_linebreaks (s, n, width, start_column, at_end_columns, o, encoding, p)
1159     const unsigned char *s;
1160     size_t n;
1161     int width;
1162     int start_column;
1163     int at_end_columns;
1164     const char *o;
1165     const char *encoding;
1166     char *p;
1167{
1168  const unsigned char *s_end;
1169  char *last_p;
1170  int last_column;
1171  int piece_width;
1172
1173  u8_possible_linebreaks (s, n, encoding, p);
1174
1175  s_end = s + n;
1176  last_p = NULL;
1177  last_column = start_column;
1178  piece_width = 0;
1179  while (s < s_end)
1180    {
1181      unsigned int uc;
1182      int count = u8_mbtouc (&uc, s, s_end - s);
1183
1184      /* Respect the override.  */
1185      if (o != NULL && *o != UC_BREAK_UNDEFINED)
1186        *p = *o;
1187
1188      if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
1189        {
1190          /* An atomic piece of text ends here.  */
1191          if (last_p != NULL && last_column + piece_width > width)
1192            {
1193              /* Insert a line break.  */
1194              *last_p = UC_BREAK_POSSIBLE;
1195              last_column = 0;
1196            }
1197        }
1198
1199      if (*p == UC_BREAK_MANDATORY)
1200        {
1201          /* uc is a line break character.  */
1202          /* Start a new piece at column 0.  */
1203          last_p = NULL;
1204          last_column = 0;
1205          piece_width = 0;
1206        }
1207      else
1208        {
1209          /* uc is not a line break character.  */
1210          int w;
1211
1212          if (*p == UC_BREAK_POSSIBLE)
1213            {
1214              /* Start a new piece.  */
1215              last_p = p;
1216              last_column += piece_width;
1217              piece_width = 0;
1218              /* No line break for the moment, may be turned into
1219                 UC_BREAK_POSSIBLE later, via last_p. */
1220            }
1221         
1222          *p = UC_BREAK_PROHIBITED;
1223
1224          w = uc_width (uc, encoding);
1225          if (w >= 0) /* ignore control characters in the string */
1226            piece_width += w;
1227         }
1228
1229      s += count;
1230      p += count;
1231      if (o != NULL)
1232        o += count;
1233    }
1234
1235  /* The last atomic piece of text ends here.  */
1236  if (last_p != NULL && last_column + piece_width + at_end_columns > width)
1237    {
1238      /* Insert a line break.  */
1239      *last_p = UC_BREAK_POSSIBLE;
1240      last_column = 0;
1241    }
1242
1243  return last_column + piece_width;
1244}
1245
1246#ifdef unused
1247
1248int
1249u16_width_linebreaks (s, n, width, start_column, at_end_columns, o, encoding, p)
1250     const unsigned short *s;
1251     size_t n;
1252     int width;
1253     int start_column;
1254     int at_end_columns;
1255     const char *o;
1256     const char *encoding;
1257     char *p;
1258{
1259  const unsigned short *s_end;
1260  char *last_p;
1261  int last_column;
1262  int piece_width;
1263
1264  u16_possible_linebreaks (s, n, encoding, p);
1265
1266  s_end = s + n;
1267  last_p = NULL;
1268  last_column = start_column;
1269  piece_width = 0;
1270  while (s < s_end)
1271    {
1272      unsigned int uc;
1273      int count = u16_mbtouc (&uc, s, s_end - s);
1274
1275      /* Respect the override.  */
1276      if (o != NULL && *o != UC_BREAK_UNDEFINED)
1277        *p = *o;
1278
1279      if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
1280        {
1281          /* An atomic piece of text ends here.  */
1282          if (last_p != NULL && last_column + piece_width > width)
1283            {
1284              /* Insert a line break.  */
1285              *last_p = UC_BREAK_POSSIBLE;
1286              last_column = 0;
1287            }
1288        }
1289
1290      if (*p == UC_BREAK_MANDATORY)
1291        {
1292          /* uc is a line break character.  */
1293          /* Start a new piece at column 0.  */
1294          last_p = NULL;
1295          last_column = 0;
1296          piece_width = 0;
1297        }
1298      else
1299        {
1300          /* uc is not a line break character.  */
1301          int w;
1302
1303          if (*p == UC_BREAK_POSSIBLE)
1304            {
1305              /* Start a new piece.  */
1306              last_p = p;
1307              last_column += piece_width;
1308              piece_width = 0;
1309              /* No line break for the moment, may be turned into
1310                 UC_BREAK_POSSIBLE later, via last_p. */
1311            }
1312         
1313          *p = UC_BREAK_PROHIBITED;
1314
1315          w = uc_width (uc, encoding);
1316          if (w >= 0) /* ignore control characters in the string */
1317            piece_width += w;
1318         }
1319
1320      s += count;
1321      p += count;
1322      if (o != NULL)
1323        o += count;
1324    }
1325
1326  /* The last atomic piece of text ends here.  */
1327  if (last_p != NULL && last_column + piece_width + at_end_columns > width)
1328    {
1329      /* Insert a line break.  */
1330      *last_p = UC_BREAK_POSSIBLE;
1331      last_column = 0;
1332    }
1333
1334  return last_column + piece_width;
1335}
1336
1337int
1338u32_width_linebreaks (s, n, width, start_column, at_end_columns, o, encoding, p)
1339     const unsigned int *s;
1340     size_t n;
1341     int width;
1342     int start_column;
1343     int at_end_columns;
1344     const char *o;
1345     const char *encoding;
1346     char *p;
1347{
1348  const unsigned int *s_end;
1349  char *last_p;
1350  int last_column;
1351  int piece_width;
1352
1353  u32_possible_linebreaks (s, n, encoding, p);
1354
1355  s_end = s + n;
1356  last_p = NULL;
1357  last_column = start_column;
1358  piece_width = 0;
1359  while (s < s_end)
1360    {
1361      unsigned int uc = *s;
1362
1363      /* Respect the override.  */
1364      if (o != NULL && *o != UC_BREAK_UNDEFINED)
1365        *p = *o;
1366
1367      if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
1368        {
1369          /* An atomic piece of text ends here.  */
1370          if (last_p != NULL && last_column + piece_width > width)
1371            {
1372              /* Insert a line break.  */
1373              *last_p = UC_BREAK_POSSIBLE;
1374              last_column = 0;
1375            }
1376        }
1377
1378      if (*p == UC_BREAK_MANDATORY)
1379        {
1380          /* uc is a line break character.  */
1381          /* Start a new piece at column 0.  */
1382          last_p = NULL;
1383          last_column = 0;
1384          piece_width = 0;
1385        }
1386      else
1387        {
1388          /* uc is not a line break character.  */
1389          int w;
1390
1391          if (*p == UC_BREAK_POSSIBLE)
1392            {
1393              /* Start a new piece.  */
1394              last_p = p;
1395              last_column += piece_width;
1396              piece_width = 0;
1397              /* No line break for the moment, may be turned into
1398                 UC_BREAK_POSSIBLE later, via last_p. */
1399            }
1400         
1401          *p = UC_BREAK_PROHIBITED;
1402
1403          w = uc_width (uc, encoding);
1404          if (w >= 0) /* ignore control characters in the string */
1405            piece_width += w;
1406         }
1407
1408      s++;
1409      p++;
1410      if (o != NULL)
1411        o++;
1412    }
1413
1414  /* The last atomic piece of text ends here.  */
1415  if (last_p != NULL && last_column + piece_width + at_end_columns > width)
1416    {
1417      /* Insert a line break.  */
1418      *last_p = UC_BREAK_POSSIBLE;
1419      last_column = 0;
1420    }
1421
1422  return last_column + piece_width;
1423}
1424
1425#endif
1426
1427
1428#ifdef TEST1
1429
1430#include <stdio.h>
1431
1432/* Read the contents of an input stream, and return it, terminated with a NUL
1433   byte. */
1434char *
1435read_file (stream)
1436     FILE *stream;
1437{
1438#define BUFSIZE 4096
1439  char *buf = NULL;
1440  int alloc = 0;
1441  int size = 0;
1442  int count;
1443
1444  while (! feof (stream))
1445    {
1446      if (size + BUFSIZE > alloc)
1447        {
1448          alloc = alloc + alloc / 2;
1449          if (alloc < size + BUFSIZE)
1450            alloc = size + BUFSIZE;
1451          buf = realloc (buf, alloc);
1452          if (buf == NULL)
1453            {
1454              fprintf (stderr, "out of memory\n");
1455              exit (1);
1456            }
1457        }
1458      count = fread (buf + size, 1, BUFSIZE, stream);
1459      if (count == 0)
1460        {
1461          if (ferror (stream))
1462            {
1463              perror ("fread");
1464              exit (1);
1465            }
1466        }
1467      else
1468        size += count;
1469    }
1470  buf = realloc (buf, size + 1);
1471  if (buf == NULL)
1472    {
1473      fprintf (stderr, "out of memory\n");
1474      exit (1);
1475    }
1476  buf[size] = '\0';
1477  return buf;
1478#undef BUFSIZE
1479}
1480
1481int
1482main (argc, argv)
1483     int argc;
1484     char * argv[];
1485{
1486  if (argc == 1)
1487    {
1488      /* Display all the break opportunities in the input string.  */
1489      char *input = read_file (stdin);
1490      int length = strlen (input);
1491      char *breaks = malloc (length);
1492      int i;
1493
1494      u8_possible_linebreaks ((unsigned char *) input, length, "UTF-8", breaks);
1495
1496      for (i = 0; i < length; i++)
1497        {
1498          switch (breaks[i])
1499            {
1500              case UC_BREAK_POSSIBLE:
1501                /* U+2027 in UTF-8 encoding */
1502                putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout);
1503                break;
1504              case UC_BREAK_MANDATORY:
1505                /* U+21B2 (or U+21B5) in UTF-8 encoding */
1506                putc (0xe2, stdout); putc (0x86, stdout); putc (0xb2, stdout);
1507                break;
1508              case UC_BREAK_PROHIBITED:
1509                break;
1510              default:
1511                abort ();
1512            }
1513          putc (input[i], stdout);
1514        }
1515
1516      free (breaks);
1517
1518      return 0;
1519    }
1520  else if (argc == 2)
1521    {
1522      /* Insert line breaks for a given width.  */
1523      int width = atoi (argv[1]);
1524      char *input = read_file (stdin);
1525      int length = strlen (input);
1526      char *breaks = malloc (length);
1527      int i;
1528
1529      u8_width_linebreaks ((unsigned char *) input, length, width, 0, 0, NULL, "UTF-8", breaks);
1530
1531      for (i = 0; i < length; i++)
1532        {
1533          switch (breaks[i])
1534            {
1535              case UC_BREAK_POSSIBLE:
1536                putc ('\n', stdout);
1537                break;
1538              case UC_BREAK_MANDATORY:
1539                break;
1540              case UC_BREAK_PROHIBITED:
1541                break;
1542              default:
1543                abort ();
1544            }
1545          putc (input[i], stdout);
1546        }
1547
1548      free (breaks);
1549
1550      return 0;
1551    }
1552  else
1553    return 1;
1554}
1555
1556#endif /* TEST1 */
1557
1558
1559/* Now the same thing with an arbitrary encoding.
1560
1561   We convert the input string to Unicode.
1562
1563   The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16,
1564   UTF-16BE, UTF-16LE, UTF-7.  UCS-2 supports only characters up to
1565   \U0000FFFF.  UTF-16 and variants support only characters up to
1566   \U0010FFFF.  UTF-7 is way too complex and not supported by glibc-2.1.
1567   UCS-4 specification leaves doubts about endianness and byte order mark.
1568   glibc currently interprets it as big endian without byte order mark,
1569   but this is not backed by an RFC.  So we use UTF-8. It supports
1570   characters up to \U7FFFFFFF and is unambiguously defined.  */
1571
1572#if HAVE_ICONV
1573
1574#include <iconv.h>
1575#include <errno.h>
1576
1577/* Luckily, the encoding's name is platform independent.  */
1578#define UTF8_NAME "UTF-8"
1579
1580/* Return the length of a string after conversion through an iconv_t.  */
1581static size_t
1582iconv_string_length (cd, s, n)
1583     iconv_t cd;
1584     const char *s;
1585     size_t n;
1586{
1587#define TMPBUFSIZE 4096
1588  size_t count = 0;
1589  char tmpbuf[TMPBUFSIZE];
1590  const char *inptr = s;
1591  size_t insize = n;
1592  while (insize > 0)
1593    {
1594      char *outptr = tmpbuf;
1595      size_t outsize = TMPBUFSIZE;
1596      size_t res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
1597      if (res == (size_t)(-1))
1598        return (size_t)(-1);
1599      count += outptr - tmpbuf;
1600    }
1601  /* Avoid glibc-2.1 bug and Solaris 2.7 bug.  */
1602#if defined _LIBICONV_VERSION \
1603   || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
1604  {
1605    char *outptr = tmpbuf;
1606    size_t outsize = TMPBUFSIZE;
1607    size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);
1608    if (res == (size_t)(-1))
1609      return (size_t)(-1);
1610    count += outptr - tmpbuf;
1611  }
1612  /* Return to the initial state.  */
1613  iconv (cd, NULL, NULL, NULL, NULL);
1614#endif
1615  return count;
1616#undef TMPBUFSIZE
1617}
1618
1619static void
1620iconv_string_keeping_offsets (cd, s, n, offtable, t, m)
1621     iconv_t cd;
1622     const char *s;
1623     size_t n;
1624     size_t *offtable;
1625     char *t;
1626     size_t m;
1627{
1628  size_t i;
1629  const char *s_end;
1630  const char *inptr;
1631  char *outptr;
1632  size_t outsize;
1633  /* Avoid glibc-2.1 bug.  */
1634#if !defined _LIBICONV_VERSION && (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1)
1635  const size_t extra = 1;
1636#else
1637  const size_t extra = 0;
1638#endif
1639
1640  for (i = 0; i < n; i++)
1641    offtable[i] = (size_t)(-1);
1642
1643  s_end = s + n;
1644  inptr = s;
1645  outptr = t;
1646  outsize = m + extra;
1647  while (inptr < s_end)
1648    {
1649      size_t insize;
1650      size_t res;
1651
1652      offtable[inptr - s] = outptr - t;
1653
1654      res = (size_t)(-1);
1655      for (insize = 1; inptr + insize <= s_end; insize++)
1656        {
1657          res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
1658          if (!(res == (size_t)(-1) && errno == EINVAL))
1659            break;
1660        }
1661      /* After we verified the convertibility and computed the translation's
1662         size m, there shouldn't be any conversion error here. */
1663      if (res == (size_t)(-1))
1664        abort ();
1665    }
1666  /* Avoid glibc-2.1 bug and Solaris 2.7 bug.  */
1667#if defined _LIBICONV_VERSION \
1668   || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
1669  if (iconv (cd, NULL, NULL, &outptr, &outsize) == (size_t)(-1))
1670    abort ();
1671#endif
1672  /* We should have produced exactly m output bytes.  */
1673  if (outsize != extra)
1674    abort ();
1675}
1676
1677#endif /* HAVE_ICONV */
1678
1679#if C_CTYPE_ASCII
1680
1681/* Tests whether a string is entirely ASCII.  Returns 1 if yes.
1682   Returns 0 if the string is in an 8-bit encoding or an ISO-2022 encoding.  */
1683static int
1684is_all_ascii (s, n)
1685     const char *s;
1686     size_t n;
1687{
1688  for (; n > 0; s++, n--)
1689    {
1690      unsigned char c = (unsigned char) *s;
1691
1692      if (!(c_isprint (c) || c_isspace (c)))
1693        return 0;
1694    }
1695  return 1;
1696}
1697
1698#endif /* C_CTYPE_ASCII */
1699
1700#if defined unused || defined TEST2
1701
1702void
1703mbs_possible_linebreaks (s, n, encoding, p)
1704     const char *s;
1705     size_t n;
1706     const char *encoding;
1707     char *p;
1708{
1709  if (is_utf8_encoding (encoding))
1710    u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
1711  else
1712    {
1713#if HAVE_ICONV
1714      iconv_t to_utf8;
1715      /* Avoid glibc-2.1 bug with EUC-KR.  */
1716# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1717      if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
1718        to_utf8 = (iconv_t)(-1);
1719      else
1720# endif
1721      to_utf8 = iconv_open (UTF8_NAME, encoding);
1722      if (to_utf8 != (iconv_t)(-1))
1723        {
1724          /* Determine the length of the resulting UTF-8 string.  */
1725          size_t m = iconv_string_length (to_utf8, s, n);
1726          if (m != (size_t)(-1))
1727            {
1728              /* Convert the string to UTF-8 and build a translation table
1729                 from offsets into s to offsets into the translated string.  */
1730              char *memory = malloc (n * sizeof (size_t) + m + m);
1731              if (memory != NULL)
1732                {
1733                  size_t *offtable = (size_t *) memory;
1734                  char *t = (char *) (offtable + n);
1735                  char *q = (char *) (t + m);
1736                  size_t i;
1737
1738                  iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
1739
1740                  /* Determine the possible line breaks of the UTF-8 string.  */
1741                  u8_possible_linebreaks ((const unsigned char *) t, m, encoding, q);
1742
1743                  /* Translate the result back to the original string.  */
1744                  memset (p, UC_BREAK_PROHIBITED, n);
1745                  for (i = 0; i < n; i++)
1746                    if (offtable[i] != (size_t)(-1))
1747                      p[i] = q[offtable[i]];
1748
1749                  free (memory);
1750                  iconv_close (to_utf8);
1751                  return;
1752                }
1753            }
1754          iconv_close (to_utf8);
1755        }
1756#endif
1757      /* Impossible to convert.  */
1758#if C_CTYPE_ASCII
1759      if (is_all_ascii (s, n))
1760        {
1761          /* ASCII is a subset of UTF-8.  */
1762          u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
1763          return;
1764        }
1765#endif
1766      /* We have a non-ASCII string and cannot convert it.
1767         Don't produce line breaks except those already present in the
1768         input string.  All we assume here is that the encoding is
1769         minimally ASCII compatible.  */
1770      {
1771        const char *s_end = s + n;
1772        while (s < s_end)
1773          {
1774            *p = (*s == '\n' ? UC_BREAK_MANDATORY : UC_BREAK_PROHIBITED);
1775            s++;
1776            p++;
1777          }
1778      }
1779    }
1780}
1781
1782#endif
1783
1784int
1785mbs_width_linebreaks (s, n, width, start_column, at_end_columns, o, encoding, p)
1786     const char *s;
1787     size_t n;
1788     int width;
1789     int start_column;
1790     int at_end_columns;
1791     const char *o;
1792     const char *encoding;
1793     char *p;
1794{
1795  if (is_utf8_encoding (encoding))
1796    return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
1797  else
1798    {
1799#if HAVE_ICONV
1800      iconv_t to_utf8;
1801      /* Avoid glibc-2.1 bug with EUC-KR.  */
1802# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1803      if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
1804        to_utf8 = (iconv_t)(-1);
1805      else
1806# endif
1807      to_utf8 = iconv_open (UTF8_NAME, encoding);
1808      if (to_utf8 != (iconv_t)(-1))
1809        {
1810          /* Determine the length of the resulting UTF-8 string.  */
1811          size_t m = iconv_string_length (to_utf8, s, n);
1812          if (m != (size_t)(-1))
1813            {
1814              /* Convert the string to UTF-8 and build a translation table
1815                 from offsets into s to offsets into the translated string.  */
1816              char *memory = malloc (n * sizeof (size_t) + m + m + (o != NULL ? m : 0));
1817              if (memory != NULL)
1818                {
1819                  size_t *offtable = (size_t *) memory;
1820                  char *t = (char *) (offtable + n);
1821                  char *q = (char *) (t + m);
1822                  char *o8 = (o != NULL ? (char *) (q + m) : NULL);
1823                  int res_column;
1824                  size_t i;
1825
1826                  iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
1827
1828                  /* Translate the overrides to the UTF-8 string.  */
1829                  if (o != NULL)
1830                    {
1831                      memset (o8, UC_BREAK_UNDEFINED, m);
1832                      for (i = 0; i < n; i++)
1833                        if (offtable[i] != (size_t)(-1))
1834                          o8[offtable[i]] = o[i];
1835                    }
1836
1837                  /* Determine the line breaks of the UTF-8 string.  */
1838                  res_column =
1839                    u8_width_linebreaks ((const unsigned char *) t, m, width, start_column, at_end_columns, o8, encoding, q);
1840
1841                  /* Translate the result back to the original string.  */
1842                  memset (p, UC_BREAK_PROHIBITED, n);
1843                  for (i = 0; i < n; i++)
1844                    if (offtable[i] != (size_t)(-1))
1845                      p[i] = q[offtable[i]];
1846
1847                  free (memory);
1848                  iconv_close (to_utf8);
1849                  return res_column;
1850                }
1851            }
1852          iconv_close (to_utf8);
1853        }
1854#endif
1855      /* Impossible to convert.  */
1856#if C_CTYPE_ASCII
1857      if (is_all_ascii (s, n))
1858        {
1859          /* ASCII is a subset of UTF-8.  */
1860          return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
1861        }
1862#endif
1863      /* We have a non-ASCII string and cannot convert it.
1864         Don't produce line breaks except those already present in the
1865         input string.  All we assume here is that the encoding is
1866         minimally ASCII compatible.  */
1867      {
1868        const char *s_end = s + n;
1869        while (s < s_end)
1870          {
1871            *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n'
1872                  ? UC_BREAK_MANDATORY
1873                  : UC_BREAK_PROHIBITED);
1874            s++;
1875            p++;
1876            if (o != NULL)
1877              o++;
1878          }
1879        /* We cannot compute widths in this case.  */
1880        return start_column;
1881      }
1882    }
1883}
1884
1885
1886#ifdef TEST2
1887
1888#include <stdio.h>
1889#include <locale.h>
1890
1891/* Read the contents of an input stream, and return it, terminated with a NUL
1892   byte. */
1893char *
1894read_file (stream)
1895     FILE *stream;
1896{
1897#define BUFSIZE 4096
1898  char *buf = NULL;
1899  int alloc = 0;
1900  int size = 0;
1901  int count;
1902
1903  while (! feof (stream))
1904    {
1905      if (size + BUFSIZE > alloc)
1906        {
1907          alloc = alloc + alloc / 2;
1908          if (alloc < size + BUFSIZE)
1909            alloc = size + BUFSIZE;
1910          buf = realloc (buf, alloc);
1911          if (buf == NULL)
1912            {
1913              fprintf (stderr, "out of memory\n");
1914              exit (1);
1915            }
1916        }
1917      count = fread (buf + size, 1, BUFSIZE, stream);
1918      if (count == 0)
1919        {
1920          if (ferror (stream))
1921            {
1922              perror ("fread");
1923              exit (1);
1924            }
1925        }
1926      else
1927        size += count;
1928    }
1929  buf = realloc (buf, size + 1);
1930  if (buf == NULL)
1931    {
1932      fprintf (stderr, "out of memory\n");
1933      exit (1);
1934    }
1935  buf[size] = '\0';
1936  return buf;
1937#undef BUFSIZE
1938}
1939
1940int
1941main (argc, argv)
1942     int argc;
1943     char * argv[];
1944{
1945  setlocale (LC_CTYPE, "");
1946  if (argc == 1)
1947    {
1948      /* Display all the break opportunities in the input string.  */
1949      char *input = read_file (stdin);
1950      int length = strlen (input);
1951      char *breaks = malloc (length);
1952      int i;
1953
1954      mbs_possible_linebreaks (input, length, locale_charset (), breaks);
1955
1956      for (i = 0; i < length; i++)
1957        {
1958          switch (breaks[i])
1959            {
1960              case UC_BREAK_POSSIBLE:
1961                putc ('|', stdout);
1962                break;
1963              case UC_BREAK_MANDATORY:
1964                break;
1965              case UC_BREAK_PROHIBITED:
1966                break;
1967              default:
1968                abort ();
1969            }
1970          putc (input[i], stdout);
1971        }
1972
1973      free (breaks);
1974
1975      return 0;
1976    }
1977  else if (argc == 2)
1978    {
1979      /* Insert line breaks for a given width.  */
1980      int width = atoi (argv[1]);
1981      char *input = read_file (stdin);
1982      int length = strlen (input);
1983      char *breaks = malloc (length);
1984      int i;
1985
1986      mbs_width_linebreaks (input, length, width, 0, 0, NULL, locale_charset (), breaks);
1987
1988      for (i = 0; i < length; i++)
1989        {
1990          switch (breaks[i])
1991            {
1992              case UC_BREAK_POSSIBLE:
1993                putc ('\n', stdout);
1994                break;
1995              case UC_BREAK_MANDATORY:
1996                break;
1997              case UC_BREAK_PROHIBITED:
1998                break;
1999              default:
2000                abort ();
2001            }
2002          putc (input[i], stdout);
2003        }
2004
2005      free (breaks);
2006
2007      return 0;
2008    }
2009  else
2010    return 1;
2011}
2012
2013#endif /* TEST2 */
Note: See TracBrowser for help on using the repository browser.