1 | /* linebreak.c - line breaking of Unicode strings |
---|
2 | Copyright (C) 2001 Free Software Foundation, Inc. |
---|
3 | Written by Bruno Haible <haible@clisp.cons.org>, 2001. |
---|
4 | |
---|
5 | This program is free software; you can redistribute it and/or modify |
---|
6 | it under the terms of the GNU General Public License as published by |
---|
7 | the Free Software Foundation; either version 2, or (at your option) |
---|
8 | any later version. |
---|
9 | |
---|
10 | This program is distributed in the hope that it will be useful, |
---|
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
13 | GNU General Public License for more details. |
---|
14 | |
---|
15 | You should have received a copy of the GNU General Public License |
---|
16 | along with this program; if not, write to the Free Software |
---|
17 | Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ |
---|
18 | |
---|
19 | #ifdef HAVE_CONFIG_H |
---|
20 | # include <config.h> |
---|
21 | #endif |
---|
22 | |
---|
23 | #include <stddef.h> |
---|
24 | #include <string.h> |
---|
25 | #include "linebreak.h" |
---|
26 | #include "c-ctype.h" |
---|
27 | |
---|
28 | |
---|
29 | /* Return the length (number of units) of the first character in S, putting |
---|
30 | its 'ucs4_t' representation in *PUC. */ |
---|
31 | static int |
---|
32 | u8_mbtouc_aux (puc, s, n) |
---|
33 | unsigned int *puc; |
---|
34 | const unsigned char *s; |
---|
35 | size_t n; |
---|
36 | { |
---|
37 | unsigned char c = *s; |
---|
38 | |
---|
39 | if (c >= 0xc2) |
---|
40 | { |
---|
41 | if (c < 0xe0) |
---|
42 | { |
---|
43 | if (n >= 2) |
---|
44 | { |
---|
45 | if ((s[1] ^ 0x80) < 0x40) |
---|
46 | { |
---|
47 | *puc = ((unsigned int) (c & 0x1f) << 6) |
---|
48 | | (unsigned int) (s[1] ^ 0x80); |
---|
49 | return 2; |
---|
50 | } |
---|
51 | /* invalid multibyte character */ |
---|
52 | } |
---|
53 | else |
---|
54 | { |
---|
55 | /* incomplete multibyte character */ |
---|
56 | *puc = 0xfffd; |
---|
57 | return n; |
---|
58 | } |
---|
59 | } |
---|
60 | else if (c < 0xf0) |
---|
61 | { |
---|
62 | if (n >= 3) |
---|
63 | { |
---|
64 | if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 |
---|
65 | && (c >= 0xe1 || s[1] >= 0xa0)) |
---|
66 | { |
---|
67 | *puc = ((unsigned int) (c & 0x0f) << 12) |
---|
68 | | ((unsigned int) (s[1] ^ 0x80) << 6) |
---|
69 | | (unsigned int) (s[2] ^ 0x80); |
---|
70 | return 3; |
---|
71 | } |
---|
72 | /* invalid multibyte character */ |
---|
73 | } |
---|
74 | else |
---|
75 | { |
---|
76 | /* incomplete multibyte character */ |
---|
77 | *puc = 0xfffd; |
---|
78 | return n; |
---|
79 | } |
---|
80 | } |
---|
81 | else if (c < 0xf8) |
---|
82 | { |
---|
83 | if (n >= 4) |
---|
84 | { |
---|
85 | if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 |
---|
86 | && (s[3] ^ 0x80) < 0x40 |
---|
87 | && (c >= 0xf1 || s[1] >= 0x90)) |
---|
88 | { |
---|
89 | *puc = ((unsigned int) (c & 0x07) << 18) |
---|
90 | | ((unsigned int) (s[1] ^ 0x80) << 12) |
---|
91 | | ((unsigned int) (s[2] ^ 0x80) << 6) |
---|
92 | | (unsigned int) (s[3] ^ 0x80); |
---|
93 | return 4; |
---|
94 | } |
---|
95 | /* invalid multibyte character */ |
---|
96 | } |
---|
97 | else |
---|
98 | { |
---|
99 | /* incomplete multibyte character */ |
---|
100 | *puc = 0xfffd; |
---|
101 | return n; |
---|
102 | } |
---|
103 | } |
---|
104 | #if 0 |
---|
105 | else if (c < 0xfc) |
---|
106 | { |
---|
107 | if (n >= 5) |
---|
108 | { |
---|
109 | if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 |
---|
110 | && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 |
---|
111 | && (c >= 0xf9 || s[1] >= 0x88)) |
---|
112 | { |
---|
113 | *puc = ((unsigned int) (c & 0x03) << 24) |
---|
114 | | ((unsigned int) (s[1] ^ 0x80) << 18) |
---|
115 | | ((unsigned int) (s[2] ^ 0x80) << 12) |
---|
116 | | ((unsigned int) (s[3] ^ 0x80) << 6) |
---|
117 | | (unsigned int) (s[4] ^ 0x80); |
---|
118 | return 5; |
---|
119 | } |
---|
120 | /* invalid multibyte character */ |
---|
121 | } |
---|
122 | else |
---|
123 | { |
---|
124 | /* incomplete multibyte character */ |
---|
125 | *puc = 0xfffd; |
---|
126 | return n; |
---|
127 | } |
---|
128 | } |
---|
129 | else if (c < 0xfe) |
---|
130 | { |
---|
131 | if (n >= 6) |
---|
132 | { |
---|
133 | if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 |
---|
134 | && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 |
---|
135 | && (s[5] ^ 0x80) < 0x40 |
---|
136 | && (c >= 0xfd || s[1] >= 0x84)) |
---|
137 | { |
---|
138 | *puc = ((unsigned int) (c & 0x01) << 30) |
---|
139 | | ((unsigned int) (s[1] ^ 0x80) << 24) |
---|
140 | | ((unsigned int) (s[2] ^ 0x80) << 18) |
---|
141 | | ((unsigned int) (s[3] ^ 0x80) << 12) |
---|
142 | | ((unsigned int) (s[4] ^ 0x80) << 6) |
---|
143 | | (unsigned int) (s[5] ^ 0x80); |
---|
144 | return 6; |
---|
145 | } |
---|
146 | /* invalid multibyte character */ |
---|
147 | } |
---|
148 | else |
---|
149 | { |
---|
150 | /* incomplete multibyte character */ |
---|
151 | *puc = 0xfffd; |
---|
152 | return n; |
---|
153 | } |
---|
154 | } |
---|
155 | #endif |
---|
156 | } |
---|
157 | /* invalid multibyte character */ |
---|
158 | *puc = 0xfffd; |
---|
159 | return 1; |
---|
160 | } |
---|
161 | static inline int |
---|
162 | u8_mbtouc (puc, s, n) |
---|
163 | unsigned int *puc; |
---|
164 | const unsigned char *s; |
---|
165 | size_t n; |
---|
166 | { |
---|
167 | unsigned char c = *s; |
---|
168 | |
---|
169 | if (c < 0x80) |
---|
170 | { |
---|
171 | *puc = c; |
---|
172 | return 1; |
---|
173 | } |
---|
174 | else |
---|
175 | return u8_mbtouc_aux (puc, s, n); |
---|
176 | } |
---|
177 | |
---|
178 | #ifdef unused |
---|
179 | static int |
---|
180 | u16_mbtouc_aux (puc, s, n) |
---|
181 | unsigned int *puc; |
---|
182 | const unsigned short *s; |
---|
183 | size_t n; |
---|
184 | { |
---|
185 | unsigned short c = *s; |
---|
186 | |
---|
187 | if (c < 0xdc00) |
---|
188 | { |
---|
189 | if (n >= 2) |
---|
190 | { |
---|
191 | if (s[1] >= 0xdc00 && s[1] < 0xe000) |
---|
192 | { |
---|
193 | *puc = 0x10000 + ((c - 0xd800) << 10) + (s[1] - 0xdc00); |
---|
194 | return 2; |
---|
195 | } |
---|
196 | /* invalid multibyte character */ |
---|
197 | } |
---|
198 | else |
---|
199 | { |
---|
200 | /* incomplete multibyte character */ |
---|
201 | *puc = 0xfffd; |
---|
202 | return n; |
---|
203 | } |
---|
204 | } |
---|
205 | /* invalid multibyte character */ |
---|
206 | *puc = 0xfffd; |
---|
207 | return 1; |
---|
208 | } |
---|
209 | static inline int |
---|
210 | u16_mbtouc (puc, s, n) |
---|
211 | unsigned int *puc; |
---|
212 | const unsigned short *s; |
---|
213 | size_t n; |
---|
214 | { |
---|
215 | unsigned short c = *s; |
---|
216 | |
---|
217 | if (c < 0xd800 || c >= 0xe000) |
---|
218 | { |
---|
219 | *puc = c; |
---|
220 | return 1; |
---|
221 | } |
---|
222 | else |
---|
223 | return u16_mbtouc_aux (puc, s, n); |
---|
224 | } |
---|
225 | |
---|
226 | static inline int |
---|
227 | u32_mbtouc (puc, s, n) |
---|
228 | unsigned int *puc; |
---|
229 | const unsigned int *s; |
---|
230 | size_t n; |
---|
231 | { |
---|
232 | *puc = *s; |
---|
233 | return 1; |
---|
234 | } |
---|
235 | #endif |
---|
236 | |
---|
237 | |
---|
238 | /* Help GCC to generate good code for string comparisons with |
---|
239 | immediate strings. */ |
---|
240 | #if defined (__GNUC__) && defined (__OPTIMIZE__) |
---|
241 | |
---|
242 | static inline int |
---|
243 | streq9 (const char *s1, const char *s2) |
---|
244 | { |
---|
245 | return strcmp (s1 + 9, s2 + 9) == 0; |
---|
246 | } |
---|
247 | |
---|
248 | static inline int |
---|
249 | streq8 (const char *s1, const char *s2, char s28) |
---|
250 | { |
---|
251 | if (s1[8] == s28) |
---|
252 | { |
---|
253 | if (s28 == 0) |
---|
254 | return 1; |
---|
255 | else |
---|
256 | return streq9 (s1, s2); |
---|
257 | } |
---|
258 | else |
---|
259 | return 0; |
---|
260 | } |
---|
261 | |
---|
262 | static inline int |
---|
263 | streq7 (const char *s1, const char *s2, char s27, char s28) |
---|
264 | { |
---|
265 | if (s1[7] == s27) |
---|
266 | { |
---|
267 | if (s27 == 0) |
---|
268 | return 1; |
---|
269 | else |
---|
270 | return streq8 (s1, s2, s28); |
---|
271 | } |
---|
272 | else |
---|
273 | return 0; |
---|
274 | } |
---|
275 | |
---|
276 | static inline int |
---|
277 | streq6 (const char *s1, const char *s2, char s26, char s27, char s28) |
---|
278 | { |
---|
279 | if (s1[6] == s26) |
---|
280 | { |
---|
281 | if (s26 == 0) |
---|
282 | return 1; |
---|
283 | else |
---|
284 | return streq7 (s1, s2, s27, s28); |
---|
285 | } |
---|
286 | else |
---|
287 | return 0; |
---|
288 | } |
---|
289 | |
---|
290 | static inline int |
---|
291 | streq5 (const char *s1, const char *s2, char s25, char s26, char s27, char s28) |
---|
292 | { |
---|
293 | if (s1[5] == s25) |
---|
294 | { |
---|
295 | if (s25 == 0) |
---|
296 | return 1; |
---|
297 | else |
---|
298 | return streq6 (s1, s2, s26, s27, s28); |
---|
299 | } |
---|
300 | else |
---|
301 | return 0; |
---|
302 | } |
---|
303 | |
---|
304 | static inline int |
---|
305 | streq4 (const char *s1, const char *s2, char s24, char s25, char s26, char s27, char s28) |
---|
306 | { |
---|
307 | if (s1[4] == s24) |
---|
308 | { |
---|
309 | if (s24 == 0) |
---|
310 | return 1; |
---|
311 | else |
---|
312 | return streq5 (s1, s2, s25, s26, s27, s28); |
---|
313 | } |
---|
314 | else |
---|
315 | return 0; |
---|
316 | } |
---|
317 | |
---|
318 | static inline int |
---|
319 | streq3 (const char *s1, const char *s2, char s23, char s24, char s25, char s26, char s27, char s28) |
---|
320 | { |
---|
321 | if (s1[3] == s23) |
---|
322 | { |
---|
323 | if (s23 == 0) |
---|
324 | return 1; |
---|
325 | else |
---|
326 | return streq4 (s1, s2, s24, s25, s26, s27, s28); |
---|
327 | } |
---|
328 | else |
---|
329 | return 0; |
---|
330 | } |
---|
331 | |
---|
332 | static inline int |
---|
333 | streq2 (const char *s1, const char *s2, char s22, char s23, char s24, char s25, char s26, char s27, char s28) |
---|
334 | { |
---|
335 | if (s1[2] == s22) |
---|
336 | { |
---|
337 | if (s22 == 0) |
---|
338 | return 1; |
---|
339 | else |
---|
340 | return streq3 (s1, s2, s23, s24, s25, s26, s27, s28); |
---|
341 | } |
---|
342 | else |
---|
343 | return 0; |
---|
344 | } |
---|
345 | |
---|
346 | static inline int |
---|
347 | streq1 (const char *s1, const char *s2, char s21, char s22, char s23, char s24, char s25, char s26, char s27, char s28) |
---|
348 | { |
---|
349 | if (s1[1] == s21) |
---|
350 | { |
---|
351 | if (s21 == 0) |
---|
352 | return 1; |
---|
353 | else |
---|
354 | return streq2 (s1, s2, s22, s23, s24, s25, s26, s27, s28); |
---|
355 | } |
---|
356 | else |
---|
357 | return 0; |
---|
358 | } |
---|
359 | |
---|
360 | static inline int |
---|
361 | streq0 (const char *s1, const char *s2, char s20, char s21, char s22, char s23, char s24, char s25, char s26, char s27, char s28) |
---|
362 | { |
---|
363 | if (s1[0] == s20) |
---|
364 | { |
---|
365 | if (s20 == 0) |
---|
366 | return 1; |
---|
367 | else |
---|
368 | return streq1 (s1, s2, s21, s22, s23, s24, s25, s26, s27, s28); |
---|
369 | } |
---|
370 | else |
---|
371 | return 0; |
---|
372 | } |
---|
373 | |
---|
374 | #define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \ |
---|
375 | streq0 (s1, s2, s20, s21, s22, s23, s24, s25, s26, s27, s28) |
---|
376 | |
---|
377 | #else |
---|
378 | |
---|
379 | #define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \ |
---|
380 | (strcmp (s1, s2) == 0) |
---|
381 | |
---|
382 | #endif |
---|
383 | |
---|
384 | |
---|
385 | static int |
---|
386 | is_cjk_encoding (encoding) |
---|
387 | const char *encoding; |
---|
388 | { |
---|
389 | if (0 |
---|
390 | /* Legacy Japanese encodings */ |
---|
391 | || STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0) |
---|
392 | /* Legacy Chinese encodings */ |
---|
393 | || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0) |
---|
394 | || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0) |
---|
395 | || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0) |
---|
396 | || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0) |
---|
397 | /* Legacy Korean encodings */ |
---|
398 | || STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0) |
---|
399 | || STREQ (encoding, "CP949", 'C', 'P', '9', '4', '9', 0, 0, 0, 0) |
---|
400 | || STREQ (encoding, "JOHAB", 'J', 'O', 'H', 'A', 'B', 0, 0, 0, 0)) |
---|
401 | return 1; |
---|
402 | return 0; |
---|
403 | } |
---|
404 | |
---|
405 | static int |
---|
406 | is_utf8_encoding (encoding) |
---|
407 | const char *encoding; |
---|
408 | { |
---|
409 | if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0 ,0)) |
---|
410 | return 1; |
---|
411 | return 0; |
---|
412 | } |
---|
413 | |
---|
414 | |
---|
415 | /* Determine number of column positions required for UC. */ |
---|
416 | int uc_width PARAMS ((unsigned int uc, const char *encoding)); |
---|
417 | |
---|
418 | /* |
---|
419 | * Non-spacing attribute table. |
---|
420 | * See PropList.txt, or grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt |
---|
421 | * Control characters are also marked non-spacing here, because they are not |
---|
422 | * printable. Zero width characters are also marked non-spacing here. |
---|
423 | */ |
---|
424 | static const unsigned char nonspacing_table_data[15*64] = { |
---|
425 | /* 0x0000-0x01ff */ |
---|
426 | 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, /* 0x0000-0x003f */ |
---|
427 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0x0040-0x007f */ |
---|
428 | 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, /* 0x0080-0x00bf */ |
---|
429 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00c0-0x00ff */ |
---|
430 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0100-0x013f */ |
---|
431 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0140-0x017f */ |
---|
432 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0180-0x01bf */ |
---|
433 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x01c0-0x01ff */ |
---|
434 | /* 0x0200-0x03ff */ |
---|
435 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0200-0x023f */ |
---|
436 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0240-0x027f */ |
---|
437 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0280-0x02bf */ |
---|
438 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x02c0-0x02ff */ |
---|
439 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x0300-0x033f */ |
---|
440 | 0xff, 0x7f, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, /* 0x0340-0x037f */ |
---|
441 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0380-0x03bf */ |
---|
442 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x03c0-0x03ff */ |
---|
443 | /* 0x0400-0x05ff */ |
---|
444 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0400-0x043f */ |
---|
445 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0440-0x047f */ |
---|
446 | 0x78, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0480-0x04bf */ |
---|
447 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04c0-0x04ff */ |
---|
448 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0500-0x053f */ |
---|
449 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0540-0x057f */ |
---|
450 | 0x00, 0x00, 0xfe, 0xff, 0xfb, 0xff, 0xff, 0xbb, /* 0x0580-0x05bf */ |
---|
451 | 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x05c0-0x05ff */ |
---|
452 | /* 0x0600-0x07ff */ |
---|
453 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0600-0x063f */ |
---|
454 | 0x00, 0xf8, 0x3f, 0x00, 0x00, 0x00, 0x01, 0x00, /* 0x0640-0x067f */ |
---|
455 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0680-0x06bf */ |
---|
456 | 0x00, 0x00, 0xc0, 0xff, 0x9f, 0x3d, 0x00, 0x00, /* 0x06c0-0x06ff */ |
---|
457 | 0x00, 0x80, 0x02, 0x00, 0x00, 0x00, 0xff, 0xff, /* 0x0700-0x073f */ |
---|
458 | 0xff, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0740-0x077f */ |
---|
459 | 0x00, 0x00, 0x00, 0x00, 0xc0, 0xff, 0x01, 0x00, /* 0x0780-0x07bf */ |
---|
460 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x07c0-0x07ff */ |
---|
461 | /* 0x0800-0x09ff */ |
---|
462 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0800-0x083f */ |
---|
463 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0840-0x087f */ |
---|
464 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0880-0x08bf */ |
---|
465 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08c0-0x08ff */ |
---|
466 | 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0900-0x093f */ |
---|
467 | 0xfe, 0x21, 0x1e, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x0940-0x097f */ |
---|
468 | 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0980-0x09bf */ |
---|
469 | 0x1e, 0x20, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x09c0-0x09ff */ |
---|
470 | /* 0x0a00-0x0bff */ |
---|
471 | 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0a00-0x0a3f */ |
---|
472 | 0x86, 0x39, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, /* 0x0a40-0x0a7f */ |
---|
473 | 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0a80-0x0abf */ |
---|
474 | 0xbe, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0ac0-0x0aff */ |
---|
475 | 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, /* 0x0b00-0x0b3f */ |
---|
476 | 0x0e, 0x20, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0b40-0x0b7f */ |
---|
477 | 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0b80-0x0bbf */ |
---|
478 | 0x01, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0bc0-0x0bff */ |
---|
479 | /* 0x0c00-0x0dff */ |
---|
480 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, /* 0x0c00-0x0c3f */ |
---|
481 | 0xc1, 0x3d, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0c40-0x0c7f */ |
---|
482 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0x0c80-0x0cbf */ |
---|
483 | 0x40, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0cc0-0x0cff */ |
---|
484 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d00-0x0d3f */ |
---|
485 | 0x0e, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d40-0x0d7f */ |
---|
486 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d80-0x0dbf */ |
---|
487 | 0x00, 0x04, 0x5c, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0dc0-0x0dff */ |
---|
488 | /* 0x0e00-0x0fff */ |
---|
489 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x07, /* 0x0e00-0x0e3f */ |
---|
490 | 0x80, 0x7f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0e40-0x0e7f */ |
---|
491 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x1b, /* 0x0e80-0x0ebf */ |
---|
492 | 0x00, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0ec0-0x0eff */ |
---|
493 | 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0xa0, 0x02, /* 0x0f00-0x0f3f */ |
---|
494 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x7f, /* 0x0f40-0x0f7f */ |
---|
495 | 0xdf, 0x00, 0xff, 0xfe, 0xff, 0xff, 0xff, 0x1f, /* 0x0f80-0x0fbf */ |
---|
496 | 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0fc0-0x0fff */ |
---|
497 | /* 0x1000-0x11ff */ |
---|
498 | 0x00, 0x00, 0x00, 0x00, 0x00, 0xe0, 0xc5, 0x02, /* 0x1000-0x103f */ |
---|
499 | 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, /* 0x1040-0x107f */ |
---|
500 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1080-0x10bf */ |
---|
501 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10c0-0x10ff */ |
---|
502 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1100-0x113f */ |
---|
503 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1140-0x117f */ |
---|
504 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1180-0x11bf */ |
---|
505 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x11c0-0x11ff */ |
---|
506 | /* 0x1600-0x17ff */ |
---|
507 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1600-0x163f */ |
---|
508 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1640-0x167f */ |
---|
509 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1680-0x16bf */ |
---|
510 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x16c0-0x16ff */ |
---|
511 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1700-0x173f */ |
---|
512 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1740-0x177f */ |
---|
513 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3f, /* 0x1780-0x17bf */ |
---|
514 | 0x40, 0xfe, 0x0f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x17c0-0x17ff */ |
---|
515 | /* 0x1800-0x19ff */ |
---|
516 | 0x00, 0x78, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1800-0x183f */ |
---|
517 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1840-0x187f */ |
---|
518 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, /* 0x1880-0x18bf */ |
---|
519 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18c0-0x18ff */ |
---|
520 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1900-0x193f */ |
---|
521 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1940-0x197f */ |
---|
522 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1980-0x19bf */ |
---|
523 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x19c0-0x19ff */ |
---|
524 | /* 0x2000-0x21ff */ |
---|
525 | 0x00, 0xf8, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, /* 0x2000-0x203f */ |
---|
526 | 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x00, 0x00, /* 0x2040-0x207f */ |
---|
527 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2080-0x20bf */ |
---|
528 | 0x00, 0x00, 0xff, 0xff, 0x0f, 0x00, 0x00, 0x00, /* 0x20c0-0x20ff */ |
---|
529 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2100-0x213f */ |
---|
530 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2140-0x217f */ |
---|
531 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2180-0x21bf */ |
---|
532 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x21c0-0x21ff */ |
---|
533 | /* 0x3000-0x31ff */ |
---|
534 | 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x00, 0x00, /* 0x3000-0x303f */ |
---|
535 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3040-0x307f */ |
---|
536 | 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, /* 0x3080-0x30bf */ |
---|
537 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30c0-0x30ff */ |
---|
538 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3100-0x313f */ |
---|
539 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3140-0x317f */ |
---|
540 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3180-0x31bf */ |
---|
541 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x31c0-0x31ff */ |
---|
542 | /* 0xfa00-0xfbff */ |
---|
543 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa00-0xfa3f */ |
---|
544 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa40-0xfa7f */ |
---|
545 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa80-0xfabf */ |
---|
546 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfac0-0xfaff */ |
---|
547 | 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, /* 0xfb00-0xfb3f */ |
---|
548 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfb40-0xfb7f */ |
---|
549 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfb80-0xfbbf */ |
---|
550 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfbc0-0xfbff */ |
---|
551 | /* 0xfe00-0xffff */ |
---|
552 | 0x00, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, /* 0xfe00-0xfe3f */ |
---|
553 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfe40-0xfe7f */ |
---|
554 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfe80-0xfebf */ |
---|
555 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0xfec0-0xfeff */ |
---|
556 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff00-0xff3f */ |
---|
557 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff40-0xff7f */ |
---|
558 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff80-0xffbf */ |
---|
559 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e /* 0xffc0-0xffff */ |
---|
560 | }; |
---|
561 | static const signed char nonspacing_table_ind[128] = { |
---|
562 | 0, 1, 2, 3, 4, 5, 6, 7, /* 0x0000-0x0fff */ |
---|
563 | 8, -1, -1, 9, 10, -1, -1, -1, /* 0x1000-0x1fff */ |
---|
564 | 11, -1, -1, -1, -1, -1, -1, -1, /* 0x2000-0x2fff */ |
---|
565 | 12, -1, -1, -1, -1, -1, -1, -1, /* 0x3000-0x3fff */ |
---|
566 | -1, -1, -1, -1, -1, -1, -1, -1, /* 0x4000-0x4fff */ |
---|
567 | -1, -1, -1, -1, -1, -1, -1, -1, /* 0x5000-0x5fff */ |
---|
568 | -1, -1, -1, -1, -1, -1, -1, -1, /* 0x6000-0x6fff */ |
---|
569 | -1, -1, -1, -1, -1, -1, -1, -1, /* 0x7000-0x7fff */ |
---|
570 | -1, -1, -1, -1, -1, -1, -1, -1, /* 0x8000-0x8fff */ |
---|
571 | -1, -1, -1, -1, -1, -1, -1, -1, /* 0x9000-0x9fff */ |
---|
572 | -1, -1, -1, -1, -1, -1, -1, -1, /* 0xa000-0xafff */ |
---|
573 | -1, -1, -1, -1, -1, -1, -1, -1, /* 0xb000-0xbfff */ |
---|
574 | -1, -1, -1, -1, -1, -1, -1, -1, /* 0xc000-0xcfff */ |
---|
575 | -1, -1, -1, -1, -1, -1, -1, -1, /* 0xd000-0xdfff */ |
---|
576 | -1, -1, -1, -1, -1, -1, -1, -1, /* 0xe000-0xefff */ |
---|
577 | -1, -1, -1, -1, -1, 13, -1, 14 /* 0xf000-0xffff */ |
---|
578 | }; |
---|
579 | |
---|
580 | /* Determine number of column positions required for UC. */ |
---|
581 | int |
---|
582 | uc_width (uc, encoding) |
---|
583 | unsigned int uc; |
---|
584 | const char *encoding; |
---|
585 | { |
---|
586 | /* Test for non-spacing or control character. */ |
---|
587 | if ((uc >> 9) < 128) |
---|
588 | { |
---|
589 | int ind = nonspacing_table_ind[uc >> 9]; |
---|
590 | if (ind >= 0) |
---|
591 | if ((nonspacing_table_data[64*ind + ((uc >> 3) & 63)] >> (uc & 7)) & 1) |
---|
592 | { |
---|
593 | if (uc > 0 && uc < 0x100) |
---|
594 | return -1; |
---|
595 | else |
---|
596 | return 0; |
---|
597 | } |
---|
598 | } |
---|
599 | /* Test for double-width character. |
---|
600 | * Generated from "grep '^....;[WF]' EastAsianWidth.txt" |
---|
601 | * and "grep '^....;[^WF]' EastAsianWidth.txt" |
---|
602 | */ |
---|
603 | if (uc >= 0x1100 |
---|
604 | && ((uc < 0x1160) /* Hangul Jamo */ |
---|
605 | || (uc >= 0x2e80 && uc < 0xa4d0 /* CJK ... Yi */ |
---|
606 | && !((uc & ~0x0011) == 0x300a || uc == 0x303f)) |
---|
607 | || (uc >= 0xac00 && uc < 0xd7a4) /* Hangul Syllables */ |
---|
608 | || (uc >= 0xf900 && uc < 0xfb00) /* CJK Compatibility Ideographs */ |
---|
609 | || (uc >= 0xfe30 && uc < 0xfe70) /* CJK Compatibility Forms */ |
---|
610 | || (uc >= 0xff00 && uc < 0xff60) /* Fullwidth Forms */ |
---|
611 | || (uc >= 0xffe0 && uc < 0xffe7))) |
---|
612 | return 2; |
---|
613 | /* In ancient CJK encodings, Cyrillic and most other characters are |
---|
614 | double-width as well. */ |
---|
615 | if (uc >= 0x00A1 && uc < 0xFF60 && uc != 0x20A9 |
---|
616 | && is_cjk_encoding (encoding)) |
---|
617 | return 2; |
---|
618 | return 1; |
---|
619 | } |
---|
620 | |
---|
621 | |
---|
622 | #ifdef unused |
---|
623 | |
---|
624 | /* Determine number of column positions required for first N units |
---|
625 | (or fewer if S ends before this) in S. */ |
---|
626 | |
---|
627 | int |
---|
628 | u8_width (s, n, encoding) |
---|
629 | const unsigned char *s; |
---|
630 | size_t n; |
---|
631 | const char *encoding; |
---|
632 | { |
---|
633 | const unsigned char *s_end = s + n; |
---|
634 | int width = 0; |
---|
635 | |
---|
636 | while (s < s_end) |
---|
637 | { |
---|
638 | unsigned int uc; |
---|
639 | int w; |
---|
640 | |
---|
641 | s += u8_mbtouc (&uc, s, s_end - s); |
---|
642 | |
---|
643 | if (uc == 0) |
---|
644 | break; /* end of string reached */ |
---|
645 | |
---|
646 | w = uc_width (uc, encoding); |
---|
647 | if (w >= 0) /* ignore control characters in the string */ |
---|
648 | width += w; |
---|
649 | } |
---|
650 | |
---|
651 | return width; |
---|
652 | } |
---|
653 | |
---|
654 | int |
---|
655 | u16_width (s, n, encoding) |
---|
656 | const unsigned short *s; |
---|
657 | size_t n; |
---|
658 | const char *encoding; |
---|
659 | { |
---|
660 | const unsigned short *s_end = s + n; |
---|
661 | int width = 0; |
---|
662 | |
---|
663 | while (s < s_end) |
---|
664 | { |
---|
665 | unsigned int uc; |
---|
666 | int w; |
---|
667 | |
---|
668 | s += u16_mbtouc (&uc, s, s_end - s); |
---|
669 | |
---|
670 | if (uc == 0) |
---|
671 | break; /* end of string reached */ |
---|
672 | |
---|
673 | w = uc_width (uc, encoding); |
---|
674 | if (w >= 0) /* ignore control characters in the string */ |
---|
675 | width += w; |
---|
676 | } |
---|
677 | |
---|
678 | return width; |
---|
679 | } |
---|
680 | |
---|
681 | int |
---|
682 | u32_width (s, n, encoding) |
---|
683 | const unsigned int *s; |
---|
684 | size_t n; |
---|
685 | const char *encoding; |
---|
686 | { |
---|
687 | const unsigned int *s_end = s + n; |
---|
688 | int width = 0; |
---|
689 | |
---|
690 | while (s < s_end) |
---|
691 | { |
---|
692 | unsigned int uc = *s++; |
---|
693 | int w; |
---|
694 | |
---|
695 | if (uc == 0) |
---|
696 | break; /* end of string reached */ |
---|
697 | |
---|
698 | w = uc_width (uc, encoding); |
---|
699 | if (w >= 0) /* ignore control characters in the string */ |
---|
700 | width += w; |
---|
701 | } |
---|
702 | |
---|
703 | return width; |
---|
704 | } |
---|
705 | |
---|
706 | #endif |
---|
707 | |
---|
708 | |
---|
709 | /* Determine the line break points in S, and store the result at p[0..n-1]. */ |
---|
710 | /* We don't support line breaking of complex-context dependent characters |
---|
711 | (Thai, Lao, Myanmar, Khmer) yet, because it requires dictionary lookup. */ |
---|
712 | |
---|
713 | /* Line breaking classification. */ |
---|
714 | |
---|
715 | enum |
---|
716 | { |
---|
717 | /* Values >= 20 are resolved at run time. */ |
---|
718 | LBP_BK = 0, /* mandatory break */ |
---|
719 | /*LBP_CR, carriage return - not used here because it's a DOSism */ |
---|
720 | /*LBP_LF, line feed - not used here because it's a DOSism */ |
---|
721 | LBP_CM = 20, /* attached characters and combining marks */ |
---|
722 | /*LBP_SG, surrogates - not used here because they are not characters */ |
---|
723 | LBP_ZW = 1, /* zero width space */ |
---|
724 | LBP_IN = 2, /* inseparable */ |
---|
725 | LBP_GL = 3, /* non-breaking (glue) */ |
---|
726 | LBP_CB = 22, /* contingent break opportunity */ |
---|
727 | LBP_SP = 21, /* space */ |
---|
728 | LBP_BA = 4, /* break opportunity after */ |
---|
729 | LBP_BB = 5, /* break opportunity before */ |
---|
730 | LBP_B2 = 6, /* break opportunity before and after */ |
---|
731 | LBP_HY = 7, /* hyphen */ |
---|
732 | LBP_NS = 8, /* non starter */ |
---|
733 | LBP_OP = 9, /* opening punctuation */ |
---|
734 | LBP_CL = 10, /* closing punctuation */ |
---|
735 | LBP_QU = 11, /* ambiguous quotation */ |
---|
736 | LBP_EX = 12, /* exclamation/interrogation */ |
---|
737 | LBP_ID = 13, /* ideographic */ |
---|
738 | LBP_NU = 14, /* numeric */ |
---|
739 | LBP_IS = 15, /* infix separator (numeric) */ |
---|
740 | LBP_SY = 16, /* symbols allowing breaks */ |
---|
741 | LBP_AL = 17, /* ordinary alphabetic and symbol characters */ |
---|
742 | LBP_PR = 18, /* prefix (numeric) */ |
---|
743 | LBP_PO = 19, /* postfix (numeric) */ |
---|
744 | LBP_SA = 23, /* complex context (South East Asian) */ |
---|
745 | LBP_AI = 24, /* ambiguous (alphabetic or ideograph) */ |
---|
746 | LBP_XX = 25 /* unknown */ |
---|
747 | }; |
---|
748 | |
---|
749 | #include "lbrkprop.h" |
---|
750 | |
---|
751 | static inline unsigned char |
---|
752 | lbrkprop_lookup (uc) |
---|
753 | unsigned int uc; |
---|
754 | { |
---|
755 | unsigned int index1 = uc >> lbrkprop_header_0; |
---|
756 | if (index1 < lbrkprop_header_1) |
---|
757 | { |
---|
758 | int lookup1 = lbrkprop.level1[index1]; |
---|
759 | if (lookup1 >= 0) |
---|
760 | { |
---|
761 | unsigned int index2 = (uc >> lbrkprop_header_2) & lbrkprop_header_3; |
---|
762 | int lookup2 = lbrkprop.level2[lookup1 + index2]; |
---|
763 | if (lookup2 >= 0) |
---|
764 | { |
---|
765 | unsigned int index3 = uc & lbrkprop_header_4; |
---|
766 | return lbrkprop.level3[lookup2 + index3]; |
---|
767 | } |
---|
768 | } |
---|
769 | } |
---|
770 | return LBP_XX; |
---|
771 | } |
---|
772 | |
---|
773 | /* Table indexed by two line breaking classifications. */ |
---|
774 | #define D 1 /* direct break opportunity, empty in table 7.3 of UTR #14 */ |
---|
775 | #define I 2 /* indirect break opportunity, '%' in table 7.3 of UTR #14 */ |
---|
776 | #define P 3 /* prohibited break, '^' in table 7.3 of UTR #14 */ |
---|
777 | static const unsigned char lbrk_table[19][19] = { |
---|
778 | /* after */ |
---|
779 | /* ZW IN GL BA BB B2 HY NS OP CL QU EX ID NU IS SY AL PR PO */ |
---|
780 | /* ZW */ { P, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, }, |
---|
781 | /* IN */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, }, |
---|
782 | /* GL */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, }, |
---|
783 | /* BA */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, }, |
---|
784 | /* BB */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, }, |
---|
785 | /* B2 */ { P, D, I, I, D, P, I, I, D, P, I, P, D, D, P, P, D, D, D, }, |
---|
786 | /* HY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, }, |
---|
787 | /* NS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, }, |
---|
788 | /* OP */ { P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, }, |
---|
789 | /* CL */ { P, D, I, I, D, D, I, P, D, P, I, P, D, D, P, P, D, D, I, }, |
---|
790 | /* QU */ { P, I, I, I, I, I, I, I, P, P, I, P, I, I, P, P, I, I, I, }, |
---|
791 | /* EX */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, }, |
---|
792 | /* ID */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, I, }, |
---|
793 | /* NU */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, I, }, |
---|
794 | /* IS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, }, |
---|
795 | /* SY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, }, |
---|
796 | /* AL */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, D, }, |
---|
797 | /* PR */ { P, D, I, I, D, D, I, I, I, P, I, P, I, I, P, P, I, D, D, }, |
---|
798 | /* PO */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, }, |
---|
799 | /* "" */ |
---|
800 | /* before */ |
---|
801 | }; |
---|
802 | /* Note: The (B2,B2) entry should probably be D instead of P. */ |
---|
803 | /* Note: The (PR,ID) entry should probably be D instead of I. */ |
---|
804 | |
---|
805 | void |
---|
806 | u8_possible_linebreaks (s, n, encoding, p) |
---|
807 | const unsigned char *s; |
---|
808 | size_t n; |
---|
809 | const char *encoding; |
---|
810 | char *p; |
---|
811 | { |
---|
812 | int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL); |
---|
813 | const unsigned char *s_end = s + n; |
---|
814 | int last_prop = LBP_BK; /* line break property of last non-space character */ |
---|
815 | char *seen_space = NULL; /* Was a space seen after the last non-space character? */ |
---|
816 | char *seen_space2 = NULL; /* At least two spaces after the last non-space? */ |
---|
817 | |
---|
818 | /* Don't break inside multibyte characters. */ |
---|
819 | memset (p, UC_BREAK_PROHIBITED, n); |
---|
820 | |
---|
821 | while (s < s_end) |
---|
822 | { |
---|
823 | unsigned int uc; |
---|
824 | int count = u8_mbtouc (&uc, s, s_end - s); |
---|
825 | int prop = lbrkprop_lookup (uc); |
---|
826 | |
---|
827 | if (prop == LBP_BK) |
---|
828 | { |
---|
829 | /* Mandatory break. */ |
---|
830 | *p = UC_BREAK_MANDATORY; |
---|
831 | last_prop = LBP_BK; |
---|
832 | seen_space = NULL; |
---|
833 | seen_space2 = NULL; |
---|
834 | } |
---|
835 | else |
---|
836 | { |
---|
837 | char *q; |
---|
838 | |
---|
839 | /* Resolve property values whose behaviour is not fixed. */ |
---|
840 | switch (prop) |
---|
841 | { |
---|
842 | case LBP_AI: |
---|
843 | /* Resolve ambiguous. */ |
---|
844 | prop = LBP_AI_REPLACEMENT; |
---|
845 | break; |
---|
846 | case LBP_CB: |
---|
847 | /* This is arbitrary. */ |
---|
848 | prop = LBP_ID; |
---|
849 | break; |
---|
850 | case LBP_SA: |
---|
851 | /* We don't handle complex scripts yet. |
---|
852 | Treat LBP_SA like LBP_XX. */ |
---|
853 | case LBP_XX: |
---|
854 | /* This is arbitrary. */ |
---|
855 | prop = LBP_AL; |
---|
856 | break; |
---|
857 | } |
---|
858 | |
---|
859 | /* Deal with combining characters. */ |
---|
860 | q = p; |
---|
861 | if (prop == LBP_CM) |
---|
862 | { |
---|
863 | /* Don't break just before a combining character. */ |
---|
864 | *p = UC_BREAK_PROHIBITED; |
---|
865 | /* A combining character turns a preceding space into LBP_AL. */ |
---|
866 | if (seen_space != NULL) |
---|
867 | { |
---|
868 | q = seen_space; |
---|
869 | seen_space = seen_space2; |
---|
870 | prop = LBP_AL; |
---|
871 | goto lookup_via_table; |
---|
872 | } |
---|
873 | } |
---|
874 | else if (prop == LBP_SP) |
---|
875 | { |
---|
876 | /* Don't break just before a space. */ |
---|
877 | *p = UC_BREAK_PROHIBITED; |
---|
878 | seen_space2 = seen_space; |
---|
879 | seen_space = p; |
---|
880 | } |
---|
881 | else |
---|
882 | { |
---|
883 | lookup_via_table: |
---|
884 | /* prop must be usable as an index for table 7.3 of UTR #14. */ |
---|
885 | if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0]))) |
---|
886 | abort (); |
---|
887 | |
---|
888 | if (last_prop == LBP_BK) |
---|
889 | { |
---|
890 | /* Don't break at the beginning of a line. */ |
---|
891 | *q = UC_BREAK_PROHIBITED; |
---|
892 | } |
---|
893 | else |
---|
894 | { |
---|
895 | switch (lbrk_table [last_prop-1] [prop-1]) |
---|
896 | { |
---|
897 | case D: |
---|
898 | *q = UC_BREAK_POSSIBLE; |
---|
899 | break; |
---|
900 | case I: |
---|
901 | *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED); |
---|
902 | break; |
---|
903 | case P: |
---|
904 | *q = UC_BREAK_PROHIBITED; |
---|
905 | break; |
---|
906 | default: |
---|
907 | abort (); |
---|
908 | } |
---|
909 | } |
---|
910 | last_prop = prop; |
---|
911 | seen_space = NULL; |
---|
912 | seen_space2 = NULL; |
---|
913 | } |
---|
914 | } |
---|
915 | |
---|
916 | s += count; |
---|
917 | p += count; |
---|
918 | } |
---|
919 | } |
---|
920 | |
---|
921 | #ifdef unused |
---|
922 | |
---|
923 | void |
---|
924 | u16_possible_linebreaks (s, n, encoding, p) |
---|
925 | const unsigned short *s; |
---|
926 | size_t n; |
---|
927 | const char *encoding; |
---|
928 | char *p; |
---|
929 | { |
---|
930 | int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL); |
---|
931 | const unsigned short *s_end = s + n; |
---|
932 | int last_prop = LBP_BK; /* line break property of last non-space character */ |
---|
933 | char *seen_space = NULL; /* Was a space seen after the last non-space character? */ |
---|
934 | char *seen_space2 = NULL; /* At least two spaces after the last non-space? */ |
---|
935 | |
---|
936 | /* Don't break inside multibyte characters. */ |
---|
937 | memset (p, UC_BREAK_PROHIBITED, n); |
---|
938 | |
---|
939 | while (s < s_end) |
---|
940 | { |
---|
941 | unsigned int uc; |
---|
942 | int count = u16_mbtouc (&uc, s, s_end - s); |
---|
943 | int prop = lbrkprop_lookup (uc); |
---|
944 | |
---|
945 | if (prop == LBP_BK) |
---|
946 | { |
---|
947 | /* Mandatory break. */ |
---|
948 | *p = UC_BREAK_MANDATORY; |
---|
949 | last_prop = LBP_BK; |
---|
950 | seen_space = NULL; |
---|
951 | seen_space2 = NULL; |
---|
952 | } |
---|
953 | else |
---|
954 | { |
---|
955 | char *q; |
---|
956 | |
---|
957 | /* Resolve property values whose behaviour is not fixed. */ |
---|
958 | switch (prop) |
---|
959 | { |
---|
960 | case LBP_AI: |
---|
961 | /* Resolve ambiguous. */ |
---|
962 | prop = LBP_AI_REPLACEMENT; |
---|
963 | break; |
---|
964 | case LBP_CB: |
---|
965 | /* This is arbitrary. */ |
---|
966 | prop = LBP_ID; |
---|
967 | break; |
---|
968 | case LBP_SA: |
---|
969 | /* We don't handle complex scripts yet. |
---|
970 | Treat LBP_SA like LBP_XX. */ |
---|
971 | case LBP_XX: |
---|
972 | /* This is arbitrary. */ |
---|
973 | prop = LBP_AL; |
---|
974 | break; |
---|
975 | } |
---|
976 | |
---|
977 | /* Deal with combining characters. */ |
---|
978 | q = p; |
---|
979 | if (prop == LBP_CM) |
---|
980 | { |
---|
981 | /* Don't break just before a combining character. */ |
---|
982 | *p = UC_BREAK_PROHIBITED; |
---|
983 | /* A combining character turns a preceding space into LBP_AL. */ |
---|
984 | if (seen_space != NULL) |
---|
985 | { |
---|
986 | q = seen_space; |
---|
987 | seen_space = seen_space2; |
---|
988 | prop = LBP_AL; |
---|
989 | goto lookup_via_table; |
---|
990 | } |
---|
991 | } |
---|
992 | else if (prop == LBP_SP) |
---|
993 | { |
---|
994 | /* Don't break just before a space. */ |
---|
995 | *p = UC_BREAK_PROHIBITED; |
---|
996 | seen_space2 = seen_space; |
---|
997 | seen_space = p; |
---|
998 | } |
---|
999 | else |
---|
1000 | { |
---|
1001 | lookup_via_table: |
---|
1002 | /* prop must be usable as an index for table 7.3 of UTR #14. */ |
---|
1003 | if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0]))) |
---|
1004 | abort (); |
---|
1005 | |
---|
1006 | if (last_prop == LBP_BK) |
---|
1007 | { |
---|
1008 | /* Don't break at the beginning of a line. */ |
---|
1009 | *q = UC_BREAK_PROHIBITED; |
---|
1010 | } |
---|
1011 | else |
---|
1012 | { |
---|
1013 | switch (lbrk_table [last_prop-1] [prop-1]) |
---|
1014 | { |
---|
1015 | case D: |
---|
1016 | *q = UC_BREAK_POSSIBLE; |
---|
1017 | break; |
---|
1018 | case I: |
---|
1019 | *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED); |
---|
1020 | break; |
---|
1021 | case P: |
---|
1022 | *q = UC_BREAK_PROHIBITED; |
---|
1023 | break; |
---|
1024 | default: |
---|
1025 | abort (); |
---|
1026 | } |
---|
1027 | } |
---|
1028 | last_prop = prop; |
---|
1029 | seen_space = NULL; |
---|
1030 | seen_space2 = NULL; |
---|
1031 | } |
---|
1032 | } |
---|
1033 | |
---|
1034 | s += count; |
---|
1035 | p += count; |
---|
1036 | } |
---|
1037 | } |
---|
1038 | |
---|
1039 | void |
---|
1040 | u32_possible_linebreaks (s, n, encoding, p) |
---|
1041 | const unsigned int *s; |
---|
1042 | size_t n; |
---|
1043 | const char *encoding; |
---|
1044 | char *p; |
---|
1045 | { |
---|
1046 | int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL); |
---|
1047 | const unsigned int *s_end = s + n; |
---|
1048 | int last_prop = LBP_BK; /* line break property of last non-space character */ |
---|
1049 | char *seen_space = NULL; /* Was a space seen after the last non-space character? */ |
---|
1050 | char *seen_space2 = NULL; /* At least two spaces after the last non-space? */ |
---|
1051 | |
---|
1052 | while (s < s_end) |
---|
1053 | { |
---|
1054 | unsigned int uc = *s; |
---|
1055 | int prop = lbrkprop_lookup (uc); |
---|
1056 | |
---|
1057 | if (prop == LBP_BK) |
---|
1058 | { |
---|
1059 | /* Mandatory break. */ |
---|
1060 | *p = UC_BREAK_MANDATORY; |
---|
1061 | last_prop = LBP_BK; |
---|
1062 | seen_space = NULL; |
---|
1063 | seen_space2 = NULL; |
---|
1064 | } |
---|
1065 | else |
---|
1066 | { |
---|
1067 | char *q; |
---|
1068 | |
---|
1069 | /* Resolve property values whose behaviour is not fixed. */ |
---|
1070 | switch (prop) |
---|
1071 | { |
---|
1072 | case LBP_AI: |
---|
1073 | /* Resolve ambiguous. */ |
---|
1074 | prop = LBP_AI_REPLACEMENT; |
---|
1075 | break; |
---|
1076 | case LBP_CB: |
---|
1077 | /* This is arbitrary. */ |
---|
1078 | prop = LBP_ID; |
---|
1079 | break; |
---|
1080 | case LBP_SA: |
---|
1081 | /* We don't handle complex scripts yet. |
---|
1082 | Treat LBP_SA like LBP_XX. */ |
---|
1083 | case LBP_XX: |
---|
1084 | /* This is arbitrary. */ |
---|
1085 | prop = LBP_AL; |
---|
1086 | break; |
---|
1087 | } |
---|
1088 | |
---|
1089 | /* Deal with combining characters. */ |
---|
1090 | q = p; |
---|
1091 | if (prop == LBP_CM) |
---|
1092 | { |
---|
1093 | /* Don't break just before a combining character. */ |
---|
1094 | *p = UC_BREAK_PROHIBITED; |
---|
1095 | /* A combining character turns a preceding space into LBP_AL. */ |
---|
1096 | if (seen_space != NULL) |
---|
1097 | { |
---|
1098 | q = seen_space; |
---|
1099 | seen_space = seen_space2; |
---|
1100 | prop = LBP_AL; |
---|
1101 | goto lookup_via_table; |
---|
1102 | } |
---|
1103 | } |
---|
1104 | else if (prop == LBP_SP) |
---|
1105 | { |
---|
1106 | /* Don't break just before a space. */ |
---|
1107 | *p = UC_BREAK_PROHIBITED; |
---|
1108 | seen_space2 = seen_space; |
---|
1109 | seen_space = p; |
---|
1110 | } |
---|
1111 | else |
---|
1112 | { |
---|
1113 | lookup_via_table: |
---|
1114 | /* prop must be usable as an index for table 7.3 of UTR #14. */ |
---|
1115 | if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0]))) |
---|
1116 | abort (); |
---|
1117 | |
---|
1118 | if (last_prop == LBP_BK) |
---|
1119 | { |
---|
1120 | /* Don't break at the beginning of a line. */ |
---|
1121 | *q = UC_BREAK_PROHIBITED; |
---|
1122 | } |
---|
1123 | else |
---|
1124 | { |
---|
1125 | switch (lbrk_table [last_prop-1] [prop-1]) |
---|
1126 | { |
---|
1127 | case D: |
---|
1128 | *q = UC_BREAK_POSSIBLE; |
---|
1129 | break; |
---|
1130 | case I: |
---|
1131 | *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED); |
---|
1132 | break; |
---|
1133 | case P: |
---|
1134 | *q = UC_BREAK_PROHIBITED; |
---|
1135 | break; |
---|
1136 | default: |
---|
1137 | abort (); |
---|
1138 | } |
---|
1139 | } |
---|
1140 | last_prop = prop; |
---|
1141 | seen_space = NULL; |
---|
1142 | seen_space2 = NULL; |
---|
1143 | } |
---|
1144 | } |
---|
1145 | |
---|
1146 | s++; |
---|
1147 | p++; |
---|
1148 | } |
---|
1149 | } |
---|
1150 | |
---|
1151 | #endif |
---|
1152 | |
---|
1153 | |
---|
1154 | /* Choose the best line breaks, assuming the uc_width function. |
---|
1155 | Return the column after the end of the string. */ |
---|
1156 | |
---|
1157 | int |
---|
1158 | u8_width_linebreaks (s, n, width, start_column, at_end_columns, o, encoding, p) |
---|
1159 | const unsigned char *s; |
---|
1160 | size_t n; |
---|
1161 | int width; |
---|
1162 | int start_column; |
---|
1163 | int at_end_columns; |
---|
1164 | const char *o; |
---|
1165 | const char *encoding; |
---|
1166 | char *p; |
---|
1167 | { |
---|
1168 | const unsigned char *s_end; |
---|
1169 | char *last_p; |
---|
1170 | int last_column; |
---|
1171 | int piece_width; |
---|
1172 | |
---|
1173 | u8_possible_linebreaks (s, n, encoding, p); |
---|
1174 | |
---|
1175 | s_end = s + n; |
---|
1176 | last_p = NULL; |
---|
1177 | last_column = start_column; |
---|
1178 | piece_width = 0; |
---|
1179 | while (s < s_end) |
---|
1180 | { |
---|
1181 | unsigned int uc; |
---|
1182 | int count = u8_mbtouc (&uc, s, s_end - s); |
---|
1183 | |
---|
1184 | /* Respect the override. */ |
---|
1185 | if (o != NULL && *o != UC_BREAK_UNDEFINED) |
---|
1186 | *p = *o; |
---|
1187 | |
---|
1188 | if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY) |
---|
1189 | { |
---|
1190 | /* An atomic piece of text ends here. */ |
---|
1191 | if (last_p != NULL && last_column + piece_width > width) |
---|
1192 | { |
---|
1193 | /* Insert a line break. */ |
---|
1194 | *last_p = UC_BREAK_POSSIBLE; |
---|
1195 | last_column = 0; |
---|
1196 | } |
---|
1197 | } |
---|
1198 | |
---|
1199 | if (*p == UC_BREAK_MANDATORY) |
---|
1200 | { |
---|
1201 | /* uc is a line break character. */ |
---|
1202 | /* Start a new piece at column 0. */ |
---|
1203 | last_p = NULL; |
---|
1204 | last_column = 0; |
---|
1205 | piece_width = 0; |
---|
1206 | } |
---|
1207 | else |
---|
1208 | { |
---|
1209 | /* uc is not a line break character. */ |
---|
1210 | int w; |
---|
1211 | |
---|
1212 | if (*p == UC_BREAK_POSSIBLE) |
---|
1213 | { |
---|
1214 | /* Start a new piece. */ |
---|
1215 | last_p = p; |
---|
1216 | last_column += piece_width; |
---|
1217 | piece_width = 0; |
---|
1218 | /* No line break for the moment, may be turned into |
---|
1219 | UC_BREAK_POSSIBLE later, via last_p. */ |
---|
1220 | } |
---|
1221 | |
---|
1222 | *p = UC_BREAK_PROHIBITED; |
---|
1223 | |
---|
1224 | w = uc_width (uc, encoding); |
---|
1225 | if (w >= 0) /* ignore control characters in the string */ |
---|
1226 | piece_width += w; |
---|
1227 | } |
---|
1228 | |
---|
1229 | s += count; |
---|
1230 | p += count; |
---|
1231 | if (o != NULL) |
---|
1232 | o += count; |
---|
1233 | } |
---|
1234 | |
---|
1235 | /* The last atomic piece of text ends here. */ |
---|
1236 | if (last_p != NULL && last_column + piece_width + at_end_columns > width) |
---|
1237 | { |
---|
1238 | /* Insert a line break. */ |
---|
1239 | *last_p = UC_BREAK_POSSIBLE; |
---|
1240 | last_column = 0; |
---|
1241 | } |
---|
1242 | |
---|
1243 | return last_column + piece_width; |
---|
1244 | } |
---|
1245 | |
---|
1246 | #ifdef unused |
---|
1247 | |
---|
1248 | int |
---|
1249 | u16_width_linebreaks (s, n, width, start_column, at_end_columns, o, encoding, p) |
---|
1250 | const unsigned short *s; |
---|
1251 | size_t n; |
---|
1252 | int width; |
---|
1253 | int start_column; |
---|
1254 | int at_end_columns; |
---|
1255 | const char *o; |
---|
1256 | const char *encoding; |
---|
1257 | char *p; |
---|
1258 | { |
---|
1259 | const unsigned short *s_end; |
---|
1260 | char *last_p; |
---|
1261 | int last_column; |
---|
1262 | int piece_width; |
---|
1263 | |
---|
1264 | u16_possible_linebreaks (s, n, encoding, p); |
---|
1265 | |
---|
1266 | s_end = s + n; |
---|
1267 | last_p = NULL; |
---|
1268 | last_column = start_column; |
---|
1269 | piece_width = 0; |
---|
1270 | while (s < s_end) |
---|
1271 | { |
---|
1272 | unsigned int uc; |
---|
1273 | int count = u16_mbtouc (&uc, s, s_end - s); |
---|
1274 | |
---|
1275 | /* Respect the override. */ |
---|
1276 | if (o != NULL && *o != UC_BREAK_UNDEFINED) |
---|
1277 | *p = *o; |
---|
1278 | |
---|
1279 | if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY) |
---|
1280 | { |
---|
1281 | /* An atomic piece of text ends here. */ |
---|
1282 | if (last_p != NULL && last_column + piece_width > width) |
---|
1283 | { |
---|
1284 | /* Insert a line break. */ |
---|
1285 | *last_p = UC_BREAK_POSSIBLE; |
---|
1286 | last_column = 0; |
---|
1287 | } |
---|
1288 | } |
---|
1289 | |
---|
1290 | if (*p == UC_BREAK_MANDATORY) |
---|
1291 | { |
---|
1292 | /* uc is a line break character. */ |
---|
1293 | /* Start a new piece at column 0. */ |
---|
1294 | last_p = NULL; |
---|
1295 | last_column = 0; |
---|
1296 | piece_width = 0; |
---|
1297 | } |
---|
1298 | else |
---|
1299 | { |
---|
1300 | /* uc is not a line break character. */ |
---|
1301 | int w; |
---|
1302 | |
---|
1303 | if (*p == UC_BREAK_POSSIBLE) |
---|
1304 | { |
---|
1305 | /* Start a new piece. */ |
---|
1306 | last_p = p; |
---|
1307 | last_column += piece_width; |
---|
1308 | piece_width = 0; |
---|
1309 | /* No line break for the moment, may be turned into |
---|
1310 | UC_BREAK_POSSIBLE later, via last_p. */ |
---|
1311 | } |
---|
1312 | |
---|
1313 | *p = UC_BREAK_PROHIBITED; |
---|
1314 | |
---|
1315 | w = uc_width (uc, encoding); |
---|
1316 | if (w >= 0) /* ignore control characters in the string */ |
---|
1317 | piece_width += w; |
---|
1318 | } |
---|
1319 | |
---|
1320 | s += count; |
---|
1321 | p += count; |
---|
1322 | if (o != NULL) |
---|
1323 | o += count; |
---|
1324 | } |
---|
1325 | |
---|
1326 | /* The last atomic piece of text ends here. */ |
---|
1327 | if (last_p != NULL && last_column + piece_width + at_end_columns > width) |
---|
1328 | { |
---|
1329 | /* Insert a line break. */ |
---|
1330 | *last_p = UC_BREAK_POSSIBLE; |
---|
1331 | last_column = 0; |
---|
1332 | } |
---|
1333 | |
---|
1334 | return last_column + piece_width; |
---|
1335 | } |
---|
1336 | |
---|
1337 | int |
---|
1338 | u32_width_linebreaks (s, n, width, start_column, at_end_columns, o, encoding, p) |
---|
1339 | const unsigned int *s; |
---|
1340 | size_t n; |
---|
1341 | int width; |
---|
1342 | int start_column; |
---|
1343 | int at_end_columns; |
---|
1344 | const char *o; |
---|
1345 | const char *encoding; |
---|
1346 | char *p; |
---|
1347 | { |
---|
1348 | const unsigned int *s_end; |
---|
1349 | char *last_p; |
---|
1350 | int last_column; |
---|
1351 | int piece_width; |
---|
1352 | |
---|
1353 | u32_possible_linebreaks (s, n, encoding, p); |
---|
1354 | |
---|
1355 | s_end = s + n; |
---|
1356 | last_p = NULL; |
---|
1357 | last_column = start_column; |
---|
1358 | piece_width = 0; |
---|
1359 | while (s < s_end) |
---|
1360 | { |
---|
1361 | unsigned int uc = *s; |
---|
1362 | |
---|
1363 | /* Respect the override. */ |
---|
1364 | if (o != NULL && *o != UC_BREAK_UNDEFINED) |
---|
1365 | *p = *o; |
---|
1366 | |
---|
1367 | if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY) |
---|
1368 | { |
---|
1369 | /* An atomic piece of text ends here. */ |
---|
1370 | if (last_p != NULL && last_column + piece_width > width) |
---|
1371 | { |
---|
1372 | /* Insert a line break. */ |
---|
1373 | *last_p = UC_BREAK_POSSIBLE; |
---|
1374 | last_column = 0; |
---|
1375 | } |
---|
1376 | } |
---|
1377 | |
---|
1378 | if (*p == UC_BREAK_MANDATORY) |
---|
1379 | { |
---|
1380 | /* uc is a line break character. */ |
---|
1381 | /* Start a new piece at column 0. */ |
---|
1382 | last_p = NULL; |
---|
1383 | last_column = 0; |
---|
1384 | piece_width = 0; |
---|
1385 | } |
---|
1386 | else |
---|
1387 | { |
---|
1388 | /* uc is not a line break character. */ |
---|
1389 | int w; |
---|
1390 | |
---|
1391 | if (*p == UC_BREAK_POSSIBLE) |
---|
1392 | { |
---|
1393 | /* Start a new piece. */ |
---|
1394 | last_p = p; |
---|
1395 | last_column += piece_width; |
---|
1396 | piece_width = 0; |
---|
1397 | /* No line break for the moment, may be turned into |
---|
1398 | UC_BREAK_POSSIBLE later, via last_p. */ |
---|
1399 | } |
---|
1400 | |
---|
1401 | *p = UC_BREAK_PROHIBITED; |
---|
1402 | |
---|
1403 | w = uc_width (uc, encoding); |
---|
1404 | if (w >= 0) /* ignore control characters in the string */ |
---|
1405 | piece_width += w; |
---|
1406 | } |
---|
1407 | |
---|
1408 | s++; |
---|
1409 | p++; |
---|
1410 | if (o != NULL) |
---|
1411 | o++; |
---|
1412 | } |
---|
1413 | |
---|
1414 | /* The last atomic piece of text ends here. */ |
---|
1415 | if (last_p != NULL && last_column + piece_width + at_end_columns > width) |
---|
1416 | { |
---|
1417 | /* Insert a line break. */ |
---|
1418 | *last_p = UC_BREAK_POSSIBLE; |
---|
1419 | last_column = 0; |
---|
1420 | } |
---|
1421 | |
---|
1422 | return last_column + piece_width; |
---|
1423 | } |
---|
1424 | |
---|
1425 | #endif |
---|
1426 | |
---|
1427 | |
---|
1428 | #ifdef TEST1 |
---|
1429 | |
---|
1430 | #include <stdio.h> |
---|
1431 | |
---|
1432 | /* Read the contents of an input stream, and return it, terminated with a NUL |
---|
1433 | byte. */ |
---|
1434 | char * |
---|
1435 | read_file (stream) |
---|
1436 | FILE *stream; |
---|
1437 | { |
---|
1438 | #define BUFSIZE 4096 |
---|
1439 | char *buf = NULL; |
---|
1440 | int alloc = 0; |
---|
1441 | int size = 0; |
---|
1442 | int count; |
---|
1443 | |
---|
1444 | while (! feof (stream)) |
---|
1445 | { |
---|
1446 | if (size + BUFSIZE > alloc) |
---|
1447 | { |
---|
1448 | alloc = alloc + alloc / 2; |
---|
1449 | if (alloc < size + BUFSIZE) |
---|
1450 | alloc = size + BUFSIZE; |
---|
1451 | buf = realloc (buf, alloc); |
---|
1452 | if (buf == NULL) |
---|
1453 | { |
---|
1454 | fprintf (stderr, "out of memory\n"); |
---|
1455 | exit (1); |
---|
1456 | } |
---|
1457 | } |
---|
1458 | count = fread (buf + size, 1, BUFSIZE, stream); |
---|
1459 | if (count == 0) |
---|
1460 | { |
---|
1461 | if (ferror (stream)) |
---|
1462 | { |
---|
1463 | perror ("fread"); |
---|
1464 | exit (1); |
---|
1465 | } |
---|
1466 | } |
---|
1467 | else |
---|
1468 | size += count; |
---|
1469 | } |
---|
1470 | buf = realloc (buf, size + 1); |
---|
1471 | if (buf == NULL) |
---|
1472 | { |
---|
1473 | fprintf (stderr, "out of memory\n"); |
---|
1474 | exit (1); |
---|
1475 | } |
---|
1476 | buf[size] = '\0'; |
---|
1477 | return buf; |
---|
1478 | #undef BUFSIZE |
---|
1479 | } |
---|
1480 | |
---|
1481 | int |
---|
1482 | main (argc, argv) |
---|
1483 | int argc; |
---|
1484 | char * argv[]; |
---|
1485 | { |
---|
1486 | if (argc == 1) |
---|
1487 | { |
---|
1488 | /* Display all the break opportunities in the input string. */ |
---|
1489 | char *input = read_file (stdin); |
---|
1490 | int length = strlen (input); |
---|
1491 | char *breaks = malloc (length); |
---|
1492 | int i; |
---|
1493 | |
---|
1494 | u8_possible_linebreaks ((unsigned char *) input, length, "UTF-8", breaks); |
---|
1495 | |
---|
1496 | for (i = 0; i < length; i++) |
---|
1497 | { |
---|
1498 | switch (breaks[i]) |
---|
1499 | { |
---|
1500 | case UC_BREAK_POSSIBLE: |
---|
1501 | /* U+2027 in UTF-8 encoding */ |
---|
1502 | putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout); |
---|
1503 | break; |
---|
1504 | case UC_BREAK_MANDATORY: |
---|
1505 | /* U+21B2 (or U+21B5) in UTF-8 encoding */ |
---|
1506 | putc (0xe2, stdout); putc (0x86, stdout); putc (0xb2, stdout); |
---|
1507 | break; |
---|
1508 | case UC_BREAK_PROHIBITED: |
---|
1509 | break; |
---|
1510 | default: |
---|
1511 | abort (); |
---|
1512 | } |
---|
1513 | putc (input[i], stdout); |
---|
1514 | } |
---|
1515 | |
---|
1516 | free (breaks); |
---|
1517 | |
---|
1518 | return 0; |
---|
1519 | } |
---|
1520 | else if (argc == 2) |
---|
1521 | { |
---|
1522 | /* Insert line breaks for a given width. */ |
---|
1523 | int width = atoi (argv[1]); |
---|
1524 | char *input = read_file (stdin); |
---|
1525 | int length = strlen (input); |
---|
1526 | char *breaks = malloc (length); |
---|
1527 | int i; |
---|
1528 | |
---|
1529 | u8_width_linebreaks ((unsigned char *) input, length, width, 0, 0, NULL, "UTF-8", breaks); |
---|
1530 | |
---|
1531 | for (i = 0; i < length; i++) |
---|
1532 | { |
---|
1533 | switch (breaks[i]) |
---|
1534 | { |
---|
1535 | case UC_BREAK_POSSIBLE: |
---|
1536 | putc ('\n', stdout); |
---|
1537 | break; |
---|
1538 | case UC_BREAK_MANDATORY: |
---|
1539 | break; |
---|
1540 | case UC_BREAK_PROHIBITED: |
---|
1541 | break; |
---|
1542 | default: |
---|
1543 | abort (); |
---|
1544 | } |
---|
1545 | putc (input[i], stdout); |
---|
1546 | } |
---|
1547 | |
---|
1548 | free (breaks); |
---|
1549 | |
---|
1550 | return 0; |
---|
1551 | } |
---|
1552 | else |
---|
1553 | return 1; |
---|
1554 | } |
---|
1555 | |
---|
1556 | #endif /* TEST1 */ |
---|
1557 | |
---|
1558 | |
---|
1559 | /* Now the same thing with an arbitrary encoding. |
---|
1560 | |
---|
1561 | We convert the input string to Unicode. |
---|
1562 | |
---|
1563 | The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16, |
---|
1564 | UTF-16BE, UTF-16LE, UTF-7. UCS-2 supports only characters up to |
---|
1565 | \U0000FFFF. UTF-16 and variants support only characters up to |
---|
1566 | \U0010FFFF. UTF-7 is way too complex and not supported by glibc-2.1. |
---|
1567 | UCS-4 specification leaves doubts about endianness and byte order mark. |
---|
1568 | glibc currently interprets it as big endian without byte order mark, |
---|
1569 | but this is not backed by an RFC. So we use UTF-8. It supports |
---|
1570 | characters up to \U7FFFFFFF and is unambiguously defined. */ |
---|
1571 | |
---|
1572 | #if HAVE_ICONV |
---|
1573 | |
---|
1574 | #include <iconv.h> |
---|
1575 | #include <errno.h> |
---|
1576 | |
---|
1577 | /* Luckily, the encoding's name is platform independent. */ |
---|
1578 | #define UTF8_NAME "UTF-8" |
---|
1579 | |
---|
1580 | /* Return the length of a string after conversion through an iconv_t. */ |
---|
1581 | static size_t |
---|
1582 | iconv_string_length (cd, s, n) |
---|
1583 | iconv_t cd; |
---|
1584 | const char *s; |
---|
1585 | size_t n; |
---|
1586 | { |
---|
1587 | #define TMPBUFSIZE 4096 |
---|
1588 | size_t count = 0; |
---|
1589 | char tmpbuf[TMPBUFSIZE]; |
---|
1590 | const char *inptr = s; |
---|
1591 | size_t insize = n; |
---|
1592 | while (insize > 0) |
---|
1593 | { |
---|
1594 | char *outptr = tmpbuf; |
---|
1595 | size_t outsize = TMPBUFSIZE; |
---|
1596 | size_t res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize); |
---|
1597 | if (res == (size_t)(-1)) |
---|
1598 | return (size_t)(-1); |
---|
1599 | count += outptr - tmpbuf; |
---|
1600 | } |
---|
1601 | /* Avoid glibc-2.1 bug and Solaris 2.7 bug. */ |
---|
1602 | #if defined _LIBICONV_VERSION \ |
---|
1603 | || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun) |
---|
1604 | { |
---|
1605 | char *outptr = tmpbuf; |
---|
1606 | size_t outsize = TMPBUFSIZE; |
---|
1607 | size_t res = iconv (cd, NULL, NULL, &outptr, &outsize); |
---|
1608 | if (res == (size_t)(-1)) |
---|
1609 | return (size_t)(-1); |
---|
1610 | count += outptr - tmpbuf; |
---|
1611 | } |
---|
1612 | /* Return to the initial state. */ |
---|
1613 | iconv (cd, NULL, NULL, NULL, NULL); |
---|
1614 | #endif |
---|
1615 | return count; |
---|
1616 | #undef TMPBUFSIZE |
---|
1617 | } |
---|
1618 | |
---|
1619 | static void |
---|
1620 | iconv_string_keeping_offsets (cd, s, n, offtable, t, m) |
---|
1621 | iconv_t cd; |
---|
1622 | const char *s; |
---|
1623 | size_t n; |
---|
1624 | size_t *offtable; |
---|
1625 | char *t; |
---|
1626 | size_t m; |
---|
1627 | { |
---|
1628 | size_t i; |
---|
1629 | const char *s_end; |
---|
1630 | const char *inptr; |
---|
1631 | char *outptr; |
---|
1632 | size_t outsize; |
---|
1633 | /* Avoid glibc-2.1 bug. */ |
---|
1634 | #if !defined _LIBICONV_VERSION && (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) |
---|
1635 | const size_t extra = 1; |
---|
1636 | #else |
---|
1637 | const size_t extra = 0; |
---|
1638 | #endif |
---|
1639 | |
---|
1640 | for (i = 0; i < n; i++) |
---|
1641 | offtable[i] = (size_t)(-1); |
---|
1642 | |
---|
1643 | s_end = s + n; |
---|
1644 | inptr = s; |
---|
1645 | outptr = t; |
---|
1646 | outsize = m + extra; |
---|
1647 | while (inptr < s_end) |
---|
1648 | { |
---|
1649 | size_t insize; |
---|
1650 | size_t res; |
---|
1651 | |
---|
1652 | offtable[inptr - s] = outptr - t; |
---|
1653 | |
---|
1654 | res = (size_t)(-1); |
---|
1655 | for (insize = 1; inptr + insize <= s_end; insize++) |
---|
1656 | { |
---|
1657 | res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize); |
---|
1658 | if (!(res == (size_t)(-1) && errno == EINVAL)) |
---|
1659 | break; |
---|
1660 | } |
---|
1661 | /* After we verified the convertibility and computed the translation's |
---|
1662 | size m, there shouldn't be any conversion error here. */ |
---|
1663 | if (res == (size_t)(-1)) |
---|
1664 | abort (); |
---|
1665 | } |
---|
1666 | /* Avoid glibc-2.1 bug and Solaris 2.7 bug. */ |
---|
1667 | #if defined _LIBICONV_VERSION \ |
---|
1668 | || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun) |
---|
1669 | if (iconv (cd, NULL, NULL, &outptr, &outsize) == (size_t)(-1)) |
---|
1670 | abort (); |
---|
1671 | #endif |
---|
1672 | /* We should have produced exactly m output bytes. */ |
---|
1673 | if (outsize != extra) |
---|
1674 | abort (); |
---|
1675 | } |
---|
1676 | |
---|
1677 | #endif /* HAVE_ICONV */ |
---|
1678 | |
---|
1679 | #if C_CTYPE_ASCII |
---|
1680 | |
---|
1681 | /* Tests whether a string is entirely ASCII. Returns 1 if yes. |
---|
1682 | Returns 0 if the string is in an 8-bit encoding or an ISO-2022 encoding. */ |
---|
1683 | static int |
---|
1684 | is_all_ascii (s, n) |
---|
1685 | const char *s; |
---|
1686 | size_t n; |
---|
1687 | { |
---|
1688 | for (; n > 0; s++, n--) |
---|
1689 | { |
---|
1690 | unsigned char c = (unsigned char) *s; |
---|
1691 | |
---|
1692 | if (!(c_isprint (c) || c_isspace (c))) |
---|
1693 | return 0; |
---|
1694 | } |
---|
1695 | return 1; |
---|
1696 | } |
---|
1697 | |
---|
1698 | #endif /* C_CTYPE_ASCII */ |
---|
1699 | |
---|
1700 | #if defined unused || defined TEST2 |
---|
1701 | |
---|
1702 | void |
---|
1703 | mbs_possible_linebreaks (s, n, encoding, p) |
---|
1704 | const char *s; |
---|
1705 | size_t n; |
---|
1706 | const char *encoding; |
---|
1707 | char *p; |
---|
1708 | { |
---|
1709 | if (is_utf8_encoding (encoding)) |
---|
1710 | u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p); |
---|
1711 | else |
---|
1712 | { |
---|
1713 | #if HAVE_ICONV |
---|
1714 | iconv_t to_utf8; |
---|
1715 | /* Avoid glibc-2.1 bug with EUC-KR. */ |
---|
1716 | # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION |
---|
1717 | if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)) |
---|
1718 | to_utf8 = (iconv_t)(-1); |
---|
1719 | else |
---|
1720 | # endif |
---|
1721 | to_utf8 = iconv_open (UTF8_NAME, encoding); |
---|
1722 | if (to_utf8 != (iconv_t)(-1)) |
---|
1723 | { |
---|
1724 | /* Determine the length of the resulting UTF-8 string. */ |
---|
1725 | size_t m = iconv_string_length (to_utf8, s, n); |
---|
1726 | if (m != (size_t)(-1)) |
---|
1727 | { |
---|
1728 | /* Convert the string to UTF-8 and build a translation table |
---|
1729 | from offsets into s to offsets into the translated string. */ |
---|
1730 | char *memory = malloc (n * sizeof (size_t) + m + m); |
---|
1731 | if (memory != NULL) |
---|
1732 | { |
---|
1733 | size_t *offtable = (size_t *) memory; |
---|
1734 | char *t = (char *) (offtable + n); |
---|
1735 | char *q = (char *) (t + m); |
---|
1736 | size_t i; |
---|
1737 | |
---|
1738 | iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m); |
---|
1739 | |
---|
1740 | /* Determine the possible line breaks of the UTF-8 string. */ |
---|
1741 | u8_possible_linebreaks ((const unsigned char *) t, m, encoding, q); |
---|
1742 | |
---|
1743 | /* Translate the result back to the original string. */ |
---|
1744 | memset (p, UC_BREAK_PROHIBITED, n); |
---|
1745 | for (i = 0; i < n; i++) |
---|
1746 | if (offtable[i] != (size_t)(-1)) |
---|
1747 | p[i] = q[offtable[i]]; |
---|
1748 | |
---|
1749 | free (memory); |
---|
1750 | iconv_close (to_utf8); |
---|
1751 | return; |
---|
1752 | } |
---|
1753 | } |
---|
1754 | iconv_close (to_utf8); |
---|
1755 | } |
---|
1756 | #endif |
---|
1757 | /* Impossible to convert. */ |
---|
1758 | #if C_CTYPE_ASCII |
---|
1759 | if (is_all_ascii (s, n)) |
---|
1760 | { |
---|
1761 | /* ASCII is a subset of UTF-8. */ |
---|
1762 | u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p); |
---|
1763 | return; |
---|
1764 | } |
---|
1765 | #endif |
---|
1766 | /* We have a non-ASCII string and cannot convert it. |
---|
1767 | Don't produce line breaks except those already present in the |
---|
1768 | input string. All we assume here is that the encoding is |
---|
1769 | minimally ASCII compatible. */ |
---|
1770 | { |
---|
1771 | const char *s_end = s + n; |
---|
1772 | while (s < s_end) |
---|
1773 | { |
---|
1774 | *p = (*s == '\n' ? UC_BREAK_MANDATORY : UC_BREAK_PROHIBITED); |
---|
1775 | s++; |
---|
1776 | p++; |
---|
1777 | } |
---|
1778 | } |
---|
1779 | } |
---|
1780 | } |
---|
1781 | |
---|
1782 | #endif |
---|
1783 | |
---|
1784 | int |
---|
1785 | mbs_width_linebreaks (s, n, width, start_column, at_end_columns, o, encoding, p) |
---|
1786 | const char *s; |
---|
1787 | size_t n; |
---|
1788 | int width; |
---|
1789 | int start_column; |
---|
1790 | int at_end_columns; |
---|
1791 | const char *o; |
---|
1792 | const char *encoding; |
---|
1793 | char *p; |
---|
1794 | { |
---|
1795 | if (is_utf8_encoding (encoding)) |
---|
1796 | return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p); |
---|
1797 | else |
---|
1798 | { |
---|
1799 | #if HAVE_ICONV |
---|
1800 | iconv_t to_utf8; |
---|
1801 | /* Avoid glibc-2.1 bug with EUC-KR. */ |
---|
1802 | # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION |
---|
1803 | if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)) |
---|
1804 | to_utf8 = (iconv_t)(-1); |
---|
1805 | else |
---|
1806 | # endif |
---|
1807 | to_utf8 = iconv_open (UTF8_NAME, encoding); |
---|
1808 | if (to_utf8 != (iconv_t)(-1)) |
---|
1809 | { |
---|
1810 | /* Determine the length of the resulting UTF-8 string. */ |
---|
1811 | size_t m = iconv_string_length (to_utf8, s, n); |
---|
1812 | if (m != (size_t)(-1)) |
---|
1813 | { |
---|
1814 | /* Convert the string to UTF-8 and build a translation table |
---|
1815 | from offsets into s to offsets into the translated string. */ |
---|
1816 | char *memory = malloc (n * sizeof (size_t) + m + m + (o != NULL ? m : 0)); |
---|
1817 | if (memory != NULL) |
---|
1818 | { |
---|
1819 | size_t *offtable = (size_t *) memory; |
---|
1820 | char *t = (char *) (offtable + n); |
---|
1821 | char *q = (char *) (t + m); |
---|
1822 | char *o8 = (o != NULL ? (char *) (q + m) : NULL); |
---|
1823 | int res_column; |
---|
1824 | size_t i; |
---|
1825 | |
---|
1826 | iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m); |
---|
1827 | |
---|
1828 | /* Translate the overrides to the UTF-8 string. */ |
---|
1829 | if (o != NULL) |
---|
1830 | { |
---|
1831 | memset (o8, UC_BREAK_UNDEFINED, m); |
---|
1832 | for (i = 0; i < n; i++) |
---|
1833 | if (offtable[i] != (size_t)(-1)) |
---|
1834 | o8[offtable[i]] = o[i]; |
---|
1835 | } |
---|
1836 | |
---|
1837 | /* Determine the line breaks of the UTF-8 string. */ |
---|
1838 | res_column = |
---|
1839 | u8_width_linebreaks ((const unsigned char *) t, m, width, start_column, at_end_columns, o8, encoding, q); |
---|
1840 | |
---|
1841 | /* Translate the result back to the original string. */ |
---|
1842 | memset (p, UC_BREAK_PROHIBITED, n); |
---|
1843 | for (i = 0; i < n; i++) |
---|
1844 | if (offtable[i] != (size_t)(-1)) |
---|
1845 | p[i] = q[offtable[i]]; |
---|
1846 | |
---|
1847 | free (memory); |
---|
1848 | iconv_close (to_utf8); |
---|
1849 | return res_column; |
---|
1850 | } |
---|
1851 | } |
---|
1852 | iconv_close (to_utf8); |
---|
1853 | } |
---|
1854 | #endif |
---|
1855 | /* Impossible to convert. */ |
---|
1856 | #if C_CTYPE_ASCII |
---|
1857 | if (is_all_ascii (s, n)) |
---|
1858 | { |
---|
1859 | /* ASCII is a subset of UTF-8. */ |
---|
1860 | return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p); |
---|
1861 | } |
---|
1862 | #endif |
---|
1863 | /* We have a non-ASCII string and cannot convert it. |
---|
1864 | Don't produce line breaks except those already present in the |
---|
1865 | input string. All we assume here is that the encoding is |
---|
1866 | minimally ASCII compatible. */ |
---|
1867 | { |
---|
1868 | const char *s_end = s + n; |
---|
1869 | while (s < s_end) |
---|
1870 | { |
---|
1871 | *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n' |
---|
1872 | ? UC_BREAK_MANDATORY |
---|
1873 | : UC_BREAK_PROHIBITED); |
---|
1874 | s++; |
---|
1875 | p++; |
---|
1876 | if (o != NULL) |
---|
1877 | o++; |
---|
1878 | } |
---|
1879 | /* We cannot compute widths in this case. */ |
---|
1880 | return start_column; |
---|
1881 | } |
---|
1882 | } |
---|
1883 | } |
---|
1884 | |
---|
1885 | |
---|
1886 | #ifdef TEST2 |
---|
1887 | |
---|
1888 | #include <stdio.h> |
---|
1889 | #include <locale.h> |
---|
1890 | |
---|
1891 | /* Read the contents of an input stream, and return it, terminated with a NUL |
---|
1892 | byte. */ |
---|
1893 | char * |
---|
1894 | read_file (stream) |
---|
1895 | FILE *stream; |
---|
1896 | { |
---|
1897 | #define BUFSIZE 4096 |
---|
1898 | char *buf = NULL; |
---|
1899 | int alloc = 0; |
---|
1900 | int size = 0; |
---|
1901 | int count; |
---|
1902 | |
---|
1903 | while (! feof (stream)) |
---|
1904 | { |
---|
1905 | if (size + BUFSIZE > alloc) |
---|
1906 | { |
---|
1907 | alloc = alloc + alloc / 2; |
---|
1908 | if (alloc < size + BUFSIZE) |
---|
1909 | alloc = size + BUFSIZE; |
---|
1910 | buf = realloc (buf, alloc); |
---|
1911 | if (buf == NULL) |
---|
1912 | { |
---|
1913 | fprintf (stderr, "out of memory\n"); |
---|
1914 | exit (1); |
---|
1915 | } |
---|
1916 | } |
---|
1917 | count = fread (buf + size, 1, BUFSIZE, stream); |
---|
1918 | if (count == 0) |
---|
1919 | { |
---|
1920 | if (ferror (stream)) |
---|
1921 | { |
---|
1922 | perror ("fread"); |
---|
1923 | exit (1); |
---|
1924 | } |
---|
1925 | } |
---|
1926 | else |
---|
1927 | size += count; |
---|
1928 | } |
---|
1929 | buf = realloc (buf, size + 1); |
---|
1930 | if (buf == NULL) |
---|
1931 | { |
---|
1932 | fprintf (stderr, "out of memory\n"); |
---|
1933 | exit (1); |
---|
1934 | } |
---|
1935 | buf[size] = '\0'; |
---|
1936 | return buf; |
---|
1937 | #undef BUFSIZE |
---|
1938 | } |
---|
1939 | |
---|
1940 | int |
---|
1941 | main (argc, argv) |
---|
1942 | int argc; |
---|
1943 | char * argv[]; |
---|
1944 | { |
---|
1945 | setlocale (LC_CTYPE, ""); |
---|
1946 | if (argc == 1) |
---|
1947 | { |
---|
1948 | /* Display all the break opportunities in the input string. */ |
---|
1949 | char *input = read_file (stdin); |
---|
1950 | int length = strlen (input); |
---|
1951 | char *breaks = malloc (length); |
---|
1952 | int i; |
---|
1953 | |
---|
1954 | mbs_possible_linebreaks (input, length, locale_charset (), breaks); |
---|
1955 | |
---|
1956 | for (i = 0; i < length; i++) |
---|
1957 | { |
---|
1958 | switch (breaks[i]) |
---|
1959 | { |
---|
1960 | case UC_BREAK_POSSIBLE: |
---|
1961 | putc ('|', stdout); |
---|
1962 | break; |
---|
1963 | case UC_BREAK_MANDATORY: |
---|
1964 | break; |
---|
1965 | case UC_BREAK_PROHIBITED: |
---|
1966 | break; |
---|
1967 | default: |
---|
1968 | abort (); |
---|
1969 | } |
---|
1970 | putc (input[i], stdout); |
---|
1971 | } |
---|
1972 | |
---|
1973 | free (breaks); |
---|
1974 | |
---|
1975 | return 0; |
---|
1976 | } |
---|
1977 | else if (argc == 2) |
---|
1978 | { |
---|
1979 | /* Insert line breaks for a given width. */ |
---|
1980 | int width = atoi (argv[1]); |
---|
1981 | char *input = read_file (stdin); |
---|
1982 | int length = strlen (input); |
---|
1983 | char *breaks = malloc (length); |
---|
1984 | int i; |
---|
1985 | |
---|
1986 | mbs_width_linebreaks (input, length, width, 0, 0, NULL, locale_charset (), breaks); |
---|
1987 | |
---|
1988 | for (i = 0; i < length; i++) |
---|
1989 | { |
---|
1990 | switch (breaks[i]) |
---|
1991 | { |
---|
1992 | case UC_BREAK_POSSIBLE: |
---|
1993 | putc ('\n', stdout); |
---|
1994 | break; |
---|
1995 | case UC_BREAK_MANDATORY: |
---|
1996 | break; |
---|
1997 | case UC_BREAK_PROHIBITED: |
---|
1998 | break; |
---|
1999 | default: |
---|
2000 | abort (); |
---|
2001 | } |
---|
2002 | putc (input[i], stdout); |
---|
2003 | } |
---|
2004 | |
---|
2005 | free (breaks); |
---|
2006 | |
---|
2007 | return 0; |
---|
2008 | } |
---|
2009 | else |
---|
2010 | return 1; |
---|
2011 | } |
---|
2012 | |
---|
2013 | #endif /* TEST2 */ |
---|