source: trunk/third/libxml2/genUnicode.py @ 20735

Revision 20735, 12.5 KB checked in by ghudson, 20 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r20734, which included commits to RCS files with non-trunk default branches.
  • Property svn:executable set to *
Line 
1#!/usr/bin/python -u
2#
3# Original script modified in November 2003 to take advantage of
4# the character-validation range routines, and updated to the
5# current Unicode information (Version 4.0.1)
6#
7# NOTE: there is an 'alias' facility for blocks which are not present in
8#       the current release, but are needed for ABI compatibility.  This
9#       must be accomplished MANUALLY!  Please see the comments below under
10#     'blockAliases'
11#
12import sys
13import string
14import time
15
16webpage = "http://www.unicode.org/Public/4.0-Update1/UCD-4.0.1d5b.html"
17sources = "Blocks-4.0.1d1b.txt UnicodeData-4.0.1d1b.txt"
18
19#
20# blockAliases is a small hack - it is used for mapping block names which
21# were were used in the 3.1 release, but are missing or changed in the current
22# release.  The format is "OldBlockName:NewBlockName1[,NewBlockName2[,...]]"
23blockAliases = []
24blockAliases.append("CombiningMarksforSymbols:CombiningDiacriticalMarksforSymbols")
25blockAliases.append("Greek:GreekandCoptic")
26blockAliases.append("PrivateUse:PrivateUseArea,SupplementaryPrivateUseArea-A," +
27        "SupplementaryPrivateUseArea-B")
28
29# minTableSize gives the minimum number of ranges which must be present
30# before a range table is produced.  If there are less than this
31# number, inline comparisons are generated
32minTableSize = 8
33
34(blockfile, catfile) = string.split(sources)
35
36
37#
38# Now process the "blocks" file, reducing it to a dictionary
39# indexed by blockname, containing a tuple with the applicable
40# block range
41#
42BlockNames = {}
43try:
44    blocks = open(blockfile, "r")
45except:
46    print "Missing %s, aborting ..." % blockfile
47    sys.exit(1)
48
49for line in blocks.readlines():
50    if line[0] == '#':
51        continue
52    line = string.strip(line)
53    if line == '':
54        continue
55    try:
56        fields = string.split(line, ';')
57        range = string.strip(fields[0])
58        (start, end) = string.split(range, "..")
59        name = string.strip(fields[1])
60        name = string.replace(name, ' ', '')
61    except:
62        print "Failed to process line: %s" % (line)
63        continue
64    start = "0x" + start
65    end = "0x" + end
66    try:
67        BlockNames[name].append((start, end))
68    except:
69        BlockNames[name] = [(start, end)]
70blocks.close()
71print "Parsed %d blocks descriptions" % (len(BlockNames.keys()))
72
73for block in blockAliases:
74    alias = string.split(block,':')
75    alist = string.split(alias[1],',')
76    for comp in alist:
77        if BlockNames.has_key(comp):
78            if alias[0] not in BlockNames:
79                BlockNames[alias[0]] = []
80            for r in BlockNames[comp]:
81                BlockNames[alias[0]].append(r)
82        else:
83            print "Alias %s: %s not in Blocks" % (alias[0], comp)
84            continue
85
86#
87# Next process the Categories file. This is more complex, since
88# the file is in code sequence, and we need to invert it.  We use
89# a dictionary with index category-name, with each entry containing
90# all the ranges (codepoints) of that category.  Note that category
91# names comprise two parts - the general category, and the "subclass"
92# within that category.  Therefore, both "general category" (which is
93# the first character of the 2-character category-name) and the full
94# (2-character) name are entered into this dictionary.
95#
96try:
97    data = open(catfile, "r")
98except:
99    print "Missing %s, aborting ..." % catfile
100    sys.exit(1)
101
102nbchar = 0;
103Categories = {}
104for line in data.readlines():
105    if line[0] == '#':
106        continue
107    line = string.strip(line)
108    if line == '':
109        continue
110    try:
111        fields = string.split(line, ';')
112        point = string.strip(fields[0])
113        value = 0
114        while point != '':
115            value = value * 16
116            if point[0] >= '0' and point[0] <= '9':
117                value = value + ord(point[0]) - ord('0')
118            elif point[0] >= 'A' and point[0] <= 'F':
119                value = value + 10 + ord(point[0]) - ord('A')
120            elif point[0] >= 'a' and point[0] <= 'f':
121                value = value + 10 + ord(point[0]) - ord('a')
122            point = point[1:]
123        name = fields[2]
124    except:
125        print "Failed to process line: %s" % (line)
126        continue
127   
128    nbchar = nbchar + 1
129    # update entry for "full name"
130    try:
131        Categories[name].append(value)
132    except:
133        try:
134            Categories[name] = [value]
135        except:
136            print "Failed to process line: %s" % (line)
137    # update "general category" name
138    try:
139        Categories[name[0]].append(value)
140    except:
141        try:
142            Categories[name[0]] = [value]
143        except:
144            print "Failed to process line: %s" % (line)
145
146blocks.close()
147print "Parsed %d char generating %d categories" % (nbchar, len(Categories.keys()))
148
149#
150# The data is now all read.  Time to process it into a more useful form.
151#
152# reduce the number list into ranges
153for cat in Categories.keys():
154    list = Categories[cat]
155    start = -1
156    prev = -1
157    end = -1
158    ranges = []
159    for val in list:
160        if start == -1:
161            start = val
162            prev = val
163            continue
164        elif val == prev + 1:
165            prev = val
166            continue
167        elif prev == start:
168            ranges.append((prev, prev))
169            start = val
170            prev = val
171            continue
172        else:
173            ranges.append((start, prev))
174            start = val
175            prev = val
176            continue
177    if prev == start:
178        ranges.append((prev, prev))
179    else:
180        ranges.append((start, prev))
181    Categories[cat] = ranges
182
183#
184# Assure all data is in alphabetic order, since we will be doing binary
185# searches on the tables.
186#
187bkeys = BlockNames.keys()
188bkeys.sort()
189
190ckeys = Categories.keys()
191ckeys.sort()
192
193#
194# Generate the resulting files
195#
196try:
197    header = open("include/libxml/xmlunicode.h", "w")
198except:
199    print "Failed to open include/libxml/xmlunicode.h"
200    sys.exit(1)
201
202try:
203    output = open("xmlunicode.c", "w")
204except:
205    print "Failed to open xmlunicode.c"
206    sys.exit(1)
207
208date = time.asctime(time.localtime(time.time()))
209
210header.write(
211"""/*
212 * Summary: Unicode character APIs
213 * Description: API for the Unicode character APIs
214 *
215 * This file is automatically generated from the
216 * UCS description files of the Unicode Character Database
217 * %s
218 * using the genUnicode.py Python script.
219 *
220 * Generation date: %s
221 * Sources: %s
222 * Author: Daniel Veillard
223 */
224
225#ifndef __XML_UNICODE_H__
226#define __XML_UNICODE_H__
227
228#include <libxml/xmlversion.h>
229
230#ifdef __cplusplus
231extern "C" {
232#endif
233
234""" % (webpage, date, sources));
235
236output.write(
237"""/*
238 * xmlunicode.c: this module implements the Unicode character APIs
239 *
240 * This file is automatically generated from the
241 * UCS description files of the Unicode Character Database
242 * %s
243 * using the genUnicode.py Python script.
244 *
245 * Generation date: %s
246 * Sources: %s
247 * Daniel Veillard <veillard@redhat.com>
248 */
249
250#define IN_LIBXML
251#include "libxml.h"
252
253#ifdef LIBXML_UNICODE_ENABLED
254
255#include <string.h>
256#include <libxml/xmlversion.h>
257#include <libxml/xmlunicode.h>
258#include <libxml/chvalid.h>
259
260typedef int (xmlIntFunc)(int);  /* just to keep one's mind untwisted */
261
262typedef struct {
263    const char *rangename;
264    xmlIntFunc *func;
265} xmlUnicodeRange;
266
267typedef struct {
268    xmlUnicodeRange *table;
269    int             numentries;
270} xmlUnicodeNameTable;
271
272
273static xmlIntFunc *xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname);
274
275static xmlUnicodeRange xmlUnicodeBlocks[] = {
276""" % (webpage, date, sources));
277
278flag = 0
279for block in bkeys:
280    name = string.replace(block, '-', '')
281    if flag:
282        output.write(',\n')
283    else:
284        flag = 1
285    output.write('  {"%s", xmlUCSIs%s}' % (block, name))
286output.write('};\n\n')
287
288output.write('static xmlUnicodeRange xmlUnicodeCats[] = {\n')
289flag = 0;
290for name in ckeys:
291    if flag:
292        output.write(',\n')
293    else:
294        flag = 1
295    output.write('  {"%s", xmlUCSIsCat%s}' % (name, name))
296output.write('};\n\n')
297
298#
299# For any categories with more than minTableSize ranges we generate
300# a range table suitable for xmlCharInRange
301#
302for name in ckeys:
303  if len(Categories[name]) > minTableSize:
304    numshort = 0
305    numlong = 0
306    ranges = Categories[name]
307    sptr = "NULL"
308    lptr = "NULL"
309    for range in ranges:
310      (low, high) = range
311      if high < 0x10000:
312        if numshort == 0:
313          pline = "static xmlChSRange xml%sS[] = {" % name
314          sptr = "xml%sS" % name
315        else:
316          pline += ", "
317        numshort += 1
318      else:
319        if numlong == 0:
320          if numshort > 0:
321            output.write(pline + " };\n")
322          pline = "static xmlChLRange xml%sL[] = {" % name
323          lptr = "xml%sL" % name
324        else:
325          pline += ", "
326        numlong += 1
327      if len(pline) > 60:
328        output.write(pline + "\n")
329        pline = "    "
330      pline += "{%s, %s}" % (hex(low), hex(high))
331    output.write(pline + " };\nstatic xmlChRangeGroup xml%sG = {%s,%s,%s,%s};\n\n"
332         % (name, numshort, numlong, sptr, lptr))
333
334
335output.write(
336"""xmlUnicodeNameTable xmlUnicodeBlockTbl = {xmlUnicodeBlocks, %s};
337xmlUnicodeNameTable xmlUnicodeCatTbl = {xmlUnicodeCats, %s};
338
339/**
340 * xmlUnicodeLookup:
341 * @tptr: pointer to the name table
342 * @name: name to be found
343 *
344 * binary table lookup for user-supplied name
345 *
346 * Returns pointer to range function if found, otherwise NULL
347 */
348static xmlIntFunc
349*xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname) {
350    int low, high, mid, cmp;
351    xmlUnicodeRange *sptr;
352
353    low = 0;
354    high = tptr->numentries - 1;
355    sptr = tptr->table;
356    while (low <= high) {
357        mid = (low + high) / 2;
358        if ((cmp=strcmp(tname, sptr[mid].rangename)) == 0)
359            return (sptr[mid].func);
360        if (cmp < 0)
361            high = mid - 1;
362        else
363            low = mid + 1;
364    }
365    return (NULL);   
366}
367
368""" % (len(BlockNames), len(Categories)) )
369
370for block in bkeys:
371    name = string.replace(block, '-', '')
372    header.write("XMLPUBFUN int XMLCALL xmlUCSIs%s\t(int code);\n" % name)
373    output.write("/**\n * xmlUCSIs%s:\n * @code: UCS code point\n" % (name))
374    output.write(" *\n * Check whether the character is part of %s UCS Block\n"%
375                 (block))
376    output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
377    output.write("int\nxmlUCSIs%s(int code) {\n    return(" % name)
378    flag = 0
379    for (start, end) in BlockNames[block]:
380        if flag:
381            output.write(" ||\n           ")
382        else:
383            flag = 1
384        output.write("((code >= %s) && (code <= %s))" % (start, end))
385    output.write(");\n}\n\n")
386
387header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsBlock\t(int code, const char *block);\n\n")
388output.write(
389"""/**
390 * xmlUCSIsBlock:
391 * @code: UCS code point
392 * @block: UCS block name
393 *
394 * Check whether the character is part of the UCS Block
395 *
396 * Returns 1 if true, 0 if false and -1 on unknown block
397 */
398int
399xmlUCSIsBlock(int code, const char *block) {
400    xmlIntFunc *func;
401
402    func = xmlUnicodeLookup(&xmlUnicodeBlockTbl, block);
403    if (func == NULL)
404        return (-1);
405    return (func(code));
406}
407
408""")
409
410for name in ckeys:
411    ranges = Categories[name]
412    header.write("XMLPUBFUN int XMLCALL xmlUCSIsCat%s\t(int code);\n" % name)
413    output.write("/**\n * xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name))
414    output.write(" *\n * Check whether the character is part of %s UCS Category\n"%
415                 (name))
416    output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
417    output.write("int\nxmlUCSIsCat%s(int code) {\n" % name)
418    if len(Categories[name]) > minTableSize:
419        output.write("    return(xmlCharInRange((unsigned int)code, &xml%sG)"
420            % name)
421    else:
422        start = 1
423        for range in ranges:
424            (begin, end) = range;
425            if start:
426                output.write("    return(");
427                start = 0
428            else:
429                output.write(" ||\n           ");
430            if (begin == end):
431                output.write("(code == %s)" % (hex(begin)))
432            else:
433                output.write("((code >= %s) && (code <= %s))" % (
434                         hex(begin), hex(end)))
435    output.write(");\n}\n\n")
436
437header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsCat\t(int code, const char *cat);\n")
438output.write(
439"""/**
440 * xmlUCSIsCat:
441 * @code: UCS code point
442 * @cat: UCS Category name
443 *
444 * Check whether the character is part of the UCS Category
445 *
446 * Returns 1 if true, 0 if false and -1 on unknown category
447 */
448int
449xmlUCSIsCat(int code, const char *cat) {
450    xmlIntFunc *func;
451
452    func = xmlUnicodeLookup(&xmlUnicodeCatTbl, cat);
453    if (func == NULL)
454        return (-1);
455    return (func(code));
456}
457
458
459#endif /* LIBXML_UNICODE_ENABLED */
460""")
461
462header.write("""
463#ifdef __cplusplus
464}
465#endif
466#endif /* __XML_UNICODE_H__ */
467""");
468
469header.close()
470output.close()
Note: See TracBrowser for help on using the repository browser.