source: trunk/third/perl-xml-parser/Parser.pm @ 21085

Revision 21085, 30.6 KB checked in by ghudson, 20 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r21084, which included commits to RCS files with non-trunk default branches.
RevLine 
[21084]1# XML::Parser
2#
3# Copyright (c) 1998-2000 Larry Wall and Clark Cooper
4# All rights reserved.
5#
6# This program is free software; you can redistribute it and/or
7# modify it under the same terms as Perl itself.
8
9package XML::Parser;
10
11use Carp;
12use IO::File;
13
14BEGIN {
15  require XML::Parser::Expat;
16  $VERSION = '2.31';
17  die "Parser.pm and Expat.pm versions don't match"
18    unless $VERSION eq $XML::Parser::Expat::VERSION;
19}
20
21use strict;
22
23use vars qw($VERSION %Built_In_Styles $LWP_load_failed);
24
25$LWP_load_failed = 0;
26
27sub new {
28  my ($class, %args) = @_;
29  my $style = $args{Style};
30 
31  my $nonexopt = $args{Non_Expat_Options} ||= {};
32 
33  $nonexopt->{Style}             = 1;
34  $nonexopt->{Non_Expat_Options} = 1;
35  $nonexopt->{Handlers}          = 1;
36  $nonexopt->{_HNDL_TYPES}       = 1;
37  $nonexopt->{NoLWP}             = 1;
38 
39  $args{_HNDL_TYPES} = {%XML::Parser::Expat::Handler_Setters};
40  $args{_HNDL_TYPES}->{Init} = 1;
41  $args{_HNDL_TYPES}->{Final} = 1;
42 
43  $args{Handlers} ||= {};
44  my $handlers = $args{Handlers};
45 
46  if (defined($style)) {
47    my $stylepkg = $style;
48   
49    if ($stylepkg !~ /::/) {
50      $stylepkg = "\u$style";
51     
52      # I'm using the Built_In_Styles hash to define
53      # valid internal styles, since a style doesn't
54      # need to define any particular Handler to be valid.
55      # So I can't check for the existence of a particular sub.
56     
57      croak "Undefined style: $style"
58        unless defined($Built_In_Styles{$stylepkg});
59      $stylepkg = 'XML::Parser::' . $stylepkg;
60    }
61   
62    my $htype;
63    foreach $htype (keys %{$args{_HNDL_TYPES}}) {
64      # Handlers explicity given override
65      # handlers from the Style package
66      unless (defined($handlers->{$htype})) {
67       
68        # A handler in the style package must either have
69        # exactly the right case as the type name or a
70        # completely lower case version of it.
71       
72        my $hname = "${stylepkg}::$htype";
73        if (defined(&$hname)) {
74          $handlers->{$htype} = \&$hname;
75          next;
76        }
77       
78        $hname = "${stylepkg}::\L$htype";
79        if (defined(&$hname)) {
80          $handlers->{$htype} = \&$hname;
81          next;
82        }
83      }
84    }
85  }
86 
87  unless (defined($handlers->{ExternEnt})
88          or defined ($handlers->{ExternEntFin})) {
89   
90    if ($args{NoLWP} or $LWP_load_failed) {
91      $handlers->{ExternEnt} = \&file_ext_ent_handler;
92      $handlers->{ExternEntFin} = \&file_ext_ent_cleanup;
93    }
94    else {
95      # The following just bootstraps the real LWP external entity
96      # handler
97
98      $handlers->{ExternEnt} = \&initial_ext_ent_handler;
99
100      # No cleanup function available until LWPExternEnt.pl loaded
101    }
102  }
103
104  $args{Pkg} ||= caller;
105  bless \%args, $class;
106}                               # End of new
107
108sub setHandlers {
109  my ($self, @handler_pairs) = @_;
110 
111  croak("Uneven number of arguments to setHandlers method")
112    if (int(@handler_pairs) & 1);
113 
114  my @ret;
115  while (@handler_pairs) {
116    my $type = shift @handler_pairs;
117    my $handler = shift @handler_pairs;
118    unless (defined($self->{_HNDL_TYPES}->{$type})) {
119      my @types = sort keys %{$self->{_HNDL_TYPES}};
120     
121      croak("Unknown Parser handler type: $type\n Valid types: @types");
122    }
123    push(@ret, $type, $self->{Handlers}->{$type});
124    $self->{Handlers}->{$type} = $handler;
125  }
126
127  return @ret;
128}                               # End of setHandlers
129
130sub parse_start {
131  my $self = shift;
132  my @expat_options = ();
133
134  my ($key, $val);
135  while (($key, $val) = each %{$self}) {
136    push (@expat_options, $key, $val)
137      unless exists $self->{Non_Expat_Options}->{$key};
138  }
139
140  my %handlers = %{$self->{Handlers}};
141  my $init = delete $handlers{Init};
142  my $final = delete $handlers{Final};
143
144  my $expatnb = new XML::Parser::ExpatNB(@expat_options, @_);
145  $expatnb->setHandlers(%handlers);
146
147  &$init($expatnb)
148    if defined($init);
149
150  $expatnb->{_State_} = 1;
151
152  $expatnb->{FinalHandler} = $final
153    if defined($final);
154
155  return $expatnb;
156}
157
158sub parse {
159  my $self = shift;
160  my $arg  = shift;
161  my @expat_options = ();
162  my ($key, $val);
163  while (($key, $val) = each %{$self}) {
164    push(@expat_options, $key, $val)
165      unless exists $self->{Non_Expat_Options}->{$key};
166  }
167 
168  my $expat = new XML::Parser::Expat(@expat_options, @_);
169  my %handlers = %{$self->{Handlers}};
170  my $init = delete $handlers{Init};
171  my $final = delete $handlers{Final};
172 
173  $expat->setHandlers(%handlers);
174 
175  if ($self->{Base}) {
176    $expat->base($self->{Base});
177  }
178
179  &$init($expat)
180    if defined($init);
181 
182  my @result = ();
183  my $result;
184  eval {
185    $result = $expat->parse($arg);
186  };
187  my $err = $@;
188  if ($err) {
189    $expat->release;
190    die $err;
191  }
192 
193  if ($result and defined($final)) {
194    if (wantarray) {
195      @result = &$final($expat);
196    }
197    else {
198      $result = &$final($expat);
199    }
200  }
201 
202  $expat->release;
203
204  return unless defined wantarray;
205  return wantarray ? @result : $result;
206}                               # End of parse
207
208sub parsestring {
209  my $self = shift;
210  $self->parse(@_);
211}                               # End of parsestring
212
213sub parsefile {
214  my $self = shift;
215  my $file = shift;
216  local(*FILE);
217  open(FILE, $file) or  croak "Couldn't open $file:\n$!";
218  binmode(FILE);
219  my @ret;
220  my $ret;
221
222  $self->{Base} = $file;
223
224  if (wantarray) {
225    eval {
226      @ret = $self->parse(*FILE, @_);
227    };
228  }
229  else {
230    eval {
231      $ret = $self->parse(*FILE, @_);
232    };
233  }
234  my $err = $@;
235  close(FILE);
236  die $err if $err;
237 
238  return unless defined wantarray;
239  return wantarray ? @ret : $ret;
240}                               # End of parsefile
241
242
243sub initial_ext_ent_handler {
244  # This just bootstraps in the real lwp_ext_ent_handler which
245  # also loads the URI and LWP modules.
246
247  unless ($LWP_load_failed) {
248    local($^W) = 0;
249
250    my $stat =
251      eval {
252        require('XML/Parser/LWPExternEnt.pl');
253      };
254     
255    if ($stat) {
256      $_[0]->setHandlers(ExternEnt    => \&lwp_ext_ent_handler,
257                         ExternEntFin => \&lwp_ext_ent_cleanup);
258                       
259      goto &lwp_ext_ent_handler;
260    }
261
262    # Failed to load lwp handler, act as if NoLWP
263
264    $LWP_load_failed = 1;
265
266    my $cmsg = "Couldn't load LWP based external entity handler\n";
267    $cmsg .= "Switching to file-based external entity handler\n";
268    $cmsg .= " (To avoid this message, use NoLWP option to XML::Parser)\n";
269    warn($cmsg);
270  }
271
272  $_[0]->setHandlers(ExternEnt    => \&file_ext_ent_handler,
273                     ExternEntFin => \&file_ext_ent_cleanup);
274  goto &file_ext_ent_handler;
275
276}  # End initial_ext_ent_handler
277
278sub file_ext_ent_handler {
279  my ($xp, $base, $path) = @_;
280
281  # Prepend base only for relative paths
282
283  if (defined($base)
284      and not ($path =~ m!^(?:[\\/]|\w+:)!))
285    {
286      my $newpath = $base;
287      $newpath =~ s![^\\/:]*$!$path!;
288      $path = $newpath;
289    }
290
291  if ($path =~ /^\s*[|>+]/
292      or $path =~ /\|\s*$/) {
293    $xp->{ErrorMessage}
294        .= "System ID ($path) contains Perl IO control characters";
295    return undef;
296  }
297
298  my $fh = new IO::File($path);
299  unless (defined $fh) {
300    $xp->{ErrorMessage}
301      .= "Failed to open $path:\n$!";
302    return undef;
303  }
304
305  $xp->{_BaseStack} ||= [];
306  $xp->{_FhStack} ||= [];
307
308  push(@{$xp->{_BaseStack}}, $base);
309  push(@{$xp->{_FhStack}}, $fh);
310
311  $xp->base($path);
312 
313  return $fh;
314}  # End file_ext_ent_handler
315
316sub file_ext_ent_cleanup {
317  my ($xp) = @_;
318
319  my $fh = pop(@{$xp->{_FhStack}});
320  $fh->close;
321
322  my $base = pop(@{$xp->{_BaseStack}});
323  $xp->base($base);
324}  # End file_ext_ent_cleanup
325
326###################################################################
327
328package XML::Parser::Debug;
329$XML::Parser::Built_In_Styles{Debug} = 1;
330
331sub Start {
332  my $expat = shift;
333  my $tag = shift;
334  print STDERR "@{$expat->{Context}} \\\\ (@_)\n";
335}
336
337sub End {
338  my $expat = shift;
339  my $tag = shift;
340  print STDERR "@{$expat->{Context}} //\n";
341}
342
343sub Char {
344  my $expat = shift;
345  my $text = shift;
346  $text =~ s/([\x80-\xff])/sprintf "#x%X;", ord $1/eg;
347  $text =~ s/([\t\n])/sprintf "#%d;", ord $1/eg;
348  print STDERR "@{$expat->{Context}} || $text\n";
349}
350
351sub Proc {
352  my $expat = shift;
353  my $target = shift;
354  my $text = shift;
355  my @foo = @{$expat->{Context}};
356  print STDERR "@foo $target($text)\n";
357}
358
359###################################################################
360
361package XML::Parser::Subs;
362$XML::Parser::Built_In_Styles{Subs} = 1;
363
364sub Start {
365  no strict 'refs';
366  my $expat = shift;
367  my $tag = shift;
368  my $sub = $expat->{Pkg} . "::$tag";
369  eval { &$sub($expat, $tag, @_) };
370}
371
372sub End {
373  no strict 'refs';
374  my $expat = shift;
375  my $tag = shift;
376  my $sub = $expat->{Pkg} . "::${tag}_";
377  eval { &$sub($expat, $tag) };
378}
379
380###################################################################
381
382package XML::Parser::Tree;
383$XML::Parser::Built_In_Styles{Tree} = 1;
384
385sub Init {
386  my $expat = shift;
387  $expat->{Lists} = [];
388  $expat->{Curlist} = $expat->{Tree} = [];
389}
390
391sub Start {
392  my $expat = shift;
393  my $tag = shift;
394  my $newlist = [ { @_ } ];
395  push @{ $expat->{Lists} }, $expat->{Curlist};
396  push @{ $expat->{Curlist} }, $tag => $newlist;
397  $expat->{Curlist} = $newlist;
398}
399
400sub End {
401  my $expat = shift;
402  my $tag = shift;
403  $expat->{Curlist} = pop @{ $expat->{Lists} };
404}
405
406sub Char {
407  my $expat = shift;
408  my $text = shift;
409  my $clist = $expat->{Curlist};
410  my $pos = $#$clist;
411 
412  if ($pos > 0 and $clist->[$pos - 1] eq '0') {
413    $clist->[$pos] .= $text;
414  } else {
415    push @$clist, 0 => $text;
416  }
417}
418
419sub Final {
420  my $expat = shift;
421  delete $expat->{Curlist};
422  delete $expat->{Lists};
423  $expat->{Tree};
424}
425
426###################################################################
427
428package XML::Parser::Objects;
429$XML::Parser::Built_In_Styles{Objects} = 1;
430
431sub Init {
432  my $expat = shift;
433  $expat->{Lists} = [];
434  $expat->{Curlist} = $expat->{Tree} = [];
435}
436
437sub Start {
438  my $expat = shift;
439  my $tag = shift;
440  my $newlist = [ ];
441  my $class = "${$expat}{Pkg}::$tag";
442  my $newobj = bless { @_, Kids => $newlist }, $class;
443  push @{ $expat->{Lists} }, $expat->{Curlist};
444  push @{ $expat->{Curlist} }, $newobj;
445  $expat->{Curlist} = $newlist;
446}
447
448sub End {
449  my $expat = shift;
450  my $tag = shift;
451  $expat->{Curlist} = pop @{ $expat->{Lists} };
452}
453
454sub Char {
455  my $expat = shift;
456  my $text = shift;
457  my $class = "${$expat}{Pkg}::Characters";
458  my $clist = $expat->{Curlist};
459  my $pos = $#$clist;
460 
461  if ($pos >= 0 and ref($clist->[$pos]) eq $class) {
462    $clist->[$pos]->{Text} .= $text;
463  } else {
464    push @$clist, bless { Text => $text }, $class;
465  }
466}
467
468sub Final {
469  my $expat = shift;
470  delete $expat->{Curlist};
471  delete $expat->{Lists};
472  $expat->{Tree};
473}
474
475################################################################
476
477package XML::Parser::Stream;
478$XML::Parser::Built_In_Styles{Stream} = 1;
479
480# This style invented by Tim Bray <tbray@textuality.com>
481
482sub Init {
483  no strict 'refs';
484  my $expat = shift;
485  $expat->{Text} = '';
486  my $sub = $expat->{Pkg} ."::StartDocument";
487  &$sub($expat)
488    if defined(&$sub);
489}
490
491sub Start {
492  no strict 'refs';
493  my $expat = shift;
494  my $type = shift;
495 
496  doText($expat);
497  $_ = "<$type";
498 
499  %_ = @_;
500  while (@_) {
501    $_ .= ' ' . shift() . '="' . shift() . '"';
502  }
503  $_ .= '>';
504 
505  my $sub = $expat->{Pkg} . "::StartTag";
506  if (defined(&$sub)) {
507    &$sub($expat, $type);
508  } else {
509    print;
510  }
511}
512
513sub End {
514  no strict 'refs';
515  my $expat = shift;
516  my $type = shift;
517 
518  # Set right context for Text handler
519  push(@{$expat->{Context}}, $type);
520  doText($expat);
521  pop(@{$expat->{Context}});
522 
523  $_ = "</$type>";
524 
525  my $sub = $expat->{Pkg} . "::EndTag";
526  if (defined(&$sub)) {
527    &$sub($expat, $type);
528  } else {
529    print;
530  }
531}
532
533sub Char {
534  my $expat = shift;
535  $expat->{Text} .= shift;
536}
537
538sub Proc {
539  no strict 'refs';
540  my $expat = shift;
541  my $target = shift;
542  my $text = shift;
543 
544  doText($expat);
545
546  $_ = "<?$target $text?>";
547 
548  my $sub = $expat->{Pkg} . "::PI";
549  if (defined(&$sub)) {
550    &$sub($expat, $target, $text);
551  } else {
552    print;
553  }
554}
555
556sub Final {
557  no strict 'refs';
558  my $expat = shift;
559  my $sub = $expat->{Pkg} . "::EndDocument";
560  &$sub($expat)
561    if defined(&$sub);
562}
563
564sub doText {
565  no strict 'refs';
566  my $expat = shift;
567  $_ = $expat->{Text};
568 
569  if (length($_)) {
570    my $sub = $expat->{Pkg} . "::Text";
571    if (defined(&$sub)) {
572      &$sub($expat);
573    } else {
574      print;
575    }
576   
577    $expat->{Text} = '';
578  }
579}
580
5811;
582
583__END__
584
585=head1 NAME
586
587XML::Parser - A perl module for parsing XML documents
588
589=head1 SYNOPSIS
590
591  use XML::Parser;
592 
593  $p1 = new XML::Parser(Style => 'Debug');
594  $p1->parsefile('REC-xml-19980210.xml');
595  $p1->parse('<foo id="me">Hello World</foo>');
596
597  # Alternative
598  $p2 = new XML::Parser(Handlers => {Start => \&handle_start,
599                                     End   => \&handle_end,
600                                     Char  => \&handle_char});
601  $p2->parse($socket);
602
603  # Another alternative
604  $p3 = new XML::Parser(ErrorContext => 2);
605
606  $p3->setHandlers(Char    => \&text,
607                   Default => \&other);
608
609  open(FOO, 'xmlgenerator |');
610  $p3->parse(*FOO, ProtocolEncoding => 'ISO-8859-1');
611  close(FOO);
612
613  $p3->parsefile('junk.xml', ErrorContext => 3);
614
615=begin man
616.ds PI PI
617
618=end man
619
620=head1 DESCRIPTION
621
622This module provides ways to parse XML documents. It is built on top of
623L<XML::Parser::Expat>, which is a lower level interface to James Clark's
624expat library. Each call to one of the parsing methods creates a new
625instance of XML::Parser::Expat which is then used to parse the document.
626Expat options may be provided when the XML::Parser object is created.
627These options are then passed on to the Expat object on each parse call.
628They can also be given as extra arguments to the parse methods, in which
629case they override options given at XML::Parser creation time.
630
631The behavior of the parser is controlled either by C<L</Style>> and/or
632C<L</Handlers>> options, or by L</setHandlers> method. These all provide
633mechanisms for XML::Parser to set the handlers needed by XML::Parser::Expat.
634If neither C<Style> nor C<Handlers> are specified, then parsing just
635checks the document for being well-formed.
636
637When underlying handlers get called, they receive as their first parameter
638the I<Expat> object, not the Parser object.
639
640=head1 METHODS
641
642=over 4
643
644=item new
645
646This is a class method, the constructor for XML::Parser. Options are passed
647as keyword value pairs. Recognized options are:
648
649=over 4
650
651=item * Style
652
653This option provides an easy way to create a given style of parser. The
654built in styles are: L<"Debug">, L<"Subs">, L<"Tree">, L<"Objects">,
655and L<"Stream">.
656Custom styles can be provided by giving a full package name containing
657at least one '::'. This package should then have subs defined for each
658handler it wishes to have installed. See L<"STYLES"> below
659for a discussion of each built in style.
660
661=item * Handlers
662
663When provided, this option should be an anonymous hash containing as
664keys the type of handler and as values a sub reference to handle that
665type of event. All the handlers get passed as their 1st parameter the
666instance of expat that is parsing the document. Further details on
667handlers can be found in L<"HANDLERS">. Any handler set here
668overrides the corresponding handler set with the Style option.
669
670=item * Pkg
671
672Some styles will refer to subs defined in this package. If not provided,
673it defaults to the package which called the constructor.
674
675=item * ErrorContext
676
677This is an Expat option. When this option is defined, errors are reported
678in context. The value should be the number of lines to show on either side
679of the line in which the error occurred.
680
681=item * ProtocolEncoding
682
683This is an Expat option. This sets the protocol encoding name. It defaults
684to none. The built-in encodings are: C<UTF-8>, C<ISO-8859-1>, C<UTF-16>, and
685C<US-ASCII>. Other encodings may be used if they have encoding maps in one
686of the directories in the @Encoding_Path list. Check L<"ENCODINGS"> for
687more information on encoding maps. Setting the protocol encoding overrides
688any encoding in the XML declaration.
689
690=item * Namespaces
691
692This is an Expat option. If this is set to a true value, then namespace
693processing is done during the parse. See L<XML::Parser::Expat/"Namespaces">
694for further discussion of namespace processing.
695
696=item * NoExpand
697
698This is an Expat option. Normally, the parser will try to expand references
699to entities defined in the internal subset. If this option is set to a true
700value, and a default handler is also set, then the default handler will be
701called when an entity reference is seen in text. This has no effect if a
702default handler has not been registered, and it has no effect on the expansion
703of entity references inside attribute values.
704
705=item * Stream_Delimiter
706
707This is an Expat option. It takes a string value. When this string is found
708alone on a line while parsing from a stream, then the parse is ended as if it
709saw an end of file. The intended use is with a stream of xml documents in a
710MIME multipart format. The string should not contain a trailing newline.
711
712=item * ParseParamEnt
713
714This is an Expat option. Unless standalone is set to "yes" in the XML
715declaration, setting this to a true value allows the external DTD to be read,
716and parameter entities to be parsed and expanded.
717
718=item * NoLWP
719
720This option has no effect if the ExternEnt or ExternEntFin handlers are
721directly set. Otherwise, if true, it forces the use of a file based external
722entity handler.
723
724=item * Non-Expat-Options
725
726If provided, this should be an anonymous hash whose keys are options that
727shouldn't be passed to Expat. This should only be of concern to those
728subclassing XML::Parser.
729
730=back
731
732=item  setHandlers(TYPE, HANDLER [, TYPE, HANDLER [...]])
733
734This method registers handlers for various parser events. It overrides any
735previous handlers registered through the Style or Handler options or through
736earlier calls to setHandlers. By providing a false or undefined value as
737the handler, the existing handler can be unset.
738
739This method returns a list of type, handler pairs corresponding to the
740input. The handlers returned are the ones that were in effect prior to
741the call.
742
743See a description of the handler types in L<"HANDLERS">.
744
745=item parse(SOURCE [, OPT => OPT_VALUE [...]])
746
747The SOURCE parameter should either be a string containing the whole XML
748document, or it should be an open IO::Handle. Constructor options to
749XML::Parser::Expat given as keyword-value pairs may follow the SOURCE
750parameter. These override, for this call, any options or attributes passed
751through from the XML::Parser instance.
752
753A die call is thrown if a parse error occurs. Otherwise it will return 1
754or whatever is returned from the B<Final> handler, if one is installed.
755In other words, what parse may return depends on the style.
756
757=item parsestring
758
759This is just an alias for parse for backwards compatibility.
760
761=item parsefile(FILE [, OPT => OPT_VALUE [...]])
762
763Open FILE for reading, then call parse with the open handle. The file
764is closed no matter how parse returns. Returns what parse returns.
765
766=item parse_start([ OPT => OPT_VALUE [...]])
767
768Create and return a new instance of XML::Parser::ExpatNB. Constructor
769options may be provided. If an init handler has been provided, it is
770called before returning the ExpatNB object. Documents are parsed by
771making incremental calls to the parse_more method of this object, which
772takes a string. A single call to the parse_done method of this object,
773which takes no arguments, indicates that the document is finished.
774
775If there is a final handler installed, it is executed by the parse_done
776method before returning and the parse_done method returns whatever is
777returned by the final handler.
778
779=back
780
781=head1 HANDLERS
782
783Expat is an event based parser. As the parser recognizes parts of the
784document (say the start or end tag for an XML element), then any handlers
785registered for that type of an event are called with suitable parameters.
786All handlers receive an instance of XML::Parser::Expat as their first
787argument. See L<XML::Parser::Expat/"METHODS"> for a discussion of the
788methods that can be called on this object.
789
790=head2 Init             (Expat)
791
792This is called just before the parsing of the document starts.
793
794=head2 Final            (Expat)
795
796This is called just after parsing has finished, but only if no errors
797occurred during the parse. Parse returns what this returns.
798
799=head2 Start            (Expat, Element [, Attr, Val [,...]])
800
801This event is generated when an XML start tag is recognized. Element is the
802name of the XML element type that is opened with the start tag. The Attr &
803Val pairs are generated for each attribute in the start tag.
804
805=head2 End              (Expat, Element)
806
807This event is generated when an XML end tag is recognized. Note that
808an XML empty tag (<foo/>) generates both a start and an end event.
809
810=head2 Char             (Expat, String)
811
812This event is generated when non-markup is recognized. The non-markup
813sequence of characters is in String. A single non-markup sequence of
814characters may generate multiple calls to this handler. Whatever the
815encoding of the string in the original document, this is given to the
816handler in UTF-8.
817
818=head2 Proc             (Expat, Target, Data)
819
820This event is generated when a processing instruction is recognized.
821
822=head2 Comment          (Expat, Data)
823
824This event is generated when a comment is recognized.
825
826=head2 CdataStart       (Expat)
827
828This is called at the start of a CDATA section.
829
830=head2 CdataEnd         (Expat)
831
832This is called at the end of a CDATA section.
833
834=head2 Default          (Expat, String)
835
836This is called for any characters that don't have a registered handler.
837This includes both characters that are part of markup for which no
838events are generated (markup declarations) and characters that
839could generate events, but for which no handler has been registered.
840
841Whatever the encoding in the original document, the string is returned to
842the handler in UTF-8.
843
844=head2 Unparsed         (Expat, Entity, Base, Sysid, Pubid, Notation)
845
846This is called for a declaration of an unparsed entity. Entity is the name
847of the entity. Base is the base to be used for resolving a relative URI.
848Sysid is the system id. Pubid is the public id. Notation is the notation
849name. Base and Pubid may be undefined.
850
851=head2 Notation         (Expat, Notation, Base, Sysid, Pubid)
852
853This is called for a declaration of notation. Notation is the notation name.
854Base is the base to be used for resolving a relative URI. Sysid is the system
855id. Pubid is the public id. Base, Sysid, and Pubid may all be undefined.
856
857=head2 ExternEnt        (Expat, Base, Sysid, Pubid)
858
859This is called when an external entity is referenced. Base is the base to be
860used for resolving a relative URI. Sysid is the system id. Pubid is the public
861id. Base, and Pubid may be undefined.
862
863This handler should either return a string, which represents the contents of
864the external entity, or return an open filehandle that can be read to obtain
865the contents of the external entity, or return undef, which indicates the
866external entity couldn't be found and will generate a parse error.
867
868If an open filehandle is returned, it must be returned as either a glob
869(*FOO) or as a reference to a glob (e.g. an instance of IO::Handle).
870
871A default handler is installed for this event. The default handler is
872XML::Parser::lwp_ext_ent_handler unless the NoLWP option was provided with
873a true value, otherwise XML::Parser::file_ext_ent_handler is the default
874handler for external entities. Even without the NoLWP option, if the
875URI or LWP modules are missing, the file based handler ends up being used
876after giving a warning on the first external entity reference.
877
878The LWP external entity handler will use proxies defined in the environment
879(http_proxy, ftp_proxy, etc.).
880
881Please note that the LWP external entity handler reads the entire
882entity into a string and returns it, where as the file handler opens a
883filehandle.
884
885Also note that the file external entity handler will likely choke on
886absolute URIs or file names that don't fit the conventions of the local
887operating system.
888
889The expat base method can be used to set a basename for
890relative pathnames. If no basename is given, or if the basename is itself
891a relative name, then it is relative to the current working directory.
892
893=head2 ExternEntFin     (Expat)
894
895This is called after parsing an external entity. It's not called unless
896an ExternEnt handler is also set. There is a default handler installed
897that pairs with the default ExternEnt handler.
898
899If you're going to install your own ExternEnt handler, then you should
900set (or unset) this handler too.
901
902=head2 Entity           (Expat, Name, Val, Sysid, Pubid, Ndata, IsParam)
903
904This is called when an entity is declared. For internal entities, the Val
905parameter will contain the value and the remaining three parameters will be
906undefined. For external entities, the Val parameter will be undefined, the
907Sysid parameter will have the system id, the Pubid parameter will have the
908public id if it was provided (it will be undefined otherwise), the Ndata
909parameter will contain the notation for unparsed entities. If this is a
910parameter entity declaration, then the IsParam parameter is true.
911
912Note that this handler and the Unparsed handler above overlap. If both are
913set, then this handler will not be called for unparsed entities.
914
915=head2 Element          (Expat, Name, Model)
916
917The element handler is called when an element declaration is found. Name
918is the element name, and Model is the content model as an XML::Parser::Content
919object. See L<XML::Parser::Expat/"XML::Parser::ContentModel Methods">
920for methods available for this class.
921
922=head2 Attlist          (Expat, Elname, Attname, Type, Default, Fixed)
923
924This handler is called for each attribute in an ATTLIST declaration.
925So an ATTLIST declaration that has multiple attributes will generate multiple
926calls to this handler. The Elname parameter is the name of the element with
927which the attribute is being associated. The Attname parameter is the name
928of the attribute. Type is the attribute type, given as a string. Default is
929the default value, which will either be "#REQUIRED", "#IMPLIED" or a quoted
930string (i.e. the returned string will begin and end with a quote character).
931If Fixed is true, then this is a fixed attribute.
932
933=head2 Doctype          (Expat, Name, Sysid, Pubid, Internal)
934
935This handler is called for DOCTYPE declarations. Name is the document type
936name. Sysid is the system id of the document type, if it was provided,
937otherwise it's undefined. Pubid is the public id of the document type,
938which will be undefined if no public id was given. Internal is the internal
939subset, given as a string. If there was no internal subset, it will be
940undefined. Internal will contain all whitespace, comments, processing
941instructions, and declarations seen in the internal subset. The declarations
942will be there whether or not they have been processed by another handler
943(except for unparsed entities processed by the Unparsed handler). However,
944comments and processing instructions will not appear if they've been processed
945by their respective handlers.
946
947=head2 * DoctypeFin             (Parser)
948
949This handler is called after parsing of the DOCTYPE declaration has finished,
950including any internal or external DTD declarations.
951
952=head2 XMLDecl          (Expat, Version, Encoding, Standalone)
953
954This handler is called for xml declarations. Version is a string containg
955the version. Encoding is either undefined or contains an encoding string.
956Standalone will be either true, false, or undefined if the standalone attribute
957is yes, no, or not made respectively.
958
959=head1 STYLES
960
961=head2 Debug
962
963This just prints out the document in outline form. Nothing special is
964returned by parse.
965
966=head2 Subs
967
968Each time an element starts, a sub by that name in the package specified
969by the Pkg option is called with the same parameters that the Start
970handler gets called with.
971
972Each time an element ends, a sub with that name appended with an underscore
973("_"), is called with the same parameters that the End handler gets called
974with.
975
976Nothing special is returned by parse.
977
978=head2 Tree
979
980Parse will return a parse tree for the document. Each node in the tree
981takes the form of a tag, content pair. Text nodes are represented with
982a pseudo-tag of "0" and the string that is their content. For elements,
983the content is an array reference. The first item in the array is a
984(possibly empty) hash reference containing attributes. The remainder of
985the array is a sequence of tag-content pairs representing the content
986of the element.
987
988So for example the result of parsing:
989
990  <foo><head id="a">Hello <em>there</em></head><bar>Howdy<ref/></bar>do</foo>
991
992would be:
993             Tag   Content
994  ==================================================================
995  [foo, [{}, head, [{id => "a"}, 0, "Hello ",  em, [{}, 0, "there"]],
996              bar, [         {}, 0, "Howdy",  ref, [{}]],
997                0, "do"
998        ]
999  ]
1000
1001The root document "foo", has 3 children: a "head" element, a "bar"
1002element and the text "do". After the empty attribute hash, these are
1003represented in it's contents by 3 tag-content pairs.
1004
1005=head2 Objects
1006
1007This is similar to the Tree style, except that a hash object is created for
1008each element. The corresponding object will be in the class whose name
1009is created by appending "::" and the element name to the package set with
1010the Pkg option. Non-markup text will be in the ::Characters class. The
1011contents of the corresponding object will be in an anonymous array that
1012is the value of the Kids property for that object.
1013
1014=head2 Stream
1015
1016This style also uses the Pkg package. If none of the subs that this
1017style looks for is there, then the effect of parsing with this style is
1018to print a canonical copy of the document without comments or declarations.
1019All the subs receive as their 1st parameter the Expat instance for the
1020document they're parsing.
1021
1022It looks for the following routines:
1023
1024=over 4
1025
1026=item * StartDocument
1027
1028Called at the start of the parse .
1029
1030=item * StartTag
1031
1032Called for every start tag with a second parameter of the element type. The $_
1033variable will contain a copy of the tag and the %_ variable will contain
1034attribute values supplied for that element.
1035
1036=item * EndTag
1037
1038Called for every end tag with a second parameter of the element type. The $_
1039variable will contain a copy of the end tag.
1040
1041=item * Text
1042
1043Called just before start or end tags with accumulated non-markup text in
1044the $_ variable.
1045
1046=item * PI
1047
1048Called for processing instructions. The $_ variable will contain a copy of
1049the PI and the target and data are sent as 2nd and 3rd parameters
1050respectively.
1051
1052=item * EndDocument
1053
1054Called at conclusion of the parse.
1055
1056=back
1057
1058=head1 ENCODINGS
1059
1060XML documents may be encoded in character sets other than Unicode as
1061long as they may be mapped into the Unicode character set. Expat has
1062further restrictions on encodings. Read the xmlparse.h header file in
1063the expat distribution to see details on these restrictions.
1064
1065Expat has built-in encodings for: C<UTF-8>, C<ISO-8859-1>, C<UTF-16>, and
1066C<US-ASCII>. Encodings are set either through the XML declaration
1067encoding attribute or through the ProtocolEncoding option to XML::Parser
1068or XML::Parser::Expat.
1069
1070For encodings other than the built-ins, expat calls the function
1071load_encoding in the Expat package with the encoding name. This function
1072looks for a file in the path list @XML::Parser::Expat::Encoding_Path, that
1073matches the lower-cased name with a '.enc' extension. The first one it
1074finds, it loads.
1075
1076If you wish to build your own encoding maps, check out the XML::Encoding
1077module from CPAN.
1078
1079=head1 AUTHORS
1080
1081Larry Wall <F<larry@wall.org>> wrote version 1.0.
1082
1083Clark Cooper <F<coopercc@netheaven.com>> picked up support, changed the API
1084for this version (2.x), provided documentation,
1085and added some standard package features.
1086
1087=cut
Note: See TracBrowser for help on using the repository browser.