XML-Filter-Sort-1.01/0000755000175000017500000000000010231533634014513 5ustar grantgrant00000000000000XML-Filter-Sort-1.01/lib/0000755000175000017500000000000010231533634015261 5ustar grantgrant00000000000000XML-Filter-Sort-1.01/lib/XML/0000755000175000017500000000000010231533634015721 5ustar grantgrant00000000000000XML-Filter-Sort-1.01/lib/XML/Filter/0000755000175000017500000000000010231533634017146 5ustar grantgrant00000000000000XML-Filter-Sort-1.01/lib/XML/Filter/Sort/0000755000175000017500000000000010231533634020075 5ustar grantgrant00000000000000XML-Filter-Sort-1.01/lib/XML/Filter/Sort/DiskBuffer.pm0000644000175000017500000000717007502452245022471 0ustar grantgrant00000000000000# $Id: DiskBuffer.pm,v 1.1.1.1 2002/06/14 20:40:05 grantm Exp $ package XML::Filter::Sort::DiskBuffer; use strict; require XML::Filter::Sort::Buffer; use Storable; ############################################################################## # G L O B A L V A R I A B L E S ############################################################################## use vars qw($VERSION @ISA); $VERSION = '0.91'; @ISA = qw(XML::Filter::Sort::Buffer); ############################################################################## # M E T H O D S ############################################################################## ############################################################################## # Method: freeze() # # Serialises a buffer and either writes it to a supplied file descriptor or # returns it as a scalar. # # If a list of sort key values is supplied (presumably a filtered version), it # will replace any values currently stored in the object. # sub freeze { my $self = shift; my $fd = shift; $self->{key_values} = [ @_ ] if(@_); my $data = Storable::freeze( [ $self->{key_values}, $self->{tree} ] ); if($fd) { $fd->print(pack('L', length($data))); $fd->print($data); return; } return($data); } ############################################################################## # Constructor: thaw() # # Alternative constructor for reconstructing buffer objects serialised using # Storable.pm. Argument can be a scalar containing the raw serialised data, or # a filehandle from which the next object will be read. # Returns false on EOF. # If called in a list context, returns the thawed object followed by an integer # approximating the object's in-memory byte count. # sub thaw { my $class = shift; my $data = shift; if(ref($data)) { # Read the data from the file if required my $fd = $data; $fd->read($data, 4) || return; my($size) = unpack('L', $data); $fd->read($data, $size) || return; } my $ref = Storable::thaw($data); my $self = bless( { tree => $ref->[1], key_values => $ref->[0], }, $class); if(wantarray) { my $size = length($data) * 2; # Approximation of in-memory size return($self, $size); } else { return($self); } } ############################################################################## # Method: close() # # Returns keys if this is a thawed buffer or calls base class method to get # keys if buffer has never been frozen. # sub close { my $self = shift; unless($self->{key_values}) { $self->{key_values} = [ $self->SUPER::close() ]; } return(@{$self->{key_values}}); } ############################################################################## # Method: key_values() # # Returns the stored values for each of the sort keys. In a scalar context, # returns a reference to an array of key values. # sub key_values { my $self = shift; return(@{$self->{key_values}}) if(wantarray); return($self->{key_values}); } 1; __END__ =head1 NAME XML::Filter::Sort::DiskBuffer - Implementation class used by XML::Filter::Sort =head1 DESCRIPTION The documentation is targetted at developers wishing to extend or replace this class. For user documentation, see L. For an overview of the classes and methods used for buffering, see L. =head1 METHODS This class inherits from B and adds the following methods: ... =head1 COPYRIGHT Copyright 2002 Grant McLean Egrantm@cpan.orgE This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut XML-Filter-Sort-1.01/lib/XML/Filter/Sort/DiskBufferMgr.pm0000644000175000017500000002522107502452251023131 0ustar grantgrant00000000000000# $Id: DiskBufferMgr.pm,v 1.1.1.1 2002/06/14 20:40:09 grantm Exp $ package XML::Filter::Sort::DiskBufferMgr; use strict; require XML::Filter::Sort::BufferMgr; require XML::Filter::Sort::DiskBuffer; use IO::File; use File::Spec; use File::Path; use File::Temp qw(tempdir); ############################################################################## # G L O B A L V A R I A B L E S ############################################################################## use vars qw($VERSION @ISA); $VERSION = '0.91'; @ISA = qw(XML::Filter::Sort::BufferMgr); use constant DEF_BUCKET_SIZE => 1024 * 1024 * 10; use constant STREAM_FILENAME => 0; use constant STREAM_FILENUM => 1; use constant STREAM_FILEDESC => 2; use constant STREAM_BUFFER => 3; use constant STREAM_KEYS => 4; ############################################################################## # M E T H O D S ############################################################################## ############################################################################## # Constructor: new() # # Extends base class constructor by adding tests for required options. # sub new { my $proto = shift; my $class = ref($proto) || $proto; my $self = $class->SUPER::new(@_); # Check/clean up supplied options if(!ref($proto) and !$self->{TempDir}) { die "You must set the 'TempDir' option for disk buffering"; } $self->{MaxMem} = DEF_BUCKET_SIZE unless($self->{MaxMem}); # Initialise structures if($self->{TempDir}) { $self->{_temp_dir} = tempdir( DIR => $self->{TempDir}); $self->{buffered_bytes} = 0; } return(bless($self,$class)); } ############################################################################## # Destructor: DESTROY() # # Cleans up the temporary directory. # sub DESTROY { my $self = shift; return unless($self->{_temp_dir}); rmtree($self->{_temp_dir}); } ############################################################################ # Method: new_buffer() # # Creates and returns an object for buffering a single record. # sub new_buffer { my $self = shift; my %opt = ( Keys => $self->{Keys} ); if($self->{_match_subs}) { $opt{_match_subs} = [ @{$self->{_match_subs}} ]; } return(XML::Filter::Sort::DiskBuffer->new(%opt)); } ############################################################################## # Method: close_buffer() # # Takes a buffer, calls its close() method to get the frozen representation of # the buffer and the list of sort key values. Filters the key values and # stores the frozen buffer using those values. If the accumulated frozen # buffers exceed the configured threshold, they will all be serialised out to # a disk file. # sub close_buffer { my $self = shift; my $record = shift; my @sort_keys = $record->close(); @sort_keys = $self->fix_keys(@sort_keys); my $data = $record->freeze(undef, @sort_keys); $self->store($data, @sort_keys); $self->{buffered_bytes} += length($data); if($self->{buffered_bytes} >= $self->{MaxMem}) { $self->save_to_disk(); } } ############################################################################## # Method: save_to_disk() # # Checks for buffered records. If there are some, creates a disk file and # writes out the frozen buffers to it in sorted order. # sub save_to_disk { my $self = shift; my $fd = shift; # Create the file if required unless($fd) { return unless($self->{buffered_bytes}); $self->{files} = [ ] unless($self->{files}); my $count = @{$self->{files}}; my $filename = File::Spec->catfile($self->{_temp_dir}, $count); $fd = IO::File->new(">$filename") || die "Error creating temporary file ($filename): $!"; binmode($fd); $self->{files}->[$count] = $filename; } # Write out the records in sorted order my $keys = $self->sorted_keys(); foreach my $key (@$keys) { if(ref($self->{records}->{$key}) eq 'ARRAY') { foreach my $record (@{$self->{records}->{$key}}) { $fd->print(pack('L', length($record))); $fd->print($record); } } else { # it must be a XML::Filter::Sort::DiskBufferMgr $self->{records}->{$key}->save_to_disk($fd); } } $fd->close() if($self->{files}); $self->{records} = {}; $self->{buffered_bytes} = 0; } ############################################################################## # Method: to_sax() # # Streams buffered data back out as SAX events. # sub to_sax { my $self = shift; my $filter = shift; $self->save_to_disk(); # OPTIMISATION: sax_from_mem if no $self->{files} while(@{$self->{files}}) { $self->prepare_merge(); if(@{$self->{files}}) { $self->merge_to_disk(); } else { $self->merge_to_sax($filter); } } } ############################################################################## # Method: merge_to_sax() # # Takes the record from the head of the list and writes it out as SAX events; # takes the next record from that stream and repositions the stream in the # list; repeats until all streams empty. # sub merge_to_sax { my $self = shift; my $filter = shift; while(my $stream = pop @{$self->{streams}}) { $stream->[STREAM_BUFFER]->to_sax($filter); $stream->[STREAM_BUFFER] = XML::Filter::Sort::DiskBuffer->thaw($stream->[STREAM_FILEDESC]); if($stream->[STREAM_BUFFER]) { $stream->[STREAM_KEYS] = $stream->[STREAM_BUFFER]->key_values(); $self->push_stream($stream); } else { $stream->[STREAM_FILEDESC]->close(); unlink($stream->[STREAM_FILENAME]); } } } ############################################################################## # Method: prepare_merge() # # The merge process treats each temporary file as a 'stream' of records. A # linked list data structure (actually just an array - go Perl!) is used to # keep track of the next available record from each stream. This routine # builds the linked list by opening each temp file, reading the first record # and 'pushing' the stream down into the list. The record at the head of the # list will be first against the wall when the revolution comes. # sub prepare_merge { my $self = shift; my $buffered_bytes = 0; while(@{$self->{files}}) { my $filename = shift @{$self->{files}}; my($filenum) = ($filename =~ /(\d+)$/); my $fd = IO::File->new("<$filename") || die "Error opening temporary file ($filename): $!"; binmode($fd); my($buffer, $size) = XML::Filter::Sort::DiskBuffer->thaw($fd); die "Temporary file ($filename) unexpectedly empty" unless($buffer); my $keys = $buffer->key_values(); $self->push_stream( [ $filename, $filenum, $fd, $buffer, $keys ] ); $buffered_bytes += $size; if($buffered_bytes >= $self->{MaxMem} and @{$self->{streams}} > 1) { $self->merge_to_disk(); $buffered_bytes = 0; } } } ############################################################################## # Method: merge_to_disk() # # This routine is called from prepare_merge() if there are too many temporary # files to merge in one operation. Merges records from all the currently open # streams into a new temporary file and pushes the new filename onto the start # of the list of files. # sub merge_to_disk { my $self = shift; my $filename = File::Spec->catfile($self->{_temp_dir}, '0'); my $fd = IO::File->new(">$filename.tmp") || die "Error creating temporary file ($filename): $!"; binmode($fd); while(my $stream = pop @{$self->{streams}}) { $stream->[STREAM_BUFFER]->freeze($fd); $stream->[STREAM_BUFFER] = XML::Filter::Sort::DiskBuffer->thaw($stream->[STREAM_FILEDESC]); if($stream->[STREAM_BUFFER]) { $stream->[STREAM_KEYS] = $stream->[STREAM_BUFFER]->key_values(); $self->push_stream($stream); } else { $stream->[STREAM_FILEDESC]->close(); unlink($stream->[STREAM_FILENAME]); } } $fd->close(); rename("$filename.tmp", $filename); unshift @{$self->{files}}, $filename; } ############################################################################## # Method: push_stream() # # Inserts a 'stream' at its proper position in the 'linked list'. # sub push_stream { my $self = shift; my $stream = shift; # Create the list if it does not already exist; my $list = $self->{streams}; unless($list) { $self->{streams} = [ $stream ]; # Create the 'linked list' return; } # Push this record in above an existing one ... for(my $i = @$list - 1; $i >= 0; $i--) { if($self->stream_cmp($stream, $list->[$i]) == -1) { splice @$list, $i, 1, $list->[$i], $stream; return; } } # ... or push it right down to the bottom unshift @$list, $stream; } ############################################################################## # Method: stream_cmp() # # Used by the merge process to determine the sort order of the buffers at the # head of two streams. # Returns -1 or 1 depending on which one sorts first. (Never returns 0 since # as a last resort, file numbers are compared to give a stable sort). # sub stream_cmp { my($self, $streama, $streamb) = @_; my $result; for(my $k = 0; $k < @{$streama->[STREAM_KEYS]}; $k++) { my $cmp = $self->{Keys}->[$k]->[1]; my $dir = $self->{Keys}->[$k]->[2]; my $a = $streama->[STREAM_KEYS]->[$k]; my $b = $streamb->[STREAM_KEYS]->[$k]; # coderef sort comparator if(ref($cmp)) { if($dir eq 'desc') { $result = $cmp->($b, $a) and return($result); } else { $result = $cmp->($a, $b) and return($result); } } # numeric comparator elsif($cmp eq 'num') { if($dir eq 'desc') { $result = ($b <=> $a) and return($result); } else { $result = ($a <=> $b) and return($result); } } # alpha comparator (default) else { if($dir eq 'desc') { $result = ($b cmp $a) and return($result); } else { $result = ($a cmp $b) and return($result); } } } # Fall through to file number to ensure a stable sort return($streama->[STREAM_FILENUM] <=> $streamb->[STREAM_FILENUM]); } 1; __END__ =head1 NAME XML::Filter::Sort::DiskBufferMgr - Implementation class used by XML::Filter::Sort =head1 DESCRIPTION The documentation is targetted at developers wishing to extend or replace this class. For user documentation, see L. For an overview of the classes and methods used for buffering, see L. =head1 METHODS This class inherits from B and adds the following methods: ... =head1 COPYRIGHT Copyright 2002 Grant McLean Egrantm@cpan.orgE This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut XML-Filter-Sort-1.01/lib/XML/Filter/Sort/BufferMgr.pm0000644000175000017500000002070207502452245022320 0ustar grantgrant00000000000000# $Id: BufferMgr.pm,v 1.1.1.1 2002/06/14 20:40:05 grantm Exp $ package XML::Filter::Sort::BufferMgr; use strict; require XML::Filter::Sort::Buffer; ############################################################################## # G L O B A L V A R I A B L E S ############################################################################## use vars qw($VERSION); $VERSION = '0.91'; ############################################################################## # M E T H O D S ############################################################################## ############################################################################## # Constructor: new() # # Allocates in-memory structures for buffering records. # sub new { my $proto = shift; my $class = ref($proto) || $proto; my $self = { @_ }; $self->{records} = {}; return(bless($self, $class)); } ############################################################################## # Method: compile_matches() # # Returns a list of closures for matching each of the sort keys. # sub compile_matches { my $self = shift; return(XML::Filter::Sort::Buffer->compile_matches(@_)); } ############################################################################## # Method: new_buffer() # # Creates and returns an object for buffering a single record. # sub new_buffer { my $self = shift; my %opt = ( Keys => $self->{Keys} ); if($self->{_match_subs}) { $opt{_match_subs} = [ @{$self->{_match_subs}} ]; } return(XML::Filter::Sort::Buffer->new(%opt)); } ############################################################################## # Method: close_buffer() # # Takes a buffer, calls its close() method to get the sort key values, filters # the key values and stores the buffer using those values. # sub close_buffer { my $self = shift; my $record = shift; my @sort_keys = $record->close(); @sort_keys = $self->fix_keys(@sort_keys); $self->store($record, @sort_keys); } ############################################################################## # Method: fix_keys() # # Takes a list of sort key values and applies various fixes/cleanups to them. # sub fix_keys { my $self = shift; my @sort_keys = @_; if($self->{IgnoreCase}) { @sort_keys = map { lc($_) } @sort_keys; } if($self->{NormaliseKeySpace}) { foreach (@sort_keys) { s/^\s+//s; s/\s+$//s; s/\s+/ /sg; } } if($self->{KeyFilterSub}) { @sort_keys = $self->{KeyFilterSub}->(@sort_keys); } return(@sort_keys); } ############################################################################## # Method: store() # # Takes a buffer, and a series of key values. Stores the buffer using those # values. # sub store { my $self = shift; my $record = shift; my $key = shift; if(@_) { unless($self->{records}->{$key}) { my @key_defs = @{$self->{Keys}}; shift @key_defs; $self->{records}->{$key} = $self->new(Keys => \@key_defs); } $self->{records}->{$key}->store($record, @_); } else { unless($self->{records}->{$key}) { $self->{records}->{$key} = []; } push @{$self->{records}->{$key}}, $record; } } ############################################################################## # Method: to_sax() # # Takes a reference to the parent XML::Filter::Sort object. Cycles through # each of the buffered records (in appropriate sorted sequence) and streams # them out to the handler object as SAX events. # sub to_sax { my $self = shift; my $filter = shift; my $keys = $self->sorted_keys(); foreach my $key (@$keys) { if(ref($self->{records}->{$key}) eq 'ARRAY') { foreach my $record (@{$self->{records}->{$key}}) { $record->to_sax($filter); } } else { $self->{records}->{$key}->to_sax($filter); } } } ############################################################################## # Method: sorted_keys() # # Returns a reference to an array of all the sort keys in order. # sub sorted_keys { my $self = shift; my @keys = keys(%{$self->{records}}); my $cmp = $self->{Keys}->[0]->[1]; my $dir = $self->{Keys}->[0]->[2]; # coderef sort comparator if(ref($cmp)) { if($dir eq 'desc') { @keys = sort { $cmp->($b, $a) } @keys; } else { @keys = sort { $cmp->($a, $b) } @keys; } } # numeric comparator elsif($cmp eq 'num') { if($dir eq 'desc') { @keys = sort { $b <=> $a } @keys; } else { @keys = sort { $a <=> $b } @keys; } } # alpha comparator (default) else { if($dir eq 'desc') { @keys = sort { $b cmp $a } @keys; } else { @keys = sort { $a cmp $b } @keys; } } return(\@keys); } 1; __END__ =head1 NAME XML::Filter::Sort::BufferMgr - Implementation class used by XML::Filter::Sort =head1 DESCRIPTION The documentation is targetted at developers wishing to extend or replace this class. For user documentation, see L. Two classes are used to implement buffering records and spooling them back out in sorted order as SAX events. One instance of the B class is used to buffer each record and one or more instances of the B class are used to manage the buffers. =head1 API METHODS The API of this module as used by B consists of the following sequence of method calls: =over 4 =item 1 When the first 'record' in a sequence is encountered, B creates a B object using the C method. =item 2 B calls the buffer manager's C method to get a B object and all SAX events are directed to this object until the end of the record is encountered. The following events are supported by the current buffer implementation: start_element() characters() comment() processing_instruction() end_element() =item 3 When the end of the record is detected, B calls the buffer manager's C method, which in turn calls the buffer's C method. The C method returns a list of values for the sort keys and the buffer manager uses these to store the buffer for later recall. Subsequent records are handled as per step 2. =item 4 When the last record has been buffered, B calls the buffer manager's C method. The buffer manager retrieves each of the buffers in sorted order and calls the buffer's C method. =back Each buffer attempts to match the sort key paths as SAX events are received. Once a value has been found for a given key, that same path match is not attempted against subsequent events. For efficiency, the code to match each key is compiled into a closure. For even more efficiency, this compilation is done once when the B object is created. The C method in the buffer manager class calls the C method in the buffer class to achieve this. =head1 DATA STRUCTURES In the current implementation, the B class simply uses a hash to store the buffer objects. If only one sort key was defined, only a single hash is required. The values in the hash are arrayrefs containing the list of buffers for records with identical keys. If two or more sort keys are defined, the hash values will be B objects which in turn will contain the buffers. The following illustration may clarify the relationship (BM=buffer manager, B=buffer): BM +----------------+---------------+ | | BM BM +-----+--------+ +-----+----------+ | | | | BM BM BM BM +-----+----+ +----+------+ +----+----+ +------+------+ | | | | | | | | | | | | [B,B,B] [B] [B,B] [B] [B,B] [B,B,B] [B] [B,B] [B] [B,B] [B,B,B] [B,B] This layered storage structure is transparent to the B object which instantiates and interacts with only one buffer manager (the one at the top of the tree). =head1 COPYRIGHT Copyright 2002 Grant McLean Egrantm@cpan.orgE This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut XML-Filter-Sort-1.01/lib/XML/Filter/Sort/Buffer.pm0000644000175000017500000002633310231532722021650 0ustar grantgrant00000000000000# $Id: Buffer.pm,v 1.2 2005/04/20 20:04:34 grantm Exp $ package XML::Filter::Sort::Buffer; use strict; ############################################################################## # G L O B A L V A R I A B L E S ############################################################################## use vars qw($VERSION @ISA); $VERSION = '0.91'; use constant NODE_TYPE => 0; use constant NODE_DATA => 1; use constant NODE_CONTENT => 2; ############################################################################## # M E T H O D S ############################################################################## ############################################################################## # Contructor: new() # # Prepare to build a tree and match nodes against patterns to extract sort # key values. # sub new { my $class = shift; my $self = { @_, }; bless($self, $class); # Prepare to match sort key nodes $self->{Keys} ||= [ [ '.' ] ]; $self->{_match_subs} ||= [ $self->compile_matches($self->{Keys}) ]; $self->{_key_values} = [ ('') x @{$self->{Keys}} ]; $self->{_path_name} = []; $self->{_path_ns} = []; $self->{_depth} = -1; # Initialise tree building structures $self->{tree} = []; $self->{_lists} = []; $self->{_curr_list} = $self->{tree}; return($self); } ############################################################################## # Class Method: compile_matches() # # Generates a closure to match each of the supplied sort keys. Returns a # list of closures. # sub compile_matches { my $class = shift; my $keys = shift; my @match_subs = (); foreach my $i (0..$#{$keys}) { my $key_num = $i; # local copy for closure my($pattern, $comparison, $direction) = @{$keys->[$key_num]}; my($path, $attr) = split(/\@/, $pattern); my $abs = ($path =~ m{^\.}); $path =~ s{^\.?/*}{}; $path =~ s{/*$}{}; my @name_list = (); my @ns_list = (); foreach (split(/\//, $path)) { my($ns, $name) = m/^(?:\{(.*?)\})?(.*)$/; push @name_list, $name; push @ns_list, $ns; }; my $required_depth = @name_list; my($attr_name, $attr_nsname); if($attr and $attr =~ m/^(\{.*?\})?(.*)$/ ) { $attr_name = $2; if($1) { $attr_nsname = $attr; } } # Closure which matches the path push @match_subs, sub { my $self = shift; if($abs) { return if($self->{_depth} != $required_depth); } else { return if($self->{_depth} < $required_depth); } foreach my $i (1..$required_depth) { return unless($self->{_path_name}->[-$i] eq $name_list[-$i]); if(defined($ns_list[-$i])) { return unless($self->{_path_ns}->[-$i] eq $ns_list[-$i]); } } return $self->save_key_value($key_num, $attr_name, $attr_nsname); }; } return(@match_subs); } ############################################################################## # Method: save_key_value() # # Once a match has been found, the matching closure will call this method to # extract the key value and save it. Returns true to indicate the reference # to the closure can be deleted since there is no need to try and match the # same pattern again. # sub save_key_value { my($self, $key_num, $attr_name, $attr_nsname) = @_; # Locate the element whose end event we're processing (ie: the element # which owns the content list we're about to close) my $node = $self->{_lists}->[-1]->[-1]; # Extract the appropriate value if($attr_name) { my $value = undef; if(defined($attr_nsname)) { if(exists($node->[NODE_DATA]->{Attributes}->{$attr_nsname})) { $value = $node->[NODE_DATA]->{Attributes}->{$attr_nsname}->{Value}; } } else { foreach my $attr (values %{$node->[NODE_DATA]->{Attributes}}) { if($attr->{LocalName} eq $attr_name) { $value = $attr->{Value}; last; } } } return unless(defined($value)); # keep looking for elem with rqd attr $self->{_key_values}->[$key_num] = $value; } else { $self->{_key_values}->[$key_num] = $self->text_content(@{$node->[NODE_CONTENT]}); } return(1); } ############################################################################## # Method: text_content() # # Takes a list of nodes and recursively builds up a string containing the # text content. # sub text_content { my $self = shift; my $text = ''; while(@_) { my $node = shift; if(ref($node)) { if($node->[NODE_TYPE] eq 'e') { if(@{$node->[NODE_CONTENT]}) { $text .= $self->text_content(@{$node->[NODE_CONTENT]}) } } } else { $text .= $node; } } return($text); } ############################################################################## # Method: close() # # Called by the buffer manager to signify that the record is complete. # sub close { my $self = shift; my @key_values = @{$self->{_key_values}}; foreach my $key (grep(/^_/, keys(%$self))) { delete($self->{$key}); } return(@key_values); } ############################################################################## # Method: to_sax() # # Takes a reference to the parent XML::Filter::Sort object and a list of node # structures. Passes each node to the handler as SAX events, recursing into # nodes as required. On initial call, node list will default to top of stored # tree. # sub to_sax { my $self = shift; my $filter = shift; @_ = @{$self->{tree}} unless(@_); while(@_) { my $node = shift; if(ref($node)) { if($node->[NODE_TYPE] eq 'e') { $filter->start_element($node->[NODE_DATA]); if(@{$node->[NODE_CONTENT]}) { $self->to_sax($filter, @{$node->[NODE_CONTENT]}) } $filter->end_element($node->[NODE_DATA]); } elsif($node->[NODE_TYPE] eq 'p') { $filter->processing_instruction($node->[NODE_DATA]); } elsif($node->[NODE_TYPE] eq 'c') { $filter->comment($node->[NODE_DATA]); } else { die "Unhandled node type: '" . $node->[NODE_TYPE] . "'"; } } else { $filter->characters( { Data => $node } ); } } } ############################################################################## # SAX handlers to build buffered event tree ############################################################################## sub start_element { my($self, $elem) = @_; $self->{_depth}++; if($self->{_depth} > 0) { push @{$self->{_path_name}}, $elem->{LocalName}; push @{$self->{_path_ns}}, (defined($elem->{NamespaceURI}) ? $elem->{NamespaceURI} : ''); } my $new_list = []; my $new_node = [ 'e', { %$elem }, $new_list ]; push @{$self->{_curr_list}}, $new_node; push @{$self->{_lists}}, $self->{_curr_list}; $self->{_curr_list} = $new_list; } sub characters { my($self, $char) = @_; push @{$self->{_curr_list}}, $char->{Data}; } sub comment { my($self, $comment) = @_; push @{$self->{_curr_list}}, [ 'c', { %{$comment} } ]; } sub processing_instruction { my($self, $pi) = @_; push @{$self->{_curr_list}}, [ 'p', { %{$pi} } ]; } sub end_element { my $self = shift; # Check for matches against sort key patterns my $i = 0; while(exists($self->{_match_subs}->[$i])) { if($self->{_match_subs}->[$i]->($self)) { splice(@{$self->{_match_subs}}, $i, 1); # Delete the match sub } else { $i++; } } $self->{_depth}--; pop @{$self->{_path_name}}; pop @{$self->{_path_ns}}; $self->{_curr_list} = pop @{$self->{_lists}}; } 1; __END__ =head1 NAME XML::Filter::Sort::Buffer - Implementation class used by XML::Filter::Sort =head1 DESCRIPTION The documentation is targetted at developers wishing to extend or replace this class. For user documentation, see L. For an overview of the classes and methods used for buffering, see L. =head1 BUFFER LIFE CYCLE A B object is created by a B object using the C method. The B object will then propagate any SAX events it receives, to the buffer object until the end of the record is reached. As each element is added to the buffer, its contents are compared to the sort key paths and the sort key values are extracted. When the end of the record is reached, the C method is called. The return value from this method is the list of sort keys. The buffer manager will store the buffer until the end of the record sequence is reached. Then it will retrieve each buffer in order of the sort key values and call the buffer's C method to send all buffered events to the downstream handler. Following the call to C, the buffer is discarded. No destructor method is used - everything is handled by Perl's garbage collector. =head1 DATA STRUCTURES The buffer contains a 'tree' of SAX events. The tree is simply an array of 'nodes'. Text nodes are represented as scalars. Other nodes are represented as arrayrefs. The first element of a node array is a single character identifying the node type: e - element c - comment p - processing instruction The second element is the node data (the hash from the original SAX event). The child nodes of an element node are represented by the third element as an arrayref. For example, this XML: smith Would be buffered as this data structure: [ [ 'e', { 'Name' => 'person' 'Prefix' => '', 'LocalName' => 'person', 'NamespaceURI' => '', 'Attributes' => { '{}age' => { 'LocalName' => 'age', 'NamespaceURI' => '', 'Value' => '27', 'Prefix' => '', 'Name' => 'age' } }, }, [ "\n ", [ 'e', { 'Name' => 'lastname' 'Prefix' => '', 'LocalName' => 'lastname', 'NamespaceURI' => '', 'Attributes' => {}, }, [ 'smith' ] ], "\n ", ] ] ] =head1 COPYRIGHT Copyright 2002 Grant McLean Egrantm@cpan.orgE This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut ############################################################################## # Method: to_sax() # # The following version of to_sax() uses an iterative design rather than the # conceptually simpler recursive implementation. Strangely (and unfortunately) # it's about 20% slower than the recursive version - anyone know why? # sub to_sax { my $self = shift; my $filter = shift; my @lists = $self->{tree}; my($node); while(@lists) { if(@{$lists[-1]}) { $node = $lists[-1]->[0]; if(ref($node)) { if($node->[NODE_TYPE] eq 'e') { $filter->start_element($node->[NODE_DATA]); push @lists, pop(@$node); } elsif($node->[NODE_TYPE] eq 'c') { $filter->comment($node->[NODE_DATA]); shift(@{$lists[-1]}); } elsif($node->[NODE_TYPE] eq 'p') { $filter->processing_instruction($node->[NODE_DATA]); shift(@{$lists[-1]}); } else { die "Unexpected node type: '$node->[NODE_TYPE]'"; } } else { $filter->characters({ Data => $node }); shift(@{$lists[-1]}); } } else { pop @lists; # discard empty content list if(@lists) { $node = shift(@{$lists[-1]}); $filter->end_element($node->[NODE_DATA]); } } } return; } XML-Filter-Sort-1.01/lib/XML/Filter/Sort.pm0000644000175000017500000005616310231533150020437 0ustar grantgrant00000000000000package XML::Filter::Sort; use strict; use Carp; require XML::SAX::Base; ############################################################################## # G L O B A L V A R I A B L E S ############################################################################## use vars qw($VERSION @ISA); $VERSION = '1.01'; @ISA = qw(XML::SAX::Base); use constant DEFAULT_BUFFER_MANAGER_CLASS => 'XML::Filter::Sort::BufferMgr'; use constant DISK_BUFFER_MANAGER_CLASS => 'XML::Filter::Sort::DiskBufferMgr'; ############################################################################## # M E T H O D S ############################################################################## ############################################################################## # Contructor: new() # # Set defaults for required properties and parse 'Keys' value from scalar to # a list of lists if required. # sub new { my $proto = shift; my $class = ref($proto) || $proto; my $self = $class->SUPER::new(@_); croak "You must set the 'Record' option" unless($self->{Record}); # Select memory vs disk buffering (or custom buffering class) if($self->{TempDir}) { $self->{BufferManagerClass} ||= DISK_BUFFER_MANAGER_CLASS; } unless($self->{BufferManagerClass}) { $self->{BufferManagerClass} = DEFAULT_BUFFER_MANAGER_CLASS; } my $mod_path = join('/', split(/::/, $self->{BufferManagerClass} . '.pm')); require $mod_path; # Organise sort keys into a list of 3-element lists $self->{Keys} = '.' unless($self->{Keys}); unless(ref($self->{Keys})) { # parse scalar to a list of lists my @keys = (); foreach (split(/[\r\n;]/, $self->{Keys})) { next unless(/\S/); s/,/ /g; my @key = /(\S+)/g; push @keys, \@key; } $self->{Keys} = \@keys; } foreach my $key (@{$self->{Keys}}) { croak "Keys must be a list of lists" unless(ref($key)); $key->[1] ||= 'alpha'; unless(ref($key->[1])) { $key->[1] = ($key->[1] =~ /^n/i ? 'num' : 'alpha'); } $key->[2] = ($key->[2] && $key->[2] =~ /^d/i ? 'desc' : 'asc'); } # Precompile a closure to match each key if($self->{BufferManagerClass}->can('compile_matches')) { $self->{_match_subs} = [ $self->{BufferManagerClass}->compile_matches($self->{Keys}) ]; } # Build up a list of options to be passed to buffers/buffer managers if($self->{MaxMem}) { if(uc($self->{MaxMem}) =~ /^\s*(\d+)(K|M)?$/) { $self->{MaxMem} = $1; $self->{MaxMem} *= 1024 if($2 and $2 eq 'K'); $self->{MaxMem} *= 1024 * 1024 if($2 and $2 eq 'M'); } else { croak "Illegal value for 'MaxMem': $self->{MaxMem}"; } } $self->{BufferOpts} = { Keys => [ @{$self->{Keys}} ], _match_subs => $self->{_match_subs}, IgnoreCase => $self->{IgnoreCase}, NormaliseKeySpace => $self->{NormaliseKeySpace} || $self->{NormalizeKeySpace}, KeyFilterSub => $self->{KeyFilterSub}, TempDir => $self->{TempDir}, MaxMem => $self->{MaxMem}, }; return(bless($self,$class)); } ############################################################################## # Method: start_document() # # Initialise handler structures and propagate event. # sub start_document { my $self = shift; # Track path to current element $self->{path_name} = []; $self->{path_ns} = []; $self->{prefixes} = []; $self->{depth} = 0; # Initialise pattern matching for record elements my @parts = split(/\//, $self->{Record}); if($parts[0] eq '') { $self->{abs_match} = 1; shift @parts; } else { $self->{abs_match} = 0; } $self->{rec_path_name} = [ ]; $self->{rec_path_ns} = [ ]; foreach (@parts) { if(/^(?:\{(.*?)\})?(.*)$/) { push @{$self->{rec_path_name}}, $2; push @{$self->{rec_path_ns}}, $1; } } $self->{required_depth} = @parts; $self->SUPER::start_document(@_); } ############################################################################## # Method: start_element() # # Marshalls events either to the default handler or to a record buffer. # Also handles the creation of buffers as record elements are encountered. # Two extra considerations increase complexity: contiguous character events # are being merged; and each 'record' element takes it's leading whitespace # with it. # sub start_element { my $self = shift; my $element = shift; return $self->start_prefixed_element($element) if($self->{passthru}); # Add this element's details to the end of the list (for recognising records) push @{$self->{path_name}}, $element->{LocalName}; push @{$self->{path_ns}}, (defined($element->{NamespaceURI}) ? $element->{NamespaceURI} : ''); $self->{depth}++; # Do we have a record buffer open? if($self->{buffer}) { $self->{record_depth}++; $self->send_characters(); $self->{buffer}->start_element($element); return; } # Any leading (non-whitespace) text? if($self->{buffered_text}) { $self->flush_buffers(); $self->send_characters(); } # Is this a record? if($self->match_record()) { $self->{record_depth} = 1; unless($self->{buffer_manager}) { $self->{buffer_manager} = $self->{BufferManagerClass}->new( %{$self->{BufferOpts}} ); } $self->{buffer} = $self->{buffer_manager}->new_buffer(); $self->send_characters(); $self->{buffer}->start_element($element); return; } # Send buffered data and this event to the downstream handler $self->flush_buffers(); $self->send_characters(); $self->start_prefixed_element($element); } ############################################################################## # Method: end_element() # # Marshalls events either to the default handler or to a record buffer. # Also handles closing the current buffer object as the end of a record is # encountered. # sub end_element { my $self = shift; my $element = shift; return $self->end_prefixed_element($element) if($self->{passthru}); pop @{$self->{path_name}}; pop @{$self->{path_ns}}; $self->{depth}--; # Do we have a record buffer open? if($self->{buffer}) { $self->send_characters(); $self->{buffer}->end_element($element); $self->{record_depth}--; if($self->{record_depth} == 0) { $self->{buffer_manager}->close_buffer($self->{buffer}); delete($self->{buffer}); } return; } # No, then do we have any complete buffered records? $self->flush_buffers(); $self->send_characters(); $self->end_prefixed_element($element); } ############################################################################## # Method: characters() # # Buffer character events for two reasons: # - to merge contiguous character data (simplifies pattern matching logic) # - to enable 'record' elements to take their leading whitespace with them # sub characters { my $self = shift; my $char = shift; return $self->SUPER::characters($char) if($self->{passthru}); unless(exists($self->{char_buffer})) { $self->{char_buffer} = ''; $self->{buffered_text} = 0; } $self->{char_buffer} .= $char->{Data}; $self->{buffered_text} |= ($char->{Data} =~ /\S/); } ############################################################################## # Method: ignorable_whitespace() # # Discard ignorable whitespace if required, otherwise send it on as # character events. # # Yes, this is a dirty hack, but it's getting late and I haven't got a # parser that generates them anyway. # sub ignorable_whitespace { my $self = shift; my $char = shift; $self->characters($char) unless($self->{SkipIgnorableWS}); } ############################################################################## # Method: start_prefix_mapping() # Method: end_prefix_mapping() # # Suppress these events as they need to remain synchronised with the # start/end_element events (which may be re-ordered). Replacement events are # generated by start/end_prefixed_element(). # sub start_prefix_mapping { } sub end_prefix_mapping { } ############################################################################## # Method: start_prefixed_element() # # Sends a start_element() event to the downstream handler, but re-generates # start_prefix_mapping() events first. # sub start_prefixed_element { my $self = shift; my $elem = shift; my @prefixes; foreach my $attr (values %{$elem->{Attributes}}) { if($attr->{Name} and $attr->{Name} eq 'xmlns') { unshift @prefixes, '', $attr->{Value}; } elsif($attr->{Prefix} and $attr->{Prefix} eq 'xmlns') { push @prefixes, $attr->{LocalName}, $attr->{Value}; } } if(@prefixes) { push @{$self->{prefixes}}, [ @prefixes ]; while(@prefixes) { my $prefix = shift @prefixes; my $uri = shift @prefixes; $self->SUPER::start_prefix_mapping({ Prefix => $prefix, NamespaceURI => $uri, }); } } else { push @{$self->{prefixes}}, undef; } $self->SUPER::start_element($elem); } ############################################################################## # Method: end_prefixed_element() # # Sends an end_element() event to the downstream handler, and follows it with # re-generated end_prefix_mapping() events. # sub end_prefixed_element { my $self = shift; my $elem = shift; $self->SUPER::end_element($elem); my $prefixes = pop @{$self->{prefixes}}; if($prefixes) { while(@$prefixes) { my $prefix = shift @$prefixes; my $uri = shift @$prefixes; $self->SUPER::end_prefix_mapping({ Prefix => $prefix, NamespaceURI => $uri, }); } } } ############################################################################## # Method: comment() # # Send comments to buffer if we have one open, otherwise flush any buffered # records before propagating event. # sub comment { my $self = shift; my $comment = shift; return $self->SUPER::comment($comment) if($self->{passthru}); if($self->{buffer}) { $self->send_characters(); $self->{buffer}->comment($comment); return; } $self->flush_buffers(); $self->send_characters(); $self->SUPER::comment($comment); } ############################################################################## # Method: processing_instruction() # # Send PIs to downstream handler but flush buffered records & text first. # sub processing_instruction { my $self = shift; my $pi = shift; return $self->SUPER::processing_instruction($pi) if($self->{passthru}); if($self->{buffer}) { $self->send_characters(); $self->{buffer}->processing_instruction($pi); return; } $self->flush_buffers(); $self->send_characters(); $self->SUPER::processing_instruction($pi); } ############################################################################## # Method: send_characters() # # Contiguous character events are concatenated into a buffer. This routine # sends the buffer contents to the open buffer if there is one, or the # downstream handler otherwise. # sub send_characters { my $self = shift; return unless(exists $self->{char_buffer}); if($self->{buffer}) { $self->{buffer}->characters({Data => $self->{char_buffer}}); } else { $self->SUPER::characters({Data => $self->{char_buffer}}); } delete($self->{char_buffer}); delete($self->{buffered_text}); } ############################################################################## # Method: flush_buffers() # # If there are any records buffered, sends them to the downstream handler. # sub flush_buffers { my $self = shift; if($self->{buffer_manager}) { $self->{passthru} = 1; $self->{buffer_manager}->to_sax($self); $self->{passthru} = 0; delete($self->{buffer_manager}); } } ############################################################################## # Method: match_record() # # Returns true if the path to the current element matches the 'Record' option # passed to the constructor. # sub match_record { my $self = shift; if($self->{abs_match}) { return if($self->{depth} != $self->{required_depth}); } else { return if($self->{depth} < $self->{required_depth}); } foreach my $i (1..$self->{required_depth}) { return unless($self->{path_name}->[-$i] eq $self->{rec_path_name}->[-$i]); if(defined($self->{rec_path_ns}->[-$i])) { return unless($self->{path_ns}->[-$i] eq $self->{rec_path_ns}->[-$i]); } } return(1); } 1; __END__ =head1 NAME XML::Filter::Sort - SAX filter for sorting elements in XML =head1 SYNOPSIS use XML::Filter::Sort; use XML::SAX::Machines qw( :all ); my $sorter = XML::Filter::Sort->new( Record => 'person', Keys => [ [ 'lastname', 'alpha', 'asc' ], [ 'firstname', 'alpha', 'asc' ], [ '@age', 'num', 'desc'] ], ); my $filter = Pipeline( $sorter => \*STDOUT ); $filter->parse_file(\*STDIN); Or from the command line: xmlsort =head1 DESCRIPTION This module is a SAX filter for sorting 'records' in XML documents (including documents larger than available memory). The C utility which is included with this distribution can be used to sort an XML file from the command line without writing Perl code (see C). =head1 EXAMPLES These examples assume that you will create an XML::Filter::Sort object and use it in a SAX::Machines pipeline (as in the synopsis above). Of course you could use the object directly by hooking up to a SAX generator and a SAX handler but such details are omitted from the sample code. When you create an XML::Filter::Sort object (with the C method), you must use the 'Record' option to identify which elements you want sorted. The simplest way to do this is to simply use the element name, eg: my $sorter = XML::Filter::Sort->new( Record => 'colour' ); Which could be used to transform this XML: red green blue to this: blue green red You can define a more specific path to the record by adding a prefix of element names separated by forward slashes, eg: my $sorter = XML::Filter::Sort->new( Record => 'hair/colour' ); which would only sort elements contained directly within a element (and would therefore leave our sample document above unchanged). A path which starts with a slash is an 'absolute' path and must specify all intervening elements from the root element to the record elements. A record element may contain other elements. The order of the record elements may be changed by the sorting process but the order of any child elements within them will not. The default sort uses the full text of each 'record' element and uses an alphabetic comparison. You can use the 'Keys' option to specify a list of elements within each record whose text content should be used as sort keys. You can also use this option to specify whether the keys should be compared alphabetically or numerically and whether the resulting order should be ascending or descending, eg: my $sorter = XML::Filter::Sort->new( Record => 'person', Keys => [ [ 'lastname', 'alpha', 'asc' ], [ 'firstname', 'alpha', 'asc' ], [ '@age', 'alpha', 'desc' ], ] ); Given this record ... Aardvark Zebedee The above code would use 'Zebedee' as the first (primary) sort key, 'Aardvark' as the second sort key and the number 35 as the third sort key. In this case, records with the same first and last name would be sorted from oldest to youngest. As with the 'record' path, it is possible to specify a path to the sort key elements (or attributes). To make a path relative to the record element itself, use './' at the start of the path. =head1 OPTIONS =over 4 =item Record => 'path string' A simple path string defining which elements should be treated as 'records' to be sorted (see L<"PATH SYNTAX">). Elements which do not match this path will not be altered by the filter. Elements which do match this path will be re-ordered depending on their contents and the value of the Keys option. When a record element is re-ordered, it takes its leading whitespace with it. Only lists of contiguous record elements will be sorted. A list of records which has a 'foreign body' (a non-record element, non-whitespace text, a comment or a processing instruction) between two elements will be treated as two separate lists and each will be sorted in isolation of the other. =item Keys => [ [ 'path string', comparator, order ], ... ] =item Keys => 'delimited string' This option specifies which parts of the records should be used as sort keys. The first form uses a list-of-lists syntax. Each key is defined using a list of three elements: =over 4 =item 1 The 'path string' defines the path to an element or an attribute whose text contents should be used as the value of the sort key (see L<"PATH SYNTAX">). =item 2 The 'comparator' defines how these values should be compared. This can be the string 'alpha' for alphabetic, the string 'num' for numeric or a reference to a subroutine taking two parameters and returning -1, 0 or 1 (similar to the standard Perl sort function but without the $a, $b magic). This item is optional and defaults to 'alpha'. =item 3 The 'order' should be 'asc' for ascending or 'desc' for descending and if omitted, defaults to 'asc'. =back You may prefer to define the Keys using a delimited string rather than a list of lists. Keys in the string should be separated by either newlines or semicolons and the components of a key should be separated by whitespace or commas. It is not possible to define a subroutine reference comparator using the string syntax. =item IgnoreCase => 1 Enabling this option will make sort comparisions case-insensitive (rather than the default case-sensitive). =item NormaliseKeySpace => 1 The sort key values for each record will be the text content of the child elements specified using the Keys option (above). If you enable this option, leading and trailing whitespace will be stripped from the keys and each internal run of spaces will be collapsed to a single space. The default value for this option is off for efficiency. Note: The contents of the record are not affected by this setting - merely the copy of the data that is used in the sort comparisons. =item KeyFilterSub => coderef You can also supply your own custom 'fix-ups' by passing this option a reference to a subroutine. The subroutine will be called once for each record and will be passed a list of the key values for the record. The routine must return the same number of elements each time it is called, but this may be less than the number of values passed to it. You might use this option to combine multiple key values into one (eg: using sprintf). Note: You can enable both the NormaliseKeySpace and the KeyFilterSub options - space normalisation will occur first. =item TempDir => 'directory path' This option serves two purposes: it enables disk buffering rather than the default memory buffering and it allows you to specify where on disk the data should be buffered. Disk buffering will be slower than memory buffering, so don't ask for it if you don't need it. For more details, see L<"IMPLEMENTATION">. Note: It is safe to specify the same temporary directory path for multiple instances since each will create a uniquely named subdirectory (and clean it up afterwards). =item MaxMem => bytes The disk buffering mode actually sorts chunks of records in memory before saving them to disk. The default chunk size is 10 megabytes. You can use this option to specify an alternative chunk size (in bytes) which is more attuned to your available resources (more is better). A suffix of 'K' or 'M' is recognised as kilobytes or megabytes respectively. If you have not enabled disk buffering (using 'TempDir'), the MaxMem option has no effect. Attempting to sort a large document using only memory buffering may result in Perl dying with an 'out of memory' error. =item SkipIgnorableWS If your SAX parser can do validation and generates ignorable_whitespace() events, you can enable this option to discard these events. If you leave this option at it's default value (implying you want the whitespace), the events will be translated to characters() events. =back =head1 PATH SYNTAX A simple element path syntax is used in two places: =over 4 =item 1 with the 'Record' option to define which elements should be sorted =item 2 with the 'Keys' option to define which parts of each record should be used as sort keys. =back In each case you can use a just an element name, or a list of element names separated by forward slashes. eg: Record => 'ul/li', Keys => 'name' If a 'Record' path begins with a '/' then it will be anchored at the document root. If a 'Keys' path begins with './' then it is anchored at the current record element. Unanchored paths can match at any level. A 'Keys' path can include an attribute name prefixed with an '@' symbol, eg: Keys => './@href' Each element or attribute name can include a namespace URI prefix in curly braces, eg: Record => '{http://www.w3.org/1999/xhtml}li' If you do not include a namespace prefix, all elements with the specified name will be matched, regardless of any namespace URI association they might have. If you include an empty namespace prefix (eg: C<'{}li'>) then only records which do not have a namespace association will be matched. =head1 IMPLEMENTATION In order to arrange records into sorted order, this module uses buffering. It does not need to buffer the whole document, but for any sequence of records within a document, all records must be buffered. Unless you specify otherwise, the records will be buffered in memory. The memory requirements are similar to DOM implementations - 10 to 50 times the character count of the source XML. If your documents are so large that you would not process them with a DOM parser then you should enable disk buffering. If you enable disk buffering, sequences of records will be assembled into 'chunks' of approximately 10 megabytes (this value is configurable). Each chunk will be sorted and saved to disk. At the end of the record sequence, all the sorted chunks will be merged and written out as SAX events. The memory buffering mode represents each record an a B object and uses B objects to manage the buffers. For details of the internals, see L. The disk buffering mode represents each record an a B object and uses B objects to manage the buffers. For details of the internals, see L. =head1 BUGS ignorable_whitespace() events shouldn't be translated to normal characters() events - perhaps in a later release they won't be. =head1 SEE ALSO B requires L and plays nicely with L. =head1 COPYRIGHT Copyright 2002-2005 Grant McLean Egrantm@cpan.orgE This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut XML-Filter-Sort-1.01/bin/0000755000175000017500000000000010231533634015263 5ustar grantgrant00000000000000XML-Filter-Sort-1.01/bin/xmlsort0000644000175000017500000001067407502452254016733 0ustar grantgrant00000000000000#!/usr/bin/perl -w ############################################################################## # $Id: xmlsort,v 1.1.1.1 2002/06/14 20:40:12 grantm Exp $ # # Title: xmlsort # # Author: Grant McLean # # Script for sorting 'records' in XML files. Use -h option for help. # use strict; use Getopt::Long; use Pod::Usage; use IO::File; use XML::SAX::ParserFactory; use XML::SAX::Writer; use XML::Filter::Sort; ############################################################################## # Handle command line parameters # my %opt = (); GetOptions(\%opt, 'r=s', 'k=s', 'i', 's', 't=s', 'm=s', 'h') || pod2usage(0); pod2usage({-verbose => 2, -exitval => 0}) if($opt{h}); pod2usage(0) unless($opt{r}); my $filename = shift || '-'; ############################################################################## # Build up the list of options for constructing the sort filter object. # my %sort_opts = ( Record => $opt{r} ); $sort_opts{Keys} = $opt{k} if($opt{k}); $sort_opts{IgnoreCase} = 1 if($opt{i}); $sort_opts{NormaliseKeySpace} = 1 if($opt{s}); $sort_opts{TempDir} = $opt{t} if($opt{t}); $sort_opts{MaxMem} = $opt{m} if($opt{m}); ############################################################################## # Create a filter pipeline and 'run' it # my $writer = XML::SAX::Writer->new( Output => \*STDOUT ); my $sorter = XML::Filter::Sort->new( %sort_opts, Handler => $writer ); my $parser = XML::SAX::ParserFactory->parser(Handler => $sorter); my $fd = IO::File->new("<$filename") || die "$!"; $parser->parse_file($fd); print "\n"; exit; __END__ =head1 NAME xmlsort - sorts 'records' in XML files =head1 SYNOPSIS xmlsort -r= [ ] [ ] Options: -r name of the elements to be sorted -k child nodes to be used as sort keys -i ignore case when sorting -s normalise whitespace when comparing sort keys -t buffer records to named directory rather than in memory -m set memory chunk size for disk buffering -h help - display the full documentation Example: xmlsort -r 'person' -k 'lastname;firstname' -i -s in.xml >out.xml =head1 DESCRIPTION This script takes an XML document either on STDIN or from a named file and writes a sorted version of the file to STDOUT. The C<-r> option should be used to identify 'records' in the document - the bits you want sorted. Elements before and after the records will be unaffected by the sort. =head1 OPTIONS Here is a brief summary of the command line options (and the XML::Filter::Sort options which they correspond to). For more details see L. =over 4 =item -r (Record) The name of the elements to be sorted. This can be a simple element name like C<'person'> or a pathname like C<'employees/person'> (only person elements contained directly within an employees element). =item -k (Keys) Semicolon separated list of elements (or attributes) within a record which should be used as sort keys. Each key can optionally be followed by 'alpha' or 'num' to indicate alphanumeric of numeric sorting and 'asc' or 'desc' for ascending or descending order (eg: -k 'lastname;firstname;age,n,d'). =item -i (IgnoreCase) This option makes sort comparisons case insensitive. =item -s (NormaliseKeySpace) By default all whitespace in the sort key elements is considered significant. Specifying -s will case leading and trailing whitespace to be stripped and internal whitespace runs to be collapsed to a single space. =item -t (TempDir) When sorting large documents, it may be prudent to use disk buffering rather than memory buffering. This option allows you to specify where temporary files should be written. =item -m (MaxMem) If you use the -t option to enable disk buffering, records will be collected in memory in 'chunks' of up to about 10 megabytes before being sorted and spooled to temporary files. This option allows you to specify a larger chunk size. A suffix of K or M indicates kilobytes or megabytes respectively. =back =head1 SEE ALSO This script uses the following modules: XML::SAX::ParserFactory XML::Filter::Sort XML::SAX::Writer =head1 AUTHOR Grant McLean =head1 COPYRIGHT Copyright (c) 2002 Grant McLean. All rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut XML-Filter-Sort-1.01/t/0000755000175000017500000000000010231533634014756 5ustar grantgrant00000000000000XML-Filter-Sort-1.01/t/6_diskworkout.t0000644000175000017500000000064207502452224017761 0ustar grantgrant00000000000000# $Id: 6_diskworkout.t,v 1.1.1.1 2002/06/14 20:39:48 grantm Exp $ use strict; use File::Spec; # Enable disk buffering via a global flag and then 'do' test script 4 again my $workout_script = $0; $workout_script =~ s{6_diskworkout}{4_workout}; my $temp_dir = File::Spec->catfile('t', 'temp'); @main::TempOpts = (TempDir => $temp_dir); push @main::TempOpts, MaxMem => 200; do $workout_script; if($@) { die $@; } XML-Filter-Sort-1.01/t/5_diskbuffer.t0000644000175000017500000001667107502452235017532 0ustar grantgrant00000000000000# $Id: 5_diskbuffer.t,v 1.1.1.1 2002/06/14 20:39:57 grantm Exp $ use strict; use Test::More; use File::Spec; BEGIN { # Seems to be required by older Perls unless(eval { require Storable }) { plan skip_all => 'Storable not installed'; } unless(eval { require XML::SAX::Writer }) { plan skip_all => 'XML::SAX::Writer not installed'; } unless(eval { require XML::SAX::ParserFactory }) { plan skip_all => 'XML::SAX::ParserFactory not installed'; } unless(eval { require XML::SAX::Machines }) { plan skip_all => 'XML::SAX::Machines not installed'; } } plan tests => 25; use XML::Filter::Sort; use XML::SAX::ParserFactory; use XML::SAX::Machines qw( :all ); $^W = 1; my($xmlin, $xmlout, $sorter); ############################################################################## # Confirm that the modules compile OK # use XML::Filter::Sort::DiskBuffer; ok(1, 'XML::Filter::Sort::DiskBuffer compiled OK'); use XML::Filter::Sort::DiskBufferMgr; ok(1, 'XML::Filter::Sort::DiskBufferMgr compiled OK'); ############################################################################## # Try freezing a buffer # $xmlin = q( Zebedee Boozle ); my $buffer = XML::Filter::Sort::DiskBuffer->new( Keys => [ [ 'lastname' ], [ 'firstname' ], [ '@age' ] ] ); is(ref($buffer), 'XML::Filter::Sort::DiskBuffer', 'Successfully created a XML::Filter::Sort::DiskBuffer object'); my $parser = XML::SAX::ParserFactory->parser(Handler => $buffer); $buffer->characters({ Data => "\n " }); $parser->parse_string($xmlin); my @keys = $buffer->close(); my $expected_keys = [ qw(Boozle Zebedee 37) ]; is_deeply(\@keys, $expected_keys, 'Inherited keys functionality OK'); my $icicle = $buffer->freeze(undef, @keys); my $data = Storable::thaw($icicle); is_deeply($data->[0], $expected_keys, 'Frozen keys manually thawed out OK'); ############################################################################## # Now try thawing it out # my $new_buffer = XML::Filter::Sort::DiskBuffer->thaw($icicle); is(ref($new_buffer), 'XML::Filter::Sort::DiskBuffer', 'Disk buffer thaw() constructor OK'); isnt($new_buffer, $buffer, 'New buffer is deep copy'); @keys = $new_buffer->key_values(); is_deeply(\@keys, $expected_keys, 'Key values successfully retrieved'); $xmlout = ''; my $writer = XML::SAX::Writer->new(Output => \$xmlout); $writer->start_document(); $new_buffer->to_sax($writer); $writer->end_document(); fix_xml($xmlout); is($xmlout, $xmlin, 'Original XML reconstructed successfully from thawed buffer'); ############################################################################## # Try re-freezing the thawed buffer and then try thawing it out # $icicle = $new_buffer->freeze(); my $newer_buffer = XML::Filter::Sort::DiskBuffer->thaw($icicle); is(ref($newer_buffer), 'XML::Filter::Sort::DiskBuffer', 'Re-thawed re-frozen buffer re-constructed OK'); isnt($newer_buffer, $new_buffer, 'New buffer is deep copy'); @keys = $newer_buffer->key_values(); is_deeply(\@keys, $expected_keys, 'Key values successfully retrieved'); $xmlout = ''; $writer = XML::SAX::Writer->new(Output => \$xmlout); $writer->start_document(); $newer_buffer->to_sax($writer); $writer->end_document(); fix_xml($xmlout); is($xmlout, $xmlin, 'Original XML reconstructed successfully from re-thawed buffer'); ############################################################################## # Now try creating a disk buffer manager object - confirm it fails if no # temp directory is specified # my %opts = ( Keys => [ ['firstname', 'alpha', 'asc'] ], ); my $buffer_mgr = eval { XML::Filter::Sort::DiskBufferMgr->new(%opts); }; ok($@, 'Failed to create XML::Filter::Sort::DiskBufferMgr object...'); ok($@ =~ /You must set the 'TempDir' option/i, '... as expected'); ############################################################################## # Create temp directory then try again # my $temp_dir = File::Spec->catfile('t', 'temp'); unless(-d $temp_dir) { mkdir($temp_dir, 0777); } ok(-d $temp_dir, 'Temporary directory exists'); $opts{TempDir} = $temp_dir; $buffer_mgr = XML::Filter::Sort::DiskBufferMgr->new(%opts); is(ref($buffer_mgr), 'XML::Filter::Sort::DiskBufferMgr', 'Successfully created a XML::Filter::Sort::DiskBufferMgr object'); ############################################################################## # Try creating a slave buffer manager # my $slave = eval { $buffer_mgr->new() }; ok(!$@, 'Successfully created a slave buffer manager'); is(ref($slave), 'XML::Filter::Sort::DiskBufferMgr', 'Slave is a XML::Filter::Sort::DiskBufferMgr too'); $slave = undef; # discard it ############################################################################## # Now feed some data into the disk buffer manager and confirm it gets # written to disk. # my @rec = ( q( Zebedee Boozle ), q( Yorick Cabbage ), q( Yorick Cabbage ), q( Xavier Aardvark ), ); store_records($buffer_mgr, @rec); my $byte_count = $buffer_mgr->{buffered_bytes}; $buffer_mgr->save_to_disk(); my $buffer_dir = $buffer_mgr->{_temp_dir}; ok(-d $buffer_mgr->{_temp_dir}, "Temp directory was created ($buffer_dir)"); my $temp_file = File::Spec->catfile($buffer_dir, '0'); ok(-f $temp_file, "Temp file was created ($temp_file)"); my $file_size = (-s $temp_file); is(4 * @rec + $byte_count, $file_size, "Disk file size is plausible ($file_size)"); ############################################################################## # Generate SAX events from disk buffer and confirm output. my $elem = { Name => 'list', LocalName => 'list', Prefix => '', NamespaceURI => '', Attributes => {}, }; $xmlout = ''; $writer = XML::SAX::Writer->new(Output => \$xmlout); $writer->start_document(); $writer->start_element($elem); $buffer_mgr->to_sax($writer); $writer->end_element($elem); $writer->end_document(); fix_xml($xmlout); is($xmlout, "$rec[3]$rec[1]$rec[2]$rec[0]", 'XML from disk buffer looks good'); ok(!-f $temp_file, 'Temp file was deleted'); $buffer_mgr = undef; # destroy buffer manager object ok(!-d $buffer_dir, 'Temp directory was deleted'); exit; ############################################################################## # S U B R O U T I N E S ############################################################################## ############################################################################## # Sometimes the output from the SAX pipeline may not be exactly what we're # expecting - for benign reasons. This routine strips the initial XML # declaration which gets added by LibXML but not by other parsers. It also # changes attribute double quotes to single. # sub fix_xml { $_[0] =~ s{^<\?xml\s.*?\?>\s*}{}s; $_[0] =~ s{(\w+)="([^>]*?)"}{$1='$2'}sg; } ############################################################################## # Takes a buffer and a list of well formed XML 'records'. Takes each record, # parses it to a buffer and stores it. # sub store_records{ my $buffer_mgr = shift; foreach my $rec (@_) { my $buffer = $buffer_mgr->new_buffer(); XML::SAX::ParserFactory->parser(Handler => $buffer)->parse_string($rec); $buffer_mgr->close_buffer($buffer); } } XML-Filter-Sort-1.01/t/3_sort.t0000644000175000017500000000671607502452231016366 0ustar grantgrant00000000000000# $Id: 3_sort.t,v 1.1.1.1 2002/06/14 20:39:53 grantm Exp $ use strict; use Test::More; BEGIN { # Seems to be required by older Perls unless(eval { require XML::SAX::Writer }) { plan skip_all => 'XML::SAX::Writer not installed'; } unless(eval { require XML::SAX::ParserFactory }) { plan skip_all => 'XML::SAX::ParserFactory not installed'; } } plan tests => 9; $^W = 1; ############################################################################## # Confirm that the module compiles # use XML::Filter::Sort; ok(1, 'XML::Filter::Sort compiled OK'); ############################################################################## # Try creating a Sort object and test that it fails when 'Record' option is # omitted. # my($sorter); eval { $sorter = XML::Filter::Sort->new() }; like($@, qr{You must set the 'Record' option}, "Can't omit 'Record' option"); ############################################################################## # Try again, this time supplying required 'Record' option as well as a handler # object. Confirm that object was created and default value for 'Keys' was # used. # my $xml = ''; my $writer = XML::SAX::Writer->new(Output => \$xml); $sorter = XML::Filter::Sort->new(Record => 'rec', Handler => $writer); ok(ref($sorter), 'Created a sort filter object'); isa_ok($sorter, 'XML::Filter::Sort'); is_deeply($sorter->{Keys}, [ [ '.', 'alpha', 'asc' ] ], 'Default value for sort keys OK'); ############################################################################## # Poke some SAX events into it and confirm it doesn't die # my $list_elem = { Name => 'list', LocalName => 'list', Prefix => '', NamespaceURI => '', Attributes => {}, }; my $rec_elem = { Name => 'rec', LocalName => 'rec', Prefix => '', NamespaceURI => '', Attributes => {}, }; $sorter->start_document(); $sorter->start_element($list_elem); foreach my $text (qw(Tom Dick Larry)) { $sorter->start_element($rec_elem); $sorter->characters({ Data => $text}); $sorter->end_element($rec_elem); } $sorter->end_element($list_elem); $sorter->end_document(); ok(1, 'Filtered a document without crashing'); ############################################################################## # Confirm that the output was actually sorted # is($xml, 'DickLarryTom', 'Records sorted correctly'); ############################################################################## # Create another object and confirm that non-default 'Keys' value is # accepted. # my $keys = [ [ 'firstname', 'alpha', 'asc' ], [ 'lastname', 'alpha', 'asc' ], [ 'age', 'num', 'desc' ], ]; $sorter = XML::Filter::Sort->new( Record => 'rec', Handler => $writer, Keys => $keys ); is_deeply($sorter->{Keys}, $keys, 'Multi-key array looks OK'); ############################################################################## # Do it again, but this time specify 'Keys' using a scalar rather than nested # arrays. # $sorter = XML::Filter::Sort->new( Record => 'rec', Handler => $writer, Keys => " firstname lastname age num desc " ); is_deeply($sorter->{Keys}, $keys, 'Multi-key array from scalar looks OK'); ############################################################################## # More complex tests of the sorting functionality are deferred to the next # script which requires XML::SAX::Machines (which is surely installed if this # punter is serious). # XML-Filter-Sort-1.01/t/4_workout.t0000644000175000017500000006133710231532651017110 0ustar grantgrant00000000000000# $Id: 4_workout.t,v 1.3 2005/04/20 20:03:53 grantm Exp $ use strict; use Test::More; BEGIN { # Seems to be required by older Perls unless(eval { require XML::SAX::Writer }) { plan skip_all => 'XML::SAX::Writer not installed'; } unless(eval { require XML::SAX::ParserFactory }) { plan skip_all => 'XML::SAX::ParserFactory not installed'; } unless(eval { require XML::SAX::Machines }) { plan skip_all => 'XML::SAX::Machines not installed'; } } plan tests => 26; use XML::Filter::Sort; use XML::SAX::Machines qw( :all ); $^W = 1; my(@opts, $xmlin, $xmlout, $sorter); ############################################################################## # Global used to flag disk rather than memory buffering # @main::TempOpts = () unless(@main::TempOpts); ############################################################################## # Sort using full text content as key (including leading digits) # $xmlin = q( 1Zebedee 2Yorick 3Wayne 4Xavier ); $xmlout = ''; @opts = (Record => 'person'); push @opts, @main::TempOpts; $sorter = Pipeline( XML::Filter::Sort->new(@opts) => \$xmlout ); $sorter->parse_string($xmlin); fix_xml($xmlout); is($xmlout, $xmlin, 'Default key to full text content, alpha, asc'); ############################################################################## # Sort using text content of specified child element as a key # $xmlout = ''; @opts = ( Record => 'person', Keys => 'firstname', ); push @opts, @main::TempOpts; $sorter = Pipeline(XML::Filter::Sort->new(@opts) => \$xmlout); $sorter->parse_string($xmlin); fix_xml($xmlout); is($xmlout, q( 3Wayne 4Xavier 2Yorick 1Zebedee ), 'Parsed key from string and extracted element content'); ############################################################################## # Check that a 'foreign' element in the middle of a sequence of records # causes the records before and the records after to be sorted as two # independent lists. # $xmlin = q( 1Zebedee 2Yorick popcorn 3Wayne 4Xavier 0 ); $xmlout = ''; @opts = ( Record => 'person', Keys => 'firstname', ); push @opts, @main::TempOpts; $sorter = Pipeline(XML::Filter::Sort->new(@opts) => \$xmlout); $sorter->parse_string($xmlin); fix_xml($xmlout); is($xmlout, q( 2Yorick 1Zebedee popcorn 3Wayne 4Xavier 0 ), 'Sorted two independent lists (element between)'); ############################################################################## # Check that non-whitespace text causes the same effect. # $xmlin = q( 1Zebedee 2Yorick popcorn 3Wayne 4Xavier ); $xmlout = ''; @opts = ( Record => 'person', Keys => 'firstname', ); push @opts, @main::TempOpts; $sorter = Pipeline(XML::Filter::Sort->new(@opts) => \$xmlout); $sorter->parse_string($xmlin); fix_xml($xmlout); is($xmlout, q( 2Yorick 1Zebedee popcorn 3Wayne 4Xavier ), 'Sorted two independent lists (text between - easy case)'); ############################################################################## # Repeat that last test with slightly different input data to expose a flaw # which probably ought to be fixed. # $xmlin = q( 1Zebedee 2Yorick popcorn 4Xavier 3Wayne ); $xmlout = ''; @opts = ( Record => 'person', Keys => 'firstname', ); push @opts, @main::TempOpts; $sorter = Pipeline(XML::Filter::Sort->new(@opts) => \$xmlout); $sorter->parse_string($xmlin); fix_xml($xmlout); TODO: { local $TODO = 'Trailing whitespace on leading text not quite done'; is($xmlout, q( 2Yorick 1Zebedee popcorn 3Wayne 4Xavier ), 'Sorted two independent lists (text between - pathological case)'); } ############################################################################## # Now do a similar test with a comment separating the two record lists. # $xmlin = q( 1Zebedee 2Yorick 4Xavier 3Wayne ); $xmlout = ''; @opts = ( Record => 'person', Keys => 'firstname', ); push @opts, @main::TempOpts; $sorter = Pipeline(XML::Filter::Sort->new(@opts) => \$xmlout); $sorter->parse_string($xmlin); fix_xml($xmlout); is($xmlout, q( 2Yorick 1Zebedee 3Wayne 4Xavier ), 'Sorted two independent lists (comment between)'); ############################################################################## # Same again but with a processing instruction separating the two record lists. # $xmlin = q( 1Zebedee 2Yorick 4Xavier 3Wayne ); $xmlout = ''; @opts = ( Record => 'person', Keys => 'firstname', ); push @opts, @main::TempOpts; $sorter = Pipeline(XML::Filter::Sort->new(@opts) => \$xmlout); $sorter->parse_string($xmlin); fix_xml($xmlout); is($xmlout, q( 2Yorick 1Zebedee 3Wayne 4Xavier ), 'Sorted two independent lists (PI between)'); ############################################################################## # Check that as each record is buffered, reordered and spat back out, it # retains its own leading whitespace. # $xmlin = q( Zebedee Yorick Wayne Xavier ); $xmlout = ''; @opts = ( Record => 'person', Keys => 'firstname', ); push @opts, @main::TempOpts; $sorter = Pipeline(XML::Filter::Sort->new(@opts) => \$xmlout); $sorter->parse_string($xmlin); fix_xml($xmlout); is($xmlout, q( Wayne Xavier Yorick Zebedee ), 'Funky indentation preserved'); ############################################################################## # Throw a namespace definition into the mix and confirm it is ignored. # $xmlin = q( 1Zebedee 2Yorick 3Wayne 4Xavier ); $xmlout = ''; @opts = ( Record => 'person', Keys => 'firstname', ); push @opts, @main::TempOpts; $sorter = Pipeline(XML::Filter::Sort->new(@opts) => \$xmlout); $sorter->parse_string($xmlin); fix_xml($xmlout); is($xmlout, q( 3Wayne 4Xavier 2Yorick 1Zebedee ), 'Record selection with optional namespace works'); ############################################################################## # Now sort only the records with no namespace # $xmlin = q( 1Zebedee 2Yorick 3Wayne 4Xavier 5Vernon 6Trevor 7Ulbrecht ); $xmlout = ''; @opts = ( Record => '{}person', Keys => 'firstname', ); push @opts, @main::TempOpts; $sorter = Pipeline(XML::Filter::Sort->new(@opts) => \$xmlout); $sorter->parse_string($xmlin); fix_xml($xmlout); is($xmlout, q( 1Zebedee 3Wayne 2Yorick 4Xavier 6Trevor 7Ulbrecht 5Vernon ), 'Record selection with no namespace works'); ############################################################################## # Now sort only the records with specified namespace # $xmlin = q( 1Zebedee 2Yorick 3Wayne 4Xavier 5Vernon 6Trevor 7Ulbrecht ); $xmlout = ''; @opts = ( Record => '{bob.com}person', Keys => 'firstname', ); push @opts, @main::TempOpts; $sorter = Pipeline(XML::Filter::Sort->new(@opts) => \$xmlout); $sorter->parse_string($xmlin); fix_xml($xmlout); is($xmlout, q( 3Wayne 2Yorick 1Zebedee 4Xavier 5Vernon 6Trevor 7Ulbrecht ), 'Record selection with specific namespace works'); ############################################################################## # Put some comments into the mix and confirm they are handled correctly. # $xmlin = q( Zebedee Xavier Yorick ); $xmlout = ''; @opts = ( Record => 'person', Keys => 'firstname', ); push @opts, @main::TempOpts; $sorter = Pipeline(XML::Filter::Sort->new(@opts) => \$xmlout); $sorter->parse_string($xmlin); fix_xml($xmlout); is($xmlout, q( Xavier Yorick Zebedee ), 'Buffering of comments works'); ############################################################################## # Do the same with processing instructions. # $xmlin = q( Zebedee Xavier Yorick ); $xmlout = ''; @opts = ( Record => 'person', Keys => 'firstname', ); push @opts, @main::TempOpts; $sorter = Pipeline(XML::Filter::Sort->new(@opts) => \$xmlout); $sorter->parse_string($xmlin); fix_xml($xmlout); is($xmlout, q( Xavier Yorick Zebedee ), 'Buffering of PIs works'); ############################################################################## # Run a multi-key sort - two alpha keys. # $xmlin = q( This is a list of names & ages Zebedee Boozle Yorick Cabbage Yorick Cabbage Xavier Aardvark
The End!
); $xmlout = ''; @opts = ( Record => 'person', Keys => ' lastname firstname @age ', ); push @opts, @main::TempOpts; $sorter = Pipeline(XML::Filter::Sort->new(@opts) => \$xmlout); $sorter->parse_string($xmlin); fix_xml($xmlout); is($xmlout, q( This is a list of names & ages Xavier Aardvark Zebedee Boozle Yorick Cabbage Yorick Cabbage
The End!
), 'Multi-element records and multi-key sort OK'); ############################################################################## # Introduce a third sort key - numeric. # $xmlout = ''; @opts = ( Record => 'person', Keys => ' lastname, alpha, asc firstname, alpha, asc @age, num, asc ', ); push @opts, @main::TempOpts; $sorter = Pipeline(XML::Filter::Sort->new(@opts) => \$xmlout); $sorter->parse_string($xmlin); fix_xml($xmlout); is($xmlout, q( This is a list of names & ages Xavier Aardvark Zebedee Boozle Yorick Cabbage Yorick Cabbage
The End!
), 'Numeric sort key OK'); ############################################################################## # Check that descending order works for both alpha and numeric sorts # $xmlout = ''; @opts = ( Record => 'person', Keys => ' firstname, alpha, desc lastname, alpha, asc @age, num, desc ', ); push @opts, @main::TempOpts; $sorter = Pipeline(XML::Filter::Sort->new(@opts) => \$xmlout); $sorter->parse_string($xmlin); fix_xml($xmlout); is($xmlout, q( This is a list of names & ages Zebedee Boozle Yorick Cabbage Yorick Cabbage Xavier Aardvark
The End!
), 'Descending order OK'); ############################################################################## # Use a code reference rather than alpha or numeric comparator # $xmlin = q( QX54763 AS87645 YT19895 RS04198 ); $xmlout = ''; @opts = ( Record => 'part', Keys => [ [ '.' => sub { my @nums = map { /(\d+)/ } @_; $nums[0] <=> $nums[1]; } ] ] ); push @opts, @main::TempOpts; $sorter = Pipeline(XML::Filter::Sort->new(@opts) => \$xmlout); $sorter->parse_string($xmlin); fix_xml($xmlout); is($xmlout, q( RS04198 YT19895 QX54763 AS87645 ), 'Coderef comparator OK'); ############################################################################## # Test that by default case of keys is significant # $xmlin = q( red Green blue ); $xmlout = ''; @opts = ( Record => 'colour', ); push @opts, @main::TempOpts; $sorter = Pipeline(XML::Filter::Sort->new(@opts) => \$xmlout); $sorter->parse_string($xmlin); fix_xml($xmlout); is($xmlout, q( Green blue red ), 'Case is significant by default'); ############################################################################## # But the IgnoreCase option fixes that # $xmlout = ''; @opts = ( Record => 'colour', IgnoreCase => 1, ); push @opts, @main::TempOpts; $sorter = Pipeline(XML::Filter::Sort->new(@opts) => \$xmlout); $sorter->parse_string($xmlin); fix_xml($xmlout); is($xmlout, q( blue Green red ), 'IgnoreCase makes case insignificant'); ############################################################################## # Test that by default space in keys is significant # $xmlin = q( red green blue light blue light blue light blue light blue ); $xmlout = ''; @opts = ( Record => 'colour', ); push @opts, @main::TempOpts; $sorter = Pipeline(XML::Filter::Sort->new(@opts) => \$xmlout); $sorter->parse_string($xmlin); fix_xml($xmlout); is($xmlout, q( blue light blue light blue light blue red green light blue ), 'Space is significant by default'); ############################################################################## # But the NormaliseKeySpace option fixes that # $xmlout = ''; @opts = ( Record => 'colour', NormaliseKeySpace => 1, ); push @opts, @main::TempOpts; $sorter = Pipeline(XML::Filter::Sort->new(@opts) => \$xmlout); $sorter->parse_string($xmlin); fix_xml($xmlout); is($xmlout, q( blue green light blue light blue light blue light blue red ), 'NormaliseKeySpace makes spaces insignificant'); ############################################################################## # And it fixes it for Americanz too # $xmlout = ''; $xmlin = q( red green blue light blue light blue light blue light blue ); $xmlout = ''; @opts = ( Record => 'color', NormalizeKeySpace => 1, # ^======= this is the bit we're testing ); push @opts, @main::TempOpts; $sorter = Pipeline(XML::Filter::Sort->new(@opts) => \$xmlout); $sorter->parse_string($xmlin); fix_xml($xmlout); is($xmlout, q( blue green light blue light blue light blue light blue red ), 'And it works for Americanz too'); ############################################################################## # Now try out the KeyFilterSub option. # $xmlout = ''; $xmlin = q( red green orange pink blue ); $xmlout = ''; @opts = ( Record => 'color', KeyFilterSub => sub { map { scalar reverse($_) } @_; }, ); push @opts, @main::TempOpts; $sorter = Pipeline(XML::Filter::Sort->new(@opts) => \$xmlout); $sorter->parse_string($xmlin); fix_xml($xmlout); is($xmlout, q( red orange blue pink green ), 'KeyFilterSub does its job'); ############################################################################## # Now try IgnoreCase, NormaliseKeySpace and KeyFilterSub simultaneously (and # at the same time). # $xmlout = ''; $xmlin = q( RED green light blue LIGHT BLUE orange PINK blue ); $xmlout = ''; @opts = ( Record => 'color', NormaliseKeySpace => 1, IgnoreCase => 1, KeyFilterSub => sub { map { scalar reverse($_) } @_; }, ); push @opts, @main::TempOpts; $sorter = Pipeline(XML::Filter::Sort->new(@opts) => \$xmlout); $sorter->parse_string($xmlin); fix_xml($xmlout); is($xmlout, q( RED orange blue light blue LIGHT BLUE PINK green ), 'IgnoreCase, NormaliseKeySpace & KeyFilterSub play nicely'); ############################################################################## # Slightly unusual version of KeyFilterSub which combine multiple keys # into one. # $xmlout = ''; $xmlin = q( red green orange BLUE RED Green orange blue ); $xmlout = ''; @opts = ( Record => 'color', Keys => '@prime, asc, desc; .', IgnoreCase => 1, KeyFilterSub => sub { sprintf("%02u%s", @_); }, ); push @opts, @main::TempOpts; $sorter = Pipeline(XML::Filter::Sort->new(@opts) => \$xmlout); $sorter->parse_string($xmlin); fix_xml($xmlout); is($xmlout, q( orange green Green blue BLUE orange RED red ), 'Synthetic key generation via KeyFilterSub'); ############################################################################## # Test that text content of '0' doesn't give us grief (any more). # $xmlin = q( 0 9 5 0 7 0 ); $xmlout = ''; @opts = (Record => 'item', Keys => '., num, asc'); push @opts, @main::TempOpts; $sorter = Pipeline( XML::Filter::Sort->new(@opts) => \$xmlout ); $sorter->parse_string($xmlin); fix_xml($xmlout); is($xmlout, q( 0 0 5 7 9 0 ), 'No problem with text content of "0" even in sort key'); ############################################################################## # S U B R O U T I N E S ############################################################################## ############################################################################## # Sometimes the output from the SAX pipeline may not be exactly what we're # expecting - for benign reasons. This routine strips the initial XML # declaration which gets added by LibXML but not by other parsers. It also # changes attribute double quotes to single. # sub fix_xml { $_[0] =~ s{^<\?xml\s.*?\?>\s*}{}s; $_[0] =~ s{(\w+)="([^>]*?)"}{$1='$2'}sg; } XML-Filter-Sort-1.01/t/1_buffer.t0000644000175000017500000002146310231532650016637 0ustar grantgrant00000000000000# $Id: 1_buffer.t,v 1.2 2005/04/20 20:03:52 grantm Exp $ use strict; use Test::More; BEGIN { # Seems to be required by older Perls unless(eval { require XML::SAX::Writer }) { plan skip_all => 'XML::SAX::Writer not installed'; } unless(eval { require XML::SAX::ParserFactory }) { plan skip_all => 'XML::SAX::ParserFactory not installed'; } } plan tests => 17; $^W = 1; ############################################################################## # Confirm that the module compiles # use XML::Filter::Sort::Buffer; ok(1, 'XML::Filter::Sort::Buffer compiled OK'); ############################################################################## # Try creating a Buffer object # my $buffer = XML::Filter::Sort::Buffer->new( Keys => [ [ '.', 'a', 'a' ] # All text alpha ascending ] ); ok(ref($buffer), 'Created a buffer object'); isa_ok($buffer, 'XML::Filter::Sort::Buffer'); ############################################################################## # Poke some SAX events into it, close it and confirm the sort key value was # extracted correctly. # my $rec_elem = { Name => 'record', LocalName => 'record', Prefix => '', NamespaceURI => '', Attributes => {}, }; $buffer->start_element($rec_elem); $buffer->characters({ Data => 'text content'}); $buffer->end_element($rec_elem); my($keyval) = $buffer->close(); is($keyval, 'text content', 'Extracted sort key value from element content'); ############################################################################## # Spool the buffered contents out via SAX to a Writer and confirm it is what # we expected. # my $xmlout = ''; my $writer = XML::SAX::Writer->new(Output => \$xmlout); $writer->start_document(); $buffer->to_sax($writer); $writer->end_document(); is($xmlout, 'text content', 'Simple XML buffered OK'); ############################################################################## # Now try again but with sort key value in an attribute # $buffer = XML::Filter::Sort::Buffer->new( Keys => [ [ './@height', 'n', 'a' ] # value of 'height' attribute ] ); $rec_elem->{Attributes} = { '{}width' => { Name => 'width', LocalName => 'width', Prefix => '', NamespaceURI => '', Value => '1024', }, '{}height' => { Name => 'height', LocalName => 'height', Prefix => '', NamespaceURI => '', Value => '768', }, }; $buffer->characters({ Data => ' '}); $buffer->start_element($rec_elem); $buffer->characters({ Data => 'text content'}); $buffer->end_element($rec_elem); ($keyval) = $buffer->close(); is($keyval, '768', 'Extracted sort key value from attribute'); ############################################################################## # Make sure it comes back out as expected XML # $xmlout = ''; $writer = XML::SAX::Writer->new(Output => \$xmlout); $writer->start_document(); $buffer->to_sax($writer); $writer->end_document(); $xmlout =~ s/"/'/sg; like($xmlout, qr{^ text content}, 'XML containing attributes returned OK' ); ############################################################################## # Try creating a Buffer object configured with multiple (3) sort keys. This # time use a parser to generate SAX events rather than doing it manually. # Confirm correct sort key values were extracted and that output from the # buffer exactly matches the input. # my $xmlin = q( Zebedee Boozle 35 0 ); my(@keyvals); ($buffer, @keyvals) = buffer_from_xml( [ [ './lastname', 'a', 'a' ], [ './firstname', 'a', 'a' ], [ './age', 'a', 'a' ], ], $xmlin ); is_deeply(\@keyvals, [qw(Boozle Zebedee 35)], 'Multiple sort keys returned OK'); $xmlout = xml_from_buffer($buffer); $xmlout =~ s/"year"/'year'/; $xmlout =~ s{/>}{/>}; $xmlout =~ s{}{}; is($xmlout, $xmlin, 'Round-tripped XML containing elements and attributes'); ############################################################################## # Throw an XML comment into the mix. Confirm that the key value extraction # mechanism ignores the comment contents (obviously) and also that the # comment is correctly buffered and regurgitated. # $xmlin = q( Dougal Boozle ); ($buffer, @keyvals) = buffer_from_xml( [ [ './firstname', 'a', 'a' ], ], $xmlin ); is_deeply(\@keyvals, [qw(Dougal)], 'Ignored value in comment'); $xmlout = xml_from_buffer($buffer, 'simple sort key value'); $xmlout =~ s{<}{<}sg; # work around (old) XML::SAX::Writer bug $xmlout =~ s{>}{>}sg; is($xmlout, $xmlin, 'Round-tripped XML containing a comment'); ############################################################################## # Similar test, but with a Processing Instruction # $xmlin = q( Zebedee Boozle ); ok(($buffer, @keyvals) = buffer_from_xml( [ [ './lastname', 'a', 'a' ], ], $xmlin ), 'No crash when presented with PI' ); is_deeply(\@keyvals, [qw(Boozle)], 'Extracted another simple sort key value'); $xmlout = xml_from_buffer($buffer); is($xmlout, $xmlin, 'Round-tripped XML containing a Processing Instruction'); ############################################################################## # Ask for matches against non-existant elements confirm we get one empty # string back for each. # $xmlin = q( Zebedee Boozle ); ($buffer, @keyvals) = buffer_from_xml( [ [ './address', 'a', 'a' ], [ './email', 'a', 'a' ], ], $xmlin ); is_deeply(\@keyvals, ['', ''], 'Correct key values returned when match failed'); ############################################################################## # Now create a buffer configured with a long list of sort keys of varying # forms and confirm they all match what we expect them to match. # $xmlin = q( Boozle Zebedee 35 100x???Fore! ); ($buffer, @keyvals) = buffer_from_xml( [ ['lastname'], ['firstname'], ['age'], ['./lastname'], ['./alpha/beta/carotine'], ['./firstname/@initial'], ['firstname/@initial'], ['@initial'], ['@age'], ['@gender'], ['alpha/age'], ['alpha/beta/gamma'], ['alpha/beta'], ], $xmlin ); is_deeply(\@keyvals, [ 'Boozle', 'Zebedee', '35', 'Boozle', 'Fore!', 'Z', 'Z', 'Z', 12, '', 100, '???', 'x???Fore!' ], 'Longish list of more complex keys'); ############################################################################## # Now do a similar test, but this time with namespaces thrown into the mix. # $xmlin = q( Smith Jones O'Toole Xavier Yorick Patrick ); ($buffer, @keyvals) = buffer_from_xml( [ ['./names/lastname'], ['./names/{}lastname'], ['./names/alias/{pat.ie}lastname'], ['lastname'], ['{}lastname'], ['{pat.ie}lastname'], ['firstname/@initial'], ['firstname/@{}initial'], ['firstname/@{pat.ie}initial'], ['./names/firstname/@initial'], ['./names/firstname/@{}initial'], ['./names/alias/firstname/@{pat.ie}initial'], ], $xmlin ); is_deeply(\@keyvals, [ 'Smith', 'Jones', 'O\'Toole', 'Smith', 'Jones', 'O\'Toole', 'X', 'Y', 'P', 'X', 'Y', 'P', ], 'Keys with namespace elements'); ############################################################################## # S U B R O U T I N E S ############################################################################## sub buffer_from_xml { my($keys, $xml) = @_; $buffer = XML::Filter::Sort::Buffer->new(Keys => $keys); my $parser = XML::SAX::ParserFactory->parser(Handler => $buffer); $parser->parse_string($xml); my @keyvals = $buffer->close(); return($buffer, @keyvals); } sub xml_from_buffer { my($buffer) = @_; my $xml = ''; $writer = XML::SAX::Writer->new(Output => \$xml); $writer->start_document(); $buffer->to_sax($writer); $writer->end_document(); return($xml); } XML-Filter-Sort-1.01/t/0_config.t0000644000175000017500000000250407502452226016634 0ustar grantgrant00000000000000# $Id: 0_config.t,v 1.1.1.1 2002/06/14 20:39:50 grantm Exp $ use Test::More tests => 1; use strict; use File::Spec; eval { # Build up a list of installed modules my @mod_list = qw( XML::SAX XML::SAX::Writer XML::SAX::Machines XML::NamespaceSupport ); # If XML::SAX is installed, add a list of installed SAX parsers eval { require XML::SAX; }; my $default_parser = ''; unless($@) { push @mod_list, map { $_->{Name} } @{XML::SAX->parsers()}; $default_parser = ref(XML::SAX::ParserFactory->parser()); } # Extract the version number from each module my(%version); foreach my $module (@mod_list) { eval " require $module; "; unless($@) { no strict 'refs'; $version{$module} = ${$module . '::VERSION'} || "Unknown"; } } # Add version number of the Perl binary eval ' use Config; $version{perl} = $Config{version} '; # Should never fail if($@) { $version{perl} = $]; } unshift @mod_list, 'perl'; # Print details of installed modules on STDERR printf STDERR "\r%-30s %s\n", 'Package', 'Version'; foreach my $module (@mod_list) { $version{$module} = 'Not Installed' unless(defined($version{$module})); $version{$module} .= " (default parser)" if($module eq $default_parser); printf STDERR " %-30s %s\n", $module, $version{$module}; } }; ok(1); XML-Filter-Sort-1.01/t/2_buffermgr.t0000644000175000017500000001054607502452225017354 0ustar grantgrant00000000000000# $Id: 2_buffermgr.t,v 1.1.1.1 2002/06/14 20:39:49 grantm Exp $ use strict; use Test::More; BEGIN { # Seems to be required by older Perls unless(eval { require XML::SAX::Writer }) { plan skip_all => 'XML::SAX::Writer not installed'; } unless(eval { require XML::SAX::ParserFactory }) { plan skip_all => 'XML::SAX::ParserFactory not installed'; } } plan tests => 13; $^W = 1; ############################################################################## # Confirm that the module compiles # use XML::Filter::Sort::BufferMgr; ok(1, 'XML::Filter::Sort::BufferMgr compiled OK'); ############################################################################## # Try creating a BufferMgr object # my $bm = XML::Filter::Sort::BufferMgr->new( Keys => [ [ '.', 'a', 'a' ] # All text alpha ascending ] ); ok(ref($bm), 'Created a buffer manager object'); isa_ok($bm, 'XML::Filter::Sort::BufferMgr'); ############################################################################## # Use it to create a Buffer object # my $buffer = $bm->new_buffer(); ok(ref($buffer), 'Created a buffer object'); isa_ok($buffer, 'XML::Filter::Sort::Buffer'); ############################################################################## # Poke some SAX events into the Buffer, close it and confirm that the # BufferMgr got the correct sort key value out and stored the Buffer in the # expected place. # my $rec_elem = { Name => 'record', LocalName => 'record', Prefix => '', NamespaceURI => '', Attributes => {}, }; $buffer->start_element($rec_elem); $buffer->characters({ Data => 'text content'}); $buffer->end_element($rec_elem); $bm->close_buffer($buffer); my($keyval) = keys(%{$bm->{records}}); is($keyval, 'text content', 'Sort key value extracted and buffer stored'); is(ref($bm->{records}->{$keyval}), 'ARRAY', 'Container for single-key records'); ############################################################################## # Ask the BufferMgr to regurgitate its buffers as SAX events to a SAX Writer # handler and confirm the results. # my $xmlout = ''; my $writer = XML::SAX::Writer->new(Output => \$xmlout); $writer->start_document(); $bm->to_sax($writer); $writer->end_document(); is($xmlout, 'text content', 'XML returned OK from single-level buffer' ); ############################################################################## # Create a new BufferMgr object to handle multiple (2) sort keys. Get a # Buffer, poke some SAX events into it; confirm the sort key values were # extracted correctly and the storage of the buffer uses a layer of indirection # for the second key value. # $bm = XML::Filter::Sort::BufferMgr->new( Keys => [ [ './@height', 'n', 'a' ], # primary sort key [ './@width', 'n', 'a' ], # secondary sort key ] ); $rec_elem->{Attributes} = { '{}width' => { Name => 'width', LocalName => 'width', Prefix => '', NamespaceURI => '', Value => '1024', }, '{}height' => { Name => 'height', LocalName => 'height', Prefix => '', NamespaceURI => '', Value => '768', }, }; $buffer = $bm->new_buffer(); $buffer->characters({ Data => ' '}); $buffer->start_element($rec_elem); $buffer->characters({ Data => 'text content'}); $buffer->end_element($rec_elem); $bm->close_buffer($buffer); my($pkeyval) = keys(%{$bm->{records}}); is($pkeyval, 768, 'Primary sort key value extracted and buffer stored'); is(ref($bm->{records}->{$pkeyval}), 'XML::Filter::Sort::BufferMgr', 'High level container for multi-key records'); my($skeyval) = keys(%{$bm->{records}->{$pkeyval}->{records}}); is($skeyval, 1024, 'Secondary sort key value extracted and buffer stored'); is(ref($bm->{records}->{$pkeyval}->{records}->{$skeyval}), 'ARRAY', 'Lowest level container for multi-key records'); ############################################################################## # Get the buffer contents back via a SAX Writer and confirm they are as # expected. # $xmlout = ''; $writer = XML::SAX::Writer->new(Output => \$xmlout); $writer->start_document(); $bm->to_sax($writer); $writer->end_document(); $xmlout =~ s/"/'/sg; like($xmlout, qr{^ text content}, 'XML returned OK from multi-level buffer' ); XML-Filter-Sort-1.01/Changes0000644000175000017500000000103310231533055016000 0ustar grantgrant00000000000000Revision history for Perl extension XML::Filter::Sort. 1.01 21-Apr-2005 - Fixed similar serious bug with text content of sort key was '0', also reported by Rishi Dhupar 1.00 20-Apr-2005 - Fixed serious bug with text content = '0' reported by Rishi Dhupar 0.91 11-Jun-2002 - CPAN release - imported into CVS on SourceForge 0.90 11-Jun-2002 - pre-CPAN release 0.01 11-Apr-2002 - original version; created by h2xs 1.20 with options -AX -n XML::Filter::Sort XML-Filter-Sort-1.01/MANIFEST0000644000175000017500000000060210231413305015632 0ustar grantgrant00000000000000Changes MANIFEST README Makefile.PL lib/XML/Filter/Sort.pm lib/XML/Filter/Sort/Buffer.pm lib/XML/Filter/Sort/BufferMgr.pm lib/XML/Filter/Sort/DiskBuffer.pm lib/XML/Filter/Sort/DiskBufferMgr.pm bin/xmlsort t/0_config.t t/1_buffer.t t/2_buffermgr.t t/3_sort.t t/4_workout.t t/5_diskbuffer.t t/6_diskworkout.t META.yml Module meta-data (added by MakeMaker) XML-Filter-Sort-1.01/META.yml0000644000175000017500000000065210231533633015766 0ustar grantgrant00000000000000# http://module-build.sourceforge.net/META-spec.html #XXXXXXX This is a prototype!!! It will change in the future!!! XXXXX# name: XML-Filter-Sort version: 1.01 version_from: lib/XML/Filter/Sort.pm installdirs: site requires: Test::Simple: 0.41 XML::SAX: XML::SAX::Writer: distribution_type: module generated_by: ExtUtils::MakeMaker version 6.17 XML-Filter-Sort-1.01/Makefile.PL0000644000175000017500000000121507502452216016467 0ustar grantgrant00000000000000use ExtUtils::MakeMaker; # See lib/ExtUtils/MakeMaker.pm for details of how to influence # the contents of the Makefile that is written. WriteMakefile( 'NAME' => 'XML::Filter::Sort', 'VERSION_FROM' => 'lib/XML/Filter/Sort.pm', 'PREREQ_PM' => { Test::Simple => 0.41, XML::SAX => undef, XML::SAX::Writer => undef, }, 'EXE_FILES' => [ qw(bin/xmlsort) ], ($] >= 5.005 ? ( 'AUTHOR' => 'Grant McLean ', 'ABSTRACT_FROM' => 'lib/XML/Filter/Sort.pm', ) : () ) ); XML-Filter-Sort-1.01/README0000644000175000017500000000134010231533126015365 0ustar grantgrant00000000000000DESCRIPTION XML::Filter::Sort - SAX filter for sorting elements in XML This module is a SAX filter for sorting 'records' in XML documents (including documents larger than available memory). The `xmlsort' utility which is included with this distribution can be used to sort an XML file from the command line without writing Perl code (see: `perldoc xmlsort'). PREREQUISITES Requires XML::SAX BUILDING/INSTALLING Once the archive is unpacked, use these commands: perl Makefile.PL make make test make install COPYRIGHT Copyright 2002-2005 Grant McLean This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself.