Sign Up
Log In
Log In
or
Sign Up
Places
All Projects
Status Monitor
Collapse sidebar
home:Tomcat42
perl-HTML-TagParser
HTML-TagParser-0.16-subtree.diff
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
File HTML-TagParser-0.16-subtree.diff of Package perl-HTML-TagParser
--- HTML-TagParser-0.16/t/12_navigation.t.orig 2008-07-11 01:34:16.000000000 +0200 +++ HTML-TagParser-0.16/t/12_navigation.t 2008-07-11 02:06:45.000000000 +0200 @@ -0,0 +1,40 @@ +# ---------------------------------------------------------------- +use strict; +use Test::More tests => 8; +BEGIN { use_ok('HTML::TagParser') }; +# ---------------------------------------------------------------- + +my $SOURCE = <<EOT; +<html> +<body> +<div id="foo"> + <span>AAA</span> + <div id="bar"selected> + BBB + <span>CCC</span> + DDD + <div/> + EEE + </div> + <span>FFF</span> +</div> +</body> +</html> +EOT +# ---------------------------------------------------------------- + +my $document = HTML::TagParser->new( $SOURCE ); +ok( ref $document, "new()" ); +my $bar = $document->getElementById('bar'); +my $fff = $bar->nextSibling(); +like( $fff->innerText(), qr/FFF/s, "nextSibling" ); +is( $fff->nextSibling(), undef, "no nextSibling" ); +my $ch = $bar->childNodes(); +is( $#$ch, 1, "childNodes" ); +is( $ch->[1]->parentNode()->id(), "bar", "parentNode" ); +is( $ch->[1]->parentNode()->parentNode()->id(), "foo", "parent.parentNode" ); +is( $ch->[1]->parentNode()->parentNode()->parentNode->parentNode()->parentNode(), undef, "root parentNode" ); + +# ---------------------------------------------------------------- +;1; +# ---------------------------------------------------------------- --- HTML-TagParser-0.16/t/08_nest.t.orig 2006-05-05 21:14:24.000000000 +0200 +++ HTML-TagParser-0.16/t/08_nest.t 2008-07-11 01:29:17.000000000 +0200 @@ -28,12 +28,12 @@ like( $body->innerText(), qr/AAA.*BBB.*CCC.*DDD.*EEE.*FFF/s, "body" ); my $foo = $html->getElementById( "foo" ); - like( $foo->innerText(), qr/AAA/s, "foo" ); -# like( $foo->innerText(), qr/AAA.*BBB.*CCC.*DDD.*EEE.*FFF/s, "foo" ); +# like( $foo->innerText(), qr/AAA/s, "foo" ); + like( $foo->innerText(), qr/AAA.*BBB.*CCC.*DDD.*EEE.*FFF/s, "foo" ); my $bar = $html->getElementById( "bar" ); - like( $bar->innerText(), qr/BBB.*CCC.*DDD\W*$/s, "bar" ); -# like( $bar->innerText(), qr/BBB.*CCC.*DDD.*EEE/s, "bar" ); +# like( $bar->innerText(), qr/BBB.*CCC.*DDD\W*$/s, "bar" ); + like( $bar->innerText(), qr/BBB.*CCC.*DDD.*EEE/s, "bar" ); # ---------------------------------------------------------------- ;1; # ---------------------------------------------------------------- --- HTML-TagParser-0.16/lib/HTML/TagParser.pm.orig 2007-04-06 12:48:08.000000000 +0200 +++ HTML-TagParser-0.16/lib/HTML/TagParser.pm 2008-07-11 02:15:14.000000000 +0200 @@ -10,12 +10,15 @@ my $elem = $html->getElementsByTagName( "title" ); print "<title>", $elem->innerText(), "</title>\n" if ref $elem; -Parse a HTML source and find its first <form action=""> attribute's value. +Parse a HTML source and find its first <form action=""> attribute's value +and find all input elements belonging to this form. my $src = '<html><form action="hoge.cgi">...</form></html>'; my $html = HTML::TagParser->new( $src ); my $elem = $html->getElementsByTagName( "form" ); print "<form action=\"", $elem->getAttribute("action"), "\">\n" if ref $elem; + my @first_inputs = $elem->subTree()->getElementsByTagName( "input" ); + my $form = $first_inputs[0]->getParent(); Fetch a HTML file via HTTP, and display its all <a> elements and attributes. @@ -120,6 +123,43 @@ This method returns $elem's innerText without tags. +=head2 $subhtml = $elem->subTree(); + +This method returns a new object of class HTML::Parser, +with all the elements that are in the DOM hierarchy under $elem. + +=head2 $elem = $elem->nextSibling(); + +This method returns the next sibling within the same parent. +It returns undef when called on a closing tag or on the lastChild node +of a parentNode. + +=head2 $elem = $elem->previousSibling(); + +This method returns the previous sibling within the same parent. +It returns undef when called on the firstChild node of a parentNode. + +=head2 $child_elem = $elem->firstChild(); + +This method returns the first child node of $elem. +It returns undef when called on a closing tag element or on a +non-container or empty container element. + +=head2 $child_elems = $elem->childNodes(); + +This method creates an array of all child nodes of $elem and returns the array by reference. +It returns an empty array-ref [] whenever firstChild() would return undef. + +=head2 $child_elem = $elem->lastChild(); + +This method returns the last child node of $elem. +It returns undef whenever firstChild() would return undef. + +=head2 $parent = $elem->parentNode(); + +This method returns the parent node of $elem. +It returns undef when called on root nodes. + =head2 $attr = $elem->attributes(); This method returns a hash of $elem's all attributes. @@ -128,6 +168,17 @@ This method returns the value of $elem's attributes which name is $key. +=head1 BUGS + +The HTML-Parser is simple. Methods innerText and subTree may be +fooled by nested tags or embedded javascript code. + +The methods with 'Sibling', 'child' or 'Child' in their names do not cache their results. +The most expensive ones are lastChild() and previousSibling(). +parentNode() is also expensive, but only once. It does caching. + +The DOM tree is read-only, as this is just a parser. + =head1 INTERNATIONALIZATION This module natively understands the character encoding used in document @@ -157,12 +208,21 @@ use Carp; use vars qw( $VERSION ); -$VERSION = "0.16"; +$VERSION = "0.16.1"; my $J2E = {qw( jis ISO-2022-JP sjis Shift_JIS euc EUC-JP ucs2 UCS2 )}; my $E2J = { map { lc($_) } reverse %$J2E }; my $SEC_OF_DAY = 60 * 60 * 24; +# [000] '/' if closing tag. +# [001] tagName +# [002] attributes string (with trailing /, if self-closing tag). +# [003] content until next (nested) tag. +# [004] attributes hash cache. +# [005] innerText combined strings cache. +# [006] index of matching closing tag (or opening tag, if [000]=='/') +# [007] index of parent (aka container) tag. +# sub new { my $package = shift; my $src = shift; @@ -330,10 +390,10 @@ return if ( defined $elem->[002] && $elem->[002] =~ m#/$# ); # <xxx/> my $tagname = $elem->[001]; + my $closing = HTML::TagParser::Util::find_closing($flat, $cur); my $list = []; - for ( ; $cur < $#$flat ; $cur++ ) { + for ( ; $cur < $closing ; $cur++ ) { push( @$list, $flat->[$cur]->[003] ); - last if ( $flat->[ $cur + 1 ]->[001] eq $tagname ); } my $text = join( "", grep { $_ ne "" } @$list ); $text =~ s/^\s+//s; @@ -342,6 +402,127 @@ $elem->[005] = HTML::TagParser::Util::xml_unescape( $text ); } +sub subTree +{ + my $self = shift; + my ( $flat, $cur ) = @$self; + my $elem = $flat->[$cur]; + return if $elem->[000]; # </xxx> + my $closing = HTML::TagParser::Util::find_closing($flat, $cur); + my $list = []; + while (++$cur < $closing) + { + push @$list, $flat->[$cur]; + } + + # allow the getElement...() methods on the returned object. + return bless { flat => $list }, 'HTML::TagParser'; +} + + +sub nextSibling +{ + my $self = shift; + my ( $flat, $cur ) = @$self; + my $elem = $flat->[$cur]; + + return undef if $elem->[000]; # </xxx> + my $closing = HTML::TagParser::Util::find_closing($flat, $cur); + my $next_s = $flat->[$closing+1]; + return undef unless $next_s; + return undef if $next_s->[000]; # parent's </xxx> + return HTML::TagParser::Element->new( $flat, $closing+1 ); +} + +sub firstChild +{ + my $self = shift; + my ( $flat, $cur ) = @$self; + my $elem = $flat->[$cur]; + return undef if $elem->[000]; # </xxx> + my $closing = HTML::TagParser::Util::find_closing($flat, $cur); + return undef if $closing <= $cur+1; # no children here. + return HTML::TagParser::Element->new( $flat, $cur+1 ); +} + +sub childNodes +{ + my $self = shift; + my ( $flat, $cur ) = @$self; + my $child = firstChild($self); + return [] unless $child; # an empty array is easier for our callers than undef + my @c = ( $child ); + while (defined ($child = nextSibling($child))) + { + push @c, $child; + } + return \@c; +} + +sub lastChild +{ + my $c = childNodes(@_); + return undef unless $c->[0]; + return $c->[-1]; +} + +sub previousSibling +{ + my $self = shift; + my ( $flat, $cur ) = @$self; + + ## This one is expensive. + ## We use find_closing() which walks forward. + ## We'd need a find_opening() which walks backwards. + ## So we walk backwards one by one and consult find_closing() + ## until we find $cur-1 or $cur. + + my $idx = $cur-1; + while ($idx >= 0) + { + if ($flat->[$idx][000] && defined($flat->[$idx][006])) + { + $idx = $flat->[$idx][006]; # use cache for backwards skipping + next; + } + + my $closing = HTML::TagParser::Util::find_closing($flat, $idx); + return HTML::TagParser::Element->new( $flat, $idx ) + if defined $closing and ($closing == $cur || $closing == $cur-1); + $idx--; + } + return undef; +} + +sub parentNode +{ + my $self = shift; + my ( $flat, $cur ) = @$self; + + return HTML::TagParser::Element->new( $flat, $flat->[$cur][007]) if $flat->[$cur][007]; # cache + + ## + ## This one is very expensive. + ## We use previousSibling() to walk backwards, and + ## previousSibling() is expensive. + ## + my $ps = $self; + my $first = $self; + + while (defined($ps = previousSibling($ps))) { $first = $ps; } + + my $parent = $first->[1] - 1; + return undef if $parent < 0; + die "parent too short" if HTML::TagParser::Util::find_closing($flat, $parent) <= $cur; + + $flat->[$cur][007] = $parent; # cache + return HTML::TagParser::Element->new( $flat, $parent ) +} + +## +## feature: +## self-closing tags have an additional attribute '/' => '/'. +## sub attributes { my $self = shift; my ( $flat, $cur ) = @$self; @@ -420,6 +601,66 @@ $flat; } +## returns 1 beyond the end, if not found. +## returns undef if called on a </xxx> closing tag +sub find_closing +{ + my ($flat, $cur) = @_; + + return $flat->[$cur][006] if $flat->[$cur][006]; # cache + return $flat->[$cur][006] = $cur if (($flat->[$cur][002]||'') =~ m{/$}); # self-closing + + my $name = $flat->[$cur][001]; + my $pre_nest = 0; + ## count how many levels deep this type of tag is nested. + my $idx; + for ($idx = 0; $idx <= $cur; $idx++) + { + my $e = $flat->[$idx]; + next unless $e->[001] eq $name; + next if (($e->[002]||'') =~ m{/$}); # self-closing + $pre_nest += ($e->[000]) ? -1 : 1; + $pre_nest = 0 if $pre_nest < 0; + $idx = $e->[006]-1 if !$e->[000] && $e->[006]; # use caches for skipping forward. + } + my $last_idx = $#$flat; + + ## we move last_idx closer, in case this container + ## has not all its subcontainers closed properly. + my $post_nest = 0; + for ($idx = $last_idx; $idx > $cur; $idx--) + { + my $e = $flat->[$idx]; + next unless $e->[001] eq $name; + $last_idx = $idx-1; # remember where a matching tag was + next if (($e->[002]||'') =~ m{/$}); # self-closing + $post_nest -= ($e->[000]) ? -1 : 1; + $post_nest = 0 if $post_nest < 0; + last if $pre_nest <= $post_nest; + $idx = $e->[006]+1 if $e->[000] && defined $e->[006]; # use caches for skipping backwards. + } + + my $nest = 1; # we know it is not self-closing. start behind. + + for ($idx = $cur+1; $idx <= $last_idx; $idx++) + { + my $e = $flat->[$idx]; + next unless $e->[001] eq $name; + next if (($e->[002]||'') =~ m{/$}); # self-closing + $nest += ($e->[000]) ? -1 : 1; + if ($nest <= 0) + { + die "assert </xxx>" unless $e->[000]; + $e->[006] = $cur; # point back to opening tag + return $flat->[$cur][006] = $idx; + } + $idx = $e->[006]-1 if !$e->[000] && $e->[006]; # use caches for skipping forward. + } + + # not all closed, but cannot go further + return $flat->[$cur][006] = $last_idx+1; +} + sub find_meta_charset { my $txtref = shift; # reference while ( $$txtref =~ m{
Locations
Projects
Search
Status Monitor
Help
OpenBuildService.org
Documentation
API Documentation
Code of Conduct
Contact
Support
@OBShq
Terms
openSUSE Build Service is sponsored by
The Open Build Service is an
openSUSE project
.
Sign Up
Log In
Places
Places
All Projects
Status Monitor