Index: lib/Plagger/Plugin/Filter/EntryFullText.pm =================================================================== --- lib/Plagger/Plugin/Filter/EntryFullText.pm (revision 1902) +++ lib/Plagger/Plugin/Filter/EntryFullText.pm (working copy) @@ -18,8 +18,10 @@ my($self, $context) = @_; $context->register_hook( $self, + 'aggregator.finalize' => \&finalize, 'customfeed.handle' => \&handle, 'update.entry.fixup' => \&filter, + 'update.fixup' => \&fixup, ); } @@ -80,6 +82,130 @@ map { Plagger::Plugin::Filter::EntryFullText::YAML->new($_, $base) } @data; } +sub merge_get_entries { + my($self, $context, $args) = @_; + + my $url = $args->{entry}->permalink || $args->{entry}->link; + $context->log(info => "merge_get_entries GET $url"); + + my $agent = Plagger::UserAgent->new; + my $res = $agent->fetch($url, $self); + + if ($res->http_response->is_error) { + $context->log(error => "merge_get_entries GET $url failed: " . $res->status); + return; + } + + my $content = decode_content($res); + + my $feed = $args->{feed}; + if( my $re = $args->{match} ) { + my $resolver = HTML::ResolveLink->new(base => $url); + $content = $resolver->resolve($content); + + my %seen; + my $parser = HTML::TokeParser->new(\$content); + while (my $token = $parser->get_tag('a')) { + next unless ($token->[1]->{href} || '') =~ /$re/; + + my $text = $parser->get_trimmed_text('/a'); + next if !$text || $text eq '[IMG]'; + + my $item_url = URI->new_abs($token->[1]->{href}, $url); + next if $seen{$item_url->as_string}++; + + my $entry = Plagger::Entry->new; + $entry->title($text); + $entry->link($item_url); + $feed->add_entry($entry); + push(@{$feed->meta->{merge}->{$url}}, $item_url); + $context->log(debug => "Add $token->[1]->{href} ($text)"); + } + } elsif (my $xpath = $args->{xpath}) { + my $tree = HTML::TreeBuilder::XPath->new; + $tree->parse($content); + $tree->eof; + + my %seen; + for my $child ( $tree->findnodes($xpath || '//a') ) { + my $href = $child->attr('href') or next; + my $title = $child->attr('title') || $child->as_text; + next if $seen{$href}++; + + my $entry = Plagger::Entry->new; + $entry->title($title); + $entry->link(URI->new_abs($href, $url)); + $args->{feed}->add_entry($entry); + push(@{$feed->meta->{merge}->{$url}}, URI->new_abs($href, $url)); + $context->log(debug => "Add $href ($title)"); + } + } + + return 1; +} + +sub finalize { + my($self, $context) = @_; + + return unless $self->conf->{merge_pages}; + + for my $feed ($context->subscription->feeds) { + for my $entry ($feed->entries) { + my $args = { feed => $feed, entry => $entry }; + my $handler = first { $_->merge_entry_handle($args) } @{ $self->{plugins} }; + if ($handler) { + $context->log(debug => " handled by " . $handler->site_name); + $args->{match} = $handler->merge_entry_follow_link; + $args->{xpath} = $handler->merge_entry_follow_xpath; + $self->merge_get_entries($context, $args); + } + + } + } +} + +sub fixup { + my($self, $context) = @_; + + return unless $self->conf->{merge_pages}; + + for my $feed ($context->update->feeds) { + for my $key (keys %{$feed->meta->{merge}}) { + my %seen; + + push(@{$feed->meta->{merge}->{$key}}, $key); + my @mergelist = sort(@{$feed->meta->{merge}->{$key}}); + my @urls; + my @delurls; + for my $url (@mergelist) { + if (($key =~ m!/$! and $url =~ m!^${key}index\.s?html?$!) or + ($url =~ m!/$! and $key =~ m!^${url}index\.s?html?$!)) { + push(@delurls, $url); + $url = $key; + } + push(@urls, $url) unless $seen{$url}++; + } + + my $top = shift(@urls); + my $entry = (grep { $_->permalink eq $top } $feed->entries)[0]; + $context->error("not found $key") unless $entry; + + for my $val (@urls) { + my $add = (grep { $_->permalink eq $val } $feed->entries)[0]; + $context->error("not found $val") unless $add; + + $entry->body($entry->body . $add->body); + $feed->delete_entry($add); + } + for my $val (@delurls) { + my $del = (grep { $_->permalink eq $val } $feed->entries)[0]; + $context->error("not found $val") unless $del; + $feed->delete_entry($del); + } + } + } +} + sub handle { my($self, $context, $args) = @_; @@ -185,6 +311,9 @@ package Plagger::Plugin::Filter::EntryFullText::Site; sub new { bless {}, shift } +sub merge_entry_handle { 0 } +sub merge_entry_follow_link { } +sub merge_entry_follow_xpath { } sub custom_feed_handle { 0 } sub custom_feed_follow_link { } sub custom_feed_follow_xpath { } @@ -199,7 +328,7 @@ my($class, $data, $base) = @_; # add ^ if handle method starts with http:// - for my $key ( qw(custom_feed_handle handle handle_force) ) { + for my $key ( qw(custom_feed_handle handle handle_force merge_entry_handle) ) { next unless defined $data->{$key}; $data->{$key} = "^$data->{$key}" if $data->{$key} =~ m!^https?://!; } @@ -222,6 +351,21 @@ $self->{base}; } +sub merge_entry_handle { + my($self, $args) = @_; + my $link = $args->{entry}->permalink || $args->{entry}->link; + $self->{merge_entry_handle} ? + $link =~ /$self->{merge_entry_handle}/ : 0; +} + +sub merge_entry_follow_link { + $_[0]->{merge_entry_follow_link}; +} + +sub merge_entry_follow_xpath { + $_[0]->{merge_entry_follow_xpath}; +} + sub custom_feed_handle { my($self, $args) = @_; $self->{custom_feed_handle} ?