package Plagger::Plugin::Filter::EntryFullText; use strict; use base qw( Plagger::Plugin ); use DirHandle; use Encode; use File::Spec; use List::Util qw(first); use HTML::ResolveLink; use Plagger::Date; # for metadata in plugins use Plagger::Util qw( decode_content ); use Plagger::Plugin::CustomFeed::Simple; use Plagger::UserAgent; sub rule_hook { 'update.entry.fixup' } sub register { my($self, $context) = @_; $context->register_hook( $self, 'aggregator.finalize' => \&finalize, 'customfeed.handle' => \&handle, 'update.entry.fixup' => \&filter, 'update.fixup' => \&fixup, ); } sub init { my $self = shift; $self->SUPER::init(@_); $self->load_plugins(); $self->{ua} = Plagger::UserAgent->new; } sub load_plugins { my $self = shift; my $context = Plagger->context; $self->load_assets('*.yaml', sub { $self->load_plugin_yaml(@_) }); $self->load_assets('*.pl', sub { $self->load_plugin_perl(@_) }); } sub load_plugin_perl { my($self, $file, $base) = @_; Plagger->context->log(debug => "Load plugin $file"); open my $fh, '<', $file or Plagger->context->error("$file: $!"); (my $pkg = $base) =~ s/\.pl$//; my $plugin_class = "Plagger::Plugin::Filter::EntryFullText::Site::$pkg"; if ($plugin_class->can('new')) { Plagger->context->log(warn => "$plugin_class is already defined. skip compiling code"); return $plugin_class->new; } my $code = join '', <$fh>; unless ($code =~ /^\s*package/s) { $code = join "\n", ( "package $plugin_class;", "use strict;", "use base qw( Plagger::Plugin::Filter::EntryFullText::Site );", "sub site_name { '$pkg' }", $code, "1;" ); } eval $code; Plagger->context->error($@) if $@; push @{ $self->{plugins} }, $plugin_class->new; } sub load_plugin_yaml { my($self, $file, $base) = @_; Plagger->context->log(debug => "Load YAML $file"); my @data = YAML::LoadFile($file); push @{ $self->{plugins} }, map { Plagger::Plugin::Filter::EntryFullText::YAML->new($_, $base) } @data; } sub merge_get_entries { my($self, $context, $args) = @_; my $url = $args->{entry}->permalink || $args->{entry}->link; $context->log(info => "merge_get_entries GET $url"); my $agent = Plagger::UserAgent->new; my $res = $agent->fetch($url, $self); if ($res->http_response->is_error) { $context->log(error => "merge_get_entries GET $url failed: " . $res->status); return; } my $content = decode_content($res); my $feed = $args->{feed}; if( my $re = $args->{match} ) { my $resolver = HTML::ResolveLink->new(base => $url); $content = $resolver->resolve($content); my %seen; my $parser = HTML::TokeParser->new(\$content); while (my $token = $parser->get_tag('a')) { next unless ($token->[1]->{href} || '') =~ /$re/; my $text = $parser->get_trimmed_text('/a'); next if !$text || $text eq '[IMG]'; my $item_url = URI->new_abs($token->[1]->{href}, $url); next if $seen{$item_url->as_string}++; my $entry = Plagger::Entry->new; $entry->title($text); $entry->link($item_url); $feed->add_entry($entry); push(@{$feed->meta->{merge}->{$url}}, $item_url); $context->log(debug => "Add $token->[1]->{href} ($text)"); } } elsif (my $xpath = $args->{xpath}) { my $tree = HTML::TreeBuilder::XPath->new; $tree->parse($content); $tree->eof; my %seen; for my $child ( $tree->findnodes($xpath || '//a') ) { my $href = $child->attr('href') or next; my $title = $child->attr('title') || $child->as_text; next if $seen{$href}++; my $entry = Plagger::Entry->new; $entry->title($title); $entry->link(URI->new_abs($href, $url)); $args->{feed}->add_entry($entry); push(@{$feed->meta->{merge}->{$url}}, URI->new_abs($href, $url)); $context->log(debug => "Add $href ($title)"); } } return 1; } sub finalize { my($self, $context) = @_; return unless $self->conf->{merge_pages}; for my $feed ($context->subscription->feeds) { for my $entry ($feed->entries) { my $args = { feed => $feed, entry => $entry }; my $handler = first { $_->merge_entry_handle($args) } @{ $self->{plugins} }; if ($handler) { $context->log(debug => " handled by " . $handler->site_name); $args->{match} = $handler->merge_entry_follow_link; $args->{xpath} = $handler->merge_entry_follow_xpath; $self->merge_get_entries($context, $args); } } } } sub fixup { my($self, $context) = @_; return unless $self->conf->{merge_pages}; for my $feed ($context->update->feeds) { for my $key (keys %{$feed->meta->{merge}}) { my %seen; push(@{$feed->meta->{merge}->{$key}}, $key); my @mergelist = sort(@{$feed->meta->{merge}->{$key}}); my @urls; my @delurls; for my $url (@mergelist) { if (($key =~ m!/$! and $url =~ m!^${key}index\.s?html?$!) or ($url =~ m!/$! and $key =~ m!^${url}index\.s?html?$!)) { push(@delurls, $url); $url = $key; } push(@urls, $url) unless $seen{$url}++; } my $top = shift(@urls); my $entry = (grep { $_->permalink eq $top } $feed->entries)[0]; $context->error("not found $key") unless $entry; for my $val (@urls) { my $add = (grep { $_->permalink eq $val } $feed->entries)[0]; $context->error("not found $val") unless $add; $entry->body($entry->body . $add->body); $feed->delete_entry($add); } for my $val (@delurls) { my $del = (grep { $_->permalink eq $val } $feed->entries)[0]; $context->error("not found $val") unless $del; $feed->delete_entry($del); } } } } sub handle { my($self, $context, $args) = @_; my $handler = first { $_->custom_feed_handle($args) } @{ $self->{plugins} }; if ($handler) { $args->{match} = $handler->custom_feed_follow_link; $args->{xpath} = $handler->custom_feed_follow_xpath; return $self->Plagger::Plugin::CustomFeed::Simple::aggregate($context, $args); } } sub filter { my($self, $context, $args) = @_; my $handler = first { $_->handle_force($args) } @{ $self->{plugins} }; if ( !$handler && $args->{entry}->body && $args->{entry}->body->is_html && !$self->conf->{force_upgrade} ) { $self->log(debug => $args->{entry}->link . " already contains body. Skipped"); return; } if (! $args->{entry}->permalink) { $self->log(debug => "Entry " . $args->{entry}->title . " doesn't have permalink. Skipped"); return; } # NoNetwork: don't connect for 3 hours my $res = $self->{ua}->fetch( $args->{entry}->permalink, $self, { NoNetwork => 60 * 60 * 3 } ); if (!$res->status && $res->is_error) { $self->log(debug => "Fetch " . $args->{entry}->permalink . " failed"); return; } $args->{content} = decode_content($res); # if the request was redirected, set it as permalink if ($res->http_response) { my $base = $res->http_response->request->uri; if ( $base ne $args->{entry}->permalink ) { $context->log(info => "rewrite permalink to $base"); $args->{entry}->permalink($base); } } # use Last-Modified to populate entry date, even if handler doesn't find one # TODO: make this a separate plugin if ($res->last_modified && !$args->{entry}->date) { $args->{entry}->date( Plagger::Date->from_epoch($res->last_modified) ); } my @plugins = $handler ? ($handler) : @{ $self->{plugins} }; my $upgraded; for my $plugin (@plugins) { if ( $handler || $plugin->handle($args) ) { $context->log(debug => $args->{entry}->permalink . " handled by " . $plugin->site_name); my $data = $plugin->extract($args); $data = { body => $data } if $data && !ref $data; if ($data) { $context->log(info => "Extract content succeeded on " . $args->{entry}->permalink); my $resolver = HTML::ResolveLink->new( base => $args->{entry}->permalink ); # if body was already there, set that to summary if ($args->{entry}->body) { $args->{entry}->summary($args->{entry}->body); } $data->{body} = $resolver->resolve( $data->{body} ); $args->{entry}->body($data->{body}); $args->{entry}->title($data->{title}) if $data->{title}; $args->{entry}->author($data->{author}) if $data->{author}; $args->{entry}->icon({ url => $data->{icon} }) if $data->{icon}; $args->{entry}->summary($data->{summary}) if $data->{summary}; # extract date using found one if ($data->{date}) { $args->{entry}->date($data->{date}); } $upgraded++; last; } } } # extract TITLE tag if title is not set yet # TODO: make this a separate plugin if (!$args->{entry}->title and $args->{content} =~ m!