package Plagger::Plugin::Filter::EntryFullText; use strict; use base qw( Plagger::Plugin ); use DirHandle; use Encode; use File::Spec; use List::Util qw(first); use HTML::ResolveLink; use Plagger::Date; # for metadata in plugins use Plagger::Util qw( decode_content ); use Plagger::Plugin::CustomFeed::Simple; use Plagger::UserAgent; sub rule_hook { 'update.entry.fixup' } sub register { my($self, $context) = @_; $context->register_hook( $self, 'aggregator.finalize' => \&finalize, 'customfeed.handle' => \&handle, 'update.entry.fixup' => \&filter, 'update.fixup' => \&fixup, ); } sub init { my $self = shift; $self->SUPER::init(@_); $self->load_plugins(); $self->{ua} = Plagger::UserAgent->new; } sub load_plugins { my $self = shift; my $context = Plagger->context; $self->load_assets('*.yaml', sub { $self->load_plugin_yaml(@_) }); $self->load_assets('*.pl', sub { $self->load_plugin_perl(@_) }); } sub load_plugin_perl { my($self, $file, $base) = @_; Plagger->context->log(debug => "Load plugin $file"); open my $fh, '<', $file or Plagger->context->error("$file: $!"); (my $pkg = $base) =~ s/\.pl$//; my $plugin_class = "Plagger::Plugin::Filter::EntryFullText::Site::$pkg"; if ($plugin_class->can('new')) { Plagger->context->log(warn => "$plugin_class is already defined. skip compiling code"); return $plugin_class->new; } my $code = join '', <$fh>; unless ($code =~ /^\s*package/s) { $code = join "\n", ( "package $plugin_class;", "use strict;", "use base qw( Plagger::Plugin::Filter::EntryFullText::Site );", "sub site_name { '$pkg' }", $code, "1;" ); } eval $code; Plagger->context->error($@) if $@; push @{ $self->{plugins} }, $plugin_class->new; } sub load_plugin_yaml { my($self, $file, $base) = @_; Plagger->context->log(debug => "Load YAML $file"); my @data = YAML::LoadFile($file); push @{ $self->{plugins} }, map { Plagger::Plugin::Filter::EntryFullText::YAML->new($_, $base) } @data; } sub merge_get_entries { my($self, $context, $args) = @_; my $url = $args->{entry}->permalink || $args->{entry}->link; $context->log(info => "merge_get_entries GET $url"); my $agent = Plagger::UserAgent->new; my $res = $agent->fetch($url, $self); if ($res->http_response->is_error) { $context->log(error => "merge_get_entries GET $url failed: " . $res->status); return; } my $content = decode_content($res); my $feed = $args->{feed}; if( my $re = $args->{match} ) { my $resolver = HTML::ResolveLink->new(base => $url); $content = $resolver->resolve($content); my %seen; my $parser = HTML::TokeParser->new(\$content); while (my $token = $parser->get_tag('a')) { next unless ($token->[1]->{href} || '') =~ /$re/; my $text = $parser->get_trimmed_text('/a'); next if !$text || $text eq '[IMG]'; my $item_url = URI->new_abs($token->[1]->{href}, $url); next if $seen{$item_url->as_string}++; my $entry = Plagger::Entry->new; $entry->title($text); $entry->link($item_url); $feed->add_entry($entry); push(@{$feed->meta->{merge}->{$url}}, $item_url); $context->log(debug => "Add $token->[1]->{href} ($text)"); } } elsif (my $xpath = $args->{xpath}) { my $tree = HTML::TreeBuilder::XPath->new; $tree->parse($content); $tree->eof; my %seen; for my $child ( $tree->findnodes($xpath || '//a') ) { my $href = $child->attr('href') or next; my $title = $child->attr('title') || $child->as_text; next if $seen{$href}++; my $entry = Plagger::Entry->new; $entry->title($title); $entry->link(URI->new_abs($href, $url)); $args->{feed}->add_entry($entry); push(@{$feed->meta->{merge}->{$url}}, URI->new_abs($href, $url)); $context->log(debug => "Add $href ($title)"); } } return 1; } sub finalize { my($self, $context) = @_; return unless $self->conf->{merge_pages}; for my $feed ($context->subscription->feeds) { for my $entry ($feed->entries) { my $args = { feed => $feed, entry => $entry }; my $handler = first { $_->merge_entry_handle($args) } @{ $self->{plugins} }; if ($handler) { $context->log(debug => " handled by " . $handler->site_name); $args->{match} = $handler->merge_entry_follow_link; $args->{xpath} = $handler->merge_entry_follow_xpath; $self->merge_get_entries($context, $args); } } } } sub fixup { my($self, $context) = @_; return unless $self->conf->{merge_pages}; for my $feed ($context->update->feeds) { for my $key (keys %{$feed->meta->{merge}}) { my %seen; push(@{$feed->meta->{merge}->{$key}}, $key); my @mergelist = sort(@{$feed->meta->{merge}->{$key}}); my @urls; my @delurls; for my $url (@mergelist) { if (($key =~ m!/$! and $url =~ m!^${key}index\.s?html?$!) or ($url =~ m!/$! and $key =~ m!^${url}index\.s?html?$!)) { push(@delurls, $url); $url = $key; } push(@urls, $url) unless $seen{$url}++; } my $top = shift(@urls); my $entry = (grep { $_->permalink eq $top } $feed->entries)[0]; $context->error("not found $key") unless $entry; for my $val (@urls) { my $add = (grep { $_->permalink eq $val } $feed->entries)[0]; $context->error("not found $val") unless $add; $entry->body($entry->body . $add->body); $feed->delete_entry($add); } for my $val (@delurls) { my $del = (grep { $_->permalink eq $val } $feed->entries)[0]; $context->error("not found $val") unless $del; $feed->delete_entry($del); } } } } sub handle { my($self, $context, $args) = @_; my $handler = first { $_->custom_feed_handle($args) } @{ $self->{plugins} }; if ($handler) { $args->{match} = $handler->custom_feed_follow_link; $args->{xpath} = $handler->custom_feed_follow_xpath; return $self->Plagger::Plugin::CustomFeed::Simple::aggregate($context, $args); } } sub filter { my($self, $context, $args) = @_; my $handler = first { $_->handle_force($args) } @{ $self->{plugins} }; if ( !$handler && $args->{entry}->body && $args->{entry}->body->is_html && !$self->conf->{force_upgrade} ) { $self->log(debug => $args->{entry}->link . " already contains body. Skipped"); return; } if (! $args->{entry}->permalink) { $self->log(debug => "Entry " . $args->{entry}->title . " doesn't have permalink. Skipped"); return; } # NoNetwork: don't connect for 3 hours my $res = $self->{ua}->fetch( $args->{entry}->permalink, $self, { NoNetwork => 60 * 60 * 3 } ); if (!$res->status && $res->is_error) { $self->log(debug => "Fetch " . $args->{entry}->permalink . " failed"); return; } $args->{content} = decode_content($res); # if the request was redirected, set it as permalink if ($res->http_response) { my $base = $res->http_response->request->uri; if ( $base ne $args->{entry}->permalink ) { $context->log(info => "rewrite permalink to $base"); $args->{entry}->permalink($base); } } # use Last-Modified to populate entry date, even if handler doesn't find one # TODO: make this a separate plugin if ($res->last_modified && !$args->{entry}->date) { $args->{entry}->date( Plagger::Date->from_epoch($res->last_modified) ); } my @plugins = $handler ? ($handler) : @{ $self->{plugins} }; my $upgraded; for my $plugin (@plugins) { if ( $handler || $plugin->handle($args) ) { $context->log(debug => $args->{entry}->permalink . " handled by " . $plugin->site_name); my $data = $plugin->extract($args); $data = { body => $data } if $data && !ref $data; if ($data) { $context->log(info => "Extract content succeeded on " . $args->{entry}->permalink); my $resolver = HTML::ResolveLink->new( base => $args->{entry}->permalink ); # if body was already there, set that to summary if ($args->{entry}->body) { $args->{entry}->summary($args->{entry}->body); } $data->{body} = $resolver->resolve( $data->{body} ); $args->{entry}->body($data->{body}); $args->{entry}->title($data->{title}) if $data->{title}; $args->{entry}->author($data->{author}) if $data->{author}; $args->{entry}->icon({ url => $data->{icon} }) if $data->{icon}; $args->{entry}->summary($data->{summary}) if $data->{summary}; # extract date using found one if ($data->{date}) { $args->{entry}->date($data->{date}); } $upgraded++; last; } } } # extract TITLE tag if title is not set yet # TODO: make this a separate plugin if (!$args->{entry}->title and $args->{content} =~ m!\s*(.*?)\s*!is ) { $args->{entry}->title( HTML::Entities::decode($1) ); } return 1 if $upgraded; # failed to extract: store whole HTML if the config is on if ($self->conf->{store_html_on_failure}) { $args->{entry}->body($args->{content}); return 1; } $context->log(warn => "Extract content failed on " . $args->{entry}->permalink); } package Plagger::Plugin::Filter::EntryFullText::Site; sub new { bless {}, shift } sub merge_entry_handle { 0 } sub merge_entry_follow_link { } sub merge_entry_follow_xpath { } sub custom_feed_handle { 0 } sub custom_feed_follow_link { } sub custom_feed_follow_xpath { } sub handle_force { 0 } sub handle { 0 } package Plagger::Plugin::Filter::EntryFullText::YAML; use Encode; use List::Util qw(first); sub new { my($class, $data, $base) = @_; # add ^ if handle method starts with http:// for my $key ( qw(custom_feed_handle handle handle_force merge_entry_handle) ) { next unless defined $data->{$key}; $data->{$key} = "^$data->{$key}" if $data->{$key} =~ m!^https?://!; } # decode as UTF-8 for my $key ( qw(extract extract_date_format) ) { next unless defined $data->{$key}; if (ref $data->{$key} && ref $data->{$key} eq 'ARRAY') { $data->{$key} = [ map decode("UTF-8", $_), @{$data->{$key}} ]; } else { $data->{$key} = decode("UTF-8", $data->{$key}); } } bless {%$data, base => $base }, $class; } sub site_name { my $self = shift; $self->{base}; } sub merge_entry_handle { my($self, $args) = @_; my $link = $args->{entry}->permalink || $args->{entry}->link; $self->{merge_entry_handle} ? $link =~ /$self->{merge_entry_handle}/ : 0; } sub merge_entry_follow_link { $_[0]->{merge_entry_follow_link}; } sub merge_entry_follow_xpath { $_[0]->{merge_entry_follow_xpath}; } sub custom_feed_handle { my($self, $args) = @_; $self->{custom_feed_handle} ? $args->{feed}->url =~ /$self->{custom_feed_handle}/ : 0; } sub custom_feed_follow_link { $_[0]->{custom_feed_follow_link}; } sub custom_feed_follow_xpath { $_[0]->{custom_feed_follow_xpath}; } sub handle_force { my($self, $args) = @_; $self->{handle_force} ? $args->{entry}->permalink =~ /$self->{handle_force}/ : 0; } sub handle { my($self, $args) = @_; $self->{handle} ? $args->{entry}->permalink =~ /$self->{handle}/ : 0; } sub xml_escape { for my $x (@_) { $x = Plagger::Util::encode_xml($x); } } sub extract { my($self, $args) = @_; my $data; unless ($self->{extract} || $self->{extract_xpath}) { Plagger->context->log(error => "YAML doesn't have either 'extract' nor 'extract_xpath'"); return; } if ($self->{extract}) { if (my @match = $args->{content} =~ /$self->{extract}/s) { my @capture = split /\s+/, $self->{extract_capture}; @capture = ('body') unless @capture; @{$data}{@capture} = @match; } } if ($self->{extract_xpath}) { eval { require HTML::TreeBuilder::XPath }; if ($@) { Plagger->context->log(error => "HTML::TreeBuilder::XPath is required. $@"); return; } my $tree = HTML::TreeBuilder::XPath->new; $tree->parse($args->{content}); $tree->eof; for my $capture (keys %{$self->{extract_xpath}}) { my @children = $tree->findnodes($self->{extract_xpath}->{$capture}); if (@children) { no warnings 'redefine'; local *HTML::Element::_xml_escape = \&xml_escape; $data->{$capture} = $children[0]->isElementNode ? $children[0]->as_XML : $children[0]->getValue; } else { Plagger->context->log(error => "Can't find node matching $self->{extract_xpath}->{$capture}"); } } } if ($data) { if ($self->{extract_after_hook}) { eval $self->{extract_after_hook}; Plagger->context->error($@) if $@; } if ($data->{date}) { if (my $format = $self->{extract_date_format}) { $format = [ $format ] unless ref $format; $data->{date} = (map { Plagger::Date->strptime($_, $data->{date}) } @$format)[0]; if ($data->{date} && $self->{extract_date_timezone}) { $data->{date}->set_time_zone($self->{extract_date_timezone}); } } else { $data->{date} = Plagger::Date->parse_dwim($data->{date}); } } return $data; } } 1; __END__ =head1 NAME Plagger::Plugin::Filter::EntryFullText - Upgrade your feeds to fulltext class =head1 SYNOPSIS - module: Filter::EntryFullText =head1 DESCRIPTION This plugin allows you to fetch entry full text by doing HTTP GET and apply regexp to HTML. It's just like upgrading your flight ticket from economy class to business class! You can write custom fulltext handler by putting C<.pl> or C<.yaml> files under assets plugin directory. =head1 CONFIG =over 4 =item store_html_on_failure Even if fulltext handlers fail to extract content body from HTML, this option enables to store the whole document HTML as entry body. It will be useful to use with search engines like Gmail and Search:: plugins. Defaults to 0. =item force_upgrade Even if entry body already contains HTML, this config forces the plugin to upgrade the body. Defaults to 0. =back =head1 WRITING CUSTOM FULLTEXT HANDLER (To be documented) =head1 AUTHOR Tatsuhiko Miyagawa =head1 SEE ALSO L