--- lib/Plagger/Plugin/Filter/EntryFullText.pm 2006-12-31 10:28:10.000000000 +0900 +++ lib/Plagger/Plugin/Filter/EFTXango.pm 2006-12-31 12:52:27.000000000 +0900 @@ -1,4 +1,4 @@ -package Plagger::Plugin::Filter::EntryFullText; +package Plagger::Plugin::Filter::EFTXango; use strict; use base qw( Plagger::Plugin ); @@ -12,14 +12,35 @@ use Plagger::Plugin::CustomFeed::Simple; use Plagger::UserAgent; +use POE; +use Xango::Broker::Push; + sub rule_hook { 'update.entry.fixup' } sub register { my($self, $context) = @_; + + my %xango_args = ( + Alias => 'xgbroker', + HandlerAlias => 'xghandler', + HttpCompArgs => [ Agent => "Plagger/$Plagger::VERSION (http://plagger.org/)", Timeout => $self->conf->{timeout} || 10 ], + %{$self->conf->{xango_args} || {}}, + ); + $self->{xango_alias} = $xango_args{Alias}; + Plagger::Plugin::Filter::EFTXango::Crawler->spawn( + Plugin => $self, + UseCache => exists $self->conf->{use_cache} ? + $self->conf->{use_cache} : 1, + BrokerAlias => $xango_args{Alias}, + MaxRedirect => $self->conf->{max_redirect} || 3, + ); + Xango::Broker::Push->spawn(%xango_args); + $context->register_hook( $self, 'customfeed.handle' => \&handle, 'update.entry.fixup' => \&filter, + 'update.fixup' => \&update, ); } @@ -27,16 +48,15 @@ my $self = shift; $self->SUPER::init(@_); $self->load_plugins(); - - $self->{ua} = Plagger::UserAgent->new; } sub load_plugins { my $self = shift; my $context = Plagger->context; - $self->load_assets('*.yaml', sub { $self->load_plugin_yaml(@_) }); - $self->load_assets('*.pl', sub { $self->load_plugin_perl(@_) }); + my $fake = "Filter-EntryFullText"; + $self->load_assets('*.yaml', sub { $self->load_plugin_yaml(@_) }, $fake); + $self->load_assets('*.pl', sub { $self->load_plugin_perl(@_) }, $fake); } sub load_plugin_perl { @@ -46,7 +66,7 @@ open my $fh, '<', $file or Plagger->context->error("$file: $!"); (my $pkg = $base) =~ s/\.pl$//; - my $plugin_class = "Plagger::Plugin::Filter::EntryFullText::Site::$pkg"; + my $plugin_class = "Plagger::Plugin::Filter::EFTXango::Site::$pkg"; if ($plugin_class->can('new')) { Plagger->context->log(warn => "$plugin_class is already defined. skip compiling code"); @@ -58,7 +78,7 @@ $code = join "\n", ( "package $plugin_class;", "use strict;", - "use base qw( Plagger::Plugin::Filter::EntryFullText::Site );", + "use base qw( Plagger::Plugin::Filter::EFTXango::Site );", "sub site_name { '$pkg' }", $code, "1;" ); @@ -77,7 +97,7 @@ my @data = YAML::LoadFile($file); push @{ $self->{plugins} }, - map { Plagger::Plugin::Filter::EntryFullText::YAML->new($_, $base) } @data; + map { Plagger::Plugin::Filter::EFTXango::YAML->new($_, $base) } @data; } sub handle { @@ -105,28 +125,30 @@ return; } - # NoNetwork: don't connect for 3 hours - my $res = $self->{ua}->fetch( $args->{entry}->permalink, $self, { NoNetwork => 60 * 60 * 3 } ); - if (!$res->status && $res->is_error) { - $self->log(debug => "Fetch " . $args->{entry}->permalink . " failed"); - return; - } + my $url = $args->{entry}->permalink; + $self->{_url2entry}->{$url} = $args; + + $context->log(info => "Fetch $url"); + POE::Kernel->post($self->{xango_alias}, 'enqueue_job', Xango::Job->new(uri => URI->new($url), redirect => 0)); +} - $args->{content} = decode_content($res); +sub entry_fixup { + my($self, $url, $content, $lastmod) = @_; - # if the request was redirected, set it as permalink - if ($res->http_response) { - my $base = $res->http_response->request->uri; - if ( $base ne $args->{entry}->permalink ) { - $context->log(info => "rewrite permalink to $base"); - $args->{entry}->permalink($base); - } + my $args = $self->{_url2entry}->{$url}; + $self->log(info => "fixup for " . $args->{entry}->link); + my $handler = first { $_->handle_force($args) } @{ $self->{plugins} }; + if ( !$handler && $args->{entry}->body && $args->{entry}->body->is_html && !$self->conf->{force_upgrade} ) { + $self->log(info => $args->{entry}->link . " should efted"); + return; } + $args->{content} = decode_content($$content); + # use Last-Modified to populate entry date, even if handler doesn't find one # TODO: make this a separate plugin - if ($res->last_modified && !$args->{entry}->date) { - $args->{entry}->date( Plagger::Date->from_epoch($res->last_modified) ); + if ($lastmod && !$args->{entry}->date) { + $args->{entry}->date( Plagger::Date->from_epoch($lastmod) ); } my @plugins = $handler ? ($handler) : @{ $self->{plugins} }; @@ -134,11 +156,11 @@ my $upgraded; for my $plugin (@plugins) { if ( $handler || $plugin->handle($args) ) { - $context->log(debug => $args->{entry}->permalink . " handled by " . $plugin->site_name); + $self->log(debug => $args->{entry}->permalink . " handled by " . $plugin->site_name); my $data = $plugin->extract($args); $data = { body => $data } if $data && !ref $data; if ($data) { - $context->log(info => "Extract content succeeded on " . $args->{entry}->permalink); + $self->log(info => "Extract content succeeded on " . $args->{entry}->permalink); my $resolver = HTML::ResolveLink->new( base => $args->{entry}->permalink ); # if body was already there, set that to summary @@ -179,11 +201,16 @@ return 1; } - $context->log(warn => "Extract content failed on " . $args->{entry}->permalink); + $self->log(warn => "Extract content failed on " . $args->{entry}->permalink); +} + +sub update { + my($self, $context, $args) = @_; + POE::Kernel->run; } -package Plagger::Plugin::Filter::EntryFullText::Site; +package Plagger::Plugin::Filter::EFTXango::Site; sub new { bless {}, shift } sub custom_feed_handle { 0 } sub custom_feed_follow_link { } @@ -191,7 +218,7 @@ sub handle_force { 0 } sub handle { 0 } -package Plagger::Plugin::Filter::EntryFullText::YAML; +package Plagger::Plugin::Filter::EFTXango::YAML; use Encode; use List::Util qw(first); @@ -318,53 +345,73 @@ } } -1; - -__END__ - -=head1 NAME - -Plagger::Plugin::Filter::EntryFullText - Upgrade your feeds to fulltext class - -=head1 SYNOPSIS - - - module: Filter::EntryFullText - -=head1 DESCRIPTION - -This plugin allows you to fetch entry full text by doing HTTP GET and -apply regexp to HTML. It's just like upgrading your flight ticket from -economy class to business class! - -You can write custom fulltext handler by putting C<.pl> or C<.yaml> -files under assets plugin directory. - -=head1 CONFIG - -=over 4 - -=item store_html_on_failure - -Even if fulltext handlers fail to extract content body from HTML, this -option enables to store the whole document HTML as entry body. It will -be useful to use with search engines like Gmail and Search:: plugins. -Defaults to 0. - -=item force_upgrade - -Even if entry body already contains HTML, this config forces the -plugin to upgrade the body. Defaults to 0. - -=back - -=head1 WRITING CUSTOM FULLTEXT HANDLER - -(To be documented) - -=head1 AUTHOR +package Plagger::Plugin::Filter::EFTXango::Crawler; +use strict; +use POE; +use Storable qw(freeze thaw); -Tatsuhiko Miyagawa +sub apply_policy { 1 } +sub spawn { + my $class = shift; + my %args = @_; + + POE::Session->create( + heap => { + PLUGIN => $args{Plugin}, USE_CACHE => $args{UseCache}, + BROKER_ALIAS => $args{BrokerAlias}, + MaxRedirect => $args{MaxRedirect}, + }, + package_states => [ + $class => [ qw(_start _stop apply_policy prep_request handle_response) ] + ] + ); +} -=head1 SEE ALSO +sub _start { $_[KERNEL]->alias_set('xghandler') } +sub _stop { } +sub prep_request { + return unless $_[HEAP]->{USE_CACHE}; + + my $job = $_[ARG0]; + my $req = $_[ARG1]; + my $plugin = $_[HEAP]->{PLUGIN}; + + my $ref = $plugin->cache->get($job->uri); + if ($ref) { + $req->if_modified_since($ref->{LastModified}) + if $ref->{LastModified}; + $req->header('If-None-Match', $ref->{ETag}) + if $ref->{ETag}; + } +} -L +sub handle_response { + my $job = $_[ARG0]; + my $plugin = $_[HEAP]->{PLUGIN}; + + my $redirect = $job->notes('redirect') + 1; + return if $redirect > $_[HEAP]->{MaxRedirect}; + + my $r = $job->notes('http_response'); + my $url = $job->uri; + if ($r->code =~ /^30[12]$/) { + $url = $r->header('location'); + return unless $url =~ m!^https?://!i; + $_[KERNEL]->post($_[HEAP]->{BROKER_ALIAS}, 'enqueue_job', Xango::Job->new(uri => URI->new($url), redirect => $redirect)); + return; + } else { + return unless $r->is_success; + + my $ct = $r->content_type; + $plugin->entry_fixup($url, $r->content_ref, $r->header('Last-Modified')); + } + + if ($_[HEAP]->{USE_CACHE}) { + $plugin->cache->set( + $job->uri, + {ETag => $r->header('ETag'), + LastModified => $r->header('Last-Modified')} + ); + } +} +1;