スポーツ写真サイト オールスポーツコミュニティから画像のURLを抽出するのに使いました。
Web::ScraperとWWW::Mechanize - ”improve it!”
と書いていたあれ。URLとパスワードを書いたYAMLを読み込ませると、そのイベントの写真(jpeg)のURLをYAMLにして出力します*1。
とりあえず一段落してちゃんと動作しているようなのでソースを載せておきます。誰か添削してくれたら嬉しいなあ。
そういえば、YAML::SyckだとDumpしたときにUTF8フラグの処理がうまく行ってなかったみたいで文字化けしてしまいました。YAML::Dumpなら問題ありません。
使い方
use WWW::AllSportsJp; WWW::AllSportsJp->new(shift)->do();
とか書いたperl.plを
$ perl perl.pl password.yml
とかやればOKです。
設定ファイル(YAML)
password.ymlはたとえばhttp://www.allsports.jp/event/00000628.htmlだったら、
name: Soccer url: http://www.allsports.jp/event/00000628.html form: name: unlockform input: ev_password passwords: - name: All password: "1234"
と書きます。nameは自分のメモ用なので好きなように書いてください。
パスワードが複数あるときは
name: Soccer url: http://www.allsports.jp/event/00000628.html form: name: unlockform input: ev_password passwords: - name: 1 password: "1234" - name: 2 password: "2345"
とすればOKです。
WWW::AllSportsJp::Oldはhttp://www2.allsports.jp/用です。
password.ymlはたとえばhttp://www2.allsports.jp/event_detail.php?ev_id=6149だったら、
name: H17 Inter High School - Archery url: http://www2.allsports.jp/event_detail.php?ev_id=6149 form: name: loginform input: ev_password passwords: - name: All password: "1234"
と書いてください。
パスワードがない場合は空欄でいけるはず。
WWW::AllSportsJp.pm
package WWW::AllSportsJp; use warnings; use strict; use Carp; use WWW::Mechanize; use WWW::Mechanize::DecodedContent; use Web::Scraper; use YAML; use Data::Dumper; $Data::Dumper::Indent = 1; use URI; use Encode; use encoding 'utf8'; use version; our $VERSION = qv('0.0.4'); use Perl6::Say; sub new(){ my $self = shift; bless { yaml => '', mech => undef, scraper => undef, file_name => shift, save => {}, debug => 0, }, $self; } sub do(){ my $self = shift; # Load config YAML my $file = $self->{file_name}; say "Load Config YAML: $file"; $self->{yaml} = YAML::LoadFile($file); say " -> OK"; # Start accessing say "URL: $self->{yaml}->{url}"; $self->{mech} = WWW::Mechanize->new( keep_alive => 4 ); my $result = $self->{mech}->get( $self->{yaml}->{url} ); $self->abort( Dumper $self->{mech}->response ) unless $self->{mech}->success; say " -> Accessing ..."; # Get Information say "Get information"; $self->getInformation; say " -> OK"; # Authorization say "Authorize"; $self->unlock; say " -> OK"; # Get say "Get listing urls"; $self->getListPages; say " -> OK"; say YAML::Dump $self->{save}->{categories} if $self->{debug}; say "Get page urls"; $self->getPageUrls; say " -> OK"; say "Get image urls"; $self->getImageUrls; say " -> OK"; say YAML::Dump $self->{save}; } sub getInformation(){ my $self = shift; my $mech = $self->{mech}; my $scraper = scraper{ process 'span.abbreviate', 'title' => 'TEXT'; process 'div.festa_detail tr th', 'types[]' => 'TEXT'; process 'div.festa_detail tr td', 'values[]' => 'TEXT'; }; $self->{save}->{information} = $scraper->scrape( $mech->decoded_content, $mech->uri ); } sub getListPages(){ my $self = shift; my $mech = $self->{mech}; my $categories = scraper{ process 'div#photo_sort table', 'categories[]' => scraper{ process 'table tr th', 'title' => 'TEXT'; process 'table tr li a', 'list_pages[]' => { url => '@href', title => 'TEXT' }; }; }; #修正 2008-03-21 #$self->{save} = $categories->scrape( $mech->decoded_content, $mech->uri ); $self->{save}->{categories} = $categories->scrape( $mech->decoded_content, $mech->uri )->{categories}; } sub getPageUrls(){ my $self = shift; my $mech = $self->{mech}; my $categories = $self->{save}->{categories}; my $scraper = scraper{ process 'div.photo_list_box div.photo_box', 'items[]' => scraper{ # process 'table tr td a[href]', 'url' => '@href'; process 'p.detail a[href]', 'url' => '@href'; process 'p.num', 'number' => 'TEXT'; process 'p.time', 'time' => 'TEXT'; }; }; my $i = 0; my $count = 0; foreach my $category ( @$categories ){ print " $i ->"; my $j = 0; foreach my $list_page ( @{$category->{list_pages}} ){ print " $j";$j++; my $result; next if $list_page->{title} eq ''; say "scrape $list_page->{url}" if $self->{debug}; $mech->get( $list_page->{url} ); if( $mech->success ){ $result = $scraper->scrape( $mech->decoded_content, $mech->uri ); push @{$categories->[$i]->{photo_pages}}, @{$result->{items}}; $count += @{$result->{items}}; say YAML::Dump $result->{items} if $self->{debug}; } else{ say Dumper $mech->response; next; } last if $self->{debug}; } $i++; say " (total $count)"; last if $self->{debug}; } say " (total $count)"; } sub getImageUrls(){ my $self = shift; my $mech = $self->{mech}; my $categories = $self->{save}->{categories}; $self->{yaml}->{url} =~ m|(https?://[^/]+)|; my $base_url = $1; say $base_url; my $count = 0; my $i = 0; foreach my $category ( @$categories ){ print " $i ->";$i++; my $j = 0; foreach my $photo_page ( @{$category->{photo_pages}} ){ print " $j"; $mech->get( $photo_page->{url} ); if( $mech->success ){ if( $mech->decoded_content =~ /\?pic=(photo_[^"&]+)["&]/ ){ $category->{photo_pages}->[$j++]->{image_url} = $base_url . "/photo/".$1; $self->{save}->{image_urls}->[$count++] = $base_url . "/photo/".$1; }else{ say "fail: " . $photo_page->{url}; } } last if $self->{debug}; } say " (total $count)"; last if $self->{debug}; } say " (total $count)"; } sub unlock(){ my $self = shift; my $mech = $self->{mech}; my $form = $self->{yaml}->{form}; my $passwords = $self->{yaml}->{passwords}; foreach my $password ( @$passwords ){ say "$self->{yaml}->{name} $password->{name} $password->{password} "; $mech->submit_form( form_name => $form->{name}, fields => { $form->{input} => $password->{password} }, ); $self->abort(Dumper $mech->response) unless $mech->success; say " -> Successfully Authorized."; last if $self->{debug}; } } sub abort(){ shift; print(@_); exit 8; } 1; __END__
WWW::AllSportsJp::Old.pm
package WWW::AllSportsJp::Old; use warnings; use strict; use Carp; use WWW::Mechanize; use WWW::Mechanize::DecodedContent; use Web::Scraper; use YAML; use Data::Dumper; $Data::Dumper::Indent = 1; use URI; use Encode; use encoding 'utf8'; use base qw(WWW::AllSportsJp); use version; our $VERSION = qv('0.0.4'); use Perl6::Say; sub getInformation(){ my $self = shift; my $mech = $self->{mech}; my $scraper = scraper{ process 'h2', 'title' => 'TEXT'; process 'td.bc_bub', 'types[]' => 'TEXT'; process 'td.wk_bu,td.wk_bu_b', 'values[]' => 'TEXT'; }; $self->{save}->{information} = $scraper->scrape( $mech->decoded_content, $mech->uri ); } sub getListPages(){ my $self = shift; my $mech = $self->{mech}; my $links = scraper{ process 'table.wk_bu table tr td[valign="top"]', 'links[]' => scraper{ process 'td[align="right"]', 'category' => 'TEXT'; process 'h3', 'title' => 'TEXT'; process 'td[align!="center"] a[href]', 'url' => '@href'; }; }; my $tmp_links = $links->scrape( $mech->decoded_content, $mech->uri ); my $categories = []; my $list_pages; my ($i,$j)=(0,0); foreach my $tmp_link (@{$tmp_links->{links}}){ if( exists $tmp_link->{category} ){ if( $tmp_link->{category} eq " " ){ next; }else{ say undef if $i != 0; print " $i ->"; $categories->[$i] = { 'title' => $tmp_link->{category}, 'list_pages' => [], 'photo_pages' => [], }; $list_pages = $categories->[$i++]->{list_pages}; $j = 0; next; } }elsif( exists $tmp_link->{url} ){ print " $j"; $list_pages->[$j++] = $tmp_link; } } say undef; $self->{save}->{categories} = $categories; say " -> OK"; say "More"; $self->getMoreListPages; } sub getMoreListPages(){ my $self = shift; my $mech = $self->{mech}; my $categories = $self->{save}->{categories}; my $more_pages = scraper{ process 'table[width="800"] tr td.pdd_10lr table tr td.txt_p a.bld[href]', 'more_pages[]' => scraper{ process 'a', 'url' => '@href'; process 'a', 'title' => 'TEXT'; }; }; my $h = 0; foreach my $category (@$categories){ my $new_list_pages = []; my ($i,$j) = (0,0); say " $h ->";$h++; foreach my $list_page ( @{$category->{list_pages}} ){ print " $i ->";$i++; $mech->get($list_page->{url}); if( $mech->success ){ my $result = $more_pages->scrape( $mech->decoded_content, $mech->uri ); foreach my $page ( @{$result->{more_pages}} ){ print " $j";$j++; push @$new_list_pages, { 'url' =>$page->{url}, 'title' => $list_page->{title} . "-" . $page->{title}, }; } say undef; $j=0; }else{ say Dumper $mech->response; next; } } push @{$category->{list_pages}}, @$new_list_pages; } } sub getPageUrls(){ my $self = shift; my $mech = $self->{mech}; my $categories = $self->{save}->{categories}; my $scraper = scraper{ process 'div.skinList ul li.skinList_li div[style]', 'items[]' => scraper{ process 'div[align="center"] table[height="175"] tr td a[href]', 'url' => '@href'; process 'div a.bld a', 'number' => 'TEXT'; process 'div font[colof="gray"]', 'time' => 'TEXT'; }; }; my $i = 0; my $count = 0; foreach my $category ( @$categories ){ print " $i ->"; my $j = 0; foreach my $list_page ( @{$category->{list_pages}} ){ print " $j";$j++; my $result; next if $list_page->{title} eq ''; $mech->get( $list_page->{url} ); if( $mech->success ){ $result = $scraper->scrape( $mech->decoded_content, $mech->uri ); push @{$categories->[$i]->{photo_pages}}, @{$result->{items}}; $count += @{$result->{items}}; } else{ say Dumper $mech->response; } } $i++; say " (total $count)"; } say " (total $count)"; } sub getImageUrls(){ my $self = shift; my $mech = $self->{mech}; my $categories = $self->{save}->{categories}; my $scraper = scraper{ process 'noscript table tr td[style]', 'style' => '@style'; }; my $count = 0; my $i = 0; foreach my $category ( @$categories ){ print " $i ->";$i++; my $j = 0; foreach my $photo_page ( @{$category->{photo_pages}} ){ print " $j"; $mech->get( $photo_page->{url} ); if( $mech->success ){ my $result = $scraper->scrape( $mech->decoded_content, $mech->uri ); if( $result->{style} =~ /url\(\'(.*)\'\)/ ){ $category->{photo_pages}->[$j++]->{image_url} = $1; $self->{save}->{image_urls}->[$count++] = $1; }else{ say 'fail'; } } } say " (total $count)"; } say " (total $count)"; } sub unlock(){ my $self = shift; my $mech = $self->{mech}; my $form = $self->{yaml}->{form}; my $passwords = $self->{yaml}->{passwords}; foreach my $password ( @$passwords ){ say " $self->{yaml}->{name} $password->{name} $password->{password}"; $mech->submit_form( fields => { $form->{input} => $password->{password} } ); $self->abort(Dumper $mech->response) unless $mech->success; say " -> Successfully Authorized"; } } 1; __END__
*1:イベント名や撮影日時も出力されます。