- 论坛徽章:
- 3
|
之前常听PY说Coro+Anyevent会很牛X。。。然后晚上刚好被妹子放了鸽子,
心情略微郁闷了下,写了个DEMO让自己心情好起来。。
回头研究下怎么优化,改天把爬虫的DEMO也弄出来。
这个东西其实还有很多扩展的空间,例如BS算法加入、多进程等,大家都懂的。
随意扩展。。。- #!perl
- use utf8;
- use AnyEvent;
- use AnyEvent::HTTP;
- use Data::Dumper;
- use Coro;
- my $calculate = new Coro::Channel;
- my $result = new Coro::Channel;
- # all proxy results
- our @proxy_results;
- # proxy page list
- my $url="http://proxy.com.ru/";
- my $cv = AnyEvent->condvar;
- eval {
- # proxy page list
- my $url="http://proxy.com.ru/";
- $cv->begin;
- http_get(
- $url,
- sub {
- my ( $content, $header, $hdr ) = @_;
- if ($content=~m/<a href='list_([0-9]{1,3}).html'>\[<b>(.*)<\/b>\]<\/a>/){
- $tmp_result=$2;
- @tmp_pages=split(/\[<b>/,$tmp_result);
- # count page
- $all_pages=$tmp_pages[-1];
- ru_proxy($url,$all_pages,$content);
- }
- $cv->end;
- }
- );
- };
- if ($@) {
- print "Error!\n";
- }
- $cv->recv();
- # ip picking up
- sub ru_proxy{
- my ($url,$pages,$content) = @_;
- foreach my $p (2..$pages){
- $cv->begin;
- http_get(
- $url."/list_".$p.".html",
- sub {
- my ( $content, $header, $hdr ) = @_;
- foreach my $line (split(/\n/,$content)){
- if ( $line =~m/<td>([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3})<\/td><td>([0-9]{1,5})<\/td>/){
- push(@proxy_results,"http://".$1.":".$2."/");
- }
- }
- $cv->end;
- }
- );
- }
- foreach my $line (split(/\n/,$content)){
- if ( $line =~m/<td>([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3})<\/td><td>([0-9]{1,5})<\/td>/){
- push(@proxy_results,"http://".$1.":".$2."/");
- }
- }
- }
- # useing coro module
- async {
- while (){
- my $task = $calculate->get;
- # testing proxy
- AnyEvent::HTTP::set_proxy $task;
- http_get(
- "http://bbs.chinaunix.net/forum.php",
- sub {
- my ($content,$header)=@_;
- if ( $content =~m/<title>(.*)<\/title>/){
- $result->put("proxy:".$task."title:".$1."\n");
- }
- }
- );
- }
- };
- foreach my $t (@proxy_results){
- $calculate->put($t);
- print $result->get."\n";
- }
复制代码 |
|