- 论坛徽章:
- 1
|
window或者linux平台都可以,只需要自己设置搜索的内容即可
[Perl]代码- use LWP::UserAgent;
- use HTML::TreeBuilder;
- use LWP::Simple;
- use URI;
- use Encode;
- @list_url=();
- @download_url=();
- foreach (1..16)#在新浪微盘里面搜索perl会有16个页面的结果
- {
- my $url = URI->new('http://vdisk.weibo.com/search/');
- my($keyword,$sortby,$page) = ("perl","default",$_);#对perl这个关键词做测试
- $url->query_form
- (
- # All form pairs:
- 'keyword' => $keyword,
- 'sortby' => $sortby,
- 'page' => $page,
- );
- push @list_url,$url;
- }
- my $ua = LWP::UserAgent->new;
- #open fh,">aa.txt";
- foreach (@list_url)#对我们自己合成的目标url做循环爬取适合的链接
- {
- my $response = $ua->get($_);
- $html=$response->content;
- my $tree = HTML::TreeBuilder->new; # empty tree
- $tree->parse($html) or print "error : parse html ";
- @pdf_name=$tree->find_by_attribute("class","sort_name_intro") or print "error : cannot find pdf_name ";
- foreach (@pdf_name)
- {
- $node=$_->look_down(_tag=>'a');
- $a=$node->attr('href');
- $b=encode("cp936", decode("utf-8",$node->attr('title')));
- $c="$a\t$b";
- push @download_url,$c;#把目标链接的url及文件名添加到下载列表
- }
- }
- foreach (@download_url)
- {
- @tmp=split;
- $html=get($tmp[0]);
- $html=~/fileDown\.init.*?\"url\":\"(.*?)\",/;#这个是关键,我找了半天才找到该页面的真实url地址
- $a=$1;
- $a=~s/\\//g;
- print $a;
- getstore("$a","$tmp[1]");
- }
复制代码 |
|