- 论坛徽章:
- 0
|
本帖最后由 iamlimeng 于 2010-11-21 12:46 编辑
研究了一下,写了段代码,希望对大家有帮助。- #!/usr/bin/perl
- use strict;
- #use warnings;
- use LWP::UserAgent;
- use LWP::ConnCache;
- my $url_taobao = "http://meilibody.taobao.com/?search=y";
- my $path = "d:/taobao";
- mkdir($path) if (not -d $path);
- my $lwp = new LWP::UserAgent(agent => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; CIBA)');
- my $conncache = new LWP::ConnCache;
- $lwp->conn_cache($conncache);
- my $n = 1;
- my @url_main_page;
- my $content = get_html($url_taobao);
- if ($content) {
- $content =~ s/\n|\r//g;
- my ($page) = ($content =~ /<div class="page-bottom">(.*?)<\/div>/ig);
- @url_main_page = ($page =~ /<a href="(.*?)">\d<\/a>/ig);
- my %url_product = ($content =~ /<div class="desc"><a target="_blank" href="(.*?)" class="permalink">(.*?)<\/a>/ig);
- get_desc($_,$url_product{$_}) for (keys %url_product);
- }
- else { print qq~ Get "$url_taobao" error!~; }
- foreach (@url_main_page) {
- my $content = get_html($_);
- if ($content) {
- $content =~ s/\n|\r//g;
- my %url_product = ($content =~ /<div class="desc"><a target="_blank" href="(.*?)" class="permalink">(.*?)<\/a>/ig);
- get_desc($_,$url_product{$_}) for (keys %url_product);
- }
- else { print qq~ Get "$_" error!~; }
- }
- print "\n Get All product infomation OK!\7";
- <STDIN>;
- sub get_desc {
- my ($url,$name) = @_;
- my $url_desc;
- my $content = get_html($url);
- if ($content) {
- $content =~ s/\n|\r//g;
- ($url_desc) = ($content =~ /.*?(http:\/\/dsc\.taobaocdn\.com\/.*?)\'/i);
- my $desc = get_html($url_desc);
- if ($desc) {
- open(FH,">$path/product_$n.txt")
- || print " Create file product_$n.txt error!\n\7";
- print FH "$name\n\n$desc";
- close FH;
- print " $n $name OK!\n";
- }
- else { print " $n $name Error!\n"; }
- }
- else { print qq~ $n Get "$url" error!~; }
- $n++;
- }
- sub get_html {
- my $url_request = shift;
- my $request = HTTP::Request->new(GET=>$url_request);
- $request->header(Accept=>'text/html');
- my $response = $lwp->request($request);
- if ($response->is_success) {
- return($response->content());
- }
- else {
- return 0;
- }
- }
复制代码 正则写得不太好,请大家包涵。 |
|