- 论坛徽章:
- 0
|
本帖最后由 sx98083714 于 2010-11-19 13:21 编辑
- #!/usr/bin/perl
- use strict;
- use URI;
- use Web::Scraper;
- use utf8;
- use LWP;
- use Encode;
- use HTML::TokeParser;
- use HTTP::Cookies;
- #将输出自动解码为utf8格式
- binmode(STDOUT,":encoding(utf8)");
- #抓取url
- my $url = "http://meilibody.taobao.com/?search=y";
- #定义抓取产品列表表达式
- my $scraper = scraper{
- process ".permalink","links[]" => { "url" => '@href',"txt" => 'TEXT' };
- };
- my $result = $scraper -> scrape (URI -> new($url) );
- #抓取产品url、标题
- for my $row ( @{ $result -> { links } } ){
-
- my $purl = $row -> { "url" };
- my $ptxt = $row -> { "txt" };
-
- #调用函数抓取产品详细内容
- my $pcontent = &getcontent($purl);
-
- print "content:",$pcontent,"\n";
- sleep 10;
-
- }
- #获取单个产品详细资料
- sub getcontent() {
-
- #抓取产品url
- my $url = shift;
-
- my $browser = LWP::UserAgent -> new;
- $browser -> cookie_jar ( { } );
- $browser -> timeout ( 500 );
-
- my $response = $browser -> get ($url,
- 'User-Agent' => 'Mozilla/5.0 (X11; U; Linux i686; zh-CN; rv:1.9.0.12) Gecko/2009072711 CentOS/3.0.12-1.el5.centos Firefox/3.0.12',
- 'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- 'Accept-Language' => 'zh-cn,zh;q=0.5',
- 'Accept-Charset' => 'gb2312,utf-8;q=0.7,*;q=0.7',
- 'Referer' => 'http://meilibody.taobao.com/?search=y',
- );
- #获取单个产品网页内容
- my $content = $response -> content;
- #网页内容为utf8编码
- $content = decode ( 'gbk' , $content);
-
- my $stream = HTML::TokeParser -> new (\$content);
-
- #1表示找到内容简介的起始处
- my $find = 0;
- #产品简介html代码
- my $pcontent = "";
-
- while ( my $token = $stream -> get_token ){
-
- #产品简介部分结束,将标记归0,跳出循环
- if ( $find == 1 and $token -> [0] eq 'S' and $token -> [1] eq 'script') {
- $find = 0;
- last;
- }
-
- if ( $find == 0 and $token -> [0] eq 'S' and $token -> [1] eq 'div' and $token -> [2] -> {class} eq "content" ){
- #找到产品简介起始,将标记置为1
- $find = 1;
- $pcontent .= $token -> [4];
- }elsif ( $find == 1 and $token -> [0] eq 'S' ){
- $pcontent .= $token -> [4];
- }elsif ( $find == 1 and $token -> [0] eq 'E'){
- $pcontent .= $token -> [2];
- }elsif ( $find == 1 and $token -> [0] eq 'T'){
- $pcontent .= $token -> [1];
- }elsif ( $find == 1 and $token -> [0] eq 'C'){
- $pcontent .= $token -> [1];
- }elsif ( $find == 1 and $token -> [0] eq 'D'){
- $pcontent .= $token -> [1];
- }
- }
-
- return $pcontent;
-
- }
复制代码 匹配应该是没有问题,但是抓取到的产器简介是“描述加载中....",看了似乎产品简介是通过js来得到的,过往神仙如何解决这一问题?谢谢! |
|