请帮忙 html::parser ,从mp3.baidu.搜索mp3 [复制链接]

gangjh

丰衣足食

求职 : 数据库管理员

论坛徽章:: 0

电梯直达

1楼 [收藏(0)] [报告]

发表于 2005-08-01 22:03 |只看该作者 |倒序浏览

根据HTML::LinkExtor修改过来的package

1.<a href=http://aa.aa/>; <img=aa.img alt="asdf">;</a>;
2.<a href=http://aa.aa/>; fffff</a>;
3 .<a href=http://aa.aa/>;test <font size=12>;text2</font>; </a>;
第一,二种可以
第三种只能抓到 test ,后面的 text2抓不到,
请帮忙

刚学perl不久,玩玩而已,呵呵

perl 真是个好东西

package myLinkExtor;
# $Id: LinkExtor.pm,v 1.33 2003/10/10 10:20:56 gisle Exp $
require HTML::Parser;
@ISA = qw(HTML::Parser);
$VERSION = sprintf("%d.%02d", q$Revision: 1.33 $ =~ /(\d+)\.(\d+)/);
use strict;
use HTML::Tagset ();
# legacy (some applications grabs this hash directly)
use vars qw(%LINK_ELEMENT);
*LINK_ELEMENT = \%HTML::Tagset::linkElements;
sub new
{
my($class, $cb, $base) = @_;
my $self = $class->;SUPER::new(
start_h =>; [\&a_start_handler, "self,tagname,attr"],
report_tags =>; [qw(a img)],
);
if ($base) {
require URI;
$self->;{extractlink_base} = URI->;new($base);
}
$self;
}
sub _found_link
{
my $self = shift;
push(@{$self->;{'links'}}, [@_]);
}
sub links
{
my $self = shift;
exists($self->;{'links'}) ? @{delete $self->;{'links'}} : ();
}
# We override the parse_file() method so that we can clear the links
# before we start a new file.
sub parse_file
{
my $self = shift;
delete $self->;{'links'};
$self->;SUPER::parse_file(@_);
}
######
##
#
#my declare
#
sub a_start_handler
{
my($self, $tag, $attr) = @_;
return unless $tag eq "a";
return unless exists $attr->;{href};
$self->;handler(text =>; [], '@{dtext}' );
push(@{$self->;handler("text")}, "href" );
push(@{$self->;handler("text")}, $attr->;{href} );
$self->;{'tag'}=$tag ;
$self->;handler(start =>; \&img_handler);
# $self->;handler(end =>; \&a_end_handler, "self,tagname");
$self->;handler(end =>; \&a_end_handler, "self,tagname");
}
sub img_handler
{
my($self, $tag, $attr) = @_;
return unless (($tag eq "img") or ($tag eq "font")) ;
push(@{$self->;handler("text")}, $attr->;{alt} || "[IMG]");
}
sub a_end_handler
{
my($self, $tag) = @_;
# my $text = join("", @{$self->;handler("text")});
# $text =~ s/^\s+//;
# $text =~ s/\s+$//;
# $text =~ s/\s+/ /g;
$self->;_found_link($self->;{'tag'} ,@{$self->;handler("text")}) ;
$self->;handler("text", undef);
$self->;handler("start", \&a_start_handler);
$self->;handler("end", undef);
}

复制代码

findmp3.pl
可按歌手搜索mp3列表,
因为不好判断有多少页,就默认5 可传入参数改变,

#!/usr/bin/perl
#
#Coder : Gan Jian Hui
#
#Last Modify :2005-07-29
#
use LWP::UserAgent;
use URI::URL ;
use strict ;
use myLinkExtor;
use URI::Escape;
if (@ARGV <= 0)
{
print STDERR "usage: get liu de hua \n";
exit(1) ;
}
my ($word,$maxpage) =@ARGV ;
$maxpage=$maxpage || 5 ;
$word=uri_escape($word) ;
my $browser = LWP::UserAgent->;new;
$browser->;env_proxy;
$browser->;agent('Mozilla/5.0');
my $p = myLinkExtor->;new() ;
my $i=0 ;
my $fsurl ;
for ($i=0; $i<$maxpage; $i++)
{
$fsurl = &findnext($i,$word);
&geturl($browser, $p, $fsurl) ;
}
sub geturl()
{
my ($bs, $lkextor , $url) = @_ ;
my $req=HTTP::Request->;new(GET =>; $url) ;
my $response = $bs->;request($req) ;
$lkextor->;parse($response->;content) ;
my $base = $response->;base;
my $nurl ;
# <a href=aa.htm>;asdf</a>;
# 0 1 2 3
foreach my $link ($lkextor->;links)
{
next if @$link[0] ne 'a' ;
$nurl = url(@$link[2],$base)->;abs ;
shift (@$link) ;
shift (@$link) ;
shift (@$link) ;
if ($nurl =~ /.(mp3|rm|wma)$/ )
{
my $text = join("",@$link) ;
$text =~ s/^\s+//;
$text =~ s/\s+$//;
$text =~ s/\s+/ /g;
#
print $nurl."\t " . $text ."\n" ;
}
}
}
sub findnext(){
my ($page,$WORD) = @_ ;
my $RM=30 ;
my $PM=$RM*$page ;
my $MAXPM=100 ;
my $NL= "http://mp3.baidu.com/m?z=0&cl=3&ct=134217728&sn=&lm=-1&cm=1&sc=1&bu=&rn=30&tn=baidump3&word=$WORD&pn=$PM" ;
return $NL ;
}