- 求职 : 数据库管理员
- 论坛徽章:
- 0
|
根据HTML::LinkExtor修改过来的package
1.<a href=http://aa.aa/>; <img=aa.img alt="asdf">;</a>;
2.<a href=http://aa.aa/>; fffff</a>;
3 .<a href=http://aa.aa/>;test <font size=12>;text2</font>; </a>;
第一,二种可以
第三种只能抓到 test ,后面的 text2抓不到,
请帮忙
刚学perl不久,玩玩而已,呵呵
perl 真是个好东西
- package myLinkExtor;
- # $Id: LinkExtor.pm,v 1.33 2003/10/10 10:20:56 gisle Exp $
- require HTML::Parser;
- @ISA = qw(HTML::Parser);
- $VERSION = sprintf("%d.%02d", q$Revision: 1.33 $ =~ /(\d+)\.(\d+)/);
- use strict;
- use HTML::Tagset ();
- # legacy (some applications grabs this hash directly)
- use vars qw(%LINK_ELEMENT);
- *LINK_ELEMENT = \%HTML::Tagset::linkElements;
- sub new
- {
- my($class, $cb, $base) = @_;
- my $self = $class->;SUPER::new(
- start_h =>; [\&a_start_handler, "self,tagname,attr"],
- report_tags =>; [qw(a img)],
- );
-
- if ($base) {
- require URI;
- $self->;{extractlink_base} = URI->;new($base);
- }
- $self;
- }
- sub _found_link
- {
- my $self = shift;
- push(@{$self->;{'links'}}, [@_]);
-
- }
- sub links
- {
- my $self = shift;
- exists($self->;{'links'}) ? @{delete $self->;{'links'}} : ();
- }
- # We override the parse_file() method so that we can clear the links
- # before we start a new file.
- sub parse_file
- {
- my $self = shift;
- delete $self->;{'links'};
- $self->;SUPER::parse_file(@_);
- }
- ######
- ##
- #
- #my declare
- #
- sub a_start_handler
- {
- my($self, $tag, $attr) = @_;
- return unless $tag eq "a";
- return unless exists $attr->;{href};
- $self->;handler(text =>; [], '@{dtext}' );
- push(@{$self->;handler("text")}, "href" );
- push(@{$self->;handler("text")}, $attr->;{href} );
- $self->;{'tag'}=$tag ;
-
-
- $self->;handler(start =>; \&img_handler);
- # $self->;handler(end =>; \&a_end_handler, "self,tagname");
- $self->;handler(end =>; \&a_end_handler, "self,tagname");
- }
- sub img_handler
- {
- my($self, $tag, $attr) = @_;
- return unless (($tag eq "img") or ($tag eq "font")) ;
-
- push(@{$self->;handler("text")}, $attr->;{alt} || "[IMG]");
- }
- sub a_end_handler
- {
- my($self, $tag) = @_;
- # my $text = join("", @{$self->;handler("text")});
- # $text =~ s/^\s+//;
- # $text =~ s/\s+$//;
- # $text =~ s/\s+/ /g;
- $self->;_found_link($self->;{'tag'} ,@{$self->;handler("text")}) ;
- $self->;handler("text", undef);
- $self->;handler("start", \&a_start_handler);
- $self->;handler("end", undef);
- }
-
复制代码
findmp3.pl
可按歌手搜索mp3列表,
因为不好判断有多少页,就默认5 可传入参数改变,
- #!/usr/bin/perl
- #
- #Coder : Gan Jian Hui
- #
- #Last Modify :2005-07-29
- #
- use LWP::UserAgent;
- use URI::URL ;
- use strict ;
- use myLinkExtor;
- use URI::Escape;
- if (@ARGV <= 0)
- {
- print STDERR "usage: get liu de hua \n";
- exit(1) ;
- }
- my ($word,$maxpage) =@ARGV ;
- $maxpage=$maxpage || 5 ;
- $word=uri_escape($word) ;
- my $browser = LWP::UserAgent->;new;
- $browser->;env_proxy;
- $browser->;agent('Mozilla/5.0');
- my $p = myLinkExtor->;new() ;
- my $i=0 ;
- my $fsurl ;
- for ($i=0; $i<$maxpage; $i++)
- {
- $fsurl = &findnext($i,$word);
- &geturl($browser, $p, $fsurl) ;
- }
- sub geturl()
- {
- my ($bs, $lkextor , $url) = @_ ;
-
- my $req=HTTP::Request->;new(GET =>; $url) ;
- my $response = $bs->;request($req) ;
- $lkextor->;parse($response->;content) ;
- my $base = $response->;base;
-
- my $nurl ;
- # <a href=aa.htm>;asdf</a>;
- # 0 1 2 3
- foreach my $link ($lkextor->;links)
- {
- next if @$link[0] ne 'a' ;
- $nurl = url(@$link[2],$base)->;abs ;
- shift (@$link) ;
- shift (@$link) ;
- shift (@$link) ;
- if ($nurl =~ /.(mp3|rm|wma)$/ )
- {
- my $text = join("",@$link) ;
- $text =~ s/^\s+//;
- $text =~ s/\s+$//;
- $text =~ s/\s+/ /g;
- #
- print $nurl."\t " . $text ."\n" ;
- }
-
- }
- }
- sub findnext(){
- my ($page,$WORD) = @_ ;
- my $RM=30 ;
- my $PM=$RM*$page ;
- my $MAXPM=100 ;
- my $NL= "http://mp3.baidu.com/m?z=0&cl=3&ct=134217728&sn=&lm=-1&cm=1&sc=1&bu=&rn=30&tn=baidump3&word=$WORD&pn=$PM" ;
- return $NL ;
- }
复制代码 |
|