- 论坛徽章:
- 0
|
代码是要实现批量的抓取网页,由于数量太多,考虑到多进程,可是运行就提示内存不能为read,该怎么写这种多进程的脚本呢?
还有采用多线程不知能否实现,本人尝试多线程时,线程执行时并没有提交cookies。- use LWP::UserAgent;
- use HTTP::Cookies;
- use LWP::ConnCache;
- use strict;
- my $ua=LWP::UserAgent->new;
- my $cache=LWP::ConnCache->new();
- $ua->conn_cache($cache);
- my $user="yongchun_wsu";
- my $password="trial01";
- my $url="https://portal.biobase-international.com/cgi-bin/portal/login.cgi";
- my $cookie=HTTP::Cookies->new(file=>'lwp_cookie.dat',autosave=>1,ignore_discard=>1);
- $ua->cookie_jar($cookie);
- my $res=$ua->post("$url",[login=>"$user",
- password=>"$password"]) or die "can't login $!";
- print "login successed !\n";
- for(my $count=1;$count<10;$count++){
- my $c=fork();
- if($c){
- print "parent running \n";
- }
- else
- {
- &getpage();
- exit 0;
- }
- }
- sub getpage {
- my $url1="https://portal.biobase-international.com/cgi-bin/build_t/idb/1.0/pageview.cgi?view=MatrixReport&matrix_acc=M0";
- for(my $i=1;$i<1357;$i++){
- my $id=sprintf("%04d",$i);
- if(! -e "M0".$id.".html"){
- my $bro=$ua->get($url1.$id);
- if($bro->is_success){
- open(FH,">","M0".$id.".html");
- print FH $bro->content;
- print "page M0$id".".html finished\n";
- close(FH);
-
- }
- else
- {
- print "can't get page $id";
- redo;
- }
- }
- }
- }
复制代码 |
|