- 论坛徽章:
- 0
|
本帖最后由 horsley 于 2011-05-04 10:28 编辑
数据帮你抽取出来了,具体保存成什么格式需要自己处理。
HTML::TableExtract 模块用于处理 html 中的表单数据,需要自己安装。
- use LWP::UserAgent;
- use HTML::TableExtract;
- use Data::Dumper;
- my $ie = LWP::UserAgent->new;
- $ie->requests_redirectable([]);
- $ie->timeout(100);
- $ie->cookie_jar( {} ); # turn on cookies
- $ie->agent('Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)') ;
- my $res = $ie->get(q#http://www.medicine.csdb.cn/viewTable.jsp?ds=dataset@@medicine&tab=CMP#);
- my ($page) = $res->content =~m#页次: \d+/(\d+)页#;
- printf "一共 $page 页\n";
- foreach my $i (1..$page) {
- $res = $ie->post( q#http://www.medicine.csdb.cn/viewTable.jsp?ds=dataset@@medicine&tab=CMP#,
- [ 'gotoPage'=>$i,
- 'sql'=>'select+CMP.YXCF%2CCMP.IID+from+CMP+order+by+IID+desc',
- 'ds'=>'dataset@@medicine',
- 'tab'=>'CMP',
- ]);
-
- foreach my $id ( $res->content =~m#&id=(\d+)"#g ) {
- my $url = sprintf q#http://www.medicine.csdb.cn/viewRecord.jsp?ds=dataset@@medicine&tab=CMP&id=%s#,$id;
- my $res = $ie->get($url);
-
- my $te = HTML::TableExtract->new(depth => 2, count => 3) ;
- $te->parse($res->content);
- my %result = map {@$_} $te->rows;
- print Dumper(\%result);
- }
- }
复制代码 |
|