- 论坛徽章:
- 0
|
原帖由 cobrawgl 于 2008-7-16 08:09 发表 ![]()
我看看了看,你的 regex 好像是可以的啊?
title=后漏了个引号。
修改后可以下载全部网页,但原来清理html的那堆,网页格式变了,我又没耐心写,所以就那么搁着了……
哪位熟悉RegEx的人来改写一下?
以下是我初步修改后的版本
——————————————————————————————
- #!/usr/bin/perl -w
- use strict;
- use LWP::Simple;
- use Encode;
- getstore( 'http://www.pgsqldb.org/twiki/pub/TWiki/PatternSkin/layout.css', 'layout.css' );
- getstore( 'http://www.pgsqldb.org/twiki/pub/TWiki/PatternSkin/style.css', 'style.css' );
- getstore( 'http://www.pgsqldb.org/twiki/pub/TWiki/PatternSkin/i_arrow_down.gif', 'i_arrow_down.gif' );
- my $urlBase = 'http://www.pgsqldb.org/mwiki/index.php';
- my $index = get( "$urlBase/ProgrammingPerl" ) || die "get index.html failed!\n";
- #&trim( $index );
- # $index =~ s{a class="twikiLink" href="/bin/view/Perl/([^"]+?)"}{a class="twikiLink" href="$1.html"}sg;
- # a href="/bin/view/Perl/PerlThread" class="twikiLink"
- # $index =~ s{a class="twikiLink" href="/bin/view/Perl/([^"]+?)"}{a class="twikiLink" href="$1.html"}sg;
- # </li><li> <a href="/mwiki/index.php/PerlUnaryandBinaryOperators" title="PerlUnaryandBinaryOperators">第三章 单目和双目操作符</a>
- $index =~ s{a href="/mwiki/index.php/([^"]+?)" title="([^"]+?)"}{a href="$1.html" title="$2"}sg;
- &save( $index, "index.html" );
- my $hhp_begin = q{
- [OPTIONS]
- Compatibility=1.1 or later
- Compiled file=Perl 语言编程.chm
- Contents file=Perl 语言编程.hhc
- Default topic=index.html
- Display compile progress=No
- Language=0x804 中文(中国)
- [FILES]
- };
- my $hhp_end = q{
- layout.css
- style.css
- i_arrow_down.gif
- [INFOTYPES]
- };
- open HHP, ">Perl 语言编程.hhp" or die "Create file failed!\n";
- print HHP $hhp_begin;
- #<ul><li> <a href="PerlPerlBitsandPieces.html" title="PerlPerlBitsandPieces">第二章 集腋成裘</a>
- #</li><li> <a href="PerlUnaryandBinaryOperators.html" title="PerlUnaryandBinaryOperators">第三章 单目和双目操作符</a>
- while( $index =~ /<li>..?<a href="(.+?)\.html" title="([^"]+?)"/mg ) {
- my $url = $1;
- print "$url\n";
- next if grep { $_ eq $url } qw( WebHome WebChanges WebIndex WebSearch );
- my $content = get( "$urlBase/$url" );
- next unless $content;
- &trim( $content );
- &save( $content, "$url.html" );
- print HHP "$url.html\n";
- }
- print HHP $hhp_end;
- my $hhc_begin = q{
- <!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML//EN">
- <HTML>
- <HEAD>
- <!-- Sitemap 1.0 -->
- </HEAD><BODY>
- <OBJECT type="text/site properties">
- <param name="ExWindow Styles" value="0x200">
- <param name="Window Styles" value="0x800025">
- <param name="Font" value="MS Sans Serif,9,0">
- </OBJECT>
- <UL>
- <LI> <OBJECT type="text/sitemap">
- <param name="Name" value="Perl 语言编程">
- <param name="Local" value="index.html">
- <param name="ImageNumber" value="11">
- </OBJECT>
- <UL>
- };
- my $hhc_end = q{
- </UL>
- </UL>
- </BODY></HTML>
- };
- open FH, ">Perl 语言编程.hhc" or die "create file failed!\n";
- print FH $hhc_begin;
- Encode::from_to( $index, 'utf-8', 'gb2312' );
- while( $index =~ m{<a href="([^"]+)" title=([^"]+?)">(第[^<]*?)</a>}sg ){
- print FH qq{<LI> <OBJECT type="text/sitemap">
- <param name="Name" value="$2">
- <param name="Local" value="$1">
- <param name="ImageNumber" value="11">
- </OBJECT>
- };
- }
- print FH $hhc_end;
- close FH;
- sub save {
- my ($str, $fileName) = @_;
- open FH, ">$fileName" or die "gen $fileName failed!\n";
- print FH $str;
- close FH;
- }
- sub trim {
- $_[0] =~ s/\n<u style="display: none">.*<\/u>\n//s;
- $_[0] =~ s/<div class="([^"]+)".*?<\/div>/($1 eq 'twikiTopic' or $1 eq 'twikiToc' or $1 eq 'fragment' or $1 eq 'portlet') ? $& : ''/esg;
- $_[0] =~ s{[url]http://www.pgsqldb.org/pub/TWiki/PatternSkin/[/url]}{./}sg;
- $_[0] =~ s{<base href="http://www.pgsqldb.org/bin/view/Perl/.*?"[^>]*>}{}sg;
- $_[0] =~ s{\n<script src="http://www.google-analytics.com/urchin.js" type="text/javascript">\n</script>}{}sg;
- $_[0] =~ s{\n<script type="text/javascript">\n_uacct = "UA-179457-1";\nurchinTracker\(\);\n</script>}{}sg;
- $_[0] =~ s{\n<div class="twikiTopic">\n}{\n<div class="twikiTopic" style="padding:0 1.5em"><br>\n}sg;
- }
复制代码 |
|