- 论坛徽章:
- 0
|
另附上我的问题:- #!perl -w
- use strict;
- use 5.010;
- use Encode;
- #use open ':encoding(UTF-8)'; #设开启为1
- #use utf8;#设开启为2
- #只开启1 3个结果正确
- #只开启2 1个结果正确
- #开启1+2 5个结果都正确
- #问题:在实际环境中(大量html须处理) 开启1 程序运行时间为两倍到三倍,如何优化,节省时间。
- open IN, "<in.txt";
- open OUT ,">tst.txt";
- my $chart = '¶†‡*§';
- $/ = undef;
- my $string2 = <IN>;
- say OUT $string2;
- say OUT "-" x 100;
- #in.txt里存储的内容如下,为utf8编码
- #<p class="articleCitationIndent1">*Institute of Mountain Risk Engineering, Austria</p>
- #<p class="articleCitationIndent1"><sup>†</sup>Laboratory of Dendrogeomorphology, Institute of Geological Sciences, Switzerland</p>
- #<p class="articleCitationIndent1"><sup>‡</sup>Climate Change and Climatic Impacts, Environmental Sciences, Switzerland</p>
- #<p class="articleCitationIndent1"><sup>§</sup>Federal Ministry for Agriculture, Forestry, Environment and Water Management, Austria</p></div>
- #<p class="articleCitationIndent1">¶Corresponding author: <a class="email" href="mailto:xxx@die-wildbach.at">xxx@die-wildbach.at</a></p>'
- ;
- while ($string2 =~ m{<p class="articleCitationIndent1">(?:</?a[^>]*>)*(?:<sup>)?+,?([$chart]),?(?:</sup>)?+,?(?:</?a[^>]*>)*((?:(?!</p>).)+)</p>}ig){
- say OUT "$1\t$2";
- }
复制代码 |
|