- 论坛徽章:
- 0
|
修正了当属性字串中包含有空格时的问题,如
alt="Google Seach" 时截取为alt="Google了
- <?php
- /********************************************************************
- * 原文件名:Filter1.php
- * 文件说明:过滤HTML字串
- * 文件编写:xuefengal@sohu.com
- * 流程说明:
- * 当附合要求的参数传递进filter函数后,filter()函数首先
- * 把要字串中所有要过滤的标签$tag通过preg_match_all()
- * 取出来,然后循环preg_match_all的匹配数组,通过preg_split()
- * 函数分割每个标签为 "左边属性" = "右边值"的形式,再从要保
- * 留的属性数组中循环,将preg_split()匹配的内容对应取出,构成
- * 可以替换的值,后最通过str_replcae()替换掉字串中相应的标签
- * 函数列表:
- * function filter(&$str,$tag,$keep_attribute)
- * function match($reg,&$str,$arr)
- * function show($str,$title='',$debug = True)
- * 使用示例:
- * //取得搜狐新闻首页
- * $str = @file_get_content("http://news.sohu.com");
- * //过滤
- * filter($str,'a','href,target,alt');
- * filter($str,'p','align');
- * show($str,'过滤后的内容');
- ********************************************************************/
- $start_time = array_sum(explode(" ",microtime()));
- $str = <<< HTML
- <A style="a" target=_blank href='http://www.a.com' xxx=xadsfa alt="a a a" style="aa">site a</A>
- <A alt='b b b' xxx=xadsfa target=_blank href='http://www.b.com' style="b" style="bb">site b</A>
- <A xxx=xadsfa style="c" href='http://www.c.com' target=_blank alt=c c c style="cc">site c</A>
- <A style="d" href='http://www.d.com' xxx=xadsfa alt=d d d target=_blank style="dd">site d</A>
- <A target=_blank style="e" xxx=xadsfa style="ee" alt=e e e href='http://www.e.com'>site e</A>
- <p align=right style="font-size:10px">adasdfasdf</p>
- <p style="font-color:red;" align='left'>asdfasdfasdfasdf</p>
- <p align=left right center>asdfasdfasdf</p>
- <font color="red" alt=adasd adsasd>asdfadsfasdf</font>
- <font align='left' color=red>asdfasdfadf</font>
- <font align=left right color=red black>asdfasdf</font>
- HTML;
- //显示原字串
- show($str,'Html');
- //过滤
- filter($str,'a','href,target,alt');
- filter($str,'p','align');
- filter($str,'font','color,alt');
- //显示过滤后的内容
- show($str,'Result');
- //脚本运行时间
- $run_time = array_sum(explode(" ",microtime())) - $start_time;
- echo('<center>Script Run Time: '.$run_time.'</center>');
- /**
- * 说明:过滤HTML字串
- * 参数:
- * $str : 要过滤的HTML字串
- * $tag : 过滤的标签类型
- * $keep_attribute :
- * 要保留的属性,此参数形式可为
- * href
- * href,target,alt
- * array('href','target','alt')
- */
- function filter(&$str,$tag,$keep_attribute) {
-
- //检查要保留的属性的参数传递方式
- if(!is_array($keep_attribute)) {
- //没有传递数组进来时判断参数是否包含,号
- if(strpos($keep_attribute,',')) {
- //包含,号时,切分参数串为数组
- $keep_attribute = explode(',',$keep_attribute);
- }else {
- //纯字串,构造数组
- $keep_attribute = array($keep_attribute);
- }
- }
- echo("·过滤[$tag]标签,保留属性:".implode(',',$keep_attribute).'<br>');
- //取得所有要处理的标记
- $pattern = "/<$tag(.*)<\/$tag>/i";
- preg_match_all($pattern,$str,$out);
- //循环处理每个标记
- foreach($out[1] as $key => $val) {
- //取得a标记中有几个=
- $cnt = preg_split('/ *=/i',$val);
- $cnt = count($cnt) -1;
- //构造匹配正则
- $pattern = '';
- for($i=1; $i<=$cnt; $i++) {
-
- $pattern .= '( .*=.*)';
- }
- //完成正则表达式形成,如/(<a)( .*=.*)( .*=.*)(>.*<\/a>/i的样式
- $pattern = "/(<$tag)$pattern(>.*<\/$tag>)/i";
-
- //取得保留属性
- $replacement = match($pattern,$out[0][$key],$keep_attribute);
- //替换
- $str = str_replace($out[0][$key],$replacement,$str);
- }
- }
- /**
- * 说明:构造标签,保留要保留的属性
- * 参数:$reg : pattern,preg_match的表达式
- * $str : string,html字串
- * $arr : array,要保留的属性
- * 返回:
- * 返回经保留处理后的标签,如
- * <A href='http://www.e.com' target=_blank alt=e e e>e.com</A>
- */
- function match($reg,&$str,$arr) {
-
- //match
- preg_match($reg,$str,$out);
- //取出保留的属性
- $keep_attribute = '';
- foreach($arr as $k1=>$v1) {
- //定义的要保留的属性的数组
- foreach($out as $k2=>$v2) {
- //匹配=后的数组
- $attribute = trim(substr($v2,0,strpos($v2,'=')));
- //=前面的
- if($v1 == $attribute) {
- //要保留的属性和匹配的值的=前的部分相同
- $keep_attribute .= $v2;
- //保存此匹配部分的值
- }
- }
- }
-
- //构造返回值,结构如:<a href=xxx target=xxx class=xxx>aadd</a>
- $keep_attribute = $out[1].$keep_attribute.($out[count($out)-1]);
- //返回值
- Return $keep_attribute;
- }
- /**
- * 显示字串内容
- */
- function show($str,$title='',$debug = True) {
- if($debug) {
- if(is_array($str)) {
- $str = print_r($str,True);
- }
- $txtRows = count(explode("\n",$str))+1;
- echo($title.':<br><TEXTAREA NAME="txt" ROWS="'.$txtRows.'" COLS="130">'.$str.'</TEXTAREA><br><br>');
- }
- }
- ?>
复制代码
运行结果
原字串:
- <A style="a" target=_blank href='http://www.a.com' xxx=xadsfa alt="a a a" style="aa">site a</A>
- <A alt='b b b' xxx=xadsfa target=_blank href='http://www.b.com' style="b" style="bb">site b</A>
- <A xxx=xadsfa style="c" href='http://www.c.com' target=_blank alt=c c c style="cc">site c</A>
- <A style="d" href='http://www.d.com' xxx=xadsfa alt=d d d target=_blank style="dd">site d</A>
- <A target=_blank style="e" xxx=xadsfa style="ee" alt=e e e href='http://www.e.com'>site e</A>
- <p align=right style="font-size:10px">adasdfasdf</p>
- <p style="font-color:red;" align='left'>asdfasdfasdfasdf</p>
- <p align=left right center>asdfasdfasdf</p>
- <font color="red" alt=adasd adsasd>asdfadsfasdf</font>
- <font align='left' color=red>asdfasdfadf</font>
- <font align=left right color=red black>asdfasdf</font>
复制代码
·过滤[a]标签,保留属性:href,target,alt
·过滤[p]标签,保留属性:align
·过滤[font]标签,保留属性:color,alt
- <A href='http://www.a.com' target=_blank alt="a a a">site a</A>
- <A href='http://www.b.com' target=_blank alt='b b b'>site b</A>
- <A href='http://www.c.com' target=_blank alt=c c c>site c</A>
- <A href='http://www.d.com' target=_blank alt=d d d>site d</A>
- <A href='http://www.e.com' target=_blank alt=e e e>site e</A>
- <p align=right>adasdfasdf</p>
- <p align='left'>asdfasdfasdfasdf</p>
- <p align=left right center>asdfasdfasdf</p>
- <font color="red" alt=adasd adsasd>asdfadsfasdf</font>
- <font color=red>asdfasdfadf</font>
- <font color=red black>asdfasdf</font>
复制代码 |
|