tkchks 发表于 2011-12-23 02:01

PHP信息采集程序源码

[ 本帖最后由 tkchks 于 2011-12-25 19:24 编辑 ]

此信息采集程序的实现原理: 首先,给定一个文章列表页面的URL,通过crawl爬行抓去该页面中的article的URL; 然后,对同一个web应用的每一篇文章而言,其title,content都是可以定位到的。通过匹配取出其中内容。 此程序包含两个页面,保存为*.php后可以运行在SAE平台下,以下是详细代码:
crawler.php
<h2>Please input a valid url to CRAWLING</h2>
<form action="" method="post">
    <input type="text" name="site" value="http://wwww.baidu.com" style="width:200px;" />
    <input type="submit" name="submit" value="CRAWLING" />
</form>
<?php
if(isset($_POST['submit'])){
$link_to_dig = $_POST['site'];
//$link_to_dig = "http://www.domain.com";


   $f = new SaeFetchurl();
   $original_file = $f->fetch($link_to_dig);
//$original_file = file_get_contents($link_to_dig);
if(!$original_file)
    die("Error loading {$link_to_dig}");

$path_info = parse_url($link_to_dig);
$base = $path_info['scheme'] . "://" . $path_info['host'];

$stripped_file = strip_tags($original_file, "<a>");
$fixed_file = preg_replace("/<a([^>]*)href=\"\//is", "<a$1href=\"{$base}/", $stripped_file);
$fixed_file = preg_replace("/<a([^>]*)href=\"\?/is", "<a$1href=\"{$link_to_dig}/?", $fixed_file);
preg_match_all("/<a(?:[^>]*)href=\"([^\"]*)\"(?:[^>]*)>(?:[^<]*)<\/a>/is", $fixed_file, $matches);



$result = print_r($matches, true);
$result = str_replace("<", "&lt;", $result);
print "<pre>" . $result . "</pre>";


}

?>
gatherinformation.php
<html >
<head>
<title>Gather information</title>
</head>
<body>
<h2>Gather information</h2>
<form action="" method="post">
    <table style="width:800px;">
   <tr><th>URL:</th><td ><input type="text" name="url" value="http://hnnu.myubbs.com/viewthread.php?tid=108154" style="width:400px;" />increase tid if you like</td></tr>
   <tr><th>Start:</th><td><input type="text" name="start" value="" style="width:400px;" />Input: <?php print htmlentities("<div class=\"t_msgfontfix\">"); ?></td></tr>
   <tr><th>End:</th><td><input type="text" name="end" value="" style="width:400px;" />Input: <?php print htmlentities("<div id=\"post_rate_div_"); ?></td></tr>
   <tr><th></th><td><input type="submit" name="submit" value="Grasp" /></td></tr>   
    </table>
<div>
      <?php
      
       if(isset($_POST['submit'])){
            $url = $_POST['url'];
            $start = $_POST['start'];
            $end = $_POST['end'];   
            if ($source = file_get_contents($url)) {      
               //test();
             echo extract_content($source,$start,$end);
            }
}
      ?>
</div>
</form>
</body>
</html>
<?php

function extract_content($string,$start,$end)
{
$pos = stripos($string,$start);
$str1 = substr($string,$pos);
$str2 = substr($str1,strlen($start));
$second_pos = stripos($str2,$end);
$str3 = substr($str2,0,$second_pos);
$content = trim($str3);
return $content;
}
function getTextBetweenTags($tag, $html, $strict=0)
{
   
    $dom = new DOMDocument('1.0', 'utf-8');
    if($strict==1)
    {
      $dom->loadXML($html);
    }
    else
    {
      $dom->loadHTML($html);
    }

    /*** discard white space ***/
    $dom->preserveWhiteSpace = false;
    $content = $dom->getElementsByTagname($tag);
    $out = array();
    foreach ($content as $item)
    {
      $out[] = $item->nodeValue;
    }
    return $out;
}

function getTextBetweenComments($string, $start,$end)
{
    $pattern = "/<$start>(.*?)<\/$end>/";
    preg_match($pattern, $string, $matches);
    return $matches;
}

function search($source,$start,$count){
      return substr($source,strpos($source, $start)+strlen($start), (int)$count);
}


function test(){
    $string ='<b>Street23</b> -Paris<b>Street 33</b>- Berlin <b>Street 453</b>- London';
    $array = explode("<b>",$string);
    $desired_array = array();
    foreach($array as $value)
    {
   $value = trim(strip_tags($value));;
   if($value)
   {
   list($street,$city) = explode("-",$value);
   $desired_array = trim($city);   
   }
    }
    echo "<pre>";
    print_r($desired_array);
    echo "</pre>";
}

?>
页: [1]
查看完整版本: PHP信息采集程序源码