php实现的web采集神器

gjfer 发表于 2015-07-09 11:14

php实现的web采集神器，只需要通过简单配置，就可以采集任意没有严格校验的站点
可以扩展IP代理功能以及伪原创功能

代码<?php
/**
*可以灵活配置使用的采集器
*作者：Rain
*创建时间：2015-02-03 15:17:30
*版本信息：V1.0
*/

///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//数据库的相关配置信息,请根据您的数据库信息进行配置
define('DB_HOST', 'localhost');
define('DB_USER', 'root');
define('DB_PWD', 'test123456');
define('DB_NAME', 'test_dbname');
define('DB_CHARSET', 'utf8');
define('TABLE_NAME', 'tb_book');
//end

//网站信息相关的配置,请根据具体需要采集的网站内容信息进行配置
define('WEB_CHARSET', 'gbk');
//变动的参数，使用%d进行替换,只支持数值形式的变动
define('WEB_LIST_URL', 'http://www.pcbookcn.com/book/1_%d.htm');
//分页的条数
define('PAGE_COUNT', 14);
//从哪个页面开始抓取
define('PAGE_START', 1);
//内容页的URL,使用正则模式,必须包含/,例如：/\/xuefu2008\/article\/details\/(\d)+/i
define('WEB_CONTENT_URL_REG', '/\/book\/(\d)+\.htm/i');
//网站域名HOST信息,不包含末尾的/，例如：http://blog.csdn.net
define('WEB_HOST', 'http://www.pcbookcn.com');
//列表页内容的精准定位，用来大致抓取一个列表页的内容显示模块位置，使用正则进行定位
define('WEB_LIST_POSTION', '/book_name\.gif(.*?)<td\swidth="15\%"\snowrap>/i');
//end

//微调参数，通常不修改也不会影响您的正常使用
define('SLEEP_TIME', 1);
define('IS_DEBUG', false);
define('INSERT_DB', true);
//内容的输出速度，单位：秒
define('OUTPUT_SPEED', 1);
//end

//需要过滤删除的文字,根据采集的网站类型进行设置,不区分大小写
$text_filter = array(
'- 中华电脑书库' => '',
'_电脑电子书' => '',
'_电脑书籍' => '',
'下载' => '',
);

//表结构映射的配置
$table_mapping = array(
//表字段名称 => 获取该字段的正则表达式,非空字段都必须在此设置映射关系，常量值请直接填写具体对应的值，无需使用正则
'size' => '/软件大小.*?000000>(.*?)<\/font>/i',
'logo' => 'http://www.94cto.com/index/uploads/images/20150105/0b8461910de101cc51a07684cdab797e.jpg',
'field1' => '/<title>(.*?)<\/title>/i',
'field2' => '/软件简介.*?000000>(.*?)<\/font>/i',
'field3' => '1',
'field4' => '1',
'field5' => '1',
'field6' => '电子书,计算机,图像,图形',
'platform' => 'window/Linux',
'ishot' => '1',
'agreement' => '免费',
'downurl' => '/(\/down\.asp\?id=.*?)"/i',
'istop' => '1',
);
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

$ga = new Gather();
$ga->run();

class Gather
{
public function __construct()
{
   $this->init_check();
}

public function run()
{
   global $table_mapping, $text_filter;

   for ($page = PAGE_START; $page <= PAGE_COUNT; $page++)
   {

         $this->write('开始采集列表第'.$page.'页的内容...');
         $list_content = $this->get(sprintf(WEB_LIST_URL, $page));
         if (empty($list_content))
         {
            $this->write('抓取的列表页的内容为空，所以过滤掉');
            continue;
         }

         $list_content = str_replace("\r", '', $list_content);
         $list_content = str_replace("\n", '', $list_content);

         //精准定位要抓取的模块内容
         if (!preg_match(WEB_LIST_POSTION, $list_content, $list_search))
         {
            $this->write('精准匹配列表页的内容失败，所以过滤掉');
            continue;
         }
         if (isset($list_search))
            $list_content = $list_search;
         else
            $list_content = $list_search;
         //end

         preg_match_all(WEB_CONTENT_URL_REG, $list_content, $match);
         if (is_array($match) && !empty($match))
         {
            $this->write('当前的列表页面，总共匹配到：'.count($match).'个内容页');
            foreach ($match as $val)
            {
               if (strpos($val, 'http:') === false)
               {
                     if (substr($val, 0, 1) == '/')
                        $val = WEB_HOST.$val;
                     else
                        $val = WEB_HOST.'/'.$val;
               }
               $web_content = $this->get($val);
               if (empty($web_content))
               {
                     $this->write('抓取的内容页为空,所以过滤掉');
                     continue;
               }

               $web_content = str_replace("\r", '', $web_content);
               $web_content = str_replace("\n", '【】', $web_content);

               $sql = "INSERT INTO ".TABLE_NAME."(".implode(', ', array_keys($table_mapping)).")VALUES(";
               foreach ($table_mapping as $field => $reg)
                     $sql .= ':'.$field.',';
               $sql = substr($sql ,0, -1);
               $sql .= ')';

               if (IS_DEBUG)
                     $this->write('执行SQL '.$sql);

               $dsn = 'mysql:dbname='.DB_NAME.';host='.DB_HOST;
               try {
                     $dbh = new PDO($dsn, DB_USER, DB_PWD);
               } catch (PDOException $e) {
                     $this->write( 'Connection failed: ' . $e->getMessage(), true);
               }
               $dbh->query("set names 'utf8'");
               $sth = $dbh->prepare($sql);

               foreach ($table_mapping as $field => $reg)
               {
                     if (substr($reg, 0, 1) !='/')
                     {
                        $field = $reg;
                     }
                     else
                     {
                        if (!preg_match($reg, $web_content, $tmp_match))
                        {
                           $this->write('对不起,匹配字段：'.$field.'失败，过滤此记录');
                           continue 2;
                        }

                        $field = $tmp_match;
                        $field = $this->closetags($field);

                        //删除javascript脚本
                        $field = preg_replace('/<script(.*?)>(.*?)<\/script>/i', '', $field);

                        //将链接删除
                        $field = preg_replace('/<a(.*?)>(.*?)<\/a>/i', '${2}', $field);

                        //图片链接地址绝对地址化
                        preg_match_all('/<img.*?src=("|\')+(.*?)("|\')+.*?>/i', $field, $img_match);
                        if (isset($img_match) && is_array($img_match) && !empty($img_match))
                        {
                           foreach ($img_match as $img_val)
                           {
                                 if (strpos($img_val, 'http:') === false)
                                 {
                                    $new_val = $img_val;
                                    if (substr($new_val, 0, 1) != '/')
                                       $new_val = '/'.$img_val;
                                    $new_val = WEB_HOST.$new_val;
                                    $field = str_replace($img_val, $new_val, $field);
                                 }
                           }
                        }
                        //end

                        //针对HTML里面的pre的换行先做一个特殊处理
                        $field = preg_replace('/<pre.*?>(.*?)<\/pre>/i', '<pre class="prettyprint">${1}</pre>', $field);
                        preg_match_all('/<pre>(.*?)<\/pre>/i', $field, $pre_match);
                        if (isset($pre_match) && is_array($pre_match) && !empty($pre_match))
                        {
                           foreach ($pre_match as $pre_val)
                                 $field = str_replace($pre_val, str_replace("【】", "\r\n", $pre_val), $field);
                        }
                        //end
                     }

                     //入库之前，将对应的换行符号都还原回来
                     $field = str_replace('【】', "\r\n", $field);
                     //文本的过滤和替换操作
                     if (is_array($text_filter) && !empty($text_filter))
                     {
                        foreach ($text_filter as $tk => $tv)
                           $field = str_ireplace($tk, $tv, $field);
                     }

                     if (IS_DEBUG)
                        $this->write('*'."\t".'字段：'.$field.'值：'."\n****************************************************\n".$field."\n****************************************************");
                     if ('downurl' == $field && stripos($field, 'http:') === false)
                        if (substr($field, 0, 1) == '/')
                           $field = WEB_HOST.trim($field);
                        else
                           $field = WEB_HOST.'/'.trim($field);
                     $sth->bindValue(':'.$field, trim($field));
               }
               if (INSERT_DB)
                     $sth->execute();
               $sth->closeCursor();

               $this->write( '休息，暂停'.SLEEP_TIME.'秒后继续抓取...');
               sleep(SLEEP_TIME);
            }
         }
         else
         {
            $this->write('列表页面没有抓取到内容，所以过滤掉');
         }
   }
   $this->write('', true);
}

protected function closetags($html)
{
   // 不需要补全的标签
   $arr_single_tags = array('meta', 'img', 'br', 'link', 'area');
   // 匹配开始标签
   preg_match_all('#<(+)(?: .*)?(?<![/|/ ])>#iU', $html, $result);
   $openedtags = $result;
   // 匹配关闭标签
   preg_match_all('#</(+)>#iU', $html, $result);
   $closedtags = $result;
   // 计算关闭开启标签数量，如果相同就返回html数据
   $len_opened = count($openedtags);
   if (count($closedtags) == $len_opened) {
   return $html;
   }
   // 把排序数组，将最后一个开启的标签放在最前面
   $openedtags = array_reverse($openedtags);
   // 遍历开启标签数组
   for ($i = 0; $i < $len_opened; $i++) {
   // 如果需要补全的标签
   if (!in_array($openedtags[$i], $arr_single_tags)) {
   // 如果这个标签不在关闭的标签中
   if (!in_array($openedtags[$i], $closedtags)) {
   // 直接补全闭合标签
   $html .= '</' . $openedtags[$i] . '>';
   } else {
   unset($closedtags, $closedtags)]);
   }
   }
   }
return $html;
}

protected function init_check()
{
   if (!$this->check_curl_support())
         $this->write('对不起，请先开启CURL的类库的支持，否则无法执行', true);
   $this->check_mysql_connect();
   $this->write('程序初始化检查通过,执行后续的流程...');
}

private function get($url, $data = array())
{
   $this->write('开始执行抓取: '.$url);
   $ch = curl_init();
   curl_setopt($ch, CURLOPT_URL, $url);
   //curl_setopt($ch, CURLOPT_USERAGENT, "Baiduspider+(+http://www.baidu.com/search/spider.htm)");
   curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)");
   curl_setopt($ch, CURLOPT_HEADER, 0);
   curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
   curl_setopt($ch, CURLOPT_HTTPHEADER, $data);
   $ret = curl_exec($ch);
   $error = curl_error($ch);
   curl_close($ch);
   unset($ch);
   if (!empty($error))
   {
         $this->write('程序抓取URL: '.$url.'发生错误，错误信息: '.$error);
         return false;
   }
   if (WEB_CHARSET != 'utf-8')
         $ret = iconv(WEB_CHARSET, 'utf-8', $ret);
   return $ret;
}

//when check finish,mysql connect will auto close
private function check_mysql_connect()
{
   $con = mysql_connect(DB_HOST, DB_USER, DB_PWD);
   if (!is_resource($con))
         $this->write('程序无法成功链接到数据库,具体的错误信息:'.mysql_error(), true);
   if (!mysql_select_db(DB_NAME, $con))
         $this->write('程序无法链接到数据库: '.DB_NAME.'，具体的错误信息: '.mysql_error(), true);
   mysql_close($con);
}

private function check_curl_support()
{
   if (!extension_loaded('curl') || !function_exists('curl_init'))
         return false;
   return true;
}

private function write($str, $end = false)
{
   if (PATH_SEPARATOR == ':')
         echo $str,PHP_EOL,PHP_EOL;
   else
         echo iconv('UTF-8', 'GBK', $str),PHP_EOL,PHP_EOL;

   if ($end)
         die("program exit");

   sleep(OUTPUT_SPEED);
}
}

renxiao2003 发表于 2015-07-10 09:53

这个到底是要采集什么。楼主只贴上代码。为什么不把功能说明白呢。

页: [1]

Chinaunix's Archiver

php实现的web采集神器