- 论坛徽章:
- 0
|
本帖最后由 luojiannx 于 2010-11-29 22:02 编辑
用火车头和狂人总是这里那里的不如意,搞来搞去,一烦躁,觉着还不如自己写个,于是就这么做了。
本采集器功能:采集cnbeta新闻,同时采集图片,发布于discuz!X1.5论坛。
包含的两个文件config.inc.php、db_mysql.class.php是我以前用惯了的phpwind里面的。
用了一些我很早以前写的旧代码,能工作,但目前已知的bug或者缺陷有:
1、偶尔(采集几百帖后)会在保存几十个图片的时候数据连接中断,用了永久链接也没用。
2、贴数的统计要手动在后台更新
3、当日新帖无体现
4、因为用的ngnix,缓存很讨嫌,难以及时输出调试信息,所以加了个什么repeat填充4k的缓存(从而输出),如果你用的是apache可以把倒数第四行去掉,那很占浏览器内存。
这是第一次写面向对象的实际应用程序,因为想来想去,也只有这样写比较简洁,条理清楚。
程序质量虽然不高,但贵在实用,网上同样功能的程序(还没这么好用)有卖到150的。
- <?php
- define('IN_NXDLJ', 'index');
- include 'config.inc.php';
- include 'include/db_mysql.class.php';
- ob_implicit_flush(true);
- global $DEBUG;
- $DEBUG=true;
- echo "DEBUG=".$DEBUG."</br>";
- debug("debug test success</br>");
- $db = new dbstuff;
- $db->connect($dbhost, $dbuser, $dbpw, 'haixin_portal', $pconnect);
- unset($dbhost, $dbuser, $dbpw, $pconnect);
- $startid=intval($_GET['startid']);
- $endid=intval($_GET['endid']);
- if($endid<$startid)
- {
- $a=$endid;
- $endid=$startid;
- $startid=$a;
- }
- debug("start </br>");
- for($id=$startid;$id<=$endid;$id++)
- {
- $sql="select id from cnbetaid where id=$id";
- $done=intval($db->get_one($sql));
- debug($id.".done=".$done."</br>");
- if(!$done)
- {
- $sql = "INSERT INTO cnbetaid (`id`) VALUES (\"$id\");";
- $db->query($sql);
- $lj=new cnbeta($id);
- $subject = $lj->getsubject();
- $timestamp = strtotime($lj->gettime());
- $message =$lj->getmessage();
- if( $timestamp==0 || $message=="" || !$subject)
- {
- debug($id. " not found</br>");
- continue;
- }
-
- dopost($subject,$timestamp,$message,$db);
- debug( $id."done</br>");
- }
- }
- exit(" All done");
- class cnbeta {
- private $id;
- private $content="";
- private $message="";
- function __construct($id)
- {
- debug("__construct START </br>");
- $url="http://www.cnbeta.com/articles/".$id.".htm";
- $ref_url="http://www.cnbeta.com/index.php";
- $this->id=$id;
- $out=curl_grab_page($url, $ref_url, " ", "false", "null", "false");
- $pattern = "/<div id=\"news_content\"><a href=\"\/topics\/(\d+)\.htm\" ><img src=\"http:\/\/img\.cnbeta\.com\/topics\/(.*)\" alt=\"(.*)\" name=\"sign\" align=\"right\" id=\"sign\" onload=\"fixPNG\(this\)\"\/><\/a>/";
- $replacement = "<div id=\"news_content\"><img src=\"/portal/topics/\$2\" alt=\"\$3\" name=\"sign\" align=\"right\" id=\"sign\" />";
- $this->content=preg_replace($pattern, $replacement, $out);
- if($this->content)
- {
- debug( "__construct UPLOAD START </br>");
- $a="<div id=\"news_content\">";
- $b="<div class=\"digbox\">";
- $c=strbetween($this->content,$a,$b);
- $pattern="/http:\/\/(\w+\.)+([0-9.]+|net|com|cn|org|cc|tv)(\S*\/)(\S)+\.(gif|GIF|jpg|jpeg|JPG|png|PNG|bmp|BMP)/i";
- preg_match_all($pattern,$c,$url,PREG_PATTERN_ORDER);
- for($i=0;$i<=count($url[0]);$i++)
- {
- if($url[0][$i]!='')
- {
- $now=getdate();
- $filename=$now[year].$now[mon].$now[mday].$now[hours].$now[minutes].$now[seconds].$i.".".$url[5][$i];
- $savetime=SaveHTTPFile($url[0][$i],"/portal/cnbeta/images",$filename);
-
- $c=str_replace($url[0][$i],"/portal/cnbeta/images/".$filename,$c);
-
- debug( "__construct UPLOADING...... </br>");
- }
- }
- $this->message=$c;
- debug( "__construct UPLOAD END </br>");
- }
- }
- function getmessage()
- {
- return $this->message;
- }
- function getsubject()
- {
- $a="<h3 id=\"news_title\">";
- $b="</h3>";
- return strbetween($this->content,$a,$b);
- }
- function gettime()
- {
- $a="发布于 ";
- $b="|<script src=\"/counter.php?sid=";
- return strbetween($this->content,$a,$b);
- }
- }
- function strbetween($content,$a,$b){
- $start=strpos($content,$a)+strlen($a);
- $length=strpos($content,$b)-$start;
- return(substr($content,$start,$length));
- }
- function curl_grab_page($url,$ref_url,$data,$login,$proxy,$proxystatus){
- debug( "curl_grab_page START...... </br>");
- $user_agent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)";
- $cookie_jar = tempnam('./tmp','cookie');
- $ch = curl_init();
- curl_setopt($ch, CURLOPT_COOKIEJAR, $cookie_jar);
- curl_setopt($ch, CURLOPT_COOKIEFILE, $cookie_jar);
- curl_setopt($ch, CURLOPT_USERAGENT, $user_agent);
- curl_setopt($ch, CURLOPT_TIMEOUT, 40);
- curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
- if ($proxystatus == 'true') {
- curl_setopt($ch, CURLOPT_HTTPPROXYTUNNEL, TRUE);
- curl_setopt($ch, CURLOPT_PROXY, $proxy);
- }
- curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 0);
- curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 0);
- curl_setopt($ch, CURLOPT_URL, $url);
- curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
- curl_setopt($ch, CURLOPT_REFERER, $ref_url);
- curl_setopt($ch, CURLOPT_HEADER, TRUE);
- curl_setopt($ch, CURLOPT_USERAGENT, $user_agent);
- curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);
- if($login == 'true'){
- curl_setopt($ch, CURLOPT_POST, TRUE);
- curl_setopt($ch, CURLOPT_POSTFIELDS, $data);
- }
-
- ob_start();
- $out=curl_exec($ch);
- ob_end_clean();
- curl_close ($ch);
- debug("curl_grab_page END...... </br>");
- return $out;
- }
- function getmicrotime(){
- list($usec, $sec) = explode(" ",microtime());
- return ((float)$usec + (float)$sec);
- }
- function SaveHTTPFile($fFileHTTPPath,$fFileSavePath,$fFileSaveName)
- {
- //记录程序开始的时间
- $BeginTime=getmicrotime();
-
- //取得文件名
- $fFileSaveName="/var/www/html/".$fFileSavePath."/".$fFileSaveName;
- //取得文件的内容
- ob_flush();
- flush();
- ob_end_clean();
- ob_start();
- readfile($fFileHTTPPath);
- $img = ob_get_contents();
- ob_end_clean();
- //$size = strlen($img);
- //保存到本地
- $fp2=@fopen($fFileSaveName, "a");
- fwrite($fp2,$img);
- fclose($fp2);
- //记录程序运行结束的时间
- $EndTime=getmicrotime();
- //返回运行时间
- return($EndTime-$BeginTime);
- }
- function dopost($subject,$timestamp,$message,$db)
- {
- debug( "dopost START </br>");
- $thread=array(
- 'fid'=>38,
- 'posttableid'=>0,
- 'readperm'=>0,
- 'price'=>0,
- 'typeid'=>0,
- 'sortid'=>0,
- 'author'=>"罗建",
- 'authorid'=>24,
- 'subject'=>$subject,
- 'dateline'=>$timestamp,
- 'lastpost'=>$timestamp,
- 'lastposter'=>"罗建",
- 'displayorder'=>0,
- 'digest'=>0,
- 'special'=>0,
- 'attachment'=>0,
- 'moderated'=>0,
- 'status'=>0,
- 'isgroup'=>0
- );
- $db->update("insert into forum_thread "
- . " SET " . pwSqlSingle($thread)
- );
- $tid=$db->insert_id();
- $post = array(
- 'fid' => 38,
- 'tid' => $tid,
- 'first' => '1',
- 'author' => "罗建",
- 'authorid' => 24,
- 'subject' => $subject,
- 'dateline' => $timestamp,
- 'message' => $message,
- 'useip' => '10.232.154.8',
- 'invisible' => 0,
- 'anonymous' => 0,
- 'usesig' => 1,
- 'htmlon' => 1,
- 'bbcodeoff' => 1,
- 'smileyoff' => 1,
- 'parseurloff' => 1,
- 'attachment' => '0',
- 'tags' => ''
- );
- $db->update("insert into forum_post "
- . " SET " . pwSqlSingle($post)
- );
-
- $db->update("insert into forum_post_tableid"." set ".pwSqlSingle(array('pid'=>$db->insert_id())));
- debug("dopost END </br>");
- }
- function debug($str)
- {
- global $DEBUG;
- if($DEBUG)
- {
- ob_start();
- echo $str;
- echo str_repeat(' ', 1024*4);
- ob_flush();
- flush();
- ob_end_clean();
- }
- }
- ?>
复制代码 采集效率大概在半个小时800贴左右,要看网速以及文章图片的多少。 |
|