注意:看到还有人在找用这个程序,我在此说明一下,新浪博客在去年做过改版调整,以下抓取程序已经不能用了,请周知 仅做为学习PHP代码的示例,如果以后有空再写新版的抓取程序,多谢! <?php error_reporting(0); set_time_limit(0); header("Content-Type:text/html; charset=utf-8"); function get_url_content($url) { if(extension_loaded('curl')) { $ch = curl_init($url); curl_setopt($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); $content = curl_exec($ch); curl_close($ch); } else { $content = file_get_contents($url); } !$content && die("获取地址:$url 内容出错."); return $content; } //新浪博客用户名 $username = empty($_GET['username']) ? 'liuxingliang' : $_GET['username']; $home_url = "http://blog.sina.com.cn/$username"; $content = get_url_content($home_url); //博客名称 preg_match('/id="blogname"\s*><a\s*.*?>(.*?)<\/a>/', $content, $matches); if($matches) { $blog_title = $matches[1]; } //用户编号 preg_match("/\"([0-9]+)\"/i", $content, $matches); !$matches && die("获取用户编号出错."); $uid = $matches[1]; //博文总数 $config_url = "http://blogcnf.sinajs.cn/acate?jv=x&{$uid}"; $content = get_url_content($config_url); preg_match('/:{"total":([0-9]+)/', $content, $matches); !$matches && die("获取博客总数出错."); //博文总数 $total = $matches[1]; $perpage = 50; //博文页数 $totalpage = ceil($total/$perpage); //博文页列表 $page = empty($_GET['page']) ? 1 : intval($_GET['page']); $page = ($page > $totalpage ? $totalpage : $page); $blog_url = "http://blog.sina.com.cn/s/articlelist_{$uid}_0_{$page}.html"; if(!empty($blog_title)) { echo "<h1>新浪博客:<a href=\"$home_url\" target=\"_blank\">$blog_title</a></h1>"; } echo "<table><thead>记录总数:$total</thead>"; $blog_url_content = get_url_content($blog_url); //文章列表 $pattern = '/<div class="articleTitle_d">\s*<div class="floatLeft">.*?<a.*?href="(http:\/\/blog.sina.com.cn\/s\/blog_.*?\.html)".*?>(.*?)<\/a>.*?<span class="time space_d01">\((.*?)\)<\/span><\/div>/s'; if(!preg_match_all($pattern, $blog_url_content, $matches)) { die("没有匹配的记录"); } //var_dump($matches); $count = count($matches[0]); for($i=0; $i<$count; $i++) { echo "<tr><td><a href=\"{$matches[1][$i]}\" target=\"_blank\">{$matches[2][$i]}</a></td><td>{$matches[3][$i]}</td></tr>"; } echo "</table>"; //分页 if($total > $perpage) { $php_self = $_SERVER['PHP_SELF'] ? $_SERVER['PHP_SELF'] : $_SERVER['SCRIPT_NAME']; $php_self .= '?username=' . $username; $range = range(1, $totalpage); foreach ($range as $num) { if($page == $num) { echo $num . " "; } else { echo "<a href=\"{$php_self}&page=$num\">$num</a> "; } } } ?> |
欢迎光临 Chinaunix (http://bbs.chinaunix.net/) | Powered by Discuz! X3.2 |