- 论坛徽章:
- 0
|
本帖最后由 网鬼 于 2010-06-30 18:06 编辑
注意:看到还有人在找用这个程序,我在此说明一下,新浪博客在去年做过改版调整,以下抓取程序已经不能用了,请周知
仅做为学习PHP代码的示例,如果以后有空再写新版的抓取程序,多谢!
<?php
error_reporting(0);
set_time_limit(0);
header("Content-Type:text/html; charset=utf-8");
function get_url_content($url) {
if(extension_loaded('curl')) {
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
$content = curl_exec($ch);
curl_close($ch);
} else {
$content = file_get_contents($url);
}
!$content && die("获取地址:$url 内容出错.");
return $content;
}
//新浪博客用户名
$username = empty($_GET['username']) ? 'liuxingliang' : $_GET['username'];
$home_url = "http://blog.sina.com.cn/$username";
$content = get_url_content($home_url);
//博客名称
preg_match('/id="blogname"\s*><a\s*.*?>(.*?)<\/a>/', $content, $matches);
if($matches) {
$blog_title = $matches[1];
}
//用户编号
preg_match("/\"([0-9]+)\"/i", $content, $matches);
!$matches && die("获取用户编号出错.");
$uid = $matches[1];
//博文总数
$config_url = "http://blogcnf.sinajs.cn/acate?jv=x&{$uid}";
$content = get_url_content($config_url);
preg_match('/:{"total":([0-9]+)/', $content, $matches);
!$matches && die("获取博客总数出错.");
//博文总数
$total = $matches[1];
$perpage = 50;
//博文页数
$totalpage = ceil($total/$perpage);
//博文页列表
$page = empty($_GET['page']) ? 1 : intval($_GET['page']);
$page = ($page > $totalpage ? $totalpage : $page);
$blog_url = "http://blog.sina.com.cn/s/articlelist_{$uid}_0_{$page}.html";
if(!empty($blog_title)) {
echo "<h1>新浪博客:<a href=\"$home_url\" target=\"_blank\">$blog_title</a></h1>";
}
echo "<table><thead>记录总数:$total</thead>";
$blog_url_content = get_url_content($blog_url);
//文章列表
$pattern = '/<div class="articleTitle_d">\s*<div class="floatLeft">.*?<a.*?href="(http:\/\/blog.sina.com.cn\/s\/blog_.*?\.html)".*?>(.*?)<\/a>.*?<span class="time space_d01">\((.*?)\)<\/span><\/div>/s';
if(!preg_match_all($pattern, $blog_url_content, $matches)) {
die("没有匹配的记录");
}
//var_dump($matches);
$count = count($matches[0]);
for($i=0; $i<$count; $i++) {
echo "<tr><td><a href=\"{$matches[1][$i]}\" target=\"_blank\">{$matches[2][$i]}</a></td><td>{$matches[3][$i]}</td></tr>";
}
echo "</table>";
//分页
if($total > $perpage) {
$php_self = $_SERVER['PHP_SELF'] ? $_SERVER['PHP_SELF'] : $_SERVER['SCRIPT_NAME'];
$php_self .= '?username=' . $username;
$range = range(1, $totalpage);
foreach ($range as $num) {
if($page == $num) {
echo $num . " ";
} else {
echo "<a href=\"{$php_self}&page=$num\">$num</a> ";
}
}
}
?> |
[ 本帖最后由 网鬼 于 2009-10-25 21:38 编辑 ] |
|