- 论坛徽章:
- 0
|
本帖最后由 ljx2380000 于 2013-08-01 08:58 编辑
关于simple_html_dom使用,我的博客里讲的很清楚,Simple HTML DOM是一款非常强大的html Dom解析器,它能帮助我们php解析html文档对象包括不符合W3C标准的html文档,并且像jQuery那样操作DOM元素,通过元素的id,class,tag等等来查找定位。我们可以利用Simple HTML DOM来采集我们所需要的数据.
需求:我们需要采集新浪的【新浪天气】频道的所有城市的今、明、后3天的天气情况。
分析:根据要求我分析出,所有城市天气的List页(列表页)地址如下http://php.weather.sina.com.cn/search_sheng.php
第一步:
新建数据库及数据表- CREATE database weather;
- CREATE TABLE `weather` (
- `city_name` varchar(50) NOT NULL DEFAULT ”,
- `detail_url` varchar(255) DEFAULT ”,
- `day_conditions` varchar(255) DEFAULT ”,
- `day_wind` varchar(255) DEFAULT ”,
- `day_highertemp` varchar(255) DEFAULT ”,
- `yesterday_conditions` varchar(255) DEFAULT ”,
- `yesterday_wind` varchar(255) DEFAULT ”,
- `yesterday_lowertemp` varchar(255) DEFAULT ”,
- `date` varchar(255) DEFAULT ”,
- PRIMARY KEY (`city_name`,`date`)
- ) ENGINE=MyISAM AUTO_INCREMENT=1 DEFAULT CHARSET=gb2312
复制代码 第二步:
编写PHP脚本getWeather.php(一个文件,需要在DOS命令行下运行)。
注意:在此php文件中需要引入simple_html_dom.php文件,可以来我的博客下载.- require_once(“simple_html_dom.php”);
- ini_set(‘memory_limit’,’1000M’);
- ini_set(“max_execution_time”,0);
- header(“Content-type: text/html;charset=gb2312″);
- //起始时间
- $startS=utime();
- //——建立数据库连接—-实时更新抓取数据START——————-//
- $mysqli = mysqli_connect(“localhost”,”root”,”root”,”weather”)or die(“Failed to connect!”.mysqli_connect_error());
- mysqli_query($mysqli,”set names gb2312″);
- function select_one($conn,$sql){
- $res = mysqli_query($conn,$sql);
- $result = array();
- if(!empty($res)){
- $result = mysqli_fetch_assoc($res);
- }
- return $result;
- }
- //起始时间1
- $start1=utime();
- $urls = “http://php.weather.sina.com.cn/search_sheng.php”;
- $htmlCN = file_get_html($urls);
- $li = $htmlCN->find(“.city_nav ol li a”);
- $listUrl = array();
- //获取所有城市天气的链接
- foreach($li as $k => $v){
- $v->href = “http://php.weather.sina.com.cn”.$v->href;
- $listUrl[$k] = substr($v->href,0,strrpos($v->href,’&'));
- }
- unset($li);
- $htmlCN->clear(); // clean up memory
- unset($htmlCN);
- $now = date(“Y-m-d”);
- $tom = date(“Y-m-d”,strtotime(“+1 days”));
- $tom2 = date(“Y-m-d”,strtotime(“+2 days”));
- $allDay = array();
- //根据所有城市天气的链接拼凑明后两天的链接
- foreach($listUrl as $k => $v){
- $allDay[$k][$now] = $v.”&day=0&dpc=1″;
- $allDay[$k][$tom] = $v.”&day=1&dpc=1″;
- $allDay[$k][$tom2] = $v.”&day=2&dpc=1″;
- }
- unset($listUrl);
- //结束时间1
- $end1=utime();
- $run1=$end1-$start1;
- echo date(“Y-m-d H:i:s”).”__The first weather is going :”.substr($run1,0,5).” seconds \r\n”;
- foreach($allDay as $keys => $days){
- //起始时间2
- $start2=utime();
- foreach($days as $day => $value){
- //起始时间3
- $start3=utime();
- $htmlS = file_get_html($value);
- $city_a = $htmlS->find(“#tab_01_ctn tbody tr td a”);
- $newArr = array();
- foreach($city_a as $k => $v){
- if(trim($v->plaintext)==’详情’){
- continue;
- }
- $newArr[$k]['url'] = trim($v->href);
- }
- unset($city_a);
- $UrlDetail = array();
- foreach($newArr as $k => $v){
- $UrlDetail[] = $v['url'];
- }
- unset($newArr);
- $newArr2 = array();
- $city_td = $htmlS->find(“#tab_01_ctn tbody tr”);
- foreach($city_td as $k => $v){
- $newArr2[$k] = trim(preg_replace(‘/\s+/’,'@’,trim($v->plaintext)));
- }
- unset($newArr2[0]);
- unset($newArr2[1]);
- unset($city_td);
- $newArr3 = array();
- foreach($newArr2 as $k => $v){
- if(substr_count($v,”@”)==10){
- $newArr3[] = substr($v,strpos($v,’@')+1,strrpos($v,’@')-strpos($v,’@')-1);
- }else{
- $newArr3[] = substr($v,0,strrpos($v,’@'));
- }
- }
- unset($newArr2);
- $newArr4 = array();
- foreach($newArr3 as $k => $v){
- $newArr4[$k] = explode(‘@’,$v);
- }
- unset($newArr3);
- $weathers = array();
- foreach($newArr4 as $k => $v){
- $weathers[$k]['city_name'] = “‘”.removeQuot($v[0]).”‘”;
- $weathers[$k]['detail_url'] = “‘”.removeQuot($UrlDetail[$k]).”‘”;
- $weathers[$k]['day_conditions'] = “‘”.removeQuot($v[1]).”‘”;
- $weathers[$k]['day_wind'] = “‘”.removeQuot($v[2].$v[3]).”‘”;
- $weathers[$k]['day_highertemp'] = “‘”.removeQuot($v[4]).”‘”;
- $weathers[$k]['yesterday_conditions'] = “‘”.removeQuot($v[5]).”‘”;
- $weathers[$k]['yesterday_wind'] = “‘”.removeQuot($v[6].$v[7]).”‘”;
- $weathers[$k]['yesterday_lowertemp'] = “‘”.removeQuot($v[8]).”‘”;
- $weathers[$k]['date'] = “‘”.removeQuot($day).”‘”;
- }
- unset($newArr4);
- //下面数据–循环入库–//
- foreach($weathers as $k => $v){
- if(!empty($v)&&is_array($v)){
- $inserSql = “REPLACE INTO weather(“.implode(“,”,array_keys($v)).”) VALUES (“.implode(“,”,$v).”)”;
- mysqli_query($mysqli,$inserSql) or die(“Failed”.mysqli_error());
- }
- echo date(“Y-m-d H:i:s”).”__The insertSQL is going : -+- [".$k."] -+- \r\n”;
- }
- unset($weathers);
- //结束时间3
- $end3=utime();
- $run3=$end3-$start3;
- echo date(“Y-m-d H:i:s”).”__The three weather is going :”.substr($run3,0,5).”-+-”.$day.” seconds \r\n”;
- }
- //结束时间2
- $end2=utime();
- $run2=$end2-$start2;
- echo date(“Y-m-d H:i:s”).”__The second weather is going :”.substr($run2,0,5).”-+-”.$keys.” seconds \r\n”;
- }
- //————————一些功能函数-START———————-//
- //替换特殊字串函数
- function removeQuot($string){
- if(strstr($string, “‘”)){
- $string = str_replace(“‘”,”",$string);
- }
- return addslashes($string);
- }
- //获取脚本运行时间
- function utime(){
- $rtime=explode(” “,microtime());
- $usec =(double)$rtime[0];
- $sec =(double)$rtime[1];
- return $sec+$usec;
- }
- //————————–一些功能函数-ENDS———————-//
- echo date(“Y-m-d H:i:s”).”get date of sina_weather ok \r\n”;
- //结束时间
- $endAll=utime();
- $runAll=$endAll-$startS;
- echo date(“Y-m-d H:i:s”).”The All Weather is going :”.substr($runAll,0,5).” seconds \r\n”;
复制代码 第三步:
在命令行下运行(通常这种采集脚本数据量很大),确保你的当前环境能用命令行运行。
具体看情况而行,下面截图是我命令行的运行方式:
第四步:
如上图(2)脚本已经运行完毕,花费170.5秒。最终得到数据如下(只截一部分):
PS:
我的博客地址:http://jianxuns.com
Email:15071414515@163.com
本人QQ 389750060 有事可加我扣扣聊。
|
|