- 论坛徽章:
- 0
|
config.ini
<?php
set_time_limit(360);
//define('URL_SEED','http://games.hawkenterprises.org/keyworddensity/test.html');//trailing slash
//define('DOMAIN','hawkenterprises.com');
define('SINGLE_SITE',true);
define('SINGLE_PAGE',true);
define('DROP_COUNT',0); // drop keywords that appear less than 4 times
$exclude_terms_url = array(0=>'javascript:','mailto:','file:','.pdf','.jpg','.gif','.png','.doc','#','.xls','.tar','.gz','feed:');
$dropwords = array(0=>'and','the','is','it','a',' ','an','or','of','on','for','to');
?>
|
run.php
<?php
function get_document($urltofetch){
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $urltofetch);
curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
$str = curl_exec($ch);
curl_close($ch);
return $str;
}
function parsed_document_links($document_string){
$hrefs = array();
$doc = new DOMDocument();
@$doc->loadHTML($document_string);
$anchor_tags = $doc->getElementsByTagName('a');
foreach($anchor_tags as $anchor){
$hrefs[] = $anchor->getAttribute('href');
}
return $hrefs;
}
function normalize_links($links_array){
global $exclude_terms_url;
$url_normalized = array();
foreach($links_array as $k=>$v){
$skip_url = false;
foreach($exclude_terms_url as $key=>$value){
if(stristr($v,$value) !== false)
$skip_url = true;
}
if(!$skip_url){
$parsed_link = @parse_url($v);
if(SINGLE_SITE){
if(isset($parsed_link['scheme']) && $parsed_link['scheme'] == 'http'){
if(stristr($v,DOMAIN) !== false){
$url_normalized[] = $v;
}
}else{
$url_normalized[] = URL_SEED . $v;
}
}
}
}
return $url_normalized;
}
?>
|
crawl.php
<?php
function get_document($urltofetch){
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $urltofetch);
curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
$str = curl_exec($ch);
curl_close($ch);
return $str;
}
function parsed_document_links($document_string){
$hrefs = array();
$doc = new DOMDocument();
@$doc->loadHTML($document_string);
$anchor_tags = $doc->getElementsByTagName('a');
foreach($anchor_tags as $anchor){
$hrefs[] = $anchor->getAttribute('href');
}
return $hrefs;
}
function normalize_links($links_array){
global $exclude_terms_url;
$url_normalized = array();
foreach($links_array as $k=>$v){
$skip_url = false;
foreach($exclude_terms_url as $key=>$value){
if(stristr($v,$value) !== false)
$skip_url = true;
}
if(!$skip_url){
$parsed_link = @parse_url($v);
if(SINGLE_SITE){
if(isset($parsed_link['scheme']) && $parsed_link['scheme'] == 'http'){
if(stristr($v,DOMAIN) !== false){
$url_normalized[] = $v;
}
}else{
$url_normalized[] = URL_SEED . $v;
}
}
}
}
return $url_normalized;
}
?>
|
[ 本帖最后由 workingbeijing 于 2008-3-4 09:58 编辑 ] |
|