免费注册 查看新帖 |

Chinaunix

  平台 论坛 博客 文库
最近访问板块 发新帖
查看: 1283 | 回复: 0
打印 上一主题 下一主题

一个简单的搜索机器人例子(Java) [复制链接]

论坛徽章:
0
跳转到指定楼层
1 [收藏(0)] [报告]
发表于 2006-05-24 16:36 |只看该作者 |倒序浏览

import java.awt.*;
import java.net.*;
import java.io.*;
import java.lang.*;
import java.util.*;
class node{
    private Object data;
    private node next;
    private node prev;
    public node(Object o){
data = o;
prev = next = null;
    }
    public String toString(){
if(next!=null)return data.toString() + "\n"+ next.toString();
return data.toString();
    }
    public node getNext(){return next;}
    public void setNext(node n){next = n;}
    public node getPrev(){return prev;}
    public void setPrev(node n){prev = n;}
    public Object getData(){return data;}
}
class linkedlist{
    node head;
    node tail;
    public linkedlist(){
tail = head = null;
    }
    public String toString(){
if(head==null)return "Empty list";
return head.toString();
    }
    public void insert(Object o){
if(tail==null){
     head = tail = new node(o);
}else{
     node nn = new node(o);
     tail.setNext(nn);
     tail=nn;
}
    }
    public boolean contains(Object o){
for(node n = head;n!=null;n=n.getNext()){
     if(o.equals(n.getData()))return true;
}
return false;
    }
    public Object pop(){
if(head==null)return null;
Object ret = head.getData();
head = head.getNext();
if(head==null)tail = null;
return ret;
    }
    public boolean isEmpty(){
return head==null;
    }
}
class list{
protected node tail;
protected node ptr;
private boolean stop;
public list(){
  ptr=tail=null;
  stop=false;
}
public boolean isEmpty(){return tail==null;}
public void reset(){
  stop=false;
  ptr=tail;
}
public String toString(){
  if(tail==null)return "Empty list";
  String ret="";
  for(node n = tail.getNext();n!=tail;n=n.getNext())ret+=n.getData().toString()+"\n";
  ret+=tail.getData().toString();
  return ret;
}
public Object get(){
  if(ptr==null)return null;
  ptr = ptr.getNext();
  if(ptr==tail.getNext()){
   if(stop)return null;
   stop=true;
   return tail.getNext().getData();
  }
  return ptr.getData();
}
public void insert(Object o, boolean attail){
  node nn = new node(o);
  if(tail==null){
   nn.setNext(nn);
   nn.setPrev(nn);
   ptr=tail=nn;
   return;
  }
  if(attail){
  tail.getNext().setPrev(nn);
   nn.setNext(tail.getNext());
   tail.setNext(nn);
   nn.setPrev(tail);
   tail=nn;
  }else{
   nn.setNext(tail.getNext());
   nn.setPrev(tail);
   tail.setNext(nn);
   nn.getNext().setPrev(nn);
  }
}
public void insert(Object o){}
}
class stack extends list{
public stack(){super();}
public void insert(Object o){insert(o, false);}
}
class queue extends list{
public queue(){super();}
public void insert(Object o){insert(o, true);}
public String peek(){
   if(tail==null)return "";
   return tail.getNext().getData().toString();
}
public Object pop(){
  if(tail==null)return null;
  Object ret = tail.getNext().getData();
  if(tail.getNext()==tail){
    tail=ptr=null;
  }else{
    if(tail.getNext()==ptr)ptr=ptr.getNext();
    tail.setNext(tail.getNext().getNext());
  }
  return ret;
}
}
class hashtable{
    private Vector table;
    private int size;
    public hashtable(){
size = 991;
table = new Vector();
for(int i=0;i", loc);
  if(loc==-1){
      errors.insert("malformed frame at "+site.toString());
      loc = beg;
  }
  else{
      try{
   parseFrame(site, source.substring(beg, loc));
      }
      catch(Exception e){
   errors.insert("while parsing "+site.toString()+", error parsing frame: "+e.toString());
      }
  }
     }
     //found "", loc);
  if(loc==-1){
      errors.insert("malformed linked at "+site.toString());
      loc = beg;
  }
  else{
      try{
   parseLink(site, source.substring(beg, loc));
      }
      catch(Exception e){
   errors.insert("while parsing "+site.toString()+", error parsing link: "+e.toString());
      }
  }
     }
}
    }
   
    /*
     * parses a frame
     */
    private void parseFrame(URL at_page, String s) throws Exception{
int beg=s.indexOf("src");
if(beg==-1)beg=s.indexOf("SRC");
if(beg==-1)return;//doesn't have a src, ignore
beg = s.indexOf("=", beg);
if(beg==-1)throw new Exception("while parsing "+at_page.toString()+", bad frame, missing \'=\' after src: "+s);
int start = beg;
for(;beg=end){//missing quotes... just take the first token after "src="
     for(beg=start+1;beg');end++){}
}
if(beg>=end){
     errors.insert("while parsing "+at_page.toString()+", bad frame: "+s);
     return;
}
String linkto=s.substring(beg,end);
if(linkto.startsWith("
[email=")||linkto.startsWith("Mailto:"))return]mailto:")||linkto.startsWith("Mailto:"))return[/email]
;
if(linkto.startsWith("javascript:")||linkto.startsWith("Javascript:"))return;
if(linkto.startsWith("
news:")||linkto.startsWith("Javascript:"))return
;
try{
     addSite(new URL(at_page, linkto));
     return;
}catch(Exception e1){}
try{
     addSite(new URL(linkto));
     return;
}catch(Exception e2){}
try{
     URL cp = new URL(at_page.toString()+"/index.html");
     System.out.println("attemping to use "+cp);
     addSite(new URL(cp, linkto));
     return;
}catch(Exception e3){}
errors.insert("while parsing "+at_page.toString()+", bad frame: "+linkto+", formed from: "+s);
    }
    /*
     * given a link at a URL, will parse it and add it to the list of sites to do
     */
    private void parseLink(URL at_page, String s) throws Exception{
//System.out.println("parsing link "+s);
int beg=s.indexOf("href");
if(beg==-1)beg=s.indexOf("HREF");
if(beg==-1)return;//doesn't have a href, must be an anchor
beg = s.indexOf("=", beg);
if(beg==-1)throw new Exception("while parsing "+at_page.toString()+", bad link, missing \'=\' after href: "+s);
int start = beg;
for(;beg=end){//missing quotes... just take the first token after "href="
     for(beg=start+1;beg');end++){}
}
if(beg>=end){
     errors.insert("while parsing "+at_page.toString()+", bad href: "+s);
     return;
}
String linkto=s.substring(beg,end);
if(linkto.startsWith("
[email=")||linkto.startsWith("Mailto:"))return]mailto:")||linkto.startsWith("Mailto:"))return[/email]
;
if(linkto.startsWith("javascript:")||linkto.startsWith("Javascript:"))return;
if(linkto.startsWith("
news:")||linkto.startsWith("Javascript:"))return
;
try{
     addSite(new URL(at_page, linkto));
     return;
}catch(Exception e1){}
try{
     addSite(new URL(linkto));
     return;
}catch(Exception e2){}
try{
     addSite(new URL(new URL(at_page.toString()+"/index.html"), linkto));
     return;
}catch(Exception e3){}
errors.insert("while parsing "+at_page.toString()+", bad link: "+linkto+", formed from: "+s);
    }
    /*
     * gets the title of a web page with content s
     */
    private String getTitle(String s){
try{
     int beg=s.indexOf("");
     if(beg==-1)beg=s.indexOf("");
     int end=s.indexOf("");
     if(end==-1)end=s.indexOf("");
     return s.substring(beg,end);
}
catch(Exception e){return "";}
    }
    /*
     * gets the text of a web page, times out after 10s
     */
    private String getText(URL site) throws Exception
    {
urlReader u = new urlReader(site);
Thread t = new Thread(u);
t.setDaemon(true);
t.start();
t.join(TIMEOUT);
String ret = u.poll();
if(ret==null){
  throw new Exception("connection timed out");
}else if(ret.equals("Not html")){
  throw new Exception("Not an HTML document");
}
return ret;
    }
    /*
     * returns how many sites have been visited so far
     */
    public int Visited(){return visitedsites;}
}
class urlReader implements Runnable{
    URL site;
    String s;
    public urlReader(URL u){
site = u;
s=null;
    }
    public void run(){
try{
     String ret=new String();
     URLConnection u = site.openConnection();
     String type = u.getContentType();
     if(type.indexOf("text")==-1 &&
        type.indexOf("txt")==-1 &&
        type.indexOf("HTM")==-1 &&
        type.indexOf("htm")==-1){
  //System.err.println("bad content type "+type+" at site "+site);
  System.out.println("bad content type "+type+" at site "+site);
  ret = "Not html";
  return;
     }
     InputStream in = u.getInputStream();
     BufferedInputStream bufIn = new BufferedInputStream(in);
     int data;
     while(true){
  data = bufIn.read();
  // Check for EOF
  if (data == -1) break;
  else ret+= ( (char) data);
     }
     s = ret;
}catch(Exception e){s=null;}
    }
    public String poll(){return s;}
}
public class spidergui extends Frame{
private spider s;
private Color txtColor;
private Color errColor;
private Color topColor;
private Color numColor;
private Color curColor;
public spidergui(spider spi, String title){
  super(title);
  curColor = new Color(40, 40, 200);
  txtColor = new Color(0, 0, 0);
  errColor = new Color(255, 0, 0);
  topColor = new Color(40, 40, 100);
  numColor = new Color(50, 150, 50);
  s=spi;
  setBounds(0, 0, 800, 600);
  show();
  toFront();
  repaint();
}
public void endShow(){
  System.out.println(s);
  hide();
  dispose();
}
public void paint(Graphics g){
  super.paint(g);
  s.todo.reset();
  s.done.reset();
  s.errors.reset();
  s.omittions.reset();
  String txt;
  Object o;
  g.setColor(curColor);
  g.setFont(new Font("arial", Font.PLAIN, 18));
  String cur = s.getCurrent();
  if(cur.length()>80)g.drawString(
   cur.substring(0, 40)+
   " . . . "+
   cur.substring(cur.length()-30, cur.length()),
  50, 50);
  else g.drawString(cur, 50, 50);
  g.setColor(numColor);
  g.setFont(new Font("arial", Font.BOLD, 24));
  g.drawString(Integer.toString(s.Visited()), 350, 80);
  g.setFont(new Font("arial", Font.PLAIN, 14));
  g.setColor(topColor);
  g.drawString("To Do:", 100, 80);
  g.drawString("Completed:", 500, 80);
  g.drawString("Ignored:", 500, 250);
  g.drawString("Errors:", 100, 420);
  g.setColor(txtColor);
  g.setFont(new Font("arial", Font.PLAIN, 12));
  for(int i=0;i65)g.drawString(
   txt.substring(0, 38) +
   " . . . " +
   txt.substring(txt.length()-18, txt.length()),
  20, 100+13*i);
else g.drawString(txt, 20, 100+13*i);
  }
  for(int i=0;i60)g.drawString(txt.substring(0, 57)+"...", 400, 100+13*i);
else g.drawString(txt, 400, 100+13*i);
  }
  for(int i=0;i60)g.drawString(txt.substring(0, 57)+"...", 400, 270+13*i);
else g.drawString(txt, 400, 270+13*i);
  }
  g.setColor(errColor);
  for(int i=0;i switch to set, or -help for more info.");
     System.exit(1);
}
spider spi=new spider(site, max, base);
if(time>0)spi.setTimer(time);
  spidergui s = new spidergui(spi, "Spider: "+site);
  s.run();
  System.out.println(spi);
}
}


本文来自ChinaUnix博客,如果查看原文请点:http://blog.chinaunix.net/u/10400/showart_117482.html
您需要登录后才可以回帖 登录 | 注册

本版积分规则 发表回复

  

北京盛拓优讯信息技术有限公司. 版权所有 京ICP备16024965号-6 北京市公安局海淀分局网监中心备案编号:11010802020122 niuxiaotong@pcpop.com 17352615567
未成年举报专区
中国互联网协会会员  联系我们:huangweiwei@itpub.net
感谢所有关心和支持过ChinaUnix的朋友们 转载本站内容请注明原作者名及出处

清除 Cookies - ChinaUnix - Archiver - WAP - TOP