Web crawler
Revised from: http://java.sun.com/developer/technicalArticles/ThirdParty/WebCrawler/
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.Properties;
import java.util.StringTokenizer;
import java.util.Vector;
public class WebCrawler implements Runnable {
public static final String SEARCH = "Search";
public static final String STOP = "Stop";
public static final String DISALLOW = "Disallow:";
public static final int SEARCH_LIMIT = 50;
Vector vectorToSearch = new Vector();
Vector vectorSearched = new Vector();
Vector vectorMatches = new Vector();
Thread searchThread;
public WebCrawler() {
// ("text/html");
// ("audio/basic");
// ("audio/au");
// ("audio/aiff");
// ("audio/wav");
// ("video/mpeg");
// ("video/x-avi");
URLConnection.setDefaultAllowUserInteraction(false);
searchThread = new Thread(this);
searchThread.start();
}
public void run() {
String strURL = "http://www.google.com";
String strTargetType = "text/html";
int numberSearched = 0;
int numberFound = 0;
if (strURL.length() == 0) {
System.out.println("ERROR: must enter a starting URL");
return;
}
vectorToSearch = new Vector();
vectorSearched = new Vector();
vectorMatches = new Vector();
vectorToSearch.addElement(strURL);
while ((vectorToSearch.size() > 0)
&& (Thread.currentThread() == searchThread)) {
strURL = (String) vectorToSearch.elementAt(0);
System.out.println("searching " + strURL);
URL url = null;
try {
url = new URL(strURL);
} catch (MalformedURLException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
vectorToSearch.removeElementAt(0);
vectorSearched.addElement(strURL);
try {
URLConnection urlConnection = url.openConnection();
urlConnection.setAllowUserInteraction(false);
InputStream urlStream = url.openStream();
String type = urlConnection.guessContentTypeFromStream(urlStream);
if (type == null)
break;
if (type.compareTo("text/html") != 0)
break;
byte b[] = new byte[5000];
int numRead = urlStream.read(b);
String content = new String(b, 0, numRead);
while (numRead != -1) {
if (Thread.currentThread() != searchThread)
break;
numRead = urlStream.read(b);
if (numRead != -1) {
String newContent = new String(b, 0, numRead);
content += newContent;
}
}
urlStream.close();
if (Thread.currentThread() != searchThread)
break;
String lowerCaseContent = content.toLowerCase();
int index = 0;
while ((index = lowerCaseContent.indexOf("<a", index)) != -1) {
if ((index = lowerCaseContent.indexOf("href", index)) == -1)
break;
if ((index = lowerCaseContent.indexOf("=", index)) == -1)
break;
if (Thread.currentThread() != searchThread)
break;
index++;
String remaining = content.substring(index);
StringTokenizer st = new StringTokenizer(remaining, "\t\n\r\">#");
String strLink = st.nextToken();
URL urlLink;
try {
urlLink = new URL(url, strLink);
strLink = urlLink.toString();
} catch (MalformedURLException e) {
System.out.println("ERROR: bad URL " + strLink);
continue;
}
if (urlLink.getProtocol().compareTo("http") != 0)
break;
if (Thread.currentThread() != searchThread)
break;
try {
URLConnection urlLinkConnection = urlLink.openConnection();
urlLinkConnection.setAllowUserInteraction(false);
InputStream linkStream = urlLink.openStream();
String strType = urlLinkConnection
.guessContentTypeFromStream(linkStream);
linkStream.close();
if (strType == null)
break;
if (strType.compareTo("text/html") == 0) {
if ((!vectorSearched.contains(strLink))
&& (!vectorToSearch.contains(strLink))) {
vectorToSearch.addElement(strLink);
}
}
if (strType.compareTo(strTargetType) == 0) {
if (vectorMatches.contains(strLink) == false) {
System.out.println(strLink);
vectorMatches.addElement(strLink);
numberFound++;
if (numberFound >= SEARCH_LIMIT)
break;
}
}
} catch (IOException e) {
System.out.println("ERROR: couldn't open URL " + strLink);
continue;
}
}
} catch (IOException e) {
System.out.println("ERROR: couldn't open URL " + strURL);
break;
}
numberSearched++;
if (numberSearched >= SEARCH_LIMIT)
break;
}
if (numberSearched >= SEARCH_LIMIT || numberFound >= SEARCH_LIMIT)
System.out.println("reached search limit of " + SEARCH_LIMIT);
else
System.out.println("done");
searchThread = null;
}
public static void main(String argv[]) {
WebCrawler applet = new WebCrawler();
/*
* Behind a firewall set your proxy and port here!
*/
Properties props = new Properties(System.getProperties());
props.put("http.proxySet", "true");
props.put("http.proxyHost", "webcache-cup");
props.put("http.proxyPort", "8080");
Properties newprops = new Properties(props);
System.setProperties(newprops);
}
}
Related examples in the same category