Get all urls from a web page – Simple Web Crawler

Pavan July 11, 2009 Java No Comments

Here I’m going to post a class that will extract all valid URLs from a web page. The example can be treated as a basic web crawler. My class uses “URLConnectionReader” provided by Sun Tutorial

The class defines 2 constructors.

One by default returns you the vector containing only text/html url objects from page.
For the other you can specify the type of urls you want from a page. This is helpful when you want to get all images, videos or any other media urls.

The class also considers relative urls. It returns relative urls with http and host name prefixed.
E.g. If you have urls like “/about.php”, then class will return “http://hostname.domain/about.php”

The URLFinder

import java.util.*;
import java.net.*;
import java.io.*;

public class URLFinder
{
String content;
Vector urlVector;
String type;
URL tUrl;
URLConnectionReader ucreader;
String host;

public URLFinder(String uri)
{
 ucreader = new URLConnectionReader(uri);
 this.content = ucreader.content.toLowerCase();
 this.host = ucreader.host;

 urlVector = new Vector(30,10);
 int index = 0;
 while ((index = this.content.indexOf("&lt;a", index))="" !="-1)" {="" if="" ((index="this.content.indexOf("href"," -1)="" break;="" index++;="" string="" remaining="this.content.substring(index);" stringtokenizer="" st="new" stringtokenizer(remaining,="" "tnr"'=""&gt;#");
	String strLink = st.nextToken();

	tUrl = isValidUrl(strLink);
	try
	{
		URLConnection urlConnection = tUrl.openConnection();
		String type = (String)urlConnection.getContentType();
		if (type == null)
		{
			System.out.println("ERROR: invalid type "
			+strLink+ "=" + type);
			continue;
		}
		if (type.indexOf("ext/html")==0)
		{
			System.out.println("ERROR: invalid type "
			+strLink+ "=" + type);
			continue;
		}
		urlVector.addElement(tUrl);
	}
	catch(Exception e){
		System.out.println("ERROR: invalid URL "
			+ strLink);
		continue;
	}
 }
}

public URLFinder(String data, String type)
{
 this.content = data.toLowerCase();
 urlVector = new Vector(30,10);
 int index = 0;
 while ((index = this.content.indexOf("&lt;a", index))="" !="-1)" {="" if="" ((index="this.content.indexOf("href"," -1)="" break;="" index++;="" string="" remaining="this.content.substring(index);" stringtokenizer="" st="new" stringtokenizer(remaining,="" "tnr"'=""&gt;#");
	String strLink = st.nextToken();

	tUrl = isValidUrl(strLink);
	try
	{
		URLConnection urlConnection = tUrl.openConnection();
		String utype = (String)urlConnection.getContentType();

		if (utype == null)
		{
			System.out.println("ERROR: invalid type "
			+strLink+ "=" + utype);
			continue;
		}
		if (utype.indexOf(type)==0)
		{
			System.out.println("ERROR: invalid type "
			+strLink+ "=" + utype);
			continue;
		}
		urlVector.addElement(tUrl.toURI());
	}
	catch(Exception e){
		System.out.println("ERROR: invalid URL "
			+ strLink);
		continue;
	}
 }
}

public URL isValidUrl(String strLink)
{
	try
	{
		tUrl = new URL(strLink);
	}
	catch(MalformedURLException e)
	{
		strLink = "http://"+this.host+strLink;
		tUrl = isValidUrl(strLink);
	}
	return tUrl;
}
}

100

101

import java.util.*;

import java.net.*;

import java.io.*;

public class URLFinder

{

String content;

Vector urlVector;

String type;

URL tUrl;

URLConnectionReader ucreader;

String host;

public URLFinder(String uri)

{

ucreader = new URLConnectionReader(uri);

this.content = ucreader.content.toLowerCase();

this.host = ucreader.host;

urlVector = new Vector(30,10);

int index = 0;

while ((index = this.content.indexOf("<a", index))="" !="-1)" {="" if="" ((index="this.content.indexOf("href"," -1)="" break;="" index++;="" string="" remaining="this.content.substring(index);" stringtokenizer="" st="new" stringtokenizer(remaining,="" "tnr"'="">#");

String strLink = st.nextToken();

tUrl = isValidUrl(strLink);

try

{

URLConnection urlConnection = tUrl.openConnection();

String type = (String)urlConnection.getContentType();

if (type == null)

{

System.out.println("ERROR: invalid type "

+strLink+ "=" + type);

continue;

}

if (type.indexOf("ext/html")==0)

{

System.out.println("ERROR: invalid type "

+strLink+ "=" + type);

continue;

}

urlVector.addElement(tUrl);

}

catch(Exception e){

System.out.println("ERROR: invalid URL "

+ strLink);

continue;

}

public URLFinder(String data, String type)

{

this.content = data.toLowerCase();

urlVector = new Vector(30,10);

int index = 0;

String strLink = st.nextToken();

tUrl = isValidUrl(strLink);

try

{

URLConnection urlConnection = tUrl.openConnection();

String utype = (String)urlConnection.getContentType();

if (utype == null)

{

System.out.println("ERROR: invalid type "

+strLink+ "=" + utype);

continue;

}

if (utype.indexOf(type)==0)

{

System.out.println("ERROR: invalid type "

+strLink+ "=" + utype);

continue;

}

urlVector.addElement(tUrl.toURI());

}

catch(Exception e){

System.out.println("ERROR: invalid URL "

+ strLink);

continue;

}

public URL isValidUrl(String strLink)

{

try

{

tUrl = new URL(strLink);

}

catch(MalformedURLException e)

{

strLink = "http://"+this.host+strLink;

tUrl = isValidUrl(strLink);

}

return tUrl;

}

Usage

finder = new URLFinder("http://www.mydomain.com/");
int cap = finder.urlVector.size();
String buf = "";
for(int j=0;j&lt;cap;j++) {="" url="" tmp="(URL)finder.urlVector.get(j);" system.out.println(tmp.touri());="" }="" &lt;="" pre=""&gt;

finder = new URLFinder("http://www.mydomain.com/");

int cap = finder.urlVector.size();

String buf = "";

for(int j=0;j<cap;j++) {="" url="" tmp="(URL)finder.urlVector.get(j);" system.out.println(tmp.touri());="" }="" <="" pre="">

This will get you all the URLs from any web page. So its pretty simple to come up with your basic version of web crawler. I am sure you will be able to build something more on top of this.

Happy Sharing!!

Tags:crawler, Java, url

Get all urls from a web page – Simple Web Crawler

Add a Comment

Openstrap 2.0.3

Aadya 2.0.3

Awakening 2.0.3

Related Posts

Add a Comment

Openstrap 2.0.3

Aadya 2.0.3

Awakening 2.0.3