Tuesday, December 1, 2009

MyCrawler

Well, I thought of sharing the crawler that I wrote!!

'''
Created on Nov 16, 2009
My very first, very own Crawler in Python !!
@author: Anish Rajan Kurian
'''
from sys import argv # Gets the argument(starting url),
from random import random # Random number appended to file name
from urlparse import urlsplit # Split the url for the filename
from htmllib import HTMLParser # Parse html to obtain the anchors
from urllib import urlretrieve # Retrieve the url n save it
from urllib2 import urlopen # Open the url !
from formatter import NullFormatter # Sends a Null Formatter to the parser

class Crawler:
'''
The crawler which initially starts with the user provided url
Then saves the page
Then crawls through that page to find more URLs
'''
def __init__(self, *tocrawl):
'''Constructor'''
self.crawllist = list(tocrawl)
self.crawled = []
self.url = ''
def save(self):
'''Saves the page to HD'''
print 'To crawl:', self.crawllist
self.url = self.crawllist.pop()
filename = urlsplit(self.url)[1] + str(random()) + '.html'
urlretrieve(self.url, filename)
print 'Saved : ', filename
def crawl(self):
'''Crawls through the page to find more URLs'''
try:
if self.url not in self.crawled:
urlobj = urlopen(self.url)
response = urlobj.read()
parser = HTMLParser(NullFormatter())
parser.feed(response)
parser.close()
for newlink in parser.anchorlist:
self.crawllist.append(newlink)
print 'New Links Obtained:', newlink
print 'Number of URLs to crawl:', len(self.crawllist)
except:
pass
def addtolist(self):
'''Adds to crawled URLs list'''
self.crawled.append(self.url)

def usage():
'''
Prints usage
'''
usage1 = '''
Usage : python crawler.py [url]
Example : python crawler.py http://www.google.com

Note : Python VM required to run.
'''
usage2 = '''
Alternately,
Run directly by adding
1) #!/usr/bin/env python to top of crawler.py
2) Give executable permission
Usage : ./crawler.py [url]
Example : ./crawler.py http://www.google.com
'''
mymsg = '''
Now that you are clear with the usage, next time you can run without
this being printed!!
Anyways, for now..
'''
print usage1 + '\n' + '-' * 75
print usage2 + '\n' + '-' * 75
print mymsg

def main():
'''
Main Program
'''
try:
if len(argv) < 2 or len(argv) > 2:
usage()
start = str(raw_input('Enter start url [example:'\
'http://www.google.com] :'))
else:
start = str(argv[1])
tocrawl = (start)
crawler = Crawler(tocrawl)
while len(crawler.crawllist) != 0:
try:
crawler.save()
except:
crawler.addtolist()
continue
crawler.crawl()
crawler.addtolist()
else:
print 'Crawl Over!!'
except KeyboardInterrupt, kbi:
print kbi
exit(1)

# Start
if __name__ == '__main__':
main()