Python: Web Crawler
Here's a simple web crawler in Python.
# craw a website, list all url under a specific given path inputURL = "http://ergoemacs.github.io/ergoemacs-mode/" resultUrl = {inputURL:False} # key is a url we want. value is True or False. True means already crawled # from urllib import urlopen import urllib.request, urllib.error, urllib.parse import urllib.parse import time import pprint # get html links from bs4 import BeautifulSoup # import BeautifulSoup def processOneUrl(url): """fetch URL content and update resultUrl.""" try: # in case of 404 error html_page = urllib.request.urlopen(url) soup = BeautifulSoup.BeautifulSoup(html_page) for link in soup.findAll('a'): fullurl = urllib.parse.urljoin(url, link.get('href')) if fullurl.startswith(inputURL): if (fullurl not in resultUrl): resultUrl[fullurl] = False resultUrl[url] = True # set as crawled except: resultUrl[url] = True # set as crawled def moreToCrawl(): """returns True or False""" for url, crawled in iter(resultUrl.items()): if not crawled: print(("moreToCrawl found {}".format(url))) return url return False while True: toCrawl = moreToCrawl() if not toCrawl: break processOneUrl(toCrawl) time.sleep(2) pprint.pprint(resultUrl)
# -*- coding: utf-8 -*- # python 2 # craw a website, list all url under a specific given path inputURL = "http://ergoemacs.github.io/ergoemacs-mode/" resultUrl = {inputURL:False} # key is a url we want. value is True or False. True means already crawled # from urllib import urlopen import urllib2 import urlparse import time import pprint import BeautifulSoup # get html links def processOneUrl(url): """fetch URL content and update resultUrl.""" try: # in case of 404 error html_page = urllib2.urlopen(url) soup = BeautifulSoup.BeautifulSoup(html_page) for link in soup.findAll('a'): fullurl = urlparse.urljoin(url, link.get('href')) if fullurl.startswith(inputURL): if (fullurl not in resultUrl): resultUrl[fullurl] = False resultUrl[url] = True # set as crawled except: resultUrl[url] = True # set as crawled def moreToCrawl(): """returns True or False""" for url, crawled in iter(resultUrl.iteritems()): if not crawled: print("moreToCrawl found {}".format(url)) return url return False while True: toCrawl = moreToCrawl() if not toCrawl: break processOneUrl(toCrawl) time.sleep(2) pprint.pprint(resultUrl)
sample output:
◆ python xx-testscript-4925873.py moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/ moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/design-basis.html moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/testimonials.html moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/archives.html moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/key-setup.html moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/bug-report.html moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/faq.html moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/changelog.html moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/customize-keys.html moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/minor-modes.html moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/features.html moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/smart-commands.html moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/aliases.html moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/tags/index.html moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/standard-shortcuts.html moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/gradual-adoption.html moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/index.html moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/key-themes.html moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/tags/design-basis.html moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/keyboard-layouts.html moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/cua-conflict.html moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/roadmap.html moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/banish-key-chords.html moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/system-wide.html {'http://ergoemacs.github.io/ergoemacs-mode/': True, u'http://ergoemacs.github.io/ergoemacs-mode/aliases.html': True, u'http://ergoemacs.github.io/ergoemacs-mode/archives.html': True, u'http://ergoemacs.github.io/ergoemacs-mode/banish-key-chords.html': True, u'http://ergoemacs.github.io/ergoemacs-mode/bug-report.html': True, u'http://ergoemacs.github.io/ergoemacs-mode/changelog.html': True, u'http://ergoemacs.github.io/ergoemacs-mode/cua-conflict.html': True, u'http://ergoemacs.github.io/ergoemacs-mode/customize-keys.html': True, u'http://ergoemacs.github.io/ergoemacs-mode/design-basis.html': True, u'http://ergoemacs.github.io/ergoemacs-mode/faq.html': True, u'http://ergoemacs.github.io/ergoemacs-mode/features.html': True, u'http://ergoemacs.github.io/ergoemacs-mode/gradual-adoption.html': True, u'http://ergoemacs.github.io/ergoemacs-mode/index.html': True, u'http://ergoemacs.github.io/ergoemacs-mode/key-setup.html': True, u'http://ergoemacs.github.io/ergoemacs-mode/key-themes.html': True, u'http://ergoemacs.github.io/ergoemacs-mode/keyboard-layouts.html': True, u'http://ergoemacs.github.io/ergoemacs-mode/minor-modes.html': True, u'http://ergoemacs.github.io/ergoemacs-mode/roadmap.html': True, u'http://ergoemacs.github.io/ergoemacs-mode/smart-commands.html': True, u'http://ergoemacs.github.io/ergoemacs-mode/standard-shortcuts.html': True, u'http://ergoemacs.github.io/ergoemacs-mode/system-wide.html': True, u'http://ergoemacs.github.io/ergoemacs-mode/tags/design-basis.html': True, u'http://ergoemacs.github.io/ergoemacs-mode/tags/index.html': True, u'http://ergoemacs.github.io/ergoemacs-mode/testimonials.html': True}
2014-01-25 thanks to Sorawee Porncharoenwase [ https://plus.google.com/+SoraweePorncharoenwase/posts] for improvement
discuss on Google Plus
https://plus.google.com/+XahLee/posts/63YyJ79TypK