Python: Web Crawler

By Xah Lee. Date: . Last updated: .

Here's a simple web crawler in Python.

# python 3

# craw a website, list all url under a specific given path

inputURL = "http://ergoemacs.github.io/ergoemacs-mode/"

resultUrl = {inputURL:False}
# key is a url we want. value is True or False. True means already crawled

# from urllib import urlopen
import urllib.request, urllib.error, urllib.parse
import urllib.parse
import time
import pprint

# get html links
from bs4 import BeautifulSoup
# import BeautifulSoup

def processOneUrl(url):
    """fetch URL content and update resultUrl."""
    try:    # in case of 404 error
        html_page = urllib.request.urlopen(url)
        soup = BeautifulSoup.BeautifulSoup(html_page)
        for link in soup.findAll('a'):
            fullurl = urllib.parse.urljoin(url, link.get('href'))
            if fullurl.startswith(inputURL):
                if (fullurl not in resultUrl):
                    resultUrl[fullurl] = False
        resultUrl[url] = True # set as crawled
    except:
        resultUrl[url] = True   # set as crawled

def moreToCrawl():
    """returns True or False"""
    for url, crawled in iter(resultUrl.items()):
        if not crawled:
            print(("moreToCrawl found {}".format(url)))
            return url
    return False

while True:
    toCrawl = moreToCrawl()
    if not toCrawl:
        break
    processOneUrl(toCrawl)
    time.sleep(2)

pprint.pprint(resultUrl)
# -*- coding: utf-8 -*-
# python 2

# craw a website, list all url under a specific given path

inputURL = "http://ergoemacs.github.io/ergoemacs-mode/"

resultUrl = {inputURL:False}
# key is a url we want. value is True or False. True means already crawled

# from urllib import urlopen
import urllib2
import urlparse
import time
import pprint

import BeautifulSoup # get html links

def processOneUrl(url):
    """fetch URL content and update resultUrl."""
    try:    # in case of 404 error
        html_page = urllib2.urlopen(url)
        soup = BeautifulSoup.BeautifulSoup(html_page)
        for link in soup.findAll('a'):
            fullurl = urlparse.urljoin(url, link.get('href'))
            if fullurl.startswith(inputURL):
                if (fullurl not in resultUrl):
                    resultUrl[fullurl] = False
        resultUrl[url] = True # set as crawled
    except:
        resultUrl[url] = True   # set as crawled

def moreToCrawl():
    """returns True or False"""
    for url, crawled in iter(resultUrl.iteritems()):
        if not crawled:
            print("moreToCrawl found {}".format(url))
            return url
    return False

while True:
    toCrawl = moreToCrawl()
    if not toCrawl:
        break
    processOneUrl(toCrawl)
    time.sleep(2)

pprint.pprint(resultUrl)

sample output:

◆ python xx-testscript-4925873.py
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/design-basis.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/testimonials.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/archives.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/key-setup.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/bug-report.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/faq.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/changelog.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/customize-keys.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/minor-modes.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/features.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/smart-commands.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/aliases.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/tags/index.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/standard-shortcuts.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/gradual-adoption.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/index.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/key-themes.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/tags/design-basis.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/keyboard-layouts.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/cua-conflict.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/roadmap.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/banish-key-chords.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/system-wide.html
{'http://ergoemacs.github.io/ergoemacs-mode/': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/aliases.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/archives.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/banish-key-chords.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/bug-report.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/changelog.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/cua-conflict.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/customize-keys.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/design-basis.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/faq.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/features.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/gradual-adoption.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/index.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/key-setup.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/key-themes.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/keyboard-layouts.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/minor-modes.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/roadmap.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/smart-commands.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/standard-shortcuts.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/system-wide.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/tags/design-basis.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/tags/index.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/testimonials.html': True}

2014-01-25 thanks to Sorawee Porncharoenwase [ https://plus.google.com/+SoraweePorncharoenwase/posts ] for improvement

discuss on Google Plus https://plus.google.com/+XahLee/posts/63YyJ79TypK

If you have a question, put $5 at patreon and message me.

Python

  1. Python 3 Basics
  2. Python 2 Basics
  3. Python 2 and 3 Difference
  4. Print Version
  5. Builtin Help
  6. Quote String
  7. String Methods
  8. Format String
  9. Operators
  10. Complex Numbers
  11. True, False
  12. if then else
  13. Loop
  14. List Basics
  15. Loop Thru List
  16. Map f to List
  17. Copy Nested List
  18. List Comprehension
  19. List Methods
  20. Sort
  21. Dictionary
  22. Loop Thru Dict
  23. Dict Methods
  24. Tuple
  25. Sets
  26. Function
  27. Closure
  28. 2 Closure
  29. Decorator
  30. Class
  31. Object, ID, Type
  32. List Modules
  33. Write a Module
  34. Unicode 🐍

Regex

  1. Regex Basics
  2. Regex Reference

Text Processing

  1. Read/Write File
  2. Traverse Directory
  3. File Path
  4. Process Unicode
  5. Convert File Encoding
  6. Find Replace in dir
  7. Find Replace by Regex
  8. Count Word Frequency

Web

  1. Send Email
  2. GET Web Page
  3. Web Crawler
  4. HTTP POST

Misc

  1. JSON
  2. Find Script Path
  3. Get Env Var
  4. System Call
  5. Decompress Gzip
  6. Append String in Loop
  7. Timing f timeit
  8. Keyword Arg Default Value Unstable
  9. Check Page Load Size
  10. Thumbnail Generation