Golang: Validate Links

By Xah Lee. Date: . Last updated: .

Here's a script that check all local links and inline image links in the html files of a given directory. Print a report.

// given a dir, check all local links and inline image links in the html files. Print a report.
// http://xahlee.info/golang/golang_validate_links.html
// Version 2018-08-30

package main

import (
	"fmt"
	"io/ioutil"
	"os"
	"path/filepath"
	"regexp"
	"strings"
	"time"
)

// inDir is dir to start. must be full path. if it's a file, the parent dir is used
var inDir = "/Users/xah/web/"

const fnameRegex = `\.html$`

var dirsToSkip = []string{
	".git",
	"emacs_manual",
	"godoc",
	"REC-SVG11-20110816",
	"clojure-doc-1.8",
	"css_2.1_spec",
	"css_transitions",
	"javascript_ecma-262_5.1_2011",
	"javascript_ecma-262_6_2015",
	"javascript_es2016",
	"javascript_es6",
	"jquery_doc",
	"node_api",
	"ocaml_doc",
	"python_doc_2.7.6",
	"python_doc_3.3.3",
	"w3c_ui_events",
	"xx_arabian_nights_full_2017-05-13",
}

// ------------------------------------------------------------

// pass return false if x equals any of y
// version 2018-11-12
func pass(x string, y []string) bool {
	for _, v := range y {
		if x == v {
			return false
		}
	}
	return true
}

func isFileExist(fpath string) bool {
	if _, err := os.Stat(fpath); os.IsNotExist(err) {
		return false
	}
	return true
}

// removeFrac removes fractional part of url. eg remove #...
// version 2018-11-12
func removeFrac(url string) string {
	// fmt.Printf("now: %v\n", url)
	var x = strings.LastIndex(url, "#")
	if x != -1 {
		return url[0:x]
	}
	return url
}

// getLinks return all links from a html file
// return value looks like this
// [[ `href="cat.html"` `cat.html`] ...]
// version 2018-11-12
func getLinks(ss string) [][]string {
	// var re = regexp.MustCompile(`<[-_=a-z0-9" ]+(?:href|src|poster)+="([^"]+)"`)
	// var re = regexp.MustCompile(` (?:href|src|poster)="([^"]+)"`)
	var re = regexp.MustCompile(`<[-_=a-z0-9" ]+(?:href|src)="([^"]+)"`)
	var lnks = re.FindAllStringSubmatch(ss, -1)
	return lnks
}

// transformLink change a xah http link to file full path
// eg http://ergoemacs.org/index.html becomes /Users/xah/web/ergoemacs_org/index.html
// if it's not exah site link, return no change
// version 2018-11-12
func transformLink(ss string) string {
	var xx = ss
	xx = strings.Replace(xx, "http://ergoemacs.org/", "/Users/xah/web/ergoemacs_org/", 1)
	xx = strings.Replace(xx, "http://wordyenglish.com/", "/Users/xah/web/wordyenglish_com/", 1)
	xx = strings.Replace(xx, "http://xaharts.org/", "/Users/xah/web/xaharts_org/", 1)
	xx = strings.Replace(xx, "http://xahlee.info/", "/Users/xah/web/xahlee_info/", 1)
	xx = strings.Replace(xx, "http://xahlee.org/", "/Users/xah/web/xahlee_org/", 1)
	xx = strings.Replace(xx, "http://xahmusic.org/", "/Users/xah/web/xahmusic_org/", 1)
	xx = strings.Replace(xx, "http://xahsl.org/", "/Users/xah/web/xahsl_org/", 1)
	return xx
}

// doFile, takes a html file path, extract all links, if local link and file does not exist, print it
func doFile(path string) error {
	contentBytes, er := ioutil.ReadFile(path)
	if er != nil {
		panic(er)
	}
	var lnks = getLinks(string(contentBytes))

	for _, v := range lnks {
		var linkVal = v[1]
		var isXahSite, errxs = regexp.MatchString(`^http://ergoemacs.org|^http://wordyenglish.com|^http://xaharts.org|^http://xahlee.info|^http://xahlee.org|^http://xahmusic.org|^http://xahsl.org/`, linkVal)
		if errxs != nil {
			panic(errxs)
		}
		if isXahSite {
			var lnkPath = removeFrac(transformLink(linkVal))
			if !isFileExist(lnkPath) {
				fmt.Printf("〈%v〉 %#v\n", path, linkVal)
			}
		} else {
			var isNoWant, errM = regexp.MatchString(`^//|^http:|^https:|^mailto:|^irc:|^ftp:|^javascript:`, linkVal)
			if errM != nil {
				panic(errM)
			}
			if !isNoWant {
				var noFrag = removeFrac(linkVal)
				var linkedPath = filepath.Dir(path) + "/" + noFrag
				linkedPath = filepath.Clean(linkedPath)
				if !isFileExist(linkedPath) {
					fmt.Printf("〈%v〉 %#v\n", path, noFrag)
				}
			}
		}

	}
	return nil
}

func main() {
	scriptName, errPath := os.Executable()
	if errPath != nil {
		panic(errPath)
	}

	inDir = filepath.Dir(inDir)

	fmt.Println("-*- coding: utf-8; mode: xah-find-output -*-")
	fmt.Printf("%v\n", time.Now())
	fmt.Printf("Script: %v\n", filepath.Base(scriptName))
	fmt.Printf("In dir: %v\n", inDir)
	fmt.Printf("File regex filter: %v\n", fnameRegex)
	fmt.Println()

	var pWalker = func(pathX string, infoX os.FileInfo, errX error) error {
		if errX != nil {
			fmt.Printf("error 「%v」 at a path 「%q」\n", errX, pathX)
			return errX
		}
		if infoX.IsDir() {
			if !pass(filepath.Base(pathX), dirsToSkip) {
				return filepath.SkipDir
			}
		} else {
			var x, err = regexp.MatchString(fnameRegex, filepath.Base(pathX))
			if err != nil {
				panic("stupid MatchString error 59767")
			}
			if x {
				doFile(pathX)
			}
		}
		return nil
	}

	err := filepath.Walk(inDir, pWalker)
	if err != nil {
		fmt.Printf("error walking the path %q: %v\n", inDir, err)
	}

	fmt.Printf("\n%v\n", "Done. bad links are printed above, if any.")
}

If you have a question, put $5 at patreon and message me.

Golang

  1. Compile, Run
  2. Source Encoding
  3. Package, Import
  4. Comment
  5. Print
  6. String
  7. Rune
  8. String as Chars
  9. Variable
  10. Zero Value
  11. Constant
  12. If Then Else
  13. Switch/Case
  14. Loop
  15. Basic Types
  16. Array
  17. Slice
  18. Map
  19. Struct
  20. Function
  21. regexp
  22. Read File
  23. Write to File
  24. Walk Dir
  25. Check File Exist
  26. System Call
  27. Get Script Path
  28. Pointer
  29. Defer
  30. Random Number

Examples

  1. match any regexp
  2. Validate Links
  3. Generate Sitemap

Reference

  1. Go Spec