Golang: Validate Links

By Xah Lee. Date: . Last updated: .

Here's a script that check all local links and inline image links in the html files of a given directory. Print a report.

// -*- coding: utf-8 -*-

/* [

File name:
check_local_links.go

Description:
given a dir, check all local links and inline image links in the html files.
Print a report.
In emacs, you can M-x xah-find-output-mode from xah-find.el to make it pretty and jump to links

website:
http://xahlee.info/golang/golang_validate_links.html

Version: 2018-11-24 2022-07-24 2022-07-29

] */

package main

import (
	"fmt"
	"os"
	"path/filepath"
	"regexp"
	"strings"
	"time"
	"unicode/utf8"
)

// inDir is dir to start. must be full path. if it's a file, the parent dir is used
var inDir = "c:/Users/xah/web/"

const fnameRegex = `\.xml$|\.html$`

var dirsToSkip = []string{
	".git",
	"emacs_manual",
	"godoc",
	"clojure-doc-1.8",
	"js_es2011",
	"js_es2015",
	"node_api",
}

var xahSiteDomainRegexs = []string{

	"\\.\\./ergoemacs_org",
	"\\.\\./wordyenglish_com",
	"\\.\\./xaharts_org",
	"\\.\\./xahlee_info",
	"\\.\\./xahlee_org",
	"\\.\\./xahmusic_org",
	"\\.\\./xahsl_org",
}

const fileSep = "━━━━━M6bcX━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"

const occurBracketL = '⦋'
const occurBracketR = '⦌'

const posBracketL = '⁅'
const posBracketR = '⁆'

const fileBracketL = '❬'
const fileBracketR = '❭'

// HHH___________________________________________________________________

// return true if link start with http and is xahlee's domain
/* [
return true if link start with http and is xahlee's domain
] */
func isXahDomain(linkVal string) bool {
	var isXahSite, errxs = regexp.MatchString(`^http://ergoemacs.org|^http://wordyenglish.com|^http://xaharts.org|^http://xahlee.info|^http://xahlee.org|^http://xahmusic.org|^http://xahporn.org|^http://xahsl.org/`, linkVal)
	if errxs != nil {
		panic(errxs)
	}
	return isXahSite
}

/* [
change a xah http link to file full path
eg
http://xahlee.info/index.html
becomes
c:/Users/xah/web/xahlee_info/index.html
if it's not exah site link, return no change
version 2018-11-12
] */
func xahSiteUrlToFilePath(ss string) string {
	var xx = ss
	xx = strings.Replace(xx, "http://ergoemacs.org/", "/Users/xah/web/ergoemacs_org/", 1)
	xx = strings.Replace(xx, "http://wordyenglish.com/", "/Users/xah/web/wordyenglish_com/", 1)
	xx = strings.Replace(xx, "http://xaharts.org/", "/Users/xah/web/xaharts_org/", 1)
	xx = strings.Replace(xx, "http://xahlee.info/", "/Users/xah/web/xahlee_info/", 1)
	xx = strings.Replace(xx, "http://xahlee.org/", "/Users/xah/web/xahlee_org/", 1)
	xx = strings.Replace(xx, "http://xahmusic.org/", "/Users/xah/web/xahmusic_org/", 1)
	xx = strings.Replace(xx, "http://xahporn.org/", "/Users/xah/web/xahporn_org/", 1)
	xx = strings.Replace(xx, "http://xahsl.org/", "/Users/xah/web/xahsl_org/", 1)
	return xx
}

// HHH___________________________________________________________________

// stringMatchAnyRegex return true if ss is matched by any regexes.
// version 2018-09-01
func stringMatchAnyRegex(ss string, regexes []string) bool {
	for _, re := range regexes {
		result, err := regexp.MatchString(re, ss)
		if err != nil {
			panic(err)
		}
		if result {
			return true
		}
	}
	return false
}

// strInArray return true if x equals any of y
// version 2018-11-12
func strInArray(x string, y []string) bool {
	for _, v := range y {
		if x == v {
			return true
		}
	}
	return false
}

func isFileExist(fpath string) bool {
	if _, err := os.Stat(fpath); os.IsNotExist(err) {
		return false
	}
	return true
}

/* [
remove fractional part of url. eg remove #...
version 2018-11-12
] */
func removeFrac(url string) string {
	var x = strings.LastIndex(url, "#")
	if x != -1 {
		return url[0:x]
	}
	return url
}

/* [
getLinkIndexes return all links from a html file in the format [pos1 pos2].
The position are index in byte slice, between them is a link, e.g. i/cat.png
The return value looks like this
[[ 100 108] ...]
version 2018-11-24 2021-10-16
] */
func getLinkIndexes(textB []byte) [][]int {
	var result = make([][]int, 0, 300)
	for _, val := range (regexp.MustCompile(`<[A-Za-z][A-Za-z0-9]* [^<>]+>`)).FindAllIndex(textB, -1) {
		var aOpeningTag = textB[val[0]:val[1]]
		for _, v2 := range (regexp.MustCompile(` (?:href|src|poster)="([^"]+)"`)).FindAllSubmatchIndex(aOpeningTag, -1) {
			if 0 != len(v2) {
				var x = []int{
					val[0] + v2[2],
					val[0] + v2[3],
				}
				result = append(result, x)
			}
		}
	}
	return result
}

// printBad report bad links
// fileFullPath is the full path of the file the link occur
// linkPath is full path of the link
// linkVal is value that occur in href= or src=
func printBad(fileFullPath string, linkPath string, linkVal string, startPos string) error {
	fmt.Printf("%c%v%c %c%s%c %c%s%c\n", fileBracketL, fileFullPath, fileBracketR, posBracketL, startPos, posBracketR, occurBracketL, linkVal, occurBracketR)
	fmt.Println(fileSep, "\n")
	return nil
}

/* [
return true if link is bad
] */
func isLinkBad(linkVal string, linkPath string) bool {

	if strings.HasPrefix(linkVal, "file://") || strings.HasPrefix(linkVal, "c:/") || strings.HasPrefix(linkVal, "C:/") {
		return true
	}

	return stringMatchAnyRegex(linkVal, xahSiteDomainRegexs) || !isFileExist(linkPath)

}

/* [
return true if link is to be skipped for checking
] */
func isSkipLink(linkVal string) bool {
	var isSkip, errM = regexp.MatchString(`^http://|^https://|^mailto:|^irc:|^ftp:|^javascript:`, linkVal)
	if errM != nil {
		panic(errM)
	}
	return isSkip
}

// checkFile, takes a html file path, extract all links, if local link and file does not exist, print it
func checkFile(fullPath string) error {
	textB, er := os.ReadFile(fullPath)
	if er != nil {
		panic(er)
	}
	var allLinks = getLinkIndexes(textB)
	for _, val := range allLinks {
		var pos1 = val[0]
		var pos2 = val[1]
		var linkVal = string(textB[pos1:pos2])
		var linkValNoFrag = removeFrac(linkVal)

		if isXahDomain(linkVal) {
			if isLinkBad(linkVal, xahSiteUrlToFilePath(linkValNoFrag)) {
				printBad(fullPath, xahSiteUrlToFilePath(linkValNoFrag), linkVal, fmt.Sprintf("%d", utf8.RuneCount(textB[0:pos1])))
			}
		} else {
			if !isSkipLink(linkVal) {
				var linkFullPath = filepath.Clean(filepath.Dir(fullPath) + "/" + linkValNoFrag)
				if isLinkBad(linkVal, linkFullPath) {
					printBad(fullPath, linkFullPath, linkVal, fmt.Sprintf("%d", utf8.RuneCount(textB[0:pos1])))
				}
			}
		}
	}
	return nil
}

func main() {
	scriptName, errPath := os.Executable()
	if errPath != nil {
		panic(errPath)
	}

	inDir = filepath.Dir(inDir)

	fmt.Println("-*- coding: utf-8; mode: xah-find-output -*-")
	fmt.Printf("%v\n", time.Now())
	fmt.Printf("Script: %v\n", filepath.Base(scriptName))
	fmt.Printf("In dir: %v\n", inDir)
	fmt.Printf("File regex filter: %v\n", fnameRegex)
	fmt.Println()
	fmt.Println("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")

	var pWalker = func(pathX string, infoX os.FileInfo, errX error) error {
		if errX != nil {
			fmt.Printf("error 「%v」 at a path 「%q」\n", errX, pathX)
			return errX
		}
		if infoX.IsDir() {
			if strInArray(filepath.Base(pathX), dirsToSkip) {
				return filepath.SkipDir
			}
		} else {
			var x, err = regexp.MatchString(fnameRegex, filepath.Base(pathX))
			if err != nil {
				panic("stupid MatchString error 59767")
			}
			if x {
				checkFile(pathX)
			}
		}
		return nil
	}

	err := filepath.Walk(inDir, pWalker)
	if err != nil {
		fmt.Printf("error walking the path %q: %v\n", inDir, err)
	}

	fmt.Printf("\n%v\n", "Done. bad links are printed above, if any.")
}
validate link output 2021-10-16
validate link output 2021-10-16

Validate HTML File Local Links