Golang: Validate Links
Here's a script that check all local links and inline image links in the html files of a given directory. Print a report.
// -*- coding: utf-8 -*- /* [ File name: check_local_links.go Description: given a dir, check all local links and inline image links in the html files. Print a report. In emacs, you can M-x xah-find-output-mode from xah-find.el to make it pretty and jump to links website: http://xahlee.info/golang/golang_validate_links.html Version: 2018-11-24 2022-07-24 2022-07-29 ] */ package main import ( "fmt" "os" "path/filepath" "regexp" "strings" "time" "unicode/utf8" ) // inDir is dir to start. must be full path. if it's a file, the parent dir is used var inDir = "c:/Users/xah/web/" const fnameRegex = `\.xml$|\.html$` var dirsToSkip = []string{ ".git", "emacs_manual", "godoc", "clojure-doc-1.8", "js_es2011", "js_es2015", "node_api", } var xahSiteDomainRegexs = []string{ "\\.\\./ergoemacs_org", "\\.\\./wordyenglish_com", "\\.\\./xaharts_org", "\\.\\./xahlee_info", "\\.\\./xahlee_org", "\\.\\./xahmusic_org", "\\.\\./xahsl_org", } const fileSep = "━━━━━M6bcX━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" const occurBracketL = '⦋' const occurBracketR = '⦌' const posBracketL = '⁅' const posBracketR = '⁆' const fileBracketL = '❬' const fileBracketR = '❭' // HHHH------------------------------ // return true if link start with http and is xahlee's domain func isXahDomain(linkVal string) bool { var isXahSite, errxs = regexp.MatchString(`^http://ergoemacs.org|^http://wordyenglish.com|^http://xaharts.org|^http://xahlee.info|^http://xahlee.org|^http://xahmusic.org|^http://xahporn.org|^http://xahsl.org/`, linkVal) if errxs != nil { panic(errxs) } return isXahSite } /* [ change a xah http link to file full path eg http://xahlee.info/index.html becomes c:/Users/xah/web/xahlee_info/index.html if it's not exah site link, return no change version 2018-11-12 ] */ func xahSiteUrlToFilePath(ss string) string { var xx = ss xx = strings.Replace(xx, "http://ergoemacs.org/", "/Users/xah/web/ergoemacs_org/", 1) xx = strings.Replace(xx, "http://wordyenglish.com/", "/Users/xah/web/wordyenglish_com/", 1) xx = strings.Replace(xx, "http://xaharts.org/", "/Users/xah/web/xaharts_org/", 1) xx = strings.Replace(xx, "http://xahlee.info/", "/Users/xah/web/xahlee_info/", 1) xx = strings.Replace(xx, "http://xahlee.org/", "/Users/xah/web/xahlee_org/", 1) xx = strings.Replace(xx, "http://xahmusic.org/", "/Users/xah/web/xahmusic_org/", 1) xx = strings.Replace(xx, "http://xahporn.org/", "/Users/xah/web/xahporn_org/", 1) xx = strings.Replace(xx, "http://xahsl.org/", "/Users/xah/web/xahsl_org/", 1) return xx } // HHHH------------------------------ // stringMatchAnyRegex return true if ss is matched by any regexes. // version 2018-09-01 func stringMatchAnyRegex(ss string, regexes []string) bool { for _, re := range regexes { result, err := regexp.MatchString(re, ss) if err != nil { panic(err) } if result { return true } } return false } // strInArray return true if x equals any of y // version 2018-11-12 func strInArray(x string, y []string) bool { for _, v := range y { if x == v { return true } } return false } func isFileExist(fpath string) bool { if _, err := os.Stat(fpath); os.IsNotExist(err) { return false } return true } /* [ remove fractional part of url. eg remove #... version 2018-11-12 ] */ func removeFrac(url string) string { var x = strings.LastIndex(url, "#") if x != -1 { return url[0:x] } return url } /* [ getLinkIndexes return all links from a html file in the format [pos1 pos2]. The position are index in byte slice, between them is a link, e.g. i/cat.png The return value looks like this [[ 100 108] ...] version 2018-11-24 2021-10-16 ] */ func getLinkIndexes(textB []byte) [][]int { var result = make([][]int, 0, 300) for _, val := range (regexp.MustCompile(`<[A-Za-z][A-Za-z0-9]* [^<>]+>`)).FindAllIndex(textB, -1) { var aOpeningTag = textB[val[0]:val[1]] for _, v2 := range (regexp.MustCompile(` (?:href|src|poster)="([^"]+)"`)).FindAllSubmatchIndex(aOpeningTag, -1) { if 0 != len(v2) { var x = []int{ val[0] + v2[2], val[0] + v2[3], } result = append(result, x) } } } return result } // printBad report bad links // fileFullPath is the full path of the file the link occur // linkPath is full path of the link // linkVal is value that occur in href= or src= func printBad(fileFullPath string, linkPath string, linkVal string, startPos string) error { fmt.Printf("%c%v%c %c%s%c %c%s%c\n", fileBracketL, fileFullPath, fileBracketR, posBracketL, startPos, posBracketR, occurBracketL, linkVal, occurBracketR) fmt.Println(fileSep, "\n") return nil } /* [ return true if link is bad ] */ func isLinkBad(linkVal string, linkPath string) bool { if strings.HasPrefix(linkVal, "file://") || strings.HasPrefix(linkVal, "c:/") || strings.HasPrefix(linkVal, "C:/") { return true } return stringMatchAnyRegex(linkVal, xahSiteDomainRegexs) || !isFileExist(linkPath) } /* [ return true if link is to be skipped for checking ] */ func isSkipLink(linkVal string) bool { var isSkip, errM = regexp.MatchString(`^http://|^https://|^mailto:|^irc:|^ftp:|^javascript:`, linkVal) if errM != nil { panic(errM) } return isSkip } // checkFile, takes a html file path, extract all links, if local link and file does not exist, print it func checkFile(fullPath string) error { textB, er := os.ReadFile(fullPath) if er != nil { panic(er) } var allLinks = getLinkIndexes(textB) for _, val := range allLinks { var pos1 = val[0] var pos2 = val[1] var linkVal = string(textB[pos1:pos2]) var linkValNoFrag = removeFrac(linkVal) if isXahDomain(linkVal) { if isLinkBad(linkVal, xahSiteUrlToFilePath(linkValNoFrag)) { printBad(fullPath, xahSiteUrlToFilePath(linkValNoFrag), linkVal, fmt.Sprintf("%d", utf8.RuneCount(textB[0:pos1]))) } } else { if !isSkipLink(linkVal) { var linkFullPath = filepath.Clean(filepath.Dir(fullPath) + "/" + linkValNoFrag) if isLinkBad(linkVal, linkFullPath) { printBad(fullPath, linkFullPath, linkVal, fmt.Sprintf("%d", utf8.RuneCount(textB[0:pos1]))) } } } } return nil } func main() { scriptName, errPath := os.Executable() if errPath != nil { panic(errPath) } inDir = filepath.Dir(inDir) fmt.Println("-*- coding: utf-8; mode: xah-find-output -*-") fmt.Printf("%v\n", time.Now()) fmt.Printf("Script: %v\n", filepath.Base(scriptName)) fmt.Printf("In dir: %v\n", inDir) fmt.Printf("File regex filter: %v\n", fnameRegex) fmt.Println() fmt.Println("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n") var pWalker = func(pathX string, infoX os.FileInfo, errX error) error { if errX != nil { fmt.Printf("error 「%v」 at a path 「%q」\n", errX, pathX) return errX } if infoX.IsDir() { if strInArray(filepath.Base(pathX), dirsToSkip) { return filepath.SkipDir } } else { var x, err = regexp.MatchString(fnameRegex, filepath.Base(pathX)) if err != nil { panic("stupid MatchString error 59767") } if x { checkFile(pathX) } } return nil } err := filepath.Walk(inDir, pWalker) if err != nil { fmt.Printf("error walking the path %q: %v\n", inDir, err) } fmt.Printf("\n%v\n", "Done. bad links are printed above, if any.") }