Golang: Generate Sitemap

By Xah Lee. Date: . Last updated: .

Here's a script to generate sitemap of a given directory.

// given a dir, generate a sitemap.xml file for all its html files
// version 2018-11-04

package main

import (
	"fmt"
	"log"
	"os"
	"path/filepath"
	"regexp"
	"strings"
)

var dirsToProcess = []string{
	"/Users/xah/web/ergoemacs_org",
	"/Users/xah/web/wordyenglish_com",
	"/Users/xah/web/xaharts_org",
	"/Users/xah/web/xahlee_info",
	"/Users/xah/web/xahlee_org",
	"/Users/xah/web/xahmusic_org",
	"/Users/xah/web/xahsl_org",
}

var destFname = "sitemap.xml"

var dirsToSkip = []string{
	".git",
	"REC-SVG11-20110816",
	"clojure-doc-1.8",
	"css_2.1_spec",
	"css_transitions",
	"javascript_ecma-262_5.1_2011",
	"javascript_ecma-262_6_2015",
	"javascript_es2016",
	"javascript_es6",
	"node_api",
	"ocaml_doc",
}

// fnameRegex. only these are searched
const fnameRegex = `\.html$`

var fnameRegexToSkip = []string{
	`^xx`,
	`^403error.html`,
	`^404error.html`,
}

var dirRegexToSkip = []string{
	`^\.git$`,
	`^xx`,
}

var dirPathToUrl = map[string]string{
	"/Users/xah/web/ergoemacs_org":    "http://ergoemacs.org",
	"/Users/xah/web/wordyenglish_com": "http://wordyenglish.com",
	"/Users/xah/web/xaharts_org":      "http://xaharts.org",
	"/Users/xah/web/xahlee_info":      "http://xahlee.info",
	"/Users/xah/web/xahlee_org":       "http://xahlee.org",
	"/Users/xah/web/xahmusic_org":     "http://xahmusic.org",
	"/Users/xah/web/xahsl_org":        "http://xahsl.org",
}

// getMatched return the pair from mm, whose key is a prefix in ss. If none, panic.
// version 2018-09-02
var getMatched = func(ss string, mm map[string]string) []string {
	var bb = []string{``, ``}
	for k, v := range mm {
		if strings.HasPrefix(ss, k) {
			bb[0] = k
			bb[1] = v
			return bb
		}
	}
	panic("logic error. 83580")
	return nil
}

// equalAny return true if x equals any of y
// version 2018-09-02
func equalAny(x string, y []string) bool {
	for _, v := range y {
		if x == v {
			return true
		}
	}
	return false
}

// matchAny return true if ss is matched by any of regex regexes.
// version 2018-09-01
func matchAny(ss string, regexes []string) bool {
	for _, re := range regexes {
		result, err := regexp.MatchString(re, ss)
		if err != nil {
			panic(err)
		}
		if result {
			return true
		}
	}
	return false
}

// getHeadBytes return the first n bytes in file at path
// version 2018-09-02
func getHeadBytes(path string, n int) []byte {
	file, err := os.Open(path) // For read access.
	if err != nil {
		log.Fatal(err)
	}
	defer file.Close()
	headBytes := make([]byte, n)
	m, err := file.Read(headBytes)
	if err != nil {
		log.Fatal(err)
	}
	return headBytes[:m]
}

func doFile(path string, path2Url []string) (output []byte) {
	var firstLinish = getHeadBytes(path, 200)
	var pmoved, err = regexp.Match("page_moved_64598", firstLinish)
	if err != nil {
		panic(err)
	}
	if !pmoved {
		output = append(output, fmt.Sprintf("<url><loc>%v</loc></url>\n", strings.Replace(path, path2Url[0], path2Url[1], 1))...)

	}
	return output
}

func writeIt(contentX []byte, pathX string) {
	var fileH, err = os.Create(pathX)
	if err != nil {
		panic(err)
	}
	defer fileH.Close()
	var _, errW = fileH.Write(contentX)
	if errW != nil {
		panic(errW)
	}
}

func sitemap(dirX string, path2Url []string) []byte {
	var output = []byte(`<?xml version="1.0" encoding="UTF-8"?>` + "\n")
	output = append(output, (`<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">` + "\n")...)
	var pWalker = func(pathX string, infoX os.FileInfo, errX error) error {
		if errX != nil {
			panic(fmt.Sprintf("error 「%v」 at a path 「%q」\n", errX, pathX))
		}
		if infoX.IsDir() {
			if equalAny(filepath.Base(pathX), dirsToSkip) || matchAny(filepath.Base(pathX), dirRegexToSkip) {
				return filepath.SkipDir
			}
		} else {
			var fname = filepath.Base(pathX)
			var goodExtension, err = regexp.MatchString(fnameRegex, fname)
			if err != nil {
				panic("stupid golang MatchString error")
			}
			if goodExtension && !matchAny(fname, fnameRegexToSkip) {
				output = append(output, doFile(pathX, path2Url)...)
			}
		}
		return nil
	}
	err := filepath.Walk(dirX, pWalker)
	if err != nil {
		fmt.Printf("error walking the path %q: %v\n", dirX, err)
	}
	output = append(output, (`</urlset>` + "\n")...)
	return output
}

func main() {
	var outBytes []byte
	for _, v := range dirsToProcess {
		var path2Url = getMatched(v, dirPathToUrl)
		outBytes = nil
		outBytes = sitemap(v, path2Url)
		var saveToPath = filepath.Join(v, destFname)
		writeIt(outBytes, saveToPath)
		fmt.Printf("file saved to: %v\n", saveToPath)
	}
	fmt.Println("Done")
}

See also: Elisp: Create Sitemap

If you have a question, put $5 at patreon and message me.

Golang

  1. Compile, Run
  2. Source Encoding
  3. Package, Import
  4. Comment
  5. Print
  6. String
  7. String Functions
  8. Print String
  9. String Backslash Escape
  10. Rune
  11. Variable
  12. Zero Value
  13. Constant
  14. If Then Else
  15. Switch/Case
  16. Loop
  17. Basic Types
  18. Array
  19. Slice
  20. Map
  21. Struct
  22. Function
  23. Closure
  24. Pointer
  25. String, Byte Slice, Rune Slice
  26. regexp
  27. Read File
  28. Write to File
  29. Walk Dir
  30. Check File Exist
  31. System Call
  32. Get Script Path
  33. Defer
  34. Random Number

Examples

  1. Match Any Regexp
  2. Find Replace
  3. Validate Links
  4. Generate Sitemap

Reference

  1. Go Spec