Perl: Validate Local Links

By Xah Lee. Date: .

Here's a script that validates all local links in all html files in a directory.

# -*- coding: utf-8 -*-
# perl
# 2004-09-21, 2018-08-30

# given a dir, check all local links and inline images in the html files there. Print a report.

# http://xahlee.info/perl/perl_validate_local_links.html

use strict;
use Data::Dumper;
use File::Find;
use File::Basename;

my $webRootPath = qq[/Users/xah/web];
my $inDirPath = qq[/Users/xah/web/xahlee_info/comp/];

$inDirPath = ($ARGV[0] ? $ARGV[0] : $inDirPath) ; # should give a full path; else the $File::Find::dir won't give full path.

die qq{dir $inDirPath doesn't exist! $!} unless -e $inDirPath;

##################################################
# subroutines

# get_links($file_full_path) returns a list of values in <a href="…">. Sample elements:
# http://xahlee.org
# ../image.png
# ab/some.html
# file.nb
# mailto:xah@xahlee.org
# #reference
# javascript:f('pop_me.html')

sub get_links ($) {
  my $full_file_path = $_[0];
  my @myLinks = ();
  open (FF, "<$full_file_path") or die qq[error: cannot open $full_file_path $!];

  # read each line. Split on char “<”. Then use regex on 「href=…」 or 「src=…」 to get the url. This assumes that a tag 「<…>」 is not broken into more than one line.
  while (my $fileContent = <FF>) {
    my @textSegments = ();
    @textSegments = split(m/</, $fileContent);
    for my $oneLine (@textSegments) {

        if ($oneLine !~ m{ -->$}i) { # the line isn't html comment
            if ($oneLine =~ m{href\s*=\s*"([^"]+)".*>}i) { push @myLinks, $1; }
            if ($oneLine =~ m{src\s*=\s*\"([^"]+)".*>}i) { push @myLinks, $1; }
        }

    } }
  close FF;
  return @myLinks;
}

sub process_file {
if (
      $File::Find::name =~ m[\.html$|\.xml$]

      && $File::Find::dir !~ m(ergoemacs_org/emacs_manual)
      && $File::Find::dir !~ m(wordyenglish_com/arabian_nights/xx_full_2017-05-13)
      && $File::Find::dir !~ m(xahlee_info/REC-SVG11-20110816)
      && $File::Find::dir !~ m(xahlee_info/clojure-doc-1.8)
      && $File::Find::dir !~ m(xahlee_info/css_2.1_spec)
      && $File::Find::dir !~ m(xahlee_info/css_transitions)
      && $File::Find::dir !~ m(xahlee_info/javascript_ecma-262_5.1_2011)
      && $File::Find::dir !~ m(xahlee_info/javascript_ecma-262_6_2015)
      && $File::Find::dir !~ m(xahlee_info/javascript_es2016)
      && $File::Find::dir !~ m(xahlee_info/javascript_es6)
      && $File::Find::dir !~ m(xahlee_info/jquery_doc)
      && $File::Find::dir !~ m(xahlee_info/node_api)
      && $File::Find::dir !~ m(xahlee_info/ocaml_doc)
      && $File::Find::dir !~ m(xahlee_info/python_doc_2.7.6)
      && $File::Find::dir !~ m(xahlee_info/python_doc_3.3.3)
      && $File::Find::dir !~ m(xahlee_info/w3c_ui_events)
      && $File::Find::dir !~ m(xahlee_info/godoc)
      && $File::Find::name !~ m(xahlee_org/wikipedia_links.html)

     ) {
    my @myLinks = get_links($File::Find::name);

    map {
      my $orig_link_value = $_;
      my $pathToCheck = $_;

      # report local links that goes outside domain
      if (
          $pathToCheck =~ m{/ergoemacs_org/}
          or $pathToCheck =~ m{/wordyenglish_com/}
          or $pathToCheck =~ m{/xaharts_org/}
          or $pathToCheck =~ m{/xahlee_info/}
          or $pathToCheck =~ m{/xahlee_org/}
          or $pathToCheck =~ m{/xahmusic_org/}
          or $pathToCheck =~ m{/xahsl_org/}
          ) {
          print qq[〈$File::Find::name〉 「$orig_link_value」];
      }

      # change xah inter domain links to file path
      $pathToCheck =~ s{^http://ergoemacs\.org/}{$webRootPath/ergoemacs_org/};
      $pathToCheck =~ s{^http://wordyenglish\.com/}{$webRootPath/wordyenglish_com/};
      $pathToCheck =~ s{^http://xaharts\.org/}{$webRootPath/xaharts_org/};
      $pathToCheck =~ s{^http://xahlee\.info/}{$webRootPath/xahlee_info/};
      $pathToCheck =~ s{^http://xahlee\.org/}{$webRootPath/xahlee_org/};
      $pathToCheck =~ s{^http://xahmusic\.org/}{$webRootPath/xahmusic_org/};
      $pathToCheck =~ s{^http://xahsl\.org/}{$webRootPath/xahsl_org/};

      if ( $pathToCheck !~ m[^//|^http:|^https:|^mailto:|^irc:|^ftp:|^javascript:]) {
          $pathToCheck =~ s/#.*//; # delete url fragment identifier e.g. 「http://example.com/index.html#a」
          $pathToCheck =~ s/%20/ /g; # decode percent encode url
          $pathToCheck =~ s/%27/'/g;

          # change it to full path
          if ( $pathToCheck !~ m{$webRootPath} ) {
              $pathToCheck = qq[$File::Find::dir/$pathToCheck]; # relative path. prepend dir
          }

          if (not -e $pathToCheck) {
              print qq[〈$File::Find::name〉 「$orig_link_value」\n];
          }

      }
    }
    @myLinks;
  } }

my $mytime = localtime();

print qq[-*- coding: utf-8; mode: xah-find-output -*-\n];
print qq[$mytime\n];
print qq[Broken links in “$inDirPath”\n];
print qq[\n];

find(\&process_file, $inDirPath);

print "\nDone checking. (any errors are printed above.)\n";

__END__

If you have a question, put $5 at patreon and message me.

Perl

  1. Perl Overview
  2. Version String
  3. Help System

Detail

  1. Quoting String
  2. Format String
  3. String Operations
  4. True, False
  5. if then else
  6. Loop
  7. List / Array
  8. Loop Thru List
  9. Map f to List
  10. List Comprehension
  11. Hash Table
  12. Function Optional Param
  13. regex

Text Processing

  1. Unicode 🐪
  2. Convert File Encoding
  3. Read Write File
  4. Traverse Dir
  5. Find Replace
  6. Validate Local Links
  7. Split Line by Regex

Advanced

  1. Sort List, Matrix, Object
  2. Sort Matrix
  3. List Modules, Search Paths
  4. Write a Module
  5. Complex Numbers
  6. System Call
  7. gzip
  8. Get Env Var
  9. GET Web Content
  10. Sort Unstable