Perl: Validate Local Links

By Xah Lee. Date: . Last updated: .

Here's a script that validates all local links in all html files in a directory.

# -*- coding: utf-8 -*-

# perl
# 2004-09-21, 2018-08-30, 2024-05-18

use utf8;

# given a dir, check all local links and inline images in the html files there.
# Print dead links.

# http://xahlee.info/perl/perl_validate_local_links.html

use strict;
use Data::Dumper;
use File::Find;
use File::Basename;

my $webRootPath = qq[c:/Users/xah/web/];
my $inDirPath = qq[c:/Users/xah/web/xahlee_info/];

$inDirPath = ($ARGV[0] ? $ARGV[0] : $inDirPath) ; # should give a full path; else the $File::Find::dir won't give full path.

die qq{dir $inDirPath doesn't exist! $!} unless -e $inDirPath;

##################################################
# subroutines

# get_links($file_full_path) returns a list of values in <a href="...">. Sample elements:
# http://xahlee.org
# ../image.png
# ab/some.html
# file.nb
# mailto:xah@xahlee.org
# #reference
# javascript:f('pop_me.html')

sub get_links ($) {
  my $full_file_path = $_[0];
  my @myLinks = ();
  open (FF, "<$full_file_path") or die qq[error: cannot open $full_file_path $!];

  # read each line. Split on char <. Then use regex on href=... or src=... to get the url. This assumes that a tag <...> is not broken into more than one line.
  while (my $fileContent = <FF>) {
    my @textSegments = ();
    @textSegments = split(m/</, $fileContent);
    for my $oneLine (@textSegments) {

        if ($oneLine !~ m{ -->$}i) { # the line isn't html comment
            if ($oneLine =~ m{href\s*=\s*"([^"]+)".*>}i) { push @myLinks, $1; }
            if ($oneLine =~ m{src\s*=\s*\"([^"]+)".*>}i) { push @myLinks, $1; }
        }

    } }
  close FF;
  return @myLinks;
}

sub process_file {
if (
      $File::Find::name =~ m[\.html$|\.xml$]

      && $File::Find::dir !~ m(ergoemacs_org/emacs_manual)
      && $File::Find::dir !~ m(wordyenglish_com/arabian_nights/xx_full_2017-05-13)
      && $File::Find::dir !~ m(xahlee_info/REC-SVG11-20110816)
      && $File::Find::dir !~ m(xahlee_info/clojure-doc-1.8)
      && $File::Find::dir !~ m(xahlee_info/css_2.1_spec)
      && $File::Find::dir !~ m(xahlee_info/css_transitions)
      && $File::Find::dir !~ m(xahlee_info/javascript_ecma-262_5.1_2011)
      && $File::Find::dir !~ m(xahlee_info/javascript_ecma-262_6_2015)
      && $File::Find::dir !~ m(xahlee_info/javascript_es2016)
      && $File::Find::dir !~ m(xahlee_info/javascript_es6)
      && $File::Find::dir !~ m(xahlee_info/jquery_doc)
      && $File::Find::dir !~ m(xahlee_info/node_api)
      && $File::Find::dir !~ m(xahlee_info/ocaml_doc)
      && $File::Find::dir !~ m(xahlee_info/python_doc_2.7.6)
      && $File::Find::dir !~ m(xahlee_info/python_doc_3.3.3)
      && $File::Find::dir !~ m(xahlee_info/w3c_ui_events)
      && $File::Find::dir !~ m(xahlee_info/godoc)
      && $File::Find::name !~ m(xahlee_org/wikipedia_links.html)

     ) {
    my @myLinks = get_links($File::Find::name);

    map {
      my $orig_link_value = $_;
      my $pathToCheck = $_;

      # report local links that goes outside domain
      if (
          $pathToCheck =~ m{/ergoemacs_org/}
          or $pathToCheck =~ m{/wordyenglish_com/}
          or $pathToCheck =~ m{/xaharts_org/}
          or $pathToCheck =~ m{/xahlee_info/}
          or $pathToCheck =~ m{/xahlee_org/}
          or $pathToCheck =~ m{/xahmusic_org/}
          or $pathToCheck =~ m{/xahporn_org/}
          or $pathToCheck =~ m{/xahsl_org/}
          ) {
          print qq[[$File::Find::name] [$orig_link_value]];
      }

      # change xah inter domain links to file path
      $pathToCheck =~ s{^http://xahlee\.info/}{$webRootPath/xahlee_info/};
      $pathToCheck =~ s{^http://xahlee\.org/}{$webRootPath/xahlee_org/};

      if ( $pathToCheck !~ m[^//|^http:|^https:|^mailto:|^irc:|^ftp:|^javascript:]) {
          $pathToCheck =~ s/#.*//; # delete url fragment identifier e.g. ❮http://example.com/index.html#a❯
          $pathToCheck =~ s/%20/ /g; # decode percent encode url
          $pathToCheck =~ s/%27/'/g;

          # change it to full path
          if ( $pathToCheck !~ m{$webRootPath} ) {
              $pathToCheck = qq[$File::Find::dir/$pathToCheck]; # relative path. prepend dir
          }

          if (not -e $pathToCheck) {
              print qq[$File::Find::name $orig_link_value\n];
          }

      }
    }
    @myLinks;
  } }

my $mytime = localtime();

print qq[-*- coding: utf-8; mode: xah-find-output -*-\n];
print qq[$mytime\n];
print qq[Broken links in $inDirPath\n];
print qq[\n];

find(\&process_file, $inDirPath);

print "\nDone checking. (any errors are printed above.)\n";

__END__

Validate HTML File Local Links