Python, Perl: Script to Check HTML File Size
The following script takes a directory and print all the sizes of html files, counting the sizes of inline images.
This script is useful in making sure that HTML file are under certain size. This is useful because web visitors with slow connection may take a long time to load HTML files with lots of inline images.
# Python 3 # given a dir, report all html file's size. (counting inline images) # author XahLee.org # url http://xahlee.info/python/check_html_size.html # version 2005-10-05 2023-04-13 import re, os.path, sys input_path = "c:/Users/xah/web/xahlee_info/python/" file_ext = ".html" sizelimit = 800 * 1000 while input_path[-1] == "/": input_path = input_path[0:-1] # get rid of trailing slash if not os.path.exists(input_path): print("dir " + input_path + " doesn't exist!") sys.exit(1) # HHHH--------------------------------------------------- # subroutines def getInlineImg(file_full_path): """getInlineImg(file_full_path) returns a array that is a list of inline images. e.g. it may return ['xx.jpg','../image.png']""" xfile = open(file_full_path, "rb") xtxtblocks = re.split(r"<img src", str(xfile.read(), "utf-8")) xtxtblocks.pop(0) xfile.close() xlinks = [] for xblock in xtxtblocks: xmatch = re.search(r"\s*=\s*\"([^\"]+)\"", xblock) if xmatch: xlinks.append(xmatch.group(1)) return xlinks def linkFullPath(dir, locallink): """linkFullPath(dir, locallink) returns a string that is the full path to the local link. e.g. linkFullPath('/Users/t/public_html/a/b', '../image/t.png') returns 'Users/t/public_html/a/image/t.png'. The returned xresult will not contain double slash or '../' string.""" xresult = dir + "/" + locallink xresult = re.sub(r"//+", r"/", xresult) while re.search(r"/[^\/]+\/\.\.", xresult): xresult = re.sub(r"/[^\/]+\/\.\.", "", xresult) return xresult def fListInlineImg(htmlfile): """fListInlineImg($html_file_full_path) returns a array where each element is a full path to inline images in the html.""" xdir = os.path.dirname(htmlfile) xresult = [] for xpath in getInlineImg(htmlfile): xresult.append(linkFullPath(xdir, xpath)) return xresult # HHHH--------------------------------------------------- # main fileSizeList = [] def fAddSize(filepath): xTotalSize = os.path.getsize(filepath) for xImgPath in fListInlineImg(filepath): xTotalSize += os.path.getsize(xImgPath) fileSizeList.append([xTotalSize, filepath]) for xdir, _, fnames in os.walk(input_path): for fname in fnames: if (file_ext == os.path.splitext(fname)[1]) and ( os.path.isfile(xdir + "/" + fname) ): full_path = os.path.join(xdir, fname) # print("doing: ", full_path) fAddSize(full_path) fileSizeList.sort(key=lambda x: x[0], reverse=True) for xx in fileSizeList: print(xx)
Perl
The following is a Perl version.
# -*- coding: utf-8 -*- # perl # 2005-10-04 # given a dir, report all HTML file's size. (counting inline images) # XahLee.org use Data::Dumper; use File::Find; use File::Basename; $inpath = '/Users/t/web/'; while ($inpath =~ m@^(.+)/$@) { $inpath = $1;} # get rid of trailing slash die "dir $inpath doesn't exist! $!" unless -e $inpath; ################################################## # subroutines # getInlineImg($file_full_path) returns a array that is sources of inline images. e.g. it may return ('xx.jpg','../image.png') sub getInlineImg ($) { $full_file_path= $_[0]; @linx = (); open (FF, "<$full_file_path") or die "error: can not open $full_file_path $!"; while (<FF>) { @txt_segs = split(m/src/, $_); shift @txt_segs; for $linkBlock (@txt_segs) { if ($linkBlock =~ m@\s*=\s*\"([^\"]+)\"@) { push @linx, $1; } } } close FF; return @linx; } # linkFullPath($dir,$locallink) returns a string that is the full path to the local link. e.g. linkFullPath('/Users/t/public_html/a/b', '../image/t.png') returns 'Users/t/public_html/a/image/t.png'. The returned result will not contain double slash or '../' string. sub linkFullPath($$){ $result=$_[0] . $_[1]; $result =~ s@\/+@\/@g; while ($result =~ s@/[^\/]+\/\.\.@@) {}; return $result; } # listInlineImg($html_file_full_path) returns a array where each element is a full path to inline images in the html. sub listInlineImg($) { my $htmlfile= $_[0]; my ($name, $dir, $suffix) = fileparse($htmlfile, ('\.html') ); my @imgPaths = getInlineImg($htmlfile); my @result = (); foreach my $aPath (@imgPaths) { push @result, linkFullPath($dir,$aPath);} return @result; } ################################################## # main sub checkLink { if ( $File::Find::name =~ m@\.html$@ && -T $File::Find::name ) { $totalSize= -s $File::Find::name; @imagePathList = listInlineImg($File::Find::name); for my $imgPath (@imagePathList) {$totalSize += -s $imgPath;}; push (@fileSizeList, [$totalSize, $File::Find::name]); }; } find(\&checkLink, $inpath); @fileSizeList = sort { $b->[0] <=> $a->[0]} @fileSizeList; print Dumper(\@fileSizeList); print "done reporting.";
Note that in some web browsers or web development tools, they can calculate the size of a web page. The script is useful because it does to all files in a dir at once. Useful for static websites such as documentation.