Here's a Python script to do find/replace by regex, for all HTML files in a dir.
For example, suppose you are working in HTML and you want links of this form:
<a href="http://en.wikipedia.org/wiki/Artemis">Artemis</a>
to become:
<a href="http://en.wikipedia.org/wiki/Artemis">Artemis↗</a>
You need a text pattern to do the job, because the URL and link text are different for each link.
Here's the script. (always do backup dir before use)
# -*- coding: utf-8 -*- # python import os, sys, re inputDir = "/home/xah/web/xahlee_info/xx-comp" patternStr = ur'''<a href="(http[^"]+)">([^>]+)</a>''' repStr = ur'''<a href="\1">\2↗</a>''' def replaceStringInFile(filePath): "replaces all string by a regex substitution" tempName = filePath+'~~~' inputFile = open(filePath) outputFile = open(tempName,'w') fContent = unicode(inputFile.read(), "utf-8") outText = re.sub(patternStr, repStr, fContent) outputFile.write((outText.encode("utf-8"))) outputFile.close() inputFile.close() os.rename(tempName, filePath) print "processed {}".format(filePath) def fileFilter(dummyArg, thisDir, dirChildrenList): for thisChild in dirChildrenList: if '.html' == os.path.splitext(thisChild)[1] and os.path.isfile(thisDir+'/'+thisChild): replaceStringInFile(thisDir+'/'+thisChild) os.path.walk(inputDir, fileFilter, None)
The following version does mulitple pairs of find replace pairs by a text pattern, on all HTML files in a dir. The files are assumed to be utf-8 encoded. (it works fine if the files are ascii, because ascii is a subset of utf-8)
# -*- coding: utf-8 -*- # Python # change all files in a dir. # using mulitple regex/replace pairs import os, sys, shutil, re input_dir = "c:/Users/h3/web/ergoemacs_org" find_replace_list = [ (re.compile(ur"""<header>.+</header>""", re.U|re.M|re.DOTALL), ur"""•8017015673"""), (re.compile(ur"""•8017015673""", re.U|re.M|re.DOTALL), ur"""<header> ∑ <a href="http://ergoemacs.org/index.html">ErgoEmacs</a> ◆ <span id="e1α"><a href="http://ergoemacs.org/emacs/emacs.html">Emacs</a> ◇ <a href="http://ergoemacs.org/emacs/elisp.html">Lisp</a> ◆ <a href="http://ergoemacs.org/emacs/blog.html">Blog</a> ◆ <span id="e2α"><a href="http://ergoemacs.org/emacs_manual/emacs/index.html">Emacs</a> ◇ <a href="http://ergoemacs.org/emacs_manual/elisp/index.html">Lisp</a> ◆ <a href="http://ergoemacs.org/emacs/buy_xah_emacs_tutorial.html">Buy Tutorial</a> <form action="http://www.google.com" id="cse-search-box"> <div> <input type="hidden" name="cx" value="partner-pub-5125343095650532:8381157956" /> <input type="hidden" name="ie" value="UTF-8" /> <input type="text" name="q" size="55" /> <input type="submit" name="sa" value="Search" /> </div> </form><script src="http://www.google.com/coop/cse/brand?form=cse-search-box&lang=en"></script> </header>"""), (re.compile(ur"""<img src="([^"]+?)" alt="([^"]+?)" width="([0-9]+)" height="([0-9]+)"> <figcaption>""", re.U|re.M), ur"""<img src="\1" alt="\2" width="\3" height="\4" /> <figcaption>"""), (re.compile(ur"""title="(\d+)x(\d+)">❐</a>""",re.U|re.M), ur"""title="\1×\2">❐</a>"""), ] def replace_string_in_file(file_path): "Replaces all strings by regex in find_replace_list at file_path." backup_fname = file_path + "~re~" # print "reading:", file_path input_file = open(file_path, "rb") file_content = unicode(input_file.read(), "utf-8") input_file.close() num_replaced = 0 for a_pair in find_replace_list: tem_tuple = re.subn(a_pair[0], a_pair[1], file_content) output_text = tem_tuple[0] num_replaced += tem_tuple[1] file_content = output_text if (num_replaced > 0): print ("◆ %d %s" % (num_replaced, file_path.replace("/cygdrive/c/Users/h3", "~")) ) shutil.copy2(file_path, backup_fname) output_file = open(file_path, "r+b") output_file.read() # we do this way to preserve file creation date output_file.seek(0) output_file.write(output_text.encode("utf-8")) output_file.truncate() output_file.close() def file_filter(dummy_arg, current_dir, file_list): for child in file_list: if re.search(r".+\.html$", child, re.U) and os.path.isfile(current_dir + "/" + child): replace_string_in_file(current_dir + "/" + child) os.path.walk(input_dir, file_filter, None) print "Done."
See also: Perl: Find/Replace on Multiple Files.