Python: Find/Replace by Regex Text Pattern

, , …,

Here's a Python script to do find/replace by regex, for all HTML files in a dir.

For example, suppose you are working in HTML and you want links of this form:

<a href="http://en.wikipedia.org/wiki/Artemis">Artemis</a>

to become:

<a href="http://en.wikipedia.org/wiki/Artemis">Artemis↗</a>

You need a text pattern to do the job, because the URL and link text are different for each link.

Here's the script. (always do backup dir before use)

# -*- coding: utf-8 -*-
# python

import os, sys, re

inputDir = "/home/xah/web/xahlee_info/xx-comp"

patternStr = ur'''<a href="(http[^"]+)">([^>]+)</a>'''
repStr = ur'''<a href="\1">\2↗</a>'''

def replaceStringInFile(filePath):
   "replaces all string by a regex substitution"
   tempName = filePath+'~~~'
   inputFile = open(filePath)
   outputFile = open(tempName,'w')
   fContent = unicode(inputFile.read(), "utf-8")

   outText = re.sub(patternStr, repStr, fContent)

   outputFile.write((outText.encode("utf-8")))

   outputFile.close()
   inputFile.close()

   os.rename(tempName, filePath)
   print "processed {}".format(filePath)

def fileFilter(dummyArg, thisDir, dirChildrenList):
    for thisChild in dirChildrenList:
        if '.html' == os.path.splitext(thisChild)[1] and os.path.isfile(thisDir+'/'+thisChild):
            replaceStringInFile(thisDir+'/'+thisChild)

os.path.walk(inputDir, fileFilter, None)

Multiple Pairs of Find Replace Regex

The following version does mulitple pairs of find replace pairs by a text pattern, on all HTML files in a dir. The files are assumed to be utf-8 encoded. (it works fine if the files are ASCII, because ASCII is a subset of utf-8)

# -*- coding: utf-8 -*-
# Python

# change all files in a dir. 
# using mulitple regex/replace pairs 

import os, sys, shutil, re

input_dir = "c:/Users/h3/web/ergoemacs_org"

find_replace_list = [

(re.compile(ur"""<header>.+</header>""", re.U|re.M|re.DOTALL), ur"""•8017015673"""),

(re.compile(ur"""•8017015673""", re.U|re.M|re.DOTALL), ur"""<header>∑ something</header>"""),


(re.compile(ur"""<img src="([^"]+?)" alt="([^"]+?)" width="([0-9]+)" height="([0-9]+)">
 <figcaption>""", re.U|re.M),
 ur"""<img src="\1" alt="\2" width="\3" height="\4" />
 <figcaption>"""),

 (re.compile(ur"""title="(\d+)x(\d+)">❐</a>""",re.U|re.M),
 ur"""title="\1×\2">❐</a>"""),
]

def replace_string_in_file(file_path):
   "Replaces all strings by regex in find_replace_list at file_path."
   backup_fname = file_path + "~re~"

   # print "reading:", file_path
   input_file = open(file_path, "rb")
   file_content = unicode(input_file.read(), "utf-8")
   input_file.close()

   num_replaced = 0
   for a_pair in find_replace_list:
      tem_tuple = re.subn(a_pair[0], a_pair[1], file_content)
      output_text = tem_tuple[0]
      num_replaced += tem_tuple[1]
      file_content = output_text

   if (num_replaced > 0):
      print ("◆ %d %s" % (num_replaced, file_path.replace("/cygdrive/c/Users/h3", "~")) )

      shutil.copy2(file_path, backup_fname)
      output_file = open(file_path, "r+b")
      output_file.read() # we do this way to preserve file creation date
      output_file.seek(0)
      output_file.write(output_text.encode("utf-8"))
      output_file.truncate()
      output_file.close()

def file_filter(dummy_arg, current_dir, file_list):
   for child in file_list:
      if re.search(r".+\.html$", child, re.U) and os.path.isfile(current_dir + "/" + child):
         replace_string_in_file(current_dir + "/" + child)

os.path.walk(input_dir, file_filter, None)

print "Done."

See also: Perl: Find/Replace on Multiple Files.

blog comments powered by Disqus