Python: Find Replace Strings in Unicode Files

By Xah Lee. Date: . Last updated: .

This page shows how to find/replace on Unicode encoded files.

# -*- coding: utf-8 -*-
# python

# find and replace many pairs of strings in a utf-8 file

filePath = "/Users/t/myfile.txt"

outPath = filePath + ".new"

findReplacePairs = [
(u"西游记", u"西游记 Monkey King"),

# more pair here
]

inF = open(filePath, "rb")
fContent = unicode(inF.read(), "utf-8")
inF.close()

for pair in findReplacePairs:
    outText = fContent.replace(pair[0], pair[1])
    fContent = outText

outF = open(outPath, "wb")
outF.write(outText.encode("utf-8"))
outF.close()

All Files in a Dir

Here's a script that does multi-pair find/replace for all HTML files in a dir, assuming the encoding is UTF-8.

# -*- coding: utf-8 -*-
# python

# find/replace multiple pairs of string, in all .html files in a dir

import os,sys,shutil

mydir= '/Users/t/web'

findreplace = [
('find1','replace1'),
('find2','replace2'),
]

def replaceStringInFile(filePath):
   "replaces all findStr by repStr in file filePath"
   print filePath
   tempName=filePath+'~x~'
   backupName=filePath+'~~'

   inF = open(filePath,'rb')
   s=unicode(inF.read(),'utf-8')
   inF.close()

   for couple in findreplace:
       outtext=s.replace(couple[0],couple[1])
       s=outtext
   outF = open(tempName,'wb')
   outF.write(outtext.encode('utf-8'))
   outF.close()

   shutil.copy2(filePath,backupName)
   os.remove(filePath)
   os.rename(tempName,filePath)

def myfun(dummy, thisDir, dirChildrenList):
    for child in dirChildrenList:
        if '.html' == os.path.splitext(child)[1] and os.path.isfile(thisDir+'/'+child):
            replaceStringInFile(thisDir+'/'+child)
            print child

os.path.walk(mydir, myfun, 'dummy')