Python: Convert File Encoding for All Files in a Dir

,

Here's a Python program that convert character encoding for all files in a directory.

# -*- coding: utf-8 -*-
# python

# a script to convert file encoding

import os

inputDir = '/Users/t/web/p/monkey_king'

def changeEncoding(filePath):
    '''take a full path to a file as input, and change its encoding from gb18030 to utf-16'''
    print filePath

    tempName = filePath+'~-~'

    inputFile = open(filePath,'rb')
    content = unicode(inputFile.read(),'gb18030')
    inputFile.close()

    outputFile = open(tempName,'wb')
    outputFile.write(content.encode('utf-16'))
    outputFile.close()

    os.rename(tempName,filePath)


def fileFilter(dummyArg, thisDir, dirChildrenList):
    for child in dirChildrenList:
        if '.html' == os.path.splitext(child)[1] and os.path.isfile(thisDir+'/'+child):
            changeEncoding(thisDir+'/'+child)
os.path.walk(inputDir, fileFilter, None)

For sample files (Chinese) in Unicode you can experiment with encoding, see: 西游记 (Journey To The West).

blog comments powered by Disqus