Python: Count Word Frequency
Here's a script that computes frequency of words in file.
The input file is typically a novel, fiction, essay, etc.
# count the occurence of words in a file # prints words with their counts # to use, save this file as word_freq.py, edit the line filePath, then run “python word_freq.py” # 2006-10-27 http://xahlee.info/python/python_word_frequency.html import re, operator filePath = "/Users/xah/web/xahlee_info/python/python_word_frequency.html" # keys are words, vals are occurance frequency freqlist={} inF = open(filePath, "r", encoding="utf-8") s=inF.read() inF.close() s=s.lower() wordlist = re.split(r'\W',s); for wd in wordlist: if wd in freqlist: freqlist[wd]=freqlist[wd]+1 else: freqlist[wd]=1 for k,v in sorted(freqlist.items(), key=operator.itemgetter(1) ,reverse=True): print(str(v) + " → " + k)
Output:
- 1575 →
- 152 → a
- 135 → li
- 76 → html
- 72 → href
- 24 → script
- 22 → class
Here's python 2:
# -*- coding: utf-8 -*- # Python # count the occurence of words in a file # prints words with their counts # to use, save this file as word_freq.py, edit the line filePath, then run “python word_freq.py” # 2006-10-27 http://xahlee.info/python/python_word_frequency.html import re, operator filePath = "/Users/xah/web/xahlee_info/python/python_word_frequency.html" # keys are words, vals are occurance frequency freqlist={} inF = open(filePath,'rb') s=unicode(inF.read(),'utf-8') inF.close() s=s.lower() wordlist = re.split(r'\W',s); for wd in wordlist: # if freqlist.has_key(wd): if wd in freqlist: freqlist[wd]=freqlist[wd]+1 else: freqlist[wd]=1 for k,v in sorted(freqlist.items(), key=operator.itemgetter(1) ,reverse=True): print str(v) + u" → " + k.encode('utf-8')