import re
def addWord(token, frequencies):
count = 0
word = ''.join(token)
if word in frequencies:
count = frequencies[word]
frequencies[word] = count + 1
def getWordFrequencies(text):
pattern = re.compile('\w')
frequencies = {}
token = []
for c in text:
if pattern.search(c):
token.append(c)
elif token:
addWord(token, frequencies)
token = []
if token:
addWord(token, frequencies)
return frequencies
mamaMia = \
"Mamma mia, here I go again\
Mamma mia, here I go again\
My my, how can I resist you?\
Mamma mia, does it show again?\
My my, just how much I've missed you"
result = getWordFrequencies(mamaMia)
for word, freq in result.iteritems():
print freq, "\t", word
Refactorings
No refactoring yet !
Elij
December 8, 2007, December 08, 2007 08:58, permalink
The regex needs some work -- but this does the same as yours...
import re
def getWordFrequencies(text):
frequencies = {}
for c in re.split('\W+', text):
frequencies[c] = (frequencies[c] if frequencies.has_key(c) else 0) + 1
return frequencies
mamaMia = \
"Mamma mia, here I go again\
Mamma mia, here I go again\
My my, how can I resist you?\
Mamma mia, does it show again?\
My my, just how much I've missed you"
result = getWordFrequencies(mamaMia)
for word, freq in result.iteritems():
print freq, "\t", word
lbolognini
December 8, 2007, December 08, 2007 14:02, permalink
adict.update() overwrites the values so there's no need to check if the w is already in the dict
mamaMia = "Mamma mia, here I go again\
Mamma mia, here I go again\
My my, how can I resist you?\
Mamma mia, does it show again?\
My my, just how much I've missed you"
adict = {}
for w in mamaMia.split(): adict.update( { w : mamaMia.count(w) } )
print adict
Netferret
February 25, 2008, February 25, 2008 15:40, permalink
This is the best solution I have found.
// s is the string to check occurances of.
alert(s.split("YourWord").length - 1);
Tim
June 29, 2008, June 29, 2008 00:32, permalink
import re
from collections import defaultdict
text = """
Mamma mia, here I go again
Mamma mia, here I go again
My my, how can I resist you?
Mamma mia, does it show again?
My my, just how much I've missed you
""".strip()
histogram = defaultdict(int)
for word in re.split("\W+", text):
histogram[word.lower()] += 1
template = "%-" + str(max(len(word) for word in histogram.keys())) + "s %s"
print "\n".join(template % (word, freq) for word, freq in
sorted(histogram.items(), key=lambda x:x[1], reverse=True))
Tom
July 27, 2008, July 27, 2008 03:48, permalink
As per lbolognini, but no variable required.
mamaMia = "Mamma mia, here I go again\ Mamma mia, here I go again\ My my, how can I resist you?\ Mamma mia, does it show again?\ My my, just how much I've missed you" result = dict([(w, mamaMia.count(w)) for w in mamaMia.split()]) for i in result.items(): print "%s\t%d"%i
Walter Cruz
July 28, 2008, July 28, 2008 12:46, permalink
What about shlex and itertools?
import shlex
from itertools import groupby
def ilen(it):
for i, _ in enumerate(it): pass
return i+1
def getWordFrequencies(text):
lexer = shlex.shlex(text, posix=False)
lexer.whitespace = lexer.whitespace + "?,'"
i = sorted(list(lexer))
l = ((ilen(g), k) for k, g in groupby(i))
for it in l:
yield it
mamaMia = \
"Mamma mia, here I go again\
Mamma mia, here I go again\
My my, how can I resist you?\
Mamma mia, does it show again?\
My my, just how much I've missed you"
for freq , word in getWordFrequencies(mamaMia):
print freq, "\t", word
Leif Ryge
October 31, 2008, October 31, 2008 19:58, permalink
This prints not only the count but also the locations of each word.
def mk_index(seq):
"""Index a sequence
>>> sorted(mk_index("abcba").items())
[('a', [0, 4]), ('b', [1, 3]), ('c', [2])]"""
result={}
for location, item in enumerate(seq):
result.setdefault(item,[]).append(location)
return result
def printWordFrequencyAndLocationReport(text):
"""Report count and locations of each word in text
>>> printWordFrequencyAndLocationReport('Ob la di ob la da')
ob occurred 2 times [0, 3]
la occurred 2 times [1, 4]
da occurred 1 time [5]
di occurred 1 time [2]"""
for word, locs in sorted(mk_index(text.lower().split()).items(),
key=lambda (w,l): len(l), reverse=True):
print "%s occurred %s time%s %s" % (word, len(locs),
('','s')[len(locs)>1], locs)
import doctest
doctest.testmod(verbose=True)
Leif Ryge
October 31, 2008, October 31, 2008 20:37, permalink
Here is another version which uses the handy itertools.groupby function.
from itertools import groupby
import doctest
def printWordFrequencies(text):
"""
>>> printWordFrequencies("Ob la di ob la da")
1 da
1 di
2 la
2 ob"""
for w, g in groupby(sorted(text.lower().split())):
print "%s %s" % (len(list(g)), w)
doctest.testmod(verbose=True)
Suganya
January 19, 2010, January 19, 2010 09:23, permalink
i wanted the code to count the no of occurences of each word in a text using java script .!! pls anyone help !!!
This code counts number of occurrences of each word in a string.
How can I improve it or make it more pythonic?