import sys
#input into this structure
words = {}
#strings generated go here
gen = []
#input word
word = ""
#final output
final = []
#first arg = word to parse, second arg = wordlist name, optional, third arg = (generated word) minimum length, optional
arglength = len(sys.argv)
if arglength > 1:
word = sys.argv[1]
else:
print "Please specify an input file"
quit()
if arglength > 2:
list = sys.argv[2]
else:
print "Using default wordlist.txt"
list = "wordlist.txt"
if arglength > 3:
cutoff = int(sys.argv[3])
else:
print "Using default cutoff of 1 letter"
cutoff = 1
#create output file based on the word we're generating
outFile = "output-" + word + ".txt"
out = open(outFile, 'w')
#create input file
f = open(list,'r')
#file is one word per line, strip out the endline character, and throw in a dictionary
for line in f:
line = line.strip()
if len(line) > cutoff:
words[line] = 1
#recurse calculates all the substrings forward, is exponential
#input: current string, base string
#example: spray => recurse("s","pray"), recurse("","pray")
def recurse(current, string):
if string == "":
return
gen.append(current)
gen.append(str(current + string[:1]))
recurse(current, string[1:])
recurse(str(current + string[:1]), string[1:])
#check the generated strings for words
#input: string list, word list
def checkOverlap(gen, words):
for item in gen:
#these are just strings, so lets check if they're words
if item in words:
#ignoring duplicates as we go
if item not in final:
final.append(item)
#start recursion on the input word
recurse("", word)
#given the generated strings, find the words
checkOverlap(gen, words)
#sort the output
final.sort()
#write the output
for item in final:
out.write(item)
out.write('\n')
print "Output written to " + outFile
Refactorings
No refactoring yet !
nosklo
January 20, 2009, January 20, 2009 14:28, permalink
Using optparse to parse options.
Using a set to store words.
Using generators and list comprehensions.
Usage example:
$ python williams.py doubt
do
dot
doubt
dub
out
$ python williams.py -h
Usage: williams.py [options] words...
Options:
-h, --help show this help message and exit
-w FILE, --wordlist=FILE
wordlist filename, default wordlist.txt
-m LENGTH, --minlenght=LENGTH
minimum length of generated word, default 1
-o FILE, --output=FILE
Output to FILE. By default output to stdout.
import sys
import optparse
# parse command line options:
parser = optparse.OptionParser('%prog [options] words...')
parser.add_option("-w", "--wordlist", dest="words", default='wordlist.txt',
help="wordlist filename, default wordlist.txt", metavar="FILE")
parser.add_option("-m", "--minlenght", dest="cutoff", default=1, type='int',
help="minimum length of generated word, default 1", metavar="LENGTH")
parser.add_option("-o", "--output", dest="outputfile",
help="Output to FILE. By default output to stdout.", metavar="FILE")
options, words = parser.parse_args()
# read wordlist:
wordlist = set(word.strip() for word in open(options.words)
if len(word.strip()) > options.cutoff)
# open output file
if options.outputfile:
out = open(options.outputfile, 'w')
else:
out = sys.stdout
def recurse(current, base):
"""
recurse calculates all the substrings forward, is exponential
input: current string, base string
example: spray => recurse("s","pray"), recurse("","pray")
"""
result = set()
if base:
result.add(current)
result.add(current + base[:1])
result.update(recurse(current, base[1:]))
result.update(recurse(current + base[:1], base[1:]))
return result
for word in words:
out.writelines(sorted('%s\n' % item for item in recurse('', word)
if item in wordlist))
cmurphycode.blogspot.com
February 20, 2009, February 20, 2009 03:21, permalink
For some reason this failed to notify me. Nice stuff! I hadn't seen optparse before, I like it a lot. Using a set is a good idea and generators are always grand. Thanks!
cmurphycode.blogspot.com
April 27, 2010, April 27, 2010 20:35, permalink
Oh My. A long journey took me to this post, which I realized I never updated. Here's what I had written a few days after the initial post to optimize for longer words...still awful code, but prefix pruning allows somewhat better than exponential behavior.
import sys
#input into this structure
words = {}
wordList = []
#strings generated go here
gen = []
#input word
word = ""
#final output
final = []
#output file
outFile = ""
def recurse(current, string, prefixList):
''' calculates all the substrings forward (cannot backtrack)
Input: current string, base string
Example: spray => recurse("s","pray"), recurse("","pray") (exponential)'''
#base case: no more base string to process
if string == "":
return
#if the current string is not a prefix for a word, might as well quit
if len(current) < len(prefixList):
if current not in prefixList[len(current)]:
return
#recursive case: we create two new threads of processing, one with the first character in the base string,
#and one without, thus we will hit all possible strings that can be generated forward
#note: this is different than just mixing all of the letters together however you please
#So here we add the two cases to our generated strings list
gen.append(current)
gen.append(str(current + string[:1]))
#then recurse on the two
recurse(current, string[1:],prefixList)
recurse(str(current + string[:1]), string[1:],prefixList)
def checkOverlap(gen, words):
'''check the generated strings for words
input: string list, word list'''
for item in gen:
#these are just strings, so lets check if they're words
if item in words:
#ignoring duplicates as we go
if item not in final:
final.append(item)
def generatePrefixes(words):
prefixList = [{},{},{},{},{},{},{}]
for word in words:
for i in range(7):
prefixList[i][word[:i]] = 1
return prefixList
if __name__ == "__main__":
#first arg = word to parse, second arg = wordlist name, optional, third arg = (generated word) minimum length, optional
arglength = len(sys.argv)
if arglength > 1:
word = sys.argv[1]
else:
print "Please specify an input word"
quit()
if arglength > 2:
list = sys.argv[2]
else:
print "Using default wordlist.txt"
list = "wordlist.txt"
if arglength > 3:
cutoff = int(sys.argv[3])
else:
print "Using default cutoff of 1 letter"
cutoff = 1
#create output file based on the word we're generating
outFile = "pruned-output-" + word + ".txt"
out = open(outFile, 'w')
#create input file
f = open(list,'r')
#file is one word per line, strip out the endline character, and throw in a dictionary
for line in f:
line = line.strip()
if len(line) > cutoff:
words[line] = 1
wordList.append(line)
prefixList = generatePrefixes(wordList)
#start recursion on the input word
recurse("", word, prefixList)
#given the generated strings, find the words
checkOverlap(gen, words)
#sort the output
final.sort()
#write the output
for item in final:
out.write(item)
out.write('\n')
print "Output written to " + outFile
For my blog: http://www.cmurphycode.blogspot.com
A little project via Kottke (http://www.kottke.org/08/11/williams-poems), that I thought would be easy and fun. No need to actually refactor unless you're truly bored- I'm just experimenting with using this as a place to share code.