"""Simple Text Generator for Python
This defines a class SimpleTextGenerator which allows you to create
generate a text automatically from another text. Two helper classes,
Tokenizer and WordList, are also defined.
The standard method for using SimpleTextGenerator is just:
>>> g = SimpleTextGenerator(filename)
>>> g.generate()
where *filename* is a name of a file from which frequencies are
to be read.
The basic idea and the tokenization code was originally in Paul Graham's book,
ANSI Common Lisp.
"""
__author__ = "Will Fitzgerald (wfitzg@kzoo.edu)"
__version__ = "$Revision: 1.1$"
__date__ = "$Date: 2003/01/05 $"
__copyright__ = "Copyright (c) 2003 Will Fitzgerald"
__license__ = "Python"
import string
def isPunct(ch):
"""
Is the 'character' a standard punctuation character?
"""
return (len(ch)==1) and (ch==string.punctuation[0:1] or (string.find(string.punctuation,ch)>0))
def isWhitespace(ch):
"""
Is the 'character' a standard whitespace character?
"""
return (len(ch)==1) and (ch==string.whitespace[0:1] or (string.find(string.whitespace,ch)>0))
def isAlpha(ch):
return (len(ch)==1) and (ch==string.letters[0:1] or (string.find(string.letters,ch)>0))
def isDigit(ch):
return (len(ch)==1) and (ch==string.digits[0:1] or (string.find(string.digits,ch)>0))
def isAlphaNumeric(ch):
return isAlpha(ch) | isDigit(ch)
class Tokenizer(object):
"""
Tokenizer: tokenizes strings for English, with punctuation.
"""
def tokenize(self,line):
tokens = []
st = 0
pos = 0
while pos<=len(line):
ch = line[pos:pos+1]
if isAlphaNumeric(ch) | (ch=='\''):
pos += 1
else:
word = string.strip(line[st:pos])
tokens.append(word)
while isPunct(line[pos:pos+1]):
tokens.append(line[pos:pos+1])
pos += 1
pos += 1
chhere = line[pos:pos+1]
while (pos<=len(line)) and (isWhitespace(chhere)):
pos += 1
chhere = line[pos:pos+1]
st = pos
return tokens
class WordList(object):
"""
WordList: a 'list' of words that tracks word frequency.
attributes:
wordCount -- total number of words in list
dictionary -- dictionary of word->frequency counts
wordList -- the words (in sorted order by frequency)
"""
def __init__(self,tokenizer=Tokenizer()):
self.__wordCount=0
self.__dictionary={}
self.__wordList=[]
self.__tokenizer=tokenizer
def getWordCount(self):
return self.__wordCount
def setWordCount(self,wordCount):
self.__wordCount=wordCount
wordCount=property(getWordCount,setWordCount)
def getDictionary(self):
return self.__dictionary
def setDictionary(self,dictionary):
self.__dictionary=dictionary
dictionary=property(getDictionary,setDictionary)
def getWordList(self):
return self.__wordList
def setWordList(self,wordList):
self.__wordList=wordList
wordList=property(getWordList,setWordList)
def getTokenizer(self):
return self.__tokenizer
def setTokenizer(self,tokenizer):
self.__tokenizer=tokenizer
tokenizer=property(getTokenizer,setTokenizer)
def addWord(self,word):
"""
adds a word to the word List
"""
d = self.dictionary
if word != '':
d[word] = d.get(word,0)+1
self.wordCount = self.wordCount+1
def tokenizeLine(self,line):
"""
tokenizes a line
"""
return self.tokenizer.tokenize(line)
def processLine(self,line):
"""
adds all of the words in a line to the word list
"""
for word in self.tokenizeLine(line):
self.addWord(word)
def processFile(self,filename):
"""
adds all of the words in a file to the word list
"""
in_file = open(filename,"r")
while 1:
in_line = in_file.readline()
if in_line == "":
break
self.processLine(in_line)
in_file.close()
self.sortList()
def sortList(self):
"""
sorts the list by frequency *must* be called to
have randomWeightedWord work.
"""
self.wordList = self.dictionary.keys()
self.wordList.sort(lambda a,b:self.compareWords(a,b))
def compareWords(self,word1,word2):
"""
compares two words. used for sorting.
"""
d = self.dictionary
c1 = d.get(word1,0)
c2 = d.get(word2,0)
if (c1==c2): if (word1>word2):
return 1
elif (word1<word2):
return -1
else:
return 0
elif (c1>c2): return -1
else:
return 1
def dumpWords(self):
"""
dumps words and word frequencies
"""
for word in self.wordList:
print word,self.dictionary[word]
def randomWord(self):
"""
returns a random word (without regard to frequency)
"""
if self.wordCount > 0:
return self.wordList[random.randrange(0,len(self.wordlist))]
else:
raise "WordList is empty"
def randomWeightedWord(self):
"""
returns a random word, more frequent words more likely to be returned.
"""
if self.wordCount <= 0:
raise "WordList is empty"
else:
n = random.randrange(1,self.wordCount+1)
sofar = 0
for word in self.wordList:
sofar = sofar + self.dictionary[word]
if sofar >= n:
return word
class SimpleTextGenerator(object):
"""
A simple text generator, based just on the raw frequency
of the words in the generator (which can be intialized from a file).
Example use:
g = SimpleTextGenerator('alice1.txt')
g.generate()
attributes:
wordList -- a WordList object
"""
def __init__(self,filename=None,tokenizer=Tokenizer()):
self.__wordList = WordList(tokenizer)
if filename != None:
self.wordList.processFile(filename)
def getWordList(self):
return self.__wordList
def setWordList(self,wordList):
self.__wordList=wordList
wordList=property(getWordList,setWordList)
"""
Generates a random text.
Keyword arguments:
n -- total number of words to generate
minParagraphLength -- the minimum number of sentences in a paragraph.
maxParagraphLength -- the maximum number of sentences in a paragraph.
(must be at least one more than minParagraphLength)
"""
def generate(self,n=1000,minParagraphLength=10,maxParagraphLength=20):
lines = 0
first = 1
paraLen = random.randrange(minParagraphLength,maxParagraphLength)
for i in range(n):
word = self.wordList.randomWeightedWord()
if isPunct(word):
lines += 1
sys.stdout.write(word)
else:
if (first==1):
first=0
else:
sys.stdout.write(" ")
sys.stdout.write(word)
if lines >= paraLen:
lines = 0
first = 1
paraLen = random.randrange(minParagraphLength,maxParagraphLength)
print
print
print