"""Simple Text Generator for Python This defines a class SimpleTextGenerator which allows you to create generate a text automatically from another text. Two helper classes, Tokenizer and WordList, are also defined. The standard method for using SimpleTextGenerator is just: >>> g = SimpleTextGenerator(filename) >>> g.generate() where *filename* is a name of a file from which frequencies are to be read. The basic idea and the tokenization code was originally in Paul Graham's book, ANSI Common Lisp. """ __author__ = "Will Fitzgerald (wfitzg@kzoo.edu)" __version__ = "$Revision: 1.1$" __date__ = "$Date: 2003/01/05 $" __copyright__ = "Copyright (c) 2003 Will Fitzgerald" __license__ = "Python" import string ## I think there is a bug in string.find. -- wf def isPunct(ch): """ Is the 'character' a standard punctuation character? """ return (len(ch)==1) and (ch==string.punctuation[0:1] or (string.find(string.punctuation,ch)>0)) def isWhitespace(ch): """ Is the 'character' a standard whitespace character? """ return (len(ch)==1) and (ch==string.whitespace[0:1] or (string.find(string.whitespace,ch)>0)) def isAlpha(ch): return (len(ch)==1) and (ch==string.letters[0:1] or (string.find(string.letters,ch)>0)) def isDigit(ch): return (len(ch)==1) and (ch==string.digits[0:1] or (string.find(string.digits,ch)>0)) def isAlphaNumeric(ch): return isAlpha(ch) | isDigit(ch) class Tokenizer(object): """ Tokenizer: tokenizes strings for English, with punctuation. """ def tokenize(self,line): tokens = [] st = 0 pos = 0 while pos<=len(line): ch = line[pos:pos+1] if isAlphaNumeric(ch) | (ch=='\''): pos += 1 else: word = string.strip(line[st:pos]) tokens.append(word) while isPunct(line[pos:pos+1]): tokens.append(line[pos:pos+1]) pos += 1 pos += 1 chhere = line[pos:pos+1] while (pos<=len(line)) and (isWhitespace(chhere)): pos += 1 chhere = line[pos:pos+1] st = pos return tokens class WordList(object): """ WordList: a 'list' of words that tracks word frequency. attributes: wordCount -- total number of words in list dictionary -- dictionary of word->frequency counts wordList -- the words (in sorted order by frequency) """ def __init__(self,tokenizer=Tokenizer()): self.__wordCount=0 self.__dictionary={} self.__wordList=[] self.__tokenizer=tokenizer # accessors for wordCount def getWordCount(self): return self.__wordCount def setWordCount(self,wordCount): self.__wordCount=wordCount wordCount=property(getWordCount,setWordCount) # accessors for dictionary def getDictionary(self): return self.__dictionary def setDictionary(self,dictionary): self.__dictionary=dictionary dictionary=property(getDictionary,setDictionary) # accessors for wordlist def getWordList(self): return self.__wordList def setWordList(self,wordList): self.__wordList=wordList wordList=property(getWordList,setWordList) # accessors for tokenizer def getTokenizer(self): return self.__tokenizer def setTokenizer(self,tokenizer): self.__tokenizer=tokenizer tokenizer=property(getTokenizer,setTokenizer) # add word to list def addWord(self,word): """ adds a word to the word List """ d = self.dictionary if word != '': d[word] = d.get(word,0)+1 self.wordCount = self.wordCount+1 # tokenize a line def tokenizeLine(self,line): """ tokenizes a line """ return self.tokenizer.tokenize(line) # process a line def processLine(self,line): """ adds all of the words in a line to the word list """ # tokenize and add words for word in self.tokenizeLine(line): self.addWord(word) # process a file def processFile(self,filename): """ adds all of the words in a file to the word list """ in_file = open(filename,"r") while 1: in_line = in_file.readline() if in_line == "": break self.processLine(in_line) in_file.close() self.sortList() def sortList(self): """ sorts the list by frequency *must* be called to have randomWeightedWord work. """ # get the list from the keys self.wordList = self.dictionary.keys() # order them by frequency (a little magic below) self.wordList.sort(lambda a,b:self.compareWords(a,b)) def compareWords(self,word1,word2): """ compares two words. used for sorting. """ d = self.dictionary c1 = d.get(word1,0) c2 = d.get(word2,0) if (c1==c2): # counts are same; base on word form if (word1>word2): return 1 elif (word1c2): # we want reverse sorting... return -1 else: return 1 def dumpWords(self): """ dumps words and word frequencies """ for word in self.wordList: print word,self.dictionary[word] def randomWord(self): """ returns a random word (without regard to frequency) """ if self.wordCount > 0: return self.wordList[random.randrange(0,len(self.wordlist))] else: raise "WordList is empty" def randomWeightedWord(self): """ returns a random word, more frequent words more likely to be returned. """ if self.wordCount <= 0: raise "WordList is empty" else: n = random.randrange(1,self.wordCount+1) sofar = 0 for word in self.wordList: sofar = sofar + self.dictionary[word] if sofar >= n: return word class SimpleTextGenerator(object): """ A simple text generator, based just on the raw frequency of the words in the generator (which can be intialized from a file). Example use: g = SimpleTextGenerator('alice1.txt') g.generate() attributes: wordList -- a WordList object """ def __init__(self,filename=None,tokenizer=Tokenizer()): self.__wordList = WordList(tokenizer) if filename != None: self.wordList.processFile(filename) # accessors for wordList def getWordList(self): return self.__wordList def setWordList(self,wordList): self.__wordList=wordList wordList=property(getWordList,setWordList) # generate! """ Generates a random text. Keyword arguments: n -- total number of words to generate minParagraphLength -- the minimum number of sentences in a paragraph. maxParagraphLength -- the maximum number of sentences in a paragraph. (must be at least one more than minParagraphLength) """ def generate(self,n=1000,minParagraphLength=10,maxParagraphLength=20): lines = 0 first = 1 paraLen = random.randrange(minParagraphLength,maxParagraphLength) for i in range(n): word = self.wordList.randomWeightedWord() if isPunct(word): lines += 1 sys.stdout.write(word) else: if (first==1): first=0 else: sys.stdout.write(" ") sys.stdout.write(word) if lines >= paraLen: lines = 0 first = 1 paraLen = random.randrange(minParagraphLength,maxParagraphLength) print print print