"""Simple Text Generator for Python

This defines a class SimpleTextGenerator which allows you to create
generate a text automatically from another text.  Two helper classes,
Tokenizer and WordList, are also defined.

The standard method for using SimpleTextGenerator is just:
>>> g = SimpleTextGenerator(filename)
>>> g.generate()
where *filename* is a name of a file from which frequencies are
to be read.

The basic idea and the tokenization code was originally in Paul Graham's book,
ANSI Common Lisp. 

"""

__author__ = "Will Fitzgerald (wfitzg@kzoo.edu)"
__version__ = "$Revision: 1.1$"
__date__ = "$Date: 2003/01/05 $"
__copyright__ = "Copyright (c) 2003 Will Fitzgerald"
__license__ = "Python"

import string

## I think there is a bug in string.find. -- wf
def isPunct(ch):
    """
    Is the 'character' a standard punctuation character?
    """
    return (len(ch)==1) and (ch==string.punctuation[0:1] or (string.find(string.punctuation,ch)>0))

def isWhitespace(ch):
    """
        Is the 'character' a standard whitespace character?
    """
    return (len(ch)==1) and (ch==string.whitespace[0:1] or (string.find(string.whitespace,ch)>0))

def isAlpha(ch):
    return (len(ch)==1) and (ch==string.letters[0:1] or (string.find(string.letters,ch)>0))

def isDigit(ch):
    return (len(ch)==1) and (ch==string.digits[0:1] or (string.find(string.digits,ch)>0))

def isAlphaNumeric(ch):
    return isAlpha(ch) | isDigit(ch)

class Tokenizer(object):
    """
    Tokenizer: tokenizes strings for English, with punctuation.

    """
    def tokenize(self,line):
        tokens = []
        st = 0
        pos = 0
        while pos<=len(line):
            ch = line[pos:pos+1]
            if isAlphaNumeric(ch) | (ch=='\''):
                pos += 1
            else:
                word = string.strip(line[st:pos])
                tokens.append(word)
                while isPunct(line[pos:pos+1]):
                    tokens.append(line[pos:pos+1])
                    pos += 1
                pos += 1
                chhere = line[pos:pos+1]
                while (pos<=len(line)) and (isWhitespace(chhere)):
                    pos += 1
                    chhere = line[pos:pos+1]
                st = pos
        return tokens
 
     
class WordList(object):
    """
    WordList: a 'list' of words that tracks word frequency.

    attributes:
    wordCount -- total number of words in list
    dictionary -- dictionary of word->frequency counts
    wordList -- the words (in sorted order by frequency)
    """
    def __init__(self,tokenizer=Tokenizer()):
        self.__wordCount=0
        self.__dictionary={}
        self.__wordList=[]
        self.__tokenizer=tokenizer

    # accessors for wordCount
    def getWordCount(self):
        return self.__wordCount
    def setWordCount(self,wordCount):
        self.__wordCount=wordCount
    wordCount=property(getWordCount,setWordCount)

    # accessors for dictionary
    def getDictionary(self):
        return self.__dictionary
    def setDictionary(self,dictionary):
        self.__dictionary=dictionary
    dictionary=property(getDictionary,setDictionary)

    # accessors for wordlist
    def getWordList(self):
        return self.__wordList
    def setWordList(self,wordList):
        self.__wordList=wordList
    wordList=property(getWordList,setWordList)

    # accessors for tokenizer
    def getTokenizer(self):
        return self.__tokenizer
    def setTokenizer(self,tokenizer):
        self.__tokenizer=tokenizer
    tokenizer=property(getTokenizer,setTokenizer)

    # add word to list
    def addWord(self,word):
        """
        adds a word to the word List
        """
        d = self.dictionary
        if word != '':
            d[word] = d.get(word,0)+1
            self.wordCount = self.wordCount+1

    # tokenize a line
    def tokenizeLine(self,line):
        """
        tokenizes a line
        """
        return self.tokenizer.tokenize(line)
    
    # process a line
    def processLine(self,line):
        """
        adds all of the words in a line to the word list
        """
        # tokenize and add words
        for word in self.tokenizeLine(line):
            self.addWord(word)

    # process a file
    def processFile(self,filename):
        """
        adds all of the words in a file to the word list
        """
        in_file = open(filename,"r")
        while 1:
            in_line = in_file.readline()
            if in_line == "":
                break
            self.processLine(in_line)
        in_file.close()
        self.sortList()
        
    def sortList(self):
        """
        sorts the list by frequency *must* be called to
        have randomWeightedWord work.
        """
        # get the list from the keys
        self.wordList = self.dictionary.keys()
        # order them by frequency (a little magic below)
        self.wordList.sort(lambda a,b:self.compareWords(a,b))

    def compareWords(self,word1,word2):
        """
        compares two words. used for sorting.
        """
        d = self.dictionary
        c1 = d.get(word1,0)
        c2 = d.get(word2,0)
        if (c1==c2):             # counts are same; base on word form
            if (word1>word2):
                return 1
            elif (word1<word2):
                return -1
            else:
                return 0
        elif (c1>c2):           # we want reverse sorting...
            return -1
        else:
            return 1
        
        
    def dumpWords(self):
        """
        dumps words and word frequencies
        """
        for word in self.wordList:
            print word,self.dictionary[word]

    def randomWord(self):
        """
        returns a random word (without regard to frequency)
        """
        if self.wordCount > 0:
            return self.wordList[random.randrange(0,len(self.wordlist))]
        else:
            raise "WordList is empty"

    def randomWeightedWord(self):
        """
        returns a random word, more frequent words more likely to be returned.
        """
        if self.wordCount <= 0:
            raise "WordList is empty"
        else:
            n = random.randrange(1,self.wordCount+1) 
            sofar = 0
            for word in self.wordList:
                sofar = sofar + self.dictionary[word]
                if sofar >= n:
                    return word


class SimpleTextGenerator(object):
    """
    A simple text generator, based just on the raw frequency
    of the words in the generator (which can be intialized from a file).
    Example use:
    g = SimpleTextGenerator('alice1.txt')
    g.generate()

    attributes:
    wordList -- a WordList object
    """
    
    def __init__(self,filename=None,tokenizer=Tokenizer()):
        self.__wordList = WordList(tokenizer)
        if filename != None:
            self.wordList.processFile(filename)

    # accessors for wordList
    def getWordList(self):
        return self.__wordList
    def setWordList(self,wordList):
        self.__wordList=wordList
    wordList=property(getWordList,setWordList)

    # generate!
    """
    Generates a random text.

    Keyword arguments:
    n -- total number of words to generate
    minParagraphLength -- the minimum number of sentences in a paragraph.
    maxParagraphLength -- the maximum number of sentences in a paragraph.
      (must be at least one more than minParagraphLength)      
      
    """
    def generate(self,n=1000,minParagraphLength=10,maxParagraphLength=20):
        lines = 0
        first = 1
        paraLen = random.randrange(minParagraphLength,maxParagraphLength)
        for i in range(n):
            word = self.wordList.randomWeightedWord() 
            if isPunct(word):
                lines += 1
                sys.stdout.write(word)
            else:
                if (first==1):
                    first=0
                else:
                    sys.stdout.write(" ")
                sys.stdout.write(word)
            if lines >= paraLen:
                lines = 0
                first = 1
                paraLen = random.randrange(minParagraphLength,maxParagraphLength)
                print
                print
        print