# Alexander Choe
# 11/27/2016
# CS 110
# Prof. Bill Campbell
# stylometrics.py -- Homework 5

def byFreq(pair):
    '''Used to return the word count for sorting purposes.'''
    return pair[1]

def freqWords():
    '''Analyzes an input text file and returns the desired top number
    of most-used words, their counts, and their ratios.'''
    counts = {}
    wordCount = 0
    wordList = []
    fname = raw_input("File to analyze: ")
    with open(fname, 'r') as holder:
            text = holder.read()
    text = text.lower()
    for ch in '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~':
        text = text.replace(ch, ' ')
    text = text.replace("'", "")
    words = text.split()
    
    for w in words:
        counts[w] = counts.get(w,0) + 1
        wordCount += 1

    n = input("How many top words? ")
    items = list(counts.items())
    items.sort()
    items.sort(key=byFreq, reverse=True)
    uniqueWords = "The number of unique words in " + str(fname) + " is " + str(len(counts)) + "."
    totalWords = "The total number of words in this text is " +str(wordCount) + "."
    print totalWords
    print uniqueWords
    for i in range(n):
        word, count = items[i]
        print("{0:<15}{1:>5}".format(word, count))
    for i in range(50):
        word, count = items[i]
        count = count/float(wordCount)
        wordList += [[word, count]]
        print("{0:<15}{1:>5}".format(word, count))
    print wordList

# Shakespeare: Macbeth, Othello, All's Well that Ends Well
# Melville: Moby Dick, Bartleby, Omoo

def melville():
    '''Creates a dictionary for Melville using Moby Dick, Bartleby,
    and Omoo, and returns his top 50 most-used words.'''
    counts = {}
    wordCount = 0
    wordList = []
    for fname in ['moby.txt', 'bartleby.txt', 'omoo.txt']:
        with open(fname, 'r') as holder:
            text = holder.read()
        text = text.lower()
        for ch in '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~':
            text = text.replace(ch, ' ')
        text = text.replace("'", "")
        words = text.split()
        for w in words:
            counts[w] = counts.get(w,0) + 1
            wordCount += 1
    items = list(counts.items())
    items.sort()
    items.sort(key=byFreq, reverse=True)
    for i in range(50):
        word, count = items[i]
        count = count/float(wordCount)
        wordList += [[word, count]]
    return wordList
#        print("{0:<15}{1:>5}".format(word, count))
#    print wordList
#    uniqueWords = "The number of unique words in Melville's texts is " + str(len(counts)) + "."
#    totalWords = "The total number of words in Melville's texts is " +str(wordCount) + "."
#    print totalWords
#    print uniqueWords
#    for i in range(50):
#        word, count = items[i]
#        print("{0:<15}{1:>5}".format(word, count))

def shakespeare():
    '''Creates a dictionary for Shakespeare using Macbeth, Othello, and
    All's Well that Ends Well, and returns his top 50 most-used words.'''
    counts = {}
    wordCount = 0
    wordList = []
    for fname in ['macbeth.txt', 'othello.txt', 'allswell.txt']:
        with open(fname, 'r') as holder:
            text = holder.read()
        text = text.lower()
        for ch in '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~':
            text = text.replace(ch, ' ')
        text = text.replace("'", "")
        words = text.split()
        for w in words:
            counts[w] = counts.get(w,0) + 1
            wordCount += 1
    items = list(counts.items())
    items.sort()
    items.sort(key=byFreq, reverse=True)
    for i in range(50):
        word, count = items[i]
        count = count/float(wordCount)
        wordList += [[word, count]]
    return wordList
#        print("{0:<15}{1:>5}".format(word, count))
#    print wordList
#    uniqueWords = "The number of unique words in Shakespeare's texts is " + str(len(counts)) + "."
#    totalWords = "The total number of words in Shakespeare's texts is " +str(wordCount) + "."
#    print totalWords
#    print uniqueWords
#    for i in range(50):
#        word, count = items[i]
#        print("{0:<15}{1:>5}".format(word, count))

def identifyAuthor(filename):
    '''This function takes in a text file with the file name in the form of
    a string and identifies the author as being Shakespeare, Melville, or
    unknown based off of the ratio of their most common vocabulary.'''
    melvilleDic = melville()
    shakespeareDic = shakespeare()
    melvilleRatio = 0
    shakespeareRatio = 0
    counts = {}
    wordCount = 0
    wordList = []
    with open(filename, 'r') as holder:
        text = holder.read()
    text = text.lower()
    for ch in '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~':
        text = text.replace(ch, ' ')
    text = text.replace("'", "")
    words = text.split()
    for w in words:
        counts[w] = counts.get(w,0) + 1
        wordCount += 1
    items = list(counts.items())
    items.sort()
    items.sort(key=byFreq, reverse=True)
    for i in range(50):
        word, count = items[i]
        count = count/float(wordCount)
        wordList += [[word, count]]
#    print wordList
    for i in wordList:
        for j in melvilleDic:
            if i[0] == j[0]:
                melvilleRatio += abs(i[1] - j[1])
            else:
                melvilleRatio += i[1]
#    print "Melville: " + str(melvilleRatio)
    for i in wordList:
        for j in shakespeareDic:
            if i[0] == j[0]:
                shakespeareRatio += abs(i[1] - j[1])
            else:
                shakespeareRatio += i[1]
#    print "Shakespeare: " + str(shakespeareRatio)
#    print "Ratio difference: " + str(abs(melvilleRatio - shakespeareRatio))
    if melvilleRatio < shakespeareRatio and "melville" in words:
        print "The author of " + str(filename) + " is Melville."
    elif melvilleRatio > shakespeareRatio and "shakespeare" in words:
        print "The author of " + str(filename) + " is Shakespeare."
    else:
        print "The author of " + str(filename) + " is unknown."