#!/usr/local/bin/python
# encoding: utf-8

"""
Linguist 278: Programming for Linguists
Stanford Linguistics, Fall 2009
Christopher Potts

Assignment 3 - Python Lists and Dictionaries

Distributed 2009-10-05
Due 2009-10-11

NOTE: Please submit a modified version of this file, including
comments.  Python should be able to run through your file without
errors.
"""

from collections import defaultdict
import re
import types
import math
# from operator import itemgetter
# import matplotlib.mlab as mlab
# import matplotlib.pyplot as plt
# import matplotlib.numerix as nx

"""===================================================================
1. Intro to comma-separated values.  The string csv below stores 11
lines of comma-separated values, with the first line giving the column
names and the other lines giving the data on 10 imaginary
subjects. Below are some questions that ask you to write functions for
working with this data.
"""

csv ="""Subject,Height,Occupation
1,70,Psychologist
2,62,Psychologist
3,61,Psychologist
4,67,Psychologist
5,70,Linguist
6,66,Psychologist
7,76,Psychologist
8,67,Linguist
9,73,Psychologist
10,63,Linguist"""

"""--------------------------------------------------
Fun fact: The column of heights, presumably in inches, was generated with

import random

and then repeated runs of

random.randint(60,77)

and the column of occupations was generated with

d = {0:'Psychologist',1:'Linguist'}

and then repeated runs of

d[random.randint(0,1)]
--------------------------------------------------"""

"""===================================================================
1.1 Parser. Complete the following function for turning the string csv
into a 10x3 matrix of data.  I should emphasize that the top line of
csv is not part of the data.  It's just there to help us out.
"""

def csv_parser(string):
    """Parses the string into lines, and then parses those lines by
    splitting on the comma and converting the numerical data to int.
    The output is a list of lists of subject data."""
    # Data is our output. It will be a list of lists.
    data = []    
    # Split csv into lines and store them in a list called 'lines'.
    
    # Remove the first element from lines, so that you have only the data lines left.

    # At this stage, we loop through the list called lines.
    # As you loop
    #     i. split each line on the commas;
    #    ii. convert the Subject and Height variables to numerical ones using int();
    #   iii. add to data a list consisting of this line's Subject, Height, and Occupation values 
    for line in lines:
        
    return data

# Check to make sure your csv_parser is working right. First, get the data:
data = csv_parser(csv)
print data[4][2] == 'Linguist' # Should be True.
print data[0][1] == 70         # Should be True.
print type(data[0][1]) == int  # Should be True.

    
"""===================================================================
1.2 Average height. Complete the following function for computing the
average height of the subjects in this data set.
"""

def average_height(data):
    """Returns the average numerical value of column 1 in an list of lists."""
    total = 0
    for line in data:
        # Add this line's height value to total.
    # Return the average value.  Tip: The total number of height values is the length of data.

# Check it out:
print "Average height", average_height(data)


"""===================================================================
1.3 Occupations. Complete the following function so that it returns
the set of occupations represented in data.
"""

def occupations(data):
    """Returns the list of occupations given in column 2 of data."""
    occupations_dictionary = {}
    for line in data:

    return occupations_dictionary.keys()

# Check it out:
print "Occupation list:", occupations(data)


"""===================================================================
1.4 Restricted average height. Complete the following function for
computing the average height of a specified subpopulation, so that

restricted_average_height(data, 'Linguist')

returns the average height of linguists, whereas

restricted_average_height(data, 'Psychologist')

returns the average height of psychologists, and

restricted_average_height(data)

just returns what average_height(data) returns.
"""

def restricted_average_height(data, occupation=None):
    """Returns the average numerical value of column 1 in a list of
    lists, optionally using values in column 2 as restrictions."""
    if occupation == None:
        # Specify this behavior / return value.
    # Ensure that the supplied occupation is actually in the data, using our previous function.
    if occupation in occupations(data):
        total = 0
        person_count = 0
        for line in data:
            # Specify the behavior
        return # Specify the return value.
    # If the occupation supplied by the user is unknown in data, we
    # want to throw an exception. Don't worry about this code.  I'm
    # just trying to ensure that your program behaves itself.
    else: 
        raise Exception("Occupation %s not present in the data" % (occupation))

# Run these checks:
print "Average height of linguists", restricted_average_height(data, occupation="Linguist")
print "Average height of linguists",  restricted_average_height(data, occupation="Psychologist")
try:    
    print "Average height of philosophers (should raise an exception) ...", restricted_average_height(data, occupation="Philosopher")
except:
    print "Exception thrown! Good news!"
print restricted_average_height(data) == average_height(data) # These should be the same.

"""===================================================================
2. Download a novel of your choosing from Project Gutenberg. Please
submit this file with your completed assignment.
"""

"""===================================================================
3. Use wc to check that it is at least 10,000 words long. To simplify
things, have wc report just the word count (not character counts,
etc.).  Put your command and output here:
"""

"""===================================================================
4. Open your novel in a text editor and study its composition. Is
there text that should be removed in order to get accurate counts?
Are there special formatting issues that need to be addressed?
Describe the issues, if any, and modify your text accordingly.

--------------------------------------------------
Advanced option for 3a: leave the text alone, but define a
preprocessor that takes the filename as input and returns the 
appropriately modified string.  This function should handle *just*
formatting, not the counting and other computations required in other
parts of this assignment. A start:

def file_preprocessor(filename):
    # This opens the file, reads it to a string with read(), and stores that string in str.
    str = open(filename).read()
    # Preprocessing steps here.  You'll need to read up on Python
    # regular expressions (see the website for links).  You might
    # consider a multi-line regular expression that removes the
    # Gutenburg boilerplate at the start and end.
    return str
--------------------------------------------------
"""

"""===================================================================
5. Finish the definition of the counts(string) function below so that
returns a dictionary mapping word types to their counts in your novel.

-- Basic version: no need to modify 'tokenize', but please print
   counts(filename) and describe some ways in which the results are
   not optimal given that we want a word-types --> counts mapping.

-- Advanced version: improve tokenize so that the output of counts is
  a respectable word-types --> counts mapping.
"""

def counts(string):
    """ Accepts a string as input. Return a dictionary mapping
    word-types to the number of times those words occur in str."""
    c = defaultdict(int)    
    words = tokenize(string)
    for w in words:
    	# Your tabulating procedure here.
    return c
    
def tokenize(string):
    """Accepts a string str as input.  Chunks str into words according
    to some procedure tailored for the task at hand."""
    return string.split()

# Usage:
filename = "" # Add your file's name here.
contents = open(filename).read() # Gets your file's content into a string.
print "Word counts", counts(contents)

"""===================================================================
6. Baayen (Word Frequency Distributions, 2001) defines

  V(m, T): the number of word-types with frequency m in the text T

For example, in the text 'The fox jumped over the dog', we have

  V(1, 'The fox jumped over the dog') = 4 ('fox','jumped','over','dog')
  V(2, 'The fox jumped over the dog') = 1 ('the')

This defines the 'frequency spectrum' for the text.  Fill out the
following function so that it outputs the frequency spectrum for your
novel. The input c should be the output of counts(str).
"""

def frequency_spectrum(c):
    """Takes the output of counts(filename) as input. Returns the
    corresponding frequency spectrum fs, where

    fs[m] = the number of words occuring m times in the text, i.e.,
    the number of words such that c[w] = m
    """
    fs = defaultdict(int)
    # Your procedure goes here.
    return fs

# Usage:
c = counts(open(filename).read()) # Uses your file from 4 above.
fs = frequency_spectrum(c)
print "Frequency spectrum", fs

"""===================================================================
7 (OPTIONAL). The frequency spectrum highlights an important sense in
which texts are unusual statistical objects. The best way to see this
is to graph the output.  To do this, comment-in the lines

from operator import itemgetter
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import matplotlib.numerix as nx

at the top of this script, and then call the following plotting
function (which calls sort_dictionary) and check out the results.
Check the documentation for matplotlib to see if you can improve the
display by changing the x and y values, etc.
"""

def mean(a):
    return float(sum(a)) / float(len(a))

def sd(a):
    mu = mean(a)
    num = 0.0    
    for x in a:
        num += (x - mu)**2
    return math.sqrt(num / float(len(a)))

def plot_frequency_spectrum(fs, best_fit=False):
    """Plots a frequency spectrum, with a log-scale x-axis.
    Calls sort_dictionary().  Requires matplotlib."""
    s = sort_dictionary(fs, 0)
    ranks = []
    counts = []
    for a in s:
        ranks.append(a[0])
        counts.append(a[1])	
    plt.plot(ranks, counts, 'ro')    
    plt.xlabel('spectrum rank')
    plt.ylabel('count')
    if best_fit:
        mu = mean(counts)
        sigma = sd(counts)
        y = mlab.normpdf(nx.asarray(counts), mu, sigma)
        plt.plot(ranks, y, 'b--', linewidth=1)
        plt.axis([0,20,0,500])
    plt.show()

def sort_dictionary(d, keys_or_values):
    """Input: dictionary d, and either 0 for keys or 1 for values.
    Returns a sorted dictionary."""
    if keys_or_values == 0 or keys_or_values == 1:
        return sorted(d.items(), key=itemgetter(keys_or_values))
    else:
        return "Error"
