#!/usr/bin/env python

from collections import defaultdict


"""
Linguist 278: Programming for Linguists
Stanford Linguistics, Fall 2009
Christopher Potts

Assignment 4 - Miscellaneous data structures and techniques

Distributed 2009-10-19
Due 2009-10-25
"""

"""===================================================================
1. For assignment 3, we counted a lot of words.  The simple
tokenize(str) function I offered just split on single whitespace
characters.  Rewrite this function using regular expressions so that
it does at least the things listed in the function's documentation.
"""

def tokenize(string):
    """Accepts a string as input and chunks into words according
    to the following principles:
    
      * All words are lowercase.
      * Newline characters are removed.
      * Punctuation at word boundaries is removed.  This includes
        quotations marks, single and double, but apostrophes must
        remain in place an intact, so that, e.g., don't is a word.
      * HTML tags, i.e., things inside angled brackets, are deleted.
      * Hyphenated words like 'self-contained' are treated as single
        words, but en and em dashes (-- and ---) are treated as
        word boundaries even if there is no space around them.
      * No empty strings are included. (You might want to use
        re.split(regex, string) for this.

    The return value is a list of strings.
    """



"""===================================================================
2. For assignment 3, we created a counts(str) function that mapped
words to their token counts, and then we created another function
frequency_spectrum(defaultdict) that mapped counts to the number of
words that had those counts.  Here are my versions:
"""

def counts(string):
    """Accepts a string as input. Return a defaultdict mapping
    word-types to the number of times those words occur in str."""
    c = defaultdict(int)
    for w in tokenize(string):
    	c[w] += 1
    return c

def frequency_spectrum(c):
    """Takes the output of counts(str) as input. Returns the
    corresponding frequency spectrum fs, where fs[m] = the number of
    words occuring m times in the text, i.e., the number of words such
    that c[w] = m."""
    fs = defaultdict(int)
    for m in c.values():
        fs[m] += 1
    return fs

"""
Your task now is to write a string-creating function
fs2csv(defaultdict) that takes the output of
frequency_spectrum(defaultdict) as input and return a string in the
following format:

"Frequency","FrequencyOfFrequency"
c1,cc1
c2,cc2
c3,cc3
...

The idea is that you can now load this into R or Excel and plot it
easily to see the graceful Reverse-J of the natural language frequency
distribution.
"""


# Some testing:
filename = "/Volumes/CHRIS/Documents/teaching/2009-F/278/data/literature/austen-emma.txt"
contents = open(filename).read()
fs = frequency_spectrum(counts(contents))                   


"""===================================================================
3. Give a commands (or, a command) for writing the output of fs2csv to
a csv file.
"""



"""===================================================================
4.  Bring in your old palindrome(str) function, the one we designed in
class, and modify it so that the 15,1239 word palindrome Peter Norvig
created is in fact judged to be a palindrome:

http://norvig.com/pal1txt.html

Additional requirements:

* Copy out the palindrome part of the webpage and submit that as a
  file along with your assignment.  Provide functionality for reading
  from a file and running palindrome() on the contents of that file.
  Your interface can be at the command-line or in this script (or
  both) --- your call.

  AND/OR

  provide a function that open's Norvig's page, sucks out the
  palidrome string using regex techniques, and runs palindrome
  on that.
  
* Use the error-checking methods (the __debug__ statements) to run a
  test on your function against this string.

NOTE: Be sure to look carefully at Norvig's HTML.  It contains some
gotchas!

If this is enjoyable to you, check out this page for more crazily long
palindromes to test your tester against:

http://norvig.com/palindrome.html
"""


"""===================================================================
5. The array whitespacers has a bunch of elements in it, some of which
have unfortunate whitespace at their edges.  Supply a command that
will map whitespacers to a new list whitespace_free in which this
surrounding whitespace has been removed. Medial whitespace should
remain untouched.

NO FOR OR WHILE LOOPS ALLOWED!
"""

whitespacers = ["Nevertheless  ", "an", " important ",  "property", " of  ", "these", "three",  "distinctive feature", "theories"]


# Testing.
print whitespace_free


"""===================================================================
6. The library dateutil.parser provides functions for heuristically
parsing date strings into different formats. Check out its documentation:

http://labix.org/python-dateutil#head-c0e81a473b647dfa787dc11e8c69557ec2c3ecd2

For many purposes, it is useful to have dates in the numerical format

YYYYMMDD

Dates in this format can be sorted with simple numerical comparison
methods.  The goal of this question is to use dateutil.parser.parse()
to do that.  Your task: Map the list raw_dates to one that is
equivalent to the list numerical_dates.

NO FOR OR WHILE LOOPS ALLOWED!
"""

raw_dates = ["October 19, 2009", "Aug 5, 1977", "1 Sep 2009", "1 Feb 1957"]

numerical_dates = [20091019, 19770805, 20090901, 19570201]

