#!/usr/bin/env python

"""
Linguist 278: Programming for Linguists
Stanford Linguistics, Fall 2009
Christopher Potts

Assignment 4 - Python regular expressions

Distributed 2009-10-12
Due 2009-10-18
"""

import urllib2
import re
import sys
import os
from collections import defaultdict


"""===================================================================
1. Defining a counter.  Example numbering in linguistics is a constant
hassle.  LaTeX has a nice solution to this.  Basically, you mark
things that you want labeled with \label{value}, where value is some
arbitrary string, and then you reference these labeled items with
\ref{value}.  The \ref{value} things can appear anywhere in the text
--- before or after the \label{} things, which can also appear
anywhere in the text.  In essence, the numbering is accomplished in
two steps:

  1. Run through the document, creating a dictionary mapping labels to
     numbers, where the numbering is determined by the order that the
     \label{} strings appear in the document. If label{x} is the first
     label, map x to 1.  If label{y} comes right after it, map y to 2,
     and so forth.

  2. Run through the document a second time substituting \ref{} and
     \label{} values for the appropriate numbers according to the
     dictionary you created in step 1.

It's often useful to be able to do example numbering plain-text files,
so let's write a function that will do it for us.
"""     

"""-------------------------------------------------------------------
1.1
Complete the function numberer() below in such a way that

 numberer(test_string) == test_string_resolved

where test_string is defined as follows:
"""

test_string = r"""In example \ref{ungramm}, we see that prepositions must come before
their head nouns.  Compare this with \ref{gramm}.

  \label{ungramm} The dog is the porch on.
  \label{gramm} The dog is on the porch.

But of course what can we *really* conclude from the contrast between
\ref{ungramm} and \ref{gramm}?  After all, there are pairs like \ref{PNP} and \ref{NPP}.

  \label{PNP} Notwithstanding your problem.
  \label{NPP} Your problem notwithstanding.

Are we actually looking at prepositions in \ref{PNP} and \ref{NPP}? This is the
question I address ..."""

def numberer(string):
    # Create a regular expression for getting \label{} values.
    # and use findall to grab them and place them into an array.

    # Create a mapping from labels to their counter numbers.
    label_dict = {} # Counter dictionary.
    c = 1 # Value for label_dict; increment this after each label.

    # Now cycle through your (label,count) pairs, substituting one
    # for the other in your string as you go.

    return string

# Testing.
test_string_resolved = """In example 1, we see that prepositions must come before
their head nouns.  Compare this with 2.

  1 The dog is the porch on.
  2 The dog is on the porch.

But of course what can we *really* conclude from the contrast between
1 and 2?  After all, there are pairs like 3 and 4.

  3 Notwithstanding your problem.
  4 Your problem notwithstanding.

Are we actually looking at prepositions in 3 and 4? This is the
question I address ..."""

# This is a debugging statement.  If your counter isn't working, it
# will raise an exception.
if __debug__:
    if not numberer(test_string) == test_string_resolved:
        raise AssertionError, "numberer() isn't behaving in the desired way."

"""-------------------------------------------------------------------
1.2

It would be useful to be able to call numberer() from the command-line
on a filename argument.  Python makes it easy to do this. Just comment
out the following code and complete it.  Running your script will make
the conditional statement true, so code in its scope will execute.
(Note, if this is commented in, your script will run only if it is
given at least one argument on the command line. So you'll probably
want to comment it out again after getting it working.)
"""

# if __name__ == '__main__':
#     filename = sys.argv[1]
    

"""===================================================================
2. Parsing structured data. For this series of subproblems, you
download a webpage and extract some information from it, relying on
the rich structure of its HTML markup. The goal is functionality for
quickly seeing what the plan is for a given day.

For those of you who don't know HTML, here are some notes:

* Line beaks are not meaningful, so it's best to avoid relying on
  them, and you should be prepared to encounter them anywhere.

* <tr> ... </tr> picks out a table row.

* <td> ... </td> picks out a table cell.

* HTML tags can have attributes.  For example <td class='day'> is a
table cell of the the class 'day'.  CSS (Cascading Style Sheets) allow
the designer to style such cells specifically, perhaps by making them
a particular color or by putting text in them in a particular font.
(You'll shortly download our course's homepage.  Here's its CSS:
  http://www.stanford.edu/class/linguist278/css/styles.css
)
"""

"""-------------------------------------------------------------------
2.1. What does page_download do if no filename argument is specified?
Add in your description as documentation for this function.  Also add
any additional comments or documentation that you feel would be
useful.
"""

def page_download(link, output_filename=""):
    if output_filename == "":
        output_filename = os.path.basename(link)
    try:
        contents = urllib2.urlopen(link).read()
    except urllib2.URLError, e:
        print e.reason        
    open(output_filename, "w").write(contents)

"""-------------------------------------------------------------------
2.2. Download the course's homepage (the value of the coursehome
string variable below), but name your local copy 'ling278home.html'.
Include your commands for doing this.

"""

coursehome = "http://www.stanford.edu/class/linguist278/index.html"

"""-------------------------------------------------------------------
2.3 The function get_table() takes a string and extracts all the
tables in it that are of the class 'schedule', returning these
substrings as a list of strings. Call this function on the homepage
file you created and inspect the results by printing them to the
screen.  At this point, you'll want to try to get a feel for the
structure of this string, so stare at it a while, and perhaps work
on getting it to print intuitively to your screen.
"""

def get_table(string):
    """Accepts a string as input and returns a list of schedule-table
    substrings."""
    # This regex is prepared for either kind of quotation mark, and it
    # snips off whitespace and newlines around the content of the
    # table.
    table_re = re.compile("<table class=(?:\"|\')schedule(?:\"|\')>(?:\s|\n)*(.+?)(?:\s|\n)*</table>", re.MULTILINE | re.DOTALL)
    return table_re.findall(string)

# Insert code for calling get_table() and printing its output to the
# screen.  Note: there is only one schedule table on our page, so you
# might want to view and deal with just the first member of the
# (singleton) list.


"""
2.4-------------------------------------------------------------------
Our goal is to extract the scheduling information from the HTML
table. The first step is to create a list of lists containing the
information.  Complete the function parse_schedule() below, and then
call the function and print its output to the screen for inspection.
"""

def parse_schedule(string):
    # This is the output data structure.  It will be a list of dictionaries.
    schedule = []
    # Extract the schedule table(s) from the input string.

    # Get the rows.  These are all delimited by <tr> ... </tr>, and
    # most (perhaps all) of them have line breaks in their contents.

    # Cycle through the rows to get the cells' information.  Cells are
    # all marked with the format
    #
    #   <td class="ATT"> ... </td>
    # or
    #   <th class="ATT"> ... </th>
    #
    # where ATT is something informative about the contents of that
    # cell. You'll want to grab both the ATT value and the contents of
    # the cell. As you go, turn each cell into a dictionary mapping
    # attributes to values.  For example, if the row were
    #
    #   <tr>
    #   <th class="day">Sep&nbsp;21</th>
    #   <td class="plan">Overview</td>
    #   <td class="assignment" rowspan="2">1. Command-line basics</td>
    #   <td class="reading" rowspan="2"><a href="http://linuxcommand.org/learning_the_shell.php">Learning the shell</a></td>
    #   </tr>
    #
    # then you would end up with a dictionary like this:
    #
    #  {"day":"Sep&nbsp;21",
    #   "plan":"Overview",
    #   "assignment":"1. Command-line basics",
    #   "reading":"<a href="http://linuxcommand.org/learning_the_shell.php">Learning the shell</a>" }
    #
    # Store these dictionaries in the schedule list defined at the top of this function.


"""-------------------------------------------------------------------
2.5 What's happening in Linguist 278? Define a function
schedule_by_day() with the following behavior:

* It takes two arguments: a date string and our homepage link. It
processes the homepage contents so that we have the schedule, as
defined by parse_schedule().

* If you feed it a day in the form 'Sep 21', 'Oct 4', etc., it
returns the dictionary associated with that day.

* It tolerates capitalization variation in the month, and it will
allow you to write 'Oct 04', i.e., you can specify your dates as
two digits.

* If you feed it a day that isn't in the schedule (or one that is
formatted wrong), it gives you a helpful message.

Please feel free to improve on this basis design. For example, you
could have it distinguish ill-formed dates from dates that aren't in
the schedule.  You could allow a wider range of date formats. And so
forth.
"""


# Here are a few tests:
if __debug__:
    if not schedule_by_day("Sep 21")["plan"] == 'Overview and goals; the  shell: navigation, pipes, filters':
        raise AssertionError, "for argument 'Sep 21', 'plan', current output is: %s" % schedule_by_day("Sep 21")["plan"]
    if not schedule_by_day("Oct 12")["assignment"] == '<a href="assignments/04_regexs.py">4. Python regular expressions</a> [due Oct 18]':
        raise AssertionError, "for argument Oct 12', 'assignment', current output is: %s" % schedule_by_day("Oct 12")["assignment"]
    if not schedule_by_day("oCt 12")["assignment"] == '<a href="assignments/04_regexs.py">4. Python regular expressions</a> [due Oct 18]':
        raise AssertionError, "for argument oCt 12', 'assignment', current output is: %s" % schedule_by_day("oCt 12")["assignment"]

"""===================================================================
3. What do you need to do?

3.1 Please briefly decribe a project you'd like to undertake involving
Python programming. This can be as ambitious as you like, but I think
this task will be most rewarding if you define a project that is just
outside your current skill set and that can be done without a lot of
preparatory work.

3.2 What can you currently achieve?  Decribe this, or perhaps start
writing some code.

3.3 What technical details seem to be preventing you from completing
the project?

It's my hope that I can nudge you in the right direction with my reply
to your write-up here, and that you will in turn undertake your
project and bring it to fruition.
"""
    
    
