#!/usr/bin/env python
# encoding: utf-8
"""
Linguist 278: Programming for Linguists
Stanford Linguistics, Fall 2013
Christopher Potts
Assignment 1
Distributed 2013-09-24
Due 2009-10-01
NOTE: Please submit a modified version of this file, including
comments. Python should be able to run through your file without
errors.
"""
import math
import pprint
"""===================================================================
1 [1 point]
Complete the function mean for calculating the mean (average) of a
list of numeric values. Your function should take a list vals as its
argument and return a float. Keep in mind that vals might contain int
values.
In class, we wrote this with a for-loop. Consider doing it instead with
the in-line function sum, which facilitates a fast and readable
one-line version of the function.
"""
def mean(vals):
"""Return the mean of the values in vals, presupposed to be
numeric (float, int, or long)."""
#<--
total = 0.0
for x in vals:
total += x
return total / float(len(vals))
#-->
# Delete pass and fill in your function.
pass
"""===================================================================
2 [1 point]
Complete the function sd for calculating the standard deviation of a
list of numeric values. Your function should take a list vals as its
value and return a float. Keep in mind that vals might contain int
values.
For details on calculating the standard deviation, see
http://en.wikipedia.org/wiki/Standard_deviation
I suggest using float(len(vals)-1) for the denominator, but
float(len(vals)) is fine.
To get the square root of a float x, using math.sqrt(x)
"""
def sd(vals, sd_corr=0):
"""Return the standard deviation of the values in vals,
presupposed to be numeric (float, int, or long)."""
#<--
mu = mean(vals)
total = 0.0
if len(vals) < 2:
return 0.0
for x in vals:
total += (x - mu)**2
return math.sqrt(total / float(len(vals)-sd_corr))
#-->
# Delete pass and fill in your function.
pass
"""===================================================================
3 [1 point]
Complete the function zscore for computing the z-score (normal score)
of a list of numeric values. Your function should take a list vals as
its value and return a list of z-score normed values. Use mean and sd,
as defined above, for this calculation.
For details on calculating the z-score, see
http://en.wikipedia.org/wiki/Z_score
"""
def zscore(vals, sd_corr=0):
"""Return the z-scored version of vals."""
#<--
normed = []
mu = mean(vals)
sigma = sd(vals, sd_corr=sd_corr)
if len(vals) < 2:
return 0.0
for x in vals:
normed.append((x - mu) / sigma)
return normed
#-->
# Delete pass and fill in your function
pass
"""===================================================================
4 [2 points]
Complete the function palindrome_detector for identifying
palindromes (words that are the same forward and backwards). Your
function should be case-insensitive (e.g., 'Wow' should count as a
palindrome) and it should ignore spaces (e.g., 'race car' is a
palindrome).
To test your function, call palindrome_detector_test, which will work
with no modifications.
"""
def palindrome_detector(s):
"""The input is any str s. The return value is True if s is a
palindrome, else False."""
#<--
s = s.lower().replace(' ', '')
return s == ''.join(reversed(s))
#-->
def palindrome_detector_test():
"""Simple unit test for palindrome_detector."""
sample = (
('deleveled', True),
('Malayalam', True),
('detartrated', True),
('a', True),
('repaper', True),
('Al lets Della call Ed Stella', True),
('Lisa Bonet ate no basil', True),
('Linguistics', False),
('Python', False),
('palindrome', False),
('an', False),
('re-paper', False)
)
for s, val in sample:
try:
assert palindrome_detector(s) == val
except AssertionError:
print 'palindrome_detector in error for %s' % s
"""===================================================================
5
Intro to comma-separated values. The string myspreadsheet stores 11
lines of comma-separated values, with the first line giving the column
names and the other lines giving the data on 10 imaginary subjects.
Below are some questions that ask you to write functions for working
with this data.
"""
myspreadsheet ="""Subject,Height,Occupation
1,74.37000326528938,Psychologist
2,67.49686206937491,Psychologist
3,74.92356434760966,Psychologist
4,64.62372198999978,Psychologist
5,67.76787900026083,Linguist
6,61.50397707923559,Psychologist
7,62.73680961908566,Psychologist
8,68.60803984763902,Linguist
9,70.16090500135535,Psychologist
10,76.81144438287173,Linguist"""
"""--------------------------------------------------
Random facts: The column of heights, presumably in
inches, was generated with
import random
and then repeated runs of
random.uniform(60,77)
The column of occupations was generated with
d = {0:'Psychologist',1:'Linguist'}
and then repeated runs of
d[random.randint(0,1)]
--------------------------------------------------"""
"""===================================================================
5.1 [3 points]
Basic CSV parser. Complete the following function for turning the str
myspreadsheet into a 10x3 matrix of data. I should emphasize that the
top line of myspreadsheet is not part of the data. It's just there to
help us out.
Column 0 of your data should be int values.
Column 1 of your data should be float values.
To test your parser, call csv_parser_test, which will work with
no modifications.
"""
def csv_parser(s):
"""Parses the string s into lines, and then parses those lines by
splitting on the comma and converting the numerical data to int.
The output is a list of lists of subject data."""
# Data is our output. It will be a list of lists.
data = []
# Split csv into lines and store them in a list called 'lines'.
#<--
lines = s.splitlines()
#-->
# Remove the first element from lines, so that you have only the data lines left.
#<--
lines.pop(0)
#-->
# At this stage, we loop through the list called lines.
# As you loop
# i. split each line on the commas;
# ii. convert the Subject variable to int.
# iii. convert the Height variable to float.
# iv. add to data a list consisting of this line's Subject, Height, and Occupation values
for line in lines:
#<--
fields = line.split(',')
fields[0] = int(fields[0])
fields[1] = float(fields[1])
data.append(fields)
#-->
return data
def csv_parser_test():
"""Display the output of csv_parser(myspreadsheet) and
test it a little bit."""
data = csv_parser(myspreadsheet)
print 'Your data object:'
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(data)
err_count = 0
# Did your parser work?
for row_num, row in enumerate(data):
try:
assert len(row) == 3
except AssertionError:
print "Row %s seems to be misparsed; its length is %s" % (row_num, len(row))
err_count += 1
# Check on one of the values:
try:
assert data[4][2] == 'Linguist'
except AssertionError:
print "Error: data[4][2] should equal 'Linguist'; actual value is %s" % data[4][2]
err_count += 1
# Did you remember your int conversions?
try:
assert isinstance(data[0][0], int)
except AssertionError:
print "Error: data[0][0] should be an int"
err_count += 1
# Did you remember your float conversions?
try:
assert isinstance(data[6][1], float)
except AssertionError:
print "Error: data[6][1] should be a float"
err_count += 1
print "%s errors for csv_parser" % err_count
"""===================================================================
5.2 [1 point]
Complete the following function for computing the mean height of the
subjects in this data set, using your mean function from above.
"""
def mean_height(data):
"""Return the mean numerical value of column 1 in an list of lists.
data is the output of csv_parser(myspreadsheet)"""
#<--
heights = []
# Gather the needed values into a list.
for row in data:
heights.append(row[1])
# Use your mean function and return the result:
return mean(heights)
#-->
"""===================================================================
5.3 [1 point]
Occupation distribution. Complete the following function so that it
returns a dictionary mapping occupation names into the number of times
they occur in the data.
"""
def occupation_distribution(data):
"""Returns the list of occupations given in column 2 of data.
data is the output of csv_parser(myspreadsheet)"""
#<--
d = {}
for row in data:
if row[2] in d:
d[row[2]] += 1
else:
d[row[2]] = 1
return d
#-->
#<--
if __name__ == '__main__':
palindrome_detector_test()
csv_parser_test()
data = csv_parser(myspreadsheet)
print mean_height(data)
data = csv_parser(myspreadsheet)
print occupation_distribution(data)
#-->