#!/usr/bin/env python

from collections import defaultdict
import re

"""
These classes are meant to be used with the files 

http://www.stanford.edu/class/linguist278/restricted/data/107-most-beautiful-words.txt
http://www.stanford.edu/class/linguist278/restricted/data/107-random-words.txt

which each consist of 107 words as follows:

orthography  CMU dictionary orthography  celex parse  Google n-gram frequency

where the fields are separated by two spaces.

Created: Linguist 278, 2009-10-21
Last update: 2009-10-25
"""
######################################################################

class WordList:
    """File string as argument.  Creates a Word from each line."""
    def __init__(self, string):
        lines = string.splitlines()
        self.words = []
        for line in lines:
            if re.search("^;;;", line) == None:
                self.words.append(Word(line))

    def __len__(self):
        return len(self.words)

######################################################################

class Word:
    """Reads in a single line containing word fields, parses the line,
    and provides access and measurement methods."""
    def __init__(self, string):
        self.string = string
        self.orthography, self.phonology, self.morphology, self.frequency = self.string.split("  ")
        self.frequency = int(self.frequency)

    def phonemes(self):
        return tuple(self.phonology.split(" "))

    def phoneme_dict(self):
        d = defaultdict(int)
        for p in self.phonemes():
            d[p] += 1
        return d

    def is_longer_than(self, otherword):
        if len(otherword) < len(self):
            return True
        else:
            return False
        
    def phoneme_set(self):
        return set(self.phonemes())

    def orthography_len(self):
        return len(self.orthography)

    def pos(self):
        pos_re = re.compile(r"\[(.+?)\]\s*$")
        return pos_re.search(self.morphology).group(1)

    def __len__(self):
        return len(self.phonemes())
        
    def __str__(self):
        return self.string

######################################################################

