NLP

nltk notes

text1.concordance('monstrous')
to examine the context of a text
text1.similar('monstrous')
to find the words with similar context
text2.common_contexts(["monstrous", 'very'])
to find the common context of a list of words
text4.dispersion_plot(['citizens', 'democracy', 'freedom', 'duties', 'American'])
lexical dispersion plot
fdist1 FreqDist(text1)
frequency distribution
fdist1.plot(50, cumulativeTrue)
generate a cumulative frequency plot
fdist1.N()
total number of samples
fdist1.tabulate()
tabulate the frequency of the distribution
bigrams('more is said than doen'.split())
get a bigrams of a list of words
text4.collocations()
In corpus linguistics, collocation defines a sequence of words or terms that co-occur more often than would be expected by chance
ConditionalFreqDist
cfd = nltk.ConditionalFreqDist(
            (genre, word)
            for genre in brown.categories()
            for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
cfd.tabulate(conditions = genres, samples=modals)
                 can could  may might must will
           news   93   86   66   38   50  389
       religion   82   59   78   12   54   71
        hobbies  268   58  131   22   83  264
science_fiction   16   49    4   12    8   16
        romance   74  193   11   51   45   43
          humor   16   30    8    8    9   13
gender classification
>>> import random
>>> from nltk.corpus import names
>>> names = ([(name, 'male') for name in names.words('male.txt')]+[(name, 'female') for name in names.words('female.txt')])
>>> featuresets = [(gender_features(n), g) for (n, g) in names]
>>> train_set, test_set = featuresets[500:], featuresets[:500]
>>> import nltk
>>> classifier = nltk.NaiveBayesClassifier.train(train_set)
>>> classifier.classify(gender_features('Neo'))
'male'
>>> classifier.classify(gender_features('Neadfasdfo'))
'male'
>>> classifier.classify(gender_features('Trinity'))
'female'
>>> print nltk.classify.accuracy(classifier, test_set)
0.602
>>> classifier.show_most_informative_features(5)
Most Informative Features
             last_letter = 'a'            female : male   =     35.5 : 1.0
             last_letter = 'k'              male : female =     34.1 : 1.0
             last_letter = 'f'              male : female =     15.9 : 1.0
             last_letter = 'p'              male : female =     13.5 : 1.0
             last_letter = 'v'              male : female =     12.7 : 1.0
>>> from nltk.classify import apply_features
>>> train_set = apply_features(gender_features, name[500:])
>>> test_set = apply_features(gender_features, names[:500])
document classification
>>> from nltk.corpus import movie_reviews
>>> documents = [(list(movie_reviews.words(fileid)), category)
...              for category in movie_reviews.categories()
...              for fileid in movie_reviews.fileids(category)]
>>> random.shuffle(documents)
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = all_words.keys()[:2000]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

>>> print nltk.classify.accuracy(classifier, test_set)
0.81
>>> classifier.show_most_informative_features(5)
Most Informative Features
   contains(outstanding) = True              pos : neg    =     11.1 : 1.0
        contains(seagal) = True              neg : pos    =      7.7 : 1.0
   contains(wonderfully) = True              pos : neg    =      6.8 : 1.0
         contains(damon) = True              pos : neg    =      5.9 : 1.0
        contains(wasted) = True              neg : pos    =      5.8 : 1.0