In [83]:
import pprint
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import heapq
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn import cross_validation
In [36]:
def get_max_score(doc, pipe, n=10):
    docmat = pipe.transform([doc])
    for (score, wid) in heapq.nlargest(n, zip(docmat.data, docmat.indices)):
        print("%s %s" % (pipe.steps[0][1].get_feature_names()[wid], score))

The 20 newsgroups dataset

In [33]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train')
twenty_test = fetch_20newsgroups(subset='test')
In [93]:
docid = 1
doc = twenty_train.data[docid]
cat = twenty_train.target[docid]
print(30*"*"+" Document:")
print("\n".join(doc.split("\n")))
print(30*"*"+" Label for this document:")
print(twenty_train.target_names[cat])
****************************** Document:
From: guykuo@carson.u.washington.edu (Guy Kuo)
Subject: SI Clock Poll - Final Call
Summary: Final call for SI clock reports
Keywords: SI,acceleration,clock,upgrade
Article-I.D.: shelley.1qvfo9INNc3s
Organization: University of Washington
Lines: 11
NNTP-Posting-Host: carson.u.washington.edu

A fair number of brave souls who upgraded their SI clock oscillator have
shared their experiences for this poll. Please send a brief message detailing
your experiences with the procedure. Top speed attained, CPU rated speed,
add on cards and adapters, heat sinks, hour of usage per day, floppy disk
functionality with 800 and 1.4 m floppies are especially requested.

I will be summarizing in the next two days, so please add to the network
knowledge base if you have done the clock upgrade and haven't answered this
poll. Thanks.

Guy Kuo <guykuo@u.washington.edu>

****************************** Label for this document:
comp.sys.mac.hardware

In [24]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
print("(number of documents, number of features) = (%s, %s) " % X_train_counts.shape)
(number of documents, number of features) = (11314, 130107) 

In [25]:
tf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
print("(number of documents, number of features) = (%s, %s) " % X_train_counts.shape)
(number of documents, number of features) = (11314, 130107) 

Bag of words

In [45]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
vect.fit(twenty_train.data)
document_matrix = vect.transform(["Summer school in computer science"])
print(document_matrix)
  (0, 41614)	1
  (0, 66608)	1
  (0, 105181)	1
  (0, 105252)	1
  (0, 111697)	1

In [46]:
vect.get_feature_names()[105181]
Out[46]:
'school'
In [43]:
from sklearn.pipeline import make_pipeline
vect = make_pipeline(CountVectorizer())
vect.fit(twenty_train.data)
get_max_score(doc, vect)
clock 5
washington 4
the 4
si 4
poll 3
of 3
edu 3
and 3
with 2
upgrade 2

In [47]:
CountVectorizer()
Out[47]:
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

Tfidf

In [9]:
text_tfidf = make_pipeline(CountVectorizer(), TfidfTransformer())
text_tfidf.fit_transform(twenty_train.data)
get_max_score(doc, text_tfidf)
clock 0.359502780329
si 0.341838991844
poll 0.290731950786
washington 0.2409454883
guykuo 0.228664180923
kuo 0.21038955503
carson 0.178128395616
experiences 0.161989154189
upgrade 0.156077323173
1qvfo9innc3s 0.137243750249

In [19]:
X_train_counts = vect.fit_transform(twenty_train.data)
tfidf_transformer = TfidfTransformer()
tfidf_transformer.fit(X_train_counts)
Out[19]:
TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)
In [11]:
    vect
Out[11]:
Pipeline(steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None))])
In [17]:
vect.steps[0][1].get_feature_names()[27366]
Out[17]:
'algorithm'
In [16]:
vect.steps[0][1].vocabulary_.get('algorithm')
Out[16]:
27366
In [50]:
text_tfidf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ])
In [51]:
text_tfidf.fit_transform(twenty_train.data, twenty_train.target)
Out[51]:
<11314x130107 sparse matrix of type '<class 'numpy.float64'>'
	with 1787565 stored elements in Compressed Sparse Row format>
In [52]:
print(text_tfidf.transform(["Summer school in computer science"]))
  (0, 111697)	0.647749683577
  (0, 105252)	0.419897493869
  (0, 105181)	0.505725350036
  (0, 66608)	0.140224558136
  (0, 41614)	0.358727453221

In [54]:
get_max_score("Summer school in computer science", text_tfidf)
summer 0.647749683577
school 0.505725350036
science 0.419897493869
computer 0.358727453221
in 0.140224558136

In [53]:
doc = "Summer school in computer science"
print(text_tfidf.transform([doc])*text_tfidf.transform([doc]).T)
  (0, 0)	1.0

Naive Bayes

In [55]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import make_pipeline
from math import log, exp
In [56]:
train_documents = ["magic algorithm",
                   "meeting tomorrow",
                   "summer school",
                   "viagra magic viagra",
                   "viagra tomorrow",
                   "other"]
train_targets = ["ham", "ham", "ham", "spam", "spam", "other"]
test_documents = ["tomorrow best viagra tomorrow"]
test_targets = ["spam"]
In [57]:
vect_multinomialNB = make_pipeline(CountVectorizer(), MultinomialNB())
vect_multinomialNB.fit(train_documents, train_targets)
Out[57]:
Pipeline(steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])
In [58]:
vect = vect_multinomialNB.steps[0][1]
classifier = vect_multinomialNB.steps[1][1]
In [35]:
print(sorted(list(vect.vocabulary_.keys())))
['algorithm', 'magic', 'meeting', 'other', 'school', 'summer', 'tomorrow', 'viagra']

In [94]:
len(sorted(list(vect.vocabulary_.keys())))
Out[94]:
8
In [59]:
print(classifier.classes_)
['ham' 'other' 'spam']

In [60]:
classifier.feature_count_
Out[60]:
array([[ 1.,  1.,  1.,  0.,  1.,  1.,  1.,  0.],
       [ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  1.,  3.]])
In [61]:
classifier.class_count_
Out[61]:
array([ 3.,  1.,  2.])
In [62]:
classifier.class_log_prior_
Out[62]:
array([-0.69314718, -1.79175947, -1.09861229])
In [77]:
# log(P(spam)) = log(nb spam/nb doc)
log(2/6)
Out[77]:
-1.0986122886681098
In [78]:
# smoothing parameter alpha=1
alpha = 1
# number of features = size of vocabulary
V = len(vect.vocabulary_)
# log(P(w='tomorrow'|class='spam')) = log((nb 'tomorrow' in 'spam' + alpha)/ (nb words in spam + alpha *V))
log((1+alpha)/(5+alpha*V))
Out[78]:
-1.8718021769015913
In [76]:
classifier.classes_[2]
Out[76]:
'spam'
In [79]:
classifier.feature_log_prob_[2,vect.vocabulary_.get('tomorrow')]
Out[79]:
-1.8718021769015913
In [44]:
test_documents
Out[44]:
['tomorrow best viagra tomorrow']
In [43]:
vect.transform(test_documents).todense()
Out[43]:
matrix([[0, 0, 0, 0, 0, 0, 2, 1]])
In [47]:
# log(P(spam|doc)) = log(P(spam)) + \sum_i x_i log(P(i|spam)) 
log(2/6) + 2 * log((1+alpha)/(5+alpha*V)) + 1 * log((3+alpha)/(5+alpha*V))
Out[47]:
-6.020871638812939
In [45]:
classifier._joint_log_likelihood(vect.transform(test_documents))
Out[45]:
array([[-7.22402481, -8.3834332 , -6.02087164]])
In [95]:
vect_multinomialNB.predict(test_documents)
Out[95]:
array(['spam'], 
      dtype='<U5')
In [48]:
exp(-6.02087164) / (exp(-7.22402481) + exp(-8.3834332) + exp( -6.02087164))
Out[48]:
0.7171416485661984
In [49]:
vect_multinomialNB.predict_proba(test_documents)
Out[49]:
array([[ 0.21531891,  0.06753945,  0.71714165]])

Naive bayes on 20 newsgroups

In [51]:
vect_multinomialNB = make_pipeline(CountVectorizer(), MultinomialNB())
vect_multinomialNB.fit(twenty_train.data, twenty_train.target)
Out[51]:
Pipeline(steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])
In [60]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']
[twenty_train.target_names[k] for k in vect_multinomialNB.predict(docs_new)]
Out[60]:
['soc.religion.christian', 'comp.graphics']
In [64]:
np.mean(vect_multinomialNB.predict(twenty_test.data) == twenty_test.target)
Out[64]:
0.77283590015932024
In [73]:
tfidf_multinomialNB = make_pipeline(CountVectorizer(), TfidfTransformer(), MultinomialNB())
tfidf_multinomialNB.fit(twenty_train.data, twenty_train.target)
np.mean(tfidf_multinomialNB.predict(twenty_test.data) == twenty_test.target)
Out[73]:
0.7738980350504514

SVM

In [14]:
from sklearn import svm
In [82]:
svm.LinearSVC()
Out[82]:
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
In [83]:
svm_clf = make_pipeline(CountVectorizer(), TfidfTransformer(), svm.LinearSVC())
In [84]:
svm_clf.fit(twenty_train.data, twenty_train.target)
Out[84]:
Pipeline(steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
  ...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])
In [85]:
np.mean(svm_clf.predict(twenty_test.data) == twenty_test.target)
Out[85]:
0.85315985130111527

KNN

In [97]:
from sklearn import neighbors
In [99]:
neighbors.KNeighborsClassifier()
Out[99]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=5, p=2, weights='uniform')
In [100]:
knn_clf = make_pipeline(CountVectorizer(), TfidfTransformer(), neighbors.KNeighborsClassifier())
In [101]:
knn_clf.fit(twenty_train.data, twenty_train.target)
Out[101]:
Pipeline(steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
  ...size=30, metric='minkowski',
           metric_params=None, n_neighbors=5, p=2, weights='uniform'))])
In [102]:
np.mean(knn_clf.predict(twenty_test.data) == twenty_test.target)
Out[102]:
0.65918746680828466
In []: