In [16]:
import numpy as np
import pprint
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.feature_extraction.text import TfidfTransformer
import heapq
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn import cross_validation
In [5]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train')
twenty_test = fetch_20newsgroups(subset='test')

Cross validation

In [12]:
tfidf_multinomialNB = make_pipeline(CountVectorizer(), TfidfTransformer(), MultinomialNB())
tfidf_multinomialNB.fit(twenty_train.data, twenty_train.target)
score_test = np.mean(tfidf_multinomialNB.predict(twenty_test.data) == twenty_test.target)
print("Accuracy on test data = %s" % score_test)
Accuracy on test data = 0.77389803505

In [11]:
from sklearn import cross_validation
scores = cross_validation.cross_val_score(tfidf_multinomialNB, twenty_train.data, twenty_train.target, cv=5, n_jobs=5)
print(scores)
print("cross validation score = %s" % np.mean(scores))
[ 0.84883208  0.84245366  0.84452297  0.84159292  0.84345898]
cross validation score = 0.844172123211

Grid Search Results

Naive Bayes

In [13]:
from sklearn.grid_search import GridSearchCV
bayes_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),])
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3)}

gs_bayes_clf = GridSearchCV(bayes_clf, parameters, n_jobs=-1)

gs_bayes_clf = gs_bayes_clf.fit(twenty_train.data, twenty_train.target)
print("prediction for doc 'God is love' is %s" % twenty_train.target_names[gs_bayes_clf.predict(['God is love'])])

print("best params are:")
print(gs_bayes_clf.best_params_)

print("all scores:")
pprint.pprint(gs_bayes_clf.grid_scores_)

print("best cross validation score is : ")
print([e.mean_validation_score for e in gs_bayes_clf.grid_scores_ \
     if e.parameters==gs_bayes_clf.best_params_])

print("Accuracy on test data for the best classifier = %s"% \
      (np.mean(gs_bayes_clf.predict(twenty_test.data) == twenty_test.target)))
prediction for doc 'God is love' is soc.religion.christian
best params are:
{'tfidf__use_idf': True, 'clf__alpha': 0.01, 'vect__ngram_range': (1, 2)}
all scores:
[mean: 0.90428, std: 0.00150, params: {'tfidf__use_idf': True, 'clf__alpha': 0.01, 'vect__ngram_range': (1, 1)},
 mean: 0.90675, std: 0.00193, params: {'tfidf__use_idf': True, 'clf__alpha': 0.01, 'vect__ngram_range': (1, 2)},
 mean: 0.89500, std: 0.00297, params: {'tfidf__use_idf': False, 'clf__alpha': 0.01, 'vect__ngram_range': (1, 1)},
 mean: 0.89694, std: 0.00373, params: {'tfidf__use_idf': False, 'clf__alpha': 0.01, 'vect__ngram_range': (1, 2)},
 mean: 0.89915, std: 0.00417, params: {'tfidf__use_idf': True, 'clf__alpha': 0.001, 'vect__ngram_range': (1, 1)},
 mean: 0.90622, std: 0.00100, params: {'tfidf__use_idf': True, 'clf__alpha': 0.001, 'vect__ngram_range': (1, 2)},
 mean: 0.90092, std: 0.00080, params: {'tfidf__use_idf': False, 'clf__alpha': 0.001, 'vect__ngram_range': (1, 1)},
 mean: 0.90631, std: 0.00125, params: {'tfidf__use_idf': False, 'clf__alpha': 0.001, 'vect__ngram_range': (1, 2)}]
best cross validation score is : 
[0.90675269577514583]
Accuracy on test data for the best classifier = 0.834439723845

SVM

In [18]:
from sklearn.grid_search import GridSearchCV
parameters = {'clf__C': (0.1, 1, 2),
              'clf__loss': ('hinge', 'squared_hinge')}

svm_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', svm.LinearSVC()),])

gs_svm_clf = GridSearchCV(svm_clf, parameters, n_jobs=-1)

gs_svm_clf = gs_svm_clf.fit(twenty_train.data, twenty_train.target)

print("best params are:")
print(gs_svm_clf.best_params_)

print("all scores:")
pprint.pprint(gs_svm_clf.grid_scores_)

print("best score is : ")
print([e.mean_validation_score for e in gs_svm_clf.grid_scores_ \
     if e.parameters==gs_svm_clf.best_params_])
best params are:
{'clf__C': 2, 'clf__loss': 'hinge'}
all scores:
[mean: 0.89951, std: 0.00426, params: {'clf__C': 0.1, 'clf__loss': 'hinge'},
 mean: 0.89650, std: 0.00153, params: {'clf__C': 0.1, 'clf__loss': 'squared_hinge'},
 mean: 0.92098, std: 0.00244, params: {'clf__C': 1, 'clf__loss': 'hinge'},
 mean: 0.92081, std: 0.00218, params: {'clf__C': 1, 'clf__loss': 'squared_hinge'},
 mean: 0.92408, std: 0.00114, params: {'clf__C': 2, 'clf__loss': 'hinge'},
 mean: 0.92284, std: 0.00110, params: {'clf__C': 2, 'clf__loss': 'squared_hinge'}]
best score is : 
[0.92407636556478701]

In [20]:
print("Accuracy= %s"% (np.mean(gs_svm_clf.predict(twenty_test.data) == twenty_test.target)))
Accuracy= 0.860993096123

In []: