-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
complete spam test classification in Naive Bayes
- Loading branch information
yinchuandong
committed
Feb 12, 2016
1 parent
c12ba39
commit 9b09301
Showing
1 changed file
with
107 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,111 @@ | ||
__author__ = 'wangjiewen' | ||
# coding=utf-8 | ||
|
||
import numpy | ||
import numpy as np | ||
|
||
class NativeBayes(object): | ||
def test(self): | ||
print 'test' | ||
def createVocabList(dataSet): | ||
vocabSet = set([]) | ||
for document in dataSet: | ||
vocabSet = vocabSet | set(document) | ||
return list(vocabSet) | ||
|
||
|
||
def bagOfWords2VecMN(vocabList, inputSet): | ||
returnVec = [0] * len(vocabList) | ||
for word in inputSet: | ||
if word in vocabList: | ||
returnVec[vocabList.index(word)] += 1 | ||
return returnVec | ||
|
||
|
||
def trainNB(trainMat, labelMat): | ||
numTrainDocs = len(trainMat) | ||
numWords = len(trainMat[0]) | ||
# in labelMat, 1 stand for abusive; 0 for normal | ||
pClass1 = sum(labelMat) / float(numTrainDocs) | ||
p0Num = np.ones(numWords) | ||
p1Num = np.ones(numWords) | ||
p0Denom = 2.0 | ||
p1Denom = 2.0 | ||
for i in range(numTrainDocs): | ||
if labelMat[i] == 1: | ||
p1Num += trainMat[i] | ||
p1Denom += sum(trainMat[i]) | ||
else: | ||
p0Num += trainMat[i] | ||
p0Denom += sum(trainMat[i]) | ||
# regarding the accuracy of float in python | ||
# change it to log | ||
p0Vec = np.log(p0Num / p0Denom) | ||
p1Vec = np.log(p1Num / p1Denom) | ||
return p0Vec, p1Vec, pClass1 | ||
|
||
|
||
def classifyBN(vec2Classify, p0Vec, p1Vec, pClass1): | ||
p0 = sum(vec2Classify * p0Vec) + np.log(1 - pClass1) | ||
p1 = sum(vec2Classify * p1Vec) + np.log(pClass1) | ||
if p0 > p1: | ||
return 0 | ||
else: | ||
return 1 | ||
|
||
|
||
def textParse(bigString): | ||
""" | ||
" filter too short words | ||
" input is big string, output is a list | ||
""" | ||
import re | ||
listOfTokens = re.split(r'\W*', bigString) | ||
return [tok.lower() for tok in listOfTokens if len(tok) > 2] | ||
|
||
|
||
def spamTest(): | ||
docList = [] | ||
classList = [] | ||
for i in range(1, 26): | ||
wordList = textParse(open('email/spam/%d.txt' % i).read()) | ||
docList.append(wordList) | ||
classList.append(1) | ||
|
||
wordList = textParse(open('email/ham/%d.txt' % i).read()) | ||
docList.append(wordList) | ||
classList.append(0) | ||
|
||
vocabList = createVocabList(docList) | ||
trainingSet = range(50) | ||
testSet = [] | ||
for i in range(10): | ||
randIndex = int(np.random.uniform(0, len(trainingSet))) | ||
testSet.append(trainingSet[randIndex]) | ||
del(trainingSet[randIndex]) | ||
|
||
trainMat = [] | ||
labelMat = [] | ||
for docId in trainingSet: | ||
trainMat.append(bagOfWords2VecMN(vocabList, docList[docId])) | ||
labelMat.append(classList[docId]) | ||
|
||
p0Vec, p1Vec, pSpam = trainNB(np.array(trainMat), np.array(labelMat)) | ||
errorCount = 0 | ||
for docId in testSet: | ||
wordVec = bagOfWords2VecMN(vocabList, docList[docId]) | ||
if classifyBN(wordVec, p0Vec, p1Vec, pSpam) != classList[docId]: | ||
errorCount += 1 | ||
print "classfiy wrongly", docList[docId] | ||
|
||
print 'the error rate is: ', float(errorCount) / len(testSet) | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
if __name__ == '__main__': | ||
print 'begin' | ||
spamTest() | ||
print 'end' | ||
|
||
def __init__(self): | ||
print 'init' | ||
|
||
def loadData(self): | ||
print 'load data' |