# 朴素贝叶斯算法的python实现

createvocablist(dataset)

setofwords2vec(vocablist, inputset)

bagofwords2vecmn(vocablist, inputset)

trainnb0(trainmatrix,traincatergory)

classifynb(vec2classify, p0vec, p1vec, pclass1)

#coding=utf-8
from numpy import *
postinglist=[[‘my’, ‘dog’, ‘has’, ‘flea’, ‘problems’, ‘help’, ‘please’],
[‘maybe’, ‘not’, ‘take’, ‘him’, ‘to’, ‘dog’, ‘park’, ‘stupid’],
[‘my’, ‘dalmation’, ‘is’, ‘so’, ‘cute’, ‘i’, ‘love’, ‘him’],
[‘stop’, ‘posting’, ‘stupid’, ‘worthless’, ‘garbage’],
[‘mr’, ‘licks’, ‘ate’, ‘my’, ‘steak’, ‘how’, ‘to’, ‘stop’, ‘him’],
[‘quit’, ‘buying’, ‘worthless’, ‘dog’, ‘food’, ‘stupid’]]
classvec = [0,1,0,1,0,1] #1 is abusive, 0 not
return postinglist,classvec
#创建一个带有所有单词的列表
def createvocablist(dataset):
vocabset = set([])
for document in dataset:
vocabset = vocabset | set(document)
return list(vocabset)
def setofwords2vec(vocablist, inputset):
retvocablist = [0] * len(vocablist)
for word in inputset:
if word in vocablist:
retvocablist[vocablist.index(word)] = 1
else:
print ‘word ‘,word ,’not in dict’
return retvocablist
#另一种模型
def bagofwords2vecmn(vocablist, inputset):
returnvec = [0]*len(vocablist)
for word in inputset:
if word in vocablist:
returnvec[vocablist.index(word)] += 1
return returnvec
def trainnb0(trainmatrix,traincatergory):
numtraindoc = len(trainmatrix)
numwords = len(trainmatrix[0])
pabusive = sum(traincatergory)/float(numtraindoc)
#防止多个概率的成绩当中的一个为0
p0num = ones(numwords)
p1num = ones(numwords)
p0denom = 2.0
p1denom = 2.0
for i in range(numtraindoc):
if traincatergory[i] == 1:
p1num +=trainmatrix[i]
p1denom += sum(trainmatrix[i])
else:
p0num +=trainmatrix[i]
p0denom += sum(trainmatrix[i])
p1vect = log(p1num/p1denom)#处于精度的考虑，否则很可能到限归零
p0vect = log(p0num/p0denom)
return p0vect,p1vect,pabusive
def classifynb(vec2classify, p0vec, p1vec, pclass1):
p1 = sum(vec2classify * p1vec) + log(pclass1) #element-wise mult
p0 = sum(vec2classify * p0vec) + log(1.0 – pclass1)
if p1 > p0:
return 1
else:
return 0
def testingnb():
myvocablist = createvocablist(listoposts)
trainmat=[]
for postindoc in listoposts:
trainmat.append(setofwords2vec(myvocablist, postindoc))
p0v,p1v,pab = trainnb0(array(trainmat),array(listclasses))
testentry = [‘love’, ‘my’, ‘dalmation’]
thisdoc = array(setofwords2vec(myvocablist, testentry))
print testentry,’classified as: ‘,classifynb(thisdoc,p0v,p1v,pab)
testentry = [‘stupid’, ‘garbage’]
thisdoc = array(setofwords2vec(myvocablist, testentry))
print testentry,’classified as: ‘,classifynb(thisdoc,p0v,p1v,pab)
def main():
testingnb()
if __name__ == ‘__main__’:
main()

Posted in 未分类