-
博客代码:180927
-
作业代码:181007
-
贝叶斯概率
朴素贝叶斯概率计算过程
-
邮箱分类器
邮件的区分方法
-
处理的主要流程
处理的流程
-
具体的代码实现
ftrain = open("spam_train.txt")lines = ftrain.readlines()S = dict() #初始化a,b字典H = dict()tagS = dict() #标记位,使得一个邮件里面不同的词只会记录一次tagH = dict()trash = 0 #记录垃圾邮箱的个数for line in lines: #对a,b以及标记位进行初始化,从0开始计数 data = line.split(' ') count_d = len(data) if data[0] == '0': trash+=1 for i in range(count_d-1): S[data[i+1]] = 0 tagS[data[i+1]] = 0 H[data[i+1]] = 0 tagH[data[i+1]] = 0for line in lines: #记录关键词垃圾邮件和正常邮件的个数 for i in range(count_d-1): tagS[data[i+1]] = 0 tagH[data[i+1]] = 0 data = line.split(' ') count_d = len(data) if data[0] == '0': for i in range(count_d-1): if tagS[data[i+1]] == 0: #如果没被标记过,才会对其进行加数 S[data[i+1]]+=1 tagS[data[i+1]] = 1 if data[0] == '1': for i in range(count_d-1): if tagH[data[i+1]] == 0: H[data[i+1]]+=1 tagH[data[i+1]] = 1 rig = 0ftest = open("spam_test.txt")lines = ftest.readlines()for line in lines: ans = trash/(5000-trash) data = line.split(' ') count_d = len(data) for i in range(count_d - 1): #将没有出现在训练邮件的单词置0 S.setdefault(data[i+1],0) for i in range(count_d - 1): H.setdefault(data[i+1],0) for i in range(count_d - 1): #将两个结果进行相除比较大小 ans = ans*(S.get(data[i+1])+1)/(trash + 1) ans = ans*(5001 - trash)/(H.get(data[i+1])+1) if ans>1: if data[0] == '0': rig+=1 if ans<1: if data[0] == '1': rig+=1print(rig/1000)