天天看点

闲来无事,用C写个朴素贝叶斯

#include <iostream>
#include <set>
#include <vector>

using namespace std;
const int MaxVocabNum = 50;

char *postingList[] = {
	"my dog has flea problem help please",
	"maybe not take him to dog park stupid",
	"my dalmation is so cute I love him",
	"stop posting stupid worthless garbage",
	"mr licks ate my steak how to stop him",
	"quit buying worthless dog food stupid"
};

bool Category[] = {
	false, true, false, true, false, true
};

set<string> st;
vector<string> v;
int wordVec[MaxVocabNum];

float p0Num[MaxVocabNum], p1Num[MaxVocabNum], p0Vect[MaxVocabNum], p1Vect[MaxVocabNum];
float p0Denom = 0.0, p1Denom = 0.0, pAbusive = 0.0;

int createVocabList()      //建词典 
{
	int num = sizeof(postingList) / sizeof(*postingList);
	
	for(int i = 0; i < num; ++i)
	{
//		printf("%s\n", postingList[i]);
		char tmp[MaxVocabNum];
		strcpy(tmp, postingList[i]);
		char *p = strtok(tmp, " ");
		while(p)
		{
			string t(p);
			st.insert(t);
			p = strtok(NULL, " ");
		}
	}

	set<string>::iterator iter;
	int i = 0;
	for(iter = st.begin(); iter != st.end(); ++iter)
	{
		v.push_back(*iter);	
		i++;
	}
	return i;	
}

void setOfWord2Vec(char str[], const int vocabNum)   //转为词向量 
{
	memset(wordVec, 0, sizeof(wordVec));
	char *p = strtok(str, " ");
	while(p)
	{
		string t(p);
		vector<string>::iterator iter;
		for(int i = 0; i < v.size(); ++i)
		{
			if(t == v[i])
			{
				wordVec[i]++;
			}	
		}
		p = strtok(NULL, " ");
	}	
}

int trainNaiveBayes()
{
	int vocabNum = createVocabList();
	cout << "字典为:" << endl;
	for(int i = 0; i != vocabNum; ++i)
		cout << v[i] << " ";
	cout << endl;
	
	int num = sizeof(postingList) / sizeof(*postingList);
	int positive = 0, nagetive = 0;
	
	for(int i = 0; i < num; ++i)
	{
		char tmp[MaxVocabNum];
		strcpy(tmp, postingList[i]);
		setOfWord2Vec(tmp, vocabNum);
		
		for(int i = 0; i != vocabNum; ++i)
			cout << wordVec[i] << " ";
		cout << endl;
		
		if(Category[i])
		{
			nagetive++;
			for(int j = 0; j < vocabNum; ++j)
			{
				p1Num[j] += wordVec[j];
				p1Denom += wordVec[j];
			}
		}
		else
		{
			positive++;
			for(int j = 0; j < vocabNum; ++j)
			{
				p0Num[j] += wordVec[j];
				p0Denom += wordVec[j];
			}
		}
	}
	
	for(int j = 0; j < vocabNum; ++j)
	{
		p0Num[j] /= p0Denom;
		p1Num[j] /= p1Denom;			
	}
	
	cout << endl << "p0Denom " << p0Denom << endl;
	for(int j = 0; j < vocabNum; ++j)
		cout << p0Num[j] << " ";			
	cout << endl << endl;
	
	cout << "p1Denom " << p1Denom << endl;
	for(int j = 0; j < vocabNum; ++j)
		cout << p1Num[j] << " ";			
	cout << endl << endl;
	
	pAbusive = (float)nagetive / num;
	cout << "pAbusive: " << pAbusive << endl;
	
	return vocabNum;
}

bool classify(char *str, int vocabNum)
{
	setOfWord2Vec(str, vocabNum);
	float p1 = 0.0, p0 = 0.0;
	
	for(int i = 0; i < vocabNum; ++i)
	{
		p0 += (wordVec[i] * p0Num[i]);
		p1 += (wordVec[i] * p1Num[i]);
	}
	p1 *= pAbusive;
	p0 *= (1 - pAbusive);
	
	cout << "p0: " << p0 << "    " << "p1: " << p1 << endl;
	return p1 > p0;
}


int main(void)
{
	int vocabNum = trainNaiveBayes();
	
	char str[] = "I love you";
	
	if(classify(str, vocabNum))
		cout << "是脏话" << endl;
	else
		cout << "不是脏话" << endl;

	return 0;
}
           

继续阅读