#include <iostream>
#include <set>
#include <vector>
using namespace std;
const int MaxVocabNum = 50;
char *postingList[] = {
"my dog has flea problem help please",
"maybe not take him to dog park stupid",
"my dalmation is so cute I love him",
"stop posting stupid worthless garbage",
"mr licks ate my steak how to stop him",
"quit buying worthless dog food stupid"
};
bool Category[] = {
false, true, false, true, false, true
};
set<string> st;
vector<string> v;
int wordVec[MaxVocabNum];
float p0Num[MaxVocabNum], p1Num[MaxVocabNum], p0Vect[MaxVocabNum], p1Vect[MaxVocabNum];
float p0Denom = 0.0, p1Denom = 0.0, pAbusive = 0.0;
int createVocabList() //建词典
{
int num = sizeof(postingList) / sizeof(*postingList);
for(int i = 0; i < num; ++i)
{
// printf("%s\n", postingList[i]);
char tmp[MaxVocabNum];
strcpy(tmp, postingList[i]);
char *p = strtok(tmp, " ");
while(p)
{
string t(p);
st.insert(t);
p = strtok(NULL, " ");
}
}
set<string>::iterator iter;
int i = 0;
for(iter = st.begin(); iter != st.end(); ++iter)
{
v.push_back(*iter);
i++;
}
return i;
}
void setOfWord2Vec(char str[], const int vocabNum) //转为词向量
{
memset(wordVec, 0, sizeof(wordVec));
char *p = strtok(str, " ");
while(p)
{
string t(p);
vector<string>::iterator iter;
for(int i = 0; i < v.size(); ++i)
{
if(t == v[i])
{
wordVec[i]++;
}
}
p = strtok(NULL, " ");
}
}
int trainNaiveBayes()
{
int vocabNum = createVocabList();
cout << "字典为:" << endl;
for(int i = 0; i != vocabNum; ++i)
cout << v[i] << " ";
cout << endl;
int num = sizeof(postingList) / sizeof(*postingList);
int positive = 0, nagetive = 0;
for(int i = 0; i < num; ++i)
{
char tmp[MaxVocabNum];
strcpy(tmp, postingList[i]);
setOfWord2Vec(tmp, vocabNum);
for(int i = 0; i != vocabNum; ++i)
cout << wordVec[i] << " ";
cout << endl;
if(Category[i])
{
nagetive++;
for(int j = 0; j < vocabNum; ++j)
{
p1Num[j] += wordVec[j];
p1Denom += wordVec[j];
}
}
else
{
positive++;
for(int j = 0; j < vocabNum; ++j)
{
p0Num[j] += wordVec[j];
p0Denom += wordVec[j];
}
}
}
for(int j = 0; j < vocabNum; ++j)
{
p0Num[j] /= p0Denom;
p1Num[j] /= p1Denom;
}
cout << endl << "p0Denom " << p0Denom << endl;
for(int j = 0; j < vocabNum; ++j)
cout << p0Num[j] << " ";
cout << endl << endl;
cout << "p1Denom " << p1Denom << endl;
for(int j = 0; j < vocabNum; ++j)
cout << p1Num[j] << " ";
cout << endl << endl;
pAbusive = (float)nagetive / num;
cout << "pAbusive: " << pAbusive << endl;
return vocabNum;
}
bool classify(char *str, int vocabNum)
{
setOfWord2Vec(str, vocabNum);
float p1 = 0.0, p0 = 0.0;
for(int i = 0; i < vocabNum; ++i)
{
p0 += (wordVec[i] * p0Num[i]);
p1 += (wordVec[i] * p1Num[i]);
}
p1 *= pAbusive;
p0 *= (1 - pAbusive);
cout << "p0: " << p0 << " " << "p1: " << p1 << endl;
return p1 > p0;
}
int main(void)
{
int vocabNum = trainNaiveBayes();
char str[] = "I love you";
if(classify(str, vocabNum))
cout << "是脏话" << endl;
else
cout << "不是脏话" << endl;
return 0;
}