字元串比對算法（Trie樹）

1. Trie樹概念

Trie樹，也叫字典樹，它是一個樹形結構。是一種專門處理字元串比對的資料結構，用來解決在一組字元串集合中快速查找某個字元串。
Trie樹本質，利用字元串之間的公共字首，将重複的字首合并在一起。

2. Trie樹操作

2.1 存儲

Trie樹是一個多叉樹；二叉樹的資料結構裡存放着左右子節點的指針；

Trie樹采用的一種經典的存儲方式是散清單。

class TrieNode//Trie樹節點類,假設隻有26個字母的資料集
{
public:
    char data;
    TrieNode *children[charNum];
    size_t count;//記錄這個節點被多少個單詞占用
    bool isEndOfWord;//是否是一個單詞的結束字元
    size_t freq;    //單詞插入的頻次
    TrieNode(char ch = '/'):data(ch), isEndOfWord(false), count(0), freq(0)
    {
        memset(children,0,sizeof(TrieNode*) * charNum);
    }
    ~TrieNode(){}
};

複制

Trie樹比較浪費記憶體，children數組存放指針；

犧牲點效率的話，可以将數組改成，有序數組，跳表，散清單，紅黑樹等

2.2 查找

TrieNode* find_private(const string &text) const//查找某個字元串,傳回最後一個字元節點的指針
{
    TrieNode *p = root;
    int index;
    for(int i = 0; i < text.size(); ++i)
    {
        index = text[i] - 'a';
        if(p->children[index] == NULL)
            return NULL;//還沒比對完
        p = p->children[index];
    }
    if(p->isEndOfWord == false)//比對完，但是隻是字首
        return NULL;
    else
    {
        return p;//私有find無輸出資訊
    }
}

複制

時間複雜度O（k），k為要查找的字元串長度

2.3 插入

void insert(const string &text)//插入一個字元串
{
    TrieNode *p = find_private(text);
    if(p)//找到了字元串，不用插入，頻次加1
    {
        p->freq++;
        return;
    }
    p = root;
    int index;
    for(int i = 0; i < text.size(); ++i)
    {
        index = text[i] - 'a';
        if(p->children[index] == NULL)
        {
            TrieNode *newNode = new TrieNode(text[i]);
            p->children[index] = newNode;
        }
        p->count++;
        p = p->children[index];
    }
    p->count++;
    p->freq++;
    p->isEndOfWord = true;
}

複制

時間複雜度O（n），n為所有字元串長度和

2.4 删除

bool delString(const string &text)
{
    TrieNode *p = root;
    stack<TrieNode*> nodeStack;
    nodeStack.push(root);
    int index;
    for(int i = 0; i < text.size(); ++i)
    {
        index = text[i] - 'a';
        if(p->children[index] == NULL)
            return false;//還沒比對完
        p = p->children[index];
        nodeStack.push(p);
    }
    if(p->isEndOfWord == false)//比對完，但是隻是字首
        return false;
    else
    {
        while(nodeStack.top()->count == 1)//删除單詞隻要自己包含的部分
        {
            index = nodeStack.top()->data - 'a';
//            cout << "del char: " << nodeStack.top()->data << endl;//(調試代碼)
            delete nodeStack.top();
            nodeStack.pop();
        }
        nodeStack.top()->children[index] = NULL;//斷開已删除的部分
        while(!nodeStack.empty())
        {
            nodeStack.top()->count--;//單詞占用記錄減1
            nodeStack.pop();
        }
        return true;
    }
}

複制

析構函數

void destory(TrieNode* proot)//樹不再使用，結束前，釋放資源
{
    if(proot == NULL)
    {
        return;
    }
    for(int i = 0; i < charNum; ++i)
    {
        destory(proot->children[i]);
    }
    delete proot;
    proot = NULL;
}

複制

2.5 列印

void printStrWithPre(const string prefix) const//列印有指定字首的單詞
    {
        if(prefix.size() == 0)
            return;
        TrieNode *p = root;
        int index,printID = 0;
        for(int i = 0; i < prefix.size(); ++i)
        {
            index = prefix[i] - 'a';
            if(p->children[index] == NULL)//字首還沒比對成功
            {
                cout << "-------------------------" << endl;
                cout << "no string with prefix: " << prefix << " can be found!" << endl;
                return;
            }
            else
                p = p->children[index];
        }//比對完了，p指向字首最後一個字元節點
        cout << "-------------------------" << endl;
        cout << p->count << " string(s) with prefix: " << prefix << " , as following:" << endl;
        printWordsOfNode(p,prefix,printID);
        cout << "-----------end-----------" << endl;
    }
    void printDict() const//字典序輸出全部單詞
    {
        string word("");
        int printID = 0;
        cout << "-------------------------" << endl;
        cout << "all " << itemCount() << " words as following:" << endl;
        printWordsOfNode(root,word,printID);
        cout << "-----------end-----------" << endl;
    }
private:
    void printWordsOfNode(TrieNode* p, string prefix, int &order) const
    {//遞歸列印字首最後一個字元對應節點下面所有的字元
        if(p != NULL)
        {
            if(p->isEndOfWord)//是終止字元，prefix是不斷+出來的，是整個字元串
                cout << ++order << " " << prefix << ", frequency: " << p->freq << endl;
            for(int i = 0; i < charNum; ++i)
            {
                if(p->children[i] != NULL)
                    printWordsOfNode(p->children[i],prefix+(p->children[i]->data),order);
            }
        }
    }

複制

3. 完整代碼

https://github.com/hitskyer/course/blob/master/dataAlgorithm/chenmingming/string_matching/trie.cpp

/**
 * @description: trie樹，字典樹
 * @author: michael ming
 * @date: 2019/6/24 19:00
 * @modified by: 
 */
#include <iostream>
#include <cstring>
#include <stack>
#define charNum 26
using namespace std;
class TrieNode//Trie樹節點類,假設隻有26個字母的資料集
{
public:
    char data;
    TrieNode *children[charNum];
    size_t count;//記錄這個節點被多少個單詞占用
    bool isEndOfWord;//是否是一個單詞的結束字元
    size_t freq;    //單詞插入的頻次
    TrieNode(char ch = '/'):data(ch), isEndOfWord(false), count(0), freq(0)
    {
        memset(children,0,sizeof(TrieNode*) * charNum);
    }
    ~TrieNode(){}
};
class Trie
{
public:
    TrieNode* root;
    Trie()
    {
        root = new TrieNode;
    }
    ~Trie()
    {
        destory(root);
    }
    void insert(const string &text)//插入一個字元串
    {
        TrieNode *p = find_private(text);
        if(p)//找到了字元串，不用插入，頻次加1
        {
            p->freq++;
            return;
        }
        p = root;
        int index;
        for(int i = 0; i < text.size(); ++i)
        {
            index = text[i] - 'a';
            if(p->children[index] == NULL)
            {
                TrieNode *newNode = new TrieNode(text[i]);
                p->children[index] = newNode;
            }
            p->count++;
            p = p->children[index];
        }
        p->count++;
        p->freq++;
        p->isEndOfWord = true;
    }
    void find(const string &text) const//查找某個字元串
    {
        TrieNode *p = root;
        int index;
        for(int i = 0; i < text.size(); ++i)
        {
            index = text[i] - 'a';
            if(p->children[index] == NULL)//還沒比對完
            {
                cout << "can not find string: " << text << endl;
                return;
            }
            p = p->children[index];
        }
        if(p->isEndOfWord == false)//比對完，但是隻是字首
        {
            cout << "can not find string: " << text << endl;
            return;
        }
        else
        {
            cout << text << " occurs " << p->freq << " time(s)." << endl;
            return;
        }
    }

private:
    TrieNode* find_private(const string &text) const//查找某個字元串,傳回最後一個字元節點的指針
    {
        TrieNode *p = root;
        int index;
        for(int i = 0; i < text.size(); ++i)
        {
            index = text[i] - 'a';
            if(p->children[index] == NULL)
                return NULL;//還沒比對完
            p = p->children[index];
        }
        if(p->isEndOfWord == false)//比對完，但是隻是字首
            return NULL;
        else
        {
            return p;//私有find無輸出資訊
        }
    }

public:
    void destory(TrieNode* proot)//樹不再使用，結束前，釋放資源
    {
        if(proot == NULL)
        {
            return;
        }
        for(int i = 0; i < charNum; ++i)
        {
            destory(proot->children[i]);
        }
        delete proot;
        proot = NULL;
    }
    bool delString(const string &text)
    {
        TrieNode *p = root;
        stack<TrieNode*> nodeStack;
        nodeStack.push(root);
        int index;
        for(int i = 0; i < text.size(); ++i)
        {
            index = text[i] - 'a';
            if(p->children[index] == NULL)
                return false;//還沒比對完
            p = p->children[index];
            nodeStack.push(p);
        }
        if(p->isEndOfWord == false)//比對完，但是隻是字首
            return false;
        else
        {
            while(nodeStack.top()->count == 1)//删除單詞隻要自己包含的部分
            {
                index = nodeStack.top()->data - 'a';
//                cout << "del char: " << nodeStack.top()->data << endl;//(調試代碼)
                delete nodeStack.top();
                nodeStack.pop();
            }
            nodeStack.top()->children[index] = NULL;//斷開已删除的部分
            while(!nodeStack.empty())
            {
                nodeStack.top()->count--;//單詞占用記錄減1
                nodeStack.pop();
            }
            return true;
        }
    }
    size_t itemCount() const//字典中單詞種數
    {
        return root->count;
    }
    void printStrWithPre(const string prefix) const//列印有指定字首的單詞
    {
        if(prefix.size() == 0)
            return;
        TrieNode *p = root;
        int index,printID = 0;
        for(int i = 0; i < prefix.size(); ++i)
        {
            index = prefix[i] - 'a';
            if(p->children[index] == NULL)//字首還沒比對成功
            {
                cout << "-------------------------" << endl;
                cout << "no string with prefix: " << prefix << " can be found!" << endl;
                return;
            }
            else
                p = p->children[index];
        }//比對完了，p指向字首最後一個字元節點
        cout << "-------------------------" << endl;
        cout << p->count << " string(s) with prefix: " << prefix << " , as following:" << endl;
        printWordsOfNode(p,prefix,printID);
        cout << "-----------end-----------" << endl;
    }
    void printDict() const//字典序輸出全部單詞
    {
        string word("");
        int printID = 0;
        cout << "-------------------------" << endl;
        cout << "all " << itemCount() << " words as following:" << endl;
        printWordsOfNode(root,word,printID);
        cout << "-----------end-----------" << endl;
    }
private:
    void printWordsOfNode(TrieNode* p, string prefix, int &order) const
    {//遞歸列印字首最後一個字元對應節點下面所有的字元
        if(p != NULL)
        {
            if(p->isEndOfWord)//是終止字元，prefix是不斷+出來的，是整個字元串
                cout << ++order << " " << prefix << ", frequency: " << p->freq << endl;
            for(int i = 0; i < charNum; ++i)
            {
                if(p->children[i] != NULL)
                    printWordsOfNode(p->children[i],prefix+(p->children[i]->data),order);
            }
        }
    }
};
int main()
{
    Trie textlib;
    string a("hello"), b("her"), c("so"), d("hi"), e("how"), f("see");
    textlib.insert(a);
    textlib.insert(a);
    textlib.insert(b);
    textlib.insert(c);
    textlib.insert(d);
    textlib.insert(e);
    textlib.insert(f);
    textlib.find(a);
    textlib.find(b);
    textlib.find(d);
    textlib.printStrWithPre("h");
    textlib.printDict();
    textlib.delString("hello");
    textlib.find(a);
    textlib.printStrWithPre("h");
    textlib.printDict();
    cout << "total kind(s) of word: " << textlib.itemCount() << endl;
    return 0;
}

複制

4. Trie樹與散清單、紅黑樹的比較

Trie樹對要處理的字元串有及其嚴苛的要求。

第一，字元串中包含的字元集不能太大。如果字元集太大，那存儲空間可能就會浪費很多。即便可以優化，也要付出犧牲查詢、插入效率的代價。
第二，要求字元串的字首重合比較多，不然空間消耗會變大很多。
第三，如果要用Trie樹解決問題，那我們就要自己從零開始實作一個Trie樹，還要保證沒有bug，這個在工程上是将簡單問題複雜化，除非必須，一般不建議這樣做。
第四，通過指針串起來的資料塊是不連續的，而Trie樹中用到了指針，是以，對緩存并不友好，性能上會打個折扣。
綜合這幾點，針對在一組字元串中查找字元串的問題，工程中，更傾向于用散清單或者紅黑樹。因為這兩種資料結構，我們都不需要自己去實作，直接利用程式設計語言中提供的現成類庫就行了。
Trie 樹隻是不适合精确比對查找，這種問題更适合用散清單或者紅黑樹來解決。
Trie樹比較适合的是查找字首比對的字元串，例如搜尋引擎智能比對輸入，給出候選提示（如果有多個候選，可以按搜尋熱度排序，上面代碼裡面的 frequency）。

Trie樹還可以應用于自動輸入補全（輸入法，代碼編輯器，浏覽器網址輸入）

4.1 思考題

上面針對英文的搜尋關鍵詞，對于更加複雜的中文來說，詞庫中的資料又該如何建構成Trie 樹呢？
如果詞庫中有很多關鍵詞，在搜尋提示的時候，使用者輸入關鍵詞，作為字首在Trie 樹中可以比對的關鍵詞也有很多，如何選擇展示哪些内容呢？（按搜尋熱度或者機率）
像Google 這樣的搜尋引擎，使用者單詞拼寫錯誤的情況下，Google還是可以使用正确的拼寫來做關鍵詞提示，這個又是怎麼做到的呢？

參考文章

https://www.cnblogs.com/xujian2014/p/5614724.html