字符串匹配算法（Trie树）

1. Trie树概念

Trie树，也叫字典树，它是一个树形结构。是一种专门处理字符串匹配的数据结构，用来解决在一组字符串集合中快速查找某个字符串。
Trie树本质，利用字符串之间的公共前缀，将重复的前缀合并在一起。

2. Trie树操作

2.1 存储

Trie树是一个多叉树；二叉树的数据结构里存放着左右子节点的指针；

Trie树采用的一种经典的存储方式是散列表。

class TrieNode//Trie树节点类,假设只有26个字母的数据集
{
public:
    char data;
    TrieNode *children[charNum];
    size_t count;//记录这个节点被多少个单词占用
    bool isEndOfWord;//是否是一个单词的结束字符
    size_t freq;    //单词插入的频次
    TrieNode(char ch = '/'):data(ch), isEndOfWord(false), count(0), freq(0)
    {
        memset(children,0,sizeof(TrieNode*) * charNum);
    }
    ~TrieNode(){}
};

复制

Trie树比较浪费内存，children数组存放指针；

牺牲点效率的话，可以将数组改成，有序数组，跳表，散列表，红黑树等

2.2 查找

TrieNode* find_private(const string &text) const//查找某个字符串,返回最后一个字符节点的指针
{
    TrieNode *p = root;
    int index;
    for(int i = 0; i < text.size(); ++i)
    {
        index = text[i] - 'a';
        if(p->children[index] == NULL)
            return NULL;//还没匹配完
        p = p->children[index];
    }
    if(p->isEndOfWord == false)//匹配完，但是只是前缀
        return NULL;
    else
    {
        return p;//私有find无输出信息
    }
}

复制

时间复杂度O（k），k为要查找的字符串长度

2.3 插入

void insert(const string &text)//插入一个字符串
{
    TrieNode *p = find_private(text);
    if(p)//找到了字符串，不用插入，频次加1
    {
        p->freq++;
        return;
    }
    p = root;
    int index;
    for(int i = 0; i < text.size(); ++i)
    {
        index = text[i] - 'a';
        if(p->children[index] == NULL)
        {
            TrieNode *newNode = new TrieNode(text[i]);
            p->children[index] = newNode;
        }
        p->count++;
        p = p->children[index];
    }
    p->count++;
    p->freq++;
    p->isEndOfWord = true;
}

复制

时间复杂度O（n），n为所有字符串长度和

2.4 删除

bool delString(const string &text)
{
    TrieNode *p = root;
    stack<TrieNode*> nodeStack;
    nodeStack.push(root);
    int index;
    for(int i = 0; i < text.size(); ++i)
    {
        index = text[i] - 'a';
        if(p->children[index] == NULL)
            return false;//还没匹配完
        p = p->children[index];
        nodeStack.push(p);
    }
    if(p->isEndOfWord == false)//匹配完，但是只是前缀
        return false;
    else
    {
        while(nodeStack.top()->count == 1)//删除单词只要自己包含的部分
        {
            index = nodeStack.top()->data - 'a';
//            cout << "del char: " << nodeStack.top()->data << endl;//(调试代码)
            delete nodeStack.top();
            nodeStack.pop();
        }
        nodeStack.top()->children[index] = NULL;//断开已删除的部分
        while(!nodeStack.empty())
        {
            nodeStack.top()->count--;//单词占用记录减1
            nodeStack.pop();
        }
        return true;
    }
}

复制

析构函数

void destory(TrieNode* proot)//树不再使用，结束前，释放资源
{
    if(proot == NULL)
    {
        return;
    }
    for(int i = 0; i < charNum; ++i)
    {
        destory(proot->children[i]);
    }
    delete proot;
    proot = NULL;
}

复制

2.5 打印

void printStrWithPre(const string prefix) const//打印有指定前缀的单词
    {
        if(prefix.size() == 0)
            return;
        TrieNode *p = root;
        int index,printID = 0;
        for(int i = 0; i < prefix.size(); ++i)
        {
            index = prefix[i] - 'a';
            if(p->children[index] == NULL)//前缀还没匹配成功
            {
                cout << "-------------------------" << endl;
                cout << "no string with prefix: " << prefix << " can be found!" << endl;
                return;
            }
            else
                p = p->children[index];
        }//匹配完了，p指向前缀最后一个字符节点
        cout << "-------------------------" << endl;
        cout << p->count << " string(s) with prefix: " << prefix << " , as following:" << endl;
        printWordsOfNode(p,prefix,printID);
        cout << "-----------end-----------" << endl;
    }
    void printDict() const//字典序输出全部单词
    {
        string word("");
        int printID = 0;
        cout << "-------------------------" << endl;
        cout << "all " << itemCount() << " words as following:" << endl;
        printWordsOfNode(root,word,printID);
        cout << "-----------end-----------" << endl;
    }
private:
    void printWordsOfNode(TrieNode* p, string prefix, int &order) const
    {//递归打印前缀最后一个字符对应节点下面所有的字符
        if(p != NULL)
        {
            if(p->isEndOfWord)//是终止字符，prefix是不断+出来的，是整个字符串
                cout << ++order << " " << prefix << ", frequency: " << p->freq << endl;
            for(int i = 0; i < charNum; ++i)
            {
                if(p->children[i] != NULL)
                    printWordsOfNode(p->children[i],prefix+(p->children[i]->data),order);
            }
        }
    }

复制

3. 完整代码

https://github.com/hitskyer/course/blob/master/dataAlgorithm/chenmingming/string_matching/trie.cpp

/**
 * @description: trie树，字典树
 * @author: michael ming
 * @date: 2019/6/24 19:00
 * @modified by: 
 */
#include <iostream>
#include <cstring>
#include <stack>
#define charNum 26
using namespace std;
class TrieNode//Trie树节点类,假设只有26个字母的数据集
{
public:
    char data;
    TrieNode *children[charNum];
    size_t count;//记录这个节点被多少个单词占用
    bool isEndOfWord;//是否是一个单词的结束字符
    size_t freq;    //单词插入的频次
    TrieNode(char ch = '/'):data(ch), isEndOfWord(false), count(0), freq(0)
    {
        memset(children,0,sizeof(TrieNode*) * charNum);
    }
    ~TrieNode(){}
};
class Trie
{
public:
    TrieNode* root;
    Trie()
    {
        root = new TrieNode;
    }
    ~Trie()
    {
        destory(root);
    }
    void insert(const string &text)//插入一个字符串
    {
        TrieNode *p = find_private(text);
        if(p)//找到了字符串，不用插入，频次加1
        {
            p->freq++;
            return;
        }
        p = root;
        int index;
        for(int i = 0; i < text.size(); ++i)
        {
            index = text[i] - 'a';
            if(p->children[index] == NULL)
            {
                TrieNode *newNode = new TrieNode(text[i]);
                p->children[index] = newNode;
            }
            p->count++;
            p = p->children[index];
        }
        p->count++;
        p->freq++;
        p->isEndOfWord = true;
    }
    void find(const string &text) const//查找某个字符串
    {
        TrieNode *p = root;
        int index;
        for(int i = 0; i < text.size(); ++i)
        {
            index = text[i] - 'a';
            if(p->children[index] == NULL)//还没匹配完
            {
                cout << "can not find string: " << text << endl;
                return;
            }
            p = p->children[index];
        }
        if(p->isEndOfWord == false)//匹配完，但是只是前缀
        {
            cout << "can not find string: " << text << endl;
            return;
        }
        else
        {
            cout << text << " occurs " << p->freq << " time(s)." << endl;
            return;
        }
    }

private:
    TrieNode* find_private(const string &text) const//查找某个字符串,返回最后一个字符节点的指针
    {
        TrieNode *p = root;
        int index;
        for(int i = 0; i < text.size(); ++i)
        {
            index = text[i] - 'a';
            if(p->children[index] == NULL)
                return NULL;//还没匹配完
            p = p->children[index];
        }
        if(p->isEndOfWord == false)//匹配完，但是只是前缀
            return NULL;
        else
        {
            return p;//私有find无输出信息
        }
    }

public:
    void destory(TrieNode* proot)//树不再使用，结束前，释放资源
    {
        if(proot == NULL)
        {
            return;
        }
        for(int i = 0; i < charNum; ++i)
        {
            destory(proot->children[i]);
        }
        delete proot;
        proot = NULL;
    }
    bool delString(const string &text)
    {
        TrieNode *p = root;
        stack<TrieNode*> nodeStack;
        nodeStack.push(root);
        int index;
        for(int i = 0; i < text.size(); ++i)
        {
            index = text[i] - 'a';
            if(p->children[index] == NULL)
                return false;//还没匹配完
            p = p->children[index];
            nodeStack.push(p);
        }
        if(p->isEndOfWord == false)//匹配完，但是只是前缀
            return false;
        else
        {
            while(nodeStack.top()->count == 1)//删除单词只要自己包含的部分
            {
                index = nodeStack.top()->data - 'a';
//                cout << "del char: " << nodeStack.top()->data << endl;//(调试代码)
                delete nodeStack.top();
                nodeStack.pop();
            }
            nodeStack.top()->children[index] = NULL;//断开已删除的部分
            while(!nodeStack.empty())
            {
                nodeStack.top()->count--;//单词占用记录减1
                nodeStack.pop();
            }
            return true;
        }
    }
    size_t itemCount() const//字典中单词种数
    {
        return root->count;
    }
    void printStrWithPre(const string prefix) const//打印有指定前缀的单词
    {
        if(prefix.size() == 0)
            return;
        TrieNode *p = root;
        int index,printID = 0;
        for(int i = 0; i < prefix.size(); ++i)
        {
            index = prefix[i] - 'a';
            if(p->children[index] == NULL)//前缀还没匹配成功
            {
                cout << "-------------------------" << endl;
                cout << "no string with prefix: " << prefix << " can be found!" << endl;
                return;
            }
            else
                p = p->children[index];
        }//匹配完了，p指向前缀最后一个字符节点
        cout << "-------------------------" << endl;
        cout << p->count << " string(s) with prefix: " << prefix << " , as following:" << endl;
        printWordsOfNode(p,prefix,printID);
        cout << "-----------end-----------" << endl;
    }
    void printDict() const//字典序输出全部单词
    {
        string word("");
        int printID = 0;
        cout << "-------------------------" << endl;
        cout << "all " << itemCount() << " words as following:" << endl;
        printWordsOfNode(root,word,printID);
        cout << "-----------end-----------" << endl;
    }
private:
    void printWordsOfNode(TrieNode* p, string prefix, int &order) const
    {//递归打印前缀最后一个字符对应节点下面所有的字符
        if(p != NULL)
        {
            if(p->isEndOfWord)//是终止字符，prefix是不断+出来的，是整个字符串
                cout << ++order << " " << prefix << ", frequency: " << p->freq << endl;
            for(int i = 0; i < charNum; ++i)
            {
                if(p->children[i] != NULL)
                    printWordsOfNode(p->children[i],prefix+(p->children[i]->data),order);
            }
        }
    }
};
int main()
{
    Trie textlib;
    string a("hello"), b("her"), c("so"), d("hi"), e("how"), f("see");
    textlib.insert(a);
    textlib.insert(a);
    textlib.insert(b);
    textlib.insert(c);
    textlib.insert(d);
    textlib.insert(e);
    textlib.insert(f);
    textlib.find(a);
    textlib.find(b);
    textlib.find(d);
    textlib.printStrWithPre("h");
    textlib.printDict();
    textlib.delString("hello");
    textlib.find(a);
    textlib.printStrWithPre("h");
    textlib.printDict();
    cout << "total kind(s) of word: " << textlib.itemCount() << endl;
    return 0;
}

复制

4. Trie树与散列表、红黑树的比较

Trie树对要处理的字符串有及其严苛的要求。

第一，字符串中包含的字符集不能太大。如果字符集太大，那存储空间可能就会浪费很多。即便可以优化，也要付出牺牲查询、插入效率的代价。
第二，要求字符串的前缀重合比较多，不然空间消耗会变大很多。
第三，如果要用Trie树解决问题，那我们就要自己从零开始实现一个Trie树，还要保证没有bug，这个在工程上是将简单问题复杂化，除非必须，一般不建议这样做。
第四，通过指针串起来的数据块是不连续的，而Trie树中用到了指针，所以，对缓存并不友好，性能上会打个折扣。
综合这几点，针对在一组字符串中查找字符串的问题，工程中，更倾向于用散列表或者红黑树。因为这两种数据结构，我们都不需要自己去实现，直接利用编程语言中提供的现成类库就行了。
Trie 树只是不适合精确匹配查找，这种问题更适合用散列表或者红黑树来解决。
Trie树比较适合的是查找前缀匹配的字符串，例如搜索引擎智能匹配输入，给出候选提示（如果有多个候选，可以按搜索热度排序，上面代码里面的 frequency）。

Trie树还可以应用于自动输入补全（输入法，代码编辑器，浏览器网址输入）

4.1 思考题

上面针对英文的搜索关键词，对于更加复杂的中文来说，词库中的数据又该如何构建成Trie 树呢？
如果词库中有很多关键词，在搜索提示的时候，用户输入关键词，作为前缀在Trie 树中可以匹配的关键词也有很多，如何选择展示哪些内容呢？（按搜索热度或者概率）
像Google 这样的搜索引擎，用户单词拼写错误的情况下，Google还是可以使用正确的拼写来做关键词提示，这个又是怎么做到的呢？

参考文章

https://www.cnblogs.com/xujian2014/p/5614724.html