天天看點

ElasticSearch ik分詞器基礎知識analyzer

概述

有兩種analyzer,你根據自己的需要自己選吧,但是一般是選用ik_max_word

如果是ik_max_word的話: 會将文本做最細粒度的拆分,比如會将“中華人民共和國國歌”拆分為“中華人民共和國,中華人民,中華,華人,人民共和國,人民,人,民,共和國,共和,和,國國,國歌”,會窮盡各種可能的組合;

如果使用ik_smart的話 會做最粗粒度的拆分,比如會将“中華人民共和國國歌”拆分為“中華人民共和國,國歌”。

如果我們搜尋"共和國" 搜到嗎? 這樣就搜不到的.

結論

一般我們是使用ik_max_word,因為拆分比較細粒度,這樣搜尋效果會更好.

ik分詞器的使用

指定my_type的analyzer 為 ik_max_word

PUT /my_index
{
  "mappings": {
    "my_type": {
      "properties": {
        "text": {
          "type": "text",
          "analyzer": "ik_max_word"
        }
      }
    }
  }
}      

添加資料

POST /my_index/my_type/_bulk
{ "index": { "_id": "1"} }
{ "text": "男子偷上萬元發紅包求交女友 被抓獲時仍然單身" }
{ "index": { "_id": "2"} }
{ "text": "16歲少女為結婚“變”22歲 7年後想離婚被法院拒絕" }
{ "index": { "_id": "3"} }
{ "text": "深圳女孩騎車逆行撞奔馳 遭索賠被吓哭(圖)" }
{ "index": { "_id": "4"} }
{ "text": "女人對護膚品比對男票好?網友神怼" }
{ "index": { "_id": "5"} }
{ "text": "為什麼國内的街道招牌用的都是紅黃配?" }      

測試一下搜尋效果

檢視分詞效果

GET /my_index/_analyze
{
"text": "男子偷上萬元發紅包求交女友 被抓獲時仍然單身",
"analyzer": "ik_max_word"
}      

結果: 下面分詞那麼多了

{
  "tokens": [
    {
      "token": "男子",
      "start_offset": 0,
      "end_offset": 2,
      "type": "CN_WORD",
      "position": 0
    },
    {
      "token": "偷上",
      "start_offset": 2,
      "end_offset": 4,
      "type": "CN_WORD",
      "position": 1
    },
    {
      "token": "上萬",
      "start_offset": 3,
      "end_offset": 5,
      "type": "CN_WORD",
      "position": 2
    },
    {
      "token": "萬元",
      "start_offset": 4,
      "end_offset": 6,
      "type": "CN_WORD",
      "position": 3
    },
    {
      "token": "萬",
      "start_offset": 4,
      "end_offset": 5,
      "type": "TYPE_CNUM",
      "position": 4
    },
    {
      "token": "元",
      "start_offset": 5,
      "end_offset": 6,
      "type": "COUNT",
      "position": 5
    },
    {
      "token": "發紅包",
      "start_offset": 6,
      "end_offset": 9,
      "type": "CN_WORD",
      "position": 6
    },
    {
      "token": "發紅",
      "start_offset": 6,
      "end_offset": 8,
      "type": "CN_WORD",
      "position": 7
    },
    {
      "token": "紅包",
      "start_offset": 7,
      "end_offset": 9,
      "type": "CN_WORD",
      "position": 8
    },
    {
      "token": "求",
      "start_offset": 9,
      "end_offset": 10,
      "type": "CN_CHAR",
      "position": 9
    },
    {
      "token": "交",
      "start_offset": 10,
      "end_offset": 11,
      "type": "CN_CHAR",
      "position": 10
    },
    {
      "token": "女友",
      "start_offset": 11,
      "end_offset": 13,
      "type": "CN_WORD",
      "position": 11
    },
    {
      "token": "被",
      "start_offset": 14,
      "end_offset": 15,
      "type": "CN_CHAR",
      "position": 12
    },
    {
      "token": "抓獲",
      "start_offset": 15,
      "end_offset": 17,
      "type": "CN_WORD",
      "position": 13
    },
    {
      "token": "時",
      "start_offset": 17,
      "end_offset": 18,
      "type": "CN_CHAR",
      "position": 14
    },
    {
      "token": "仍然",
      "start_offset": 18,
      "end_offset": 20,
      "type": "CN_WORD",
      "position": 15
    },
    {
      "token": "單身",
      "start_offset": 20,
      "end_offset": 22,
      "type": "CN_WORD",
      "position": 16
    }
  ]
}      

測試查詢資料

GET /my_index/my_type/_search
{
  "query": {
    "match": {
      "text": "16歲少女結婚好還是單身好?"
    }
  }
}      
{
  "took": 13,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 3,
    "max_score": 2.6093416,
    "hits": [
      {
        "_index": "my_index",
        "_type": "my_type",
        "_id": "2",
        "_score": 2.6093416,
        "_source": {
          "text": "16歲少女為結婚“變”22歲 7年後想離婚被法院拒絕"
        }
      },
      {
        "_index": "my_index",
        "_type": "my_type",
        "_id": "4",
        "_score": 1.3300087,
        "_source": {
          "text": "女人對護膚品比對男票好?網友神怼"
        }
      },
      {
        "_index": "my_index",
        "_type": "my_type",
        "_id": "1",
        "_score": 0.26301134,
        "_source": {
          "text": "男子偷上萬元發紅包求交女友 被抓獲時仍然單身"
        }
      }
    ]
  }
}      

繼續閱讀