zabbix 监控zookeeper篇
安装依赖包
<pre style="margin: 0px; padding: 0px; white-space: pre-wrap; overflow-wrap: break-word; font-family: "Courier New" !important; font-size: 12px !important;">yum install -y nc yum install -y zabbix-sender</pre>
nc 命令
[
复制代码
](javascript:void(0); "复制代码")
<pre style="margin: 0px; padding: 0px; white-space: pre-wrap; overflow-wrap: break-word; font-family: "Courier New" !important; font-size: 12px !important;">echo ruok|nc 127.0.0.1 2181 imok echo mntr|nc 127.0.0.1 2181 zk_version 3.4.6-1569965, built on 02/20/2014 09:09 GMT
zk_avg_latency 0 zk_max_latency 6 zk_min_latency 0 zk_packets_received 93114 zk_packets_sent 93113 zk_num_alive_connections 4 zk_outstanding_requests 0 zk_server_state leader
zk_znode_count 29 zk_watch_count 0 zk_ephemerals_count 14 zk_approximate_data_size 1087 zk_open_file_descriptor_count 39 zk_max_file_descriptor_count 1000000 zk_followers 4 zk_synced_followers 4 zk_pending_syncs 0
echo srvr|nc 127.0.0.1 2181 Zookeeper version: 3.4.6-1569965, built on 02/20/2014 09:09 GMT
Latency min/avg/max: 0/0/6 Received: 93121 Sent: 93120 Connections: 4 Outstanding: 0 Zxid: 0x900000020 Mode: leader
Node count: 29</pre>
ZooKeeper监控项
<pre style="margin: 0px; padding: 0px; white-space: pre-wrap; overflow-wrap: break-word; font-family: "Courier New" !important; font-size: 12px !important;">zk_avg/min/max_latency 响应一个客户端请求的时间,建议这个时间大于10个Tick就报警
zk_outstanding_requests 排队请求的数量,当ZooKeeper超过了它的处理能力时,这个值会增大,建议设置报警阀值为10
zk_packets_received 接收到客户端请求的包数量
zk_packets_sent 发送给客户单的包数量,主要是响应和通知
zk_max_file_descriptor_count 最大允许打开的文件数,由ulimit控制
zk_open_file_descriptor_count 打开文件数量,当这个值大于允许值得85%时报警
Mode 运行的角色,如果没有加入集群就是standalone,加入集群式follower或者leader
zk_followers leader角色才会有这个输出,集合中follower的个数。正常的值应该是集合成员的数量减1
zk_pending_syncs leader角色才会有这个输出,pending syncs的数量
zk_znode_count znodes的数量
zk_watch_count watches的数量
Java Heap Size ZooKeeper Java进程的</pre>
编写Zabbix监控ZooKeeper的脚本和配置文件
将这些监控数据一次性使用zabbix_sender全部发送给zabbix。采用zabbix_sender一z部监控数据的脚本,首先想办法将监控项目汇集成一个字典,然后遍历这个字典,将字典中的key:value对通过zabbix_sender的-k和-o参数指定发送出去
vim zookeeper.py
<pre style="margin: 0px; padding: 0px; white-space: pre-wrap; overflow-wrap: break-word; font-family: "Courier New" !important; font-size: 12px !important;">#!/usr/bin/python
"""{'zk_followers': 0,
'zk_outstanding_requests': 0,
'zk_approximate_data_size': 890971,
'zk_packets_sent': 5818488,
'zk_pending_syncs': 0,
'zk_avg_latency': 0,
'zk_version': '3.4.6-1569965, built on 02/20/2014 09:09 GMT',
'zk_watch_count': 1364,
'zk_packets_received': 5797681,
'zk_open_file_descriptor_count': 46,
'zk_server_ruok': 'imok',
'zk_server_state': 'follower',
'zk_synced_followers': 0,
'zk_max_latency': 400,
'zk_num_alive_connections': 18,
'zk_min_latency': 0,
'zk_ephemerals_count': 1112,
'zk_znode_count': 2207,
'zk_max_file_descriptor_count': 4096}
31022 """
import sys import socket import re import subprocess from StringIO import StringIO import os
zabbix_sender = '/usr/bin/zabbix_sender' zabbix_conf = '/etc/zabbix/zabbix_agentd.conf' send_to_zabbix = 1
get zookeeper server status
class ZooKeeperServer(object): def init(self, host='localhost', port='2181', timeout=1):
self._address = (host, int(port))
self._timeout = timeout
self._result = {} def _create_socket(self): return socket.socket() def _send_cmd(self, cmd): """ Send a 4letter word command to the server """ s = self._create_socket()
s.settimeout(self._timeout)
s.connect(self._address)
s.send(cmd)
data = s.recv(2048)
s.close() return data def get_stats(self): """ Get ZooKeeper server stats as a map """
"""zk_version 3.4.6-1569965, built on 02/20/2014 09:09 GMT
zk_avg_latency 0
zk_max_latency 94
zk_min_latency 0
zk_packets_received 1267904
zk_packets_sent 1317835
zk_num_alive_connections 12
zk_outstanding_requests 0
zk_server_state follower
zk_znode_count 1684
zk_watch_count 2757
zk_ephemerals_count 899
zk_approximate_data_size 728074
zk_open_file_descriptor_count 41
zk_max_file_descriptor_count 4096 """ data_mntr = self._send_cmd('mntr')
data_ruok = self._send_cmd('ruok') if data_mntr:
result_mntr = self._parse(data_mntr) if data_ruok: # {'zk_server_ruok': 'imok'}
result_ruok = self._parse_ruok(data_ruok)
self._result = dict(result_mntr.items() + result_ruok.items()) if not self._result.has_key('zk_followers') and not self._result.has_key('zk_synced_followers') and not self._result.has_key('zk_pending_syncs'): # #### the tree metrics only exposed on leader role zookeeper server, we just set the followers' to 0
leader_only = {'zk_followers':0,'zk_synced_followers':0,'zk_pending_syncs':0}
self._result = dict(result_mntr.items() + result_ruok.items() + leader_only.items()) return self._result def _parse(self, data): """ :param data: zk_outstanding_requests 0 zk_approximate_data_size 653931
:return: {'zk_outstanding_requests': '0', 'zk_approximate_data_size': '653931',} """
""" Parse the output from the 'mntr' 4letter word command """ h = StringIO(data)
result = {} for line in h.readlines(): try:
key, value = self._parse_line(line)
result[key] = value except ValueError: pass # ignore broken lines
return result def _parse_ruok(self, data): """ :param data: imok
:return: {'zk_server_ruok': 'imok'} """
""" Parse the output from the 'ruok' 4letter word command """ h = StringIO(data)
result = {}
ruok = h.readline() if ruok:
result['zk_server_ruok'] = ruok return result def _parse_line(self, line): # zk_watch_count 1482
try: # zk_max_file_descriptor_count 65535
key, value = map(str.strip, line.split('\t')) except ValueError: raise ValueError('Found invalid line: %s' % line) if not key: raise ValueError('The key is mandatory and should not be empty') try:
value = int(value) except (TypeError, ValueError): pass
return key, value def get_pid(self): # ps -ef|grep java|grep zookeeper|awk '{print $2}'
pidarg = '''ps -ef|grep java|grep zookeeper|grep -v grep|awk '{print $2}' ''' # 31022
pidout = subprocess.Popen(pidarg, shell=True, stdout=subprocess.PIPE)
pid = pidout.stdout.readline().strip('\n') return pid def send_to_zabbix(self, metric): # key = zookeeper.status[zk_max_file_descriptor_count]
key = "zookeeper.status[" + metric + "]"
if send_to_zabbix > 0: # print key + ":" + str(self._result[metric])
try:
subprocess.call([zabbix_sender, "-c", zabbix_conf, "-k", key, "-o", str(self._result[metric])], stdout=FNULL, stderr=FNULL, shell=False) #print "send zabbix success"
except OSError, detail: print "Something went wrong while exectuting zabbix_sender : ", detail else: print "Simulation: the following command would be execucted :\n", zabbix_sender, "-c", zabbix_conf, "-k", key, "-o", self._result[metric], "\n"
def usage(): """Display program usage"""
print "\nUsage : ", sys.argv[0], " alive|all"
print "Modes : \n\talive : Return pid of running zookeeper\n\tall : Send zookeeper stats as well" sys.exit(1)
accepted_modes = ['alive', 'all'] if len(sys.argv) == 2 and sys.argv[1] in accepted_modes:
mode = sys.argv[1] else:
usage()
zk = ZooKeeperServer() # print zk.get_stats()
pid = zk.get_pid() if pid != "" and mode == 'all':
zk.get_stats() print zk._result
FNULL = open(os.devnull, 'w') for key in zk._result:
zk.send_to_zabbix(key)
FNULL.close() print pid elif pid != "" and mode == "alive": print pid else: print 0</pre>
增加脚本可执行权限
chmod +x /etc/zabbix/scripts/zookeeper.py
zabbix配置文件
vim /etc/zabbix/zabbix_agentd.d/check_zookeeper.conf
<pre style="margin: 0px; padding: 0px; white-space: pre-wrap; overflow-wrap: break-word; font-family: "Courier New" !important; font-size: 12px !important;">UserParameter=zookeeper.status[*],/usr/bin/python /usr/local/zabbix-agent/scripts/check_zookeeper.py $1</pre>
重新启动zabbix-agent服务
service zabbix-agent restart
制作Zabbix监控ZooKeeper的模板并设置报警阀值
zookeeper.xml(一定是zabbix采集器的方式)
<pre style="margin: 0px; padding: 0px; white-space: pre-wrap; overflow-wrap: break-word; font-family: "Courier New" !important; font-size: 12px !important;"><?xml version="1.0" encoding="UTF-8"?>
<zabbix_export>
<version>3.0</version>
<date>2017-12-11T08:02:58Z</date>
<groups>
<group>
<name>Zabbix servers</name>
</group>
</groups>
<templates>
<template>
<template>Zookeeper</template>
<name>Zookeeper</name>
<description/>
<applications>
<application>
<name>ZooKeeper Status</name>
</application>
</applications>
<items>
<item>
<name>zookeeper pid</name>
<type>2</type>
<snmp_community/>
<multiplier>0</multiplier>
<snmp_oid/>
<key>zookeeper.status[alive]</key>
<delay>10</delay>
<history>90</history>
<trends>365</trends>
<status>0</status>
<value_type>3</value_type>
<allowed_hosts/>
<units/>
<delta>0</delta>
<snmpv3_contextname/>
<snmpv3_securityname/>
<snmpv3_securitylevel>0</snmpv3_securitylevel>
<snmpv3_authprotocol>0</snmpv3_authprotocol>
<snmpv3_authpassphrase/>
<snmpv3_privprotocol>0</snmpv3_privprotocol>
<snmpv3_privpassphrase/>
<formula>1</formula>
<delay_flex/>
<params/>
<ipmi_sensor/>
<data_type>0</data_type>
<authtype>0</authtype>
<username/>
<password/>
<publickey/>
<privatekey/>
<port/>
<inventory_link>0</inventory_link>
<valuemap/>
<logtimefmt/>
</item>
<name>zookeeper approximate data size</name>
<key>zookeeper.status[zk_approximate_data_size]</key>
<delay>0</delay>
<name>zookeeper average latency</name>
<key>zookeeper.status[zk_avg_latency]</key>
<name>zookeeper ephemerals count</name>
<key>zookeeper.status[zk_ephemerals_count]</key>
<name>zookeeper leader's followers</name>
<key>zookeeper.status[zk_followers]</key>
<name>zookeeper max file descriptor count</name>
<key>zookeeper.status[zk_max_file_descriptor_count]</key>
<name>zookeeper max latency</name>
<key>zookeeper.status[zk_max_latency]</key>
<name>zookeeper min latency</name>
<key>zookeeper.status[zk_min_latency]</key>
<name>zookeeper alive connections</name>
<key>zookeeper.status[zk_num_alive_connections]</key>
<name>zookeeper opened file descriptor count</name>
<key>zookeeper.status[zk_open_file_descriptor_count]</key>
<name>zookeeper outstanding requests</name>
<key>zookeeper.status[zk_outstanding_requests]</key>
<name>zookeeper packages received</name>
<key>zookeeper.status[zk_packets_received]</key>
<description>收包数量</description>
<name>zookeeper packages sent</name>
<key>zookeeper.status[zk_packets_sent]</key>
<description>发包数据量</description>
<name>zookeeper leader's pending syncs</name>
<key>zookeeper.status[zk_pending_syncs]</key>
<name>zookeeper response checking</name>
<key>zookeeper.status[zk_server_ruok]</key>
<trends>0</trends>
<value_type>1</value_type>
<name>zookeeper state role</name>
<key>zookeeper.status[zk_server_state]</key>
<name>zookeeper leader's synced followers</name>
<key>zookeeper.status[zk_synced_followers]</key>
<name>zookeeper version</name>
<key>zookeeper.status[zk_version]</key>
<name>zookeeper watches count</name>
<key>zookeeper.status[zk_watch_count]</key>
<name>zookeeper znodes count</name>
<key>zookeeper.status[zk_znode_count]</key>
</items>
<discovery_rules/>
<macros/>
<templates/>
<screens/>
</template>
</templates>
<triggers>
<trigger>
<expression>{Zookeeper:zookeeper.status[zk_outstanding_requests].last()}>10</expression>
<name>big outstanding requests number</name>
<url/>
<priority>0</priority>
<type>0</type>
<dependencies/>
</trigger>
<expression>{Zookeeper:zookeeper.status[zk_pending_syncs].last()}>10</expression>
<name>big pending syncs</name>
<expression>{Zookeeper:zookeeper.status[zk_avg_latency].last()}>10</expression>
<name>large average latency</name>
<expression>{Zookeeper:zookeeper.status[zk_open_file_descriptor_count].last()} > {Zookeeper:zookeeper.status[zk_max_file_descriptor_count].last()}*0.85</expression>
<name>large file descriptor used</name>
<expression>{Zookeeper:zookeeper.status[zk_server_ruok].str(imok)}<>1</expression>
<name>zookeeper is abnormal</name>
<priority>4</priority>
<expression>{Zookeeper:zookeeper.status[alive].last()}=0</expression>
<name>zookeeper is not running</name>
<expression>{Zookeeper:zookeeper.status[zk_server_state].abschange()}>0</expression>
<name>zookeeper state role has been changed</name>
<priority>1</priority>
</triggers>
<graphs>
<graph>
<name>ZooKeeper Alive Connections</name>
<width>900</width>
<height>200</height>
<yaxismin>0.0000</yaxismin>
<yaxismax>100.0000</yaxismax>
<show_work_period>1</show_work_period>
<show_triggers>1</show_triggers>
<show_legend>1</show_legend>
<show_3d>0</show_3d>
<percent_left>0.0000</percent_left>
<percent_right>0.0000</percent_right>
<ymin_type_1>0</ymin_type_1>
<ymax_type_1>0</ymax_type_1>
<ymin_item_1>0</ymin_item_1>
<ymax_item_1>0</ymax_item_1>
<graph_items>
<graph_item>
<sortorder>0</sortorder>
<drawtype>0</drawtype>
<color>1A7C11</color>
<yaxisside>0</yaxisside>
<calc_fnc>2</calc_fnc>
<host>Zookeeper</host>
</graph_item>
</graph_items>
</graph>
<name>ZooKeeper Latency</name>
</graphs>
</zabbix_export></pre>
导入zabbix
image