天天看点

python 监控系统负载 cpu 内存 并实现发送邮件或杀死爬虫脚本

背景

远程服务器上, 部署的爬虫, 经常会因为负载 cpu 内存的过高而导致本地ssh无法连接

废话就不多说, 直接上代码, 主要注释我就写在代码里

主要的用的 psutil 模块

# -*- coding: utf-8 -*-
# @Author: Mehaei
# @Date:   2019-08-27 16:57:58
# @Last Modified by:   Mehaei
# @Last Modified time: 2019-08-30 17:00:04
import os
import sys

# 解决不同路径启动脚本, 自定义模块无法导入问题
work_dir, file_name = os.path.split(__file__)
os.chdir(work_dir if work_dir else "./")
sys.path.append("../")

import time
# 主要使用的模块
import psutil
from submit_data.to_email import ToEmail

# 项目名 发送邮件是subject会使用
PROJECT_NAME = "Test"
# unit: s 检查时间间隔 每10分钟检查一次
_CHECK_TIME_INTERVAL = 60 * 10


# 内存标准值
_MEMORY_NORMAL = 90

# 负载总值 等于cpu的核数
_LOADAVG_NORMAL = psutil.cpu_count()

# cpu标准值
_CPU_NORMAL = 90

# 检查文件路径大小
_DISK_MONITOR_LIST = ["/home"]

# 文件夹正常值
_DISK_NORMAL = 90

# 根据爬虫文件名 杀死爬虫进程
_CRAWL_SPIDER_FILE = ["spider.py"]

# 根据启动爬虫命令 杀死爬虫进程
_KILL_PROCESS_COMMAND = ["python3 ./spider.py"]

CPU = "cpu"
MEMORY = "memory"
DISK = "disk"
SYS_LOAD = "sys_load"

NORMAL = "Normal"

# 需要监控的服务
MONITOR_LIST = [CPU, MEMORY, SYS_LOAD]

# inform or kill
# 如果是inform 则会发送邮件, kill 或杀死爬虫进程
EXCEPTION_HANDLING_METHOD = "inform"

# 如果在这个时间段内异常存在则不会在次发送邮件
SEND_EMAIL_INTERVAL = 60 * 60


class ResourceMonitor(object):
    def __init__(self):
        self._keep_check()

    def _keep_check(self) -> None:
        send_time = 0
        while True:
            check_result = self.check_hardware_status()
            for hardware, status in check_result.items():
                if status == NORMAL:
                    continue
                else:
                    # send email
                    if EXCEPTION_HANDLING_METHOD == "inform":
                        if (time.time() - send_time) < SEND_EMAIL_INTERVAL:
                            continue
                        ToEmail(SUBJECT="%s ResourceMonitor Exception" % PROJECT_NAME).send(["%s: %s" % (h, s) for h, s in check_result.items()])
                        send_time = time.time()
                    # kill crawl spider
                    elif EXCEPTION_HANDLING_METHOD == "kill":
                        # self.kill_crawl_process()
                        pass
                    else:
                        pass
            time.sleep(_CHECK_TIME_INTERVAL)

    def check_hardware_status(self) -> dict:
        result = {}
        error_msg = "%s exception, usage rate: %s"
        if MEMORY in MONITOR_LIST:
            memory_usage_rate = self.memory_monitor()
            if memory_usage_rate > _MEMORY_NORMAL:
                result[MEMORY] = error_msg % (MEMORY, memory_usage_rate)
            else:
                result[MEMORY] = NORMAL

        if CPU in MONITOR_LIST:
            cpu_usage_rate = self.cpu_monitor()
            if cpu_usage_rate > _CPU_NORMAL:
                result[CPU] = error_msg % (CPU, cpu_usage_rate)
            else:
                result[CPU] = NORMAL

        if SYS_LOAD in MONITOR_LIST:
            loadavg = self.loadavg_monitor()
            if loadavg[0] > _LOADAVG_NORMAL:
                result[SYS_LOAD] = error_msg % (SYS_LOAD, loadavg[0])
            else:
                result[SYS_LOAD] = NORMAL

        return result
   
    # 内存监控
    def memory_monitor(self) -> float:
        """
        return memory usage rate
        type: float
        """
        memory = psutil.virtual_memory()
        return memory.percent
   
    # cpu监控
    def cpu_monitor(self):
        """
        return cpu useage rate
        type: float
        """
        # interval=0.01, percpu=False is solve run script return 0 or 100.0
        return psutil.cpu_percent(interval=0.01, percpu=False)
      
    # 磁盘监控
    def disk_monitor(self) -> dict:
        """
        return _DISK_MONITOR_LIST usage
        type: dict
        """
        check_result = {}
        for path in _DISK_MONITOR_LIST:
            usage_rate = self.disk_status(path)
            if usage_rate < _DISK_NORMAL:
                check_result[path] = "Normal"
            else:
                check_result[path] = "Error"
        return check_result

    def disk_status(self, path: str) -> float:
        """
        params: path, need check file path, example: /amazon
        type: str
        return path useage
        type float
        """
        disk_usage_rate = psutil.disk_usage(path)
        return disk_usage_rate.percent
    
    # 负载监控
    def loadavg_monitor(self) -> tuple:
        """
        return system loadavg
        type: tuple
        """
        return psutil.getloadavg()
    
    # 执行杀死爬虫
    def kill_crawl_process(self) -> None:
        """
        find all process, and kill crawl process
        """
        pid = psutil.pids() 
        error_msg = ""
        for k,i in enumerate(pid): 
            try: 
                proc  = psutil.Process(i) 
                # print k,i,"%.2f%%"%(proc.memory_percent()),"%",proc.name(),proc.exe() 
                cmdline = proc.cmdline()
                if " ".join(cmdline) in _KILL_PROCESS_COMMAND:
                    cmdline.terminate()

                if all((len(cmdline) == 2, cmdline[0] == "python3", cmdline[-1].split("/")[-1] in _CRAWL_SPIDER_FILE)):
                    cmdline.terminate()

            except Exception as e:
                error_msg = e

            finally:
                # send email
                pass


if __name__ == "__main__":
    # 启动监控
    ResourceMonitor()
           

# 写在最后

模块还有很多要修改的地方, 如有错误欢迎交流

公众号刚开通, 请各路大神多多指教

python 监控系统负载 cpu 内存 并实现发送邮件或杀死爬虫脚本

感谢关注