夜莺(Nightingale)是一个企业级监控解决方案。旨在满足云原生时代企业级的监控需求。Nightingale 在产品完成度、系统高可用、以及用户体验方面,达到了企业级的要求,可满足不同规模用户的场景,小到几台服务,大到数十万都可以完美支撑。兼顾云原生和裸金属,支持应用监控和系统监控,插件机制灵活,插件丰富完善,具有高度的灵活性和可扩展性。
Nightingale 在 Open-Falcon 的基础上,结合滴滴内部的最佳实践,在性能、可维护性、易用性方面做了大量的改进,作为集团统一的监控解决方案,支撑了滴滴内部数十亿监控指标,覆盖了从系统、容器、到应用等各层面的监控需求,周活跃用户数千。
1、以下是整个安装步骤及过程:
[root@localhost ~]# cd ~ [root@localhost ~]# pwd /root [root@localhost ~]#wget https://studygolang.com/dl/golang/go1.14.4.linux-amd64.tar.gz #下载go环境 [root@localhost ~]#tar -zxvf go1.14.4.linux-amd64.tar.gz #解压 [root@localhost ~]#vim .bash_profile #配置go环境变量 export GOROOT=/root/go export GOPATH=/root/gopath PATH=$GOROOT/bin:$GOPATH/bin:$PATH:$HOME/bin [root@localhost ~]#source .bash_profile #配置文件立马生效 [root@localhost ~]#mkdir -p gopath/src/github.com/didi #创建目录 [root@localhost ~]#cd gopath/src/github.com/didi #进入新创建的目录下 [root@localhost didi]# git clone https://github.com/didi/nightingale.git #git克隆Nightingale源代码 [root@localhost didi]# cd nightingale/ #进入到Nightingale目录下 [root@localhost nightingale]# ll #源项目目录结构 -rwxr-xr-x 1 root root 3721 7月 1 17:17 control drwxr-xr-x 3 root root 18 7月 1 17:56 data drwxr-xr-x 2 root root 20 7月 1 17:17 doc drwxr-xr-x 3 root root 19 7月 1 17:17 docker -rw-r--r-- 1 root root 1551 7月 1 17:17 docker-compose.yml -rw-r--r-- 1 root root 215 7月 1 17:17 Dockerfile drwxr-xr-x 5 root root 207 7月 1 17:50 etc -rw-r--r-- 1 root root 1169 7月 1 17:17 go.mod -rw-r--r-- 1 root root 59578 7月 1 17:17 go.sum -rw-r--r-- 1 root root 11401 7月 1 17:17 LICENSE drwxr-xr-x 8 root root 91 7月 1 17:56 logs drwxr-xr-x 2 root root 53 7月 1 17:17 plugin drwxr-xr-x 3 root root 212 7月 1 17:17 pub -rw-r--r-- 1 root root 2549 7月 1 17:17 README.md -rw-r--r-- 1 root root 2092 7月 1 17:17 README_ZH.md drwxr-xr-x 2 root root 88 7月 1 17:17 sql drwxr-xr-x 6 root root 65 7月 1 17:17 src drwxr-xr-x 9 root root 153 7月 1 17:17 vendor drwxr-xr-x 5 root root 265 7月 1 17:17 web [root@localhost nightingale]# ./control build #构建Nightingale [root@localhost nightingale]# ll #编译后的目录结构,编译后生成:n9e-collector、n9e-index、n9e-judge、n9e-monapi、n9e-transfer、n9e-tsdb可执行文件 -rwxr-xr-x 1 root root 3721 7月 1 17:17 control drwxr-xr-x 3 root root 18 7月 1 17:56 data drwxr-xr-x 2 root root 20 7月 1 17:17 doc drwxr-xr-x 3 root root 19 7月 1 17:17 docker -rw-r--r-- 1 root root 1551 7月 1 17:17 docker-compose.yml -rw-r--r-- 1 root root 215 7月 1 17:17 Dockerfile drwxr-xr-x 5 root root 207 7月 1 17:50 etc -rw-r--r-- 1 root root 1169 7月 1 17:17 go.mod -rw-r--r-- 1 root root 59578 7月 1 17:17 go.sum -rw-r--r-- 1 root root 11401 7月 1 17:17 LICENSE drwxr-xr-x 8 root root 91 7月 1 17:56 logs -rwxr-xr-x 1 root root 22598923 7月 1 17:18 n9e-collector -rwxr-xr-x 1 root root 21947066 7月 1 17:18 n9e-index -rwxr-xr-x 1 root root 21932629 7月 1 17:18 n9e-judge -rwxr-xr-x 1 root root 22550169 7月 1 17:18 n9e-monapi -rwxr-xr-x 1 root root 25309584 7月 1 17:18 n9e-transfer -rwxr-xr-x 1 root root 20324072 7月 1 17:18 n9e-tsdb drwxr-xr-x 2 root root 53 7月 1 17:17 plugin drwxr-xr-x 3 root root 212 7月 1 17:17 pub -rw-r--r-- 1 root root 2549 7月 1 17:17 README.md -rw-r--r-- 1 root root 2092 7月 1 17:17 README_ZH.md drwxr-xr-x 2 root root 88 7月 1 17:17 sql drwxr-xr-x 6 root root 65 7月 1 17:17 src drwxr-xr-x 9 root root 153 7月 1 17:17 vendor drwxr-xr-x 5 root root 265 7月 1 17:17 web [root@localhost nightingale]# yum install -y mariadb mariadb-server redis #安装MySQL和Redis [root@localhost nightingale]# vim /etc/my.cnf #修改MySQL配置文件 bind-address=127.0.0.1 #配置好配置文件后重启MySQL [root@localhost nightingale]# systemctl start mariadb-service #启动MySQL [root@localhost nightingale]# systemctl enable mariadb.service #设置MySQL开机自启动 [root@localhost nightingale]# systemctl status mariadb.service #查看MySQL状态 [root@localhost nightingale]# mysql_secure_installation #设置数据库用户名、密码以及其他选项 ... Enter current password for root (enter for none): #直接回车 ... Set root password?[Y/n] Y #配置文件默认密码是1234 New password:1234 Re-enter new password:1234 ... Remove anonymous users?[Y/n]Y ... Disallow root login remotely?[Y/n]n ... Reload privilege tables now?[Y/n]Y ... Thanks for using MariaDB! [root@localhost nightingale]# systemctl stop mariadb.service #关闭MySQL [root@localhost nightingale]# systemctl start mariadb.service #启动MySQL [root@localhost nightingale]# systemctl status mariadb.service #查看MySQL状态 [root@localhost nightingale]# systemctl start redis #启动Redis [root@localhost nightingale]# systemctl enable redis #设置Redis开机自启动 [root@localhost nightingale]# systemctl status redis #查看Redis状态 [root@localhost nightingale]# vim /etc/redis.conf #修改Redis配置文件 # requirepass foobared #去掉注释#可设置自定义密码,本次实验不进行设置 [root@localhost etc]# ss -tlnp State Recv-Q Send-Q Local Address:Port Peer Address:Port ... LISTEN 0 50 127.0.0.1:3306 : users:(("mysqld",pid=30391,fd=14)) LISTEN 0 128 127.0.0.1:6379 : users:(("redis-server",pid=24126,fd=4)) ... [root@localhost nightingale]# cd etc/ [root@localhost etc]# vim mysql.yml #设置mysql用户名和密码 --- uic: addr: "root:1234@tcp(127.0.0.1:3306)/n9e_uic?charset=utf8&parseTime=True&loc=Asia%2FShanghai" max: 16 idle: 4 debug: false mon: addr: "root:1234@tcp(127.0.0.1:3306)/n9e_mon?charset=utf8&parseTime=True&loc=Asia%2FShanghai" max: 16 idle: 4 debug: false hbs: addr: "root:1234@tcp(127.0.0.1:3306)/n9e_hbs?charset=utf8&parseTime=True&loc=Asia%2FShanghai" max: 16 idle: 4 debug: false [root@localhost etc]# cd ../sql/ #进入到sql目录下,导入sql文件 [root@localhost sql]# ll -rw-r--r-- 1 root root 480 7月 1 17:17 n9e_hbs.sql -rw-r--r-- 1 root root 13862 7月 1 17:17 n9e_mon.sql -rw-r--r-- 1 root root 1346 7月 1 17:17 n9e_uic.sql -rw-r--r-- 1 root root 122 7月 1 17:17 upgrade_2.3.0.sql [root@localhost sql]#mysql -uroot -p1234 < n9e_hbs.sql #导入n9e_hbs.sql文件 [root@localhost sql]#mysql -uroot -p1234 < n9e_mon.sql #导入n9e_mon.sql文件 [root@localhost sql]#mysql -uroot -p1234 < n9e_uic.sql #导入n9e_uic.sql文件 [root@localhost sql]# cd ../ [root@localhost nightingale]# yum install -y nginx #安装NGINX [root@localhost nightingale]# cd etc/ [root@localhost etc]# ll -rw-r--r-- 1 root root 324 7月 1 17:17 address.yml -rw-r--r-- 1 root root 827 7月 1 17:17 collector.yml -rw-r--r-- 1 root root 204 7月 1 17:17 index.yml -rw-r--r-- 1 root root 415 7月 1 17:17 judge.yml drwxr-xr-x 2 root root 30 7月 1 17:17 log -rw-r--r-- 1 root root 1240 7月 1 17:17 monapi.yml -rw-r--r-- 1 root root 412 7月 1 17:44 mysql.yml -rw-r--r-- 1 root root 2352 7月 1 17:17 nginx.conf drwxr-xr-x 2 root root 19 7月 1 17:17 port drwxr-xr-x 2 root root 163 7月 1 17:17 service -rw-r--r-- 1 root root 497 7月 1 17:17 transfer.yml -rw-r--r-- 1 root root 108 7月 1 17:17 tsdb.yml [root@localhost etc]# cp nginx.conf /etc/nginx/nginx.conf [root@localhost etc]# cd ../pub/ [root@localhost pub]# pwd /root/gopath/src/github.com/didi/nightingale/pub [root@localhost nightingale]# vim /etc/nginx/nginx.conf #修改配置文件 location / { root /root/gopath/src/github.com/didi/nightingale/pub; } #[root@localhost nightingale]# vim /etc/nginx.conf #修改配置文件 #location / { # root /root/gopath/src/github.com/didi/nightingale/pub; #} [root@localhost nightingale]# systemctl start nginx.service #启动NGINX [root@localhost nightingale]# systemctl enable nginx.service #设置开机自启动NGINX [root@localhost nightingale]# systemctl status nginx.service #查看NGINX启动状态 [root@localhost nightingale]# ps -ef | grep nginx root 40157 1 0 17:53 ? 00:00:00 nginx: master process /usr/sbin/nginx root 40158 40157 1 17:53 ? 00:00:00 nginx: worker process root 40159 40157 2 17:53 ? 00:00:00 nginx: worker process root 40160 40157 2 17:53 ? 00:00:00 nginx: worker process root 40161 40157 3 17:53 ? 00:00:01 nginx: worker process root 40609 2346 0 17:54 pts/1 00:00:00 grep --color=auto nginx [root@localhost etc]# ss -tlnp State Recv-Q Send-Q Local Address:Port Peer Address:Port ... LISTEN 0 50 127.0.0.1:3306 : users:(("mysqld",pid=30391,fd=14)) LISTEN 0 128 127.0.0.1:6379 : users:(("redis-server",pid=24126,fd=4)) LISTEN 0 128 :80 :* users:(("nginx",pid=40161,fd=6),("nginx",pid=40160,fd=6),("nginx",pid=40159,fd=6),("nginx",pid=40158,fd=6),("nginx",pid=40157,fd=6)) ... [root@localhost nightingale]# ./control start all #启动Nightingale全部模块(共六个) monapi started transfer started tsdb started index started judge started collector started [root@localhost etc]# ss -tlnp State Recv-Q Send-Q Local Address:Port Peer Address:Port LISTEN 0 128 :22 :* users:(("sshd",pid=1108,fd=3)) LISTEN 0 128 127.0.0.1:631 : users:(("cupsd",pid=1105,fd=12)) LISTEN 0 100 127.0.0.1:25 : users:(("master",pid=1450,fd=13)) LISTEN 0 128 127.0.0.1:6010 : users:(("sshd",pid=73544,fd=9)) LISTEN 0 128 127.0.0.1:6011 : users:(("sshd",pid=2344,fd=9)) LISTEN 0 50 127.0.0.1:3306 : users:(("mysqld",pid=30391,fd=14)) LISTEN 0 128 127.0.0.1:6379 : users:(("redis-server",pid=24126,fd=4)) LISTEN 0 128 :111 :* users:(("rpcbind",pid=754,fd=8)) LISTEN 0 128 :80 :* users:(("nginx",pid=40161,fd=6),("nginx",pid=40160,fd=6),("nginx",pid=40159,fd=6),("nginx",pid=40158,fd=6),("nginx",pid=40157,fd=6)) LISTEN 0 128 [::]:22 [::]:* users:(("sshd",pid=1108,fd=4)) LISTEN 0 128 [::1]:631 [::]:* users:(("cupsd",pid=1105,fd=11)) LISTEN 0 100 [::1]:25 [::]:* users:(("master",pid=1450,fd=14)) LISTEN 0 128 [::1]:6010 [::]:* users:(("sshd",pid=73544,fd=8)) LISTEN 0 128 [::1]:6011 [::]:* users:(("sshd",pid=2344,fd=8)) LISTEN 0 128 [::]:5820 [::]:* users:(("n9e-tsdb",pid=42357,fd=13)) LISTEN 0 128 [::]:5821 [::]:* users:(("n9e-tsdb",pid=42357,fd=11)) LISTEN 0 128 [::]:5830 [::]:* users:(("n9e-index",pid=42377,fd=12)) LISTEN 0 128 [::]:5831 [::]:* users:(("n9e-index",pid=42377,fd=13)) LISTEN 0 128 [::]:5800 [::]:* users:(("n9e-monapi",pid=42313,fd=18)) LISTEN 0 128 [::]:2058 [::]:* users:(("n9e-collector",pid=42423,fd=12)) LISTEN 0 128 [::]:111 [::]:* users:(("rpcbind",pid=754,fd=11)) LISTEN 0 128 [::]:5840 [::]:* users:(("n9e-judge",pid=42406,fd=14)) LISTEN 0 128 [::]:5841 [::]:* users:(("n9e-judge",pid=42406,fd=12)) LISTEN 0 128 [::]:5810 [::]:* users:(("n9e-transfer",pid=42337,fd=12)) LISTEN 0 128 [::]:5811 [::]:* users:(("n9e-transfer",pid=42337,fd=14)) [root@localhost nightingale]# ./control status #查看模块启动状态 root 42313 0.6 0.2 923028 11532 pts/1 Sl 17:56 0:00 /root/gopath/src/github.com/didi/nightingale/n9e-monapi root 42337 1.5 0.2 990472 10668 pts/1 Sl 17:56 0:00 /root/gopath/src/github.com/didi/nightingale/n9e-transfer root 42357 0.3 0.2 990160 9700 pts/1 Sl 17:56 0:00 /root/gopath/src/github.com/didi/nightingale/n9e-tsdb root 42377 0.3 0.2 988036 8748 pts/1 Sl 17:56 0:00 /root/gopath/src/github.com/didi/nightingale/n9e-index root 42406 0.3 0.2 1053580 8956 pts/1 Sl 17:56 0:00 /root/gopath/src/github.com/didi/nightingale/n9e-judge root 42423 0.9 0.7 857508 29668 pts/1 Sl 17:56 0:00 /root/gopath/src/github.com/didi/nightingale/n9e-collector [root@localhost nightingale]# ss -tlnp | grep 58 LISTEN 0 128 :80 :* users:(("nginx",pid=40161,fd=6),("nginx",pid=40160,fd=6),("nginx",pid=40159,fd=6),("nginx",pid=40158,fd=6),("nginx",pid=40157,fd=6)) LISTEN 0 128 [::]:5820 [::]:* users:(("n9e-tsdb",pid=42357,fd=13)) LISTEN 0 128 [::]:5821 [::]:* users:(("n9e-tsdb",pid=42357,fd=11)) LISTEN 0 128 [::]:5830 [::]:* users:(("n9e-index",pid=42377,fd=12)) LISTEN 0 128 [::]:5831 [::]:* users:(("n9e-index",pid=42377,fd=13)) LISTEN 0 128 [::]:5800 [::]:* users:(("n9e-monapi",pid=42313,fd=18)) LISTEN 0 128 [::]:2058 [::]:* users:(("n9e-collector",pid=42423,fd=12)) LISTEN 0 128 [::]:5840 [::]:* users:(("n9e-judge",pid=42406,fd=14)) LISTEN 0 128 [::]:5841 [::]:* users:(("n9e-judge",pid=42406,fd=12)) LISTEN 0 128 [::]:5810 [::]:* users:(("n9e-transfer",pid=42337,fd=12)) LISTEN 0 128 [::]:5811 [::]:* users:(("n9e-transfer",pid=42337,fd=14))2、最后访问IP即可:http://IP
eg:http://192.168.21.145/3、登录名和密码都为root
支持两种配置方式,一个是在目标机器的指定目录创建元信息文件,一个是在页面上配置采集策略。
效果图:
支持两种配置方式,一个是在目标机器的指定目录创建元信息文件,一个是在页面上配置采集策略。
配置方式同端口配置,此处不再赘述。
[root@nightingale proc]# pwd /root/gopath/src/github.com/didi/nightingale/etc/proc [root@nightingale proc]# ll 总用量 8 -rw-r--r-- 1 root root 10 7月 7 22:22 10_cmdline_n9e-judge -rw-r--r-- 1 root root 9 7月 7 22:19 10_name_n9e-transfer [root@nightingale proc]# cat 10_cmdline_n9e-judge n9e-judge [root@nightingale proc]# cat 10_name_n9e-transfer transfer [root@nightingale proc]#效果图:
日志监控支持两种配置方式,一种是在目标机器的指定目录创建元信息配置文件,一种是在页面上配置采集策略。
如此,上面的这条json配置代表什么意思呢?表示会每隔10s采集一次 /var/log/messages 文件,匹配 “Out of Memory” 这个关键字,看总共可以匹配到多少行(即cnt函数),把行数作为监控指标的值上报。这样服务端就可以配置告警策略,当log.sys.oom的值大于0就报警,说明系统出现了OOM。
效果图:
对于插件,有如下几个要求:
插件脚本必须具有可执行权限,部署完了脚本记得chmod +x一下插件脚本可以是sh、py、pl、rb,甚至可以是二进制,只要机器上有runtime环境插件脚本的命名:${step}_xx.xx,比如20_uptime.sh,${step}是在告诉collector多久跑一次plugin目录下非${step}_xx.xx命名格式的文件或者目录可以存在没关系,collector不会识别为插件插件执行之后要在stdout输出一个json array,collector会截获这个输出,解析为监控指标上报如果插件执行报错了,报错消息要打印到stderr,不要打印到stdout官方给出的两个参考例子:
下面给一个shell编写的插件例子20_uptime.sh:
#!/bin/bash duration=$(cat /proc/uptime | awk '{print $1}') localip=$(ifconfig `route|grep '^default'|awk '{print $NF}'`|grep inet|awk '{print $2}'|awk -F ':' '{print $NF}'|head -n 1) step=$(basename $0|awk -F'_' '{print $1}') echo '[ { "endpoint": "'${localip}'", "tags": "", "timestamp": '$(date +%s)', "metric": "sys.uptime.duration", "value": '${duration}', "step": '${step}' } ]'下面给一个python编写的插件例子20_plugin_status.py:
#!/usr/bin/env python # -*- coding: utf-8 -*- import time import commands import json import sys import os items = [] def collect_myself_status(): item = {} item["metric"] = "plugin.myself.status" item["value"] = 1 item["tags"] = "" items.append(item) def main(): code, endpoint = commands.getstatusoutput( "timeout 1 ifconfig `route|grep '^default'|awk '{print $NF}'`|grep inet|awk '{print $2}'|awk -F ':' '{print $NF}'|head -n 1") if code != 0: sys.stderr.write('cannot get local ip') return timestamp = int("%d" % time.time()) plugin_name = os.path.basename(sys.argv[0]) step = int(plugin_name.split("_", 1)[0]) collect_myself_status() for item in items: item["endpoint"] = endpoint item["timestamp"] = timestamp item["step"] = step print json.dumps(items) if __name__ == "__main__": main()自定义的插件脚本:
[root@nightingale plugin]# pwd /root/gopath/src/github.com/didi/nightingale/plugin [root@nightingale plugin]# ll 总用量 12 -rwxr-xr-x 1 root root 1717 7月 8 00:30 10_ss.sh -rwxr-xr-x 1 root root 999 7月 8 22:23 60_plugin_status.py -rwxr-xr-x 1 root root 466 7月 1 17:17 60_uptime.sh [root@nightingale plugin]# cat 10_ss.sh #!/bin/bash # author: ulric.qin@gmail.com #TCP: 140 (estab 82, closed 33, orphaned 0, synrecv 0, timewait 32/0), ports 0 output=$(ss -s | grep TCP:) ss_estab=$(echo $output | grep -Po "estab (\d+)" | awk '{print $2}') ss_closed=$(echo $output | grep -Po "closed (\d+)" | awk '{print $2}') ss_orphaned=$(echo $output | grep -Po "orphaned (\d+)" | awk '{print $2}') ss_synrecv=$(echo $output | grep -Po "synrecv (\d+)" | awk '{print $2}') ss_timewait=$(echo $output | grep -Po "timewait (\d+)" | awk '{print $2}') localip=$(/usr/sbin/ifconfig `/usr/sbin/route|grep '^default'|awk '{print $NF}'`|grep inet|awk '{print $2}'|head -n 1) step=$(basename $0|awk -F'_' '{print $1}') timestamp=$(date +%s) echo '[ { "endpoint": "'${localip}'", "tags": "", "timestamp": '${timestamp}', "metric": "net.ss.estab", "value": '${ss_estab}', "step": '${step}' }, { "endpoint": "'${localip}'", "tags": "", "timestamp": '${timestamp}', "metric": "net.ss.closed", "value": '${ss_closed}', "step": '${step}' }, { "endpoint": "'${localip}'", "tags": "", "timestamp": '${timestamp}', "metric": "net.ss.orphaned", "value": '${ss_orphaned}', "step": '${step}' }, { "endpoint": "'${localip}'", "tags": "", "timestamp": '${timestamp}', "metric": "net.ss.synrecv", "value": '${ss_synrecv}', "step": '${step}' }, { "endpoint": "'${localip}'", "tags": "", "timestamp": '${timestamp}', "metric": "net.ss.timewait", "value": '${ss_timewait}', "step": '${step}' } ]' [root@nightingale plugin]#最后把自定义的插件配置到文件路径下,如下图 :
效果图: