第9章PrometheusAlert告警聚合
1.介绍
项目地址:
https://github.com/feiyu563/PrometheusAlert
PrometheusAlert是开源的运维告警中心消息转发系统,支持主流的监控系统Prometheus、Zabbix,日志系统Graylog2,Graylog3、数据可视化系统Grafana、SonarQube。阿里云-云监控,以及所有支持WebHook接口的系统发出的预警消息,支持将收到的这些消息发送到钉钉,微信,email,飞书,腾讯短信,腾讯电话,阿里云短信,阿里云电话,华为短信,百度云短信,容联云电话,七陌短信,七陌语音,TeleGram,百度Hi(如流),Kafka等。
2.安装部署
如果是k8s环境,直接运行官方提供好的yaml资源配置清单即可,按照需求修改并填写相关信息后apply应用即可。
cat > PrometheusAlert-Deployment.yaml << 'EOF'
# apiVersion: v1
# kind: Namespace
# metadata:
#   name: monitoring
---
apiVersion: v1
data:
  app.conf: |
    #---------------------↓全局配置-----------------------
    appname = PrometheusAlert
    #登录用户名
    login_user=prometheusalert
    #登录密码
    login_password=prometheusalert
    #监听地址
    httpaddr = "0.0.0.0"
    #监听端口
    httpport = 8080
    runmode = dev
    #设置代理 proxy = http://123.123.123.123:8080
    proxy =
    #开启JSON请求
    copyrequestbody = true
    #告警消息标题
    title=PrometheusAlert
    #链接到告警平台地址
    GraylogAlerturl=http://graylog.org
    #钉钉告警 告警logo图标地址
    logourl=https://raw.githubusercontent.com/feiyu563/PrometheusAlert/master/doc/alert-center.png
    #钉钉告警 恢复logo图标地址
    rlogourl=https://raw.githubusercontent.com/feiyu563/PrometheusAlert/master/doc/alert-center.png
    #短信告警级别(等于3就进行短信告警) 告警级别定义 0 信息,1 警告,2 一般严重,3 严重,4 灾难
    messagelevel=3
    #电话告警级别(等于4就进行语音告警) 告警级别定义 0 信息,1 警告,2 一般严重,3 严重,4 灾难
    phonecalllevel=4
    #默认拨打号码(页面测试短信和电话功能需要配置此项)
    defaultphone=xxxxxxxx
    #故障恢复是否启用电话通知0为关闭,1为开启
    phonecallresolved=0
    #是否前台输出file or console
    logtype=file
    #日志文件路径
    logpath=logs/prometheusalertcenter.log
    #转换Prometheus,graylog告警消息的时区为CST时区(如默认已经是CST时区,请勿开启)
    prometheus_cst_time=0
    #数据库驱动,支持sqlite3,mysql,postgres如使用mysql或postgres,请开启db_host,db_port,db_user,db_password,db_name的注释
    db_driver=sqlite3
    #db_host=127.0.0.1
    #db_port=3306
    #db_user=root
    #db_password=root
    #db_name=prometheusalert
    #是否开启告警记录 0为关闭,1为开启
    AlertRecord=0
    #是否开启告警记录定时删除 0为关闭,1为开启
    RecordLive=0
    #告警记录定时删除周期,单位天
    RecordLiveDay=7
    # 是否将告警记录写入es7,0为关闭,1为开启
    alert_to_es=0
    # es地址,是[]string
    # beego.Appconfig.Strings读取配置为[]string,使用";"而不是","
    to_es_url=http://localhost:9200
    # to_es_url=http://es1:9200;http://es2:9200;http://es3:9200
    # es用户和密码
    # to_es_user=username
    # to_es_pwd=password
    # 长连接最大空闲数
    maxIdleConns=100
    # 热更新配置文件
    open-hotreload=0
    #---------------------↓webhook-----------------------
    #是否开启钉钉告警通道,可同时开始多个通道0为关闭,1为开启
    open-dingding=1
    #默认钉钉机器人地址
    ddurl=https://oapi.dingtalk.com/robot/send?access_token=xxxxx
    #是否开启 @所有人(0为关闭,1为开启)
    dd_isatall=1
    #是否开启钉钉机器人加签,0为关闭,1为开启
    # 使用方法:https://oapi.dingtalk.com/robot/send?access_token=XXXXXX&secret=mysecret
    open-dingding-secret=0
    #是否开启微信告警通道,可同时开始多个通道0为关闭,1为开启
    open-weixin=1
    #默认企业微信机器人地址
    wxurl=https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=xxxxx
    #是否开启飞书告警通道,可同时开始多个通道0为关闭,1为开启
    open-feishu=1
    #默认飞书机器人地址
    fsurl=https://open.feishu.cn/open-apis/bot/hook/xxxxxxxxx
    # webhook 发送 http 请求的 contentType, 如 application/json, application/x-www-form-urlencoded,不配置默认 application/json
    wh_contenttype=application/json
    #---------------------↓腾讯云接口-----------------------
    #是否开启腾讯云短信告警通道,可同时开始多个通道0为关闭,1为开启
    open-txdx=0
    #腾讯云短信接口key
    TXY_DX_appkey=xxxxx
    #腾讯云短信模版ID 腾讯云短信模版配置可参考 prometheus告警:{1}
    TXY_DX_tpl_id=xxxxx
    #腾讯云短信sdk app id
    TXY_DX_sdkappid=xxxxx
    #腾讯云短信签名 根据自己审核通过的签名来填写
    TXY_DX_sign=腾讯云
    #是否开启腾讯云电话告警通道,可同时开始多个通道0为关闭,1为开启
    open-txdh=0
    #腾讯云电话接口key
    TXY_DH_phonecallappkey=xxxxx
    #腾讯云电话模版ID
    TXY_DH_phonecalltpl_id=xxxxx
    #腾讯云电话sdk app id
    TXY_DH_phonecallsdkappid=xxxxx
    #---------------------↓华为云接口-----------------------
    #是否开启华为云短信告警通道,可同时开始多个通道0为关闭,1为开启
    open-hwdx=0
    #华为云短信接口key
    HWY_DX_APP_Key=xxxxxxxxxxxxxxxxxxxxxx
    #华为云短信接口Secret
    HWY_DX_APP_Secret=xxxxxxxxxxxxxxxxxxxxxx
    #华为云APP接入地址(端口接口地址)
    HWY_DX_APP_Url=https://rtcsms.cn-north-1.myhuaweicloud.com:10743
    #华为云短信模板ID
    HWY_DX_Templateid=xxxxxxxxxxxxxxxxxxxxxx
    #华为云签名名称,必须是已审核通过的,与模板类型一致的签名名称,按照自己的实际签名填写
    HWY_DX_Signature=华为云
    #华为云签名通道号
    HWY_DX_Sender=xxxxxxxxxx
    #---------------------↓阿里云接口-----------------------
    #是否开启阿里云短信告警通道,可同时开始多个通道0为关闭,1为开启
    open-alydx=0
    #阿里云短信主账号AccessKey的ID
    ALY_DX_AccessKeyId=xxxxxxxxxxxxxxxxxxxxxx
    #阿里云短信接口密钥
    ALY_DX_AccessSecret=xxxxxxxxxxxxxxxxxxxxxx
    #阿里云短信签名名称
    ALY_DX_SignName=阿里云
    #阿里云短信模板ID
    ALY_DX_Template=xxxxxxxxxxxxxxxxxxxxxx
    #是否开启阿里云电话告警通道,可同时开始多个通道0为关闭,1为开启
    open-alydh=0
    #阿里云电话主账号AccessKey的ID
    ALY_DH_AccessKeyId=xxxxxxxxxxxxxxxxxxxxxx
    #阿里云电话接口密钥
    ALY_DH_AccessSecret=xxxxxxxxxxxxxxxxxxxxxx
    #阿里云电话被叫显号,必须是已购买的号码
    ALY_DX_CalledShowNumber=xxxxxxxxx
    #阿里云电话文本转语音(TTS)模板ID
    ALY_DH_TtsCode=xxxxxxxx
    #---------------------↓容联云接口-----------------------
    #是否开启容联云电话告警通道,可同时开始多个通道0为关闭,1为开启
    open-rlydh=0
    #容联云基础接口地址
    RLY_URL=https://app.cloopen.com:8883/2013-12-26/Accounts/
    #容联云后台SID
    RLY_ACCOUNT_SID=xxxxxxxxxxx
    #容联云api-token
    RLY_ACCOUNT_TOKEN=xxxxxxxxxx
    #容联云app_id
    RLY_APP_ID=xxxxxxxxxxxxx
    #---------------------↓邮件配置-----------------------
    #是否开启邮件
    open-email=1
    #邮件发件服务器地址
    Email_host=smtp.qq.com
    #邮件发件服务器端口
    Email_port=465
    #邮件帐号
    Email_user=邮件接收者@qq.com
    #邮件密码
    Email_password=xxxxx
    #邮件标题
    Email_title=PrometheusAlert
    #默认发送邮箱
    Default_emails=邮件接收者@qq.com
    #---------------------↓七陌云接口-----------------------
    #是否开启七陌短信告警通道,可同时开始多个通道0为关闭,1为开启
    open-7moordx=0
    #七陌账户ID
    7MOOR_ACCOUNT_ID=Nxxx
    #七陌账户APISecret
    7MOOR_ACCOUNT_APISECRET=xxx
    #七陌账户短信模板编号
    7MOOR_DX_TEMPLATENUM=n
    #注意:七陌短信变量这里只用一个var1,在代码里写死了。
    #-----------
    #是否开启七陌webcall语音通知告警通道,可同时开始多个通道0为关闭,1为开启
    open-7moordh=0
    #请在七陌平台添加虚拟服务号、文本节点
    #七陌账户webcall的虚拟服务号
    7MOOR_WEBCALL_SERVICENO=xxx
    # 文本节点里被替换的变量,我配置的是text。如果被替换的变量不是text,请修改此配置
    7MOOR_WEBCALL_VOICE_VAR=text
    #---------------------↓telegram接口-----------------------
    #是否开启telegram告警通道,可同时开始多个通道0为关闭,1为开启
    open-tg=0
    #tg机器人token
    TG_TOKEN=xxxxx
    #tg消息模式 个人消息或者频道消息 0为关闭(推送给个人),1为开启(推送给频道)
    TG_MODE_CHAN=0
    #tg用户ID
    TG_USERID=xxxxx
    #tg频道name或者id, 频道name需要以@开始
    TG_CHANNAME=xxxxx
    #tg api地址, 可以配置为代理地址
    #TG_API_PROXY="https://api.telegram.org/bot%s/%s"
    #---------------------↓workwechat接口-----------------------
    #是否开启workwechat告警通道,可同时开始多个通道0为关闭,1为开启
    open-workwechat=0
    # 企业ID
    WorkWechat_CropID=xxxxx
    # 应用ID
    WorkWechat_AgentID=xxxx
    # 应用secret
    WorkWechat_AgentSecret=xxxx
    # 接受用户
    WorkWechat_ToUser="zhangsan|lisi"
    # 接受部门
    WorkWechat_ToParty="ops|dev"
    # 接受标签
    WorkWechat_ToTag=""
    # 消息类型, 暂时只支持markdown
    # WorkWechat_Msgtype = "markdown"
    #---------------------↓百度云接口-----------------------
    #是否开启百度云短信告警通道,可同时开始多个通道0为关闭,1为开启
    open-baidudx=0
    #百度云短信接口AK(ACCESS_KEY_ID)
    BDY_DX_AK=xxxxx
    #百度云短信接口SK(SECRET_ACCESS_KEY)
    BDY_DX_SK=xxxxx
    #百度云短信ENDPOINT(ENDPOINT参数需要用指定区域的域名来进行定义,如服务所在区域为北京,则为)
    BDY_DX_ENDPOINT=http://smsv3.bj.baidubce.com
    #百度云短信模版ID,根据自己审核通过的模版来填写(模版支持一个参数code:如prometheus告警:{code})
    BDY_DX_TEMPLATE_ID=xxxxx
    #百度云短信签名ID,根据自己审核通过的签名来填写
    TXY_DX_SIGNATURE_ID=xxxxx
    #---------------------↓百度Hi(如流)-----------------------
    #是否开启百度Hi(如流)告警通道,可同时开始多个通道0为关闭,1为开启
    open-ruliu=0
    #默认百度Hi(如流)机器人地址
    BDRL_URL=https://api.im.baidu.com/api/msg/groupmsgsend?access_token=xxxxxxxxxxxxxx
    #百度Hi(如流)群ID
    BDRL_ID=123456
    #---------------------↓bark接口-----------------------
    #是否开启telegram告警通道,可同时开始多个通道0为关闭,1为开启
    open-bark=0
    #bark默认地址, 建议自行部署bark-server
    BARK_URL=https://api.day.app
    #bark key, 多个key使用分割
    BARK_KEYS=xxxxx
    # 复制, 推荐开启
    BARK_COPY=1
    # 历史记录保存,推荐开启
    BARK_ARCHIVE=1
    # 消息分组
    BARK_GROUP=PrometheusAlert
    #---------------------↓语音播报-----------------------
    #语音播报需要配合语音播报插件才能使用
    #是否开启语音播报通道,0为关闭,1为开启
    open-voice=1
    VOICE_IP=127.0.0.1
    VOICE_PORT=9999
    #---------------------↓飞书机器人应用-----------------------
    #是否开启feishuapp告警通道,可同时开始多个通道0为关闭,1为开启
    open-feishuapp=1
    # APPID
    FEISHU_APPID=cli_xxxxxxxxxxxxx
    # APPSECRET
    FEISHU_APPSECRET=xxxxxxxxxxxxxxxxxxxxxx
    # 可填飞书 用户open_id、user_id、union_ids、部门open_department_id
    AT_USER_ID="xxxxxxxx"
    #---------------------↓告警组-----------------------
    # 有其他新增的配置段,请放在告警组的上面
    # 暂时仅针对 PrometheusContronller 中的 /prometheus/alert 路由
    # 告警组如果放在了 wx, dd... 那部分的上分,beego section 取 url 值不太对。
    # 所以这里使用 include 来包含另告警组配置
    # 是否启用告警组功能
    open-alertgroup=0
    # 自定义的告警组既可以写在这里,也可以写在单独的文件里。
    # 写在单独的告警组配置里更便于修改。
    # include "alertgroup.conf"
    #---------------------↓kafka地址-----------------------
    # kafka服务器的地址
    open-kafka=1
    kafka_server = 127.0.0.1:9092
    # 写入消息的kafka topic
    kafka_topic = devops
    # 用户标记该消息是来自PrometheusAlert,一般无需修改
    kafka_key = PrometheusAlert
  user.csv: |
    2019年4月10日,15888888881,小张,15999999999,备用联系人小陈,15999999998,备用联系人小赵
    2019年4月11日,15888888882,小李,15999999999,备用联系人小陈,15999999998,备用联系人小赵
    2019年4月12日,15888888883,小王,15999999999,备用联系人小陈,15999999998,备用联系人小赵
    2019年4月13日,15888888884,小宋,15999999999,备用联系人小陈,15999999998,备用联系人小赵
kind: ConfigMap
metadata:
  name: prometheus-alert-center-conf
  namespace: monitoring
---
apiVersion: apps/v1
kind: Deployment
metadata:
  labels:
    app: prometheus-alert-center
    alertname: prometheus-alert-center
  name: prometheus-alert-center
  namespace: monitoring
spec:
  replicas: 1
  selector:
    matchLabels:
      app: prometheus-alert-center
      alertname: prometheus-alert-center
  template:
    metadata:
      labels:
        app: prometheus-alert-center
        alertname: prometheus-alert-center
    spec:
      containers:
      - image: feiyu563/prometheus-alert:v4.9.1
        name: prometheus-alert-center
        env:
        - name: TZ
          value: "Asia/Shanghai"
        ports:
        - containerPort: 8080
          name: http
        resources:
          limits:
            cpu: 200m
            memory: 200Mi
          requests:
            cpu: 100m
            memory: 100Mi
        volumeMounts:
        - name: prometheus-alert-center-conf-map
          mountPath: /app/conf/app.conf
          subPath: app.conf
        - name: prometheus-alert-center-conf-map
          mountPath: /app/user.csv
          subPath: user.csv
      volumes:
      - name: prometheus-alert-center-conf-map
        configMap:
          name: prometheus-alert-center-conf
          items:
          - key: app.conf
            path: app.conf
          - key: user.csv
            path: user.csv
---
apiVersion: v1
kind: Service
metadata:
  labels:
    alertname: prometheus-alert-center
  name: prometheus-alert-center
  namespace: monitoring
  annotations:
    prometheus.io/scrape: 'true'
    prometheus.io/port: '8080'
spec:
  ports:
  - name: http
    port: 8080
    targetPort: http
  selector:
    app: prometheus-alert-center
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
  name: prometheus-ingress
  namespace: monitoring
spec:
  rules:
  - host: alert-center.local
    http:
      paths:
      - path: /
        pathType: ImplementationSpecific
        backend:
          service:
            name: prometheus-alert-center
            port:
              number: 8080
EOF
应用后查看资源创建状态:
[root@master-01 ~]# kubectl -n monitoring get pod|grep "prometheus-alert-center"
prometheus-alert-center-69c9d5d7fd-mp8s8   1/1     Running   0          149m
[root@master-01 ~]# kubectl -n monitoring get svc|grep "prometheus-alert-center"
prometheus-alert-center   ClusterIP   10.98.227.20     <none>        8080/TCP                     152m
[root@master-01 ~]# kubectl -n monitoring get ingress|grep "prometheus-alert-center"
prometheus-alert-center   <none>   alert-center.local                                  localhost   80      153m
打开web界面:

3.prometheus-operator接入配置
如果是prometheus-operator的项目,那么只需要修改 alertmanager-secret.yaml 配置,将webhook的地址填写为prometheus-alert的service名称即可:
cat > alertmanager-secret.yaml << 'EOF'
apiVersion: v1
kind: Secret
metadata:
  labels:
    alertmanager: main
    app.kubernetes.io/component: alert-router
    app.kubernetes.io/name: alertmanager
    app.kubernetes.io/part-of: kube-prometheus
    app.kubernetes.io/version: 0.21.0
  name: alertmanager-main
  namespace: monitoring
type: Opaque
stringData:
  alertmanager.yaml: |-
    global:
      resolve_timeout: 5m
    route:
      group_by: ['instance']
      group_wait: 10m
      group_interval: 10s
      repeat_interval: 10m
      receiver: 'web.hook.prometheusalert'
    receivers:
    - name: 'web.hook.prometheusalert'
      webhook_configs:
      - url: 'http://prometheus-alert-center:8080/prometheusalert?type=email&tpl=prometheus-email&email=vim27@qq.com'
EOF
生效后的配置查看:
[root@master-01 ~]# kubectl get secret alertmanager-main-generated -n monitoring -o json | jq -r '.data."alertmanager.yaml"' | base64 --decode
global:
  resolve_timeout: 5m
route:
  group_by: ['instance']
  group_wait: 10m
  group_interval: 10s
  repeat_interval: 10m
  receiver: 'web.hook.prometheusalert'
receivers:
- name: 'web.hook.prometheusalert'
  webhook_configs:
  - url: 'http://prometheus-alert-center:8080/prometheusalert?type=email&tpl=prometheus-email&email=vim27@qq.com'
alertmanager的web页面可以查看是否生效:

4.对接邮件告警配置
在PrometheusAlert的web页面可以方便的调试和测试告警模版是否正确生效,点击Markdown测试可以看到有两个方框可以用来填写内容,一个是报警的原始json文本,一个是我们自定义的模版

报警的原始json文本可以通过查看prometheus-alert-center的日志获取到:
[root@master-01 ~]# kubectl -n monitoring logs -f prometheus-alert-center-69c9d5d7fd-mp8s8
这里随便摘抄一段简短的报警信息:
{"receiver":"web\\.hook\\.prometheusalert","status":"firing","alerts":[{"status":"firing","labels":{"alertname":"KubeDeploymentReplicasMismatch","container":"kube-rbac-proxy-main","deployment":"mysql-dp","instance":"10.244.2.19:8443","job":"kube-state-metrics","namespace":"default","prometheus":"monitoring/k8s","severity":"warning"},"annotations":{"description":"Deployment default/mysql-dp has not matched the expected number of replicas for longer than 15 minutes.","runbook_url":"https://github.com/prometheus-operator/kube-prometheus/wiki/kubedeploymentreplicasmismatch","summary":"Deployment has not matched the expected number of replicas."},"startsAt":"2024-09-20T13:09:56.729Z","endsAt":"0001-01-01T00:00:00Z","generatorURL":"http://prometheus-k8s-1:9090/graph?g0.expr=%28kube_deployment_spec_replicas%7Bjob%3D%22kube-state-metrics%22%7D+%21%3D+kube_deployment_status_replicas_available%7Bjob%3D%22kube-state-metrics%22%7D%29+and+%28changes%28kube_deployment_status_replicas_updated%7Bjob%3D%22kube-state-metrics%22%7D%5B10m%5D%29+%3D%3D+0%29\u0026g0.tab=1","fingerprint":"d3c4dfbd62a8075a"}],"groupLabels":{"instance":"10.244.2.19:8443"},"commonLabels":{"alertname":"KubeDeploymentReplicasMismatch","container":"kube-rbac-proxy-main","deployment":"mysql-dp","instance":"10.244.2.19:8443","job":"kube-state-metrics","namespace":"default","prometheus":"monitoring/k8s","severity":"warning"},"commonAnnotations":{"description":"Deployment default/mysql-dp has not matched the expected number of replicas for longer than 15 minutes.","runbook_url":"https://github.com/prometheus-operator/kube-prometheus/wiki/kubedeploymentreplicasmismatch","summary":"Deployment has not matched the expected number of replicas."},"externalURL":"http://alertmanager-main-0:9093","version":"4","groupKey":"{}:{instance=\"10.244.2.19:8443\"}","truncatedAlerts":0}
经过json格式化后的内容:
{
	"receiver": "web\\.hook\\.prometheusalert",
	"status": "firing",
	"alerts": [{
		"status": "firing",
		"labels": {
			"alertname": "KubeDeploymentReplicasMismatch",
			"container": "kube-rbac-proxy-main",
			"deployment": "mysql-dp",
			"instance": "10.244.2.19:8443",
			"job": "kube-state-metrics",
			"namespace": "default",
			"prometheus": "monitoring/k8s",
			"severity": "warning"
		},
		"annotations": {
			"description": "Deployment default/mysql-dp has not matched the expected number of replicas for longer than 15 minutes.",
			"runbook_url": "https://github.com/prometheus-operator/kube-prometheus/wiki/kubedeploymentreplicasmismatch",
			"summary": "Deployment has not matched the expected number of replicas."
		},
		"startsAt": "2024-09-20T13:09:56.729Z",
		"endsAt": "0001-01-01T00:00:00Z",
		"generatorURL": "http://prometheus-k8s-1:9090/graph?g0.expr=%28kube_deployment_spec_replicas%7Bjob%3D%22kube-state-metrics%22%7D+%21%3D+kube_deployment_status_replicas_available%7Bjob%3D%22kube-state-metrics%22%7D%29+and+%28changes%28kube_deployment_status_replicas_updated%7Bjob%3D%22kube-state-metrics%22%7D%5B10m%5D%29+%3D%3D+0%29\u0026g0.tab=1",
		"fingerprint": "d3c4dfbd62a8075a"
	}],
	"groupLabels": {
		"instance": "10.244.2.19:8443"
	},
	"commonLabels": {
		"alertname": "KubeDeploymentReplicasMismatch",
		"container": "kube-rbac-proxy-main",
		"deployment": "mysql-dp",
		"instance": "10.244.2.19:8443",
		"job": "kube-state-metrics",
		"namespace": "default",
		"prometheus": "monitoring/k8s",
		"severity": "warning"
	},
	"commonAnnotations": {
		"description": "Deployment default/mysql-dp has not matched the expected number of replicas for longer than 15 minutes.",
		"runbook_url": "https://github.com/prometheus-operator/kube-prometheus/wiki/kubedeploymentreplicasmismatch",
		"summary": "Deployment has not matched the expected number of replicas."
	},
	"externalURL": "http://alertmanager-main-0:9093",
	"version": "4",
	"groupKey": "{}:{instance=\"10.244.2.19:8443\"}",
	"truncatedAlerts": 0
}
默认的邮件报警模版:
{{ $var := .externalURL}}{{ range $k,$v:=.alerts }}
{{if eq $v.status "resolved"}}
<h1><a href ={{$v.generatorURL}}>Prometheus恢复信息</a></h1>
<h2><a href ={{$var}}>{{$v.labels.alertname}}</a></h2>
<h5>告警级别:{{$v.labels.severity}}</h5>
<h5>开始时间:{{$v.startsAt}}</h5>
<h5>结束时间:{{$v.endsAt}}</h5>
<h5>故障主机IP:{{$v.labels.instance}}</h5>
<h3>{{$v.annotations.description}}</h3>
<img src=https://raw.githubusercontent.com/feiyu563/PrometheusAlert/master/doc/alert-center.png />
{{else}}
<h1><a href ={{$v.generatorURL}}>Prometheus告警信息</a></h1>
<h2><a href ={{$var}}>{{$v.labels.alertname}}</a></h2>
<h5>告警级别:{{$v.labels.severity}}</h5>
<h5>开始时间:{{$v.startsAt}}</h5>
<h5>故障主机IP:{{$v.labels.instance}}</h5>
<h3>{{$v.annotations.description}}</h3>
<img src=https://raw.githubusercontent.com/feiyu563/PrometheusAlert/master/doc/alert-center.png />
{{end}}
{{ end }}
默认邮件报警展示效果:

我们可以看到其实展示效果并不好,我们精简并调整一下报警模版:
{{ $var := .externalURL}}{{ range $k,$v:=.alerts }}
{{if eq $v.status "resolved"}}
  <h3>告警名称:{{$v.labels.alertname}}</h3>
  <h3>告警级别:{{$v.labels.level}}</h3>
  <h3>告警实例:{{$v.labels.instance}}</h3>
  <h3>告警详情:{{$v.annotations.description}}</h3>
  <h3>开始时间:{{$v.startsAt}}</h3>
  <h3>结束时间:{{$v.endsAt}}</h5>
{{else}}
  <h3>告警名称:{{$v.labels.alertname}}</h3>
  <h3>告警级别:{{$v.labels.level}}</h3>
  <h3>开始时间:{{$v.startsAt}}</h3>
  <h3>告警实例:{{$v.labels.instance}}</h3>
  <h3>告警详情:{{$v.annotations.description}}</h3>
{{ end }}
{{ end }}
再次测试渲染效果:

现在的效果已经好很多了,如果测试没有问题,我们就可以真正的去替换相应的email模版了,点击模版管理,选择email


邮件效果:

5.对接钉钉告警配置
第一步:钉钉添加机器人




https://oapi.dingtalk.com/robot/send?access_token=3cc684fa9182e659bc3b60d89a5bc4c8764ca93f01c6036397f2d025f31dbe0f
第二步:在PrometheusAlert图形界面修改

默认的钉钉告警模版:
{{ $var := .externalURL}}{{ range $k,$v:=.alerts }}
{{if eq $v.status "resolved"}}
## [Prometheus恢复信息]({{$v.generatorURL}})
#### [{{$v.labels.alertname}}]({{$var}})
###### 告警级别:{{$v.labels.level}}
###### 开始时间:{{$v.startsAt}}
###### 结束时间:{{$v.endsAt}}
###### 故障主机IP:{{$v.labels.instance}}
##### {{$v.annotations.description}}

{{else}}
## [Prometheus告警信息]({{$v.generatorURL}})
#### [{{$v.labels.alertname}}]({{$var}})
###### 告警级别:{{$v.labels.level}}
###### 开始时间:{{$v.startsAt}}
###### 故障主机IP:{{$v.labels.instance}}
##### {{$v.annotations.description}}

{{end}}
{{ end }}
{{ $urimsg:=""}}{{ range $key,$value:=.commonLabels }}{{$urimsg =  print $urimsg $key "%3D%22" $value "%22%2C" }}{{end}}[*** 点我屏蔽该告警]({{$var}}/#/silences/new?filter=%7B{{SplitString $urimsg 0 -3}}%7D)
修改后的钉钉告警模版:
{{ $var := .externalURL}}{{ range $k,$v:=.alerts }}
{{if eq $v.status "resolved"}}
##### 告警名称:{{$v.labels.alertname}}
##### 告警级别:{{$v.labels.level}}
##### 告警实例:{{$v.labels.instance}}
##### 告警详情:{{$v.annotations.description}}
##### 开始时间:{{$v.startsAt}}
##### 结束时间:{{$v.endsAt}}
{{else}}
##### 告警名称:{{$v.labels.alertname}}
##### 告警级别:{{$v.labels.level}}
##### 开始时间:{{$v.startsAt}}
##### 告警实例:{{$v.labels.instance}}
##### 告警详情:{{$v.annotations.description}}
{{ end }}
{{ end }}
钉钉接收的效果:

第三步:修改alertmanager-secret.yaml
增加钉钉的webhook
cat > alertmanager-secret.yaml << 'EOF'
apiVersion: v1
kind: Secret
metadata:
  labels:
    alertmanager: main
    app.kubernetes.io/component: alert-router
    app.kubernetes.io/name: alertmanager
    app.kubernetes.io/part-of: kube-prometheus
    app.kubernetes.io/version: 0.21.0
  name: alertmanager-main
  namespace: monitoring
type: Opaque
stringData:
  alertmanager.yaml: |-
    global:
      resolve_timeout: 5m
    route:
      group_by: ['instance']
      group_wait: 10m
      group_interval: 10s
      repeat_interval: 10m
      receiver: 'dingding.webhook.prometheusalert'
    receivers:
    - name: 'email.webhook.prometheusalert'
      webhook_configs:
      - url: 'http://prometheus-alert-center:8080/prometheusalert?type=email&tpl=prometheus-email&email=vim27@qq.com'
    - name: 'dingding.webhook.prometheusalert'
      webhook_configs:
      - url: 'http://prometheus-alert-center:8080/prometheusalert?type=dd&tpl=prometheus-dd&ddurl=https://oapi.dingtalk.com/robot/send?access_token=3cc684fa9182e659bc3b60d89a5bc4c8764ca93f01c6036397f2d025f31dbe0f'
EOF
查看告警效果:

更新: 2024-09-23 17:26:56