#Prometheus授予集群权限访问
apiVersion: v1 kind: ServiceAccount metadata: name: prometheus namespace: kube-system --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: prometheus rules: - apiGroups: - "" resources: - nodes - services - endpoints - pods - nodes/proxy verbs: - get - list - watch - apiGroups: - "" resources: - configmaps - nodes/metrics verbs: - get - nonResourceURLs: - /metrics verbs: - get --- apiVersion: rbac.authorization.k8s.io/v1beta1 kind: ClusterRoleBinding metadata: name: prometheus roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: prometheus subjects: - kind: ServiceAccount name: prometheus namespace: kube-system#DaemoSet资源,在每台Node节点上都运行一个容器用来收集Node节点资源信息,Prometheus客户端工具
apiVersion: extensions/v1beta1 kind: DaemonSet metadata: name: node-exporter namespace: kube-system labels: name: node-exporter spec: template: metadata: labels: name: node-exporter spec: hostPID: true hostIPC: true hostNetwork: true containers: - name: node-exporter image: prom/node-exporter:v0.16.0 ports: - containerPort: 9100 resources: requests: cpu: 0.15 securityContext: privileged: true args: - --path.procfs - /host/proc - --path.sysfs - /host/sys - --collector.filesystem.ignored-mount-points - '"^/(sys|proc|dev|host|etc)($|/)"' volumeMounts: - name: dev mountPath: /host/dev - name: proc mountPath: /host/proc - name: sys mountPath: /host/sys - name: rootfs mountPath: /rootfs tolerations: - key: "node-role.kubernetes.io/master" operator: "Exists" effect: "NoSchedule" volumes: - name: proc hostPath: path: /proc - name: dev hostPath: path: /dev - name: sys hostPath: path: /sys - name: rootfs hostPath: path: /#prometheus组件alert告警配置,定义了prometheus告警发送路由,由什么账号发送,发给谁
apiVersion: v1 kind: ConfigMap metadata: name: alert-config namespace: kube-system data: config.yml: |- global: resolve_timeout: 5m smtp_smarthost: 'smtp.*****.com:25' smtp_from: '*****@**.com' smtp_auth_username: '*****@**.com' smtp_auth_password: '*******' smtp_hello: '163.com' smtp_require_tls: false route: group_by: ['alertname', 'cluster'] group_wait: 30s group_interval: 5m repeat_interval: 5m receiver: default routes: - receiver: email group_wait: 10s match: team: pod receivers: - name: 'default' email_configs: - to: '*****@***.com' send_resolved: true - name: 'email' email_configs: - to: '*****@****.com' send_resolved: true#相当于Pormetheus配置文件,里面定义了监控项,以及告警策略
apiVersion: v1 kind: ConfigMap metadata: name: prometheus-config namespace: kube-system data: prometheus.yml: | global: scrape_interval: 15s scrape_timeout: 15s scrape_configs: - job_name: 'prometheus' static_configs: - targets: ['localhost:9090'] - job_name: 'kubernetes-node' kubernetes_sd_configs: - role: node relabel_configs: - source_labels: [__address__] regex: '(.*):10250' replacement: '${1}:9100' target_label: __address__ action: replace - job_name: 'kubernetes-cadvisor' kubernetes_sd_configs: - role: node scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) - target_label: __address__ replacement: kubernetes.default.svc:443 - source_labels: [__meta_kubernetes_node_name] regex: (.+) target_label: __metrics_path__ replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor - job_name: kubernetes-service-endpoints kubernetes_sd_configs: - role: endpoints relabel_configs: - action: keep regex: true source_labels: - __meta_kubernetes_service_annotation_prometheus_io_scrape - action: replace regex: (https?) source_labels: - __meta_kubernetes_service_annotation_prometheus_io_scheme target_label: __scheme__ - action: replace regex: (.+) source_labels: - __meta_kubernetes_service_annotation_prometheus_io_path target_label: __metrics_path__ - action: replace regex: ([^:]+)(?::\d+)?;(\d+) replacement: $1:$2 source_labels: - __address__ - __meta_kubernetes_service_annotation_prometheus_io_port target_label: __address__ - action: labelmap regex: __meta_kubernetes_service_label_(.+) - action: replace source_labels: - __meta_kubernetes_namespace target_label: kubernetes_namespace - action: replace source_labels: - __meta_kubernetes_service_name target_label: kubernetes_name - job_name: 'kubernetes-services' metrics_path: /probe params: module: [http_2xx] kubernetes_sd_configs: - role: service relabel_configs: - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe] action: keep regex: true - source_labels: [__address__] target_label: __param_target - target_label: __address__ replacement: blackbox - source_labels: [__param_target] target_label: instance - action: labelmap regex: __meta_kubernetes_service_label_(.+) - source_labels: [__meta_kubernetes_namespace] target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_service_name] target_label: kubernetes_name alerting: alertmanagers: - static_configs: - targets: ["localhost:9093"] rule_files: - /etc/prometheus/rules.yml rules.yml: | groups: - name: Pod-Memory rules: - alert: Pod-Memory expr: sum(container_memory_rss{container!="POD",container!="alermanager",image!="",pod!=""})by(pod) / sum(container_spec_memory_limit_bytes{container!="",container!="POD"})by(pod) * 100 != +inf > 85 for: 1m labels: team: pod annotations: summary: "{{ $labels.pod }}:Memory is too hight!" description: "{{ $labels.pod }}: \n(Now value is : {{ $value }}%)" - name: Pod-CPU rules: - alert: Pod-CPU expr: sum(rate(container_cpu_usage_seconds_total{image!="",container!="POD",container!=""}[1m])) by (pod,namespace) / (sum(container_spec_cpu_quota{image!="",container!="POD",container!=""}/100000) by (pod,namespace)) * 100 > 85 for: 1m labels: team: pod annotations: summary: "{{ $labels.pod }}:CPU is too hight!" description: "{{ $labels.pod }}: \n(Now value is : {{ $value }}%)"#创建Deployment资源
注:要先有NFS提供持久化
apiVersion: extensions/v1beta1 kind: Deployment metadata: name: prometheus namespace: kube-system labels: app: prometheus spec: template: metadata: labels: app: prometheus spec: serviceAccountName: prometheus containers: - image: prom/prometheus:v2.4.3 name: prometheus command: - "/bin/prometheus" args: - "--config.file=/etc/prometheus/prometheus.yml" - "--storage.tsdb.path=/prometheus" - "--storage.tsdb.retention=7d" - "--web.enable-admin-api" # 控制对admin HTTP API的访问,其中包括删除时间序列等功能 - "--web.enable-lifecycle" # 支持热更新,直接执行localhost:9090/-/reload立即生效 ports: - containerPort: 9090 protocol: TCP name: http volumeMounts: - mountPath: "/prometheus" # subPath: prometheus name: prometheus - mountPath: "/etc/prometheus" name: config-volume resources: requests: cpu: 400m memory: 500Mi limits: cpu: 500m memory: 2Gi - name: alermanager image: prom/alertmanager:v0.15.3 imagePullPolicy: Always args: - "--config.file=/etc/alertmanager/config.yml" - "--storage.path=/alertmanager/data" ports: - containerPort: 9093 name: http volumeMounts: - mountPath: "/etc/alertmanager" name: alertcfg securityContext: runAsUser: 0 volumes: - name: prometheus nfs: path: /data/docker/kubernetes-data/prometheus-data server: 172.18.180.5 - configMap: name: prometheus-config name: config-volume - name: alertcfg configMap: name: alert-configprometheus-svc.yaml
#将端口暴露出来提供访问,9090:Prometheus,9093:Prometheus-alter
apiVersion: v1 kind: Service metadata: name: prometheus namespace: kube-system labels: app: prometheus spec: selector: app: prometheus type: NodePort ports: - name: web nodePort: 32001 port: 9090 targetPort: http - name: alertmanager port: 9093 targetPort: 9093 nodePort: 32002