云原生程序特点
程序的配置,通过设置环境变量传递到容器内部(-e参数传递环境变量)
程序的配置,通过程序启动参数配置生效
程序的配置,通过集中在配置中心进行统一管理(configmap/集成配置平台apollo等)


监控k8s所需exporter插件

  • kube-state-metrics – 收集master&etcd等基本状态信息
  • node-exporter – 收集node信息
  • cadvisor – 收集docker容器内部使用资源信息
  • blackbox-exporte – 收集k8sdocker容器服务是否存活

kube-state-metrics(收集master&etcd等基本状态信息)


节点 130


下载镜像

1
2
3
[root@ceshi-132 ~]# docker pull quay.io/coreos/kube-state-metrics:v1.5.0
[root@ceshi-132 ~]# docker tag 91599517197a harbor.od.com/public/kube-state-metrics:v1.5.0
[root@ceshi-132 ~]# docker push harbor.od.com/public/kube-state-metrics:v1.5.0

创建资源配置清单
[root@ceshi-132 k8s-yaml]# mkdir kube-state-metrics

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
[root@ceshi-132 kube-state-metrics]# vi rbac.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
addonmanager.kubernetes.io/mode: Reconcile
kubernetes.io/cluster-service: "true"
name: kube-state-metrics
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
addonmanager.kubernetes.io/mode: Reconcile
kubernetes.io/cluster-service: "true"
name: kube-state-metrics
rules:
- apiGroups:
- ""
resources:
- configmaps
- secrets
- nodes
- pods
- services
- resourcequotas
- replicationcontrollers
- limitranges
- persistentvolumeclaims
- persistentvolumes
- namespaces
- endpoints
verbs:
- list
- watch
- apiGroups:
- policy
resources:
- poddisruptionbudgets
verbs:
- list
- watch
- apiGroups:
- extensions
resources:
- daemonsets
- deployments
- replicasets
verbs:
- list
- watch
- apiGroups:
- apps
resources:
- statefulsets
verbs:
- list
- watch
- apiGroups:
- batch
resources:
- cronjobs
- jobs
verbs:
- list
- watch
- apiGroups:
- autoscaling
resources:
- horizontalpodautoscalers
verbs:
- list
- watch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
addonmanager.kubernetes.io/mode: Reconcile
kubernetes.io/cluster-service: "true"
name: kube-state-metrics
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: kube-state-metrics
subjects:
- kind: ServiceAccount
name: kube-state-metrics
namespace: kube-system
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
[root@ceshi-132 kube-state-metrics]# vi dp.yaml
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
annotations:
deployment.kubernetes.io/revision: "2"
labels:
grafanak8sapp: "true"
app: kube-state-metrics
name: kube-state-metrics
namespace: kube-system
spec:
selector:
matchLabels:
grafanak8sapp: "true"
app: kube-state-metrics
strategy:
rollingUpdate:
maxSurge: 25%
maxUnavailable: 25%
type: RollingUpdate
template:
metadata:
labels:
grafanak8sapp: "true"
app: kube-state-metrics
spec:
containers:
- name: kube-state-metrics
image: harbor.od.com/public/kube-state-metrics:v1.5.0
imagePullPolicy: IfNotPresent
ports:
- containerPort: 8080
name: http-metrics
protocol: TCP
readinessProbe: #就绪性探针
failureThreshold: 3
httpGet:
path: /healthz
port: 8080
scheme: HTTP
initialDelaySeconds: 5
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 5
serviceAccountName: kube-state-metrics

交付

1
2
3
4
5
6
[root@ceshi-130 ~]# kubectl apply -f http://k8s-yaml.od.com/kube-state-metrics/rbac.yaml
serviceaccount/kube-state-metrics created
clusterrole.rbac.authorization.k8s.io/kube-state-metrics created
clusterrolebinding.rbac.authorization.k8s.io/kube-state-metrics created
[root@ceshi-130 ~]# kubectl apply -f http://k8s-yaml.od.com/kube-state-metrics/dp.yaml
deployment.extensions/kube-state-metrics created

curl探测返回ok为正常

1
2
3
4
5
[root@ceshi-130 ~]# kubectl get pods -n kube-system -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
kube-state-metrics-8669f776c6-sv24d 1/1 Running 0 42s 172.7.21.2 ceshi-130.host.com <none> <none>
[root@ceshi-130 ~]# curl 172.7.21.2:8080/healthz
ok

node-exporter(收集node信息)

下载镜像

1
2
3
4
[root@ceshi-132 ~]# docker pull prom/node-exporter:v0.15.0
[root@ceshi-132 ~]# docker tag 12d51ffa2b22 harbor.od.com/public/node-exporter:v0.15.0
[root@ceshi-132 ~]# docker push harbor.od.com/public/node-exporter:v0.15.0
The push refers to repository [harbor.od.com/public/node-exporter]

创建资源清单

1
2
[root@ceshi-132 k8s-yaml]# mkdir node-exporter
[root@ceshi-132 k8s-yaml]# cd node-exporter/

node-exporter每个节点都必须有所以类型为DaemonSet

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
[root@ceshi-132 node-exporter]# vi ds.yaml 
kind: DaemonSet
apiVersion: extensions/v1beta1
metadata:
name: node-exporter
namespace: kube-system
labels:
daemon: "node-exporter"
grafanak8sapp: "true"
spec:
selector:
matchLabels:
daemon: "node-exporter"
grafanak8sapp: "true"
template:
metadata:
name: node-exporter
labels:
daemon: "node-exporter"
grafanak8sapp: "true"
spec:
volumes:
- name: proc
hostPath:
path: /proc
type: ""
- name: sys
hostPath:
path: /sys
type: ""
containers:
- name: node-exporter
image: harbor.od.com/public/node-exporter:v0.15.0
imagePullPolicy: IfNotPresent
args:
- --path.procfs=/host_proc
- --path.sysfs=/host_sys
ports:
- name: node-exporter
hostPort: 9100
containerPort: 9100
protocol: TCP
volumeMounts:
- name: sys
readOnly: true
mountPath: /host_sys
- name: proc
readOnly: true
mountPath: /host_proc
hostNetwork: true 宿主机共享网络

交付

1
2
[root@ceshi-130 ~]# kubectl apply -f http://k8s-yaml.od.com/node-exporter/ds.yaml
daemonset.extensions/node-exporter created

交付后可以看到IP为宿主机因为ds配置为 hostNetwork: true 和宿主机共享网络空间

1
2
3
4
5
6
7
8
[root@ceshi-130 ~]# netstat -tnlp | grep 9100
tcp 0 0 0.0.0.0:9100 0.0.0.0:* LISTEN 127117/node_exporte

[root@ceshi-130 ~]# kubectl get pods -n kube-system -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
kube-state-metrics-8669f776c6-sv24d 1/1 Running 0 14m 172.7.21.2 ceshi-130.host.com <none> <none>
node-exporter-22zzl 1/1 Running 0 57s 10.1.133.96 ceshi-131.host.com <none> <none>
node-exporter-lbh9f 1/1 Running 0 57s 10.1.133.95 ceshi-130.host.com <none> <none>

cadvisor(收集docker容器内部使用资源信息)

下载镜像

1
2
3
[root@ceshi-132 ~]# docker pull google/cadvisor:v0.28.3
[root@ceshi-132 ~]# docker tag 75f88e3ec333 harbor.od.com/public/cadvisor:v0.28.3
[root@ceshi-132 ~]# docker push harbor.od.com/public/cadvisor:v0.28.3

创建资源配置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
[root@ceshi-132 cadvisor]# vi ds.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: cadvisor
namespace: kube-system
labels:
app: cadvisor
spec:
selector:
matchLabels:
name: cadvisor
template:
metadata:
labels:
name: cadvisor
spec:
hostNetwork: true 共享宿主机网络空间
tolerations: 容忍
- key: node-role.kubernetes.io/master 如果是master节点
effect: NoSchedule 影响:不调度
containers:
- name: cadvisor
image: harbor.od.com/public/cadvisor:v0.28.3
imagePullPolicy: IfNotPresent
volumeMounts:
- name: rootfs
mountPath: /rootfs
readOnly: true
- name: var-run
mountPath: /var/run
- name: sys
mountPath: /sys
readOnly: true
- name: docker
mountPath: /var/lib/docker
readOnly: true
ports:
- name: http
containerPort: 4194
protocol: TCP
readinessProbe:
tcpSocket:
port: 4194
initialDelaySeconds: 5
periodSeconds: 10
args:
- --housekeeping_interval=10s
- --port=4194
terminationGracePeriodSeconds: 30
volumes:
- name: rootfs
hostPath:
path: /
- name: var-run
hostPath:
path: /var/run
- name: sys
hostPath:
path: /sys
- name: docker
hostPath:
path: /data/docker

交付

1
2
[root@ceshi-130 ~]# kubectl apply -f http://k8s-yaml.od.com/cadvisor/ds.yaml
daemonset.apps/cadvisor created

===============================================================================

污点类型 解释(相当于做标签)
kubectl taint nodes node key=value:==NoSchedule== 添加effect类型污点,新的pod不能调度过来,==老的不影响==
kubectl taint nodes node key=value:==NoExecute== 添加effecf类型污点,新的pod不能调度过来,==老的被驱除==
kubectl taint nodes node key=value:PreferNoSchedule 会尝试将pod分配到该节点

打污点

1
[root@ceshi-130 ~]# kubectl taint nodes ceshi-130.host.com node-role.kubernetes.io/master:NoSchedule

删除污点

1
[root@ceshi-130 ~]# kubectl taint nodes ceshi-130.host.com node-role.kubernetes.io/master-

yaml匹配污点如上面yaml所示

1
2
3
4
tolerations:  #containers同级
- key: "key1" #能容忍的污点key
value: "value1" #值
effect: "NoExecute" #effect策略,见上面

人为调度K8S调度三种方法:

类型 解释
污点、容忍度 (1)污点:运算节点node上的污点 (2) 容忍度:pod是否能容忍污点
nodeName 让pod运行在指定node上
nodeSelector 通过标签选择器,让pod运行在指定一类的node上

blackbox-exporter(收集k8sdocker容器服务是否存活)

下载镜像

1
2
3
[root@ceshi-132 ~]# docker pull prom/blackbox-exporter:v0.15.1
[root@ceshi-132 ~]# docker tag 81b70b6158be harbor.od.com/public/blackbox-exporter:v0.15.1
[root@ceshi-132 ~]# docker push harbor.od.com/public/blackbox-exporter:v0.15.1

创建资源清单

1
2
[root@ceshi-132 ~]# cd /data/k8s-yaml/
[root@ceshi-132 k8s-yaml]# mkdir blackbox-exporter
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
[root@ceshi-132 blackbox-exporter]# cat cm.yaml 
apiVersion: v1
kind: ConfigMap
metadata:
labels:
app: blackbox-exporter
name: blackbox-exporter
namespace: kube-system
data:
blackbox.yml: |-
modules:
http_2xx:
prober: http
timeout: 2s
http:
valid_http_versions: ["HTTP/1.1", "HTTP/2"]
valid_status_codes: [200,301,302]
method: GET
preferred_ip_protocol: "ip4"
tcp_connect:
prober: tcp
timeout: 2s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
[root@ceshi-132 blackbox-exporter]# cat dp.yaml 
kind: Deployment
apiVersion: extensions/v1beta1
metadata:
name: blackbox-exporter
namespace: kube-system
labels:
app: blackbox-exporter
annotations:
deployment.kubernetes.io/revision: 1
spec:
replicas: 1
selector:
matchLabels:
app: blackbox-exporter
template:
metadata:
labels:
app: blackbox-exporter
spec:
volumes:
- name: config
configMap:
name: blackbox-exporter
defaultMode: 420
containers:
- name: blackbox-exporter
image: harbor.od.com/public/blackbox-exporter:v0.15.1
imagePullPolicy: IfNotPresent
args:
- --config.file=/etc/blackbox_exporter/blackbox.yml
- --log.level=info
- --web.listen-address=:9115
ports:
- name: blackbox-port
containerPort: 9115
protocol: TCP
resources:
limits:
cpu: 200m
memory: 256Mi
requests:
cpu: 100m
memory: 50Mi
volumeMounts:
- name: config
mountPath: /etc/blackbox_exporter
readinessProbe:
tcpSocket:
port: 9115
initialDelaySeconds: 5
timeoutSeconds: 5
periodSeconds: 10
successThreshold: 1
failureThreshold: 3
1
2
3
4
5
6
7
8
9
10
11
12
13
[root@ceshi-132 blackbox-exporter]# cat svc.yaml 
kind: Service
apiVersion: v1
metadata:
name: blackbox-exporter
namespace: kube-system
spec:
selector:
app: blackbox-exporter
ports:
- name: blackbox-port
protocol: TCP
port: 9115

出现ingress host需要解析域名

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
[root@ceshi-132 blackbox-exporter]# cat ingress.yaml 
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
name: blackbox-exporter
namespace: kube-system
spec:
rules:
- host: blackbox.od.com
http:
paths:
- path: /
backend:
serviceName: blackbox-exporter
servicePort: blackbox-port

交付

1
2
3
4
5
6
7
8
[root@ceshi-130 ~]# kubectl apply -f http://k8s-yaml.od.com/blackbox-exporter/cm.yaml
configmap/blackbox-exporter created
[root@ceshi-130 ~]# kubectl apply -f http://k8s-yaml.od.com/blackbox-exporter/dp.yaml
deployment.extensions/blackbox-exporter created
[root@ceshi-130 ~]# kubectl apply -f http://k8s-yaml.od.com/blackbox-exporter/svc.yaml
service/blackbox-exporter created
[root@ceshi-130 ~]# kubectl apply -f http://k8s-yaml.od.com/blackbox-exporter/ingress.yaml
ingress.extensions/blackbox-exporter created

prometheus-server

下载镜像

1
2
3
[root@ceshi-132 ~]# docker pull prom/prometheus:v2.14.0
[root@ceshi-132 ~]# docker tag 7317640d555e harbor.od.com/public/prometheus:v2.14.0
[root@ceshi-132 ~]# docker push harbor.od.com/public/prometheus:v2.14.0

创建资源配置清单

1
2
[root@ceshi-132 data]# cd /data/k8s-yaml/
[root@ceshi-132 k8s-yaml]# mkdir prometheus
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
[root@ceshi-132 prometheus]# cat rbac.yaml 
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
addonmanager.kubernetes.io/mode: Reconcile
kubernetes.io/cluster-service: "true"
name: prometheus
namespace: infra
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
addonmanager.kubernetes.io/mode: Reconcile
kubernetes.io/cluster-service: "true"
name: prometheus
rules:
- apiGroups:
- ""
resources:
- nodes
- nodes/metrics
- services
- endpoints
- pods
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- configmaps
verbs:
- get
- nonResourceURLs:
- /metrics
verbs:
- get
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
addonmanager.kubernetes.io/mode: Reconcile
kubernetes.io/cluster-service: "true"
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: infra
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
[root@ceshi-132 prometheus]# cat dp.yaml 
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
annotations:
deployment.kubernetes.io/revision: "5"
labels:
name: prometheus
name: prometheus
namespace: infra
spec:
progressDeadlineSeconds: 600
replicas: 1
revisionHistoryLimit: 7
selector:
matchLabels:
app: prometheus
strategy:
rollingUpdate:
maxSurge: 1
maxUnavailable: 1
type: RollingUpdate
template:
metadata:
labels:
app: prometheus
spec:
nodeName: ceshi-130.host.com 手动指定pod会被调度的node
containers:
- name: prometheus
image: harbor.od.com/public/prometheus:v2.14.0
imagePullPolicy: IfNotPresent
command:
- /bin/prometheus
args:
- --config.file=/data/etc/prometheus.yml
- --storage.tsdb.path=/data/prom-db
- --storage.tsdb.min-block-duration=10m
- --storage.tsdb.retention=72h
- --web.enable-lifecycle
ports:
- containerPort: 9090
protocol: TCP
volumeMounts:
- mountPath: /data
name: data
resources: 限制容器资源
requests: 容器启动时申请
cpu: "1000m" 1c(毫核) 1000m=1c
memory: "1.5Gi"
limits: 当容器资源使用达到阈值会被kill
cpu: "2000m" cpu使用不能超过2c
memory: "3Gi" 内存不能超过3G
imagePullSecrets:
- name: harbor
securityContext:
runAsUser: 0
serviceAccountName: prometheus
volumes:
- name: data
nfs:
server: 10.1.133.97
path: /data/nfsvolume/prometheus

创建挂载目录

1
2
[root@ceshi-132 prometheus]# cd /data/nfsvolume/
[root@ceshi-132 nfsvolume]# mkdir prometheus/{etc,prom-db}

拷贝证书 因为prometheus要自动发现监控所以必须和K8S-apiserver交互

1
2
3
4
[root@ceshi-132 prometheus]# cd etc/
[root@ceshi-132 etc]# cp /opt/certs/ca.pem .
[root@ceshi-132 etc]# cp /opt/certs/client.pem .
[root@ceshi-132 etc]# cp /opt/certs/client-key.pem .

创建prometheus配置文件prometheus.yml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
[root@ceshi-132 etc]# vi prometheus.yml 
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: 'etcd'
tls_config:
ca_file: /data/etc/ca.pem
cert_file: /data/etc/client.pem
key_file: /data/etc/client-key.pem
scheme: https
static_configs:
- targets:
- '10.1.133.93:2379'
- '10.1.133.95:2379'
- '10.1.133.96:2379'
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
- job_name: 'kubernetes-kubelet'
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __address__
replacement: ${1}:10255
- job_name: 'kubernetes-cadvisor'
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __address__
replacement: ${1}:4194
- job_name: 'kubernetes-kube-state'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
- source_labels: [__meta_kubernetes_pod_label_grafanak8sapp]
regex: .*true.*
action: keep
- source_labels: ['__meta_kubernetes_pod_label_daemon', '__meta_kubernetes_pod_node_name']
regex: 'node-exporter;(.*)'
action: replace
target_label: nodename
- job_name: 'blackbox_http_pod_probe'
metrics_path: /probe
kubernetes_sd_configs:
- role: pod
params:
module: [http_2xx]
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_blackbox_scheme]
action: keep
regex: http
- source_labels: [__address__, __meta_kubernetes_pod_annotation_blackbox_port, __meta_kubernetes_pod_annotation_blackbox_path]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+);(.+)
replacement: $1:$2$3
target_label: __param_target
- action: replace
target_label: __address__
replacement: blackbox-exporter.kube-system:9115
- source_labels: [__param_target]
target_label: instance
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
- job_name: 'blackbox_tcp_pod_probe'
metrics_path: /probe
kubernetes_sd_configs:
- role: pod
params:
module: [tcp_connect]
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_blackbox_scheme]
action: keep
regex: tcp
- source_labels: [__address__, __meta_kubernetes_pod_annotation_blackbox_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __param_target
- action: replace
target_label: __address__
replacement: blackbox-exporter.kube-system:9115
- source_labels: [__param_target]
target_label: instance
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
- job_name: 'traefik'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scheme]
action: keep
regex: traefik
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
1
2
3
4
5
6
7
8
9
10
11
12
13
[root@ceshi-132 prometheus]# cat svc.yaml 
apiVersion: v1
kind: Service
metadata:
name: prometheus
namespace: infra
spec:
ports:
- port: 9090
protocol: TCP
targetPort: 9090
selector:
app: prometheus
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
[root@ceshi-132 prometheus]# cat ingress.yaml 
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
annotations:
kubernetes.io/ingress.class: traefik
name: prometheus
namespace: infra
spec:
rules:
- host: prometheus.od.com
http:
paths:
- path: /
backend:
serviceName: prometheus
servicePort: 9090

交付

1
2
3
4
5
6
7
8
9
10
[root@ceshi-130 ~]# kubectl apply -f http://k8s-yaml.od.com/prometheus/rbac.yaml
serviceaccount/prometheus created
clusterrole.rbac.authorization.k8s.io/prometheus created
clusterrolebinding.rbac.authorization.k8s.io/prometheus created
[root@ceshi-130 ~]# kubectl apply -f http://k8s-yaml.od.com/prometheus/dp.yaml
deployment.extensions/prometheus created
[root@ceshi-130 ~]# kubectl apply -f http://k8s-yaml.od.com/prometheus/svc.yaml
service/prometheus created
[root@ceshi-130 ~]# kubectl apply -f http://k8s-yaml.od.com/prometheus/ingress.yaml
ingress.extensions/prometheus created

增加traefik监控项在template下级labels同级添加annotations,重启pod

1
2
3
4
5
"annotations": {
"prometheus_io_scheme": "traefik",
"prometheus_io_path": "/metrics",
"prometheus_io_port": "8080"
}

upload successful
blackbox 探测服务存活性
prometheus通过blackbox来检测业务得存活性,不是直接获取

==TCP==

1
2
3
4
"annotations": {
"blackbox_port": "80",
"blackbox_scheme": "tcp"
}

==HTTP==

1
2
3
4
5
"annotations": {
"blackbox_path": "/", (绝对路径后缀)
"blackbox_port": "8080",
"blackbox_scheme": "http"
}

==JVM==

1
2
3
4
5
"annotations": {
"prometheus.io.scrape": "true",
"prometheus.io.port": "12346",
"prometheus.io.path": "/"
}

upload successful

upload successful

grafana

下载镜像

1
2
3
[root@ceshi-132 ~]# docker pull grafana/grafana:5.4.2
[root@ceshi-132 ~]# docker tag 6f18ddf9e552 harbor.od.com/public/grafana:v5.4.2
[root@ceshi-132 ~]# docker push harbor.od.com/public/grafana:v5.4.2

创建资源配置清单

1
[root@ceshi-132 k8s-yaml]# mkdir /data/k8s-yaml/grafana/
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
[root@ceshi-132 grafana]# cat rbac.yaml 
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
addonmanager.kubernetes.io/mode: Reconcile
kubernetes.io/cluster-service: "true"
name: grafana
rules:
- apiGroups:
- "*"
resources:
- namespaces
- deployments
- pods
verbs:
- get
- list
- watch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
addonmanager.kubernetes.io/mode: Reconcile
kubernetes.io/cluster-service: "true"
name: grafana
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: grafana
subjects:
- kind: User
name: k8s-node
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
[root@ceshi-132 grafana]# cat dp.yaml 
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
labels:
app: grafana
name: grafana
name: grafana
namespace: infra
spec:
progressDeadlineSeconds: 600
replicas: 1
revisionHistoryLimit: 7
selector:
matchLabels:
name: grafana
strategy:
rollingUpdate:
maxSurge: 1
maxUnavailable: 1
type: RollingUpdate
template:
metadata:
labels:
app: grafana
name: grafana
spec:
containers:
- name: grafana
image: harbor.od.com/public/grafana:v5.4.2
imagePullPolicy: IfNotPresent
ports:
- containerPort: 3000
protocol: TCP
volumeMounts:
- mountPath: /var/lib/grafana
name: data
imagePullSecrets:
- name: harbor
securityContext:
runAsUser: 0
volumes:
- nfs:
server: 10.1.133.97
path: /data/nfsvolume/grafana
name: data

创建挂载目录

1
[root@ceshi-132 grafana]# mkdir /data/nfsvolume/grafana
1
2
3
4
5
6
7
8
9
10
11
12
13
[root@ceshi-132 grafana]# cat svc.yaml 
apiVersion: v1
kind: Service
metadata:
name: grafana
namespace: infra
spec:
ports:
- port: 3000
protocol: TCP
targetPort: 3000
selector:
app: grafana
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
[root@ceshi-132 grafana]# cat ingress.yaml 
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
name: grafana
namespace: infra
spec:
rules:
- host: grafana.od.com 解析域名
http:
paths:
- path: /
backend:
serviceName: grafana
servicePort: 3000

交付

1
2
3
4
5
6
7
8
9
[root@ceshi-130 ~]# kubectl apply -f http://k8s-yaml.od.com/grafana/rbac.yaml
clusterrole.rbac.authorization.k8s.io/grafana created
clusterrolebinding.rbac.authorization.k8s.io/grafana created
[root@ceshi-130 ~]# kubectl apply -f http://k8s-yaml.od.com/grafana/dp.yaml
deployment.extensions/grafana created
[root@ceshi-130 ~]# kubectl apply -f http://k8s-yaml.od.com/grafana/svc.yaml
service/grafana created
[root@ceshi-130 ~]# kubectl apply -f http://k8s-yaml.od.com/grafana/ingress.yaml
ingress.extensions/grafana created

交付完成后用户密码默认admin/admin
安装grafana插件

1
2
3
4
5
6
[root@ceshi-130 ~]# kubectl exec -it grafana-5c6f59bcb7-9bc44 -n infra bash
root@grafana-5c6f59bcb7-9bc44:/usr/share/grafana# grafana-cli plugins install grafana-kubernetes-app
root@grafana-5c6f59bcb7-9bc44:/usr/share/grafana# grafana-cli plugins install grafana-clock-panel
root@grafana-5c6f59bcb7-9bc44:/usr/share/grafana# grafana-cli plugins install grafana-piechart-panel
root@grafana-5c6f59bcb7-9bc44:/usr/share/grafana# grafana-cli plugins install briangann-gauge-panel
root@grafana-5c6f59bcb7-9bc44:/usr/share/grafana# grafana-cli plugins install natel-discrete-panel

重启pod

1
2
[root@ceshi-131 ~]# kubectl delete pod grafana-5c6f59bcb7-9bc44 -n infra
pod "grafana-5c6f59bcb7-9bc44" deleted

add data source –> Prometheus

upload successful
启动K8S插件,添加集群
plugins –> kubernetes –> enable –> new cluster

upload successful

upload successful

alertmanager 告警插件

下载镜像

1
2
3
[root@ceshi-132 ~]# docker pull docker.io/prom/alertmanager:v0.14.0
[root@ceshi-132 ~]# docker tag 30594e96cbe8 harbor.od.com/public/alertmanager:v0.14.0
[root@ceshi-132 ~]# docker push harbor.od.com/public/alertmanager:v0.14.0

创建资源配置清单

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
[root@ceshi-132 alertmanager]# cat cm.yaml 
apiVersion: v1
kind: ConfigMap
metadata:
name: alertmanager-config
namespace: infra
data:
config.yml: |-
global:
# 在没有报警的情况下声明为已解决的时间
resolve_timeout: 5m
# 配置邮件发送信息
smtp_smarthost: 'smtp.163.com:25'
smtp_from: 'xxx@163.com'
smtp_auth_username: 'xxxx@163.com'
smtp_auth_password: 'xxxx'
smtp_require_tls: false
# 所有报警信息进入后的根路由,用来设置报警的分发策略
route:
# 这里的标签列表是接收到报警信息后的重新分组标签,例如,接收到的报警信息里面有许多具有 cluster=A 和 alertname=LatncyHigh 这样的标签的报警信息将会批量被聚合到一个分组里面
group_by: ['alertname', 'cluster']
# 当一个新的报警分组被创建后,需要等待至少group_wait时间来初始化通知,这种方式可以确保您能有足够的时间为同一分组来获取多个警报,然后一起触发这个报警信息。
group_wait: 30s

# 当第一个报警发送后,等待'group_interval'时间来发送新的一组报警信息。
group_interval: 5m

# 如果一个报警信息已经发送成功了,等待'repeat_interval'时间来重新发送他们
repeat_interval: 5m

# 默认的receiver:如果一个报警没有被一个route匹配,则发送给默认的接收器
receiver: default

receivers:
- name: 'default'
email_configs:
- to: 'xxx.com'
send_resolved: true
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
[root@ceshi-132 alertmanager]# cat dp.yaml 
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
name: alertmanager
namespace: infra
spec:
replicas: 1
selector:
matchLabels:
app: alertmanager
template:
metadata:
labels:
app: alertmanager
spec:
containers:
- name: alertmanager
image: harbor.od.com/public/alertmanager:v0.14.0
args:
- "--config.file=/etc/alertmanager/config.yml"
- "--storage.path=/alertmanager"
ports:
- name: alertmanager
containerPort: 9093
volumeMounts:
- name: alertmanager-cm
mountPath: /etc/alertmanager
volumes:
- name: alertmanager-cm
configMap:
name: alertmanager-config
imagePullSecrets:
- name: harbor
1
2
3
4
5
6
7
8
9
10
11
12
[root@ceshi-132 alertmanager]# cat svc.yaml 
apiVersion: v1
kind: Service
metadata:
name: alertmanager
namespace: infra
spec:
selector:
app: alertmanager
ports:
- port: 80
targetPort: 9093

交付

1
2
3
4
5
6
7
8
[root@ceshi-130 ~]# kubectl apply -f http://k8s-yaml.od.com/dubbo-demo-service/dp.yaml
deployment.extensions/dubbo-demo-service created
[root@ceshi-130 ~]# kubectl apply -f http://k8s-yaml.od.com/alertmanager/cm.yaml
configmap/alertmanager-config created
[root@ceshi-130 ~]# kubectl apply -f http://k8s-yaml.od.com/alertmanager/dp.yaml
deployment.extensions/alertmanager created
[root@ceshi-130 ~]# kubectl apply -f http://k8s-yaml.od.com/alertmanager/svc.yaml
service/alertmanager created

报警规则

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
[root@ceshi-132 etc]# vi /data/nfsvolume/prometheus/etc/rules.yml
groups:
- name: hostStatsAlert
rules:
- alert: hostCpuUsageAlert
expr: sum(avg without (cpu)(irate(node_cpu{mode!='idle'}[5m]))) by (instance) > 0.85
for: 5m
labels:
severity: warning
annotations:
summary: "{{ $labels.instance }} CPU usage above 85% (current value: {{ $value }}%)"
- alert: hostMemUsageAlert
expr: (node_memory_MemTotal - node_memory_MemAvailable)/node_memory_MemTotal > 0.85
for: 5m
labels:
severity: warning
annotations:
summary: "{{ $labels.instance }} MEM usage above 85% (current value: {{ $value }}%)"
- alert: OutOfInodes
expr: node_filesystem_free{fstype="overlay",mountpoint ="/"} / node_filesystem_size{fstype="overlay",mountpoint ="/"} * 100 < 10
for: 5m
labels:
severity: warning
annotations:
summary: "Out of inodes (instance {{ $labels.instance }})"
description: "Disk is almost running out of available inodes (< 10% left) (current value: {{ $value }})"
- alert: OutOfDiskSpace
expr: node_filesystem_free{fstype="overlay",mountpoint ="/rootfs"} / node_filesystem_size{fstype="overlay",mountpoint ="/rootfs"} * 100 < 10
for: 5m
labels:
severity: warning
annotations:
summary: "Out of disk space (instance {{ $labels.instance }})"
description: "Disk is almost full (< 10% left) (current value: {{ $value }})"
- alert: UnusualNetworkThroughputIn
expr: sum by (instance) (irate(node_network_receive_bytes[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
summary: "Unusual network throughput in (instance {{ $labels.instance }})"
description: "Host network interfaces are probably receiving too much data (> 100 MB/s) (current value: {{ $value }})"
- alert: UnusualNetworkThroughputOut
expr: sum by (instance) (irate(node_network_transmit_bytes[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
summary: "Unusual network throughput out (instance {{ $labels.instance }})"
description: "Host network interfaces are probably sending too much data (> 100 MB/s) (current value: {{ $value }})"
- alert: UnusualDiskReadRate
expr: sum by (instance) (irate(node_disk_bytes_read[2m])) / 1024 / 1024 > 50
for: 5m
labels:
severity: warning
annotations:
summary: "Unusual disk read rate (instance {{ $labels.instance }})"
description: "Disk is probably reading too much data (> 50 MB/s) (current value: {{ $value }})"
- alert: UnusualDiskWriteRate
expr: sum by (instance) (irate(node_disk_bytes_written[2m])) / 1024 / 1024 > 50
for: 5m
labels:
severity: warning
annotations:
summary: "Unusual disk write rate (instance {{ $labels.instance }})"
description: "Disk is probably writing too much data (> 50 MB/s) (current value: {{ $value }})"
- alert: UnusualDiskReadLatency
expr: rate(node_disk_read_time_ms[1m]) / rate(node_disk_reads_completed[1m]) > 100
for: 5m
labels:
severity: warning
annotations:
summary: "Unusual disk read latency (instance {{ $labels.instance }})"
description: "Disk latency is growing (read operations > 100ms) (current value: {{ $value }})"
- alert: UnusualDiskWriteLatency
expr: rate(node_disk_write_time_ms[1m]) / rate(node_disk_writes_completedl[1m]) > 100
for: 5m
labels:
severity: warning
annotations:
summary: "Unusual disk write latency (instance {{ $labels.instance }})"
description: "Disk latency is growing (write operations > 100ms) (current value: {{ $value }})"
- name: http_status
rules:
- alert: ProbeFailed
expr: probe_success == 0
for: 1m
labels:
severity: error
annotations:
summary: "Probe failed (instance {{ $labels.instance }})"
description: "Probe failed (current value: {{ $value }})"
- alert: StatusCode
expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
for: 1m
labels:
severity: error
annotations:
summary: "Status Code (instance {{ $labels.instance }})"
description: "HTTP status code is not 200-399 (current value: {{ $value }})"
- alert: SslCertificateWillExpireSoon
expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30
for: 5m
labels:
severity: warning
annotations:
summary: "SSL certificate will expire soon (instance {{ $labels.instance }})"
description: "SSL certificate expires in 30 days (current value: {{ $value }})"
- alert: SslCertificateHasExpired
expr: probe_ssl_earliest_cert_expiry - time() <= 0
for: 5m
labels:
severity: error
annotations:
summary: "SSL certificate has expired (instance {{ $labels.instance }})"
description: "SSL certificate has expired already (current value: {{ $value }})"
- alert: BlackboxSlowPing
expr: probe_icmp_duration_seconds > 2
for: 5m
labels:
severity: warning
annotations:
summary: "Blackbox slow ping (instance {{ $labels.instance }})"
description: "Blackbox ping took more than 2s (current value: {{ $value }})"
- alert: BlackboxSlowRequests
expr: probe_http_duration_seconds > 2
for: 5m
labels:
severity: warning
annotations:
summary: "Blackbox slow requests (instance {{ $labels.instance }})"
description: "Blackbox request took more than 2s (current value: {{ $value }})"
- alert: PodCpuUsagePercent
expr: sum(sum(label_replace(irate(container_cpu_usage_seconds_total[1m]),"pod","$1","container_label_io_kubernetes_pod_name", "(.*)"))by(pod) / on(pod) group_right kube_pod_container_resource_limits_cpu_cores *100 )by(container,namespace,node,pod,severity) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Pod cpu usage percent has exceeded 80% (current value: {{ $value }}%)"

配置prometheus文件,重启

1
2
3
4
5
6
7
[root@ceshi-132 etc]# vi /data/nfsvolume/prometheus/etc/prometheus.yml
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager"]
rule_files:
- "/data/etc/rules.yml"

平滑重新加载配置,因为有的服务太庞大,停止再重启容易拖垮整个集群

1
[root@ceshi-130 ~]# kill -SIGHUP 113990

upload successful