apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ ansible_operator_meta.name }}-{{ item.name }}-alertrules
namespace: {{ ansible_operator_meta.namespace }}
labels:
app: prometheus-postgres-exporter-alertrules
name: {{ ansible_operator_meta.name }}-{{ item.name }}-alertrules
spec:
groups:
- name: fep-container
rules:
- alert: ContainerDisappeared
annotations:
description: {{ 'Container {{$labels.container}}/{{$labels.pod}} from {{$labels.namespace}} has been disappeared' }}
summary: Container Pod disappeared.
expr: time() -
container_last_seen{ container="fep-patroni", namespace="{{ ansible_operator_meta.namespace }}", pod=~"^{{ item.name }}-sts-.*" } > 60
labels:
severity: warning
- alert: ContainerHighCPUUsage
annotations:
description: {{ 'Container {{$labels.container}}/{{$labels.pod}} from {{$labels.namespace}} has been high on CPU usage(>80%) for 5 mins' }}
summary: High Container CPU usage.
expr: (sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{pod=~"{{ item.name }}-sts.*", namespace="{{ ansible_operator_meta.namespace }}", container="fep-patroni"}) by (pod,namespace,container)/sum(kube_pod_container_resource_limits_cpu_cores) by (pod,namespace,container))*100 > 80
for: 5m
labels:
severity: warning
- alert: ContainerHighRAMUsage
annotations:
description: {{ 'Container {{$labels.container}}/{{$labels.pod}} from {{$labels.namespace}} has been high on RAM usage(>80%) since 30 mins' }}
summary: High container memory usage.
expr: sum(container_memory_working_set_bytes{pod=~"{{ item.name }}-sts.*", namespace="{{ ansible_operator_meta.namespace }}", container="fep-patroni"} / container_spec_memory_limit_bytes * 100) by (pod, container, instance) > 80
for: 30m
labels:
severity: warning
- alert: PVCLowDiskSpace
annotations:
description: {{ 'Found low disk space on {{$labels.persistentvolumeclaim}} in {{$labels.namespace}} namespace.' }}
summary: {{ 'Found low disk space on {{$labels.persistentvolumeclaim}} in {{$labels.namespace}} namespace.' }}
expr: kubelet_volume_stats_available_bytes{namespace="{{ ansible_operator_meta.namespace }}", persistentvolumeclaim=~"fep.*{{ item.name }}.*"}/ (kubelet_volume_stats_capacity_bytes) * 100 < 10
for: 5m
labels:
severity: warning
- name: postgres
rules:
- alert: PostgresqlDown
annotations:
description: "Postgresql one or more instances are down in FEPCluster {{ item.name }} in {{ ansible_operator_meta.namespace }} namespace. Please check the FEP pods in this cluster"
summary: "Postgresql FEPCluster {{ item.name }} in {{ ansible_operator_meta.namespace }} namespace is degraded"
expr: count(pg_static{ namespace="{{ ansible_operator_meta.namespace }}", service="{{ ansible_operator_meta.name }}-service", server=~"{{item.name}}-sts.*" }) < {{item.instances | length}}
labels:
severity: error
- alert: PostgresqlTooManyConnections
annotations:
description: {{ 'PostgreSQL instance has too many connections on server {{ $labels.server }} in {{ $labels.namespace }} namespace.' }}
summary: {{ 'Postgresql too many connections (FEPCluster server {{ $labels.server }})' }}
expr: pg_capacity_connection_total{namespace="{{ ansible_operator_meta.namespace }}", service="{{ ansible_operator_meta.name }}-service", server=~"{{ item.name }}-sts.*"}/pg_settings_max_connections > 0.9
labels:
severity: warning
- alert: PostgresqlRolePasswordCloseExpierd
annotations:
description: "The Postgresql role's password expires in less than 7 days. Please update the password."
summary: "Postgresql Role Password expires in less than 7 days."
expr: count(pg_password_valid_days{ namespace="{{ ansible_operator_meta.namespace }}", service="{{ ansible_operator_meta.name }}-service", server=~"{{ item.name }}-sts.*", rolname=~".*" } < 8) > 0
labels:
severity: warning
- alert: PostgresqlRolePasswordExpired
annotations:
description: "The Postgresql role's password has already expired. Please update the password."
summary: "Postgresql Role Password has already expired. "
expr: count(pg_password_valid_seconds{ namespace="{{ ansible_operator_meta.namespace }}", service="{{ ansible_operator_meta.name }}-service", server=~"{{ item.name }}-sts.*", rolname=~".*" } < 0) > 0
labels:
severity: warning
- alert: PasswordIsGraceTimeByUserProfile
annotations:
description: The password for the role in the grace time exists. Please change your password.
summary: There is a password in the grace time
expr: grace_time_roles{ namespace="{{ ansible_operator_meta.namespace }}", service="{{ ansible_operator_meta.name }}-service", server=~"{{ item.name }}-sts.*" } > 0
labels:
severity: warning
- alert: PasswordExpiredByUserProfile
annotations:
description: Expired role password exists. Please change your password.
summary: Expired role password exists
expr: expired_roles{ namespace="{{ ansible_operator_meta.namespace }}", service="{{ ansible_operator_meta.name }}-service", server=~"{{ item.name }}-sts.*" } > 0
labels:
severity: warning
- alert: PasswordLockedByUserProfile
annotations:
description: There is a role with a password lock. Please confirm the role.
summary: Password locked role exists
expr: locked_roles{ namespace="{{ ansible_operator_meta.namespace }}", service="{{ ansible_operator_meta.name }}-service", server=~"{{ item.name }}-sts.*" } > 0
labels:
severity: warning
- alert: PostgresqlTooManyTxidUsage
annotations:
description: "Transaction ID usage has exceeded the value of autovacuum_freeze_max_age for more than 24 hours. Consider periodic aggressive vacuuming."
summary: "Transaction ID usage exceeds autovacuum_freeze_max_age"
expr: pg_txid_usage{ namespace="{{ ansible_operator_meta.namespace }}", service="{{ ansible_operator_meta.name }}-service", server=~"{{ item.name }}-sts.*" } > pg_settings_autovacuum_freeze_max_age
for: 24h
labels:
severity: warning