-
一、容器化部署1.1 Docker镜像构建Dockerfiledockerfile# 多阶段构建优化# 第一阶段:构建应用FROM maven:3.8.4-openjdk-17 AS builderWORKDIR /app# 复制依赖文件COPY pom.xml .COPY mvnw .COPY .mvn .mvn# 下载依赖(利用缓存层)RUN mvn dependency:go-offline -B# 复制源代码COPY src src# 构建应用RUN mvn clean package -DskipTests -DfinalName=sso-service# 第二阶段:运行环境FROM eclipse-temurin:17-jre-alpine# 安装必要的工具RUN apk add --no-cache tzdata curl bash && \ cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && \ echo "Asia/Shanghai" > /etc/timezone# 创建应用用户RUN addgroup -S appgroup && \ adduser -S appuser -G appgroup && \ mkdir -p /app && \ chown -R appuser:appgroup /appWORKDIR /app# 从构建阶段复制jar文件COPY --from=builder /app/target/sso-service.jar /app/app.jarCOPY --from=builder /app/target/classes/application.yml /app/config/application.ymlCOPY --from=builder /app/src/main/resources/logback-spring.xml /app/config/# 健康检查脚本COPY docker/healthcheck.sh /app/healthcheck.shRUN chmod +x /app/healthcheck.sh# 切换用户USER appuser# 暴露端口EXPOSE 8080# 健康检查HEALTHCHECK --interval=30s --timeout=3s --start-period=60s --retries=3 \ CMD curl -f http://localhost:8080/actuator/health || exit 1# 启动应用ENTRYPOINT ["java", \ "-Djava.security.egd=file:/dev/./urandom", \ "-Dspring.profiles.active=${SPRING_PROFILES_ACTIVE:-prod}", \ "-Dspring.config.location=file:/app/config/", \ "-jar", "/app/app.jar"]docker-compose.ymlyamlversion: '3.8'services: sso-service: build: context: . dockerfile: Dockerfile image: sso-service:${TAG:-latest} container_name: sso-service restart: unless-stopped environment: - SPRING_PROFILES_ACTIVE=prod - JAVA_OPTS=-Xmx1g -Xms512m - TZ=Asia/Shanghai env_file: - .env volumes: - ./logs:/app/logs - ./config:/app/config:ro - sso-data:/app/data ports: - "8080:8080" networks: - sso-network depends_on: - redis - postgres healthcheck: test: ["CMD", "/app/healthcheck.sh"] interval: 30s timeout: 10s retries: 3 start_period: 40s redis: image: redis:7-alpine container_name: sso-redis restart: unless-stopped command: redis-server --appendonly yes --requirepass ${REDIS_PASSWORD} volumes: - redis-data:/data ports: - "6379:6379" networks: - sso-network healthcheck: test: ["CMD", "redis-cli", "ping"] interval: 10s timeout: 5s retries: 3 postgres: image: postgres:15-alpine container_name: sso-postgres restart: unless-stopped environment: POSTGRES_DB: ${DB_NAME} POSTGRES_USER: ${DB_USER} POSTGRES_PASSWORD: ${DB_PASSWORD} volumes: - postgres-data:/var/lib/postgresql/data - ./initdb:/docker-entrypoint-initdb.d:ro ports: - "5432:5432" networks: - sso-network healthcheck: test: ["CMD-SHELL", "pg_isready -U ${DB_USER}"] interval: 10s timeout: 5s retries: 3 nginx: image: nginx:1.23-alpine container_name: sso-nginx restart: unless-stopped volumes: - ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro - ./nginx/conf.d:/etc/nginx/conf.d:ro - ./ssl:/etc/nginx/ssl:ro - ./logs/nginx:/var/log/nginx ports: - "80:80" - "443:443" networks: - sso-network depends_on: - sso-servicenetworks: sso-network: driver: bridgevolumes: redis-data: driver: local postgres-data: driver: local sso-data: driver: localNginx配置nginx# nginx/nginx.confuser nginx;worker_processes auto;error_log /var/log/nginx/error.log warn;pid /var/run/nginx.pid;events { worker_connections 1024; use epoll; multi_accept on;}http { include /etc/nginx/mime.types; default_type application/octet-stream; log_format main '$remote_addr - $remote_user [$time_local] "$request" ' '$status $body_bytes_sent "$http_referer" ' '"$http_user_agent" "$http_x_forwarded_for"'; access_log /var/log/nginx/access.log main; # 优化参数 sendfile on; tcp_nopush on; tcp_nodelay on; keepalive_timeout 65; types_hash_max_size 2048; client_max_body_size 10m; # Gzip压缩 gzip on; gzip_vary on; gzip_min_length 1024; gzip_types text/plain text/css text/xml text/javascript application/json application/javascript application/xml+rss; # 安全头 add_header X-Frame-Options "SAMEORIGIN" always; add_header X-Content-Type-Options "nosniff" always; add_header X-XSS-Protection "1; mode=block" always; add_header Referrer-Policy "strict-origin-when-cross-origin" always; # SSL配置 ssl_protocols TLSv1.2 TLSv1.3; ssl_ciphers ECDHE-RSA-AES256-GCM-SHA512:DHE-RSA-AES256-GCM-SHA512; ssl_prefer_server_ciphers off; ssl_session_cache shared:SSL:10m; ssl_session_timeout 10m; include /etc/nginx/conf.d/*.conf;}nginx# nginx/conf.d/sso.confupstream sso_backend { least_conn; server sso-service:8080 max_fails=3 fail_timeout=30s; # 会话保持(如果需要) # sticky cookie sso_session expires=1h domain=.example.com path=/;}server { listen 80; server_name sso.example.com; # 重定向到HTTPS return 301 https://$server_name$request_uri;}server { listen 443 ssl http2; server_name sso.example.com; # SSL证书 ssl_certificate /etc/nginx/ssl/sso.example.com.crt; ssl_certificate_key /etc/nginx/ssl/sso.example.com.key; # SSL优化 ssl_session_timeout 1d; ssl_session_cache shared:MozSSL:10m; ssl_session_tickets off; # OCSP Stapling ssl_stapling on; ssl_stapling_verify on; # HSTS add_header Strict-Transport-Security "max-age=63072000" always; # 安全头 add_header Content-Security-Policy "default-src 'self'; script-src 'self' 'unsafe-inline'; style-src 'self' 'unsafe-inline'; img-src 'self' data:;" always; add_header Permissions-Policy "geolocation=(), microphone=(), camera=()" always; # 访问日志 access_log /var/log/nginx/sso.access.log main buffer=32k flush=5s; location / { proxy_pass http://sso_backend; proxy_http_version 1.1; # 代理头设置 proxy_set_header Host $host; proxy_set_header X-Real-IP $remote_addr; proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; proxy_set_header X-Forwarded-Proto $scheme; proxy_set_header X-Forwarded-Host $host; proxy_set_header X-Forwarded-Port $server_port; # 超时设置 proxy_connect_timeout 30s; proxy_send_timeout 60s; proxy_read_timeout 60s; # 缓冲区设置 proxy_buffer_size 4k; proxy_buffers 8 4k; proxy_busy_buffers_size 8k; # WebSocket支持 proxy_set_header Upgrade $http_upgrade; proxy_set_header Connection "upgrade"; } # 健康检查端点 location /health { access_log off; proxy_pass http://sso_backend/actuator/health; proxy_set_header Host $host; } # 静态资源缓存 location ~* \.(js|css|png|jpg|jpeg|gif|ico|svg|woff|woff2|ttf|eot)$ { expires 1y; add_header Cache-Control "public, immutable"; proxy_pass http://sso_backend; } # 阻止敏感文件访问 location ~ /\. { deny all; access_log off; log_not_found off; } location ~ /(\.git|\.svn|\.env|\.htaccess|\.htpasswd) { deny all; access_log off; log_not_found off; }}二、Kubernetes部署配置2.1 部署文件namespace.yamlyamlapiVersion: v1kind: Namespacemetadata: name: sso-system labels: name: sso-system environment: productionconfigmap.yamlyamlapiVersion: v1kind: ConfigMapmetadata: name: sso-config namespace: sso-systemdata: application.yml: | spring: application: name: sso-service profiles: active: prod datasource: url: jdbc:postgresql://${DB_HOST:postgres-svc}:${DB_PORT:5432}/${DB_NAME:sso_db} username: ${DB_USER} password: ${DB_PASSWORD} hikari: maximum-pool-size: 10 minimum-idle: 5 connection-timeout: 30000 idle-timeout: 600000 max-lifetime: 1800000 redis: host: ${REDIS_HOST:redis-svc} port: ${REDIS_PORT:6379} password: ${REDIS_PASSWORD} timeout: 2000ms lettuce: pool: max-active: 8 max-idle: 8 min-idle: 0 security: oauth2: jwt: secret: ${JWT_SECRET} logging: level: com.example.sso: DEBUG org.springframework.security: INFO file: name: /app/logs/sso-service.log max-size: 10MB max-history: 30 server: port: 8080 servlet: context-path: / compression: enabled: true mime-types: application/json,application/xml,text/html,text/xml,text/plain ssl: enabled: false management: endpoints: web: exposure: include: health,info,metrics,prometheus endpoint: health: show-details: always probes: enabled: true health: livenessState: enabled: true readinessState: enabled: true sso: token: expiration: 3600 refresh-expiration: 86400 security: max-login-attempts: 5 lock-duration-minutes: 15secret.yamlyamlapiVersion: v1kind: Secretmetadata: name: sso-secrets namespace: sso-systemtype: OpaquestringData: db-password: "${DB_PASSWORD}" redis-password: "${REDIS_PASSWORD}" jwt-secret: "${JWT_SECRET}" admin-password: "${ADMIN_PASSWORD}"deployment.yamlyamlapiVersion: apps/v1kind: Deploymentmetadata: name: sso-service namespace: sso-system labels: app: sso-service version: v1.0.0spec: replicas: 3 revisionHistoryLimit: 3 strategy: type: RollingUpdate rollingUpdate: maxSurge: 1 maxUnavailable: 0 selector: matchLabels: app: sso-service template: metadata: labels: app: sso-service version: v1.0.0 annotations: prometheus.io/scrape: "true" prometheus.io/port: "8080" prometheus.io/path: "/actuator/prometheus" spec: serviceAccountName: sso-service-account securityContext: runAsUser: 1000 runAsGroup: 1000 fsGroup: 1000 containers: - name: sso-service image: sso-service:${IMAGE_TAG} imagePullPolicy: IfNotPresent ports: - containerPort: 8080 name: http protocol: TCP env: - name: DB_HOST value: "postgres-svc" - name: DB_PORT value: "5432" - name: DB_NAME value: "sso_db" - name: DB_USER value: "sso_user" - name: REDIS_HOST value: "redis-svc" - name: REDIS_PORT value: "6379" envFrom: - secretRef: name: sso-secrets resources: requests: memory: "512Mi" cpu: "250m" limits: memory: "1Gi" cpu: "500m" volumeMounts: - name: config-volume mountPath: /app/config readOnly: true - name: logs-volume mountPath: /app/logs - name: tmp-volume mountPath: /tmp livenessProbe: httpGet: path: /actuator/health/liveness port: 8080 scheme: HTTP initialDelaySeconds: 60 periodSeconds: 10 timeoutSeconds: 5 successThreshold: 1 failureThreshold: 3 readinessProbe: httpGet: path: /actuator/health/readiness port: 8080 scheme: HTTP initialDelaySeconds: 30 periodSeconds: 10 timeoutSeconds: 5 successThreshold: 1 failureThreshold: 3 startupProbe: httpGet: path: /actuator/health/readiness port: 8080 initialDelaySeconds: 30 periodSeconds: 10 timeoutSeconds: 5 failureThreshold: 30 lifecycle: preStop: exec: command: ["sh", "-c", "sleep 30"] volumes: - name: config-volume configMap: name: sso-config - name: logs-volume emptyDir: {} - name: tmp-volume emptyDir: {} affinity: podAntiAffinity: preferredDuringSchedulingIgnoredDuringExecution: - weight: 100 podAffinityTerm: labelSelector: matchExpressions: - key: app operator: In values: - sso-service topologyKey: kubernetes.io/hostname tolerations: - key: "node.kubernetes.io/unreachable" operator: "Exists" effect: "NoExecute" tolerationSeconds: 600 - key: "node.kubernetes.io/not-ready" operator: "Exists" effect: "NoExecute" tolerationSeconds: 600service.yamlyamlapiVersion: v1kind: Servicemetadata: name: sso-service namespace: sso-system labels: app: sso-service annotations: prometheus.io/scrape: "true" prometheus.io/port: "8080"spec: selector: app: sso-service ports: - port: 80 targetPort: 8080 protocol: TCP name: http - port: 8080 targetPort: 8080 protocol: TCP name: metrics type: ClusterIP sessionAffinity: ClientIP sessionAffinityConfig: clientIP: timeoutSeconds: 10800ingress.yamlyamlapiVersion: networking.k8s.io/v1kind: Ingressmetadata: name: sso-ingress namespace: sso-system annotations: nginx.ingress.kubernetes.io/ssl-redirect: "true" nginx.ingress.kubernetes.io/force-ssl-redirect: "true" nginx.ingress.kubernetes.io/rewrite-target: / nginx.ingress.kubernetes.io/affinity: "cookie" nginx.ingress.kubernetes.io/session-cookie-name: "sso-session" nginx.ingress.kubernetes.io/session-cookie-expires: "172800" nginx.ingress.kubernetes.io/session-cookie-max-age: "172800" nginx.ingress.kubernetes.io/proxy-body-size: "10m" nginx.ingress.kubernetes.io/proxy-connect-timeout: "30" nginx.ingress.kubernetes.io/proxy-send-timeout: "60" nginx.ingress.kubernetes.io/proxy-read-timeout: "60" nginx.ingress.kubernetes.io/enable-cors: "true" nginx.ingress.kubernetes.io/cors-allow-methods: "GET, POST, PUT, DELETE, OPTIONS" nginx.ingress.kubernetes.io/cors-allow-headers: "*" cert-manager.io/cluster-issuer: "letsencrypt-prod"spec: ingressClassName: nginx tls: - hosts: - sso.example.com secretName: sso-tls-secret rules: - host: sso.example.com http: paths: - path: / pathType: Prefix backend: service: name: sso-service port: number: 80hpa.yamlyamlapiVersion: autoscaling/v2kind: HorizontalPodAutoscalermetadata: name: sso-hpa namespace: sso-systemspec: scaleTargetRef: apiVersion: apps/v1 kind: Deployment name: sso-service minReplicas: 3 maxReplicas: 10 metrics: - type: Resource resource: name: cpu target: type: Utilization averageUtilization: 70 - type: Resource resource: name: memory target: type: Utilization averageUtilization: 80 - type: Pods pods: metric: name: custom_metric_qps target: type: AverageValue averageValue: 1000 behavior: scaleDown: stabilizationWindowSeconds: 300 policies: - type: Percent value: 10 periodSeconds: 60 - type: Pods value: 1 periodSeconds: 60 selectPolicy: Min scaleUp: stabilizationWindowSeconds: 60 policies: - type: Percent value: 20 periodSeconds: 60 - type: Pods value: 2 periodSeconds: 60 selectPolicy: Max三、CI/CD流水线3.1 Jenkinsfilegroovypipeline { agent any environment { DOCKER_REGISTRY = 'registry.example.com' K8S_NAMESPACE = 'sso-system' VERSION = "${env.BUILD_ID}" IMAGE_NAME = "${DOCKER_REGISTRY}/sso-service:${VERSION}" } options { buildDiscarder(logRotator(numToKeepStr: '10')) timeout(time: 30, unit: 'MINUTES') disableConcurrentBuilds() } stages { stage('代码检查') { steps { script { // SonarQube代码扫描 withSonarQubeEnv('sonar-server') { sh 'mvn sonar:sonar -Dsonar.projectKey=sso-service' } // 单元测试 sh 'mvn test' // 集成测试 sh 'mvn verify -Pintegration-test' } } post { success { echo '代码检查通过' } failure { error '代码检查失败' } } } stage('构建镜像') { steps { script { // 构建Docker镜像 sh """ docker build \ --build-arg VERSION=${VERSION} \ -t ${IMAGE_NAME} \ -f Dockerfile . """ // 扫描镜像安全漏洞 sh "trivy image --severity HIGH,CRITICAL ${IMAGE_NAME}" // 推送镜像到仓库 withCredentials([usernamePassword( credentialsId: 'docker-registry-creds', usernameVariable: 'DOCKER_USER', passwordVariable: 'DOCKER_PASS' )]) { sh """ docker login ${DOCKER_REGISTRY} \ -u ${DOCKER_USER} \ -p ${DOCKER_PASS} docker push ${IMAGE_NAME} """ } } } } stage('部署到测试环境') { when { branch 'develop' } steps { script { // 更新K8S部署文件 sh """ sed -i 's|image:.*|image: ${IMAGE_NAME}|' k8s/deployment.yaml """ // 部署到测试集群 withKubeConfig([ credentialsId: 'k8s-test-cluster', serverUrl: 'https://k8s-test.example.com' ]) { sh """ kubectl apply -f k8s/ -n test kubectl rollout status deployment/sso-service -n test --timeout=300s """ } // 运行自动化测试 sh "mvn test -Psmoke-test -Dbase.url=https://sso-test.example.com" } } } stage('部署到生产环境') { when { branch 'main' } steps { input message: '确认部署到生产环境?', ok: '确认部署' script { // 备份当前版本 sh """ kubectl get deployment sso-service -n ${K8S_NAMESPACE} -o yaml > backup/deployment-backup-${VERSION}.yaml """ // 金丝雀发布 sh """ kubectl set image deployment/sso-service sso-service=${IMAGE_NAME} -n ${K8S_NAMESPACE} kubectl rollout pause deployment/sso-service -n ${K8S_NAMESPACE} # 先更新一个Pod kubectl patch deployment sso-service -n ${K8S_NAMESPACE} \ -p '{"spec":{"replicas": 1}}' # 等待新Pod就绪 kubectl rollout status deployment/sso-service -n ${K8S_NAMESPACE} --timeout=120s # 运行金丝雀测试 ./scripts/canary-test.sh https://sso.example.com # 继续滚动更新 kubectl rollout resume deployment/sso-service -n ${K8S_NAMESPACE} kubectl rollout status deployment/sso-service -n ${K8S_NAMESPACE} --timeout=300s """ } } } } post { success { script { // 发送成功通知 emailext( subject: "SSO服务部署成功 - ${env.JOB_NAME} #${env.BUILD_NUMBER}", body: "部署版本: ${VERSION}\n构建链接: ${env.BUILD_URL}", to: 'devops@example.com' ) } } failure { script { // 发送失败通知 emailext( subject: "SSO服务部署失败 - ${env.JOB_NAME} #${env.BUILD_NUMBER}", body: "请检查构建日志: ${env.BUILD_URL}", to: 'devops@example.com', attachLog: true ) } } cleanup { // 清理工作空间 cleanWs() } }}3.2 GitLab CI配置yaml# .gitlab-ci.ymlstages: - test - build - security-scan - deploy-test - deploy-prodvariables: DOCKER_IMAGE: registry.example.com/sso-service:$CI_COMMIT_SHORT_SHA K8S_NAMESPACE: sso-system# 镜像构建规则.build_rules: &build_rules rules: - if: '$CI_PIPELINE_SOURCE == "merge_request_event"' changes: - Dockerfile - pom.xml - src/**/* - if: '$CI_COMMIT_BRANCH == "main" || $CI_COMMIT_BRANCH == "develop"'unit-test: stage: test image: maven:3.8.4-openjdk-17 script: - mvn clean test -DskipITs artifacts: reports: junit: target/surefire-reports/TEST-*.xml cache: key: "${CI_COMMIT_REF_SLUG}" paths: - .m2/repositoryintegration-test: stage: test image: maven:3.8.4-openjdk-17 services: - postgres:15-alpine - redis:7-alpine variables: POSTGRES_DB: sso_test POSTGRES_USER: sso_user POSTGRES_PASSWORD: test_password REDIS_PASSWORD: test_password script: - mvn verify -Pintegration-test artifacts: reports: junit: target/failsafe-reports/TEST-*.xmlbuild: stage: build image: docker:20.10 services: - docker:20.10-dind rules: *build_rules script: - docker build -t $DOCKER_IMAGE . - docker push $DOCKER_IMAGEsonar-scan: stage: security-scan image: maven:3.8.4-openjdk-17 variables: SONAR_USER_HOME: "${CI_PROJECT_DIR}/.sonar" cache: key: "${CI_COMMIT_REF_SLUG}" paths: - .sonar/cache script: - mvn sonar:sonar -Dsonar.projectKey=sso-service rules: - if: '$CI_COMMIT_BRANCH == "main"'trivy-scan: stage: security-scan image: aquasec/trivy:latest script: - trivy image --severity HIGH,CRITICAL --exit-code 1 $DOCKER_IMAGE - trivy image --format template --template "@contrib/gitlab.tpl" --output "gl-dependency-scanning-report.json" $DOCKER_IMAGE artifacts: reports: dependency_scanning: gl-dependency-scanning-report.jsondeploy-test: stage: deploy-test image: bitnami/kubectl:latest rules: - if: '$CI_COMMIT_BRANCH == "develop"' script: - kubectl config use-context test-cluster - kubectl set image deployment/sso-service sso-service=$DOCKER_IMAGE -n $K8S_NAMESPACE - kubectl rollout status deployment/sso-service -n $K8S_NAMESPACE --timeout=300s - ./scripts/run-smoke-tests.sh https://sso-test.example.comdeploy-prod: stage: deploy-prod image: bitnami/kubectl:latest rules: - if: '$CI_COMMIT_BRANCH == "main"' when: manual script: - kubectl config use-context prod-cluster - | # 蓝绿部署策略 CURRENT_VERSION=$(kubectl get deployment sso-service -n $K8S_NAMESPACE -o jsonpath='{.spec.template.spec.containers[0].image}') # 创建新版本部署 kubectl apply -f k8s/blue-green/green-deployment.yaml sed -i "s|IMAGE_PLACEHOLDER|$DOCKER_IMAGE|" k8s/blue-green/green-deployment.yaml kubectl apply -f k8s/blue-green/green-deployment.yaml # 等待新版本就绪 kubectl rollout status deployment/sso-service-green -n $K8S_NAMESPACE --timeout=300s # 切换流量 kubectl apply -f k8s/blue-green/green-service.yaml # 验证新版本 ./scripts/validate-deployment.sh https://sso.example.com # 清理旧版本 kubectl delete deployment sso-service-blue -n $K8S_NAMESPACE environment: name: production url: https://sso.example.com四、配置管理4.1 配置中心集成java@SpringBootApplication@EnableConfigServerpublic class ConfigServerApplication { public static void main(String[] args) { SpringApplication.run(ConfigServerApplication.class, args); }}@Configurationpublic class ConfigClientConfig { @Bean public ConfigServicePropertySourceLocator configServicePropertySourceLocator( ConfigClientProperties properties) { ConfigServicePropertySourceLocator locator = new ConfigServicePropertySourceLocator(properties); return locator; }}application.yml (配置客户端)yamlspring: application: name: sso-service cloud: config: uri: http://config-server:8888 fail-fast: true retry: initial-interval: 1000 max-interval: 2000 max-attempts: 6 name: ${spring.application.name} profile: ${spring.profiles.active} label: ${spring.cloud.config.label:main} management: endpoints: web: exposure: include: refresh,configprops配置自动刷新java@RestController@RefreshScopepublic class ConfigRefreshController { @Value("${sso.token.expiration}") private Integer tokenExpiration; @Value("${sso.security.max-login-attempts}") private Integer maxLoginAttempts; @PostMapping("/refresh-config") public ResponseEntity<?> refreshConfig() { // 配置已通过@RefreshScope自动刷新 Map<String, Object> config = new HashMap<>(); config.put("token_expiration", tokenExpiration); config.put("max_login_attempts", maxLoginAttempts); config.put("refresh_time", Instant.now()); return ResponseEntity.ok(config); }}@Componentpublic class ConfigChangeListener { private static final Logger log = LoggerFactory.getLogger(ConfigChangeListener.class); @EventListener public void handleRefreshScopeRefreshed(ContextRefreshedEvent event) { log.info("配置已刷新,应用重新加载配置"); // 重新初始化相关组件 tokenService.reloadConfig(); securityService.reloadConfig(); }}五、备份与恢复5.1 数据库备份脚本bash#!/bin/bash# backup-database.shset -e# 配置参数BACKUP_DIR="/backup/postgres"DATE=$(date +%Y%m%d_%H%M%S)RETENTION_DAYS=30DB_HOST="postgres-svc"DB_PORT="5432"DB_NAME="sso_db"DB_USER="sso_user"# 创建备份目录mkdir -p $BACKUP_DIRecho "开始备份数据库: $DB_NAME"# 执行备份PGPASSWORD=$DB_PASSWORD pg_dump \ -h $DB_HOST \ -p $DB_PORT \ -U $DB_USER \ -d $DB_NAME \ -F c \ -f $BACKUP_DIR/${DB_NAME}_${DATE}.dump# 验证备份文件if [ -s $BACKUP_DIR/${DB_NAME}_${DATE}.dump ]; then echo "备份成功: $BACKUP_DIR/${DB_NAME}_${DATE}.dump" # 计算备份文件大小 BACKUP_SIZE=$(du -h $BACKUP_DIR/${DB_NAME}_${DATE}.dump | cut -f1) echo "备份大小: $BACKUP_SIZE" # 加密备份文件(可选) # gpg --symmetric --cipher-algo AES256 --passphrase "$ENCRYPTION_KEY" $BACKUP_DIR/${DB_NAME}_${DATE}.dump else echo "备份失败: 备份文件为空" exit 1fi# 清理旧备份find $BACKUP_DIR -name "${DB_NAME}_*.dump" -mtime +$RETENTION_DAYS -deleteecho "已清理超过 $RETENTION_DAYS 天的旧备份"# 上传到云存储(可选)# aws s3 cp $BACKUP_DIR/${DB_NAME}_${DATE}.dump s3://backup-bucket/sso-db/echo "数据库备份完成"5.2 Redis备份脚本bash#!/bin/bash# backup-redis.shset -eBACKUP_DIR="/backup/redis"DATE=$(date +%Y%m%d_%H%M%S)RETENTION_DAYS=7REDIS_HOST="redis-svc"REDIS_PORT="6379"mkdir -p $BACKUP_DIRecho "开始备份Redis数据"# 执行备份redis-cli -h $REDIS_HOST -p $REDIS_PORT -a "$REDIS_PASSWORD" --rdb $BACKUP_DIR/dump_${DATE}.rdb# 验证备份if [ -s $BACKUP_DIR/dump_${DATE}.rdb ]; then echo "Redis备份成功: $BACKUP_DIR/dump_${DATE}.rdb" # 生成AOF备份 redis-cli -h $REDIS_HOST -p $REDIS_PORT -a "$REDIS_PASSWORD" BGREWRITEAOF else echo "Redis备份失败" exit 1fi# 清理旧备份find $BACKUP_DIR -name "dump_*.rdb" -mtime +$RETENTION_DAYS -deleteecho "Redis备份完成"5.3 恢复脚本bash#!/bin/bash# restore-database.shset -eBACKUP_FILE=$1DB_HOST="postgres-svc"DB_PORT="5432"DB_NAME="sso_db"DB_USER="sso_user"if [ -z "$BACKUP_FILE" ]; then echo "请指定备份文件" exit 1fiif [ ! -f "$BACKUP_FILE" ]; then echo "备份文件不存在: $BACKUP_FILE" exit 1fiecho "开始恢复数据库: $DB_NAME"echo "使用备份文件: $BACKUP_FILE"# 停止应用(可选)# kubectl scale deployment sso-service --replicas=0 -n sso-system# 等待所有连接关闭sleep 30# 删除现有数据库并重建PGPASSWORD=$DB_PASSWORD psql \ -h $DB_HOST \ -p $DB_PORT \ -U $DB_USER \ -d postgres \ -c "DROP DATABASE IF EXISTS $DB_NAME;"PGPASSWORD=$DB_PASSWORD psql \ -h $DB_HOST \ -p $DB_PORT \ -U $DB_USER \ -d postgres \ -c "CREATE DATABASE $DB_NAME;"# 恢复数据PGPASSWORD=$DB_PASSWORD pg_restore \ -h $DB_HOST \ -p $DB_PORT \ -U $DB_USER \ -d $DB_NAME \ -c \ $BACKUP_FILEecho "数据库恢复完成"# 启动应用# kubectl scale deployment sso-service --replicas=3 -n sso-system# 验证恢复echo "验证恢复结果..."PGPASSWORD=$DB_PASSWORD psql \ -h $DB_HOST \ -p $DB_PORT \ -U $DB_USER \ -d $DB_NAME \ -c "SELECT COUNT(*) FROM sso_user;"六、故障排除手册6.1 常见问题排查问题1:应用启动失败bash# 检查日志kubectl logs deployment/sso-service -n sso-system --tail=100# 检查Pod状态kubectl get pods -n sso-system -l app=sso-service# 检查事件kubectl get events -n sso-system --sort-by='.lastTimestamp'# 进入Pod调试kubectl exec -it deployment/sso-service -n sso-system -- /bin/bash问题2:数据库连接问题bash# 测试数据库连接kubectl exec deployment/sso-service -n sso-system -- \ curl -v postgres-svc:5432# 检查数据库状态kubectl exec deployment/postgres -n sso-system -- \ psql -U sso_user -d sso_db -c "SELECT version();"# 检查连接池kubectl exec deployment/sso-service -n sso-system -- \ curl http://localhost:8080/actuator/hikari问题3:Redis连接问题bash# 测试Redis连接kubectl exec deployment/sso-service -n sso-system -- \ redis-cli -h redis-svc -p 6379 -a "$REDIS_PASSWORD" ping# 检查Redis内存使用kubectl exec deployment/redis -n sso-system -- \ redis-cli info memory# 查看Redis慢查询kubectl exec deployment/redis -n sso-system -- \ redis-cli slowlog get 10问题4:内存泄漏排查bash# 查看JVM内存状态kubectl exec deployment/sso-service -n sso-system -- \ curl http://localhost:8080/actuator/metrics/jvm.memory.used# 生成堆转储kubectl exec deployment/sso-service -n sso-system -- \ jmap -dump:live,format=b,file=/tmp/heap.hprof 1# 分析GC日志kubectl logs deployment/sso-service -n sso-system | grep GC问题5:性能问题排查bash# 查看应用指标kubectl exec deployment/sso-service -n sso-system -- \ curl http://localhost:8080/actuator/metrics# CPU使用率kubectl top pods -n sso-system# 网络连接数kubectl exec deployment/sso-service -n sso-system -- \ netstat -an | grep ESTABLISHED | wc -l# 数据库查询性能kubectl exec deployment/postgres -n sso-system -- \ psql -U sso_user -d sso_db -c "SELECT * FROM pg_stat_statements ORDER BY total_time DESC LIMIT 10;"6.2 监控告警规则Prometheus告警规则yaml# prometheus/alerts.yamlgroups:- name: sso-alerts rules: - alert: HighErrorRate expr: rate(http_server_requests_seconds_count{status=~"5..", uri!~".*actuator.*"}[5m]) / rate(http_server_requests_seconds_count{uri!~".*actuator.*"}[5m]) * 100 > 5 for: 5m labels: severity: critical annotations: summary: "SSO服务高错误率" description: "5分钟内错误率超过5%,当前值: {{ $value }}%" - alert: HighLatency expr: histogram_quantile(0.95, rate(http_server_requests_seconds_bucket[5m])) > 2 for: 5m labels: severity: warning annotations: summary: "SSO服务高延迟" description: "95%请求延迟超过2秒,当前值: {{ $value }}秒" - alert: HighMemoryUsage expr: jvm_memory_used_bytes{area="heap"} / jvm_memory_max_bytes{area="heap"} * 100 > 85 for: 10m labels: severity: warning annotations: summary: "SSO服务高内存使用率" description: "堆内存使用率超过85%,当前值: {{ $value }}%" - alert: PodCrashLooping expr: rate(kube_pod_container_status_restarts_total{namespace="sso-system", container="sso-service"}[15m]) > 0 for: 5m labels: severity: critical annotations: summary: "SSO服务Pod频繁重启" description: "Pod在15分钟内重启{{ $value }}次" - alert: DatabaseConnectionHigh expr: hikaricp_connections_active > hikaricp_connections_max * 0.8 for: 5m labels: severity: warning annotations: summary: "数据库连接数过高" description: "活跃连接数超过最大连接数的80%" - alert: HighLoginFailureRate expr: rate(sso_login_failure_total[10m]) / rate(sso_login_total[10m]) * 100 > 20 for: 5m labels: severity: warning annotations: summary: "登录失败率过高" description: "10分钟内登录失败率超过20%"6.3 应急响应流程bash# 应急响应脚本#!/bin/bash# emergency-response.shset -eACTION=$1VERSION=$2case $ACTION in "rollback") echo "执行回滚到版本: $VERSION" # 回滚部署 kubectl rollout undo deployment/sso-service -n sso-system # 等待回滚完成 kubectl rollout status deployment/sso-service -n sso-system # 验证回滚 ./scripts/validate-deployment.sh https://sso.example.com echo "回滚完成" ;; "scale-up") echo "扩容实例" kubectl scale deployment/sso-service --replicas=10 -n sso-system kubectl rollout status deployment/sso-service -n sso-system ;; "scale-down") echo "缩容实例" kubectl scale deployment/sso-service --replicas=2 -n sso-system ;; "restart") echo "重启部署" kubectl rollout restart deployment/sso-service -n sso-system kubectl rollout status deployment/sso-service -n sso-system ;; "drain") echo "排空节点流量" NODE=$2 kubectl drain $NODE --ignore-daemonsets --delete-emptydir-data ;; *) echo "Usage: $0 {rollback|scale-up|scale-down|restart|drain}" exit 1 ;;esac七、运维最佳实践容量规划监控资源使用趋势设置自动伸缩策略定期进行压力测试变更管理所有变更通过CI/CD流程重要变更需审批变更前备份关键数据安全运维定期更新安全补丁监控异常访问模式定期审计权限配置灾难恢复定期测试恢复流程多区域部署冷热备份策略结合文档维护保持部署文档更新记录故障处理经验维护应急预案
-
1. 与Prometheus新特性集成远程写特性增强yaml# prometheus-remote-write-optimized.yamlremote_write:- url: http://thanos-receive:10908/api/v1/receive remote_timeout: 30s queue_config: capacity: 10000 max_shards: 200 min_shards: 1 max_samples_per_send: 2000 batch_send_deadline: 5s min_backoff: 100ms max_backoff: 10s write_relabel_configs: - source_labels: [__name__] regex: 'ALERTS|up' action: drop metadata_config: send: true send_interval: 1mExemplars与追踪集成yaml# exemplars配置# Prometheus配置global: exemplars_storage_max_exemplars: 100000# Thanos查询支持- query- --query.enable-exemplars- --exemplars.config-file=/etc/thanos/exemplars.yaml2. 与云原生生态集成ServiceMonitor自动发现yaml# thanos-service-monitor-full.yamlapiVersion: monitoring.coreos.com/v1kind: ServiceMonitormetadata: name: thanos-components namespace: thanosspec: selector: matchLabels: app.kubernetes.io/part-of: thanos namespaceSelector: matchNames: - thanos endpoints: - port: http interval: 30s path: /metrics relabelings: - sourceLabels: [__meta_kubernetes_pod_name] targetLabel: pod - sourceLabels: [__meta_kubernetes_namespace] targetLabel: namespaceGrafana数据源配置yaml# grafana-datasource.yamlapiVersion: v1kind: ConfigMapmetadata: name: grafana-datasourcesdata: prometheus.yaml: | apiVersion: 1 datasources: - name: Thanos type: prometheus url: http://thanos-query-frontend:10902 access: proxy isDefault: true jsonData: timeInterval: 30s queryTimeout: 2m httpMethod: POST exemplarTraceIdDestinations: - name: trace_id datasourceUid: tempo3. 安全与多租户TLS与认证配置yaml# thanos-tls-config.yamlapiVersion: v1kind: Secretmetadata: name: thanos-tls namespace: thanostype: Opaquedata: tls.crt: ${TLS_CERT} tls.key: ${TLS_KEY} ca.crt: ${CA_CERT}---# Thanos组件TLS配置- query- --grpc-address=0.0.0.0:10901- --grpc-server-tls-cert=/etc/tls/tls.crt- --grpc-server-tls-key=/etc/tls/tls.key- --grpc-server-tls-client-ca=/etc/tls/ca.crt- --http-address=0.0.0.0:10902- --http-config.tls-cert=/etc/tls/tls.crt- --http-config.tls-key=/etc/tls/tls.key基于OAuth的认证yaml# thanos-oauth-proxy.yamlapiVersion: apps/v1kind: Deploymentmetadata: name: thanos-query-oauthspec: template: spec: containers: - name: oauth-proxy image: openshift/oauth-proxy:latest args: - --provider=openshift - --https-address=:10902 - --http-address= - --upstream=http://thanos-query:10902 - --email-domain=* - --openshift-service-account=thanos-query - --cookie-secret=${COOKIE_SECRET} ports: - containerPort: 10902
-
1. 查询性能优化查询前端缓存配置yaml# thanos-query-frontend-cache.yamlapiVersion: apps/v1kind: Deploymentmetadata: name: thanos-query-frontendspec: template: spec: containers: - name: thanos-query-frontend image: thanosio/thanos:v0.32.0 args: - query-frontend - --http-address=0.0.0.0:10902 - --query-frontend.downstream-url=thanos-query:10902 - --query-range.split-interval=24h - --query-range.response-cache-config-file=/etc/thanos/cache.yaml - --query-range.max-retries-per-request=5 - --query-range.response-cache-max-freshness=1m volumeMounts: - name: cache-config mountPath: /etc/thanos volumes: - name: cache-config configMap: name: thanos-cache-config---apiVersion: v1kind: ConfigMapmetadata: name: thanos-cache-configdata: cache.yaml: | type: REDIS config: addresses: - redis-1:6379 - redis-2:6379 password: "${REDIS_PASSWORD}" db: 0 pool_size: 100 timeout: 1s查询并行化优化yaml# thanos-query优化参数- query- --query.max-concurrent=20- --query.timeout=5m- --query.max-concurrent-select=10- --selector-label=cluster=production- --store-response-timeout=2m- --query.auto-downsampling- --query.partial-response2. 存储层优化Store Gateway调优yaml# thanos-store-optimized.yamlapiVersion: apps/v1kind: StatefulSetmetadata: name: thanos-store-gatewayspec: replicas: 3 template: spec: containers: - name: thanos-store-gateway image: thanosio/thanos:v0.32.0 args: - store - --grpc-address=0.0.0.0:10901 - --http-address=0.0.0.0:10902 - --data-dir=/data - --objstore.config-file=/etc/thanos/objectstore.yaml - --index-cache-size=2GB - --chunk-pool-size=2GB - --store.grpc.series-sample-limit=0 - --store.grpc.series-max-concurrency=20 resources: requests: memory: "4Gi" cpu: "1" limits: memory: "8Gi" cpu: "2"索引缓存优化yaml# 索引缓存配置- store- --index-cache.config-file=/etc/thanos/index-cache.yaml- --index-cache-size=4GB---apiVersion: v1kind: ConfigMapmetadata: name: thanos-index-cachedata: index-cache.yaml: | type: MEMCACHED config: addresses: - memcached-1:11211 - memcached-2:11211 timeout: 1s max_idle_connections: 100 max_async_concurrency: 203. 大规模集群运维资源配额与限制yaml# thanos-resource-quotas.yamlapiVersion: v1kind: ResourceQuotametadata: name: thanos-resources namespace: thanosspec: hard: requests.cpu: "8" requests.memory: 32Gi limits.cpu: "16" limits.memory: 64Gi pods: "20"监控与告警规则yaml# thanos-alerts.yamlgroups:- name: thanos-alerts rules: - alert: ThanosStoreGatewayDown expr: absent(up{job="thanos-store"}) for: 5m labels: severity: critical annotations: summary: "Thanos Store Gateway is down" - alert: ThanosHighQueryLatency expr: histogram_quantile(0.95, rate(thanos_query_api_queries_duration_seconds_bucket[5m])) > 10 for: 5m labels: severity: warning annotations: summary: "Thanos query latency is high" - alert: ThanosCompactBehind expr: thanos_compactor_blocks_seconds > 3600 for: 1h labels: severity: warning annotations: summary: "Thanos compaction is behind schedule"
-
1. 多集群监控架构全局查询联邦yaml# thanos-query-global.yamlapiVersion: apps/v1kind: Deploymentmetadata: name: thanos-query-globalspec: template: spec: containers: - name: thanos-query-global image: thanosio/thanos:v0.32.0 args: - query - --http-address=0.0.0.0:10902 - --store=thanos-query-cluster-a:10901 - --store=thanos-query-cluster-b:10901 - --store=thanos-query-cluster-c:10901 - --store=dnssrv+_grpc._tcp.thanos-store-global.thanos.svc.cluster.local - --query.replica-label=cluster - --query.replica-label=replica - --query.auto-downsampling跨集群服务发现yaml# external-store-config.yamlapiVersion: v1kind: ConfigMapmetadata: name: thanos-external-storesdata: stores.json: | [ { "name": "cluster-a", "address": "thanos-query.cluster-a.svc:10901" }, { "name": "cluster-b", "address": "thanos-query.cluster-b.svc:10901" }, { "name": "global-store", "address": "thanos-store-gateway.thanos.svc.cluster.local:10901" } ]2. 多租户监控架构基于标签的租户隔离yaml# 租户特定的Prometheus配置global: external_labels: tenant: "team-a" environment: "production"# Thanos查询前端多租户配置- query-frontend- --http-address=0.0.0.0:10902- --query-frontend.tenant-header=X-Tenant-ID- --query-frontend.tenant-certificate-field=tenant租户查询路由yaml# nginx租户路由配置server { listen 10902; location / { set $tenant $http_x_tenant_id; if ($tenant = "team-a") { proxy_pass http://thanos-query-team-a:10902; } if ($tenant = "team-b") { proxy_pass http://thanos-query-team-b:10902; } # 默认路由 proxy_pass http://thanos-query-default:10902; }}3. 数据备份与灾难恢复跨区域复制策略yaml# thanos-backup.yamlapiVersion: batch/v1kind: CronJobmetadata: name: thanos-backup namespace: thanosspec: schedule: "0 2 * * *" # 每天凌晨2点 jobTemplate: spec: template: spec: containers: - name: thanos-backup image: thanosio/thanos:v0.32.0 command: - /bin/sh - -c - | thanos tools bucket replicate \ --objstore.config-file=/etc/thanos/objectstore-primary.yaml \ --objstore-to.config-file=/etc/thanos/objectstore-backup.yaml \ --selector-label="cluster=production" volumeMounts: - name: objectstore-config mountPath: /etc/thanos restartPolicy: OnFailure数据恢复流程bash#!/bin/bash# thanos-restore.sh# 1. 停止写入kubectl scale deployment thanos-receive --replicas=0# 2. 从备份恢复数据thanos tools bucket restore \ --objstore.config-file=objectstore-backup.yaml \ --objstore-to.config-file=objectstore-primary.yaml# 3. 重启服务kubectl scale deployment thanos-receive --replicas=3# 4. 验证数据完整性thanos tools bucket verify \ --objstore.config-file=objectstore-primary.yaml
-
1. 基础设施准备Kubernetes命名空间与配置yaml# thanos-namespace.yamlapiVersion: v1kind: Namespacemetadata: name: thanos labels: name: thanos---# thanos-configmap.yamlapiVersion: v1kind: ConfigMapmetadata: name: thanos-config namespace: thanosdata: objectstore.yaml: | type: S3 config: bucket: "thanos-monitoring" endpoint: "s3.amazonaws.com" region: "us-west-1" access_key: "${AWS_ACCESS_KEY}" secret_key: "${AWS_SECRET_KEY}"存储类与持久化卷yaml# thanos-storage.yamlapiVersion: storage.k8s.io/v1kind: StorageClassmetadata: name: thanos-storageprovisioner: kubernetes.io/aws-ebsparameters: type: gp3 fsType: ext4---apiVersion: v1kind: PersistentVolumeClaimmetadata: name: thanos-compact-pvc namespace: thanosspec: accessModes: - ReadWriteOnce storageClassName: thanos-storage resources: requests: storage: 100Gi2. 完整Thanos集群部署部署脚本与清单bash#!/bin/bash# deploy-thanos.sh# 创建命名空间kubectl apply -f thanos-namespace.yaml# 创建配置kubectl create secret generic thanos-objectstorage \ --namespace thanos \ --from-file=objectstore.yaml=objectstore.yaml# 部署组件kubectl apply -f thanos-store.yamlkubectl apply -f thanos-compactor.yamlkubectl apply -f thanos-query.yamlkubectl apply -f thanos-query-frontend.yaml# 验证部署kubectl get pods -n thanos完整的部署清单yaml# thanos-full-deployment.yamlapiVersion: apps/v1kind: Deploymentmetadata: name: thanos-query namespace: thanosspec: replicas: 2 selector: matchLabels: app: thanos-query template: metadata: labels: app: thanos-query spec: containers: - name: thanos-query image: thanosio/thanos:v0.32.0 ports: - name: grpc containerPort: 10901 - name: http containerPort: 10902 args: - query - --grpc-address=0.0.0.0:10901 - --http-address=0.0.0.0:10902 - --store=thanos-store-gateway:10901 - --store=dnssrv+_grpc._tcp.thanos-store-gateway.thanos.svc.cluster.local - --query.replica-label=replica - --query.auto-downsampling resources: requests: memory: "512Mi" cpu: "500m" limits: memory: "1Gi" cpu: "1"---apiVersion: v1kind: Servicemetadata: name: thanos-query namespace: thanosspec: ports: - name: http port: 10902 targetPort: 10902 - name: grpc port: 10901 targetPort: 10901 selector: app: thanos-query type: LoadBalancer3. Prometheus与Thanos集成修改Prometheus配置yaml# prometheus.yml - 适应Thanos的配置global: scrape_interval: 15s external_labels: cluster: "production" replica: "A" __replica__: "prometheus-a" # Thanos专用标签# 远程写配置到Thanos Receive(可选)remote_write:- url: http://thanos-receive:10908/api/v1/receive queue_config: capacity: 10000 max_shards: 200 min_shards: 1 max_samples_per_send: 1000# 标准抓取配置scrape_configs:- job_name: 'prometheus' static_configs: - targets: ['localhost:9090']Thanos Receive模式yaml# thanos-receive.yamlapiVersion: apps/v1kind: Deploymentmetadata: name: thanos-receive namespace: thanosspec: replicas: 3 template: spec: containers: - name: thanos-receive image: thanosio/thanos:v0.32.0 args: - receive - --grpc-address=0.0.0.0:10901 - --http-address=0.0.0.0:10902 - --remote-write.address=0.0.0.0:10908 - --tsdb.path=/var/thanos/receive - --objstore.config-file=/etc/thanos/objectstore.yaml - --label=receive="true" - --label=cluster="production" ports: - containerPort: 10901 - containerPort: 10902 - containerPort: 10908 volumeMounts: - name: objectstore-config mountPath: /etc/thanos - name: receive-data mountPath: /var/thanos/receive4. 监控与运维Thanos自身监控yaml# thanos-service-monitor.yamlapiVersion: monitoring.coreos.com/v1kind: ServiceMonitormetadata: name: thanos namespace: thanosspec: selector: matchLabels: app.kubernetes.io/name: thanos endpoints: - port: http interval: 30s path: /metrics - port: grpc interval: 30s path: /metrics健康检查与就绪探针yaml# 在Thanos容器配置中添加livenessProbe: httpGet: path: /-/healthy port: http initialDelaySeconds: 30 periodSeconds: 10readinessProbe: httpGet: path: /-/ready port: http initialDelaySeconds: 30 periodSeconds: 5性能监控指标promql# Thanos查询性能rate(thanos_query_api_queries_total[5m])# 存储网关指标thanos_store_nodes_grpc_connections# 压缩进度thanos_compactor_blocks_marked_for_deletion# 对象存储操作rate(thanos_objstore_bucket_operations_total[5m])
-
1. Thanos解决的问题域Prometheus的原始限制bash# Prometheus单机版的典型问题- 存储限制:本地TSDB有限容量- 数据丢失:Pod重启导致2小时数据丢失- 全局视图:多集群查询复杂- 长期存储:默认15天保留策略- 高可用性:数据一致性挑战Thanos的解决方案架构text┌─────────────┐ ┌─────────────┐ ┌─────────────┐│ Prometheus │ │ Prometheus │ │ Prometheus ││ Cluster A │ │ Cluster B │ │ Cluster C │└──────┬──────┘ └──────┬──────┘ └──────┬──────┘ │ │ │ └──────────────────┼──────────────────┘ │┌─────────────────────────────────────────────────┐│ Thanos ││ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ ││ │ Sidecar │ │ Sidecar │ │ Sidecar │ │ Query │ ││ └─────────┘ └─────────┘ └─────────┘ └─────────┘ ││ ┌─────────┐ ┌─────────┐ ┌─────────┐ ││ │ Store │ │ Compact │ │ Rule │ ││ └─────────┘ └─────────┘ └─────────┘ │└─────────────────────────────────────────────────┘2. Thanos核心组件详解Sidecar模式部署yaml# prometheus-with-thanos-sidecar.yamlapiVersion: apps/v1kind: Deploymentmetadata: name: prometheusspec: replicas: 2 template: spec: containers: - name: prometheus image: prom/prometheus:v2.47.0 args: - '--config.file=/etc/prometheus/prometheus.yml' - '--storage.tsdb.path=/prometheus' - '--web.enable-lifecycle' - '--storage.tsdb.retention.time=2h' # 本地只保留2小时 ports: - containerPort: 9090 - name: thanos-sidecar image: thanosio/thanos:v0.32.0 args: - sidecar - --grpc-address=0.0.0.0:10901 - --http-address=0.0.0.0:10902 - --prometheus.url=http://localhost:9090 - --tsdb.path=/prometheus - --reloader.config-file=/etc/prometheus/prometheus.yml - --reloader.config-envsubst-file=/etc/prometheus-shared/prometheus.yml ports: - containerPort: 10901 - containerPort: 10902 volumeMounts: - name: prometheus-data mountPath: /prometheusStore Gateway组件yaml# thanos-store.yamlapiVersion: apps/v1kind: Deploymentmetadata: name: thanos-storespec: replicas: 2 template: spec: containers: - name: thanos-store image: thanosio/thanos:v0.32.0 args: - store - --grpc-address=0.0.0.0:10901 - --http-address=0.0.0.0:10902 - --data-dir=/data - --objstore.config-file=/etc/thanos/objectstore.yaml ports: - containerPort: 10901 - containerPort: 10902 volumeMounts: - name: objectstore-config mountPath: /etc/thanosQuery组件 - 统一查询入口yaml# thanos-query.yamlapiVersion: apps/v1kind: Deploymentmetadata: name: thanos-queryspec: replicas: 2 template: spec: containers: - name: thanos-query image: thanosio/thanos:v0.32.0 args: - query - --grpc-address=0.0.0.0:10901 - --http-address=0.0.0.0:10902 - --store=thanos-store-gateway:10901 - --store=prometheus-a-sidecar:10901 - --store=prometheus-b-sidecar:10901 - --query.replica-label=replica ports: - containerPort: 10901 - containerPort: 109023. 对象存储配置S3对象存储配置yaml# objectstore.yamltype: S3config: bucket: "thanos-monitoring" endpoint: "s3.amazonaws.com" region: "us-west-1" access_key: "${AWS_ACCESS_KEY}" secret_key: "${AWS_SECRET_KEY}" insecure: false signature_version2: false put_user_metadata: {} http_config: idle_conn_timeout: 90s response_header_timeout: 2m trace: enable: true多存储后端支持yaml# 支持多种对象存储## Google Cloud Storagetype: GCSconfig: bucket: "thanos-monitoring" ## Azure Blob Storage type: AZUREconfig: storage_account: "thanosstorage" storage_account_key: "${AZURE_KEY}" container: "thanos" ## 本地文件系统(测试用)type: FILESYSTEMconfig: directory: "/thanos/data"4. 数据压缩与降采样Compactor组件配置yaml# thanos-compactor.yamlapiVersion: apps/v1kind: Deploymentmetadata: name: thanos-compactorspec: replicas: 1 template: spec: containers: - name: thanos-compactor image: thanosio/thanos:v0.32.0 args: - compact - --grpc-address=0.0.0.0:10901 - --http-address=0.0.0.0:10902 - --data-dir=/var/thanos/compact - --objstore.config-file=/etc/thanos/objectstore.yaml - --wait - --retention.resolution-raw=30d - --retention.resolution-5m=90d - --retention.resolution-1h=1y volumeMounts: - name: objectstore-config mountPath: /etc/thanos - name: compact-data mountPath: /var/thanos/compact5. 全局查询与数据去重跨集群查询配置yaml# thanos-query-frontend.yaml - 查询前端优化apiVersion: apps/v1kind: Deploymentmetadata: name: thanos-query-frontendspec: template: spec: containers: - name: thanos-query-frontend image: thanosio/thanos:v0.32.0 args: - query-frontend - --http-address=0.0.0.0:10902 - --query-frontend.compress-responses - --query-frontend.downstream-url=thanos-query:10902 - --query-range.split-interval=24h - --query-range.max-retries-per-request=5 - --query-range.response-cache-max-freshness=1m数据去重策略yaml# 查询时指定去重标签--query.replica-label=replica--query.replica-label=cluster_id--deduplication.replica-label=replica# 在Prometheus配置外部标签global: external_labels: cluster: us-west-1 replica: A tenant: team-a
-
1. 与Grafana深度集成仪表盘模板化json{ "dashboard": { "title": "Kubernetes Cluster Monitoring", "templating": { "list": [ { "name": "namespace", "type": "query", "query": "label_values(kube_pod_info, namespace)" } ] }, "panels": [ { "title": "CPU Usage", "type": "graph", "targets": [ { "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\"}[5m])) by (pod)", "legendFormat": "{{pod}}" } ] } ] }}2. 与日志追踪集成基于 exemplars 的链路追踪promql# 查询包含追踪信息的指标http_request_duration_seconds_bucket{le="0.1"} # {trace_id="abc123"}# 配置exemplars存储exemplar_storage: max_exemplars: 1000003. 长期存储方案远程读写配置yamlremote_write:- url: "http://victoriametrics:8428/api/v1/write" queue_config: capacity: 10000 max_samples_per_send: 1000 write_relabel_configs: - source_labels: [__name__] regex: '.*_(total|sum|count)' action: keepremote_read:- url: "http://victoriametrics:8428/api/v1/read" read_recent: true
-
1. 高可用部署模式双活冗余架构yaml# prometheus-ha.ymlglobal: external_labels: replica: A # 区分副本scrape_configs:- job_name: 'prometheus' static_configs: - targets: ['localhost:9090'] metric_relabel_configs: - source_labels: [__name__] regex: '.*_total' action: keepremote_write:- url: http://thanos-receive:10908/api/v1/receive queue_config: capacity: 10000 max_shards: 200 min_shards: 1负载均衡配置yaml# nginx负载均衡upstream prometheus { hash $consistent_hash consistent; server prometheus-a:9090; server prometheus-b:9090;}server { listen 9090; location / { set $consistent_hash $arg_query; # 基于查询参数哈希 proxy_pass http://prometheus; }}2. 联邦集群架构层级联邦设计yaml# 全局Prometheus配置scrape_configs:- job_name: 'federation' scrape_interval: 1m honor_labels: true metrics_path: '/federate' params: 'match[]': - 'up{job=~".*"}' - '{__name__=~"job:.*"}' static_configs: - targets: - 'prometheus-eu1:9090' - 'prometheus-us1:9090' - 'prometheus-ap1:9090'分片采集策略yaml# 基于标签分片- job_name: 'node-shard-A' scrape_configs: - job_name: 'node' relabel_configs: - source_labels: [__address__] modulus: 2 target_label: __tmp_hash action: hashmod - source_labels: [__tmp_hash] regex: 0 action: keep- job_name: 'node-shard-B' scrape_configs: - job_name: 'node' relabel_configs: - source_labels: [__address__] modulus: 2 target_label: __tmp_hash action: hashmod - source_labels: [__tmp_hash] regex: 1 action: keep
-
1. TSDB架构设计存储层级结构text┌─────────────────┐│ Head Block │ ← 最新数据,可写├─────────────────┤│ Memory Series │ ← 活跃序列元数据 ├─────────────────┤│ WAL (Write- │ ← 预写日志,防数据丢失│ Ahead Log) │├─────────────────┤│ mmap chunks │ ← 内存映射块文件├─────────────────┤│ Block │ ← 不可变数据块│ (2h chunks) │├─────────────────┤│ Compacted │ ← 压缩合并块│ Block │└─────────────────┘数据写入流程gofunc (h *Head) Append() error { // 1. 写入WAL确保数据持久化 wal.Write(seriesRef, timestamp, value) // 2. 更新内存中的时间序列 series.Append(timestamp, value) // 3. 定期将内存数据刷到mmap chunks if time.Since(lastMMap) > 2*time.Hour { h.mmapChunks() }}2. 数据压缩与保留策略块压缩机制yaml# 启动参数配置压缩行为--storage.tsdb.min-block-duration=2h--storage.tsdb.max-block-duration=24h--storage.tsdb.retention.time=15d--storage.tsdb.retention.size=512GB保留策略优化bash# 根据数据重要性设置不同保留时间## 基础设施指标:30天--storage.tsdb.retention.time=30d## 业务指标:7天 --storage.tsdb.retention.time=7d## 详细日志类指标:2天--storage.tsdb.retention.time=2d3. 内存管理与性能优化内存使用分析promql# 监控TSDB内存使用process_resident_memory_bytes{job="prometheus"}go_memstats_alloc_bytesprometheus_tsdb_head_series性能调优参数yaml# 优化大规模部署--storage.tsdb.max-block-chunk-segment-size=512MB--storage.tsdb.wal-compression=true--query.max-concurrency=20--query.timeout=2m
-
1. Kubernetes服务发现机制原生服务发现类型yaml# prometheus.yml配置scrape_configs: - job_name: 'kubernetes-pods' kubernetes_sd_configs: - role: pod - job_name: 'kubernetes-services' kubernetes_sd_configs: - role: service - job_name: 'kubernetes-nodes' kubernetes_sd_configs: - role: node - job_name: 'kubernetes-endpoints' kubernetes_sd_configs: - role: endpointsPod发现与注解驱动配置yamlapiVersion: v1kind: Podmetadata: name: example-app annotations: prometheus.io/scrape: "true" prometheus.io/path: "/metrics" prometheus.io/port: "8080" prometheus.io/scheme: "https"spec: containers: - name: app image: myapp:latest ports: - containerPort: 80802. Prometheus Operator架构解析Custom Resource Definitionsyaml# Prometheus实例定义apiVersion: monitoring.coreos.com/v1kind: Prometheusmetadata: name: main namespace: monitoringspec: replicas: 2 serviceAccountName: prometheus serviceMonitorSelector: {} podMonitorSelector: {} resources: requests: memory: 400Mi limits: memory: 2GiServiceMonitor配置详解yamlapiVersion: monitoring.coreos.com/v1kind: ServiceMonitormetadata: name: web-service namespace: monitoringspec: selector: matchLabels: app: web-server endpoints: - port: web-metrics interval: 30s path: /metrics scheme: https tlsConfig: insecureSkipVerify: true namespaceSelector: any: true3. 高级服务发现配置多集群监控架构yaml# 联邦集群配置scrape_configs: - job_name: 'federate' scrape_interval: 15s honor_labels: true metrics_path: '/federate' params: 'match[]': - '{job="prometheus"}' - '{__name__=~"job:.*"}' static_configs: - targets: - 'prometheus-cluster-1:9090' - 'prometheus-cluster-2:9090'动态标签管理yamlrelabel_configs:# 基于Pod标签添加业务标签- source_labels: [__meta_kubernetes_pod_label_app] target_label: app- source_labels: [__meta_kubernetes_pod_label_version] target_label: version# 基于命名空间添加环境标签 - source_labels: [__meta_kubernetes_namespace] target_label: environment replacement: "production"# 过滤开发环境Pod- source_labels: [__meta_kubernetes_namespace] regex: "dev-.*" action: drop4. Kubernetes特定指标监控容器资源监控promql# 容器CPU使用率sum(rate(container_cpu_usage_seconds_total[1m])) by (pod, container)# 容器内存使用率container_memory_usage_bytes / container_spec_memory_limit_bytes * 100# 容器重启次数rate(kube_pod_container_status_restarts_total[1h])集群级别监控promql# 节点资源预留sum(kube_pod_container_resource_requests_cpu_cores) by (node)/sum(kube_node_status_capacity_cpu_cores) by (node) * 100# Pod调度状态sum(kube_pod_status_phase{phase="Pending"}) by (namespace)# PVC使用情况kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes * 100
-
1. 数据模型设计哲学时间序列数据的本质Prometheus将所有数据存储为时间序列,每个时间序列由以下部分组成:指标名称:描述监控指标的类型标签集合:多维度的键值对,用于标识和过滤时间戳:数据点的时间样本值:该时间点的测量值指标命名最佳实践promql# 好的命名规范http_requests_totalnode_memory_usage_bytesdatabase_connections_active# 避免的命名方式httpReqTotal # 不一致的大小写node_memory_usage # 缺少单位metric1 # 无意义的名称2. 四种指标类型深度剖析Counter(计数器)的进阶用法promql# 计算QPS(每秒查询率)rate(http_requests_total[5m])# 计算增长率increase(http_requests_total[1h])# 业务场景:计算订单增长率rate(order_created_total[1h]) * 3600Gauge(仪表盘)的实际应用promql# 当前系统负载node_load1# 内存使用率(node_memory_total_bytes - node_memory_available_bytes) / node_memory_total_bytes * 100# 连接池使用情况database_connections_active / database_connections_max * 100Histogram(直方图)的分位数计算promql# 计算95分位响应时间histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))# 计算平均响应时间rate(http_request_duration_seconds_sum[5m]) / rate(http_request_duration_seconds_count[5m])Summary与Histogram的选择策略go// Summary在客户端计算分位数,适合:// - 不需要跨实例聚合的场景// - 对精度要求高的应用级指标// Histogram在服务端计算分位数,适合:// - 需要跨实例聚合的场景 // - 网络延迟等基础设施指标3. PromQL高级查询技巧向量匹配操作promql# 一对一匹配node_disk_read_bytes_total * on(instance, device) group_left(node_disk_read_time_seconds_total)# 多对一/一对多匹配sum by (job) (rate(http_requests_total[5m])) * on(job) group_left(team) job_team_info子查询与嵌套查询promql# 计算每小时最大QPSmax_over_time( rate(http_requests_total[5m])[1h:1m])# 预测磁盘空间耗尽时间predict_linear(node_filesystem_free_bytes[6h], 3600*24)时间窗口函数实战promql# 移动平均计算avg_over_time(node_memory_usage_bytes[1h])# 变化率检测delta(node_network_receive_bytes_total[2m])# 离群值检测avg_over_time(temperature_celsius[10m]) + 2 * stddev_over_time(temperature_celsius[10m])4. 记录规则与优化策略性能优化规则yamlgroups:- name: optimized_rules interval: 30s # 降低评估频率 rules: - record: job:http_requests:rate5m expr: rate(http_requests_total[5m]) - record: job:http_errors:percentage expr: | rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) * 100业务指标聚合yaml- record: business:revenue:hourly expr: | sum by (product_type) ( increase(order_revenue_total[1h]) ) - record: system:resource:utilization expr: | avg by (instance) ( node_cpu_usage + node_memory_usage + node_disk_usage ) / 3
-
1. Alertmanager架构解析核心组件与工作流程Alertmanager架构:text┌─────────────────┐ ┌──────────────────┐ ┌──────────────┐│ Prometheus │───▶│ Alertmanager │───▶│ 通知渠道 ││ Server │ │ │ │ (Email/Webhook) └─────────────────┘ └──────────────────┘ └──────────────┘ │ │ │ │ ▼ ▼┌─────────────────┐ ┌──────────────────┐│ 告警规则 │ │ 静默/抑制 ││ Rules │ │ Silences │└─────────────────┘ └──────────────────┘数据处理流程:接收告警:从Prometheus接收告警分组分组:按标签对告警进行分组抑制处理:处理告警抑制关系静默检查:检查是否处于静默期路由分发:根据路由树发送到不同接收器重试机制:失败告警的重试处理高可用部署方案yaml# docker-compose.ha.ymlversion: '3.8'services: alertmanager-1: image: prom/alertmanager:latest ports: - "9093:9093" command: - '--config.file=/etc/alertmanager/alertmanager.yml' - '--storage.path=/alertmanager' - '--cluster.advertise-address=alertmanager-1:9093' - '--cluster.peer=alertmanager-1:9093' - '--cluster.peer=alertmanager-2:9093' volumes: - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml - alertmanager_data_1:/alertmanager alertmanager-2: image: prom/alertmanager:latest ports: - "9094:9093" command: - '--config.file=/etc/alertmanager/alertmanager.yml' - '--storage.path=/alertmanager' - '--cluster.advertise-address=alertmanager-2:9093' - '--cluster.peer=alertmanager-1:9093' - '--cluster.peer=alertmanager-2:9093' volumes: - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml - alertmanager_data_2:/alertmanagervolumes: alertmanager_data_1: alertmanager_data_2:2. 告警规则配置实战基础告警规则yaml# rules/node_alerts.ymlgroups:- name: node_alerts rules: # 节点宕机告警 - alert: NodeDown expr: up{job="node"} == 0 for: 2m labels: severity: critical team: infrastructure annotations: summary: "Node {{ $labels.instance }} is down" description: "Node {{ $labels.instance }} has been down for more than 2 minutes." runbook: "https://wiki.company.com/runbooks/node-down" # CPU使用率告警 - alert: HighCPUUsage expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 for: 5m labels: severity: warning annotations: summary: "High CPU usage on {{ $labels.instance }}" description: "CPU usage is at {{ $value }}% for more than 5 minutes" # 内存使用率告警 - alert: HighMemoryUsage expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 90 for: 5m labels: severity: warning annotations: summary: "High memory usage on {{ $labels.instance }}" description: "Memory usage is at {{ $value }}%" # 磁盘空间告警 - alert: DiskSpaceCritical expr: (1 - (node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes)) * 100 > 95 for: 2m labels: severity: critical annotations: summary: "Disk space critical on {{ $labels.instance }} ({{ $labels.mountpoint }})" description: "Disk usage is at {{ $value }}%"业务应用告警规则yaml# rules/business_alerts.ymlgroups:- name: business_alerts rules: # HTTP错误率告警 - alert: HighHTTPErrorRate expr: | ( rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) ) * 100 > 5 for: 3m labels: severity: critical team: backend annotations: summary: "High HTTP error rate on {{ $labels.service }}" description: "HTTP error rate is {{ $value }}% (threshold: 5%)" # 应用响应时间告警 - alert: HighResponseTime expr: | histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]) ) > 1 for: 5m labels: severity: warning annotations: summary: "High response time on {{ $labels.service }}" description: "95th percentile response time is {{ $value }}s" # 业务交易量突降告警 - alert: BusinessTransactionDrop expr: | ( avg_over_time(business_transactions_total[1h]) / avg_over_time(business_transactions_total[1h] offset 1d) ) < 0.5 for: 10m labels: severity: critical team: business annotations: summary: "Business transactions dropped significantly" description: "Current transactions are {{ $value }} of normal levels"高级告警规则技巧预测性告警:yaml- alert: DiskSpacePrediction expr: | predict_linear(node_filesystem_free_bytes{fstype!="tmpfs"}[6h], 4*60*60) < 0 for: 1h labels: severity: warning annotations: summary: "Disk space predicted to run out in 4 hours" description: "Based on current usage trends, disk will be full soon"关联性告警:yaml- alert: ServiceDependencyIssue expr: | ( up{service="order-service"} == 1 and up{service="payment-service"} == 0 ) or ( rate(http_requests_total{service="order-service",status=~"5.."}[2m]) > 10 and up{service="payment-service"} == 0 ) labels: severity: critical annotations: summary: "Service dependency issue detected" description: "Order service is affected by payment service outage"3. Alertmanager配置详解主配置文件结构yaml# alertmanager.ymlglobal: # SMTP配置 smtp_smarthost: 'smtp.company.com:587' smtp_from: 'alertmanager@company.com' smtp_auth_username: 'alertmanager' smtp_auth_password: '${SMTP_PASSWORD}' # Slack配置 slack_api_url: 'https://hooks.slack.com/services/XXX' # 其他全局配置 resolve_timeout: 5m# 路由树配置route: group_by: ['alertname', 'cluster', 'service'] group_wait: 30s group_interval: 5m repeat_interval: 12h receiver: 'default-receiver' # 子路由 routes: - match: severity: critical receiver: 'critical-alerts' group_interval: 1m repeat_interval: 5m - match: service: database receiver: 'dba-team' group_wait: 10s - match_re: service: ^(frontend|backend).* receiver: 'web-team'# 接收器配置receivers: - name: 'default-receiver' email_configs: - to: 'alerts@company.com' subject: '[{{ .Status | toUpper }}] {{ .GroupLabels.alertname }}' - name: 'critical-alerts' slack_configs: - channel: '#critical-alerts' title: '🚨 {{ .GroupLabels.alertname }}' text: | {{ range .Alerts }} *Alert:* {{ .Annotations.summary }} *Description:* {{ .Annotations.description }} *Runbook:* {{ .Annotations.runbook }} *Time:* {{ .StartsAt }} {{ end }} pagerduty_configs: - service_key: '${PAGERDUTY_KEY}' description: '{{ .CommonAnnotations.summary }}' - name: 'dba-team' webhook_configs: - url: 'http://dba-tool:8080/alerts' send_resolved: true - name: 'web-team' email_configs: - to: 'web-team@company.com' slack_configs: - channel: '#web-team-alerts'# 抑制规则inhibit_rules: - source_match: severity: 'critical' target_match: severity: 'warning' equal: ['alertname', 'cluster', 'service']多通道通知配置yamlreceivers: - name: 'multi-channel' # Email通知 email_configs: - to: 'team@company.com' headers: Priority: '1' # High priority # Slack通知 slack_configs: - channel: '#alerts' color: '{{ if eq .Status "firing" }}danger{{ else }}good{{ end }}' actions: - type: button text: 'View Dashboard' url: 'http://grafana.company.com/dashboard' - type: button text: 'Runbook' url: '{{ .CommonAnnotations.runbook }}' # Webhook集成 webhook_configs: - url: 'http://chatbot:8080/alert' max_alerts: 10 # PagerDuty集成 pagerduty_configs: - service_key: '${PAGERDUTY_KEY}' severity: '{{ .CommonLabels.severity }}' details: alertname: '{{ .CommonLabels.alertname }}' summary: '{{ .CommonAnnotations.summary }}' # OpsGenie集成 opsgenie_configs: - api_key: '${OPSGENIE_KEY}' message: '{{ .CommonAnnotations.summary }}' tags: ['{{ .CommonLabels.team }}', '{{ .CommonLabels.severity }}']4. 静默与抑制配置静默管理通过Web UI创建静默:访问Alertmanager UI(默认9093端口)点击"New Silence"设置匹配标签和时间范围添加注释说明静默原因通过API创建静默:bash# 创建静默curl -X POST http://alertmanager:9093/api/v2/silences \ -H 'Content-Type: application/json' \ -d '{ "matchers": [ { "name": "alertname", "value": "NodeDown", "isRegex": false }, { "name": "instance", "value": "web-01.*", "isRegex": true } ], "startsAt": "2023-01-01T00:00:00.000Z", "endsAt": "2023-01-01T02:00:00.000Z", "createdBy": "api-user", "comment": "Planned maintenance window" }'# 列出所有静默curl http://alertmanager:9093/api/v2/silences# 删除静默curl -X DELETE http://alertmanager:9093/api/v2/silence/<silence-id>抑制规则优化yamlinhibit_rules: # 节点宕机时抑制该节点上的所有应用告警 - source_match: alertname: NodeDown target_match_re: instance: '.*' equal: ['instance'] # 集群级别故障抑制服务级别告警 - source_match: alertname: ClusterUnavailable target_match: severity: warning equal: ['cluster'] # 数据库主节点故障抑制从节点告警 - source_match: alertname: MySQLPrimaryDown role: primary target_match: alertname: MySQLReplicationLag role: replica equal: ['cluster']5. 模板自定义告警模板定制yaml# 自定义模板文件 templates/custom.tmpl{{ define "slack.custom.title" }}{{ if eq .Status "firing" }}🔥{{ else }}✅{{ end }} [{{ .Status | toUpper }}] {{ .GroupLabels.alertname }}{{ end }}{{ define "slack.custom.text" }}{{ range .Alerts }}*Alert:* {{ .Annotations.summary }}*Description:* {{ .Annotations.description }}*Severity:* {{ .Labels.severity }}*Instance:* {{ .Labels.instance }}*Started:* {{ .StartsAt.Format "2006-01-02 15:04:05" }}{{ if .GeneratorURL }}*Grafana:* <{{ .GeneratorURL }}|View Dashboard>{{ end }}{{ if .Annotations.runbook }}*Runbook:* <{{ .Annotations.runbook }}|Documentation>{{ end }}---{{ end }}在配置中引用模板:yamlglobal: # ... 其他配置# 模板文件路径templates: - '/etc/alertmanager/templates/*.tmpl'receivers: - name: 'slack-with-template' slack_configs: - channel: '#alerts' title: '{{ template "slack.custom.title" . }}' text: '{{ template "slack.custom.text" . }}' color: '{{ if eq .Status "firing" }}danger{{ else }}good{{ end }}'条件模板go{{ define "email.subject" }}{{- if eq .Status "firing" -}}[FIRING] {{ len .Alerts }} alert(s) for {{ .GroupLabels.alertname }}{{- else -}}[RESOLVED] {{ .GroupLabels.alertname }} resolved{{- end }}{{ end }}{{ define "email.html" }}<html><body><h2>Alert Summary</h2>{{ range .Alerts }}<div style="border: 1px solid {{ if eq .Status "firing" }}red{{ else }}green{{ end }}; padding: 10px; margin: 5px;"> <h3>{{ .Annotations.summary }}</h3> <p><strong>Status:</strong> {{ .Status }}</p> <p><strong>Description:</strong> {{ .Annotations.description }}</p> <p><strong>Labels:</strong></p> <ul> {{ range .Labels.SortedPairs }} <li><strong>{{ .Name }}:</strong> {{ .Value }}</li> {{ end }} </ul></div>{{ end }}</body></html>{{ end }}6. 告警最佳实践告警分级策略yaml# 严重级别定义- name: severity_critical rules: - alert: ServiceDown expr: up == 0 labels: severity: critical priority: P0 - name: severity_warning rules: - alert: HighResourceUsage expr: cpu_usage > 80 labels: severity: warning priority: P1 - name: severity_info rules: - alert: DiskSpaceWarning expr: disk_usage > 85 labels: severity: info priority: P2避免告警风暴分组与抑制策略:yamlroute: group_by: ['alertname', 'cluster'] group_wait: 30s group_interval: 5m repeat_interval: 1h routes: - match: severity: critical group_interval: 1m repeat_interval: 10m receiver: 'critical-pager' inhibit_rules: - source_match: severity: critical target_match: severity: warning equal: ['cluster', 'service']告警有效性评估监控告警本身:yaml- alert: AlertmanagerClusterDown expr: count(up{job="alertmanager"}) < 2 for: 2m labels: severity: critical annotations: summary: "Alertmanager cluster degraded" - alert: HighAlertVolume expr: | rate(alertmanager_alerts_received_total[5m]) > 100 labels: severity: warning annotations: summary: "High volume of alerts received"告警质量指标:promql# 告警触发频率rate(ALERTS{alertstate="firing"}[1h])# 平均解决时间avg_over_time(alertmanager_alerts{state="active"}[1h])# 误报率rate(ALERTS{alertstate="firing", severity="info"}[24h]) / rate(ALERTS{alertstate="firing"}[24h])7. 实战:完整的生产告警体系完整的prometheus.yml配置yamlglobal: scrape_interval: 15s evaluation_interval: 15srule_files: - "rules/*.yml"alerting: alertmanagers: - static_configs: - targets: - alertmanager-1:9093 - alertmanager-2:9093 timeout: 10s api_version: v2scrape_configs: - job_name: 'alertmanager' static_configs: - targets: ['alertmanager-1:9093', 'alertmanager-2:9093']完整的alertmanager.yml配置yamlglobal: smtp_smarthost: 'smtp.company.com:587' smtp_from: 'alertmanager@company.com' smtp_auth_username: 'alertmanager' smtp_auth_password: '${SMTP_PASSWORD}'route: receiver: 'default' group_by: ['alertname', 'cluster'] group_wait: 10s group_interval: 5m repeat_interval: 3h routes: - match: severity: critical receiver: 'critical' repeat_interval: 10m - match: team: database receiver: 'dba-team'receivers: - name: 'default' email_configs: - to: 'alerts@company.com' - name: 'critical' slack_configs: - channel: '#critical-alerts' pagerduty_configs: - service_key: '${PAGERDUTY_KEY}' - name: 'dba-team' webhook_configs: - url: 'http://dba-portal:8080/alerts'inhibit_rules: - source_match: severity: critical target_match: severity: warning equal: ['cluster', 'alertname']通过这个完整的告警体系,您可以构建一个可靠、可扩展且易于维护的监控告警系统,确保及时发现问题并快速响应。
-
1. Exporter工作原理数据采集机制Exporter架构模式:go// 典型的Exporter结构type Exporter struct { registry *prometheus.Registry gauges map[string]*prometheus.GaugeVec counters map[string]*prometheus.CounterVec}// 数据收集接口type Collector interface { Describe(ch chan<- *prometheus.Desc) Collect(ch chan<- prometheus.Metric)}工作流程:初始化:注册指标描述符数据收集:从目标系统获取原始数据指标转换:将数据转换为Prometheus格式HTTP服务:通过/metrics端点暴露数据指标暴露格式http# HELP node_cpu_seconds_total Seconds the CPUs spent in each mode.# TYPE node_cpu_seconds_total counternode_cpu_seconds_total{cpu="0",mode="idle"} 12345.67node_cpu_seconds_total{cpu="0",mode="system"} 123.45node_cpu_seconds_total{cpu="0",mode="user"} 456.78# HELP node_memory_MemTotal_bytes Memory information field MemTotal.# TYPE node_memory_MemTotal_bytes gaugenode_memory_MemTotal_bytes 8.253952e+092. 核心Exporter部署配置Node Exporter:系统级监控Docker部署:bashdocker run -d \ --name=node-exporter \ --net="host" \ --pid="host" \ -v "/:/host:ro,rslave" \ quay.io/prometheus/node-exporter:latest \ --path.rootfs=/host \ --collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)systemd服务配置:ini[Unit]Description=Node ExporterDocumentation=https://prometheus.io/docs/guides/node-exporter/Wants=network-online.targetAfter=network-online.target[Service]User=node_exporterGroup=node_exporterType=simpleRestart=on-failureExecStart=/usr/local/bin/node-exporter \ --web.listen-address=:9100 \ --collector.systemd \ --collector.systemd.unit-whitelist=(docker|ssh|nginx).service \ --collector.textfile.directory=/var/lib/node_exporter/textfile_collector[Install]WantedBy=multi-user.target关键指标收集:yaml# Prometheus配置scrape_configs: - job_name: 'node' static_configs: - targets: ['node-exporter:9100'] scrape_interval: 15s relabel_configs: - source_labels: [__address__] target_label: instance regex: '(.*):.*' replacement: '${1}'Blackbox Exporter:网络探测配置示例:yaml# blackbox.ymlmodules: http_2xx: prober: http timeout: 5s http: valid_http_versions: ["HTTP/1.1", "HTTP/2"] valid_status_codes: [200, 301, 302] method: GET headers: User-Agent: "Blackbox Exporter/1.0" tcp_connect: prober: tcp timeout: 5s tcp: preferred_ip_protocol: "ip4" icmp: prober: icmp timeout: 5s icmp: preferred_ip_protocol: "ip4"Prometheus集成:yamlscrape_configs: - job_name: 'blackbox-http' metrics_path: /probe params: module: [http_2xx] static_configs: - targets: - https://example.com - https://google.com - http://192.168.1.1 relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: blackbox-exporter:9115MySQL Exporter:数据库监控部署配置:bashdocker run -d \ --name=mysql-exporter \ -p 9104:9104 \ -e DATA_SOURCE_NAME="exporter:password@(mysql-server:3306)/" \ prom/mysqld-exporter:latest高级配置:yaml# my.cnf for exporter user[client]user=exporterpassword=your_passwordhost=mysql-serverport=3306# 自定义查询配置custom-queries: | # Query for long running transactions long_running_transactions: query: "SELECT COUNT(*) as count FROM information_schema.processlist WHERE command != 'Sleep' AND time > 60" metrics: - metric_name: mysql_long_running_transactions type: gauge help: "Number of long running transactions" values: [count]关键监控指标:sql-- 连接数监控SHOW STATUS LIKE 'Threads_connected';-- 查询性能监控SHOW STATUS LIKE 'Slow_queries';-- 复制状态监控SHOW SLAVE STATUS;-- 缓冲池使用率SHOW STATUS LIKE 'Innodb_buffer_pool%';SNMP Exporter:网络设备监控生成配置文件:bash# 生成SNMP配置./snmp_exporter generate# 自定义OID配置modules: if_mib: walk: - 1.3.6.1.2.1.2 # interfaces - 1.3.6.1.2.1.31.1.1 # ifXTable version: 2 auth: community: public walk_params: max_repetitions: 25部署示例:yaml# docker-compose.ymlversion: '3'services: snmp-exporter: image: prom/snmp-exporter:latest ports: - "9116:9116" volumes: - ./snmp.yml:/etc/snmp_exporter/snmp.yml3. 业务应用ExporterJava应用监控(JMX Exporter)配置示例:yaml# jmx_config.ymlrules: - pattern: 'java.lang<type=Memory><>(.*):' name: java_memory_$1 - pattern: 'java.lang<type=Threading><>ThreadCount' name: java_threads_count type: gauge - pattern: ' Catalina<type=GlobalRequestProcessor, name=\"(\w+-\w+)-(\d+)\"><>requestCount:' name: tomcat_request_count labels: port: "$2" protocol: "$1"Java启动参数:bashjava -javaagent:./jmx_prometheus_javaagent-0.17.0.jar=8080:jmx_config.yml \ -jar your-application.jarNginx监控状态模块配置:nginx# nginx.confserver { listen 8080; location /nginx_status { stub_status on; access_log off; allow 127.0.0.1; deny all; }}Nginx Exporter配置:yaml# nginx-exporter配置- job_name: 'nginx' static_configs: - targets: ['nginx-exporter:9113'] metrics_path: /metrics params: 'nginx.scrape_uri': ['http://nginx:8080/nginx_status']Redis监控Redis Exporter部署:bashdocker run -d \ --name=redis-exporter \ -p 9121:9121 \ -e REDIS_ADDR=redis://redis-server:6379 \ oliver006/redis_exporter:latest监控关键指标:promql# 内存使用率redis_memory_used_bytes / redis_memory_max_bytes * 100# 连接数redis_connected_clients# 命中率rate(redis_keyspace_hits_total[5m]) / (rate(redis_keyspace_hits_total[5m]) + rate(redis_keyspace_misses_total[5m])) * 1004. 自定义Exporter开发Python自定义Exporterpython#!/usr/bin/env python3from prometheus_client import start_http_server, Gauge, Counterimport timeimport requestsclass CustomExporter: def __init__(self): # 定义指标 self.service_health = Gauge('service_health', 'Service health status', ['service_name']) self.request_count = Counter('http_requests_total', 'Total HTTP requests', ['method', 'endpoint']) self.response_time = Gauge('http_response_time_seconds', 'HTTP response time') def collect_metrics(self): # 收集业务指标 services = ['user-service', 'order-service', 'payment-service'] for service in services: try: response = requests.get(f'http://{service}:8080/health', timeout=5) health_status = 1 if response.status_code == 200 else 0 self.service_health.labels(service_name=service).set(health_status) except Exception as e: self.service_health.labels(service_name=service).set(0) def run(self): start_http_server(8000) while True: self.collect_metrics() time.sleep(15)if __name__ == '__main__': exporter = CustomExporter() exporter.run()Go自定义Exportergopackage mainimport ( "log" "net/http" "time" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp")type BusinessExporter struct { orderCount *prometheus.CounterVec revenue *prometheus.GaugeVec errorRate *prometheus.GaugeVec}func NewBusinessExporter() *BusinessExporter { return &BusinessExporter{ orderCount: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "business_orders_total", Help: "Total number of orders", }, []string{"product_type", "region"}, ), revenue: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Name: "business_revenue_usd", Help: "Current revenue in USD", }, []string{"product_type"}, ), errorRate: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Name: "business_error_rate", Help: "Error rate percentage", }, []string{"service"}, ), }}func (e *BusinessExporter) Describe(ch chan<- *prometheus.Desc) { e.orderCount.Describe(ch) e.revenue.Describe(ch) e.errorRate.Describe(ch)}func (e *BusinessExporter) Collect(ch chan<- prometheus.Metric) { // 模拟业务数据收集 e.orderCount.WithLabelValues("electronics", "us-west").Add(42) e.revenue.WithLabelValues("electronics").Set(15000.50) e.errorRate.WithLabelValues("payment-service").Set(0.02) e.orderCount.Collect(ch) e.revenue.Collect(ch) e.errorRate.Collect(ch)}func main() { exporter := NewBusinessExporter() prometheus.MustRegister(exporter) http.Handle("/metrics", promhttp.Handler()) log.Fatal(http.ListenAndServe(":8080", nil))}5. Exporter监控最佳实践资源监控配置yaml# 综合基础设施监控scrape_configs: # 系统级别 - job_name: 'node' static_configs: - targets: ['node-exporter:9100'] # 数据库层 - job_name: 'mysql' static_configs: - targets: ['mysql-exporter:9104'] # 缓存层 - job_name: 'redis' static_configs: - targets: ['redis-exporter:9121'] # 应用层 - job_name: 'java-apps' static_configs: - targets: ['app1:8080', 'app2:8080'] metrics_path: /actuator/prometheus # 网络探测 - job_name: 'blackbox-http' metrics_path: /probe params: module: [http_2xx] static_configs: - targets: - https://api.company.com/health - https://web.company.com/health relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: blackbox-exporter:9115高可用Exporter部署yaml# docker-compose.high-availability.ymlversion: '3.8'services: node-exporter-primary: image: prom/node-exporter:latest ports: - "9100:9100" deploy: replicas: 2 node-exporter-backup: image: prom/node-exporter:latest ports: - "9101:9100" command: - '--web.listen-address=:9100' - '--path.procfs=/host/proc' - '--path.sysfs=/host/sys' prometheus: image: prom/prometheus:latest ports: - "9090:9090" volumes: - ./prometheus.yml:/etc/prometheus/prometheus.yml command: - '--config.file=/etc/prometheus/prometheus.yml' - '--web.enable-lifecycle'6. 监控数据验证与调试数据质量检查bash# 检查指标端点curl -s http://node-exporter:9100/metrics | head -20# 验证指标格式promtool check metrics <(curl -s http://exporter:8080/metrics)# 指标统计分析curl -s http://exporter:8080/metrics | grep -v '^#' | wc -l常见问题排查指标丢失排查:bash# 检查目标状态curl -s http://prometheus:9090/api/v1/targets | jq '.data.activeTargets[] | select(.labels.job=="node")'# 检查抓取错误curl -s http://prometheus:9090/api/v1/targets | jq '.data.activeTargets[] | select(.lastError)'# 验证网络连通性telnet node-exporter 9100性能问题排查:bash# 检查抓取持续时间curl -s http://prometheus:9090/api/v1/targets | jq '.data.activeTargets[] | {job: .labels.job, scrapeDuration: .lastScrapeDuration}'# 监控Exporter资源使用docker stats node-exporter通过全面的Exporter生态部署,您可以构建覆盖基础设施、中间件、应用程序和业务逻辑的完整监控体系。
-
1. 配置文件深度解析global配置块:全局参数优化yamlglobal: # 默认抓取间隔 scrape_interval: 15s # 规则评估间隔 evaluation_interval: 15s # 外部标签 - 用于集群标识 external_labels: cluster: 'production' region: 'us-west-1' environment: 'prod'关键参数说明:scrape_interval:控制数据抓取频率,影响数据精度和系统负载evaluation_interval:告警规则执行频率external_labels:用于跨集群数据聚合的标识标签scrape_configs配置详解yamlscrape_configs: - job_name: 'api-server' # 抓取参数 scrape_interval: 30s scrape_timeout: 10s # 指标路径和参数 metrics_path: '/metrics' params: 'format': ['prometheus'] # 服务发现配置 kubernetes_sd_configs: - role: endpoints namespaces: names: ['default', 'monitoring'] # 标签重写规则 relabel_configs: - source_labels: [__meta_kubernetes_pod_name] target_label: pod_name - source_labels: [__meta_kubernetes_namespace] target_label: namespace2. 静态配置实战基础静态目标配置yamlscrape_configs: - job_name: 'frontend-services' static_configs: - targets: - 'web-01:8080' - 'web-02:8080' - 'web-03:8080' labels: tier: 'frontend' environment: 'production' # 抓取配置优化 scrape_interval: 30s scrape_timeout: 5s metrics_path: '/actuator/prometheus' # HTTP配置 scheme: https basic_auth: username: 'monitor' password: '${METRICS_PASSWORD}' # TLS配置 tls_config: insecure_skip_verify: false ca_file: '/etc/ssl/certs/ca.crt'多环境静态配置yamlscrape_configs: # 开发环境 - job_name: 'dev-services' static_configs: - targets: ['dev-web:8080', 'dev-api:8080'] labels: environment: 'development' scrape_interval: 60s # 开发环境降低频率 # 生产环境 - job_name: 'prod-services' static_configs: - targets: ['prod-web-01:8080', 'prod-web-02:8080'] labels: environment: 'production' scrape_interval: 15s # 生产环境更高频率 # 基础设施监控 - job_name: 'infrastructure' static_configs: - targets: - 'node-exporter:9100' - 'redis-exporter:9121' - 'postgres-exporter:9187' labels: component: 'infrastructure'3. 动态服务发现配置基于文件的服务发现创建目标文件:yaml# targets/frontend.yml- targets: - 'web-01.company.com:8080' - 'web-02.company.com:8080' labels: service: 'frontend' tier: 'web' environment: 'production'# targets/backend.yml - targets: - 'api-01.company.com:8080' - 'api-02.company.com:8080' labels: service: 'backend' tier: 'api' environment: 'production'Prometheus配置:yamlscrape_configs: - job_name: 'file-sd-frontend' file_sd_configs: - files: - 'targets/frontend.yml' refresh_interval: 5m - job_name: 'file-sd-backend' file_sd_configs: - files: - 'targets/backend.yml' refresh_interval: 5mKubernetes服务发现yamlscrape_configs: # 监控所有Pod - job_name: 'kubernetes-pods' kubernetes_sd_configs: - role: pod relabel_configs: # 只监控有注解的Pod - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] action: keep regex: true # 从注解获取指标路径 - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] action: replace target_label: __metrics_path__ regex: (.+) # 从注解获取端口 - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port] action: replace target_label: __address__ regex: (.+) # 添加标准标签 - source_labels: [__meta_kubernetes_namespace] target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_pod_name] target_label: kubernetes_pod_name - source_labels: [__meta_kubernetes_pod_container_name] target_label: kubernetes_container_nameConsul服务发现yamlscrape_configs: - job_name: 'consul-services' consul_sd_configs: - server: 'consul.service.consul:8500' datacenter: 'dc1' services: - 'web-service' - 'api-service' - 'database-service' relabel_configs: - source_labels: [__meta_consul_service] target_label: consul_service - source_labels: [__meta_consul_tags] separator: ',' regex: 'environment:(.+)' target_label: environment - source_labels: [__meta_consul_tags] separator: ',' regex: 'version:(.+)' target_label: version4. 标签管理实战标签重写(relabel_configs)深度解析yamlscrape_configs: - job_name: 'advanced-relabeling' static_configs: - targets: ['host1:8080', 'host2:8080'] relabel_configs: # 1. 标签重命名 - source_labels: [__address__] regex: '(.*):(.*)' replacement: '${1}' target_label: instance # 2. 标签值映射 - source_labels: [__meta_environment] regex: 'prod' replacement: 'production' target_label: environment # 3. 标签删除 - regex: 'temp_.*' action: labeldrop # 4. 条件标签添加 - source_labels: [__meta_critical_service] regex: 'true' target_label: priority replacement: 'high' # 5. 默认标签 - target_label: datacenter replacement: 'us-east-1' # 6. 哈希分片 - source_labels: [__address__] modulus: 4 target_label: __tmp_hash action: hashmod - source_labels: [__tmp_hash] regex: '0' action: keep元标签管理Kubernetes服务发现中的元标签示例:yamlrelabel_configs: # 提取Pod元数据 - source_labels: [__meta_kubernetes_pod_label_app] target_label: app - source_labels: [__meta_kubernetes_pod_label_version] target_label: version # 提取节点信息 - source_labels: [__meta_kubernetes_pod_node_name] target_label: kubernetes_node replacement: '${1}' # 服务发现元信息 - source_labels: [__meta_kubernetes_service_name] target_label: kubernetes_service5. 远程读写配置远程写入配置yaml# prometheus.ymlremote_write: - url: "http://thanos-receive:10908/api/v1/receive" # 队列配置 queue_config: capacity: 2500 max_shards: 200 min_shards: 1 max_samples_per_send: 500 batch_send_deadline: 5s min_backoff: 30ms max_backoff: 100ms # 写入重试 write_relabel_configs: - source_labels: [__name__] regex: 'up|process_.*|go_.*' action: drop # 元数据配置 metadata_config: send: true send_interval: 1m # 签名认证 sigv4: region: us-east-1远程读取配置yamlremote_read: - url: "http://thanos-query:10902/api/v1/read" read_recent: true # 必要的标签匹配 required_matchers: cluster: "production" # 查询配置 remote_timeout: 30s6. 高级配置技巧配置模板化yaml# 基础配置模板- job_name: '{{.ServiceName}}' metrics_path: '{{.MetricsPath | default "/metrics"}}' static_configs: - targets: [{{range $i, $e := .Targets}}{{if $i}},{{end}}{{$e}}{{end}}] labels: service: '{{.ServiceName}}' environment: '{{.Environment}}'# 使用Go模板生成配置__targets: - service_name: 'user-service' metrics_path: '/actuator/metrics' targets: ['user-01:8080', 'user-02:8080'] environment: 'production' - service_name: 'order-service' metrics_path: '/metrics' targets: ['order-01:8080', 'order-02:8080'] environment: 'production'环境变量配置yamlglobal: scrape_interval: ${SCRAPE_INTERVAL:-15s}scrape_configs: - job_name: 'database' static_configs: - targets: ['${DB_HOST}:${DB_METRICS_PORT}'] basic_auth: username: '${METRICS_USER}' password: '${METRICS_PASSWORD}'7. 配置验证与优化配置验证工具bash# 验证配置文件语法promtool check config prometheus.yml# 验证规则文件promtool check rules rules/*.yml# 测试服务发现promtool test rules test.yml# 检查指标名称curl -s http://localhost:9090/api/v1/label/__name__/values | jq .性能优化配置yaml# 存储配置优化storage: tsdb: # 内存配置 memory_size: 4GB # 块配置 min_block_duration: 2h max_block_duration: 24h # 保留策略 retention: 30d# 查询配置query: # 查询超时 timeout: 2m # 并发查询 max_concurrency: 20 # 查询日志 log_queries: true8. 实战配置示例完整的生产环境配置yamlglobal: scrape_interval: 15s evaluation_interval: 15s external_labels: cluster: 'production' region: 'us-west-2' environment: 'prod'# 告警规则rule_files: - "rules/node_alerts.yml" - "rules/service_alerts.yml" - "rules/business_alerts.yml"# 远程写入remote_write: - url: "http://longterm-storage:8480/insert/0/prometheus" queue_config: capacity: 10000 max_shards: 200# 抓取配置scrape_configs: # Prometheus自身监控 - job_name: 'prometheus' static_configs: - targets: ['localhost:9090'] scrape_interval: 30s # 节点监控 - job_name: 'node' file_sd_configs: - files: ['targets/nodes.yml'] metrics_path: /metrics scrape_interval: 15s # Kubernetes服务监控 - job_name: 'kubernetes-services' kubernetes_sd_configs: - role: service relabel_configs: - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] action: keep regex: truealerting: alertmanagers: - static_configs: - targets: ['alertmanager:9093']这个配置实战指南涵盖了从基础到高级的各种配置场景,帮助企业构建稳定可靠的监控数据采集体系。
-
1. 环境准备与规划系统要求硬件要求(基于每秒10万样本的典型负载):CPU:4核以上内存:16GB以上磁盘:SSD,容量根据保留时间计算网络:千兆网卡容量规划公式:text所需磁盘空间 = 保留时间 × 抓取频率 × 指标数量 × 每个样本大小部署架构设计单机部署架构:text┌─────────────────┐│ Prometheus ││ Server │◄── Exporters/Applications│ ┌─────────────┐ ││ │ TSDB │ ││ └─────────────┘ │└─────────────────┘高可用部署架构:text┌─────────────────┐ ┌─────────────────┐│ Prometheus A │ │ Prometheus B ││ │ │ │└─────────────────┘ └─────────────────┘ ▲ ▲ └─────负载均衡器─────┘ │ ┌───────┴───────┐ │ 监控目标 │ └───────────────┘2. 多种安装方式详解二进制包安装(Linux)下载与解压:bash# 下载最新版本VERSION="2.47.0"ARCH="linux-amd64"wget https://github.com/prometheus/prometheus/releases/download/v$VERSION/prometheus-$VERSION.$ARCH.tar.gz# 解压tar xvfz prometheus-$VERSION.$ARCH.tar.gzcd prometheus-$VERSION.$ARCH目录结构:textprometheus-2.47.0.linux-amd64/├── prometheus # 主程序├── promtool # 管理工具├── prometheus.yml # 主配置文件└── consoles/ # 控制台文件Docker容器化部署使用Docker运行:bash# 创建配置目录mkdir -p /opt/prometheus/{data,conf}# 复制配置文件cp prometheus.yml /opt/prometheus/conf/# 运行容器docker run -d \ --name=prometheus \ -p 9090:9090 \ -v /opt/prometheus/conf/prometheus.yml:/etc/prometheus/prometheus.yml \ -v /opt/prometheus/data:/prometheus \ prom/prometheus:v2.47.0Docker Compose部署:yamlversion: '3.8'services: prometheus: image: prom/prometheus:v2.47.0 ports: - "9090:9090" volumes: - ./prometheus.yml:/etc/prometheus/prometheus.yml - prometheus_data:/prometheus command: - '--config.file=/etc/prometheus/prometheus.yml' - '--storage.tsdb.path=/prometheus' - '--web.console.libraries=/etc/prometheus/console_libraries' - '--web.console.templates=/etc/prometheus/consoles' - '--storage.tsdb.retention.time=30d' - '--web.enable-lifecycle'volumes: prometheus_data:Kubernetes Operator部署使用Prometheus Operator:yaml# 安装Prometheus Operatorkubectl create namespace monitoringhelm repo add prometheus-community https://prometheus-community.github.io/helm-chartshelm install prometheus prometheus-community/kube-prometheus-stack -n monitoring# 自定义Prometheus实例apiVersion: monitoring.coreos.com/v1kind: Prometheusmetadata: name: main namespace: monitoringspec: serviceMonitorSelector: {} resources: requests: memory: 400Mi limits: memory: 2Gi retention: 30d源码编译安装从源码构建:bash# 前提:安装Go 1.19+git clone https://github.com/prometheus/prometheus.gitcd prometheusmake build# 测试构建结果./prometheus --version3. 平台特异性部署Windows系统部署Windows Service安装:powershell# 下载Windows版本Invoke-WebRequest https://github.com/prometheus/prometheus/releases/download/v2.47.0/prometheus-2.47.0.windows-amd64.zip -OutFile prometheus.zip# 解压并安装服务Expand-Archive prometheus.zip -DestinationPath C:\PrometheusNew-Service -Name "Prometheus" -BinaryPathName "C:\Prometheus\prometheus.exe --config.file=C:\Prometheus\prometheus.yml" -StartupType AutomaticmacOS开发环境部署使用Homebrew安装:bashbrew install prometheusbrew services start prometheus4. 目录结构与文件权限配置推荐目录结构:text/opt/prometheus/├── bin/ # 二进制文件├── conf/ # 配置文件├── data/ # 数据目录├── logs/ # 日志文件└── rules/ # 告警规则文件权限配置:bash# 创建专用用户useradd --no-create-home --shell /bin/false prometheus# 设置目录权限mkdir -p /opt/prometheus/{bin,conf,data,logs,rules}chown -R prometheus:prometheus /opt/prometheuschmod 755 /opt/prometheus/{bin,conf,logs,rules}5. 系统服务配置(systemd)创建systemd服务文件:ini# /etc/systemd/system/prometheus.service[Unit]Description=Prometheus Time Series Collection and Processing ServerDocumentation=https://prometheus.io/docs/Wants=network-online.targetAfter=network-online.target[Service]User=prometheusGroup=prometheusType=simpleRestart=on-failureExecStart=/opt/prometheus/bin/prometheus \ --config.file=/opt/prometheus/conf/prometheus.yml \ --storage.tsdb.path=/opt/prometheus/data \ --web.console.libraries=/opt/prometheus/console_libraries \ --web.console.templates=/opt/prometheus/consoles \ --web.listen-address=0.0.0.0:9090 \ --web.external-url= \ --storage.tsdb.retention.time=30d \ --web.enable-lifecycleExecReload=/bin/kill -HUP $MAINPIDTimeoutStopSec=20sSendSIGKILL=noLimitNOFILE=8192[Install]WantedBy=multi-user.target管理服务:bash# 重新加载systemd配置systemctl daemon-reload# 启动服务systemctl start prometheus# 设置开机自启systemctl enable prometheus# 查看服务状态systemctl status prometheus6. 初始配置文件基础prometheus.yml:yamlglobal: scrape_interval: 15s evaluation_interval: 15s# 告警规则文件rule_files: - "rules/*.yml"# 抓取配置scrape_configs: # 监控Prometheus自身 - job_name: 'prometheus' static_configs: - targets: ['localhost:9090'] metrics_path: /metrics scrape_interval: 30s # Node Exporter示例 - job_name: 'node_exporter' static_configs: - targets: ['localhost:9100'] scrape_interval: 15s# 告警管理器配置alerting: alertmanagers: - static_configs: - targets: []7. 验证安装与初步访问检查服务状态:bash# 检查进程ps aux | grep prometheus# 检查端口监听netstat -tlnp | grep 9090# 检查日志journalctl -u prometheus -f访问Web UI:打开浏览器访问 http://localhost:9090点击Status → Targets查看抓取目标状态点击Graph进入查询界面,输入up查看服务状态基础健康检查:bash# 使用promtool验证配置/opt/prometheus/bin/promtool check config /opt/prometheus/conf/prometheus.yml# API健康检查curl http://localhost:9090/-/healthy至此,您已经成功完成了Prometheus的部署。
推荐直播
-
HDC深度解读系列 - Serverless与MCP融合创新,构建AI应用全新智能中枢2025/08/20 周三 16:30-18:00
张昆鹏 HCDG北京核心组代表
HDC2025期间,华为云展示了Serverless与MCP融合创新的解决方案,本期访谈直播,由华为云开发者专家(HCDE)兼华为云开发者社区组织HCDG北京核心组代表张鹏先生主持,华为云PaaS服务产品部 Serverless总监Ewen为大家深度解读华为云Serverless与MCP如何融合构建AI应用全新智能中枢
回顾中 -
关于RISC-V生态发展的思考2025/09/02 周二 17:00-18:00
中国科学院计算技术研究所副所长包云岗教授
中科院包云岗老师将在本次直播中,探讨处理器生态的关键要素及其联系,分享过去几年推动RISC-V生态建设实践过程中的经验与教训。
回顾中 -
一键搞定华为云万级资源,3步轻松管理企业成本2025/09/09 周二 15:00-16:00
阿言 华为云交易产品经理
本直播重点介绍如何一键续费万级资源,3步轻松管理成本,帮助提升日常管理效率!
回顾中
热门标签