DevOps 实用工具与最佳实践 🚀
DevOps是开发和运维的完美结合,通过自动化、监控和协作来加速软件交付。
📊 目录
🔧 Git 版本控制
基本操作
# 设置文件为可执行
git update-index --chmod=+x <file>
# 查看文件权限变更
git ls-files --stage <file>
# 强制推送(谨慎使用)
git push --force-with-lease origin main
高级Git技巧
# 交互式变基,整理提交历史
git rebase -i HEAD~3
# 暂存部分文件内容
git add -p <file>
# 查找引入bug的提交
git bisect start
git bisect bad HEAD
git bisect good v1.0
# 查看文件每行的最后修改
git blame <file>
# 恢复删除的分支
git reflog
git checkout -b <branch-name> <commit-hash>
Git工作流
# Gitflow工作流示例
git flow init
git flow feature start new-feature
git flow feature finish new-feature
# 创建发布分支
git flow release start 1.0.0
git flow release finish 1.0.0
🐳 Docker 容器化
Dockerfile 最佳实践
# 多阶段构建示例
FROM node:16-alpine AS builder
WORKDIR /app
COPY package*.json ./
RUN npm ci --only=production
FROM node:16-alpine AS runtime
RUN addgroup -g 1001 -S nodejs
RUN adduser -S nodejs -u 1001
WORKDIR /app
COPY --from=builder --chown=nodejs:nodejs /app/node_modules ./node_modules
COPY --chown=nodejs:nodejs . .
USER nodejs
EXPOSE 3000
CMD ["node", "server.js"]
Docker Compose 服务编排
version: '3.8'
services:
app:
build: .
ports:
- "3000:3000"
environment:
- NODE_ENV=production
depends_on:
- db
restart: unless-stopped
db:
image: postgres:13-alpine
environment:
POSTGRES_DB: myapp
POSTGRES_USER: ${DB_USER}
POSTGRES_PASSWORD: ${DB_PASSWORD}
volumes:
- postgres_data:/var/lib/postgresql/data
restart: unless-stopped
volumes:
postgres_data:
常用Docker命令
# 清理未使用的资源
docker system prune -af
# 进入运行中的容器
docker exec -it <container_id> /bin/bash
# 查看容器资源使用情况
docker stats
# 导出/导入镜像
docker save -o app.tar myapp:latest
docker load -i app.tar
# 多平台构建
docker buildx create --use
docker buildx build --platform linux/amd64,linux/arm64 -t myapp:latest --push .
⚙️ CI/CD 自动化
GitHub Actions工作流
name: CI/CD Pipeline
on:
push:
branches: [ main, develop ]
pull_request:
branches: [ main ]
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Setup Node.js
uses: actions/setup-node@v3
with:
node-version: '16'
cache: 'npm'
- name: Install dependencies
run: npm ci
- name: Run tests
run: npm test
- name: Run security audit
run: npm audit
deploy:
needs: test
runs-on: ubuntu-latest
if: github.ref == 'refs/heads/main'
steps:
- name: Deploy to production
run: |
echo "Deploying to production..."
# 部署脚本
Jenkins Pipeline 示例
pipeline {
agent any
environment {
DOCKER_REGISTRY = 'registry.example.com'
IMAGE_NAME = 'myapp'
}
stages {
stage('Checkout') {
steps {
checkout scm
}
}
stage('Test') {
steps {
sh 'npm test'
publishTestResults testResultsPattern: 'test-results.xml'
}
}
stage('Build') {
steps {
script {
def image = docker.build("${IMAGE_NAME}:${BUILD_NUMBER}")
docker.withRegistry("https://${DOCKER_REGISTRY}", 'docker-registry-credentials') {
image.push()
image.push('latest')
}
}
}
}
stage('Deploy') {
steps {
sh """
kubectl set image deployment/myapp \
myapp=${DOCKER_REGISTRY}/${IMAGE_NAME}:${BUILD_NUMBER}
"""
}
}
}
post {
always {
cleanWs()
}
}
}
📈 监控与日志
Prometheus 配置
global:
scrape_interval: 15s
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'node-exporter'
static_configs:
- targets: ['localhost:9100']
- job_name: 'application'
static_configs:
- targets: ['app:3000']
metrics_path: '/metrics'
日志聚合配置
# docker-compose.yml for ELK stack
version: '3.8'
services:
elasticsearch:
image: elasticsearch:7.14.0
environment:
- discovery.type=single-node
ports:
- "9200:9200"
logstash:
image: logstash:7.14.0
volumes:
- ./logstash.conf:/usr/share/logstash/pipeline/logstash.conf
ports:
- "5044:5044"
kibana:
image: kibana:7.14.0
ports:
- "5601:5601"
environment:
- ELASTICSEARCH_HOSTS=http://elasticsearch:9200
🏗️ 基础设施即代码
Terraform 示例
# AWS EC2 实例配置
provider "aws" {
region = var.aws_region
}
resource "aws_security_group" "web" {
name_prefix = "web-sg"
ingress {
from_port = 80
to_port = 80
protocol = "tcp"
cidr_blocks = ["0.0.0.0/0"]
}
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
}
resource "aws_instance" "web" {
ami = var.ami_id
instance_type = var.instance_type
vpc_security_group_ids = [aws_security_group.web.id]
user_data = file("${path.module}/user-data.sh")
tags = {
Name = "web-server"
Environment = var.environment
}
}
Ansible Playbook
---
- name: Configure web servers
hosts: webservers
become: yes
vars:
nginx_port: 80
app_user: webapp
tasks:
- name: Install Nginx
package:
name: nginx
state: present
- name: Start and enable Nginx
systemd:
name: nginx
state: started
enabled: yes
- name: Deploy application
copy:
src: ""
dest: /var/www/html/
owner: ""
group: ""
with_fileglob:
- "dist/*"
notify: restart nginx
handlers:
- name: restart nginx
systemd:
name: nginx
state: restarted
📋 最佳实践
🔒 安全实践
# 扫描Docker镜像漏洞
docker run --rm -v /var/run/docker.sock:/var/run/docker.sock \
aquasec/trivy image myapp:latest
# 密钥管理
kubectl create secret generic mysecret \
--from-literal=username=admin \
--from-literal=password='S3cur3P@ssw0rd'
# 网络策略
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: deny-all
spec:
podSelector: {}
policyTypes:
- Ingress
- Egress
🚀 性能优化
# 资源限制
resources:
requests:
memory: "64Mi"
cpu: "250m"
limits:
memory: "128Mi"
cpu: "500m"
# 健康检查
livenessProbe:
httpGet:
path: /health
port: 3000
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /ready
port: 3000
initialDelaySeconds: 5
periodSeconds: 5
📊 指标收集
// Node.js Prometheus指标示例
const prometheus = require('prom-client');
const httpRequestDuration = new prometheus.Histogram({
name: 'http_request_duration_seconds',
help: 'Duration of HTTP requests in seconds',
labelNames: ['method', 'status_code'],
buckets: [0.1, 0.5, 1, 2, 5]
});
const httpRequestsTotal = new prometheus.Counter({
name: 'http_requests_total',
help: 'Total number of HTTP requests',
labelNames: ['method', 'status_code']
});
// 中间件使用
app.use((req, res, next) => {
const start = Date.now();
res.on('finish', () => {
const duration = (Date.now() - start) / 1000;
httpRequestDuration
.labels(req.method, res.statusCode)
.observe(duration);
httpRequestsTotal
.labels(req.method, res.statusCode)
.inc();
});
next();
});
🛠️ 有用的脚本
健康检查脚本
#!/bin/bash
# health-check.sh
SERVICE_URL="http://localhost:3000/health"
MAX_RETRIES=3
RETRY_INTERVAL=5
for i in $(seq 1 $MAX_RETRIES); do
if curl -f -s "$SERVICE_URL" > /dev/null; then
echo "Service is healthy"
exit 0
else
echo "Health check failed (attempt $i/$MAX_RETRIES)"
if [ $i -lt $MAX_RETRIES ]; then
sleep $RETRY_INTERVAL
fi
fi
done
echo "Service is unhealthy after $MAX_RETRIES attempts"
exit 1
备份脚本
#!/bin/bash
# backup.sh
BACKUP_DIR="/backups"
DATE=$(date +%Y%m%d_%H%M%S)
DB_NAME="myapp"
# 数据库备份
docker exec postgres pg_dump -U postgres $DB_NAME > "$BACKUP_DIR/db_$DATE.sql"
# 压缩备份
gzip "$BACKUP_DIR/db_$DATE.sql"
# 清理7天前的备份
find $BACKUP_DIR -name "db_*.sql.gz" -mtime +7 -delete
echo "Backup completed: db_$DATE.sql.gz"
📚 推荐工具
类别 | 工具 | 用途 |
---|---|---|
容器 | Docker, Podman | 容器化应用 |
编排 | Kubernetes, Docker Swarm | 容器编排 |
CI/CD | Jenkins, GitLab CI, GitHub Actions | 持续集成/部署 |
监控 | Prometheus, Grafana, ELK | 监控和日志 |
IaC | Terraform, Ansible, Pulumi | 基础设施即代码 |
安全 | Vault, Trivy, Falco | 安全扫描和管理 |
💡 小贴士: DevOps是一个持续学习的过程,关键是自动化一切可以自动化的流程,提高开发和运维效率。