Docker · Kubernetes · Grafana · ArgoCD · AWS

📁 Recommended Folder Structure

C:\devops\  (or /opt/devops on Linux)
├── monitoring/
│   ├── docker-compose.yml
│   ├── prometheus/
│   │   └── prometheus.yml
│   ├── alertmanager/
│   │   └── alertmanager.yml
│   └── data/
│       ├── prometheus/
│       └── grafana/
│
├── logging/
│   ├── docker-compose.yml
│   ├── loki/
│   │   └── loki-config.yml
│   └── promtail/
│       └── promtail-config.yml
│
├── tracing/
│   └── docker-compose.yml
│
├── cicd/
│   ├── docker-compose.yml
│   └── data/
│       ├── gitea/
│       └── jenkins/
│
└── security/
    ├── docker-compose.yml
    └── vault/
💡
Data persistence

All stacks below use bind mounts (./data/...) so your data survives container restarts and rebuilds. On Linux, Prometheus data dir must be writable by UID 65534, Grafana by UID 472: sudo chown 65534 ./data/prometheus && sudo chown 472 ./data/grafana

📊 Stack 1 — Monitoring (Prometheus + Grafana)

Full observability: metrics collection, dashboards, alerts, and node-level system metrics.

Services: Prometheus · Grafana · AlertManager · Node Exporter · cAdvisor

# monitoring/docker-compose.yml
services:

  prometheus:
    image: prom/prometheus:latest
    container_name: prometheus
    ports:
      - "9090:9090"
    volumes:
      - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
      - ./prometheus/rules:/etc/prometheus/rules:ro
      - ./data/prometheus:/prometheus
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--storage.tsdb.retention.time=30d'
      - '--web.enable-lifecycle'
      - '--web.cors.origin=.*'
    restart: unless-stopped

  grafana:
    image: grafana/grafana:latest
    container_name: grafana
    ports:
      - "3000:3000"
    user: "472"
    environment:
      - GF_SECURITY_ADMIN_USER=admin
      - GF_SECURITY_ADMIN_PASSWORD=admin
      - GF_USERS_ALLOW_SIGN_UP=false
      - GF_DASHBOARDS_DEFAULT_HOME_DASHBOARD_PATH=/var/lib/grafana/dashboards/overview.json
      - GF_ALERTING_ENABLED=true
    volumes:
      - ./data/grafana:/var/lib/grafana
      - ./grafana/provisioning:/etc/grafana/provisioning:ro
      - ./grafana/dashboards:/var/lib/grafana/dashboards:ro
    depends_on:
      - prometheus
    restart: unless-stopped

  alertmanager:
    image: prom/alertmanager:latest
    container_name: alertmanager
    ports:
      - "9093:9093"
    volumes:
      - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
      - ./data/alertmanager:/alertmanager
    command:
      - '--config.file=/etc/alertmanager/alertmanager.yml'
      - '--storage.path=/alertmanager'
    restart: unless-stopped

  node-exporter:
    image: prom/node-exporter:latest
    container_name: node-exporter
    ports:
      - "9100:9100"
    volumes:
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
      - /:/rootfs:ro
    command:
      - '--path.procfs=/host/proc'
      - '--path.rootfs=/rootfs'
      - '--path.sysfs=/host/sys'
      - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
    restart: unless-stopped

  cadvisor:
    image: gcr.io/cadvisor/cadvisor:latest
    container_name: cadvisor
    ports:
      - "8080:8080"
    volumes:
      - /:/rootfs:ro
      - /var/run:/var/run:ro
      - /sys:/sys:ro
      - /var/lib/docker:/var/lib/docker:ro
      - /dev/disk:/dev/disk:ro
    privileged: true
    devices:
      - /dev/kmsg
    restart: unless-stopped
# prometheus/prometheus.yml
global:
  scrape_interval: 15s
  evaluation_interval: 15s

alerting:
  alertmanagers:
    - static_configs:
        - targets: ['alertmanager:9093']

rule_files:
  - /etc/prometheus/rules/*.yml

scrape_configs:
  - job_name: 'prometheus'
    static_configs:
      - targets: ['localhost:9090']

  - job_name: 'node'
    static_configs:
      - targets: ['node-exporter:9100']

  - job_name: 'cadvisor'
    static_configs:
      - targets: ['cadvisor:8080']

  - job_name: 'alertmanager'
    static_configs:
      - targets: ['alertmanager:9093']
# grafana/provisioning/datasources/prometheus.yml
apiVersion: 1
datasources:
  - name: Prometheus
    type: prometheus
    url: http://prometheus:9090
    isDefault: true
    editable: false
# Start the monitoring stack
cd monitoring
mkdir -p data/prometheus data/grafana data/alertmanager
docker compose up -d

# Access points:
#   Grafana:      http://localhost:3000  (admin/admin)
#   Prometheus:   http://localhost:9090
#   AlertManager: http://localhost:9093
#   cAdvisor:     http://localhost:8080

📋 Stack 2 — Logging (Loki + Promtail + Grafana)

Centralised log aggregation. Promtail ships logs from all containers to Loki. Query with LogQL in Grafana.

# logging/docker-compose.yml
services:

  loki:
    image: grafana/loki:latest
    container_name: loki
    ports:
      - "3100:3100"
    volumes:
      - ./loki/loki-config.yml:/etc/loki/local-config.yaml:ro
      - ./data/loki:/loki
    command: -config.file=/etc/loki/local-config.yaml
    restart: unless-stopped

  promtail:
    image: grafana/promtail:latest
    container_name: promtail
    volumes:
      - ./promtail/promtail-config.yml:/etc/promtail/config.yml:ro
      - /var/log:/var/log:ro
      - /var/lib/docker/containers:/var/lib/docker/containers:ro
    command: -config.file=/etc/promtail/config.yml
    restart: unless-stopped

  grafana:
    image: grafana/grafana:latest
    container_name: grafana-logs
    ports:
      - "3001:3000"
    user: "472"
    environment:
      - GF_SECURITY_ADMIN_USER=admin
      - GF_SECURITY_ADMIN_PASSWORD=admin
    volumes:
      - ./data/grafana:/var/lib/grafana
      - ./grafana/provisioning:/etc/grafana/provisioning:ro
    restart: unless-stopped
# loki/loki-config.yml
auth_enabled: false

server:
  http_listen_port: 3100

common:
  path_prefix: /loki
  storage:
    filesystem:
      chunks_directory: /loki/chunks
      rules_directory: /loki/rules
  replication_factor: 1
  ring:
    instance_addr: 127.0.0.1
    kvstore:
      store: inmemory

schema_config:
  configs:
    - from: 2020-10-24
      store: boltdb-shipper
      object_store: filesystem
      schema: v11
      index:
        prefix: index_
        period: 24h

limits_config:
  retention_period: 168h   # 7 days

compactor:
  working_directory: /loki/boltdb-shipper-compactor
  retention_enabled: true
# promtail/promtail-config.yml
server:
  http_listen_port: 9080

positions:
  filename: /tmp/positions.yaml

clients:
  - url: http://loki:3100/loki/api/v1/push

scrape_configs:
  - job_name: containers
    docker_sd_configs:
      - host: unix:///var/run/docker.sock
        refresh_interval: 5s
    relabel_configs:
      - source_labels: [__meta_docker_container_name]
        target_label: container
      - source_labels: [__meta_docker_container_label_com_docker_compose_service]
        target_label: service

  - job_name: system
    static_configs:
      - targets: [localhost]
        labels:
          job: syslog
          __path__: /var/log/*.log
# grafana/provisioning/datasources/loki.yml
apiVersion: 1
datasources:
  - name: Loki
    type: loki
    url: http://loki:3100
    isDefault: false
    editable: false

🔍 Stack 3 — Tracing (Jaeger + OpenTelemetry)

# tracing/docker-compose.yml
services:

  jaeger:
    image: jaegertracing/all-in-one:latest
    container_name: jaeger
    ports:
      - "16686:16686"   # UI
      - "14268:14268"   # HTTP collector
      - "6831:6831/udp" # UDP agent
      - "4317:4317"     # OTLP gRPC
      - "4318:4318"     # OTLP HTTP
    environment:
      - COLLECTOR_OTLP_ENABLED=true
      - SPAN_STORAGE_TYPE=memory
    restart: unless-stopped

  otel-collector:
    image: otel/opentelemetry-collector-contrib:latest
    container_name: otel-collector
    ports:
      - "4317:4317"   # OTLP gRPC receiver
      - "4318:4318"   # OTLP HTTP receiver
      - "8888:8888"   # collector metrics
    volumes:
      - ./otel-collector-config.yml:/etc/otelcol-contrib/config.yaml:ro
    command: --config /etc/otelcol-contrib/config.yaml
    depends_on:
      - jaeger
    restart: unless-stopped
# otel-collector-config.yml
receivers:
  otlp:
    protocols:
      grpc:
        endpoint: 0.0.0.0:4317
      http:
        endpoint: 0.0.0.0:4318

exporters:
  jaeger:
    endpoint: jaeger:14250
    tls:
      insecure: true
  logging:
    loglevel: info

service:
  pipelines:
    traces:
      receivers: [otlp]
      exporters: [jaeger, logging]
# Start tracing stack
cd tracing && docker compose up -d

# Access:
#   Jaeger UI: http://localhost:16686

# Instrument your app with OpenTelemetry SDK
# Python example:
pip install opentelemetry-distro opentelemetry-exporter-otlp
opentelemetry-instrument \
  --traces_exporter otlp \
  --service_name myapp \
  --exporter_otlp_endpoint http://localhost:4317 \
  python app.py

⚙️ Stack 4 — CI/CD (Gitea + Jenkins)

Self-hosted Git + CI/CD pipeline — no GitHub or cloud services required.

# cicd/docker-compose.yml
services:

  gitea:
    image: gitea/gitea:latest
    container_name: gitea
    ports:
      - "3000:3000"
      - "2222:22"
    environment:
      - GITEA__database__DB_TYPE=sqlite3
      - GITEA__server__ROOT_URL=http://localhost:3000
      - GITEA__server__SSH_PORT=2222
      - GITEA__log__LEVEL=Warn
    volumes:
      - ./data/gitea:/data
    restart: unless-stopped

  jenkins:
    image: jenkins/jenkins:lts
    container_name: jenkins
    ports:
      - "8080:8080"
      - "50000:50000"
    environment:
      - JAVA_OPTS=-Djenkins.install.runSetupWizard=false
    volumes:
      - ./data/jenkins:/var/jenkins_home
      - /var/run/docker.sock:/var/run/docker.sock
    user: root                    # needed for Docker socket access
    restart: unless-stopped

  jenkins-agent:
    image: jenkins/ssh-agent:latest
    container_name: jenkins-agent
    environment:
      - JENKINS_AGENT_SSH_PUBKEY=${JENKINS_AGENT_SSH_PUBKEY}
    volumes:
      - /var/run/docker.sock:/var/run/docker.sock
    restart: unless-stopped
# Start CI/CD stack
cd cicd
mkdir -p data/gitea data/jenkins
docker compose up -d

# Get Jenkins initial admin password
docker exec jenkins cat /var/jenkins_home/secrets/initialAdminPassword

# Access:
#   Gitea:   http://localhost:3000
#   Jenkins: http://localhost:8080

🔐 Stack 5 — Security (Vault + Trivy)

# security/docker-compose.yml
services:

  vault:
    image: hashicorp/vault:latest
    container_name: vault
    ports:
      - "8200:8200"
    environment:
      - VAULT_DEV_ROOT_TOKEN_ID=root     # dev mode — NOT for prod
      - VAULT_DEV_LISTEN_ADDRESS=0.0.0.0:8200
    cap_add:
      - IPC_LOCK
    volumes:
      - ./vault/config:/vault/config:ro
      - ./data/vault:/vault/data
    command: server -dev
    restart: unless-stopped

  trivy-server:
    image: aquasec/trivy:latest
    container_name: trivy
    ports:
      - "8090:8090"
    volumes:
      - ./data/trivy:/root/.cache/trivy
    command: server --listen 0.0.0.0:8090
    restart: unless-stopped
# Start security stack
cd security
mkdir -p data/vault data/trivy
docker compose up -d

# Configure Vault
export VAULT_ADDR='http://localhost:8200'
export VAULT_TOKEN='root'

# Enable KV secrets engine
vault secrets enable -path=secret kv-v2

# Store a secret
vault kv put secret/myapp/db password="Super@1234" user="app"

# Read it back
vault kv get secret/myapp/db

# Scan image via Trivy server
trivy client --remote http://localhost:8090 nginx:latest

# Access:
#   Vault UI: http://localhost:8200 (token: root)

🚀 Stack 6 — Full Observability (Metrics + Logs + Traces)

The complete Grafana observability stack — Prometheus, Loki, Tempo, and Grafana all wired together. This is the modern alternative to the ELK stack.

# full-stack/docker-compose.yml
services:

  prometheus:
    image: prom/prometheus:latest
    container_name: prometheus
    ports:
      - "9090:9090"
    volumes:
      - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
      - ./data/prometheus:/prometheus
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--storage.tsdb.retention.time=30d'
    restart: unless-stopped

  loki:
    image: grafana/loki:latest
    container_name: loki
    ports:
      - "3100:3100"
    volumes:
      - ./loki/loki-config.yml:/etc/loki/local-config.yaml:ro
      - ./data/loki:/loki
    command: -config.file=/etc/loki/local-config.yaml
    restart: unless-stopped

  tempo:
    image: grafana/tempo:latest
    container_name: tempo
    ports:
      - "3200:3200"   # HTTP
      - "4317:4317"   # OTLP gRPC
      - "4318:4318"   # OTLP HTTP
    volumes:
      - ./tempo/tempo-config.yml:/etc/tempo.yaml:ro
      - ./data/tempo:/tmp/tempo
    command: -config.file=/etc/tempo.yaml
    restart: unless-stopped

  promtail:
    image: grafana/promtail:latest
    container_name: promtail
    volumes:
      - ./promtail/promtail-config.yml:/etc/promtail/config.yml:ro
      - /var/log:/var/log:ro
      - /var/lib/docker/containers:/var/lib/docker/containers:ro
    command: -config.file=/etc/promtail/config.yml
    restart: unless-stopped

  node-exporter:
    image: prom/node-exporter:latest
    container_name: node-exporter
    ports:
      - "9100:9100"
    volumes:
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
    command:
      - '--path.procfs=/host/proc'
      - '--path.sysfs=/host/sys'
    restart: unless-stopped

  cadvisor:
    image: gcr.io/cadvisor/cadvisor:latest
    container_name: cadvisor
    ports:
      - "8090:8080"
    volumes:
      - /:/rootfs:ro
      - /var/run:/var/run:ro
      - /sys:/sys:ro
      - /var/lib/docker:/var/lib/docker:ro
    privileged: true
    restart: unless-stopped

  grafana:
    image: grafana/grafana:latest
    container_name: grafana
    ports:
      - "3000:3000"
    user: "472"
    environment:
      - GF_SECURITY_ADMIN_USER=admin
      - GF_SECURITY_ADMIN_PASSWORD=admin
      - GF_FEATURE_TOGGLES_ENABLE=traceqlEditor
    volumes:
      - ./data/grafana:/var/lib/grafana
      - ./grafana/provisioning:/etc/grafana/provisioning:ro
    depends_on:
      - prometheus
      - loki
      - tempo
    restart: unless-stopped
# grafana/provisioning/datasources/all.yml
apiVersion: 1
datasources:
  - name: Prometheus
    type: prometheus
    url: http://prometheus:9090
    isDefault: true

  - name: Loki
    type: loki
    url: http://loki:3100
    jsonData:
      derivedFields:
        - datasourceUid: tempo
          matcherRegex: '"trace_id":\s*"(\w+)"'
          name: TraceID
          url: "$${__value.raw}"    # links log lines to traces

  - name: Tempo
    uid: tempo
    type: tempo
    url: http://tempo:3200
    jsonData:
      tracesToLogs:
        datasourceUid: loki        # links traces to logs
# tempo/tempo-config.yml
server:
  http_listen_port: 3200

distributor:
  receivers:
    otlp:
      protocols:
        grpc:
          endpoint: 0.0.0.0:4317
        http:
          endpoint: 0.0.0.0:4318

ingester:
  trace_idle_period: 10s
  max_block_bytes: 1_000_000
  max_block_duration: 5m

compactor:
  compaction:
    compaction_window: 1h

storage:
  trace:
    backend: local
    local:
      path: /tmp/tempo/blocks
# Start the full observability stack
cd full-stack
mkdir -p data/{prometheus,grafana,loki,tempo}

# Linux: fix permissions
sudo chown 65534:65534 data/prometheus
sudo chown 472:472 data/grafana

docker compose up -d
docker compose ps    # verify all running

# Access Grafana: http://localhost:3000 (admin/admin)
# All 3 data sources are pre-wired — Explore → select Loki/Prometheus/Tempo

📋 All Stacks — Ports Reference

ServicePortURLDefault Credentials
Grafana3000http://localhost:3000admin / admin
Prometheus9090http://localhost:9090
AlertManager9093http://localhost:9093
Node Exporter9100http://localhost:9100/metrics
cAdvisor8090http://localhost:8090
Loki3100http://localhost:3100
Tempo3200http://localhost:3200
Jaeger UI16686http://localhost:16686
Gitea3000http://localhost:3000set on first run
Jenkins8080http://localhost:8080admin / (from log)
Vault8200http://localhost:8200token: root (dev mode)
Trivy Server8090http://localhost:8090

🛠️ Tips & Troubleshooting

# Check logs for any container
docker logs -f grafana
docker logs -f prometheus

# Check all containers status
docker compose ps

# Reload Prometheus config without restart
curl -X POST http://localhost:9090/-/reload

# Check what Prometheus is scraping
curl http://localhost:9090/api/v1/targets | jq '.data.activeTargets[].scrapeUrl'

# Test Loki is receiving logs
curl http://localhost:3100/loki/api/v1/labels

# Push a test log to Loki manually
curl -X POST http://localhost:3100/loki/api/v1/push \
  -H "Content-Type: application/json" \
  -d '{"streams":[{"stream":{"app":"test"},"values":[["'"$(date +%s%N)"'","hello from curl"]]}]}'

# Stop a stack cleanly
docker compose down

# Stop and wipe all data (fresh start)
docker compose down -v
rm -rf ./data/*
⚠️
Windows users — Linux containers

On Docker Desktop (Windows), the containers run inside WSL2. Paths like /proc and /var/lib/docker refer to the WSL2 VM, not Windows. Node Exporter will show WSL2 metrics, not Windows host metrics. For Windows host monitoring use windows_exporter instead.