Database Setup¶

This guide provides comprehensive database setup and configuration for Temporal.io enterprise deployment, covering PostgreSQL installation, configuration, optimization, backup, and maintenance procedures.

Overview¶

The database setup includes: - PostgreSQL cluster deployment and configuration - High availability setup with replication - Performance tuning and optimization - Backup and recovery procedures - Monitoring and maintenance - Migration and schema management

PostgreSQL Deployment¶

High Availability PostgreSQL Cluster¶

PostgreSQL Helm Chart Configuration¶

# helm/values/database/postgresql-ha.yaml
postgresql:
  image:
    tag: "15.4"

  # Authentication
  auth:
    postgresPassword: ""  # Set via secret
    username: "temporal"
    password: ""  # Set via secret
    database: "temporal"
    existingSecret: "postgresql-credentials"
    secretKeys:
      adminPasswordKey: "postgres-password"
      userPasswordKey: "password"

  # Architecture
  architecture: replication

  # Primary configuration
  primary:
    name: primary
    persistence:
      enabled: true
      storageClass: "gp3"
      size: 100Gi

    resources:
      limits:
        memory: 4Gi
        cpu: 2000m
      requests:
        memory: 2Gi
        cpu: 1000m

    configuration: |
      # PostgreSQL configuration
      max_connections = 200
      shared_buffers = 1GB
      effective_cache_size = 3GB
      maintenance_work_mem = 256MB
      checkpoint_completion_target = 0.9
      wal_buffers = 16MB
      default_statistics_target = 100
      random_page_cost = 1.1
      effective_io_concurrency = 200
      work_mem = 4MB
      min_wal_size = 1GB
      max_wal_size = 4GB

      # Logging
      log_destination = 'stderr'
      logging_collector = on
      log_directory = 'log'
      log_filename = 'postgresql-%Y-%m-%d_%H%M%S.log'
      log_rotation_age = 1d
      log_rotation_size = 100MB
      log_min_duration_statement = 1000
      log_line_prefix = '%t [%p]: [%l-1] user=%u,db=%d,app=%a,client=%h '
      log_checkpoints = on
      log_connections = on
      log_disconnections = on
      log_lock_waits = on
      log_temp_files = 10MB

      # SSL
      ssl = on
      ssl_cert_file = '/etc/ssl/certs/tls.crt'
      ssl_key_file = '/etc/ssl/private/tls.key'
      ssl_ca_file = '/etc/ssl/certs/ca.crt'

    initdb:
      scripts:
        01_temporal_setup.sql: |
          -- Create temporal databases
          CREATE DATABASE temporal;
          CREATE DATABASE temporal_visibility;

          -- Create temporal user with proper permissions
          CREATE USER temporal WITH PASSWORD '${TEMPORAL_PASSWORD}';
          GRANT ALL PRIVILEGES ON DATABASE temporal TO temporal;
          GRANT ALL PRIVILEGES ON DATABASE temporal_visibility TO temporal;

          -- Create read-only user for monitoring
          CREATE USER temporal_monitor WITH PASSWORD '${MONITOR_PASSWORD}';
          GRANT CONNECT ON DATABASE temporal TO temporal_monitor;
          GRANT CONNECT ON DATABASE temporal_visibility TO temporal_monitor;
          GRANT USAGE ON SCHEMA public TO temporal_monitor;
          GRANT SELECT ON ALL TABLES IN SCHEMA public TO temporal_monitor;
          ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON TABLES TO temporal_monitor;

  # Read replica configuration
  readReplicas:
    replicaCount: 2

    persistence:
      enabled: true
      storageClass: "gp3"
      size: 100Gi

    resources:
      limits:
        memory: 2Gi
        cpu: 1000m
      requests:
        memory: 1Gi
        cpu: 500m

  # Backup configuration
  backup:
    enabled: true
    cronjob:
      schedule: "0 2 * * *"  # Daily at 2 AM
      restartPolicy: OnFailure
      storage:
        storageClass: "gp3"
        size: 500Gi

    retention:
      days: 30

    s3:
      enabled: true
      bucket: "temporal-backups"
      region: "us-west-2"
      endpoint: ""
      accessKey: ""  # Set via secret
      secretKey: ""  # Set via secret

  # Metrics
  metrics:
    enabled: true
    image:
      tag: "0.11.1"

    serviceMonitor:
      enabled: true
      namespace: "monitoring"
      interval: "30s"

    resources:
      limits:
        memory: 256Mi
        cpu: 250m
      requests:
        memory: 128Mi
        cpu: 100m

  # Network policy
  networkPolicy:
    enabled: true
    allowExternal: false
    explicitNamespacesSelector:
      matchLabels:
        name: "temporal-system"

Database Secrets Management¶

PostgreSQL Credentials Secret¶

# k8s/database/secrets/postgresql-credentials.yaml
apiVersion: external-secrets.io/v1beta1
kind: ExternalSecret
metadata:
  name: postgresql-credentials
  namespace: temporal-system
spec:
  refreshInterval: 1h
  secretStoreRef:
    name: vault-backend
    kind: SecretStore
  target:
    name: postgresql-credentials
    creationPolicy: Owner
    template:
      type: Opaque
      data:
        postgres-password: "{{ .postgres_password }}"
        password: "{{ .temporal_password }}"
        monitor-password: "{{ .monitor_password }}"
        replication-password: "{{ .replication_password }}"
  data:
  - secretKey: postgres_password
    remoteRef:
      key: temporal/database
      property: postgres_password
  - secretKey: temporal_password
    remoteRef:
      key: temporal/database
      property: temporal_password
  - secretKey: monitor_password
    remoteRef:
      key: temporal/database
      property: monitor_password
  - secretKey: replication_password
    remoteRef:
      key: temporal/database
      property: replication_password

Database Schema Management¶

Temporal Schema Setup Job¶

# k8s/database/jobs/schema-setup.yaml
apiVersion: batch/v1
kind: Job
metadata:
  name: temporal-schema-setup
  namespace: temporal-system
  annotations:
    "helm.sh/hook": post-install,post-upgrade
    "helm.sh/hook-weight": "1"
    "helm.sh/hook-delete-policy": before-hook-creation
spec:
  template:
    metadata:
      name: temporal-schema-setup
    spec:
      restartPolicy: OnFailure
      containers:
      - name: temporal-admin-tools
        image: temporalio/admin-tools:1.20.0
        command:
        - /bin/bash
        - -c
        - |
          set -euo pipefail

          echo "Setting up Temporal database schema..."

          # Wait for database to be ready
          until pg_isready -h $DB_HOST -p $DB_PORT -U $DB_USER; do
            echo "Waiting for database to be ready..."
            sleep 5
          done

          # Setup default database
          temporal-sql-tool \
            --plugin postgres \
            --ep $DB_HOST \
            --port $DB_PORT \
            --user $DB_USER \
            --password $DB_PASSWORD \
            --database $DB_NAME \
            setup-schema -v 0.0

          temporal-sql-tool \
            --plugin postgres \
            --ep $DB_HOST \
            --port $DB_PORT \
            --user $DB_USER \
            --password $DB_PASSWORD \
            --database $DB_NAME \
            update-schema -d /etc/temporal/schema/postgresql/v96

          # Setup visibility database
          temporal-sql-tool \
            --plugin postgres \
            --ep $DB_HOST \
            --port $DB_PORT \
            --user $DB_USER \
            --password $DB_PASSWORD \
            --database $DB_VISIBILITY_NAME \
            setup-schema -v 0.0

          temporal-sql-tool \
            --plugin postgres \
            --ep $DB_HOST \
            --port $DB_PORT \
            --user $DB_USER \
            --password $DB_PASSWORD \
            --database $DB_VISIBILITY_NAME \
            update-schema -d /etc/temporal/schema/postgresql/visibility/versioned

          echo "Temporal database schema setup completed successfully"

        env:
        - name: DB_HOST
          value: "postgresql-primary"
        - name: DB_PORT
          value: "5432"
        - name: DB_USER
          value: "temporal"
        - name: DB_PASSWORD
          valueFrom:
            secretKeyRef:
              name: postgresql-credentials
              key: password
        - name: DB_NAME
          value: "temporal"
        - name: DB_VISIBILITY_NAME
          value: "temporal_visibility"

        resources:
          limits:
            memory: 512Mi
            cpu: 500m
          requests:
            memory: 256Mi
            cpu: 250m

      securityContext:
        runAsNonRoot: true
        runAsUser: 10001
        fsGroup: 10001

Performance Optimization¶

PostgreSQL Tuning Configuration¶

Performance Tuning ConfigMap¶

# k8s/database/config/postgresql-performance.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: postgresql-performance-config
  namespace: temporal-system
data:
  postgresql.conf: |
    # Connection Settings
    max_connections = 200
    superuser_reserved_connections = 3

    # Memory Settings
    shared_buffers = 1GB                    # 25% of RAM
    effective_cache_size = 3GB              # 75% of RAM
    maintenance_work_mem = 256MB
    work_mem = 4MB

    # Checkpoint Settings
    checkpoint_completion_target = 0.9
    checkpoint_timeout = 10min
    max_wal_size = 4GB
    min_wal_size = 1GB
    wal_buffers = 16MB

    # Query Planning
    default_statistics_target = 100
    constraint_exclusion = partition
    cursor_tuple_fraction = 0.1

    # Disk I/O Settings
    random_page_cost = 1.1                  # For SSD storage
    effective_io_concurrency = 200
    seq_page_cost = 1

    # Background Writer
    bgwriter_delay = 200ms
    bgwriter_lru_maxpages = 100
    bgwriter_lru_multiplier = 2.0
    bgwriter_flush_after = 512kB

    # Autovacuum Settings
    autovacuum = on
    autovacuum_max_workers = 3
    autovacuum_naptime = 1min
    autovacuum_vacuum_threshold = 50
    autovacuum_analyze_threshold = 50
    autovacuum_vacuum_scale_factor = 0.2
    autovacuum_analyze_scale_factor = 0.1
    autovacuum_freeze_max_age = 200000000
    autovacuum_multixact_freeze_max_age = 400000000
    autovacuum_vacuum_cost_delay = 20ms
    autovacuum_vacuum_cost_limit = 200

    # Logging Settings
    log_destination = 'stderr'
    logging_collector = on
    log_directory = 'log'
    log_filename = 'postgresql-%Y-%m-%d_%H%M%S.log'
    log_rotation_age = 1d
    log_rotation_size = 100MB
    log_min_duration_statement = 1000
    log_line_prefix = '%t [%p]: [%l-1] user=%u,db=%d,app=%a,client=%h '
    log_checkpoints = on
    log_connections = on
    log_disconnections = on
    log_lock_waits = on
    log_temp_files = 10MB
    log_autovacuum_min_duration = 0
    log_error_verbosity = default

    # Replication Settings
    wal_level = replica
    max_wal_senders = 10
    max_replication_slots = 10
    hot_standby = on
    hot_standby_feedback = off

    # SSL Settings
    ssl = on
    ssl_cert_file = '/etc/ssl/certs/tls.crt'
    ssl_key_file = '/etc/ssl/private/tls.key'
    ssl_ca_file = '/etc/ssl/certs/ca.crt'
    ssl_ciphers = 'ECDHE-RSA-AES128-GCM-SHA256:ECDHE-RSA-AES256-GCM-SHA384'
    ssl_prefer_server_ciphers = on
    ssl_protocols = 'TLSv1.2,TLSv1.3'

Database Indexes for Temporal¶

Temporal Optimization Script¶

-- k8s/database/sql/temporal-indexes.sql
-- Additional indexes for Temporal performance optimization

-- Indexes for executions table
CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_executions_namespace_id 
ON executions (namespace_id);

CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_executions_workflow_id_run_id 
ON executions (workflow_id, run_id);

CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_executions_state_created_time 
ON executions (state, created_time);

CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_executions_start_time 
ON executions (start_time) WHERE start_time IS NOT NULL;

-- Indexes for history_events table
CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_history_events_workflow_id_run_id_event_id 
ON history_events (workflow_id, run_id, event_id);

CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_history_events_created_time 
ON history_events (created_time);

CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_history_events_event_type 
ON history_events (event_type);

-- Indexes for tasks table
CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_tasks_task_queue_name_state 
ON tasks (task_queue_name, state);

CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_tasks_created_time 
ON tasks (created_time);

CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_tasks_visibility_timestamp 
ON tasks (visibility_timestamp) WHERE visibility_timestamp IS NOT NULL;

-- Indexes for activity_info_maps table
CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_activity_info_maps_workflow_id_run_id 
ON activity_info_maps (workflow_id, run_id);

-- Indexes for timer_info_maps table
CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_timer_info_maps_workflow_id_run_id 
ON timer_info_maps (workflow_id, run_id);

-- Indexes for child_execution_info_maps table
CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_child_execution_info_maps_workflow_id_run_id 
ON child_execution_info_maps (workflow_id, run_id);

-- Indexes for visibility tables
CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_executions_visibility_namespace_id_start_time 
ON executions_visibility (namespace_id, start_time);

CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_executions_visibility_workflow_type_start_time 
ON executions_visibility (workflow_type, start_time);

CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_executions_visibility_status_start_time 
ON executions_visibility (status, start_time);

-- Composite indexes for common queries
CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_executions_visibility_namespace_status_start_time 
ON executions_visibility (namespace_id, status, start_time);

CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_executions_visibility_namespace_type_start_time 
ON executions_visibility (namespace_id, workflow_type, start_time);

-- Update table statistics
ANALYZE executions;
ANALYZE history_events;
ANALYZE tasks;
ANALYZE activity_info_maps;
ANALYZE timer_info_maps;
ANALYZE child_execution_info_maps;
ANALYZE executions_visibility;

Backup and Recovery¶

Automated Backup System¶

Backup CronJob¶

# k8s/database/backup/backup-cronjob.yaml
apiVersion: batch/v1
kind: CronJob
metadata:
  name: postgresql-backup
  namespace: temporal-system
spec:
  schedule: "0 2 * * *"  # Daily at 2 AM
  concurrencyPolicy: Forbid
  failedJobsHistoryLimit: 3
  successfulJobsHistoryLimit: 3
  jobTemplate:
    spec:
      template:
        metadata:
          annotations:
            prometheus.io/scrape: "false"
        spec:
          restartPolicy: OnFailure
          containers:
          - name: postgresql-backup
            image: postgres:15.4
            command:
            - /bin/bash
            - -c
            - |
              set -euo pipefail

              TIMESTAMP=$(date +%Y%m%d_%H%M%S)
              BACKUP_DIR="/backups"

              echo "Starting PostgreSQL backup at $(date)"

              # Create backup directory
              mkdir -p "$BACKUP_DIR"

              # Backup main database
              echo "Backing up temporal database..."
              PGPASSWORD="$DB_PASSWORD" pg_dump \
                -h "$DB_HOST" \
                -U "$DB_USER" \
                -d temporal \
                --verbose \
                --no-owner \
                --no-privileges \
                --clean \
                --if-exists \
                --format=custom \
                -f "$BACKUP_DIR/temporal_${TIMESTAMP}.dump"

              # Backup visibility database
              echo "Backing up temporal_visibility database..."
              PGPASSWORD="$DB_PASSWORD" pg_dump \
                -h "$DB_HOST" \
                -U "$DB_USER" \
                -d temporal_visibility \
                --verbose \
                --no-owner \
                --no-privileges \
                --clean \
                --if-exists \
                --format=custom \
                -f "$BACKUP_DIR/temporal_visibility_${TIMESTAMP}.dump"

              # Compress backups
              echo "Compressing backup files..."
              gzip "$BACKUP_DIR/temporal_${TIMESTAMP}.dump"
              gzip "$BACKUP_DIR/temporal_visibility_${TIMESTAMP}.dump"

              # Upload to S3
              echo "Uploading backups to S3..."
              aws s3 cp "$BACKUP_DIR/temporal_${TIMESTAMP}.dump.gz" \
                "s3://${S3_BUCKET}/database/temporal_${TIMESTAMP}.dump.gz" \
                --storage-class STANDARD_IA

              aws s3 cp "$BACKUP_DIR/temporal_visibility_${TIMESTAMP}.dump.gz" \
                "s3://${S3_BUCKET}/database/temporal_visibility_${TIMESTAMP}.dump.gz" \
                --storage-class STANDARD_IA

              # Clean up local files
              rm -f "$BACKUP_DIR"/*.dump.gz

              # Clean up old backups (keep last 30 days)
              aws s3 ls "s3://${S3_BUCKET}/database/" | \
                grep "temporal_" | \
                sort | \
                head -n -60 | \
                awk '{print $4}' | \
                xargs -I {} aws s3 rm "s3://${S3_BUCKET}/database/{}" || true

              echo "Backup completed successfully at $(date)"

            env:
            - name: DB_HOST
              value: "postgresql-primary"
            - name: DB_USER
              value: "temporal"
            - name: DB_PASSWORD
              valueFrom:
                secretKeyRef:
                  name: postgresql-credentials
                  key: password
            - name: S3_BUCKET
              value: "temporal-backups"
            - name: AWS_REGION
              value: "us-west-2"

            volumeMounts:
            - name: backup-storage
              mountPath: /backups

            resources:
              limits:
                memory: 1Gi
                cpu: 500m
              requests:
                memory: 512Mi
                cpu: 250m

          volumes:
          - name: backup-storage
            emptyDir:
              sizeLimit: 10Gi

          serviceAccountName: postgresql-backup

          securityContext:
            runAsNonRoot: true
            runAsUser: 999
            fsGroup: 999

Point-in-Time Recovery Setup¶

WAL-E Configuration for Continuous Archiving¶

# k8s/database/backup/wal-e-config.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: wal-e-config
  namespace: temporal-system
data:
  wal-e.conf: |
    [wal-e]
    s3_prefix = s3://temporal-backups/wal-e
    aws_region = us-west-2

    [postgresql]
    archive_mode = on
    archive_command = 'wal-e wal-push %p'
    archive_timeout = 60

    max_wal_senders = 10
    wal_keep_size = 1GB
    wal_level = replica

  recovery.conf.template: |
    standby_mode = 'on'
    primary_conninfo = 'host=${PRIMARY_HOST} port=5432 user=replicator password=${REPLICATION_PASSWORD}'
    restore_command = 'wal-e wal-fetch %f %p'
    recovery_target_time = '${RECOVERY_TARGET_TIME}'

Disaster Recovery Procedures¶

Recovery Script¶

#!/bin/bash
# scripts/database/disaster-recovery.sh

set -euo pipefail

BACKUP_DATE=${1:-latest}
RECOVERY_TYPE=${2:-full}  # full, point-in-time
TARGET_TIME=${3:-}

# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'

log() {
    echo -e "${GREEN}[$(date +'%Y-%m-%d %H:%M:%S')] $1${NC}"
}

warn() {
    echo -e "${YELLOW}[$(date +'%Y-%m-%d %H:%M:%S')] WARNING: $1${NC}"
}

error() {
    echo -e "${RED}[$(date +'%Y-%m-%d %H:%M:%S')] ERROR: $1${NC}"
    exit 1
}

log "Starting disaster recovery process..."
log "Backup date: $BACKUP_DATE"
log "Recovery type: $RECOVERY_TYPE"

# Verify prerequisites
if ! command -v kubectl &> /dev/null; then
    error "kubectl is required but not installed"
fi

if ! command -v aws &> /dev/null; then
    error "aws CLI is required but not installed"
fi

# Get backup file
if [[ "$BACKUP_DATE" == "latest" ]]; then
    BACKUP_FILE=$(aws s3 ls s3://temporal-backups/database/ | sort | tail -n 1 | awk '{print $4}')
    if [[ -z "$BACKUP_FILE" ]]; then
        error "No backup files found"
    fi
    log "Using latest backup: $BACKUP_FILE"
else
    BACKUP_FILE="temporal_${BACKUP_DATE}.dump.gz"
fi

# Download backup
log "Downloading backup file..."
aws s3 cp "s3://temporal-backups/database/$BACKUP_FILE" "./backup.dump.gz"
gunzip backup.dump.gz

# Stop Temporal services
log "Stopping Temporal services..."
kubectl scale deployment temporal-frontend temporal-history temporal-matching temporal-worker --replicas=0 -n temporal-system

# Create recovery database
log "Creating recovery database..."
kubectl run recovery-db --image=postgres:15.4 --rm -i --restart=Never -- \
    createdb -h postgresql-primary -U postgres -O temporal temporal_recovery

# Restore backup
log "Restoring database backup..."
kubectl run restore-job --image=postgres:15.4 --rm -i --restart=Never -- \
    pg_restore -h postgresql-primary -U temporal -d temporal_recovery \
    --verbose --clean --if-exists < backup.dump

if [[ "$RECOVERY_TYPE" == "point-in-time" && -n "$TARGET_TIME" ]]; then
    log "Performing point-in-time recovery to $TARGET_TIME..."
    # Configure recovery.conf for point-in-time recovery
    kubectl run pitr-job --image=postgres:15.4 --rm -i --restart=Never -- \
        bash -c "
        echo \"recovery_target_time = '$TARGET_TIME'\" > /tmp/recovery.conf
        echo \"recovery_target_action = 'promote'\" >> /tmp/recovery.conf
        kubectl cp /tmp/recovery.conf postgresql-primary:/var/lib/postgresql/data/recovery.conf
        "
fi

# Validate recovery
log "Validating database recovery..."
RECORD_COUNT=$(kubectl run validate-job --image=postgres:15.4 --rm -i --restart=Never -- \
    psql -h postgresql-primary -U temporal -d temporal_recovery -t -c "SELECT COUNT(*) FROM executions;")

log "Database recovery validation: $RECORD_COUNT records found"

# Switch to recovered database
log "Switching to recovered database..."
kubectl run switch-db --image=postgres:15.4 --rm -i --restart=Never -- \
    bash -c "
    psql -h postgresql-primary -U postgres -c 'ALTER DATABASE temporal RENAME TO temporal_old;'
    psql -h postgresql-primary -U postgres -c 'ALTER DATABASE temporal_recovery RENAME TO temporal;'
    "

# Restart Temporal services
log "Restarting Temporal services..."
kubectl scale deployment temporal-frontend temporal-history temporal-matching temporal-worker --replicas=1 -n temporal-system

# Wait for services to be ready
kubectl wait --for=condition=available deployment/temporal-frontend -n temporal-system --timeout=300s

log "Disaster recovery completed successfully!"
log "Please verify system functionality before proceeding with normal operations"

# Clean up
rm -f backup.dump

Monitoring and Maintenance¶

Database Monitoring Setup¶

PostgreSQL Exporter Configuration¶

# k8s/database/monitoring/postgres-exporter.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: postgres-exporter
  namespace: temporal-system
  labels:
    app: postgres-exporter
spec:
  replicas: 1
  selector:
    matchLabels:
      app: postgres-exporter
  template:
    metadata:
      labels:
        app: postgres-exporter
    spec:
      containers:
      - name: postgres-exporter
        image: quay.io/prometheuscommunity/postgres-exporter:v0.11.1
        ports:
        - containerPort: 9187
          name: metrics
        env:
        - name: DATA_SOURCE_NAME
          valueFrom:
            secretKeyRef:
              name: postgres-exporter-secret
              key: connection-string
        - name: PG_EXPORTER_QUERIES_PATH
          value: "/etc/postgres_exporter/queries.yaml"

        volumeMounts:
        - name: queries-config
          mountPath: /etc/postgres_exporter

        resources:
          limits:
            memory: 256Mi
            cpu: 250m
          requests:
            memory: 128Mi
            cpu: 100m

        securityContext:
          allowPrivilegeEscalation: false
          readOnlyRootFilesystem: true
          runAsNonRoot: true
          runAsUser: 65534

      volumes:
      - name: queries-config
        configMap:
          name: postgres-exporter-queries

      serviceAccountName: postgres-exporter

---
apiVersion: v1
kind: Service
metadata:
  name: postgres-exporter
  namespace: temporal-system
  labels:
    app: postgres-exporter
spec:
  ports:
  - port: 9187
    targetPort: 9187
    name: metrics
  selector:
    app: postgres-exporter

---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
  name: postgres-exporter
  namespace: temporal-system
spec:
  selector:
    matchLabels:
      app: postgres-exporter
  endpoints:
  - port: metrics
    interval: 30s
    path: /metrics

Custom PostgreSQL Queries for Monitoring¶

# k8s/database/monitoring/postgres-queries.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: postgres-exporter-queries
  namespace: temporal-system
data:
  queries.yaml: |
    pg_replication:
      query: "SELECT CASE WHEN NOT pg_is_in_recovery() THEN 0 ELSE GREATEST (0, EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))) END AS lag"
      master: true
      metrics:
        - lag:
            usage: "GAUGE"
            description: "Replication lag behind master in seconds"

    pg_postmaster:
      query: "SELECT pg_postmaster_start_time as start_time_seconds from pg_postmaster_start_time()"
      master: true
      metrics:
        - start_time_seconds:
            usage: "GAUGE"
            description: "Time at which postmaster started"

    pg_stat_user_tables:
      query: |
        SELECT
          current_database() datname,
          schemaname,
          relname,
          seq_scan,
          seq_tup_read,
          idx_scan,
          idx_tup_fetch,
          n_tup_ins,
          n_tup_upd,
          n_tup_del,
          n_tup_hot_upd,
          n_live_tup,
          n_dead_tup,
          n_mod_since_analyze,
          COALESCE(last_vacuum, '1970-01-01Z') as last_vacuum,
          COALESCE(last_autovacuum, '1970-01-01Z') as last_autovacuum,
          COALESCE(last_analyze, '1970-01-01Z') as last_analyze,
          COALESCE(last_autoanalyze, '1970-01-01Z') as last_autoanalyze,
          vacuum_count,
          autovacuum_count,
          analyze_count,
          autoanalyze_count
        FROM pg_stat_user_tables
      metrics:
        - datname:
            usage: "LABEL"
            description: "Name of current database"
        - schemaname:
            usage: "LABEL"
            description: "Name of the schema that this table is in"
        - relname:
            usage: "LABEL"
            description: "Name of this table"
        - seq_scan:
            usage: "COUNTER"
            description: "Number of sequential scans initiated on this table"
        - seq_tup_read:
            usage: "COUNTER"
            description: "Number of live rows fetched by sequential scans"
        - idx_scan:
            usage: "COUNTER"
            description: "Number of index scans initiated on this table"
        - idx_tup_fetch:
            usage: "COUNTER"
            description: "Number of live rows fetched by index scans"
        - n_tup_ins:
            usage: "COUNTER"
            description: "Number of rows inserted"
        - n_tup_upd:
            usage: "COUNTER"
            description: "Number of rows updated"
        - n_tup_del:
            usage: "COUNTER"
            description: "Number of rows deleted"
        - n_tup_hot_upd:
            usage: "COUNTER"
            description: "Number of rows HOT updated"
        - n_live_tup:
            usage: "GAUGE"
            description: "Estimated number of live rows"
        - n_dead_tup:
            usage: "GAUGE"
            description: "Estimated number of dead rows"
        - n_mod_since_analyze:
            usage: "GAUGE"
            description: "Estimated number of rows modified since this table was last analyzed"
        - last_vacuum:
            usage: "GAUGE"
            description: "Last time at which this table was manually vacuumed"
        - last_autovacuum:
            usage: "GAUGE"
            description: "Last time at which this table was vacuumed by the autovacuum daemon"
        - last_analyze:
            usage: "GAUGE"
            description: "Last time at which this table was manually analyzed"
        - last_autoanalyze:
            usage: "GAUGE"
            description: "Last time at which this table was analyzed by the autovacuum daemon"
        - vacuum_count:
            usage: "COUNTER"
            description: "Number of times this table has been manually vacuumed"
        - autovacuum_count:
            usage: "COUNTER"
            description: "Number of times this table has been vacuumed by the autovacuum daemon"
        - analyze_count:
            usage: "COUNTER"
            description: "Number of times this table has been manually analyzed"
        - autoanalyze_count:
            usage: "COUNTER"
            description: "Number of times this table has been analyzed by the autovacuum daemon"

    temporal_executions:
      query: |
        SELECT
          namespace_id,
          COUNT(*) as total_executions,
          COUNT(*) FILTER (WHERE state = 1) as running_executions,
          COUNT(*) FILTER (WHERE state = 2) as completed_executions,
          COUNT(*) FILTER (WHERE state = 3) as failed_executions,
          COUNT(*) FILTER (WHERE state = 4) as cancelled_executions,
          COUNT(*) FILTER (WHERE state = 5) as terminated_executions
        FROM executions
        GROUP BY namespace_id
      metrics:
        - namespace_id:
            usage: "LABEL"
            description: "Temporal namespace ID"
        - total_executions:
            usage: "GAUGE"
            description: "Total number of workflow executions"
        - running_executions:
            usage: "GAUGE"
            description: "Number of running workflow executions"
        - completed_executions:
            usage: "GAUGE"
            description: "Number of completed workflow executions"
        - failed_executions:
            usage: "GAUGE"
            description: "Number of failed workflow executions"
        - cancelled_executions:
            usage: "GAUGE"
            description: "Number of cancelled workflow executions"
        - terminated_executions:
            usage: "GAUGE"
            description: "Number of terminated workflow executions"

Database Maintenance Scripts¶

Maintenance CronJob¶

# k8s/database/maintenance/maintenance-cronjob.yaml
apiVersion: batch/v1
kind: CronJob
metadata:
  name: postgresql-maintenance
  namespace: temporal-system
spec:
  schedule: "0 3 * * 0"  # Weekly on Sunday at 3 AM
  concurrencyPolicy: Forbid
  jobTemplate:
    spec:
      template:
        spec:
          restartPolicy: OnFailure
          containers:
          - name: maintenance
            image: postgres:15.4
            command:
            - /bin/bash
            - -c
            - |
              set -euo pipefail

              echo "Starting database maintenance at $(date)"

              # Vacuum and analyze all databases
              for db in temporal temporal_visibility; do
                echo "Maintaining database: $db"

                # Vacuum analyze
                PGPASSWORD="$DB_PASSWORD" psql -h "$DB_HOST" -U "$DB_USER" -d "$db" -c "VACUUM ANALYZE;"

                # Reindex
                PGPASSWORD="$DB_PASSWORD" psql -h "$DB_HOST" -U "$DB_USER" -d "$db" -c "REINDEX DATABASE $db;"

                # Update statistics
                PGPASSWORD="$DB_PASSWORD" psql -h "$DB_HOST" -U "$DB_USER" -d "$db" -c "ANALYZE;"
              done

              echo "Database maintenance completed at $(date)"

            env:
            - name: DB_HOST
              value: "postgresql-primary"
            - name: DB_USER
              value: "temporal"
            - name: DB_PASSWORD
              valueFrom:
                secretKeyRef:
                  name: postgresql-credentials
                  key: password

            resources:
              limits:
                memory: 512Mi
                cpu: 500m
              requests:
                memory: 256Mi
                cpu: 250m

This comprehensive database setup guide provides enterprise-grade PostgreSQL deployment with high availability, performance optimization, backup/recovery procedures, and monitoring capabilities specifically tuned for Temporal.io workloads.