Database Design¶

This document provides comprehensive coverage of the database architecture for the Temporal.io enterprise deployment, including data models, storage strategies, performance optimization, and operational procedures.

Database Architecture Overview¶

The database design implements a multi-store approach with PostgreSQL as the primary database for persistence and visibility, complemented by Elasticsearch for advanced search capabilities and Redis for caching layers.

graph TB
    subgraph "Application Layer"
        APP1[Temporal Frontend]
        APP2[Temporal History]
        APP3[Temporal Matching]
        APP4[Temporal Worker]
        APP5[Temporal Web UI]
    end

    subgraph "Database Layer"
        subgraph "Primary Storage"
            DB1[PostgreSQL Primary<br/>Read/Write Operations]
            DB2[PostgreSQL Replica 1<br/>Read Operations]
            DB3[PostgreSQL Replica 2<br/>Read Operations]
        end

        subgraph "Search Layer"
            ES1[Elasticsearch Master 1]
            ES2[Elasticsearch Master 2]
            ES3[Elasticsearch Master 3]
            ES4[Elasticsearch Data 1]
            ES5[Elasticsearch Data 2]
            ES6[Elasticsearch Data 3]
        end

        subgraph "Cache Layer"
            CACHE1[Redis Primary<br/>Session Cache]
            CACHE2[Redis Replica<br/>Read Cache]
            CACHE3[Redis Sentinel<br/>Failover]
        end

        subgraph "Backup Storage"
            BACKUP1[WAL Streaming<br/>Continuous Backup]
            BACKUP2[Point-in-Time Recovery<br/>PITR]
            BACKUP3[Object Storage<br/>S3/GCS/Azure]
        end
    end

    subgraph "Monitoring & Management"
        MON1[PostgreSQL Exporter<br/>Metrics]
        MON2[Elasticsearch Exporter<br/>Metrics]
        MON3[Redis Exporter<br/>Metrics]
        MON4[Backup Monitoring<br/>Health Checks]
    end

    APP1 --> DB1
    APP2 --> DB1
    APP3 --> DB1
    APP4 --> DB1
    APP5 --> DB2

    APP1 --> ES1
    APP5 --> ES1

    APP1 --> CACHE1
    APP5 --> CACHE1

    DB1 --> DB2
    DB1 --> DB3
    DB1 --> BACKUP1

    MON1 --> DB1
    MON2 --> ES1
    MON3 --> CACHE1

PostgreSQL Database Design¶

Primary Database Configuration¶

High Availability Setup¶

# PostgreSQL High Availability Configuration
postgresql_ha:
  deployment_type: "primary_replica"
  replication_mode: "streaming"

  primary:
    name: "postgresql-primary"
    resources:
      cpu: "4000m"
      memory: "16Gi"
      storage: "500Gi"
      storage_class: "fast-ssd"

  replicas:
    count: 2
    resources:
      cpu: "2000m"
      memory: "8Gi"
      storage: "500Gi"
      storage_class: "fast-ssd"
    lag_threshold: "1MB"

  connection_pooling:
    enabled: true
    tool: "pgbouncer"
    max_connections: 1000
    default_pool_size: 50
    min_pool_size: 10

  backup:
    type: "continuous"
    tool: "pgbackrest"
    retention: "30_days"
    compression: "lz4"

PostgreSQL Configuration¶

-- postgresql.conf optimizations for Temporal
-- Memory Settings
shared_buffers = '4GB'                    -- 25% of available RAM
effective_cache_size = '12GB'             -- 75% of available RAM
work_mem = '64MB'                         -- For complex queries
maintenance_work_mem = '1GB'              -- For VACUUM, INDEX operations

-- Write-Ahead Logging (WAL)
wal_level = 'replica'                     -- Enable streaming replication
max_wal_size = '2GB'                      -- Maximum WAL size
min_wal_size = '1GB'                      -- Minimum WAL size
wal_compression = on                      -- Compress WAL records
wal_log_hints = on                        -- Enable WAL hints for pg_rewind

-- Checkpoints
checkpoint_completion_target = 0.9        -- Spread checkpoint I/O
checkpoint_timeout = '15min'              -- Checkpoint frequency

-- Connection Settings
max_connections = 1000                    -- Maximum concurrent connections
shared_preload_libraries = 'pg_stat_statements,auto_explain,pg_cron'

-- Query Optimization
random_page_cost = 1.1                    -- SSD optimization
effective_io_concurrency = 200            -- Concurrent I/O operations
default_statistics_target = 500           -- Query planner statistics

-- Logging
log_statement = 'mod'                     -- Log modifications
log_min_duration_statement = 1000         -- Log slow queries (>1s)
log_line_prefix = '%t [%p]: [%l-1] user=%u,db=%d,app=%a,client=%h '
log_lock_waits = on                       -- Log lock waits
log_temp_files = 10MB                     -- Log temp files > 10MB

-- Auto-vacuum
autovacuum = on
autovacuum_max_workers = 6
autovacuum_naptime = 15s
autovacuum_vacuum_threshold = 100
autovacuum_analyze_threshold = 50
autovacuum_vacuum_scale_factor = 0.05
autovacuum_analyze_scale_factor = 0.02

Database Schema Design¶

Temporal Default Store Schema¶

-- Core Temporal tables for workflow execution
CREATE SCHEMA temporal;

-- Executions table - stores workflow execution state
CREATE TABLE temporal.executions (
    shard_id INTEGER NOT NULL,
    namespace_id CHAR(64) NOT NULL,
    workflow_id VARCHAR(255) NOT NULL,
    run_id CHAR(64) NOT NULL,
    next_event_id BIGINT NOT NULL,
    last_write_version BIGINT NOT NULL,
    data BYTEA NOT NULL,
    data_encoding VARCHAR(16) NOT NULL,
    state BYTEA NOT NULL,
    state_encoding VARCHAR(16) NOT NULL,
    db_record_version BIGINT NOT NULL DEFAULT 0,
    PRIMARY KEY (shard_id, namespace_id, workflow_id, run_id)
);

-- History tree table - stores workflow history events
CREATE TABLE temporal.history_tree (
    shard_id INTEGER NOT NULL,
    tree_id CHAR(64) NOT NULL,
    branch_id CHAR(64) NOT NULL,
    data BYTEA NOT NULL,
    data_encoding VARCHAR(16) NOT NULL,
    PRIMARY KEY (shard_id, tree_id, branch_id)
);

-- Current executions table - fast lookup for current workflow state
CREATE TABLE temporal.current_executions (
    shard_id INTEGER NOT NULL,
    namespace_id CHAR(64) NOT NULL,
    workflow_id VARCHAR(255) NOT NULL,
    run_id CHAR(64) NOT NULL,
    create_request_id VARCHAR(64) NOT NULL,
    state INTEGER NOT NULL,
    status INTEGER NOT NULL,
    start_version BIGINT NOT NULL,
    last_write_version BIGINT NOT NULL,
    PRIMARY KEY (shard_id, namespace_id, workflow_id)
);

-- Tasks table - stores task queue items
CREATE TABLE temporal.tasks (
    range_hash BIGINT NOT NULL,
    task_queue_id BYTEA NOT NULL,
    task_id BIGINT NOT NULL,
    data BYTEA NOT NULL,
    data_encoding VARCHAR(16) NOT NULL,
    PRIMARY KEY (range_hash, task_queue_id, task_id)
);

-- Activity info table - tracks activity execution state
CREATE TABLE temporal.activity_info_maps (
    shard_id INTEGER NOT NULL,
    namespace_id CHAR(64) NOT NULL,
    workflow_id VARCHAR(255) NOT NULL,
    run_id CHAR(64) NOT NULL,
    schedule_id BIGINT NOT NULL,
    data BYTEA NOT NULL,
    data_encoding VARCHAR(16) NOT NULL,
    last_heartbeat_updated_time TIMESTAMP NOT NULL,
    PRIMARY KEY (shard_id, namespace_id, workflow_id, run_id, schedule_id)
);

-- Timer info table - manages workflow timers
CREATE TABLE temporal.timer_info_maps (
    shard_id INTEGER NOT NULL,
    namespace_id CHAR(64) NOT NULL,
    workflow_id VARCHAR(255) NOT NULL,
    run_id CHAR(64) NOT NULL,
    timer_id VARCHAR(255) NOT NULL,
    data BYTEA NOT NULL,
    data_encoding VARCHAR(16) NOT NULL,
    PRIMARY KEY (shard_id, namespace_id, workflow_id, run_id, timer_id)
);

Temporal Visibility Store Schema¶

-- Visibility schema for search and filtering
CREATE SCHEMA temporal_visibility;

-- Executions visibility table - searchable execution data
CREATE TABLE temporal_visibility.executions (
    namespace_id CHAR(64) NOT NULL,
    run_id CHAR(64) NOT NULL,
    start_time TIMESTAMP NOT NULL,
    execution_time TIMESTAMP NOT NULL,
    workflow_id VARCHAR(255) NOT NULL,
    workflow_type_name VARCHAR(255) NOT NULL,
    status INTEGER NOT NULL,
    close_time TIMESTAMP,
    history_length BIGINT,
    memo BYTEA,
    encoding VARCHAR(16),
    task_queue VARCHAR(255),
    search_attributes JSON,
    PRIMARY KEY (namespace_id, run_id)
);

-- Indexes for efficient querying
CREATE INDEX idx_executions_start_time ON temporal_visibility.executions(namespace_id, start_time DESC);
CREATE INDEX idx_executions_close_time ON temporal_visibility.executions(namespace_id, close_time DESC);
CREATE INDEX idx_executions_workflow_id ON temporal_visibility.executions(namespace_id, workflow_id);
CREATE INDEX idx_executions_workflow_type ON temporal_visibility.executions(namespace_id, workflow_type_name);
CREATE INDEX idx_executions_status ON temporal_visibility.executions(namespace_id, status);
CREATE INDEX idx_executions_task_queue ON temporal_visibility.executions(namespace_id, task_queue);

-- GIN index for JSON search attributes
CREATE INDEX idx_executions_search_attributes ON temporal_visibility.executions USING GIN(search_attributes);

-- Workflow search attributes table
CREATE TABLE temporal_visibility.custom_search_attributes (
    namespace_id CHAR(64) NOT NULL,
    run_id CHAR(64) NOT NULL,
    search_attributes JSON NOT NULL,
    PRIMARY KEY (namespace_id, run_id)
);

Partitioning Strategy¶

Time-based Partitioning¶

-- Partition executions table by month
CREATE TABLE temporal_visibility.executions_template (
    LIKE temporal_visibility.executions INCLUDING ALL
) PARTITION BY RANGE (start_time);

-- Create monthly partitions
CREATE TABLE temporal_visibility.executions_2024_01 
    PARTITION OF temporal_visibility.executions_template
    FOR VALUES FROM ('2024-01-01') TO ('2024-02-01');

CREATE TABLE temporal_visibility.executions_2024_02 
    PARTITION OF temporal_visibility.executions_template
    FOR VALUES FROM ('2024-02-01') TO ('2024-03-01');

-- Automated partition management
CREATE OR REPLACE FUNCTION create_monthly_partition() RETURNS void AS $$
DECLARE
    partition_date date;
    partition_name text;
    start_date text;
    end_date text;
BEGIN
    partition_date := date_trunc('month', CURRENT_DATE + interval '1 month');
    partition_name := 'executions_' || to_char(partition_date, 'YYYY_MM');
    start_date := partition_date::text;
    end_date := (partition_date + interval '1 month')::text;

    EXECUTE format('CREATE TABLE IF NOT EXISTS temporal_visibility.%I 
                    PARTITION OF temporal_visibility.executions_template
                    FOR VALUES FROM (%L) TO (%L)',
                   partition_name, start_date, end_date);
END;
$$ LANGUAGE plpgsql;

-- Schedule partition creation
SELECT cron.schedule('create-partition', '0 0 1 * *', 'SELECT create_monthly_partition()');

Performance Optimization¶

Index Strategy¶

-- Composite indexes for common query patterns
CREATE INDEX CONCURRENTLY idx_executions_composite_status 
    ON temporal_visibility.executions(namespace_id, status, start_time DESC);

CREATE INDEX CONCURRENTLY idx_executions_composite_type 
    ON temporal_visibility.executions(namespace_id, workflow_type_name, start_time DESC);

CREATE INDEX CONCURRENTLY idx_executions_composite_queue 
    ON temporal_visibility.executions(namespace_id, task_queue, start_time DESC);

-- Partial indexes for active workflows
CREATE INDEX CONCURRENTLY idx_executions_active 
    ON temporal_visibility.executions(namespace_id, start_time DESC)
    WHERE status IN (1, 2); -- Running, ContinuedAsNew

-- Functional indexes for complex queries
CREATE INDEX CONCURRENTLY idx_executions_duration 
    ON temporal_visibility.executions(namespace_id, (close_time - start_time))
    WHERE close_time IS NOT NULL;

Query Optimization¶

-- Materialized view for dashboard metrics
CREATE MATERIALIZED VIEW temporal_visibility.workflow_metrics AS
SELECT 
    namespace_id,
    workflow_type_name,
    DATE(start_time) as execution_date,
    COUNT(*) as total_executions,
    COUNT(*) FILTER (WHERE status = 3) as completed_executions,
    COUNT(*) FILTER (WHERE status = 4) as failed_executions,
    COUNT(*) FILTER (WHERE status = 5) as canceled_executions,
    AVG(EXTRACT(EPOCH FROM (close_time - start_time))) as avg_duration_seconds,
    PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY EXTRACT(EPOCH FROM (close_time - start_time))) as median_duration_seconds,
    PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY EXTRACT(EPOCH FROM (close_time - start_time))) as p95_duration_seconds
FROM temporal_visibility.executions
WHERE start_time >= CURRENT_DATE - INTERVAL '30 days'
GROUP BY namespace_id, workflow_type_name, DATE(start_time);

-- Refresh materialized view hourly
CREATE UNIQUE INDEX ON temporal_visibility.workflow_metrics(namespace_id, workflow_type_name, execution_date);
SELECT cron.schedule('refresh-metrics', '0 * * * *', 'REFRESH MATERIALIZED VIEW CONCURRENTLY temporal_visibility.workflow_metrics');

Elasticsearch Configuration¶

Cluster Setup¶

Elasticsearch Cluster Configuration¶

# Elasticsearch cluster for advanced visibility
elasticsearch:
  cluster_name: "temporal-es-cluster"

  master_nodes:
    count: 3
    resources:
      cpu: "1000m"
      memory: "2Gi"
      storage: "50Gi"
    jvm_heap: "1g"

  data_nodes:
    count: 3
    resources:
      cpu: "2000m"
      memory: "8Gi"
      storage: "500Gi"
    jvm_heap: "4g"

  client_nodes:
    count: 2
    resources:
      cpu: "500m"
      memory: "2Gi"
    jvm_heap: "1g"

  settings:
    indices.memory.index_buffer_size: "20%"
    indices.memory.min_index_buffer_size: "48mb"
    indices.fielddata.cache.size: "30%"
    indices.breaker.fielddata.limit: "40%"
    indices.breaker.request.limit: "30%"
    cluster.routing.allocation.disk.watermark.low: "85%"
    cluster.routing.allocation.disk.watermark.high: "90%"
    cluster.routing.allocation.disk.watermark.flood_stage: "95%"

Index Templates and Mappings¶

Temporal Visibility Index Template¶

{
  "index_patterns": ["temporal_visibility_v1_*"],
  "template": {
    "settings": {
      "number_of_shards": 3,
      "number_of_replicas": 1,
      "refresh_interval": "5s",
      "index.codec": "best_compression",
      "index.mapping.total_fields.limit": 2000,
      "index.max_result_window": 10000,
      "index.lifecycle.name": "temporal_visibility_policy",
      "index.lifecycle.rollover_alias": "temporal_visibility_v1"
    },
    "mappings": {
      "dynamic": "strict",
      "properties": {
        "NamespaceId": {
          "type": "keyword"
        },
        "WorkflowId": {
          "type": "keyword"
        },
        "RunId": {
          "type": "keyword"
        },
        "WorkflowType": {
          "type": "keyword"
        },
        "StartTime": {
          "type": "date",
          "format": "strict_date_optional_time_nanos"
        },
        "ExecutionTime": {
          "type": "date",
          "format": "strict_date_optional_time_nanos"
        },
        "CloseTime": {
          "type": "date",
          "format": "strict_date_optional_time_nanos"
        },
        "ExecutionStatus": {
          "type": "integer"
        },
        "ExecutionDuration": {
          "type": "long"
        },
        "HistoryLength": {
          "type": "long"
        },
        "TaskQueue": {
          "type": "keyword"
        },
        "Memo": {
          "type": "object",
          "enabled": false
        },
        "SearchAttributes": {
          "type": "object",
          "properties": {
            "CustomKeywordField": {
              "type": "keyword"
            },
            "CustomIntField": {
              "type": "integer"
            },
            "CustomDoubleField": {
              "type": "double"
            },
            "CustomBoolField": {
              "type": "boolean"
            },
            "CustomDatetimeField": {
              "type": "date",
              "format": "strict_date_optional_time_nanos"
            },
            "CustomStringField": {
              "type": "text",
              "analyzer": "standard"
            }
          }
        }
      }
    }
  }
}

Index Lifecycle Management¶

{
  "policy": {
    "phases": {
      "hot": {
        "actions": {
          "rollover": {
            "max_primary_shard_size": "5GB",
            "max_age": "7d"
          },
          "set_priority": {
            "priority": 100
          }
        }
      },
      "warm": {
        "min_age": "7d",
        "actions": {
          "set_priority": {
            "priority": 50
          },
          "allocate": {
            "number_of_replicas": 0
          },
          "forcemerge": {
            "max_num_segments": 1
          },
          "shrink": {
            "number_of_shards": 1
          }
        }
      },
      "cold": {
        "min_age": "30d",
        "actions": {
          "set_priority": {
            "priority": 0
          },
          "allocate": {
            "number_of_replicas": 0
          }
        }
      },
      "delete": {
        "min_age": "90d",
        "actions": {
          "delete": {}
        }
      }
    }
  }
}

Redis Configuration¶

Caching Strategy¶

Redis Cluster Setup¶

# Redis configuration for caching
redis:
  deployment_mode: "sentinel"

  sentinel:
    count: 3
    resources:
      cpu: "100m"
      memory: "128Mi"

  master:
    resources:
      cpu: "500m"
      memory: "2Gi"
    persistence:
      enabled: true
      size: "10Gi"

  replica:
    count: 2
    resources:
      cpu: "250m"
      memory: "1Gi"

  configuration:
    maxmemory: "1536mb"
    maxmemory-policy: "allkeys-lru"
    timeout: 300
    tcp-keepalive: 60
    save: "900 1 300 10 60 10000"
    stop-writes-on-bgsave-error: "yes"
    rdbcompression: "yes"
    rdbchecksum: "yes"

  cache_patterns:
    user_sessions:
      ttl: 3600  # 1 hour
      namespace: "session:"

    workflow_metadata:
      ttl: 600   # 10 minutes
      namespace: "wf_meta:"

    task_queue_info:
      ttl: 300   # 5 minutes
      namespace: "tq_info:"

Data Management Strategies¶

Backup and Recovery¶

Continuous Backup with pgBackRest¶

# pgBackRest configuration
pgbackrest:
  global:
    repo1-type: "s3"
    repo1-s3-bucket: "temporal-backups"
    repo1-s3-region: "us-east-1"
    repo1-s3-endpoint: "s3.amazonaws.com"
    repo1-retention-full: 30
    repo1-retention-diff: 7
    repo1-retention-incr: 3

  stanza:
    temporal:
      pg1-path: "/var/lib/postgresql/data"
      pg1-host: "postgresql-primary"
      pg1-host-user: "postgres"
      recovery-option: "recovery_target_action=promote"

  backup_schedules:
    full_backup: "0 2 * * 0"  # Weekly full backup
    differential_backup: "0 2 * * 1-6"  # Daily differential backup
    incremental_backup: "0 */6 * * *"  # Every 6 hours incremental backup

Point-in-Time Recovery¶

#!/bin/bash
# Point-in-time recovery script
recover_to_timestamp() {
    local target_timestamp=$1
    local recovery_dir="/var/lib/postgresql/recovery"

    # Stop PostgreSQL
    kubectl scale statefulset postgresql-primary --replicas=0

    # Restore from backup
    pgbackrest --stanza=temporal --type=time \
               --target="$target_timestamp" \
               --target-action=promote \
               restore

    # Start PostgreSQL
    kubectl scale statefulset postgresql-primary --replicas=1

    echo "Recovery to $target_timestamp completed"
}

Data Retention Policies¶

Automated Data Cleanup¶

-- Data retention stored procedures
CREATE OR REPLACE FUNCTION cleanup_old_executions(retention_days INTEGER DEFAULT 90)
RETURNS INTEGER AS $$
DECLARE
    deleted_count INTEGER;
    cutoff_date TIMESTAMP;
BEGIN
    cutoff_date := CURRENT_TIMESTAMP - (retention_days || ' days')::INTERVAL;

    -- Delete old completed executions
    DELETE FROM temporal_visibility.executions 
    WHERE close_time < cutoff_date 
    AND status IN (3, 4, 5, 6, 7, 8); -- Completed, Failed, Canceled, etc.

    GET DIAGNOSTICS deleted_count = ROW_COUNT;

    -- Log cleanup operation
    INSERT INTO temporal.cleanup_log (cleanup_date, table_name, deleted_rows, retention_days)
    VALUES (CURRENT_TIMESTAMP, 'executions', deleted_count, retention_days);

    RETURN deleted_count;
END;
$$ LANGUAGE plpgsql;

-- Schedule automated cleanup
SELECT cron.schedule('cleanup-executions', '0 3 * * *', 'SELECT cleanup_old_executions(90)');

Data Migration and Upgrades¶

Schema Migration Framework¶

-- Migration tracking table
CREATE TABLE temporal.schema_migrations (
    version VARCHAR(255) PRIMARY KEY,
    applied_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    checksum VARCHAR(64) NOT NULL,
    description TEXT
);

-- Example migration: Add new search attribute
CREATE OR REPLACE FUNCTION migrate_add_search_attribute()
RETURNS VOID AS $$
BEGIN
    -- Check if migration already applied
    IF EXISTS (SELECT 1 FROM temporal.schema_migrations WHERE version = '1.20.0_add_customer_id') THEN
        RAISE NOTICE 'Migration 1.20.0_add_customer_id already applied';
        RETURN;
    END IF;

    -- Add new column
    ALTER TABLE temporal_visibility.executions 
    ADD COLUMN IF NOT EXISTS customer_id VARCHAR(255);

    -- Add index
    CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_executions_customer_id 
        ON temporal_visibility.executions(namespace_id, customer_id);

    -- Record migration
    INSERT INTO temporal.schema_migrations (version, description, checksum)
    VALUES ('1.20.0_add_customer_id', 'Add customer_id search attribute', 'abc123def456');

    RAISE NOTICE 'Migration 1.20.0_add_customer_id completed successfully';
END;
$$ LANGUAGE plpgsql;

Performance Monitoring and Optimization¶

Database Metrics Collection¶

PostgreSQL Monitoring¶

# PostgreSQL Exporter configuration
postgresql_exporter:
  enabled: true
  datasource:
    host: "postgresql-primary"
    port: 5432
    database: "temporal"
    user: "postgres_exporter"

  custom_queries:
    temporal_metrics:
      query: |
        SELECT 
          schemaname,
          tablename,
          n_tup_ins as inserts,
          n_tup_upd as updates,
          n_tup_del as deletes,
          n_live_tup as live_tuples,
          n_dead_tup as dead_tuples,
          last_vacuum,
          last_autovacuum,
          last_analyze,
          last_autoanalyze
        FROM pg_stat_user_tables 
        WHERE schemaname IN ('temporal', 'temporal_visibility')
      metrics:
        - schemaname:
            usage: "LABEL"
            description: "Schema name"
        - tablename:
            usage: "LABEL"
            description: "Table name"
        - inserts:
            usage: "COUNTER"
            description: "Number of rows inserted"
        - updates:
            usage: "COUNTER"
            description: "Number of rows updated"
        - deletes:
            usage: "COUNTER"
            description: "Number of rows deleted"
        - live_tuples:
            usage: "GAUGE"
            description: "Estimated number of live rows"
        - dead_tuples:
            usage: "GAUGE"
            description: "Estimated number of dead rows"

Performance Tuning¶

Slow Query Analysis¶

-- Enable pg_stat_statements for query analysis
CREATE EXTENSION IF NOT EXISTS pg_stat_statements;

-- Query to find slow operations
SELECT 
    query,
    calls,
    total_time,
    mean_time,
    stddev_time,
    rows,
    100.0 * shared_blks_hit / nullif(shared_blks_hit + shared_blks_read, 0) AS hit_percent
FROM pg_stat_statements
WHERE mean_time > 1000  -- Queries taking more than 1 second on average
ORDER BY mean_time DESC
LIMIT 20;

-- Query to find table bloat
SELECT 
    schemaname,
    tablename,
    attname,
    n_distinct,
    correlation,
    most_common_vals,
    histogram_bounds
FROM pg_stats
WHERE schemaname IN ('temporal', 'temporal_visibility')
AND n_distinct < 100  -- Low cardinality columns that might benefit from partial indexes
ORDER BY schemaname, tablename, attname;

Index Usage Analysis¶

-- Analyze index usage patterns
SELECT 
    schemaname,
    tablename,
    indexname,
    idx_scan,
    idx_tup_read,
    idx_tup_fetch,
    pg_size_pretty(pg_relation_size(indexrelid)) as index_size
FROM pg_stat_user_indexes
WHERE schemaname IN ('temporal', 'temporal_visibility')
ORDER BY idx_scan DESC;

-- Find unused indexes
SELECT 
    schemaname,
    tablename,
    indexname,
    pg_size_pretty(pg_relation_size(indexrelid)) as index_size
FROM pg_stat_user_indexes
WHERE schemaname IN ('temporal', 'temporal_visibility')
AND idx_scan = 0
ORDER BY pg_relation_size(indexrelid) DESC;

Disaster Recovery and High Availability¶

Database Failover Procedures¶

Automated Failover with Patroni¶

# Patroni configuration for PostgreSQL HA
patroni:
  scope: temporal-cluster
  namespace: temporal-backend

  bootstrap:
    dcs:
      ttl: 30
      loop_wait: 10
      retry_timeout: 30
      maximum_lag_on_failover: 1048576  # 1MB
      master_start_timeout: 300
      synchronous_mode: true
      synchronous_mode_strict: false

    initdb:
      - encoding: UTF8
      - data-checksums

  postgresql:
    use_pg_rewind: true
    use_slots: true
    parameters:
      wal_level: replica
      hot_standby: "on"
      max_connections: 1000
      max_wal_senders: 10
      max_replication_slots: 10
      wal_keep_segments: 8

  watchdog:
    mode: required
    device: /dev/watchdog
    safety_margin: 5

Cross-Region Data Replication¶

Logical Replication Setup¶

-- Set up logical replication for disaster recovery
-- On primary cluster
CREATE PUBLICATION temporal_pub FOR ALL TABLES IN SCHEMA temporal, temporal_visibility;

-- On secondary cluster (different region)
CREATE SUBSCRIPTION temporal_sub 
CONNECTION 'host=primary-cluster.example.com port=5432 user=replicator dbname=temporal'
PUBLICATION temporal_pub
WITH (copy_data = true, create_slot = true, enabled = true);

-- Monitor replication lag
SELECT 
    application_name,
    client_addr,
    state,
    sent_lsn,
    write_lsn,
    flush_lsn,
    replay_lsn,
    write_lag,
    flush_lag,
    replay_lag
FROM pg_stat_replication;

Recovery Testing¶

Automated Recovery Testing¶

#!/bin/bash
# Disaster recovery testing script
test_recovery() {
    local test_type=$1
    local test_timestamp=$(date +%Y%m%d_%H%M%S)
    local test_namespace="temporal-dr-test-$test_timestamp"

    echo "Starting DR test: $test_type at $test_timestamp"

    case $test_type in
        "point_in_time")
            # Test point-in-time recovery
            test_pitr_recovery "$test_namespace"
            ;;
        "full_restore")
            # Test full backup restore
            test_full_restore "$test_namespace"
            ;;
        "failover")
            # Test automatic failover
            test_automatic_failover "$test_namespace"
            ;;
        *)
            echo "Unknown test type: $test_type"
            exit 1
            ;;
    esac

    # Cleanup test resources
    cleanup_test_environment "$test_namespace"

    echo "DR test completed: $test_type"
}

# Schedule monthly DR tests
echo "0 3 1 * * /path/to/test_recovery.sh point_in_time" | crontab -

This comprehensive database design ensures that the Temporal.io enterprise deployment has a robust, scalable, and highly available data layer that can handle enterprise workloads while maintaining data integrity, performance, and recoverability.