Skip to content

Infrastructure Setup

This guide provides step-by-step instructions for setting up the infrastructure required for Temporal.io enterprise deployment, including Kubernetes cluster, networking, storage, and foundational services.

Overview

The infrastructure setup includes: - Kubernetes cluster provisioning - Network configuration and security groups - Storage provisioning and configuration - Load balancer setup - DNS and certificate management - Monitoring infrastructure

Prerequisites

Required Tools

# Install required tools on macOS
brew install kubectl
brew install helm
brew install terraform
brew install aws-cli
brew install eksctl  # for AWS EKS
brew install k9s     # optional but recommended

# Verify installations
kubectl version --client
helm version
terraform version
aws --version

Required Permissions

AWS IAM Permissions

{
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Action": [
                "eks:*",
                "ec2:*",
                "iam:*",
                "cloudformation:*",
                "autoscaling:*",
                "elasticloadbalancing:*",
                "route53:*",
                "acm:*",
                "rds:*",
                "elasticache:*"
            ],
            "Resource": "*"
        }
    ]
}

GCP IAM Roles

# Required roles for GKE deployment
gcloud projects add-iam-policy-binding PROJECT_ID \
    --member="user:YOUR_EMAIL" \
    --role="roles/container.admin"

gcloud projects add-iam-policy-binding PROJECT_ID \
    --member="user:YOUR_EMAIL" \
    --role="roles/compute.admin"

gcloud projects add-iam-policy-binding PROJECT_ID \
    --member="user:YOUR_EMAIL" \
    --role="roles/iam.serviceAccountAdmin"

Terraform Infrastructure

Directory Structure

infrastructure/
├── terraform/
│   ├── modules/
│   │   ├── eks/
│   │   ├── vpc/
│   │   ├── rds/
│   │   └── elasticache/
│   └── environments/
│       ├── development/
│       ├── staging/
│       └── production/
├── scripts/
└── docs/

VPC and Networking

VPC Module

# terraform/modules/vpc/main.tf
resource "aws_vpc" "main" {
  cidr_block           = var.vpc_cidr
  enable_dns_hostnames = true
  enable_dns_support   = true

  tags = {
    Name        = "${var.cluster_name}-vpc"
    Environment = var.environment
    Project     = "temporal"
  }
}

resource "aws_internet_gateway" "main" {
  vpc_id = aws_vpc.main.id

  tags = {
    Name = "${var.cluster_name}-igw"
  }
}

# Public subnets
resource "aws_subnet" "public" {
  count = length(var.availability_zones)

  vpc_id                  = aws_vpc.main.id
  cidr_block              = var.public_subnet_cidrs[count.index]
  availability_zone       = var.availability_zones[count.index]
  map_public_ip_on_launch = true

  tags = {
    Name                     = "${var.cluster_name}-public-${count.index + 1}"
    "kubernetes.io/role/elb" = "1"
    Environment              = var.environment
  }
}

# Private subnets
resource "aws_subnet" "private" {
  count = length(var.availability_zones)

  vpc_id            = aws_vpc.main.id
  cidr_block        = var.private_subnet_cidrs[count.index]
  availability_zone = var.availability_zones[count.index]

  tags = {
    Name                              = "${var.cluster_name}-private-${count.index + 1}"
    "kubernetes.io/role/internal-elb" = "1"
    Environment                       = var.environment
  }
}

# NAT Gateway
resource "aws_eip" "nat" {
  count = length(var.availability_zones)
  vpc   = true

  tags = {
    Name = "${var.cluster_name}-nat-eip-${count.index + 1}"
  }
}

resource "aws_nat_gateway" "main" {
  count = length(var.availability_zones)

  allocation_id = aws_eip.nat[count.index].id
  subnet_id     = aws_subnet.public[count.index].id

  tags = {
    Name = "${var.cluster_name}-nat-${count.index + 1}"
  }

  depends_on = [aws_internet_gateway.main]
}

# Route tables
resource "aws_route_table" "public" {
  vpc_id = aws_vpc.main.id

  route {
    cidr_block = "0.0.0.0/0"
    gateway_id = aws_internet_gateway.main.id
  }

  tags = {
    Name = "${var.cluster_name}-public-rt"
  }
}

resource "aws_route_table" "private" {
  count  = length(var.availability_zones)
  vpc_id = aws_vpc.main.id

  route {
    cidr_block     = "0.0.0.0/0"
    nat_gateway_id = aws_nat_gateway.main[count.index].id
  }

  tags = {
    Name = "${var.cluster_name}-private-rt-${count.index + 1}"
  }
}

# Route table associations
resource "aws_route_table_association" "public" {
  count = length(var.availability_zones)

  subnet_id      = aws_subnet.public[count.index].id
  route_table_id = aws_route_table.public.id
}

resource "aws_route_table_association" "private" {
  count = length(var.availability_zones)

  subnet_id      = aws_subnet.private[count.index].id
  route_table_id = aws_route_table.private[count.index].id
}

VPC Variables

# terraform/modules/vpc/variables.tf
variable "cluster_name" {
  description = "Name of the EKS cluster"
  type        = string
}

variable "environment" {
  description = "Environment name"
  type        = string
}

variable "vpc_cidr" {
  description = "CIDR block for VPC"
  type        = string
  default     = "10.0.0.0/16"
}

variable "availability_zones" {
  description = "List of availability zones"
  type        = list(string)
}

variable "public_subnet_cidrs" {
  description = "CIDR blocks for public subnets"
  type        = list(string)
}

variable "private_subnet_cidrs" {
  description = "CIDR blocks for private subnets"
  type        = list(string)
}

EKS Cluster Setup

EKS Module

# terraform/modules/eks/main.tf
resource "aws_eks_cluster" "main" {
  name     = var.cluster_name
  role_arn = aws_iam_role.cluster.arn
  version  = var.kubernetes_version

  vpc_config {
    subnet_ids              = concat(var.public_subnet_ids, var.private_subnet_ids)
    endpoint_private_access = true
    endpoint_public_access  = true
    public_access_cidrs     = var.public_access_cidrs
  }

  encryption_config {
    provider {
      key_arn = aws_kms_key.eks.arn
    }
    resources = ["secrets"]
  }

  enabled_cluster_log_types = ["api", "audit", "authenticator", "controllerManager", "scheduler"]

  depends_on = [
    aws_iam_role_policy_attachment.cluster_AmazonEKSClusterPolicy,
    aws_cloudwatch_log_group.cluster,
  ]

  tags = {
    Environment = var.environment
    Project     = "temporal"
  }
}

resource "aws_cloudwatch_log_group" "cluster" {
  name              = "/aws/eks/${var.cluster_name}/cluster"
  retention_in_days = 7
}

# EKS Node Group
resource "aws_eks_node_group" "main" {
  cluster_name    = aws_eks_cluster.main.name
  node_group_name = "${var.cluster_name}-nodes"
  node_role_arn   = aws_iam_role.node.arn
  subnet_ids      = var.private_subnet_ids

  capacity_type  = var.capacity_type
  instance_types = var.instance_types

  scaling_config {
    desired_size = var.desired_capacity
    max_size     = var.max_capacity
    min_size     = var.min_capacity
  }

  update_config {
    max_unavailable = 1
  }

  # Ensure that IAM Role permissions are created before and deleted after EKS Node Group handling.
  depends_on = [
    aws_iam_role_policy_attachment.node_AmazonEKSWorkerNodePolicy,
    aws_iam_role_policy_attachment.node_AmazonEKS_CNI_Policy,
    aws_iam_role_policy_attachment.node_AmazonEC2ContainerRegistryReadOnly,
  ]

  tags = {
    Environment = var.environment
    Project     = "temporal"
  }
}

# IAM Role for EKS Cluster
resource "aws_iam_role" "cluster" {
  name = "${var.cluster_name}-cluster-role"

  assume_role_policy = jsonencode({
    Statement = [{
      Action = "sts:AssumeRole"
      Effect = "Allow"
      Principal = {
        Service = "eks.amazonaws.com"
      }
    }]
    Version = "2012-10-17"
  })
}

resource "aws_iam_role_policy_attachment" "cluster_AmazonEKSClusterPolicy" {
  policy_arn = "arn:aws:iam::aws:policy/AmazonEKSClusterPolicy"
  role       = aws_iam_role.cluster.name
}

# IAM Role for EKS Node Group
resource "aws_iam_role" "node" {
  name = "${var.cluster_name}-node-role"

  assume_role_policy = jsonencode({
    Statement = [{
      Action = "sts:AssumeRole"
      Effect = "Allow"
      Principal = {
        Service = "ec2.amazonaws.com"
      }
    }]
    Version = "2012-10-17"
  })
}

resource "aws_iam_role_policy_attachment" "node_AmazonEKSWorkerNodePolicy" {
  policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy"
  role       = aws_iam_role.node.name
}

resource "aws_iam_role_policy_attachment" "node_AmazonEKS_CNI_Policy" {
  policy_arn = "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy"
  role       = aws_iam_role.node.name
}

resource "aws_iam_role_policy_attachment" "node_AmazonEC2ContainerRegistryReadOnly" {
  policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly"
  role       = aws_iam_role.node.name
}

# KMS Key for EKS encryption
resource "aws_kms_key" "eks" {
  description = "EKS Secret Encryption Key"

  tags = {
    Name        = "${var.cluster_name}-eks-key"
    Environment = var.environment
  }
}

resource "aws_kms_alias" "eks" {
  name          = "alias/${var.cluster_name}-eks"
  target_key_id = aws_kms_key.eks.key_id
}

RDS PostgreSQL Setup

RDS Module

# terraform/modules/rds/main.tf
resource "aws_db_subnet_group" "main" {
  name       = "${var.cluster_name}-db-subnet-group"
  subnet_ids = var.private_subnet_ids

  tags = {
    Name        = "${var.cluster_name}-db-subnet-group"
    Environment = var.environment
  }
}

resource "aws_security_group" "rds" {
  name_prefix = "${var.cluster_name}-rds-"
  vpc_id      = var.vpc_id

  ingress {
    from_port   = 5432
    to_port     = 5432
    protocol    = "tcp"
    cidr_blocks = var.allowed_cidr_blocks
  }

  egress {
    from_port   = 0
    to_port     = 0
    protocol    = "-1"
    cidr_blocks = ["0.0.0.0/0"]
  }

  tags = {
    Name        = "${var.cluster_name}-rds-sg"
    Environment = var.environment
  }
}

resource "aws_db_instance" "main" {
  allocated_storage           = var.allocated_storage
  max_allocated_storage       = var.max_allocated_storage
  storage_type                = "gp3"
  storage_encrypted           = true
  kms_key_id                  = aws_kms_key.rds.arn

  db_name  = var.database_name
  engine   = "postgres"
  engine_version = var.postgres_version
  instance_class = var.instance_class

  username = var.username
  password = var.password

  vpc_security_group_ids = [aws_security_group.rds.id]
  db_subnet_group_name   = aws_db_subnet_group.main.name

  backup_retention_period = var.backup_retention_period
  backup_window          = var.backup_window
  maintenance_window     = var.maintenance_window

  skip_final_snapshot = var.environment != "production"
  deletion_protection = var.environment == "production"

  performance_insights_enabled = true
  monitoring_interval         = 60
  monitoring_role_arn         = aws_iam_role.rds_monitoring.arn

  tags = {
    Name        = "${var.cluster_name}-postgres"
    Environment = var.environment
  }
}

# RDS Read Replica for production
resource "aws_db_instance" "replica" {
  count = var.environment == "production" ? 1 : 0

  identifier             = "${var.cluster_name}-postgres-replica"
  replicate_source_db    = aws_db_instance.main.id
  instance_class         = var.replica_instance_class
  publicly_accessible    = false
  auto_minor_version_upgrade = false

  tags = {
    Name        = "${var.cluster_name}-postgres-replica"
    Environment = var.environment
  }
}

# KMS Key for RDS encryption
resource "aws_kms_key" "rds" {
  description = "RDS encryption key"

  tags = {
    Name        = "${var.cluster_name}-rds-key"
    Environment = var.environment
  }
}

resource "aws_kms_alias" "rds" {
  name          = "alias/${var.cluster_name}-rds"
  target_key_id = aws_kms_key.rds.key_id
}

# IAM Role for RDS Enhanced Monitoring
resource "aws_iam_role" "rds_monitoring" {
  name = "${var.cluster_name}-rds-monitoring-role"

  assume_role_policy = jsonencode({
    Version = "2012-10-17"
    Statement = [
      {
        Action = "sts:AssumeRole"
        Effect = "Allow"
        Principal = {
          Service = "monitoring.rds.amazonaws.com"
        }
      }
    ]
  })
}

resource "aws_iam_role_policy_attachment" "rds_monitoring" {
  role       = aws_iam_role.rds_monitoring.name
  policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonRDSEnhancedMonitoringRole"
}

ElastiCache Redis Setup

ElastiCache Module

# terraform/modules/elasticache/main.tf
resource "aws_elasticache_subnet_group" "main" {
  name       = "${var.cluster_name}-cache-subnet"
  subnet_ids = var.private_subnet_ids
}

resource "aws_security_group" "elasticache" {
  name_prefix = "${var.cluster_name}-cache-"
  vpc_id      = var.vpc_id

  ingress {
    from_port   = 6379
    to_port     = 6379
    protocol    = "tcp"
    cidr_blocks = var.allowed_cidr_blocks
  }

  egress {
    from_port   = 0
    to_port     = 0
    protocol    = "-1"
    cidr_blocks = ["0.0.0.0/0"]
  }

  tags = {
    Name        = "${var.cluster_name}-cache-sg"
    Environment = var.environment
  }
}

resource "aws_elasticache_replication_group" "main" {
  replication_group_id         = "${var.cluster_name}-redis"
  description                  = "Redis cluster for ${var.cluster_name}"

  node_type                    = var.node_type
  port                         = 6379
  parameter_group_name         = aws_elasticache_parameter_group.main.name
  subnet_group_name            = aws_elasticache_subnet_group.main.name
  security_group_ids           = [aws_security_group.elasticache.id]

  num_cache_clusters           = var.num_cache_clusters
  at_rest_encryption_enabled   = true
  transit_encryption_enabled   = true
  auth_token                   = var.auth_token

  maintenance_window           = var.maintenance_window
  snapshot_retention_limit     = var.snapshot_retention_limit
  snapshot_window              = var.snapshot_window

  tags = {
    Name        = "${var.cluster_name}-redis"
    Environment = var.environment
  }
}

resource "aws_elasticache_parameter_group" "main" {
  family = "redis7"
  name   = "${var.cluster_name}-redis-params"

  parameter {
    name  = "maxmemory-policy"
    value = "allkeys-lru"
  }
}

Environment-Specific Configurations

Development Environment

# terraform/environments/development/main.tf
module "vpc" {
  source = "../../modules/vpc"

  cluster_name             = "temporal-dev"
  environment             = "development"
  vpc_cidr                = "10.0.0.0/16"
  availability_zones      = ["us-west-2a", "us-west-2b"]
  public_subnet_cidrs     = ["10.0.1.0/24", "10.0.2.0/24"]
  private_subnet_cidrs    = ["10.0.10.0/24", "10.0.20.0/24"]
}

module "eks" {
  source = "../../modules/eks"

  cluster_name         = "temporal-dev"
  environment         = "development"
  kubernetes_version  = "1.28"
  public_subnet_ids   = module.vpc.public_subnet_ids
  private_subnet_ids  = module.vpc.private_subnet_ids

  instance_types      = ["t3.medium"]
  capacity_type       = "ON_DEMAND"
  desired_capacity    = 2
  min_capacity        = 1
  max_capacity        = 4

  public_access_cidrs = ["0.0.0.0/0"]
}

module "rds" {
  source = "../../modules/rds"

  cluster_name            = "temporal-dev"
  environment            = "development"
  vpc_id                 = module.vpc.vpc_id
  private_subnet_ids     = module.vpc.private_subnet_ids
  allowed_cidr_blocks    = [module.vpc.vpc_cidr]

  instance_class         = "db.t3.micro"
  allocated_storage      = 20
  max_allocated_storage  = 100
  postgres_version       = "15.4"

  database_name          = "temporal"
  username               = "temporal"
  password               = var.db_password

  backup_retention_period = 7
  backup_window          = "03:00-04:00"
  maintenance_window     = "sun:04:00-sun:05:00"
}

module "elasticache" {
  source = "../../modules/elasticache"

  cluster_name           = "temporal-dev"
  environment           = "development"
  vpc_id                = module.vpc.vpc_id
  private_subnet_ids    = module.vpc.private_subnet_ids
  allowed_cidr_blocks   = [module.vpc.vpc_cidr]

  node_type             = "cache.t3.micro"
  num_cache_clusters    = 1
  auth_token            = var.redis_auth_token

  maintenance_window         = "sun:05:00-sun:06:00"
  snapshot_retention_limit   = 1
  snapshot_window           = "03:00-05:00"
}

Production Environment

# terraform/environments/production/main.tf
module "vpc" {
  source = "../../modules/vpc"

  cluster_name             = "temporal-prod"
  environment             = "production"
  vpc_cidr                = "10.1.0.0/16"
  availability_zones      = ["us-west-2a", "us-west-2b", "us-west-2c"]
  public_subnet_cidrs     = ["10.1.1.0/24", "10.1.2.0/24", "10.1.3.0/24"]
  private_subnet_cidrs    = ["10.1.10.0/24", "10.1.20.0/24", "10.1.30.0/24"]
}

module "eks" {
  source = "../../modules/eks"

  cluster_name         = "temporal-prod"
  environment         = "production"
  kubernetes_version  = "1.28"
  public_subnet_ids   = module.vpc.public_subnet_ids
  private_subnet_ids  = module.vpc.private_subnet_ids

  instance_types      = ["m5.large", "m5.xlarge"]
  capacity_type       = "ON_DEMAND"
  desired_capacity    = 6
  min_capacity        = 3
  max_capacity        = 12

  public_access_cidrs = ["203.0.113.0/24"] # Your office IP range
}

module "rds" {
  source = "../../modules/rds"

  cluster_name            = "temporal-prod"
  environment            = "production"
  vpc_id                 = module.vpc.vpc_id
  private_subnet_ids     = module.vpc.private_subnet_ids
  allowed_cidr_blocks    = [module.vpc.vpc_cidr]

  instance_class         = "db.r5.xlarge"
  replica_instance_class = "db.r5.large"
  allocated_storage      = 100
  max_allocated_storage  = 1000
  postgres_version       = "15.4"

  database_name          = "temporal"
  username               = "temporal"
  password               = var.db_password

  backup_retention_period = 30
  backup_window          = "03:00-04:00"
  maintenance_window     = "sun:04:00-sun:05:00"
}

module "elasticache" {
  source = "../../modules/elasticache"

  cluster_name           = "temporal-prod"
  environment           = "production"
  vpc_id                = module.vpc.vpc_id
  private_subnet_ids    = module.vpc.private_subnet_ids
  allowed_cidr_blocks   = [module.vpc.vpc_cidr]

  node_type             = "cache.r6g.large"
  num_cache_clusters    = 3
  auth_token            = var.redis_auth_token

  maintenance_window         = "sun:05:00-sun:06:00"
  snapshot_retention_limit   = 7
  snapshot_window           = "03:00-05:00"
}

Deployment Scripts

Infrastructure Deployment Script

#!/bin/bash
# scripts/deploy-infrastructure.sh

set -euo pipefail

ENVIRONMENT=${1:-development}
ACTION=${2:-plan}

# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'

log() {
    echo -e "${GREEN}[$(date +'%Y-%m-%d %H:%M:%S')] $1${NC}"
}

warn() {
    echo -e "${YELLOW}[$(date +'%Y-%m-%d %H:%M:%S')] WARNING: $1${NC}"
}

error() {
    echo -e "${RED}[$(date +'%Y-%m-%d %H:%M:%S')] ERROR: $1${NC}"
    exit 1
}

# Validate environment
if [[ ! "$ENVIRONMENT" =~ ^(development|staging|production)$ ]]; then
    error "Invalid environment. Must be one of: development, staging, production"
fi

# Validate action
if [[ ! "$ACTION" =~ ^(plan|apply|destroy)$ ]]; then
    error "Invalid action. Must be one of: plan, apply, destroy"
fi

TERRAFORM_DIR="terraform/environments/$ENVIRONMENT"

# Check if Terraform directory exists
if [[ ! -d "$TERRAFORM_DIR" ]]; then
    error "Terraform directory not found: $TERRAFORM_DIR"
fi

log "Deploying infrastructure for environment: $ENVIRONMENT"
log "Action: $ACTION"

cd "$TERRAFORM_DIR"

# Initialize Terraform
log "Initializing Terraform..."
terraform init

# Validate configuration
log "Validating Terraform configuration..."
terraform validate

# Plan or apply
case "$ACTION" in
    plan)
        log "Creating Terraform plan..."
        terraform plan -out=tfplan
        ;;
    apply)
        log "Applying Terraform configuration..."
        if [[ -f "tfplan" ]]; then
            terraform apply tfplan
        else
            terraform apply -auto-approve
        fi

        # Update kubeconfig
        if [[ "$ACTION" == "apply" ]]; then
            log "Updating kubeconfig..."
            CLUSTER_NAME=$(terraform output -raw cluster_name)
            REGION=$(terraform output -raw region)
            aws eks update-kubeconfig --region "$REGION" --name "$CLUSTER_NAME"

            log "Infrastructure deployment completed successfully!"
            log "Cluster endpoint: $(terraform output -raw cluster_endpoint)"
            log "Database endpoint: $(terraform output -raw database_endpoint)"
        fi
        ;;
    destroy)
        warn "This will destroy all infrastructure in $ENVIRONMENT environment!"
        read -p "Are you sure? Type 'yes' to confirm: " -r
        if [[ $REPLY == "yes" ]]; then
            terraform destroy -auto-approve
            log "Infrastructure destroyed successfully"
        else
            log "Destroy cancelled"
        fi
        ;;
esac

Post-Deployment Setup Script

#!/bin/bash
# scripts/post-deployment-setup.sh

set -euo pipefail

ENVIRONMENT=${1:-development}

log() {
    echo -e "\033[0;32m[$(date +'%Y-%m-%d %H:%M:%S')] $1\033[0m"
}

error() {
    echo -e "\033[0;31m[$(date +'%Y-%m-%d %H:%M:%S')] ERROR: $1\033[0m"
    exit 1
}

log "Running post-deployment setup for environment: $ENVIRONMENT"

# Verify cluster connectivity
log "Verifying cluster connectivity..."
if ! kubectl cluster-info > /dev/null 2>&1; then
    error "Cannot connect to Kubernetes cluster"
fi

# Install cert-manager
log "Installing cert-manager..."
kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.13.0/cert-manager.yaml
kubectl wait --for=condition=available --timeout=300s deployment/cert-manager -n cert-manager
kubectl wait --for=condition=available --timeout=300s deployment/cert-manager-cainjector -n cert-manager
kubectl wait --for=condition=available --timeout=300s deployment/cert-manager-webhook -n cert-manager

# Install AWS Load Balancer Controller
log "Installing AWS Load Balancer Controller..."
helm repo add eks https://aws.github.io/eks-charts
helm repo update

CLUSTER_NAME=$(kubectl config current-context | cut -d'/' -f2)
VPC_ID=$(aws eks describe-cluster --name "$CLUSTER_NAME" --query "cluster.resourcesVpcConfig.vpcId" --output text)

helm upgrade --install aws-load-balancer-controller eks/aws-load-balancer-controller \
    -n kube-system \
    --set clusterName="$CLUSTER_NAME" \
    --set serviceAccount.create=false \
    --set serviceAccount.name=aws-load-balancer-controller \
    --set region=us-west-2 \
    --set vpcId="$VPC_ID"

# Install external-secrets operator
log "Installing external-secrets operator..."
helm repo add external-secrets https://charts.external-secrets.io
helm upgrade --install external-secrets external-secrets/external-secrets \
    -n external-secrets-system \
    --create-namespace

# Create namespaces
log "Creating namespaces..."
kubectl apply -f - <<EOF
apiVersion: v1
kind: Namespace
metadata:
  name: temporal-system
  labels:
    istio-injection: enabled
---
apiVersion: v1
kind: Namespace
metadata:
  name: temporal-app
  labels:
    istio-injection: enabled
---
apiVersion: v1
kind: Namespace
metadata:
  name: monitoring
  labels:
    istio-injection: enabled
EOF

# Install Prometheus Operator
log "Installing Prometheus Operator..."
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm upgrade --install prometheus-operator prometheus-community/kube-prometheus-stack \
    -n monitoring \
    --set grafana.adminPassword=admin123 \
    --set prometheus.prometheusSpec.retention=30d

log "Post-deployment setup completed successfully!"
log "Next steps:"
log "1. Configure DNS and certificates"
log "2. Set up external secrets"
log "3. Deploy Temporal cluster"

Validation and Testing

Infrastructure Validation Script

#!/bin/bash
# scripts/validate-infrastructure.sh

set -euo pipefail

ENVIRONMENT=${1:-development}

log() {
    echo -e "\033[0;32m[$(date +'%Y-%m-%d %H:%M:%S')] $1\033[0m"
}

error() {
    echo -e "\033[0;31m[$(date +'%Y-%m-%d %H:%M:%S')] ERROR: $1\033[0m"
    exit 1
}

warn() {
    echo -e "\033[1;33m[$(date +'%Y-%m-%d %H:%M:%S')] WARNING: $1\033[0m"
}

log "Validating infrastructure for environment: $ENVIRONMENT"

# Check cluster connectivity
log "Checking cluster connectivity..."
if kubectl cluster-info > /dev/null 2>&1; then
    log "✓ Cluster connectivity OK"
else
    error "✗ Cannot connect to cluster"
fi

# Check node status
log "Checking node status..."
READY_NODES=$(kubectl get nodes --no-headers | grep -c "Ready")
TOTAL_NODES=$(kubectl get nodes --no-headers | wc -l)
if [[ $READY_NODES -eq $TOTAL_NODES ]] && [[ $TOTAL_NODES -gt 0 ]]; then
    log "✓ All $TOTAL_NODES nodes are ready"
else
    warn "✗ Only $READY_NODES out of $TOTAL_NODES nodes are ready"
fi

# Check essential services
log "Checking essential services..."
SERVICES=("kube-dns" "aws-load-balancer-controller" "cert-manager")
for service in "${SERVICES[@]}"; do
    if kubectl get pods -A | grep -q "$service.*Running"; then
        log "✓ $service is running"
    else
        warn "✗ $service is not running"
    fi
done

# Check database connectivity
log "Checking database connectivity..."
if kubectl run db-test --image=postgres:13 --rm -i --restart=Never -- \
    psql -h "$DB_ENDPOINT" -U temporal -d temporal -c "SELECT 1" > /dev/null 2>&1; then
    log "✓ Database connectivity OK"
else
    error "✗ Cannot connect to database"
fi

# Check Redis connectivity
log "Checking Redis connectivity..."
if kubectl run redis-test --image=redis:7 --rm -i --restart=Never -- \
    redis-cli -h "$REDIS_ENDPOINT" ping > /dev/null 2>&1; then
    log "✓ Redis connectivity OK"
else
    error "✗ Cannot connect to Redis"
fi

log "Infrastructure validation completed"

This infrastructure setup guide provides a comprehensive foundation for deploying Temporal.io in a production-ready environment with proper networking, security, and scalability considerations.