AWSTemplateFormatVersion: '2010-09-09'
Description: 'Kscope Kubernetes Crawler setup. PREREQUISITE: EKS clusters must have API authentication mode (EKS API and ConfigMap) enabled.'

Parameters:
  ResourcePrefix:
    Type: String
    Default: kscope
    Description: Prefix for all resource names

  KscopeAccountId:
    Type: String
    Description: AWS Account ID of the Kscope platform
    Default: '123456789012'

  EKSClusterNames:
    Type: CommaDelimitedList
    Description: Comma-separated list of EKS cluster names to grant access to (e.g., my-cluster-1,my-cluster-2). Use * to grant access to all clusters in the region.
    Default: '*'

Resources:

  # ── IAM: crawler role and policy ─────────────────────────────────────────

  KubernetsCrawlerPolicy:
    Type: AWS::IAM::ManagedPolicy
    Properties:
      ManagedPolicyName: !Sub '${ResourcePrefix}-kubernetes-crawler-policy-${AWS::Region}'
      Description: IAM policy for Kscope Kubernetes Crawler
      PolicyDocument:
        Version: '2012-10-17'
        Statement:
          - Sid: EKSClusterAccess
            Effect: Allow
            Action:
              - eks:DescribeCluster
              - eks:ListClusters
              - eks:DescribeNodegroup
              - eks:ListNodegroups
              - eks:DescribeAddon
              - eks:ListAddons
              - eks:DescribeFargateProfile
              - eks:ListFargateProfiles
              - eks:ListUpdates
              - eks:DescribeUpdate
            Resource: !Sub 'arn:${AWS::Partition}:eks:${AWS::Region}:${AWS::AccountId}:cluster/*'
          - Sid: EC2ReadAccess
            Effect: Allow
            Action:
              - ec2:DescribeInstances
              - ec2:DescribeSecurityGroups
              - ec2:DescribeSubnets
              - ec2:DescribeVpcs
              - ec2:DescribeVolumes
              - ec2:DescribeNetworkInterfaces
            Resource: '*'
          - Sid: IAMReadAccess
            Effect: Allow
            Action:
              - iam:GetRole
              - iam:ListAttachedRolePolicies
              - iam:ListRolePolicies
            Resource: '*'
          - Sid: CloudWatchLogsAccess
            Effect: Allow
            Action:
              - logs:DescribeLogGroups
              - logs:DescribeLogStreams
              - logs:GetLogEvents
              - logs:FilterLogEvents
            Resource: !Sub 'arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:log-group:/aws/eks/*'
          - Sid: AssumeOwnRole
            Effect: Allow
            Action: sts:AssumeRole
            Resource: !Sub 'arn:${AWS::Partition}:iam::${AWS::AccountId}:role/${ResourcePrefix}-kubernetes-crawler-role-${AWS::Region}'

  # ExternalId = the stack's own UUID — cryptographically random, unique per deployment,
  # zero extra resources. Intrinsic functions resolve correctly in IAM conditions.
  KubernetsCrawlerRole:
    Type: AWS::IAM::Role
    Properties:
      RoleName: !Sub '${ResourcePrefix}-kubernetes-crawler-role-${AWS::Region}'
      Description: IAM role for Kscope Kubernetes Crawler to access EKS clusters
      AssumeRolePolicyDocument:
        Version: '2012-10-17'
        Statement:
          - Effect: Allow
            Principal:
              AWS: !Sub 'arn:${AWS::Partition}:iam::${KscopeAccountId}:root'
            Action: sts:AssumeRole
            Condition:
              StringEquals:
                'sts:ExternalId': !Select [2, !Split ['/', !Ref 'AWS::StackId']]
          - Effect: Allow
            Principal:
              AWS: !Sub 'arn:${AWS::Partition}:iam::${AWS::AccountId}:root'
            Action: sts:AssumeRole
            Condition:
              ArnEquals:
                'aws:PrincipalArn': !Sub 'arn:${AWS::Partition}:iam::${AWS::AccountId}:role/${ResourcePrefix}-kubernetes-crawler-role-${AWS::Region}'
      ManagedPolicyArns:
        - !Ref KubernetsCrawlerPolicy
      Tags:
        - Key: ManagedBy
          Value: Kscope
        - Key: Purpose
          Value: KubernetesCrawler

  # ── Lambda: Kubernetes RBAC setup ────────────────────────────────────────
  # Iterates over all specified clusters, creates temporary access entries for
  # itself, applies ClusterRole + ClusterRoleBinding, creates permanent access
  # entries for the crawler role, then removes its own temporary access entries.
  # Runs on create/update/delete.

  SetupRole:
    Type: AWS::IAM::Role
    Properties:
      RoleName: !Sub '${ResourcePrefix}-k8s-setup-role-${AWS::Region}'
      AssumeRolePolicyDocument:
        Version: '2012-10-17'
        Statement:
          - Effect: Allow
            Principal:
              Service: lambda.amazonaws.com
            Action: sts:AssumeRole
      ManagedPolicyArns:
        - arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole
      Policies:
        - PolicyName: EKSSetup
          PolicyDocument:
            Version: '2012-10-17'
            Statement:
              - Effect: Allow
                Action:
                  - eks:DescribeCluster
                  - eks:CreateAccessEntry
                Resource: !Sub 'arn:${AWS::Partition}:eks:${AWS::Region}:${AWS::AccountId}:cluster/*'
              - Effect: Allow
                Action:
                  - eks:AssociateAccessPolicy
                  - eks:DeleteAccessEntry
                Resource: !Sub 'arn:${AWS::Partition}:eks:${AWS::Region}:${AWS::AccountId}:access-entry/*'
              - Effect: Allow
                Action: eks:ListClusters
                Resource: '*'

  SetupFunction:
    Type: AWS::Lambda::Function
    Properties:
      FunctionName: !Sub '${ResourcePrefix}-k8s-setup-${AWS::Region}'
      Runtime: python3.12
      Handler: index.handler
      Role: !GetAtt SetupRole.Arn
      Timeout: 600
      Code:
        ZipFile: !Sub |
          import boto3, base64, json, ssl, time, urllib.request, urllib.error
          from botocore.signers import RequestSigner

          # Inlined cfnresponse — never import it; CloudFormation's auto-injection
          # is unreliable across runtimes and fails silently, blocking stack operations.
          def cfn_send(event, context, status, data={}, reason=''):
            body = json.dumps({
              'Status': status,
              'Reason': reason or f'See CloudWatch log stream: {context.log_stream_name}',
              'PhysicalResourceId': event.get('PhysicalResourceId') or context.log_stream_name,
              'StackId': event['StackId'], 'RequestId': event['RequestId'],
              'LogicalResourceId': event['LogicalResourceId'], 'Data': data,
            }).encode()
            req = urllib.request.Request(
              event['ResponseURL'], data=body, method='PUT',
              headers={'Content-Type': '', 'Content-Length': str(len(body))}
            )
            urllib.request.urlopen(req)

          USER  = '${ResourcePrefix}-kubernetes-crawler'
          ADMIN = 'arn:${AWS::Partition}:eks::aws:cluster-access-policy/AmazonEKSClusterAdminPolicy'

          CR = {
            "apiVersion": "rbac.authorization.k8s.io/v1", "kind": "ClusterRole",
            "metadata": {"name": USER},
            "rules": [
              {"apiGroups": [""],
               "resources": ["namespaces","services","serviceaccounts","pods","nodes",
                             "persistentvolumes","persistentvolumeclaims","configmaps","events"],
               "verbs": ["get","list"]},
              {"apiGroups": ["apps"],
               "resources": ["replicasets","deployments","daemonsets","statefulsets"],
               "verbs": ["get","list","patch"]},
              {"apiGroups": ["rbac.authorization.k8s.io"],
               "resources": ["clusterroles","clusterrolebindings","roles","rolebindings"],
               "verbs": ["get","list","patch"]},
              {"apiGroups": ["batch"], "resources": ["jobs","cronjobs"], "verbs": ["get","list","patch"]},
              {"apiGroups": [""], "resources": ["pods"], "verbs": ["patch"]},
            ]
          }

          CRB = {
            "apiVersion": "rbac.authorization.k8s.io/v1", "kind": "ClusterRoleBinding",
            "metadata": {"name": USER},
            "roleRef": {"apiGroup": "rbac.authorization.k8s.io", "kind": "ClusterRole", "name": USER},
            "subjects": [{"kind": "User", "name": USER, "apiGroup": "rbac.authorization.k8s.io"}]
          }

          def get_token(cluster, region):
            s = boto3.session.Session()
            c = s.client('sts', region_name=region)
            sg = RequestSigner(c.meta.service_model.service_id, region, 'sts', 'v4',
                               s.get_credentials(), s.events)
            params = {
              'method': 'GET',
              'url': f'https://sts.{region}.amazonaws.com/?Action=GetCallerIdentity&Version=2011-06-15',
              'body': {}, 'context': {}, 'headers': {'x-k8s-aws-id': cluster},
            }
            signed = sg.generate_presigned_url(params, region_name=region, expires_in=60, operation_name='')
            return 'k8s-aws-v1.' + base64.urlsafe_b64encode(signed.encode()).decode().rstrip('=')

          def kube(method, ep, path, tok, ca, body=None):
            data = json.dumps(body).encode() if body else None
            r = urllib.request.Request(ep + path, data=data, method=method)
            r.add_header('Authorization', f'Bearer {tok}')
            r.add_header('Content-Type', 'application/json')
            ctx = ssl.create_default_context(cadata=ca)
            try:
              with urllib.request.urlopen(r, context=ctx) as resp:
                return resp.status, json.loads(resp.read())
            except urllib.error.HTTPError as e:
              return e.code, {}

          def apply_rbac(ep, tok, ca):
            base = '/apis/rbac.authorization.k8s.io/v1'
            for kind_path, list_path, manifest in [
              (f'{base}/clusterroles/{USER}',        f'{base}/clusterroles',        CR),
              (f'{base}/clusterrolebindings/{USER}', f'{base}/clusterrolebindings', CRB),
            ]:
              status, existing = kube('GET', ep, kind_path, tok, ca)
              if status == 404:
                kube('POST', ep, list_path, tok, ca, manifest)
              else:
                manifest['metadata']['resourceVersion'] = existing.get('metadata', {}).get('resourceVersion', '')
                kube('PUT', ep, kind_path, tok, ca, manifest)

          def get_clusters(eks, names):
            if names == ['*']:
              clusters = []
              for page in eks.get_paginator('list_clusters').paginate():
                clusters.extend(page['clusters'])
              return clusters
            return names

          def wait_for_k8s_access(ep, cluster, region, ca, timeout=120):
            for _ in range(timeout // 3):
              tok = get_token(cluster, region)
              status, _ = kube('GET', ep, '/api/v1/namespaces', tok, ca)
              if status == 200:
                return get_token(cluster, region)
              time.sleep(3)
            raise Exception(f'K8s API not accessible for {cluster} after {timeout}s')

          def setup(eks, cluster, region, crawler_arn, lambda_arn):
            info = eks.describe_cluster(name=cluster)['cluster']
            ep   = info['endpoint']
            ca   = base64.b64decode(info['certificateAuthority']['data']).decode()
            try:
              eks.create_access_entry(clusterName=cluster, principalArn=lambda_arn)
            except Exception as e:
              print(f'create_access_entry: {e}')
            try:
              eks.associate_access_policy(clusterName=cluster, principalArn=lambda_arn,
                policyArn=ADMIN, accessScope={'type': 'cluster'})
              print(f'Access entry ready for {cluster}')
            except Exception as e:
              print(f'associate_access_policy: {e}')
            tok = wait_for_k8s_access(ep, cluster, region, ca)
            apply_rbac(ep, tok, ca)
            try:
              eks.create_access_entry(clusterName=cluster, principalArn=crawler_arn, username=USER)
            except Exception:
              pass
            try:
              eks.delete_access_entry(clusterName=cluster, principalArn=lambda_arn)
            except Exception:
              pass

          def teardown(eks, cluster, region, crawler_arn, lambda_arn):
            try:
              info = eks.describe_cluster(name=cluster)['cluster']
              ep   = info['endpoint']
              ca   = base64.b64decode(info['certificateAuthority']['data']).decode()
              # Best-effort K8s RBAC cleanup with a short timeout
              try:
                try:
                  eks.create_access_entry(clusterName=cluster, principalArn=lambda_arn)
                except Exception:
                  pass
                eks.associate_access_policy(clusterName=cluster, principalArn=lambda_arn,
                  policyArn=ADMIN, accessScope={'type': 'cluster'})
                tok  = wait_for_k8s_access(ep, cluster, region, ca, timeout=30)
                base = '/apis/rbac.authorization.k8s.io/v1'
                kube('DELETE', ep, f'{base}/clusterroles/{USER}',        tok, ca)
                kube('DELETE', ep, f'{base}/clusterrolebindings/{USER}', tok, ca)
              except Exception:
                pass
              # Always clean up access entries regardless of K8s outcome
              for arn in [crawler_arn, lambda_arn]:
                try: eks.delete_access_entry(clusterName=cluster, principalArn=arn)
                except Exception: pass
            except Exception:
              pass

          def handler(event, context):
            props       = event['ResourceProperties']
            names       = props['ClusterNames']
            region      = props['Region']
            crawler_arn = props['CrawlerRoleArn']
            lambda_arn  = props['LambdaRoleArn']
            eks         = boto3.client('eks', region_name=region)
            clusters    = get_clusters(eks, names)

            if event['RequestType'] == 'Delete':
              for c in clusters:
                teardown(eks, c, region, crawler_arn, lambda_arn)
              cfn_send(event, context, 'SUCCESS')
              return

            errors = []
            for c in clusters:
              try:
                setup(eks, c, region, crawler_arn, lambda_arn)
              except Exception as e:
                errors.append(f'{c}: {e}')
            if errors:
              cfn_send(event, context, 'FAILED', reason='; '.join(errors))
            else:
              cfn_send(event, context, 'SUCCESS')

  # ── Secrets Manager: ExternalId storage ─────────────────────────────────

  ExternalIdSecret:
    Type: AWS::SecretsManager::Secret
    Properties:
      Name: !Sub '/${ResourcePrefix}/kubernetes-crawler/role/external-id'
      Description: External ID for Kscope Kubernetes Crawler cross-account role assumption
      SecretString: !Sub
        - '{"externalId":"${ExternalId}"}'
        - ExternalId: !Select [2, !Split ['/', !Ref 'AWS::StackId']]

  # ── RBAC setup custom resource ────────────────────────────────────────────
  # Lambda manages its own access entries per cluster internally.

  KubernetesRBACSetup:
    Type: Custom::KubernetesRBAC
    Properties:
      ServiceToken: !GetAtt SetupFunction.Arn
      ClusterNames: !Ref EKSClusterNames
      Region: !Ref 'AWS::Region'
      CrawlerRoleArn: !GetAtt KubernetsCrawlerRole.Arn
      LambdaRoleArn: !GetAtt SetupRole.Arn

Outputs:
  RoleArn:
    Description: ARN of the IAM role for Kscope Kubernetes Crawler
    Value: !GetAtt KubernetsCrawlerRole.Arn
    Export:
      Name: !Sub '${AWS::StackName}-RoleArn'

  RoleName:
    Description: Name of the IAM role
    Value: !Ref KubernetsCrawlerRole
    Export:
      Name: !Sub '${AWS::StackName}-RoleName'

  ExternalIdSecretArn:
    Description: Secrets Manager ARN containing the External ID for cross-account role assumption
    Value: !Ref ExternalIdSecret
    Export:
      Name: !Sub '${AWS::StackName}-ExternalIdSecretArn'
