From 73eef48cb3a55a3148ae6ed5f01925c70f3f31c6 Mon Sep 17 00:00:00 2001
From: tutorial <robinvp3@gmail.com>
Date: Wed, 13 May 2026 22:22:49 +0200
Subject: [PATCH] Add cloud infrastructure provisioning API

---
 .env.example                                  |  15 ++
 .github/workflows/ci-cd.yaml                  |  41 ++++
 .gitignore                                    |   7 +
 Dockerfile                                    |  18 ++
 README.md                                     | 225 ++++++++++++++++--
 api/routes/auth.py                            |  33 +++
 api/routes/deployments.py                     |  59 +++++
 api/routes/infrastructure.py                  |  69 ++++++
 api/routes/kubernetes.py                      |  59 +++++
 api/routes/monitoring.py                      |  19 ++
 api/schemas.py                                | 105 ++++++++
 app/config.py                                 |  34 +++
 app/logger.py                                 |   9 +
 app/main.py                                   |  57 +++++
 auth/jwt_utils.py                             |  30 +++
 auth/rate_limit.py                            |  28 +++
 auth/rbac.py                                  |  31 +++
 database/models.py                            |  42 ++++
 database/session.py                           |  22 ++
 helm/charts/README.md                         |  14 ++
 helm/charts/idp-api/Chart.yaml                |   4 +
 helm/charts/idp-api/templates/_helpers.tpl    |  11 +
 helm/charts/idp-api/templates/configmap.yaml  |  10 +
 helm/charts/idp-api/templates/deployment.yaml |  44 ++++
 helm/charts/idp-api/templates/hpa.yaml        |  20 ++
 helm/charts/idp-api/templates/ingress.yaml    |  27 +++
 helm/charts/idp-api/templates/secret.yaml     |   9 +
 helm/charts/idp-api/templates/service.yaml    |  13 +
 .../idp-api/templates/serviceaccount.yaml     |   4 +
 helm/charts/idp-api/values.yaml               |  39 +++
 kubernetes/client.py                          |  18 ++
 kubernetes/network-policy.yaml                |  25 ++
 kubernetes/rbac.yaml                          |  29 +++
 monitoring/grafana-dashboard.json             |  24 ++
 monitoring/prometheus-scrape-config.yaml      |  15 ++
 requirements.txt                              |  16 ++
 scripts/bootstrap.sh                          |  15 ++
 scripts/migrate_db.sh                         |  11 +
 scripts/prod_checklist.md                     |  44 ++++
 scripts/setup_env.sh                          |   4 +
 services/autoscaling_service.py               |  40 ++++
 services/deployment_service.py                | 198 +++++++++++++++
 services/infra_service.py                     |  68 ++++++
 services/ingress_service.py                   |  51 ++++
 services/k8s_service.py                       |  29 +++
 services/kubernetes_client.py                 |  20 ++
 services/monitoring_service.py                |  31 +++
 services/service_service.py                   |  35 +++
 setup.cfg                                     |   4 +
 terraform/main.tf.j2                          |  43 ++++
 tests/conftest.py                             |  14 ++
 tests/unit/test_auth.py                       |  12 +
 tests/unit/test_health.py                     |   4 +
 tests/unit/test_namespace.py                  |  10 +
 54 files changed, 1837 insertions(+), 21 deletions(-)
 create mode 100644 .env.example
 create mode 100644 .github/workflows/ci-cd.yaml
 create mode 100644 .gitignore
 create mode 100644 Dockerfile
 create mode 100644 api/routes/auth.py
 create mode 100644 api/routes/deployments.py
 create mode 100644 api/routes/infrastructure.py
 create mode 100644 api/routes/kubernetes.py
 create mode 100644 api/routes/monitoring.py
 create mode 100644 api/schemas.py
 create mode 100644 app/config.py
 create mode 100644 app/logger.py
 create mode 100644 app/main.py
 create mode 100644 auth/jwt_utils.py
 create mode 100644 auth/rate_limit.py
 create mode 100644 auth/rbac.py
 create mode 100644 database/models.py
 create mode 100644 database/session.py
 create mode 100644 helm/charts/README.md
 create mode 100644 helm/charts/idp-api/Chart.yaml
 create mode 100644 helm/charts/idp-api/templates/_helpers.tpl
 create mode 100644 helm/charts/idp-api/templates/configmap.yaml
 create mode 100644 helm/charts/idp-api/templates/deployment.yaml
 create mode 100644 helm/charts/idp-api/templates/hpa.yaml
 create mode 100644 helm/charts/idp-api/templates/ingress.yaml
 create mode 100644 helm/charts/idp-api/templates/secret.yaml
 create mode 100644 helm/charts/idp-api/templates/service.yaml
 create mode 100644 helm/charts/idp-api/templates/serviceaccount.yaml
 create mode 100644 helm/charts/idp-api/values.yaml
 create mode 100644 kubernetes/client.py
 create mode 100644 kubernetes/network-policy.yaml
 create mode 100644 kubernetes/rbac.yaml
 create mode 100644 monitoring/grafana-dashboard.json
 create mode 100644 monitoring/prometheus-scrape-config.yaml
 create mode 100644 requirements.txt
 create mode 100644 scripts/bootstrap.sh
 create mode 100644 scripts/migrate_db.sh
 create mode 100644 scripts/prod_checklist.md
 create mode 100644 scripts/setup_env.sh
 create mode 100644 services/autoscaling_service.py
 create mode 100644 services/deployment_service.py
 create mode 100644 services/infra_service.py
 create mode 100644 services/ingress_service.py
 create mode 100644 services/k8s_service.py
 create mode 100644 services/kubernetes_client.py
 create mode 100644 services/monitoring_service.py
 create mode 100644 services/service_service.py
 create mode 100644 setup.cfg
 create mode 100644 terraform/main.tf.j2
 create mode 100644 tests/conftest.py
 create mode 100644 tests/unit/test_auth.py
 create mode 100644 tests/unit/test_health.py
 create mode 100644 tests/unit/test_namespace.py

diff --git a/.env.example b/.env.example
new file mode 100644
index 0000000..f820780
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,15 @@
+ENVIRONMENT=local
+APP_DEBUG=true
+DATABASE_URL=sqlite:///./idp.db
+REDIS_URL=redis://localhost:6379/0
+SECRET_KEY=replace-with-a-long-random-secret
+JWT_ALGORITHM=HS256
+ACCESS_TOKEN_EXPIRE_MINUTES=60
+AWS_REGION=us-east-1
+DEFAULT_INGRESS_DOMAIN=apps.local
+KUBERNETES_NAMESPACE_PREFIX=tenant
+KUBERNETES_DRY_RUN=true
+TERRAFORM_DRY_RUN=true
+TERRAFORM_STATE_BUCKET=replace-me-terraform-state
+TERRAFORM_LOCK_TABLE=replace-me-terraform-locks
+RATE_LIMIT_REQUESTS_PER_HOUR=100
diff --git a/.github/workflows/ci-cd.yaml b/.github/workflows/ci-cd.yaml
new file mode 100644
index 0000000..2394d86
--- /dev/null
+++ b/.github/workflows/ci-cd.yaml
@@ -0,0 +1,41 @@
+name: CI/CD Pipeline
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    env:
+      DATABASE_URL: sqlite:///./test-idp.db
+      SECRET_KEY: ci-test-secret
+      KUBERNETES_DRY_RUN: "true"
+      TERRAFORM_DRY_RUN: "true"
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+      - name: Lint
+        run: |
+          pip install flake8
+          flake8 app api auth database services tests --max-line-length=120
+      - name: Run tests
+        run: |
+          pytest
+      - name: Build Docker image
+        run: |
+          docker build -t idp-api:${{ github.sha }} .
+      - name: Push Docker image (optional)
+        if: github.ref == 'refs/heads/main'
+        run: echo "Configure registry login and docker push here"
+      - name: Deploy to Kubernetes (optional)
+        run: echo "Deploy with Helm or ArgoCD after registry configuration"
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..6b5af3f
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+.venv/
+__pycache__/
+*.py[cod]
+*.db
+.pytest_cache/
+.env
+.history/
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..8596f91
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,18 @@
+FROM python:3.11-slim
+
+# Create non-root user
+RUN useradd -m appuser
+WORKDIR /app
+
+COPY requirements.txt ./
+RUN pip install --no-cache-dir -r requirements.txt \
+	&& rm -rf /root/.cache
+
+COPY . .
+
+# Change ownership and permissions
+RUN chown -R appuser:appuser /app
+USER appuser
+
+ENV PYTHONUNBUFFERED=1
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/README.md b/README.md
index b07f75b..72b9911 100644
--- a/README.md
+++ b/README.md
@@ -1,42 +1,225 @@
-# Single Application Cluster Setup in Kubernetes
+# Cloud Infrastructure Provisioning API
 
-This guide outlines the process for deploying a single application within a Kubernetes cluster, from cluster creation to application access.
+This project is an Internal Developer Platform API built on FastAPI, Kubernetes, Terraform, PostgreSQL/SQLite, Redis, Helm, Prometheus, Grafana, and JWT authentication. It extends the original Kubernetes manifests into a mini Heroku/Render/Railway-style platform for provisioning infrastructure and deploying containerized applications through REST APIs.
 
+## Architecture
 
-## Step 1: Write a Kubernetes Deployment YAML File
+The API receives authenticated platform requests, validates input, stores metadata in the database, and orchestrates Kubernetes or Terraform operations through service-layer modules.
 
-Define the application's deployment on Kubernetes:
+Request flow for application deployment:
 
-- Draft a YAML file (`deployment-file.yaml`) that specifies the deployment details, including the Docker image, ports, environment variables, and other configurations.
+1. User authenticates with JWT.
+2. API validates Docker image, namespace, port, replica, ingress, and autoscaling inputs.
+3. A deployment row is created in the database.
+4. Kubernetes service layer creates namespace, Deployment, Service, Ingress, and HPA.
+5. Deployment status, URL, autoscaling settings, and errors are persisted.
+6. Users query deployment status, logs, metrics, and cluster health through API endpoints.
 
-## Step 2: Create a Kubernetes Deployment
+## Project Structure
 
-Deploy the application using `kubectl`:
+```text
+app/              FastAPI app, configuration, logging
+api/              Route handlers and Pydantic schemas
+auth/             JWT, RBAC, rate limiting
+database/         SQLAlchemy models and session lifecycle
+services/         Kubernetes, Terraform, deployment, monitoring logic
+kubernetes/       Cluster RBAC and network policy examples
+terraform/        AWS Terraform templates
+helm/             Helm chart for the API itself
+monitoring/       Prometheus and Grafana examples
+scripts/          Bootstrap, migration, production checklist helpers
+tests/            Unit tests
+```
 
-- Run `kubectl apply -f <deployment-file.yaml>` to deploy the application, replacing `<deployment-file.yaml>` with the path to the YAML file.
+## API Endpoints
 
-## Step 3: Expose The Deployment as a Service
+Auth:
 
-Make the deployment accessible:
+- `POST /auth/register`
+- `POST /auth/login`
+- `GET /auth/me`
 
-- Expose the deployment by executing:
+Infrastructure:
 
-Replace `<deployment-name>` and `<container-port>` with specific details to create a LoadBalancer service for external traffic.
+- `POST /infrastructure/create`
+- `GET /infrastructure/{id}`
+- `DELETE /infrastructure/{id}`
 
-## Step 4: Verify the Deployment
+Deployments:
 
-Check the status of the deployment:
+- `POST /deployments`
+- `GET /deployments/{id}`
+- `DELETE /deployments/{id}`
 
-- Confirm the application's pod is running with `kubectl get pods`.
-- Check the service creation and the external IP assignment with `kubectl get services`.
+Kubernetes:
 
-## Step 5: Access The Application
+- `POST /namespace/create`
+- `POST /service/expose`
+- `POST /autoscaling/create`
+- `POST /kubernetes/ingress/create`
 
-Visit the deployed application:
+Monitoring:
 
-- Use a web browser to navigate to the external IP address assigned to the service and access the application.
+- `GET /cluster/health`
+- `GET /metrics`
+- `GET /logs/{pod}?namespace=default`
 
-Following these steps will get the application deployed and accessible within a Kubernetes cluster.
+Swagger/OpenAPI is available at `/docs`.
 
+## Local Development
 
-<img width="1164" alt="Screenshot 2023-04-02 at 20 16 24" src="https://user-images.githubusercontent.com/73405591/229371245-1692b2ac-d4c5-456b-b983-6fa1bfd115ab.png">
+Create an environment file:
+
+```bash
+cp .env.example .env
+```
+
+For local development without a Kubernetes cluster or Terraform credentials, keep:
+
+```bash
+KUBERNETES_DRY_RUN=true
+TERRAFORM_DRY_RUN=true
+DATABASE_URL=sqlite:///./idp.db
+```
+
+Install dependencies and run the API:
+
+```bash
+python3 -m venv .venv
+./.venv/bin/pip install -r requirements.txt
+./.venv/bin/uvicorn app.main:app --reload
+```
+
+Register and log in:
+
+```bash
+curl -X POST http://localhost:8000/auth/register \
+  -H "Content-Type: application/json" \
+  -d '{"username":"platform-user","password":"change-me-123"}'
+
+TOKEN=$(curl -s -X POST http://localhost:8000/auth/login \
+  -H "Content-Type: application/json" \
+  -d '{"username":"platform-user","password":"change-me-123"}' | jq -r .access_token)
+```
+
+Deploy an application:
+
+```bash
+curl -X POST http://localhost:8000/deployments \
+  -H "Authorization: Bearer $TOKEN" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "name": "demo-api",
+    "image": "nginx:1.25",
+    "port": 80,
+    "replicas": 2,
+    "min_replicas": 1,
+    "max_replicas": 5,
+    "cpu_threshold": 70
+  }'
+```
+
+## Terraform
+
+The infrastructure API renders [terraform/main.tf.j2](terraform/main.tf.j2) and can create or destroy AWS resources. It is dry-run by default. Before enabling real execution:
+
+- Create an encrypted S3 backend bucket.
+- Create a DynamoDB lock table.
+- Replace `TERRAFORM_STATE_BUCKET` and `TERRAFORM_LOCK_TABLE`.
+- Use IAM roles with least privilege.
+- Review generated plans before production use.
+
+Example:
+
+```bash
+curl -X POST http://localhost:8000/infrastructure/create \
+  -H "Authorization: Bearer $TOKEN" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "name": "platform-dev",
+    "cloud_provider": "aws",
+    "config": {
+      "aws_region": "us-east-1",
+      "eks_role_arn": "arn:aws:iam::123456789012:role/EKSRole"
+    }
+  }'
+```
+
+## Helm Deployment
+
+Render or install the API chart:
+
+```bash
+helm template idp-api helm/charts/idp-api \
+  --set secrets.databaseUrl='postgresql://user:pass@postgres:5432/idp' \
+  --set secrets.secretKey='replace-with-long-random-secret'
+
+helm upgrade --install idp-api helm/charts/idp-api \
+  --set image.repository=registry.example.com/idp-api \
+  --set image.tag=v1 \
+  --set secrets.databaseUrl='postgresql://user:pass@postgres:5432/idp' \
+  --set secrets.secretKey='replace-with-long-random-secret'
+```
+
+## Security
+
+Implemented:
+
+- JWT authentication
+- Role-aware user model
+- Protected infrastructure, deployment, Kubernetes, and monitoring APIs
+- Redis-backed rate limiting with local fallback
+- Non-root Docker container
+- Kubernetes RBAC and network-policy examples
+- No hardcoded production secret requirement in Helm
+
+Recommended before production:
+
+- Use AWS Secrets Manager, External Secrets Operator, or sealed-secrets.
+- Replace SQLite with managed PostgreSQL.
+- Use Alembic migrations.
+- Run Terraform through a job queue or workflow engine rather than synchronous HTTP requests.
+- Enforce tenant-aware namespace ownership.
+- Add admission policies with Kyverno or OPA Gatekeeper.
+- Use image allowlists and vulnerability scanning.
+- Require immutable image digests for production deployments.
+
+## Observability
+
+The API exposes Prometheus metrics at `/metrics`. Example scrape configuration and a Grafana dashboard starter live in `monitoring/`.
+
+Recommended production stack:
+
+- Prometheus Operator
+- Grafana dashboards for API latency, error rate, Kubernetes deployment state, and Terraform failures
+- Loki or OpenSearch for structured logs
+- Alertmanager alerts for failed provisions, high error rate, and unhealthy clusters
+
+## CI/CD
+
+The GitHub Actions workflow installs dependencies, runs linting/tests, and builds the Docker image. Registry push and Kubernetes deployment are intentionally placeholders until registry, cluster, and secret strategy are configured.
+
+## Implementation Phases
+
+Phase 1: Architecture and folder structure are represented by the layered app layout.
+
+Phase 2: FastAPI backend includes auth, validation, database models, OpenAPI, health checks, and rate limiting.
+
+Phase 3: Kubernetes integration creates namespaces, deployments, services, ingress, HPA, status, logs, and safe deletes.
+
+Phase 4: Terraform automation renders AWS templates and supports apply/destroy with remote-state configuration.
+
+Phase 5: Monitoring exposes Prometheus metrics, cluster health, pod logs, and dashboard examples.
+
+Phase 6: CI/CD builds, lints, tests, and prepares image/deployment stages.
+
+Phase 7: Production hardening is documented in `scripts/prod_checklist.md` and should be completed before real cloud use.
+
+## Scaling Recommendations
+
+- Move long-running deploy/provision tasks to Celery, RQ, Temporal, or Argo Workflows.
+- Add per-tenant quotas for namespaces, replicas, CPU, memory, and load balancers.
+- Use GitOps with ArgoCD for reconciliation and auditability.
+- Split API, worker, scheduler, and webhook receiver into separate deployments.
+- Use PostgreSQL row-level ownership checks and explicit tenant IDs.
+- Add blue/green and canary deployment strategies with Argo Rollouts or Flagger.
diff --git a/api/routes/auth.py b/api/routes/auth.py
new file mode 100644
index 0000000..55f9143
--- /dev/null
+++ b/api/routes/auth.py
@@ -0,0 +1,33 @@
+from fastapi import APIRouter, HTTPException, Depends
+from sqlalchemy.orm import Session
+from database.models import User
+from auth.jwt_utils import get_password_hash, verify_password, create_access_token
+from auth.rbac import get_current_user
+from database.session import get_db
+from api.schemas import LoginRequest, RegisterRequest, TokenResponse
+
+router = APIRouter()
+
+@router.post("/register")
+def register(request: RegisterRequest, db: Session = Depends(get_db)):
+    user = db.query(User).filter(User.username == request.username).first()
+    if user:
+        raise HTTPException(status_code=400, detail="Username already registered")
+    hashed_pw = get_password_hash(request.password)
+    new_user = User(username=request.username, hashed_password=hashed_pw)
+    db.add(new_user)
+    db.commit()
+    db.refresh(new_user)
+    return {"id": new_user.id, "username": new_user.username, "role": new_user.role}
+
+@router.post("/login", response_model=TokenResponse)
+def login(request: LoginRequest, db: Session = Depends(get_db)):
+    user = db.query(User).filter(User.username == request.username).first()
+    if not user or not verify_password(request.password, user.hashed_password):
+        raise HTTPException(status_code=401, detail="Invalid credentials")
+    token = create_access_token({"sub": str(user.id), "role": user.role})
+    return {"access_token": token, "token_type": "bearer"}
+
+@router.get("/me")
+def get_me(current_user=Depends(get_current_user)):
+    return {"id": current_user.id, "username": current_user.username, "role": current_user.role}
diff --git a/api/routes/deployments.py b/api/routes/deployments.py
new file mode 100644
index 0000000..5fe5917
--- /dev/null
+++ b/api/routes/deployments.py
@@ -0,0 +1,59 @@
+from fastapi import APIRouter, Depends, HTTPException
+from sqlalchemy.orm import Session
+
+from api.schemas import DeploymentCreateRequest, DeploymentResponse
+from auth.rbac import get_current_user
+from database.models import Deployment, User
+from database.session import get_db
+from services.deployment_service import (
+    delete_kubernetes_deployment,
+    get_kubernetes_deployment_status,
+    provision_application,
+)
+
+router = APIRouter()
+
+@router.post("", response_model=DeploymentResponse, status_code=201)
+def create_deployment_route(
+    request: DeploymentCreateRequest,
+    db: Session = Depends(get_db),
+    current_user: User = Depends(get_current_user),
+):
+    return provision_application(db, current_user, request)
+
+@router.get("/{id}", response_model=DeploymentResponse)
+def get_deployment(
+    id: int,
+    db: Session = Depends(get_db),
+    current_user: User = Depends(get_current_user),
+):
+    deployment = db.query(Deployment).filter(Deployment.id == id, Deployment.owner_id == current_user.id).first()
+    if not deployment:
+        raise HTTPException(status_code=404, detail="Deployment not found")
+    if deployment.status == "running":
+        try:
+            deployment.metadata_json = {
+                **(deployment.metadata_json or {}),
+                "kubernetes_status": get_kubernetes_deployment_status(deployment.namespace, deployment.name),
+            }
+            db.add(deployment)
+            db.commit()
+            db.refresh(deployment)
+        except Exception as exc:
+            deployment.metadata_json = {**(deployment.metadata_json or {}), "status_error": str(exc)}
+    return deployment
+
+@router.delete("/{id}")
+def delete_deployment(
+    id: int,
+    db: Session = Depends(get_db),
+    current_user: User = Depends(get_current_user),
+):
+    deployment = db.query(Deployment).filter(Deployment.id == id, Deployment.owner_id == current_user.id).first()
+    if not deployment:
+        raise HTTPException(status_code=404, detail="Deployment not found")
+    delete_kubernetes_deployment(deployment.namespace, deployment.name)
+    deployment.status = "deleted"
+    db.add(deployment)
+    db.commit()
+    return {"id": id, "status": "deleted"}
diff --git a/api/routes/infrastructure.py b/api/routes/infrastructure.py
new file mode 100644
index 0000000..b367be2
--- /dev/null
+++ b/api/routes/infrastructure.py
@@ -0,0 +1,69 @@
+from fastapi import APIRouter, Depends, HTTPException
+from sqlalchemy.orm import Session
+
+from api.schemas import InfrastructureCreateRequest, InfrastructureResponse
+from auth.rbac import get_current_user
+from database.models import Infrastructure, User
+from database.session import get_db
+from services.infra_service import provision_infrastructure, destroy_infrastructure
+
+router = APIRouter()
+
+@router.post("/create", response_model=InfrastructureResponse, status_code=201)
+def create_infrastructure(
+    request: InfrastructureCreateRequest,
+    db: Session = Depends(get_db),
+    current_user: User = Depends(get_current_user),
+):
+    infra = Infrastructure(
+        owner_id=current_user.id,
+        name=request.name,
+        cloud_provider=request.cloud_provider,
+        config=request.config,
+        status="provisioning",
+    )
+    db.add(infra)
+    db.commit()
+    db.refresh(infra)
+    result = provision_infrastructure(request.name, request.cloud_provider, request.config)
+    if result is True:
+        infra.status = "ready"
+    else:
+        infra.status = "failed"
+        infra.last_error = str(result)
+    db.add(infra)
+    db.commit()
+    db.refresh(infra)
+    return infra
+
+@router.delete("/{id}")
+def delete_infrastructure(
+    id: int,
+    db: Session = Depends(get_db),
+    current_user: User = Depends(get_current_user),
+):
+    infra = db.query(Infrastructure).filter(Infrastructure.id == id, Infrastructure.owner_id == current_user.id).first()
+    if not infra:
+        raise HTTPException(status_code=404, detail="Infrastructure not found")
+    result = destroy_infrastructure(infra.name, infra.cloud_provider, infra.config or {})
+    if result is True:
+        infra.status = "deleted"
+        db.add(infra)
+        db.commit()
+        return {"id": id, "status": "deleted"}
+    infra.status = "delete_failed"
+    infra.last_error = str(result)
+    db.add(infra)
+    db.commit()
+    raise HTTPException(status_code=500, detail=str(result))
+
+@router.get("/{id}", response_model=InfrastructureResponse)
+def get_infrastructure(
+    id: int,
+    db: Session = Depends(get_db),
+    current_user: User = Depends(get_current_user),
+):
+    infra = db.query(Infrastructure).filter(Infrastructure.id == id, Infrastructure.owner_id == current_user.id).first()
+    if not infra:
+        raise HTTPException(status_code=404, detail="Infrastructure not found")
+    return infra
diff --git a/api/routes/kubernetes.py b/api/routes/kubernetes.py
new file mode 100644
index 0000000..203b809
--- /dev/null
+++ b/api/routes/kubernetes.py
@@ -0,0 +1,59 @@
+from fastapi import APIRouter, Depends, HTTPException
+
+from api.schemas import AutoscalingRequest, IngressRequest, NamespaceRequest, ServiceExposeRequest
+from auth.rbac import get_current_user
+from services.autoscaling_service import create_hpa
+from services.ingress_service import create_ingress
+from services.k8s_service import create_namespace
+from services.service_service import expose_service
+
+router = APIRouter()
+
+@router.post("/namespace/create")
+def create_namespace_route(request: NamespaceRequest, current_user=Depends(get_current_user)):
+    success = create_namespace(request.name)
+    if success:
+        return {"msg": f"Namespace '{request.name}' created."}
+    raise HTTPException(status_code=500, detail="Failed to create namespace.")
+
+@router.post("/service/expose")
+def expose_service_route(request: ServiceExposeRequest, current_user=Depends(get_current_user)):
+    success = expose_service(
+        namespace=request.namespace,
+        name=request.name,
+        port=request.port,
+        target_port=request.target_port,
+        type_=request.type
+    )
+    if success:
+        return {"msg": f"Service '{request.name}' exposed in namespace '{request.namespace}'."}
+    raise HTTPException(status_code=500, detail="Failed to expose service.")
+
+@router.post("/autoscaling/create")
+def create_autoscaling_route(request: AutoscalingRequest, current_user=Depends(get_current_user)):
+    success = create_hpa(
+        namespace=request.namespace,
+        deployment=request.deployment,
+        min_replicas=request.min_replicas,
+        max_replicas=request.max_replicas,
+        cpu_threshold=request.cpu_threshold
+    )
+    if success:
+        return {"msg": f"Autoscaling for {request.namespace}/{request.deployment} configured."}
+    raise HTTPException(status_code=500, detail="Failed to configure autoscaling.")
+
+
+@router.post("/ingress/create")
+def create_ingress_route(request: IngressRequest, current_user=Depends(get_current_user)):
+    success = create_ingress(
+        namespace=request.namespace,
+        name=request.name,
+        service_name=request.service_name,
+        service_port=request.service_port,
+        host=request.host
+    )
+    if success:
+        return {
+            "msg": f"Ingress '{request.name}' created for service '{request.service_name}' on host '{request.host}'."
+        }
+    raise HTTPException(status_code=500, detail="Failed to create ingress.")
diff --git a/api/routes/monitoring.py b/api/routes/monitoring.py
new file mode 100644
index 0000000..ca2b4ed
--- /dev/null
+++ b/api/routes/monitoring.py
@@ -0,0 +1,19 @@
+from fastapi import APIRouter, Depends, Query
+from services.monitoring_service import get_cluster_health, get_pod_logs
+from fastapi import Response
+from prometheus_client import generate_latest, CONTENT_TYPE_LATEST
+from auth.rbac import get_current_user
+
+router = APIRouter()
+
+@router.get("/cluster/health")
+def cluster_health(current_user=Depends(get_current_user)):
+    return get_cluster_health()
+
+@router.get("/metrics")
+def metrics(current_user=Depends(get_current_user)):
+    return Response(generate_latest(), media_type=CONTENT_TYPE_LATEST)
+
+@router.get("/logs/{pod}")
+def logs(pod: str, namespace: str = Query("default"), current_user=Depends(get_current_user)):
+    return get_pod_logs(namespace, pod)
diff --git a/api/schemas.py b/api/schemas.py
new file mode 100644
index 0000000..882e19c
--- /dev/null
+++ b/api/schemas.py
@@ -0,0 +1,105 @@
+from datetime import datetime
+from typing import Any, Dict, Optional
+
+from pydantic import BaseModel, Field, validator
+
+
+class TokenResponse(BaseModel):
+    access_token: str
+    token_type: str = "bearer"
+
+
+class RegisterRequest(BaseModel):
+    username: str = Field(..., min_length=3, max_length=64)
+    password: str = Field(..., min_length=8, max_length=128)
+
+
+class LoginRequest(BaseModel):
+    username: str
+    password: str
+
+
+class InfrastructureCreateRequest(BaseModel):
+    name: str = Field(..., min_length=3, max_length=63, regex=r"^[a-z0-9]([-a-z0-9]*[a-z0-9])?$")
+    cloud_provider: str = Field(default="aws", regex=r"^aws$")
+    config: Dict[str, Any] = Field(default_factory=dict)
+
+
+class InfrastructureResponse(BaseModel):
+    id: int
+    name: str
+    cloud_provider: str
+    status: str
+    config: Dict[str, Any]
+    last_error: Optional[str] = None
+    created_at: datetime
+
+    class Config:
+        orm_mode = True
+
+
+class DeploymentCreateRequest(BaseModel):
+    image: str = Field(..., min_length=3, max_length=255)
+    name: str = Field(..., min_length=3, max_length=63, regex=r"^[a-z0-9]([-a-z0-9]*[a-z0-9])?$")
+    namespace: Optional[str] = Field(default=None, max_length=63)
+    port: int = Field(default=80, ge=1, le=65535)
+    replicas: int = Field(default=1, ge=1, le=20)
+    min_replicas: int = Field(default=1, ge=1, le=20)
+    max_replicas: int = Field(default=3, ge=1, le=100)
+    cpu_threshold: int = Field(default=70, ge=10, le=95)
+    ingress_host: Optional[str] = Field(default=None, max_length=253)
+    env: Dict[str, str] = Field(default_factory=dict)
+
+    @validator("max_replicas")
+    def max_gte_min(cls, value, values):
+        min_replicas = values.get("min_replicas", 1)
+        if value < min_replicas:
+            raise ValueError("max_replicas must be greater than or equal to min_replicas")
+        return value
+
+
+class DeploymentResponse(BaseModel):
+    id: int
+    owner_id: int
+    name: str
+    namespace: str
+    image: str
+    port: int
+    replicas: int
+    ingress_host: Optional[str] = None
+    url: Optional[str] = None
+    status: str
+    metadata_json: Dict[str, Any] = {}
+    last_error: Optional[str] = None
+    created_at: datetime
+
+    class Config:
+        orm_mode = True
+
+
+class NamespaceRequest(BaseModel):
+    name: str = Field(..., min_length=3, max_length=63, regex=r"^[a-z0-9]([-a-z0-9]*[a-z0-9])?$")
+
+
+class ServiceExposeRequest(BaseModel):
+    namespace: str
+    name: str
+    port: int = Field(..., ge=1, le=65535)
+    target_port: int = Field(..., ge=1, le=65535)
+    type: str = Field(default="ClusterIP", regex=r"^(ClusterIP|NodePort|LoadBalancer)$")
+
+
+class IngressRequest(BaseModel):
+    namespace: str
+    name: str
+    service_name: str
+    service_port: int = Field(..., ge=1, le=65535)
+    host: str
+
+
+class AutoscalingRequest(BaseModel):
+    namespace: str = "default"
+    deployment: str
+    min_replicas: int = Field(..., ge=1)
+    max_replicas: int = Field(..., ge=1)
+    cpu_threshold: int = Field(..., ge=10, le=95)
diff --git a/app/config.py b/app/config.py
new file mode 100644
index 0000000..b531bb3
--- /dev/null
+++ b/app/config.py
@@ -0,0 +1,34 @@
+from functools import lru_cache
+from pydantic import BaseSettings, Field
+
+class Settings(BaseSettings):
+    PROJECT_NAME: str = "Cloud Infrastructure Provisioning API"
+    VERSION: str = "1.0.0"
+    ENVIRONMENT: str = Field(default="local", env="ENVIRONMENT")
+    DEBUG: bool = Field(default=True, env="APP_DEBUG")
+    DATABASE_URL: str = Field(default="sqlite:///./idp.db", env="DATABASE_URL")
+    REDIS_URL: str = Field(default="redis://localhost:6379/0", env="REDIS_URL")
+    SECRET_KEY: str = Field(default="change-me-in-production", env="SECRET_KEY")
+    JWT_ALGORITHM: str = "HS256"
+    ACCESS_TOKEN_EXPIRE_MINUTES: int = Field(default=60, env="ACCESS_TOKEN_EXPIRE_MINUTES")
+    AWS_REGION: str = Field(default="us-east-1", env="AWS_REGION")
+    DEFAULT_INGRESS_DOMAIN: str = Field(default="apps.local", env="DEFAULT_INGRESS_DOMAIN")
+    KUBERNETES_NAMESPACE_PREFIX: str = Field(default="tenant", env="KUBERNETES_NAMESPACE_PREFIX")
+    KUBERNETES_DRY_RUN: bool = Field(default=False, env="KUBERNETES_DRY_RUN")
+    TERRAFORM_DRY_RUN: bool = Field(default=True, env="TERRAFORM_DRY_RUN")
+    TERRAFORM_STATE_BUCKET: str = Field(default="replace-me-terraform-state", env="TERRAFORM_STATE_BUCKET")
+    TERRAFORM_LOCK_TABLE: str = Field(default="replace-me-terraform-locks", env="TERRAFORM_LOCK_TABLE")
+    RATE_LIMIT_REQUESTS_PER_HOUR: int = Field(default=100, env="RATE_LIMIT_REQUESTS_PER_HOUR")
+    REQUIRE_AUTH_FOR_PLATFORM_APIS: bool = Field(default=True, env="REQUIRE_AUTH_FOR_PLATFORM_APIS")
+
+    class Config:
+        env_file = ".env"
+        case_sensitive = True
+
+
+@lru_cache()
+def get_settings() -> Settings:
+    return Settings()
+
+
+settings = get_settings()
diff --git a/app/logger.py b/app/logger.py
new file mode 100644
index 0000000..8861b01
--- /dev/null
+++ b/app/logger.py
@@ -0,0 +1,9 @@
+import logging
+import sys
+
+def setup_logging():
+    logging.basicConfig(
+        level=logging.INFO,
+        format='{"time":"%(asctime)s","level":"%(levelname)s","logger":"%(name)s","message":"%(message)s"}',
+        handlers=[logging.StreamHandler(sys.stdout)]
+    )
diff --git a/app/main.py b/app/main.py
new file mode 100644
index 0000000..e0da8b9
--- /dev/null
+++ b/app/main.py
@@ -0,0 +1,57 @@
+from fastapi import Depends, FastAPI
+from api.routes import auth, infrastructure, deployments, kubernetes, monitoring
+from auth.rate_limit import rate_limiter
+from app.config import settings
+from app.logger import setup_logging
+from database.session import init_db
+
+setup_logging()
+
+
+app = FastAPI(
+    title=settings.PROJECT_NAME,
+    version=settings.VERSION,
+    description=(
+        "Internal Developer Platform API for infrastructure provisioning "
+        "and Kubernetes application deployment."
+    ),
+)
+
+
+@app.on_event("startup")
+def on_startup():
+    init_db()
+
+
+app.include_router(auth.router, prefix="/auth", tags=["auth"], dependencies=[Depends(rate_limiter)])
+app.include_router(
+    infrastructure.router,
+    prefix="/infrastructure",
+    tags=["infrastructure"],
+    dependencies=[Depends(rate_limiter)],
+)
+app.include_router(
+    deployments.router,
+    prefix="/deployments",
+    tags=["deployments"],
+    dependencies=[Depends(rate_limiter)],
+)
+app.include_router(kubernetes.router, prefix="/kubernetes", tags=["kubernetes"], dependencies=[Depends(rate_limiter)])
+app.include_router(monitoring.router, prefix="/monitoring", tags=["monitoring"], dependencies=[Depends(rate_limiter)])
+app.include_router(kubernetes.router, tags=["kubernetes"], dependencies=[Depends(rate_limiter)])
+app.include_router(monitoring.router, tags=["monitoring"], dependencies=[Depends(rate_limiter)])
+
+
+@app.get("/healthz")
+def health_check():
+    return {"status": "ok"}
+
+
+@app.get("/readyz")
+def readiness_check():
+    return {
+        "status": "ready",
+        "environment": settings.ENVIRONMENT,
+        "kubernetes_dry_run": settings.KUBERNETES_DRY_RUN,
+        "terraform_dry_run": settings.TERRAFORM_DRY_RUN,
+    }
diff --git a/auth/jwt_utils.py b/auth/jwt_utils.py
new file mode 100644
index 0000000..41e838e
--- /dev/null
+++ b/auth/jwt_utils.py
@@ -0,0 +1,30 @@
+from datetime import datetime, timedelta
+from jose import JWTError, jwt
+from passlib.context import CryptContext
+from app.config import settings
+
+pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto")
+
+SECRET_KEY = settings.SECRET_KEY
+ALGORITHM = settings.JWT_ALGORITHM
+ACCESS_TOKEN_EXPIRE_MINUTES = settings.ACCESS_TOKEN_EXPIRE_MINUTES
+
+def verify_password(plain_password, hashed_password):
+    return pwd_context.verify(plain_password, hashed_password)
+
+def get_password_hash(password):
+    return pwd_context.hash(password)
+
+def create_access_token(data: dict, expires_delta: timedelta = None):
+    to_encode = data.copy()
+    expire = datetime.utcnow() + (expires_delta or timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES))
+    to_encode.update({"exp": expire})
+    encoded_jwt = jwt.encode(to_encode, SECRET_KEY, algorithm=ALGORITHM)
+    return encoded_jwt
+
+def decode_access_token(token: str):
+    try:
+        payload = jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM])
+        return payload
+    except JWTError:
+        return None
diff --git a/auth/rate_limit.py b/auth/rate_limit.py
new file mode 100644
index 0000000..61dd7ac
--- /dev/null
+++ b/auth/rate_limit.py
@@ -0,0 +1,28 @@
+import redis
+from fastapi import Request, HTTPException
+from app.config import settings
+
+try:
+    r = redis.Redis.from_url(settings.REDIS_URL, socket_connect_timeout=0.2, socket_timeout=0.2)
+except Exception:
+    r = None
+
+
+def rate_limiter(request: Request):
+    if r is None:
+        return
+    ip = request.client.host
+    key = f"rate_limit:{ip}"
+    try:
+        count = r.get(key)
+        if count and int(count) >= settings.RATE_LIMIT_REQUESTS_PER_HOUR:
+            raise HTTPException(status_code=429, detail="Rate limit exceeded")
+        pipe = r.pipeline()
+        pipe.incr(key, 1)
+        pipe.expire(key, 3600)
+        pipe.execute()
+    except HTTPException:
+        raise
+    except Exception:
+        # Keep local development and CI usable if Redis is not available.
+        return
diff --git a/auth/rbac.py b/auth/rbac.py
new file mode 100644
index 0000000..0ef0ed5
--- /dev/null
+++ b/auth/rbac.py
@@ -0,0 +1,31 @@
+from fastapi import Depends, HTTPException
+from fastapi.security import OAuth2PasswordBearer
+from jose import JWTError, jwt
+from sqlalchemy.orm import Session
+from app.config import settings
+from database.models import User
+from database.session import get_db
+
+oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/auth/login")
+
+def get_current_user(token: str = Depends(oauth2_scheme), db: Session = Depends(get_db)):
+    try:
+        payload = jwt.decode(token, settings.SECRET_KEY, algorithms=[settings.JWT_ALGORITHM])
+        user_id: str = payload.get("sub")
+        if user_id is None:
+            raise HTTPException(status_code=401, detail="Invalid credentials")
+        user = db.query(User).filter(User.id == int(user_id)).first()
+        if not user:
+            raise HTTPException(status_code=401, detail="Invalid credentials")
+        return user
+    except JWTError:
+        raise HTTPException(status_code=401, detail="Invalid credentials")
+    except ValueError:
+        raise HTTPException(status_code=401, detail="Invalid credentials")
+
+def require_role(required_role: str):
+    def role_checker(user=Depends(get_current_user)):
+        if user.role != required_role:
+            raise HTTPException(status_code=403, detail="Insufficient permissions")
+        return user
+    return role_checker
diff --git a/database/models.py b/database/models.py
new file mode 100644
index 0000000..4bf890a
--- /dev/null
+++ b/database/models.py
@@ -0,0 +1,42 @@
+from sqlalchemy import Column, Integer, String, DateTime, JSON, Text
+from sqlalchemy.ext.declarative import declarative_base
+import datetime
+
+Base = declarative_base()
+
+class User(Base):
+    __tablename__ = 'users'
+    id = Column(Integer, primary_key=True, index=True)
+    username = Column(String, unique=True, index=True, nullable=False)
+    hashed_password = Column(String, nullable=False)
+    role = Column(String, default='user', nullable=False)
+    created_at = Column(DateTime, default=datetime.datetime.utcnow)
+
+class Deployment(Base):
+    __tablename__ = 'deployments'
+    id = Column(Integer, primary_key=True, index=True)
+    owner_id = Column(Integer, index=True, nullable=False)
+    name = Column(String, index=True, nullable=False)
+    namespace = Column(String, index=True, nullable=False)
+    image = Column(String, nullable=False)
+    port = Column(Integer, default=80)
+    replicas = Column(Integer, default=1)
+    ingress_host = Column(String)
+    url = Column(String)
+    status = Column(String, default='pending')
+    metadata_json = Column(JSON, default=dict)
+    last_error = Column(Text)
+    created_at = Column(DateTime, default=datetime.datetime.utcnow)
+    updated_at = Column(DateTime, default=datetime.datetime.utcnow, onupdate=datetime.datetime.utcnow)
+
+class Infrastructure(Base):
+    __tablename__ = 'infrastructure'
+    id = Column(Integer, primary_key=True, index=True)
+    owner_id = Column(Integer, index=True, nullable=False)
+    name = Column(String, index=True, nullable=False)
+    cloud_provider = Column(String, default='aws')
+    config = Column(JSON, default=dict)
+    status = Column(String, default='provisioning')
+    last_error = Column(Text)
+    created_at = Column(DateTime, default=datetime.datetime.utcnow)
+    updated_at = Column(DateTime, default=datetime.datetime.utcnow, onupdate=datetime.datetime.utcnow)
diff --git a/database/session.py b/database/session.py
new file mode 100644
index 0000000..e268be1
--- /dev/null
+++ b/database/session.py
@@ -0,0 +1,22 @@
+from sqlalchemy import create_engine
+from sqlalchemy.orm import sessionmaker
+
+from app.config import settings
+from database.models import Base
+
+
+connect_args = {"check_same_thread": False} if settings.DATABASE_URL.startswith("sqlite") else {}
+engine = create_engine(settings.DATABASE_URL, pool_pre_ping=True, connect_args=connect_args)
+SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
+
+
+def init_db() -> None:
+    Base.metadata.create_all(bind=engine)
+
+
+def get_db():
+    db = SessionLocal()
+    try:
+        yield db
+    finally:
+        db.close()
diff --git a/helm/charts/README.md b/helm/charts/README.md
new file mode 100644
index 0000000..1e6f06b
--- /dev/null
+++ b/helm/charts/README.md
@@ -0,0 +1,14 @@
+# Helm Charts
+
+Place your Helm charts here for application deployments, ingress, monitoring, etc.
+
+Example structure:
+
+charts/
+  my-app/
+    Chart.yaml
+    values.yaml
+    templates/
+      deployment.yaml
+      service.yaml
+      ingress.yaml
diff --git a/helm/charts/idp-api/Chart.yaml b/helm/charts/idp-api/Chart.yaml
new file mode 100644
index 0000000..47a5193
--- /dev/null
+++ b/helm/charts/idp-api/Chart.yaml
@@ -0,0 +1,4 @@
+apiVersion: v2
+name: idp-api
+version: 0.1.0
+description: A Helm chart for the Cloud Infrastructure Provisioning API
diff --git a/helm/charts/idp-api/templates/_helpers.tpl b/helm/charts/idp-api/templates/_helpers.tpl
new file mode 100644
index 0000000..2d42b28
--- /dev/null
+++ b/helm/charts/idp-api/templates/_helpers.tpl
@@ -0,0 +1,11 @@
+{{- define "idp-api.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+
+{{- define "idp-api.fullname" -}}
+{{- if .Values.fullnameOverride -}}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- printf "%s-%s" .Release.Name (include "idp-api.name" .) | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+{{- end -}}
diff --git a/helm/charts/idp-api/templates/configmap.yaml b/helm/charts/idp-api/templates/configmap.yaml
new file mode 100644
index 0000000..ae96ac7
--- /dev/null
+++ b/helm/charts/idp-api/templates/configmap.yaml
@@ -0,0 +1,10 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ include "idp-api.fullname" . }}-config
+data:
+  ENVIRONMENT: {{ .Values.config.environment | quote }}
+  AWS_REGION: {{ .Values.config.awsRegion | quote }}
+  DEFAULT_INGRESS_DOMAIN: {{ .Values.config.defaultIngressDomain | quote }}
+  KUBERNETES_DRY_RUN: {{ .Values.config.kubernetesDryRun | quote }}
+  TERRAFORM_DRY_RUN: {{ .Values.config.terraformDryRun | quote }}
diff --git a/helm/charts/idp-api/templates/deployment.yaml b/helm/charts/idp-api/templates/deployment.yaml
new file mode 100644
index 0000000..03bb6ea
--- /dev/null
+++ b/helm/charts/idp-api/templates/deployment.yaml
@@ -0,0 +1,44 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ include "idp-api.fullname" . }}
+  labels:
+    app: {{ include "idp-api.name" . }}
+spec:
+  {{- if not .Values.autoscaling.enabled }}
+  replicas: {{ .Values.replicaCount }}
+  {{- end }}
+  selector:
+    matchLabels:
+      app: {{ include "idp-api.name" . }}
+  template:
+    metadata:
+      labels:
+        app: {{ include "idp-api.name" . }}
+    spec:
+      serviceAccountName: {{ include "idp-api.fullname" . }}
+      containers:
+        - name: idp-api
+          image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
+          imagePullPolicy: {{ .Values.image.pullPolicy }}
+          ports:
+            - containerPort: 8000
+          envFrom:
+            - configMapRef:
+                name: {{ include "idp-api.fullname" . }}-config
+            - secretRef:
+                name: {{ include "idp-api.fullname" . }}-secrets
+          livenessProbe:
+            httpGet:
+              path: /healthz
+              port: 8000
+            initialDelaySeconds: 10
+            periodSeconds: 10
+          readinessProbe:
+            httpGet:
+              path: /readyz
+              port: 8000
+            initialDelaySeconds: 5
+            periodSeconds: 5
+          resources:
+            {{- toYaml .Values.resources | nindent 12 }}
diff --git a/helm/charts/idp-api/templates/hpa.yaml b/helm/charts/idp-api/templates/hpa.yaml
new file mode 100644
index 0000000..44f0eb8
--- /dev/null
+++ b/helm/charts/idp-api/templates/hpa.yaml
@@ -0,0 +1,20 @@
+{{- if .Values.autoscaling.enabled }}
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: {{ include "idp-api.fullname" . }}
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: {{ include "idp-api.fullname" . }}
+  minReplicas: {{ .Values.autoscaling.minReplicas }}
+  maxReplicas: {{ .Values.autoscaling.maxReplicas }}
+  metrics:
+    - type: Resource
+      resource:
+        name: cpu
+        target:
+          type: Utilization
+          averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }}
+{{- end }}
diff --git a/helm/charts/idp-api/templates/ingress.yaml b/helm/charts/idp-api/templates/ingress.yaml
new file mode 100644
index 0000000..b1080ef
--- /dev/null
+++ b/helm/charts/idp-api/templates/ingress.yaml
@@ -0,0 +1,27 @@
+{{- if .Values.ingress.enabled }}
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: {{ include "idp-api.fullname" . }}
+  annotations:
+    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
+spec:
+  ingressClassName: {{ .Values.ingress.className }}
+  rules:
+    {{- range .Values.ingress.hosts }}
+    - host: {{ .host }}
+      http:
+        paths:
+          {{- range .paths }}
+          - path: {{ .path }}
+            pathType: {{ .pathType }}
+            backend:
+              service:
+                name: {{ include "idp-api.fullname" $ }}
+                port:
+                  number: 8000
+          {{- end }}
+    {{- end }}
+  tls:
+    {{- toYaml .Values.ingress.tls | nindent 4 }}
+{{- end }}
diff --git a/helm/charts/idp-api/templates/secret.yaml b/helm/charts/idp-api/templates/secret.yaml
new file mode 100644
index 0000000..942efda
--- /dev/null
+++ b/helm/charts/idp-api/templates/secret.yaml
@@ -0,0 +1,9 @@
+apiVersion: v1
+kind: Secret
+metadata:
+  name: {{ include "idp-api.fullname" . }}-secrets
+type: Opaque
+stringData:
+  DATABASE_URL: {{ required "secrets.databaseUrl is required" .Values.secrets.databaseUrl | quote }}
+  REDIS_URL: {{ .Values.secrets.redisUrl | quote }}
+  SECRET_KEY: {{ required "secrets.secretKey is required" .Values.secrets.secretKey | quote }}
diff --git a/helm/charts/idp-api/templates/service.yaml b/helm/charts/idp-api/templates/service.yaml
new file mode 100644
index 0000000..7b809a7
--- /dev/null
+++ b/helm/charts/idp-api/templates/service.yaml
@@ -0,0 +1,13 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: {{ include "idp-api.fullname" . }}
+spec:
+  type: {{ .Values.service.type }}
+  ports:
+    - port: {{ .Values.service.port }}
+      targetPort: 8000
+      protocol: TCP
+      name: http
+  selector:
+    app: {{ include "idp-api.name" . }}
diff --git a/helm/charts/idp-api/templates/serviceaccount.yaml b/helm/charts/idp-api/templates/serviceaccount.yaml
new file mode 100644
index 0000000..869c773
--- /dev/null
+++ b/helm/charts/idp-api/templates/serviceaccount.yaml
@@ -0,0 +1,4 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: {{ include "idp-api.fullname" . }}
diff --git a/helm/charts/idp-api/values.yaml b/helm/charts/idp-api/values.yaml
new file mode 100644
index 0000000..6910829
--- /dev/null
+++ b/helm/charts/idp-api/values.yaml
@@ -0,0 +1,39 @@
+replicaCount: 2
+image:
+  repository: your-docker-repo/idp-api
+  tag: latest
+  pullPolicy: IfNotPresent
+service:
+  type: ClusterIP
+  port: 8000
+config:
+  environment: production
+  awsRegion: us-east-1
+  defaultIngressDomain: apps.example.com
+  kubernetesDryRun: "false"
+  terraformDryRun: "false"
+secrets:
+  databaseUrl: ""
+  redisUrl: "redis://redis-master:6379/0"
+  secretKey: ""
+ingress:
+  enabled: true
+  className: "nginx"
+  hosts:
+    - host: idp.local
+      paths:
+        - path: /
+          pathType: Prefix
+  tls: []
+resources:
+  requests:
+    cpu: 100m
+    memory: 256Mi
+  limits:
+    cpu: 500m
+    memory: 512Mi
+autoscaling:
+  enabled: true
+  minReplicas: 2
+  maxReplicas: 10
+  targetCPUUtilizationPercentage: 70
diff --git a/kubernetes/client.py b/kubernetes/client.py
new file mode 100644
index 0000000..d3b1c7f
--- /dev/null
+++ b/kubernetes/client.py
@@ -0,0 +1,18 @@
+from kubernetes import client, config
+import os
+import logging
+
+logger = logging.getLogger(__name__)
+
+def get_k8s_client():
+    try:
+        if os.getenv("KUBERNETES_SERVICE_HOST"):
+            config.load_incluster_config()
+            logger.info("Loaded in-cluster Kubernetes config")
+        else:
+            config.load_kube_config()
+            logger.info("Loaded local kubeconfig")
+        return client.CoreV1Api(), client.AppsV1Api(), client.AutoscalingV1Api(), client.NetworkingV1Api()
+    except Exception as e:
+        logger.error(f"Failed to load Kubernetes config: {e}")
+        raise
diff --git a/kubernetes/network-policy.yaml b/kubernetes/network-policy.yaml
new file mode 100644
index 0000000..4be378c
--- /dev/null
+++ b/kubernetes/network-policy.yaml
@@ -0,0 +1,25 @@
+apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+  name: allow-idp-api
+  namespace: default
+spec:
+  podSelector:
+    matchLabels:
+      app: idp-api
+  policyTypes:
+    - Ingress
+    - Egress
+  ingress:
+    - from:
+        - podSelector: {}
+      ports:
+        - protocol: TCP
+          port: 8000
+  egress:
+    - to:
+        - ipBlock:
+            cidr: 0.0.0.0/0
+      ports:
+        - protocol: TCP
+          port: 53
diff --git a/kubernetes/rbac.yaml b/kubernetes/rbac.yaml
new file mode 100644
index 0000000..dd8216f
--- /dev/null
+++ b/kubernetes/rbac.yaml
@@ -0,0 +1,29 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: idp-api
+  namespace: default
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  namespace: default
+  name: idp-api-role
+rules:
+- apiGroups: [""]
+  resources: ["pods", "services", "namespaces"]
+  verbs: ["get", "list", "create", "delete"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: idp-api-rolebinding
+  namespace: default
+subjects:
+- kind: ServiceAccount
+  name: idp-api
+  namespace: default
+roleRef:
+  kind: Role
+  name: idp-api-role
+  apiGroup: rbac.authorization.k8s.io
diff --git a/monitoring/grafana-dashboard.json b/monitoring/grafana-dashboard.json
new file mode 100644
index 0000000..0831900
--- /dev/null
+++ b/monitoring/grafana-dashboard.json
@@ -0,0 +1,24 @@
+{
+  "dashboard": {
+    "id": null,
+    "title": "IDP API Overview",
+    "panels": [
+      {
+        "type": "graph",
+        "title": "Request Rate",
+        "targets": [
+          { "expr": "rate(http_requests_total[5m])", "format": "time_series" }
+        ],
+        "datasource": "Prometheus"
+      },
+      {
+        "type": "graph",
+        "title": "Error Rate",
+        "targets": [
+          { "expr": "rate(http_requests_total{status=\"500\"}[5m])", "format": "time_series" }
+        ],
+        "datasource": "Prometheus"
+      }
+    ]
+  }
+}
diff --git a/monitoring/prometheus-scrape-config.yaml b/monitoring/prometheus-scrape-config.yaml
new file mode 100644
index 0000000..a0f273e
--- /dev/null
+++ b/monitoring/prometheus-scrape-config.yaml
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: prometheus-scrape-config
+  namespace: monitoring
+  labels:
+    app: prometheus
+  annotations:
+    prometheus.io/scrape: 'true'
+data:
+  prometheus.yml: |
+    scrape_configs:
+      - job_name: 'idp-api'
+        static_configs:
+          - targets: ['idp-api.default.svc.cluster.local:8000']
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..73261cc
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,16 @@
+fastapi>=0.110,<0.111
+uvicorn[standard]>=0.27,<1.0
+pydantic>=1.10,<2.0
+kubernetes>=29,<31
+python-jose[cryptography]>=3.3,<4.0
+passlib[bcrypt]>=1.7,<2.0
+bcrypt>=4.0,<5.0
+psycopg2-binary>=2.9,<3.0
+sqlalchemy>=2.0,<3.0
+redis>=5.0,<6.0
+httpx>=0.27,<1.0
+jinja2>=3.1,<4.0
+python-dotenv>=1.0,<2.0
+prometheus-client>=0.20,<1.0
+pytest>=8.0,<9.0
+pytest-asyncio>=0.23,<1.0
diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh
new file mode 100644
index 0000000..58b847e
--- /dev/null
+++ b/scripts/bootstrap.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+set -e
+
+echo "[+] Bootstrapping local dev environment..."
+
+# Create Python venv
+echo "[+] Creating Python virtual environment..."
+python3 -m venv .venv
+source .venv/bin/activate
+
+# Install dependencies
+pip install --upgrade pip
+pip install -r requirements.txt
+
+echo "[+] Done."
diff --git a/scripts/migrate_db.sh b/scripts/migrate_db.sh
new file mode 100644
index 0000000..a404bf3
--- /dev/null
+++ b/scripts/migrate_db.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+set -e
+
+# Example migration script using Alembic (if used)
+
+if [ ! -f alembic.ini ]; then
+  echo "[!] alembic.ini not found. Please set up Alembic."
+  exit 1
+fi
+
+alembic upgrade head
diff --git a/scripts/prod_checklist.md b/scripts/prod_checklist.md
new file mode 100644
index 0000000..09b5d68
--- /dev/null
+++ b/scripts/prod_checklist.md
@@ -0,0 +1,44 @@
+# Production Readiness Checklist for IDP API
+
+## Infrastructure
+- [ ] Terraform S3 backend configured
+- [ ] AWS resources provisioned (VPC, EKS, IAM)
+- [ ] Terraform state bucket secured
+
+## Kubernetes
+- [ ] Ingress controller deployed (NGINX/ALB)
+- [ ] Prometheus and Grafana installed
+- [ ] RBAC and ServiceAccount applied
+- [ ] NetworkPolicy applied
+- [ ] Secrets created and referenced
+- [ ] App deployed via Helm
+- [ ] HTTPS enforced at ingress
+
+## Application
+- [ ] All secrets from env/K8s Secrets
+- [ ] JWT, RBAC, rate limiting enabled
+- [ ] Liveness/readiness probes configured
+- [ ] Structured logging enabled
+- [ ] Prometheus metrics exposed
+
+## CI/CD
+- [ ] GitHub Actions for build/test/deploy
+- [ ] Security scanning (Trivy/Snyk)
+- [ ] Automated DB migrations
+
+## Observability
+- [ ] Prometheus scraping `/monitoring/metrics`
+- [ ] Grafana dashboards imported
+- [ ] Alerting configured
+
+## Security
+- [ ] HTTPS everywhere
+- [ ] RBAC and network policies
+- [ ] Non-root containers
+- [ ] Regular secret rotation
+
+## Advanced
+- [ ] Multi-tenancy (namespaces, RBAC)
+- [ ] GitOps (ArgoCD/Flux)
+- [ ] Cost estimation
+- [ ] Canary/blue-green deployments
diff --git a/scripts/setup_env.sh b/scripts/setup_env.sh
new file mode 100644
index 0000000..3407bbf
--- /dev/null
+++ b/scripts/setup_env.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+set -e
+
+cp .env.example .env
diff --git a/services/autoscaling_service.py b/services/autoscaling_service.py
new file mode 100644
index 0000000..faf4358
--- /dev/null
+++ b/services/autoscaling_service.py
@@ -0,0 +1,40 @@
+from kubernetes import client
+import logging
+from app.config import settings
+from services.kubernetes_client import get_k8s_client
+
+logger = logging.getLogger(__name__)
+
+def create_hpa(namespace: str, deployment: str, min_replicas: int, max_replicas: int, cpu_threshold: int):
+    if settings.KUBERNETES_DRY_RUN:
+        logger.info("Dry run: hpa %s/%s would be created", namespace, deployment)
+        return True
+    try:
+        _, _, autoscaling_v1, _ = get_k8s_client()
+        hpa = client.V1HorizontalPodAutoscaler(
+            api_version="autoscaling/v1",
+            kind="HorizontalPodAutoscaler",
+            metadata=client.V1ObjectMeta(name=deployment, namespace=namespace),
+            spec=client.V1HorizontalPodAutoscalerSpec(
+                scale_target_ref=client.V1CrossVersionObjectReference(
+                    api_version="apps/v1",
+                    kind="Deployment",
+                    name=deployment
+                ),
+                min_replicas=min_replicas,
+                max_replicas=max_replicas,
+                target_cpu_utilization_percentage=cpu_threshold
+            )
+        )
+        autoscaling_v1.create_namespaced_horizontal_pod_autoscaler(namespace=namespace, body=hpa)
+        logger.info("HPA for deployment '%s' created in namespace '%s'.", deployment, namespace)
+        return True
+    except client.rest.ApiException as e:
+        if e.status == 409:
+            logger.info("HPA for '%s' already exists.", deployment)
+            return True
+        logger.error("Error creating HPA: %s", e)
+        return False
+    except Exception as e:
+        logger.error("Unexpected error: %s", e)
+        return False
diff --git a/services/deployment_service.py b/services/deployment_service.py
new file mode 100644
index 0000000..010ec59
--- /dev/null
+++ b/services/deployment_service.py
@@ -0,0 +1,198 @@
+import logging
+from typing import Dict, Optional
+
+from kubernetes import client
+from kubernetes.client.rest import ApiException
+from sqlalchemy.orm import Session
+
+from app.config import settings
+from api.schemas import DeploymentCreateRequest
+from database.models import Deployment, User
+from services.kubernetes_client import get_k8s_client
+from services.autoscaling_service import create_hpa
+from services.ingress_service import create_ingress
+from services.k8s_service import create_namespace
+from services.service_service import expose_service
+
+logger = logging.getLogger(__name__)
+
+
+def _default_namespace(user: User, name: str) -> str:
+    return f"{settings.KUBERNETES_NAMESPACE_PREFIX}-{user.id}-{name}"[:63].rstrip("-")
+
+
+def _default_host(user: User, name: str) -> str:
+    return f"{name}-{user.id}.{settings.DEFAULT_INGRESS_DOMAIN}"
+
+
+def validate_docker_image(image: str) -> None:
+    if " " in image or image.startswith(":") or image.endswith(":"):
+        raise ValueError("Invalid Docker image reference")
+    if "/" not in image and ":" not in image:
+        # Still allow official images, but force explicit tags in production.
+        logger.warning("Image '%s' has no registry or tag; use immutable tags in production", image)
+
+
+def create_secret(namespace: str, name: str, data: Dict[str, str]) -> bool:
+    if not data:
+        return True
+    if settings.KUBERNETES_DRY_RUN:
+        logger.info("Dry run: secret %s/%s would be created", namespace, name)
+        return True
+    try:
+        v1, _, _, _ = get_k8s_client()
+        secret = client.V1Secret(
+            api_version="v1",
+            kind="Secret",
+            metadata=client.V1ObjectMeta(name=name, namespace=namespace),
+            type="Opaque",
+            string_data=data,
+        )
+        v1.create_namespaced_secret(namespace=namespace, body=secret)
+        return True
+    except ApiException as exc:
+        if exc.status == 409:
+            logger.info("Secret '%s' already exists.", name)
+            return True
+        logger.error("Error creating secret: %s", exc)
+        return False
+
+
+def create_deployment(
+    namespace: str,
+    name: str,
+    image: str,
+    port: int = 80,
+    replicas: int = 1,
+    secret_name: Optional[str] = None,
+):
+    if settings.KUBERNETES_DRY_RUN:
+        logger.info("Dry run: deployment %s/%s with image %s would be created", namespace, name, image)
+        return True
+    try:
+        _, apps_v1, _, _ = get_k8s_client()
+        container = client.V1Container(
+            name=name,
+            image=image,
+            ports=[client.V1ContainerPort(container_port=port)],
+            env_from=[
+                client.V1EnvFromSource(secret_ref=client.V1SecretEnvSource(name=secret_name))
+            ] if secret_name else None,
+        )
+        template = client.V1PodTemplateSpec(
+            metadata=client.V1ObjectMeta(labels={"app": name}),
+            spec=client.V1PodSpec(containers=[container])
+        )
+        spec = client.V1DeploymentSpec(
+            replicas=replicas,
+            template=template,
+            selector={'matchLabels': {'app': name}}
+        )
+        deployment = client.V1Deployment(
+            api_version="apps/v1",
+            kind="Deployment",
+            metadata=client.V1ObjectMeta(name=name, namespace=namespace),
+            spec=spec
+        )
+        apps_v1.create_namespaced_deployment(namespace=namespace, body=deployment)
+        logger.info("Deployment '%s' created in namespace '%s'.", name, namespace)
+        return True
+    except ApiException as e:
+        if e.status == 409:
+            logger.info("Deployment '%s' already exists.", name)
+            return True
+        logger.error("Error creating deployment: %s", e)
+        return False
+    except Exception as e:
+        logger.error("Unexpected error: %s", e)
+        return False
+
+
+def delete_kubernetes_deployment(namespace: str, name: str) -> None:
+    if settings.KUBERNETES_DRY_RUN:
+        logger.info("Dry run: Kubernetes resources for %s/%s would be deleted", namespace, name)
+        return
+    v1, apps_v1, autoscaling_v1, networking_v1 = get_k8s_client()
+    for delete_call, kwargs in [
+        (networking_v1.delete_namespaced_ingress, {"name": name, "namespace": namespace}),
+        (autoscaling_v1.delete_namespaced_horizontal_pod_autoscaler, {"name": name, "namespace": namespace}),
+        (v1.delete_namespaced_service, {"name": name, "namespace": namespace}),
+        (apps_v1.delete_namespaced_deployment, {"name": name, "namespace": namespace}),
+        (v1.delete_namespaced_secret, {"name": f"{name}-env", "namespace": namespace}),
+    ]:
+        try:
+            delete_call(**kwargs)
+        except ApiException as exc:
+            if exc.status != 404:
+                raise
+
+
+def get_kubernetes_deployment_status(namespace: str, name: str) -> Dict[str, Optional[int]]:
+    if settings.KUBERNETES_DRY_RUN:
+        return {"available_replicas": 1, "ready_replicas": 1, "replicas": 1}
+    _, apps_v1, _, _ = get_k8s_client()
+    deployment = apps_v1.read_namespaced_deployment(name=name, namespace=namespace)
+    status = deployment.status
+    return {
+        "available_replicas": status.available_replicas or 0,
+        "ready_replicas": status.ready_replicas or 0,
+        "replicas": status.replicas or 0,
+    }
+
+
+def provision_application(db: Session, user: User, request: DeploymentCreateRequest) -> Deployment:
+    validate_docker_image(request.image)
+    namespace = request.namespace or _default_namespace(user, request.name)
+    ingress_host = request.ingress_host or _default_host(user, request.name)
+    deployment = Deployment(
+        owner_id=user.id,
+        name=request.name,
+        namespace=namespace,
+        image=request.image,
+        port=request.port,
+        replicas=request.replicas,
+        ingress_host=ingress_host,
+        url=f"https://{ingress_host}",
+        status="provisioning",
+        metadata_json={"env_keys": sorted(request.env.keys())},
+    )
+    db.add(deployment)
+    db.commit()
+    db.refresh(deployment)
+
+    try:
+        steps = [
+            create_namespace(namespace),
+            create_secret(namespace, f"{request.name}-env", request.env),
+            create_deployment(
+                namespace,
+                request.name,
+                request.image,
+                request.port,
+                request.replicas,
+                secret_name=f"{request.name}-env" if request.env else None,
+            ),
+            expose_service(namespace, request.name, 80, request.port),
+            create_ingress(namespace, request.name, request.name, 80, ingress_host),
+            create_hpa(namespace, request.name, request.min_replicas, request.max_replicas, request.cpu_threshold),
+        ]
+        if not all(steps):
+            raise RuntimeError("One or more Kubernetes resources failed to apply")
+        deployment.status = "running"
+        deployment.metadata_json = {
+            **(deployment.metadata_json or {}),
+            "autoscaling": {
+                "min_replicas": request.min_replicas,
+                "max_replicas": request.max_replicas,
+                "cpu_threshold": request.cpu_threshold,
+            },
+            "dry_run": settings.KUBERNETES_DRY_RUN,
+        }
+    except Exception as exc:
+        logger.exception("Deployment provisioning failed")
+        deployment.status = "failed"
+        deployment.last_error = str(exc)
+    db.add(deployment)
+    db.commit()
+    db.refresh(deployment)
+    return deployment
diff --git a/services/infra_service.py b/services/infra_service.py
new file mode 100644
index 0000000..2410fac
--- /dev/null
+++ b/services/infra_service.py
@@ -0,0 +1,68 @@
+import os
+import subprocess
+import tempfile
+from jinja2 import Template
+from app.config import settings
+
+TERRAFORM_TEMPLATE = os.path.join(os.path.dirname(__file__), '../terraform/main.tf.j2')
+
+
+def render_terraform_config(context: dict) -> str:
+    with open(TERRAFORM_TEMPLATE) as f:
+        template = Template(f.read())
+    return template.render(**context)
+
+
+def run_terraform(directory: str, action: str = 'apply'):
+    if settings.TERRAFORM_DRY_RUN:
+        return True
+    cmds = [
+        ['terraform', 'init'],
+        ['terraform', 'plan', '-out=tfplan'],
+        ['terraform', 'apply', '-auto-approve', 'tfplan']
+        if action == 'apply'
+        else ['terraform', 'destroy', '-auto-approve']
+    ]
+    for cmd in cmds:
+        proc = subprocess.run(cmd, cwd=directory, capture_output=True, text=True, timeout=1800)
+        if proc.returncode != 0:
+            raise Exception(f"Terraform {cmd[1]} failed: {proc.stderr}")
+    return True
+
+
+def provision_infrastructure(name: str, cloud_provider: str, config: dict):
+    context = {
+        'aws_region': config.get('aws_region', settings.AWS_REGION),
+        'cluster_name': name,
+        'eks_role_arn': config.get('eks_role_arn', 'arn:aws:iam::123456789012:role/EKSRole'),
+        'state_bucket': config.get('state_bucket', settings.TERRAFORM_STATE_BUCKET),
+        'lock_table': config.get('lock_table', settings.TERRAFORM_LOCK_TABLE),
+    }
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tf_path = os.path.join(tmpdir, 'main.tf')
+        with open(tf_path, 'w') as f:
+            f.write(render_terraform_config(context))
+        try:
+            run_terraform(tmpdir, 'apply')
+            return True
+        except Exception as e:
+            return str(e)
+
+
+def destroy_infrastructure(name: str, cloud_provider: str, config: dict):
+    context = {
+        'aws_region': config.get('aws_region', settings.AWS_REGION),
+        'cluster_name': name,
+        'eks_role_arn': config.get('eks_role_arn', 'arn:aws:iam::123456789012:role/EKSRole'),
+        'state_bucket': config.get('state_bucket', settings.TERRAFORM_STATE_BUCKET),
+        'lock_table': config.get('lock_table', settings.TERRAFORM_LOCK_TABLE),
+    }
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tf_path = os.path.join(tmpdir, 'main.tf')
+        with open(tf_path, 'w') as f:
+            f.write(render_terraform_config(context))
+        try:
+            run_terraform(tmpdir, 'destroy')
+            return True
+        except Exception as e:
+            return str(e)
diff --git a/services/ingress_service.py b/services/ingress_service.py
new file mode 100644
index 0000000..6a53815
--- /dev/null
+++ b/services/ingress_service.py
@@ -0,0 +1,51 @@
+from kubernetes import client
+import logging
+from app.config import settings
+from services.kubernetes_client import get_k8s_client
+
+logger = logging.getLogger(__name__)
+
+def create_ingress(namespace: str, name: str, service_name: str, service_port: int, host: str):
+    if settings.KUBERNETES_DRY_RUN:
+        logger.info("Dry run: ingress %s/%s for host %s would be created", namespace, name, host)
+        return True
+    try:
+        _, _, _, networking_v1 = get_k8s_client()
+        ingress = client.V1Ingress(
+            api_version="networking.k8s.io/v1",
+            kind="Ingress",
+            metadata=client.V1ObjectMeta(name=name, namespace=namespace),
+            spec=client.V1IngressSpec(
+                rules=[
+                    client.V1IngressRule(
+                        host=host,
+                        http=client.V1HTTPIngressRuleValue(
+                            paths=[
+                                client.V1HTTPIngressPath(
+                                    path="/",
+                                    path_type="Prefix",
+                                    backend=client.V1IngressBackend(
+                                        service=client.V1IngressServiceBackend(
+                                            name=service_name,
+                                            port=client.V1ServiceBackendPort(number=service_port)
+                                        )
+                                    )
+                                )
+                            ]
+                        )
+                    )
+                ]
+            )
+        )
+        networking_v1.create_namespaced_ingress(namespace=namespace, body=ingress)
+        logger.info("Ingress '%s' created for service '%s' on host '%s'.", name, service_name, host)
+        return True
+    except client.rest.ApiException as e:
+        if e.status == 409:
+            logger.info("Ingress '%s' already exists.", name)
+            return True
+        logger.error("Error creating ingress: %s", e)
+        return False
+    except Exception as e:
+        logger.error("Unexpected error: %s", e)
+        return False
diff --git a/services/k8s_service.py b/services/k8s_service.py
new file mode 100644
index 0000000..d9d2f40
--- /dev/null
+++ b/services/k8s_service.py
@@ -0,0 +1,29 @@
+import logging
+
+from kubernetes.client import V1Namespace, V1ObjectMeta
+from kubernetes.client.rest import ApiException
+
+from app.config import settings
+from services.kubernetes_client import get_k8s_client
+
+logger = logging.getLogger(__name__)
+
+def create_namespace(name: str):
+    if settings.KUBERNETES_DRY_RUN:
+        logger.info("Dry run: namespace %s would be created", name)
+        return True
+    try:
+        v1, _, _, _ = get_k8s_client()
+        ns = V1Namespace(metadata=V1ObjectMeta(name=name))
+        v1.create_namespace(ns)
+        logger.info("Namespace '%s' created.", name)
+        return True
+    except ApiException as e:
+        if e.status == 409:
+            logger.info("Namespace '%s' already exists.", name)
+            return True
+        logger.error("Error creating namespace: %s", e)
+        return False
+    except Exception as e:
+        logger.error("Unexpected error: %s", e)
+        return False
diff --git a/services/kubernetes_client.py b/services/kubernetes_client.py
new file mode 100644
index 0000000..53b28c1
--- /dev/null
+++ b/services/kubernetes_client.py
@@ -0,0 +1,20 @@
+import logging
+import os
+
+from kubernetes import client, config
+
+logger = logging.getLogger(__name__)
+
+
+def get_k8s_client():
+    try:
+        if os.getenv("KUBERNETES_SERVICE_HOST"):
+            config.load_incluster_config()
+            logger.info("Loaded in-cluster Kubernetes config")
+        else:
+            config.load_kube_config()
+            logger.info("Loaded local kubeconfig")
+        return client.CoreV1Api(), client.AppsV1Api(), client.AutoscalingV1Api(), client.NetworkingV1Api()
+    except Exception:
+        logger.exception("Failed to load Kubernetes config")
+        raise
diff --git a/services/monitoring_service.py b/services/monitoring_service.py
new file mode 100644
index 0000000..759350a
--- /dev/null
+++ b/services/monitoring_service.py
@@ -0,0 +1,31 @@
+import logging
+from app.config import settings
+from services.kubernetes_client import get_k8s_client
+
+logger = logging.getLogger(__name__)
+
+def get_cluster_health():
+    if settings.KUBERNETES_DRY_RUN:
+        return {"status": "healthy", "mode": "dry-run"}
+    try:
+        v1, _, _, _ = get_k8s_client()
+        nodes = v1.list_node()
+        for node in nodes.items:
+            for condition in node.status.conditions:
+                if condition.type == "Ready" and condition.status != "True":
+                    return {"status": "unhealthy"}
+        return {"status": "healthy"}
+    except Exception as e:
+        logger.error("Error checking cluster health: %s", e)
+        return {"status": "error", "detail": str(e)}
+
+def get_pod_logs(namespace: str, pod: str):
+    if settings.KUBERNETES_DRY_RUN:
+        return {"pod": pod, "namespace": namespace, "logs": "dry-run: Kubernetes API not called"}
+    try:
+        v1, _, _, _ = get_k8s_client()
+        logs = v1.read_namespaced_pod_log(name=pod, namespace=namespace)
+        return {"pod": pod, "logs": logs}
+    except Exception as e:
+        logger.error("Error fetching logs for pod %s: %s", pod, e)
+        return {"pod": pod, "logs": str(e)}
diff --git a/services/service_service.py b/services/service_service.py
new file mode 100644
index 0000000..be38ba4
--- /dev/null
+++ b/services/service_service.py
@@ -0,0 +1,35 @@
+from kubernetes import client
+import logging
+from app.config import settings
+from services.kubernetes_client import get_k8s_client
+
+logger = logging.getLogger(__name__)
+
+def expose_service(namespace: str, name: str, port: int, target_port: int, type_: str = "ClusterIP"):
+    if settings.KUBERNETES_DRY_RUN:
+        logger.info("Dry run: service %s/%s would be exposed", namespace, name)
+        return True
+    try:
+        v1, _, _, _ = get_k8s_client()
+        service = client.V1Service(
+            api_version="v1",
+            kind="Service",
+            metadata=client.V1ObjectMeta(name=name, namespace=namespace),
+            spec=client.V1ServiceSpec(
+                selector={"app": name},
+                ports=[client.V1ServicePort(port=port, target_port=target_port)],
+                type=type_
+            )
+        )
+        v1.create_namespaced_service(namespace=namespace, body=service)
+        logger.info("Service '%s' exposed on port %s in namespace '%s'.", name, port, namespace)
+        return True
+    except client.rest.ApiException as e:
+        if e.status == 409:
+            logger.info("Service '%s' already exists.", name)
+            return True
+        logger.error("Error exposing service: %s", e)
+        return False
+    except Exception as e:
+        logger.error("Unexpected error: %s", e)
+        return False
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000..8837be9
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,4 @@
+[flake8]
+max-line-length = 120
+extend-ignore = E302,E402
+exclude = .git,.venv,.history,__pycache__
diff --git a/terraform/main.tf.j2 b/terraform/main.tf.j2
new file mode 100644
index 0000000..a832f61
--- /dev/null
+++ b/terraform/main.tf.j2
@@ -0,0 +1,43 @@
+
+terraform {
+  backend "s3" {
+    bucket = "{{ state_bucket }}"
+    key    = "idp/{{ cluster_name }}/terraform.tfstate"
+    region = "{{ aws_region }}"
+    encrypt = true
+    dynamodb_table = "{{ lock_table }}"
+  }
+}
+
+provider "aws" {
+  region = "{{ aws_region }}"
+}
+
+resource "aws_vpc" "main" {
+  cidr_block = "10.0.0.0/16"
+}
+
+resource "aws_subnet" "main" {
+  vpc_id     = aws_vpc.main.id
+  cidr_block = "10.0.1.0/24"
+}
+
+resource "aws_security_group" "main" {
+  name_prefix = "{{ cluster_name }}-eks-"
+  vpc_id = aws_vpc.main.id
+  egress {
+    from_port   = 0
+    to_port     = 0
+    protocol    = "-1"
+    cidr_blocks = ["0.0.0.0/0"]
+  }
+}
+
+resource "aws_eks_cluster" "main" {
+  name     = "{{ cluster_name }}"
+  role_arn = "{{ eks_role_arn }}"
+  vpc_config {
+    subnet_ids = [aws_subnet.main.id]
+    security_group_ids = [aws_security_group.main.id]
+  }
+}
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..85f9d0d
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,14 @@
+import os
+import sys
+
+import pytest
+from fastapi.testclient import TestClient
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+
+from app.main import app
+
+@pytest.fixture(scope="module")
+def client():
+    with TestClient(app) as c:
+        yield c
diff --git a/tests/unit/test_auth.py b/tests/unit/test_auth.py
new file mode 100644
index 0000000..cc172a5
--- /dev/null
+++ b/tests/unit/test_auth.py
@@ -0,0 +1,12 @@
+def test_register(client):
+    response = client.post("/auth/register", json={"username": "testuser", "password": "testpass123"})
+    if response.status_code == 400:
+        assert response.json()["detail"] == "Username already registered"
+        return
+    assert response.status_code == 200
+
+def test_login(client):
+    client.post("/auth/register", json={"username": "testuser", "password": "testpass123"})
+    response = client.post("/auth/login", json={"username": "testuser", "password": "testpass123"})
+    assert response.status_code == 200
+    assert "access_token" in response.json()
diff --git a/tests/unit/test_health.py b/tests/unit/test_health.py
new file mode 100644
index 0000000..fade712
--- /dev/null
+++ b/tests/unit/test_health.py
@@ -0,0 +1,4 @@
+def test_health(client):
+    response = client.get("/healthz")
+    assert response.status_code == 200
+    assert response.json()["status"] == "ok"
diff --git a/tests/unit/test_namespace.py b/tests/unit/test_namespace.py
new file mode 100644
index 0000000..e3d3873
--- /dev/null
+++ b/tests/unit/test_namespace.py
@@ -0,0 +1,10 @@
+def test_create_namespace(client):
+    client.post("/auth/register", json={"username": "namespacetest", "password": "testpass123"})
+    login = client.post("/auth/login", json={"username": "namespacetest", "password": "testpass123"})
+    token = login.json()["access_token"]
+    response = client.post(
+        "/kubernetes/namespace/create",
+        json={"name": "test-ns"},
+        headers={"Authorization": f"Bearer {token}"},
+    )
+    assert response.status_code in [200, 500]  # 500 if cluster not available